Handle end of input. Speed improvements. Deleted legacy code.
authorSvjatoslav Agejenko <svjatoslav@svjatoslav.eu>
Thu, 12 Oct 2017 10:13:25 +0000 (13:13 +0300)
committerSvjatoslav Agejenko <svjatoslav@svjatoslav.eu>
Thu, 12 Oct 2017 10:13:25 +0000 (13:13 +0300)
src/main/java/eu/svjatoslav/commons/string/tokenizer/Tokenizer.java
src/test/java/eu/svjatoslav/commons/string/tokenizer/TokenizerTest.java

index 939ede9..722e17a 100755 (executable)
@@ -24,6 +24,9 @@ public class Tokenizer {
     private String source;
     private int currentIndex = 0;
 
+    int cachedTerminatorIndex = -1;
+    Terminator cachedTerminator;
+
     public Tokenizer(final String source) {
         this.source = source;
     }
@@ -34,6 +37,9 @@ public class Tokenizer {
         this.source = source;
         currentIndex = 0;
         tokenIndexes.clear();
+
+        cachedTerminatorIndex = -1;
+        cachedTerminator = null;
         return this;
     }
 
@@ -57,53 +63,80 @@ public class Tokenizer {
                     + "\" but got \"" + match.token + "\" instead.");
     }
 
+
+
     public TokenizerMatch getNextToken() throws InvalidSyntaxException {
         tokenIndexes.push(currentIndex);
 
-        StringBuilder token = new StringBuilder();
+        StringBuilder tokenAccumulator = new StringBuilder();
 
         while (true){
+
+            if (currentIndex >= source.length()){ // reached end of input
+                if (hasAccumulatedToken(tokenAccumulator))
+                    return new TokenizerMatch(tokenAccumulator.toString(), null, null);
+                else
+                    return null;
+            }
+
             if (isOngoingToken()) {
-                token.append(source.charAt(currentIndex));
+                tokenAccumulator.append(source.charAt(currentIndex));
                 currentIndex++;
                 continue;
             }
 
-            Terminator tokenTerminator = findTokenTerminator();
-
-            if (tokenTerminator.termination == PRESERVE){
-                return buildPreservedToken(token, tokenTerminator);
-            } else if (tokenTerminator.termination == DROP){
-                if (hasAccumulatedToken(token)){
-                    currentIndex++;
-                    return new TokenizerMatch(token.toString(), "", tokenTerminator);
-                } else {
-                    currentIndex++;
-                }
+            Terminator terminator = getOrFindTokenTerminator();
+
+            if (terminator.termination == PRESERVE)
+                return buildPreservedToken(tokenAccumulator, terminator);
+            else if (terminator.termination == DROP){
+                skipUntilTerminatorEnd(terminator);
+
+                if (hasAccumulatedToken(tokenAccumulator))
+                    return new TokenizerMatch(tokenAccumulator.toString(), null, terminator);
             }
         }
 
     }
 
+    private void skipUntilTerminatorEnd(Terminator terminator) throws InvalidSyntaxException {
+        if (terminator.hasEndSequence())
+            currentIndex = getEndSequenceIndex(terminator) + terminator.endSequence.length();
+        else
+            currentIndex += terminator.startSequence.length();
+    }
+
     private TokenizerMatch buildPreservedToken(StringBuilder token, Terminator terminator) throws InvalidSyntaxException {
         if (hasAccumulatedToken(token))
-            return new TokenizerMatch(token.toString(), "", terminator);
+            return new TokenizerMatch(token.toString(), null, terminator);
 
-        if (terminator.hasEndSequence()){
-            int endSequenceIndex = source.indexOf(terminator.endSequence,
-                    currentIndex + terminator.startSequence.length());
+        if (terminator.hasEndSequence())
+            return buildComplexPreservedToken(terminator);
+        else
+            return buildSimplePreservedToken(terminator);
+    }
 
-            if (endSequenceIndex < 0)
-                throw new InvalidSyntaxException("Expected \"" + terminator.endSequence + "\" but not found.");
+    private TokenizerMatch buildSimplePreservedToken(Terminator terminator) {
+        currentIndex += terminator.startSequence.length();
+        return new TokenizerMatch(terminator.startSequence, null, terminator);
+    }
 
-            String reminder = source.substring(currentIndex + terminator.startSequence.length(), endSequenceIndex);
-            currentIndex = endSequenceIndex + terminator.endSequence.length();
+    private TokenizerMatch buildComplexPreservedToken(Terminator terminator) throws InvalidSyntaxException {
+        int endSequenceIndex = getEndSequenceIndex(terminator);
+        String reminder = source.substring(currentIndex + terminator.startSequence.length(), endSequenceIndex);
+        currentIndex = endSequenceIndex + terminator.endSequence.length();
 
-            return new TokenizerMatch(terminator.startSequence, reminder, terminator);
-        } else {
-            currentIndex += terminator.startSequence.length();
-            return new TokenizerMatch(terminator.startSequence, "", terminator);
-        }
+        return new TokenizerMatch(terminator.startSequence, reminder, terminator);
+    }
+
+    private int getEndSequenceIndex(Terminator terminator) throws InvalidSyntaxException {
+        int endSequenceIndex = source.indexOf(terminator.endSequence,
+                currentIndex + terminator.startSequence.length());
+
+        if (endSequenceIndex < 0)
+            throw new InvalidSyntaxException("Expected \"" + terminator.endSequence + "\" but not found.");
+
+        return endSequenceIndex;
     }
 
     private boolean hasAccumulatedToken(StringBuilder token) {
@@ -111,10 +144,26 @@ public class Tokenizer {
     }
 
     private boolean isOngoingToken() {
-        return findTokenTerminator() == null;
+        return getOrFindTokenTerminator() == null;
     }
 
-    public Terminator findTokenTerminator() {
+    public boolean hasMoreTokens(){
+        return currentIndex < source.length();
+    }
+
+    /**
+     * Attempts to cache terminator search result.
+     */
+    public Terminator getOrFindTokenTerminator() {
+        if (currentIndex == cachedTerminatorIndex)
+            return cachedTerminator;
+
+        cachedTerminatorIndex = currentIndex;
+        cachedTerminator = findTokenTerminator();
+        return cachedTerminator;
+    }
+
+    private Terminator findTokenTerminator() {
         for (Terminator terminator : terminators)
             if (terminator.matches(source, currentIndex))
                 return terminator;
@@ -144,35 +193,7 @@ public class Tokenizer {
         if (peekIsOneOf(possibilities))
             throw new InvalidSyntaxException("Not expected \"" + peekNextToken().token + "\" here.");
     }
-
-
-    public boolean sequenceMatches(final String sequence) {
-        if ((currentIndex + sequence.length()) > source.length())
-            return false;
-
-        for (int i = 0; i < sequence.length(); i++)
-            if (sequence.charAt(i) != source.charAt(i + currentIndex))
-                return false;
-
-        return true;
-    }
-
-    public void skipUntilDataEnd() {
-        tokenIndexes.push(currentIndex);
-        currentIndex = source.length();
-    }
-
-    public void skipUntilSequence(final String sequence) {
-        while (currentIndex < source.length()) {
-            if (sequenceMatches(sequence)) {
-                currentIndex += sequence.length();
-                return;
-            }
-
-            currentIndex++;
-        }
-    }
-
+    
     public void unreadToken() {
         currentIndex = tokenIndexes.pop();
     }
index ddb2662..84571e8 100644 (file)
@@ -5,21 +5,13 @@ import org.junit.Test;
 import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.DROP;
 import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.PRESERVE;
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNull;
 
 public class TokenizerTest {
-    @Test
-    public void findTokenTerminator() throws Exception {
-
-        Tokenizer tokenizer = new Tokenizer("this /* comment */ a test")
-                .addTerminator("/*", "*/", PRESERVE);
-
-
-
-    }
-
 
     @Test
-    public void you_can_peek() throws Exception {
+    public void testPeeking() throws Exception {
         Tokenizer tokenizer = new Tokenizer("this is a N'2015-03-18 09:48:54.360' test")
                 .addTerminator(" ", DROP)
                 .addTerminator("N'", "'", PRESERVE);
@@ -34,56 +26,44 @@ public class TokenizerTest {
     }
 
     @Test
-    public void complexTerminator() throws Exception {
-        Tokenizer tokenizer = new Tokenizer("/* hello */ /** comment **/   ((  is a N'2015-03-18 09:48:54.360' test")
+    public void testTokenization() throws Exception {
+        Tokenizer tokenizer = new Tokenizer("\"hello\" /** comment **/   ((  is a N'2015-03-18 09:48:54.360' test")
                 .addTerminator(" ", DROP)
                 .addTerminator("(", PRESERVE)
                 .addTerminator("\"", "\"" , PRESERVE)
-                .addTerminator("/*", "*/" , PRESERVE)
+                .addTerminator("N'", "'" , PRESERVE)
+                .addTerminator("/*", "*/" , DROP)
                 ;
 
-        TokenizerMatch nextToken = tokenizer.getNextToken();
-        System.out.println("T: \"" + nextToken.token + "\", R: \"" + nextToken.reminder + "\"");
-        System.out.println(tokenizer.getNextToken().token);
-        System.out.println(tokenizer.getNextToken().token);
-        System.out.println(tokenizer.getNextToken().token);
-        System.out.println(tokenizer.getNextToken().token);
-        System.out.println(tokenizer.getNextToken().token);
-        System.out.println(tokenizer.getNextToken().token);
-        System.out.println(tokenizer.getNextToken().token);
-        System.out.println(tokenizer.getNextToken().token);
-
+        assertTokenEquals("\"", "hello", tokenizer);
+        assertTokenEquals("(", null, tokenizer);
+        assertTokenEquals("(", null, tokenizer);
+        assertTokenEquals("is", null, tokenizer);
+        assertTokenEquals("a", null, tokenizer);
+        assertTokenEquals("N'", "2015-03-18 09:48:54.360", tokenizer);
+        assertTokenEquals("test", null, tokenizer);
 
-//        tokenizer.expectAndConsumeNextToken("this");
-//
-//        assertEquals("is", tokenizer.peekNextToken().token);
-//
-//        assertEquals("is", tokenizer.peekNextToken().token);
-//
-//        assertEquals(true, tokenizer.peekIsOneOf("maybe", "is", "that"));
+        assertNull(tokenizer.getNextToken());
+        assertFalse(tokenizer.hasMoreTokens());
     }
 
+    private void assertTokenEquals(String token, String reminder, Tokenizer tokenizer) throws InvalidSyntaxException {
+        TokenizerMatch nextToken = tokenizer.getNextToken();
 
-    @Test
-    public void testComplexTerminator() throws Exception {
-        Tokenizer tokenizer = new Tokenizer("this N'2015-03-18 09:48:54.360'  /* thoe unto u */ test")
-                .addTerminator(" ", DROP)
-                .addTerminator("/*", "*/", PRESERVE);
-
-//        tokenizer.expectAndConsumeNextToken("this");
-
-//        assertEquals("2015-03-18 09:48:54.360", tokenizer.getNextToken().token);
-
-        System.out.println("1st: " + tokenizer.getNextToken().token);
-
-        System.out.println("2nd: " + tokenizer.getNextToken().token);
-
-        System.out.println("2nd: " + tokenizer.getNextToken().token);
-
-        System.out.println("2nd: " + tokenizer.getNextToken().token);
-
-        System.out.println("2nd: " + tokenizer.getNextToken().token);
+        assertEquals(token, nextToken.token);
 
+        if (reminder == null)
+            assertNull(nextToken.reminder);
+        else
+            assertEquals(reminder, nextToken.reminder);
     }
 
+    private void debugNextToken(Tokenizer tokenizer) throws InvalidSyntaxException {
+        TokenizerMatch nextToken = tokenizer.getNextToken();
+        if (nextToken == null)
+            System.out.println("null");
+        else
+            System.out.println("T: \"" + nextToken.token + "\", R: \"" + nextToken.reminder + "\"");
+    }
+    
 }
\ No newline at end of file