Use regular expressions as terminators
[svjatoslav_commons.git] / src / main / java / eu / svjatoslav / commons / string / tokenizer / Tokenizer.java
index cc20369..6aa56b4 100755 (executable)
@@ -7,9 +7,9 @@ package eu.svjatoslav.commons.string.tokenizer;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Stack;
+import java.util.regex.Matcher;
 import java.util.stream.Stream;
 
-import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.DROP;
 import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.PRESERVE;
 import static java.lang.System.out;
 
@@ -29,9 +29,6 @@ public class Tokenizer {
 
     private int currentIndex = 0;
 
-    private int cachedTerminatorIndex = -1;
-    private Terminator cachedTerminator;
-
     public Tokenizer(final String source) {
         this.source = source;
     }
@@ -43,30 +40,29 @@ public class Tokenizer {
         this.source = source;
         currentIndex = 0;
         tokenIndexes.clear();
-
-        cachedTerminatorIndex = -1;
-        cachedTerminator = null;
         return this;
     }
 
-    public Tokenizer addTerminator(final String startSequence,
-                                   final Terminator.TerminationStrategy terminationStrategy) {
-        terminators.add(new Terminator(startSequence, terminationStrategy));
-        return this;
+    public Terminator addTerminator(final Terminator.TerminationStrategy terminationStrategy, String regexp) {
+        Terminator terminator = new Terminator(terminationStrategy, regexp,null);
+        terminators.add(terminator);
+        return terminator;
     }
 
-    public Tokenizer addTerminator(Terminator terminator) {
+    public Terminator addTerminator(final Terminator.TerminationStrategy terminationStrategy,
+                                    String regexp, String group) {
+        Terminator terminator = new Terminator(terminationStrategy, regexp,group);
         terminators.add(terminator);
-        return this;
+        return terminator;
     }
 
-    public Tokenizer addTerminator(final String startSequence,
-                                   final String endSequence, final Terminator.TerminationStrategy terminationStrategy) {
-        terminators.add(new Terminator(startSequence, endSequence, terminationStrategy));
-        return this;
+
+    public Terminator addTerminator(Terminator terminator) {
+        terminators.add(terminator);
+        return terminator;
     }
 
-    public void expectAndConsumeNextToken(final String value)
+    public void expectAndConsumeNextStringToken(final String value)
             throws InvalidSyntaxException {
         final TokenizerMatch match = getNextToken();
         if (!value.equals(match.token))
@@ -74,11 +70,22 @@ public class Tokenizer {
                     + "\" but got \"" + match.token + "\" instead.");
     }
 
+    public TokenizerMatch expectAndConsumeNextTerminatorToken(Terminator terminator)
+            throws InvalidSyntaxException {
+        final TokenizerMatch match = getNextToken();
+
+        if (match.terminator != terminator)
+            throw new InvalidSyntaxException("Expected terminator \"" + terminator
+                    + "\" but got \"" + match.terminator + "\" instead.");
+
+        return match;
+    }
+
+
     /**
      * @return next @TokenizerMatch or <code>null</code> if end of input is reached.
-     * @throws InvalidSyntaxException
      */
-    public TokenizerMatch getNextToken() throws InvalidSyntaxException {
+    public TokenizerMatch getNextToken() {
         tokenIndexes.push(currentIndex);
 
         StringBuilder tokenAccumulator = new StringBuilder();
@@ -92,104 +99,49 @@ public class Tokenizer {
                     return null;
             }
 
-            if (isOngoingToken()) {
+            TokenizerMatch matchResult = findTerminatorMatch();
+            if (matchResult == null) {
                 tokenAccumulator.append(source.charAt(currentIndex));
                 currentIndex++;
                 continue;
             }
 
-            Terminator terminator = getOrFindTokenTerminator();
+            if (matchResult.terminator.termination == PRESERVE) {
+                if (hasAccumulatedToken(tokenAccumulator))
+                    return new TokenizerMatch(tokenAccumulator.toString(), null, null);
 
-            if (terminator.termination == PRESERVE)
-                return buildPreservedToken(tokenAccumulator, terminator);
-            else if (terminator.termination == DROP) {
-                skipUntilTerminatorEnd(terminator);
+                currentIndex = matchResult.matcher.end();
+                return matchResult;
+            } else {
+                currentIndex = matchResult.matcher.end();
 
                 if (hasAccumulatedToken(tokenAccumulator))
-                    return new TokenizerMatch(tokenAccumulator.toString(), null, terminator);
+                    return new TokenizerMatch(tokenAccumulator.toString(), null, null);
             }
         }
-
     }
 
-    private void skipUntilTerminatorEnd(Terminator terminator) throws InvalidSyntaxException {
-        if (terminator.hasEndSequence())
-            currentIndex = getEndSequenceIndex(terminator) + terminator.endSequence.length();
-        else
-            currentIndex += terminator.startSequence.length();
-    }
-
-    /**
-     * @throws InvalidSyntaxException if end sequence is not found as is expected by given token.
-     */
-    private TokenizerMatch buildPreservedToken(StringBuilder token, Terminator terminator)
-            throws InvalidSyntaxException {
-        if (hasAccumulatedToken(token))
-            return new TokenizerMatch(token.toString(), null, terminator);
-
-        if (terminator.hasEndSequence())
-            return buildTokenWithExpectedENdSequence(terminator);
-        else
-            return buildTokenWithoutEndSequence(terminator);
-    }
-
-    private TokenizerMatch buildTokenWithoutEndSequence(Terminator terminator) {
-        currentIndex += terminator.startSequence.length();
-        return new TokenizerMatch(terminator.startSequence, null, terminator);
-    }
-
-    private TokenizerMatch buildTokenWithExpectedENdSequence(Terminator terminator) throws InvalidSyntaxException {
-        int endSequenceIndex = getEndSequenceIndex(terminator);
-        String reminder = source.substring(currentIndex + terminator.startSequence.length(), endSequenceIndex);
-        currentIndex = endSequenceIndex + terminator.endSequence.length();
-
-        return new TokenizerMatch(terminator.startSequence, reminder, terminator);
-    }
-
-    /**
-     * @throws InvalidSyntaxException if end of input is reached without finding expected end sequence.
-     */
-    private int getEndSequenceIndex(Terminator terminator) throws InvalidSyntaxException {
-        int endSequenceIndex = source.indexOf(terminator.endSequence,
-                currentIndex + terminator.startSequence.length());
-
-        if (endSequenceIndex < 0)
-            throw new InvalidSyntaxException("Expected \"" + terminator.endSequence + "\" but not found.");
-
-        return endSequenceIndex;
-    }
-
-    private boolean hasAccumulatedToken(StringBuilder token) {
-        return token.length() > 0;
+    public TokenizerMatch findTerminatorMatch(){
+        for (Terminator terminator : terminators)
+            if (terminator.active) {
+                Matcher match = terminator.match(source, currentIndex);
+                if (match.find()) {
+                    String token = source.substring(match.start(), match.end());
+                    return new TokenizerMatch(token, terminator, match);
+                }
+            }
+        return null;
     }
 
-    private boolean isOngoingToken() {
-        return getOrFindTokenTerminator() == null;
+    private boolean hasAccumulatedToken(StringBuilder tokenAccumulator) {
+        return tokenAccumulator.length() > 0;
     }
 
     public boolean hasMoreContent() {
+        if (source == null) return false;
         return currentIndex < source.length();
     }
 
-    /**
-     * Attempts to cache terminator search result.
-     */
-    public Terminator getOrFindTokenTerminator() {
-        if (currentIndex == cachedTerminatorIndex)
-            return cachedTerminator;
-
-        cachedTerminatorIndex = currentIndex;
-        cachedTerminator = findTokenTerminator();
-        return cachedTerminator;
-    }
-
-    private Terminator findTokenTerminator() {
-        for (Terminator terminator : terminators)
-            if (terminator.matches(source, currentIndex))
-                return terminator;
-        return null;
-    }
-
     public boolean consumeIfNextToken(final String token) throws InvalidSyntaxException {
         if (token.equals(getNextToken().token))
             return true;
@@ -224,13 +176,9 @@ public class Tokenizer {
     public void enlistRemainingTokens(){
         int redTokenCount = 0;
 
-        try {
-            while (hasMoreContent()) {
-                out.println(getNextToken().toString());
-                redTokenCount++;
-            }
-        } catch (InvalidSyntaxException e){
-            out.println("There is syntax exception");
+        while (hasMoreContent()) {
+            out.println(getNextToken().toString());
+            redTokenCount++;
         }
 
         // restore pointer to original location