Helper function to split string into groups based on regexp. Possibility to retrieve...

[svjatoslav_commons.git] / src / main / java / eu / svjatoslav / commons / string / tokenizer / Tokenizer.java
diff --git a/src/main/java/eu/svjatoslav/commons/string/tokenizer/Tokenizer.java b/src/main/java/eu/svjatoslav/commons/string/tokenizer/Tokenizer.java

index 8b396ed..14554be 100755 (executable)
--- a/src/main/java/eu/svjatoslav/commons/string/tokenizer/Tokenizer.java
+++ b/src/main/java/eu/svjatoslav/commons/string/tokenizer/Tokenizer.java
@@ -1,41 +1,68 @@
  /*
- * Svjatoslav Commons - shared library of common functionality.
- * Copyright ©2012-2017, Svjatoslav Agejenko, svjatoslav@svjatoslav.eu
- * 
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 3 of the GNU Lesser General Public License
- * or later as published by the Free Software Foundation.
+ * Svjatoslav Commons - shared library of common functionality. Author: Svjatoslav Agejenko.
+ * This project is released under Creative Commons Zero (CC0) license.
   */
-
  package eu.svjatoslav.commons.string.tokenizer;
  
  import java.util.ArrayList;
  import java.util.List;
  import java.util.Stack;
+import java.util.regex.Matcher;
+import java.util.stream.Stream;
+
+import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.PRESERVE;
+import static java.lang.System.out;
  
  public class Tokenizer {
  
-    final Stack<Integer> tokenIndexes = new Stack<>();
+    /**
+     * Stack of token indexes. This allows to walk back in history and un-consume the token.
+     */
+    private final Stack<Integer> tokenIndexes = new Stack<>();
+
+    /**
+     * Terminators that will be searched for by given tokenizer within given source string.
+     */
      private final List<Terminator> terminators = new ArrayList<>();
-    private final String source;
+
+    private String source; // string to be tokenized
+
      private int currentIndex = 0;
  
      public Tokenizer(final String source) {
          this.source = source;
      }
  
-    public void addTerminator(final String startSequence,
-                              final boolean ignoreTerminator) {
-        terminators.add(new Terminator(startSequence, ignoreTerminator));
+    public Tokenizer() {
      }
  
-    public void addTerminator(final String startSequence,
-                              final String endSequence, final boolean ignoreTerminator) {
-        terminators.add(new Terminator(startSequence, endSequence,
-                ignoreTerminator));
+    public Tokenizer setSource(String source) {
+        this.source = source;
+        currentIndex = 0;
+        tokenIndexes.clear();
+        return this;
+    }
+
+    public Terminator addTerminator(final Terminator.TerminationStrategy terminationStrategy, String regexp) {
+        Terminator terminator = new Terminator(terminationStrategy, regexp,null);
+        terminators.add(terminator);
+        return terminator;
+    }
+
+    public Terminator addTerminator(final Terminator.TerminationStrategy terminationStrategy,
+                                    String regexp, String group) {
+        Terminator terminator = new Terminator(terminationStrategy, regexp,group);
+        terminators.add(terminator);
+        return terminator;
+    }
+
+
+    public Terminator addTerminator(Terminator terminator) {
+        terminators.add(terminator);
+        return terminator;
      }
  
-    public void expectNextToken(final String value)
+    public void expectAndConsumeNextStringToken(final String value)
              throws InvalidSyntaxException {
          final TokenizerMatch match = getNextToken();
          if (!value.equals(match.token))
@@ -43,49 +70,79 @@ public class Tokenizer {
                      + "\" but got \"" + match.token + "\" instead.");
      }
  
+    public TokenizerMatch expectAndConsumeNextTerminatorToken(Terminator terminator)
+            throws InvalidSyntaxException {
+        final TokenizerMatch match = getNextToken();
+
+        if (match.terminator != terminator)
+            throw new InvalidSyntaxException("Expected terminator \"" + terminator
+                    + "\" but got \"" + match.terminator + "\" instead.");
+
+        return match;
+    }
+
+
+    /**
+     * @return next @TokenizerMatch or <code>null</code> if end of input is reached.
+     */
      public TokenizerMatch getNextToken() {
          tokenIndexes.push(currentIndex);
-        final StringBuilder result = new StringBuilder();
+
+        StringBuilder tokenAccumulator = new StringBuilder();
  
          while (true) {
-            if (currentIndex >= source.length())
-                return null;
-
-            boolean accumulateCurrentChar = true;
-
-            for (final Terminator terminator : terminators)
-                if (sequenceMatches(terminator.startSequence))
-
-                    if (terminator.ignoreTerminator) {
-                        currentIndex += terminator.startSequence.length();
-
-                        if (terminator.endSequence != null)
-                            skipUntilSequence(terminator.endSequence);
-
-                        if (result.length() > 0)
-                            return new TokenizerMatch(result.toString(),
-                                    terminator);
-                        else {
-                            accumulateCurrentChar = false;
-                            break;
-                        }
-                    } else if (result.length() > 0)
-                        return new TokenizerMatch(result.toString(), terminator);
-                    else {
-                        currentIndex += terminator.startSequence.length();
-                        return new TokenizerMatch(terminator.startSequence,
-                                terminator);
-                    }
-
-            if (accumulateCurrentChar) {
-                result.append(source.charAt(currentIndex));
+
+            if (currentIndex >= source.length()) { // reached end of input
+                if (hasAccumulatedToken(tokenAccumulator))
+                    return new TokenizerMatch(tokenAccumulator.toString(), null, null, this);
+                else
+                    return null;
+            }
+
+            TokenizerMatch matchResult = findTerminatorMatch();
+            if (matchResult == null) {
+                tokenAccumulator.append(source.charAt(currentIndex));
                  currentIndex++;
+                continue;
+            }
+
+            if (matchResult.terminator.termination == PRESERVE) {
+                if (hasAccumulatedToken(tokenAccumulator))
+                    return new TokenizerMatch(tokenAccumulator.toString(), null, null, this);
+
+                currentIndex = matchResult.matcher.end();
+                return matchResult;
+            } else {
+                currentIndex = matchResult.matcher.end();
+
+                if (hasAccumulatedToken(tokenAccumulator))
+                    return new TokenizerMatch(tokenAccumulator.toString(), null, null, this);
              }
          }
+    }
+
+    public TokenizerMatch findTerminatorMatch(){
+        for (Terminator terminator : terminators)
+            if (terminator.active) {
+                Matcher match = terminator.match(source, currentIndex);
+                if (match.find()) {
+                    String token = source.substring(match.start(), match.end());
+                    return new TokenizerMatch(token, terminator, match, this);
+                }
+            }
+        return null;
+    }
  
+    private boolean hasAccumulatedToken(StringBuilder tokenAccumulator) {
+        return tokenAccumulator.length() > 0;
      }
  
-    public boolean probeNextToken(final String token) {
+    public boolean hasMoreContent() {
+        if (source == null) return false;
+        return currentIndex < source.length();
+    }
+
+    public boolean consumeIfNextToken(final String token) throws InvalidSyntaxException {
          if (token.equals(getNextToken().token))
              return true;
  
@@ -93,35 +150,45 @@ public class Tokenizer {
          return false;
      }
  
-    public boolean sequenceMatches(final String sequence) {
-        if ((currentIndex + sequence.length()) > source.length())
-            return false;
+    public TokenizerMatch peekNextToken() throws InvalidSyntaxException {
+        TokenizerMatch result = getNextToken();
+        unreadToken();
+        return result;
+    }
  
-        for (int i = 0; i < sequence.length(); i++)
-            if (sequence.charAt(i) != source.charAt(i + currentIndex))
-                return false;
+    public boolean peekIsOneOf(String... possibilities) throws InvalidSyntaxException {
+        String nextToken = peekNextToken().token;
+        return Stream.of(possibilities).anyMatch(possibility -> possibility.equals(nextToken));
+    }
  
-        return true;
+    public void peekExpectNoneOf(String... possibilities) throws InvalidSyntaxException {
+        if (peekIsOneOf(possibilities))
+            throw new InvalidSyntaxException("Not expected \"" + peekNextToken().token + "\" here.");
      }
  
-    public void skipUntilDataEnd() {
-        tokenIndexes.push(currentIndex);
-        currentIndex = source.length();
+    public void unreadToken() {
+        currentIndex = tokenIndexes.pop();
      }
  
-    public void skipUntilSequence(final String sequence) {
-        while (currentIndex < source.length()) {
-            if (sequenceMatches(sequence)) {
-                currentIndex += sequence.length();
-                return;
-            }
+    /**
+     * For debugging
+     */
+    public void enlistRemainingTokens(){
+        int redTokenCount = 0;
  
-            currentIndex++;
+        while (hasMoreContent()) {
+            out.println(getNextToken().toString());
+            redTokenCount++;
          }
+
+        // restore pointer to original location
+        for (int i = 0; i< redTokenCount; i++ ) unreadToken();
      }
  
-    public void unreadToken() {
-        currentIndex = tokenIndexes.pop();
+
+    public void skipUntilDataEnd() {
+        tokenIndexes.push(currentIndex);
+        currentIndex = source.length();
      }
  
  }