/*
- * Svjatoslav Commons - shared library of common functionality.
- * Copyright ©2012-2014, Svjatoslav Agejenko, svjatoslav@svjatoslav.eu
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 3 of the GNU Lesser General Public License
- * or later as published by the Free Software Foundation.
+ * Svjatoslav Commons - shared library of common functionality. Author: Svjatoslav Agejenko.
+ * This project is released under Creative Commons Zero (CC0) license.
*/
-
package eu.svjatoslav.commons.string.tokenizer;
import java.util.ArrayList;
import java.util.List;
import java.util.Stack;
+import java.util.regex.Matcher;
+import java.util.stream.Stream;
+
+import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.PRESERVE;
+import static java.lang.System.out;
public class Tokenizer {
- private final List<Terminator> terminators = new ArrayList<Terminator>();
- private final String source;
-
- Stack<Integer> tokenIndexes = new Stack<Integer>();
-
- private int currentIndex = 0;
-
- public Tokenizer(final String source) {
- this.source = source;
- }
-
- public void addTerminator(final String startSequence,
- final boolean ignoreTerminator) {
- terminators.add(new Terminator(startSequence, ignoreTerminator));
- }
-
- public void addTerminator(final String startSequence,
- final String endSequence, final boolean ignoreTerminator) {
- terminators.add(new Terminator(startSequence, endSequence,
- ignoreTerminator));
- }
-
- public void expectNextToken(final String value)
- throws InvalidSyntaxException {
- final TokenizerMatch match = getNextToken();
- if (!value.equals(match.token))
- throw new InvalidSyntaxException("Expected \"" + value
- + "\" but got \"" + match.token + "\" instead.");
- }
-
- public TokenizerMatch getNextToken() {
- tokenIndexes.push(currentIndex);
- final StringBuffer result = new StringBuffer();
-
- while (true) {
- if (currentIndex >= source.length())
- return null;
-
- boolean accumulateCurrentChar = true;
-
- findTerminator: for (final Terminator terminator : terminators)
- if (sequenceMatches(terminator.startSequence))
-
- if (terminator.ignoreTerminator) {
- currentIndex += terminator.startSequence.length();
-
- if (terminator.endSequence != null)
- skipUntilSequence(terminator.endSequence);
-
- if (result.length() > 0)
- return new TokenizerMatch(result.toString(),
- terminator);
- else {
- accumulateCurrentChar = false;
- break findTerminator;
- }
- } else if (result.length() > 0)
- return new TokenizerMatch(result.toString(), terminator);
- else {
- currentIndex += terminator.startSequence.length();
- return new TokenizerMatch(terminator.startSequence,
- terminator);
- }
-
- if (accumulateCurrentChar) {
- result.append(source.charAt(currentIndex));
- currentIndex++;
- }
- }
-
- }
-
- public boolean probeNextToken(final String token) {
- if (token.equals(getNextToken().token))
- return true;
-
- unreadToken();
- return false;
- }
-
- public boolean sequenceMatches(final String sequence) {
- if ((currentIndex + sequence.length()) > source.length())
- return false;
-
- for (int i = 0; i < sequence.length(); i++)
- if (sequence.charAt(i) != source.charAt(i + currentIndex))
- return false;
-
- return true;
- }
-
- public void skipUntilDataEnd() {
- tokenIndexes.push(currentIndex);
- currentIndex = source.length();
- }
-
- public void skipUntilSequence(final String sequence) {
- while (currentIndex < source.length()) {
- if (sequenceMatches(sequence)) {
- currentIndex += sequence.length();
- return;
- }
-
- currentIndex++;
- }
- }
-
- public void unreadToken() {
- currentIndex = tokenIndexes.pop();
- }
+ /**
+ * Stack of token indexes. This allows to walk back in history and un-consume the token.
+ */
+ private final Stack<Integer> tokenIndexes = new Stack<>();
+
+ /**
+ * Terminators that will be searched for by given tokenizer within given source string.
+ */
+ private final List<Terminator> terminators = new ArrayList<>();
+
+ private String source; // string to be tokenized
+
+ private int currentIndex = 0;
+
+ public Tokenizer(final String source) {
+ this.source = source;
+ }
+
+ public Tokenizer() {
+ }
+
+ public Tokenizer setSource(String source) {
+ this.source = source;
+ currentIndex = 0;
+ tokenIndexes.clear();
+ return this;
+ }
+
+ public Terminator addTerminator(final Terminator.TerminationStrategy terminationStrategy, String regexp) {
+ Terminator terminator = new Terminator(terminationStrategy, regexp,null);
+ terminators.add(terminator);
+ return terminator;
+ }
+
+ public Terminator addTerminator(final Terminator.TerminationStrategy terminationStrategy,
+ String regexp, String group) {
+ Terminator terminator = new Terminator(terminationStrategy, regexp,group);
+ terminators.add(terminator);
+ return terminator;
+ }
+
+
+ public Terminator addTerminator(Terminator terminator) {
+ terminators.add(terminator);
+ return terminator;
+ }
+
+ public void expectAndConsumeNextStringToken(final String value)
+ throws InvalidSyntaxException {
+ final TokenizerMatch match = getNextToken();
+ if (!value.equals(match.token))
+ throw new InvalidSyntaxException("Expected \"" + value
+ + "\" but got \"" + match.token + "\" instead.");
+ }
+
+ public TokenizerMatch expectAndConsumeNextTerminatorToken(Terminator terminator)
+ throws InvalidSyntaxException {
+ final TokenizerMatch match = getNextToken();
+
+ if (match.terminator != terminator)
+ throw new InvalidSyntaxException("Expected terminator \"" + terminator
+ + "\" but got \"" + match.terminator + "\" instead.");
+
+ return match;
+ }
+
+
+ /**
+ * @return next @TokenizerMatch or <code>null</code> if end of input is reached.
+ */
+ public TokenizerMatch getNextToken() {
+ tokenIndexes.push(currentIndex);
+
+ StringBuilder tokenAccumulator = new StringBuilder();
+
+ while (true) {
+
+ if (currentIndex >= source.length()) { // reached end of input
+ if (hasAccumulatedToken(tokenAccumulator))
+ return new TokenizerMatch(tokenAccumulator.toString(), null, null);
+ else
+ return null;
+ }
+
+ TokenizerMatch matchResult = findTerminatorMatch();
+ if (matchResult == null) {
+ tokenAccumulator.append(source.charAt(currentIndex));
+ currentIndex++;
+ continue;
+ }
+
+ if (matchResult.terminator.termination == PRESERVE) {
+ if (hasAccumulatedToken(tokenAccumulator))
+ return new TokenizerMatch(tokenAccumulator.toString(), null, null);
+
+ currentIndex = matchResult.matcher.end();
+ return matchResult;
+ } else {
+ currentIndex = matchResult.matcher.end();
+
+ if (hasAccumulatedToken(tokenAccumulator))
+ return new TokenizerMatch(tokenAccumulator.toString(), null, null);
+ }
+ }
+ }
+
+ public TokenizerMatch findTerminatorMatch(){
+ for (Terminator terminator : terminators)
+ if (terminator.active) {
+ Matcher match = terminator.match(source, currentIndex);
+ if (match.find()) {
+ String token = source.substring(match.start(), match.end());
+ return new TokenizerMatch(token, terminator, match);
+ }
+ }
+ return null;
+ }
+
+ private boolean hasAccumulatedToken(StringBuilder tokenAccumulator) {
+ return tokenAccumulator.length() > 0;
+ }
+
+ public boolean hasMoreContent() {
+ if (source == null) return false;
+ return currentIndex < source.length();
+ }
+
+ public boolean consumeIfNextToken(final String token) throws InvalidSyntaxException {
+ if (token.equals(getNextToken().token))
+ return true;
+
+ unreadToken();
+ return false;
+ }
+
+ public TokenizerMatch peekNextToken() throws InvalidSyntaxException {
+ TokenizerMatch result = getNextToken();
+ unreadToken();
+ return result;
+ }
+
+ public boolean peekIsOneOf(String... possibilities) throws InvalidSyntaxException {
+ String nextToken = peekNextToken().token;
+ return Stream.of(possibilities).anyMatch(possibility -> possibility.equals(nextToken));
+ }
+
+ public void peekExpectNoneOf(String... possibilities) throws InvalidSyntaxException {
+ if (peekIsOneOf(possibilities))
+ throw new InvalidSyntaxException("Not expected \"" + peekNextToken().token + "\" here.");
+ }
+
+ public void unreadToken() {
+ currentIndex = tokenIndexes.pop();
+ }
+
+ /**
+ * For debugging
+ */
+ public void enlistRemainingTokens(){
+ int redTokenCount = 0;
+
+ while (hasMoreContent()) {
+ out.println(getNextToken().toString());
+ redTokenCount++;
+ }
+
+ // restore pointer to original location
+ for (int i = 0; i< redTokenCount; i++ ) unreadToken();
+ }
+
+
+ public void skipUntilDataEnd() {
+ tokenIndexes.push(currentIndex);
+ currentIndex = source.length();
+ }
}