import java.util.ArrayList;
import java.util.List;
import java.util.Stack;
+import java.util.regex.Matcher;
import java.util.stream.Stream;
-import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.DROP;
import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.PRESERVE;
+import static java.lang.System.out;
public class Tokenizer {
+ /**
+ * Stack of token indexes. This allows to walk back in history and un-consume the token.
+ */
private final Stack<Integer> tokenIndexes = new Stack<>();
+
+ /**
+ * Terminators that will be searched for by given tokenizer within given source string.
+ */
private final List<Terminator> terminators = new ArrayList<>();
- private String source;
- private int currentIndex = 0;
- private int cachedTerminatorIndex = -1;
- private Terminator cachedTerminator;
+ private String source; // string to be tokenized
+
+ private int currentIndex = 0;
public Tokenizer(final String source) {
this.source = source;
this.source = source;
currentIndex = 0;
tokenIndexes.clear();
-
- cachedTerminatorIndex = -1;
- cachedTerminator = null;
return this;
}
- public Tokenizer addTerminator(final String startSequence,
- final Terminator.TerminationStrategy terminationStrategy) {
- terminators.add(new Terminator(startSequence, terminationStrategy));
- return this;
+ public Terminator addTerminator(final Terminator.TerminationStrategy terminationStrategy, String regexp) {
+ Terminator terminator = new Terminator(terminationStrategy, regexp,null);
+ terminators.add(terminator);
+ return terminator;
}
- public Tokenizer addTerminator(final String startSequence,
- final String endSequence, final Terminator.TerminationStrategy terminationStrategy) {
- terminators.add(new Terminator(startSequence, endSequence, terminationStrategy));
- return this;
+ public Terminator addTerminator(final Terminator.TerminationStrategy terminationStrategy,
+ String regexp, String group) {
+ Terminator terminator = new Terminator(terminationStrategy, regexp,group);
+ terminators.add(terminator);
+ return terminator;
}
- public void expectAndConsumeNextToken(final String value)
+
+ public Terminator addTerminator(Terminator terminator) {
+ terminators.add(terminator);
+ return terminator;
+ }
+
+ public void expectAndConsumeNextStringToken(final String value)
throws InvalidSyntaxException {
final TokenizerMatch match = getNextToken();
if (!value.equals(match.token))
+ "\" but got \"" + match.token + "\" instead.");
}
+ public TokenizerMatch expectAndConsumeNextTerminatorToken(Terminator terminator)
+ throws InvalidSyntaxException {
+ final TokenizerMatch match = getNextToken();
+
+ if (match.terminator != terminator)
+ throw new InvalidSyntaxException("Expected terminator \"" + terminator
+ + "\" but got \"" + match.terminator + "\" instead.");
+
+ return match;
+ }
+
- public TokenizerMatch getNextToken() throws InvalidSyntaxException {
+ /**
+ * @return next @TokenizerMatch or <code>null</code> if end of input is reached.
+ */
+ public TokenizerMatch getNextToken() {
tokenIndexes.push(currentIndex);
StringBuilder tokenAccumulator = new StringBuilder();
return null;
}
- if (isOngoingToken()) {
+ TokenizerMatch matchResult = findTerminatorMatch();
+ if (matchResult == null) {
tokenAccumulator.append(source.charAt(currentIndex));
currentIndex++;
continue;
}
- Terminator terminator = getOrFindTokenTerminator();
+ if (matchResult.terminator.termination == PRESERVE) {
+ if (hasAccumulatedToken(tokenAccumulator))
+ return new TokenizerMatch(tokenAccumulator.toString(), null, null);
- if (terminator.termination == PRESERVE)
- return buildPreservedToken(tokenAccumulator, terminator);
- else if (terminator.termination == DROP) {
- skipUntilTerminatorEnd(terminator);
+ currentIndex = matchResult.matcher.end();
+ return matchResult;
+ } else {
+ currentIndex = matchResult.matcher.end();
if (hasAccumulatedToken(tokenAccumulator))
- return new TokenizerMatch(tokenAccumulator.toString(), null, terminator);
+ return new TokenizerMatch(tokenAccumulator.toString(), null, null);
}
}
-
- }
-
- private void skipUntilTerminatorEnd(Terminator terminator) throws InvalidSyntaxException {
- if (terminator.hasEndSequence())
- currentIndex = getEndSequenceIndex(terminator) + terminator.endSequence.length();
- else
- currentIndex += terminator.startSequence.length();
- }
-
- private TokenizerMatch buildPreservedToken(StringBuilder token, Terminator terminator) throws InvalidSyntaxException {
- if (hasAccumulatedToken(token))
- return new TokenizerMatch(token.toString(), null, terminator);
-
- if (terminator.hasEndSequence())
- return buildComplexPreservedToken(terminator);
- else
- return buildSimplePreservedToken(terminator);
- }
-
- private TokenizerMatch buildSimplePreservedToken(Terminator terminator) {
- currentIndex += terminator.startSequence.length();
- return new TokenizerMatch(terminator.startSequence, null, terminator);
- }
-
- private TokenizerMatch buildComplexPreservedToken(Terminator terminator) throws InvalidSyntaxException {
- int endSequenceIndex = getEndSequenceIndex(terminator);
- String reminder = source.substring(currentIndex + terminator.startSequence.length(), endSequenceIndex);
- currentIndex = endSequenceIndex + terminator.endSequence.length();
-
- return new TokenizerMatch(terminator.startSequence, reminder, terminator);
}
- private int getEndSequenceIndex(Terminator terminator) throws InvalidSyntaxException {
- int endSequenceIndex = source.indexOf(terminator.endSequence,
- currentIndex + terminator.startSequence.length());
-
- if (endSequenceIndex < 0)
- throw new InvalidSyntaxException("Expected \"" + terminator.endSequence + "\" but not found.");
-
- return endSequenceIndex;
- }
-
- private boolean hasAccumulatedToken(StringBuilder token) {
- return token.length() > 0;
+ public TokenizerMatch findTerminatorMatch(){
+ for (Terminator terminator : terminators)
+ if (terminator.active) {
+ Matcher match = terminator.match(source, currentIndex);
+ if (match.find()) {
+ String token = source.substring(match.start(), match.end());
+ return new TokenizerMatch(token, terminator, match);
+ }
+ }
+ return null;
}
- private boolean isOngoingToken() {
- return getOrFindTokenTerminator() == null;
+ private boolean hasAccumulatedToken(StringBuilder tokenAccumulator) {
+ return tokenAccumulator.length() > 0;
}
- public boolean hasMoreTokens() {
+ public boolean hasMoreContent() {
+ if (source == null) return false;
return currentIndex < source.length();
}
- /**
- * Attempts to cache terminator search result.
- */
- public Terminator getOrFindTokenTerminator() {
- if (currentIndex == cachedTerminatorIndex)
- return cachedTerminator;
-
- cachedTerminatorIndex = currentIndex;
- cachedTerminator = findTokenTerminator();
- return cachedTerminator;
- }
-
- private Terminator findTokenTerminator() {
- for (Terminator terminator : terminators)
- if (terminator.matches(source, currentIndex))
- return terminator;
- return null;
- }
-
public boolean consumeIfNextToken(final String token) throws InvalidSyntaxException {
if (token.equals(getNextToken().token))
return true;
currentIndex = tokenIndexes.pop();
}
+ /**
+ * For debugging
+ */
+ public void enlistRemainingTokens(){
+ int redTokenCount = 0;
+
+ while (hasMoreContent()) {
+ out.println(getNextToken().toString());
+ redTokenCount++;
+ }
+
+ // restore pointer to original location
+ for (int i = 0; i< redTokenCount; i++ ) unreadToken();
+ }
+
+
public void skipUntilDataEnd() {
tokenIndexes.push(currentIndex);
currentIndex = source.length();