2 * Svjatoslav Commons - shared library of common functionality. Author: Svjatoslav Agejenko.
3 * This project is released under Creative Commons Zero (CC0) license.
5 package eu.svjatoslav.commons.string.tokenizer;
7 import java.util.ArrayList;
9 import java.util.Stack;
10 import java.util.stream.Stream;
12 import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.DROP;
13 import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.PRESERVE;
15 public class Tokenizer {
17 private final Stack<Integer> tokenIndexes = new Stack<>();
18 private final List<Terminator> terminators = new ArrayList<>();
19 private String source;
20 private int currentIndex = 0;
22 private int cachedTerminatorIndex = -1;
23 private Terminator cachedTerminator;
25 public Tokenizer(final String source) {
32 public Tokenizer setSource(String source) {
37 cachedTerminatorIndex = -1;
38 cachedTerminator = null;
42 public Tokenizer addTerminator(final String startSequence,
43 final Terminator.TerminationStrategy terminationStrategy) {
44 terminators.add(new Terminator(startSequence, terminationStrategy));
48 public Tokenizer addTerminator(final String startSequence,
49 final String endSequence, final Terminator.TerminationStrategy terminationStrategy) {
50 terminators.add(new Terminator(startSequence, endSequence, terminationStrategy));
54 public void expectAndConsumeNextToken(final String value)
55 throws InvalidSyntaxException {
56 final TokenizerMatch match = getNextToken();
57 if (!value.equals(match.token))
58 throw new InvalidSyntaxException("Expected \"" + value
59 + "\" but got \"" + match.token + "\" instead.");
63 public TokenizerMatch getNextToken() throws InvalidSyntaxException {
64 tokenIndexes.push(currentIndex);
66 StringBuilder tokenAccumulator = new StringBuilder();
70 if (currentIndex >= source.length()) { // reached end of input
71 if (hasAccumulatedToken(tokenAccumulator))
72 return new TokenizerMatch(tokenAccumulator.toString(), null, null);
77 if (isOngoingToken()) {
78 tokenAccumulator.append(source.charAt(currentIndex));
83 Terminator terminator = getOrFindTokenTerminator();
85 if (terminator.termination == PRESERVE)
86 return buildPreservedToken(tokenAccumulator, terminator);
87 else if (terminator.termination == DROP) {
88 skipUntilTerminatorEnd(terminator);
90 if (hasAccumulatedToken(tokenAccumulator))
91 return new TokenizerMatch(tokenAccumulator.toString(), null, terminator);
97 private void skipUntilTerminatorEnd(Terminator terminator) throws InvalidSyntaxException {
98 if (terminator.hasEndSequence())
99 currentIndex = getEndSequenceIndex(terminator) + terminator.endSequence.length();
101 currentIndex += terminator.startSequence.length();
104 private TokenizerMatch buildPreservedToken(StringBuilder token, Terminator terminator) throws InvalidSyntaxException {
105 if (hasAccumulatedToken(token))
106 return new TokenizerMatch(token.toString(), null, terminator);
108 if (terminator.hasEndSequence())
109 return buildComplexPreservedToken(terminator);
111 return buildSimplePreservedToken(terminator);
114 private TokenizerMatch buildSimplePreservedToken(Terminator terminator) {
115 currentIndex += terminator.startSequence.length();
116 return new TokenizerMatch(terminator.startSequence, null, terminator);
119 private TokenizerMatch buildComplexPreservedToken(Terminator terminator) throws InvalidSyntaxException {
120 int endSequenceIndex = getEndSequenceIndex(terminator);
121 String reminder = source.substring(currentIndex + terminator.startSequence.length(), endSequenceIndex);
122 currentIndex = endSequenceIndex + terminator.endSequence.length();
124 return new TokenizerMatch(terminator.startSequence, reminder, terminator);
127 private int getEndSequenceIndex(Terminator terminator) throws InvalidSyntaxException {
128 int endSequenceIndex = source.indexOf(terminator.endSequence,
129 currentIndex + terminator.startSequence.length());
131 if (endSequenceIndex < 0)
132 throw new InvalidSyntaxException("Expected \"" + terminator.endSequence + "\" but not found.");
134 return endSequenceIndex;
137 private boolean hasAccumulatedToken(StringBuilder token) {
138 return token.length() > 0;
141 private boolean isOngoingToken() {
142 return getOrFindTokenTerminator() == null;
145 public boolean hasMoreTokens() {
146 return currentIndex < source.length();
150 * Attempts to cache terminator search result.
152 public Terminator getOrFindTokenTerminator() {
153 if (currentIndex == cachedTerminatorIndex)
154 return cachedTerminator;
156 cachedTerminatorIndex = currentIndex;
157 cachedTerminator = findTokenTerminator();
158 return cachedTerminator;
161 private Terminator findTokenTerminator() {
162 for (Terminator terminator : terminators)
163 if (terminator.matches(source, currentIndex))
168 public boolean consumeIfNextToken(final String token) throws InvalidSyntaxException {
169 if (token.equals(getNextToken().token))
176 public TokenizerMatch peekNextToken() throws InvalidSyntaxException {
177 TokenizerMatch result = getNextToken();
182 public boolean peekIsOneOf(String... possibilities) throws InvalidSyntaxException {
183 String nextToken = peekNextToken().token;
184 return Stream.of(possibilities).anyMatch(possibility -> possibility.equals(nextToken));
187 public void peekExpectNoneOf(String... possibilities) throws InvalidSyntaxException {
188 if (peekIsOneOf(possibilities))
189 throw new InvalidSyntaxException("Not expected \"" + peekNextToken().token + "\" here.");
192 public void unreadToken() {
193 currentIndex = tokenIndexes.pop();
196 public void skipUntilDataEnd() {
197 tokenIndexes.push(currentIndex);
198 currentIndex = source.length();