2 * Svjatoslav Commons - shared library of common functionality. Author: Svjatoslav Agejenko.
3 * This project is released under Creative Commons Zero (CC0) license.
5 package eu.svjatoslav.commons.string.tokenizer;
7 import java.util.ArrayList;
9 import java.util.Stack;
10 import java.util.stream.Stream;
12 import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.DROP;
13 import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.PRESERVE;
15 public class Tokenizer {
17 private final Stack<Integer> tokenIndexes = new Stack<>();
18 private final List<Terminator> terminators = new ArrayList<>();
19 private String source;
20 private int currentIndex = 0;
22 private int cachedTerminatorIndex = -1;
23 private Terminator cachedTerminator;
25 public Tokenizer(final String source) {
32 public Tokenizer setSource(String source) {
37 cachedTerminatorIndex = -1;
38 cachedTerminator = null;
42 public Tokenizer addTerminator(final String startSequence,
43 final Terminator.TerminationStrategy terminationStrategy) {
44 terminators.add(new Terminator(startSequence, terminationStrategy));
48 public Tokenizer addTerminator(final String startSequence,
49 final String endSequence, final Terminator.TerminationStrategy terminationStrategy) {
50 terminators.add(new Terminator(startSequence, endSequence, terminationStrategy));
54 public void expectAndConsumeNextToken(final String value)
55 throws InvalidSyntaxException {
56 final TokenizerMatch match = getNextToken();
57 if (!value.equals(match.token))
58 throw new InvalidSyntaxException("Expected \"" + value
59 + "\" but got \"" + match.token + "\" instead.");
65 * @return next @TokenizerMatch or <code>null</code> if end of input is reached.
66 * @throws InvalidSyntaxException
68 public TokenizerMatch getNextToken() throws InvalidSyntaxException {
69 tokenIndexes.push(currentIndex);
71 StringBuilder tokenAccumulator = new StringBuilder();
75 if (currentIndex >= source.length()) { // reached end of input
76 if (hasAccumulatedToken(tokenAccumulator))
77 return new TokenizerMatch(tokenAccumulator.toString(), null, null);
82 if (isOngoingToken()) {
83 tokenAccumulator.append(source.charAt(currentIndex));
88 Terminator terminator = getOrFindTokenTerminator();
90 if (terminator.termination == PRESERVE)
91 return buildPreservedToken(tokenAccumulator, terminator);
92 else if (terminator.termination == DROP) {
93 skipUntilTerminatorEnd(terminator);
95 if (hasAccumulatedToken(tokenAccumulator))
96 return new TokenizerMatch(tokenAccumulator.toString(), null, terminator);
102 private void skipUntilTerminatorEnd(Terminator terminator) throws InvalidSyntaxException {
103 if (terminator.hasEndSequence())
104 currentIndex = getEndSequenceIndex(terminator) + terminator.endSequence.length();
106 currentIndex += terminator.startSequence.length();
110 * @throws InvalidSyntaxException if end sequence is not found as is expected by given token.
112 private TokenizerMatch buildPreservedToken(StringBuilder token, Terminator terminator)
113 throws InvalidSyntaxException {
114 if (hasAccumulatedToken(token))
115 return new TokenizerMatch(token.toString(), null, terminator);
117 if (terminator.hasEndSequence())
118 return buildTokenWithExpectedENdSequence(terminator);
120 return buildTokenWithoutEndSequence(terminator);
123 private TokenizerMatch buildTokenWithoutEndSequence(Terminator terminator) {
124 currentIndex += terminator.startSequence.length();
125 return new TokenizerMatch(terminator.startSequence, null, terminator);
128 private TokenizerMatch buildTokenWithExpectedENdSequence(Terminator terminator) throws InvalidSyntaxException {
129 int endSequenceIndex = getEndSequenceIndex(terminator);
130 String reminder = source.substring(currentIndex + terminator.startSequence.length(), endSequenceIndex);
131 currentIndex = endSequenceIndex + terminator.endSequence.length();
133 return new TokenizerMatch(terminator.startSequence, reminder, terminator);
137 * @throws InvalidSyntaxException if end of input is reached without finding expected end sequence.
139 private int getEndSequenceIndex(Terminator terminator) throws InvalidSyntaxException {
140 int endSequenceIndex = source.indexOf(terminator.endSequence,
141 currentIndex + terminator.startSequence.length());
143 if (endSequenceIndex < 0)
144 throw new InvalidSyntaxException("Expected \"" + terminator.endSequence + "\" but not found.");
146 return endSequenceIndex;
149 private boolean hasAccumulatedToken(StringBuilder token) {
150 return token.length() > 0;
153 private boolean isOngoingToken() {
154 return getOrFindTokenTerminator() == null;
157 public boolean hasMoreTokens() {
158 return currentIndex < source.length();
162 * Attempts to cache terminator search result.
164 public Terminator getOrFindTokenTerminator() {
165 if (currentIndex == cachedTerminatorIndex)
166 return cachedTerminator;
168 cachedTerminatorIndex = currentIndex;
169 cachedTerminator = findTokenTerminator();
170 return cachedTerminator;
173 private Terminator findTokenTerminator() {
174 for (Terminator terminator : terminators)
175 if (terminator.matches(source, currentIndex))
180 public boolean consumeIfNextToken(final String token) throws InvalidSyntaxException {
181 if (token.equals(getNextToken().token))
188 public TokenizerMatch peekNextToken() throws InvalidSyntaxException {
189 TokenizerMatch result = getNextToken();
194 public boolean peekIsOneOf(String... possibilities) throws InvalidSyntaxException {
195 String nextToken = peekNextToken().token;
196 return Stream.of(possibilities).anyMatch(possibility -> possibility.equals(nextToken));
199 public void peekExpectNoneOf(String... possibilities) throws InvalidSyntaxException {
200 if (peekIsOneOf(possibilities))
201 throw new InvalidSyntaxException("Not expected \"" + peekNextToken().token + "\" here.");
204 public void unreadToken() {
205 currentIndex = tokenIndexes.pop();
208 public void skipUntilDataEnd() {
209 tokenIndexes.push(currentIndex);
210 currentIndex = source.length();