2 * Svjatoslav Commons - shared library of common functionality. Author: Svjatoslav Agejenko.
3 * This project is released under Creative Commons Zero (CC0) license.
5 package eu.svjatoslav.commons.string.tokenizer;
7 import java.util.ArrayList;
9 import java.util.Stack;
10 import java.util.stream.Stream;
12 import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.DROP;
13 import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.PRESERVE;
14 import static java.lang.System.out;
16 public class Tokenizer {
19 * Stack of token indexes. This allows to walk back in history and un-consume the token.
21 private final Stack<Integer> tokenIndexes = new Stack<>();
24 * Terminators that will be searched for by given tokenizer within given source string.
26 private final List<Terminator> terminators = new ArrayList<>();
28 private String source; // string to be tokenized
30 private int currentIndex = 0;
32 private int cachedTerminatorIndex = -1;
33 private Terminator cachedTerminator;
35 public Tokenizer(final String source) {
42 public Tokenizer setSource(String source) {
47 cachedTerminatorIndex = -1;
48 cachedTerminator = null;
52 public Tokenizer addTerminator(final String startSequence,
53 final Terminator.TerminationStrategy terminationStrategy) {
54 terminators.add(new Terminator(startSequence, terminationStrategy));
58 public Tokenizer addTerminator(Terminator terminator) {
59 terminators.add(terminator);
63 public Tokenizer addTerminator(final String startSequence,
64 final String endSequence, final Terminator.TerminationStrategy terminationStrategy) {
65 terminators.add(new Terminator(startSequence, endSequence, terminationStrategy));
69 public void expectAndConsumeNextToken(final String value)
70 throws InvalidSyntaxException {
71 final TokenizerMatch match = getNextToken();
72 if (!value.equals(match.token))
73 throw new InvalidSyntaxException("Expected \"" + value
74 + "\" but got \"" + match.token + "\" instead.");
78 * @return next @TokenizerMatch or <code>null</code> if end of input is reached.
79 * @throws InvalidSyntaxException
81 public TokenizerMatch getNextToken() throws InvalidSyntaxException {
82 tokenIndexes.push(currentIndex);
84 StringBuilder tokenAccumulator = new StringBuilder();
88 if (currentIndex >= source.length()) { // reached end of input
89 if (hasAccumulatedToken(tokenAccumulator))
90 return new TokenizerMatch(tokenAccumulator.toString(), null, null);
95 if (isOngoingToken()) {
96 tokenAccumulator.append(source.charAt(currentIndex));
101 Terminator terminator = getOrFindTokenTerminator();
103 if (terminator.termination == PRESERVE)
104 return buildPreservedToken(tokenAccumulator, terminator);
105 else if (terminator.termination == DROP) {
106 skipUntilTerminatorEnd(terminator);
108 if (hasAccumulatedToken(tokenAccumulator))
109 return new TokenizerMatch(tokenAccumulator.toString(), null, terminator);
115 private void skipUntilTerminatorEnd(Terminator terminator) throws InvalidSyntaxException {
116 if (terminator.hasEndSequence())
117 currentIndex = getEndSequenceIndex(terminator) + terminator.endSequence.length();
119 currentIndex += terminator.startSequence.length();
123 * @throws InvalidSyntaxException if end sequence is not found as is expected by given token.
125 private TokenizerMatch buildPreservedToken(StringBuilder token, Terminator terminator)
126 throws InvalidSyntaxException {
127 if (hasAccumulatedToken(token))
128 return new TokenizerMatch(token.toString(), null, terminator);
130 if (terminator.hasEndSequence())
131 return buildTokenWithExpectedENdSequence(terminator);
133 return buildTokenWithoutEndSequence(terminator);
136 private TokenizerMatch buildTokenWithoutEndSequence(Terminator terminator) {
137 currentIndex += terminator.startSequence.length();
138 return new TokenizerMatch(terminator.startSequence, null, terminator);
141 private TokenizerMatch buildTokenWithExpectedENdSequence(Terminator terminator) throws InvalidSyntaxException {
142 int endSequenceIndex = getEndSequenceIndex(terminator);
143 String reminder = source.substring(currentIndex + terminator.startSequence.length(), endSequenceIndex);
144 currentIndex = endSequenceIndex + terminator.endSequence.length();
146 return new TokenizerMatch(terminator.startSequence, reminder, terminator);
150 * @throws InvalidSyntaxException if end of input is reached without finding expected end sequence.
152 private int getEndSequenceIndex(Terminator terminator) throws InvalidSyntaxException {
153 int endSequenceIndex = source.indexOf(terminator.endSequence,
154 currentIndex + terminator.startSequence.length());
156 if (endSequenceIndex < 0)
157 throw new InvalidSyntaxException("Expected \"" + terminator.endSequence + "\" but not found.");
159 return endSequenceIndex;
162 private boolean hasAccumulatedToken(StringBuilder token) {
163 return token.length() > 0;
166 private boolean isOngoingToken() {
167 return getOrFindTokenTerminator() == null;
170 public boolean hasMoreContent() {
171 return currentIndex < source.length();
175 * Attempts to cache terminator search result.
177 public Terminator getOrFindTokenTerminator() {
178 if (currentIndex == cachedTerminatorIndex)
179 return cachedTerminator;
181 cachedTerminatorIndex = currentIndex;
182 cachedTerminator = findTokenTerminator();
183 return cachedTerminator;
186 private Terminator findTokenTerminator() {
187 for (Terminator terminator : terminators)
188 if (terminator.matches(source, currentIndex))
193 public boolean consumeIfNextToken(final String token) throws InvalidSyntaxException {
194 if (token.equals(getNextToken().token))
201 public TokenizerMatch peekNextToken() throws InvalidSyntaxException {
202 TokenizerMatch result = getNextToken();
207 public boolean peekIsOneOf(String... possibilities) throws InvalidSyntaxException {
208 String nextToken = peekNextToken().token;
209 return Stream.of(possibilities).anyMatch(possibility -> possibility.equals(nextToken));
212 public void peekExpectNoneOf(String... possibilities) throws InvalidSyntaxException {
213 if (peekIsOneOf(possibilities))
214 throw new InvalidSyntaxException("Not expected \"" + peekNextToken().token + "\" here.");
217 public void unreadToken() {
218 currentIndex = tokenIndexes.pop();
224 public void enlistRemainingTokens(){
225 int redTokenCount = 0;
228 while (hasMoreContent()) {
229 out.println(getNextToken().toString());
232 } catch (InvalidSyntaxException e){
233 out.println("There is syntax exception");
236 // restore pointer to original location
237 for (int i = 0; i< redTokenCount; i++ ) unreadToken();
241 public void skipUntilDataEnd() {
242 tokenIndexes.push(currentIndex);
243 currentIndex = source.length();