b716989269dba2d37c586d17cda890b7de39ea1b
[svjatoslav_commons.git] / src / main / java / eu / svjatoslav / commons / string / tokenizer / Tokenizer.java
1 /*
2  * Svjatoslav Commons - shared library of common functionality. Author: Svjatoslav Agejenko.
3  * This project is released under Creative Commons Zero (CC0) license.
4  */
5 package eu.svjatoslav.commons.string.tokenizer;
6
7 import java.util.ArrayList;
8 import java.util.List;
9 import java.util.Stack;
10 import java.util.stream.Stream;
11
12 import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.DROP;
13 import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.PRESERVE;
14
15 public class Tokenizer {
16
17     private final Stack<Integer> tokenIndexes = new Stack<>();
18     private final List<Terminator> terminators = new ArrayList<>();
19     private String source;
20     private int currentIndex = 0;
21
22     private int cachedTerminatorIndex = -1;
23     private Terminator cachedTerminator;
24
25     public Tokenizer(final String source) {
26         this.source = source;
27     }
28
29     public Tokenizer() {
30     }
31
32     public Tokenizer setSource(String source) {
33         this.source = source;
34         currentIndex = 0;
35         tokenIndexes.clear();
36
37         cachedTerminatorIndex = -1;
38         cachedTerminator = null;
39         return this;
40     }
41
42     public Tokenizer addTerminator(final String startSequence,
43                                    final Terminator.TerminationStrategy terminationStrategy) {
44         terminators.add(new Terminator(startSequence, terminationStrategy));
45         return this;
46     }
47
48     public Tokenizer addTerminator(final String startSequence,
49                                    final String endSequence, final Terminator.TerminationStrategy terminationStrategy) {
50         terminators.add(new Terminator(startSequence, endSequence, terminationStrategy));
51         return this;
52     }
53
54     public void expectAndConsumeNextToken(final String value)
55             throws InvalidSyntaxException {
56         final TokenizerMatch match = getNextToken();
57         if (!value.equals(match.token))
58             throw new InvalidSyntaxException("Expected \"" + value
59                     + "\" but got \"" + match.token + "\" instead.");
60     }
61
62
63     /**
64      *
65      * @return next @TokenizerMatch or <code>null</code> if end of input is reached.
66      * @throws InvalidSyntaxException
67      */
68     public TokenizerMatch getNextToken() throws InvalidSyntaxException {
69         tokenIndexes.push(currentIndex);
70
71         StringBuilder tokenAccumulator = new StringBuilder();
72
73         while (true) {
74
75             if (currentIndex >= source.length()) { // reached end of input
76                 if (hasAccumulatedToken(tokenAccumulator))
77                     return new TokenizerMatch(tokenAccumulator.toString(), null, null);
78                 else
79                     return null;
80             }
81
82             if (isOngoingToken()) {
83                 tokenAccumulator.append(source.charAt(currentIndex));
84                 currentIndex++;
85                 continue;
86             }
87
88             Terminator terminator = getOrFindTokenTerminator();
89
90             if (terminator.termination == PRESERVE)
91                 return buildPreservedToken(tokenAccumulator, terminator);
92             else if (terminator.termination == DROP) {
93                 skipUntilTerminatorEnd(terminator);
94
95                 if (hasAccumulatedToken(tokenAccumulator))
96                     return new TokenizerMatch(tokenAccumulator.toString(), null, terminator);
97             }
98         }
99
100     }
101
102     private void skipUntilTerminatorEnd(Terminator terminator) throws InvalidSyntaxException {
103         if (terminator.hasEndSequence())
104             currentIndex = getEndSequenceIndex(terminator) + terminator.endSequence.length();
105         else
106             currentIndex += terminator.startSequence.length();
107     }
108
109     /**
110      * @throws InvalidSyntaxException if end sequence is not found as is expected by given token.
111      */
112     private TokenizerMatch buildPreservedToken(StringBuilder token, Terminator terminator)
113             throws InvalidSyntaxException {
114         if (hasAccumulatedToken(token))
115             return new TokenizerMatch(token.toString(), null, terminator);
116
117         if (terminator.hasEndSequence())
118             return buildTokenWithExpectedENdSequence(terminator);
119         else
120             return buildTokenWithoutEndSequence(terminator);
121     }
122
123     private TokenizerMatch buildTokenWithoutEndSequence(Terminator terminator) {
124         currentIndex += terminator.startSequence.length();
125         return new TokenizerMatch(terminator.startSequence, null, terminator);
126     }
127
128     private TokenizerMatch buildTokenWithExpectedENdSequence(Terminator terminator) throws InvalidSyntaxException {
129         int endSequenceIndex = getEndSequenceIndex(terminator);
130         String reminder = source.substring(currentIndex + terminator.startSequence.length(), endSequenceIndex);
131         currentIndex = endSequenceIndex + terminator.endSequence.length();
132
133         return new TokenizerMatch(terminator.startSequence, reminder, terminator);
134     }
135
136     /**
137      * @throws InvalidSyntaxException if end of input is reached without finding expected end sequence.
138      */
139     private int getEndSequenceIndex(Terminator terminator) throws InvalidSyntaxException {
140         int endSequenceIndex = source.indexOf(terminator.endSequence,
141                 currentIndex + terminator.startSequence.length());
142
143         if (endSequenceIndex < 0)
144             throw new InvalidSyntaxException("Expected \"" + terminator.endSequence + "\" but not found.");
145
146         return endSequenceIndex;
147     }
148
149     private boolean hasAccumulatedToken(StringBuilder token) {
150         return token.length() > 0;
151     }
152
153     private boolean isOngoingToken() {
154         return getOrFindTokenTerminator() == null;
155     }
156
157     public boolean hasMoreTokens() {
158         return currentIndex < source.length();
159     }
160
161     /**
162      * Attempts to cache terminator search result.
163      */
164     public Terminator getOrFindTokenTerminator() {
165         if (currentIndex == cachedTerminatorIndex)
166             return cachedTerminator;
167
168         cachedTerminatorIndex = currentIndex;
169         cachedTerminator = findTokenTerminator();
170         return cachedTerminator;
171     }
172
173     private Terminator findTokenTerminator() {
174         for (Terminator terminator : terminators)
175             if (terminator.matches(source, currentIndex))
176                 return terminator;
177         return null;
178     }
179
180     public boolean consumeIfNextToken(final String token) throws InvalidSyntaxException {
181         if (token.equals(getNextToken().token))
182             return true;
183
184         unreadToken();
185         return false;
186     }
187
188     public TokenizerMatch peekNextToken() throws InvalidSyntaxException {
189         TokenizerMatch result = getNextToken();
190         unreadToken();
191         return result;
192     }
193
194     public boolean peekIsOneOf(String... possibilities) throws InvalidSyntaxException {
195         String nextToken = peekNextToken().token;
196         return Stream.of(possibilities).anyMatch(possibility -> possibility.equals(nextToken));
197     }
198
199     public void peekExpectNoneOf(String... possibilities) throws InvalidSyntaxException {
200         if (peekIsOneOf(possibilities))
201             throw new InvalidSyntaxException("Not expected \"" + peekNextToken().token + "\" here.");
202     }
203
204     public void unreadToken() {
205         currentIndex = tokenIndexes.pop();
206     }
207
208     public void skipUntilDataEnd() {
209         tokenIndexes.push(currentIndex);
210         currentIndex = source.length();
211     }
212
213 }