cc2036949557778171a48380e1b991226518bb73
[svjatoslav_commons.git] / src / main / java / eu / svjatoslav / commons / string / tokenizer / Tokenizer.java
1 /*
2  * Svjatoslav Commons - shared library of common functionality. Author: Svjatoslav Agejenko.
3  * This project is released under Creative Commons Zero (CC0) license.
4  */
5 package eu.svjatoslav.commons.string.tokenizer;
6
7 import java.util.ArrayList;
8 import java.util.List;
9 import java.util.Stack;
10 import java.util.stream.Stream;
11
12 import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.DROP;
13 import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.PRESERVE;
14 import static java.lang.System.out;
15
16 public class Tokenizer {
17
18     /**
19      * Stack of token indexes. This allows to walk back in history and un-consume the token.
20      */
21     private final Stack<Integer> tokenIndexes = new Stack<>();
22
23     /**
24      * Terminators that will be searched for by given tokenizer within given source string.
25      */
26     private final List<Terminator> terminators = new ArrayList<>();
27
28     private String source; // string to be tokenized
29
30     private int currentIndex = 0;
31
32     private int cachedTerminatorIndex = -1;
33     private Terminator cachedTerminator;
34
35     public Tokenizer(final String source) {
36         this.source = source;
37     }
38
39     public Tokenizer() {
40     }
41
42     public Tokenizer setSource(String source) {
43         this.source = source;
44         currentIndex = 0;
45         tokenIndexes.clear();
46
47         cachedTerminatorIndex = -1;
48         cachedTerminator = null;
49         return this;
50     }
51
52     public Tokenizer addTerminator(final String startSequence,
53                                    final Terminator.TerminationStrategy terminationStrategy) {
54         terminators.add(new Terminator(startSequence, terminationStrategy));
55         return this;
56     }
57
58     public Tokenizer addTerminator(Terminator terminator) {
59         terminators.add(terminator);
60         return this;
61     }
62
63     public Tokenizer addTerminator(final String startSequence,
64                                    final String endSequence, final Terminator.TerminationStrategy terminationStrategy) {
65         terminators.add(new Terminator(startSequence, endSequence, terminationStrategy));
66         return this;
67     }
68
69     public void expectAndConsumeNextToken(final String value)
70             throws InvalidSyntaxException {
71         final TokenizerMatch match = getNextToken();
72         if (!value.equals(match.token))
73             throw new InvalidSyntaxException("Expected \"" + value
74                     + "\" but got \"" + match.token + "\" instead.");
75     }
76
77     /**
78      * @return next @TokenizerMatch or <code>null</code> if end of input is reached.
79      * @throws InvalidSyntaxException
80      */
81     public TokenizerMatch getNextToken() throws InvalidSyntaxException {
82         tokenIndexes.push(currentIndex);
83
84         StringBuilder tokenAccumulator = new StringBuilder();
85
86         while (true) {
87
88             if (currentIndex >= source.length()) { // reached end of input
89                 if (hasAccumulatedToken(tokenAccumulator))
90                     return new TokenizerMatch(tokenAccumulator.toString(), null, null);
91                 else
92                     return null;
93             }
94
95             if (isOngoingToken()) {
96                 tokenAccumulator.append(source.charAt(currentIndex));
97                 currentIndex++;
98                 continue;
99             }
100
101             Terminator terminator = getOrFindTokenTerminator();
102
103             if (terminator.termination == PRESERVE)
104                 return buildPreservedToken(tokenAccumulator, terminator);
105             else if (terminator.termination == DROP) {
106                 skipUntilTerminatorEnd(terminator);
107
108                 if (hasAccumulatedToken(tokenAccumulator))
109                     return new TokenizerMatch(tokenAccumulator.toString(), null, terminator);
110             }
111         }
112
113     }
114
115     private void skipUntilTerminatorEnd(Terminator terminator) throws InvalidSyntaxException {
116         if (terminator.hasEndSequence())
117             currentIndex = getEndSequenceIndex(terminator) + terminator.endSequence.length();
118         else
119             currentIndex += terminator.startSequence.length();
120     }
121
122     /**
123      * @throws InvalidSyntaxException if end sequence is not found as is expected by given token.
124      */
125     private TokenizerMatch buildPreservedToken(StringBuilder token, Terminator terminator)
126             throws InvalidSyntaxException {
127         if (hasAccumulatedToken(token))
128             return new TokenizerMatch(token.toString(), null, terminator);
129
130         if (terminator.hasEndSequence())
131             return buildTokenWithExpectedENdSequence(terminator);
132         else
133             return buildTokenWithoutEndSequence(terminator);
134     }
135
136     private TokenizerMatch buildTokenWithoutEndSequence(Terminator terminator) {
137         currentIndex += terminator.startSequence.length();
138         return new TokenizerMatch(terminator.startSequence, null, terminator);
139     }
140
141     private TokenizerMatch buildTokenWithExpectedENdSequence(Terminator terminator) throws InvalidSyntaxException {
142         int endSequenceIndex = getEndSequenceIndex(terminator);
143         String reminder = source.substring(currentIndex + terminator.startSequence.length(), endSequenceIndex);
144         currentIndex = endSequenceIndex + terminator.endSequence.length();
145
146         return new TokenizerMatch(terminator.startSequence, reminder, terminator);
147     }
148
149     /**
150      * @throws InvalidSyntaxException if end of input is reached without finding expected end sequence.
151      */
152     private int getEndSequenceIndex(Terminator terminator) throws InvalidSyntaxException {
153         int endSequenceIndex = source.indexOf(terminator.endSequence,
154                 currentIndex + terminator.startSequence.length());
155
156         if (endSequenceIndex < 0)
157             throw new InvalidSyntaxException("Expected \"" + terminator.endSequence + "\" but not found.");
158
159         return endSequenceIndex;
160     }
161
162     private boolean hasAccumulatedToken(StringBuilder token) {
163         return token.length() > 0;
164     }
165
166     private boolean isOngoingToken() {
167         return getOrFindTokenTerminator() == null;
168     }
169
170     public boolean hasMoreContent() {
171         return currentIndex < source.length();
172     }
173
174     /**
175      * Attempts to cache terminator search result.
176      */
177     public Terminator getOrFindTokenTerminator() {
178         if (currentIndex == cachedTerminatorIndex)
179             return cachedTerminator;
180
181         cachedTerminatorIndex = currentIndex;
182         cachedTerminator = findTokenTerminator();
183         return cachedTerminator;
184     }
185
186     private Terminator findTokenTerminator() {
187         for (Terminator terminator : terminators)
188             if (terminator.matches(source, currentIndex))
189                 return terminator;
190         return null;
191     }
192
193     public boolean consumeIfNextToken(final String token) throws InvalidSyntaxException {
194         if (token.equals(getNextToken().token))
195             return true;
196
197         unreadToken();
198         return false;
199     }
200
201     public TokenizerMatch peekNextToken() throws InvalidSyntaxException {
202         TokenizerMatch result = getNextToken();
203         unreadToken();
204         return result;
205     }
206
207     public boolean peekIsOneOf(String... possibilities) throws InvalidSyntaxException {
208         String nextToken = peekNextToken().token;
209         return Stream.of(possibilities).anyMatch(possibility -> possibility.equals(nextToken));
210     }
211
212     public void peekExpectNoneOf(String... possibilities) throws InvalidSyntaxException {
213         if (peekIsOneOf(possibilities))
214             throw new InvalidSyntaxException("Not expected \"" + peekNextToken().token + "\" here.");
215     }
216
217     public void unreadToken() {
218         currentIndex = tokenIndexes.pop();
219     }
220
221     /**
222      * For debugging
223      */
224     public void enlistRemainingTokens(){
225         int redTokenCount = 0;
226
227         try {
228             while (hasMoreContent()) {
229                 out.println(getNextToken().toString());
230                 redTokenCount++;
231             }
232         } catch (InvalidSyntaxException e){
233             out.println("There is syntax exception");
234         }
235
236         // restore pointer to original location
237         for (int i = 0; i< redTokenCount; i++ ) unreadToken();
238     }
239
240
241     public void skipUntilDataEnd() {
242         tokenIndexes.push(currentIndex);
243         currentIndex = source.length();
244     }
245
246 }