Helper function to split string into groups based on regexp. Possibility to retrieve...
[svjatoslav_commons.git] / src / main / java / eu / svjatoslav / commons / string / tokenizer / Tokenizer.java
1 /*
2  * Svjatoslav Commons - shared library of common functionality. Author: Svjatoslav Agejenko.
3  * This project is released under Creative Commons Zero (CC0) license.
4  */
5 package eu.svjatoslav.commons.string.tokenizer;
6
7 import java.util.ArrayList;
8 import java.util.List;
9 import java.util.Stack;
10 import java.util.regex.Matcher;
11 import java.util.stream.Stream;
12
13 import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.PRESERVE;
14 import static java.lang.System.out;
15
16 public class Tokenizer {
17
18     /**
19      * Stack of token indexes. This allows to walk back in history and un-consume the token.
20      */
21     private final Stack<Integer> tokenIndexes = new Stack<>();
22
23     /**
24      * Terminators that will be searched for by given tokenizer within given source string.
25      */
26     private final List<Terminator> terminators = new ArrayList<>();
27
28     private String source; // string to be tokenized
29
30     private int currentIndex = 0;
31
32     public Tokenizer(final String source) {
33         this.source = source;
34     }
35
36     public Tokenizer() {
37     }
38
39     public Tokenizer setSource(String source) {
40         this.source = source;
41         currentIndex = 0;
42         tokenIndexes.clear();
43         return this;
44     }
45
46     public Terminator addTerminator(final Terminator.TerminationStrategy terminationStrategy, String regexp) {
47         Terminator terminator = new Terminator(terminationStrategy, regexp,null);
48         terminators.add(terminator);
49         return terminator;
50     }
51
52     public Terminator addTerminator(final Terminator.TerminationStrategy terminationStrategy,
53                                     String regexp, String group) {
54         Terminator terminator = new Terminator(terminationStrategy, regexp,group);
55         terminators.add(terminator);
56         return terminator;
57     }
58
59
60     public Terminator addTerminator(Terminator terminator) {
61         terminators.add(terminator);
62         return terminator;
63     }
64
65     public void expectAndConsumeNextStringToken(final String value)
66             throws InvalidSyntaxException {
67         final TokenizerMatch match = getNextToken();
68         if (!value.equals(match.token))
69             throw new InvalidSyntaxException("Expected \"" + value
70                     + "\" but got \"" + match.token + "\" instead.");
71     }
72
73     public TokenizerMatch expectAndConsumeNextTerminatorToken(Terminator terminator)
74             throws InvalidSyntaxException {
75         final TokenizerMatch match = getNextToken();
76
77         if (match.terminator != terminator)
78             throw new InvalidSyntaxException("Expected terminator \"" + terminator
79                     + "\" but got \"" + match.terminator + "\" instead.");
80
81         return match;
82     }
83
84
85     /**
86      * @return next @TokenizerMatch or <code>null</code> if end of input is reached.
87      */
88     public TokenizerMatch getNextToken() {
89         tokenIndexes.push(currentIndex);
90
91         StringBuilder tokenAccumulator = new StringBuilder();
92
93         while (true) {
94
95             if (currentIndex >= source.length()) { // reached end of input
96                 if (hasAccumulatedToken(tokenAccumulator))
97                     return new TokenizerMatch(tokenAccumulator.toString(), null, null, this);
98                 else
99                     return null;
100             }
101
102             TokenizerMatch matchResult = findTerminatorMatch();
103             if (matchResult == null) {
104                 tokenAccumulator.append(source.charAt(currentIndex));
105                 currentIndex++;
106                 continue;
107             }
108
109             if (matchResult.terminator.termination == PRESERVE) {
110                 if (hasAccumulatedToken(tokenAccumulator))
111                     return new TokenizerMatch(tokenAccumulator.toString(), null, null, this);
112
113                 currentIndex = matchResult.matcher.end();
114                 return matchResult;
115             } else {
116                 currentIndex = matchResult.matcher.end();
117
118                 if (hasAccumulatedToken(tokenAccumulator))
119                     return new TokenizerMatch(tokenAccumulator.toString(), null, null, this);
120             }
121         }
122     }
123
124     public TokenizerMatch findTerminatorMatch(){
125         for (Terminator terminator : terminators)
126             if (terminator.active) {
127                 Matcher match = terminator.match(source, currentIndex);
128                 if (match.find()) {
129                     String token = source.substring(match.start(), match.end());
130                     return new TokenizerMatch(token, terminator, match, this);
131                 }
132             }
133         return null;
134     }
135
136     private boolean hasAccumulatedToken(StringBuilder tokenAccumulator) {
137         return tokenAccumulator.length() > 0;
138     }
139
140     public boolean hasMoreContent() {
141         if (source == null) return false;
142         return currentIndex < source.length();
143     }
144
145     public boolean consumeIfNextToken(final String token) throws InvalidSyntaxException {
146         if (token.equals(getNextToken().token))
147             return true;
148
149         unreadToken();
150         return false;
151     }
152
153     public TokenizerMatch peekNextToken() throws InvalidSyntaxException {
154         TokenizerMatch result = getNextToken();
155         unreadToken();
156         return result;
157     }
158
159     public boolean peekIsOneOf(String... possibilities) throws InvalidSyntaxException {
160         String nextToken = peekNextToken().token;
161         return Stream.of(possibilities).anyMatch(possibility -> possibility.equals(nextToken));
162     }
163
164     public void peekExpectNoneOf(String... possibilities) throws InvalidSyntaxException {
165         if (peekIsOneOf(possibilities))
166             throw new InvalidSyntaxException("Not expected \"" + peekNextToken().token + "\" here.");
167     }
168
169     public void unreadToken() {
170         currentIndex = tokenIndexes.pop();
171     }
172
173     /**
174      * For debugging
175      */
176     public void enlistRemainingTokens(){
177         int redTokenCount = 0;
178
179         while (hasMoreContent()) {
180             out.println(getNextToken().toString());
181             redTokenCount++;
182         }
183
184         // restore pointer to original location
185         for (int i = 0; i< redTokenCount; i++ ) unreadToken();
186     }
187
188
189     public void skipUntilDataEnd() {
190         tokenIndexes.push(currentIndex);
191         currentIndex = source.length();
192     }
193
194 }