Handle complex content preserving terminators.
[svjatoslav_commons.git] / src / main / java / eu / svjatoslav / commons / string / tokenizer / Tokenizer.java
1 /*
2  * Svjatoslav Commons - shared library of common functionality.
3  * Copyright ©2012-2017, Svjatoslav Agejenko, svjatoslav@svjatoslav.eu
4  * 
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of version 3 of the GNU Lesser General Public License
7  * or later as published by the Free Software Foundation.
8  */
9
10 package eu.svjatoslav.commons.string.tokenizer;
11
12 import java.util.ArrayList;
13 import java.util.List;
14 import java.util.Stack;
15 import java.util.stream.Stream;
16
17 import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.DROP;
18 import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.PRESERVE;
19
20 public class Tokenizer {
21
22     final Stack<Integer> tokenIndexes = new Stack<>();
23     private final List<Terminator> terminators = new ArrayList<>();
24     private String source;
25     private int currentIndex = 0;
26
27     public Tokenizer(final String source) {
28         this.source = source;
29     }
30
31     public Tokenizer(){}
32
33     public Tokenizer setSource(String source){
34         this.source = source;
35         currentIndex = 0;
36         tokenIndexes.clear();
37         return this;
38     }
39
40     public Tokenizer addTerminator(final String startSequence,
41                                    final Terminator.TerminationStrategy terminationStrategy) {
42         terminators.add(new Terminator(startSequence, terminationStrategy));
43         return this;
44     }
45
46     public Tokenizer addTerminator(final String startSequence,
47                                    final String endSequence, final Terminator.TerminationStrategy terminationStrategy) {
48         terminators.add(new Terminator(startSequence, endSequence, terminationStrategy));
49         return this;
50     }
51
52     public void expectAndConsumeNextToken(final String value)
53             throws InvalidSyntaxException {
54         final TokenizerMatch match = getNextToken();
55         if (!value.equals(match.token))
56             throw new InvalidSyntaxException("Expected \"" + value
57                     + "\" but got \"" + match.token + "\" instead.");
58     }
59
60     public TokenizerMatch getNextToken() throws InvalidSyntaxException {
61         tokenIndexes.push(currentIndex);
62
63         StringBuilder token = new StringBuilder();
64
65         while (true){
66             if (isOngoingToken()) {
67                 token.append(source.charAt(currentIndex));
68                 currentIndex++;
69                 continue;
70             }
71
72             Terminator tokenTerminator = findTokenTerminator();
73
74             if (tokenTerminator.termination == PRESERVE){
75                 return buildPreservedToken(token, tokenTerminator);
76             } else if (tokenTerminator.termination == DROP){
77                 if (hasAccumulatedToken(token)){
78                     currentIndex++;
79                     return new TokenizerMatch(token.toString(), "", tokenTerminator);
80                 } else {
81                     currentIndex++;
82                 }
83             }
84         }
85
86     }
87
88     private TokenizerMatch buildPreservedToken(StringBuilder token, Terminator terminator) throws InvalidSyntaxException {
89         if (hasAccumulatedToken(token))
90             return new TokenizerMatch(token.toString(), "", terminator);
91
92         if (terminator.hasEndSequence()){
93             int endSequenceIndex = source.indexOf(terminator.endSequence,
94                     currentIndex + terminator.startSequence.length());
95
96             if (endSequenceIndex < 0)
97                 throw new InvalidSyntaxException("Expected \"" + terminator.endSequence + "\" but not found.");
98
99             String reminder = source.substring(currentIndex + terminator.startSequence.length(), endSequenceIndex);
100             currentIndex = endSequenceIndex + terminator.endSequence.length();
101
102             return new TokenizerMatch(terminator.startSequence, reminder, terminator);
103         } else {
104             currentIndex += terminator.startSequence.length();
105             return new TokenizerMatch(terminator.startSequence, "", terminator);
106         }
107     }
108
109     private boolean hasAccumulatedToken(StringBuilder token) {
110         return token.length() > 0;
111     }
112
113     private boolean isOngoingToken() {
114         return findTokenTerminator() == null;
115     }
116
117     public Terminator findTokenTerminator() {
118         for (Terminator terminator : terminators)
119             if (terminator.matches(source, currentIndex))
120                 return terminator;
121         return null;
122     }
123
124     public boolean consumeIfNextToken(final String token) throws InvalidSyntaxException {
125         if (token.equals(getNextToken().token))
126             return true;
127
128         unreadToken();
129         return false;
130     }
131
132     public TokenizerMatch peekNextToken() throws InvalidSyntaxException {
133         TokenizerMatch result = getNextToken();
134         unreadToken();
135         return result;
136     }
137
138     public boolean peekIsOneOf(String ... possibilities) throws InvalidSyntaxException {
139         String nextToken = peekNextToken().token;
140         return Stream.of(possibilities).anyMatch(possibility -> possibility.equals(nextToken));
141     }
142
143     public void peekExpectNoneOf(String ... possibilities) throws InvalidSyntaxException {
144         if (peekIsOneOf(possibilities))
145             throw new InvalidSyntaxException("Not expected \"" + peekNextToken().token + "\" here.");
146     }
147
148
149     public boolean sequenceMatches(final String sequence) {
150         if ((currentIndex + sequence.length()) > source.length())
151             return false;
152
153         for (int i = 0; i < sequence.length(); i++)
154             if (sequence.charAt(i) != source.charAt(i + currentIndex))
155                 return false;
156
157         return true;
158     }
159
160     public void skipUntilDataEnd() {
161         tokenIndexes.push(currentIndex);
162         currentIndex = source.length();
163     }
164
165     public void skipUntilSequence(final String sequence) {
166         while (currentIndex < source.length()) {
167             if (sequenceMatches(sequence)) {
168                 currentIndex += sequence.length();
169                 return;
170             }
171
172             currentIndex++;
173         }
174     }
175
176     public void unreadToken() {
177         currentIndex = tokenIndexes.pop();
178     }
179
180 }