*/
package eu.svjatoslav.commons.string.tokenizer;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
public class Terminator {
- public final String startSequence;
- public final String endSequence;
+ String regexp;
public final TerminationStrategy termination;
+ public final String group;
+ public boolean active = true;
+ public final Pattern pattern;
- public Terminator(final String startSequence, TerminationStrategy termination) {
- this.startSequence = startSequence;
- this.endSequence = null;
- this.termination = termination;
- }
-
- public Terminator(final String startSequence, final String endSequence, TerminationStrategy termination) {
- this.startSequence = startSequence;
- this.endSequence = endSequence;
+ public Terminator(TerminationStrategy termination, String regexp, String group) {
this.termination = termination;
+ this.group = group;
+ this.regexp = regexp;
+ this.pattern = Pattern.compile("^"+regexp);
}
- public boolean matches(String source, int index) {
- // boundary check
- if (source.length() < (index + startSequence.length()))
- return false;
-
- // match check
- for (int i = 0; i < startSequence.length(); i++)
- if (startSequence.charAt(i) != source.charAt(index + i))
- return false;
-
- return true;
- }
-
- public boolean hasEndSequence() {
- return endSequence != null;
+ public Matcher match(String source, int index) {
+ Matcher matcher = pattern.matcher(source);
+ matcher.region(index, source.length());
+ return matcher;
}
@Override
public String toString() {
return "Terminator{" +
- "startSequence='" + startSequence + '\'' +
- ", endSequence='" + endSequence + '\'' +
+ "regexp='" + regexp + '\'' +
", termination=" + termination +
+ ", group='" + group + '\'' +
+ ", active=" + active +
'}';
}
public enum TerminationStrategy {
- PRESERVE, // Identify and return such tokens for further processing.
- DROP // Identify but ignore such tokens, do not return them. Good for handling comments in scripts.
+ /**
+ * Preserve token that is identified within Terminator and return it for processing. For example when
+ * building language parser, it could be used for statements that you want to capture.
+ */
+ PRESERVE,
+
+ /**
+ * While tokens that are marked by Terminator are identified, they are dropped and not returned for consumption.
+ * For example, when building language parser, you might use such strategy for whitespace and comments.
+ * That is, those tokens act as separators between actually useful tokens, but you don't want to consume such
+ * separators or comments in your code.
+ */
+ DROP
}
}
import java.util.ArrayList;
import java.util.List;
import java.util.Stack;
+import java.util.regex.Matcher;
import java.util.stream.Stream;
-import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.DROP;
import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.PRESERVE;
import static java.lang.System.out;
private int currentIndex = 0;
- private int cachedTerminatorIndex = -1;
- private Terminator cachedTerminator;
-
public Tokenizer(final String source) {
this.source = source;
}
this.source = source;
currentIndex = 0;
tokenIndexes.clear();
-
- cachedTerminatorIndex = -1;
- cachedTerminator = null;
return this;
}
- public Tokenizer addTerminator(final String startSequence,
- final Terminator.TerminationStrategy terminationStrategy) {
- terminators.add(new Terminator(startSequence, terminationStrategy));
- return this;
+ public Terminator addTerminator(final Terminator.TerminationStrategy terminationStrategy, String regexp) {
+ Terminator terminator = new Terminator(terminationStrategy, regexp,null);
+ terminators.add(terminator);
+ return terminator;
}
- public Tokenizer addTerminator(Terminator terminator) {
+ public Terminator addTerminator(final Terminator.TerminationStrategy terminationStrategy,
+ String regexp, String group) {
+ Terminator terminator = new Terminator(terminationStrategy, regexp,group);
terminators.add(terminator);
- return this;
+ return terminator;
}
- public Tokenizer addTerminator(final String startSequence,
- final String endSequence, final Terminator.TerminationStrategy terminationStrategy) {
- terminators.add(new Terminator(startSequence, endSequence, terminationStrategy));
- return this;
+
+ public Terminator addTerminator(Terminator terminator) {
+ terminators.add(terminator);
+ return terminator;
}
- public void expectAndConsumeNextToken(final String value)
+ public void expectAndConsumeNextStringToken(final String value)
throws InvalidSyntaxException {
final TokenizerMatch match = getNextToken();
if (!value.equals(match.token))
+ "\" but got \"" + match.token + "\" instead.");
}
+ public TokenizerMatch expectAndConsumeNextTerminatorToken(Terminator terminator)
+ throws InvalidSyntaxException {
+ final TokenizerMatch match = getNextToken();
+
+ if (match.terminator != terminator)
+ throw new InvalidSyntaxException("Expected terminator \"" + terminator
+ + "\" but got \"" + match.terminator + "\" instead.");
+
+ return match;
+ }
+
+
/**
* @return next @TokenizerMatch or <code>null</code> if end of input is reached.
- * @throws InvalidSyntaxException
*/
- public TokenizerMatch getNextToken() throws InvalidSyntaxException {
+ public TokenizerMatch getNextToken() {
tokenIndexes.push(currentIndex);
StringBuilder tokenAccumulator = new StringBuilder();
return null;
}
- if (isOngoingToken()) {
+ TokenizerMatch matchResult = findTerminatorMatch();
+ if (matchResult == null) {
tokenAccumulator.append(source.charAt(currentIndex));
currentIndex++;
continue;
}
- Terminator terminator = getOrFindTokenTerminator();
+ if (matchResult.terminator.termination == PRESERVE) {
+ if (hasAccumulatedToken(tokenAccumulator))
+ return new TokenizerMatch(tokenAccumulator.toString(), null, null);
- if (terminator.termination == PRESERVE)
- return buildPreservedToken(tokenAccumulator, terminator);
- else if (terminator.termination == DROP) {
- skipUntilTerminatorEnd(terminator);
+ currentIndex = matchResult.matcher.end();
+ return matchResult;
+ } else {
+ currentIndex = matchResult.matcher.end();
if (hasAccumulatedToken(tokenAccumulator))
- return new TokenizerMatch(tokenAccumulator.toString(), null, terminator);
+ return new TokenizerMatch(tokenAccumulator.toString(), null, null);
}
}
-
}
- private void skipUntilTerminatorEnd(Terminator terminator) throws InvalidSyntaxException {
- if (terminator.hasEndSequence())
- currentIndex = getEndSequenceIndex(terminator) + terminator.endSequence.length();
- else
- currentIndex += terminator.startSequence.length();
- }
-
- /**
- * @throws InvalidSyntaxException if end sequence is not found as is expected by given token.
- */
- private TokenizerMatch buildPreservedToken(StringBuilder token, Terminator terminator)
- throws InvalidSyntaxException {
- if (hasAccumulatedToken(token))
- return new TokenizerMatch(token.toString(), null, terminator);
-
- if (terminator.hasEndSequence())
- return buildTokenWithExpectedENdSequence(terminator);
- else
- return buildTokenWithoutEndSequence(terminator);
- }
-
- private TokenizerMatch buildTokenWithoutEndSequence(Terminator terminator) {
- currentIndex += terminator.startSequence.length();
- return new TokenizerMatch(terminator.startSequence, null, terminator);
- }
-
- private TokenizerMatch buildTokenWithExpectedENdSequence(Terminator terminator) throws InvalidSyntaxException {
- int endSequenceIndex = getEndSequenceIndex(terminator);
- String reminder = source.substring(currentIndex + terminator.startSequence.length(), endSequenceIndex);
- currentIndex = endSequenceIndex + terminator.endSequence.length();
-
- return new TokenizerMatch(terminator.startSequence, reminder, terminator);
- }
-
- /**
- * @throws InvalidSyntaxException if end of input is reached without finding expected end sequence.
- */
- private int getEndSequenceIndex(Terminator terminator) throws InvalidSyntaxException {
- int endSequenceIndex = source.indexOf(terminator.endSequence,
- currentIndex + terminator.startSequence.length());
-
- if (endSequenceIndex < 0)
- throw new InvalidSyntaxException("Expected \"" + terminator.endSequence + "\" but not found.");
-
- return endSequenceIndex;
- }
-
- private boolean hasAccumulatedToken(StringBuilder token) {
- return token.length() > 0;
+ public TokenizerMatch findTerminatorMatch(){
+ for (Terminator terminator : terminators)
+ if (terminator.active) {
+ Matcher match = terminator.match(source, currentIndex);
+ if (match.find()) {
+ String token = source.substring(match.start(), match.end());
+ return new TokenizerMatch(token, terminator, match);
+ }
+ }
+ return null;
}
- private boolean isOngoingToken() {
- return getOrFindTokenTerminator() == null;
+ private boolean hasAccumulatedToken(StringBuilder tokenAccumulator) {
+ return tokenAccumulator.length() > 0;
}
public boolean hasMoreContent() {
+ if (source == null) return false;
return currentIndex < source.length();
}
- /**
- * Attempts to cache terminator search result.
- */
- public Terminator getOrFindTokenTerminator() {
- if (currentIndex == cachedTerminatorIndex)
- return cachedTerminator;
-
- cachedTerminatorIndex = currentIndex;
- cachedTerminator = findTokenTerminator();
- return cachedTerminator;
- }
-
- private Terminator findTokenTerminator() {
- for (Terminator terminator : terminators)
- if (terminator.matches(source, currentIndex))
- return terminator;
- return null;
- }
-
public boolean consumeIfNextToken(final String token) throws InvalidSyntaxException {
if (token.equals(getNextToken().token))
return true;
public void enlistRemainingTokens(){
int redTokenCount = 0;
- try {
- while (hasMoreContent()) {
- out.println(getNextToken().toString());
- redTokenCount++;
- }
- } catch (InvalidSyntaxException e){
- out.println("There is syntax exception");
+ while (hasMoreContent()) {
+ out.println(getNextToken().toString());
+ redTokenCount++;
}
// restore pointer to original location
*/
package eu.svjatoslav.commons.string.tokenizer;
+import java.util.regex.Matcher;
+
public class TokenizerMatch {
public final String token;
- public final String reminder;
+
+ /**
+ * {@link Terminator} that matched current token
+ */
public final Terminator terminator;
- public TokenizerMatch(final String token, final String reminder, final Terminator terminator) {
+ public final Matcher matcher;
+
+
+ public TokenizerMatch(final String token, final Terminator terminator, Matcher matcher) {
this.token = token;
- this.reminder = reminder;
this.terminator = terminator;
+ this.matcher = matcher;
+ }
+
+ public boolean isGroup(String group){
+ if (terminator == null){
+ return group == null;
+ }
+
+ if (terminator.group == null){
+ return group == null;
+ }
+
+ return terminator.group.equals(group);
}
@Override
public String toString() {
return "TokenizerMatch{" +
"token='" + token + '\'' +
- ", reminder='" + reminder + '\'' +
", terminator=" + terminator +
'}';
}
@Test
public void testMatches() {
Terminator terminator = new Terminator(
- "/*", "*/", Terminator.TerminationStrategy.PRESERVE);
+ Terminator.TerminationStrategy.PRESERVE,
+ "/\\*.+\\*/",
+ "test");
// must find
- assertTrue(terminator.matches("/* bla bla bla */", 0));
+ assertTrue(terminator.match("/* bla bla bla */", 0).find());
// must not find
- assertFalse(terminator.matches("/* bla bla bla */", 1));
+ assertFalse(terminator.match("/* bla bla bla */", 1).find());
// must not overflow
- assertFalse(terminator.matches("/", 0));
+ assertFalse(terminator.match("/", 0).find());
}
}
\ No newline at end of file
@Test
public void testPeeking() throws Exception {
- Tokenizer tokenizer = new Tokenizer("this is a N'2015-03-18 09:48:54.360' test")
- .addTerminator(" ", DROP)
- .addTerminator("N'", "'", PRESERVE);
+ Tokenizer tokenizer = new Tokenizer("this is a N'2015-03-18 09:48:54.360' test");
+ tokenizer.addTerminator(DROP, "\\s");
+ tokenizer.addTerminator(PRESERVE, "N'.*'");
- tokenizer.expectAndConsumeNextToken("this");
+ tokenizer.expectAndConsumeNextStringToken("this");
assertEquals("is", tokenizer.peekNextToken().token);
assertEquals("is", tokenizer.peekNextToken().token);
- assertEquals(true, tokenizer.peekIsOneOf("maybe", "is", "that"));
+ assertTrue(tokenizer.peekIsOneOf("maybe", "is", "that"));
}
@Test
public void testTokenization() throws Exception {
- Tokenizer tokenizer = new Tokenizer("\"hello\" /** comment **/ (( is a N'2015-03-18 09:48:54.360' test")
- .addTerminator(" ", DROP)
- .addTerminator("(", PRESERVE)
- .addTerminator("\"", "\"", PRESERVE)
- .addTerminator("N'", "'", PRESERVE)
- .addTerminator("/*", "*/", DROP);
-
- assertTokenEquals("\"", "hello", tokenizer);
- assertTokenEquals("(", null, tokenizer);
- assertTokenEquals("(", null, tokenizer);
- assertTokenEquals("is", null, tokenizer);
- assertTokenEquals("a", null, tokenizer);
- assertTokenEquals("N'", "2015-03-18 09:48:54.360", tokenizer);
- assertTokenEquals("test", null, tokenizer);
+ Tokenizer tokenizer = new Tokenizer("\"hello world\" /** comment **/ (( is a N'2015-03-18 09:48:54.360' test");
+ tokenizer.addTerminator(DROP,"\\s");
+ tokenizer.addTerminator(PRESERVE,"\\(");
+ tokenizer.addTerminator(PRESERVE, "\\\".*\\\"");
+ tokenizer.addTerminator(PRESERVE, "N'.*'");
+ tokenizer.addTerminator(DROP,"/\\*.*\\*/");
+
+ assertTokenEquals("\"hello world\"", tokenizer);
+ assertTokenEquals("(", tokenizer);
+ assertTokenEquals("(", tokenizer);
+ assertTokenEquals("is", tokenizer);
+ assertTokenEquals("a", tokenizer);
+ assertTokenEquals("N'2015-03-18 09:48:54.360'", tokenizer);
+ assertTokenEquals("test", tokenizer);
assertNull(tokenizer.getNextToken());
assertFalse(tokenizer.hasMoreContent());
}
- private void assertTokenEquals(String token, String reminder, Tokenizer tokenizer) throws InvalidSyntaxException {
- TokenizerMatch nextToken = tokenizer.getNextToken();
-
- assertEquals(token, nextToken.token);
-
- if (reminder == null)
- assertNull(nextToken.reminder);
- else
- assertEquals(reminder, nextToken.reminder);
- }
-
- private void debugNextToken(Tokenizer tokenizer) throws InvalidSyntaxException {
- TokenizerMatch nextToken = tokenizer.getNextToken();
- if (nextToken == null)
- System.out.println("null");
- else
- System.out.println("T: \"" + nextToken.token + "\", R: \"" + nextToken.reminder + "\"");
+ private void assertTokenEquals(String expectedValue, Tokenizer tokenizer){
+ assertEquals(expectedValue, tokenizer.getNextToken().token);
}
}
\ No newline at end of file