X-Git-Url: http://www2.svjatoslav.eu/gitweb/?p=svjatoslav_commons.git;a=blobdiff_plain;f=src%2Fmain%2Fjava%2Feu%2Fsvjatoslav%2Fcommons%2Fstring%2Ftokenizer%2FTokenizer.java;h=14554beb886acffc4e638b31b5e50c49cb6ada51;hp=939ede9dc5a8a7ecfe687745261254ba3b9b235b;hb=b1ffc7025cc976821987469570f07a7298ea16c9;hpb=3bc3db3ceb288b82e48c349bf27dfafda2bcd444 diff --git a/src/main/java/eu/svjatoslav/commons/string/tokenizer/Tokenizer.java b/src/main/java/eu/svjatoslav/commons/string/tokenizer/Tokenizer.java index 939ede9..14554be 100755 --- a/src/main/java/eu/svjatoslav/commons/string/tokenizer/Tokenizer.java +++ b/src/main/java/eu/svjatoslav/commons/string/tokenizer/Tokenizer.java @@ -1,55 +1,68 @@ /* - * Svjatoslav Commons - shared library of common functionality. - * Copyright ©2012-2017, Svjatoslav Agejenko, svjatoslav@svjatoslav.eu - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 3 of the GNU Lesser General Public License - * or later as published by the Free Software Foundation. + * Svjatoslav Commons - shared library of common functionality. Author: Svjatoslav Agejenko. + * This project is released under Creative Commons Zero (CC0) license. */ - package eu.svjatoslav.commons.string.tokenizer; import java.util.ArrayList; import java.util.List; import java.util.Stack; +import java.util.regex.Matcher; import java.util.stream.Stream; -import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.DROP; import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.PRESERVE; +import static java.lang.System.out; public class Tokenizer { - final Stack tokenIndexes = new Stack<>(); + /** + * Stack of token indexes. This allows to walk back in history and un-consume the token. + */ + private final Stack tokenIndexes = new Stack<>(); + + /** + * Terminators that will be searched for by given tokenizer within given source string. + */ private final List terminators = new ArrayList<>(); - private String source; + + private String source; // string to be tokenized + private int currentIndex = 0; public Tokenizer(final String source) { this.source = source; } - public Tokenizer(){} + public Tokenizer() { + } - public Tokenizer setSource(String source){ + public Tokenizer setSource(String source) { this.source = source; currentIndex = 0; tokenIndexes.clear(); return this; } - public Tokenizer addTerminator(final String startSequence, - final Terminator.TerminationStrategy terminationStrategy) { - terminators.add(new Terminator(startSequence, terminationStrategy)); - return this; + public Terminator addTerminator(final Terminator.TerminationStrategy terminationStrategy, String regexp) { + Terminator terminator = new Terminator(terminationStrategy, regexp,null); + terminators.add(terminator); + return terminator; } - public Tokenizer addTerminator(final String startSequence, - final String endSequence, final Terminator.TerminationStrategy terminationStrategy) { - terminators.add(new Terminator(startSequence, endSequence, terminationStrategy)); - return this; + public Terminator addTerminator(final Terminator.TerminationStrategy terminationStrategy, + String regexp, String group) { + Terminator terminator = new Terminator(terminationStrategy, regexp,group); + terminators.add(terminator); + return terminator; + } + + + public Terminator addTerminator(Terminator terminator) { + terminators.add(terminator); + return terminator; } - public void expectAndConsumeNextToken(final String value) + public void expectAndConsumeNextStringToken(final String value) throws InvalidSyntaxException { final TokenizerMatch match = getNextToken(); if (!value.equals(match.token)) @@ -57,68 +70,76 @@ public class Tokenizer { + "\" but got \"" + match.token + "\" instead."); } - public TokenizerMatch getNextToken() throws InvalidSyntaxException { - tokenIndexes.push(currentIndex); + public TokenizerMatch expectAndConsumeNextTerminatorToken(Terminator terminator) + throws InvalidSyntaxException { + final TokenizerMatch match = getNextToken(); - StringBuilder token = new StringBuilder(); + if (match.terminator != terminator) + throw new InvalidSyntaxException("Expected terminator \"" + terminator + + "\" but got \"" + match.terminator + "\" instead."); - while (true){ - if (isOngoingToken()) { - token.append(source.charAt(currentIndex)); - currentIndex++; - continue; - } + return match; + } - Terminator tokenTerminator = findTokenTerminator(); - if (tokenTerminator.termination == PRESERVE){ - return buildPreservedToken(token, tokenTerminator); - } else if (tokenTerminator.termination == DROP){ - if (hasAccumulatedToken(token)){ - currentIndex++; - return new TokenizerMatch(token.toString(), "", tokenTerminator); - } else { - currentIndex++; - } - } - } + /** + * @return next @TokenizerMatch or null if end of input is reached. + */ + public TokenizerMatch getNextToken() { + tokenIndexes.push(currentIndex); - } + StringBuilder tokenAccumulator = new StringBuilder(); + + while (true) { - private TokenizerMatch buildPreservedToken(StringBuilder token, Terminator terminator) throws InvalidSyntaxException { - if (hasAccumulatedToken(token)) - return new TokenizerMatch(token.toString(), "", terminator); + if (currentIndex >= source.length()) { // reached end of input + if (hasAccumulatedToken(tokenAccumulator)) + return new TokenizerMatch(tokenAccumulator.toString(), null, null, this); + else + return null; + } - if (terminator.hasEndSequence()){ - int endSequenceIndex = source.indexOf(terminator.endSequence, - currentIndex + terminator.startSequence.length()); + TokenizerMatch matchResult = findTerminatorMatch(); + if (matchResult == null) { + tokenAccumulator.append(source.charAt(currentIndex)); + currentIndex++; + continue; + } - if (endSequenceIndex < 0) - throw new InvalidSyntaxException("Expected \"" + terminator.endSequence + "\" but not found."); + if (matchResult.terminator.termination == PRESERVE) { + if (hasAccumulatedToken(tokenAccumulator)) + return new TokenizerMatch(tokenAccumulator.toString(), null, null, this); - String reminder = source.substring(currentIndex + terminator.startSequence.length(), endSequenceIndex); - currentIndex = endSequenceIndex + terminator.endSequence.length(); + currentIndex = matchResult.matcher.end(); + return matchResult; + } else { + currentIndex = matchResult.matcher.end(); - return new TokenizerMatch(terminator.startSequence, reminder, terminator); - } else { - currentIndex += terminator.startSequence.length(); - return new TokenizerMatch(terminator.startSequence, "", terminator); + if (hasAccumulatedToken(tokenAccumulator)) + return new TokenizerMatch(tokenAccumulator.toString(), null, null, this); + } } } - private boolean hasAccumulatedToken(StringBuilder token) { - return token.length() > 0; + public TokenizerMatch findTerminatorMatch(){ + for (Terminator terminator : terminators) + if (terminator.active) { + Matcher match = terminator.match(source, currentIndex); + if (match.find()) { + String token = source.substring(match.start(), match.end()); + return new TokenizerMatch(token, terminator, match, this); + } + } + return null; } - private boolean isOngoingToken() { - return findTokenTerminator() == null; + private boolean hasAccumulatedToken(StringBuilder tokenAccumulator) { + return tokenAccumulator.length() > 0; } - public Terminator findTokenTerminator() { - for (Terminator terminator : terminators) - if (terminator.matches(source, currentIndex)) - return terminator; - return null; + public boolean hasMoreContent() { + if (source == null) return false; + return currentIndex < source.length(); } public boolean consumeIfNextToken(final String token) throws InvalidSyntaxException { @@ -135,46 +156,39 @@ public class Tokenizer { return result; } - public boolean peekIsOneOf(String ... possibilities) throws InvalidSyntaxException { + public boolean peekIsOneOf(String... possibilities) throws InvalidSyntaxException { String nextToken = peekNextToken().token; return Stream.of(possibilities).anyMatch(possibility -> possibility.equals(nextToken)); } - public void peekExpectNoneOf(String ... possibilities) throws InvalidSyntaxException { + public void peekExpectNoneOf(String... possibilities) throws InvalidSyntaxException { if (peekIsOneOf(possibilities)) throw new InvalidSyntaxException("Not expected \"" + peekNextToken().token + "\" here."); } + public void unreadToken() { + currentIndex = tokenIndexes.pop(); + } - public boolean sequenceMatches(final String sequence) { - if ((currentIndex + sequence.length()) > source.length()) - return false; + /** + * For debugging + */ + public void enlistRemainingTokens(){ + int redTokenCount = 0; - for (int i = 0; i < sequence.length(); i++) - if (sequence.charAt(i) != source.charAt(i + currentIndex)) - return false; + while (hasMoreContent()) { + out.println(getNextToken().toString()); + redTokenCount++; + } - return true; + // restore pointer to original location + for (int i = 0; i< redTokenCount; i++ ) unreadToken(); } + public void skipUntilDataEnd() { tokenIndexes.push(currentIndex); currentIndex = source.length(); } - public void skipUntilSequence(final String sequence) { - while (currentIndex < source.length()) { - if (sequenceMatches(sequence)) { - currentIndex += sequence.length(); - return; - } - - currentIndex++; - } - } - - public void unreadToken() { - currentIndex = tokenIndexes.pop(); - } - }