From ba10494483d7eaf1e9d58ddb1402806c8fc58178 Mon Sep 17 00:00:00 2001 From: Svjatoslav Agejenko Date: Thu, 12 Oct 2017 13:13:25 +0300 Subject: [PATCH] Handle end of input. Speed improvements. Deleted legacy code. --- .../commons/string/tokenizer/Tokenizer.java | 135 ++++++++++-------- .../string/tokenizer/TokenizerTest.java | 82 ++++------- 2 files changed, 109 insertions(+), 108 deletions(-) diff --git a/src/main/java/eu/svjatoslav/commons/string/tokenizer/Tokenizer.java b/src/main/java/eu/svjatoslav/commons/string/tokenizer/Tokenizer.java index 939ede9..722e17a 100755 --- a/src/main/java/eu/svjatoslav/commons/string/tokenizer/Tokenizer.java +++ b/src/main/java/eu/svjatoslav/commons/string/tokenizer/Tokenizer.java @@ -24,6 +24,9 @@ public class Tokenizer { private String source; private int currentIndex = 0; + int cachedTerminatorIndex = -1; + Terminator cachedTerminator; + public Tokenizer(final String source) { this.source = source; } @@ -34,6 +37,9 @@ public class Tokenizer { this.source = source; currentIndex = 0; tokenIndexes.clear(); + + cachedTerminatorIndex = -1; + cachedTerminator = null; return this; } @@ -57,53 +63,80 @@ public class Tokenizer { + "\" but got \"" + match.token + "\" instead."); } + + public TokenizerMatch getNextToken() throws InvalidSyntaxException { tokenIndexes.push(currentIndex); - StringBuilder token = new StringBuilder(); + StringBuilder tokenAccumulator = new StringBuilder(); while (true){ + + if (currentIndex >= source.length()){ // reached end of input + if (hasAccumulatedToken(tokenAccumulator)) + return new TokenizerMatch(tokenAccumulator.toString(), null, null); + else + return null; + } + if (isOngoingToken()) { - token.append(source.charAt(currentIndex)); + tokenAccumulator.append(source.charAt(currentIndex)); currentIndex++; continue; } - Terminator tokenTerminator = findTokenTerminator(); - - if (tokenTerminator.termination == PRESERVE){ - return buildPreservedToken(token, tokenTerminator); - } else if (tokenTerminator.termination == DROP){ - if (hasAccumulatedToken(token)){ - currentIndex++; - return new TokenizerMatch(token.toString(), "", tokenTerminator); - } else { - currentIndex++; - } + Terminator terminator = getOrFindTokenTerminator(); + + if (terminator.termination == PRESERVE) + return buildPreservedToken(tokenAccumulator, terminator); + else if (terminator.termination == DROP){ + skipUntilTerminatorEnd(terminator); + + if (hasAccumulatedToken(tokenAccumulator)) + return new TokenizerMatch(tokenAccumulator.toString(), null, terminator); } } } + private void skipUntilTerminatorEnd(Terminator terminator) throws InvalidSyntaxException { + if (terminator.hasEndSequence()) + currentIndex = getEndSequenceIndex(terminator) + terminator.endSequence.length(); + else + currentIndex += terminator.startSequence.length(); + } + private TokenizerMatch buildPreservedToken(StringBuilder token, Terminator terminator) throws InvalidSyntaxException { if (hasAccumulatedToken(token)) - return new TokenizerMatch(token.toString(), "", terminator); + return new TokenizerMatch(token.toString(), null, terminator); - if (terminator.hasEndSequence()){ - int endSequenceIndex = source.indexOf(terminator.endSequence, - currentIndex + terminator.startSequence.length()); + if (terminator.hasEndSequence()) + return buildComplexPreservedToken(terminator); + else + return buildSimplePreservedToken(terminator); + } - if (endSequenceIndex < 0) - throw new InvalidSyntaxException("Expected \"" + terminator.endSequence + "\" but not found."); + private TokenizerMatch buildSimplePreservedToken(Terminator terminator) { + currentIndex += terminator.startSequence.length(); + return new TokenizerMatch(terminator.startSequence, null, terminator); + } - String reminder = source.substring(currentIndex + terminator.startSequence.length(), endSequenceIndex); - currentIndex = endSequenceIndex + terminator.endSequence.length(); + private TokenizerMatch buildComplexPreservedToken(Terminator terminator) throws InvalidSyntaxException { + int endSequenceIndex = getEndSequenceIndex(terminator); + String reminder = source.substring(currentIndex + terminator.startSequence.length(), endSequenceIndex); + currentIndex = endSequenceIndex + terminator.endSequence.length(); - return new TokenizerMatch(terminator.startSequence, reminder, terminator); - } else { - currentIndex += terminator.startSequence.length(); - return new TokenizerMatch(terminator.startSequence, "", terminator); - } + return new TokenizerMatch(terminator.startSequence, reminder, terminator); + } + + private int getEndSequenceIndex(Terminator terminator) throws InvalidSyntaxException { + int endSequenceIndex = source.indexOf(terminator.endSequence, + currentIndex + terminator.startSequence.length()); + + if (endSequenceIndex < 0) + throw new InvalidSyntaxException("Expected \"" + terminator.endSequence + "\" but not found."); + + return endSequenceIndex; } private boolean hasAccumulatedToken(StringBuilder token) { @@ -111,10 +144,26 @@ public class Tokenizer { } private boolean isOngoingToken() { - return findTokenTerminator() == null; + return getOrFindTokenTerminator() == null; } - public Terminator findTokenTerminator() { + public boolean hasMoreTokens(){ + return currentIndex < source.length(); + } + + /** + * Attempts to cache terminator search result. + */ + public Terminator getOrFindTokenTerminator() { + if (currentIndex == cachedTerminatorIndex) + return cachedTerminator; + + cachedTerminatorIndex = currentIndex; + cachedTerminator = findTokenTerminator(); + return cachedTerminator; + } + + private Terminator findTokenTerminator() { for (Terminator terminator : terminators) if (terminator.matches(source, currentIndex)) return terminator; @@ -144,35 +193,7 @@ public class Tokenizer { if (peekIsOneOf(possibilities)) throw new InvalidSyntaxException("Not expected \"" + peekNextToken().token + "\" here."); } - - - public boolean sequenceMatches(final String sequence) { - if ((currentIndex + sequence.length()) > source.length()) - return false; - - for (int i = 0; i < sequence.length(); i++) - if (sequence.charAt(i) != source.charAt(i + currentIndex)) - return false; - - return true; - } - - public void skipUntilDataEnd() { - tokenIndexes.push(currentIndex); - currentIndex = source.length(); - } - - public void skipUntilSequence(final String sequence) { - while (currentIndex < source.length()) { - if (sequenceMatches(sequence)) { - currentIndex += sequence.length(); - return; - } - - currentIndex++; - } - } - + public void unreadToken() { currentIndex = tokenIndexes.pop(); } diff --git a/src/test/java/eu/svjatoslav/commons/string/tokenizer/TokenizerTest.java b/src/test/java/eu/svjatoslav/commons/string/tokenizer/TokenizerTest.java index ddb2662..84571e8 100644 --- a/src/test/java/eu/svjatoslav/commons/string/tokenizer/TokenizerTest.java +++ b/src/test/java/eu/svjatoslav/commons/string/tokenizer/TokenizerTest.java @@ -5,21 +5,13 @@ import org.junit.Test; import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.DROP; import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.PRESERVE; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; public class TokenizerTest { - @Test - public void findTokenTerminator() throws Exception { - - Tokenizer tokenizer = new Tokenizer("this /* comment */ a test") - .addTerminator("/*", "*/", PRESERVE); - - - - } - @Test - public void you_can_peek() throws Exception { + public void testPeeking() throws Exception { Tokenizer tokenizer = new Tokenizer("this is a N'2015-03-18 09:48:54.360' test") .addTerminator(" ", DROP) .addTerminator("N'", "'", PRESERVE); @@ -34,56 +26,44 @@ public class TokenizerTest { } @Test - public void complexTerminator() throws Exception { - Tokenizer tokenizer = new Tokenizer("/* hello */ /** comment **/ (( is a N'2015-03-18 09:48:54.360' test") + public void testTokenization() throws Exception { + Tokenizer tokenizer = new Tokenizer("\"hello\" /** comment **/ (( is a N'2015-03-18 09:48:54.360' test") .addTerminator(" ", DROP) .addTerminator("(", PRESERVE) .addTerminator("\"", "\"" , PRESERVE) - .addTerminator("/*", "*/" , PRESERVE) + .addTerminator("N'", "'" , PRESERVE) + .addTerminator("/*", "*/" , DROP) ; - TokenizerMatch nextToken = tokenizer.getNextToken(); - System.out.println("T: \"" + nextToken.token + "\", R: \"" + nextToken.reminder + "\""); - System.out.println(tokenizer.getNextToken().token); - System.out.println(tokenizer.getNextToken().token); - System.out.println(tokenizer.getNextToken().token); - System.out.println(tokenizer.getNextToken().token); - System.out.println(tokenizer.getNextToken().token); - System.out.println(tokenizer.getNextToken().token); - System.out.println(tokenizer.getNextToken().token); - System.out.println(tokenizer.getNextToken().token); - + assertTokenEquals("\"", "hello", tokenizer); + assertTokenEquals("(", null, tokenizer); + assertTokenEquals("(", null, tokenizer); + assertTokenEquals("is", null, tokenizer); + assertTokenEquals("a", null, tokenizer); + assertTokenEquals("N'", "2015-03-18 09:48:54.360", tokenizer); + assertTokenEquals("test", null, tokenizer); -// tokenizer.expectAndConsumeNextToken("this"); -// -// assertEquals("is", tokenizer.peekNextToken().token); -// -// assertEquals("is", tokenizer.peekNextToken().token); -// -// assertEquals(true, tokenizer.peekIsOneOf("maybe", "is", "that")); + assertNull(tokenizer.getNextToken()); + assertFalse(tokenizer.hasMoreTokens()); } + private void assertTokenEquals(String token, String reminder, Tokenizer tokenizer) throws InvalidSyntaxException { + TokenizerMatch nextToken = tokenizer.getNextToken(); - @Test - public void testComplexTerminator() throws Exception { - Tokenizer tokenizer = new Tokenizer("this N'2015-03-18 09:48:54.360' /* thoe unto u */ test") - .addTerminator(" ", DROP) - .addTerminator("/*", "*/", PRESERVE); - -// tokenizer.expectAndConsumeNextToken("this"); - -// assertEquals("2015-03-18 09:48:54.360", tokenizer.getNextToken().token); - - System.out.println("1st: " + tokenizer.getNextToken().token); - - System.out.println("2nd: " + tokenizer.getNextToken().token); - - System.out.println("2nd: " + tokenizer.getNextToken().token); - - System.out.println("2nd: " + tokenizer.getNextToken().token); - - System.out.println("2nd: " + tokenizer.getNextToken().token); + assertEquals(token, nextToken.token); + if (reminder == null) + assertNull(nextToken.reminder); + else + assertEquals(reminder, nextToken.reminder); } + private void debugNextToken(Tokenizer tokenizer) throws InvalidSyntaxException { + TokenizerMatch nextToken = tokenizer.getNextToken(); + if (nextToken == null) + System.out.println("null"); + else + System.out.println("T: \"" + nextToken.token + "\", R: \"" + nextToken.reminder + "\""); + } + } \ No newline at end of file -- 2.20.1