From 9bb7c6bf73ebbcbc66f4abd0fabf2f698c42d42c Mon Sep 17 00:00:00 2001 From: Svjatoslav Agejenko Date: Tue, 4 Aug 2020 21:30:20 +0300 Subject: [PATCH] Use regular expressions as terminators --- .../commons/string/tokenizer/Terminator.java | 61 +++---- .../commons/string/tokenizer/Tokenizer.java | 158 ++++++------------ .../string/tokenizer/TokenizerMatch.java | 27 ++- .../string/tokenizer/TerminatorTest.java | 10 +- .../string/tokenizer/TokenizerTest.java | 57 +++---- 5 files changed, 134 insertions(+), 179 deletions(-) diff --git a/src/main/java/eu/svjatoslav/commons/string/tokenizer/Terminator.java b/src/main/java/eu/svjatoslav/commons/string/tokenizer/Terminator.java index 1a6c5ee..8946b32 100755 --- a/src/main/java/eu/svjatoslav/commons/string/tokenizer/Terminator.java +++ b/src/main/java/eu/svjatoslav/commons/string/tokenizer/Terminator.java @@ -4,52 +4,53 @@ */ package eu.svjatoslav.commons.string.tokenizer; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + public class Terminator { - public final String startSequence; - public final String endSequence; + String regexp; public final TerminationStrategy termination; + public final String group; + public boolean active = true; + public final Pattern pattern; - public Terminator(final String startSequence, TerminationStrategy termination) { - this.startSequence = startSequence; - this.endSequence = null; - this.termination = termination; - } - - public Terminator(final String startSequence, final String endSequence, TerminationStrategy termination) { - this.startSequence = startSequence; - this.endSequence = endSequence; + public Terminator(TerminationStrategy termination, String regexp, String group) { this.termination = termination; + this.group = group; + this.regexp = regexp; + this.pattern = Pattern.compile("^"+regexp); } - public boolean matches(String source, int index) { - // boundary check - if (source.length() < (index + startSequence.length())) - return false; - - // match check - for (int i = 0; i < startSequence.length(); i++) - if (startSequence.charAt(i) != source.charAt(index + i)) - return false; - - return true; - } - - public boolean hasEndSequence() { - return endSequence != null; + public Matcher match(String source, int index) { + Matcher matcher = pattern.matcher(source); + matcher.region(index, source.length()); + return matcher; } @Override public String toString() { return "Terminator{" + - "startSequence='" + startSequence + '\'' + - ", endSequence='" + endSequence + '\'' + + "regexp='" + regexp + '\'' + ", termination=" + termination + + ", group='" + group + '\'' + + ", active=" + active + '}'; } public enum TerminationStrategy { - PRESERVE, // Identify and return such tokens for further processing. - DROP // Identify but ignore such tokens, do not return them. Good for handling comments in scripts. + /** + * Preserve token that is identified within Terminator and return it for processing. For example when + * building language parser, it could be used for statements that you want to capture. + */ + PRESERVE, + + /** + * While tokens that are marked by Terminator are identified, they are dropped and not returned for consumption. + * For example, when building language parser, you might use such strategy for whitespace and comments. + * That is, those tokens act as separators between actually useful tokens, but you don't want to consume such + * separators or comments in your code. + */ + DROP } } diff --git a/src/main/java/eu/svjatoslav/commons/string/tokenizer/Tokenizer.java b/src/main/java/eu/svjatoslav/commons/string/tokenizer/Tokenizer.java index cc20369..6aa56b4 100755 --- a/src/main/java/eu/svjatoslav/commons/string/tokenizer/Tokenizer.java +++ b/src/main/java/eu/svjatoslav/commons/string/tokenizer/Tokenizer.java @@ -7,9 +7,9 @@ package eu.svjatoslav.commons.string.tokenizer; import java.util.ArrayList; import java.util.List; import java.util.Stack; +import java.util.regex.Matcher; import java.util.stream.Stream; -import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.DROP; import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.PRESERVE; import static java.lang.System.out; @@ -29,9 +29,6 @@ public class Tokenizer { private int currentIndex = 0; - private int cachedTerminatorIndex = -1; - private Terminator cachedTerminator; - public Tokenizer(final String source) { this.source = source; } @@ -43,30 +40,29 @@ public class Tokenizer { this.source = source; currentIndex = 0; tokenIndexes.clear(); - - cachedTerminatorIndex = -1; - cachedTerminator = null; return this; } - public Tokenizer addTerminator(final String startSequence, - final Terminator.TerminationStrategy terminationStrategy) { - terminators.add(new Terminator(startSequence, terminationStrategy)); - return this; + public Terminator addTerminator(final Terminator.TerminationStrategy terminationStrategy, String regexp) { + Terminator terminator = new Terminator(terminationStrategy, regexp,null); + terminators.add(terminator); + return terminator; } - public Tokenizer addTerminator(Terminator terminator) { + public Terminator addTerminator(final Terminator.TerminationStrategy terminationStrategy, + String regexp, String group) { + Terminator terminator = new Terminator(terminationStrategy, regexp,group); terminators.add(terminator); - return this; + return terminator; } - public Tokenizer addTerminator(final String startSequence, - final String endSequence, final Terminator.TerminationStrategy terminationStrategy) { - terminators.add(new Terminator(startSequence, endSequence, terminationStrategy)); - return this; + + public Terminator addTerminator(Terminator terminator) { + terminators.add(terminator); + return terminator; } - public void expectAndConsumeNextToken(final String value) + public void expectAndConsumeNextStringToken(final String value) throws InvalidSyntaxException { final TokenizerMatch match = getNextToken(); if (!value.equals(match.token)) @@ -74,11 +70,22 @@ public class Tokenizer { + "\" but got \"" + match.token + "\" instead."); } + public TokenizerMatch expectAndConsumeNextTerminatorToken(Terminator terminator) + throws InvalidSyntaxException { + final TokenizerMatch match = getNextToken(); + + if (match.terminator != terminator) + throw new InvalidSyntaxException("Expected terminator \"" + terminator + + "\" but got \"" + match.terminator + "\" instead."); + + return match; + } + + /** * @return next @TokenizerMatch or null if end of input is reached. - * @throws InvalidSyntaxException */ - public TokenizerMatch getNextToken() throws InvalidSyntaxException { + public TokenizerMatch getNextToken() { tokenIndexes.push(currentIndex); StringBuilder tokenAccumulator = new StringBuilder(); @@ -92,104 +99,49 @@ public class Tokenizer { return null; } - if (isOngoingToken()) { + TokenizerMatch matchResult = findTerminatorMatch(); + if (matchResult == null) { tokenAccumulator.append(source.charAt(currentIndex)); currentIndex++; continue; } - Terminator terminator = getOrFindTokenTerminator(); + if (matchResult.terminator.termination == PRESERVE) { + if (hasAccumulatedToken(tokenAccumulator)) + return new TokenizerMatch(tokenAccumulator.toString(), null, null); - if (terminator.termination == PRESERVE) - return buildPreservedToken(tokenAccumulator, terminator); - else if (terminator.termination == DROP) { - skipUntilTerminatorEnd(terminator); + currentIndex = matchResult.matcher.end(); + return matchResult; + } else { + currentIndex = matchResult.matcher.end(); if (hasAccumulatedToken(tokenAccumulator)) - return new TokenizerMatch(tokenAccumulator.toString(), null, terminator); + return new TokenizerMatch(tokenAccumulator.toString(), null, null); } } - } - private void skipUntilTerminatorEnd(Terminator terminator) throws InvalidSyntaxException { - if (terminator.hasEndSequence()) - currentIndex = getEndSequenceIndex(terminator) + terminator.endSequence.length(); - else - currentIndex += terminator.startSequence.length(); - } - - /** - * @throws InvalidSyntaxException if end sequence is not found as is expected by given token. - */ - private TokenizerMatch buildPreservedToken(StringBuilder token, Terminator terminator) - throws InvalidSyntaxException { - if (hasAccumulatedToken(token)) - return new TokenizerMatch(token.toString(), null, terminator); - - if (terminator.hasEndSequence()) - return buildTokenWithExpectedENdSequence(terminator); - else - return buildTokenWithoutEndSequence(terminator); - } - - private TokenizerMatch buildTokenWithoutEndSequence(Terminator terminator) { - currentIndex += terminator.startSequence.length(); - return new TokenizerMatch(terminator.startSequence, null, terminator); - } - - private TokenizerMatch buildTokenWithExpectedENdSequence(Terminator terminator) throws InvalidSyntaxException { - int endSequenceIndex = getEndSequenceIndex(terminator); - String reminder = source.substring(currentIndex + terminator.startSequence.length(), endSequenceIndex); - currentIndex = endSequenceIndex + terminator.endSequence.length(); - - return new TokenizerMatch(terminator.startSequence, reminder, terminator); - } - - /** - * @throws InvalidSyntaxException if end of input is reached without finding expected end sequence. - */ - private int getEndSequenceIndex(Terminator terminator) throws InvalidSyntaxException { - int endSequenceIndex = source.indexOf(terminator.endSequence, - currentIndex + terminator.startSequence.length()); - - if (endSequenceIndex < 0) - throw new InvalidSyntaxException("Expected \"" + terminator.endSequence + "\" but not found."); - - return endSequenceIndex; - } - - private boolean hasAccumulatedToken(StringBuilder token) { - return token.length() > 0; + public TokenizerMatch findTerminatorMatch(){ + for (Terminator terminator : terminators) + if (terminator.active) { + Matcher match = terminator.match(source, currentIndex); + if (match.find()) { + String token = source.substring(match.start(), match.end()); + return new TokenizerMatch(token, terminator, match); + } + } + return null; } - private boolean isOngoingToken() { - return getOrFindTokenTerminator() == null; + private boolean hasAccumulatedToken(StringBuilder tokenAccumulator) { + return tokenAccumulator.length() > 0; } public boolean hasMoreContent() { + if (source == null) return false; return currentIndex < source.length(); } - /** - * Attempts to cache terminator search result. - */ - public Terminator getOrFindTokenTerminator() { - if (currentIndex == cachedTerminatorIndex) - return cachedTerminator; - - cachedTerminatorIndex = currentIndex; - cachedTerminator = findTokenTerminator(); - return cachedTerminator; - } - - private Terminator findTokenTerminator() { - for (Terminator terminator : terminators) - if (terminator.matches(source, currentIndex)) - return terminator; - return null; - } - public boolean consumeIfNextToken(final String token) throws InvalidSyntaxException { if (token.equals(getNextToken().token)) return true; @@ -224,13 +176,9 @@ public class Tokenizer { public void enlistRemainingTokens(){ int redTokenCount = 0; - try { - while (hasMoreContent()) { - out.println(getNextToken().toString()); - redTokenCount++; - } - } catch (InvalidSyntaxException e){ - out.println("There is syntax exception"); + while (hasMoreContent()) { + out.println(getNextToken().toString()); + redTokenCount++; } // restore pointer to original location diff --git a/src/main/java/eu/svjatoslav/commons/string/tokenizer/TokenizerMatch.java b/src/main/java/eu/svjatoslav/commons/string/tokenizer/TokenizerMatch.java index a620010..2b9b1f8 100755 --- a/src/main/java/eu/svjatoslav/commons/string/tokenizer/TokenizerMatch.java +++ b/src/main/java/eu/svjatoslav/commons/string/tokenizer/TokenizerMatch.java @@ -4,23 +4,42 @@ */ package eu.svjatoslav.commons.string.tokenizer; +import java.util.regex.Matcher; + public class TokenizerMatch { public final String token; - public final String reminder; + + /** + * {@link Terminator} that matched current token + */ public final Terminator terminator; - public TokenizerMatch(final String token, final String reminder, final Terminator terminator) { + public final Matcher matcher; + + + public TokenizerMatch(final String token, final Terminator terminator, Matcher matcher) { this.token = token; - this.reminder = reminder; this.terminator = terminator; + this.matcher = matcher; + } + + public boolean isGroup(String group){ + if (terminator == null){ + return group == null; + } + + if (terminator.group == null){ + return group == null; + } + + return terminator.group.equals(group); } @Override public String toString() { return "TokenizerMatch{" + "token='" + token + '\'' + - ", reminder='" + reminder + '\'' + ", terminator=" + terminator + '}'; } diff --git a/src/test/java/eu/svjatoslav/commons/string/tokenizer/TerminatorTest.java b/src/test/java/eu/svjatoslav/commons/string/tokenizer/TerminatorTest.java index 1b6173a..31aa3b1 100644 --- a/src/test/java/eu/svjatoslav/commons/string/tokenizer/TerminatorTest.java +++ b/src/test/java/eu/svjatoslav/commons/string/tokenizer/TerminatorTest.java @@ -10,16 +10,18 @@ public class TerminatorTest { @Test public void testMatches() { Terminator terminator = new Terminator( - "/*", "*/", Terminator.TerminationStrategy.PRESERVE); + Terminator.TerminationStrategy.PRESERVE, + "/\\*.+\\*/", + "test"); // must find - assertTrue(terminator.matches("/* bla bla bla */", 0)); + assertTrue(terminator.match("/* bla bla bla */", 0).find()); // must not find - assertFalse(terminator.matches("/* bla bla bla */", 1)); + assertFalse(terminator.match("/* bla bla bla */", 1).find()); // must not overflow - assertFalse(terminator.matches("/", 0)); + assertFalse(terminator.match("/", 0).find()); } } \ No newline at end of file diff --git a/src/test/java/eu/svjatoslav/commons/string/tokenizer/TokenizerTest.java b/src/test/java/eu/svjatoslav/commons/string/tokenizer/TokenizerTest.java index 9f35367..519b6d1 100644 --- a/src/test/java/eu/svjatoslav/commons/string/tokenizer/TokenizerTest.java +++ b/src/test/java/eu/svjatoslav/commons/string/tokenizer/TokenizerTest.java @@ -10,57 +10,42 @@ public class TokenizerTest { @Test public void testPeeking() throws Exception { - Tokenizer tokenizer = new Tokenizer("this is a N'2015-03-18 09:48:54.360' test") - .addTerminator(" ", DROP) - .addTerminator("N'", "'", PRESERVE); + Tokenizer tokenizer = new Tokenizer("this is a N'2015-03-18 09:48:54.360' test"); + tokenizer.addTerminator(DROP, "\\s"); + tokenizer.addTerminator(PRESERVE, "N'.*'"); - tokenizer.expectAndConsumeNextToken("this"); + tokenizer.expectAndConsumeNextStringToken("this"); assertEquals("is", tokenizer.peekNextToken().token); assertEquals("is", tokenizer.peekNextToken().token); - assertEquals(true, tokenizer.peekIsOneOf("maybe", "is", "that")); + assertTrue(tokenizer.peekIsOneOf("maybe", "is", "that")); } @Test public void testTokenization() throws Exception { - Tokenizer tokenizer = new Tokenizer("\"hello\" /** comment **/ (( is a N'2015-03-18 09:48:54.360' test") - .addTerminator(" ", DROP) - .addTerminator("(", PRESERVE) - .addTerminator("\"", "\"", PRESERVE) - .addTerminator("N'", "'", PRESERVE) - .addTerminator("/*", "*/", DROP); - - assertTokenEquals("\"", "hello", tokenizer); - assertTokenEquals("(", null, tokenizer); - assertTokenEquals("(", null, tokenizer); - assertTokenEquals("is", null, tokenizer); - assertTokenEquals("a", null, tokenizer); - assertTokenEquals("N'", "2015-03-18 09:48:54.360", tokenizer); - assertTokenEquals("test", null, tokenizer); + Tokenizer tokenizer = new Tokenizer("\"hello world\" /** comment **/ (( is a N'2015-03-18 09:48:54.360' test"); + tokenizer.addTerminator(DROP,"\\s"); + tokenizer.addTerminator(PRESERVE,"\\("); + tokenizer.addTerminator(PRESERVE, "\\\".*\\\""); + tokenizer.addTerminator(PRESERVE, "N'.*'"); + tokenizer.addTerminator(DROP,"/\\*.*\\*/"); + + assertTokenEquals("\"hello world\"", tokenizer); + assertTokenEquals("(", tokenizer); + assertTokenEquals("(", tokenizer); + assertTokenEquals("is", tokenizer); + assertTokenEquals("a", tokenizer); + assertTokenEquals("N'2015-03-18 09:48:54.360'", tokenizer); + assertTokenEquals("test", tokenizer); assertNull(tokenizer.getNextToken()); assertFalse(tokenizer.hasMoreContent()); } - private void assertTokenEquals(String token, String reminder, Tokenizer tokenizer) throws InvalidSyntaxException { - TokenizerMatch nextToken = tokenizer.getNextToken(); - - assertEquals(token, nextToken.token); - - if (reminder == null) - assertNull(nextToken.reminder); - else - assertEquals(reminder, nextToken.reminder); - } - - private void debugNextToken(Tokenizer tokenizer) throws InvalidSyntaxException { - TokenizerMatch nextToken = tokenizer.getNextToken(); - if (nextToken == null) - System.out.println("null"); - else - System.out.println("T: \"" + nextToken.token + "\", R: \"" + nextToken.reminder + "\""); + private void assertTokenEquals(String expectedValue, Tokenizer tokenizer){ + assertEquals(expectedValue, tokenizer.getNextToken().token); } } \ No newline at end of file -- 2.20.1