Handle complex content preserving terminators.
authorSvjatoslav Agejenko <svjatoslav@svjatoslav.eu>
Thu, 12 Oct 2017 09:29:33 +0000 (12:29 +0300)
committerSvjatoslav Agejenko <svjatoslav@svjatoslav.eu>
Thu, 12 Oct 2017 09:29:33 +0000 (12:29 +0300)
src/main/java/eu/svjatoslav/commons/string/tokenizer/Terminator.java
src/main/java/eu/svjatoslav/commons/string/tokenizer/Tokenizer.java
src/test/java/eu/svjatoslav/commons/string/tokenizer/TokenizerTest.java

index a298538..c1d1983 100755 (executable)
@@ -45,6 +45,10 @@ public class Terminator {
         DROP
     }
 
+    public boolean hasEndSequence(){
+        return endSequence != null;
+    }
+
     @Override
     public String toString() {
         return "Terminator{" +
index e92ccd7..939ede9 100755 (executable)
@@ -57,45 +57,61 @@ public class Tokenizer {
                     + "\" but got \"" + match.token + "\" instead.");
     }
 
-    public TokenizerMatch getNextToken() {
+    public TokenizerMatch getNextToken() throws InvalidSyntaxException {
         tokenIndexes.push(currentIndex);
 
         StringBuilder token = new StringBuilder();
 
         while (true){
-            if (isTokenTermination()){
-                Terminator tokenTerminator = findTokenTerminator();
-
-                if (tokenTerminator.termination == PRESERVE){
-                    if (hasAccumulatedToken(token)){
-                        // already assembled some token
-                        return new TokenizerMatch(token.toString(), "", tokenTerminator);
-                    } else {
-                        currentIndex++;
-                        return new TokenizerMatch(tokenTerminator.startSequence, "", tokenTerminator);
-                    }
-                } else if (tokenTerminator.termination == DROP){
-                    if (hasAccumulatedToken(token)){
-                        currentIndex++;
-                        return new TokenizerMatch(token.toString(), "", tokenTerminator);
-                    } else {
-                        currentIndex++;
-                    }
-                }
-            } else {
+            if (isOngoingToken()) {
                 token.append(source.charAt(currentIndex));
                 currentIndex++;
+                continue;
+            }
+
+            Terminator tokenTerminator = findTokenTerminator();
+
+            if (tokenTerminator.termination == PRESERVE){
+                return buildPreservedToken(token, tokenTerminator);
+            } else if (tokenTerminator.termination == DROP){
+                if (hasAccumulatedToken(token)){
+                    currentIndex++;
+                    return new TokenizerMatch(token.toString(), "", tokenTerminator);
+                } else {
+                    currentIndex++;
+                }
             }
         }
 
     }
 
+    private TokenizerMatch buildPreservedToken(StringBuilder token, Terminator terminator) throws InvalidSyntaxException {
+        if (hasAccumulatedToken(token))
+            return new TokenizerMatch(token.toString(), "", terminator);
+
+        if (terminator.hasEndSequence()){
+            int endSequenceIndex = source.indexOf(terminator.endSequence,
+                    currentIndex + terminator.startSequence.length());
+
+            if (endSequenceIndex < 0)
+                throw new InvalidSyntaxException("Expected \"" + terminator.endSequence + "\" but not found.");
+
+            String reminder = source.substring(currentIndex + terminator.startSequence.length(), endSequenceIndex);
+            currentIndex = endSequenceIndex + terminator.endSequence.length();
+
+            return new TokenizerMatch(terminator.startSequence, reminder, terminator);
+        } else {
+            currentIndex += terminator.startSequence.length();
+            return new TokenizerMatch(terminator.startSequence, "", terminator);
+        }
+    }
+
     private boolean hasAccumulatedToken(StringBuilder token) {
         return token.length() > 0;
     }
 
-    private boolean isTokenTermination() {
-        return findTokenTerminator() != null;
+    private boolean isOngoingToken() {
+        return findTokenTerminator() == null;
     }
 
     public Terminator findTokenTerminator() {
@@ -105,7 +121,7 @@ public class Tokenizer {
         return null;
     }
 
-    public boolean consumeIfNextToken(final String token) {
+    public boolean consumeIfNextToken(final String token) throws InvalidSyntaxException {
         if (token.equals(getNextToken().token))
             return true;
 
@@ -113,13 +129,13 @@ public class Tokenizer {
         return false;
     }
 
-    public TokenizerMatch peekNextToken(){
+    public TokenizerMatch peekNextToken() throws InvalidSyntaxException {
         TokenizerMatch result = getNextToken();
         unreadToken();
         return result;
     }
 
-    public boolean peekIsOneOf(String ... possibilities){
+    public boolean peekIsOneOf(String ... possibilities) throws InvalidSyntaxException {
         String nextToken = peekNextToken().token;
         return Stream.of(possibilities).anyMatch(possibility -> possibility.equals(nextToken));
     }
index e72b936..ddb2662 100644 (file)
@@ -2,6 +2,7 @@ package eu.svjatoslav.commons.string.tokenizer;
 
 import org.junit.Test;
 
+import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.DROP;
 import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.PRESERVE;
 import static org.junit.Assert.assertEquals;
 
@@ -20,7 +21,7 @@ public class TokenizerTest {
     @Test
     public void you_can_peek() throws Exception {
         Tokenizer tokenizer = new Tokenizer("this is a N'2015-03-18 09:48:54.360' test")
-                .addTerminator(" ", Terminator.TerminationStrategy.DROP)
+                .addTerminator(" ", DROP)
                 .addTerminator("N'", "'", PRESERVE);
 
         tokenizer.expectAndConsumeNextToken("this");
@@ -34,14 +35,15 @@ public class TokenizerTest {
 
     @Test
     public void complexTerminator() throws Exception {
-        Tokenizer tokenizer = new Tokenizer("   this((\"hello\"  /* comment */   ((  is a N'2015-03-18 09:48:54.360' test")
-                .addTerminator(" ", Terminator.TerminationStrategy.DROP)
-                .addTerminator("(", Terminator.TerminationStrategy.PRESERVE)
-                .addTerminator("\"", "\"" ,Terminator.TerminationStrategy.PRESERVE)
-                .addTerminator("/*", "*/" ,Terminator.TerminationStrategy.DROP)
+        Tokenizer tokenizer = new Tokenizer("/* hello */ /** comment **/   ((  is a N'2015-03-18 09:48:54.360' test")
+                .addTerminator(" ", DROP)
+                .addTerminator("(", PRESERVE)
+                .addTerminator("\"", "\"" , PRESERVE)
+                .addTerminator("/*", "*/" , PRESERVE)
                 ;
 
-        System.out.println(tokenizer.getNextToken().token);
+        TokenizerMatch nextToken = tokenizer.getNextToken();
+        System.out.println("T: \"" + nextToken.token + "\", R: \"" + nextToken.reminder + "\"");
         System.out.println(tokenizer.getNextToken().token);
         System.out.println(tokenizer.getNextToken().token);
         System.out.println(tokenizer.getNextToken().token);
@@ -65,7 +67,7 @@ public class TokenizerTest {
     @Test
     public void testComplexTerminator() throws Exception {
         Tokenizer tokenizer = new Tokenizer("this N'2015-03-18 09:48:54.360'  /* thoe unto u */ test")
-                .addTerminator(" ", Terminator.TerminationStrategy.DROP)
+                .addTerminator(" ", DROP)
                 .addTerminator("/*", "*/", PRESERVE);
 
 //        tokenizer.expectAndConsumeNextToken("this");