Use regular expression tokenizer. WIP
authorSvjatoslav Agejenko <svjatoslav@svjatoslav.eu>
Tue, 4 Aug 2020 18:35:25 +0000 (21:35 +0300)
committerSvjatoslav Agejenko <svjatoslav@svjatoslav.eu>
Tue, 4 Aug 2020 18:35:25 +0000 (21:35 +0300)
src/main/java/eu/svjatoslav/sixth/core/document/Document.java
src/main/java/eu/svjatoslav/sixth/core/document/Heading.java
src/main/java/eu/svjatoslav/sixth/core/document/Helper.java [new file with mode: 0644]
src/main/java/eu/svjatoslav/sixth/core/document/ListElement.java [new file with mode: 0644]
src/main/java/eu/svjatoslav/sixth/core/document/MdGenerator.java
src/main/java/eu/svjatoslav/sixth/core/document/OrgParser.java
src/main/java/eu/svjatoslav/sixth/core/document/content/Content.java [new file with mode: 0644]
src/main/java/eu/svjatoslav/sixth/core/document/content/PropertyCollection.java [new file with mode: 0644]
src/main/java/eu/svjatoslav/sixth/core/document/content/TextBlock.java [new file with mode: 0644]
src/main/java/eu/svjatoslav/sixth/core/document/text/FormattedText.java
src/main/java/eu/svjatoslav/sixth/core/document/text/Hyperlink.java

index 2385284..e626304 100644 (file)
@@ -1,12 +1,17 @@
 package eu.svjatoslav.sixth.core.document;
 
+import eu.svjatoslav.commons.string.tokenizer.InvalidSyntaxException;
+import eu.svjatoslav.commons.string.tokenizer.Tokenizer;
+import eu.svjatoslav.commons.string.tokenizer.TokenizerMatch;
 import eu.svjatoslav.sixth.core.document.text.FormattedText;
 
+import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.DROP;
+import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.PRESERVE;
+import static eu.svjatoslav.sixth.core.document.Helper.*;
 import static eu.svjatoslav.sixth.core.document.text.FormattedText.fromOrg;
 
 public class Document {
-    public final Heading rootHeading = new Heading( fromOrg("<root>") , 0, null);
-
+    public final Heading rootHeading = new Heading( null , 0, null);
     private Heading currentHeading = rootHeading;
 
     public Heading createHeading(FormattedText name, int targetLevel){
@@ -22,9 +27,58 @@ public class Document {
             return createHeading(name, targetLevel);
         }
 
-        Heading missingIntermediate = new Heading(fromOrg("<noname>"), currentHeading.level + 1, currentHeading);
-        currentHeading.addChild(missingIntermediate);
-        currentHeading = missingIntermediate;
-        return createHeading(name, targetLevel);
+        try {
+            Heading missingIntermediate = new Heading(fromOrg("<noname>"), currentHeading.level + 1, currentHeading);
+            currentHeading.addChild(missingIntermediate);
+            currentHeading = missingIntermediate;
+            return createHeading(name, targetLevel);
+        } catch (InvalidSyntaxException e) {
+            throw new IllegalStateException("impossible situation");
+        }
+    }
+
+    public Heading getCurrentHeading(){
+        return currentHeading;
+    }
+
+    private void parseHeading(TokenizerMatch token) throws InvalidSyntaxException {
+        System.out.println("HEADING!! " + token.token);
+        int level = token.token.length()-1;
+//        createHeading(fromOrg(token.reminder), level);
+    }
+
+    public void parse(String fileContentsAsString) throws InvalidSyntaxException {
+        final Tokenizer tokenizer = new Tokenizer(fileContentsAsString);
+
+        // Org heading:
+        // "*** Example Heading 1234"
+        tokenizer.addTerminator(PRESERVE, "\\*+\\s.*\\r?\\n", TG_HEADING);
+
+        // Org list. Examples:
+        // "   + my list title"
+        // "+"
+        tokenizer.addTerminator(PRESERVE, "\\s*(\\+|-)(\\s.*)?\\r?\\n", TG_LIST);
+        tokenizer.addTerminator(PRESERVE, "\\s+\\*(\\s.*)?\\r?\\n", TG_LIST);
+
+        // DocumentProperty:
+        // "#+OPTIONS: H:20 num:20"
+        tokenizer.addTerminator(PRESERVE, "#\\+.+:.*\\r?\\n", TG_DOCUMENT_PROPERTY);
+
+        // newline
+        tokenizer.addTerminator(DROP,"\\r?\\n", TG_NEWLINE);
+
+
+        while (tokenizer.hasMoreContent()) {
+            final TokenizerMatch tm = tokenizer.getNextToken();
+
+            if (tm.isGroup(TG_HEADING)){
+                parseHeading(tm);
+                continue;
+            }
+
+            tokenizer.unreadToken();
+            currentHeading.parse(tokenizer);
+        }
+
     }
 }
index 6159e1d..fe335f5 100644 (file)
@@ -1,16 +1,26 @@
 package eu.svjatoslav.sixth.core.document;
 
+import eu.svjatoslav.commons.string.tokenizer.InvalidSyntaxException;
+import eu.svjatoslav.commons.string.tokenizer.Tokenizer;
+import eu.svjatoslav.commons.string.tokenizer.TokenizerMatch;
 import eu.svjatoslav.sixth.core.document.text.FormattedText;
 
 import java.util.ArrayList;
 import java.util.List;
 
+import static eu.svjatoslav.sixth.core.document.Helper.TG_DOCUMENT_PROPERTY;
+import static eu.svjatoslav.sixth.core.document.Helper.TG_LIST;
+import static eu.svjatoslav.sixth.core.document.text.FormattedText.fromOrg;
+
 public class Heading {
     public final FormattedText name;
     public final int level;
     public final Heading parent;
     private final List<Heading> children = new ArrayList<>();
 
+    public final ListElement rootListElement = new ListElement(null, 0, null);
+    private ListElement currentListElement = rootListElement;
+
     public Heading(FormattedText name, int level, Heading parent){
         this.level = level;
         this.name = name;
@@ -25,4 +35,77 @@ public class Heading {
         return children;
     }
 
+    public String toMD () {
+        StringBuilder sb = new StringBuilder();
+
+        if (level > 0) sb.append(enlistTitleInMD());
+
+       // sb.append(unparsedContent);
+
+
+        children.stream().map(Heading::toMD).forEach(sb::append);
+
+        return sb.toString();
+    }
+
+    private String enlistTitleInMD() {
+        StringBuilder sb = new StringBuilder();
+        for (int i = 0; i < level; i++)
+            sb.append("#");
+
+        sb.append(" ").append(name.compileMd()).append("\n");
+        return sb.toString();
+    }
+
+    public ListElement createListElement(FormattedText name, int targetLevel){
+        if (currentListElement.level == (targetLevel - 1)){
+            ListElement newListElement = new ListElement(name, targetLevel, currentListElement);
+            currentListElement.addChild(newListElement);
+            currentListElement = newListElement;
+            return newListElement;
+        }
+
+        if (currentListElement.level > (targetLevel - 1)){
+            currentListElement = currentListElement.parent;
+            return createListElement(name, targetLevel);
+        }
+
+        try {
+            ListElement missingIntermediate = new ListElement(
+                    fromOrg("<noname>"), currentListElement.level + 1, currentListElement);
+            currentListElement.addChild(missingIntermediate);
+            currentListElement = missingIntermediate;
+            return createListElement(name, targetLevel);
+        } catch (InvalidSyntaxException e) {
+            throw new IllegalStateException("impossible situation");
+        }
+    }
+
+    public ListElement getCurrentHeading(){
+        return currentListElement;
+    }
+
+    public void parse(Tokenizer tokenizer){
+        while (tokenizer.hasMoreContent()) {
+            final TokenizerMatch tm = tokenizer.getNextToken();
+
+            if (tm.isGroup(TG_LIST)){
+                System.out.println("LIST!: " + tm.token);
+                continue;
+            }
+
+            if (tm.isGroup(TG_DOCUMENT_PROPERTY)){
+//                System.out.println("DOCUMENT PROPERTY!!!: " + tm.token);
+                continue;
+            }
+
+            if (tm.isGroup(null)){
+//                System.out.println("    HC: " + tm.token);
+                continue;
+            }
+
+            tokenizer.unreadToken();
+            break;
+        }
+    }
 }
diff --git a/src/main/java/eu/svjatoslav/sixth/core/document/Helper.java b/src/main/java/eu/svjatoslav/sixth/core/document/Helper.java
new file mode 100644 (file)
index 0000000..2d9c228
--- /dev/null
@@ -0,0 +1,12 @@
+package eu.svjatoslav.sixth.core.document;
+
+public class Helper {
+
+    public static final String TG_NEWLINE = "newline";
+    public static final String TG_HYPERLINK = "hyperlink";
+    public static final String TG_HEADING = "heading";
+    public static final String TG_LIST = "list";
+    public static final String TG_DOCUMENT_PROPERTY = "document property";
+
+
+}
diff --git a/src/main/java/eu/svjatoslav/sixth/core/document/ListElement.java b/src/main/java/eu/svjatoslav/sixth/core/document/ListElement.java
new file mode 100644 (file)
index 0000000..908b190
--- /dev/null
@@ -0,0 +1,49 @@
+package eu.svjatoslav.sixth.core.document;
+
+import eu.svjatoslav.commons.string.tokenizer.Tokenizer;
+import eu.svjatoslav.commons.string.tokenizer.TokenizerMatch;
+import eu.svjatoslav.sixth.core.document.text.FormattedText;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import static eu.svjatoslav.sixth.core.document.Helper.TG_DOCUMENT_PROPERTY;
+
+public class ListElement {
+    public final FormattedText name;
+    public final int level;
+    public final ListElement parent;
+    private final List<ListElement> children = new ArrayList<>();
+
+    public ListElement(FormattedText name, int level, ListElement parent){
+        this.level = level;
+        this.name = name;
+        this.parent = parent;
+    }
+
+    public void addChild(ListElement listElement){
+        children.add(listElement);
+    }
+
+    public void parse(Tokenizer tokenizer){
+        while (tokenizer.hasMoreContent()) {
+            final TokenizerMatch tm = tokenizer.getNextToken();
+
+            if (tm.isGroup(TG_DOCUMENT_PROPERTY)){
+                System.out.println("DOCUMENT PROPERT!!!: " + tm.token);
+                continue;
+            }
+
+            if (tm.isGroup(null)){
+                System.out.println("    HC: " + tm.token);
+                continue;
+            }
+
+            tokenizer.unreadToken();
+            break;
+        }
+    }
+
+
+
+}
index 90f1b1a..fbc8320 100644 (file)
@@ -12,23 +12,10 @@ public class MdGenerator {
     public void generate(Document document, File file) throws IOException {
         sb = new StringBuilder();
 
-        enlistHeading(document.rootHeading);
+        sb.append(document.rootHeading.toMD());
 
         saveToFile(file, sb.toString());
     }
 
-    private void enlistHeading(Heading heading) {
-        if (heading.level > 0) enlistHeadingTitle(heading);
-
-        heading.getChildren().forEach(this::enlistHeading);
-    }
-
-    private void enlistHeadingTitle(Heading heading) {
-        for (int i = 0; i < heading.level; i++)
-            sb.append("#");
-
-        sb.append(" ").append(heading.name.compileMd()).append("\n");
-    }
-
 
 }
index fdbd41b..8822d21 100644 (file)
@@ -1,17 +1,11 @@
 package eu.svjatoslav.sixth.core.document;
 
-import eu.svjatoslav.commons.string.String2;
 import eu.svjatoslav.commons.string.tokenizer.InvalidSyntaxException;
-import eu.svjatoslav.commons.string.tokenizer.Tokenizer;
-import eu.svjatoslav.commons.string.tokenizer.TokenizerMatch;
 
 import java.io.File;
 import java.io.IOException;
 
 import static eu.svjatoslav.commons.file.IOHelper.getFileContentsAsString;
-import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.DROP;
-import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.PRESERVE;
-import static eu.svjatoslav.sixth.core.document.text.FormattedText.fromOrg;
 
 public class OrgParser {
 
@@ -20,125 +14,11 @@ public class OrgParser {
     public Document parse(File file) throws IOException, InvalidSyntaxException {
         document = new Document();
 
-        Tokenizer lineTokenizer = getFileToLineTokenizer(getFileContentsAsString(file));
+        String fileContentsAsString = getFileContentsAsString(file);
 
-        while (true) {
-            final TokenizerMatch line = lineTokenizer.getNextToken();
-            if (line == null)
-                break; // EOF
-
-            parseLine(line.token);
-        }
+        document.parse(fileContentsAsString);
 
         return document;
     }
 
-    private void parseLine(String line) throws InvalidSyntaxException {
-        Tokenizer lineTokenizer = getLineTokenizer(line + "\n");
-
-        TokenizerMatch token = lineTokenizer.getNextToken();
-        if (token == null) return;
-
-        if (token.terminator == null)
-            return;
-
-        if (token.token.startsWith("*")){
-            parseHeading(token);
-            return;
-        }
-    }
-
-    private void parseHeading(TokenizerMatch token) {
-        int level = token.token.length()-1;
-        document.createHeading(fromOrg(token.reminder), level);
-   }
-
-    private Tokenizer getLineTokenizer(String contents) {
-        final Tokenizer tokenizer = new Tokenizer(contents);
-        for (int i = 1; i<50; i++){
-            String prefix = new String2("*").repeat(i).toString();
-            tokenizer.addTerminator(prefix +" ","\n", PRESERVE);
-        }
-        return tokenizer;
-    }
-
-    private Tokenizer getFileToLineTokenizer(String contents) {
-        final Tokenizer tokenizer = new Tokenizer(contents);
-
-        // empty space
-//        tokenizer.addTerminator(" ", DROP);
-//        tokenizer.addTerminator("\t", DROP);
-//        tokenizer.addTerminator("\n", DROP);
-
-        // newline
-        tokenizer.addTerminator("\n", DROP);
-
-//        tokenizer.addTerminator(";", PRESERVE);
-//        tokenizer.addTerminator("{", PRESERVE);
-//        tokenizer.addTerminator("}", PRESERVE);
-//        tokenizer.addTerminator("(", PRESERVE);
-//        tokenizer.addTerminator(")", PRESERVE);
-//        tokenizer.addTerminator("[", PRESERVE);
-//        tokenizer.addTerminator("]", PRESERVE);
-//        tokenizer.addTerminator("<", PRESERVE);
-//        tokenizer.addTerminator(">", PRESERVE);
-//        tokenizer.addTerminator(",", PRESERVE);
-//        tokenizer.addTerminator("@", PRESERVE);
-
-        // comments
-//        tokenizer.addTerminator("//", "\n", DROP);
-//        tokenizer.addTerminator("/*", "*/", DROP);
-        return tokenizer;
-    }
-//
-//    private void parseImport(final Tokenizer tokenizer)
-//            throws InvalidSyntaxException {
-//
-//        final Import imp = new Import();
-//
-//        final TokenizerMatch match = tokenizer.getNextToken();
-//
-//        if (match.token.equals("static")) {
-//            imp.isStatic = true;
-//            imp.path = tokenizer.getNextToken().token;
-//        } else
-//            imp.path = match.token;
-//
-//        imports.add(imp);
-//
-//        tokenizer.expectAndConsumeNextToken(";");
-//    }
-//
-//    private void parseInterface(final Tokenizer tokenizer)
-//            throws InvalidSyntaxException {
-//
-//        final TokenizerMatch match = tokenizer.getNextToken();
-//        final Clazz clazz = new Clazz(packageName, match.token, tokenizer, true);
-//        // System.out.println(clazz.toString());
-//        classes.add(clazz);
-//    }
-//
-//    private void parsePackage(final Tokenizer tokenizer)
-//            throws InvalidSyntaxException {
-//
-//        final TokenizerMatch match = tokenizer.getNextToken();
-//
-//        packageName = match.token;
-//
-//        tokenizer.expectAndConsumeNextToken(";");
-//    }
-
-    public void skipUntilSemicolon(final Tokenizer tokenizer) throws InvalidSyntaxException {
-        while (true) {
-            final TokenizerMatch token = tokenizer.getNextToken();
-
-            if (token == null)
-                return;
-
-            if (token.token.equals(";"))
-                return;
-        }
-    }
-
-
 }
diff --git a/src/main/java/eu/svjatoslav/sixth/core/document/content/Content.java b/src/main/java/eu/svjatoslav/sixth/core/document/content/Content.java
new file mode 100644 (file)
index 0000000..d57b01b
--- /dev/null
@@ -0,0 +1,4 @@
+package eu.svjatoslav.sixth.core.document.content;
+
+public interface Content {
+}
diff --git a/src/main/java/eu/svjatoslav/sixth/core/document/content/PropertyCollection.java b/src/main/java/eu/svjatoslav/sixth/core/document/content/PropertyCollection.java
new file mode 100644 (file)
index 0000000..f3840b4
--- /dev/null
@@ -0,0 +1,10 @@
+package eu.svjatoslav.sixth.core.document.content;
+
+import java.util.HashMap;
+import java.util.Map;
+
+public class PropertyCollection implements Content {
+
+    private Map<String, String> propertyToValue = new HashMap<>();
+
+}
diff --git a/src/main/java/eu/svjatoslav/sixth/core/document/content/TextBlock.java b/src/main/java/eu/svjatoslav/sixth/core/document/content/TextBlock.java
new file mode 100644 (file)
index 0000000..e5bd4ff
--- /dev/null
@@ -0,0 +1,7 @@
+package eu.svjatoslav.sixth.core.document.content;
+
+import eu.svjatoslav.sixth.core.document.text.FormattedText;
+
+public class TextBlock implements Content {
+    private FormattedText text;
+}
index b9d42a2..ae6a421 100644 (file)
@@ -1,17 +1,34 @@
 package eu.svjatoslav.sixth.core.document.text;
 
+import eu.svjatoslav.commons.string.tokenizer.InvalidSyntaxException;
+import eu.svjatoslav.commons.string.tokenizer.Tokenizer;
+import eu.svjatoslav.commons.string.tokenizer.TokenizerMatch;
+
 import java.util.ArrayList;
 import java.util.List;
 
 public class FormattedText {
     List<FormattedTextElement> elements = new ArrayList<>();
 
-    public void parseOrgSyntax(String orgText){
-        PlainText plainText = new PlainText(orgText);
-        elements.add(plainText);
+    public void parseOrgSyntax(String orgText) throws InvalidSyntaxException {
+
+        Tokenizer tokenizer = getTokenizer(orgText);
+        while (tokenizer.hasMoreContent()) {
+            final TokenizerMatch token = tokenizer.getNextToken();
+
+            if (token.terminator == Hyperlink.orgTerminator){
+                elements.add(Hyperlink.fromOrg(token));
+                continue;
+            }
+
+            PlainText plainText = new PlainText(token.token);
+            elements.add(plainText);
+        }
+
     }
 
-    public static FormattedText fromOrg(String orgText){
+
+    public static FormattedText fromOrg(String orgText) throws InvalidSyntaxException {
         FormattedText formattedText = new FormattedText();
         formattedText.parseOrgSyntax(orgText);
         return formattedText;
@@ -25,4 +42,11 @@ public class FormattedText {
 
         return sb.toString();
     }
+
+    private Tokenizer getTokenizer(String contents) {
+        final Tokenizer tokenizer = new Tokenizer(contents);
+        tokenizer.addTerminator(Hyperlink.orgTerminator);
+        return tokenizer;
+    }
+
 }
index 9f64b09..5e6884c 100644 (file)
@@ -1,9 +1,41 @@
 package eu.svjatoslav.sixth.core.document.text;
 
+import eu.svjatoslav.commons.string.tokenizer.InvalidSyntaxException;
+import eu.svjatoslav.commons.string.tokenizer.Terminator;
+import eu.svjatoslav.commons.string.tokenizer.TokenizerMatch;
+
+import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.PRESERVE;
+import static eu.svjatoslav.sixth.core.document.Helper.TG_HYPERLINK;
+
 public class Hyperlink implements FormattedTextElement {
 
+    public static final Terminator orgTerminator =
+            new Terminator(PRESERVE, "\\[\\[.*\\]\\]", TG_HYPERLINK);
+
+    private String label;
+    private String URL;
+
     @Override
     public String compileMd() {
-        return "-TODO-";
+        return "<URL: " + URL + ", LABEL: " + label + ">";
     }
+
+    public static Hyperlink fromOrg(TokenizerMatch tokenizerMatch) throws InvalidSyntaxException {
+        Hyperlink hyperlink = new Hyperlink();
+        hyperlink.parseOrgSyntax(tokenizerMatch);
+        return hyperlink;
+    }
+
+    private void parseOrgSyntax(TokenizerMatch tokenizerMatch) throws InvalidSyntaxException {
+//        Tokenizer tokenizer = new Tokenizer(tokenizerMatch.reminder);
+//        Terminator linkSeparator = tokenizer.addTerminator("][", PRESERVE);
+//
+//        URL = tokenizer.expectAndConsumeNextTerminatorToken(null).token;
+//
+//        if (tokenizer.hasMoreContent()){ // link label is optional
+//            tokenizer.expectAndConsumeNextTerminatorToken(linkSeparator);
+//            label = tokenizer.expectAndConsumeNextTerminatorToken(null).token;
+//        }
+    }
+
 }