From: Svjatoslav Agejenko Date: Tue, 4 Aug 2020 18:35:25 +0000 (+0300) Subject: Use regular expression tokenizer. WIP X-Git-Url: http://www2.svjatoslav.eu/gitweb/?a=commitdiff_plain;h=e37d8d2a8afaf35a27bc65d8d700eeea5ed5bd46;p=sixth.git Use regular expression tokenizer. WIP --- diff --git a/src/main/java/eu/svjatoslav/sixth/core/document/Document.java b/src/main/java/eu/svjatoslav/sixth/core/document/Document.java index 2385284..e626304 100644 --- a/src/main/java/eu/svjatoslav/sixth/core/document/Document.java +++ b/src/main/java/eu/svjatoslav/sixth/core/document/Document.java @@ -1,12 +1,17 @@ package eu.svjatoslav.sixth.core.document; +import eu.svjatoslav.commons.string.tokenizer.InvalidSyntaxException; +import eu.svjatoslav.commons.string.tokenizer.Tokenizer; +import eu.svjatoslav.commons.string.tokenizer.TokenizerMatch; import eu.svjatoslav.sixth.core.document.text.FormattedText; +import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.DROP; +import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.PRESERVE; +import static eu.svjatoslav.sixth.core.document.Helper.*; import static eu.svjatoslav.sixth.core.document.text.FormattedText.fromOrg; public class Document { - public final Heading rootHeading = new Heading( fromOrg("") , 0, null); - + public final Heading rootHeading = new Heading( null , 0, null); private Heading currentHeading = rootHeading; public Heading createHeading(FormattedText name, int targetLevel){ @@ -22,9 +27,58 @@ public class Document { return createHeading(name, targetLevel); } - Heading missingIntermediate = new Heading(fromOrg(""), currentHeading.level + 1, currentHeading); - currentHeading.addChild(missingIntermediate); - currentHeading = missingIntermediate; - return createHeading(name, targetLevel); + try { + Heading missingIntermediate = new Heading(fromOrg(""), currentHeading.level + 1, currentHeading); + currentHeading.addChild(missingIntermediate); + currentHeading = missingIntermediate; + return createHeading(name, targetLevel); + } catch (InvalidSyntaxException e) { + throw new IllegalStateException("impossible situation"); + } + } + + public Heading getCurrentHeading(){ + return currentHeading; + } + + private void parseHeading(TokenizerMatch token) throws InvalidSyntaxException { + System.out.println("HEADING!! " + token.token); + int level = token.token.length()-1; +// createHeading(fromOrg(token.reminder), level); + } + + public void parse(String fileContentsAsString) throws InvalidSyntaxException { + final Tokenizer tokenizer = new Tokenizer(fileContentsAsString); + + // Org heading: + // "*** Example Heading 1234" + tokenizer.addTerminator(PRESERVE, "\\*+\\s.*\\r?\\n", TG_HEADING); + + // Org list. Examples: + // " + my list title" + // "+" + tokenizer.addTerminator(PRESERVE, "\\s*(\\+|-)(\\s.*)?\\r?\\n", TG_LIST); + tokenizer.addTerminator(PRESERVE, "\\s+\\*(\\s.*)?\\r?\\n", TG_LIST); + + // DocumentProperty: + // "#+OPTIONS: H:20 num:20" + tokenizer.addTerminator(PRESERVE, "#\\+.+:.*\\r?\\n", TG_DOCUMENT_PROPERTY); + + // newline + tokenizer.addTerminator(DROP,"\\r?\\n", TG_NEWLINE); + + + while (tokenizer.hasMoreContent()) { + final TokenizerMatch tm = tokenizer.getNextToken(); + + if (tm.isGroup(TG_HEADING)){ + parseHeading(tm); + continue; + } + + tokenizer.unreadToken(); + currentHeading.parse(tokenizer); + } + } } diff --git a/src/main/java/eu/svjatoslav/sixth/core/document/Heading.java b/src/main/java/eu/svjatoslav/sixth/core/document/Heading.java index 6159e1d..fe335f5 100644 --- a/src/main/java/eu/svjatoslav/sixth/core/document/Heading.java +++ b/src/main/java/eu/svjatoslav/sixth/core/document/Heading.java @@ -1,16 +1,26 @@ package eu.svjatoslav.sixth.core.document; +import eu.svjatoslav.commons.string.tokenizer.InvalidSyntaxException; +import eu.svjatoslav.commons.string.tokenizer.Tokenizer; +import eu.svjatoslav.commons.string.tokenizer.TokenizerMatch; import eu.svjatoslav.sixth.core.document.text.FormattedText; import java.util.ArrayList; import java.util.List; +import static eu.svjatoslav.sixth.core.document.Helper.TG_DOCUMENT_PROPERTY; +import static eu.svjatoslav.sixth.core.document.Helper.TG_LIST; +import static eu.svjatoslav.sixth.core.document.text.FormattedText.fromOrg; + public class Heading { public final FormattedText name; public final int level; public final Heading parent; private final List children = new ArrayList<>(); + public final ListElement rootListElement = new ListElement(null, 0, null); + private ListElement currentListElement = rootListElement; + public Heading(FormattedText name, int level, Heading parent){ this.level = level; this.name = name; @@ -25,4 +35,77 @@ public class Heading { return children; } + public String toMD () { + StringBuilder sb = new StringBuilder(); + + if (level > 0) sb.append(enlistTitleInMD()); + + // sb.append(unparsedContent); + + + children.stream().map(Heading::toMD).forEach(sb::append); + + return sb.toString(); + } + + private String enlistTitleInMD() { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < level; i++) + sb.append("#"); + + sb.append(" ").append(name.compileMd()).append("\n"); + return sb.toString(); + } + + public ListElement createListElement(FormattedText name, int targetLevel){ + if (currentListElement.level == (targetLevel - 1)){ + ListElement newListElement = new ListElement(name, targetLevel, currentListElement); + currentListElement.addChild(newListElement); + currentListElement = newListElement; + return newListElement; + } + + if (currentListElement.level > (targetLevel - 1)){ + currentListElement = currentListElement.parent; + return createListElement(name, targetLevel); + } + + try { + ListElement missingIntermediate = new ListElement( + fromOrg(""), currentListElement.level + 1, currentListElement); + currentListElement.addChild(missingIntermediate); + currentListElement = missingIntermediate; + return createListElement(name, targetLevel); + } catch (InvalidSyntaxException e) { + throw new IllegalStateException("impossible situation"); + } + } + + public ListElement getCurrentHeading(){ + return currentListElement; + } + + public void parse(Tokenizer tokenizer){ + while (tokenizer.hasMoreContent()) { + final TokenizerMatch tm = tokenizer.getNextToken(); + + if (tm.isGroup(TG_LIST)){ + System.out.println("LIST!: " + tm.token); + continue; + } + + if (tm.isGroup(TG_DOCUMENT_PROPERTY)){ +// System.out.println("DOCUMENT PROPERTY!!!: " + tm.token); + continue; + } + + if (tm.isGroup(null)){ +// System.out.println(" HC: " + tm.token); + continue; + } + + tokenizer.unreadToken(); + break; + } + } } diff --git a/src/main/java/eu/svjatoslav/sixth/core/document/Helper.java b/src/main/java/eu/svjatoslav/sixth/core/document/Helper.java new file mode 100644 index 0000000..2d9c228 --- /dev/null +++ b/src/main/java/eu/svjatoslav/sixth/core/document/Helper.java @@ -0,0 +1,12 @@ +package eu.svjatoslav.sixth.core.document; + +public class Helper { + + public static final String TG_NEWLINE = "newline"; + public static final String TG_HYPERLINK = "hyperlink"; + public static final String TG_HEADING = "heading"; + public static final String TG_LIST = "list"; + public static final String TG_DOCUMENT_PROPERTY = "document property"; + + +} diff --git a/src/main/java/eu/svjatoslav/sixth/core/document/ListElement.java b/src/main/java/eu/svjatoslav/sixth/core/document/ListElement.java new file mode 100644 index 0000000..908b190 --- /dev/null +++ b/src/main/java/eu/svjatoslav/sixth/core/document/ListElement.java @@ -0,0 +1,49 @@ +package eu.svjatoslav.sixth.core.document; + +import eu.svjatoslav.commons.string.tokenizer.Tokenizer; +import eu.svjatoslav.commons.string.tokenizer.TokenizerMatch; +import eu.svjatoslav.sixth.core.document.text.FormattedText; + +import java.util.ArrayList; +import java.util.List; + +import static eu.svjatoslav.sixth.core.document.Helper.TG_DOCUMENT_PROPERTY; + +public class ListElement { + public final FormattedText name; + public final int level; + public final ListElement parent; + private final List children = new ArrayList<>(); + + public ListElement(FormattedText name, int level, ListElement parent){ + this.level = level; + this.name = name; + this.parent = parent; + } + + public void addChild(ListElement listElement){ + children.add(listElement); + } + + public void parse(Tokenizer tokenizer){ + while (tokenizer.hasMoreContent()) { + final TokenizerMatch tm = tokenizer.getNextToken(); + + if (tm.isGroup(TG_DOCUMENT_PROPERTY)){ + System.out.println("DOCUMENT PROPERT!!!: " + tm.token); + continue; + } + + if (tm.isGroup(null)){ + System.out.println(" HC: " + tm.token); + continue; + } + + tokenizer.unreadToken(); + break; + } + } + + + +} diff --git a/src/main/java/eu/svjatoslav/sixth/core/document/MdGenerator.java b/src/main/java/eu/svjatoslav/sixth/core/document/MdGenerator.java index 90f1b1a..fbc8320 100644 --- a/src/main/java/eu/svjatoslav/sixth/core/document/MdGenerator.java +++ b/src/main/java/eu/svjatoslav/sixth/core/document/MdGenerator.java @@ -12,23 +12,10 @@ public class MdGenerator { public void generate(Document document, File file) throws IOException { sb = new StringBuilder(); - enlistHeading(document.rootHeading); + sb.append(document.rootHeading.toMD()); saveToFile(file, sb.toString()); } - private void enlistHeading(Heading heading) { - if (heading.level > 0) enlistHeadingTitle(heading); - - heading.getChildren().forEach(this::enlistHeading); - } - - private void enlistHeadingTitle(Heading heading) { - for (int i = 0; i < heading.level; i++) - sb.append("#"); - - sb.append(" ").append(heading.name.compileMd()).append("\n"); - } - } diff --git a/src/main/java/eu/svjatoslav/sixth/core/document/OrgParser.java b/src/main/java/eu/svjatoslav/sixth/core/document/OrgParser.java index fdbd41b..8822d21 100644 --- a/src/main/java/eu/svjatoslav/sixth/core/document/OrgParser.java +++ b/src/main/java/eu/svjatoslav/sixth/core/document/OrgParser.java @@ -1,17 +1,11 @@ package eu.svjatoslav.sixth.core.document; -import eu.svjatoslav.commons.string.String2; import eu.svjatoslav.commons.string.tokenizer.InvalidSyntaxException; -import eu.svjatoslav.commons.string.tokenizer.Tokenizer; -import eu.svjatoslav.commons.string.tokenizer.TokenizerMatch; import java.io.File; import java.io.IOException; import static eu.svjatoslav.commons.file.IOHelper.getFileContentsAsString; -import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.DROP; -import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.PRESERVE; -import static eu.svjatoslav.sixth.core.document.text.FormattedText.fromOrg; public class OrgParser { @@ -20,125 +14,11 @@ public class OrgParser { public Document parse(File file) throws IOException, InvalidSyntaxException { document = new Document(); - Tokenizer lineTokenizer = getFileToLineTokenizer(getFileContentsAsString(file)); + String fileContentsAsString = getFileContentsAsString(file); - while (true) { - final TokenizerMatch line = lineTokenizer.getNextToken(); - if (line == null) - break; // EOF - - parseLine(line.token); - } + document.parse(fileContentsAsString); return document; } - private void parseLine(String line) throws InvalidSyntaxException { - Tokenizer lineTokenizer = getLineTokenizer(line + "\n"); - - TokenizerMatch token = lineTokenizer.getNextToken(); - if (token == null) return; - - if (token.terminator == null) - return; - - if (token.token.startsWith("*")){ - parseHeading(token); - return; - } - } - - private void parseHeading(TokenizerMatch token) { - int level = token.token.length()-1; - document.createHeading(fromOrg(token.reminder), level); - } - - private Tokenizer getLineTokenizer(String contents) { - final Tokenizer tokenizer = new Tokenizer(contents); - for (int i = 1; i<50; i++){ - String prefix = new String2("*").repeat(i).toString(); - tokenizer.addTerminator(prefix +" ","\n", PRESERVE); - } - return tokenizer; - } - - private Tokenizer getFileToLineTokenizer(String contents) { - final Tokenizer tokenizer = new Tokenizer(contents); - - // empty space -// tokenizer.addTerminator(" ", DROP); -// tokenizer.addTerminator("\t", DROP); -// tokenizer.addTerminator("\n", DROP); - - // newline - tokenizer.addTerminator("\n", DROP); - -// tokenizer.addTerminator(";", PRESERVE); -// tokenizer.addTerminator("{", PRESERVE); -// tokenizer.addTerminator("}", PRESERVE); -// tokenizer.addTerminator("(", PRESERVE); -// tokenizer.addTerminator(")", PRESERVE); -// tokenizer.addTerminator("[", PRESERVE); -// tokenizer.addTerminator("]", PRESERVE); -// tokenizer.addTerminator("<", PRESERVE); -// tokenizer.addTerminator(">", PRESERVE); -// tokenizer.addTerminator(",", PRESERVE); -// tokenizer.addTerminator("@", PRESERVE); - - // comments -// tokenizer.addTerminator("//", "\n", DROP); -// tokenizer.addTerminator("/*", "*/", DROP); - return tokenizer; - } -// -// private void parseImport(final Tokenizer tokenizer) -// throws InvalidSyntaxException { -// -// final Import imp = new Import(); -// -// final TokenizerMatch match = tokenizer.getNextToken(); -// -// if (match.token.equals("static")) { -// imp.isStatic = true; -// imp.path = tokenizer.getNextToken().token; -// } else -// imp.path = match.token; -// -// imports.add(imp); -// -// tokenizer.expectAndConsumeNextToken(";"); -// } -// -// private void parseInterface(final Tokenizer tokenizer) -// throws InvalidSyntaxException { -// -// final TokenizerMatch match = tokenizer.getNextToken(); -// final Clazz clazz = new Clazz(packageName, match.token, tokenizer, true); -// // System.out.println(clazz.toString()); -// classes.add(clazz); -// } -// -// private void parsePackage(final Tokenizer tokenizer) -// throws InvalidSyntaxException { -// -// final TokenizerMatch match = tokenizer.getNextToken(); -// -// packageName = match.token; -// -// tokenizer.expectAndConsumeNextToken(";"); -// } - - public void skipUntilSemicolon(final Tokenizer tokenizer) throws InvalidSyntaxException { - while (true) { - final TokenizerMatch token = tokenizer.getNextToken(); - - if (token == null) - return; - - if (token.token.equals(";")) - return; - } - } - - } diff --git a/src/main/java/eu/svjatoslav/sixth/core/document/content/Content.java b/src/main/java/eu/svjatoslav/sixth/core/document/content/Content.java new file mode 100644 index 0000000..d57b01b --- /dev/null +++ b/src/main/java/eu/svjatoslav/sixth/core/document/content/Content.java @@ -0,0 +1,4 @@ +package eu.svjatoslav.sixth.core.document.content; + +public interface Content { +} diff --git a/src/main/java/eu/svjatoslav/sixth/core/document/content/PropertyCollection.java b/src/main/java/eu/svjatoslav/sixth/core/document/content/PropertyCollection.java new file mode 100644 index 0000000..f3840b4 --- /dev/null +++ b/src/main/java/eu/svjatoslav/sixth/core/document/content/PropertyCollection.java @@ -0,0 +1,10 @@ +package eu.svjatoslav.sixth.core.document.content; + +import java.util.HashMap; +import java.util.Map; + +public class PropertyCollection implements Content { + + private Map propertyToValue = new HashMap<>(); + +} diff --git a/src/main/java/eu/svjatoslav/sixth/core/document/content/TextBlock.java b/src/main/java/eu/svjatoslav/sixth/core/document/content/TextBlock.java new file mode 100644 index 0000000..e5bd4ff --- /dev/null +++ b/src/main/java/eu/svjatoslav/sixth/core/document/content/TextBlock.java @@ -0,0 +1,7 @@ +package eu.svjatoslav.sixth.core.document.content; + +import eu.svjatoslav.sixth.core.document.text.FormattedText; + +public class TextBlock implements Content { + private FormattedText text; +} diff --git a/src/main/java/eu/svjatoslav/sixth/core/document/text/FormattedText.java b/src/main/java/eu/svjatoslav/sixth/core/document/text/FormattedText.java index b9d42a2..ae6a421 100644 --- a/src/main/java/eu/svjatoslav/sixth/core/document/text/FormattedText.java +++ b/src/main/java/eu/svjatoslav/sixth/core/document/text/FormattedText.java @@ -1,17 +1,34 @@ package eu.svjatoslav.sixth.core.document.text; +import eu.svjatoslav.commons.string.tokenizer.InvalidSyntaxException; +import eu.svjatoslav.commons.string.tokenizer.Tokenizer; +import eu.svjatoslav.commons.string.tokenizer.TokenizerMatch; + import java.util.ArrayList; import java.util.List; public class FormattedText { List elements = new ArrayList<>(); - public void parseOrgSyntax(String orgText){ - PlainText plainText = new PlainText(orgText); - elements.add(plainText); + public void parseOrgSyntax(String orgText) throws InvalidSyntaxException { + + Tokenizer tokenizer = getTokenizer(orgText); + while (tokenizer.hasMoreContent()) { + final TokenizerMatch token = tokenizer.getNextToken(); + + if (token.terminator == Hyperlink.orgTerminator){ + elements.add(Hyperlink.fromOrg(token)); + continue; + } + + PlainText plainText = new PlainText(token.token); + elements.add(plainText); + } + } - public static FormattedText fromOrg(String orgText){ + + public static FormattedText fromOrg(String orgText) throws InvalidSyntaxException { FormattedText formattedText = new FormattedText(); formattedText.parseOrgSyntax(orgText); return formattedText; @@ -25,4 +42,11 @@ public class FormattedText { return sb.toString(); } + + private Tokenizer getTokenizer(String contents) { + final Tokenizer tokenizer = new Tokenizer(contents); + tokenizer.addTerminator(Hyperlink.orgTerminator); + return tokenizer; + } + } diff --git a/src/main/java/eu/svjatoslav/sixth/core/document/text/Hyperlink.java b/src/main/java/eu/svjatoslav/sixth/core/document/text/Hyperlink.java index 9f64b09..5e6884c 100644 --- a/src/main/java/eu/svjatoslav/sixth/core/document/text/Hyperlink.java +++ b/src/main/java/eu/svjatoslav/sixth/core/document/text/Hyperlink.java @@ -1,9 +1,41 @@ package eu.svjatoslav.sixth.core.document.text; +import eu.svjatoslav.commons.string.tokenizer.InvalidSyntaxException; +import eu.svjatoslav.commons.string.tokenizer.Terminator; +import eu.svjatoslav.commons.string.tokenizer.TokenizerMatch; + +import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.PRESERVE; +import static eu.svjatoslav.sixth.core.document.Helper.TG_HYPERLINK; + public class Hyperlink implements FormattedTextElement { + public static final Terminator orgTerminator = + new Terminator(PRESERVE, "\\[\\[.*\\]\\]", TG_HYPERLINK); + + private String label; + private String URL; + @Override public String compileMd() { - return "-TODO-"; + return ""; } + + public static Hyperlink fromOrg(TokenizerMatch tokenizerMatch) throws InvalidSyntaxException { + Hyperlink hyperlink = new Hyperlink(); + hyperlink.parseOrgSyntax(tokenizerMatch); + return hyperlink; + } + + private void parseOrgSyntax(TokenizerMatch tokenizerMatch) throws InvalidSyntaxException { +// Tokenizer tokenizer = new Tokenizer(tokenizerMatch.reminder); +// Terminator linkSeparator = tokenizer.addTerminator("][", PRESERVE); +// +// URL = tokenizer.expectAndConsumeNextTerminatorToken(null).token; +// +// if (tokenizer.hasMoreContent()){ // link label is optional +// tokenizer.expectAndConsumeNextTerminatorToken(linkSeparator); +// label = tokenizer.expectAndConsumeNextTerminatorToken(null).token; +// } + } + }