+/*
+ * Svjatoslav Commons - shared library of common functionality. Author: Svjatoslav Agejenko.
+ * This project is released under Creative Commons Zero (CC0) license.
+ */
+package eu.svjatoslav.commons.string.tokenizer;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Stack;
+import java.util.regex.Matcher;
+import java.util.stream.Stream;
+
+import static eu.svjatoslav.commons.string.tokenizer.Terminator.TerminationStrategy.PRESERVE;
+import static java.lang.System.out;
+
+/**
+ * A regex-based tokenizer for parsing structured text into tokens.
+ *
+ * <p>The Tokenizer breaks down source text into tokens based on regular
+ * expression patterns called "terminators". Terminators define how to
+ * identify and handle token boundaries:</p>
+ *
+ * <ul>
+ * <li>{@link Terminator.TerminationStrategy#PRESERVE PRESERVE} - Return matched
+ * tokens for processing (useful for syntax elements you want to capture)</li>
+ * <li>{@link Terminator.TerminationStrategy#DROP DROP} - Silently discard matched
+ * tokens (useful for whitespace, comments, or other separators)</li>
+ * </ul>
+ *
+ * <p>Key features:</p>
+ * <ul>
+ * <li>Pattern-based token identification using regex</li>
+ * <li>Peek ahead without consuming tokens</li>
+ * <li>Unread tokens to backtrack</li>
+ * <li>Expect specific tokens (throws on mismatch)</li>
+ * <li>Group-based token categorization</li>
+ * </ul>
+ *
+ * <p>Example usage:</p>
+ * <pre>{@code
+ * Tokenizer tokenizer = new Tokenizer("hello, world! 123");
+ * tokenizer.addTerminator(DROP, "\\s+"); // Drop whitespace
+ * tokenizer.addTerminator(PRESERVE, "\\w+"); // Preserve words
+ * tokenizer.addTerminator(PRESERVE, ","); // Preserve comma
+ * tokenizer.addTerminator(PRESERVE, "!"); // Preserve exclamation
+ * tokenizer.addTerminator(PRESERVE, "\\d+"); // Preserve numbers
+ *
+ * while (tokenizer.hasMoreContent()) {
+ * TokenizerMatch match = tokenizer.getNextToken();
+ * System.out.println(match.token);
+ * }
+ * // Output: hello, world, !, 123
+ * }</pre>
+ *
+ * <p>The tokenizer maintains a history stack, allowing you to unread
+ * tokens and backtrack during parsing:</p>
+ * <pre>{@code
+ * TokenizerMatch first = tokenizer.getNextToken();
+ * TokenizerMatch second = tokenizer.getNextToken();
+ * tokenizer.unreadToken(); // Go back one token
+ * tokenizer.unreadToken(); // Go back another token
+ * TokenizerMatch again = tokenizer.getNextToken(); // Same as first
+ * }</pre>
+ *
+ * <p>You can also peek without consuming:</p>
+ * <pre>{@code
+ * TokenizerMatch peeked = tokenizer.peekNextToken(); // Look ahead
+ * TokenizerMatch actual = tokenizer.getNextToken(); // Same as peeked
+ * }</pre>
+ *
+ * @see Terminator
+ * @see TokenizerMatch
+ * @see InvalidSyntaxException
+ */
+public class Tokenizer {
+
+ /**
+ * Stack of token indexes for backtracking.
+ *
+ * <p>Each time a token is consumed, the current index is pushed onto
+ * this stack. Calling {@link #unreadToken()} pops the stack and
+ * restores the previous position, allowing the same token to be
+ * read again.</p>
+ */
+ private final Stack<Integer> tokenIndexes = new Stack<>();
+
+ /**
+ * List of terminators that define token boundaries.
+ *
+ * <p>Terminators are checked in order during tokenization. When a
+ * terminator matches at the current position, it determines how
+ * the match is handled (preserved or dropped).</p>
+ */
+ private final List<Terminator> terminators = new ArrayList<>();
+
+ /**
+ * The source string being tokenized.
+ *
+ * <p>This is the text that will be broken down into tokens. Can be
+ * null initially and set later via {@link #setSource(String)}.</p>
+ */
+ private String source;
+
+ /**
+ * Current reading position within the source string.
+ *
+ * <p>This index advances as tokens are consumed. It starts at 0
+ * and moves forward through the source string.</p>
+ */
+ private int currentIndex = 0;
+
+ /**
+ * Creates a new tokenizer for the specified source string.
+ *
+ * <p>The source string will be processed when {@link #getNextToken()}
+ * is called. Add terminators before calling getNextToken() to
+ * define how tokens should be identified.</p>
+ *
+ * @param source the text to tokenize. May be null (use setSource later).
+ */
+ public Tokenizer(final String source) {
+ this.source = source;
+ }
+
+ /**
+ * Creates an empty tokenizer without a source string.
+ *
+ * <p>Use {@link #setSource(String)} to provide text for tokenization
+ * before calling {@link #getNextToken()}.</p>
+ */
+ public Tokenizer() {
+ }
+
+ /**
+ * Sets or replaces the source string to tokenize.
+ *
+ * <p>This resets the tokenizer state: the reading position is set to 0,
+ * and the token history stack is cleared. Use this to tokenize a new
+ * string with the same terminator configuration.</p>
+ *
+ * <p>Example:</p>
+ * <pre>{@code
+ * Tokenizer tokenizer = new Tokenizer();
+ * tokenizer.addTerminator(DROP, "\\s+");
+ *
+ * tokenizer.setSource("first string");
+ * // tokenize first string...
+ *
+ * tokenizer.setSource("second string");
+ * // tokenize second string with same rules...
+ * }</pre>
+ *
+ * @param source the new text to tokenize. May be null.
+ * @return this Tokenizer instance for fluent method chaining.
+ */
+ public Tokenizer setSource(String source) {
+ this.source = source;
+ currentIndex = 0;
+ tokenIndexes.clear();
+ return this;
+ }
+
+ /**
+ * Adds a terminator with a termination strategy and regex pattern.
+ *
+ * <p>The terminator will match tokens based on the regex pattern.
+ * The termination strategy determines whether matched tokens are
+ * preserved (returned) or dropped (silently discarded).</p>
+ *
+ * <p>The pattern is anchored to match only at the current position
+ * (prepended with "^").</p>
+ *
+ * @param terminationStrategy how to handle matched tokens
+ * (PRESERVE or DROP).
+ * @param regexp the regex pattern to match tokens.
+ * @return the created Terminator object, which can be further configured
+ * (e.g., setting the active flag or group).
+ */
+ public Terminator addTerminator(final Terminator.TerminationStrategy terminationStrategy, String regexp) {
+ Terminator terminator = new Terminator(terminationStrategy, regexp,null);
+ terminators.add(terminator);
+ return terminator;
+ }
+
+ /**
+ * Adds a terminator with a termination strategy, regex pattern, and group name.
+ *
+ * <p>The group name allows categorizing tokens by type, which can be
+ * checked using {@link TokenizerMatch#isGroup(String)}.</p>
+ *
+ * <p>Example:</p>
+ * <pre>{@code
+ * tokenizer.addTerminator(PRESERVE, "\\d+", "number");
+ * tokenizer.addTerminator(PRESERVE, "\\w+", "word");
+ *
+ * TokenizerMatch match = tokenizer.getNextToken();
+ * if (match.isGroup("number")) {
+ * // Handle number token...
+ * }
+ * }</pre>
+ *
+ * @param terminationStrategy how to handle matched tokens
+ * (PRESERVE or DROP).
+ * @param regexp the regex pattern to match tokens.
+ * @param group the group name for categorizing this token type.
+ * May be null.
+ * @return the created Terminator object.
+ */
+ public Terminator addTerminator(final Terminator.TerminationStrategy terminationStrategy,
+ String regexp, String group) {
+ Terminator terminator = new Terminator(terminationStrategy, regexp,group);
+ terminators.add(terminator);
+ return terminator;
+ }
+
+
+ /**
+ * Adds a pre-configured terminator to this tokenizer.
+ *
+ * <p>Use this when you need to create a Terminator with custom
+ * configuration before adding it.</p>
+ *
+ * @param terminator the terminator to add. Must not be null.
+ * @return the same terminator that was added.
+ */
+ public Terminator addTerminator(Terminator terminator) {
+ terminators.add(terminator);
+ return terminator;
+ }
+
+ /**
+ * Consumes the next token and verifies it matches the expected value.
+ *
+ * <p>This is a convenience method for parsing where you expect a
+ * specific token at a specific position. If the token doesn't match,
+ * an exception is thrown.</p>
+ *
+ * <p>Example:</p>
+ * <pre>{@code
+ * tokenizer.expectAndConsumeNextStringToken("if");
+ * // Consumes "if" token, throws if next token is not "if"
+ * }</pre>
+ *
+ * @param value the expected token value. Must not be null.
+ * @throws InvalidSyntaxException if the next token does not match
+ * the expected value.
+ */
+ public void expectAndConsumeNextStringToken(final String value)
+ throws InvalidSyntaxException {
+ final TokenizerMatch match = getNextToken();
+ if (!value.equals(match.token))
+ throw new InvalidSyntaxException("Expected \"" + value
+ + "\" but got \"" + match.token + "\" instead.");
+ }
+
+ /**
+ * Consumes the next token and verifies it was matched by the expected terminator.
+ *
+ * <p>This is useful when you need to ensure a specific terminator matched
+ * the token, not just that the token has a certain value.</p>
+ *
+ * <p>Example:</p>
+ * <pre>{@code
+ * Terminator stringTerminator = tokenizer.addTerminator(PRESERVE, "\".*\"");
+ * tokenizer.expectAndConsumeNextTerminatorToken(stringTerminator);
+ * }</pre>
+ *
+ * @param terminator the expected terminator that should have matched.
+ * @return the TokenizerMatch containing the matched token.
+ * @throws InvalidSyntaxException if the next token was matched by
+ * a different terminator.
+ */
+ public TokenizerMatch expectAndConsumeNextTerminatorToken(Terminator terminator)
+ throws InvalidSyntaxException {
+ final TokenizerMatch match = getNextToken();
+
+ if (match.terminator != terminator)
+ throw new InvalidSyntaxException("Expected terminator \"" + terminator
+ + "\" but got \"" + match.terminator + "\" instead.");
+
+ return match;
+ }
+
+
+ /**
+ * Returns the next token from the source string.
+ *
+ * <p>This method advances the reading position. The token is identified
+ * based on the configured terminators:</p>
+ * <ul>
+ * <li>If a PRESERVE terminator matches, that matched text is returned</li>
+ * <li>If a DROP terminator matches, it is discarded and the next token is sought</li>
+ * <li>If no terminator matches, characters accumulate until a terminator matches</li>
+ * </ul>
+ *
+ * <p>Example:</p>
+ * <pre>{@code
+ * TokenizerMatch match = tokenizer.getNextToken();
+ * if (match != null) {
+ * System.out.println(match.token);
+ * }
+ * }</pre>
+ *
+ * @return the next TokenizerMatch, or {@code null} if the end of the
+ * source string is reached.
+ */
+ public TokenizerMatch getNextToken() {
+ tokenIndexes.push(currentIndex);
+
+ StringBuilder tokenAccumulator = new StringBuilder();
+
+ while (true) {
+
+ if (currentIndex >= source.length()) { // reached end of input
+ if (hasAccumulatedToken(tokenAccumulator))
+ return new TokenizerMatch(tokenAccumulator.toString(), null, null, this);
+ else
+ return null;
+ }
+
+ TokenizerMatch matchResult = findTerminatorMatch();
+ if (matchResult == null) {
+ tokenAccumulator.append(source.charAt(currentIndex));
+ currentIndex++;
+ continue;
+ }
+
+ if (matchResult.terminator.termination == PRESERVE) {
+ if (hasAccumulatedToken(tokenAccumulator))
+ return new TokenizerMatch(tokenAccumulator.toString(), null, null, this);
+
+ currentIndex = matchResult.matcher.end();
+ return matchResult;
+ } else {
+ currentIndex = matchResult.matcher.end();
+
+ if (hasAccumulatedToken(tokenAccumulator))
+ return new TokenizerMatch(tokenAccumulator.toString(), null, null, this);
+ }
+ }
+ }
+
+ /**
+ * Finds a terminator that matches at the current position.
+ *
+ * <p>This checks all active terminators (in order) to see if any
+ * matches at the current index. The first matching terminator
+ * is returned.</p>
+ *
+ * <p>Terminators with {@code active = false} are skipped.</p>
+ *
+ * @return a TokenizerMatch if a terminator matches, or {@code null}
+ * if no terminator matches at the current position.
+ */
+ public TokenizerMatch findTerminatorMatch(){
+ for (Terminator terminator : terminators)
+ if (terminator.active) {
+ Matcher match = terminator.match(source, currentIndex);
+ if (match.find()) {
+ String token = source.substring(match.start(), match.end());
+ return new TokenizerMatch(token, terminator, match, this);
+ }
+ }
+ return null;
+ }
+
+ /**
+ * Checks if the token accumulator has any content.
+ *
+ * <p>This is used internally to determine if accumulated characters
+ * should be returned as a token.</p>
+ *
+ * @param tokenAccumulator the StringBuilder containing accumulated characters.
+ * @return {@code true} if there are accumulated characters, {@code false} otherwise.
+ */
+ private boolean hasAccumulatedToken(StringBuilder tokenAccumulator) {
+ return tokenAccumulator.length() > 0;
+ }
+
+ /**
+ * Checks if there is more content to read.
+ *
+ * <p>Returns true if the current position is before the end of the
+ * source string. Note that even if this returns true, getNextToken()
+ * might return null if remaining content is dropped by terminators.</p>
+ *
+ * @return {@code true} if there is more content, {@code false} if at
+ * the end of source or source is null.
+ */
+ public boolean hasMoreContent() {
+ if (source == null) return false;
+ return currentIndex < source.length();
+ }
+
+ /**
+ * Consumes the next token if it matches the expected value.
+ *
+ * <p>If the next token matches, it is consumed and {@code true} is returned.
+ * If it doesn't match, the token is unread and {@code false} is returned.</p>
+ *
+ * <p>Example:</p>
+ * <pre>{@code
+ * if (tokenizer.consumeIfNextToken("else")) {
+ * // Handle else clause
+ * } else {
+ * // Token was not "else", position unchanged
+ * }
+ * }</pre>
+ *
+ * @param token the expected token value. Must not be null.
+ * @return {@code true} if the next token matched and was consumed,
+ * {@code false} otherwise (position unchanged).
+ * @throws InvalidSyntaxException if parsing fails.
+ */
+ public boolean consumeIfNextToken(final String token) throws InvalidSyntaxException {
+ if (token.equals(getNextToken().token))
+ return true;
+
+ unreadToken();
+ return false;
+ }
+
+ /**
+ * Returns the next token without consuming it.
+ *
+ * <p>This looks ahead at the next token and returns it, then immediately
+ * unread to restore the position. Use this to examine what's coming
+ * without advancing.</p>
+ *
+ * <p>Example:</p>
+ * <pre>{@code
+ * TokenizerMatch peeked = tokenizer.peekNextToken();
+ * System.out.println("Next will be: " + peeked.token);
+ * TokenizerMatch actual = tokenizer.getNextToken(); // Same as peeked
+ * }</pre>
+ *
+ * @return the next TokenizerMatch without advancing the position.
+ * @throws InvalidSyntaxException if parsing fails.
+ */
+ public TokenizerMatch peekNextToken() throws InvalidSyntaxException {
+ TokenizerMatch result = getNextToken();
+ unreadToken();
+ return result;
+ }
+
+ /**
+ * Checks if the next token is one of the specified possibilities.
+ *
+ * <p>This peeks at the next token and checks if its value equals any
+ * of the given strings. The position is unchanged after this call.</p>
+ *
+ * <p>Example:</p>
+ * <pre>{@code
+ * if (tokenizer.peekIsOneOf("if", "else", "while")) {
+ * // Next token is a control keyword
+ * }
+ * }</pre>
+ *
+ * @param possibilities the token values to check against.
+ * Must not be null or empty.
+ * @return {@code true} if the next token matches one of the possibilities,
+ * {@code false} otherwise.
+ * @throws InvalidSyntaxException if parsing fails.
+ */
+ public boolean peekIsOneOf(String... possibilities) throws InvalidSyntaxException {
+ String nextToken = peekNextToken().token;
+ return Stream.of(possibilities).anyMatch(possibility -> possibility.equals(nextToken));
+ }
+
+ /**
+ * Verifies the next token is NOT one of the specified possibilities.
+ *
+ * <p>If the next token matches any possibility, an exception is thrown.
+ * Use this for negative assertions in parsing.</p>
+ *
+ * <p>Example:</p>
+ * <pre>{@code
+ * tokenizer.peekExpectNoneOf("}", "end");
+ * // Throws if next token is } or end
+ * }</pre>
+ *
+ * @param possibilities the token values that should NOT appear next.
+ * @throws InvalidSyntaxException if the next token matches any possibility.
+ */
+ public void peekExpectNoneOf(String... possibilities) throws InvalidSyntaxException {
+ if (peekIsOneOf(possibilities))
+ throw new InvalidSyntaxException("Not expected \"" + peekNextToken().token + "\" here.");
+ }
+
+ /**
+ * Unreads the most recently consumed token.
+ *
+ * <p>This restores the reading position to before the last token was
+ * read. The token can be read again with getNextToken().</p>
+ *
+ * <p>You can unread multiple times to backtrack further:</p>
+ * <pre>{@code
+ * TokenizerMatch first = tokenizer.getNextToken();
+ * TokenizerMatch second = tokenizer.getNextToken();
+ * TokenizerMatch third = tokenizer.getNextToken();
+ *
+ * tokenizer.unreadToken(); // Back to after second
+ * tokenizer.unreadToken(); // Back to after first
+ *
+ * TokenizerMatch again = tokenizer.getNextToken(); // Same as second
+ * }</pre>
+ */
+ public void unreadToken() {
+ currentIndex = tokenIndexes.pop();
+ }
+
+ /**
+ * Prints all remaining tokens for debugging purposes.
+ *
+ * <p>This reads and prints all remaining tokens without permanently
+ * consuming them. After printing, the position is restored to the
+ * original location.</p>
+ *
+ * <p>Output is printed to stdout with each token on a new line.</p>
+ */
+ public void enlistRemainingTokens(){
+ int redTokenCount = 0;
+
+ while (hasMoreContent()) {
+ out.println(getNextToken().toString());
+ redTokenCount++;
+ }
+
+ // restore pointer to original location
+ for (int i = 0; i< redTokenCount; i++ ) unreadToken();
+ }
+
+
+ /**
+ * Skips to the end of the source string without consuming tokens.
+ *
+ * <p>This advances directly to the end, skipping all remaining content.
+ * After calling this, {@link #hasMoreContent()} will return {@code false}.</p>
+ *
+ * <p>The current position is saved on the stack, so you can unread
+ * to restore it if needed.</p>
+ */
+ public void skipUntilDataEnd() {
+ tokenIndexes.push(currentIndex);
+ currentIndex = source.length();
+ }
+
+}
\ No newline at end of file