colorful-parse 0.2.1

Structural prose parser (logos lexer + sentence segmenter): a Parser adapter.
Documentation
//! The structural prose parser — a [`Parser`] adapter.
//!
//! A `logos` lexer turns text into mechanical tokens (words, numbers, sentence
//! terminators, quotes, punctuation), and a small segmenter groups them into
//! [`Node::Sentence`] runs (absorbing a trailing closing quote or bracket). It
//! produces *structure only* — it makes no part-of-speech decisions; that is the
//! lexicon and annotator's job. (This is sentence segmentation, not a deep
//! recursive-descent grammar; the structure is intentionally shallow in `v0`.)
//!
//! Parsing is **total**: any input, including malformed or non-ASCII text,
//! yields a [`Tree`] without panicking. Characters the lexer cannot otherwise
//! classify are emitted as punctuation rather than dropped.

#![forbid(unsafe_code)]
#![warn(missing_docs)]

use colorful_core::{Node, Parser, Span, Tree};
use logos::Logos;

/// Mechanical token kinds produced by the lexer.
#[derive(Logos, Debug, PartialEq, Eq)]
#[logos(skip r"[ \t\r\n\u{000C}\u{00A0}\u{1680}\u{2000}-\u{200A}\u{202F}\u{205F}\u{3000}]+")]
enum Tok {
    /// A letter-initial word, allowing internal digits, apostrophes, and hyphens
    /// (`don't`, `well-being`, `covid19`, `H2O`). A digit-initial token is a
    /// [`Tok::Number`] instead.
    #[regex(r"\p{L}[\p{L}\p{N}]*(?:['\u{2019}\-][\p{L}\p{N}]+)*")]
    Word,
    /// A numeric token with optional internal separators (`150`, `3.14`,
    /// `1,000`).
    #[regex(r"\p{N}+(?:[.,]\p{N}+)*")]
    Number,
    /// A run of sentence-ending punctuation (`.`, `!`, `?`, `?!`, `...`).
    #[regex(r"[.!?]+")]
    SentenceEnd,
    /// A quotation mark (straight or typographic).
    #[regex(r#"["'\u{201C}\u{201D}\u{2018}\u{2019}\u{00AB}\u{00BB}]"#)]
    Quote,
    /// Other punctuation.
    #[regex(r"[,;:\u{2026}\u{2014}\u{2013}()\[\]{}/\\@#$%^&*+=<>~|_-]")]
    Punct,
}

/// The structural prose parser.
#[derive(Debug, Default, Clone, Copy)]
pub struct ProseParser;

impl ProseParser {
    /// Create a new parser.
    #[must_use]
    pub fn new() -> Self {
        Self
    }
}

impl Parser for ProseParser {
    fn parse(&self, text: &str) -> Tree {
        let mut sentences: Vec<Node> = Vec::new();
        let mut parts: Vec<Node> = Vec::new();
        let mut sent_start: usize = 0;
        let mut sent_end: usize = 0;
        // After a sentence terminator we defer the flush, so a closing quote or
        // bracket sitting immediately after it is absorbed into the sentence
        // rather than leaking into the next one.
        let mut pending_flush = false;

        let mut lexer = Tok::lexer(text);
        while let Some(result) = lexer.next() {
            let range = lexer.span();
            let span = Span::new(range.start, range.end);
            let is_closer =
                matches!(result, Ok(Tok::Quote)) || matches!(span.slice(text), ")" | "]" | "}");

            if pending_flush {
                if is_closer && span.start == sent_end {
                    // Adjacent trailing closer: keep it in the current sentence.
                    parts.push(Node::Punct { span });
                    sent_end = span.end;
                    continue;
                }
                // The sentence is complete; flush it before this token.
                sentences.push(Node::Sentence {
                    span: Span::new(sent_start, sent_end),
                    parts: std::mem::take(&mut parts),
                });
                pending_flush = false;
            }

            if parts.is_empty() {
                sent_start = span.start;
            }
            sent_end = span.end;

            match result {
                Ok(Tok::Word | Tok::Number) => parts.push(Node::Word { span }),
                Ok(Tok::SentenceEnd) => {
                    parts.push(Node::Punct { span });
                    pending_flush = true;
                }
                // Quotes, other punctuation, and any unrecognized character all
                // become punctuation nodes — parsing stays total.
                Ok(Tok::Quote | Tok::Punct) | Err(()) => parts.push(Node::Punct { span }),
            }
        }

        // Flush a trailing, unterminated sentence.
        if !parts.is_empty() {
            sentences.push(Node::Sentence {
                span: Span::new(sent_start, sent_end),
                parts,
            });
        }

        Tree::document(sentences)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use colorful_core::Node;

    fn word(start: usize, end: usize) -> Node {
        Node::Word {
            span: Span::new(start, end),
        }
    }

    fn punct(start: usize, end: usize) -> Node {
        Node::Punct {
            span: Span::new(start, end),
        }
    }

    fn sentence(start: usize, end: usize, parts: Vec<Node>) -> Node {
        Node::Sentence {
            span: Span::new(start, end),
            parts,
        }
    }

    fn parse(text: &str) -> Vec<Node> {
        let Node::Document(sentences) = ProseParser::new().parse(text).root else {
            unreachable!("root is always a document");
        };
        sentences
    }

    #[test]
    fn single_sentence_words_and_terminator() {
        // "The cat sat."
        assert_eq!(
            parse("The cat sat."),
            vec![sentence(
                0,
                12,
                vec![word(0, 3), word(4, 7), word(8, 11), punct(11, 12)],
            )]
        );
    }

    #[test]
    fn splits_on_sentence_terminators() {
        // "Hi. Go!"
        assert_eq!(
            parse("Hi. Go!"),
            vec![
                sentence(0, 3, vec![word(0, 2), punct(2, 3)]),
                sentence(4, 7, vec![word(4, 6), punct(6, 7)]),
            ]
        );
    }

    #[test]
    fn unterminated_text_is_one_sentence() {
        // "hello world" (no terminator) flushes as a single sentence.
        assert_eq!(
            parse("hello world"),
            vec![sentence(0, 11, vec![word(0, 5), word(6, 11)])]
        );
    }

    #[test]
    fn contractions_and_hyphens_stay_one_word() {
        assert_eq!(parse("don't"), vec![sentence(0, 5, vec![word(0, 5)])]);
        assert_eq!(
            parse("well-being"),
            vec![sentence(0, 10, vec![word(0, 10)])]
        );
    }

    #[test]
    fn sentence_absorbs_trailing_closing_quote() {
        // The closing quote sitting right after the period stays in the first
        // sentence; the next sentence starts at "Go".
        assert_eq!(
            parse("\"Hi.\" Go."),
            vec![
                sentence(
                    0,
                    5,
                    vec![punct(0, 1), word(1, 3), punct(3, 4), punct(4, 5)]
                ),
                sentence(6, 9, vec![word(6, 8), punct(8, 9)]),
            ]
        );
    }

    #[test]
    fn opening_quote_after_terminator_starts_new_sentence() {
        // A quote separated from the terminator by a space is an opening quote
        // and begins the next sentence (it is not absorbed).
        assert_eq!(
            parse("Hi. \"Go.\""),
            vec![
                sentence(0, 3, vec![word(0, 2), punct(2, 3)]),
                sentence(
                    4,
                    9,
                    vec![punct(4, 5), word(5, 7), punct(7, 8), punct(8, 9)]
                ),
            ]
        );
    }

    #[test]
    fn unicode_spaces_are_skipped() {
        // A thin space (U+2009, 3 bytes) separates words and is not punctuation.
        assert_eq!(
            parse("a\u{2009}b"),
            vec![sentence(0, 5, vec![word(0, 1), word(4, 5)])]
        );
    }

    #[test]
    fn quotes_are_separate_punctuation() {
        // "hi" -> quote, word, quote
        assert_eq!(
            parse("\"hi\""),
            vec![sentence(0, 4, vec![punct(0, 1), word(1, 3), punct(3, 4)])]
        );
    }

    #[test]
    fn alphanumeric_words_stay_together() {
        // A letter-initial word keeps its internal digits (it is one word, not
        // word + number), so the lexicon sees `covid19`, not `covid` + `19`.
        assert_eq!(parse("covid19"), vec![sentence(0, 7, vec![word(0, 7)])]);
        assert_eq!(parse("H2O"), vec![sentence(0, 3, vec![word(0, 3)])]);
        // A digit-initial token is still a number.
        assert_eq!(parse("3.5"), vec![sentence(0, 3, vec![word(0, 3)])]);
    }

    #[test]
    fn numbers_are_word_nodes() {
        // "I have 3.5" -> three word nodes (the number is a word node).
        assert_eq!(
            parse("I have 3.5"),
            vec![sentence(0, 10, vec![word(0, 1), word(2, 6), word(7, 10)])]
        );
    }

    #[test]
    fn non_ascii_letters_join_words() {
        // "café" is a single word (é is a Unicode letter); 5 bytes.
        assert_eq!(parse("café"), vec![sentence(0, 5, vec![word(0, 5)])]);
    }

    #[test]
    fn empty_input_is_empty_document() {
        assert_eq!(parse(""), Vec::<Node>::new());
        assert_eq!(parse("   \n\t "), Vec::<Node>::new());
    }

    /// Collect the leaf (word/punct) spans of a parsed tree in order.
    fn leaf_spans(text: &str) -> Vec<Span> {
        let mut spans = Vec::new();
        let Node::Document(sentences) = ProseParser::new().parse(text).root else {
            unreachable!();
        };
        for sentence in sentences {
            let Node::Sentence { parts, .. } = sentence else {
                continue;
            };
            for part in parts {
                match part {
                    Node::Word { span } | Node::Punct { span } => spans.push(span),
                    _ => {}
                }
            }
        }
        spans
    }

    #[test]
    fn parsing_is_total_and_spans_are_well_formed() {
        // `logos` lowers its lexer to a loop only with optimizations; in debug
        // builds it recurses once per character, so a pathologically long single
        // token can exhaust a small default test stack. Shipped binaries are
        // release (no such recursion), so run the property on a generous stack to
        // exercise long tokens honestly in debug too.
        std::thread::Builder::new()
            .stack_size(16 * 1024 * 1024)
            .spawn(check_total_and_well_formed)
            .expect("spawn checker thread")
            .join()
            .expect("parser must not panic on adversarial input");
    }

    fn check_total_and_well_formed() {
        // Adversarial inputs must not panic, and every leaf span must be
        // non-empty, in bounds, and strictly ordered (no overlaps).
        let long_word = "a".repeat(10_000);
        let inputs: [&str; 14] = [
            "",
            "?!?!?!",
            "\u{1F600}\u{1F4A9}", // emoji
            long_word.as_str(),
            "no terminator here",
            "Mix3d 1,000 things\u{2014}and \u{00AB}quotes\u{00BB}.",
            "\t\n  \u{00A0}",
            "don''t",
            "....",
            "He said \u{201C}hi\u{201D} to O'Brien.",
            "cafe\u{0301} combining mark",  // combining acute
            "a\u{200D}b zero\u{200B}width", // ZWJ + ZWSP
            "\u{202E}reversed\u{202C} direction marks", // RTL override
            "z\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}algo text", // stacked combining
        ];
        for &input in &inputs {
            let spans = leaf_spans(input);
            let mut prev_end = 0usize;
            for span in spans {
                assert!(span.start < span.end, "empty span in {input:?}");
                assert!(span.end <= input.len(), "out-of-bounds span in {input:?}");
                assert!(span.start >= prev_end, "overlapping spans in {input:?}");
                assert!(
                    input.is_char_boundary(span.start) && input.is_char_boundary(span.end),
                    "span not on char boundary in {input:?}"
                );
                prev_end = span.end;
            }
        }
    }
}