Skip to main content

colorful_parse/
lib.rs

1//! The structural prose parser — a [`Parser`] adapter.
2//!
3//! A `logos` lexer turns text into mechanical tokens (words, numbers, sentence
4//! terminators, quotes, punctuation), and a small segmenter groups them into
5//! [`Node::Sentence`] runs (absorbing a trailing closing quote or bracket). It
6//! produces *structure only* — it makes no part-of-speech decisions; that is the
7//! lexicon and annotator's job. (This is sentence segmentation, not a deep
8//! recursive-descent grammar; the structure is intentionally shallow in `v0`.)
9//!
10//! Parsing is **total**: any input, including malformed or non-ASCII text,
11//! yields a [`Tree`] without panicking. Characters the lexer cannot otherwise
12//! classify are emitted as punctuation rather than dropped.
13
14#![forbid(unsafe_code)]
15#![warn(missing_docs)]
16
17use colorful_core::{Node, Parser, Span, Tree};
18use logos::Logos;
19
20/// Mechanical token kinds produced by the lexer.
21#[derive(Logos, Debug, PartialEq, Eq)]
22#[logos(skip r"[ \t\r\n\u{000C}\u{00A0}\u{1680}\u{2000}-\u{200A}\u{202F}\u{205F}\u{3000}]+")]
23enum Tok {
24    /// A letter-initial word, allowing internal digits, apostrophes, and hyphens
25    /// (`don't`, `well-being`, `covid19`, `H2O`). A digit-initial token is a
26    /// [`Tok::Number`] instead.
27    #[regex(r"\p{L}[\p{L}\p{N}]*(?:['\u{2019}\-][\p{L}\p{N}]+)*")]
28    Word,
29    /// A numeric token with optional internal separators (`150`, `3.14`,
30    /// `1,000`).
31    #[regex(r"\p{N}+(?:[.,]\p{N}+)*")]
32    Number,
33    /// A run of sentence-ending punctuation (`.`, `!`, `?`, `?!`, `...`).
34    #[regex(r"[.!?]+")]
35    SentenceEnd,
36    /// A quotation mark (straight or typographic).
37    #[regex(r#"["'\u{201C}\u{201D}\u{2018}\u{2019}\u{00AB}\u{00BB}]"#)]
38    Quote,
39    /// Other punctuation.
40    #[regex(r"[,;:\u{2026}\u{2014}\u{2013}()\[\]{}/\\@#$%^&*+=<>~|_-]")]
41    Punct,
42}
43
44/// The structural prose parser.
45#[derive(Debug, Default, Clone, Copy)]
46pub struct ProseParser;
47
48impl ProseParser {
49    /// Create a new parser.
50    #[must_use]
51    pub fn new() -> Self {
52        Self
53    }
54}
55
56impl Parser for ProseParser {
57    fn parse(&self, text: &str) -> Tree {
58        let mut sentences: Vec<Node> = Vec::new();
59        let mut parts: Vec<Node> = Vec::new();
60        let mut sent_start: usize = 0;
61        let mut sent_end: usize = 0;
62        // After a sentence terminator we defer the flush, so a closing quote or
63        // bracket sitting immediately after it is absorbed into the sentence
64        // rather than leaking into the next one.
65        let mut pending_flush = false;
66
67        let mut lexer = Tok::lexer(text);
68        while let Some(result) = lexer.next() {
69            let range = lexer.span();
70            let span = Span::new(range.start, range.end);
71            let is_closer =
72                matches!(result, Ok(Tok::Quote)) || matches!(span.slice(text), ")" | "]" | "}");
73
74            if pending_flush {
75                if is_closer && span.start == sent_end {
76                    // Adjacent trailing closer: keep it in the current sentence.
77                    parts.push(Node::Punct { span });
78                    sent_end = span.end;
79                    continue;
80                }
81                // The sentence is complete; flush it before this token.
82                sentences.push(Node::Sentence {
83                    span: Span::new(sent_start, sent_end),
84                    parts: std::mem::take(&mut parts),
85                });
86                pending_flush = false;
87            }
88
89            if parts.is_empty() {
90                sent_start = span.start;
91            }
92            sent_end = span.end;
93
94            match result {
95                Ok(Tok::Word | Tok::Number) => parts.push(Node::Word { span }),
96                Ok(Tok::SentenceEnd) => {
97                    parts.push(Node::Punct { span });
98                    pending_flush = true;
99                }
100                // Quotes, other punctuation, and any unrecognized character all
101                // become punctuation nodes — parsing stays total.
102                Ok(Tok::Quote | Tok::Punct) | Err(()) => parts.push(Node::Punct { span }),
103            }
104        }
105
106        // Flush a trailing, unterminated sentence.
107        if !parts.is_empty() {
108            sentences.push(Node::Sentence {
109                span: Span::new(sent_start, sent_end),
110                parts,
111            });
112        }
113
114        Tree::document(sentences)
115    }
116}
117
118#[cfg(test)]
119mod tests {
120    use super::*;
121    use colorful_core::Node;
122
123    fn word(start: usize, end: usize) -> Node {
124        Node::Word {
125            span: Span::new(start, end),
126        }
127    }
128
129    fn punct(start: usize, end: usize) -> Node {
130        Node::Punct {
131            span: Span::new(start, end),
132        }
133    }
134
135    fn sentence(start: usize, end: usize, parts: Vec<Node>) -> Node {
136        Node::Sentence {
137            span: Span::new(start, end),
138            parts,
139        }
140    }
141
142    fn parse(text: &str) -> Vec<Node> {
143        let Node::Document(sentences) = ProseParser::new().parse(text).root else {
144            unreachable!("root is always a document");
145        };
146        sentences
147    }
148
149    #[test]
150    fn single_sentence_words_and_terminator() {
151        // "The cat sat."
152        assert_eq!(
153            parse("The cat sat."),
154            vec![sentence(
155                0,
156                12,
157                vec![word(0, 3), word(4, 7), word(8, 11), punct(11, 12)],
158            )]
159        );
160    }
161
162    #[test]
163    fn splits_on_sentence_terminators() {
164        // "Hi. Go!"
165        assert_eq!(
166            parse("Hi. Go!"),
167            vec![
168                sentence(0, 3, vec![word(0, 2), punct(2, 3)]),
169                sentence(4, 7, vec![word(4, 6), punct(6, 7)]),
170            ]
171        );
172    }
173
174    #[test]
175    fn unterminated_text_is_one_sentence() {
176        // "hello world" (no terminator) flushes as a single sentence.
177        assert_eq!(
178            parse("hello world"),
179            vec![sentence(0, 11, vec![word(0, 5), word(6, 11)])]
180        );
181    }
182
183    #[test]
184    fn contractions_and_hyphens_stay_one_word() {
185        assert_eq!(parse("don't"), vec![sentence(0, 5, vec![word(0, 5)])]);
186        assert_eq!(
187            parse("well-being"),
188            vec![sentence(0, 10, vec![word(0, 10)])]
189        );
190    }
191
192    #[test]
193    fn sentence_absorbs_trailing_closing_quote() {
194        // The closing quote sitting right after the period stays in the first
195        // sentence; the next sentence starts at "Go".
196        assert_eq!(
197            parse("\"Hi.\" Go."),
198            vec![
199                sentence(
200                    0,
201                    5,
202                    vec![punct(0, 1), word(1, 3), punct(3, 4), punct(4, 5)]
203                ),
204                sentence(6, 9, vec![word(6, 8), punct(8, 9)]),
205            ]
206        );
207    }
208
209    #[test]
210    fn opening_quote_after_terminator_starts_new_sentence() {
211        // A quote separated from the terminator by a space is an opening quote
212        // and begins the next sentence (it is not absorbed).
213        assert_eq!(
214            parse("Hi. \"Go.\""),
215            vec![
216                sentence(0, 3, vec![word(0, 2), punct(2, 3)]),
217                sentence(
218                    4,
219                    9,
220                    vec![punct(4, 5), word(5, 7), punct(7, 8), punct(8, 9)]
221                ),
222            ]
223        );
224    }
225
226    #[test]
227    fn unicode_spaces_are_skipped() {
228        // A thin space (U+2009, 3 bytes) separates words and is not punctuation.
229        assert_eq!(
230            parse("a\u{2009}b"),
231            vec![sentence(0, 5, vec![word(0, 1), word(4, 5)])]
232        );
233    }
234
235    #[test]
236    fn quotes_are_separate_punctuation() {
237        // "hi" -> quote, word, quote
238        assert_eq!(
239            parse("\"hi\""),
240            vec![sentence(0, 4, vec![punct(0, 1), word(1, 3), punct(3, 4)])]
241        );
242    }
243
244    #[test]
245    fn alphanumeric_words_stay_together() {
246        // A letter-initial word keeps its internal digits (it is one word, not
247        // word + number), so the lexicon sees `covid19`, not `covid` + `19`.
248        assert_eq!(parse("covid19"), vec![sentence(0, 7, vec![word(0, 7)])]);
249        assert_eq!(parse("H2O"), vec![sentence(0, 3, vec![word(0, 3)])]);
250        // A digit-initial token is still a number.
251        assert_eq!(parse("3.5"), vec![sentence(0, 3, vec![word(0, 3)])]);
252    }
253
254    #[test]
255    fn numbers_are_word_nodes() {
256        // "I have 3.5" -> three word nodes (the number is a word node).
257        assert_eq!(
258            parse("I have 3.5"),
259            vec![sentence(0, 10, vec![word(0, 1), word(2, 6), word(7, 10)])]
260        );
261    }
262
263    #[test]
264    fn non_ascii_letters_join_words() {
265        // "café" is a single word (é is a Unicode letter); 5 bytes.
266        assert_eq!(parse("café"), vec![sentence(0, 5, vec![word(0, 5)])]);
267    }
268
269    #[test]
270    fn empty_input_is_empty_document() {
271        assert_eq!(parse(""), Vec::<Node>::new());
272        assert_eq!(parse("   \n\t "), Vec::<Node>::new());
273    }
274
275    /// Collect the leaf (word/punct) spans of a parsed tree in order.
276    fn leaf_spans(text: &str) -> Vec<Span> {
277        let mut spans = Vec::new();
278        let Node::Document(sentences) = ProseParser::new().parse(text).root else {
279            unreachable!();
280        };
281        for sentence in sentences {
282            let Node::Sentence { parts, .. } = sentence else {
283                continue;
284            };
285            for part in parts {
286                match part {
287                    Node::Word { span } | Node::Punct { span } => spans.push(span),
288                    _ => {}
289                }
290            }
291        }
292        spans
293    }
294
295    #[test]
296    fn parsing_is_total_and_spans_are_well_formed() {
297        // `logos` lowers its lexer to a loop only with optimizations; in debug
298        // builds it recurses once per character, so a pathologically long single
299        // token can exhaust a small default test stack. Shipped binaries are
300        // release (no such recursion), so run the property on a generous stack to
301        // exercise long tokens honestly in debug too.
302        std::thread::Builder::new()
303            .stack_size(16 * 1024 * 1024)
304            .spawn(check_total_and_well_formed)
305            .expect("spawn checker thread")
306            .join()
307            .expect("parser must not panic on adversarial input");
308    }
309
310    fn check_total_and_well_formed() {
311        // Adversarial inputs must not panic, and every leaf span must be
312        // non-empty, in bounds, and strictly ordered (no overlaps).
313        let long_word = "a".repeat(10_000);
314        let inputs: [&str; 14] = [
315            "",
316            "?!?!?!",
317            "\u{1F600}\u{1F4A9}", // emoji
318            long_word.as_str(),
319            "no terminator here",
320            "Mix3d 1,000 things\u{2014}and \u{00AB}quotes\u{00BB}.",
321            "\t\n  \u{00A0}",
322            "don''t",
323            "....",
324            "He said \u{201C}hi\u{201D} to O'Brien.",
325            "cafe\u{0301} combining mark",  // combining acute
326            "a\u{200D}b zero\u{200B}width", // ZWJ + ZWSP
327            "\u{202E}reversed\u{202C} direction marks", // RTL override
328            "z\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}algo text", // stacked combining
329        ];
330        for &input in &inputs {
331            let spans = leaf_spans(input);
332            let mut prev_end = 0usize;
333            for span in spans {
334                assert!(span.start < span.end, "empty span in {input:?}");
335                assert!(span.end <= input.len(), "out-of-bounds span in {input:?}");
336                assert!(span.start >= prev_end, "overlapping spans in {input:?}");
337                assert!(
338                    input.is_char_boundary(span.start) && input.is_char_boundary(span.end),
339                    "span not on char boundary in {input:?}"
340                );
341                prev_end = span.end;
342            }
343        }
344    }
345}