1#![forbid(unsafe_code)]
15#![warn(missing_docs)]
16
17use colorful_core::{Node, Parser, Span, Tree};
18use logos::Logos;
19
20#[derive(Logos, Debug, PartialEq, Eq)]
22#[logos(skip r"[ \t\r\n\u{000C}\u{00A0}\u{1680}\u{2000}-\u{200A}\u{202F}\u{205F}\u{3000}]+")]
23enum Tok {
24 #[regex(r"\p{L}[\p{L}\p{N}]*(?:['\u{2019}\-][\p{L}\p{N}]+)*")]
28 Word,
29 #[regex(r"\p{N}+(?:[.,]\p{N}+)*")]
32 Number,
33 #[regex(r"[.!?]+")]
35 SentenceEnd,
36 #[regex(r#"["'\u{201C}\u{201D}\u{2018}\u{2019}\u{00AB}\u{00BB}]"#)]
38 Quote,
39 #[regex(r"[,;:\u{2026}\u{2014}\u{2013}()\[\]{}/\\@#$%^&*+=<>~|_-]")]
41 Punct,
42}
43
44#[derive(Debug, Default, Clone, Copy)]
46pub struct ProseParser;
47
48impl ProseParser {
49 #[must_use]
51 pub fn new() -> Self {
52 Self
53 }
54}
55
56impl Parser for ProseParser {
57 fn parse(&self, text: &str) -> Tree {
58 let mut sentences: Vec<Node> = Vec::new();
59 let mut parts: Vec<Node> = Vec::new();
60 let mut sent_start: usize = 0;
61 let mut sent_end: usize = 0;
62 let mut pending_flush = false;
66
67 let mut lexer = Tok::lexer(text);
68 while let Some(result) = lexer.next() {
69 let range = lexer.span();
70 let span = Span::new(range.start, range.end);
71 let is_closer =
72 matches!(result, Ok(Tok::Quote)) || matches!(span.slice(text), ")" | "]" | "}");
73
74 if pending_flush {
75 if is_closer && span.start == sent_end {
76 parts.push(Node::Punct { span });
78 sent_end = span.end;
79 continue;
80 }
81 sentences.push(Node::Sentence {
83 span: Span::new(sent_start, sent_end),
84 parts: std::mem::take(&mut parts),
85 });
86 pending_flush = false;
87 }
88
89 if parts.is_empty() {
90 sent_start = span.start;
91 }
92 sent_end = span.end;
93
94 match result {
95 Ok(Tok::Word | Tok::Number) => parts.push(Node::Word { span }),
96 Ok(Tok::SentenceEnd) => {
97 parts.push(Node::Punct { span });
98 pending_flush = true;
99 }
100 Ok(Tok::Quote | Tok::Punct) | Err(()) => parts.push(Node::Punct { span }),
103 }
104 }
105
106 if !parts.is_empty() {
108 sentences.push(Node::Sentence {
109 span: Span::new(sent_start, sent_end),
110 parts,
111 });
112 }
113
114 Tree::document(sentences)
115 }
116}
117
118#[cfg(test)]
119mod tests {
120 use super::*;
121 use colorful_core::Node;
122
123 fn word(start: usize, end: usize) -> Node {
124 Node::Word {
125 span: Span::new(start, end),
126 }
127 }
128
129 fn punct(start: usize, end: usize) -> Node {
130 Node::Punct {
131 span: Span::new(start, end),
132 }
133 }
134
135 fn sentence(start: usize, end: usize, parts: Vec<Node>) -> Node {
136 Node::Sentence {
137 span: Span::new(start, end),
138 parts,
139 }
140 }
141
142 fn parse(text: &str) -> Vec<Node> {
143 let Node::Document(sentences) = ProseParser::new().parse(text).root else {
144 unreachable!("root is always a document");
145 };
146 sentences
147 }
148
149 #[test]
150 fn single_sentence_words_and_terminator() {
151 assert_eq!(
153 parse("The cat sat."),
154 vec![sentence(
155 0,
156 12,
157 vec![word(0, 3), word(4, 7), word(8, 11), punct(11, 12)],
158 )]
159 );
160 }
161
162 #[test]
163 fn splits_on_sentence_terminators() {
164 assert_eq!(
166 parse("Hi. Go!"),
167 vec![
168 sentence(0, 3, vec![word(0, 2), punct(2, 3)]),
169 sentence(4, 7, vec![word(4, 6), punct(6, 7)]),
170 ]
171 );
172 }
173
174 #[test]
175 fn unterminated_text_is_one_sentence() {
176 assert_eq!(
178 parse("hello world"),
179 vec![sentence(0, 11, vec![word(0, 5), word(6, 11)])]
180 );
181 }
182
183 #[test]
184 fn contractions_and_hyphens_stay_one_word() {
185 assert_eq!(parse("don't"), vec![sentence(0, 5, vec![word(0, 5)])]);
186 assert_eq!(
187 parse("well-being"),
188 vec![sentence(0, 10, vec![word(0, 10)])]
189 );
190 }
191
192 #[test]
193 fn sentence_absorbs_trailing_closing_quote() {
194 assert_eq!(
197 parse("\"Hi.\" Go."),
198 vec![
199 sentence(
200 0,
201 5,
202 vec![punct(0, 1), word(1, 3), punct(3, 4), punct(4, 5)]
203 ),
204 sentence(6, 9, vec![word(6, 8), punct(8, 9)]),
205 ]
206 );
207 }
208
209 #[test]
210 fn opening_quote_after_terminator_starts_new_sentence() {
211 assert_eq!(
214 parse("Hi. \"Go.\""),
215 vec![
216 sentence(0, 3, vec![word(0, 2), punct(2, 3)]),
217 sentence(
218 4,
219 9,
220 vec![punct(4, 5), word(5, 7), punct(7, 8), punct(8, 9)]
221 ),
222 ]
223 );
224 }
225
226 #[test]
227 fn unicode_spaces_are_skipped() {
228 assert_eq!(
230 parse("a\u{2009}b"),
231 vec![sentence(0, 5, vec![word(0, 1), word(4, 5)])]
232 );
233 }
234
235 #[test]
236 fn quotes_are_separate_punctuation() {
237 assert_eq!(
239 parse("\"hi\""),
240 vec![sentence(0, 4, vec![punct(0, 1), word(1, 3), punct(3, 4)])]
241 );
242 }
243
244 #[test]
245 fn alphanumeric_words_stay_together() {
246 assert_eq!(parse("covid19"), vec![sentence(0, 7, vec![word(0, 7)])]);
249 assert_eq!(parse("H2O"), vec![sentence(0, 3, vec![word(0, 3)])]);
250 assert_eq!(parse("3.5"), vec![sentence(0, 3, vec![word(0, 3)])]);
252 }
253
254 #[test]
255 fn numbers_are_word_nodes() {
256 assert_eq!(
258 parse("I have 3.5"),
259 vec![sentence(0, 10, vec![word(0, 1), word(2, 6), word(7, 10)])]
260 );
261 }
262
263 #[test]
264 fn non_ascii_letters_join_words() {
265 assert_eq!(parse("café"), vec![sentence(0, 5, vec![word(0, 5)])]);
267 }
268
269 #[test]
270 fn empty_input_is_empty_document() {
271 assert_eq!(parse(""), Vec::<Node>::new());
272 assert_eq!(parse(" \n\t "), Vec::<Node>::new());
273 }
274
275 fn leaf_spans(text: &str) -> Vec<Span> {
277 let mut spans = Vec::new();
278 let Node::Document(sentences) = ProseParser::new().parse(text).root else {
279 unreachable!();
280 };
281 for sentence in sentences {
282 let Node::Sentence { parts, .. } = sentence else {
283 continue;
284 };
285 for part in parts {
286 match part {
287 Node::Word { span } | Node::Punct { span } => spans.push(span),
288 _ => {}
289 }
290 }
291 }
292 spans
293 }
294
295 #[test]
296 fn parsing_is_total_and_spans_are_well_formed() {
297 std::thread::Builder::new()
303 .stack_size(16 * 1024 * 1024)
304 .spawn(check_total_and_well_formed)
305 .expect("spawn checker thread")
306 .join()
307 .expect("parser must not panic on adversarial input");
308 }
309
310 fn check_total_and_well_formed() {
311 let long_word = "a".repeat(10_000);
314 let inputs: [&str; 14] = [
315 "",
316 "?!?!?!",
317 "\u{1F600}\u{1F4A9}", long_word.as_str(),
319 "no terminator here",
320 "Mix3d 1,000 things\u{2014}and \u{00AB}quotes\u{00BB}.",
321 "\t\n \u{00A0}",
322 "don''t",
323 "....",
324 "He said \u{201C}hi\u{201D} to O'Brien.",
325 "cafe\u{0301} combining mark", "a\u{200D}b zero\u{200B}width", "\u{202E}reversed\u{202C} direction marks", "z\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}algo text", ];
330 for &input in &inputs {
331 let spans = leaf_spans(input);
332 let mut prev_end = 0usize;
333 for span in spans {
334 assert!(span.start < span.end, "empty span in {input:?}");
335 assert!(span.end <= input.len(), "out-of-bounds span in {input:?}");
336 assert!(span.start >= prev_end, "overlapping spans in {input:?}");
337 assert!(
338 input.is_char_boundary(span.start) && input.is_char_boundary(span.end),
339 "span not on char boundary in {input:?}"
340 );
341 prev_end = span.end;
342 }
343 }
344 }
345}