#![forbid(unsafe_code)]
#![warn(missing_docs)]
use colorful_core::{Node, Parser, Span, Tree};
use logos::Logos;
#[derive(Logos, Debug, PartialEq, Eq)]
#[logos(skip r"[ \t\r\n\u{000C}\u{00A0}\u{1680}\u{2000}-\u{200A}\u{202F}\u{205F}\u{3000}]+")]
enum Tok {
#[regex(r"\p{L}[\p{L}\p{N}]*(?:['\u{2019}\-][\p{L}\p{N}]+)*")]
Word,
#[regex(r"\p{N}+(?:[.,]\p{N}+)*")]
Number,
#[regex(r"[.!?]+")]
SentenceEnd,
#[regex(r#"["'\u{201C}\u{201D}\u{2018}\u{2019}\u{00AB}\u{00BB}]"#)]
Quote,
#[regex(r"[,;:\u{2026}\u{2014}\u{2013}()\[\]{}/\\@#$%^&*+=<>~|_-]")]
Punct,
}
#[derive(Debug, Default, Clone, Copy)]
pub struct ProseParser;
impl ProseParser {
#[must_use]
pub fn new() -> Self {
Self
}
}
impl Parser for ProseParser {
fn parse(&self, text: &str) -> Tree {
let mut sentences: Vec<Node> = Vec::new();
let mut parts: Vec<Node> = Vec::new();
let mut sent_start: usize = 0;
let mut sent_end: usize = 0;
let mut pending_flush = false;
let mut lexer = Tok::lexer(text);
while let Some(result) = lexer.next() {
let range = lexer.span();
let span = Span::new(range.start, range.end);
let is_closer =
matches!(result, Ok(Tok::Quote)) || matches!(span.slice(text), ")" | "]" | "}");
if pending_flush {
if is_closer && span.start == sent_end {
parts.push(Node::Punct { span });
sent_end = span.end;
continue;
}
sentences.push(Node::Sentence {
span: Span::new(sent_start, sent_end),
parts: std::mem::take(&mut parts),
});
pending_flush = false;
}
if parts.is_empty() {
sent_start = span.start;
}
sent_end = span.end;
match result {
Ok(Tok::Word | Tok::Number) => parts.push(Node::Word { span }),
Ok(Tok::SentenceEnd) => {
parts.push(Node::Punct { span });
pending_flush = true;
}
Ok(Tok::Quote | Tok::Punct) | Err(()) => parts.push(Node::Punct { span }),
}
}
if !parts.is_empty() {
sentences.push(Node::Sentence {
span: Span::new(sent_start, sent_end),
parts,
});
}
Tree::document(sentences)
}
}
#[cfg(test)]
mod tests {
use super::*;
use colorful_core::Node;
fn word(start: usize, end: usize) -> Node {
Node::Word {
span: Span::new(start, end),
}
}
fn punct(start: usize, end: usize) -> Node {
Node::Punct {
span: Span::new(start, end),
}
}
fn sentence(start: usize, end: usize, parts: Vec<Node>) -> Node {
Node::Sentence {
span: Span::new(start, end),
parts,
}
}
fn parse(text: &str) -> Vec<Node> {
let Node::Document(sentences) = ProseParser::new().parse(text).root else {
unreachable!("root is always a document");
};
sentences
}
#[test]
fn single_sentence_words_and_terminator() {
assert_eq!(
parse("The cat sat."),
vec![sentence(
0,
12,
vec![word(0, 3), word(4, 7), word(8, 11), punct(11, 12)],
)]
);
}
#[test]
fn splits_on_sentence_terminators() {
assert_eq!(
parse("Hi. Go!"),
vec![
sentence(0, 3, vec![word(0, 2), punct(2, 3)]),
sentence(4, 7, vec![word(4, 6), punct(6, 7)]),
]
);
}
#[test]
fn unterminated_text_is_one_sentence() {
assert_eq!(
parse("hello world"),
vec![sentence(0, 11, vec![word(0, 5), word(6, 11)])]
);
}
#[test]
fn contractions_and_hyphens_stay_one_word() {
assert_eq!(parse("don't"), vec![sentence(0, 5, vec![word(0, 5)])]);
assert_eq!(
parse("well-being"),
vec![sentence(0, 10, vec![word(0, 10)])]
);
}
#[test]
fn sentence_absorbs_trailing_closing_quote() {
assert_eq!(
parse("\"Hi.\" Go."),
vec![
sentence(
0,
5,
vec![punct(0, 1), word(1, 3), punct(3, 4), punct(4, 5)]
),
sentence(6, 9, vec![word(6, 8), punct(8, 9)]),
]
);
}
#[test]
fn opening_quote_after_terminator_starts_new_sentence() {
assert_eq!(
parse("Hi. \"Go.\""),
vec![
sentence(0, 3, vec![word(0, 2), punct(2, 3)]),
sentence(
4,
9,
vec![punct(4, 5), word(5, 7), punct(7, 8), punct(8, 9)]
),
]
);
}
#[test]
fn unicode_spaces_are_skipped() {
assert_eq!(
parse("a\u{2009}b"),
vec![sentence(0, 5, vec![word(0, 1), word(4, 5)])]
);
}
#[test]
fn quotes_are_separate_punctuation() {
assert_eq!(
parse("\"hi\""),
vec![sentence(0, 4, vec![punct(0, 1), word(1, 3), punct(3, 4)])]
);
}
#[test]
fn alphanumeric_words_stay_together() {
assert_eq!(parse("covid19"), vec![sentence(0, 7, vec![word(0, 7)])]);
assert_eq!(parse("H2O"), vec![sentence(0, 3, vec![word(0, 3)])]);
assert_eq!(parse("3.5"), vec![sentence(0, 3, vec![word(0, 3)])]);
}
#[test]
fn numbers_are_word_nodes() {
assert_eq!(
parse("I have 3.5"),
vec![sentence(0, 10, vec![word(0, 1), word(2, 6), word(7, 10)])]
);
}
#[test]
fn non_ascii_letters_join_words() {
assert_eq!(parse("café"), vec![sentence(0, 5, vec![word(0, 5)])]);
}
#[test]
fn empty_input_is_empty_document() {
assert_eq!(parse(""), Vec::<Node>::new());
assert_eq!(parse(" \n\t "), Vec::<Node>::new());
}
fn leaf_spans(text: &str) -> Vec<Span> {
let mut spans = Vec::new();
let Node::Document(sentences) = ProseParser::new().parse(text).root else {
unreachable!();
};
for sentence in sentences {
let Node::Sentence { parts, .. } = sentence else {
continue;
};
for part in parts {
match part {
Node::Word { span } | Node::Punct { span } => spans.push(span),
_ => {}
}
}
}
spans
}
#[test]
fn parsing_is_total_and_spans_are_well_formed() {
std::thread::Builder::new()
.stack_size(16 * 1024 * 1024)
.spawn(check_total_and_well_formed)
.expect("spawn checker thread")
.join()
.expect("parser must not panic on adversarial input");
}
fn check_total_and_well_formed() {
let long_word = "a".repeat(10_000);
let inputs: [&str; 14] = [
"",
"?!?!?!",
"\u{1F600}\u{1F4A9}", long_word.as_str(),
"no terminator here",
"Mix3d 1,000 things\u{2014}and \u{00AB}quotes\u{00BB}.",
"\t\n \u{00A0}",
"don''t",
"....",
"He said \u{201C}hi\u{201D} to O'Brien.",
"cafe\u{0301} combining mark", "a\u{200D}b zero\u{200B}width", "\u{202E}reversed\u{202C} direction marks", "z\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}algo text", ];
for &input in &inputs {
let spans = leaf_spans(input);
let mut prev_end = 0usize;
for span in spans {
assert!(span.start < span.end, "empty span in {input:?}");
assert!(span.end <= input.len(), "out-of-bounds span in {input:?}");
assert!(span.start >= prev_end, "overlapping spans in {input:?}");
assert!(
input.is_char_boundary(span.start) && input.is_char_boundary(span.end),
"span not on char boundary in {input:?}"
);
prev_end = span.end;
}
}
}
}