pub(crate) mod properties;
pub(crate) mod transitions;
use crate::uax29::Action;
use properties::{ASCII_SENTENCE_BREAK_PROP, lookup_sentence_break_property};
use transitions::{State, TRANSITION_TABLE, Transition};
#[derive(Default)]
#[non_exhaustive]
pub struct Options {}
pub fn tokenize(text: &str, breakpoints: &mut Vec<usize>, _options: Options) {
if text.is_empty() {
return;
}
let bytes = text.as_bytes();
let mut state = State::StartOfText;
let mut deferred_break_pos = None;
let mut pos = 0;
while pos < text.len() {
let b = bytes[pos];
let (prop, char_len) = if b < 0x80 {
(ASCII_SENTENCE_BREAK_PROP[b as usize], 1usize)
} else {
let c = text[pos..].chars().next().unwrap();
(lookup_sentence_break_property(c), c.len_utf8())
};
let Transition(next_state, action) = TRANSITION_TABLE[state as usize][prop as usize];
match action {
Action::Break => {
state = next_state;
breakpoints.push(pos);
pos += char_len;
continue;
}
Action::NoBreak => {
if next_state.is_deferred() {
if deferred_break_pos.is_none() {
deferred_break_pos = Some(pos);
}
} else {
deferred_break_pos = None;
}
state = next_state;
pos += char_len;
}
Action::Transparent => {
pos += char_len;
}
Action::DeferredBreak => {
let boundary = deferred_break_pos.take().unwrap();
state = next_state;
breakpoints.push(boundary);
continue;
}
}
}
if state.is_deferred() {
breakpoints.push(deferred_break_pos.take().unwrap());
}
breakpoints.push(text.len());
}
#[cfg(test)]
mod tests {
use super::{Options, tokenize};
use crate::uax29::test_helpers::test_against_uax29_break_tests;
#[test]
fn test_sentence_break_against_uax29_tests() {
let (passed, failed) =
test_against_uax29_break_tests("testdata/SentenceBreakTest.txt", |s, breakpoints| {
tokenize(s, breakpoints, Options::default())
});
assert_eq!(
(512, 0),
(passed, failed),
"{} / {} tests passed",
passed,
passed + failed
);
}
#[test]
fn tokenizer_sanity() {
fn assert_breaks(s: &str, expected: Vec<usize>) {
let mut breakpoints = Vec::new();
tokenize(s, &mut breakpoints, Options::default());
assert_eq!(breakpoints, expected, "input: {:?}", s);
}
assert_breaks("", vec![]);
assert_breaks("a", vec![0, 1]);
assert_breaks(".", vec![0, 1]);
assert_breaks("Hello world", vec![0, 11]);
assert_breaks("\r\n", vec![0, 2]);
assert_breaks("a\nb", vec![0, 2, 3]);
assert_breaks("a\r\nb", vec![0, 3, 4]);
assert_breaks("a\rb", vec![0, 2, 3]);
assert_breaks("a\u{0308}b", vec![0, 4]);
assert_breaks("3.4", vec![0, 3]);
assert_breaks("U.S.A.", vec![0, 6]);
assert_breaks("U.S.", vec![0, 4]);
assert_breaks("c.D", vec![0, 3]);
assert_breaks("c.d", vec![0, 3]);
assert_breaks("etc. the", vec![0, 8]);
assert_breaks("the resp. leaders are", vec![0, 21]);
assert_breaks("etc.)'\u{a0}the", vec![0, 11]);
assert_breaks(".,", vec![0, 2]); assert_breaks("..", vec![0, 2]); assert_breaks("!,", vec![0, 2]); assert_breaks("!.", vec![0, 2]);
assert_breaks("Hello. World", vec![0, 7, 12]);
assert_breaks("Hello!) World", vec![0, 8, 13]);
assert_breaks("Hello. World", vec![0, 8, 13]);
assert_breaks("Hello.\nWorld", vec![0, 7, 12]);
assert_breaks("Hello! world", vec![0, 7, 12]);
assert_breaks("Hello. World", vec![0, 7, 12]);
assert_breaks("c.d", vec![0, 3]);
assert_breaks("3.4", vec![0, 3]);
assert_breaks("U.S.", vec![0, 4]);
assert_breaks("the resp. leaders are", vec![0, 21]);
assert_breaks("etc.)\u{2019}\u{a0}\u{2018}(the", vec![0, 17]);
assert_breaks(
"She said \"See spot run.\" John shook his head.",
vec![0, 25, 45],
);
}
}