pub(crate) mod properties;
pub(crate) mod transitions;
use crate::uax29::Action;
use properties::{
ASCII_WORD_BREAK_PROP, WordBreakProperty, lookup_word_break_property_from_dictionary,
};
use transitions::{State, TABLE, Transition};
#[derive(Default, Clone)]
#[non_exhaustive]
pub struct Options {}
pub fn tokenize(text: &str, breakpoints: &mut Vec<usize>, _options: Options) {
if text.is_empty() {
return;
}
let bytes = text.as_bytes();
let mut state = State::StartOfText;
let mut deferred_break_pos = None;
let mut pos = 0;
let mut last_was_zwj = false;
while pos < text.len() {
if matches!(
state,
State::ALetter | State::Numeric | State::ExtendNumLet | State::HLetter
) {
let scan_start = pos;
while pos < text.len() && bytes[pos] < 0x80 && WORD_CONTINUE[bytes[pos] as usize] {
pos += 1;
}
if pos > scan_start {
let last = bytes[pos - 1]; state = match last {
b'0'..=b'9' => State::Numeric,
b'_' => State::ExtendNumLet,
_ => State::ALetter,
};
last_was_zwj = false;
continue;
}
}
let b = bytes[pos];
let (c, prop, char_len) = if b < 0x80 {
(b as char, ASCII_WORD_BREAK_PROP[b as usize], 1usize)
} else {
let c = text[pos..].chars().next().unwrap();
(
c,
lookup_word_break_property_from_dictionary(c),
c.len_utf8(),
)
};
let Transition(next_state, action) = TABLE[state as usize][prop as usize];
match action {
Action::Break => {
let boundary = pos;
pos += char_len;
if last_was_zwj {
last_was_zwj = false;
if WordBreakProperty::is_ext_pictographic(c) {
continue; }
}
last_was_zwj = prop == WordBreakProperty::ZWJ;
state = next_state;
breakpoints.push(boundary);
continue;
}
Action::NoBreak => {
last_was_zwj = false;
if next_state.is_deferred() {
if deferred_break_pos.is_none() {
deferred_break_pos = Some(pos);
}
} else {
deferred_break_pos = None;
}
state = next_state;
pos += char_len;
}
Action::DeferredBreak => {
last_was_zwj = false;
let boundary = deferred_break_pos.take().unwrap();
state = next_state;
breakpoints.push(boundary);
continue;
}
Action::Transparent => {
last_was_zwj = prop == WordBreakProperty::ZWJ;
pos += char_len;
}
}
}
if state.is_deferred() {
breakpoints.push(deferred_break_pos.take().unwrap());
}
breakpoints.push(text.len());
}
const WORD_CONTINUE: [bool; 128] = {
let mut t = [false; 128];
let mut i = 0u8;
loop {
t[i as usize] = match i {
b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' => true,
_ => false,
};
if i == 127 {
break;
}
i += 1;
}
t
};
#[cfg(test)]
mod tests {
use super::{Options, tokenize};
use crate::uax29::test_helpers::test_against_uax29_break_tests;
#[test]
fn test_word_break_against_uax29_tests() {
let (passed, failed) =
test_against_uax29_break_tests("testdata/WordBreakTest.txt", |s, breakpoints| {
tokenize(s, breakpoints, Options::default())
});
assert_eq!(
(1944, 0),
(passed, failed),
"{} / {} tests passed",
passed,
passed + failed
);
}
#[test]
fn tokenizer_sanity() {
fn assert_breaks(s: &str, expected: Vec<usize>) {
let mut breakpoints = Vec::new();
tokenize(s, &mut breakpoints, Options::default());
assert_eq!(breakpoints, expected, "input: {:?}", s);
}
assert_breaks("", vec![]);
assert_breaks("a", vec![0, 1]);
assert_breaks(".", vec![0, 1]);
assert_breaks("\n", vec![0, 1]);
assert_breaks("hello", vec![0, 5]);
assert_breaks("123", vec![0, 3]);
assert_breaks("abc123", vec![0, 6]);
assert_breaks("123abc", vec![0, 6]);
assert_breaks("a1b2", vec![0, 4]);
assert_breaks("\r\n", vec![0, 2]);
assert_breaks("\r\n\r\n", vec![0, 2, 4]);
assert_breaks("\r", vec![0, 1]);
assert_breaks("\n\n", vec![0, 1, 2]);
assert_breaks("a\r\nb", vec![0, 1, 3, 4]);
assert_breaks("ab\r\ncd", vec![0, 2, 4, 6]);
assert_breaks("a c", vec![0, 1, 4, 5]);
assert_breaks("e.g. hello", vec![0, 3, 4, 5, 10]);
assert_breaks("example.com", vec![0, 11]);
assert_breaks("won't", vec![0, 5]);
assert_breaks("a_1", vec![0, 3]);
assert_breaks("_a", vec![0, 2]);
assert_breaks("can'", vec![0, 3, 4]);
assert_breaks("can' hi", vec![0, 3, 4, 5, 7]);
assert_breaks("א'", vec![0, "א'".len()]);
assert_breaks("א'א", vec![0, "א'א".len()]);
assert_breaks("א'\u{2060}א", vec![0, "א'\u{2060}א".len()]);
assert_breaks("א'a", vec![0, "א'a".len()]);
assert_breaks("הצ'קרות", vec![0, "הצ'קרות".len()]);
assert_breaks(
"לייף אנרג'י",
vec![0, "לייף".len(), "לייף ".len(), "לייף אנרג'י".len()],
);
assert_breaks("👨\u{200D}👩", vec![0, 11]);
assert_breaks("👨👩", vec![0, 4, 8]);
assert_breaks("🇦", vec![0, 4]);
assert_breaks("🇦🇦", vec![0, 8]);
assert_breaks("🇦🇦🇦", vec![0, 8, 12]);
assert_breaks("\u{200d}Ⓜ", vec![0, 6]);
}
}