pub(crate) mod properties;
pub(crate) mod transitions;
use crate::uax29::Action;
use properties::{
ASCII_WORD_BREAK_PROP, WordBreakProperty, is_word_like_strict,
lookup_word_break_property_from_dictionary,
};
use transitions::{State, TABLE, Transition};
#[derive(Default, Clone, Copy, Debug)]
#[non_exhaustive]
pub struct Options {}
#[derive(Copy, Clone, Default, Debug, Eq, PartialEq)]
pub struct TokenProperties(u8);
impl TokenProperties {
const WORD_LIKE_MASK: u8 = 0b0000_0001;
const NON_ASCII_MASK: u8 = 0b0000_0010;
pub(crate) const NON_ASCII: Self = Self(Self::NON_ASCII_MASK);
pub(crate) const WORD_LIKE: Self = Self(Self::WORD_LIKE_MASK);
pub fn is_word_like(&self) -> bool {
self.0 & Self::WORD_LIKE_MASK != 0
}
pub fn is_ascii(&self) -> bool {
self.0 & Self::NON_ASCII_MASK == 0
}
}
impl std::ops::BitOrAssign for TokenProperties {
#[inline]
fn bitor_assign(&mut self, rhs: Self) {
self.0 |= rhs.0;
}
}
pub fn tokenize(
text: &str,
_options: Options,
mut on_breakpoint: impl FnMut(usize, TokenProperties) -> bool,
) {
if text.is_empty() {
return;
}
let bytes = text.as_bytes();
let mut state = State::StartOfText;
let mut deferred_break_pos = None;
let mut pos = 0;
let mut last_was_zwj = false;
let mut token_props = TokenProperties::default();
let mut deferred_props = TokenProperties::default();
while pos < text.len() {
if matches!(
state,
State::ALetter | State::Numeric | State::ExtendNumLet | State::HLetter
) {
let scan_start = pos;
let mut fast_acc: u8 = 0;
while pos < text.len() && bytes[pos] < 0x80 {
let info = ASCII_BYTE_INFO[bytes[pos] as usize];
if info & ASCII_WORD_CONTINUE == 0 {
break;
}
fast_acc |= info;
pos += 1;
}
if pos > scan_start {
token_props.0 |= fast_acc & !ASCII_WORD_CONTINUE;
let last = bytes[pos - 1]; state = match last {
b'0'..=b'9' => State::Numeric,
b'_' => State::ExtendNumLet,
_ => State::ALetter,
};
last_was_zwj = false;
continue;
}
}
let b = bytes[pos];
let (c, prop, char_len, char_props) = if b < 0x80 {
(
b as char,
ASCII_WORD_BREAK_PROP[b as usize],
1usize,
TokenProperties(ASCII_BYTE_INFO[b as usize] & !ASCII_WORD_CONTINUE),
)
} else {
let c = text[pos..].chars().next().unwrap();
let prop = lookup_word_break_property_from_dictionary(c);
let mut char_props = TokenProperties::NON_ASCII;
char_props |= WORD_BREAK_CONTRIB[prop as usize];
if !char_props.is_word_like() && is_word_like_strict(c) {
char_props |= TokenProperties::WORD_LIKE;
}
(c, prop, c.len_utf8(), char_props)
};
let Transition(next_state, action) = TABLE[state as usize][prop as usize];
match action {
Action::Break => {
let boundary = pos;
pos += char_len;
if last_was_zwj {
last_was_zwj = false;
if WordBreakProperty::is_ext_pictographic(c) {
token_props |= char_props;
continue;
}
}
last_was_zwj = prop == WordBreakProperty::ZWJ;
state = next_state;
if !on_breakpoint(boundary, std::mem::take(&mut token_props)) {
return;
}
token_props |= char_props;
continue;
}
Action::NoBreak => {
last_was_zwj = false;
if next_state.is_deferred() {
if deferred_break_pos.is_none() {
deferred_break_pos = Some(pos);
}
deferred_props |= char_props;
} else {
if deferred_break_pos.take().is_some() {
token_props |= std::mem::take(&mut deferred_props);
}
token_props |= char_props;
}
state = next_state;
pos += char_len;
}
Action::DeferredBreak => {
last_was_zwj = false;
let boundary = deferred_break_pos.take().unwrap();
state = next_state;
if !on_breakpoint(boundary, std::mem::take(&mut token_props)) {
return;
}
token_props |= std::mem::take(&mut deferred_props);
continue;
}
Action::Transparent => {
last_was_zwj = prop == WordBreakProperty::ZWJ;
pos += char_len;
if deferred_break_pos.is_some() {
deferred_props |= char_props;
} else {
token_props |= char_props;
}
}
}
}
if state.is_deferred() {
let breakpoint = deferred_break_pos.take().unwrap();
if !on_breakpoint(breakpoint, std::mem::take(&mut token_props)) {
return;
}
token_props |= std::mem::take(&mut deferred_props);
}
_ = on_breakpoint(text.len(), token_props);
}
const WORD_BREAK_CONTRIB: [TokenProperties; WordBreakProperty::NUM_VARIANTS] = {
let mut t = [TokenProperties(0); WordBreakProperty::NUM_VARIANTS];
t[WordBreakProperty::ALetter as usize] = TokenProperties::WORD_LIKE;
t[WordBreakProperty::HebrewLetter as usize] = TokenProperties::WORD_LIKE;
t[WordBreakProperty::Numeric as usize] = TokenProperties::WORD_LIKE;
t
};
const ASCII_WORD_CONTINUE: u8 = 0b1000_0000;
const ASCII_BYTE_INFO: [u8; 128] = {
let mut t = [0u8; 128];
let mut i = 0u8;
loop {
t[i as usize] = match i {
b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' => {
ASCII_WORD_CONTINUE | TokenProperties::WORD_LIKE_MASK
}
b'_' => ASCII_WORD_CONTINUE,
_ => 0,
};
if i == 127 {
break;
}
i += 1;
}
t
};
#[cfg(test)]
mod tests {
use super::{Options, tokenize};
use crate::uax29::test_helpers::test_against_uax29_break_tests;
#[test]
fn test_word_break_against_uax29_tests() {
let (passed, failed) =
test_against_uax29_break_tests("testdata/WordBreakTest.txt", |s, breakpoints| {
tokenize(s, Options::default(), |bp, _props| {
breakpoints.push(bp);
true
});
});
assert_eq!(
(1944, 0),
(passed, failed),
"{} / {} tests passed",
passed,
passed + failed
);
}
#[test]
fn tokenizer_sanity() {
fn assert_breaks(s: &str, expected: Vec<usize>) {
let mut breakpoints = Vec::new();
tokenize(s, Options::default(), |bp, _props| {
breakpoints.push(bp);
true
});
assert_eq!(breakpoints, expected, "input: {:?}", s);
}
assert_breaks("", vec![]);
assert_breaks("a", vec![0, 1]);
assert_breaks(".", vec![0, 1]);
assert_breaks("\n", vec![0, 1]);
assert_breaks("hello", vec![0, 5]);
assert_breaks("123", vec![0, 3]);
assert_breaks("abc123", vec![0, 6]);
assert_breaks("123abc", vec![0, 6]);
assert_breaks("a1b2", vec![0, 4]);
assert_breaks("\r\n", vec![0, 2]);
assert_breaks("\r\n\r\n", vec![0, 2, 4]);
assert_breaks("\r", vec![0, 1]);
assert_breaks("\n\n", vec![0, 1, 2]);
assert_breaks("a\r\nb", vec![0, 1, 3, 4]);
assert_breaks("ab\r\ncd", vec![0, 2, 4, 6]);
assert_breaks("a c", vec![0, 1, 4, 5]);
assert_breaks("e.g. hello", vec![0, 3, 4, 5, 10]);
assert_breaks("example.com", vec![0, 11]);
assert_breaks("won't", vec![0, 5]);
assert_breaks("a_1", vec![0, 3]);
assert_breaks("_a", vec![0, 2]);
assert_breaks("can'", vec![0, 3, 4]);
assert_breaks("can' hi", vec![0, 3, 4, 5, 7]);
assert_breaks("א'", vec![0, "א'".len()]);
assert_breaks("א'א", vec![0, "א'א".len()]);
assert_breaks("א'\u{2060}א", vec![0, "א'\u{2060}א".len()]);
assert_breaks("א'a", vec![0, "א'a".len()]);
assert_breaks("הצ'קרות", vec![0, "הצ'קרות".len()]);
assert_breaks(
"לייף אנרג'י",
vec![0, "לייף".len(), "לייף ".len(), "לייף אנרג'י".len()],
);
assert_breaks("צה\u{05F4}ל", vec![0, "צה\u{05F4}ל".len()]);
assert_breaks(
"אקספרס\u{05F4} מהיום",
vec![
0,
"אקספרס".len(),
"אקספרס\u{05F4}".len(),
"אקספרס\u{05F4} ".len(),
"אקספרס\u{05F4} מהיום".len(),
],
);
assert_breaks(
"\u{05F4}אקספרס\u{05F4} מהיום",
vec![
0,
"\u{05F4}".len(),
"\u{05F4}אקספרס".len(),
"\u{05F4}אקספרס\u{05F4}".len(),
"\u{05F4}אקספרס\u{05F4} ".len(),
"\u{05F4}אקספרס\u{05F4} מהיום".len(),
],
);
assert_breaks("👨\u{200D}👩", vec![0, 11]);
assert_breaks("👨👩", vec![0, 4, 8]);
assert_breaks("🇦", vec![0, 4]);
assert_breaks("🇦🇦", vec![0, 8]);
assert_breaks("🇦🇦🇦", vec![0, 8, 12]);
assert_breaks("\u{200d}Ⓜ", vec![0, 6]);
}
#[test]
fn tokenizer_properties_sanity() {
fn assert_props(s: &str, expected: Vec<(usize, bool)>) {
let mut got: Vec<(usize, bool)> = Vec::new();
tokenize(s, Options::default(), |bp, props| {
got.push((bp, props.is_ascii()));
true
});
assert_eq!(got, expected, "input: {:?}", s);
}
assert_props("hello", vec![(0, true), (5, true)]);
assert_props("🛑", vec![(0, true), (4, false)]);
assert_props("ab🛑", vec![(0, true), (2, true), (6, false)]);
}
fn assert_word_like(s: &str, expected: Vec<(usize, bool)>) {
let mut got: Vec<(usize, bool)> = Vec::new();
tokenize(s, Options::default(), |bp, props| {
got.push((bp, props.is_word_like()));
true
});
assert_eq!(got, expected, "input: {:?}", s);
}
#[test]
fn tokenizer_word_like_ascii_sanity() {
assert_word_like("hello", vec![(0, false), (5, true)]);
assert_word_like("123", vec![(0, false), (3, true)]);
assert_word_like("abc123", vec![(0, false), (6, true)]);
assert_word_like("won't", vec![(0, false), (5, true)]);
assert_word_like("___", vec![(0, false), (3, false)]);
assert_word_like(" ", vec![(0, false), (3, false)]);
assert_word_like("!!!", vec![(0, false), (1, false), (2, false), (3, false)]);
}
#[test]
fn tokenizer_word_like_strict_sanity() {
assert_word_like("ש", vec![(0, false), (2, true)]);
assert_word_like("中", vec![(0, false), (3, true)]);
assert_word_like("々", vec![(0, false), (3, true)]);
assert_word_like("①", vec![(0, false), (3, true)]);
assert_word_like("अ", vec![(0, false), (3, true)]);
assert_word_like("ก", vec![(0, false), (3, true)]);
assert_word_like("👍", vec![(0, false), (4, true)]);
assert_word_like("リ", vec![(0, false), (3, true)]);
assert_word_like("ー", vec![(0, false), (3, false)]);
assert_word_like("\u{05F4}", vec![(0, false), ("\u{05F4}".len(), true)]);
}
#[test]
fn deferred_break_does_not_misattribute_props() {
let s = "אקספרס\u{05F4} ";
assert_word_like(
s,
vec![
(0, false),
("אקספרס".len(), true), ("אקספרס\u{05F4}".len(), true), (s.len(), false), ],
);
let s = "\u{05F4}אקספרס\u{05F4} מהיום";
assert_word_like(
s,
vec![
(0, false),
("\u{05F4}".len(), true), ("\u{05F4}אקספרס".len(), true), ("\u{05F4}אקספרס\u{05F4}".len(), true), ("\u{05F4}אקספרס\u{05F4} ".len(), false), (s.len(), true), ],
);
}
}