1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
pub(crate) mod properties;
pub(crate) mod transitions;
use crate::uax29::Action;
use properties::{ASCII_SENTENCE_BREAK_PROP, lookup_sentence_break_property};
use transitions::{State, TRANSITION_TABLE, Transition};
#[derive(Default)]
#[non_exhaustive]
pub struct Options {}
pub fn tokenize(text: &str, _options: Options, mut on_breakpoint: impl FnMut(usize) -> bool) {
if text.is_empty() {
return;
}
let bytes = text.as_bytes();
let mut state = State::StartOfText;
let mut deferred_break_pos = None;
let mut pos = 0;
while pos < text.len() {
let b = bytes[pos];
let (prop, char_len) = if b < 0x80 {
(ASCII_SENTENCE_BREAK_PROP[b as usize], 1usize)
} else {
let c = text[pos..].chars().next().unwrap();
(lookup_sentence_break_property(c), c.len_utf8())
};
let Transition(next_state, action) = TRANSITION_TABLE[state as usize][prop as usize];
match action {
Action::Break => {
state = next_state;
if !on_breakpoint(pos) {
return;
}
pos += char_len;
continue;
}
Action::NoBreak => {
if next_state.is_deferred() {
if deferred_break_pos.is_none() {
deferred_break_pos = Some(pos);
}
} else {
deferred_break_pos = None;
}
state = next_state;
pos += char_len;
}
Action::Transparent => {
// State doesn't change, but we still consume the character.
pos += char_len;
}
Action::DeferredBreak => {
let boundary = deferred_break_pos.take().unwrap();
state = next_state;
// Don't advance pos — re-examine current char in new state.
if !on_breakpoint(boundary) {
return;
}
continue;
}
}
}
// Deferred state at EOT — defer failed, confirm break
if state.is_deferred() {
if !on_breakpoint(deferred_break_pos.unwrap()) {
return;
}
}
// SB2: Any ÷ eot (break at end of text)
_ = on_breakpoint(text.len());
}
#[cfg(test)]
mod tests {
use super::{Options, tokenize};
use crate::uax29::test_helpers::test_against_uax29_break_tests;
#[test]
fn test_sentence_break_against_uax29_tests() {
let (passed, failed) =
test_against_uax29_break_tests("testdata/SentenceBreakTest.txt", |s, breakpoints| {
tokenize(s, Options::default(), |bp| {
breakpoints.push(bp);
true
});
});
assert_eq!(
(512, 0),
(passed, failed),
"{} / {} tests passed",
passed,
passed + failed
);
}
#[test]
fn tokenizer_sanity() {
fn assert_breaks(s: &str, expected: Vec<usize>) {
let mut breakpoints = Vec::new();
tokenize(s, Options::default(), |bp| {
breakpoints.push(bp);
true
});
assert_eq!(breakpoints, expected, "input: {:?}", s);
}
// Empty string yields no breakpoints.
assert_breaks("", vec![]);
// Non-empty strings break at the start & end.
assert_breaks("a", vec![0, 1]);
assert_breaks(".", vec![0, 1]);
// SB998: don't break within a sentence.
assert_breaks("Hello world", vec![0, 11]);
// SB3: CR × LF (don't break between CR and LF)
assert_breaks("\r\n", vec![0, 2]);
// SB4: Break after paragraph separators (Sep, CR, LF).
assert_breaks("a\nb", vec![0, 2, 3]);
assert_breaks("a\r\nb", vec![0, 3, 4]);
assert_breaks("a\rb", vec![0, 2, 3]);
// SB5: Extend and Format are transparent.
assert_breaks("a\u{0308}b", vec![0, 4]); // a + combining diaeresis + b
// SB6: ATerm × Numeric — don't break between "." and a digit.
assert_breaks("3.4", vec![0, 3]);
// SB7: (Upper | Lower) ATerm × Upper — abbreviations like U.S.A.
assert_breaks("U.S.A.", vec![0, 6]);
assert_breaks("U.S.", vec![0, 4]);
assert_breaks("c.D", vec![0, 3]);
// SB8: ATerm Close* Sp* × (¬(OLetter|Upper|Lower|ParaSep|SATerm))* Lower
// Don't break after "." when eventually followed by a lowercase letter.
assert_breaks("c.d", vec![0, 3]);
assert_breaks("etc. the", vec![0, 8]);
assert_breaks("the resp. leaders are", vec![0, 21]);
// SB8: with Close and Sp between ATerm and Lower.
assert_breaks("etc.)'\u{a0}the", vec![0, 11]);
// SB8a: SATerm Close* Sp* × (SContinue | SATerm)
// Don't break before continuation punctuation after sentence terminators.
assert_breaks(".,", vec![0, 2]); // ATerm × SContinue
assert_breaks("..", vec![0, 2]); // ATerm × ATerm
assert_breaks("!,", vec![0, 2]); // STerm × SContinue
assert_breaks("!.", vec![0, 2]); // STerm × ATerm
// SB9/SB10/SB11: Break after sentence terminators,
// but include trailing Close, Sp, and ParaSep in the sentence.
assert_breaks("Hello. World", vec![0, 7, 12]);
assert_breaks("Hello!) World", vec![0, 8, 13]);
assert_breaks("Hello. World", vec![0, 8, 13]);
assert_breaks("Hello.\nWorld", vec![0, 7, 12]);
// SB11: STerm breaks even when followed by lowercase.
assert_breaks("Hello! world", vec![0, 7, 12]);
// SB8 vs SB11: ATerm followed by OLetter or Upper DOES break (SB8 fails).
assert_breaks("Hello. World", vec![0, 7, 12]);
// Figures 3 & 4 from the spec:
// Figure 3: Forbidden breaks on "." (should NOT break)
assert_breaks("c.d", vec![0, 3]);
assert_breaks("3.4", vec![0, 3]);
assert_breaks("U.S.", vec![0, 4]);
assert_breaks("the resp. leaders are", vec![0, 21]);
assert_breaks("etc.)\u{2019}\u{a0}\u{2018}(the", vec![0, 17]);
// Figure 4: Allowed breaks on "." (SHOULD break)
assert_breaks(
"She said \"See spot run.\" John shook his head.",
vec![0, 25, 45],
);
}
}