kham_core/sentence.rs
1//! Thai sentence segmentation.
2//!
3//! Splits Thai (and mixed-script) text into sentences by detecting sentence-ending
4//! delimiters: Thai terminators (`๚` `๛`), Paiyannoi (`ฯ` — but not in `ฯลฯ`),
5//! universal punctuation (`!` `?` `.`), and newlines.
6//!
7//! ## Delimiters
8//!
9//! | Char | Unicode | Rule |
10//! |------|---------|------|
11//! | `๚` | U+0E5A | Always ends a sentence |
12//! | `๛` | U+0E5B | Always ends a sentence |
13//! | `ฯ` | U+0E2F | Ends a sentence unless it is the first or last character of `ฯลฯ` |
14//! | `\n` | U+000A | Always ends a sentence |
15//! | `!` | U+0021 | Always ends a sentence |
16//! | `?` | U+003F | Always ends a sentence |
17//! | `.` | U+002E | Ends a sentence when not a decimal point and followed by whitespace or end-of-string |
18//!
19//! ## No-split cases
20//!
21//! - `ฯลฯ` ("etc.") — neither `ฯ` character in the sequence is a split point.
22//! - `3.14` — a period between two ASCII digits is a decimal point, not a boundary.
23//! - `A.B.C.` — a period not followed by whitespace or end-of-string is not a boundary
24//! (handles abbreviations like `ก.ค.`, `พ.ศ.`, `A.D.`).
25//!
26//! # Examples
27//!
28//! ```rust
29//! use kham_core::sentence::split_sentences;
30//!
31//! let sents = split_sentences("วันนี้อากาศดี\nพรุ่งนี้จะฝนตก");
32//! assert_eq!(sents.len(), 2);
33//! assert_eq!(sents[0].text.trim(), "วันนี้อากาศดี");
34//! assert_eq!(sents[1].text.trim(), "พรุ่งนี้จะฝนตก");
35//!
36//! // ฯลฯ is not a sentence boundary
37//! let sents2 = split_sentences("กินข้าวฯลฯทุกวัน");
38//! assert_eq!(sents2.len(), 1);
39//! ```
40
41use alloc::vec::Vec;
42use core::ops::Range;
43
44// ---------------------------------------------------------------------------
45// Public types
46// ---------------------------------------------------------------------------
47
48/// A sentence extracted from source text.
49///
50/// `text` is a zero-copy slice of the original input. It includes the
51/// terminating delimiter (if any) and surrounding whitespace — call
52/// `.text.trim()` to strip those. `span` and `char_span` give the byte and
53/// char offsets of the slice in the source string.
54#[derive(Debug, Clone, PartialEq, Eq)]
55pub struct Sentence<'a> {
56 /// Zero-copy slice of the source text (includes terminator).
57 pub text: &'a str,
58 /// Byte offsets `[start, end)` of this sentence in the source string.
59 pub span: Range<usize>,
60 /// Unicode scalar-value offsets `[start, end)` of this sentence.
61 pub char_span: Range<usize>,
62}
63
64// ---------------------------------------------------------------------------
65// Segmenter
66// ---------------------------------------------------------------------------
67
68/// Splits text into sentences.
69///
70/// Currently stateless; a builder API will be added when configurable options
71/// (e.g., toggling newline splitting) are required.
72///
73/// ```rust
74/// use kham_core::sentence::SentenceSegmenter;
75///
76/// let seg = SentenceSegmenter::new();
77/// let sents = seg.split("กินข้าว\nดื่มน้ำ");
78/// assert_eq!(sents.len(), 2);
79/// ```
80#[derive(Debug, Default, Clone)]
81pub struct SentenceSegmenter;
82
83impl SentenceSegmenter {
84 /// Create a sentence segmenter with default settings.
85 pub fn new() -> Self {
86 Self
87 }
88
89 /// Split `text` into sentences.
90 ///
91 /// Empty and whitespace-only spans between delimiters are silently dropped.
92 /// The returned slices are zero-copy references into `text`.
93 pub fn split<'a>(&self, text: &'a str) -> Vec<Sentence<'a>> {
94 if text.is_empty() {
95 return Vec::new();
96 }
97
98 // Collect (byte_offset, char) pairs once for O(1) lookahead/lookbehind.
99 let chars: Vec<(usize, char)> = text.char_indices().collect();
100 let n = chars.len();
101
102 let mut result = Vec::new();
103 let mut seg_byte_start = 0usize;
104 let mut seg_char_start = 0usize;
105
106 for i in 0..n {
107 if !is_boundary(&chars, i) {
108 continue;
109 }
110
111 let byte_end = if i + 1 < n {
112 chars[i + 1].0
113 } else {
114 text.len()
115 };
116 let char_end = i + 1;
117
118 let slice = &text[seg_byte_start..byte_end];
119 if !slice.trim().is_empty() {
120 result.push(Sentence {
121 text: slice,
122 span: seg_byte_start..byte_end,
123 char_span: seg_char_start..char_end,
124 });
125 }
126 seg_byte_start = byte_end;
127 seg_char_start = char_end;
128 }
129
130 // Remaining text after the last delimiter.
131 if seg_byte_start < text.len() {
132 let slice = &text[seg_byte_start..];
133 if !slice.trim().is_empty() {
134 result.push(Sentence {
135 text: slice,
136 span: seg_byte_start..text.len(),
137 char_span: seg_char_start..n,
138 });
139 }
140 }
141
142 result
143 }
144}
145
146// ---------------------------------------------------------------------------
147// Free function
148// ---------------------------------------------------------------------------
149
150/// Split `text` into sentences.
151///
152/// Convenience wrapper over [`SentenceSegmenter::split`].
153///
154/// # Examples
155///
156/// ```rust
157/// use kham_core::sentence::split_sentences;
158///
159/// let sents = split_sentences("กินข้าว\nดื่มน้ำ");
160/// assert_eq!(sents.len(), 2);
161/// assert_eq!(sents[0].text.trim(), "กินข้าว");
162/// assert_eq!(sents[1].text.trim(), "ดื่มน้ำ");
163/// ```
164pub fn split_sentences(text: &str) -> Vec<Sentence<'_>> {
165 SentenceSegmenter::new().split(text)
166}
167
168// ---------------------------------------------------------------------------
169// Boundary detection
170// ---------------------------------------------------------------------------
171
172/// Return `true` if `chars[i]` is the last character of a sentence.
173fn is_boundary(chars: &[(usize, char)], i: usize) -> bool {
174 let c = chars[i].1;
175 let prev = if i > 0 { Some(chars[i - 1].1) } else { None };
176 let next = if i + 1 < chars.len() {
177 Some(chars[i + 1].1)
178 } else {
179 None
180 };
181
182 match c {
183 // Thai section / sentence terminators — always end a sentence.
184 '\u{0E5A}' | '\u{0E5B}' => true,
185
186 // Paiyannoi ฯ (U+0E2F) — ends a sentence unless it is part of ฯลฯ.
187 // ฯลฯ = U+0E2F U+0E25 U+0E2F
188 // First ฯ: next char is ล AND char after that is ฯ.
189 // Last ฯ: prev char is ล AND char before that is ฯ.
190 '\u{0E2F}' => {
191 let next2 = chars.get(i + 2).map(|(_, c2)| *c2);
192 let is_ฯลฯ_first = next == Some('\u{0E25}') && next2 == Some('\u{0E2F}');
193 let is_ฯลฯ_last = prev == Some('\u{0E25}') && i >= 2 && chars[i - 2].1 == '\u{0E2F}';
194 !is_ฯลฯ_first && !is_ฯลฯ_last
195 }
196
197 // Newline — always ends a sentence (paragraph / line break).
198 '\n' => true,
199
200 // Universal sentence-ending punctuation.
201 '!' | '?' => true,
202
203 // Period:
204 // - NOT a boundary when it is a decimal point (digit on both sides).
205 // - NOT a boundary when the next character is not whitespace and not
206 // end-of-string (rules out mid-abbreviation dots like ก.ค., A.B.C.).
207 '.' => {
208 let prev_digit = prev.is_some_and(|p| p.is_ascii_digit());
209 let next_digit = next.is_some_and(|n| n.is_ascii_digit());
210 let next_space_or_end = next.is_none_or(|n| n.is_whitespace());
211 !prev_digit && !next_digit && next_space_or_end
212 }
213
214 _ => false,
215 }
216}
217
218// ---------------------------------------------------------------------------
219// Tests
220// ---------------------------------------------------------------------------
221
222#[cfg(test)]
223mod tests {
224 use super::*;
225
226 fn trimmed<'a>(sents: &'a [Sentence<'a>]) -> Vec<&'a str> {
227 sents.iter().map(|s| s.text.trim()).collect()
228 }
229
230 // ── basic splitting ───────────────────────────────────────────────────────
231
232 #[test]
233 fn empty_returns_empty() {
234 assert!(split_sentences("").is_empty());
235 }
236
237 #[test]
238 fn whitespace_only_returns_empty() {
239 assert!(split_sentences(" \n\t ").is_empty());
240 }
241
242 #[test]
243 fn single_sentence_no_delimiter() {
244 let sents = split_sentences("กินข้าวกับปลา");
245 assert_eq!(trimmed(&sents), &["กินข้าวกับปลา"]);
246 }
247
248 #[test]
249 fn split_on_newline() {
250 let sents = split_sentences("กินข้าว\nดื่มน้ำ");
251 assert_eq!(trimmed(&sents), &["กินข้าว", "ดื่มน้ำ"]);
252 }
253
254 #[test]
255 fn double_newline_no_empty_sentence() {
256 // The empty span between two newlines must be dropped.
257 let sents = split_sentences("กินข้าว\n\nดื่มน้ำ");
258 assert_eq!(trimmed(&sents), &["กินข้าว", "ดื่มน้ำ"]);
259 }
260
261 #[test]
262 fn trailing_newline_no_empty_sentence() {
263 let sents = split_sentences("กินข้าว\n");
264 assert_eq!(sents.len(), 1);
265 assert_eq!(sents[0].text.trim(), "กินข้าว");
266 }
267
268 #[test]
269 fn three_sentences_via_newlines() {
270 let sents = split_sentences("ประโยคหนึ่ง\nประโยคสอง\nประโยคสาม");
271 assert_eq!(sents.len(), 3);
272 }
273
274 // ── Thai terminators ──────────────────────────────────────────────────────
275
276 #[test]
277 fn angkhankhu_splits() {
278 // ๚ (U+0E5A) is the Thai sentence mark.
279 let sents = split_sentences("กินข้าว๚ดื่มน้ำ");
280 assert_eq!(sents.len(), 2, "sents: {:?}", trimmed(&sents));
281 assert!(sents[0].text.contains("กินข้าว"));
282 assert!(sents[1].text.contains("ดื่มน้ำ"));
283 }
284
285 #[test]
286 fn khomut_splits() {
287 // ๛ (U+0E5B) is the Thai chapter/section mark.
288 let sents = split_sentences("บทที่หนึ่ง๛บทที่สอง");
289 assert_eq!(sents.len(), 2);
290 }
291
292 // ── Paiyannoi ฯ rules ─────────────────────────────────────────────────────
293
294 #[test]
295 fn paiyannoi_alone_splits() {
296 // Standalone ฯ (not part of ฯลฯ) ends the sentence.
297 let sents = split_sentences("กินข้าวฯดื่มน้ำ");
298 assert_eq!(sents.len(), 2, "ฯ should split: {:?}", trimmed(&sents));
299 }
300
301 #[test]
302 fn ฯลฯ_does_not_split() {
303 // ฯลฯ is an abbreviation ("etc.") — must not be treated as a sentence boundary.
304 let sents = split_sentences("กินข้าวฯลฯทุกวัน");
305 assert_eq!(
306 sents.len(),
307 1,
308 "ฯลฯ should not split: {:?}",
309 trimmed(&sents)
310 );
311 }
312
313 #[test]
314 fn ฯลฯ_in_middle_preserves_two_sentences() {
315 // ฯลฯ in the middle of a sentence, split by newline at end.
316 let sents = split_sentences("กินข้าวฯลฯทุกวัน\nพรุ่งนี้จะฝน");
317 assert_eq!(sents.len(), 2, "sents: {:?}", trimmed(&sents));
318 assert!(
319 trimmed(&sents)[0].contains("ฯลฯ"),
320 "ฯลฯ should remain in first sentence"
321 );
322 }
323
324 // ── period rules ─────────────────────────────────────────────────────────
325
326 #[test]
327 fn period_before_space_splits() {
328 let sents = split_sentences("Hello world. Goodbye world.");
329 assert_eq!(sents.len(), 2, "sents: {:?}", trimmed(&sents));
330 assert_eq!(sents[0].text.trim(), "Hello world.");
331 assert_eq!(sents[1].text.trim(), "Goodbye world.");
332 }
333
334 #[test]
335 fn period_at_end_of_string_does_not_add_empty_sentence() {
336 let sents = split_sentences("Hello world.");
337 assert_eq!(sents.len(), 1);
338 assert_eq!(sents[0].text.trim(), "Hello world.");
339 }
340
341 #[test]
342 fn decimal_point_does_not_split() {
343 // Period between two ASCII digits is a decimal point.
344 let sents = split_sentences("ราคา3.14บาท");
345 assert_eq!(
346 sents.len(),
347 1,
348 "decimal point should not split: {:?}",
349 trimmed(&sents)
350 );
351 }
352
353 #[test]
354 fn abbreviation_dot_not_followed_by_space_does_not_split() {
355 // ก.ค. — period not followed by whitespace or end: not a boundary.
356 let sents = split_sentences("วันที่5ก.ค.2567");
357 assert_eq!(
358 sents.len(),
359 1,
360 "abbreviation dots should not split: {:?}",
361 trimmed(&sents)
362 );
363 }
364
365 // ── exclamation and question marks ────────────────────────────────────────
366
367 #[test]
368 fn exclamation_splits() {
369 let sents = split_sentences("ดีมาก!แย่มาก");
370 assert_eq!(sents.len(), 2, "! should split: {:?}", trimmed(&sents));
371 }
372
373 #[test]
374 fn question_splits() {
375 let sents = split_sentences("ไปไหน?ไปตลาด");
376 assert_eq!(sents.len(), 2, "? should split: {:?}", trimmed(&sents));
377 }
378
379 // ── span correctness ──────────────────────────────────────────────────────
380
381 #[test]
382 fn byte_spans_are_valid_utf8_slices() {
383 let text = "กินข้าว\nดื่มน้ำ";
384 for s in split_sentences(text) {
385 // Must not panic.
386 let _ = &text[s.span.clone()];
387 assert_eq!(s.text, &text[s.span]);
388 }
389 }
390
391 #[test]
392 fn char_spans_match_text() {
393 let text = "กินข้าว\nดื่มน้ำ";
394 let all_chars: Vec<char> = text.chars().collect();
395 for s in split_sentences(text) {
396 let by_char: alloc::string::String = all_chars[s.char_span.clone()].iter().collect();
397 assert_eq!(s.text, by_char, "char_span mismatch for '{}'", s.text);
398 }
399 }
400
401 #[test]
402 fn spans_cover_full_input() {
403 // The union of sentence spans must equal the full text length
404 // (minus any whitespace-only gaps between delimiters).
405 let text = "ประโยคหนึ่ง\nประโยคสอง\nประโยคสาม";
406 let sents = split_sentences(text);
407 let reconstructed: alloc::string::String = sents.iter().map(|s| s.text).collect();
408 assert_eq!(reconstructed, text);
409 }
410
411 // ── mixed script ──────────────────────────────────────────────────────────
412
413 #[test]
414 fn mixed_thai_english_newline() {
415 let sents = split_sentences("กินข้าว\nHello world.\nดื่มน้ำ");
416 // \n → sentence 1; "Hello world." → period+end/whitespace → sentence 2; "ดื่มน้ำ" → 3
417 assert!(
418 sents.len() >= 2,
419 "expected ≥ 2 sentences, got {:?}",
420 trimmed(&sents)
421 );
422 }
423
424 // ── SentenceSegmenter struct ──────────────────────────────────────────────
425
426 #[test]
427 fn segmenter_new_and_default_agree() {
428 let text = "กินข้าว\nดื่มน้ำ";
429 let a = SentenceSegmenter::new().split(text);
430 let b = SentenceSegmenter.split(text);
431 assert_eq!(a, b);
432 }
433}