Skip to main content

kham_core/
sentence.rs

1//! Thai sentence segmentation.
2//!
3//! Splits Thai (and mixed-script) text into sentences by detecting sentence-ending
4//! delimiters: Thai terminators (`๚` `๛`), Paiyannoi (`ฯ` — but not in `ฯลฯ`),
5//! universal punctuation (`!` `?` `.`), and newlines.
6//!
7//! ## Delimiters
8//!
9//! | Char | Unicode | Rule |
10//! |------|---------|------|
11//! | `๚`  | U+0E5A  | Always ends a sentence |
12//! | `๛`  | U+0E5B  | Always ends a sentence |
13//! | `ฯ`  | U+0E2F  | Ends a sentence unless it is the first or last character of `ฯลฯ` |
14//! | `\n` | U+000A  | Always ends a sentence |
15//! | `!`  | U+0021  | Always ends a sentence |
16//! | `?`  | U+003F  | Always ends a sentence |
17//! | `.`  | U+002E  | Ends a sentence when not a decimal point and followed by whitespace or end-of-string |
18//!
19//! ## No-split cases
20//!
21//! - `ฯลฯ` ("etc.") — neither `ฯ` character in the sequence is a split point.
22//! - `3.14` — a period between two ASCII digits is a decimal point, not a boundary.
23//! - `A.B.C.` — a period not followed by whitespace or end-of-string is not a boundary
24//!   (handles abbreviations like `ก.ค.`, `พ.ศ.`, `A.D.`).
25//!
26//! # Examples
27//!
28//! ```rust
29//! use kham_core::sentence::split_sentences;
30//!
31//! let sents = split_sentences("วันนี้อากาศดี\nพรุ่งนี้จะฝนตก");
32//! assert_eq!(sents.len(), 2);
33//! assert_eq!(sents[0].text.trim(), "วันนี้อากาศดี");
34//! assert_eq!(sents[1].text.trim(), "พรุ่งนี้จะฝนตก");
35//!
36//! // ฯลฯ is not a sentence boundary
37//! let sents2 = split_sentences("กินข้าวฯลฯทุกวัน");
38//! assert_eq!(sents2.len(), 1);
39//! ```
40
41use alloc::vec::Vec;
42use core::ops::Range;
43
44// ---------------------------------------------------------------------------
45// Public types
46// ---------------------------------------------------------------------------
47
48/// A sentence extracted from source text.
49///
50/// `text` is a zero-copy slice of the original input. It includes the
51/// terminating delimiter (if any) and surrounding whitespace — call
52/// `.text.trim()` to strip those. `span` and `char_span` give the byte and
53/// char offsets of the slice in the source string.
54#[derive(Debug, Clone, PartialEq, Eq)]
55pub struct Sentence<'a> {
56    /// Zero-copy slice of the source text (includes terminator).
57    pub text: &'a str,
58    /// Byte offsets `[start, end)` of this sentence in the source string.
59    pub span: Range<usize>,
60    /// Unicode scalar-value offsets `[start, end)` of this sentence.
61    pub char_span: Range<usize>,
62}
63
64// ---------------------------------------------------------------------------
65// Segmenter
66// ---------------------------------------------------------------------------
67
68/// Splits text into sentences.
69///
70/// Currently stateless; a builder API will be added when configurable options
71/// (e.g., toggling newline splitting) are required.
72///
73/// ```rust
74/// use kham_core::sentence::SentenceSegmenter;
75///
76/// let seg = SentenceSegmenter::new();
77/// let sents = seg.split("กินข้าว\nดื่มน้ำ");
78/// assert_eq!(sents.len(), 2);
79/// ```
80#[derive(Debug, Default, Clone)]
81pub struct SentenceSegmenter;
82
83impl SentenceSegmenter {
84    /// Create a sentence segmenter with default settings.
85    pub fn new() -> Self {
86        Self
87    }
88
89    /// Split `text` into sentences.
90    ///
91    /// Empty and whitespace-only spans between delimiters are silently dropped.
92    /// The returned slices are zero-copy references into `text`.
93    pub fn split<'a>(&self, text: &'a str) -> Vec<Sentence<'a>> {
94        if text.is_empty() {
95            return Vec::new();
96        }
97
98        // Collect (byte_offset, char) pairs once for O(1) lookahead/lookbehind.
99        let chars: Vec<(usize, char)> = text.char_indices().collect();
100        let n = chars.len();
101
102        let mut result = Vec::new();
103        let mut seg_byte_start = 0usize;
104        let mut seg_char_start = 0usize;
105
106        for i in 0..n {
107            if !is_boundary(&chars, i) {
108                continue;
109            }
110
111            let byte_end = if i + 1 < n {
112                chars[i + 1].0
113            } else {
114                text.len()
115            };
116            let char_end = i + 1;
117
118            let slice = &text[seg_byte_start..byte_end];
119            if !slice.trim().is_empty() {
120                result.push(Sentence {
121                    text: slice,
122                    span: seg_byte_start..byte_end,
123                    char_span: seg_char_start..char_end,
124                });
125            }
126            seg_byte_start = byte_end;
127            seg_char_start = char_end;
128        }
129
130        // Remaining text after the last delimiter.
131        if seg_byte_start < text.len() {
132            let slice = &text[seg_byte_start..];
133            if !slice.trim().is_empty() {
134                result.push(Sentence {
135                    text: slice,
136                    span: seg_byte_start..text.len(),
137                    char_span: seg_char_start..n,
138                });
139            }
140        }
141
142        result
143    }
144}
145
146// ---------------------------------------------------------------------------
147// Free function
148// ---------------------------------------------------------------------------
149
150/// Split `text` into sentences.
151///
152/// Convenience wrapper over [`SentenceSegmenter::split`].
153///
154/// # Examples
155///
156/// ```rust
157/// use kham_core::sentence::split_sentences;
158///
159/// let sents = split_sentences("กินข้าว\nดื่มน้ำ");
160/// assert_eq!(sents.len(), 2);
161/// assert_eq!(sents[0].text.trim(), "กินข้าว");
162/// assert_eq!(sents[1].text.trim(), "ดื่มน้ำ");
163/// ```
164pub fn split_sentences(text: &str) -> Vec<Sentence<'_>> {
165    SentenceSegmenter::new().split(text)
166}
167
168// ---------------------------------------------------------------------------
169// Boundary detection
170// ---------------------------------------------------------------------------
171
172/// Return `true` if `chars[i]` is the last character of a sentence.
173fn is_boundary(chars: &[(usize, char)], i: usize) -> bool {
174    let c = chars[i].1;
175    let prev = if i > 0 { Some(chars[i - 1].1) } else { None };
176    let next = if i + 1 < chars.len() {
177        Some(chars[i + 1].1)
178    } else {
179        None
180    };
181
182    match c {
183        // Thai section / sentence terminators — always end a sentence.
184        '\u{0E5A}' | '\u{0E5B}' => true,
185
186        // Paiyannoi ฯ (U+0E2F) — ends a sentence unless it is part of ฯลฯ.
187        //   ฯลฯ = U+0E2F  U+0E25  U+0E2F
188        // First ฯ: next char is ล AND char after that is ฯ.
189        // Last  ฯ: prev char is ล AND char before that is ฯ.
190        '\u{0E2F}' => {
191            let next2 = chars.get(i + 2).map(|(_, c2)| *c2);
192            let is_ฯลฯ_first = next == Some('\u{0E25}') && next2 == Some('\u{0E2F}');
193            let is_ฯลฯ_last = prev == Some('\u{0E25}') && i >= 2 && chars[i - 2].1 == '\u{0E2F}';
194            !is_ฯลฯ_first && !is_ฯลฯ_last
195        }
196
197        // Newline — always ends a sentence (paragraph / line break).
198        '\n' => true,
199
200        // Universal sentence-ending punctuation.
201        '!' | '?' => true,
202
203        // Period:
204        //   - NOT a boundary when it is a decimal point (digit on both sides).
205        //   - NOT a boundary when the next character is not whitespace and not
206        //     end-of-string (rules out mid-abbreviation dots like ก.ค., A.B.C.).
207        '.' => {
208            let prev_digit = prev.is_some_and(|p| p.is_ascii_digit());
209            let next_digit = next.is_some_and(|n| n.is_ascii_digit());
210            let next_space_or_end = next.is_none_or(|n| n.is_whitespace());
211            !prev_digit && !next_digit && next_space_or_end
212        }
213
214        _ => false,
215    }
216}
217
218// ---------------------------------------------------------------------------
219// Tests
220// ---------------------------------------------------------------------------
221
222#[cfg(test)]
223mod tests {
224    use super::*;
225
226    fn trimmed<'a>(sents: &'a [Sentence<'a>]) -> Vec<&'a str> {
227        sents.iter().map(|s| s.text.trim()).collect()
228    }
229
230    // ── basic splitting ───────────────────────────────────────────────────────
231
232    #[test]
233    fn empty_returns_empty() {
234        assert!(split_sentences("").is_empty());
235    }
236
237    #[test]
238    fn whitespace_only_returns_empty() {
239        assert!(split_sentences("   \n\t  ").is_empty());
240    }
241
242    #[test]
243    fn single_sentence_no_delimiter() {
244        let sents = split_sentences("กินข้าวกับปลา");
245        assert_eq!(trimmed(&sents), &["กินข้าวกับปลา"]);
246    }
247
248    #[test]
249    fn split_on_newline() {
250        let sents = split_sentences("กินข้าว\nดื่มน้ำ");
251        assert_eq!(trimmed(&sents), &["กินข้าว", "ดื่มน้ำ"]);
252    }
253
254    #[test]
255    fn double_newline_no_empty_sentence() {
256        // The empty span between two newlines must be dropped.
257        let sents = split_sentences("กินข้าว\n\nดื่มน้ำ");
258        assert_eq!(trimmed(&sents), &["กินข้าว", "ดื่มน้ำ"]);
259    }
260
261    #[test]
262    fn trailing_newline_no_empty_sentence() {
263        let sents = split_sentences("กินข้าว\n");
264        assert_eq!(sents.len(), 1);
265        assert_eq!(sents[0].text.trim(), "กินข้าว");
266    }
267
268    #[test]
269    fn three_sentences_via_newlines() {
270        let sents = split_sentences("ประโยคหนึ่ง\nประโยคสอง\nประโยคสาม");
271        assert_eq!(sents.len(), 3);
272    }
273
274    // ── Thai terminators ──────────────────────────────────────────────────────
275
276    #[test]
277    fn angkhankhu_splits() {
278        // ๚ (U+0E5A) is the Thai sentence mark.
279        let sents = split_sentences("กินข้าว๚ดื่มน้ำ");
280        assert_eq!(sents.len(), 2, "sents: {:?}", trimmed(&sents));
281        assert!(sents[0].text.contains("กินข้าว"));
282        assert!(sents[1].text.contains("ดื่มน้ำ"));
283    }
284
285    #[test]
286    fn khomut_splits() {
287        // ๛ (U+0E5B) is the Thai chapter/section mark.
288        let sents = split_sentences("บทที่หนึ่ง๛บทที่สอง");
289        assert_eq!(sents.len(), 2);
290    }
291
292    // ── Paiyannoi ฯ rules ─────────────────────────────────────────────────────
293
294    #[test]
295    fn paiyannoi_alone_splits() {
296        // Standalone ฯ (not part of ฯลฯ) ends the sentence.
297        let sents = split_sentences("กินข้าวฯดื่มน้ำ");
298        assert_eq!(sents.len(), 2, "ฯ should split: {:?}", trimmed(&sents));
299    }
300
301    #[test]
302    fn ฯลฯ_does_not_split() {
303        // ฯลฯ is an abbreviation ("etc.") — must not be treated as a sentence boundary.
304        let sents = split_sentences("กินข้าวฯลฯทุกวัน");
305        assert_eq!(
306            sents.len(),
307            1,
308            "ฯลฯ should not split: {:?}",
309            trimmed(&sents)
310        );
311    }
312
313    #[test]
314    fn ฯลฯ_in_middle_preserves_two_sentences() {
315        // ฯลฯ in the middle of a sentence, split by newline at end.
316        let sents = split_sentences("กินข้าวฯลฯทุกวัน\nพรุ่งนี้จะฝน");
317        assert_eq!(sents.len(), 2, "sents: {:?}", trimmed(&sents));
318        assert!(
319            trimmed(&sents)[0].contains("ฯลฯ"),
320            "ฯลฯ should remain in first sentence"
321        );
322    }
323
324    // ── period rules ─────────────────────────────────────────────────────────
325
326    #[test]
327    fn period_before_space_splits() {
328        let sents = split_sentences("Hello world. Goodbye world.");
329        assert_eq!(sents.len(), 2, "sents: {:?}", trimmed(&sents));
330        assert_eq!(sents[0].text.trim(), "Hello world.");
331        assert_eq!(sents[1].text.trim(), "Goodbye world.");
332    }
333
334    #[test]
335    fn period_at_end_of_string_does_not_add_empty_sentence() {
336        let sents = split_sentences("Hello world.");
337        assert_eq!(sents.len(), 1);
338        assert_eq!(sents[0].text.trim(), "Hello world.");
339    }
340
341    #[test]
342    fn decimal_point_does_not_split() {
343        // Period between two ASCII digits is a decimal point.
344        let sents = split_sentences("ราคา3.14บาท");
345        assert_eq!(
346            sents.len(),
347            1,
348            "decimal point should not split: {:?}",
349            trimmed(&sents)
350        );
351    }
352
353    #[test]
354    fn abbreviation_dot_not_followed_by_space_does_not_split() {
355        // ก.ค. — period not followed by whitespace or end: not a boundary.
356        let sents = split_sentences("วันที่5ก.ค.2567");
357        assert_eq!(
358            sents.len(),
359            1,
360            "abbreviation dots should not split: {:?}",
361            trimmed(&sents)
362        );
363    }
364
365    // ── exclamation and question marks ────────────────────────────────────────
366
367    #[test]
368    fn exclamation_splits() {
369        let sents = split_sentences("ดีมาก!แย่มาก");
370        assert_eq!(sents.len(), 2, "! should split: {:?}", trimmed(&sents));
371    }
372
373    #[test]
374    fn question_splits() {
375        let sents = split_sentences("ไปไหน?ไปตลาด");
376        assert_eq!(sents.len(), 2, "? should split: {:?}", trimmed(&sents));
377    }
378
379    // ── span correctness ──────────────────────────────────────────────────────
380
381    #[test]
382    fn byte_spans_are_valid_utf8_slices() {
383        let text = "กินข้าว\nดื่มน้ำ";
384        for s in split_sentences(text) {
385            // Must not panic.
386            let _ = &text[s.span.clone()];
387            assert_eq!(s.text, &text[s.span]);
388        }
389    }
390
391    #[test]
392    fn char_spans_match_text() {
393        let text = "กินข้าว\nดื่มน้ำ";
394        let all_chars: Vec<char> = text.chars().collect();
395        for s in split_sentences(text) {
396            let by_char: alloc::string::String = all_chars[s.char_span.clone()].iter().collect();
397            assert_eq!(s.text, by_char, "char_span mismatch for '{}'", s.text);
398        }
399    }
400
401    #[test]
402    fn spans_cover_full_input() {
403        // The union of sentence spans must equal the full text length
404        // (minus any whitespace-only gaps between delimiters).
405        let text = "ประโยคหนึ่ง\nประโยคสอง\nประโยคสาม";
406        let sents = split_sentences(text);
407        let reconstructed: alloc::string::String = sents.iter().map(|s| s.text).collect();
408        assert_eq!(reconstructed, text);
409    }
410
411    // ── mixed script ──────────────────────────────────────────────────────────
412
413    #[test]
414    fn mixed_thai_english_newline() {
415        let sents = split_sentences("กินข้าว\nHello world.\nดื่มน้ำ");
416        // \n → sentence 1; "Hello world." → period+end/whitespace → sentence 2; "ดื่มน้ำ" → 3
417        assert!(
418            sents.len() >= 2,
419            "expected ≥ 2 sentences, got {:?}",
420            trimmed(&sents)
421        );
422    }
423
424    // ── SentenceSegmenter struct ──────────────────────────────────────────────
425
426    #[test]
427    fn segmenter_new_and_default_agree() {
428        let text = "กินข้าว\nดื่มน้ำ";
429        let a = SentenceSegmenter::new().split(text);
430        let b = SentenceSegmenter.split(text);
431        assert_eq!(a, b);
432    }
433}