use alloc::vec::Vec;
use core::ops::Range;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Sentence<'a> {
pub text: &'a str,
pub span: Range<usize>,
pub char_span: Range<usize>,
}
#[derive(Debug, Default, Clone)]
pub struct SentenceSegmenter;
impl SentenceSegmenter {
pub fn new() -> Self {
Self
}
pub fn split<'a>(&self, text: &'a str) -> Vec<Sentence<'a>> {
if text.is_empty() {
return Vec::new();
}
let chars: Vec<(usize, char)> = text.char_indices().collect();
let n = chars.len();
let mut result = Vec::new();
let mut seg_byte_start = 0usize;
let mut seg_char_start = 0usize;
for i in 0..n {
if !is_boundary(&chars, i) {
continue;
}
let byte_end = if i + 1 < n {
chars[i + 1].0
} else {
text.len()
};
let char_end = i + 1;
let slice = &text[seg_byte_start..byte_end];
if !slice.trim().is_empty() {
result.push(Sentence {
text: slice,
span: seg_byte_start..byte_end,
char_span: seg_char_start..char_end,
});
}
seg_byte_start = byte_end;
seg_char_start = char_end;
}
if seg_byte_start < text.len() {
let slice = &text[seg_byte_start..];
if !slice.trim().is_empty() {
result.push(Sentence {
text: slice,
span: seg_byte_start..text.len(),
char_span: seg_char_start..n,
});
}
}
result
}
}
pub fn split_sentences(text: &str) -> Vec<Sentence<'_>> {
SentenceSegmenter::new().split(text)
}
fn is_boundary(chars: &[(usize, char)], i: usize) -> bool {
let c = chars[i].1;
let prev = if i > 0 { Some(chars[i - 1].1) } else { None };
let next = if i + 1 < chars.len() {
Some(chars[i + 1].1)
} else {
None
};
match c {
'\u{0E5A}' | '\u{0E5B}' => true,
'\u{0E2F}' => {
let next2 = chars.get(i + 2).map(|(_, c2)| *c2);
let is_ฯลฯ_first = next == Some('\u{0E25}') && next2 == Some('\u{0E2F}');
let is_ฯลฯ_last = prev == Some('\u{0E25}') && i >= 2 && chars[i - 2].1 == '\u{0E2F}';
!is_ฯลฯ_first && !is_ฯลฯ_last
}
'\n' => true,
'!' | '?' => true,
'.' => {
let prev_digit = prev.is_some_and(|p| p.is_ascii_digit());
let next_digit = next.is_some_and(|n| n.is_ascii_digit());
let next_space_or_end = next.is_none_or(|n| n.is_whitespace());
!prev_digit && !next_digit && next_space_or_end
}
_ => false,
}
}
#[cfg(test)]
mod tests {
use super::*;
fn trimmed<'a>(sents: &'a [Sentence<'a>]) -> Vec<&'a str> {
sents.iter().map(|s| s.text.trim()).collect()
}
#[test]
fn empty_returns_empty() {
assert!(split_sentences("").is_empty());
}
#[test]
fn whitespace_only_returns_empty() {
assert!(split_sentences(" \n\t ").is_empty());
}
#[test]
fn single_sentence_no_delimiter() {
let sents = split_sentences("กินข้าวกับปลา");
assert_eq!(trimmed(&sents), &["กินข้าวกับปลา"]);
}
#[test]
fn split_on_newline() {
let sents = split_sentences("กินข้าว\nดื่มน้ำ");
assert_eq!(trimmed(&sents), &["กินข้าว", "ดื่มน้ำ"]);
}
#[test]
fn double_newline_no_empty_sentence() {
let sents = split_sentences("กินข้าว\n\nดื่มน้ำ");
assert_eq!(trimmed(&sents), &["กินข้าว", "ดื่มน้ำ"]);
}
#[test]
fn trailing_newline_no_empty_sentence() {
let sents = split_sentences("กินข้าว\n");
assert_eq!(sents.len(), 1);
assert_eq!(sents[0].text.trim(), "กินข้าว");
}
#[test]
fn three_sentences_via_newlines() {
let sents = split_sentences("ประโยคหนึ่ง\nประโยคสอง\nประโยคสาม");
assert_eq!(sents.len(), 3);
}
#[test]
fn angkhankhu_splits() {
let sents = split_sentences("กินข้าว๚ดื่มน้ำ");
assert_eq!(sents.len(), 2, "sents: {:?}", trimmed(&sents));
assert!(sents[0].text.contains("กินข้าว"));
assert!(sents[1].text.contains("ดื่มน้ำ"));
}
#[test]
fn khomut_splits() {
let sents = split_sentences("บทที่หนึ่ง๛บทที่สอง");
assert_eq!(sents.len(), 2);
}
#[test]
fn paiyannoi_alone_splits() {
let sents = split_sentences("กินข้าวฯดื่มน้ำ");
assert_eq!(sents.len(), 2, "ฯ should split: {:?}", trimmed(&sents));
}
#[test]
fn ฯลฯ_does_not_split() {
let sents = split_sentences("กินข้าวฯลฯทุกวัน");
assert_eq!(
sents.len(),
1,
"ฯลฯ should not split: {:?}",
trimmed(&sents)
);
}
#[test]
fn ฯลฯ_in_middle_preserves_two_sentences() {
let sents = split_sentences("กินข้าวฯลฯทุกวัน\nพรุ่งนี้จะฝน");
assert_eq!(sents.len(), 2, "sents: {:?}", trimmed(&sents));
assert!(
trimmed(&sents)[0].contains("ฯลฯ"),
"ฯลฯ should remain in first sentence"
);
}
#[test]
fn period_before_space_splits() {
let sents = split_sentences("Hello world. Goodbye world.");
assert_eq!(sents.len(), 2, "sents: {:?}", trimmed(&sents));
assert_eq!(sents[0].text.trim(), "Hello world.");
assert_eq!(sents[1].text.trim(), "Goodbye world.");
}
#[test]
fn period_at_end_of_string_does_not_add_empty_sentence() {
let sents = split_sentences("Hello world.");
assert_eq!(sents.len(), 1);
assert_eq!(sents[0].text.trim(), "Hello world.");
}
#[test]
fn decimal_point_does_not_split() {
let sents = split_sentences("ราคา3.14บาท");
assert_eq!(
sents.len(),
1,
"decimal point should not split: {:?}",
trimmed(&sents)
);
}
#[test]
fn abbreviation_dot_not_followed_by_space_does_not_split() {
let sents = split_sentences("วันที่5ก.ค.2567");
assert_eq!(
sents.len(),
1,
"abbreviation dots should not split: {:?}",
trimmed(&sents)
);
}
#[test]
fn exclamation_splits() {
let sents = split_sentences("ดีมาก!แย่มาก");
assert_eq!(sents.len(), 2, "! should split: {:?}", trimmed(&sents));
}
#[test]
fn question_splits() {
let sents = split_sentences("ไปไหน?ไปตลาด");
assert_eq!(sents.len(), 2, "? should split: {:?}", trimmed(&sents));
}
#[test]
fn byte_spans_are_valid_utf8_slices() {
let text = "กินข้าว\nดื่มน้ำ";
for s in split_sentences(text) {
let _ = &text[s.span.clone()];
assert_eq!(s.text, &text[s.span]);
}
}
#[test]
fn char_spans_match_text() {
let text = "กินข้าว\nดื่มน้ำ";
let all_chars: Vec<char> = text.chars().collect();
for s in split_sentences(text) {
let by_char: alloc::string::String = all_chars[s.char_span.clone()].iter().collect();
assert_eq!(s.text, by_char, "char_span mismatch for '{}'", s.text);
}
}
#[test]
fn spans_cover_full_input() {
let text = "ประโยคหนึ่ง\nประโยคสอง\nประโยคสาม";
let sents = split_sentences(text);
let reconstructed: alloc::string::String = sents.iter().map(|s| s.text).collect();
assert_eq!(reconstructed, text);
}
#[test]
fn mixed_thai_english_newline() {
let sents = split_sentences("กินข้าว\nHello world.\nดื่มน้ำ");
assert!(
sents.len() >= 2,
"expected ≥ 2 sentences, got {:?}",
trimmed(&sents)
);
}
#[test]
fn segmenter_new_and_default_agree() {
let text = "กินข้าว\nดื่มน้ำ";
let a = SentenceSegmenter::new().split(text);
let b = SentenceSegmenter.split(text);
assert_eq!(a, b);
}
}