use alloc::vec;
use alloc::vec::Vec;
#[inline]
fn is_consonant(c: char) -> bool {
matches!(c, '\u{0E01}'..='\u{0E2E}')
}
#[inline]
fn is_lead_vowel(c: char) -> bool {
matches!(c, '\u{0E40}'..='\u{0E44}')
}
#[inline]
fn is_upper_vowel(c: char) -> bool {
c == '\u{0E31}' || matches!(c, '\u{0E34}'..='\u{0E3A}')
}
#[inline]
fn is_tone(c: char) -> bool {
matches!(c, '\u{0E48}'..='\u{0E4B}')
}
#[inline]
fn is_thanthakat(c: char) -> bool {
c == '\u{0E4C}'
}
#[inline]
fn is_nikhahit(c: char) -> bool {
c == '\u{0E4D}'
}
#[inline]
fn is_follow_vowel(c: char) -> bool {
c == '\u{0E30}' || matches!(c, '\u{0E32}'..='\u{0E33}')
}
#[inline]
fn is_thai(c: char) -> bool {
matches!(c, '\u{0E00}'..='\u{0E7F}')
}
struct Cursor<'a> {
chars: core::iter::Peekable<core::str::CharIndices<'a>>,
base: usize,
end: usize,
}
impl<'a> Cursor<'a> {
fn new(text: &'a str, pos: usize) -> Self {
Self {
chars: text[pos..].char_indices().peekable(),
base: pos,
end: pos,
}
}
#[inline]
fn peek(&mut self) -> Option<char> {
self.chars.peek().map(|&(_, c)| c)
}
#[inline]
fn advance(&mut self) -> Option<char> {
let (off, c) = self.chars.next()?;
self.end = self.base + off + c.len_utf8();
Some(c)
}
#[inline]
fn advance_if(&mut self, pred: impl Fn(char) -> bool) -> bool {
match self.chars.peek() {
Some(&(_, c)) if pred(c) => {
self.advance();
true
}
_ => false,
}
}
#[inline]
fn advance_while(&mut self, pred: impl Fn(char) -> bool) {
while self.advance_if(&pred) {}
}
}
fn scan_non_thai(cur: &mut Cursor<'_>) {
cur.advance_while(|c| !is_thai(c));
}
fn scan_head(cur: &mut Cursor<'_>, first: char) -> Option<char> {
if is_lead_vowel(first) {
match cur.peek() {
Some(c) if is_consonant(c) => {
cur.advance();
Some(c)
}
_ => None,
}
} else if is_consonant(first) {
Some(first)
} else {
None
}
}
fn scan_upper_vowels(cur: &mut Cursor<'_>) {
cur.advance_while(is_upper_vowel);
}
fn scan_tone_marks(cur: &mut Cursor<'_>) {
cur.advance_while(is_tone);
}
fn scan_trailing(cur: &mut Cursor<'_>) {
cur.advance_if(|c| is_thanthakat(c) || is_follow_vowel(c) || is_nikhahit(c));
}
fn scan_one_tcc(text: &str, pos: usize) -> Option<usize> {
let mut cur = Cursor::new(text, pos);
let first = cur.advance()?;
if !is_thai(first) {
scan_non_thai(&mut cur);
return Some(cur.end);
}
let consonant = match scan_head(&mut cur, first) {
Some(c) => c,
None => return Some(cur.end),
};
if !matches!(consonant, '\u{0E24}' | '\u{0E26}') {
scan_upper_vowels(&mut cur);
scan_tone_marks(&mut cur);
scan_trailing(&mut cur);
}
Some(cur.end)
}
pub fn tcc_boundaries(text: &str) -> Vec<usize> {
if text.is_empty() {
return vec![0];
}
let mut bounds = Vec::with_capacity(text.len() / 3 + 2);
bounds.push(0);
let mut pos = 0;
while pos < text.len() {
match scan_one_tcc(text, pos) {
Some(next) if next > pos => {
bounds.push(next);
pos = next;
}
_ => {
let next = text[pos..]
.char_indices()
.nth(1)
.map(|(i, _)| pos + i)
.unwrap_or(text.len());
bounds.push(next);
pos = next;
}
}
}
bounds
}
pub fn tcc_iter(text: &str) -> impl Iterator<Item = &str> {
TccIter { text, pos: 0 }
}
struct TccIter<'a> {
text: &'a str,
pos: usize,
}
impl<'a> Iterator for TccIter<'a> {
type Item = &'a str;
fn next(&mut self) -> Option<Self::Item> {
if self.pos >= self.text.len() {
return None;
}
let end = scan_one_tcc(self.text, self.pos)?;
let slice = &self.text[self.pos..end];
self.pos = end;
Some(slice)
}
}
#[cfg(test)]
mod tests {
use super::*;
use alloc::vec;
fn tccs(text: &str) -> Vec<&str> {
tcc_iter(text).collect()
}
#[test]
fn empty() {
assert_eq!(tcc_boundaries(""), vec![0]);
assert_eq!(tccs(""), Vec::<&str>::new());
}
#[test]
fn single_consonant() {
assert_eq!(tccs("ก"), vec!["ก"]);
}
#[test]
fn consonant_upper_vowel() {
assert_eq!(tccs("กิ"), vec!["กิ"]);
}
#[test]
fn consonant_upper_tone() {
assert_eq!(tccs("กิ้"), vec!["กิ้"]);
}
#[test]
fn two_consonants() {
assert_eq!(tccs("กน"), vec!["ก", "น"]);
}
#[test]
fn gin_two_tccs() {
assert_eq!(tccs("กิน"), vec!["กิ", "น"]);
let b = tcc_boundaries("กิน");
assert_eq!(b, vec![0, 6, 9]);
}
#[test]
fn lead_vowel() {
assert_eq!(tccs("เก"), vec!["เก"]);
}
#[test]
fn lead_vowel_with_tone() {
assert_eq!(tccs("เก้"), vec!["เก้"]);
}
#[test]
fn follow_vowel_aa() {
assert_eq!(tccs("กา"), vec!["กา"]);
}
#[test]
fn follow_vowel_sara_am() {
assert_eq!(tccs("กำ"), vec!["กำ"]);
}
#[test]
fn thanthakat() {
assert_eq!(tccs("กร์"), vec!["ก", "ร์"]);
}
#[test]
fn non_thai_run() {
assert_eq!(tccs("hello"), vec!["hello"]);
}
#[test]
fn mixed_script() {
assert_eq!(tccs("hiกิน"), vec!["hi", "กิ", "น"]);
}
#[test]
fn thai_digit() {
assert_eq!(tccs("๑"), vec!["๑"]);
}
#[test]
fn sawasdee() {
let result = tccs("สวัสดี");
assert_eq!(result.join(""), "สวัสดี");
assert_eq!(result.len(), 4);
}
#[test]
fn boundary_coverage() {
let text = "ธนาคาร100แห่ง";
let bounds = tcc_boundaries(text);
assert_eq!(bounds[0], 0);
assert_eq!(*bounds.last().unwrap(), text.len());
for &b in &bounds {
assert!(
text.is_char_boundary(b),
"offset {b} is not a char boundary"
);
}
let rebuilt: alloc::string::String = bounds.windows(2).map(|w| &text[w[0]..w[1]]).collect();
assert_eq!(rebuilt, text);
}
}