use std::collections::HashSet;
use std::sync::OnceLock;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DictLanguage {
Thai,
Lao,
Khmer,
Myanmar,
}
static THAI_DATA: &str = include_str!("../data/dictionaries/thai.dict");
static LAO_DATA: &str = include_str!("../data/dictionaries/lao.dict");
static KHMER_DATA: &str = include_str!("../data/dictionaries/khmer.dict");
static MYANMAR_DATA: &str = include_str!("../data/dictionaries/myanmar.dict");
static THAI_DICT: OnceLock<Dictionary> = OnceLock::new();
static LAO_DICT: OnceLock<Dictionary> = OnceLock::new();
static KHMER_DICT: OnceLock<Dictionary> = OnceLock::new();
static MYANMAR_DICT: OnceLock<Dictionary> = OnceLock::new();
struct Dictionary {
words: HashSet<&'static str>,
max_word_len: usize,
}
impl Dictionary {
fn from_data(data: &'static str) -> Self {
let mut words = HashSet::new();
let mut max_len: usize = 0;
for line in data.lines() {
let w = line.trim();
if !w.is_empty() {
let byte_len = w.len();
if byte_len > max_len {
max_len = byte_len;
}
words.insert(w);
}
}
Dictionary {
words,
max_word_len: max_len,
}
}
fn contains(&self, word: &str) -> bool {
self.words.contains(word)
}
}
fn get_dict(lang: DictLanguage) -> &'static Dictionary {
match lang {
DictLanguage::Thai => THAI_DICT.get_or_init(|| Dictionary::from_data(THAI_DATA)),
DictLanguage::Lao => LAO_DICT.get_or_init(|| Dictionary::from_data(LAO_DATA)),
DictLanguage::Khmer => KHMER_DICT.get_or_init(|| Dictionary::from_data(KHMER_DATA)),
DictLanguage::Myanmar => {
MYANMAR_DICT.get_or_init(|| Dictionary::from_data(MYANMAR_DATA))
}
}
}
pub fn language_for_codepoint(cp: u32) -> Option<DictLanguage> {
match cp {
0x0E01..=0x0E3A | 0x0E40..=0x0E4E | 0x0E50..=0x0E5B => Some(DictLanguage::Thai),
0x0E81..=0x0EDF => Some(DictLanguage::Lao),
0x1000..=0x109F | 0xAA60..=0xAA7F => Some(DictLanguage::Myanmar),
0x1780..=0x17FF | 0x19E0..=0x19FF => Some(DictLanguage::Khmer),
_ => None,
}
}
pub fn segment_words(text: &str, lang: DictLanguage) -> Vec<usize> {
let dict = get_dict(lang);
let bytes = text.as_bytes();
let len = bytes.len();
let mut boundaries: Vec<usize> = Vec::new();
let mut pos: usize = 0;
while pos < len {
let remaining = &text[pos..];
let max_try = remaining.len().min(dict.max_word_len);
let mut matched_len: usize = 0;
let mut try_len = max_try;
while try_len > 0 {
if remaining.is_char_boundary(try_len) && dict.contains(&remaining[..try_len]) {
matched_len = try_len;
break;
}
try_len -= 1;
}
if matched_len > 0 {
pos += matched_len;
if pos < len {
boundaries.push(pos);
}
} else {
let ch_len = remaining
.chars()
.next()
.map(|c| c.len_utf8())
.unwrap_or(1);
pos += ch_len;
if pos < len {
boundaries.push(pos);
}
}
}
boundaries
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_thai_dict_loads() {
let dict = get_dict(DictLanguage::Thai);
assert!(dict.words.len() > 20000);
assert!(dict.contains("\u{0E01}\u{0E23}")); }
#[test]
fn test_lao_dict_loads() {
let dict = get_dict(DictLanguage::Lao);
assert!(dict.words.len() > 20000);
}
#[test]
fn test_khmer_dict_loads() {
let dict = get_dict(DictLanguage::Khmer);
assert!(dict.words.len() > 50000);
}
#[test]
fn test_myanmar_dict_loads() {
let dict = get_dict(DictLanguage::Myanmar);
assert!(dict.words.len() > 30000);
}
#[test]
fn test_language_for_codepoint() {
assert_eq!(language_for_codepoint(0x0E01), Some(DictLanguage::Thai));
assert_eq!(language_for_codepoint(0x0E44), Some(DictLanguage::Thai));
assert_eq!(language_for_codepoint(0x0E81), Some(DictLanguage::Lao));
assert_eq!(language_for_codepoint(0x1000), Some(DictLanguage::Myanmar));
assert_eq!(language_for_codepoint(0x1780), Some(DictLanguage::Khmer));
assert_eq!(language_for_codepoint(0x0041), None);
}
#[test]
fn test_segment_basic_thai() {
let _dict = get_dict(DictLanguage::Thai);
let text = "\u{0E2A}\u{0E27}\u{0E31}\u{0E2A}\u{0E14}\u{0E35}"; let _ = segment_words(text, DictLanguage::Thai);
}
}