use uniworld::linebreak::dictionary::{segment_words, DictLanguage, language_for_codepoint};
use uniworld::linebreak::{line_break_opportunities, line_break_opportunities_with_dictionary, BreakAction};
#[test]
fn thai_segment_known_phrase() {
let text = "\u{0E2A}\u{0E27}\u{0E31}\u{0E2A}\u{0E14}\u{0E35}";
let boundaries = segment_words(text, DictLanguage::Thai);
for &b in &boundaries {
assert!(text.is_char_boundary(b), "boundary at {b} is not a char boundary");
assert!(b > 0 && b < text.len(), "boundary {b} out of range");
}
}
#[test]
fn thai_segment_multi_word() {
let text = "\u{0E04}\u{0E19}\u{0E44}\u{0E17}\u{0E22}";
let boundaries = segment_words(text, DictLanguage::Thai);
assert!(
!boundaries.is_empty(),
"Expected at least one word boundary in 'khon thai'"
);
assert_eq!(boundaries[0], 6, "Expected boundary after first word");
}
#[test]
fn thai_segment_longer_text() {
let text = "\u{0E1B}\u{0E23}\u{0E30}\u{0E40}\u{0E17}\u{0E28}\u{0E44}\u{0E17}\u{0E22}";
let boundaries = segment_words(text, DictLanguage::Thai);
for &b in &boundaries {
assert!(text.is_char_boundary(b));
}
}
#[test]
fn thai_dictionary_line_breaks() {
let text = "\u{0E04}\u{0E19}\u{0E44}\u{0E17}\u{0E22}";
let _breaks_no_dict = line_break_opportunities(text);
let breaks_dict = line_break_opportunities_with_dictionary(text);
let boundary_byte = 6; assert_eq!(
breaks_dict[boundary_byte],
BreakAction::Allowed,
"Expected Allowed break between Thai words with dictionary"
);
}
#[test]
fn thai_no_break_within_word() {
let text = "\u{0E1B}\u{0E23}\u{0E30}\u{0E40}\u{0E17}\u{0E28}";
let breaks = line_break_opportunities_with_dictionary(text);
let char_positions: Vec<usize> = text.char_indices().map(|(i, _)| i).collect();
for &pos in &char_positions[1..] {
if pos < text.len() {
assert_ne!(
breaks[pos],
BreakAction::Allowed,
"Unexpected break within Thai word at byte {pos}"
);
}
}
}
#[test]
fn lao_segment_basic() {
let text = "\u{0EAA}\u{0EB0}\u{0E9A}\u{0EB2}\u{0E8D}\u{0E94}\u{0EB5}";
let boundaries = segment_words(text, DictLanguage::Lao);
for &b in &boundaries {
assert!(text.is_char_boundary(b));
}
}
#[test]
fn lao_language_detection() {
assert_eq!(language_for_codepoint(0x0E81), Some(DictLanguage::Lao));
assert_eq!(language_for_codepoint(0x0EAA), Some(DictLanguage::Lao));
}
#[test]
fn khmer_segment_basic() {
let text = "\u{1797}\u{17B6}\u{179F}\u{17B6}";
let boundaries = segment_words(text, DictLanguage::Khmer);
for &b in &boundaries {
assert!(text.is_char_boundary(b));
}
}
#[test]
fn khmer_language_detection() {
assert_eq!(language_for_codepoint(0x1780), Some(DictLanguage::Khmer));
assert_eq!(language_for_codepoint(0x1797), Some(DictLanguage::Khmer));
}
#[test]
fn myanmar_segment_basic() {
let text = "\u{1019}\u{103C}\u{1014}\u{103A}\u{1019}\u{102C}";
let boundaries = segment_words(text, DictLanguage::Myanmar);
for &b in &boundaries {
assert!(text.is_char_boundary(b));
}
}
#[test]
fn myanmar_language_detection() {
assert_eq!(language_for_codepoint(0x1000), Some(DictLanguage::Myanmar));
assert_eq!(language_for_codepoint(0x1019), Some(DictLanguage::Myanmar));
}
#[test]
fn mixed_thai_latin() {
let text = "Hello \u{0E2A}\u{0E27}\u{0E31}\u{0E2A}\u{0E14}\u{0E35} World";
let breaks = line_break_opportunities_with_dictionary(text);
let space1 = 5; assert_eq!(breaks[space1 + 1], BreakAction::Allowed,
"Expected break opportunity after 'Hello '");
}
#[test]
fn non_sa_text_unchanged() {
let text = "Hello World";
let breaks_no_dict = line_break_opportunities(text);
let breaks_dict = line_break_opportunities_with_dictionary(text);
assert_eq!(breaks_no_dict, breaks_dict);
}
#[test]
fn empty_text() {
let breaks = line_break_opportunities_with_dictionary("");
assert_eq!(breaks.len(), 1);
}
#[test]
fn single_thai_char() {
let text = "\u{0E01}"; let breaks = line_break_opportunities_with_dictionary(text);
assert_eq!(breaks.len(), text.len() + 1);
}