use icu_segmenter::WordBreakSegmenter;
#[test]
fn word_break_th() {
let segmenter =
WordBreakSegmenter::try_new_unstable(&icu_testdata::unstable()).expect("Data exists");
let s = "ภาษาไทยภาษาไทย";
let utf16: Vec<u16> = s.encode_utf16().collect();
let iter = segmenter.segment_utf16(&utf16);
assert_eq!(
iter.collect::<Vec<usize>>(),
vec![0, 4, 7, 11, 14],
"word segmenter with Thai"
);
let iter = segmenter.segment_str(s);
assert_eq!(
iter.collect::<Vec<usize>>(),
vec![0, 12, 21, 33, 42],
"word segmenter with Thai"
);
let s = "aภาษาไทยภาษาไทยb";
let utf16: Vec<u16> = s.encode_utf16().collect();
let iter = segmenter.segment_utf16(&utf16);
assert_eq!(
iter.collect::<Vec<usize>>(),
vec![0, 1, 5, 8, 12, 15, 16],
"word segmenter with Thai and ascii"
);
}
#[ignore = "testdata doesn't have Burmese data"]
#[test]
fn word_break_my() {
let segmenter =
WordBreakSegmenter::try_new_unstable(&icu_testdata::unstable()).expect("Data exists");
let s = "မြန်မာစာမြန်မာစာမြန်မာစာ";
let utf16: Vec<u16> = s.encode_utf16().collect();
let iter = segmenter.segment_utf16(&utf16);
assert_eq!(
iter.collect::<Vec<usize>>(),
vec![0, 8, 16, 22, 24],
"word segmenter with Burmese"
);
}
#[test]
fn word_break_hiragana() {
let segmenter =
WordBreakSegmenter::try_new_unstable(&icu_testdata::unstable()).expect("Data exists");
let s = "うなぎうなじ";
let iter = segmenter.segment_str(s);
assert_eq!(
iter.collect::<Vec<usize>>(),
vec![0, 9, 18],
"word segmenter with Hiragana"
);
}
#[test]
fn word_break_mixed_han() {
let segmenter =
WordBreakSegmenter::try_new_unstable(&icu_testdata::unstable()).expect("Data exists");
let s = "Welcome龟山岛龟山岛Welcome";
let iter = segmenter.segment_str(s);
assert_eq!(
iter.collect::<Vec<usize>>(),
vec![0, 7, 16, 25, 32],
"word segmenter with Chinese and letter"
);
}