use super::Segmenter;
pub struct ArabicSegmenter;
impl Segmenter for ArabicSegmenter {
fn segment_str<'o>(&self, to_segment: &'o str) -> Box<dyn Iterator<Item = &'o str> + 'o> {
if to_segment.len() > 4
&& (to_segment.starts_with("ال")
|| to_segment.starts_with("أل")
|| to_segment.starts_with("إل")
|| to_segment.starts_with("آل")
|| to_segment.starts_with("ٱل"))
{
Box::new(vec![&to_segment[..4], &to_segment[4..]].into_iter())
} else {
Box::new(Some(to_segment).into_iter())
}
}
}
#[cfg(test)]
mod test {
use crate::segmenter::test::test_segmenter;
const TEXT: &str = "السلام عليكم، كيف حالكم؟ (أتمنى أن تكونوا بأفضل ٱلأحوال) 123 456";
const SEGMENTED: &[&str] = &[
"ال",
"سلام",
" ",
"عليكم",
"،",
" ",
"كيف",
" ",
"حالكم",
"؟",
" ",
"(",
"أتمنى",
" ",
"أن",
" ",
"تكونوا",
" ",
"بأفضل",
" ",
"ٱل",
"أحوال",
")",
" ",
"123",
" ",
"456",
];
const TOKENIZED: &[&str] = &[
"ال",
"سلام",
" ",
"عليكم",
"،",
" ",
"كيف",
" ",
"حالكم",
"؟",
" ",
"(",
"اتمني",
" ",
"ان",
" ",
"تكونوا",
" ",
"بافضل",
" ",
"ال",
"احوال",
")",
" ",
"123",
" ",
"456",
];
test_segmenter!(ArabicSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Arabic, Language::Ara);
}