use alloc::string::{String, ToString};
use alloc::vec::Vec;
use core::slice::Iter;
use core::str::Chars;
#[derive(PartialEq)]
pub enum Language {
Burmese,
ChineseOrJapanese,
Khmer,
Lao,
Thai,
Unknown,
}
pub fn get_language(codepoint: u32) -> Language {
match codepoint {
0xe01..=0xe7f => Language::Thai,
0x0E80..=0x0EFF => Language::Lao,
0x1000..=0x109f => Language::Burmese,
0x1780..=0x17FF => Language::Khmer,
0x19E0..=0x19FF => Language::Khmer,
0x2E80..=0x2EFF => Language::ChineseOrJapanese,
0x2F00..=0x2FDF => Language::ChineseOrJapanese,
0x3040..=0x30FF => Language::ChineseOrJapanese,
0x31F0..=0x31FF => Language::ChineseOrJapanese,
0x32D0..=0x32FE => Language::ChineseOrJapanese,
0x3400..=0x4DBF => Language::ChineseOrJapanese,
0x4E00..=0x9FFF => Language::ChineseOrJapanese,
0xa9e0..=0xa9ff => Language::Burmese,
0xaa60..=0xaa7f => Language::Burmese,
0xF900..=0xFAFF => Language::ChineseOrJapanese,
0xFF66..=0xFF9D => Language::ChineseOrJapanese,
0x16FE2..=0x16FE3 => Language::ChineseOrJapanese,
0x16FF0..=0x16FF1 => Language::ChineseOrJapanese,
0x1AFF0..=0x1B16F => Language::ChineseOrJapanese,
0x1F200 => Language::ChineseOrJapanese,
0x20000..=0x2FA1F => Language::ChineseOrJapanese,
0x30000..=0x3134F => Language::ChineseOrJapanese,
_ => Language::Unknown,
}
}
pub struct LanguageIterator<'s> {
input: Chars<'s>,
current_ch: Option<char>,
}
impl<'s> LanguageIterator<'s> {
#[allow(dead_code)]
pub fn new(input: &'s str) -> Self {
let mut input = input.chars();
let current_ch = input.next();
Self { input, current_ch }
}
}
impl<'s> Iterator for LanguageIterator<'s> {
type Item = String;
fn next(&mut self) -> Option<Self::Item> {
let mut s = "".to_string();
let lang = get_language(self.current_ch? as u32);
s.push(self.current_ch.unwrap());
for c in self.input.by_ref() {
self.current_ch = Some(c);
let new_lang = get_language(c as u32);
if lang != new_lang {
return Some(s);
}
s.push(c);
}
self.current_ch = None;
Some(s)
}
}
pub struct LanguageIteratorUtf16<'s> {
input: Iter<'s, u16>,
current_ch: Option<&'s u16>,
}
impl<'s> LanguageIteratorUtf16<'s> {
#[allow(dead_code)]
pub fn new(input: &'s [u16]) -> Self {
let mut input = input.iter();
let current_ch = input.next();
Self { input, current_ch }
}
}
impl<'s> Iterator for LanguageIteratorUtf16<'s> {
type Item = Vec<u16>;
fn next(&mut self) -> Option<Self::Item> {
let mut s: Vec<u16> = Vec::new();
let lang = get_language(*self.current_ch? as u32);
s.push(*self.current_ch.unwrap());
for c in self.input.by_ref() {
self.current_ch = Some(c);
let new_lang = get_language(*c as u32);
if lang != new_lang {
return Some(s);
}
s.push(*c);
}
self.current_ch = None;
Some(s)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_thai_only() {
let s = "ภาษาไทยภาษาไทย";
let utf16: Vec<u16> = s.encode_utf16().collect();
let mut iter = LanguageIteratorUtf16::new(&utf16);
assert_eq!(iter.next(), Some(utf16), "Thai language only with UTF-16");
let mut iter = LanguageIterator::new(s);
assert_eq!(
iter.next(),
Some(s.to_string()),
"Thai language only with UTF-8"
);
assert_eq!(iter.next(), None, "Iterator for UTF-8 is finished");
}
#[test]
fn test_combine() {
const TEST_STR_THAI: &str = "ภาษาไทยภาษาไทย";
const TEST_STR_BURMESE: &str = "ဗမာနွယ်ဘာသာစကားမျာ";
let mut s = String::from(TEST_STR_THAI);
s.push_str(TEST_STR_BURMESE);
let utf16: Vec<u16> = s.encode_utf16().collect();
let thai_utf16: Vec<u16> = TEST_STR_THAI.encode_utf16().collect();
let burmese_utf16: Vec<u16> = TEST_STR_BURMESE.encode_utf16().collect();
let mut iter = LanguageIteratorUtf16::new(&utf16);
assert_eq!(
iter.next(),
Some(thai_utf16),
"Thai language with UTF-16 at first"
);
assert_eq!(
iter.next(),
Some(burmese_utf16),
"Burmese language with UTF-16 at second"
);
assert_eq!(iter.next(), None, "Iterator for UTF-16 is finished");
let mut iter = LanguageIterator::new(&s);
assert_eq!(
iter.next(),
Some(TEST_STR_THAI.to_string()),
"Thai language with UTF-8 at first"
);
assert_eq!(
iter.next(),
Some(TEST_STR_BURMESE.to_string()),
"Burmese language with UTF-8 at second"
);
assert_eq!(iter.next(), None, "Iterator for UTF-8 is finished");
}
}