use std::sync::LazyLock;
use fancy_regex::Regex;
use unicode_normalization::{is_nfc, UnicodeNormalization};
pub type CodepointRange = (u32, u32);
pub const CJK_IDEOGRAPH_RANGES: &[CodepointRange] = &[
(0x4E00, 0x9FFF), (0x3400, 0x4DBF), ];
pub const CJK_PUNCTUATION_RANGE: CodepointRange = (0x3000, 0x303F);
pub const FULLWIDTH_CHARACTER_RANGES: &[CodepointRange] = &[
(0xFF00, 0xFFEF), ];
pub const BOPOMOFO_RANGE: CodepointRange = (0x3100, 0x312f);
pub const BOPOMOFO_EXTENDED_RANGE: CodepointRange = (0x31a0, 0x31bf);
pub const IDEOGRAPHIC_SYMBOLS_AND_PUNCTUATION_RANGE: CodepointRange = (0x16fe0, 0x16fff);
pub const SMALL_FORM_RANGE: CodepointRange = (0xfe50, 0xfe6f);
pub const VERTICAL_FORM_RANGE: CodepointRange = (0xfe10, 0xfe1f);
pub const CHINESE_RANGES: &[CodepointRange] = &[
(0x4E00, 0x9FFF), (0x3400, 0x4DBF), CJK_PUNCTUATION_RANGE,
(0xFF00, 0xFFEF), BOPOMOFO_RANGE,
BOPOMOFO_EXTENDED_RANGE,
IDEOGRAPHIC_SYMBOLS_AND_PUNCTUATION_RANGE,
SMALL_FORM_RANGE,
VERTICAL_FORM_RANGE,
];
pub fn is_code_point_in_ranges(code_point: u32, ranges: &[CodepointRange]) -> bool {
for &(start, end) in ranges {
if code_point >= start && code_point <= end {
return true;
}
}
false
}
pub fn is_string_partially_chinese(s: &str) -> bool {
if s.is_empty() {
return false;
}
for c in s.chars() {
if is_code_point_chinese(c as u32) {
return true;
}
}
false
}
pub fn is_code_point_chinese(code_point: u32) -> bool {
is_code_point_in_ranges(code_point, CHINESE_RANGES)
}
static PINYIN_CLEANUP_REGEX: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"[\s・:'’-]|\/\/").unwrap() });
pub fn normalize_pinyin(s: &str) -> String {
let normalized_s: String = s.nfc().collect();
let lowercased_s = normalized_s.to_lowercase();
PINYIN_CLEANUP_REGEX
.replace_all(&lowercased_s, "")
.into_owned()
}
mod zh_tests {
use unicode_normalization::{is_nfc, UnicodeNormalization};
use crate::zh::chinese::{
is_code_point_chinese, is_string_partially_chinese, normalize_pinyin,
};
fn zhtest() {
println!(
"Is '你好世界' partially Chinese? {}",
is_string_partially_chinese("你好世界")
); println!(
"Is 'Hello' partially Chinese? {}",
is_string_partially_chinese("Hello")
); println!("Is '世' Chinese? {}", is_code_point_chinese('世' as u32)); println!("Is 'A' Chinese? {}", is_code_point_chinese('A' as u32));
let pinyin1 = "Nǐ hǎo";
let pinyin2 = "ni³ hao³ // comment";
let pinyin3 = "Pīn・yīn: 'test' - example";
println!("'{}' -> '{}'", pinyin1, normalize_pinyin(pinyin1)); println!("'{}' -> '{}'", pinyin2, normalize_pinyin(pinyin2)); println!("'{}' -> '{}'", pinyin3, normalize_pinyin(pinyin3));
let precomposed = "é"; let decomposed = "é"; println!("'{}' is NFC: {}", precomposed, is_nfc(precomposed)); println!("'{}' is NFC: {}", decomposed, is_nfc(decomposed)); let nfc_decomposed: String = decomposed.nfc().collect();
println!("Decomposed '{decomposed}' to NFC: '{nfc_decomposed}'"); }
}