use pinyin::ToPinyin;
use prettify_pinyin::prettify;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
pub struct Entry {
pub id: u64,
pub text: String,
pub text_alt: Vec<String>,
pub category: String,
pub tocfl_level: u32,
pub situation: String,
pub written_per_million: u64,
pub spoken_per_million: u64,
pub components: String,
pub zhuyin: String,
pub zhuyin_alt: Vec<String>,
pub pinyin: String,
pub pinyin_alt: Vec<String>,
}
pub struct TOCFLDictionary<V> {
pub hashmap: HashMap<(String, String), V>,
}
fn remove_whitespace(mut s: String) -> String {
s.retain(|c| !c.is_whitespace());
s
}
fn normalize_pinyin(pinyin: &str) -> String {
let normalized: String = prettify(pinyin.to_string());
remove_whitespace(normalized)
}
impl<V> TOCFLDictionary<V> {
pub fn get_entry(&self, traditional: &str, pinyin: &str) -> Option<&V> {
self.hashmap
.get(&(traditional.to_string(), normalize_pinyin(pinyin)))
}
pub fn get_entry_no_pinyin(&self, traditional: &str) -> Option<&V> {
self.hashmap.get(&(traditional.to_string(), "".to_string()))
}
pub fn get_entry_multiple(&self, traditional: &str, pinyin: &[&str]) -> Option<&V> {
for pinyin in pinyin {
if let Some(entry) = self
.hashmap
.get(&(traditional.to_string(), normalize_pinyin(pinyin)))
{
return Some(entry);
}
}
self.hashmap.get(&(traditional.to_string(), "".to_string()))
}
pub fn iter(&self) -> impl Iterator<Item = &V> + '_ {
self.hashmap.values()
}
}
pub fn compile_common_chars() -> TOCFLDictionary<u64> {
let dict = load_tocfl_dictionary();
let hashmap = dict.hashmap;
let mut cha_to_pinyin: HashMap<char, Vec<String>> = HashMap::new();
for (word, pinyin) in hashmap.keys() {
if word.chars().count() != 1 {
continue;
}
for cha in word.chars() {
let pinyins = cha_to_pinyin.entry(cha).or_default();
if pinyin.trim().is_empty() {
continue;
}
pinyins.push(pinyin.to_string());
}
}
let mut char_hash_map = HashMap::new();
let empty_fall_back = vec![];
for ((word, _pinyin), v) in hashmap.iter() {
if word.chars().count() <= 1 {
continue;
}
let mut add_entry = |cha: char, pinyin: &str| {
let key = (cha.to_string(), remove_whitespace(pinyin.to_string()));
let entry = char_hash_map.entry(key).or_insert_with(Default::default);
*entry += v.written_per_million;
};
for cha in word.chars() {
let pinyin = cha_to_pinyin.get(&cha).unwrap_or(&empty_fall_back);
if pinyin.len() == 1 {
let pinyin = &pinyin[0];
add_entry(cha, &remove_whitespace(pinyin.to_string()));
add_entry(cha, "");
}
if pinyin.is_empty() {
add_entry(cha, "");
if let Some(pinyin) = cha.to_pinyin() {
add_entry(cha, pinyin.with_tone());
}
}
}
}
TOCFLDictionary {
hashmap: char_hash_map,
}
}
pub fn load_tocfl_dictionary() -> TOCFLDictionary<Entry> {
let rows = include_str!("../tocfl_words.json");
let hashmap: HashMap<(String, String), Entry> = rows
.lines()
.flat_map(|line| {
let entry: Entry = serde_json::from_str(line).unwrap();
let mut first_and_pinyin_fallback = vec![
(entry.text.to_string(), entry.pinyin.to_string()),
(entry.text.to_string(), "".to_string()),
];
let other = entry
.text_alt
.iter()
.map(ToString::to_string)
.zip(entry.pinyin_alt.iter().map(ToString::to_string));
first_and_pinyin_fallback.extend(other);
first_and_pinyin_fallback
.into_iter()
.map(move |(chin, pin)| ((chin.to_string(), remove_whitespace(pin)), entry.clone()))
})
.collect();
TOCFLDictionary { hashmap }
}
#[test]
fn test_normalize() {
assert_eq!(normalize_pinyin("yì shì"), "yìshì");
assert_eq!(normalize_pinyin("yi4 shi4"), "yìshì");
}
#[test]
fn entry_test1() {
load_tocfl_dictionary().get_entry("爸爸", "bàba").unwrap();
}
#[test]
fn entry_test2() {
load_tocfl_dictionary().get_entry("爸爸", "bà ba").unwrap();
}
#[test]
fn entry_awareness() {
load_tocfl_dictionary().get_entry("意識", "yì shì").unwrap();
load_tocfl_dictionary().get_entry("意識", "yìshì").unwrap();
load_tocfl_dictionary()
.get_entry("意識", "yi4 shi4")
.unwrap();
}
#[test]
fn entry_test3() {
load_tocfl_dictionary().get_entry("爸", "bà").unwrap();
}
#[test]
fn entry_test4() {
load_tocfl_dictionary()
.get_entry("安靜", "ān jìng")
.unwrap();
}
#[test]
fn entry_test_fen1() {
load_tocfl_dictionary().get_entry("分", "fēn").unwrap();
load_tocfl_dictionary().get_entry("分", "fen1").unwrap();
}
#[test]
fn entry_test_pian_yi() {
dbg!(load_tocfl_dictionary().get_entry_no_pinyin("便宜").unwrap());
}
#[test]
fn entry_test_fen2() {
assert_eq!(load_tocfl_dictionary().get_entry("分", "fèn"), None);
}
#[test]
fn entry_test_taberu() {
assert_eq!(compile_common_chars().get_entry_no_pinyin("食"), Some(&712));
assert_eq!(compile_common_chars().get_entry("食", "shí"), Some(&712));
}
#[test]
fn entry_test_hui_painting() {
assert_eq!(compile_common_chars().get_entry("繪", "hui4"), Some(&120));
assert_eq!(compile_common_chars().get_entry_no_pinyin("繪"), Some(&120));
}
#[test]
fn entry_test_hui_meeting() {
assert_eq!(compile_common_chars().get_entry("會", "hui4"), Some(&3624));
assert_eq!(
compile_common_chars().get_entry_no_pinyin("會"),
Some(&3624)
);
}