use prettify_pinyin::prettify;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct Entry {
pub id: u64,
pub text: String,
pub text_alt: Vec<String>,
pub category: String,
pub tocfl_level: u32,
pub situation: String,
pub written_per_million: u64,
pub spoken_per_million: u64,
pub components: String,
pub zhuyin: String,
pub zhuyin_alt: Vec<String>,
pub pinyin: String,
pub pinyin_alt: Vec<String>,
}
pub struct TOCFLDictionary<V> {
pub hashmap: HashMap<(String, String), V>,
}
fn remove_whitespace(mut s: String) -> String {
s.retain(|c| !c.is_whitespace());
s
}
fn normalize_pinyin(pinyin: &str) -> String {
let normalized: String = prettify(pinyin.to_string());
let normalized = remove_whitespace(normalized);
normalized
}
impl<V> TOCFLDictionary<V> {
pub fn get_entry(&self, traditional: &str, pinyin: &str) -> Option<&V> {
self.hashmap
.get(&(traditional.to_string(), normalize_pinyin(pinyin)))
.or(self.hashmap.get(&(traditional.to_string(), "".to_string())))
}
pub fn iter(&self) -> impl Iterator<Item = &V> + '_ {
self.hashmap.values()
}
}
pub fn compile_common_chars() -> TOCFLDictionary<u64> {
let dict = load_tocfl_dictionary();
let hashmap = dict.hashmap;
let mut cha_to_pinyin = HashMap::new();
for (word, pinyin) in hashmap.keys() {
if word.chars().count() != 1 {
continue;
}
for cha in word.chars() {
cha_to_pinyin.entry(cha).or_insert(pinyin.to_string());
}
}
let mut char_hash_map = HashMap::new();
let empty_fall_back = "".to_string();
for ((word, _pinyin), v) in hashmap.iter() {
if word.chars().count() <= 1 {
continue;
}
for cha in word.chars() {
let pinyin = cha_to_pinyin.get(&cha).unwrap_or(&empty_fall_back);
let key = (cha.to_string(), remove_whitespace(pinyin.to_string()));
let entry = char_hash_map.entry(key).or_insert_with(Default::default);
*entry += v.written_per_million;
}
}
TOCFLDictionary {
hashmap: char_hash_map,
}
}
pub fn load_tocfl_dictionary() -> TOCFLDictionary<Entry> {
let rows = include_str!("../tocfl_words.json");
let hashmap: HashMap<(String, String), Entry> = rows
.lines()
.flat_map(|line| {
let entry: Entry = serde_json::from_str(line).unwrap();
let mut first = vec![(entry.text.to_string(), entry.pinyin.to_string())];
let other = entry
.text_alt
.iter()
.map(ToString::to_string)
.zip(entry.pinyin_alt.iter().map(ToString::to_string));
first.extend(other);
first
.into_iter()
.map(move |(chin, pin)| ((chin.to_string(), remove_whitespace(pin)), entry.clone()))
})
.collect();
TOCFLDictionary { hashmap }
}
#[test]
fn entry_test1() {
load_tocfl_dictionary().get_entry("爸爸", "bàba").unwrap();
}
#[test]
fn entry_test2() {
load_tocfl_dictionary().get_entry("爸爸", "bà ba").unwrap();
}
#[test]
fn entry_test3() {
load_tocfl_dictionary().get_entry("爸", "bà").unwrap();
}
#[test]
fn entry_test4() {
load_tocfl_dictionary()
.get_entry("安靜", "ān jìng")
.unwrap();
}