1use pinyin::ToPinyin;
2use prettify_pinyin::prettify;
3use serde::{Deserialize, Serialize};
4
5use std::collections::HashMap;
6
7#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
8pub struct Entry {
9 pub id: u64,
11 pub text: String,
13 pub text_alt: Vec<String>,
15 pub category: String,
18 pub tocfl_level: u32,
20 pub situation: String,
22
23 pub written_per_million: u64,
25 pub spoken_per_million: u64,
27
28 pub components: String,
31
32 pub zhuyin: String,
34 pub zhuyin_alt: Vec<String>,
36
37 pub pinyin: String,
39 pub pinyin_alt: Vec<String>,
41}
42
43pub struct TOCFLDictionary<V> {
44 pub hashmap: HashMap<(String, String), V>,
45}
46
47fn remove_whitespace(mut s: String) -> String {
48 s.retain(|c| !c.is_whitespace());
49 s
50}
51
52fn normalize_pinyin(pinyin: &str) -> String {
53 let normalized: String = prettify(pinyin.to_string());
54
55 remove_whitespace(normalized)
56}
57
58impl<V> TOCFLDictionary<V> {
59 pub fn get_entry(&self, traditional: &str, pinyin: &str) -> Option<&V> {
70 self.hashmap
71 .get(&(traditional.to_string(), normalize_pinyin(pinyin)))
72 }
73
74 pub fn get_entry_no_pinyin(&self, traditional: &str) -> Option<&V> {
76 self.hashmap.get(&(traditional.to_string(), "".to_string()))
77 }
78
79 pub fn get_entry_multiple(&self, traditional: &str, pinyin: &[&str]) -> Option<&V> {
81 for pinyin in pinyin {
82 if let Some(entry) = self
83 .hashmap
84 .get(&(traditional.to_string(), normalize_pinyin(pinyin)))
85 {
86 return Some(entry);
87 }
88 }
89 self.hashmap.get(&(traditional.to_string(), "".to_string()))
91 }
92
93 pub fn iter(&self) -> impl Iterator<Item = &V> + '_ {
95 self.hashmap.values()
96 }
97}
98
99pub fn compile_common_chars() -> TOCFLDictionary<u64> {
103 let dict = load_tocfl_dictionary();
104
105 let hashmap = dict.hashmap;
106
107 let mut cha_to_pinyin: HashMap<char, Vec<String>> = HashMap::new();
109 for (word, pinyin) in hashmap.keys() {
110 if word.chars().count() != 1 {
111 continue;
112 }
113 for cha in word.chars() {
114 let pinyins = cha_to_pinyin.entry(cha).or_default();
115
116 if pinyin.trim().is_empty() {
117 continue;
118 }
119 pinyins.push(pinyin.to_string());
120 }
121 }
122
123 let mut char_hash_map = HashMap::new();
126 let empty_fall_back = vec![];
127 for ((word, _pinyin), v) in hashmap.iter() {
128 if word.chars().count() <= 1 {
129 continue;
130 }
131 let mut add_entry = |cha: char, pinyin: &str| {
132 let key = (cha.to_string(), remove_whitespace(pinyin.to_string()));
133 let entry = char_hash_map.entry(key).or_insert_with(Default::default);
134 *entry += v.written_per_million;
135 };
136 for cha in word.chars() {
138 let pinyin = cha_to_pinyin.get(&cha).unwrap_or(&empty_fall_back);
139
140 if pinyin.len() == 1 {
141 let pinyin = &pinyin[0];
142 add_entry(cha, &remove_whitespace(pinyin.to_string()));
143
144 add_entry(cha, "");
146 }
147 if pinyin.is_empty() {
148 add_entry(cha, "");
150 if let Some(pinyin) = cha.to_pinyin() {
152 add_entry(cha, pinyin.with_tone());
153 }
154 }
155 }
156 }
157
158 TOCFLDictionary {
159 hashmap: char_hash_map,
160 }
161}
162
163pub fn load_tocfl_dictionary() -> TOCFLDictionary<Entry> {
164 let rows = include_str!("../tocfl_words.json");
165 let hashmap: HashMap<(String, String), Entry> = rows
166 .lines()
167 .flat_map(|line| {
168 let entry: Entry = serde_json::from_str(line).unwrap();
169 let mut first_and_pinyin_fallback = vec![
170 (entry.text.to_string(), entry.pinyin.to_string()),
171 (entry.text.to_string(), "".to_string()),
172 ];
173 let other = entry
174 .text_alt
175 .iter()
176 .map(ToString::to_string)
177 .zip(entry.pinyin_alt.iter().map(ToString::to_string));
178 first_and_pinyin_fallback.extend(other);
179 first_and_pinyin_fallback
180 .into_iter()
181 .map(move |(chin, pin)| ((chin.to_string(), remove_whitespace(pin)), entry.clone()))
182 })
183 .collect();
184
185 TOCFLDictionary { hashmap }
186}
187#[test]
188fn test_normalize() {
189 assert_eq!(normalize_pinyin("yì shì"), "yìshì");
190 assert_eq!(normalize_pinyin("yi4 shi4"), "yìshì");
191 }
194
195#[test]
196fn entry_test1() {
197 load_tocfl_dictionary().get_entry("爸爸", "bàba").unwrap();
198}
199
200#[test]
201fn entry_test2() {
202 load_tocfl_dictionary().get_entry("爸爸", "bà ba").unwrap();
203}
204
205#[test]
206fn entry_awareness() {
207 load_tocfl_dictionary().get_entry("意識", "yì shì").unwrap();
210 load_tocfl_dictionary().get_entry("意識", "yìshì").unwrap();
211
212 load_tocfl_dictionary()
213 .get_entry("意識", "yi4 shi4")
214 .unwrap();
215 }
219
220#[test]
221fn entry_test3() {
222 load_tocfl_dictionary().get_entry("爸", "bà").unwrap();
223}
224
225#[test]
226fn entry_test4() {
227 load_tocfl_dictionary()
228 .get_entry("安靜", "ān jìng")
229 .unwrap();
230}
231#[test]
232fn entry_test_fen1() {
233 load_tocfl_dictionary().get_entry("分", "fēn").unwrap();
234 load_tocfl_dictionary().get_entry("分", "fen1").unwrap();
235}
236#[test]
237fn entry_test_pian_yi() {
238 dbg!(load_tocfl_dictionary().get_entry_no_pinyin("便宜").unwrap());
239}
240
241#[test]
242fn entry_test_fen2() {
243 assert_eq!(load_tocfl_dictionary().get_entry("分", "fèn"), None);
244}
245
246#[test]
247fn entry_test_taberu() {
248 assert_eq!(compile_common_chars().get_entry_no_pinyin("食"), Some(&712));
249 assert_eq!(compile_common_chars().get_entry("食", "shí"), Some(&712));
250}
251
252#[test]
253fn entry_test_hui_painting() {
254 assert_eq!(compile_common_chars().get_entry("繪", "hui4"), Some(&120));
255 assert_eq!(compile_common_chars().get_entry_no_pinyin("繪"), Some(&120));
256}
257
258#[test]
259fn entry_test_hui_meeting() {
260 assert_eq!(compile_common_chars().get_entry("會", "hui4"), Some(&3624));
261 assert_eq!(
262 compile_common_chars().get_entry_no_pinyin("會"),
263 Some(&3624)
264 );
265}