tocfl/
lib.rs

1use pinyin::ToPinyin;
2use prettify_pinyin::prettify;
3use serde::{Deserialize, Serialize};
4
5use std::collections::HashMap;
6
7#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
8pub struct Entry {
9    /// Ordinal
10    pub id: u64,
11    /// Traditional Chinese
12    pub text: String,
13    /// Other Variants
14    pub text_alt: Vec<String>,
15    /// Category
16    /// Base, advanced etc. in Chinese, e.g. 基礎
17    pub category: String,
18    /// TOCFL Level, 1-7
19    pub tocfl_level: u32,
20    /// Situation the word is used
21    pub situation: String,
22
23    /// Count per million written
24    pub written_per_million: u64,
25    /// Count per million spoken
26    pub spoken_per_million: u64,
27
28    /// No idea what that is.
29    /// e.g. [['的', ['7457', '8310', '8568']]]
30    pub components: String,
31
32    /// Zhuyin
33    pub zhuyin: String,
34    /// Pinyin Other Variants
35    pub zhuyin_alt: Vec<String>,
36
37    /// Pinyin
38    pub pinyin: String,
39    /// Pinyin Other Variants
40    pub pinyin_alt: Vec<String>,
41}
42
43pub struct TOCFLDictionary<V> {
44    pub hashmap: HashMap<(String, String), V>,
45}
46
47fn remove_whitespace(mut s: String) -> String {
48    s.retain(|c| !c.is_whitespace());
49    s
50}
51
52fn normalize_pinyin(pinyin: &str) -> String {
53    let normalized: String = prettify(pinyin.to_string());
54
55    remove_whitespace(normalized)
56}
57
58impl<V> TOCFLDictionary<V> {
59    /// Get an entry for its traditional chinese character + pinyin combination
60    /// Prefer to use this to differentiate between different characters that have multiple pronounciations
61    ///
62    /// Note that some characters have multiple pronounciations, e.g. 分 fēn and fèn
63    ///
64    /// The pinyin can have the format "yì" or "yi4"
65    ///
66    /// # Limitation
67    /// Note that some characters don't have a pinyin, e.g. 食.
68    ///
69    pub fn get_entry(&self, traditional: &str, pinyin: &str) -> Option<&V> {
70        self.hashmap
71            .get(&(traditional.to_string(), normalize_pinyin(pinyin)))
72    }
73
74    /// Get an entry for its traditional chinese character
75    pub fn get_entry_no_pinyin(&self, traditional: &str) -> Option<&V> {
76        self.hashmap.get(&(traditional.to_string(), "".to_string()))
77    }
78
79    /// Get an entry for its traditional + [&pinyin] combination
80    pub fn get_entry_multiple(&self, traditional: &str, pinyin: &[&str]) -> Option<&V> {
81        for pinyin in pinyin {
82            if let Some(entry) = self
83                .hashmap
84                .get(&(traditional.to_string(), normalize_pinyin(pinyin)))
85            {
86                return Some(entry);
87            }
88        }
89        //fallback remove pinyin
90        self.hashmap.get(&(traditional.to_string(), "".to_string()))
91    }
92
93    /// Iterator over all entries
94    pub fn iter(&self) -> impl Iterator<Item = &V> + '_ {
95        self.hashmap.values()
96    }
97}
98
99/// Compile a hashmap of `HashMap<(Char, Pinyin), CountPerMillion>` by building a commonness HashMap of chars from words
100///
101/// Those chars may not be common themselves and may be more common in words
102pub fn compile_common_chars() -> TOCFLDictionary<u64> {
103    let dict = load_tocfl_dictionary();
104
105    let hashmap = dict.hashmap;
106
107    // Note that we only add pinyin if there is only on character
108    let mut cha_to_pinyin: HashMap<char, Vec<String>> = HashMap::new();
109    for (word, pinyin) in hashmap.keys() {
110        if word.chars().count() != 1 {
111            continue;
112        }
113        for cha in word.chars() {
114            let pinyins = cha_to_pinyin.entry(cha).or_default();
115
116            if pinyin.trim().is_empty() {
117                continue;
118            }
119            pinyins.push(pinyin.to_string());
120        }
121    }
122
123    // We add the word parts to the chars, although single chars may be not that common
124    // e.g. 午 on its own is uncommon, but 下午 [xiawu] is quite common
125    let mut char_hash_map = HashMap::new();
126    let empty_fall_back = vec![];
127    for ((word, _pinyin), v) in hashmap.iter() {
128        if word.chars().count() <= 1 {
129            continue;
130        }
131        let mut add_entry = |cha: char, pinyin: &str| {
132            let key = (cha.to_string(), remove_whitespace(pinyin.to_string()));
133            let entry = char_hash_map.entry(key).or_insert_with(Default::default);
134            *entry += v.written_per_million;
135        };
136        // TODO tokenize _pinyin and use that would be better
137        for cha in word.chars() {
138            let pinyin = cha_to_pinyin.get(&cha).unwrap_or(&empty_fall_back);
139
140            if pinyin.len() == 1 {
141                let pinyin = &pinyin[0];
142                add_entry(cha, &remove_whitespace(pinyin.to_string()));
143
144                // Add empty fallback
145                add_entry(cha, "");
146            }
147            if pinyin.is_empty() {
148                // Add empty fallback
149                add_entry(cha, "");
150                // Add default from character to pinyin conversion
151                if let Some(pinyin) = cha.to_pinyin() {
152                    add_entry(cha, pinyin.with_tone());
153                }
154            }
155        }
156    }
157
158    TOCFLDictionary {
159        hashmap: char_hash_map,
160    }
161}
162
163pub fn load_tocfl_dictionary() -> TOCFLDictionary<Entry> {
164    let rows = include_str!("../tocfl_words.json");
165    let hashmap: HashMap<(String, String), Entry> = rows
166        .lines()
167        .flat_map(|line| {
168            let entry: Entry = serde_json::from_str(line).unwrap();
169            let mut first_and_pinyin_fallback = vec![
170                (entry.text.to_string(), entry.pinyin.to_string()),
171                (entry.text.to_string(), "".to_string()),
172            ];
173            let other = entry
174                .text_alt
175                .iter()
176                .map(ToString::to_string)
177                .zip(entry.pinyin_alt.iter().map(ToString::to_string));
178            first_and_pinyin_fallback.extend(other);
179            first_and_pinyin_fallback
180                .into_iter()
181                .map(move |(chin, pin)| ((chin.to_string(), remove_whitespace(pin)), entry.clone()))
182        })
183        .collect();
184
185    TOCFLDictionary { hashmap }
186}
187#[test]
188fn test_normalize() {
189    assert_eq!(normalize_pinyin("yì shì"), "yìshì");
190    assert_eq!(normalize_pinyin("yi4 shi4"), "yìshì");
191    // For that we need a tokenizer
192    //assert_eq!(normalize_pinyin("yi4shi4"), "yìshì");
193}
194
195#[test]
196fn entry_test1() {
197    load_tocfl_dictionary().get_entry("爸爸", "bàba").unwrap();
198}
199
200#[test]
201fn entry_test2() {
202    load_tocfl_dictionary().get_entry("爸爸", "bà ba").unwrap();
203}
204
205#[test]
206fn entry_awareness() {
207    //dbg!(load_tocfl_dictionary().get_entry_no_pinyin("意識").unwrap());
208
209    load_tocfl_dictionary().get_entry("意識", "yì shì").unwrap();
210    load_tocfl_dictionary().get_entry("意識", "yìshì").unwrap();
211
212    load_tocfl_dictionary()
213        .get_entry("意識", "yi4 shi4")
214        .unwrap();
215    //load_tocfl_dictionary()
216    //.get_entry("意識", "yi4shi4")
217    //.unwrap();
218}
219
220#[test]
221fn entry_test3() {
222    load_tocfl_dictionary().get_entry("爸", "bà").unwrap();
223}
224
225#[test]
226fn entry_test4() {
227    load_tocfl_dictionary()
228        .get_entry("安靜", "ān jìng")
229        .unwrap();
230}
231#[test]
232fn entry_test_fen1() {
233    load_tocfl_dictionary().get_entry("分", "fēn").unwrap();
234    load_tocfl_dictionary().get_entry("分", "fen1").unwrap();
235}
236#[test]
237fn entry_test_pian_yi() {
238    dbg!(load_tocfl_dictionary().get_entry_no_pinyin("便宜").unwrap());
239}
240
241#[test]
242fn entry_test_fen2() {
243    assert_eq!(load_tocfl_dictionary().get_entry("分", "fèn"), None);
244}
245
246#[test]
247fn entry_test_taberu() {
248    assert_eq!(compile_common_chars().get_entry_no_pinyin("食"), Some(&712));
249    assert_eq!(compile_common_chars().get_entry("食", "shí"), Some(&712));
250}
251
252#[test]
253fn entry_test_hui_painting() {
254    assert_eq!(compile_common_chars().get_entry("繪", "hui4"), Some(&120));
255    assert_eq!(compile_common_chars().get_entry_no_pinyin("繪"), Some(&120));
256}
257
258#[test]
259fn entry_test_hui_meeting() {
260    assert_eq!(compile_common_chars().get_entry("會", "hui4"), Some(&3624));
261    assert_eq!(
262        compile_common_chars().get_entry_no_pinyin("會"),
263        Some(&3624)
264    );
265}