haoxue_dict/
lib.rs

1#![doc = include_str!("../README.md")]
2pub use cedict::DictEntry;
3use either::Either;
4use itertools::Itertools;
5#[cfg(feature = "embed-dict")]
6use once_cell::sync::Lazy;
7#[cfg(feature = "embed-dict")]
8use std::io::Cursor;
9use std::{
10    collections::{BTreeMap, HashMap},
11    ops::RangeFrom,
12};
13
14#[cfg(feature = "embed-dict")]
15static DEFAULT_DICT: &str = include_str!("../data/cedict-2024-06-07.txt");
16
17#[cfg(feature = "embed-dict")]
18static DEFAULT_WF: &str = include_str!("../data/SUBTLEX-CH-WF.utf8.txt");
19
20static SEGMENTATION_EXCEPTIONS: &[&[&str]] = &[
21    &["家", "中餐馆"],
22    &["这", "位子"],
23    &["十", "分钟"],
24    &["一", "点钟"],
25    &["合上", "书"],
26    &["第二", "天性"],
27    &["都", "会"],
28    &["上", "都"],
29    &["把", "手举"],
30    &["天", "下雨"],
31    &["四十", "分"],
32    &["写", "作文"],
33    &["得", "很"],
34    &["家", "的"],
35    &["的", "话"],
36];
37
38#[cfg(feature = "embed-dict")]
39/// Built-in dictionary. Requires the `embed-dict` feature.
40pub static DICTIONARY: Lazy<Dictionary> = Lazy::new(Dictionary::new);
41
42/// A Chinese dictionary and word segmenter.
43pub struct Dictionary {
44    entries: BTreeMap<String, Vec<DictEntry<String>>>,
45    word_frequency: HashMap<String, f64>,
46}
47
48impl Dictionary {
49    #[cfg(feature = "embed-dict")]
50    pub fn new() -> Self {
51        Self::new_from_reader(Cursor::new(DEFAULT_DICT), Cursor::new(DEFAULT_WF))
52    }
53
54    /// Create a new dictionary from readers. The dictionary file must follow the
55    /// [cedict](https://en.wikipedia.org/wiki/CEDICT) format. The word frequency
56    /// file must be a CSV with columns `word`, `wcount`, `wmillion`, `logw`,
57    /// `wcd`, `wcdp`, `logwcd`. See
58    /// [SUBTLEX-CH-WF](https://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexch).
59    pub fn new_from_reader<R: std::io::Read>(dict_reader: R, wf_reader: R) -> Self {
60        Dictionary {
61            entries: cedict::parse_reader(dict_reader)
62                .filter(|entry| !entry.simplified().chars().all(|c| c.is_ascii()))
63                .sorted_by(|a, b| a.simplified().cmp(&b.simplified()))
64                .chunk_by(|entry| entry.simplified().to_string())
65                .into_iter()
66                .map(|(key, entries)| (key, entries.collect()))
67                .collect(),
68            word_frequency: csv::ReaderBuilder::new()
69                .delimiter(b'\t')
70                .has_headers(true)
71                .from_reader(wf_reader)
72                .deserialize()
73                .map(|x| x.unwrap())
74                .map(
75                    |(word, _wcount, _wmillion, logw, _wcd, _wcdp, _logwcd): (
76                        String,
77                        u64,
78                        f64,
79                        f64,
80                        f64,
81                        f64,
82                        f64,
83                    )| (word, logw),
84                )
85                .collect(),
86        }
87    }
88
89    /// Get the `log(frequency)` of a word. If a word isn't in the frequency
90    /// table, the frequency is estimated as the geometric mean of the
91    /// frequencies of the component characters.
92    ///
93    /// # Examples
94    ///
95    /// ```rust
96    /// # use haoxue_dict::DICTIONARY;
97    /// #
98    /// // 'University' has a lower frequency than 'I'
99    /// assert!(DICTIONARY.frequency("大学") < DICTIONARY.frequency("们"));
100    /// ```
101    ///
102    /// ```rust
103    /// # use haoxue_dict::DICTIONARY;
104    /// #
105    /// // 'Senator' has a lower frequency than 'I'
106    /// assert_eq!(DICTIONARY.frequency("我"), 6.2259_f64);
107    /// assert_eq!(DICTIONARY.frequency("参议员"), 2.9154_f64);
108    /// ```
109    pub fn frequency(&self, word: &str) -> f64 {
110        self.word_frequency.get(word).copied().unwrap_or_else(|| {
111            word.chars()
112                .map(|c| {
113                    let mut buf = [0; 4];
114                    let result = c.encode_utf8(&mut buf);
115                    self.word_frequency.get(result).copied().unwrap_or(0f64)
116                })
117                .sum::<f64>()
118                .powf((word.chars().count() as f64).recip())
119        })
120    }
121
122    fn lookup_entry<'a>(&'a self, entry: &str) -> Option<Option<&'a Vec<DictEntry<String>>>> {
123        let (first_entry, dict_entry): (&String, &Vec<DictEntry<String>>) = self
124            .entries
125            .range(RangeFrom {
126                start: entry.to_string(),
127            })
128            .next()?;
129        if !first_entry.starts_with(entry) {
130            None
131        } else if entry == first_entry {
132            Some(Some(dict_entry))
133        } else {
134            Some(None)
135        }
136    }
137
138    /// Lookup possible words that match the given text. Since Chinese doesn't
139    /// separate words with spaces, it is ambiguous how to segment a text. This
140    /// function returns all possible dictionary entries for the given text.
141    ///
142    /// # Examples
143    ///
144    /// ```rust
145    /// # use haoxue_dict::DICTIONARY;
146    /// #
147    /// let mut entries = DICTIONARY.lookup_entries("好久不见");
148    /// assert_eq!(entries.next().unwrap().simplified(), "好"); // hǎo variant
149    /// assert_eq!(entries.next().unwrap().simplified(), "好"); // hào variant
150    /// assert_eq!(entries.next().unwrap().simplified(), "好久");
151    /// assert_eq!(entries.next().unwrap().simplified(), "好久不见");
152    /// assert!(entries.next().is_none());
153    /// ```
154    ///
155    /// ```rust
156    /// # use haoxue_dict::DICTIONARY;
157    /// #
158    /// let mut entries = DICTIONARY.lookup_entries("你好吗?");
159    /// assert_eq!(entries.next().unwrap().simplified(), "你");
160    /// assert_eq!(entries.next().unwrap().simplified(), "你"); // Taiwan variant
161    /// assert_eq!(entries.next().unwrap().simplified(), "你好");
162    /// assert!(entries.next().is_none());
163    /// ```
164    ///
165    /// ```rust
166    /// # use haoxue_dict::DICTIONARY;
167    /// #
168    /// let mut entries = DICTIONARY.lookup_entries("");
169    /// assert!(entries.next().is_none());
170    /// ```
171    ///
172    /// ```rust
173    /// # use haoxue_dict::DICTIONARY;
174    /// #
175    /// let mut entries = DICTIONARY.lookup_entries("English!");
176    /// assert!(entries.next().is_none());
177    /// ```
178    pub fn lookup_entries<'a: 'b, 'b>(
179        &'a self,
180        text: &'b str,
181    ) -> impl Iterator<Item = &'a DictEntry<String>> + 'b {
182        string_inits(text)
183            .map_while(|entry| self.lookup_entry(entry))
184            .filter_map(|x| std::convert::identity(x))
185            .flatten()
186    }
187
188    /// Get the first entry for a word.
189    ///
190    /// # Examples
191    ///
192    /// ```rust
193    /// # use haoxue_dict::DICTIONARY;
194    /// #
195    /// assert_eq!(DICTIONARY.get_entry("参议员").unwrap().definitions().next(), Some("senator"));
196    /// ```
197    pub fn get_entry(&self, text: &str) -> Option<&DictEntry<String>> {
198        self.lookup_entry(text)??.first()
199    }
200
201    // 十分钟
202    // 0. segments: [[]]
203    // 1. segments: [[十], [十分]]
204    // 2. segments: [[十,分], [十,分钟],[十分]]
205    // 3. segments: [[十,分,钟], [十,分钟],[十分, 钟]]
206    // 4. pick best: [十,分钟]
207    fn segment_step<'a>(&'a self, text: &str) -> Vec<&'a DictEntry<String>> {
208        let mut fragments = Fragments::new();
209        fragments.push_fragment(Fragment::new());
210        loop {
211            let (offset, smallest) = fragments.pop();
212
213            assert!(
214                smallest.len() > 0,
215                "There must always be at least 1 smallest fragment."
216            );
217
218            let mut end_of_entries = true;
219
220            for entry in self.lookup_entries(&text[offset..]) {
221                end_of_entries = false;
222                for mut fragment in smallest.clone() {
223                    fragment.push(self, entry);
224                    fragments.push_fragment(fragment);
225                }
226            }
227
228            // Return if all fragments have the same size.
229            if let Some(fragment) = fragments.has_winner() {
230                return fragment.words.clone();
231            }
232
233            if end_of_entries {
234                return vec![];
235            }
236        }
237    }
238
239    /// Segment a text into words and non-Chinese characters. This function uses
240    /// word-frequencies and heuristics to pick the best segmentation. It's not
241    /// 100% accurate, and it changes often. Do not rely on the output being
242    /// stable.
243    ///
244    /// # Examples
245    ///
246    /// ```rust
247    /// # use haoxue_dict::{DICTIONARY, DictEntry};
248    /// # use either::Either;
249    /// let mut segments = DICTIONARY.segment("我是大学生。")
250    ///                      .iter()
251    ///                      .map(|x| x.map_left(DictEntry::simplified))
252    ///                      .collect::<Vec<_>>();
253    /// assert_eq!(segments, vec![ Either::Left("我")
254    ///                          , Either::Left("是")
255    ///                          , Either::Left("大学生")
256    ///                          , Either::Right("。")
257    ///                          ]);
258    /// ```
259    ///
260    /// ```rust
261    /// # use haoxue_dict::{DICTIONARY, DictEntry};
262    /// # use either::Either;
263    /// let mut segments = DICTIONARY.segment("十分钟")
264    ///                      .iter()
265    ///                      .map(|x| x.map_left(DictEntry::simplified))
266    ///                      .collect::<Vec<_>>();
267    /// assert_eq!(segments, vec![ Either::Left("十")
268    ///                          , Either::Left("分钟")
269    ///                          ]);
270    ///
271    /// let mut segments = DICTIONARY.segment("十分")
272    ///                      .iter()
273    ///                      .map(|x| x.map_left(DictEntry::simplified))
274    ///                      .collect::<Vec<_>>();
275    /// assert_eq!(segments, vec![ Either::Left("十分")]);
276    /// ```
277    ///
278    /// ```rust
279    /// # use haoxue_dict::{DICTIONARY, DictEntry};
280    /// # use either::Either;
281    /// let mut segments = DICTIONARY.segment("我叫Lemmih。")
282    ///                      .iter()
283    ///                      .map(|x| x.map_left(DictEntry::simplified))
284    ///                      .collect::<Vec<_>>();
285    /// assert_eq!(segments, vec![ Either::Left("我")
286    ///                          , Either::Left("叫")
287    ///                          , Either::Right("Lemmih。")
288    ///                          ]);
289    /// ```
290    pub fn segment<'a, 'b>(&'a self, text: &'b str) -> Vec<Either<&'a DictEntry<String>, &'b str>> {
291        let mut non_chinese_start = 0;
292        let mut result = vec![];
293        let mut offset = 0;
294        while offset < text.len() {
295            let segment = self.segment_step(&text[offset..]);
296            if segment.is_empty() {
297                let mut n = offset + 1;
298                while !text.is_char_boundary(n) {
299                    n += 1;
300                }
301                offset = n;
302            } else {
303                if non_chinese_start != offset {
304                    result.push(Either::Right(&text[non_chinese_start..offset]));
305                }
306                offset += segment.iter().map(|x| x.simplified().len()).sum::<usize>();
307                non_chinese_start = offset;
308                for word in segment {
309                    result.push(Either::Left(word));
310                }
311            }
312        }
313        if non_chinese_start != offset {
314            result.push(Either::Right(&text[non_chinese_start..offset]));
315        }
316        result
317    }
318}
319
320// invariant: fragments.len >= 1
321struct Fragments<'a> {
322    fragments: BTreeMap<usize, Vec<Fragment<'a>>>,
323}
324
325impl<'a> Fragments<'a> {
326    fn new() -> Self {
327        Fragments {
328            fragments: BTreeMap::new(),
329        }
330    }
331
332    // If all fragments have the same non-zero length, return the fragment with the highest score
333    fn has_winner(&self) -> Option<&Fragment<'a>> {
334        if self.fragments.len() != 1 {
335            return None;
336        }
337
338        let (&len, fragments) = self.fragments.iter().next()?;
339        if len == 0 {
340            return None;
341        }
342
343        fragments
344            .iter()
345            .max_by(|a, b| a.score().total_cmp(&b.score()))
346    }
347
348    fn push_fragment(&mut self, fragment: Fragment<'a>) {
349        let len = fragment.len;
350        self.fragments.entry(len).or_default().push(fragment);
351    }
352
353    fn pop(&mut self) -> (usize, Vec<Fragment<'a>>) {
354        self.fragments.pop_first().unwrap_or_default()
355    }
356}
357
358#[derive(Clone, Debug)]
359struct Fragment<'a> {
360    words: Vec<&'a DictEntry<String>>,
361    scores: Vec<f64>,
362    len: usize, // in bytes
363}
364
365impl<'a> Fragment<'a> {
366    fn new() -> Self {
367        Fragment {
368            words: vec![],
369            scores: vec![],
370            len: 0,
371        }
372    }
373
374    fn score(&self) -> f64 {
375        self.scores
376            .iter()
377            .product::<f64>()
378            .powf((self.scores.len() as f64).recip())
379            - self.scores.len() as f64 * 10_f64
380    }
381
382    fn push(&mut self, dict: &Dictionary, word: &'a DictEntry<String>) {
383        let mut score = dict.frequency(word.simplified());
384        self.words.push(word);
385
386        for &exception in SEGMENTATION_EXCEPTIONS {
387            let x = self
388                .words
389                .iter()
390                .map(|x| DictEntry::simplified(x))
391                .rev()
392                .take(exception.len())
393                .rev()
394                .collect::<Vec<_>>();
395
396            if x == exception {
397                score += 100_000_f64;
398            }
399        }
400
401        self.scores.push(score);
402        self.len += word.simplified().len();
403    }
404}
405
406// "ABC" => ["A", "AB", "ABC"]
407// "你好吗" => ["你","你好", "你好吗"]
408fn string_inits(str: &str) -> impl Iterator<Item = &str> {
409    str.char_indices()
410        .skip(1)
411        .map(|(n, _)| &str[..n])
412        .chain(std::iter::once(str))
413}
414
415// fn string_tails(str: &str) -> impl Iterator<Item = &str> {
416//     str.char_indices().map(|(n, _)| &str[n..])
417// }
418
419// "ABC"
420// AB C
421// A BC
422// A B C
423
424#[cfg(test)]
425mod plain_tests {
426    // #[test]
427    // fn string_tails_sanity() {
428    //     assert_eq!(
429    //         super::string_tails("ABC").collect::<Vec<&str>>(),
430    //         vec!["ABC", "BC", "C"]
431    //     );
432    //     assert_eq!(
433    //         super::string_tails("你好吗").collect::<Vec<&str>>(),
434    //         vec!["你好吗", "好吗", "吗"]
435    //     );
436    // }
437
438    #[test]
439    fn string_inits_sanity() {
440        assert_eq!(
441            super::string_inits("ABC").collect::<Vec<&str>>(),
442            vec!["A", "AB", "ABC"]
443        );
444        assert_eq!(
445            super::string_inits("你好吗").collect::<Vec<&str>>(),
446            vec!["你", "你好", "你好吗"]
447        );
448    }
449}
450
451#[cfg(all(test, feature = "embed-dict"))]
452mod dictionary_tests {
453    use super::{DictEntry, DICTIONARY};
454
455    // 会 should return both hui4 and kuai4
456    #[test]
457    fn multiple_entries() {
458        assert_eq!(
459            DICTIONARY
460                .lookup_entries("会")
461                .map(|entry| entry.pinyin().to_string())
462                .collect::<Vec<String>>(),
463            &["hui4", "kuai4"]
464        );
465    }
466
467    // 了 has multi entries spread over multiple lines
468    #[test]
469    fn entries_for_le_liao() {
470        assert_eq!(
471            DICTIONARY
472                .lookup_entries("了")
473                .map(|entry| entry.pinyin().to_string())
474                .collect::<Vec<String>>(),
475            &["le5", "liao3", "liao3", "liao4"]
476        );
477    }
478
479    #[track_caller]
480    fn assert_segment_step(text: &str, expected: &str) {
481        assert_eq!(
482            DICTIONARY
483                .segment_step(text)
484                .into_iter()
485                .map(DictEntry::simplified)
486                .collect::<Vec<_>>(),
487            expected
488                .split(' ')
489                .filter(|str| !str.is_empty())
490                .collect::<Vec<_>>()
491        );
492    }
493
494    #[track_caller]
495    fn assert_segment(text: &str, expected: &str) {
496        assert_eq!(
497            DICTIONARY
498                .segment(text)
499                .into_iter()
500                .map(|ret| ret.right_or_else(DictEntry::simplified))
501                .collect::<Vec<_>>(),
502            expected.split(' ').collect::<Vec<_>>()
503        );
504    }
505
506    #[test]
507    fn segment_step_sanity_1() {
508        assert_segment_step("", "");
509    }
510
511    #[test]
512    fn segment_step_sanity_2() {
513        assert_segment_step("我ABC", "我");
514    }
515
516    #[test]
517    fn segment_step_sanity_3() {
518        assert_segment_step("你好", "你好");
519    }
520
521    #[test]
522    fn segment_step_sanity_4() {
523        assert_segment_step("多工作", "多 工作");
524        assert_segment_step("有电话", "有 电话");
525        assert_segment_step("回电话", "回 电话");
526        assert_segment_step("不知道", "不 知道");
527        assert_segment_step("定时间", "定 时间");
528        assert_segment_step("这位子", "这 位子");
529        assert_segment_step("十分钟", "十 分钟");
530        assert_segment_step("有电梯", "有 电梯");
531        assert_segment_step("中午前", "中午 前");
532        assert_segment_step("想要点", "想要 点");
533        // This one is questionable.
534        // assert_segment_step(&dict, "得很", &["得", "很"]); // fails
535        assert_segment_step("外套", "外套");
536        assert_segment_step("家中餐馆", "家");
537        assert_segment_step("后生活", "后 生活");
538        assert_segment_step("不愿意", "不 愿意");
539        assert_segment_step("点出发", "点 出发");
540        assert_segment_step("老婆婆", "老 婆婆");
541        assert_segment_step("不会跳舞", "不会");
542        assert_segment_step("穿上外套", "穿上 外套");
543        assert_segment_step("建议", "建议");
544        assert_segment_step("怎么不知道", "怎么");
545        assert_segment_step("蛋糕发起来", "蛋糕");
546        assert_segment_step("管理的人才", "管理");
547        assert_segment_step("轻快乐曲", "轻快 乐曲");
548        assert_segment_step("高明和", "高明 和");
549        assert_segment_step("一下子之间", "一下子");
550        assert_segment_step("我绝没想到", "我");
551        assert_segment_step("绝没想到", "绝");
552        assert_segment_step("没想到", "没想到");
553        assert_segment_step("没想到会", "没想到");
554    }
555
556    #[test]
557    fn segment_sanity_mixed() {
558        assert_segment("我叫David", "我 叫 David");
559        assert_segment("English!", "English!");
560        assert_segment("告诉ABC屁股", "告诉 ABC 屁股");
561    }
562
563    #[test]
564    fn segment_sanity() {
565        assert_segment("节日里人们", "节日 里 人们");
566        assert_segment("我可没有时间闲呆着", "我 可 没有 时间 闲 呆 着");
567        assert_segment("我要看病", "我 要 看病");
568        assert_segment("你好像不太舒服", "你 好像 不 太 舒服");
569        assert_segment("我非常想见到她", "我 非常 想 见到 她");
570        assert_segment("婚后生活怎么样", "婚 后 生活 怎么样");
571        assert_segment(
572            "为了照顾家人,我放弃了升职的机会",
573            "为了 照顾 家人 , 我 放弃 了 升职 的 机会",
574        );
575        assert_segment("我有好多事要干", "我 有 好多 事 要 干");
576
577        assert_segment("我不知道这张表怎么填", "我 不 知道 这 张 表 怎么 填");
578        assert_segment("他今天有很多事情要做", "他 今天 有 很 多 事情 要 做");
579        assert_segment("我不知道他在想什么", "我 不 知道 他 在 想 什么");
580        assert_segment("我是个不顾家的人", "我 是 个 不顾 家 的 人");
581        assert_segment("你真有胆量", "你 真 有胆量");
582        // assert_segment("夏天到了", "夏天 到 了");
583        assert_segment("我合上书准备离开", "我 合上 书 准备 离开");
584        assert_segment("他的话", "他 的 话");
585        assert_segment("你用什么方法学习", "你 用 什么 方法 学习");
586        /*
587        , ("你定时间吧","你 定 时间 吧")
588        -- , ("这位子有人吗","这 位子 有人 吗")
589
590
591
592        , ("我先做作业再吃晚饭","我 先 做 作业 再 吃 晚饭")
593        , ("现在一点钟了", "现在 一 点钟 了")
594
595
596        , ("AAA","AAA")
597        , ("BBB","BBB")
598
599        , ("习惯是第二天性", "习惯 是 第二 天性")
600        , ("一切都会好的", "一切 都 会 好 的")
601        , ("上帝什么都会", "上帝 什么 都 会")
602        -- , ("他比我高一个头", "他 比 我 高 一 个 头") -- What's the right way to tokenize here?
603        , ("每张桌子上都有菜单", "每 张 桌子 上 都 有 菜单")
604        , ("把手举起来","把 手举 起来")
605        , ("若天下雨","若 天 下雨")
606        , ("现在是五点四十分","现在 是 五 点 四十 分")
607        , ("她在写作文","她 在 写 作文")
608         */
609    }
610
611    #[test]
612    fn default_dict_is_valid() {
613        assert_eq!(DICTIONARY.entries.len(), 119002);
614    }
615
616    #[test]
617    fn default_wf_is_valid() {
618        assert_eq!(DICTIONARY.word_frequency.len(), 99121);
619    }
620
621    #[test]
622    fn multi_lookup() {
623        assert_eq!(
624            DICTIONARY
625                .lookup_entries("一个人")
626                .map(DictEntry::simplified)
627                .map(str::to_string)
628                .collect::<Vec<String>>(),
629            vec!["一", "一个人"]
630        );
631    }
632}