1#![doc = include_str!("../README.md")]
2pub use cedict::DictEntry;
3use either::Either;
4use itertools::Itertools;
5#[cfg(feature = "embed-dict")]
6use once_cell::sync::Lazy;
7#[cfg(feature = "embed-dict")]
8use std::io::Cursor;
9use std::{
10 collections::{BTreeMap, HashMap},
11 ops::RangeFrom,
12};
13
14#[cfg(feature = "embed-dict")]
15static DEFAULT_DICT: &str = include_str!("../data/cedict-2024-06-07.txt");
16
17#[cfg(feature = "embed-dict")]
18static DEFAULT_WF: &str = include_str!("../data/SUBTLEX-CH-WF.utf8.txt");
19
20static SEGMENTATION_EXCEPTIONS: &[&[&str]] = &[
21 &["家", "中餐馆"],
22 &["这", "位子"],
23 &["十", "分钟"],
24 &["一", "点钟"],
25 &["合上", "书"],
26 &["第二", "天性"],
27 &["都", "会"],
28 &["上", "都"],
29 &["把", "手举"],
30 &["天", "下雨"],
31 &["四十", "分"],
32 &["写", "作文"],
33 &["得", "很"],
34 &["家", "的"],
35 &["的", "话"],
36];
37
38#[cfg(feature = "embed-dict")]
39pub static DICTIONARY: Lazy<Dictionary> = Lazy::new(Dictionary::new);
41
42pub struct Dictionary {
44 entries: BTreeMap<String, Vec<DictEntry<String>>>,
45 word_frequency: HashMap<String, f64>,
46}
47
48impl Dictionary {
49 #[cfg(feature = "embed-dict")]
50 pub fn new() -> Self {
51 Self::new_from_reader(Cursor::new(DEFAULT_DICT), Cursor::new(DEFAULT_WF))
52 }
53
54 pub fn new_from_reader<R: std::io::Read>(dict_reader: R, wf_reader: R) -> Self {
60 Dictionary {
61 entries: cedict::parse_reader(dict_reader)
62 .filter(|entry| !entry.simplified().chars().all(|c| c.is_ascii()))
63 .sorted_by(|a, b| a.simplified().cmp(&b.simplified()))
64 .chunk_by(|entry| entry.simplified().to_string())
65 .into_iter()
66 .map(|(key, entries)| (key, entries.collect()))
67 .collect(),
68 word_frequency: csv::ReaderBuilder::new()
69 .delimiter(b'\t')
70 .has_headers(true)
71 .from_reader(wf_reader)
72 .deserialize()
73 .map(|x| x.unwrap())
74 .map(
75 |(word, _wcount, _wmillion, logw, _wcd, _wcdp, _logwcd): (
76 String,
77 u64,
78 f64,
79 f64,
80 f64,
81 f64,
82 f64,
83 )| (word, logw),
84 )
85 .collect(),
86 }
87 }
88
89 pub fn frequency(&self, word: &str) -> f64 {
110 self.word_frequency.get(word).copied().unwrap_or_else(|| {
111 word.chars()
112 .map(|c| {
113 let mut buf = [0; 4];
114 let result = c.encode_utf8(&mut buf);
115 self.word_frequency.get(result).copied().unwrap_or(0f64)
116 })
117 .sum::<f64>()
118 .powf((word.chars().count() as f64).recip())
119 })
120 }
121
122 fn lookup_entry<'a>(&'a self, entry: &str) -> Option<Option<&'a Vec<DictEntry<String>>>> {
123 let (first_entry, dict_entry): (&String, &Vec<DictEntry<String>>) = self
124 .entries
125 .range(RangeFrom {
126 start: entry.to_string(),
127 })
128 .next()?;
129 if !first_entry.starts_with(entry) {
130 None
131 } else if entry == first_entry {
132 Some(Some(dict_entry))
133 } else {
134 Some(None)
135 }
136 }
137
138 pub fn lookup_entries<'a: 'b, 'b>(
179 &'a self,
180 text: &'b str,
181 ) -> impl Iterator<Item = &'a DictEntry<String>> + 'b {
182 string_inits(text)
183 .map_while(|entry| self.lookup_entry(entry))
184 .filter_map(|x| std::convert::identity(x))
185 .flatten()
186 }
187
188 pub fn get_entry(&self, text: &str) -> Option<&DictEntry<String>> {
198 self.lookup_entry(text)??.first()
199 }
200
201 fn segment_step<'a>(&'a self, text: &str) -> Vec<&'a DictEntry<String>> {
208 let mut fragments = Fragments::new();
209 fragments.push_fragment(Fragment::new());
210 loop {
211 let (offset, smallest) = fragments.pop();
212
213 assert!(
214 smallest.len() > 0,
215 "There must always be at least 1 smallest fragment."
216 );
217
218 let mut end_of_entries = true;
219
220 for entry in self.lookup_entries(&text[offset..]) {
221 end_of_entries = false;
222 for mut fragment in smallest.clone() {
223 fragment.push(self, entry);
224 fragments.push_fragment(fragment);
225 }
226 }
227
228 if let Some(fragment) = fragments.has_winner() {
230 return fragment.words.clone();
231 }
232
233 if end_of_entries {
234 return vec![];
235 }
236 }
237 }
238
239 pub fn segment<'a, 'b>(&'a self, text: &'b str) -> Vec<Either<&'a DictEntry<String>, &'b str>> {
291 let mut non_chinese_start = 0;
292 let mut result = vec![];
293 let mut offset = 0;
294 while offset < text.len() {
295 let segment = self.segment_step(&text[offset..]);
296 if segment.is_empty() {
297 let mut n = offset + 1;
298 while !text.is_char_boundary(n) {
299 n += 1;
300 }
301 offset = n;
302 } else {
303 if non_chinese_start != offset {
304 result.push(Either::Right(&text[non_chinese_start..offset]));
305 }
306 offset += segment.iter().map(|x| x.simplified().len()).sum::<usize>();
307 non_chinese_start = offset;
308 for word in segment {
309 result.push(Either::Left(word));
310 }
311 }
312 }
313 if non_chinese_start != offset {
314 result.push(Either::Right(&text[non_chinese_start..offset]));
315 }
316 result
317 }
318}
319
320struct Fragments<'a> {
322 fragments: BTreeMap<usize, Vec<Fragment<'a>>>,
323}
324
325impl<'a> Fragments<'a> {
326 fn new() -> Self {
327 Fragments {
328 fragments: BTreeMap::new(),
329 }
330 }
331
332 fn has_winner(&self) -> Option<&Fragment<'a>> {
334 if self.fragments.len() != 1 {
335 return None;
336 }
337
338 let (&len, fragments) = self.fragments.iter().next()?;
339 if len == 0 {
340 return None;
341 }
342
343 fragments
344 .iter()
345 .max_by(|a, b| a.score().total_cmp(&b.score()))
346 }
347
348 fn push_fragment(&mut self, fragment: Fragment<'a>) {
349 let len = fragment.len;
350 self.fragments.entry(len).or_default().push(fragment);
351 }
352
353 fn pop(&mut self) -> (usize, Vec<Fragment<'a>>) {
354 self.fragments.pop_first().unwrap_or_default()
355 }
356}
357
358#[derive(Clone, Debug)]
359struct Fragment<'a> {
360 words: Vec<&'a DictEntry<String>>,
361 scores: Vec<f64>,
362 len: usize, }
364
365impl<'a> Fragment<'a> {
366 fn new() -> Self {
367 Fragment {
368 words: vec![],
369 scores: vec![],
370 len: 0,
371 }
372 }
373
374 fn score(&self) -> f64 {
375 self.scores
376 .iter()
377 .product::<f64>()
378 .powf((self.scores.len() as f64).recip())
379 - self.scores.len() as f64 * 10_f64
380 }
381
382 fn push(&mut self, dict: &Dictionary, word: &'a DictEntry<String>) {
383 let mut score = dict.frequency(word.simplified());
384 self.words.push(word);
385
386 for &exception in SEGMENTATION_EXCEPTIONS {
387 let x = self
388 .words
389 .iter()
390 .map(|x| DictEntry::simplified(x))
391 .rev()
392 .take(exception.len())
393 .rev()
394 .collect::<Vec<_>>();
395
396 if x == exception {
397 score += 100_000_f64;
398 }
399 }
400
401 self.scores.push(score);
402 self.len += word.simplified().len();
403 }
404}
405
406fn string_inits(str: &str) -> impl Iterator<Item = &str> {
409 str.char_indices()
410 .skip(1)
411 .map(|(n, _)| &str[..n])
412 .chain(std::iter::once(str))
413}
414
415#[cfg(test)]
425mod plain_tests {
426 #[test]
439 fn string_inits_sanity() {
440 assert_eq!(
441 super::string_inits("ABC").collect::<Vec<&str>>(),
442 vec!["A", "AB", "ABC"]
443 );
444 assert_eq!(
445 super::string_inits("你好吗").collect::<Vec<&str>>(),
446 vec!["你", "你好", "你好吗"]
447 );
448 }
449}
450
451#[cfg(all(test, feature = "embed-dict"))]
452mod dictionary_tests {
453 use super::{DictEntry, DICTIONARY};
454
455 #[test]
457 fn multiple_entries() {
458 assert_eq!(
459 DICTIONARY
460 .lookup_entries("会")
461 .map(|entry| entry.pinyin().to_string())
462 .collect::<Vec<String>>(),
463 &["hui4", "kuai4"]
464 );
465 }
466
467 #[test]
469 fn entries_for_le_liao() {
470 assert_eq!(
471 DICTIONARY
472 .lookup_entries("了")
473 .map(|entry| entry.pinyin().to_string())
474 .collect::<Vec<String>>(),
475 &["le5", "liao3", "liao3", "liao4"]
476 );
477 }
478
479 #[track_caller]
480 fn assert_segment_step(text: &str, expected: &str) {
481 assert_eq!(
482 DICTIONARY
483 .segment_step(text)
484 .into_iter()
485 .map(DictEntry::simplified)
486 .collect::<Vec<_>>(),
487 expected
488 .split(' ')
489 .filter(|str| !str.is_empty())
490 .collect::<Vec<_>>()
491 );
492 }
493
494 #[track_caller]
495 fn assert_segment(text: &str, expected: &str) {
496 assert_eq!(
497 DICTIONARY
498 .segment(text)
499 .into_iter()
500 .map(|ret| ret.right_or_else(DictEntry::simplified))
501 .collect::<Vec<_>>(),
502 expected.split(' ').collect::<Vec<_>>()
503 );
504 }
505
506 #[test]
507 fn segment_step_sanity_1() {
508 assert_segment_step("", "");
509 }
510
511 #[test]
512 fn segment_step_sanity_2() {
513 assert_segment_step("我ABC", "我");
514 }
515
516 #[test]
517 fn segment_step_sanity_3() {
518 assert_segment_step("你好", "你好");
519 }
520
521 #[test]
522 fn segment_step_sanity_4() {
523 assert_segment_step("多工作", "多 工作");
524 assert_segment_step("有电话", "有 电话");
525 assert_segment_step("回电话", "回 电话");
526 assert_segment_step("不知道", "不 知道");
527 assert_segment_step("定时间", "定 时间");
528 assert_segment_step("这位子", "这 位子");
529 assert_segment_step("十分钟", "十 分钟");
530 assert_segment_step("有电梯", "有 电梯");
531 assert_segment_step("中午前", "中午 前");
532 assert_segment_step("想要点", "想要 点");
533 assert_segment_step("外套", "外套");
536 assert_segment_step("家中餐馆", "家");
537 assert_segment_step("后生活", "后 生活");
538 assert_segment_step("不愿意", "不 愿意");
539 assert_segment_step("点出发", "点 出发");
540 assert_segment_step("老婆婆", "老 婆婆");
541 assert_segment_step("不会跳舞", "不会");
542 assert_segment_step("穿上外套", "穿上 外套");
543 assert_segment_step("建议", "建议");
544 assert_segment_step("怎么不知道", "怎么");
545 assert_segment_step("蛋糕发起来", "蛋糕");
546 assert_segment_step("管理的人才", "管理");
547 assert_segment_step("轻快乐曲", "轻快 乐曲");
548 assert_segment_step("高明和", "高明 和");
549 assert_segment_step("一下子之间", "一下子");
550 assert_segment_step("我绝没想到", "我");
551 assert_segment_step("绝没想到", "绝");
552 assert_segment_step("没想到", "没想到");
553 assert_segment_step("没想到会", "没想到");
554 }
555
556 #[test]
557 fn segment_sanity_mixed() {
558 assert_segment("我叫David", "我 叫 David");
559 assert_segment("English!", "English!");
560 assert_segment("告诉ABC屁股", "告诉 ABC 屁股");
561 }
562
563 #[test]
564 fn segment_sanity() {
565 assert_segment("节日里人们", "节日 里 人们");
566 assert_segment("我可没有时间闲呆着", "我 可 没有 时间 闲 呆 着");
567 assert_segment("我要看病", "我 要 看病");
568 assert_segment("你好像不太舒服", "你 好像 不 太 舒服");
569 assert_segment("我非常想见到她", "我 非常 想 见到 她");
570 assert_segment("婚后生活怎么样", "婚 后 生活 怎么样");
571 assert_segment(
572 "为了照顾家人,我放弃了升职的机会",
573 "为了 照顾 家人 , 我 放弃 了 升职 的 机会",
574 );
575 assert_segment("我有好多事要干", "我 有 好多 事 要 干");
576
577 assert_segment("我不知道这张表怎么填", "我 不 知道 这 张 表 怎么 填");
578 assert_segment("他今天有很多事情要做", "他 今天 有 很 多 事情 要 做");
579 assert_segment("我不知道他在想什么", "我 不 知道 他 在 想 什么");
580 assert_segment("我是个不顾家的人", "我 是 个 不顾 家 的 人");
581 assert_segment("你真有胆量", "你 真 有胆量");
582 assert_segment("我合上书准备离开", "我 合上 书 准备 离开");
584 assert_segment("他的话", "他 的 话");
585 assert_segment("你用什么方法学习", "你 用 什么 方法 学习");
586 }
610
611 #[test]
612 fn default_dict_is_valid() {
613 assert_eq!(DICTIONARY.entries.len(), 119002);
614 }
615
616 #[test]
617 fn default_wf_is_valid() {
618 assert_eq!(DICTIONARY.word_frequency.len(), 99121);
619 }
620
621 #[test]
622 fn multi_lookup() {
623 assert_eq!(
624 DICTIONARY
625 .lookup_entries("一个人")
626 .map(DictEntry::simplified)
627 .map(str::to_string)
628 .collect::<Vec<String>>(),
629 vec!["一", "一个人"]
630 );
631 }
632}