chinese_detection/
language_profiler.rs1use bincode::deserialize_from;
6use once_cell::sync::Lazy;
7use std::collections::HashMap;
8
9type Profile = HashMap<String, u64>;
10
11static CHINESE_DATA: Lazy<Profile> =
12 Lazy::new(|| deserialize_from(&include_bytes!("../profiles/zh.profile")[..]).unwrap());
13static ENGLISH_DATA: Lazy<Profile> =
14 Lazy::new(|| deserialize_from(&include_bytes!("../profiles/en.profile")[..]).unwrap());
15static PINYIN_DATA: Lazy<Profile> =
16 Lazy::new(|| deserialize_from(&include_bytes!("../profiles/py.profile")[..]).unwrap());
17static ENGLISH_TOTAL: Lazy<u64> = Lazy::new(|| calc_total(&ENGLISH_DATA));
18static CHINESE_TOTAL: Lazy<u64> = Lazy::new(|| calc_total(&CHINESE_DATA));
19static PINYIN_TOTAL: Lazy<u64> = Lazy::new(|| calc_total(&PINYIN_DATA));
20static GRAM_LENGTH: usize = 2;
21
22#[derive(Debug, Eq, PartialEq)]
23pub enum ClassificationResult {
24 ZH, EN, PY, UN, }
29
30fn calc_total(profile_data: &HashMap<String, u64>) -> u64 {
31 let mut accumulator: u64 = 0;
32 for (_, value) in profile_data.iter() {
33 accumulator += value;
34 }
35
36 accumulator
37}
38
39fn calc_prob(substring: &str, profile: &HashMap<String, u64>, total: f64) -> f64 {
40 let occurances: f64 = *profile.get(substring).unwrap_or(&1) as f64;
41
42 (occurances / total).log10() * -1_f64
43}
44
45fn score(sentence: &str, profile: &HashMap<String, u64>, total: &u64) -> f64 {
46 let mut accumulator: f64 = 0.0;
47 let words: Vec<_> = sentence.split(' ').collect();
48 for word in words {
49 if word.chars().count() < GRAM_LENGTH {
50 accumulator += calc_prob(word, profile, *total as f64);
51 } else {
52 let mut i = 0;
53 while i < (word.chars().count() - (GRAM_LENGTH - 1)) {
54 let substring: String = word.chars().skip(i).take(GRAM_LENGTH).collect();
55 accumulator += calc_prob(&substring, profile, *total as f64);
56 i += 1;
57 }
58 }
59 }
60
61 accumulator
62}
63
64pub fn init() {
65 Lazy::force(&ENGLISH_TOTAL);
66 Lazy::force(&CHINESE_TOTAL);
67 Lazy::force(&PINYIN_TOTAL);
68 Lazy::force(&CHINESE_DATA);
69 Lazy::force(&ENGLISH_DATA);
70 Lazy::force(&PINYIN_DATA);
71}
72
73pub fn classify(sentence: &str) -> ClassificationResult {
74 let english_score = score(sentence, &ENGLISH_DATA, &ENGLISH_TOTAL);
75 let chinese_score = score(sentence, &CHINESE_DATA, &CHINESE_TOTAL);
76 let pinyin_score = score(sentence, &PINYIN_DATA, &PINYIN_TOTAL);
77
78 if chinese_score < english_score && chinese_score < pinyin_score {
79 ClassificationResult::ZH
80 } else if pinyin_score < english_score && pinyin_score < chinese_score {
81 ClassificationResult::PY
82 } else if english_score < pinyin_score && english_score < chinese_score {
83 ClassificationResult::EN
84 } else {
85 ClassificationResult::UN
86 }
87}