irg_kvariants/
lib.rs

1use once_cell::sync::Lazy;
2use serde::Deserialize;
3use std::collections::HashMap;
4
5#[derive(Debug, Eq, PartialEq)]
6pub enum KVariantClass {
7    Wrong,
8    SementicVariant,
9    Simplified,
10    Old,
11    Equal,
12}
13
14#[derive(Debug, Eq, PartialEq)]
15pub struct KVariant {
16    pub source_ideograph: char,
17    pub classification: KVariantClass,
18    pub destination_ideograph: char,
19}
20
21#[derive(Deserialize)]
22pub struct TsvRow {
23    lhs: String,
24    relation: String,
25    rhs: String,
26}
27
28pub static KVARIANTS: Lazy<HashMap<char, KVariant>> = Lazy::new(|| {
29    // The tab separated format is like:
30    //
31    //   㨲 (U+3A32)	wrong!	㩍 (U+3A4D)
32    //   铿 (U+94FF)	simp	鏗 (U+93D7)
33    //   㓻 (U+34FB)	sem	    剛 (U+525B)
34    //   ...
35    //
36    let dictionary: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/kVariants.min.csv"));
37    let mut reader = csv::ReaderBuilder::new().has_headers(false).from_reader(dictionary);
38
39    let mut map: HashMap<char, KVariant> = HashMap::new();
40    for result in reader.deserialize() {
41        let line: TsvRow = result.unwrap();
42        let rhs = line.rhs.chars().next().unwrap();
43        let lhs = line.lhs.chars().next().unwrap();
44
45        if let Some(classification) = match line.relation.as_str() {
46            "wrong!" => Some(KVariantClass::Wrong),
47            "sem" => Some(KVariantClass::SementicVariant),
48            "simp" => Some(KVariantClass::Simplified),
49            "old" => Some(KVariantClass::Old),
50            "=" => Some(KVariantClass::Equal),
51            unexpected_classification => {
52                debug_assert!(
53                    false,
54                    "Unexpected classification {unexpected_classification:?} encountered. Consider handling or ignore explicaitly.",
55                );
56                None
57            }
58        } {
59            debug_assert!(
60                !map.contains_key(&lhs),
61                "Unexpected one source ideograph mapping to multiple destination ideographs.
62                 If this happens in the future when we update kVariants.tsv, we would need to handle it
63                 by, for example, deciding priorities for different classification types. "
64            );
65
66            map.insert(
67                lhs,
68                KVariant { source_ideograph: lhs, classification, destination_ideograph: rhs },
69            );
70        }
71    }
72
73    map
74});
75
76#[cfg(test)]
77mod test {
78    use super::*;
79
80    #[test]
81    fn test_kvariants() {
82        assert_eq!(
83            KVARIANTS.get(&'澚'),
84            Some(&KVariant {
85                source_ideograph: '澚',
86                classification: KVariantClass::Wrong,
87                destination_ideograph: '澳'
88            }),
89        );
90        assert_eq!(
91            KVARIANTS.get(&'䀾'),
92            Some(&KVariant {
93                source_ideograph: '䀾',
94                classification: KVariantClass::SementicVariant,
95                destination_ideograph: '䁈',
96            }),
97        );
98        assert_eq!(
99            KVARIANTS.get(&'亚'),
100            Some(&KVariant {
101                source_ideograph: '亚',
102                classification: KVariantClass::Simplified,
103                destination_ideograph: '亞',
104            }),
105        );
106        assert_eq!(
107            KVARIANTS.get(&'㮺'),
108            Some(&KVariant {
109                source_ideograph: '㮺',
110                classification: KVariantClass::Old,
111                destination_ideograph: '本',
112            }),
113        );
114        assert_eq!(
115            KVARIANTS.get(&'刄'),
116            Some(&KVariant {
117                source_ideograph: '刄',
118                classification: KVariantClass::Equal,
119                destination_ideograph: '刃',
120            }),
121        );
122        assert_eq!(KVARIANTS.get(&'刃'), None);
123    }
124
125    #[test]
126    fn test_no_loop() {
127        for value in KVARIANTS.values() {
128            match KVARIANTS.get(&value.destination_ideograph) {
129                // e.g. when value is "栄", reverse lookup would yield nothing.
130                None => (),
131
132                // e.g. when value is "椉", reverse lookup would yield "椉 old 乘".
133                Some(reverse_lookup_value) => {
134                    assert_ne!(
135                        value.destination_ideograph,
136                        reverse_lookup_value.destination_ideograph
137                    );
138                }
139            }
140        }
141    }
142}