charabia 0.9.9

A simple library to detect the language, tokenize the text and normalize the tokens
Documentation
use std::collections::HashMap;

use serde::Deserialize;
use std::sync::LazyLock;

#[derive(Debug, PartialEq, Eq)]
pub enum KVariantClass {
    Wrong,
    SementicVariant,
    Simplified,
    Old,
    Equal,
}

#[derive(Debug, PartialEq, Eq)]
pub struct KVariant {
    pub source_ideograph: char,
    pub classification: KVariantClass,
    pub destination_ideograph: char,
}

#[derive(Deserialize)]
pub struct TsvRow {
    lhs: String,
    relation: String,
    rhs: String,
}

pub static KVARIANTS: LazyLock<HashMap<char, KVariant>> = LazyLock::new(|| {
    // The tab separated format is like:
    //
    //   㨲 (U+3A32)	wrong!	㩍 (U+3A4D)
    //   铿 (U+94FF)	simp	鏗 (U+93D7)
    //   㓻 (U+34FB)	sem	    剛 (U+525B)
    //   ...
    //
    let tsv = include_str!("../../../dictionaries/txt/chinese/kVariants.tsv");
    let mut reader =
        csv::ReaderBuilder::new().delimiter(b'\t').has_headers(false).from_reader(tsv.as_bytes());

    let mut map: HashMap<char, KVariant> = HashMap::new();
    for result in reader.deserialize() {
        let line: TsvRow = result.unwrap();
        let rhs = line.rhs.chars().next().unwrap(); // Extract "㨲" from "㨲 (U+3A32)"
        let lhs = line.lhs.chars().next().unwrap(); // Extract "㩍" from "㩍 (U+3A4D)"

        if let Some(classification) = match line.relation.as_str() {
            "wrong!" => Some(KVariantClass::Wrong),
            "sem" => Some(KVariantClass::SementicVariant),
            "simp" => Some(KVariantClass::Simplified),
            "old" => Some(KVariantClass::Old),
            "=" => Some(KVariantClass::Equal),
            unexpected_classification => {
                debug_assert!(
                    false,
                    "Unexpected classification {unexpected_classification:?} encountered. Consider handling or ignore explicaitly."
                );
                None
            }
        } {
            debug_assert!(
                !map.contains_key(&lhs),
                "Unexpected one source ideograph mapping to multiple destination ideographs.
                 If this happens in the future when we update kVariants.tsv, we would need to handle it
                 by, for example, deciding priorities for different classification types. "
            );

            map.insert(
                lhs,
                KVariant { source_ideograph: lhs, classification, destination_ideograph: rhs },
            );
        }
    }

    map
});

#[cfg(test)]
mod test {
    use super::*;

    #[test]
    fn test_kvariants() {
        assert_eq!(
            KVARIANTS.get(&''),
            Some(&KVariant {
                source_ideograph: '',
                classification: KVariantClass::Wrong,
                destination_ideograph: ''
            }),
        );
        assert_eq!(
            KVARIANTS.get(&''),
            Some(&KVariant {
                source_ideograph: '',
                classification: KVariantClass::SementicVariant,
                destination_ideograph: '',
            }),
        );
        assert_eq!(
            KVARIANTS.get(&''),
            Some(&KVariant {
                source_ideograph: '',
                classification: KVariantClass::Simplified,
                destination_ideograph: '',
            }),
        );
        assert_eq!(
            KVARIANTS.get(&''),
            Some(&KVariant {
                source_ideograph: '',
                classification: KVariantClass::Old,
                destination_ideograph: '',
            }),
        );
        assert_eq!(
            KVARIANTS.get(&''),
            Some(&KVariant {
                source_ideograph: '',
                classification: KVariantClass::Equal,
                destination_ideograph: '',
            }),
        );
        assert_eq!(KVARIANTS.get(&''), None);
    }

    #[test]
    fn test_no_loop() {
        for value in KVARIANTS.values() {
            match KVARIANTS.get(&value.destination_ideograph) {
                // e.g. when value is "栄", reverse lookup would yield nothing.
                None => (),

                // e.g. when value is "椉", reverse lookup would yield "椉 old 乘".
                Some(reverse_lookup_value) => {
                    assert_ne!(
                        value.destination_ideograph,
                        reverse_lookup_value.destination_ideograph
                    );
                }
            }
        }
    }
}