1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
use serde::{Deserialize, Serialize};

#[derive(Serialize, Deserialize, Debug, Copy, Clone)]
pub struct CategoryData {
    pub invoke: bool,
    pub group: bool,
    pub length: u32,
}

#[derive(Serialize, Deserialize, Clone, Debug, Hash, Copy, PartialOrd, Ord, Eq, PartialEq)]
pub struct CategoryId(pub usize);

#[derive(Clone, Serialize, Deserialize)]
pub struct CharacterDefinitions {
    pub category_definitions: Vec<CategoryData>,
    pub category_names: Vec<String>,
    pub mapping: LookupTable<CategoryId>,
}

#[derive(Serialize, Deserialize, Clone)]
pub struct LookupTable<T: Copy + Clone> {
    boundaries: Vec<u32>,
    values: Vec<Vec<T>>,
}

impl<T: Copy + Clone> LookupTable<T> {
    pub fn from_fn(mut boundaries: Vec<u32>, funct: &dyn Fn(u32, &mut Vec<T>)) -> LookupTable<T> {
        if !boundaries.contains(&0) {
            boundaries.push(0);
        }
        boundaries.sort_unstable();
        let mut values = Vec::new();
        for &boundary in &boundaries {
            let mut output = Vec::default();
            funct(boundary, &mut output);
            values.push(output);
        }
        LookupTable { boundaries, values }
    }

    pub fn eval(&self, target: u32) -> &[T] {
        let idx = self
            .boundaries
            .binary_search(&target)
            .unwrap_or_else(|val| val - 1);
        &self.values[idx][..]
    }
}

impl CharacterDefinitions {
    pub fn categories(&self) -> &[String] {
        &self.category_names[..]
    }

    pub fn load(char_def_data: &[u8]) -> CharacterDefinitions {
        bincode::deserialize(char_def_data).expect("Failed to deserialize char definition data")
    }

    pub fn lookup_definition(&self, category_id: CategoryId) -> &CategoryData {
        &self.category_definitions[category_id.0]
    }

    pub fn category_name(&self, category_id: CategoryId) -> &str {
        &self.category_names[category_id.0 as usize]
    }

    pub fn lookup_categories(&self, c: char) -> &[CategoryId] {
        self.mapping.eval(c as u32)
    }
}

#[cfg(test)]
mod tests {
    use crate::core::character_definition::LookupTable;

    #[test]
    fn test_lookup_table() {
        let funct = |c: u32, output: &mut Vec<u32>| {
            if c >= 10u32 {
                output.push(1u32);
            } else {
                output.push(0u32);
            }
        };
        let lookup_table = LookupTable::from_fn(vec![0u32, 10u32], &funct);
        for i in 0..100 {
            let mut v = Vec::default();
            funct(i, &mut v);
            assert_eq!(lookup_table.eval(i), &v[..]);
        }
    }

    //    #[test]
    //    fn test_bisa() {
    //        let char_definitions = CharacterDefinitions::load();
    //        let category_ids: Vec<&str> = char_definitions
    //            .lookup_categories('々')
    //            .iter()
    //            .map(|&category_id| char_definitions.category_name(category_id))
    //            .collect();
    //        assert_eq!(category_ids, &["KANJI", "SYMBOL"]);
    //    }

    //    #[test]
    //    fn test_jp_hyphen() {
    //        let char_definitions = CharacterDefinitions::load();
    //        let category_ids: Vec<&str> = char_definitions
    //            .lookup_categories('ー')
    //            .iter()
    //            .map(|&category_id| char_definitions.category_name(category_id))
    //            .collect();
    //        assert_eq!(category_ids, &["KATAKANA"]);
    //    }

    //    #[test]
    //    fn test_char_definitions() {
    //        let char_definitions = CharacterDefinitions::load();
    //        {
    //            let v = char_definitions.lookup_categories('あ');
    //            assert_eq!(v.len(), 1);
    //            assert_eq!(char_definitions.category_name(v[0]), "HIRAGANA");
    //        }
    //        {
    //            let v = char_definitions.lookup_categories('@');
    //            assert_eq!(v.len(), 1);
    //            assert_eq!(char_definitions.category_name(v[0]), "SYMBOL");
    //        }
    //        {
    //            let v = char_definitions.lookup_categories('一');
    //            assert_eq!(v.len(), 2);
    //            assert_eq!(char_definitions.category_name(v[0]), "KANJI");
    //            assert_eq!(char_definitions.category_name(v[1]), "KANJINUMERIC");
    //        }
    //    }
}