khmercut/
lib.rs

1use crfs::{Attribute, Model};
2use regex::Regex;
3use std::vec;
4
5pub fn text_tagger(input_str: &String) -> Vec<(&str, String)> {
6    let grapheme_raw_regex =
7        r"([\u1780-\u17FF](\u17d2[\u1780-\u17FF]|[\u17B6-\u17D1\u17D3\u17DD])*)";
8    let grapheme_regex: Regex = Regex::new(grapheme_raw_regex).unwrap();
9    let characters_regex = Regex::new(
10        r"([\u1780-\u17d3]+)|([\u17d4-\u17dd]+)|([\u17e0-\u17e9]+)|(\s+)|([^\u1780-\u17ff\s]+)",
11    )
12    .unwrap();
13
14    let mut outputs = vec![];
15    for capture in characters_regex.captures_iter(&input_str) {
16        let (data_type, text) = capture
17            .iter()
18            .enumerate()
19            .skip(1)
20            .find(|t| t.1.is_some())
21            .map(|t| (t.0, t.1.unwrap().as_str()))
22            .unwrap();
23
24        if data_type != 1 {
25            outputs.push((text, "NS".to_string()));
26            continue;
27        }
28
29        for match_item in grapheme_regex.find_iter(text) {
30            let grapheme = match_item.as_str();
31            let size = grapheme.chars().count();
32            let grapheme_type: String = if size == 1 {
33                "C".to_string()
34            } else {
35                format!("K{}", size)
36            };
37
38            outputs.push((grapheme, grapheme_type));
39        }
40    }
41
42    return outputs;
43}
44
45pub fn create_features(kccs: &Vec<(&str, String)>) -> Vec<Vec<Attribute>> {
46    let mut chunks: Vec<Vec<Attribute>> = vec![];
47    let size = kccs.len();
48
49    for (i, el) in kccs.iter().enumerate() {
50        // initial feature
51        let mut items = vec![
52            Attribute::new(format!("kcc:{}", el.0), 1.0),
53            Attribute::new(format!("t:{}", el.1), 1.0),
54            Attribute::new("ns".to_string(), if el.1 == "NS" { 1.0 } else { 0.0 }),
55        ];
56
57        // bos
58        if i == 0 {
59            items.push(Attribute::new("BOS", 1.0));
60        }
61
62        if i >= 1 {
63            items.push(Attribute::new(format!("kcc[-1]:{}", kccs[i - 1].0), 1.0));
64            items.push(Attribute::new(format!("kcc[-1]t:{}", kccs[i - 1].1), 1.0));
65            items.push(Attribute::new(
66                format!("kcc[-1:0]:{}{}", kccs[i - 1].0, el.0),
67                1.0,
68            ));
69            let ns_1 = if kccs[i - 1].1 == "NS" { 1.0 } else { 0.0 };
70            items.push(Attribute::new("ns-1".to_string(), ns_1));
71        }
72
73        if i >= 2 {
74            items.push(Attribute::new(format!("kcc[-2]:{}", kccs[i - 2].0), 1.0));
75            items.push(Attribute::new(format!("kcc[-2]t:{}", kccs[i - 2].1), 1.0));
76            items.push(Attribute::new(
77                format!("kcc[-2:-1]:{}{}", kccs[i - 2].0, kccs[i - 1].0),
78                1.0,
79            ));
80            items.push(Attribute::new(
81                format!("kcc[-2:0]:{}{}{}", kccs[i - 2].0, kccs[i - 1].0, kccs[i].0,),
82                1.0,
83            ));
84        }
85
86        if i >= 3 {
87            items.push(Attribute::new(format!("kcc[-3]:{}", kccs[i - 3].0), 1.0));
88            items.push(Attribute::new(format!("kcc[-3]t:{}", kccs[i - 3].1), 1.0));
89            items.push(Attribute::new(
90                format!(
91                    "kcc[-3:0]:{}{}{}{}",
92                    kccs[i - 3].0,
93                    kccs[i - 2].0,
94                    kccs[i - 1].0,
95                    kccs[i].0
96                ),
97                1.0,
98            ));
99            items.push(Attribute::new(
100                format!(
101                    "kcc[-3:-1]:{}{}{}",
102                    kccs[i - 3].0,
103                    kccs[i - 2].0,
104                    kccs[i - 1].0
105                ),
106                1.0,
107            ));
108            items.push(Attribute::new(
109                format!("kcc[-3:-2]:{}{}", kccs[i - 3].0, kccs[i - 2].0),
110                1.0,
111            ));
112        }
113
114        if size >= 1 && i < size - 1 {
115            items.push(Attribute::new(format!("kcc[+1]:{}", kccs[i + 1].0), 1.0));
116            items.push(Attribute::new(format!("kcc[+1]t:{}", kccs[i + 1].1), 1.0));
117            items.push(Attribute::new(
118                format!("kcc[+1:0]t:{}{}", kccs[i].0, kccs[i + 1].0),
119                1.0,
120            ));
121            items.push(Attribute::new(
122                format!("ns+1"),
123                if kccs[i + 1].1 == "NS" { 1.0 } else { 0.0 },
124            ));
125        }
126
127        if size >= 2 && i < size - 2 {
128            items.push(Attribute::new(format!("kcc[+2]:{}", kccs[i + 2].0), 1.0));
129            items.push(Attribute::new(format!("kcc[+2]t:{}", kccs[i + 2].1), 1.0));
130            items.push(Attribute::new(
131                format!("kcc[+1:+2]:{}{}", kccs[i + 1].0, kccs[i + 2].0),
132                1.0,
133            ));
134            items.push(Attribute::new(
135                format!("kcc[0:+2]:{}{}{}", kccs[i].0, kccs[i + 1].0, kccs[i + 2].0),
136                1.0,
137            ));
138            items.push(Attribute::new(
139                "ns+2",
140                if kccs[i + 2].1 == "NS" { 1.0 } else { 0.0 },
141            ));
142        }
143
144        if size >= 3 && i < size - 3 {
145            items.push(Attribute::new(format!("kcc[+3]:{}", kccs[i + 3].0), 1.0));
146            items.push(Attribute::new(format!("kcc[+3]t:{}", kccs[i + 3].1), 1.0));
147            items.push(Attribute::new(
148                format!("kcc[+2:+3]t:{}{}", kccs[i + 2].0, kccs[i + 3].0),
149                1.0,
150            ));
151            items.push(Attribute::new(
152                format!(
153                    "kcc[+1:+3]t:{}{}{}",
154                    kccs[i + 1].0,
155                    kccs[i + 2].0,
156                    kccs[i + 3].0
157                ),
158                1.0,
159            ));
160            items.push(Attribute::new(
161                format!(
162                    "kcc[0:+3]t:{}{}{}{}",
163                    kccs[i].0,
164                    kccs[i + 1].0,
165                    kccs[i + 2].0,
166                    kccs[i + 3].0
167                ),
168                1.0,
169            ));
170        }
171
172        if size >= 1 && i == size - 1 {
173            items.push(Attribute::new("EOS", 1.0));
174        }
175
176        chunks.push(items);
177    }
178
179    return chunks;
180}
181
182pub fn tokenize(model: &Model, input_str: &String) -> Vec<String> {
183    let normalized_text = input_str.replace("\u{200b}", "");
184    let graphemes = text_tagger(&normalized_text);
185    let features = create_features(&graphemes);
186    let mut tagger = model.tagger().unwrap();
187    let results: Vec<&str> = tagger.tag(&features).unwrap();
188    let mut tokens = vec![];
189
190    for (i, y) in results.iter().enumerate() {
191        let (c, _) = graphemes.get(i).unwrap();
192        let flag = y.parse::<i8>().unwrap();
193        if flag == 1 || i == 0 {
194            tokens.push(c.to_string());
195            continue;
196        }
197        tokens.last_mut().unwrap().push_str(c);
198    }
199    tokens
200}