1use crfs::{Attribute, Model};
2use regex::Regex;
3use std::vec;
4
5pub fn text_tagger(input_str: &String) -> Vec<(&str, String)> {
6 let grapheme_raw_regex =
7 r"([\u1780-\u17FF](\u17d2[\u1780-\u17FF]|[\u17B6-\u17D1\u17D3\u17DD])*)";
8 let grapheme_regex: Regex = Regex::new(grapheme_raw_regex).unwrap();
9 let characters_regex = Regex::new(
10 r"([\u1780-\u17d3]+)|([\u17d4-\u17dd]+)|([\u17e0-\u17e9]+)|(\s+)|([^\u1780-\u17ff\s]+)",
11 )
12 .unwrap();
13
14 let mut outputs = vec![];
15 for capture in characters_regex.captures_iter(&input_str) {
16 let (data_type, text) = capture
17 .iter()
18 .enumerate()
19 .skip(1)
20 .find(|t| t.1.is_some())
21 .map(|t| (t.0, t.1.unwrap().as_str()))
22 .unwrap();
23
24 if data_type != 1 {
25 outputs.push((text, "NS".to_string()));
26 continue;
27 }
28
29 for match_item in grapheme_regex.find_iter(text) {
30 let grapheme = match_item.as_str();
31 let size = grapheme.chars().count();
32 let grapheme_type: String = if size == 1 {
33 "C".to_string()
34 } else {
35 format!("K{}", size)
36 };
37
38 outputs.push((grapheme, grapheme_type));
39 }
40 }
41
42 return outputs;
43}
44
45pub fn create_features(kccs: &Vec<(&str, String)>) -> Vec<Vec<Attribute>> {
46 let mut chunks: Vec<Vec<Attribute>> = vec![];
47 let size = kccs.len();
48
49 for (i, el) in kccs.iter().enumerate() {
50 let mut items = vec![
52 Attribute::new(format!("kcc:{}", el.0), 1.0),
53 Attribute::new(format!("t:{}", el.1), 1.0),
54 Attribute::new("ns".to_string(), if el.1 == "NS" { 1.0 } else { 0.0 }),
55 ];
56
57 if i == 0 {
59 items.push(Attribute::new("BOS", 1.0));
60 }
61
62 if i >= 1 {
63 items.push(Attribute::new(format!("kcc[-1]:{}", kccs[i - 1].0), 1.0));
64 items.push(Attribute::new(format!("kcc[-1]t:{}", kccs[i - 1].1), 1.0));
65 items.push(Attribute::new(
66 format!("kcc[-1:0]:{}{}", kccs[i - 1].0, el.0),
67 1.0,
68 ));
69 let ns_1 = if kccs[i - 1].1 == "NS" { 1.0 } else { 0.0 };
70 items.push(Attribute::new("ns-1".to_string(), ns_1));
71 }
72
73 if i >= 2 {
74 items.push(Attribute::new(format!("kcc[-2]:{}", kccs[i - 2].0), 1.0));
75 items.push(Attribute::new(format!("kcc[-2]t:{}", kccs[i - 2].1), 1.0));
76 items.push(Attribute::new(
77 format!("kcc[-2:-1]:{}{}", kccs[i - 2].0, kccs[i - 1].0),
78 1.0,
79 ));
80 items.push(Attribute::new(
81 format!("kcc[-2:0]:{}{}{}", kccs[i - 2].0, kccs[i - 1].0, kccs[i].0,),
82 1.0,
83 ));
84 }
85
86 if i >= 3 {
87 items.push(Attribute::new(format!("kcc[-3]:{}", kccs[i - 3].0), 1.0));
88 items.push(Attribute::new(format!("kcc[-3]t:{}", kccs[i - 3].1), 1.0));
89 items.push(Attribute::new(
90 format!(
91 "kcc[-3:0]:{}{}{}{}",
92 kccs[i - 3].0,
93 kccs[i - 2].0,
94 kccs[i - 1].0,
95 kccs[i].0
96 ),
97 1.0,
98 ));
99 items.push(Attribute::new(
100 format!(
101 "kcc[-3:-1]:{}{}{}",
102 kccs[i - 3].0,
103 kccs[i - 2].0,
104 kccs[i - 1].0
105 ),
106 1.0,
107 ));
108 items.push(Attribute::new(
109 format!("kcc[-3:-2]:{}{}", kccs[i - 3].0, kccs[i - 2].0),
110 1.0,
111 ));
112 }
113
114 if size >= 1 && i < size - 1 {
115 items.push(Attribute::new(format!("kcc[+1]:{}", kccs[i + 1].0), 1.0));
116 items.push(Attribute::new(format!("kcc[+1]t:{}", kccs[i + 1].1), 1.0));
117 items.push(Attribute::new(
118 format!("kcc[+1:0]t:{}{}", kccs[i].0, kccs[i + 1].0),
119 1.0,
120 ));
121 items.push(Attribute::new(
122 format!("ns+1"),
123 if kccs[i + 1].1 == "NS" { 1.0 } else { 0.0 },
124 ));
125 }
126
127 if size >= 2 && i < size - 2 {
128 items.push(Attribute::new(format!("kcc[+2]:{}", kccs[i + 2].0), 1.0));
129 items.push(Attribute::new(format!("kcc[+2]t:{}", kccs[i + 2].1), 1.0));
130 items.push(Attribute::new(
131 format!("kcc[+1:+2]:{}{}", kccs[i + 1].0, kccs[i + 2].0),
132 1.0,
133 ));
134 items.push(Attribute::new(
135 format!("kcc[0:+2]:{}{}{}", kccs[i].0, kccs[i + 1].0, kccs[i + 2].0),
136 1.0,
137 ));
138 items.push(Attribute::new(
139 "ns+2",
140 if kccs[i + 2].1 == "NS" { 1.0 } else { 0.0 },
141 ));
142 }
143
144 if size >= 3 && i < size - 3 {
145 items.push(Attribute::new(format!("kcc[+3]:{}", kccs[i + 3].0), 1.0));
146 items.push(Attribute::new(format!("kcc[+3]t:{}", kccs[i + 3].1), 1.0));
147 items.push(Attribute::new(
148 format!("kcc[+2:+3]t:{}{}", kccs[i + 2].0, kccs[i + 3].0),
149 1.0,
150 ));
151 items.push(Attribute::new(
152 format!(
153 "kcc[+1:+3]t:{}{}{}",
154 kccs[i + 1].0,
155 kccs[i + 2].0,
156 kccs[i + 3].0
157 ),
158 1.0,
159 ));
160 items.push(Attribute::new(
161 format!(
162 "kcc[0:+3]t:{}{}{}{}",
163 kccs[i].0,
164 kccs[i + 1].0,
165 kccs[i + 2].0,
166 kccs[i + 3].0
167 ),
168 1.0,
169 ));
170 }
171
172 if size >= 1 && i == size - 1 {
173 items.push(Attribute::new("EOS", 1.0));
174 }
175
176 chunks.push(items);
177 }
178
179 return chunks;
180}
181
182pub fn tokenize(model: &Model, input_str: &String) -> Vec<String> {
183 let normalized_text = input_str.replace("\u{200b}", "");
184 let graphemes = text_tagger(&normalized_text);
185 let features = create_features(&graphemes);
186 let mut tagger = model.tagger().unwrap();
187 let results: Vec<&str> = tagger.tag(&features).unwrap();
188 let mut tokens = vec![];
189
190 for (i, y) in results.iter().enumerate() {
191 let (c, _) = graphemes.get(i).unwrap();
192 let flag = y.parse::<i8>().unwrap();
193 if flag == 1 || i == 0 {
194 tokens.push(c.to_string());
195 continue;
196 }
197 tokens.last_mut().unwrap().push_str(c);
198 }
199 tokens
200}