1pub mod dict;
2pub mod edge;
3pub mod edge_builders;
4pub mod errors;
5pub mod replacer;
6pub mod text_range;
7
8#[macro_use]
9extern crate lazy_static;
10extern crate prefixtree;
11#[macro_use]
12extern crate serde_derive;
13
14use self::prefixtree::{prefix_tree_from_str, PrefixTree};
15use crate::dict::Dict;
16use crate::edge::{Edge, EdgeType};
17use crate::edge_builders::{
18 DictEdgeBuilder, EdgeBuilder, EdgeBuildingContext, RuleBasedEdgeBuilder, UnkEdgeBuilder,
19};
20use crate::errors::WordcutError;
21use crate::text_range::TextRange;
22use regex_automata::dfa::dense;
23use regex_automata::dfa::Automaton;
24use regex_automata::meta::Regex;
25use regex_automata::util::primitives::StateID;
26use regex_automata::util::start;
27use regex_automata::Anchored;
28use std::fs::File;
29use std::io;
30use std::io::BufRead;
31use std::path::Path;
32
33macro_rules! insert_prefix {
34 ($filename:expr) => {
35 Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/", $filename))
36 };
37}
38
39macro_rules! insert_prefix_str {
40 ($filename:expr) => {
41 concat!(env!("CARGO_MANIFEST_DIR"), "/data/", $filename)
42 };
43}
44
45pub fn default_dict_path() -> &'static Path {
46 insert_prefix!("mixed-wordlist.txt")
47}
48
49pub fn thai_cluster_path() -> Option<String> {
50 Some(insert_prefix_str!("thai_cluster_rules.txt").to_owned())
51}
52
53pub fn thai_replace_rules_path() -> Option<String> {
54 Some(insert_prefix_str!("thai-replace-rules.json").to_owned())
55}
56
57type ClusterRulesMatcher = dense::DFA<Vec<u32>>;
58type SplitRulesMatcher = Regex;
59
60lazy_static! {
61 static ref DEFAULT_THAI_SPLIT_RE: Regex =
62 Regex::new("[\r\t\n ]+|[A-Za-z]+|[0-9]+|[๐-๙]+|“").unwrap();
63}
64
65lazy_static! {
66 static ref DFA_START_CONFIG: start::Config = start::Config::new().anchored(Anchored::Yes);
67}
68
69pub fn create_prefix_tree(words: &[&str]) -> PrefixTree<char, bool> {
70 let words_payloads: Vec<(&str, bool)> = words.iter().map(|&word| (word, true)).collect();
71 prefix_tree_from_str(&words_payloads[..])
72}
73
74#[inline]
75fn does_not_break_cluster(s: usize, e: usize, text_len: usize, clusters: &[usize]) -> bool {
76 (s == 0 || clusters[s] == 0 || clusters[s] != clusters[s - 1])
77 && (e == text_len || clusters[e - 1] == 0 || clusters[e] != clusters[e - 1])
78}
79
80#[inline]
81fn should_skip_edge(edge: &Option<Edge>, i: usize, text_len: usize, clusters: &[usize]) -> bool {
82 let mut skip_edge = false;
83 if let Some(edge) = edge {
84 let s = edge.p;
85 let e = i + 1;
86 skip_edge = !edge.is_unk() && !does_not_break_cluster(s, e, text_len, clusters);
87 }
88 skip_edge
89}
90
91fn build_path_with_clusters(
92 mut builders: Vec<&mut dyn EdgeBuilder>,
93 clusters: &[usize],
94 text: &[char],
95) -> Vec<Edge> {
96 let mut path = vec![];
97 path.push(Edge {
98 w: 0,
99 unk: 0,
100 p: 0,
101 etype: EdgeType::Init,
102 });
103
104 let mut context = EdgeBuildingContext {
105 text,
106 i: 0,
107 ch: '\0',
108 left_boundary: 0,
109 best_edge: None,
110 };
111
112 let text_len = text.len();
113 for i in 0..text_len {
114 context.ch = text[i];
115 context.i = i;
116 context.best_edge = None;
117 for builder in &mut builders {
118 let edge = builder.build(&context, &path);
119 if !should_skip_edge(&edge, i, text_len, clusters)
120 && Edge::better(&edge, &context.best_edge)
121 {
122 context.best_edge = edge
123 }
124 }
125 path.push(context.best_edge.unwrap());
126 if !context.best_edge.unwrap().is_unk() {
127 context.left_boundary = i + 1;
128 }
129 }
130 path
131}
132
133#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
134pub struct DagEdge {
135 pub s: usize,
136 pub e: usize,
137 pub etype: EdgeType,
138}
139
140pub type Dag = Vec<Vec<DagEdge>>;
141
142pub trait DagEdgeBuilder {
143 fn build_dag_edges(&mut self, context: &EdgeBuildingContext) -> Vec<DagEdge>;
144}
145
146impl<'a> DagEdgeBuilder for DictEdgeBuilder<'a> {
147 fn build_dag_edges(&mut self, context: &EdgeBuildingContext) -> Vec<DagEdge> {
148 self.add_pointer(context);
149 self.update_pointers(context);
150 self.pointers
152 .iter()
153 .filter(|p| p.is_final)
154 .map(|p| DagEdge {
155 s: p.s,
156 e: context.i + 1,
157 etype: EdgeType::Dict,
158 })
159 .collect()
160 }
161}
162
163pub fn build_dag(dict: &Dict, text: &Vec<char>) -> Dag {
164 let mut builders: Vec<Box<dyn DagEdgeBuilder>> = vec![Box::new(DictEdgeBuilder::new(dict))];
165
166 let mut dag = Vec::with_capacity(text.len() + 1);
167
168 for _ in 0..text.len() + 1 {
169 dag.push(vec![]);
170 }
171 dag[0].push(DagEdge {
172 s: 0,
173 e: 0,
174 etype: EdgeType::Init,
175 });
176 let mut context = EdgeBuildingContext {
177 text: &text,
178 i: 0,
179 ch: '\0',
180 left_boundary: 0,
181 best_edge: None,
182 };
183
184 for i in 0..text.len() {
185 context.ch = text[i];
186 context.i = i;
187 context.best_edge = None;
188
189 for builder in &mut builders {
190 for edge in builder.build_dag_edges(&context) {
191 dag[edge.e].push(edge)
192 }
193 }
194 }
195
196 let mut left_boundary = 0;
197 for i in 1..text.len() + 1 {
198 if dag[i].len() == 0 {
199 dag[i].push(DagEdge {
200 s: left_boundary,
201 e: i,
202 etype: EdgeType::Unk,
203 });
204 } else {
205 left_boundary = i;
206 }
207 }
208
209 dag
210}
211
212pub fn path_to_ranges(path: &[Edge]) -> Vec<TextRange> {
213 let len = path.len();
214
215 if len == 0 {
216 return vec![];
217 }
218
219 let mut ranges: Vec<TextRange> = Vec::with_capacity(len);
220 let mut e = len - 1;
221 while e > 0 {
222 let edge = &path[e];
223 let s = edge.p;
224 ranges.push(TextRange { s, e });
225 e = s;
226 }
227 ranges.reverse();
228 ranges
229}
230
231pub fn path_to_byte_ranges(path: &[Edge], text: &[char]) -> Vec<TextRange> {
232 let char_ranges = path_to_ranges(path);
233 let mut ranges: Vec<TextRange> = Vec::with_capacity(char_ranges.len());
234 let mut global_byte_offset = 0;
235 for r in char_ranges {
236 let mut word_byte_offset = 0;
237 for i in r.s..r.e {
238 word_byte_offset += text[i].len_utf8();
239 }
240 ranges.push(TextRange {
241 s: global_byte_offset,
242 e: global_byte_offset + word_byte_offset,
243 });
244 global_byte_offset += word_byte_offset;
245 }
246 ranges
247}
248
249pub fn path_to_str_vec(path: &[Edge], text: &[char]) -> Vec<String> {
250 let ranges = path_to_ranges(path);
251 let mut str_vec: Vec<String> = Vec::with_capacity(ranges.len());
252 for r in ranges {
253 let mut buf = String::with_capacity(3 * (r.e - r.s + 1));
254 for i in r.s..r.e {
255 buf.push(text[i]);
256 }
257 str_vec.push(buf)
258 }
259 str_vec
260}
261
262pub struct Wordcut {
263 dict: Dict,
264 cluster_re: Option<ClusterRulesMatcher>,
265 split_re: SplitRulesMatcher,
266}
267
268impl Wordcut {
269 pub fn new(dict: Dict) -> Wordcut {
270 Wordcut {
271 dict,
272 cluster_re: None,
273 split_re: DEFAULT_THAI_SPLIT_RE.clone(),
274 }
275 }
276
277 pub fn new_with_cluster_re(dict: Dict, cluster_re: ClusterRulesMatcher) -> Wordcut {
278 Wordcut {
279 dict,
280 cluster_re: Some(cluster_re),
281 split_re: DEFAULT_THAI_SPLIT_RE.clone(),
282 }
283 }
284
285 pub fn new_with_cluster_re_and_split_re(
286 dict: Dict,
287 cluster_re: ClusterRulesMatcher,
288 split_re: SplitRulesMatcher,
289 ) -> Wordcut {
290 Wordcut {
291 dict,
292 cluster_re: Some(cluster_re),
293 split_re,
294 }
295 }
296
297 #[inline]
298 pub fn build_path(&self, text: &str, text_chars: &[char]) -> Vec<Edge> {
299 let byte_to_char_idx_map = create_byte_to_char_idx_map(text);
300 let mut dict_edge_builder = DictEdgeBuilder::new(&self.dict);
301 let mut unk_edge_builder = UnkEdgeBuilder::new();
302 let mut rule_based_edge_builder =
303 RuleBasedEdgeBuilder::new(&byte_to_char_idx_map, text, &self.split_re);
304 let builders: Vec<&mut dyn EdgeBuilder> = vec![
305 &mut dict_edge_builder,
306 &mut unk_edge_builder,
307 &mut rule_based_edge_builder,
308 ];
309
310 let clusters = if let Some(cluster_re) = &self.cluster_re {
311 find_clusters(text, &byte_to_char_idx_map, cluster_re, text_chars.len())
312 } else {
313 let mut clusters = vec![];
314 clusters.resize(text_chars.len() + 1, 0);
315 clusters
316 };
317 build_path_with_clusters(builders, &clusters, text_chars)
318 }
319
320 #[allow(dead_code)]
321 pub fn segment(&self, text: &str) -> Vec<TextRange> {
322 let text_chars: Vec<char> = text.chars().collect();
323 let path = self.build_path(text, &text_chars);
324 path_to_ranges(&path)
325 }
326
327 pub fn segment_into_byte_ranges(&self, text: &str) -> Vec<TextRange> {
328 let text_chars: Vec<char> = text.chars().collect();
329 let path = self.build_path(text, &text_chars);
330 path_to_byte_ranges(&path, &text_chars)
331 }
332
333 pub fn segment_into_strings(&self, text: &str) -> Vec<String> {
334 let text_chars: Vec<char> = text.chars().collect();
335 let path = self.build_path(text, &text_chars);
336 path_to_str_vec(&path, &text_chars)
337 }
338
339 pub fn put_delimiters(&self, text: &str, delim: &str) -> String {
340 self.segment_into_strings(text).join(delim)
341 }
342
343 #[allow(dead_code)]
344 pub fn build_dag(&self, text: &str) -> Dag {
345 build_dag(&self.dict, &text.chars().collect())
346 }
347}
348
349pub fn create_byte_to_char_idx_map(text: &str) -> Vec<usize> {
350 let mut byte_to_char_map = vec![];
351 let mut i = 0;
352 for b in text.as_bytes() {
353 if (*b as i8) >= -0x40 {
354 byte_to_char_map.push(i);
355 i += 1;
356 } else {
357 byte_to_char_map.push(0);
358 }
359 }
360 byte_to_char_map.push(i);
361 byte_to_char_map
362}
363
364#[derive(Debug)]
365pub struct ClusterPointer {
366 state_id: StateID,
367 p: usize,
368}
369
370#[derive(Debug)]
371pub struct ClusterEdge {
372 acc_pat_len: usize,
373 unk_cnt: usize,
374 p: usize,
375 is_unk: bool,
376}
377
378pub fn find_cluster_path(dfa: &ClusterRulesMatcher, text: &str) -> Vec<ClusterEdge> {
379 let mut pointers = vec![];
380 let mut ch_index = 0;
381 let mut path = vec![];
382 let mut left_boundary = 0;
383 path.push(ClusterEdge {
384 p: 0,
385 acc_pat_len: 0,
386 unk_cnt: 0,
387 is_unk: false,
388 });
389 for ch_byte in text.as_bytes() {
390 let mut best_edge: Option<ClusterEdge> = None;
391 pointers.push(ClusterPointer {
392 state_id: dfa
393 .start_state(&DFA_START_CONFIG)
394 .expect("DFA state started"),
395 p: ch_index,
396 });
397 let mut new_pointer_index = 0;
398 for pointer_index in 0..pointers.len() {
399 let next_id = dfa.next_state(pointers[pointer_index].state_id, *ch_byte);
400 if !dfa.is_dead_state(next_id) {
401 pointers[new_pointer_index] = ClusterPointer {
402 state_id: next_id,
403 p: pointers[pointer_index].p,
404 };
405 new_pointer_index += 1;
406 if dfa.is_match_state(dfa.next_eoi_state(next_id)) {
407 let source = &path[pointers[pointer_index].p];
408 let edge = ClusterEdge {
409 p: pointers[pointer_index].p,
410 acc_pat_len: source.acc_pat_len
411 + (ch_index - pointers[pointer_index].p + 1),
412 unk_cnt: source.unk_cnt,
413 is_unk: false,
414 };
415 if match &best_edge {
416 Some(b_edge) => {
417 b_edge.unk_cnt > edge.unk_cnt
418 || (b_edge.unk_cnt == edge.unk_cnt
419 && b_edge.acc_pat_len < edge.acc_pat_len)
420 }
421 None => true,
422 } {
423 best_edge = Some(edge);
424 }
425 }
426 }
427 }
428 pointers.truncate(new_pointer_index);
429 if best_edge.is_none() {
430 let source = &path[left_boundary];
431 best_edge = Some(ClusterEdge {
432 p: left_boundary,
433 acc_pat_len: source.acc_pat_len,
434 unk_cnt: source.unk_cnt + (ch_index - left_boundary + 1),
435 is_unk: true,
436 });
437 }
438 let best_edge = best_edge.unwrap();
439 if !best_edge.is_unk {
440 left_boundary = ch_index + 1;
441 }
442 path.push(best_edge);
443 ch_index += 1;
444 }
445 path
446}
447
448pub fn find_clusters(
449 text: &str,
450 byte_to_char_idx_map: &[usize],
451 dfa: &ClusterRulesMatcher,
452 len: usize,
453) -> Vec<usize> {
454 let mut clusters = vec![];
455 clusters.resize(len, 0);
456 let mut id = 1;
457 let path = find_cluster_path(dfa, text);
458 let mut me = path.len() - 1;
459 while me > 0 {
460 let edge = &path[me];
461 let ms = edge.p;
462 let s = byte_to_char_idx_map[ms];
463 let e = byte_to_char_idx_map[me];
464 if !edge.is_unk {
465 for i in s..e {
466 clusters[i] = id;
467 }
468 id += 1;
469 }
470 me = ms;
471 }
472 clusters
473}
474
475pub fn load_wordlist(path: impl AsRef<Path>) -> io::Result<Vec<String>> {
476 let f = File::open(path.as_ref())?;
477 let f = io::BufReader::new(f);
478 Ok(f.lines().map(|line| line.unwrap()).collect())
479}
480
481pub fn load_dict(path: impl AsRef<Path>) -> io::Result<Dict> {
482 let wordlist = load_wordlist(path)?;
483 let wordlist: Vec<_> = wordlist.iter().map(|w| &w[..]).collect();
484 Ok(create_prefix_tree(&wordlist))
485}
486
487pub fn load_cluster_rules(path: &Path) -> Result<ClusterRulesMatcher, WordcutError> {
488 let f = File::open(path)
489 .map_err(|_| WordcutError::CannotOpenClusterRulesAt(path.to_string_lossy().to_string()))?;
490 let f = io::BufReader::new(f);
491 let mut rules = vec![];
492 for line in f.lines() {
493 let line = line.map_err(|_| WordcutError::CannotReadClusterRule)?;
494 rules.push(format!("({})", line.trim()));
495 }
496 let rules = rules.join("|");
497 let dfa =
498 dense::DFA::new(&rules).map_err(|_| WordcutError::CannotCompileClusterRules(rules))?;
499 Ok(dfa)
500}
501
502pub fn load_split_rules(path: &Path) -> Result<SplitRulesMatcher, WordcutError> {
503 let f = File::open(path)
504 .map_err(|_| WordcutError::CannotOpenSplitRulesAt(path.to_string_lossy().to_string()))?;
505 let f = io::BufReader::new(f);
506 let mut rules = vec![];
507 for line in f.lines() {
508 let line = line.map_err(|_| WordcutError::CannotReadClusterRule)?;
509 rules.push(format!("({})", line.trim()));
510 }
511 let rules = rules.join("|");
512 Ok(Regex::new(&rules).map_err(|_| WordcutError::CannotCompileSplitRules(rules))?)
513}
514
515#[cfg(test)]
516mod tests {
517 extern crate serde_json;
518 use super::*;
519
520 use DagEdge;
521 use EdgeType;
522 use TextRange;
523 use Wordcut;
524
525 #[test]
526 fn test_prefix_tree() {
527 let prefix_tree = super::create_prefix_tree(&["A"]);
528 assert_eq!(
529 prefix_tree.seek(&(0, 0, 'A')),
530 Some(&(0 as u32, true, Some(true)))
531 );
532 assert_eq!(prefix_tree.seek(&(0, 0, 'B')), None);
533 }
534
535 #[test]
536 fn test_segment() {
537 let dict = super::create_prefix_tree(&["กา", "กาก"]);
538 let wordcut = Wordcut::new(dict);
539 let ranges = wordcut.segment("กากกา");
540 let expected = vec![TextRange { s: 0, e: 3 }, TextRange { s: 3, e: 5 }];
541 assert_eq!(ranges, expected)
542 }
543
544 #[test]
545 fn test_segment_into_byte_ranges() {
546 let dict = super::create_prefix_tree(&["กา", "กาก"]);
547 let wordcut = Wordcut::new(dict);
548 let ranges = wordcut.segment_into_byte_ranges("กากกา");
549 let expected = vec![TextRange { s: 0, e: 9 }, TextRange { s: 9, e: 15 }];
550 assert_eq!(ranges, expected)
551 }
552
553 #[test]
554 fn test_segment_to_strings() {
555 let dict = super::create_prefix_tree(&["กา", "กาก"]);
556 let wordcut = Wordcut::new(dict);
557 let toks = wordcut.segment_into_strings("กากกา");
558 let expected = vec![String::from("กาก"), String::from("กา")];
559 assert_eq!(toks, expected)
560 }
561
562 #[test]
563 fn test_segment_to_strings_mixed() {
564 let dict = super::create_prefix_tree(&["กา", "กาก"]);
565 let wordcut = Wordcut::new(dict);
566 let toks = wordcut.segment_into_strings("cat กากกา");
567 let expected = vec![
568 String::from("cat"),
569 String::from(" "),
570 String::from("กาก"),
571 String::from("กา"),
572 ];
573 assert_eq!(toks, expected)
574 }
575
576 #[test]
577 fn test_put_delimiters() {
578 let dict = super::create_prefix_tree(&["กา", "กาก"]);
579 let wordcut = Wordcut::new(dict);
580 assert_eq!(wordcut.put_delimiters("กากกา", "|"), String::from("กาก|กา"))
581 }
582
583 #[test]
584 fn test_load_wordlist() {
585 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
586 let v = super::load_wordlist(path);
587 assert_eq!(v.unwrap(), vec![String::from("กา"), String::from("กาก")])
588 }
589
590 #[test]
591 fn test_wordcut() {
592 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
593 let dict = super::load_dict(&path);
594 let wordcut = Wordcut::new(dict.unwrap());
595 assert_eq!(wordcut.put_delimiters("กากกา", "|"), String::from("กาก|กา"))
596 }
597
598 #[test]
599 fn test_wordcut_with_replacer() {
600 let dict = super::create_prefix_tree(&["ข้อ", "รับ", "สำหรับ", "เสนอ"]);
601 let wordcut = Wordcut::new(dict);
602 let rule = r###"{"pattern": "ํา", "replacement": "ำ"}"###;
603 let rule: replacer::Rule = serde_json::from_str(rule).unwrap();
604 let imm_rules = replacer::ImmRule::from_rules(&vec![rule]).unwrap();
605 let mod_text = replacer::replace(&imm_rules, "สําหรับข้อเสนอ");
606 assert_eq!(
607 wordcut.put_delimiters(&mod_text, "|"),
608 String::from("สำหรับ|ข้อ|เสนอ")
609 )
610 }
611
612 #[test]
613 fn test_wordcut_with_replacer_two_occurs() {
614 let dict = super::create_prefix_tree(&["กำลัง", "ทำ", "พยายาม", "ลัง", "ให้"]);
615 let wordcut = Wordcut::new(dict);
616 let rule = r###"{"pattern": "ํา", "replacement": "ำ"}"###;
617 let rule: replacer::Rule = serde_json::from_str(rule).unwrap();
618 let imm_rules = replacer::ImmRule::from_rules(&vec![rule]).unwrap();
619 let mod_text = replacer::replace(&imm_rules, "กําลังพยายามทําให้");
620 assert_eq!(
621 wordcut.put_delimiters(&mod_text, "|"),
622 String::from("กำลัง|พยายาม|ทำ|ให้")
623 )
624 }
625
626 #[test]
627 fn test_wordcut_with_latin() {
628 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
629 let dict = super::load_dict(&path);
630 let wordcut = Wordcut::new(dict.unwrap());
631 assert_eq!(
632 wordcut.put_delimiters("ฑฑACญญ", "|"),
633 String::from("ฑฑ|AC|ญญ")
634 )
635 }
636
637 #[test]
638 fn test_wordcut_with_two_spaces() {
639 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
640 let dict = super::load_dict(&path);
641 let wordcut = Wordcut::new(dict.unwrap());
642 assert_eq!(
643 wordcut.put_delimiters("กา มา", "|"),
644 String::from("กา| |มา")
645 )
646 }
647
648 #[test]
649 fn test_wordcut_with_two_spaces_unk() {
650 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
651 let dict = super::load_dict(&path);
652 let wordcut = Wordcut::new(dict.unwrap());
653 assert_eq!(
654 wordcut.put_delimiters("แแ ยย", "|"),
655 String::from("แแ| |ยย")
656 )
657 }
658
659 #[test]
660 fn test_wordcut_with_unicode_quote() {
661 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
662 let dict = super::load_dict(&path);
663 let wordcut = Wordcut::new(dict.unwrap());
664 assert_eq!(
665 wordcut.put_delimiters("“ฆกากา”", "|"),
666 String::from("“|ฆ|กา|กา|”")
667 )
668 }
669
670 #[test]
671 fn test_dag() {
672 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
673 let dict = super::load_dict(&path).unwrap();
674 let wordcut = Wordcut::new(dict);
675 let dag = wordcut.build_dag("กากกา");
676 let expected = vec![
677 vec![DagEdge {
678 s: 0,
679 e: 0,
680 etype: EdgeType::Init,
681 }], vec![DagEdge {
683 s: 0,
684 e: 1,
685 etype: EdgeType::Unk,
686 }], vec![DagEdge {
688 s: 0,
689 e: 2,
690 etype: EdgeType::Dict,
691 }], vec![DagEdge {
693 s: 0,
694 e: 3,
695 etype: EdgeType::Dict,
696 }], vec![DagEdge {
698 s: 3,
699 e: 4,
700 etype: EdgeType::Unk,
701 }], vec![DagEdge {
703 s: 3,
704 e: 5,
705 etype: EdgeType::Dict,
706 }], ];
708 assert_eq!(dag, expected);
709 }
710
711 #[test]
712 fn test_dag_in_object() {
713 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
714 let dict = super::load_dict(&path);
715 let dag = super::build_dag(&dict.unwrap(), &"กากกา".chars().collect());
716 let expected = vec![
717 vec![DagEdge {
718 s: 0,
719 e: 0,
720 etype: EdgeType::Init,
721 }], vec![DagEdge {
723 s: 0,
724 e: 1,
725 etype: EdgeType::Unk,
726 }], vec![DagEdge {
728 s: 0,
729 e: 2,
730 etype: EdgeType::Dict,
731 }], vec![DagEdge {
733 s: 0,
734 e: 3,
735 etype: EdgeType::Dict,
736 }], vec![DagEdge {
738 s: 3,
739 e: 4,
740 etype: EdgeType::Unk,
741 }], vec![DagEdge {
743 s: 3,
744 e: 5,
745 etype: EdgeType::Dict,
746 }], ];
748 assert_eq!(dag, expected);
749 }
750
751 #[test]
752 fn test_dag_empty() {
753 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
754 let dict = super::load_dict(&path);
755 let dag = super::build_dag(&dict.unwrap(), &"".chars().collect());
756 let expected = vec![
757 vec![DagEdge {
758 s: 0,
759 e: 0,
760 etype: EdgeType::Init,
761 }], ];
763 assert_eq!(dag, expected);
764 }
765
766 #[test]
767 fn test_dag_to_json() {
768 let dag = vec![
769 vec![DagEdge {
770 s: 0,
771 e: 0,
772 etype: EdgeType::Init,
773 }], ];
775 let s = serde_json::to_string(&dag).unwrap();
776 assert_eq!(s, "[[{\"s\":0,\"e\":0,\"etype\":\"Init\"}]]");
777 }
778
779 #[test]
780 fn test_find_clusters() {
781 let text = "กาแกกก์A";
782 let path = super::Path::new(concat!(
783 env!("CARGO_MANIFEST_DIR"),
784 "/data/thai_cluster_rules.txt"
785 ));
786 let cluster_re = super::load_cluster_rules(&path).unwrap();
787 let byte_to_char_idx_map = create_byte_to_char_idx_map(text);
788 let clusters = find_clusters(
789 text,
790 &byte_to_char_idx_map,
791 &cluster_re,
792 text.chars().count(),
793 );
794 assert_eq!(clusters, vec![2, 2, 1, 1, 1, 1, 1, 0]);
795 }
796
797 #[test]
798 fn test_wordcut_with_clusters() {
799 let text = "แมวแฐแกกก์มา";
800 let cluster_path = super::Path::new(concat!(
801 env!("CARGO_MANIFEST_DIR"),
802 "/data/thai_cluster_rules.txt"
803 ));
804 let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
805 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
806 let dict = super::load_dict(&path);
807 let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
808 assert_eq!(
809 wordcut.put_delimiters(text, "|||"),
810 String::from("แมว|||แฐแกกก์|||มา")
811 );
812 }
813
814 #[test]
815 fn test_wordcut_with_clusters_portsmouth() {
816 let text = "จากพอร์ตสมัธไป";
817 let cluster_path = super::Path::new(concat!(
818 env!("CARGO_MANIFEST_DIR"),
819 "/data/thai_cluster_rules.txt"
820 ));
821 let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
822 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
823 let dict = super::load_dict(&path);
824 let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
825 assert_eq!(
826 wordcut.put_delimiters(text, "|||"),
827 String::from("จาก|||พอร์ต|||สมัธ|||ไป")
828 );
829 }
830
831 #[test]
832 fn test_wordcut_with_clusters2() {
833 let text = "มีรีเควสต์อะไร";
834 let cluster_path = super::Path::new(concat!(
835 env!("CARGO_MANIFEST_DIR"),
836 "/data/thai_cluster_rules.txt"
837 ));
838 let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
839 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
840 let dict = super::load_dict(&path);
841 let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
842 assert_eq!(
843 wordcut.put_delimiters(text, "|||"),
844 String::from("มี|||รี|||เค|||วสต์|||อะไร")
845 );
846 }
847
848 #[test]
849 fn test_wordcut_khmer_cluster_basic() {
850 let text = "ឡារី";
851 let cluster_path = super::Path::new(concat!(
852 env!("CARGO_MANIFEST_DIR"),
853 "/data/khmer_cluster_rules.txt"
854 ));
855 let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
856 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/khmerdict.txt"));
857 let dict = super::load_dict(&path);
858 let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
859 assert_eq!(wordcut.put_delimiters(text, "|||"), String::from("ឡា|||រី"));
860 }
861
862 #[test]
863 fn test_rule_based_edge_builder() {
864 let text = " ABก";
865 let text_chars: Vec<char> = text.chars().collect();
866 let byte_to_char_idx_map = create_byte_to_char_idx_map(text);
867 let mut builder =
868 RuleBasedEdgeBuilder::new(&byte_to_char_idx_map, text, &DEFAULT_THAI_SPLIT_RE);
869 let mut path = vec![];
870 path.push(Edge {
871 w: 10,
872 unk: 20,
873 p: 0,
874 etype: EdgeType::Init,
875 });
876 let edge = builder.build(
877 &EdgeBuildingContext {
878 text: &text_chars,
879 i: 0,
880 ch: '\0',
881 left_boundary: 0,
882 best_edge: None,
883 },
884 &path,
885 );
886 assert!(edge.is_none());
887 path.push(Edge {
888 w: 20,
889 unk: 30,
890 p: 0,
891 etype: EdgeType::Unk,
892 });
893
894 let edge = builder.build(
895 &EdgeBuildingContext {
896 text: &text_chars,
897 i: 1,
898 ch: '\0',
899 left_boundary: 0,
900 best_edge: None,
901 },
902 &path,
903 );
904 assert!(edge.is_some());
905 path.push(Edge {
906 w: 30,
907 unk: 40,
908 p: 0,
909 etype: EdgeType::Pat,
910 });
911
912 let edge = builder.build(
913 &EdgeBuildingContext {
914 text: &text_chars,
915 i: 2,
916 ch: '\0',
917 left_boundary: 0,
918 best_edge: None,
919 },
920 &path,
921 );
922 assert!(edge.is_none());
923 path.push(Edge {
924 w: 50,
925 unk: 60,
926 p: 0,
927 etype: EdgeType::Unk,
928 });
929
930 let edge = builder.build(
931 &EdgeBuildingContext {
932 text: &text_chars,
933 i: 3,
934 ch: '\0',
935 left_boundary: 0,
936 best_edge: None,
937 },
938 &path,
939 );
940 assert!(edge.is_some());
941 let edge = edge.unwrap();
942 assert_eq!(
943 edge,
944 Edge {
945 w: 31,
946 unk: 40,
947 p: 2,
948 etype: EdgeType::Pat
949 }
950 );
951 }
952
953 #[test]
954 fn test_wordcut_with_split_rules() {
955 let text = "AB X(A)/12";
956 let cluster_path = super::Path::new(concat!(
957 env!("CARGO_MANIFEST_DIR"),
958 "/data/thai_cluster_rules.txt"
959 ));
960 let split_path = super::Path::new(concat!(
961 env!("CARGO_MANIFEST_DIR"),
962 "/data/thai_split_rules.txt"
963 ));
964
965 let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
966 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
967 let dict = super::load_dict(&path);
968 let split_re = load_split_rules(&split_path).unwrap();
969 let wordcut =
970 Wordcut::new_with_cluster_re_and_split_re(dict.unwrap(), cluster_re, split_re);
971 assert_eq!(
972 wordcut.put_delimiters(text, "|||"),
973 String::from("AB||| |||X|||(|||A|||)|||/|||12")
974 );
975 }
976
977 #[test]
978 fn test_find_clusters_path() {
979 let path = super::Path::new(concat!(
980 env!("CARGO_MANIFEST_DIR"),
981 "/data/thai_cluster_rules.txt"
982 ));
983 let cluster_re = super::load_cluster_rules(&path).unwrap();
984 let cluster_path = find_cluster_path(&cluster_re, "เกียำ");
985 assert_eq!(cluster_path.len(), 16);
986 assert_eq!(cluster_path[15].p, 9);
987 }
988}