1pub mod dict;
2pub mod edge;
3pub mod edge_builders;
4pub mod errors;
5pub mod replacer;
6pub mod text_range;
7
8#[macro_use]
9extern crate lazy_static;
10extern crate prefixtree;
11#[macro_use]
12extern crate serde_derive;
13
14use self::prefixtree::{prefix_tree_from_str, PrefixTree};
15use crate::dict::Dict;
16use crate::edge::{Edge, EdgeType};
17use crate::edge_builders::{
18 DictEdgeBuilder, EdgeBuilder, EdgeBuildingContext, RuleBasedEdgeBuilder, UnkEdgeBuilder,
19};
20use crate::errors::WordcutError;
21use crate::text_range::TextRange;
22use regex_automata::dfa::dense;
23use regex_automata::dfa::Automaton;
24use regex_automata::meta::Regex;
25use regex_automata::util::primitives::StateID;
26use regex_automata::util::start;
27use regex_automata::Anchored;
28use std::fs::File;
29use std::io;
30use std::io::BufRead;
31use std::path::Path;
32
33macro_rules! insert_prefix {
34 ($filename:expr) => {
35 Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/", $filename))
36 };
37}
38
39macro_rules! insert_prefix_str {
40 ($filename:expr) => {
41 concat!(env!("CARGO_MANIFEST_DIR"), "/data/", $filename)
42 };
43}
44
45pub fn default_dict_path() -> &'static Path {
46 insert_prefix!("mixed-wordlist.txt")
47}
48
49pub fn thai_cluster_path() -> Option<String> {
50 Some(insert_prefix_str!("thai_cluster_rules.txt").to_owned())
51}
52
53pub fn thai_replace_rules_path() -> Option<String> {
54 Some(insert_prefix_str!("thai-replace-rules.json").to_owned())
55}
56
57type ClusterRulesMatcher = dense::DFA<Vec<u32>>;
58type SplitRulesMatcher = Regex;
59
60lazy_static! {
61 static ref DEFAULT_THAI_SPLIT_RE: Regex =
62 Regex::new("[\r\t\n ]+|[A-Za-z]+|[0-9]+|[๐-๙]+|“").unwrap();
63}
64
65lazy_static! {
66 static ref DFA_START_CONFIG: start::Config = start::Config::new().anchored(Anchored::Yes);
67}
68
69pub fn create_prefix_tree(words: &[&str]) -> PrefixTree<char, bool> {
70 let words_payloads: Vec<(&str, bool)> = words.iter().map(|&word| (word, true)).collect();
71 prefix_tree_from_str(&words_payloads[..])
72}
73
74#[inline]
75fn does_not_break_cluster(s: usize, e: usize, text_len: usize, clusters: &[usize]) -> bool {
76 (s == 0 || clusters[s] == 0 || clusters[s] != clusters[s - 1])
77 && (e == text_len || clusters[e - 1] == 0 || clusters[e] != clusters[e - 1])
78}
79
80#[inline]
81fn should_skip_edge(edge: &Option<Edge>, i: usize, text_len: usize, clusters: &[usize]) -> bool {
82 let mut skip_edge = false;
83 if let Some(edge) = edge {
84 let s = edge.p;
85 let e = i + 1;
86 skip_edge = !edge.is_unk() && !does_not_break_cluster(s, e, text_len, clusters);
87 }
88 skip_edge
89}
90
91fn build_path_with_clusters(
92 mut builders: Vec<&mut dyn EdgeBuilder>,
93 clusters: &[usize],
94 text: &[char],
95) -> Vec<Edge> {
96 let mut path = vec![];
97 path.push(Edge {
98 w: 0,
99 unk: 0,
100 p: 0,
101 etype: EdgeType::Init,
102 });
103
104 let mut context = EdgeBuildingContext {
105 text,
106 i: 0,
107 ch: '\0',
108 left_boundary: 0,
109 best_edge: None,
110 };
111
112 let text_len = text.len();
113 for i in 0..text_len {
114 context.ch = text[i];
115 context.i = i;
116 context.best_edge = None;
117 for builder in &mut builders {
118 let edge = builder.build(&context, &path);
119 if !should_skip_edge(&edge, i, text_len, clusters)
120 && Edge::better(&edge, &context.best_edge)
121 {
122 context.best_edge = edge
123 }
124 }
125 path.push(context.best_edge.unwrap());
126 if !context.best_edge.unwrap().is_unk() {
127 context.left_boundary = i + 1;
128 }
129 }
130 path
131}
132
133#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
134pub struct DagEdge {
135 pub s: usize,
136 pub e: usize,
137 pub etype: EdgeType,
138}
139
140pub type Dag = Vec<Vec<DagEdge>>;
141
142pub trait DagEdgeBuilder {
143 fn build_dag_edges(&mut self, context: &EdgeBuildingContext) -> Vec<DagEdge>;
144}
145
146impl<'a> DagEdgeBuilder for DictEdgeBuilder<'a> {
147 fn build_dag_edges(&mut self, context: &EdgeBuildingContext) -> Vec<DagEdge> {
148 self.add_pointer(context);
149 self.update_pointers(context);
150 self.pointers
152 .iter()
153 .filter(|p| p.is_final)
154 .map(|p| DagEdge {
155 s: p.s,
156 e: context.i + 1,
157 etype: EdgeType::Dict,
158 })
159 .collect()
160 }
161}
162
163pub fn build_dag(dict: &Dict, text: &Vec<char>) -> Dag {
164 let mut builders: Vec<Box<dyn DagEdgeBuilder>> = vec![Box::new(DictEdgeBuilder::new(dict))];
165
166 let mut dag = Vec::with_capacity(text.len() + 1);
167
168 for _ in 0..text.len() + 1 {
169 dag.push(vec![]);
170 }
171 dag[0].push(DagEdge {
172 s: 0,
173 e: 0,
174 etype: EdgeType::Init,
175 });
176 let mut context = EdgeBuildingContext {
177 text: &text,
178 i: 0,
179 ch: '\0',
180 left_boundary: 0,
181 best_edge: None,
182 };
183
184 for i in 0..text.len() {
185 context.ch = text[i];
186 context.i = i;
187 context.best_edge = None;
188
189 for builder in &mut builders {
190 for edge in builder.build_dag_edges(&context) {
191 dag[edge.e].push(edge)
192 }
193 }
194 }
195
196 let mut left_boundary = 0;
197 for i in 1..text.len() + 1 {
198 if dag[i].len() == 0 {
199 dag[i].push(DagEdge {
200 s: left_boundary,
201 e: i,
202 etype: EdgeType::Unk,
203 });
204 } else {
205 left_boundary = i;
206 }
207 }
208
209 dag
210}
211
212pub fn path_to_ranges(path: &[Edge]) -> Vec<TextRange> {
213 let len = path.len();
214
215 if len == 0 {
216 return vec![];
217 }
218
219 let mut ranges: Vec<TextRange> = Vec::with_capacity(len);
220 let mut e = len - 1;
221 while e > 0 {
222 let edge = &path[e];
223 let s = edge.p;
224 ranges.push(TextRange { s, e });
225 e = s;
226 }
227 ranges.reverse();
228 ranges
229}
230
231pub fn path_to_byte_ranges(path: &[Edge], text: &[char]) -> Vec<TextRange> {
232 let char_ranges = path_to_ranges(path);
233 let mut ranges: Vec<TextRange> = Vec::with_capacity(char_ranges.len());
234 let mut global_byte_offset = 0;
235 for r in char_ranges {
236 let mut word_byte_offset = 0;
237 for i in r.s..r.e {
238 word_byte_offset += text[i].len_utf8();
239 }
240 ranges.push(TextRange {
241 s: global_byte_offset,
242 e: global_byte_offset + word_byte_offset,
243 });
244 global_byte_offset += word_byte_offset;
245 }
246 ranges
247}
248
249pub fn path_to_str_vec(path: &[Edge], text: &[char]) -> Vec<String> {
250 let ranges = path_to_ranges(path);
251 let mut str_vec: Vec<String> = Vec::with_capacity(ranges.len());
252 for r in ranges {
253 let mut buf = String::with_capacity(3 * (r.e - r.s + 1));
254 for i in r.s..r.e {
255 buf.push(text[i]);
256 }
257 str_vec.push(buf)
258 }
259 str_vec
260}
261
262pub struct Wordcut {
263 dict: Dict,
264 cluster_re: Option<ClusterRulesMatcher>,
265 split_re: SplitRulesMatcher,
266}
267
268impl Wordcut {
269 pub fn new(dict: Dict) -> Wordcut {
270 Wordcut {
271 dict,
272 cluster_re: None,
273 split_re: DEFAULT_THAI_SPLIT_RE.clone(),
274 }
275 }
276
277 pub fn new_with_cluster_re(dict: Dict, cluster_re: ClusterRulesMatcher) -> Wordcut {
278 Wordcut {
279 dict,
280 cluster_re: Some(cluster_re),
281 split_re: DEFAULT_THAI_SPLIT_RE.clone(),
282 }
283 }
284
285 pub fn new_with_cluster_re_and_split_re(
286 dict: Dict,
287 cluster_re: ClusterRulesMatcher,
288 split_re: SplitRulesMatcher,
289 ) -> Wordcut {
290 Wordcut {
291 dict,
292 cluster_re: Some(cluster_re),
293 split_re,
294 }
295 }
296
297 #[inline]
298 pub fn build_path(&self, text: &str, text_chars: &[char]) -> Vec<Edge> {
299 let byte_to_char_idx_map = create_byte_to_char_idx_map(text);
300 let mut dict_edge_builder = DictEdgeBuilder::new(&self.dict);
301 let mut unk_edge_builder = UnkEdgeBuilder::new();
302 let mut rule_based_edge_builder =
303 RuleBasedEdgeBuilder::new(&byte_to_char_idx_map, text, &self.split_re);
304 let builders: Vec<&mut dyn EdgeBuilder> = vec![
305 &mut dict_edge_builder,
306 &mut unk_edge_builder,
307 &mut rule_based_edge_builder,
308 ];
309
310 let clusters = if let Some(cluster_re) = &self.cluster_re {
311 find_clusters(text, &byte_to_char_idx_map, cluster_re, text_chars.len())
312 } else {
313 let mut clusters = vec![];
314 clusters.resize(text_chars.len() + 1, 0);
315 clusters
316 };
317 build_path_with_clusters(builders, &clusters, text_chars)
318 }
319
320 #[allow(dead_code)]
321 pub fn segment(&self, text: &str) -> Vec<TextRange> {
322 let text_chars: Vec<char> = text.chars().collect();
323 let path = self.build_path(text, &text_chars);
324 path_to_ranges(&path)
325 }
326
327 pub fn segment_into_byte_ranges(&self, text: &str) -> Vec<TextRange> {
328 let text_chars: Vec<char> = text.chars().collect();
329 let path = self.build_path(text, &text_chars);
330 path_to_byte_ranges(&path, &text_chars)
331 }
332
333 pub fn segment_into_strings(&self, text: &str) -> Vec<String> {
334 let text_chars: Vec<char> = text.chars().collect();
335 let path = self.build_path(text, &text_chars);
336 path_to_str_vec(&path, &text_chars)
337 }
338
339 pub fn put_delimiters(&self, text: &str, delim: &str) -> String {
340 self.segment_into_strings(text).join(delim)
341 }
342
343 #[allow(dead_code)]
344 pub fn build_dag(&self, text: &str) -> Dag {
345 build_dag(&self.dict, &text.chars().collect())
346 }
347}
348
349pub fn create_byte_to_char_idx_map(text: &str) -> Vec<usize> {
350 let mut byte_to_char_map = vec![];
351 let mut i = 0;
352 for b in text.as_bytes() {
353 if (*b as i8) >= -0x40 {
354 byte_to_char_map.push(i);
355 i += 1;
356 } else {
357 byte_to_char_map.push(0);
358 }
359 }
360 byte_to_char_map.push(i);
361 byte_to_char_map
362}
363
364#[derive(Debug)]
365pub struct ClusterPointer {
366 state_id: StateID,
367 p: usize,
368}
369
370#[derive(Debug)]
371pub struct ClusterEdge {
372 acc_pat_len: usize,
373 unk_cnt: usize,
374 p: usize,
375 is_unk: bool,
376}
377
378pub fn find_cluster_path(dfa: &ClusterRulesMatcher, text: &str) -> Vec<ClusterEdge> {
379 let mut pointers = vec![];
380 let mut ch_index = 0;
381 let mut path = vec![];
382 let mut left_boundary = 0;
383 path.push(ClusterEdge {
384 p: 0,
385 acc_pat_len: 0,
386 unk_cnt: 0,
387 is_unk: false,
388 });
389 for ch_byte in text.as_bytes() {
390 let mut best_edge: Option<ClusterEdge> = None;
391 pointers.push(ClusterPointer {
392 state_id: dfa
393 .start_state(&DFA_START_CONFIG)
394 .expect("DFA state started"),
395 p: ch_index,
396 });
397 let mut new_pointer_index = 0;
398 for pointer_index in 0..pointers.len() {
399 let next_id = dfa.next_state(pointers[pointer_index].state_id, *ch_byte);
400 if !dfa.is_dead_state(next_id) {
401 pointers[new_pointer_index] = ClusterPointer {
402 state_id: next_id,
403 p: pointers[pointer_index].p,
404 };
405 new_pointer_index += 1;
406 if dfa.is_match_state(dfa.next_eoi_state(next_id)) {
407 let source = &path[pointers[pointer_index].p];
408 let edge = ClusterEdge {
409 p: pointers[pointer_index].p,
410 acc_pat_len: source.acc_pat_len
411 + (ch_index - pointers[pointer_index].p + 1),
412 unk_cnt: source.unk_cnt,
413 is_unk: false,
414 };
415 if match &best_edge {
416 Some(b_edge) => {
417 b_edge.unk_cnt > edge.unk_cnt
418 || (b_edge.unk_cnt == edge.unk_cnt
419 && b_edge.acc_pat_len < edge.acc_pat_len)
420 }
421 None => true,
422 } {
423 best_edge = Some(edge);
424 }
425 }
426 }
427 }
428 pointers.truncate(new_pointer_index);
429 if best_edge.is_none() {
430 let source = &path[left_boundary];
431 best_edge = Some(ClusterEdge {
432 p: left_boundary,
433 acc_pat_len: source.acc_pat_len,
434 unk_cnt: source.unk_cnt + (ch_index - left_boundary + 1),
435 is_unk: true,
436 });
437 }
438 let best_edge = best_edge.unwrap();
439 if !best_edge.is_unk {
440 left_boundary = ch_index + 1;
441 }
442 path.push(best_edge);
443 ch_index += 1;
444 }
445 path
446}
447
448pub fn find_clusters(
449 text: &str,
450 byte_to_char_idx_map: &[usize],
451 dfa: &ClusterRulesMatcher,
452 len: usize,
453) -> Vec<usize> {
454 let mut clusters = vec![];
455 clusters.resize(len, 0);
456 let mut id = 1;
457 let path = find_cluster_path(dfa, text);
458 let mut me = path.len() - 1;
459 while me > 0 {
460 let edge = &path[me];
461 let ms = edge.p;
462 let s = byte_to_char_idx_map[ms];
463 let e = byte_to_char_idx_map[me];
464 if !edge.is_unk {
465 for i in s..e {
466 clusters[i] = id;
467 }
468 id += 1;
469 }
470 me = ms;
471 }
472 clusters
473}
474
475pub fn load_wordlist(path: impl AsRef<Path>) -> io::Result<Vec<String>> {
476 let f = File::open(path.as_ref())?;
477 let f = io::BufReader::new(f);
478 Ok(f.lines().map(|line| line.unwrap()).collect())
479}
480
481pub fn load_dict(path: impl AsRef<Path>) -> io::Result<Dict> {
482 let wordlist = load_wordlist(path)?;
483 let wordlist: Vec<_> = wordlist.iter().map(|w| &w[..]).collect();
484 Ok(create_prefix_tree(&wordlist))
485}
486
487pub fn load_cluster_rules(path: &Path) -> Result<ClusterRulesMatcher, WordcutError> {
488 let f = File::open(path)
489 .map_err(|_| WordcutError::CannotOpenClusterRulesAt(path.to_string_lossy().to_string()))?;
490 let f = io::BufReader::new(f);
491 let mut rules = vec![];
492 for line in f.lines() {
493 let line = line.map_err(|_| WordcutError::CannotReadClusterRule)?;
494 rules.push(format!("({})", line.trim()));
495 }
496 let rules = rules.join("|");
497 let dfa =
498 dense::DFA::new(&rules).map_err(|_| WordcutError::CannotCompileClusterRules(rules))?;
499 Ok(dfa)
500}
501
502pub fn load_split_rules(path: &Path) -> Result<SplitRulesMatcher, WordcutError> {
503 let f = File::open(path)
504 .map_err(|_| WordcutError::CannotOpenSplitRulesAt(path.to_string_lossy().to_string()))?;
505 let f = io::BufReader::new(f);
506 let mut rules = vec![];
507 for line in f.lines() {
508 let line = line.map_err(|_| WordcutError::CannotReadClusterRule)?;
509 rules.push(format!("({})", line.trim()));
510 }
511 let rules = rules.join("|");
512 Ok(Regex::new(&rules).map_err(|_| WordcutError::CannotCompileSplitRules(rules))?)
513}
514
515#[cfg(test)]
516mod tests {
517 extern crate serde_json;
518 use super::*;
519
520 use DagEdge;
521 use EdgeType;
522 use TextRange;
523 use Wordcut;
524
525 #[test]
526 fn test_prefix_tree() {
527 let prefix_tree = super::create_prefix_tree(&["A"]);
528 assert_eq!(
529 prefix_tree.seek(&(0, 0, 'A')),
530 Some(&(0 as u32, true, Some(true)))
531 );
532 assert_eq!(prefix_tree.seek(&(0, 0, 'B')), None);
533 }
534
535 #[test]
536 fn test_segment() {
537 let dict = super::create_prefix_tree(&["กา", "กาก"]);
538 let wordcut = Wordcut::new(dict);
539 let ranges = wordcut.segment("กากกา");
540 let expected = vec![TextRange { s: 0, e: 3 }, TextRange { s: 3, e: 5 }];
541 assert_eq!(ranges, expected)
542 }
543
544 #[test]
545 fn test_segment_into_byte_ranges() {
546 let dict = super::create_prefix_tree(&["กา", "กาก"]);
547 let wordcut = Wordcut::new(dict);
548 let ranges = wordcut.segment_into_byte_ranges("กากกา");
549 let expected = vec![TextRange { s: 0, e: 9 }, TextRange { s: 9, e: 15 }];
550 assert_eq!(ranges, expected)
551 }
552
553 #[test]
554 fn test_segment_to_strings() {
555 let dict = super::create_prefix_tree(&["กา", "กาก"]);
556 let wordcut = Wordcut::new(dict);
557 let toks = wordcut.segment_into_strings("กากกา");
558 let expected = vec![String::from("กาก"), String::from("กา")];
559 assert_eq!(toks, expected)
560 }
561
562 #[test]
563 fn test_put_delimiters() {
564 let dict = super::create_prefix_tree(&["กา", "กาก"]);
565 let wordcut = Wordcut::new(dict);
566 assert_eq!(wordcut.put_delimiters("กากกา", "|"), String::from("กาก|กา"))
567 }
568
569 #[test]
570 fn test_load_wordlist() {
571 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
572 let v = super::load_wordlist(path);
573 assert_eq!(v.unwrap(), vec![String::from("กา"), String::from("กาก")])
574 }
575
576 #[test]
577 fn test_wordcut() {
578 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
579 let dict = super::load_dict(&path);
580 let wordcut = Wordcut::new(dict.unwrap());
581 assert_eq!(wordcut.put_delimiters("กากกา", "|"), String::from("กาก|กา"))
582 }
583
584 #[test]
585 fn test_wordcut_with_replacer() {
586 let dict = super::create_prefix_tree(&["ข้อ", "รับ", "สำหรับ", "เสนอ"]);
587 let wordcut = Wordcut::new(dict);
588 let rule = r###"{"pattern": "ํา", "replacement": "ำ"}"###;
589 let rule: replacer::Rule = serde_json::from_str(rule).unwrap();
590 let imm_rules = replacer::ImmRule::from_rules(&vec![rule]).unwrap();
591 let mod_text = replacer::replace(&imm_rules, "สําหรับข้อเสนอ");
592 assert_eq!(
593 wordcut.put_delimiters(&mod_text, "|"),
594 String::from("สำหรับ|ข้อ|เสนอ")
595 )
596 }
597
598 #[test]
599 fn test_wordcut_with_replacer_two_occurs() {
600 let dict = super::create_prefix_tree(&["กำลัง", "ทำ", "พยายาม", "ลัง", "ให้"]);
601 let wordcut = Wordcut::new(dict);
602 let rule = r###"{"pattern": "ํา", "replacement": "ำ"}"###;
603 let rule: replacer::Rule = serde_json::from_str(rule).unwrap();
604 let imm_rules = replacer::ImmRule::from_rules(&vec![rule]).unwrap();
605 let mod_text = replacer::replace(&imm_rules, "กําลังพยายามทําให้");
606 assert_eq!(
607 wordcut.put_delimiters(&mod_text, "|"),
608 String::from("กำลัง|พยายาม|ทำ|ให้")
609 )
610 }
611
612 #[test]
613 fn test_wordcut_with_latin() {
614 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
615 let dict = super::load_dict(&path);
616 let wordcut = Wordcut::new(dict.unwrap());
617 assert_eq!(
618 wordcut.put_delimiters("ฑฑACญญ", "|"),
619 String::from("ฑฑ|AC|ญญ")
620 )
621 }
622
623 #[test]
624 fn test_wordcut_with_two_spaces() {
625 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
626 let dict = super::load_dict(&path);
627 let wordcut = Wordcut::new(dict.unwrap());
628 assert_eq!(
629 wordcut.put_delimiters("กา มา", "|"),
630 String::from("กา| |มา")
631 )
632 }
633
634 #[test]
635 fn test_wordcut_with_two_spaces_unk() {
636 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
637 let dict = super::load_dict(&path);
638 let wordcut = Wordcut::new(dict.unwrap());
639 assert_eq!(
640 wordcut.put_delimiters("แแ ยย", "|"),
641 String::from("แแ| |ยย")
642 )
643 }
644
645 #[test]
646 fn test_wordcut_with_unicode_quote() {
647 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
648 let dict = super::load_dict(&path);
649 let wordcut = Wordcut::new(dict.unwrap());
650 assert_eq!(
651 wordcut.put_delimiters("“ฆกากา”", "|"),
652 String::from("“|ฆ|กา|กา|”")
653 )
654 }
655
656 #[test]
657 fn test_dag() {
658 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
659 let dict = super::load_dict(&path).unwrap();
660 let wordcut = Wordcut::new(dict);
661 let dag = wordcut.build_dag("กากกา");
662 let expected = vec![
663 vec![DagEdge {
664 s: 0,
665 e: 0,
666 etype: EdgeType::Init,
667 }], vec![DagEdge {
669 s: 0,
670 e: 1,
671 etype: EdgeType::Unk,
672 }], vec![DagEdge {
674 s: 0,
675 e: 2,
676 etype: EdgeType::Dict,
677 }], vec![DagEdge {
679 s: 0,
680 e: 3,
681 etype: EdgeType::Dict,
682 }], vec![DagEdge {
684 s: 3,
685 e: 4,
686 etype: EdgeType::Unk,
687 }], vec![DagEdge {
689 s: 3,
690 e: 5,
691 etype: EdgeType::Dict,
692 }], ];
694 assert_eq!(dag, expected);
695 }
696
697 #[test]
698 fn test_dag_in_object() {
699 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
700 let dict = super::load_dict(&path);
701 let dag = super::build_dag(&dict.unwrap(), &"กากกา".chars().collect());
702 let expected = vec![
703 vec![DagEdge {
704 s: 0,
705 e: 0,
706 etype: EdgeType::Init,
707 }], vec![DagEdge {
709 s: 0,
710 e: 1,
711 etype: EdgeType::Unk,
712 }], vec![DagEdge {
714 s: 0,
715 e: 2,
716 etype: EdgeType::Dict,
717 }], vec![DagEdge {
719 s: 0,
720 e: 3,
721 etype: EdgeType::Dict,
722 }], vec![DagEdge {
724 s: 3,
725 e: 4,
726 etype: EdgeType::Unk,
727 }], vec![DagEdge {
729 s: 3,
730 e: 5,
731 etype: EdgeType::Dict,
732 }], ];
734 assert_eq!(dag, expected);
735 }
736
737 #[test]
738 fn test_dag_empty() {
739 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
740 let dict = super::load_dict(&path);
741 let dag = super::build_dag(&dict.unwrap(), &"".chars().collect());
742 let expected = vec![
743 vec![DagEdge {
744 s: 0,
745 e: 0,
746 etype: EdgeType::Init,
747 }], ];
749 assert_eq!(dag, expected);
750 }
751
752 #[test]
753 fn test_dag_to_json() {
754 let dag = vec![
755 vec![DagEdge {
756 s: 0,
757 e: 0,
758 etype: EdgeType::Init,
759 }], ];
761 let s = serde_json::to_string(&dag).unwrap();
762 assert_eq!(s, "[[{\"s\":0,\"e\":0,\"etype\":\"Init\"}]]");
763 }
764
765 #[test]
766 fn test_find_clusters() {
767 let text = "กาแกกก์A";
768 let path = super::Path::new(concat!(
769 env!("CARGO_MANIFEST_DIR"),
770 "/data/thai_cluster_rules.txt"
771 ));
772 let cluster_re = super::load_cluster_rules(&path).unwrap();
773 let byte_to_char_idx_map = create_byte_to_char_idx_map(text);
774 let clusters = find_clusters(
775 text,
776 &byte_to_char_idx_map,
777 &cluster_re,
778 text.chars().count(),
779 );
780 assert_eq!(clusters, vec![2, 2, 1, 1, 1, 1, 1, 0]);
781 }
782
783 #[test]
784 fn test_wordcut_with_clusters() {
785 let text = "แมวแฐแกกก์มา";
786 let cluster_path = super::Path::new(concat!(
787 env!("CARGO_MANIFEST_DIR"),
788 "/data/thai_cluster_rules.txt"
789 ));
790 let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
791 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
792 let dict = super::load_dict(&path);
793 let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
794 assert_eq!(
795 wordcut.put_delimiters(text, "|||"),
796 String::from("แมว|||แฐแกกก์|||มา")
797 );
798 }
799
800 #[test]
801 fn test_wordcut_with_clusters_portsmouth() {
802 let text = "จากพอร์ตสมัธไป";
803 let cluster_path = super::Path::new(concat!(
804 env!("CARGO_MANIFEST_DIR"),
805 "/data/thai_cluster_rules.txt"
806 ));
807 let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
808 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
809 let dict = super::load_dict(&path);
810 let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
811 assert_eq!(
812 wordcut.put_delimiters(text, "|||"),
813 String::from("จาก|||พอร์ต|||สมัธ|||ไป")
814 );
815 }
816
817 #[test]
818 fn test_wordcut_with_clusters2() {
819 let text = "มีรีเควสต์อะไร";
820 let cluster_path = super::Path::new(concat!(
821 env!("CARGO_MANIFEST_DIR"),
822 "/data/thai_cluster_rules.txt"
823 ));
824 let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
825 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
826 let dict = super::load_dict(&path);
827 let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
828 assert_eq!(
829 wordcut.put_delimiters(text, "|||"),
830 String::from("มี|||รี|||เค|||วสต์|||อะไร")
831 );
832 }
833
834 #[test]
835 fn test_wordcut_khmer_cluster_basic() {
836 let text = "ឡារី";
837 let cluster_path = super::Path::new(concat!(
838 env!("CARGO_MANIFEST_DIR"),
839 "/data/khmer_cluster_rules.txt"
840 ));
841 let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
842 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/khmerdict.txt"));
843 let dict = super::load_dict(&path);
844 let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
845 assert_eq!(wordcut.put_delimiters(text, "|||"), String::from("ឡា|||រី"));
846 }
847
848 #[test]
849 fn test_rule_based_edge_builder() {
850 let text = " ABก";
851 let text_chars: Vec<char> = text.chars().collect();
852 let byte_to_char_idx_map = create_byte_to_char_idx_map(text);
853 let mut builder =
854 RuleBasedEdgeBuilder::new(&byte_to_char_idx_map, text, &DEFAULT_THAI_SPLIT_RE);
855 let mut path = vec![];
856 path.push(Edge {
857 w: 10,
858 unk: 20,
859 p: 0,
860 etype: EdgeType::Init,
861 });
862 let edge = builder.build(
863 &EdgeBuildingContext {
864 text: &text_chars,
865 i: 0,
866 ch: '\0',
867 left_boundary: 0,
868 best_edge: None,
869 },
870 &path,
871 );
872 assert!(edge.is_none());
873 path.push(Edge {
874 w: 20,
875 unk: 30,
876 p: 0,
877 etype: EdgeType::Unk,
878 });
879
880 let edge = builder.build(
881 &EdgeBuildingContext {
882 text: &text_chars,
883 i: 1,
884 ch: '\0',
885 left_boundary: 0,
886 best_edge: None,
887 },
888 &path,
889 );
890 assert!(edge.is_some());
891 path.push(Edge {
892 w: 30,
893 unk: 40,
894 p: 0,
895 etype: EdgeType::Pat,
896 });
897
898 let edge = builder.build(
899 &EdgeBuildingContext {
900 text: &text_chars,
901 i: 2,
902 ch: '\0',
903 left_boundary: 0,
904 best_edge: None,
905 },
906 &path,
907 );
908 assert!(edge.is_none());
909 path.push(Edge {
910 w: 50,
911 unk: 60,
912 p: 0,
913 etype: EdgeType::Unk,
914 });
915
916 let edge = builder.build(
917 &EdgeBuildingContext {
918 text: &text_chars,
919 i: 3,
920 ch: '\0',
921 left_boundary: 0,
922 best_edge: None,
923 },
924 &path,
925 );
926 assert!(edge.is_some());
927 let edge = edge.unwrap();
928 assert_eq!(
929 edge,
930 Edge {
931 w: 31,
932 unk: 40,
933 p: 2,
934 etype: EdgeType::Pat
935 }
936 );
937 }
938
939 #[test]
940 fn test_wordcut_with_split_rules() {
941 let text = "AB X(A)/12";
942 let cluster_path = super::Path::new(concat!(
943 env!("CARGO_MANIFEST_DIR"),
944 "/data/thai_cluster_rules.txt"
945 ));
946 let split_path = super::Path::new(concat!(
947 env!("CARGO_MANIFEST_DIR"),
948 "/data/thai_split_rules.txt"
949 ));
950
951 let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
952 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
953 let dict = super::load_dict(&path);
954 let split_re = load_split_rules(&split_path).unwrap();
955 let wordcut =
956 Wordcut::new_with_cluster_re_and_split_re(dict.unwrap(), cluster_re, split_re);
957 assert_eq!(
958 wordcut.put_delimiters(text, "|||"),
959 String::from("AB||| |||X|||(|||A|||)|||/|||12")
960 );
961 }
962
963 #[test]
964 fn test_find_clusters_path() {
965 let path = super::Path::new(concat!(
966 env!("CARGO_MANIFEST_DIR"),
967 "/data/thai_cluster_rules.txt"
968 ));
969 let cluster_re = super::load_cluster_rules(&path).unwrap();
970 let cluster_path = find_cluster_path(&cluster_re, "เกียำ");
971 assert_eq!(cluster_path.len(), 16);
972 assert_eq!(cluster_path[15].p, 9);
973 }
974}