1pub mod replacer;
2
3#[macro_use]
4extern crate lazy_static;
5extern crate prefixtree;
6#[macro_use]
7extern crate serde_derive;
8
9use self::prefixtree::{prefix_tree_from_str, PrefixTree};
10use regex_automata::dfa::dense;
11use regex_automata::dfa::Automaton;
12use regex_automata::meta::Regex;
13use regex_automata::util::primitives::StateID;
14use regex_automata::util::start;
15use regex_automata::Anchored;
16use std::fs::File;
17use std::io;
18use std::io::BufRead;
19use std::iter::Peekable;
20use std::path::Path;
21use thiserror::Error;
22
23macro_rules! insert_prefix {
24 ($filename:expr) => {
25 Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/", $filename))
26 };
27}
28
29macro_rules! insert_prefix_str {
30 ($filename:expr) => {
31 concat!(env!("CARGO_MANIFEST_DIR"), "/data/", $filename)
32 };
33}
34
35pub fn default_dict_path() -> &'static Path {
36 insert_prefix!("mixed-wordlist.txt")
37}
38
39pub fn thai_cluster_path() -> Option<String> {
40 Some(insert_prefix_str!("thai_cluster_rules.txt").to_owned())
41}
42
43pub fn thai_replace_rules_path() -> Option<String> {
44 Some(insert_prefix_str!("thai-replace-rules.json").to_owned())
45}
46
47pub type Dict = PrefixTree<char, bool>;
48
49type ClusterRulesMatcher = dense::DFA<Vec<u32>>;
50type SplitRulesMatcher = Regex;
51
52lazy_static! {
53 static ref DEFAULT_THAI_SPLIT_RE: Regex =
54 Regex::new("[\r\t\n ]+|[A-Za-z]+|[0-9]+|[๐-๙]+|“").unwrap();
55}
56
57lazy_static! {
58 static ref DFA_START_CONFIG: start::Config = start::Config::new().anchored(Anchored::Yes);
59}
60
61#[derive(Error, Debug)]
62pub enum WordcutError {
63 #[error("Cannot open cluster rules at `{0}`")]
64 CannotOpenClusterRulesAt(String),
65 #[error("Cannot read a cluster rule")]
66 CannotReadClusterRule,
67 #[error("Cannot compile cluster rules `{0}`")]
68 CannotCompileClusterRules(String),
69 #[error("Cannot open split rules at `{0}`")]
70 CannotOpenSplitRulesAt(String),
71 #[error("Cannot compile split rules `{0}`")]
72 CannotCompileSplitRules(String),
73}
74
75pub fn create_prefix_tree(words: &[&str]) -> PrefixTree<char, bool> {
76 let words_payloads: Vec<(&str, bool)> = words.iter().map(|&word| (word, true)).collect();
77 prefix_tree_from_str(&words_payloads[..])
78}
79
80#[derive(Clone, PartialEq, Eq, Copy, Debug, Serialize, Deserialize)]
81pub enum EdgeType {
82 Init,
83 Dict,
84 Unk,
85 Pat,
86}
87
88#[derive(Clone, Copy, Debug, PartialEq)]
89pub struct Edge {
90 pub w: usize,
91 pub unk: usize,
92 pub p: usize,
93 pub etype: EdgeType,
94}
95
96impl Edge {
97 pub fn is_unk(&self) -> bool {
98 self.etype == EdgeType::Unk
99 }
100
101 pub fn better_than(&self, o: &Edge) -> bool {
102 if self.etype == EdgeType::Pat && o.etype == EdgeType::Unk {
103 return true;
104 }
105
106 if self.etype == EdgeType::Unk && o.etype == EdgeType::Pat {
107 return false;
108 }
109 if self.unk < o.unk {
110 return true;
111 }
112
113 if self.unk > o.unk {
114 return false;
115 }
116
117 if self.w < o.w {
118 return true;
119 }
120
121 if self.w > o.w {
122 return false;
123 }
124
125 if o.is_unk() && !self.is_unk() {
126 return true;
127 }
128
129 false
130 }
131
132 pub fn better(a: &Option<Edge>, b: &Option<Edge>) -> bool {
133 if a.is_none() {
134 return false;
135 }
136
137 if b.is_none() {
138 return true;
139 }
140
141 a.unwrap().better_than(&b.unwrap())
142 }
143}
144
145pub trait EdgeBuilder {
146 fn build(&mut self, context: &EdgeBuildingContext, path: &[Edge]) -> Option<Edge>;
147}
148
149#[derive(Debug)]
150pub struct EdgeBuildingContext<'a> {
151 pub text: &'a [char],
152 pub i: usize,
153 pub ch: char,
154 pub left_boundary: usize,
155 pub best_edge: Option<Edge>,
156}
157
158pub struct UnkEdgeBuilder {}
159
160impl UnkEdgeBuilder {
161 pub fn new() -> UnkEdgeBuilder {
162 UnkEdgeBuilder {}
163 }
164}
165
166impl EdgeBuilder for UnkEdgeBuilder {
167 fn build(&mut self, context: &EdgeBuildingContext, path: &[Edge]) -> Option<Edge> {
168 if context.best_edge.is_some() {
169 return None;
170 }
171
172 let source = path[context.left_boundary];
173 Some(Edge {
174 p: context.left_boundary,
175 etype: EdgeType::Unk,
176 unk: source.unk + 1,
177 w: source.w + 1,
178 })
179 }
180}
181
182#[derive(Clone)]
183struct Pointer {
184 node_id: usize,
185 s: usize,
186 offset: usize,
187 is_final: bool,
188}
189
190impl Pointer {
191 fn update(&mut self, dict: &Dict, ch: char) -> bool {
192 match dict.seek(&(self.node_id as u32, self.offset as u32, ch)) {
193 None => false,
194 Some(&(child_id, is_final, _)) => {
195 self.node_id = child_id as usize;
196 self.is_final = is_final;
197 self.offset += 1;
198 true
199 }
200 }
201 }
202
203 fn gen_edge(&self, path: &[Edge]) -> Edge {
204 let source = path[self.s];
205 Edge {
206 etype: EdgeType::Dict,
207 p: self.s,
208 w: source.w + 1,
209 unk: source.unk,
210 }
211 }
212}
213
214pub struct DictEdgeBuilder<'a> {
215 dict: &'a Dict,
216 pointers: Vec<Pointer>,
217}
218
219impl<'a> DictEdgeBuilder<'a> {
220 pub fn new(dict: &Dict) -> DictEdgeBuilder {
221 const MAX_SIZE: usize = 0xFF;
222 DictEdgeBuilder {
223 dict,
224 pointers: Vec::with_capacity(MAX_SIZE),
225 }
226 }
227
228 fn add_pointer(&mut self, context: &EdgeBuildingContext) {
229 self.pointers.push(Pointer {
230 node_id: 0,
231 offset: 0,
232 is_final: false,
233 s: context.i,
234 });
235 }
236
237 fn update_pointers(&mut self, context: &EdgeBuildingContext) {
238 let mut j = 0;
239 for i in 0..self.pointers.len() {
240 let valid = self.pointers[i].update(self.dict, context.ch);
241 if valid {
242 if j < i {
243 self.pointers[j] = self.pointers[i].clone()
244 }
245 j += 1
246 }
247 }
248 self.pointers.truncate(j);
249 }
250
251 fn gen_edge(&self, pointers: &[Pointer], path: &[Edge]) -> Option<Edge> {
252 let mut best_edge: Option<Edge> = None;
253 for pointer in pointers {
254 if pointer.is_final {
255 let edge = pointer.gen_edge(path);
256 if best_edge.is_none() {
257 best_edge = Some(edge)
258 } else if edge.better_than(&best_edge.unwrap()) {
259 best_edge = Some(edge)
260 }
261 }
262 }
263 best_edge
264 }
265}
266
267impl<'a> EdgeBuilder for DictEdgeBuilder<'a> {
268 fn build(&mut self, context: &EdgeBuildingContext, path: &[Edge]) -> Option<Edge> {
269 self.add_pointer(context);
270 self.update_pointers(context);
271 self.gen_edge(&self.pointers, path)
272 }
273}
274
275pub struct RuleBasedEdgeBuilder {
276 range_peekable: Peekable<std::vec::IntoIter<TextRange>>,
277}
278
279impl RuleBasedEdgeBuilder {
280 pub fn new(byte_to_char_idx_map: &[usize], text: &str, re: &Regex) -> Self {
281 let mut ranges = vec![];
282 for m in re.find_iter(text.as_bytes()) {
283 let ms = m.start();
284 let me = m.end();
285 let s = byte_to_char_idx_map[ms];
286 let e = byte_to_char_idx_map[me];
287 ranges.push(TextRange { s, e });
288 }
289 RuleBasedEdgeBuilder {
290 range_peekable: ranges.into_iter().peekable(),
291 }
292 }
293}
294
295impl EdgeBuilder for RuleBasedEdgeBuilder {
296 fn build(&mut self, context: &EdgeBuildingContext, path: &[Edge]) -> Option<Edge> {
297 loop {
298 if let Some(r) = self.range_peekable.peek() {
299 if context.i >= r.e {
300 self.range_peekable.next();
301 } else {
302 break;
303 }
304 } else {
305 return None;
306 }
307 }
308 if let Some(r) = self.range_peekable.peek() {
309 if r.e != context.i + 1 {
310 return None;
311 }
312 let source = path[r.s];
313 Some(Edge {
314 etype: EdgeType::Pat,
315 p: r.s,
316 w: source.w + 1,
317 unk: source.unk,
318 })
319 } else {
320 None
321 }
322 }
323}
324
325#[inline]
326fn does_not_break_cluster(s: usize, e: usize, text_len: usize, clusters: &[usize]) -> bool {
327 (s == 0 || clusters[s] == 0 || clusters[s] != clusters[s - 1])
328 && (e == text_len || clusters[e - 1] == 0 || clusters[e] != clusters[e - 1])
329}
330
331#[inline]
332fn should_skip_edge(edge: &Option<Edge>, i: usize, text_len: usize, clusters: &[usize]) -> bool {
333 let mut skip_edge = false;
334 if let Some(edge) = edge {
335 let s = edge.p;
336 let e = i + 1;
337 skip_edge = !edge.is_unk() && !does_not_break_cluster(s, e, text_len, clusters);
338 }
339 skip_edge
340}
341
342fn build_path_with_clusters(
343 mut builders: Vec<&mut dyn EdgeBuilder>,
344 clusters: &[usize],
345 text: &[char],
346) -> Vec<Edge> {
347 let mut path = vec![];
348 path.push(Edge {
349 w: 0,
350 unk: 0,
351 p: 0,
352 etype: EdgeType::Init,
353 });
354
355 let mut context = EdgeBuildingContext {
356 text,
357 i: 0,
358 ch: '\0',
359 left_boundary: 0,
360 best_edge: None,
361 };
362
363 let text_len = text.len();
364 for i in 0..text_len {
365 context.ch = text[i];
366 context.i = i;
367 context.best_edge = None;
368 for builder in &mut builders {
369 let edge = builder.build(&context, &path);
370 if !should_skip_edge(&edge, i, text_len, clusters)
371 && Edge::better(&edge, &context.best_edge)
372 {
373 context.best_edge = edge
374 }
375 }
376 path.push(context.best_edge.unwrap());
377 if !context.best_edge.unwrap().is_unk() {
378 context.left_boundary = i + 1;
379 }
380 }
381 path
382}
383
384#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
385pub struct DagEdge {
386 pub s: usize,
387 pub e: usize,
388 pub etype: EdgeType,
389}
390
391pub type Dag = Vec<Vec<DagEdge>>;
392
393pub trait DagEdgeBuilder {
394 fn build_dag_edges(&mut self, context: &EdgeBuildingContext) -> Vec<DagEdge>;
395}
396
397impl<'a> DagEdgeBuilder for DictEdgeBuilder<'a> {
398 fn build_dag_edges(&mut self, context: &EdgeBuildingContext) -> Vec<DagEdge> {
399 self.add_pointer(context);
400 self.update_pointers(context);
401 self.pointers
403 .iter()
404 .filter(|p| p.is_final)
405 .map(|p| DagEdge {
406 s: p.s,
407 e: context.i + 1,
408 etype: EdgeType::Dict,
409 })
410 .collect()
411 }
412}
413
414pub fn build_dag(dict: &Dict, text: &Vec<char>) -> Dag {
415 let mut builders: Vec<Box<dyn DagEdgeBuilder>> = vec![Box::new(DictEdgeBuilder::new(dict))];
416
417 let mut dag = Vec::with_capacity(text.len() + 1);
418
419 for _ in 0..text.len() + 1 {
420 dag.push(vec![]);
421 }
422 dag[0].push(DagEdge {
423 s: 0,
424 e: 0,
425 etype: EdgeType::Init,
426 });
427 let mut context = EdgeBuildingContext {
428 text: &text,
429 i: 0,
430 ch: '\0',
431 left_boundary: 0,
432 best_edge: None,
433 };
434
435 for i in 0..text.len() {
436 context.ch = text[i];
437 context.i = i;
438 context.best_edge = None;
439
440 for builder in &mut builders {
441 for edge in builder.build_dag_edges(&context) {
442 dag[edge.e].push(edge)
443 }
444 }
445 }
446
447 let mut left_boundary = 0;
448 for i in 1..text.len() + 1 {
449 if dag[i].len() == 0 {
450 dag[i].push(DagEdge {
451 s: left_boundary,
452 e: i,
453 etype: EdgeType::Unk,
454 });
455 } else {
456 left_boundary = i;
457 }
458 }
459
460 dag
461}
462
463#[derive(Debug, PartialEq, Serialize, Deserialize)]
464pub struct TextRange {
465 pub s: usize,
466 pub e: usize,
467}
468
469pub fn path_to_ranges(path: &[Edge]) -> Vec<TextRange> {
470 let len = path.len();
471
472 if len == 0 {
473 return vec![];
474 }
475
476 let mut ranges: Vec<TextRange> = Vec::with_capacity(len);
477 let mut e = len - 1;
478 while e > 0 {
479 let edge = &path[e];
480 let s = edge.p;
481 ranges.push(TextRange { s, e });
482 e = s;
483 }
484 ranges.reverse();
485 ranges
486}
487
488pub fn path_to_byte_ranges(path: &[Edge], text: &[char]) -> Vec<TextRange> {
489 let char_ranges = path_to_ranges(path);
490 let mut ranges: Vec<TextRange> = Vec::with_capacity(char_ranges.len());
491 let mut global_byte_offset = 0;
492 for r in char_ranges {
493 let mut word_byte_offset = 0;
494 for i in r.s..r.e {
495 word_byte_offset += text[i].len_utf8();
496 }
497 ranges.push(TextRange {
498 s: global_byte_offset,
499 e: global_byte_offset + word_byte_offset,
500 });
501 global_byte_offset += word_byte_offset;
502 }
503 ranges
504}
505
506pub fn path_to_str_vec(path: &[Edge], text: &[char]) -> Vec<String> {
507 let ranges = path_to_ranges(path);
508 let mut str_vec: Vec<String> = Vec::with_capacity(ranges.len());
509 for r in ranges {
510 let mut buf = String::with_capacity(3 * (r.e - r.s + 1));
511 for i in r.s..r.e {
512 buf.push(text[i]);
513 }
514 str_vec.push(buf)
515 }
516 str_vec
517}
518
519pub struct Wordcut {
520 dict: Dict,
521 cluster_re: Option<ClusterRulesMatcher>,
522 split_re: SplitRulesMatcher,
523}
524
525impl Wordcut {
526 pub fn new(dict: Dict) -> Wordcut {
527 Wordcut {
528 dict,
529 cluster_re: None,
530 split_re: DEFAULT_THAI_SPLIT_RE.clone(),
531 }
532 }
533
534 pub fn new_with_cluster_re(dict: Dict, cluster_re: ClusterRulesMatcher) -> Wordcut {
535 Wordcut {
536 dict,
537 cluster_re: Some(cluster_re),
538 split_re: DEFAULT_THAI_SPLIT_RE.clone(),
539 }
540 }
541
542 pub fn new_with_cluster_re_and_split_re(
543 dict: Dict,
544 cluster_re: ClusterRulesMatcher,
545 split_re: SplitRulesMatcher,
546 ) -> Wordcut {
547 Wordcut {
548 dict,
549 cluster_re: Some(cluster_re),
550 split_re,
551 }
552 }
553
554 #[inline]
555 pub fn build_path(&self, text: &str, text_chars: &[char]) -> Vec<Edge> {
556 let byte_to_char_idx_map = create_byte_to_char_idx_map(text);
557 let mut dict_edge_builder = DictEdgeBuilder::new(&self.dict);
558 let mut unk_edge_builder = UnkEdgeBuilder::new();
559 let mut rule_based_edge_builder =
560 RuleBasedEdgeBuilder::new(&byte_to_char_idx_map, text, &self.split_re);
561 let builders: Vec<&mut dyn EdgeBuilder> = vec![
562 &mut dict_edge_builder,
563 &mut unk_edge_builder,
564 &mut rule_based_edge_builder,
565 ];
566
567 let clusters = if let Some(cluster_re) = &self.cluster_re {
568 find_clusters(text, &byte_to_char_idx_map, cluster_re, text_chars.len())
569 } else {
570 let mut clusters = vec![];
571 clusters.resize(text_chars.len() + 1, 0);
572 clusters
573 };
574 build_path_with_clusters(builders, &clusters, text_chars)
575 }
576
577 #[allow(dead_code)]
578 pub fn segment(&self, text: &str) -> Vec<TextRange> {
579 let text_chars: Vec<char> = text.chars().collect();
580 let path = self.build_path(text, &text_chars);
581 path_to_ranges(&path)
582 }
583
584 pub fn segment_into_byte_ranges(&self, text: &str) -> Vec<TextRange> {
585 let text_chars: Vec<char> = text.chars().collect();
586 let path = self.build_path(text, &text_chars);
587 path_to_byte_ranges(&path, &text_chars)
588 }
589
590 pub fn segment_into_strings(&self, text: &str) -> Vec<String> {
591 let text_chars: Vec<char> = text.chars().collect();
592 let path = self.build_path(text, &text_chars);
593 path_to_str_vec(&path, &text_chars)
594 }
595
596 pub fn put_delimiters(&self, text: &str, delim: &str) -> String {
597 self.segment_into_strings(text).join(delim)
598 }
599
600 #[allow(dead_code)]
601 pub fn build_dag(&self, text: &str) -> Dag {
602 build_dag(&self.dict, &text.chars().collect())
603 }
604}
605
606pub fn create_byte_to_char_idx_map(text: &str) -> Vec<usize> {
607 let mut byte_to_char_map = vec![];
608 let mut i = 0;
609 for b in text.as_bytes() {
610 if (*b as i8) >= -0x40 {
611 byte_to_char_map.push(i);
612 i += 1;
613 } else {
614 byte_to_char_map.push(0);
615 }
616 }
617 byte_to_char_map.push(i);
618 byte_to_char_map
619}
620
621#[derive(Debug)]
622pub struct ClusterPointer {
623 state_id: StateID,
624 p: usize,
625}
626
627#[derive(Debug)]
628pub struct ClusterEdge {
629 acc_pat_len: usize,
630 unk_cnt: usize,
631 p: usize,
632 is_unk: bool,
633}
634
635pub fn find_cluster_path(dfa: &ClusterRulesMatcher, text: &str) -> Vec<ClusterEdge> {
636 let mut pointers = vec![];
637 let mut ch_index = 0;
638 let mut path = vec![];
639 let mut left_boundary = 0;
640 path.push(ClusterEdge {
641 p: 0,
642 acc_pat_len: 0,
643 unk_cnt: 0,
644 is_unk: false,
645 });
646 for ch_byte in text.as_bytes() {
647 let mut best_edge: Option<ClusterEdge> = None;
648 pointers.push(ClusterPointer {
649 state_id: dfa
650 .start_state(&DFA_START_CONFIG)
651 .expect("DFA state started"),
652 p: ch_index,
653 });
654 let mut new_pointer_index = 0;
655 for pointer_index in 0..pointers.len() {
656 let next_id = dfa.next_state(pointers[pointer_index].state_id, *ch_byte);
657 if !dfa.is_dead_state(next_id) {
658 pointers[new_pointer_index] = ClusterPointer {
659 state_id: next_id,
660 p: pointers[pointer_index].p,
661 };
662 new_pointer_index += 1;
663 if dfa.is_match_state(dfa.next_eoi_state(next_id)) {
664 let source = &path[pointers[pointer_index].p];
665 let edge = ClusterEdge {
666 p: pointers[pointer_index].p,
667 acc_pat_len: source.acc_pat_len
668 + (ch_index - pointers[pointer_index].p + 1),
669 unk_cnt: source.unk_cnt,
670 is_unk: false,
671 };
672 if match &best_edge {
673 Some(b_edge) => {
674 b_edge.unk_cnt > edge.unk_cnt
675 || (b_edge.unk_cnt == edge.unk_cnt
676 && b_edge.acc_pat_len < edge.acc_pat_len)
677 }
678 None => true,
679 } {
680 best_edge = Some(edge);
681 }
682 }
683 }
684 }
685 pointers.truncate(new_pointer_index);
686 if best_edge.is_none() {
687 let source = &path[left_boundary];
688 best_edge = Some(ClusterEdge {
689 p: left_boundary,
690 acc_pat_len: source.acc_pat_len,
691 unk_cnt: source.unk_cnt + (ch_index - left_boundary + 1),
692 is_unk: true,
693 });
694 }
695 let best_edge = best_edge.unwrap();
696 if !best_edge.is_unk {
697 left_boundary = ch_index + 1;
698 }
699 path.push(best_edge);
700 ch_index += 1;
701 }
702 path
703}
704
705pub fn find_clusters(
706 text: &str,
707 byte_to_char_idx_map: &[usize],
708 dfa: &ClusterRulesMatcher,
709 len: usize,
710) -> Vec<usize> {
711 let mut clusters = vec![];
712 clusters.resize(len, 0);
713 let mut id = 1;
714 let path = find_cluster_path(dfa, text);
715 let mut me = path.len() - 1;
716 while me > 0 {
717 let edge = &path[me];
718 let ms = edge.p;
719 let s = byte_to_char_idx_map[ms];
720 let e = byte_to_char_idx_map[me];
721 if !edge.is_unk {
722 for i in s..e {
723 clusters[i] = id;
724 }
725 id += 1;
726 }
727 me = ms;
728 }
729 clusters
730}
731
732pub fn load_wordlist(path: impl AsRef<Path>) -> io::Result<Vec<String>> {
733 let f = File::open(path.as_ref())?;
734 let f = io::BufReader::new(f);
735 Ok(f.lines().map(|line| line.unwrap()).collect())
736}
737
738pub fn load_dict(path: impl AsRef<Path>) -> io::Result<Dict> {
739 let wordlist = load_wordlist(path)?;
740 let wordlist: Vec<_> = wordlist.iter().map(|w| &w[..]).collect();
741 Ok(create_prefix_tree(&wordlist))
742}
743
744pub fn load_cluster_rules(path: &Path) -> Result<ClusterRulesMatcher, WordcutError> {
745 let f = File::open(path)
746 .map_err(|_| WordcutError::CannotOpenClusterRulesAt(path.to_string_lossy().to_string()))?;
747 let f = io::BufReader::new(f);
748 let mut rules = vec![];
749 for line in f.lines() {
750 let line = line.map_err(|_| WordcutError::CannotReadClusterRule)?;
751 rules.push(format!("({})", line.trim()));
752 }
753 let rules = rules.join("|");
754 let dfa =
755 dense::DFA::new(&rules).map_err(|_| WordcutError::CannotCompileClusterRules(rules))?;
756 Ok(dfa)
757}
758
759pub fn load_split_rules(path: &Path) -> Result<SplitRulesMatcher, WordcutError> {
760 let f = File::open(path)
761 .map_err(|_| WordcutError::CannotOpenSplitRulesAt(path.to_string_lossy().to_string()))?;
762 let f = io::BufReader::new(f);
763 let mut rules = vec![];
764 for line in f.lines() {
765 let line = line.map_err(|_| WordcutError::CannotReadClusterRule)?;
766 rules.push(format!("({})", line.trim()));
767 }
768 let rules = rules.join("|");
769 Ok(Regex::new(&rules).map_err(|_| WordcutError::CannotCompileSplitRules(rules))?)
770}
771
772#[cfg(test)]
773mod tests {
774 extern crate serde_json;
775 use super::*;
776
777 use DagEdge;
778 use EdgeType;
779 use TextRange;
780 use Wordcut;
781
782 #[test]
783 fn test_prefix_tree() {
784 let prefix_tree = super::create_prefix_tree(&["A"]);
785 assert_eq!(
786 prefix_tree.seek(&(0, 0, 'A')),
787 Some(&(0 as u32, true, Some(true)))
788 );
789 assert_eq!(prefix_tree.seek(&(0, 0, 'B')), None);
790 }
791
792 #[test]
793 fn test_segment() {
794 let dict = super::create_prefix_tree(&["กา", "กาก"]);
795 let wordcut = Wordcut::new(dict);
796 let ranges = wordcut.segment("กากกา");
797 let expected = vec![TextRange { s: 0, e: 3 }, TextRange { s: 3, e: 5 }];
798 assert_eq!(ranges, expected)
799 }
800
801 #[test]
802 fn test_segment_into_byte_ranges() {
803 let dict = super::create_prefix_tree(&["กา", "กาก"]);
804 let wordcut = Wordcut::new(dict);
805 let ranges = wordcut.segment_into_byte_ranges("กากกา");
806 let expected = vec![TextRange { s: 0, e: 9 }, TextRange { s: 9, e: 15 }];
807 assert_eq!(ranges, expected)
808 }
809
810 #[test]
811 fn test_segment_to_strings() {
812 let dict = super::create_prefix_tree(&["กา", "กาก"]);
813 let wordcut = Wordcut::new(dict);
814 let toks = wordcut.segment_into_strings("กากกา");
815 let expected = vec![String::from("กาก"), String::from("กา")];
816 assert_eq!(toks, expected)
817 }
818
819 #[test]
820 fn test_put_delimiters() {
821 let dict = super::create_prefix_tree(&["กา", "กาก"]);
822 let wordcut = Wordcut::new(dict);
823 assert_eq!(wordcut.put_delimiters("กากกา", "|"), String::from("กาก|กา"))
824 }
825
826 #[test]
827 fn test_load_wordlist() {
828 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
829 let v = super::load_wordlist(path);
830 assert_eq!(v.unwrap(), vec![String::from("กา"), String::from("กาก")])
831 }
832
833 #[test]
834 fn test_wordcut() {
835 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
836 let dict = super::load_dict(&path);
837 let wordcut = Wordcut::new(dict.unwrap());
838 assert_eq!(wordcut.put_delimiters("กากกา", "|"), String::from("กาก|กา"))
839 }
840
841 #[test]
842 fn test_wordcut_with_replacer() {
843 let dict = super::create_prefix_tree(&["ข้อ", "รับ", "สำหรับ", "เสนอ"]);
844 let wordcut = Wordcut::new(dict);
845 let rule = r###"{"pattern": "ํา", "replacement": "ำ"}"###;
846 let rule: replacer::Rule = serde_json::from_str(rule).unwrap();
847 let imm_rules = replacer::ImmRule::from_rules(&vec![rule]).unwrap();
848 let mod_text = replacer::replace(&imm_rules, "สําหรับข้อเสนอ");
849 assert_eq!(
850 wordcut.put_delimiters(&mod_text, "|"),
851 String::from("สำหรับ|ข้อ|เสนอ")
852 )
853 }
854
855 #[test]
856 fn test_wordcut_with_replacer_two_occurs() {
857 let dict = super::create_prefix_tree(&["กำลัง", "ทำ", "พยายาม", "ลัง", "ให้"]);
858 let wordcut = Wordcut::new(dict);
859 let rule = r###"{"pattern": "ํา", "replacement": "ำ"}"###;
860 let rule: replacer::Rule = serde_json::from_str(rule).unwrap();
861 let imm_rules = replacer::ImmRule::from_rules(&vec![rule]).unwrap();
862 let mod_text = replacer::replace(&imm_rules, "กําลังพยายามทําให้");
863 assert_eq!(
864 wordcut.put_delimiters(&mod_text, "|"),
865 String::from("กำลัง|พยายาม|ทำ|ให้")
866 )
867 }
868
869 #[test]
870 fn test_wordcut_with_latin() {
871 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
872 let dict = super::load_dict(&path);
873 let wordcut = Wordcut::new(dict.unwrap());
874 assert_eq!(
875 wordcut.put_delimiters("ฑฑACญญ", "|"),
876 String::from("ฑฑ|AC|ญญ")
877 )
878 }
879
880 #[test]
881 fn test_wordcut_with_two_spaces() {
882 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
883 let dict = super::load_dict(&path);
884 let wordcut = Wordcut::new(dict.unwrap());
885 assert_eq!(
886 wordcut.put_delimiters("กา มา", "|"),
887 String::from("กา| |มา")
888 )
889 }
890
891 #[test]
892 fn test_wordcut_with_two_spaces_unk() {
893 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
894 let dict = super::load_dict(&path);
895 let wordcut = Wordcut::new(dict.unwrap());
896 assert_eq!(
897 wordcut.put_delimiters("แแ ยย", "|"),
898 String::from("แแ| |ยย")
899 )
900 }
901
902 #[test]
903 fn test_wordcut_with_unicode_quote() {
904 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
905 let dict = super::load_dict(&path);
906 let wordcut = Wordcut::new(dict.unwrap());
907 assert_eq!(
908 wordcut.put_delimiters("“ฆกากา”", "|"),
909 String::from("“|ฆ|กา|กา|”")
910 )
911 }
912
913 #[test]
914 fn test_dag() {
915 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
916 let dict = super::load_dict(&path).unwrap();
917 let wordcut = Wordcut::new(dict);
918 let dag = wordcut.build_dag("กากกา");
919 let expected = vec![
920 vec![DagEdge {
921 s: 0,
922 e: 0,
923 etype: EdgeType::Init,
924 }], vec![DagEdge {
926 s: 0,
927 e: 1,
928 etype: EdgeType::Unk,
929 }], vec![DagEdge {
931 s: 0,
932 e: 2,
933 etype: EdgeType::Dict,
934 }], vec![DagEdge {
936 s: 0,
937 e: 3,
938 etype: EdgeType::Dict,
939 }], vec![DagEdge {
941 s: 3,
942 e: 4,
943 etype: EdgeType::Unk,
944 }], vec![DagEdge {
946 s: 3,
947 e: 5,
948 etype: EdgeType::Dict,
949 }], ];
951 assert_eq!(dag, expected);
952 }
953
954 #[test]
955 fn test_dag_in_object() {
956 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
957 let dict = super::load_dict(&path);
958 let dag = super::build_dag(&dict.unwrap(), &"กากกา".chars().collect());
959 let expected = vec![
960 vec![DagEdge {
961 s: 0,
962 e: 0,
963 etype: EdgeType::Init,
964 }], vec![DagEdge {
966 s: 0,
967 e: 1,
968 etype: EdgeType::Unk,
969 }], vec![DagEdge {
971 s: 0,
972 e: 2,
973 etype: EdgeType::Dict,
974 }], vec![DagEdge {
976 s: 0,
977 e: 3,
978 etype: EdgeType::Dict,
979 }], vec![DagEdge {
981 s: 3,
982 e: 4,
983 etype: EdgeType::Unk,
984 }], vec![DagEdge {
986 s: 3,
987 e: 5,
988 etype: EdgeType::Dict,
989 }], ];
991 assert_eq!(dag, expected);
992 }
993
994 #[test]
995 fn test_dag_empty() {
996 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
997 let dict = super::load_dict(&path);
998 let dag = super::build_dag(&dict.unwrap(), &"".chars().collect());
999 let expected = vec![
1000 vec![DagEdge {
1001 s: 0,
1002 e: 0,
1003 etype: EdgeType::Init,
1004 }], ];
1006 assert_eq!(dag, expected);
1007 }
1008
1009 #[test]
1010 fn test_dag_to_json() {
1011 let dag = vec![
1012 vec![DagEdge {
1013 s: 0,
1014 e: 0,
1015 etype: EdgeType::Init,
1016 }], ];
1018 let s = serde_json::to_string(&dag).unwrap();
1019 assert_eq!(s, "[[{\"s\":0,\"e\":0,\"etype\":\"Init\"}]]");
1020 }
1021
1022 #[test]
1023 fn test_find_clusters() {
1024 let text = "กาแกกก์A";
1025 let path = super::Path::new(concat!(
1026 env!("CARGO_MANIFEST_DIR"),
1027 "/data/thai_cluster_rules.txt"
1028 ));
1029 let cluster_re = super::load_cluster_rules(&path).unwrap();
1030 let byte_to_char_idx_map = create_byte_to_char_idx_map(text);
1031 let clusters = find_clusters(
1032 text,
1033 &byte_to_char_idx_map,
1034 &cluster_re,
1035 text.chars().count(),
1036 );
1037 assert_eq!(clusters, vec![2, 2, 1, 1, 1, 1, 1, 0]);
1038 }
1039
1040 #[test]
1041 fn test_wordcut_with_clusters() {
1042 let text = "แมวแฐแกกก์มา";
1043 let cluster_path = super::Path::new(concat!(
1044 env!("CARGO_MANIFEST_DIR"),
1045 "/data/thai_cluster_rules.txt"
1046 ));
1047 let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
1048 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
1049 let dict = super::load_dict(&path);
1050 let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
1051 assert_eq!(
1052 wordcut.put_delimiters(text, "|||"),
1053 String::from("แมว|||แฐแกกก์|||มา")
1054 );
1055 }
1056
1057 #[test]
1058 fn test_wordcut_with_clusters_portsmouth() {
1059 let text = "จากพอร์ตสมัธไป";
1060 let cluster_path = super::Path::new(concat!(
1061 env!("CARGO_MANIFEST_DIR"),
1062 "/data/thai_cluster_rules.txt"
1063 ));
1064 let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
1065 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
1066 let dict = super::load_dict(&path);
1067 let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
1068 assert_eq!(
1069 wordcut.put_delimiters(text, "|||"),
1070 String::from("จาก|||พอร์ต|||สมัธ|||ไป")
1071 );
1072 }
1073
1074 #[test]
1075 fn test_wordcut_with_clusters2() {
1076 let text = "มีรีเควสต์อะไร";
1077 let cluster_path = super::Path::new(concat!(
1078 env!("CARGO_MANIFEST_DIR"),
1079 "/data/thai_cluster_rules.txt"
1080 ));
1081 let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
1082 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
1083 let dict = super::load_dict(&path);
1084 let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
1085 assert_eq!(
1086 wordcut.put_delimiters(text, "|||"),
1087 String::from("มี|||รี|||เค|||วสต์|||อะไร")
1088 );
1089 }
1090
1091 #[test]
1092 fn test_wordcut_khmer_cluster_basic() {
1093 let text = "ឡារី";
1094 let cluster_path = super::Path::new(concat!(
1095 env!("CARGO_MANIFEST_DIR"),
1096 "/data/khmer_cluster_rules.txt"
1097 ));
1098 let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
1099 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/khmerdict.txt"));
1100 let dict = super::load_dict(&path);
1101 let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
1102 assert_eq!(wordcut.put_delimiters(text, "|||"), String::from("ឡា|||រី"));
1103 }
1104
1105 #[test]
1106 fn test_rule_based_edge_builder() {
1107 let text = " ABก";
1108 let text_chars: Vec<char> = text.chars().collect();
1109 let byte_to_char_idx_map = create_byte_to_char_idx_map(text);
1110 let mut builder =
1111 RuleBasedEdgeBuilder::new(&byte_to_char_idx_map, text, &DEFAULT_THAI_SPLIT_RE);
1112 let mut path = vec![];
1113 path.push(Edge {
1114 w: 10,
1115 unk: 20,
1116 p: 0,
1117 etype: EdgeType::Init,
1118 });
1119 let edge = builder.build(
1120 &EdgeBuildingContext {
1121 text: &text_chars,
1122 i: 0,
1123 ch: '\0',
1124 left_boundary: 0,
1125 best_edge: None,
1126 },
1127 &path,
1128 );
1129 assert!(edge.is_none());
1130 path.push(Edge {
1131 w: 20,
1132 unk: 30,
1133 p: 0,
1134 etype: EdgeType::Unk,
1135 });
1136
1137 let edge = builder.build(
1138 &EdgeBuildingContext {
1139 text: &text_chars,
1140 i: 1,
1141 ch: '\0',
1142 left_boundary: 0,
1143 best_edge: None,
1144 },
1145 &path,
1146 );
1147 assert!(edge.is_some());
1148 path.push(Edge {
1149 w: 30,
1150 unk: 40,
1151 p: 0,
1152 etype: EdgeType::Pat,
1153 });
1154
1155 let edge = builder.build(
1156 &EdgeBuildingContext {
1157 text: &text_chars,
1158 i: 2,
1159 ch: '\0',
1160 left_boundary: 0,
1161 best_edge: None,
1162 },
1163 &path,
1164 );
1165 assert!(edge.is_none());
1166 path.push(Edge {
1167 w: 50,
1168 unk: 60,
1169 p: 0,
1170 etype: EdgeType::Unk,
1171 });
1172
1173 let edge = builder.build(
1174 &EdgeBuildingContext {
1175 text: &text_chars,
1176 i: 3,
1177 ch: '\0',
1178 left_boundary: 0,
1179 best_edge: None,
1180 },
1181 &path,
1182 );
1183 assert!(edge.is_some());
1184 let edge = edge.unwrap();
1185 assert_eq!(
1186 edge,
1187 Edge {
1188 w: 31,
1189 unk: 40,
1190 p: 2,
1191 etype: EdgeType::Pat
1192 }
1193 );
1194 }
1195
1196 #[test]
1197 fn test_wordcut_with_split_rules() {
1198 let text = "AB X(A)/12";
1199 let cluster_path = super::Path::new(concat!(
1200 env!("CARGO_MANIFEST_DIR"),
1201 "/data/thai_cluster_rules.txt"
1202 ));
1203 let split_path = super::Path::new(concat!(
1204 env!("CARGO_MANIFEST_DIR"),
1205 "/data/thai_split_rules.txt"
1206 ));
1207
1208 let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
1209 let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
1210 let dict = super::load_dict(&path);
1211 let split_re = load_split_rules(&split_path).unwrap();
1212 let wordcut =
1213 Wordcut::new_with_cluster_re_and_split_re(dict.unwrap(), cluster_re, split_re);
1214 assert_eq!(
1215 wordcut.put_delimiters(text, "|||"),
1216 String::from("AB||| |||X|||(|||A|||)|||/|||12")
1217 );
1218 }
1219
1220 #[test]
1221 fn test_find_clusters_path() {
1222 let path = super::Path::new(concat!(
1223 env!("CARGO_MANIFEST_DIR"),
1224 "/data/thai_cluster_rules.txt"
1225 ));
1226 let cluster_re = super::load_cluster_rules(&path).unwrap();
1227 let cluster_path = find_cluster_path(&cluster_re, "เกียำ");
1228 assert_eq!(cluster_path.len(), 16);
1229 assert_eq!(cluster_path[15].p, 9);
1230 }
1231}