wordcut_engine/
lib.rs

1pub mod replacer;
2
3#[macro_use]
4extern crate lazy_static;
5extern crate prefixtree;
6#[macro_use]
7extern crate serde_derive;
8
9use self::prefixtree::{prefix_tree_from_str, PrefixTree};
10use regex_automata::dfa::dense;
11use regex_automata::dfa::Automaton;
12use regex_automata::meta::Regex;
13use regex_automata::util::primitives::StateID;
14use regex_automata::util::start;
15use regex_automata::Anchored;
16use std::fs::File;
17use std::io;
18use std::io::BufRead;
19use std::iter::Peekable;
20use std::path::Path;
21use thiserror::Error;
22
23macro_rules! insert_prefix {
24    ($filename:expr) => {
25        Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/", $filename))
26    };
27}
28
29macro_rules! insert_prefix_str {
30    ($filename:expr) => {
31        concat!(env!("CARGO_MANIFEST_DIR"), "/data/", $filename)
32    };
33}
34
35pub fn default_dict_path() -> &'static Path {
36    insert_prefix!("mixed-wordlist.txt")
37}
38
39pub fn thai_cluster_path() -> Option<String> {
40    Some(insert_prefix_str!("thai_cluster_rules.txt").to_owned())
41}
42
43pub fn thai_replace_rules_path() -> Option<String> {
44    Some(insert_prefix_str!("thai-replace-rules.json").to_owned())
45}
46
47pub type Dict = PrefixTree<char, bool>;
48
49type ClusterRulesMatcher = dense::DFA<Vec<u32>>;
50type SplitRulesMatcher = Regex;
51
52lazy_static! {
53    static ref DEFAULT_THAI_SPLIT_RE: Regex =
54        Regex::new("[\r\t\n ]+|[A-Za-z]+|[0-9]+|[๐-๙]+|“").unwrap();
55}
56
57lazy_static! {
58    static ref DFA_START_CONFIG: start::Config = start::Config::new().anchored(Anchored::Yes);
59}
60
61#[derive(Error, Debug)]
62pub enum WordcutError {
63    #[error("Cannot open cluster rules at `{0}`")]
64    CannotOpenClusterRulesAt(String),
65    #[error("Cannot read a cluster rule")]
66    CannotReadClusterRule,
67    #[error("Cannot compile cluster rules `{0}`")]
68    CannotCompileClusterRules(String),
69    #[error("Cannot open split rules at `{0}`")]
70    CannotOpenSplitRulesAt(String),
71    #[error("Cannot compile split rules `{0}`")]
72    CannotCompileSplitRules(String),
73}
74
75pub fn create_prefix_tree(words: &[&str]) -> PrefixTree<char, bool> {
76    let words_payloads: Vec<(&str, bool)> = words.iter().map(|&word| (word, true)).collect();
77    prefix_tree_from_str(&words_payloads[..])
78}
79
80#[derive(Clone, PartialEq, Eq, Copy, Debug, Serialize, Deserialize)]
81pub enum EdgeType {
82    Init,
83    Dict,
84    Unk,
85    Pat,
86}
87
88#[derive(Clone, Copy, Debug, PartialEq)]
89pub struct Edge {
90    pub w: usize,
91    pub unk: usize,
92    pub p: usize,
93    pub etype: EdgeType,
94}
95
96impl Edge {
97    pub fn is_unk(&self) -> bool {
98        self.etype == EdgeType::Unk
99    }
100
101    pub fn better_than(&self, o: &Edge) -> bool {
102        if self.etype == EdgeType::Pat && o.etype == EdgeType::Unk {
103            return true;
104        }
105
106        if self.etype == EdgeType::Unk && o.etype == EdgeType::Pat {
107            return false;
108        }
109        if self.unk < o.unk {
110            return true;
111        }
112
113        if self.unk > o.unk {
114            return false;
115        }
116
117        if self.w < o.w {
118            return true;
119        }
120
121        if self.w > o.w {
122            return false;
123        }
124
125        if o.is_unk() && !self.is_unk() {
126            return true;
127        }
128
129        false
130    }
131
132    pub fn better(a: &Option<Edge>, b: &Option<Edge>) -> bool {
133        if a.is_none() {
134            return false;
135        }
136
137        if b.is_none() {
138            return true;
139        }
140
141        a.unwrap().better_than(&b.unwrap())
142    }
143}
144
145pub trait EdgeBuilder {
146    fn build(&mut self, context: &EdgeBuildingContext, path: &[Edge]) -> Option<Edge>;
147}
148
149#[derive(Debug)]
150pub struct EdgeBuildingContext<'a> {
151    pub text: &'a [char],
152    pub i: usize,
153    pub ch: char,
154    pub left_boundary: usize,
155    pub best_edge: Option<Edge>,
156}
157
158pub struct UnkEdgeBuilder {}
159
160impl UnkEdgeBuilder {
161    pub fn new() -> UnkEdgeBuilder {
162        UnkEdgeBuilder {}
163    }
164}
165
166impl EdgeBuilder for UnkEdgeBuilder {
167    fn build(&mut self, context: &EdgeBuildingContext, path: &[Edge]) -> Option<Edge> {
168        if context.best_edge.is_some() {
169            return None;
170        }
171
172        let source = path[context.left_boundary];
173        Some(Edge {
174            p: context.left_boundary,
175            etype: EdgeType::Unk,
176            unk: source.unk + 1,
177            w: source.w + 1,
178        })
179    }
180}
181
182#[derive(Clone)]
183struct Pointer {
184    node_id: usize,
185    s: usize,
186    offset: usize,
187    is_final: bool,
188}
189
190impl Pointer {
191    fn update(&mut self, dict: &Dict, ch: char) -> bool {
192        match dict.seek(&(self.node_id as u32, self.offset as u32, ch)) {
193            None => false,
194            Some(&(child_id, is_final, _)) => {
195                self.node_id = child_id as usize;
196                self.is_final = is_final;
197                self.offset += 1;
198                true
199            }
200        }
201    }
202
203    fn gen_edge(&self, path: &[Edge]) -> Edge {
204        let source = path[self.s];
205        Edge {
206            etype: EdgeType::Dict,
207            p: self.s,
208            w: source.w + 1,
209            unk: source.unk,
210        }
211    }
212}
213
214pub struct DictEdgeBuilder<'a> {
215    dict: &'a Dict,
216    pointers: Vec<Pointer>,
217}
218
219impl<'a> DictEdgeBuilder<'a> {
220    pub fn new(dict: &Dict) -> DictEdgeBuilder {
221        const MAX_SIZE: usize = 0xFF;
222        DictEdgeBuilder {
223            dict,
224            pointers: Vec::with_capacity(MAX_SIZE),
225        }
226    }
227
228    fn add_pointer(&mut self, context: &EdgeBuildingContext) {
229        self.pointers.push(Pointer {
230            node_id: 0,
231            offset: 0,
232            is_final: false,
233            s: context.i,
234        });
235    }
236
237    fn update_pointers(&mut self, context: &EdgeBuildingContext) {
238        let mut j = 0;
239        for i in 0..self.pointers.len() {
240            let valid = self.pointers[i].update(self.dict, context.ch);
241            if valid {
242                if j < i {
243                    self.pointers[j] = self.pointers[i].clone()
244                }
245                j += 1
246            }
247        }
248        self.pointers.truncate(j);
249    }
250
251    fn gen_edge(&self, pointers: &[Pointer], path: &[Edge]) -> Option<Edge> {
252        let mut best_edge: Option<Edge> = None;
253        for pointer in pointers {
254            if pointer.is_final {
255                let edge = pointer.gen_edge(path);
256                if best_edge.is_none() {
257                    best_edge = Some(edge)
258                } else if edge.better_than(&best_edge.unwrap()) {
259                    best_edge = Some(edge)
260                }
261            }
262        }
263        best_edge
264    }
265}
266
267impl<'a> EdgeBuilder for DictEdgeBuilder<'a> {
268    fn build(&mut self, context: &EdgeBuildingContext, path: &[Edge]) -> Option<Edge> {
269        self.add_pointer(context);
270        self.update_pointers(context);
271        self.gen_edge(&self.pointers, path)
272    }
273}
274
275pub struct RuleBasedEdgeBuilder {
276    range_peekable: Peekable<std::vec::IntoIter<TextRange>>,
277}
278
279impl RuleBasedEdgeBuilder {
280    pub fn new(byte_to_char_idx_map: &[usize], text: &str, re: &Regex) -> Self {
281        let mut ranges = vec![];
282        for m in re.find_iter(text.as_bytes()) {
283            let ms = m.start();
284            let me = m.end();
285            let s = byte_to_char_idx_map[ms];
286            let e = byte_to_char_idx_map[me];
287            ranges.push(TextRange { s, e });
288        }
289        RuleBasedEdgeBuilder {
290            range_peekable: ranges.into_iter().peekable(),
291        }
292    }
293}
294
295impl EdgeBuilder for RuleBasedEdgeBuilder {
296    fn build(&mut self, context: &EdgeBuildingContext, path: &[Edge]) -> Option<Edge> {
297        loop {
298            if let Some(r) = self.range_peekable.peek() {
299                if context.i >= r.e {
300                    self.range_peekable.next();
301                } else {
302                    break;
303                }
304            } else {
305                return None;
306            }
307        }
308        if let Some(r) = self.range_peekable.peek() {
309            if r.e != context.i + 1 {
310                return None;
311            }
312            let source = path[r.s];
313            Some(Edge {
314                etype: EdgeType::Pat,
315                p: r.s,
316                w: source.w + 1,
317                unk: source.unk,
318            })
319        } else {
320            None
321        }
322    }
323}
324
325#[inline]
326fn does_not_break_cluster(s: usize, e: usize, text_len: usize, clusters: &[usize]) -> bool {
327    (s == 0 || clusters[s] == 0 || clusters[s] != clusters[s - 1])
328        && (e == text_len || clusters[e - 1] == 0 || clusters[e] != clusters[e - 1])
329}
330
331#[inline]
332fn should_skip_edge(edge: &Option<Edge>, i: usize, text_len: usize, clusters: &[usize]) -> bool {
333    let mut skip_edge = false;
334    if let Some(edge) = edge {
335        let s = edge.p;
336        let e = i + 1;
337        skip_edge = !edge.is_unk() && !does_not_break_cluster(s, e, text_len, clusters);
338    }
339    skip_edge
340}
341
342fn build_path_with_clusters(
343    mut builders: Vec<&mut dyn EdgeBuilder>,
344    clusters: &[usize],
345    text: &[char],
346) -> Vec<Edge> {
347    let mut path = vec![];
348    path.push(Edge {
349        w: 0,
350        unk: 0,
351        p: 0,
352        etype: EdgeType::Init,
353    });
354
355    let mut context = EdgeBuildingContext {
356        text,
357        i: 0,
358        ch: '\0',
359        left_boundary: 0,
360        best_edge: None,
361    };
362
363    let text_len = text.len();
364    for i in 0..text_len {
365        context.ch = text[i];
366        context.i = i;
367        context.best_edge = None;
368        for builder in &mut builders {
369            let edge = builder.build(&context, &path);
370            if !should_skip_edge(&edge, i, text_len, clusters)
371                && Edge::better(&edge, &context.best_edge)
372            {
373                context.best_edge = edge
374            }
375        }
376        path.push(context.best_edge.unwrap());
377        if !context.best_edge.unwrap().is_unk() {
378            context.left_boundary = i + 1;
379        }
380    }
381    path
382}
383
384#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
385pub struct DagEdge {
386    pub s: usize,
387    pub e: usize,
388    pub etype: EdgeType,
389}
390
391pub type Dag = Vec<Vec<DagEdge>>;
392
393pub trait DagEdgeBuilder {
394    fn build_dag_edges(&mut self, context: &EdgeBuildingContext) -> Vec<DagEdge>;
395}
396
397impl<'a> DagEdgeBuilder for DictEdgeBuilder<'a> {
398    fn build_dag_edges(&mut self, context: &EdgeBuildingContext) -> Vec<DagEdge> {
399        self.add_pointer(context);
400        self.update_pointers(context);
401        //self.gen_edge(&self.pointers, path)
402        self.pointers
403            .iter()
404            .filter(|p| p.is_final)
405            .map(|p| DagEdge {
406                s: p.s,
407                e: context.i + 1,
408                etype: EdgeType::Dict,
409            })
410            .collect()
411    }
412}
413
414pub fn build_dag(dict: &Dict, text: &Vec<char>) -> Dag {
415    let mut builders: Vec<Box<dyn DagEdgeBuilder>> = vec![Box::new(DictEdgeBuilder::new(dict))];
416
417    let mut dag = Vec::with_capacity(text.len() + 1);
418
419    for _ in 0..text.len() + 1 {
420        dag.push(vec![]);
421    }
422    dag[0].push(DagEdge {
423        s: 0,
424        e: 0,
425        etype: EdgeType::Init,
426    });
427    let mut context = EdgeBuildingContext {
428        text: &text,
429        i: 0,
430        ch: '\0',
431        left_boundary: 0,
432        best_edge: None,
433    };
434
435    for i in 0..text.len() {
436        context.ch = text[i];
437        context.i = i;
438        context.best_edge = None;
439
440        for builder in &mut builders {
441            for edge in builder.build_dag_edges(&context) {
442                dag[edge.e].push(edge)
443            }
444        }
445    }
446
447    let mut left_boundary = 0;
448    for i in 1..text.len() + 1 {
449        if dag[i].len() == 0 {
450            dag[i].push(DagEdge {
451                s: left_boundary,
452                e: i,
453                etype: EdgeType::Unk,
454            });
455        } else {
456            left_boundary = i;
457        }
458    }
459
460    dag
461}
462
463#[derive(Debug, PartialEq, Serialize, Deserialize)]
464pub struct TextRange {
465    pub s: usize,
466    pub e: usize,
467}
468
469pub fn path_to_ranges(path: &[Edge]) -> Vec<TextRange> {
470    let len = path.len();
471
472    if len == 0 {
473        return vec![];
474    }
475
476    let mut ranges: Vec<TextRange> = Vec::with_capacity(len);
477    let mut e = len - 1;
478    while e > 0 {
479        let edge = &path[e];
480        let s = edge.p;
481        ranges.push(TextRange { s, e });
482        e = s;
483    }
484    ranges.reverse();
485    ranges
486}
487
488pub fn path_to_byte_ranges(path: &[Edge], text: &[char]) -> Vec<TextRange> {
489    let char_ranges = path_to_ranges(path);
490    let mut ranges: Vec<TextRange> = Vec::with_capacity(char_ranges.len());
491    let mut global_byte_offset = 0;
492    for r in char_ranges {
493        let mut word_byte_offset = 0;
494        for i in r.s..r.e {
495            word_byte_offset += text[i].len_utf8();
496        }
497        ranges.push(TextRange {
498            s: global_byte_offset,
499            e: global_byte_offset + word_byte_offset,
500        });
501        global_byte_offset += word_byte_offset;
502    }
503    ranges
504}
505
506pub fn path_to_str_vec(path: &[Edge], text: &[char]) -> Vec<String> {
507    let ranges = path_to_ranges(path);
508    let mut str_vec: Vec<String> = Vec::with_capacity(ranges.len());
509    for r in ranges {
510        let mut buf = String::with_capacity(3 * (r.e - r.s + 1));
511        for i in r.s..r.e {
512            buf.push(text[i]);
513        }
514        str_vec.push(buf)
515    }
516    str_vec
517}
518
519pub struct Wordcut {
520    dict: Dict,
521    cluster_re: Option<ClusterRulesMatcher>,
522    split_re: SplitRulesMatcher,
523}
524
525impl Wordcut {
526    pub fn new(dict: Dict) -> Wordcut {
527        Wordcut {
528            dict,
529            cluster_re: None,
530            split_re: DEFAULT_THAI_SPLIT_RE.clone(),
531        }
532    }
533
534    pub fn new_with_cluster_re(dict: Dict, cluster_re: ClusterRulesMatcher) -> Wordcut {
535        Wordcut {
536            dict,
537            cluster_re: Some(cluster_re),
538            split_re: DEFAULT_THAI_SPLIT_RE.clone(),
539        }
540    }
541
542    pub fn new_with_cluster_re_and_split_re(
543        dict: Dict,
544        cluster_re: ClusterRulesMatcher,
545        split_re: SplitRulesMatcher,
546    ) -> Wordcut {
547        Wordcut {
548            dict,
549            cluster_re: Some(cluster_re),
550            split_re,
551        }
552    }
553
554    #[inline]
555    pub fn build_path(&self, text: &str, text_chars: &[char]) -> Vec<Edge> {
556        let byte_to_char_idx_map = create_byte_to_char_idx_map(text);
557        let mut dict_edge_builder = DictEdgeBuilder::new(&self.dict);
558        let mut unk_edge_builder = UnkEdgeBuilder::new();
559        let mut rule_based_edge_builder =
560            RuleBasedEdgeBuilder::new(&byte_to_char_idx_map, text, &self.split_re);
561        let builders: Vec<&mut dyn EdgeBuilder> = vec![
562            &mut dict_edge_builder,
563            &mut unk_edge_builder,
564            &mut rule_based_edge_builder,
565        ];
566
567        let clusters = if let Some(cluster_re) = &self.cluster_re {
568            find_clusters(text, &byte_to_char_idx_map, cluster_re, text_chars.len())
569        } else {
570            let mut clusters = vec![];
571            clusters.resize(text_chars.len() + 1, 0);
572            clusters
573        };
574        build_path_with_clusters(builders, &clusters, text_chars)
575    }
576
577    #[allow(dead_code)]
578    pub fn segment(&self, text: &str) -> Vec<TextRange> {
579        let text_chars: Vec<char> = text.chars().collect();
580        let path = self.build_path(text, &text_chars);
581        path_to_ranges(&path)
582    }
583
584    pub fn segment_into_byte_ranges(&self, text: &str) -> Vec<TextRange> {
585        let text_chars: Vec<char> = text.chars().collect();
586        let path = self.build_path(text, &text_chars);
587        path_to_byte_ranges(&path, &text_chars)
588    }
589
590    pub fn segment_into_strings(&self, text: &str) -> Vec<String> {
591        let text_chars: Vec<char> = text.chars().collect();
592        let path = self.build_path(text, &text_chars);
593        path_to_str_vec(&path, &text_chars)
594    }
595
596    pub fn put_delimiters(&self, text: &str, delim: &str) -> String {
597        self.segment_into_strings(text).join(delim)
598    }
599
600    #[allow(dead_code)]
601    pub fn build_dag(&self, text: &str) -> Dag {
602        build_dag(&self.dict, &text.chars().collect())
603    }
604}
605
606pub fn create_byte_to_char_idx_map(text: &str) -> Vec<usize> {
607    let mut byte_to_char_map = vec![];
608    let mut i = 0;
609    for b in text.as_bytes() {
610        if (*b as i8) >= -0x40 {
611            byte_to_char_map.push(i);
612            i += 1;
613        } else {
614            byte_to_char_map.push(0);
615        }
616    }
617    byte_to_char_map.push(i);
618    byte_to_char_map
619}
620
621#[derive(Debug)]
622pub struct ClusterPointer {
623    state_id: StateID,
624    p: usize,
625}
626
627#[derive(Debug)]
628pub struct ClusterEdge {
629    acc_pat_len: usize,
630    unk_cnt: usize,
631    p: usize,
632    is_unk: bool,
633}
634
635pub fn find_cluster_path(dfa: &ClusterRulesMatcher, text: &str) -> Vec<ClusterEdge> {
636    let mut pointers = vec![];
637    let mut ch_index = 0;
638    let mut path = vec![];
639    let mut left_boundary = 0;
640    path.push(ClusterEdge {
641        p: 0,
642        acc_pat_len: 0,
643        unk_cnt: 0,
644        is_unk: false,
645    });
646    for ch_byte in text.as_bytes() {
647        let mut best_edge: Option<ClusterEdge> = None;
648        pointers.push(ClusterPointer {
649            state_id: dfa
650                .start_state(&DFA_START_CONFIG)
651                .expect("DFA state started"),
652            p: ch_index,
653        });
654        let mut new_pointer_index = 0;
655        for pointer_index in 0..pointers.len() {
656            let next_id = dfa.next_state(pointers[pointer_index].state_id, *ch_byte);
657            if !dfa.is_dead_state(next_id) {
658                pointers[new_pointer_index] = ClusterPointer {
659                    state_id: next_id,
660                    p: pointers[pointer_index].p,
661                };
662                new_pointer_index += 1;
663                if dfa.is_match_state(dfa.next_eoi_state(next_id)) {
664                    let source = &path[pointers[pointer_index].p];
665                    let edge = ClusterEdge {
666                        p: pointers[pointer_index].p,
667                        acc_pat_len: source.acc_pat_len
668                            + (ch_index - pointers[pointer_index].p + 1),
669                        unk_cnt: source.unk_cnt,
670                        is_unk: false,
671                    };
672                    if match &best_edge {
673                        Some(b_edge) => {
674                            b_edge.unk_cnt > edge.unk_cnt
675                                || (b_edge.unk_cnt == edge.unk_cnt
676                                    && b_edge.acc_pat_len < edge.acc_pat_len)
677                        }
678                        None => true,
679                    } {
680                        best_edge = Some(edge);
681                    }
682                }
683            }
684        }
685        pointers.truncate(new_pointer_index);
686        if best_edge.is_none() {
687            let source = &path[left_boundary];
688            best_edge = Some(ClusterEdge {
689                p: left_boundary,
690                acc_pat_len: source.acc_pat_len,
691                unk_cnt: source.unk_cnt + (ch_index - left_boundary + 1),
692                is_unk: true,
693            });
694        }
695        let best_edge = best_edge.unwrap();
696        if !best_edge.is_unk {
697            left_boundary = ch_index + 1;
698        }
699        path.push(best_edge);
700        ch_index += 1;
701    }
702    path
703}
704
705pub fn find_clusters(
706    text: &str,
707    byte_to_char_idx_map: &[usize],
708    dfa: &ClusterRulesMatcher,
709    len: usize,
710) -> Vec<usize> {
711    let mut clusters = vec![];
712    clusters.resize(len, 0);
713    let mut id = 1;
714    let path = find_cluster_path(dfa, text);
715    let mut me = path.len() - 1;
716    while me > 0 {
717        let edge = &path[me];
718        let ms = edge.p;
719        let s = byte_to_char_idx_map[ms];
720        let e = byte_to_char_idx_map[me];
721        if !edge.is_unk {
722            for i in s..e {
723                clusters[i] = id;
724            }
725            id += 1;
726        }
727        me = ms;
728    }
729    clusters
730}
731
732pub fn load_wordlist(path: impl AsRef<Path>) -> io::Result<Vec<String>> {
733    let f = File::open(path.as_ref())?;
734    let f = io::BufReader::new(f);
735    Ok(f.lines().map(|line| line.unwrap()).collect())
736}
737
738pub fn load_dict(path: impl AsRef<Path>) -> io::Result<Dict> {
739    let wordlist = load_wordlist(path)?;
740    let wordlist: Vec<_> = wordlist.iter().map(|w| &w[..]).collect();
741    Ok(create_prefix_tree(&wordlist))
742}
743
744pub fn load_cluster_rules(path: &Path) -> Result<ClusterRulesMatcher, WordcutError> {
745    let f = File::open(path)
746        .map_err(|_| WordcutError::CannotOpenClusterRulesAt(path.to_string_lossy().to_string()))?;
747    let f = io::BufReader::new(f);
748    let mut rules = vec![];
749    for line in f.lines() {
750        let line = line.map_err(|_| WordcutError::CannotReadClusterRule)?;
751        rules.push(format!("({})", line.trim()));
752    }
753    let rules = rules.join("|");
754    let dfa =
755        dense::DFA::new(&rules).map_err(|_| WordcutError::CannotCompileClusterRules(rules))?;
756    Ok(dfa)
757}
758
759pub fn load_split_rules(path: &Path) -> Result<SplitRulesMatcher, WordcutError> {
760    let f = File::open(path)
761        .map_err(|_| WordcutError::CannotOpenSplitRulesAt(path.to_string_lossy().to_string()))?;
762    let f = io::BufReader::new(f);
763    let mut rules = vec![];
764    for line in f.lines() {
765        let line = line.map_err(|_| WordcutError::CannotReadClusterRule)?;
766        rules.push(format!("({})", line.trim()));
767    }
768    let rules = rules.join("|");
769    Ok(Regex::new(&rules).map_err(|_| WordcutError::CannotCompileSplitRules(rules))?)
770}
771
772#[cfg(test)]
773mod tests {
774    extern crate serde_json;
775    use super::*;
776
777    use DagEdge;
778    use EdgeType;
779    use TextRange;
780    use Wordcut;
781
782    #[test]
783    fn test_prefix_tree() {
784        let prefix_tree = super::create_prefix_tree(&["A"]);
785        assert_eq!(
786            prefix_tree.seek(&(0, 0, 'A')),
787            Some(&(0 as u32, true, Some(true)))
788        );
789        assert_eq!(prefix_tree.seek(&(0, 0, 'B')), None);
790    }
791
792    #[test]
793    fn test_segment() {
794        let dict = super::create_prefix_tree(&["กา", "กาก"]);
795        let wordcut = Wordcut::new(dict);
796        let ranges = wordcut.segment("กากกา");
797        let expected = vec![TextRange { s: 0, e: 3 }, TextRange { s: 3, e: 5 }];
798        assert_eq!(ranges, expected)
799    }
800
801    #[test]
802    fn test_segment_into_byte_ranges() {
803        let dict = super::create_prefix_tree(&["กา", "กาก"]);
804        let wordcut = Wordcut::new(dict);
805        let ranges = wordcut.segment_into_byte_ranges("กากกา");
806        let expected = vec![TextRange { s: 0, e: 9 }, TextRange { s: 9, e: 15 }];
807        assert_eq!(ranges, expected)
808    }
809
810    #[test]
811    fn test_segment_to_strings() {
812        let dict = super::create_prefix_tree(&["กา", "กาก"]);
813        let wordcut = Wordcut::new(dict);
814        let toks = wordcut.segment_into_strings("กากกา");
815        let expected = vec![String::from("กาก"), String::from("กา")];
816        assert_eq!(toks, expected)
817    }
818
819    #[test]
820    fn test_put_delimiters() {
821        let dict = super::create_prefix_tree(&["กา", "กาก"]);
822        let wordcut = Wordcut::new(dict);
823        assert_eq!(wordcut.put_delimiters("กากกา", "|"), String::from("กาก|กา"))
824    }
825
826    #[test]
827    fn test_load_wordlist() {
828        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
829        let v = super::load_wordlist(path);
830        assert_eq!(v.unwrap(), vec![String::from("กา"), String::from("กาก")])
831    }
832
833    #[test]
834    fn test_wordcut() {
835        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
836        let dict = super::load_dict(&path);
837        let wordcut = Wordcut::new(dict.unwrap());
838        assert_eq!(wordcut.put_delimiters("กากกา", "|"), String::from("กาก|กา"))
839    }
840
841    #[test]
842    fn test_wordcut_with_replacer() {
843        let dict = super::create_prefix_tree(&["ข้อ", "รับ", "สำหรับ", "เสนอ"]);
844        let wordcut = Wordcut::new(dict);
845        let rule = r###"{"pattern": "ํา", "replacement": "ำ"}"###;
846        let rule: replacer::Rule = serde_json::from_str(rule).unwrap();
847        let imm_rules = replacer::ImmRule::from_rules(&vec![rule]).unwrap();
848        let mod_text = replacer::replace(&imm_rules, "สําหรับข้อเสนอ");
849        assert_eq!(
850            wordcut.put_delimiters(&mod_text, "|"),
851            String::from("สำหรับ|ข้อ|เสนอ")
852        )
853    }
854
855    #[test]
856    fn test_wordcut_with_replacer_two_occurs() {
857        let dict = super::create_prefix_tree(&["กำลัง", "ทำ", "พยายาม", "ลัง", "ให้"]);
858        let wordcut = Wordcut::new(dict);
859        let rule = r###"{"pattern": "ํา", "replacement": "ำ"}"###;
860        let rule: replacer::Rule = serde_json::from_str(rule).unwrap();
861        let imm_rules = replacer::ImmRule::from_rules(&vec![rule]).unwrap();
862        let mod_text = replacer::replace(&imm_rules, "กําลังพยายามทําให้");
863        assert_eq!(
864            wordcut.put_delimiters(&mod_text, "|"),
865            String::from("กำลัง|พยายาม|ทำ|ให้")
866        )
867    }
868
869    #[test]
870    fn test_wordcut_with_latin() {
871        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
872        let dict = super::load_dict(&path);
873        let wordcut = Wordcut::new(dict.unwrap());
874        assert_eq!(
875            wordcut.put_delimiters("ฑฑACญญ", "|"),
876            String::from("ฑฑ|AC|ญญ")
877        )
878    }
879
880    #[test]
881    fn test_wordcut_with_two_spaces() {
882        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
883        let dict = super::load_dict(&path);
884        let wordcut = Wordcut::new(dict.unwrap());
885        assert_eq!(
886            wordcut.put_delimiters("กา  มา", "|"),
887            String::from("กา|  |มา")
888        )
889    }
890
891    #[test]
892    fn test_wordcut_with_two_spaces_unk() {
893        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
894        let dict = super::load_dict(&path);
895        let wordcut = Wordcut::new(dict.unwrap());
896        assert_eq!(
897            wordcut.put_delimiters("แแ  ยย", "|"),
898            String::from("แแ|  |ยย")
899        )
900    }
901
902    #[test]
903    fn test_wordcut_with_unicode_quote() {
904        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
905        let dict = super::load_dict(&path);
906        let wordcut = Wordcut::new(dict.unwrap());
907        assert_eq!(
908            wordcut.put_delimiters("“ฆกากา”", "|"),
909            String::from("“|ฆ|กา|กา|”")
910        )
911    }
912
913    #[test]
914    fn test_dag() {
915        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
916        let dict = super::load_dict(&path).unwrap();
917        let wordcut = Wordcut::new(dict);
918        let dag = wordcut.build_dag("กากกา");
919        let expected = vec![
920            vec![DagEdge {
921                s: 0,
922                e: 0,
923                etype: EdgeType::Init,
924            }], // 0
925            vec![DagEdge {
926                s: 0,
927                e: 1,
928                etype: EdgeType::Unk,
929            }], // 1
930            vec![DagEdge {
931                s: 0,
932                e: 2,
933                etype: EdgeType::Dict,
934            }], // 2
935            vec![DagEdge {
936                s: 0,
937                e: 3,
938                etype: EdgeType::Dict,
939            }], // 3
940            vec![DagEdge {
941                s: 3,
942                e: 4,
943                etype: EdgeType::Unk,
944            }], // 4
945            vec![DagEdge {
946                s: 3,
947                e: 5,
948                etype: EdgeType::Dict,
949            }], // 5
950        ];
951        assert_eq!(dag, expected);
952    }
953
954    #[test]
955    fn test_dag_in_object() {
956        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
957        let dict = super::load_dict(&path);
958        let dag = super::build_dag(&dict.unwrap(), &"กากกา".chars().collect());
959        let expected = vec![
960            vec![DagEdge {
961                s: 0,
962                e: 0,
963                etype: EdgeType::Init,
964            }], // 0
965            vec![DagEdge {
966                s: 0,
967                e: 1,
968                etype: EdgeType::Unk,
969            }], // 1
970            vec![DagEdge {
971                s: 0,
972                e: 2,
973                etype: EdgeType::Dict,
974            }], // 2
975            vec![DagEdge {
976                s: 0,
977                e: 3,
978                etype: EdgeType::Dict,
979            }], // 3
980            vec![DagEdge {
981                s: 3,
982                e: 4,
983                etype: EdgeType::Unk,
984            }], // 4
985            vec![DagEdge {
986                s: 3,
987                e: 5,
988                etype: EdgeType::Dict,
989            }], // 5
990        ];
991        assert_eq!(dag, expected);
992    }
993
994    #[test]
995    fn test_dag_empty() {
996        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
997        let dict = super::load_dict(&path);
998        let dag = super::build_dag(&dict.unwrap(), &"".chars().collect());
999        let expected = vec![
1000            vec![DagEdge {
1001                s: 0,
1002                e: 0,
1003                etype: EdgeType::Init,
1004            }], // 0
1005        ];
1006        assert_eq!(dag, expected);
1007    }
1008
1009    #[test]
1010    fn test_dag_to_json() {
1011        let dag = vec![
1012            vec![DagEdge {
1013                s: 0,
1014                e: 0,
1015                etype: EdgeType::Init,
1016            }], // 0
1017        ];
1018        let s = serde_json::to_string(&dag).unwrap();
1019        assert_eq!(s, "[[{\"s\":0,\"e\":0,\"etype\":\"Init\"}]]");
1020    }
1021
1022    #[test]
1023    fn test_find_clusters() {
1024        let text = "กาแกกก์A";
1025        let path = super::Path::new(concat!(
1026            env!("CARGO_MANIFEST_DIR"),
1027            "/data/thai_cluster_rules.txt"
1028        ));
1029        let cluster_re = super::load_cluster_rules(&path).unwrap();
1030        let byte_to_char_idx_map = create_byte_to_char_idx_map(text);
1031        let clusters = find_clusters(
1032            text,
1033            &byte_to_char_idx_map,
1034            &cluster_re,
1035            text.chars().count(),
1036        );
1037        assert_eq!(clusters, vec![2, 2, 1, 1, 1, 1, 1, 0]);
1038    }
1039
1040    #[test]
1041    fn test_wordcut_with_clusters() {
1042        let text = "แมวแฐแกกก์มา";
1043        let cluster_path = super::Path::new(concat!(
1044            env!("CARGO_MANIFEST_DIR"),
1045            "/data/thai_cluster_rules.txt"
1046        ));
1047        let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
1048        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
1049        let dict = super::load_dict(&path);
1050        let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
1051        assert_eq!(
1052            wordcut.put_delimiters(text, "|||"),
1053            String::from("แมว|||แฐแกกก์|||มา")
1054        );
1055    }
1056
1057    #[test]
1058    fn test_wordcut_with_clusters_portsmouth() {
1059        let text = "จากพอร์ตสมัธไป";
1060        let cluster_path = super::Path::new(concat!(
1061            env!("CARGO_MANIFEST_DIR"),
1062            "/data/thai_cluster_rules.txt"
1063        ));
1064        let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
1065        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
1066        let dict = super::load_dict(&path);
1067        let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
1068        assert_eq!(
1069            wordcut.put_delimiters(text, "|||"),
1070            String::from("จาก|||พอร์ต|||สมัธ|||ไป")
1071        );
1072    }
1073
1074    #[test]
1075    fn test_wordcut_with_clusters2() {
1076        let text = "มีรีเควสต์อะไร";
1077        let cluster_path = super::Path::new(concat!(
1078            env!("CARGO_MANIFEST_DIR"),
1079            "/data/thai_cluster_rules.txt"
1080        ));
1081        let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
1082        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
1083        let dict = super::load_dict(&path);
1084        let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
1085        assert_eq!(
1086            wordcut.put_delimiters(text, "|||"),
1087            String::from("มี|||รี|||เค|||วสต์|||อะไร")
1088        );
1089    }
1090
1091    #[test]
1092    fn test_wordcut_khmer_cluster_basic() {
1093        let text = "ឡារី";
1094        let cluster_path = super::Path::new(concat!(
1095            env!("CARGO_MANIFEST_DIR"),
1096            "/data/khmer_cluster_rules.txt"
1097        ));
1098        let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
1099        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/khmerdict.txt"));
1100        let dict = super::load_dict(&path);
1101        let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
1102        assert_eq!(wordcut.put_delimiters(text, "|||"), String::from("ឡា|||រី"));
1103    }
1104
1105    #[test]
1106    fn test_rule_based_edge_builder() {
1107        let text = "  ABก";
1108        let text_chars: Vec<char> = text.chars().collect();
1109        let byte_to_char_idx_map = create_byte_to_char_idx_map(text);
1110        let mut builder =
1111            RuleBasedEdgeBuilder::new(&byte_to_char_idx_map, text, &DEFAULT_THAI_SPLIT_RE);
1112        let mut path = vec![];
1113        path.push(Edge {
1114            w: 10,
1115            unk: 20,
1116            p: 0,
1117            etype: EdgeType::Init,
1118        });
1119        let edge = builder.build(
1120            &EdgeBuildingContext {
1121                text: &text_chars,
1122                i: 0,
1123                ch: '\0',
1124                left_boundary: 0,
1125                best_edge: None,
1126            },
1127            &path,
1128        );
1129        assert!(edge.is_none());
1130        path.push(Edge {
1131            w: 20,
1132            unk: 30,
1133            p: 0,
1134            etype: EdgeType::Unk,
1135        });
1136
1137        let edge = builder.build(
1138            &EdgeBuildingContext {
1139                text: &text_chars,
1140                i: 1,
1141                ch: '\0',
1142                left_boundary: 0,
1143                best_edge: None,
1144            },
1145            &path,
1146        );
1147        assert!(edge.is_some());
1148        path.push(Edge {
1149            w: 30,
1150            unk: 40,
1151            p: 0,
1152            etype: EdgeType::Pat,
1153        });
1154
1155        let edge = builder.build(
1156            &EdgeBuildingContext {
1157                text: &text_chars,
1158                i: 2,
1159                ch: '\0',
1160                left_boundary: 0,
1161                best_edge: None,
1162            },
1163            &path,
1164        );
1165        assert!(edge.is_none());
1166        path.push(Edge {
1167            w: 50,
1168            unk: 60,
1169            p: 0,
1170            etype: EdgeType::Unk,
1171        });
1172
1173        let edge = builder.build(
1174            &EdgeBuildingContext {
1175                text: &text_chars,
1176                i: 3,
1177                ch: '\0',
1178                left_boundary: 0,
1179                best_edge: None,
1180            },
1181            &path,
1182        );
1183        assert!(edge.is_some());
1184        let edge = edge.unwrap();
1185        assert_eq!(
1186            edge,
1187            Edge {
1188                w: 31,
1189                unk: 40,
1190                p: 2,
1191                etype: EdgeType::Pat
1192            }
1193        );
1194    }
1195
1196    #[test]
1197    fn test_wordcut_with_split_rules() {
1198        let text = "AB   X(A)/12";
1199        let cluster_path = super::Path::new(concat!(
1200            env!("CARGO_MANIFEST_DIR"),
1201            "/data/thai_cluster_rules.txt"
1202        ));
1203        let split_path = super::Path::new(concat!(
1204            env!("CARGO_MANIFEST_DIR"),
1205            "/data/thai_split_rules.txt"
1206        ));
1207
1208        let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
1209        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
1210        let dict = super::load_dict(&path);
1211        let split_re = load_split_rules(&split_path).unwrap();
1212        let wordcut =
1213            Wordcut::new_with_cluster_re_and_split_re(dict.unwrap(), cluster_re, split_re);
1214        assert_eq!(
1215            wordcut.put_delimiters(text, "|||"),
1216            String::from("AB|||   |||X|||(|||A|||)|||/|||12")
1217        );
1218    }
1219
1220    #[test]
1221    fn test_find_clusters_path() {
1222        let path = super::Path::new(concat!(
1223            env!("CARGO_MANIFEST_DIR"),
1224            "/data/thai_cluster_rules.txt"
1225        ));
1226        let cluster_re = super::load_cluster_rules(&path).unwrap();
1227        let cluster_path = find_cluster_path(&cluster_re, "เกียำ");
1228        assert_eq!(cluster_path.len(), 16);
1229        assert_eq!(cluster_path[15].p, 9);
1230    }
1231}
wordcut_engine/lib.rs

wordcut_engine/
lib.rs