wordcut_engine/
lib.rs

1pub mod dict;
2pub mod edge;
3pub mod edge_builders;
4pub mod errors;
5pub mod replacer;
6pub mod text_range;
7
8#[macro_use]
9extern crate lazy_static;
10extern crate prefixtree;
11#[macro_use]
12extern crate serde_derive;
13
14use self::prefixtree::{prefix_tree_from_str, PrefixTree};
15use crate::dict::Dict;
16use crate::edge::{Edge, EdgeType};
17use crate::edge_builders::{
18    DictEdgeBuilder, EdgeBuilder, EdgeBuildingContext, RuleBasedEdgeBuilder, UnkEdgeBuilder,
19};
20use crate::errors::WordcutError;
21use crate::text_range::TextRange;
22use regex_automata::dfa::dense;
23use regex_automata::dfa::Automaton;
24use regex_automata::meta::Regex;
25use regex_automata::util::primitives::StateID;
26use regex_automata::util::start;
27use regex_automata::Anchored;
28use std::fs::File;
29use std::io;
30use std::io::BufRead;
31use std::path::Path;
32
33macro_rules! insert_prefix {
34    ($filename:expr) => {
35        Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/", $filename))
36    };
37}
38
39macro_rules! insert_prefix_str {
40    ($filename:expr) => {
41        concat!(env!("CARGO_MANIFEST_DIR"), "/data/", $filename)
42    };
43}
44
45pub fn default_dict_path() -> &'static Path {
46    insert_prefix!("mixed-wordlist.txt")
47}
48
49pub fn thai_cluster_path() -> Option<String> {
50    Some(insert_prefix_str!("thai_cluster_rules.txt").to_owned())
51}
52
53pub fn thai_replace_rules_path() -> Option<String> {
54    Some(insert_prefix_str!("thai-replace-rules.json").to_owned())
55}
56
57type ClusterRulesMatcher = dense::DFA<Vec<u32>>;
58type SplitRulesMatcher = Regex;
59
60lazy_static! {
61    static ref DEFAULT_THAI_SPLIT_RE: Regex =
62        Regex::new("[\r\t\n ]+|[A-Za-z]+|[0-9]+|[๐-๙]+|“").unwrap();
63}
64
65lazy_static! {
66    static ref DFA_START_CONFIG: start::Config = start::Config::new().anchored(Anchored::Yes);
67}
68
69pub fn create_prefix_tree(words: &[&str]) -> PrefixTree<char, bool> {
70    let words_payloads: Vec<(&str, bool)> = words.iter().map(|&word| (word, true)).collect();
71    prefix_tree_from_str(&words_payloads[..])
72}
73
74#[inline]
75fn does_not_break_cluster(s: usize, e: usize, text_len: usize, clusters: &[usize]) -> bool {
76    (s == 0 || clusters[s] == 0 || clusters[s] != clusters[s - 1])
77        && (e == text_len || clusters[e - 1] == 0 || clusters[e] != clusters[e - 1])
78}
79
80#[inline]
81fn should_skip_edge(edge: &Option<Edge>, i: usize, text_len: usize, clusters: &[usize]) -> bool {
82    let mut skip_edge = false;
83    if let Some(edge) = edge {
84        let s = edge.p;
85        let e = i + 1;
86        skip_edge = !edge.is_unk() && !does_not_break_cluster(s, e, text_len, clusters);
87    }
88    skip_edge
89}
90
91fn build_path_with_clusters(
92    mut builders: Vec<&mut dyn EdgeBuilder>,
93    clusters: &[usize],
94    text: &[char],
95) -> Vec<Edge> {
96    let mut path = vec![];
97    path.push(Edge {
98        w: 0,
99        unk: 0,
100        p: 0,
101        etype: EdgeType::Init,
102    });
103
104    let mut context = EdgeBuildingContext {
105        text,
106        i: 0,
107        ch: '\0',
108        left_boundary: 0,
109        best_edge: None,
110    };
111
112    let text_len = text.len();
113    for i in 0..text_len {
114        context.ch = text[i];
115        context.i = i;
116        context.best_edge = None;
117        for builder in &mut builders {
118            let edge = builder.build(&context, &path);
119            if !should_skip_edge(&edge, i, text_len, clusters)
120                && Edge::better(&edge, &context.best_edge)
121            {
122                context.best_edge = edge
123            }
124        }
125        path.push(context.best_edge.unwrap());
126        if !context.best_edge.unwrap().is_unk() {
127            context.left_boundary = i + 1;
128        }
129    }
130    path
131}
132
133#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
134pub struct DagEdge {
135    pub s: usize,
136    pub e: usize,
137    pub etype: EdgeType,
138}
139
140pub type Dag = Vec<Vec<DagEdge>>;
141
142pub trait DagEdgeBuilder {
143    fn build_dag_edges(&mut self, context: &EdgeBuildingContext) -> Vec<DagEdge>;
144}
145
146impl<'a> DagEdgeBuilder for DictEdgeBuilder<'a> {
147    fn build_dag_edges(&mut self, context: &EdgeBuildingContext) -> Vec<DagEdge> {
148        self.add_pointer(context);
149        self.update_pointers(context);
150        //self.gen_edge(&self.pointers, path)
151        self.pointers
152            .iter()
153            .filter(|p| p.is_final)
154            .map(|p| DagEdge {
155                s: p.s,
156                e: context.i + 1,
157                etype: EdgeType::Dict,
158            })
159            .collect()
160    }
161}
162
163pub fn build_dag(dict: &Dict, text: &Vec<char>) -> Dag {
164    let mut builders: Vec<Box<dyn DagEdgeBuilder>> = vec![Box::new(DictEdgeBuilder::new(dict))];
165
166    let mut dag = Vec::with_capacity(text.len() + 1);
167
168    for _ in 0..text.len() + 1 {
169        dag.push(vec![]);
170    }
171    dag[0].push(DagEdge {
172        s: 0,
173        e: 0,
174        etype: EdgeType::Init,
175    });
176    let mut context = EdgeBuildingContext {
177        text: &text,
178        i: 0,
179        ch: '\0',
180        left_boundary: 0,
181        best_edge: None,
182    };
183
184    for i in 0..text.len() {
185        context.ch = text[i];
186        context.i = i;
187        context.best_edge = None;
188
189        for builder in &mut builders {
190            for edge in builder.build_dag_edges(&context) {
191                dag[edge.e].push(edge)
192            }
193        }
194    }
195
196    let mut left_boundary = 0;
197    for i in 1..text.len() + 1 {
198        if dag[i].len() == 0 {
199            dag[i].push(DagEdge {
200                s: left_boundary,
201                e: i,
202                etype: EdgeType::Unk,
203            });
204        } else {
205            left_boundary = i;
206        }
207    }
208
209    dag
210}
211
212pub fn path_to_ranges(path: &[Edge]) -> Vec<TextRange> {
213    let len = path.len();
214
215    if len == 0 {
216        return vec![];
217    }
218
219    let mut ranges: Vec<TextRange> = Vec::with_capacity(len);
220    let mut e = len - 1;
221    while e > 0 {
222        let edge = &path[e];
223        let s = edge.p;
224        ranges.push(TextRange { s, e });
225        e = s;
226    }
227    ranges.reverse();
228    ranges
229}
230
231pub fn path_to_byte_ranges(path: &[Edge], text: &[char]) -> Vec<TextRange> {
232    let char_ranges = path_to_ranges(path);
233    let mut ranges: Vec<TextRange> = Vec::with_capacity(char_ranges.len());
234    let mut global_byte_offset = 0;
235    for r in char_ranges {
236        let mut word_byte_offset = 0;
237        for i in r.s..r.e {
238            word_byte_offset += text[i].len_utf8();
239        }
240        ranges.push(TextRange {
241            s: global_byte_offset,
242            e: global_byte_offset + word_byte_offset,
243        });
244        global_byte_offset += word_byte_offset;
245    }
246    ranges
247}
248
249pub fn path_to_str_vec(path: &[Edge], text: &[char]) -> Vec<String> {
250    let ranges = path_to_ranges(path);
251    let mut str_vec: Vec<String> = Vec::with_capacity(ranges.len());
252    for r in ranges {
253        let mut buf = String::with_capacity(3 * (r.e - r.s + 1));
254        for i in r.s..r.e {
255            buf.push(text[i]);
256        }
257        str_vec.push(buf)
258    }
259    str_vec
260}
261
262pub struct Wordcut {
263    dict: Dict,
264    cluster_re: Option<ClusterRulesMatcher>,
265    split_re: SplitRulesMatcher,
266}
267
268impl Wordcut {
269    pub fn new(dict: Dict) -> Wordcut {
270        Wordcut {
271            dict,
272            cluster_re: None,
273            split_re: DEFAULT_THAI_SPLIT_RE.clone(),
274        }
275    }
276
277    pub fn new_with_cluster_re(dict: Dict, cluster_re: ClusterRulesMatcher) -> Wordcut {
278        Wordcut {
279            dict,
280            cluster_re: Some(cluster_re),
281            split_re: DEFAULT_THAI_SPLIT_RE.clone(),
282        }
283    }
284
285    pub fn new_with_cluster_re_and_split_re(
286        dict: Dict,
287        cluster_re: ClusterRulesMatcher,
288        split_re: SplitRulesMatcher,
289    ) -> Wordcut {
290        Wordcut {
291            dict,
292            cluster_re: Some(cluster_re),
293            split_re,
294        }
295    }
296
297    #[inline]
298    pub fn build_path(&self, text: &str, text_chars: &[char]) -> Vec<Edge> {
299        let byte_to_char_idx_map = create_byte_to_char_idx_map(text);
300        let mut dict_edge_builder = DictEdgeBuilder::new(&self.dict);
301        let mut unk_edge_builder = UnkEdgeBuilder::new();
302        let mut rule_based_edge_builder =
303            RuleBasedEdgeBuilder::new(&byte_to_char_idx_map, text, &self.split_re);
304        let builders: Vec<&mut dyn EdgeBuilder> = vec![
305            &mut dict_edge_builder,
306            &mut unk_edge_builder,
307            &mut rule_based_edge_builder,
308        ];
309
310        let clusters = if let Some(cluster_re) = &self.cluster_re {
311            find_clusters(text, &byte_to_char_idx_map, cluster_re, text_chars.len())
312        } else {
313            let mut clusters = vec![];
314            clusters.resize(text_chars.len() + 1, 0);
315            clusters
316        };
317        build_path_with_clusters(builders, &clusters, text_chars)
318    }
319
320    #[allow(dead_code)]
321    pub fn segment(&self, text: &str) -> Vec<TextRange> {
322        let text_chars: Vec<char> = text.chars().collect();
323        let path = self.build_path(text, &text_chars);
324        path_to_ranges(&path)
325    }
326
327    pub fn segment_into_byte_ranges(&self, text: &str) -> Vec<TextRange> {
328        let text_chars: Vec<char> = text.chars().collect();
329        let path = self.build_path(text, &text_chars);
330        path_to_byte_ranges(&path, &text_chars)
331    }
332
333    pub fn segment_into_strings(&self, text: &str) -> Vec<String> {
334        let text_chars: Vec<char> = text.chars().collect();
335        let path = self.build_path(text, &text_chars);
336        path_to_str_vec(&path, &text_chars)
337    }
338
339    pub fn put_delimiters(&self, text: &str, delim: &str) -> String {
340        self.segment_into_strings(text).join(delim)
341    }
342
343    #[allow(dead_code)]
344    pub fn build_dag(&self, text: &str) -> Dag {
345        build_dag(&self.dict, &text.chars().collect())
346    }
347}
348
349pub fn create_byte_to_char_idx_map(text: &str) -> Vec<usize> {
350    let mut byte_to_char_map = vec![];
351    let mut i = 0;
352    for b in text.as_bytes() {
353        if (*b as i8) >= -0x40 {
354            byte_to_char_map.push(i);
355            i += 1;
356        } else {
357            byte_to_char_map.push(0);
358        }
359    }
360    byte_to_char_map.push(i);
361    byte_to_char_map
362}
363
364#[derive(Debug)]
365pub struct ClusterPointer {
366    state_id: StateID,
367    p: usize,
368}
369
370#[derive(Debug)]
371pub struct ClusterEdge {
372    acc_pat_len: usize,
373    unk_cnt: usize,
374    p: usize,
375    is_unk: bool,
376}
377
378pub fn find_cluster_path(dfa: &ClusterRulesMatcher, text: &str) -> Vec<ClusterEdge> {
379    let mut pointers = vec![];
380    let mut ch_index = 0;
381    let mut path = vec![];
382    let mut left_boundary = 0;
383    path.push(ClusterEdge {
384        p: 0,
385        acc_pat_len: 0,
386        unk_cnt: 0,
387        is_unk: false,
388    });
389    for ch_byte in text.as_bytes() {
390        let mut best_edge: Option<ClusterEdge> = None;
391        pointers.push(ClusterPointer {
392            state_id: dfa
393                .start_state(&DFA_START_CONFIG)
394                .expect("DFA state started"),
395            p: ch_index,
396        });
397        let mut new_pointer_index = 0;
398        for pointer_index in 0..pointers.len() {
399            let next_id = dfa.next_state(pointers[pointer_index].state_id, *ch_byte);
400            if !dfa.is_dead_state(next_id) {
401                pointers[new_pointer_index] = ClusterPointer {
402                    state_id: next_id,
403                    p: pointers[pointer_index].p,
404                };
405                new_pointer_index += 1;
406                if dfa.is_match_state(dfa.next_eoi_state(next_id)) {
407                    let source = &path[pointers[pointer_index].p];
408                    let edge = ClusterEdge {
409                        p: pointers[pointer_index].p,
410                        acc_pat_len: source.acc_pat_len
411                            + (ch_index - pointers[pointer_index].p + 1),
412                        unk_cnt: source.unk_cnt,
413                        is_unk: false,
414                    };
415                    if match &best_edge {
416                        Some(b_edge) => {
417                            b_edge.unk_cnt > edge.unk_cnt
418                                || (b_edge.unk_cnt == edge.unk_cnt
419                                    && b_edge.acc_pat_len < edge.acc_pat_len)
420                        }
421                        None => true,
422                    } {
423                        best_edge = Some(edge);
424                    }
425                }
426            }
427        }
428        pointers.truncate(new_pointer_index);
429        if best_edge.is_none() {
430            let source = &path[left_boundary];
431            best_edge = Some(ClusterEdge {
432                p: left_boundary,
433                acc_pat_len: source.acc_pat_len,
434                unk_cnt: source.unk_cnt + (ch_index - left_boundary + 1),
435                is_unk: true,
436            });
437        }
438        let best_edge = best_edge.unwrap();
439        if !best_edge.is_unk {
440            left_boundary = ch_index + 1;
441        }
442        path.push(best_edge);
443        ch_index += 1;
444    }
445    path
446}
447
448pub fn find_clusters(
449    text: &str,
450    byte_to_char_idx_map: &[usize],
451    dfa: &ClusterRulesMatcher,
452    len: usize,
453) -> Vec<usize> {
454    let mut clusters = vec![];
455    clusters.resize(len, 0);
456    let mut id = 1;
457    let path = find_cluster_path(dfa, text);
458    let mut me = path.len() - 1;
459    while me > 0 {
460        let edge = &path[me];
461        let ms = edge.p;
462        let s = byte_to_char_idx_map[ms];
463        let e = byte_to_char_idx_map[me];
464        if !edge.is_unk {
465            for i in s..e {
466                clusters[i] = id;
467            }
468            id += 1;
469        }
470        me = ms;
471    }
472    clusters
473}
474
475pub fn load_wordlist(path: impl AsRef<Path>) -> io::Result<Vec<String>> {
476    let f = File::open(path.as_ref())?;
477    let f = io::BufReader::new(f);
478    Ok(f.lines().map(|line| line.unwrap()).collect())
479}
480
481pub fn load_dict(path: impl AsRef<Path>) -> io::Result<Dict> {
482    let wordlist = load_wordlist(path)?;
483    let wordlist: Vec<_> = wordlist.iter().map(|w| &w[..]).collect();
484    Ok(create_prefix_tree(&wordlist))
485}
486
487pub fn load_cluster_rules(path: &Path) -> Result<ClusterRulesMatcher, WordcutError> {
488    let f = File::open(path)
489        .map_err(|_| WordcutError::CannotOpenClusterRulesAt(path.to_string_lossy().to_string()))?;
490    let f = io::BufReader::new(f);
491    let mut rules = vec![];
492    for line in f.lines() {
493        let line = line.map_err(|_| WordcutError::CannotReadClusterRule)?;
494        rules.push(format!("({})", line.trim()));
495    }
496    let rules = rules.join("|");
497    let dfa =
498        dense::DFA::new(&rules).map_err(|_| WordcutError::CannotCompileClusterRules(rules))?;
499    Ok(dfa)
500}
501
502pub fn load_split_rules(path: &Path) -> Result<SplitRulesMatcher, WordcutError> {
503    let f = File::open(path)
504        .map_err(|_| WordcutError::CannotOpenSplitRulesAt(path.to_string_lossy().to_string()))?;
505    let f = io::BufReader::new(f);
506    let mut rules = vec![];
507    for line in f.lines() {
508        let line = line.map_err(|_| WordcutError::CannotReadClusterRule)?;
509        rules.push(format!("({})", line.trim()));
510    }
511    let rules = rules.join("|");
512    Ok(Regex::new(&rules).map_err(|_| WordcutError::CannotCompileSplitRules(rules))?)
513}
514
515#[cfg(test)]
516mod tests {
517    extern crate serde_json;
518    use super::*;
519
520    use DagEdge;
521    use EdgeType;
522    use TextRange;
523    use Wordcut;
524
525    #[test]
526    fn test_prefix_tree() {
527        let prefix_tree = super::create_prefix_tree(&["A"]);
528        assert_eq!(
529            prefix_tree.seek(&(0, 0, 'A')),
530            Some(&(0 as u32, true, Some(true)))
531        );
532        assert_eq!(prefix_tree.seek(&(0, 0, 'B')), None);
533    }
534
535    #[test]
536    fn test_segment() {
537        let dict = super::create_prefix_tree(&["กา", "กาก"]);
538        let wordcut = Wordcut::new(dict);
539        let ranges = wordcut.segment("กากกา");
540        let expected = vec![TextRange { s: 0, e: 3 }, TextRange { s: 3, e: 5 }];
541        assert_eq!(ranges, expected)
542    }
543
544    #[test]
545    fn test_segment_into_byte_ranges() {
546        let dict = super::create_prefix_tree(&["กา", "กาก"]);
547        let wordcut = Wordcut::new(dict);
548        let ranges = wordcut.segment_into_byte_ranges("กากกา");
549        let expected = vec![TextRange { s: 0, e: 9 }, TextRange { s: 9, e: 15 }];
550        assert_eq!(ranges, expected)
551    }
552
553    #[test]
554    fn test_segment_to_strings() {
555        let dict = super::create_prefix_tree(&["กา", "กาก"]);
556        let wordcut = Wordcut::new(dict);
557        let toks = wordcut.segment_into_strings("กากกา");
558        let expected = vec![String::from("กาก"), String::from("กา")];
559        assert_eq!(toks, expected)
560    }
561
562    #[test]
563    fn test_put_delimiters() {
564        let dict = super::create_prefix_tree(&["กา", "กาก"]);
565        let wordcut = Wordcut::new(dict);
566        assert_eq!(wordcut.put_delimiters("กากกา", "|"), String::from("กาก|กา"))
567    }
568
569    #[test]
570    fn test_load_wordlist() {
571        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
572        let v = super::load_wordlist(path);
573        assert_eq!(v.unwrap(), vec![String::from("กา"), String::from("กาก")])
574    }
575
576    #[test]
577    fn test_wordcut() {
578        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
579        let dict = super::load_dict(&path);
580        let wordcut = Wordcut::new(dict.unwrap());
581        assert_eq!(wordcut.put_delimiters("กากกา", "|"), String::from("กาก|กา"))
582    }
583
584    #[test]
585    fn test_wordcut_with_replacer() {
586        let dict = super::create_prefix_tree(&["ข้อ", "รับ", "สำหรับ", "เสนอ"]);
587        let wordcut = Wordcut::new(dict);
588        let rule = r###"{"pattern": "ํา", "replacement": "ำ"}"###;
589        let rule: replacer::Rule = serde_json::from_str(rule).unwrap();
590        let imm_rules = replacer::ImmRule::from_rules(&vec![rule]).unwrap();
591        let mod_text = replacer::replace(&imm_rules, "สําหรับข้อเสนอ");
592        assert_eq!(
593            wordcut.put_delimiters(&mod_text, "|"),
594            String::from("สำหรับ|ข้อ|เสนอ")
595        )
596    }
597
598    #[test]
599    fn test_wordcut_with_replacer_two_occurs() {
600        let dict = super::create_prefix_tree(&["กำลัง", "ทำ", "พยายาม", "ลัง", "ให้"]);
601        let wordcut = Wordcut::new(dict);
602        let rule = r###"{"pattern": "ํา", "replacement": "ำ"}"###;
603        let rule: replacer::Rule = serde_json::from_str(rule).unwrap();
604        let imm_rules = replacer::ImmRule::from_rules(&vec![rule]).unwrap();
605        let mod_text = replacer::replace(&imm_rules, "กําลังพยายามทําให้");
606        assert_eq!(
607            wordcut.put_delimiters(&mod_text, "|"),
608            String::from("กำลัง|พยายาม|ทำ|ให้")
609        )
610    }
611
612    #[test]
613    fn test_wordcut_with_latin() {
614        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
615        let dict = super::load_dict(&path);
616        let wordcut = Wordcut::new(dict.unwrap());
617        assert_eq!(
618            wordcut.put_delimiters("ฑฑACญญ", "|"),
619            String::from("ฑฑ|AC|ญญ")
620        )
621    }
622
623    #[test]
624    fn test_wordcut_with_two_spaces() {
625        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
626        let dict = super::load_dict(&path);
627        let wordcut = Wordcut::new(dict.unwrap());
628        assert_eq!(
629            wordcut.put_delimiters("กา  มา", "|"),
630            String::from("กา|  |มา")
631        )
632    }
633
634    #[test]
635    fn test_wordcut_with_two_spaces_unk() {
636        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
637        let dict = super::load_dict(&path);
638        let wordcut = Wordcut::new(dict.unwrap());
639        assert_eq!(
640            wordcut.put_delimiters("แแ  ยย", "|"),
641            String::from("แแ|  |ยย")
642        )
643    }
644
645    #[test]
646    fn test_wordcut_with_unicode_quote() {
647        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
648        let dict = super::load_dict(&path);
649        let wordcut = Wordcut::new(dict.unwrap());
650        assert_eq!(
651            wordcut.put_delimiters("“ฆกากา”", "|"),
652            String::from("“|ฆ|กา|กา|”")
653        )
654    }
655
656    #[test]
657    fn test_dag() {
658        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
659        let dict = super::load_dict(&path).unwrap();
660        let wordcut = Wordcut::new(dict);
661        let dag = wordcut.build_dag("กากกา");
662        let expected = vec![
663            vec![DagEdge {
664                s: 0,
665                e: 0,
666                etype: EdgeType::Init,
667            }], // 0
668            vec![DagEdge {
669                s: 0,
670                e: 1,
671                etype: EdgeType::Unk,
672            }], // 1
673            vec![DagEdge {
674                s: 0,
675                e: 2,
676                etype: EdgeType::Dict,
677            }], // 2
678            vec![DagEdge {
679                s: 0,
680                e: 3,
681                etype: EdgeType::Dict,
682            }], // 3
683            vec![DagEdge {
684                s: 3,
685                e: 4,
686                etype: EdgeType::Unk,
687            }], // 4
688            vec![DagEdge {
689                s: 3,
690                e: 5,
691                etype: EdgeType::Dict,
692            }], // 5
693        ];
694        assert_eq!(dag, expected);
695    }
696
697    #[test]
698    fn test_dag_in_object() {
699        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
700        let dict = super::load_dict(&path);
701        let dag = super::build_dag(&dict.unwrap(), &"กากกา".chars().collect());
702        let expected = vec![
703            vec![DagEdge {
704                s: 0,
705                e: 0,
706                etype: EdgeType::Init,
707            }], // 0
708            vec![DagEdge {
709                s: 0,
710                e: 1,
711                etype: EdgeType::Unk,
712            }], // 1
713            vec![DagEdge {
714                s: 0,
715                e: 2,
716                etype: EdgeType::Dict,
717            }], // 2
718            vec![DagEdge {
719                s: 0,
720                e: 3,
721                etype: EdgeType::Dict,
722            }], // 3
723            vec![DagEdge {
724                s: 3,
725                e: 4,
726                etype: EdgeType::Unk,
727            }], // 4
728            vec![DagEdge {
729                s: 3,
730                e: 5,
731                etype: EdgeType::Dict,
732            }], // 5
733        ];
734        assert_eq!(dag, expected);
735    }
736
737    #[test]
738    fn test_dag_empty() {
739        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
740        let dict = super::load_dict(&path);
741        let dag = super::build_dag(&dict.unwrap(), &"".chars().collect());
742        let expected = vec![
743            vec![DagEdge {
744                s: 0,
745                e: 0,
746                etype: EdgeType::Init,
747            }], // 0
748        ];
749        assert_eq!(dag, expected);
750    }
751
752    #[test]
753    fn test_dag_to_json() {
754        let dag = vec![
755            vec![DagEdge {
756                s: 0,
757                e: 0,
758                etype: EdgeType::Init,
759            }], // 0
760        ];
761        let s = serde_json::to_string(&dag).unwrap();
762        assert_eq!(s, "[[{\"s\":0,\"e\":0,\"etype\":\"Init\"}]]");
763    }
764
765    #[test]
766    fn test_find_clusters() {
767        let text = "กาแกกก์A";
768        let path = super::Path::new(concat!(
769            env!("CARGO_MANIFEST_DIR"),
770            "/data/thai_cluster_rules.txt"
771        ));
772        let cluster_re = super::load_cluster_rules(&path).unwrap();
773        let byte_to_char_idx_map = create_byte_to_char_idx_map(text);
774        let clusters = find_clusters(
775            text,
776            &byte_to_char_idx_map,
777            &cluster_re,
778            text.chars().count(),
779        );
780        assert_eq!(clusters, vec![2, 2, 1, 1, 1, 1, 1, 0]);
781    }
782
783    #[test]
784    fn test_wordcut_with_clusters() {
785        let text = "แมวแฐแกกก์มา";
786        let cluster_path = super::Path::new(concat!(
787            env!("CARGO_MANIFEST_DIR"),
788            "/data/thai_cluster_rules.txt"
789        ));
790        let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
791        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
792        let dict = super::load_dict(&path);
793        let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
794        assert_eq!(
795            wordcut.put_delimiters(text, "|||"),
796            String::from("แมว|||แฐแกกก์|||มา")
797        );
798    }
799
800    #[test]
801    fn test_wordcut_with_clusters_portsmouth() {
802        let text = "จากพอร์ตสมัธไป";
803        let cluster_path = super::Path::new(concat!(
804            env!("CARGO_MANIFEST_DIR"),
805            "/data/thai_cluster_rules.txt"
806        ));
807        let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
808        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
809        let dict = super::load_dict(&path);
810        let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
811        assert_eq!(
812            wordcut.put_delimiters(text, "|||"),
813            String::from("จาก|||พอร์ต|||สมัธ|||ไป")
814        );
815    }
816
817    #[test]
818    fn test_wordcut_with_clusters2() {
819        let text = "มีรีเควสต์อะไร";
820        let cluster_path = super::Path::new(concat!(
821            env!("CARGO_MANIFEST_DIR"),
822            "/data/thai_cluster_rules.txt"
823        ));
824        let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
825        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
826        let dict = super::load_dict(&path);
827        let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
828        assert_eq!(
829            wordcut.put_delimiters(text, "|||"),
830            String::from("มี|||รี|||เค|||วสต์|||อะไร")
831        );
832    }
833
834    #[test]
835    fn test_wordcut_khmer_cluster_basic() {
836        let text = "ឡារី";
837        let cluster_path = super::Path::new(concat!(
838            env!("CARGO_MANIFEST_DIR"),
839            "/data/khmer_cluster_rules.txt"
840        ));
841        let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
842        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/khmerdict.txt"));
843        let dict = super::load_dict(&path);
844        let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
845        assert_eq!(wordcut.put_delimiters(text, "|||"), String::from("ឡា|||រី"));
846    }
847
848    #[test]
849    fn test_rule_based_edge_builder() {
850        let text = "  ABก";
851        let text_chars: Vec<char> = text.chars().collect();
852        let byte_to_char_idx_map = create_byte_to_char_idx_map(text);
853        let mut builder =
854            RuleBasedEdgeBuilder::new(&byte_to_char_idx_map, text, &DEFAULT_THAI_SPLIT_RE);
855        let mut path = vec![];
856        path.push(Edge {
857            w: 10,
858            unk: 20,
859            p: 0,
860            etype: EdgeType::Init,
861        });
862        let edge = builder.build(
863            &EdgeBuildingContext {
864                text: &text_chars,
865                i: 0,
866                ch: '\0',
867                left_boundary: 0,
868                best_edge: None,
869            },
870            &path,
871        );
872        assert!(edge.is_none());
873        path.push(Edge {
874            w: 20,
875            unk: 30,
876            p: 0,
877            etype: EdgeType::Unk,
878        });
879
880        let edge = builder.build(
881            &EdgeBuildingContext {
882                text: &text_chars,
883                i: 1,
884                ch: '\0',
885                left_boundary: 0,
886                best_edge: None,
887            },
888            &path,
889        );
890        assert!(edge.is_some());
891        path.push(Edge {
892            w: 30,
893            unk: 40,
894            p: 0,
895            etype: EdgeType::Pat,
896        });
897
898        let edge = builder.build(
899            &EdgeBuildingContext {
900                text: &text_chars,
901                i: 2,
902                ch: '\0',
903                left_boundary: 0,
904                best_edge: None,
905            },
906            &path,
907        );
908        assert!(edge.is_none());
909        path.push(Edge {
910            w: 50,
911            unk: 60,
912            p: 0,
913            etype: EdgeType::Unk,
914        });
915
916        let edge = builder.build(
917            &EdgeBuildingContext {
918                text: &text_chars,
919                i: 3,
920                ch: '\0',
921                left_boundary: 0,
922                best_edge: None,
923            },
924            &path,
925        );
926        assert!(edge.is_some());
927        let edge = edge.unwrap();
928        assert_eq!(
929            edge,
930            Edge {
931                w: 31,
932                unk: 40,
933                p: 2,
934                etype: EdgeType::Pat
935            }
936        );
937    }
938
939    #[test]
940    fn test_wordcut_with_split_rules() {
941        let text = "AB   X(A)/12";
942        let cluster_path = super::Path::new(concat!(
943            env!("CARGO_MANIFEST_DIR"),
944            "/data/thai_cluster_rules.txt"
945        ));
946        let split_path = super::Path::new(concat!(
947            env!("CARGO_MANIFEST_DIR"),
948            "/data/thai_split_rules.txt"
949        ));
950
951        let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
952        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
953        let dict = super::load_dict(&path);
954        let split_re = load_split_rules(&split_path).unwrap();
955        let wordcut =
956            Wordcut::new_with_cluster_re_and_split_re(dict.unwrap(), cluster_re, split_re);
957        assert_eq!(
958            wordcut.put_delimiters(text, "|||"),
959            String::from("AB|||   |||X|||(|||A|||)|||/|||12")
960        );
961    }
962
963    #[test]
964    fn test_find_clusters_path() {
965        let path = super::Path::new(concat!(
966            env!("CARGO_MANIFEST_DIR"),
967            "/data/thai_cluster_rules.txt"
968        ));
969        let cluster_re = super::load_cluster_rules(&path).unwrap();
970        let cluster_path = find_cluster_path(&cluster_re, "เกียำ");
971        assert_eq!(cluster_path.len(), 16);
972        assert_eq!(cluster_path[15].p, 9);
973    }
974}
wordcut_engine/lib.rs

wordcut_engine/
lib.rs