wordcut_engine/
lib.rs

1pub mod dict;
2pub mod edge;
3pub mod edge_builders;
4pub mod errors;
5pub mod replacer;
6pub mod text_range;
7
8#[macro_use]
9extern crate lazy_static;
10extern crate prefixtree;
11#[macro_use]
12extern crate serde_derive;
13
14use self::prefixtree::{prefix_tree_from_str, PrefixTree};
15use crate::dict::Dict;
16use crate::edge::{Edge, EdgeType};
17use crate::edge_builders::{
18    DictEdgeBuilder, EdgeBuilder, EdgeBuildingContext, RuleBasedEdgeBuilder, UnkEdgeBuilder,
19};
20use crate::errors::WordcutError;
21use crate::text_range::TextRange;
22use regex_automata::dfa::dense;
23use regex_automata::dfa::Automaton;
24use regex_automata::meta::Regex;
25use regex_automata::util::primitives::StateID;
26use regex_automata::util::start;
27use regex_automata::Anchored;
28use std::fs::File;
29use std::io;
30use std::io::BufRead;
31use std::path::Path;
32
33macro_rules! insert_prefix {
34    ($filename:expr) => {
35        Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/", $filename))
36    };
37}
38
39macro_rules! insert_prefix_str {
40    ($filename:expr) => {
41        concat!(env!("CARGO_MANIFEST_DIR"), "/data/", $filename)
42    };
43}
44
45pub fn default_dict_path() -> &'static Path {
46    insert_prefix!("mixed-wordlist.txt")
47}
48
49pub fn thai_cluster_path() -> Option<String> {
50    Some(insert_prefix_str!("thai_cluster_rules.txt").to_owned())
51}
52
53pub fn thai_replace_rules_path() -> Option<String> {
54    Some(insert_prefix_str!("thai-replace-rules.json").to_owned())
55}
56
57type ClusterRulesMatcher = dense::DFA<Vec<u32>>;
58type SplitRulesMatcher = Regex;
59
60lazy_static! {
61    static ref DEFAULT_THAI_SPLIT_RE: Regex =
62        Regex::new("[\r\t\n ]+|[A-Za-z]+|[0-9]+|[๐-๙]+|“").unwrap();
63}
64
65lazy_static! {
66    static ref DFA_START_CONFIG: start::Config = start::Config::new().anchored(Anchored::Yes);
67}
68
69pub fn create_prefix_tree(words: &[&str]) -> PrefixTree<char, bool> {
70    let words_payloads: Vec<(&str, bool)> = words.iter().map(|&word| (word, true)).collect();
71    prefix_tree_from_str(&words_payloads[..])
72}
73
74#[inline]
75fn does_not_break_cluster(s: usize, e: usize, text_len: usize, clusters: &[usize]) -> bool {
76    (s == 0 || clusters[s] == 0 || clusters[s] != clusters[s - 1])
77        && (e == text_len || clusters[e - 1] == 0 || clusters[e] != clusters[e - 1])
78}
79
80#[inline]
81fn should_skip_edge(edge: &Option<Edge>, i: usize, text_len: usize, clusters: &[usize]) -> bool {
82    let mut skip_edge = false;
83    if let Some(edge) = edge {
84        let s = edge.p;
85        let e = i + 1;
86        skip_edge = !edge.is_unk() && !does_not_break_cluster(s, e, text_len, clusters);
87    }
88    skip_edge
89}
90
91fn build_path_with_clusters(
92    mut builders: Vec<&mut dyn EdgeBuilder>,
93    clusters: &[usize],
94    text: &[char],
95) -> Vec<Edge> {
96    let mut path = vec![];
97    path.push(Edge {
98        w: 0,
99        unk: 0,
100        p: 0,
101        etype: EdgeType::Init,
102    });
103
104    let mut context = EdgeBuildingContext {
105        text,
106        i: 0,
107        ch: '\0',
108        left_boundary: 0,
109        best_edge: None,
110    };
111
112    let text_len = text.len();
113    for i in 0..text_len {
114        context.ch = text[i];
115        context.i = i;
116        context.best_edge = None;
117        for builder in &mut builders {
118            let edge = builder.build(&context, &path);
119            if !should_skip_edge(&edge, i, text_len, clusters)
120                && Edge::better(&edge, &context.best_edge)
121            {
122                context.best_edge = edge
123            }
124        }
125        path.push(context.best_edge.unwrap());
126        if !context.best_edge.unwrap().is_unk() {
127            context.left_boundary = i + 1;
128        }
129    }
130    path
131}
132
133#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
134pub struct DagEdge {
135    pub s: usize,
136    pub e: usize,
137    pub etype: EdgeType,
138}
139
140pub type Dag = Vec<Vec<DagEdge>>;
141
142pub trait DagEdgeBuilder {
143    fn build_dag_edges(&mut self, context: &EdgeBuildingContext) -> Vec<DagEdge>;
144}
145
146impl<'a> DagEdgeBuilder for DictEdgeBuilder<'a> {
147    fn build_dag_edges(&mut self, context: &EdgeBuildingContext) -> Vec<DagEdge> {
148        self.add_pointer(context);
149        self.update_pointers(context);
150        //self.gen_edge(&self.pointers, path)
151        self.pointers
152            .iter()
153            .filter(|p| p.is_final)
154            .map(|p| DagEdge {
155                s: p.s,
156                e: context.i + 1,
157                etype: EdgeType::Dict,
158            })
159            .collect()
160    }
161}
162
163pub fn build_dag(dict: &Dict, text: &Vec<char>) -> Dag {
164    let mut builders: Vec<Box<dyn DagEdgeBuilder>> = vec![Box::new(DictEdgeBuilder::new(dict))];
165
166    let mut dag = Vec::with_capacity(text.len() + 1);
167
168    for _ in 0..text.len() + 1 {
169        dag.push(vec![]);
170    }
171    dag[0].push(DagEdge {
172        s: 0,
173        e: 0,
174        etype: EdgeType::Init,
175    });
176    let mut context = EdgeBuildingContext {
177        text: &text,
178        i: 0,
179        ch: '\0',
180        left_boundary: 0,
181        best_edge: None,
182    };
183
184    for i in 0..text.len() {
185        context.ch = text[i];
186        context.i = i;
187        context.best_edge = None;
188
189        for builder in &mut builders {
190            for edge in builder.build_dag_edges(&context) {
191                dag[edge.e].push(edge)
192            }
193        }
194    }
195
196    let mut left_boundary = 0;
197    for i in 1..text.len() + 1 {
198        if dag[i].len() == 0 {
199            dag[i].push(DagEdge {
200                s: left_boundary,
201                e: i,
202                etype: EdgeType::Unk,
203            });
204        } else {
205            left_boundary = i;
206        }
207    }
208
209    dag
210}
211
212pub fn path_to_ranges(path: &[Edge]) -> Vec<TextRange> {
213    let len = path.len();
214
215    if len == 0 {
216        return vec![];
217    }
218
219    let mut ranges: Vec<TextRange> = Vec::with_capacity(len);
220    let mut e = len - 1;
221    while e > 0 {
222        let edge = &path[e];
223        let s = edge.p;
224        ranges.push(TextRange { s, e });
225        e = s;
226    }
227    ranges.reverse();
228    ranges
229}
230
231pub fn path_to_byte_ranges(path: &[Edge], text: &[char]) -> Vec<TextRange> {
232    let char_ranges = path_to_ranges(path);
233    let mut ranges: Vec<TextRange> = Vec::with_capacity(char_ranges.len());
234    let mut global_byte_offset = 0;
235    for r in char_ranges {
236        let mut word_byte_offset = 0;
237        for i in r.s..r.e {
238            word_byte_offset += text[i].len_utf8();
239        }
240        ranges.push(TextRange {
241            s: global_byte_offset,
242            e: global_byte_offset + word_byte_offset,
243        });
244        global_byte_offset += word_byte_offset;
245    }
246    ranges
247}
248
249pub fn path_to_str_vec(path: &[Edge], text: &[char]) -> Vec<String> {
250    let ranges = path_to_ranges(path);
251    let mut str_vec: Vec<String> = Vec::with_capacity(ranges.len());
252    for r in ranges {
253        let mut buf = String::with_capacity(3 * (r.e - r.s + 1));
254        for i in r.s..r.e {
255            buf.push(text[i]);
256        }
257        str_vec.push(buf)
258    }
259    str_vec
260}
261
262pub struct Wordcut {
263    dict: Dict,
264    cluster_re: Option<ClusterRulesMatcher>,
265    split_re: SplitRulesMatcher,
266}
267
268impl Wordcut {
269    pub fn new(dict: Dict) -> Wordcut {
270        Wordcut {
271            dict,
272            cluster_re: None,
273            split_re: DEFAULT_THAI_SPLIT_RE.clone(),
274        }
275    }
276
277    pub fn new_with_cluster_re(dict: Dict, cluster_re: ClusterRulesMatcher) -> Wordcut {
278        Wordcut {
279            dict,
280            cluster_re: Some(cluster_re),
281            split_re: DEFAULT_THAI_SPLIT_RE.clone(),
282        }
283    }
284
285    pub fn new_with_cluster_re_and_split_re(
286        dict: Dict,
287        cluster_re: ClusterRulesMatcher,
288        split_re: SplitRulesMatcher,
289    ) -> Wordcut {
290        Wordcut {
291            dict,
292            cluster_re: Some(cluster_re),
293            split_re,
294        }
295    }
296
297    #[inline]
298    pub fn build_path(&self, text: &str, text_chars: &[char]) -> Vec<Edge> {
299        let byte_to_char_idx_map = create_byte_to_char_idx_map(text);
300        let mut dict_edge_builder = DictEdgeBuilder::new(&self.dict);
301        let mut unk_edge_builder = UnkEdgeBuilder::new();
302        let mut rule_based_edge_builder =
303            RuleBasedEdgeBuilder::new(&byte_to_char_idx_map, text, &self.split_re);
304        let builders: Vec<&mut dyn EdgeBuilder> = vec![
305            &mut dict_edge_builder,
306            &mut unk_edge_builder,
307            &mut rule_based_edge_builder,
308        ];
309
310        let clusters = if let Some(cluster_re) = &self.cluster_re {
311            find_clusters(text, &byte_to_char_idx_map, cluster_re, text_chars.len())
312        } else {
313            let mut clusters = vec![];
314            clusters.resize(text_chars.len() + 1, 0);
315            clusters
316        };
317        build_path_with_clusters(builders, &clusters, text_chars)
318    }
319
320    #[allow(dead_code)]
321    pub fn segment(&self, text: &str) -> Vec<TextRange> {
322        let text_chars: Vec<char> = text.chars().collect();
323        let path = self.build_path(text, &text_chars);
324        path_to_ranges(&path)
325    }
326
327    pub fn segment_into_byte_ranges(&self, text: &str) -> Vec<TextRange> {
328        let text_chars: Vec<char> = text.chars().collect();
329        let path = self.build_path(text, &text_chars);
330        path_to_byte_ranges(&path, &text_chars)
331    }
332
333    pub fn segment_into_strings(&self, text: &str) -> Vec<String> {
334        let text_chars: Vec<char> = text.chars().collect();
335        let path = self.build_path(text, &text_chars);
336        path_to_str_vec(&path, &text_chars)
337    }
338
339    pub fn put_delimiters(&self, text: &str, delim: &str) -> String {
340        self.segment_into_strings(text).join(delim)
341    }
342
343    #[allow(dead_code)]
344    pub fn build_dag(&self, text: &str) -> Dag {
345        build_dag(&self.dict, &text.chars().collect())
346    }
347}
348
349pub fn create_byte_to_char_idx_map(text: &str) -> Vec<usize> {
350    let mut byte_to_char_map = vec![];
351    let mut i = 0;
352    for b in text.as_bytes() {
353        if (*b as i8) >= -0x40 {
354            byte_to_char_map.push(i);
355            i += 1;
356        } else {
357            byte_to_char_map.push(0);
358        }
359    }
360    byte_to_char_map.push(i);
361    byte_to_char_map
362}
363
364#[derive(Debug)]
365pub struct ClusterPointer {
366    state_id: StateID,
367    p: usize,
368}
369
370#[derive(Debug)]
371pub struct ClusterEdge {
372    acc_pat_len: usize,
373    unk_cnt: usize,
374    p: usize,
375    is_unk: bool,
376}
377
378pub fn find_cluster_path(dfa: &ClusterRulesMatcher, text: &str) -> Vec<ClusterEdge> {
379    let mut pointers = vec![];
380    let mut ch_index = 0;
381    let mut path = vec![];
382    let mut left_boundary = 0;
383    path.push(ClusterEdge {
384        p: 0,
385        acc_pat_len: 0,
386        unk_cnt: 0,
387        is_unk: false,
388    });
389    for ch_byte in text.as_bytes() {
390        let mut best_edge: Option<ClusterEdge> = None;
391        pointers.push(ClusterPointer {
392            state_id: dfa
393                .start_state(&DFA_START_CONFIG)
394                .expect("DFA state started"),
395            p: ch_index,
396        });
397        let mut new_pointer_index = 0;
398        for pointer_index in 0..pointers.len() {
399            let next_id = dfa.next_state(pointers[pointer_index].state_id, *ch_byte);
400            if !dfa.is_dead_state(next_id) {
401                pointers[new_pointer_index] = ClusterPointer {
402                    state_id: next_id,
403                    p: pointers[pointer_index].p,
404                };
405                new_pointer_index += 1;
406                if dfa.is_match_state(dfa.next_eoi_state(next_id)) {
407                    let source = &path[pointers[pointer_index].p];
408                    let edge = ClusterEdge {
409                        p: pointers[pointer_index].p,
410                        acc_pat_len: source.acc_pat_len
411                            + (ch_index - pointers[pointer_index].p + 1),
412                        unk_cnt: source.unk_cnt,
413                        is_unk: false,
414                    };
415                    if match &best_edge {
416                        Some(b_edge) => {
417                            b_edge.unk_cnt > edge.unk_cnt
418                                || (b_edge.unk_cnt == edge.unk_cnt
419                                    && b_edge.acc_pat_len < edge.acc_pat_len)
420                        }
421                        None => true,
422                    } {
423                        best_edge = Some(edge);
424                    }
425                }
426            }
427        }
428        pointers.truncate(new_pointer_index);
429        if best_edge.is_none() {
430            let source = &path[left_boundary];
431            best_edge = Some(ClusterEdge {
432                p: left_boundary,
433                acc_pat_len: source.acc_pat_len,
434                unk_cnt: source.unk_cnt + (ch_index - left_boundary + 1),
435                is_unk: true,
436            });
437        }
438        let best_edge = best_edge.unwrap();
439        if !best_edge.is_unk {
440            left_boundary = ch_index + 1;
441        }
442        path.push(best_edge);
443        ch_index += 1;
444    }
445    path
446}
447
448pub fn find_clusters(
449    text: &str,
450    byte_to_char_idx_map: &[usize],
451    dfa: &ClusterRulesMatcher,
452    len: usize,
453) -> Vec<usize> {
454    let mut clusters = vec![];
455    clusters.resize(len, 0);
456    let mut id = 1;
457    let path = find_cluster_path(dfa, text);
458    let mut me = path.len() - 1;
459    while me > 0 {
460        let edge = &path[me];
461        let ms = edge.p;
462        let s = byte_to_char_idx_map[ms];
463        let e = byte_to_char_idx_map[me];
464        if !edge.is_unk {
465            for i in s..e {
466                clusters[i] = id;
467            }
468            id += 1;
469        }
470        me = ms;
471    }
472    clusters
473}
474
475pub fn load_wordlist(path: impl AsRef<Path>) -> io::Result<Vec<String>> {
476    let f = File::open(path.as_ref())?;
477    let f = io::BufReader::new(f);
478    Ok(f.lines().map(|line| line.unwrap()).collect())
479}
480
481pub fn load_dict(path: impl AsRef<Path>) -> io::Result<Dict> {
482    let wordlist = load_wordlist(path)?;
483    let wordlist: Vec<_> = wordlist.iter().map(|w| &w[..]).collect();
484    Ok(create_prefix_tree(&wordlist))
485}
486
487pub fn load_cluster_rules(path: &Path) -> Result<ClusterRulesMatcher, WordcutError> {
488    let f = File::open(path)
489        .map_err(|_| WordcutError::CannotOpenClusterRulesAt(path.to_string_lossy().to_string()))?;
490    let f = io::BufReader::new(f);
491    let mut rules = vec![];
492    for line in f.lines() {
493        let line = line.map_err(|_| WordcutError::CannotReadClusterRule)?;
494        rules.push(format!("({})", line.trim()));
495    }
496    let rules = rules.join("|");
497    let dfa =
498        dense::DFA::new(&rules).map_err(|_| WordcutError::CannotCompileClusterRules(rules))?;
499    Ok(dfa)
500}
501
502pub fn load_split_rules(path: &Path) -> Result<SplitRulesMatcher, WordcutError> {
503    let f = File::open(path)
504        .map_err(|_| WordcutError::CannotOpenSplitRulesAt(path.to_string_lossy().to_string()))?;
505    let f = io::BufReader::new(f);
506    let mut rules = vec![];
507    for line in f.lines() {
508        let line = line.map_err(|_| WordcutError::CannotReadClusterRule)?;
509        rules.push(format!("({})", line.trim()));
510    }
511    let rules = rules.join("|");
512    Ok(Regex::new(&rules).map_err(|_| WordcutError::CannotCompileSplitRules(rules))?)
513}
514
515#[cfg(test)]
516mod tests {
517    extern crate serde_json;
518    use super::*;
519
520    use DagEdge;
521    use EdgeType;
522    use TextRange;
523    use Wordcut;
524
525    #[test]
526    fn test_prefix_tree() {
527        let prefix_tree = super::create_prefix_tree(&["A"]);
528        assert_eq!(
529            prefix_tree.seek(&(0, 0, 'A')),
530            Some(&(0 as u32, true, Some(true)))
531        );
532        assert_eq!(prefix_tree.seek(&(0, 0, 'B')), None);
533    }
534
535    #[test]
536    fn test_segment() {
537        let dict = super::create_prefix_tree(&["กา", "กาก"]);
538        let wordcut = Wordcut::new(dict);
539        let ranges = wordcut.segment("กากกา");
540        let expected = vec![TextRange { s: 0, e: 3 }, TextRange { s: 3, e: 5 }];
541        assert_eq!(ranges, expected)
542    }
543
544    #[test]
545    fn test_segment_into_byte_ranges() {
546        let dict = super::create_prefix_tree(&["กา", "กาก"]);
547        let wordcut = Wordcut::new(dict);
548        let ranges = wordcut.segment_into_byte_ranges("กากกา");
549        let expected = vec![TextRange { s: 0, e: 9 }, TextRange { s: 9, e: 15 }];
550        assert_eq!(ranges, expected)
551    }
552
553    #[test]
554    fn test_segment_to_strings() {
555        let dict = super::create_prefix_tree(&["กา", "กาก"]);
556        let wordcut = Wordcut::new(dict);
557        let toks = wordcut.segment_into_strings("กากกา");
558        let expected = vec![String::from("กาก"), String::from("กา")];
559        assert_eq!(toks, expected)
560    }
561
562    #[test]
563    fn test_segment_to_strings_mixed() {
564        let dict = super::create_prefix_tree(&["กา", "กาก"]);
565        let wordcut = Wordcut::new(dict);
566        let toks = wordcut.segment_into_strings("cat กากกา");
567        let expected = vec![
568            String::from("cat"),
569            String::from(" "),
570            String::from("กาก"),
571            String::from("กา"),
572        ];
573        assert_eq!(toks, expected)
574    }
575
576    #[test]
577    fn test_put_delimiters() {
578        let dict = super::create_prefix_tree(&["กา", "กาก"]);
579        let wordcut = Wordcut::new(dict);
580        assert_eq!(wordcut.put_delimiters("กากกา", "|"), String::from("กาก|กา"))
581    }
582
583    #[test]
584    fn test_load_wordlist() {
585        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
586        let v = super::load_wordlist(path);
587        assert_eq!(v.unwrap(), vec![String::from("กา"), String::from("กาก")])
588    }
589
590    #[test]
591    fn test_wordcut() {
592        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
593        let dict = super::load_dict(&path);
594        let wordcut = Wordcut::new(dict.unwrap());
595        assert_eq!(wordcut.put_delimiters("กากกา", "|"), String::from("กาก|กา"))
596    }
597
598    #[test]
599    fn test_wordcut_with_replacer() {
600        let dict = super::create_prefix_tree(&["ข้อ", "รับ", "สำหรับ", "เสนอ"]);
601        let wordcut = Wordcut::new(dict);
602        let rule = r###"{"pattern": "ํา", "replacement": "ำ"}"###;
603        let rule: replacer::Rule = serde_json::from_str(rule).unwrap();
604        let imm_rules = replacer::ImmRule::from_rules(&vec![rule]).unwrap();
605        let mod_text = replacer::replace(&imm_rules, "สําหรับข้อเสนอ");
606        assert_eq!(
607            wordcut.put_delimiters(&mod_text, "|"),
608            String::from("สำหรับ|ข้อ|เสนอ")
609        )
610    }
611
612    #[test]
613    fn test_wordcut_with_replacer_two_occurs() {
614        let dict = super::create_prefix_tree(&["กำลัง", "ทำ", "พยายาม", "ลัง", "ให้"]);
615        let wordcut = Wordcut::new(dict);
616        let rule = r###"{"pattern": "ํา", "replacement": "ำ"}"###;
617        let rule: replacer::Rule = serde_json::from_str(rule).unwrap();
618        let imm_rules = replacer::ImmRule::from_rules(&vec![rule]).unwrap();
619        let mod_text = replacer::replace(&imm_rules, "กําลังพยายามทําให้");
620        assert_eq!(
621            wordcut.put_delimiters(&mod_text, "|"),
622            String::from("กำลัง|พยายาม|ทำ|ให้")
623        )
624    }
625
626    #[test]
627    fn test_wordcut_with_latin() {
628        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
629        let dict = super::load_dict(&path);
630        let wordcut = Wordcut::new(dict.unwrap());
631        assert_eq!(
632            wordcut.put_delimiters("ฑฑACญญ", "|"),
633            String::from("ฑฑ|AC|ญญ")
634        )
635    }
636
637    #[test]
638    fn test_wordcut_with_two_spaces() {
639        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
640        let dict = super::load_dict(&path);
641        let wordcut = Wordcut::new(dict.unwrap());
642        assert_eq!(
643            wordcut.put_delimiters("กา  มา", "|"),
644            String::from("กา|  |มา")
645        )
646    }
647
648    #[test]
649    fn test_wordcut_with_two_spaces_unk() {
650        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
651        let dict = super::load_dict(&path);
652        let wordcut = Wordcut::new(dict.unwrap());
653        assert_eq!(
654            wordcut.put_delimiters("แแ  ยย", "|"),
655            String::from("แแ|  |ยย")
656        )
657    }
658
659    #[test]
660    fn test_wordcut_with_unicode_quote() {
661        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
662        let dict = super::load_dict(&path);
663        let wordcut = Wordcut::new(dict.unwrap());
664        assert_eq!(
665            wordcut.put_delimiters("“ฆกากา”", "|"),
666            String::from("“|ฆ|กา|กา|”")
667        )
668    }
669
670    #[test]
671    fn test_dag() {
672        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
673        let dict = super::load_dict(&path).unwrap();
674        let wordcut = Wordcut::new(dict);
675        let dag = wordcut.build_dag("กากกา");
676        let expected = vec![
677            vec![DagEdge {
678                s: 0,
679                e: 0,
680                etype: EdgeType::Init,
681            }], // 0
682            vec![DagEdge {
683                s: 0,
684                e: 1,
685                etype: EdgeType::Unk,
686            }], // 1
687            vec![DagEdge {
688                s: 0,
689                e: 2,
690                etype: EdgeType::Dict,
691            }], // 2
692            vec![DagEdge {
693                s: 0,
694                e: 3,
695                etype: EdgeType::Dict,
696            }], // 3
697            vec![DagEdge {
698                s: 3,
699                e: 4,
700                etype: EdgeType::Unk,
701            }], // 4
702            vec![DagEdge {
703                s: 3,
704                e: 5,
705                etype: EdgeType::Dict,
706            }], // 5
707        ];
708        assert_eq!(dag, expected);
709    }
710
711    #[test]
712    fn test_dag_in_object() {
713        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
714        let dict = super::load_dict(&path);
715        let dag = super::build_dag(&dict.unwrap(), &"กากกา".chars().collect());
716        let expected = vec![
717            vec![DagEdge {
718                s: 0,
719                e: 0,
720                etype: EdgeType::Init,
721            }], // 0
722            vec![DagEdge {
723                s: 0,
724                e: 1,
725                etype: EdgeType::Unk,
726            }], // 1
727            vec![DagEdge {
728                s: 0,
729                e: 2,
730                etype: EdgeType::Dict,
731            }], // 2
732            vec![DagEdge {
733                s: 0,
734                e: 3,
735                etype: EdgeType::Dict,
736            }], // 3
737            vec![DagEdge {
738                s: 3,
739                e: 4,
740                etype: EdgeType::Unk,
741            }], // 4
742            vec![DagEdge {
743                s: 3,
744                e: 5,
745                etype: EdgeType::Dict,
746            }], // 5
747        ];
748        assert_eq!(dag, expected);
749    }
750
751    #[test]
752    fn test_dag_empty() {
753        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
754        let dict = super::load_dict(&path);
755        let dag = super::build_dag(&dict.unwrap(), &"".chars().collect());
756        let expected = vec![
757            vec![DagEdge {
758                s: 0,
759                e: 0,
760                etype: EdgeType::Init,
761            }], // 0
762        ];
763        assert_eq!(dag, expected);
764    }
765
766    #[test]
767    fn test_dag_to_json() {
768        let dag = vec![
769            vec![DagEdge {
770                s: 0,
771                e: 0,
772                etype: EdgeType::Init,
773            }], // 0
774        ];
775        let s = serde_json::to_string(&dag).unwrap();
776        assert_eq!(s, "[[{\"s\":0,\"e\":0,\"etype\":\"Init\"}]]");
777    }
778
779    #[test]
780    fn test_find_clusters() {
781        let text = "กาแกกก์A";
782        let path = super::Path::new(concat!(
783            env!("CARGO_MANIFEST_DIR"),
784            "/data/thai_cluster_rules.txt"
785        ));
786        let cluster_re = super::load_cluster_rules(&path).unwrap();
787        let byte_to_char_idx_map = create_byte_to_char_idx_map(text);
788        let clusters = find_clusters(
789            text,
790            &byte_to_char_idx_map,
791            &cluster_re,
792            text.chars().count(),
793        );
794        assert_eq!(clusters, vec![2, 2, 1, 1, 1, 1, 1, 0]);
795    }
796
797    #[test]
798    fn test_wordcut_with_clusters() {
799        let text = "แมวแฐแกกก์มา";
800        let cluster_path = super::Path::new(concat!(
801            env!("CARGO_MANIFEST_DIR"),
802            "/data/thai_cluster_rules.txt"
803        ));
804        let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
805        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
806        let dict = super::load_dict(&path);
807        let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
808        assert_eq!(
809            wordcut.put_delimiters(text, "|||"),
810            String::from("แมว|||แฐแกกก์|||มา")
811        );
812    }
813
814    #[test]
815    fn test_wordcut_with_clusters_portsmouth() {
816        let text = "จากพอร์ตสมัธไป";
817        let cluster_path = super::Path::new(concat!(
818            env!("CARGO_MANIFEST_DIR"),
819            "/data/thai_cluster_rules.txt"
820        ));
821        let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
822        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
823        let dict = super::load_dict(&path);
824        let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
825        assert_eq!(
826            wordcut.put_delimiters(text, "|||"),
827            String::from("จาก|||พอร์ต|||สมัธ|||ไป")
828        );
829    }
830
831    #[test]
832    fn test_wordcut_with_clusters2() {
833        let text = "มีรีเควสต์อะไร";
834        let cluster_path = super::Path::new(concat!(
835            env!("CARGO_MANIFEST_DIR"),
836            "/data/thai_cluster_rules.txt"
837        ));
838        let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
839        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
840        let dict = super::load_dict(&path);
841        let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
842        assert_eq!(
843            wordcut.put_delimiters(text, "|||"),
844            String::from("มี|||รี|||เค|||วสต์|||อะไร")
845        );
846    }
847
848    #[test]
849    fn test_wordcut_khmer_cluster_basic() {
850        let text = "ឡារី";
851        let cluster_path = super::Path::new(concat!(
852            env!("CARGO_MANIFEST_DIR"),
853            "/data/khmer_cluster_rules.txt"
854        ));
855        let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
856        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/khmerdict.txt"));
857        let dict = super::load_dict(&path);
858        let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
859        assert_eq!(wordcut.put_delimiters(text, "|||"), String::from("ឡា|||រី"));
860    }
861
862    #[test]
863    fn test_rule_based_edge_builder() {
864        let text = "  ABก";
865        let text_chars: Vec<char> = text.chars().collect();
866        let byte_to_char_idx_map = create_byte_to_char_idx_map(text);
867        let mut builder =
868            RuleBasedEdgeBuilder::new(&byte_to_char_idx_map, text, &DEFAULT_THAI_SPLIT_RE);
869        let mut path = vec![];
870        path.push(Edge {
871            w: 10,
872            unk: 20,
873            p: 0,
874            etype: EdgeType::Init,
875        });
876        let edge = builder.build(
877            &EdgeBuildingContext {
878                text: &text_chars,
879                i: 0,
880                ch: '\0',
881                left_boundary: 0,
882                best_edge: None,
883            },
884            &path,
885        );
886        assert!(edge.is_none());
887        path.push(Edge {
888            w: 20,
889            unk: 30,
890            p: 0,
891            etype: EdgeType::Unk,
892        });
893
894        let edge = builder.build(
895            &EdgeBuildingContext {
896                text: &text_chars,
897                i: 1,
898                ch: '\0',
899                left_boundary: 0,
900                best_edge: None,
901            },
902            &path,
903        );
904        assert!(edge.is_some());
905        path.push(Edge {
906            w: 30,
907            unk: 40,
908            p: 0,
909            etype: EdgeType::Pat,
910        });
911
912        let edge = builder.build(
913            &EdgeBuildingContext {
914                text: &text_chars,
915                i: 2,
916                ch: '\0',
917                left_boundary: 0,
918                best_edge: None,
919            },
920            &path,
921        );
922        assert!(edge.is_none());
923        path.push(Edge {
924            w: 50,
925            unk: 60,
926            p: 0,
927            etype: EdgeType::Unk,
928        });
929
930        let edge = builder.build(
931            &EdgeBuildingContext {
932                text: &text_chars,
933                i: 3,
934                ch: '\0',
935                left_boundary: 0,
936                best_edge: None,
937            },
938            &path,
939        );
940        assert!(edge.is_some());
941        let edge = edge.unwrap();
942        assert_eq!(
943            edge,
944            Edge {
945                w: 31,
946                unk: 40,
947                p: 2,
948                etype: EdgeType::Pat
949            }
950        );
951    }
952
953    #[test]
954    fn test_wordcut_with_split_rules() {
955        let text = "AB   X(A)/12";
956        let cluster_path = super::Path::new(concat!(
957            env!("CARGO_MANIFEST_DIR"),
958            "/data/thai_cluster_rules.txt"
959        ));
960        let split_path = super::Path::new(concat!(
961            env!("CARGO_MANIFEST_DIR"),
962            "/data/thai_split_rules.txt"
963        ));
964
965        let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
966        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
967        let dict = super::load_dict(&path);
968        let split_re = load_split_rules(&split_path).unwrap();
969        let wordcut =
970            Wordcut::new_with_cluster_re_and_split_re(dict.unwrap(), cluster_re, split_re);
971        assert_eq!(
972            wordcut.put_delimiters(text, "|||"),
973            String::from("AB|||   |||X|||(|||A|||)|||/|||12")
974        );
975    }
976
977    #[test]
978    fn test_find_clusters_path() {
979        let path = super::Path::new(concat!(
980            env!("CARGO_MANIFEST_DIR"),
981            "/data/thai_cluster_rules.txt"
982        ));
983        let cluster_re = super::load_cluster_rules(&path).unwrap();
984        let cluster_path = find_cluster_path(&cluster_re, "เกียำ");
985        assert_eq!(cluster_path.len(), 16);
986        assert_eq!(cluster_path[15].p, 9);
987    }
988}
wordcut_engine/lib.rs

wordcut_engine/
lib.rs