wordcut-engine 1.2.2

pub mod dict;
pub mod edge;
pub mod edge_builders;
pub mod errors;
pub mod replacer;
pub mod text_range;

#[macro_use]
extern crate lazy_static;
extern crate prefixtree;
#[macro_use]
extern crate serde_derive;

use self::prefixtree::{prefix_tree_from_str, PrefixTree};
use crate::dict::Dict;
use crate::edge::{Edge, EdgeType};
use crate::edge_builders::{
    DictEdgeBuilder, EdgeBuilder, EdgeBuildingContext, RuleBasedEdgeBuilder, UnkEdgeBuilder,
};
use crate::errors::WordcutError;
use crate::text_range::TextRange;
use regex_automata::dfa::dense;
use regex_automata::dfa::Automaton;
use regex_automata::meta::Regex;
use regex_automata::util::primitives::StateID;
use regex_automata::util::start;
use regex_automata::Anchored;
use std::fs::File;
use std::io;
use std::io::BufRead;
use std::path::Path;

macro_rules! insert_prefix {
    ($filename:expr) => {
        Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/", $filename))
    };
}

macro_rules! insert_prefix_str {
    ($filename:expr) => {
        concat!(env!("CARGO_MANIFEST_DIR"), "/data/", $filename)
    };
}

pub fn default_dict_path() -> &'static Path {
    insert_prefix!("mixed-wordlist.txt")
}

pub fn thai_cluster_path() -> Option<String> {
    Some(insert_prefix_str!("thai_cluster_rules.txt").to_owned())
}

pub fn thai_replace_rules_path() -> Option<String> {
    Some(insert_prefix_str!("thai-replace-rules.json").to_owned())
}

type ClusterRulesMatcher = dense::DFA<Vec<u32>>;
type SplitRulesMatcher = Regex;

lazy_static! {
    static ref DEFAULT_THAI_SPLIT_RE: Regex =
        Regex::new("[\r\t\n ]+|[A-Za-z]+|[0-9]+|[๐-๙]+|“").unwrap();
}

lazy_static! {
    static ref DFA_START_CONFIG: start::Config = start::Config::new().anchored(Anchored::Yes);
}

pub fn create_prefix_tree(words: &[&str]) -> PrefixTree<char, bool> {
    let words_payloads: Vec<(&str, bool)> = words.iter().map(|&word| (word, true)).collect();
    prefix_tree_from_str(&words_payloads[..])
}

#[inline]
fn does_not_break_cluster(s: usize, e: usize, text_len: usize, clusters: &[usize]) -> bool {
    (s == 0 || clusters[s] == 0 || clusters[s] != clusters[s - 1])
        && (e == text_len || clusters[e - 1] == 0 || clusters[e] != clusters[e - 1])
}

#[inline]
fn should_skip_edge(edge: &Option<Edge>, i: usize, text_len: usize, clusters: &[usize]) -> bool {
    let mut skip_edge = false;
    if let Some(edge) = edge {
        let s = edge.p;
        let e = i + 1;
        skip_edge = !edge.is_unk() && !does_not_break_cluster(s, e, text_len, clusters);
    }
    skip_edge
}

fn build_path_with_clusters(
    mut builders: Vec<&mut dyn EdgeBuilder>,
    clusters: &[usize],
    text: &[char],
) -> Vec<Edge> {
    let mut path = vec![];
    path.push(Edge {
        w: 0,
        unk: 0,
        p: 0,
        etype: EdgeType::Init,
    });

    let mut context = EdgeBuildingContext {
        text,
        i: 0,
        ch: '\0',
        left_boundary: 0,
        best_edge: None,
    };

    let text_len = text.len();
    for i in 0..text_len {
        context.ch = text[i];
        context.i = i;
        context.best_edge = None;
        for builder in &mut builders {
            let edge = builder.build(&context, &path);
            if !should_skip_edge(&edge, i, text_len, clusters)
                && Edge::better(&edge, &context.best_edge)
            {
                context.best_edge = edge
            }
        }
        path.push(context.best_edge.unwrap());
        if !context.best_edge.unwrap().is_unk() {
            context.left_boundary = i + 1;
        }
    }
    path
}

#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
pub struct DagEdge {
    pub s: usize,
    pub e: usize,
    pub etype: EdgeType,
}

pub type Dag = Vec<Vec<DagEdge>>;

pub trait DagEdgeBuilder {
    fn build_dag_edges(&mut self, context: &EdgeBuildingContext) -> Vec<DagEdge>;
}

impl<'a> DagEdgeBuilder for DictEdgeBuilder<'a> {
    fn build_dag_edges(&mut self, context: &EdgeBuildingContext) -> Vec<DagEdge> {
        self.add_pointer(context);
        self.update_pointers(context);
        //self.gen_edge(&self.pointers, path)
        self.pointers
            .iter()
            .filter(|p| p.is_final)
            .map(|p| DagEdge {
                s: p.s,
                e: context.i + 1,
                etype: EdgeType::Dict,
            })
            .collect()
    }
}

pub fn build_dag(dict: &Dict, text: &Vec<char>) -> Dag {
    let mut builders: Vec<Box<dyn DagEdgeBuilder>> = vec![Box::new(DictEdgeBuilder::new(dict))];

    let mut dag = Vec::with_capacity(text.len() + 1);

    for _ in 0..text.len() + 1 {
        dag.push(vec![]);
    }
    dag[0].push(DagEdge {
        s: 0,
        e: 0,
        etype: EdgeType::Init,
    });
    let mut context = EdgeBuildingContext {
        text: &text,
        i: 0,
        ch: '\0',
        left_boundary: 0,
        best_edge: None,
    };

    for i in 0..text.len() {
        context.ch = text[i];
        context.i = i;
        context.best_edge = None;

        for builder in &mut builders {
            for edge in builder.build_dag_edges(&context) {
                dag[edge.e].push(edge)
            }
        }
    }

    let mut left_boundary = 0;
    for i in 1..text.len() + 1 {
        if dag[i].len() == 0 {
            dag[i].push(DagEdge {
                s: left_boundary,
                e: i,
                etype: EdgeType::Unk,
            });
        } else {
            left_boundary = i;
        }
    }

    dag
}

pub fn path_to_ranges(path: &[Edge]) -> Vec<TextRange> {
    let len = path.len();

    if len == 0 {
        return vec![];
    }

    let mut ranges: Vec<TextRange> = Vec::with_capacity(len);
    let mut e = len - 1;
    while e > 0 {
        let edge = &path[e];
        let s = edge.p;
        ranges.push(TextRange { s, e });
        e = s;
    }
    ranges.reverse();
    ranges
}

pub fn path_to_byte_ranges(path: &[Edge], text: &[char]) -> Vec<TextRange> {
    let char_ranges = path_to_ranges(path);
    let mut ranges: Vec<TextRange> = Vec::with_capacity(char_ranges.len());
    let mut global_byte_offset = 0;
    for r in char_ranges {
        let mut word_byte_offset = 0;
        for i in r.s..r.e {
            word_byte_offset += text[i].len_utf8();
        }
        ranges.push(TextRange {
            s: global_byte_offset,
            e: global_byte_offset + word_byte_offset,
        });
        global_byte_offset += word_byte_offset;
    }
    ranges
}

pub fn path_to_str_vec(path: &[Edge], text: &[char]) -> Vec<String> {
    let ranges = path_to_ranges(path);
    let mut str_vec: Vec<String> = Vec::with_capacity(ranges.len());
    for r in ranges {
        let mut buf = String::with_capacity(3 * (r.e - r.s + 1));
        for i in r.s..r.e {
            buf.push(text[i]);
        }
        str_vec.push(buf)
    }
    str_vec
}

pub struct Wordcut {
    dict: Dict,
    cluster_re: Option<ClusterRulesMatcher>,
    split_re: SplitRulesMatcher,
}

impl Wordcut {
    pub fn new(dict: Dict) -> Wordcut {
        Wordcut {
            dict,
            cluster_re: None,
            split_re: DEFAULT_THAI_SPLIT_RE.clone(),
        }
    }

    pub fn new_with_cluster_re(dict: Dict, cluster_re: ClusterRulesMatcher) -> Wordcut {
        Wordcut {
            dict,
            cluster_re: Some(cluster_re),
            split_re: DEFAULT_THAI_SPLIT_RE.clone(),
        }
    }

    pub fn new_with_cluster_re_and_split_re(
        dict: Dict,
        cluster_re: ClusterRulesMatcher,
        split_re: SplitRulesMatcher,
    ) -> Wordcut {
        Wordcut {
            dict,
            cluster_re: Some(cluster_re),
            split_re,
        }
    }

    #[inline]
    pub fn build_path(&self, text: &str, text_chars: &[char]) -> Vec<Edge> {
        let byte_to_char_idx_map = create_byte_to_char_idx_map(text);
        let mut dict_edge_builder = DictEdgeBuilder::new(&self.dict);
        let mut unk_edge_builder = UnkEdgeBuilder::new();
        let mut rule_based_edge_builder =
            RuleBasedEdgeBuilder::new(&byte_to_char_idx_map, text, &self.split_re);
        let builders: Vec<&mut dyn EdgeBuilder> = vec![
            &mut dict_edge_builder,
            &mut unk_edge_builder,
            &mut rule_based_edge_builder,
        ];

        let clusters = if let Some(cluster_re) = &self.cluster_re {
            find_clusters(text, &byte_to_char_idx_map, cluster_re, text_chars.len())
        } else {
            let mut clusters = vec![];
            clusters.resize(text_chars.len() + 1, 0);
            clusters
        };
        build_path_with_clusters(builders, &clusters, text_chars)
    }

    #[allow(dead_code)]
    pub fn segment(&self, text: &str) -> Vec<TextRange> {
        let text_chars: Vec<char> = text.chars().collect();
        let path = self.build_path(text, &text_chars);
        path_to_ranges(&path)
    }

    pub fn segment_into_byte_ranges(&self, text: &str) -> Vec<TextRange> {
        let text_chars: Vec<char> = text.chars().collect();
        let path = self.build_path(text, &text_chars);
        path_to_byte_ranges(&path, &text_chars)
    }

    pub fn segment_into_strings(&self, text: &str) -> Vec<String> {
        let text_chars: Vec<char> = text.chars().collect();
        let path = self.build_path(text, &text_chars);
        path_to_str_vec(&path, &text_chars)
    }

    pub fn put_delimiters(&self, text: &str, delim: &str) -> String {
        self.segment_into_strings(text).join(delim)
    }

    #[allow(dead_code)]
    pub fn build_dag(&self, text: &str) -> Dag {
        build_dag(&self.dict, &text.chars().collect())
    }
}

pub fn create_byte_to_char_idx_map(text: &str) -> Vec<usize> {
    let mut byte_to_char_map = vec![];
    let mut i = 0;
    for b in text.as_bytes() {
        if (*b as i8) >= -0x40 {
            byte_to_char_map.push(i);
            i += 1;
        } else {
            byte_to_char_map.push(0);
        }
    }
    byte_to_char_map.push(i);
    byte_to_char_map
}

#[derive(Debug)]
pub struct ClusterPointer {
    state_id: StateID,
    p: usize,
}

#[derive(Debug)]
pub struct ClusterEdge {
    acc_pat_len: usize,
    unk_cnt: usize,
    p: usize,
    is_unk: bool,
}

pub fn find_cluster_path(dfa: &ClusterRulesMatcher, text: &str) -> Vec<ClusterEdge> {
    let mut pointers = vec![];
    let mut ch_index = 0;
    let mut path = vec![];
    let mut left_boundary = 0;
    path.push(ClusterEdge {
        p: 0,
        acc_pat_len: 0,
        unk_cnt: 0,
        is_unk: false,
    });
    for ch_byte in text.as_bytes() {
        let mut best_edge: Option<ClusterEdge> = None;
        pointers.push(ClusterPointer {
            state_id: dfa
                .start_state(&DFA_START_CONFIG)
                .expect("DFA state started"),
            p: ch_index,
        });
        let mut new_pointer_index = 0;
        for pointer_index in 0..pointers.len() {
            let next_id = dfa.next_state(pointers[pointer_index].state_id, *ch_byte);
            if !dfa.is_dead_state(next_id) {
                pointers[new_pointer_index] = ClusterPointer {
                    state_id: next_id,
                    p: pointers[pointer_index].p,
                };
                new_pointer_index += 1;
                if dfa.is_match_state(dfa.next_eoi_state(next_id)) {
                    let source = &path[pointers[pointer_index].p];
                    let edge = ClusterEdge {
                        p: pointers[pointer_index].p,
                        acc_pat_len: source.acc_pat_len
                            + (ch_index - pointers[pointer_index].p + 1),
                        unk_cnt: source.unk_cnt,
                        is_unk: false,
                    };
                    if match &best_edge {
                        Some(b_edge) => {
                            b_edge.unk_cnt > edge.unk_cnt
                                || (b_edge.unk_cnt == edge.unk_cnt
                                    && b_edge.acc_pat_len < edge.acc_pat_len)
                        }
                        None => true,
                    } {
                        best_edge = Some(edge);
                    }
                }
            }
        }
        pointers.truncate(new_pointer_index);
        if best_edge.is_none() {
            let source = &path[left_boundary];
            best_edge = Some(ClusterEdge {
                p: left_boundary,
                acc_pat_len: source.acc_pat_len,
                unk_cnt: source.unk_cnt + (ch_index - left_boundary + 1),
                is_unk: true,
            });
        }
        let best_edge = best_edge.unwrap();
        if !best_edge.is_unk {
            left_boundary = ch_index + 1;
        }
        path.push(best_edge);
        ch_index += 1;
    }
    path
}

pub fn find_clusters(
    text: &str,
    byte_to_char_idx_map: &[usize],
    dfa: &ClusterRulesMatcher,
    len: usize,
) -> Vec<usize> {
    let mut clusters = vec![];
    clusters.resize(len, 0);
    let mut id = 1;
    let path = find_cluster_path(dfa, text);
    let mut me = path.len() - 1;
    while me > 0 {
        let edge = &path[me];
        let ms = edge.p;
        let s = byte_to_char_idx_map[ms];
        let e = byte_to_char_idx_map[me];
        if !edge.is_unk {
            for i in s..e {
                clusters[i] = id;
            }
            id += 1;
        }
        me = ms;
    }
    clusters
}

pub fn load_wordlist(path: impl AsRef<Path>) -> io::Result<Vec<String>> {
    let f = File::open(path.as_ref())?;
    let f = io::BufReader::new(f);
    Ok(f.lines().map(|line| line.unwrap()).collect())
}

pub fn load_dict(path: impl AsRef<Path>) -> io::Result<Dict> {
    let wordlist = load_wordlist(path)?;
    let wordlist: Vec<_> = wordlist.iter().map(|w| &w[..]).collect();
    Ok(create_prefix_tree(&wordlist))
}

pub fn load_cluster_rules(path: &Path) -> Result<ClusterRulesMatcher, WordcutError> {
    let f = File::open(path)
        .map_err(|_| WordcutError::CannotOpenClusterRulesAt(path.to_string_lossy().to_string()))?;
    let f = io::BufReader::new(f);
    let mut rules = vec![];
    for line in f.lines() {
        let line = line.map_err(|_| WordcutError::CannotReadClusterRule)?;
        rules.push(format!("({})", line.trim()));
    }
    let rules = rules.join("|");
    let dfa =
        dense::DFA::new(&rules).map_err(|_| WordcutError::CannotCompileClusterRules(rules))?;
    Ok(dfa)
}

pub fn load_split_rules(path: &Path) -> Result<SplitRulesMatcher, WordcutError> {
    let f = File::open(path)
        .map_err(|_| WordcutError::CannotOpenSplitRulesAt(path.to_string_lossy().to_string()))?;
    let f = io::BufReader::new(f);
    let mut rules = vec![];
    for line in f.lines() {
        let line = line.map_err(|_| WordcutError::CannotReadClusterRule)?;
        rules.push(format!("({})", line.trim()));
    }
    let rules = rules.join("|");
    Ok(Regex::new(&rules).map_err(|_| WordcutError::CannotCompileSplitRules(rules))?)
}

#[cfg(test)]
mod tests {
    extern crate serde_json;
    use super::*;

    use DagEdge;
    use EdgeType;
    use TextRange;
    use Wordcut;

    #[test]
    fn test_prefix_tree() {
        let prefix_tree = super::create_prefix_tree(&["A"]);
        assert_eq!(
            prefix_tree.seek(&(0, 0, 'A')),
            Some(&(0 as u32, true, Some(true)))
        );
        assert_eq!(prefix_tree.seek(&(0, 0, 'B')), None);
    }

    #[test]
    fn test_segment() {
        let dict = super::create_prefix_tree(&["กา", "กาก"]);
        let wordcut = Wordcut::new(dict);
        let ranges = wordcut.segment("กากกา");
        let expected = vec![TextRange { s: 0, e: 3 }, TextRange { s: 3, e: 5 }];
        assert_eq!(ranges, expected)
    }

    #[test]
    fn test_segment_into_byte_ranges() {
        let dict = super::create_prefix_tree(&["กา", "กาก"]);
        let wordcut = Wordcut::new(dict);
        let ranges = wordcut.segment_into_byte_ranges("กากกา");
        let expected = vec![TextRange { s: 0, e: 9 }, TextRange { s: 9, e: 15 }];
        assert_eq!(ranges, expected)
    }

    #[test]
    fn test_segment_to_strings() {
        let dict = super::create_prefix_tree(&["กา", "กาก"]);
        let wordcut = Wordcut::new(dict);
        let toks = wordcut.segment_into_strings("กากกา");
        let expected = vec![String::from("กาก"), String::from("กา")];
        assert_eq!(toks, expected)
    }

    #[test]
    fn test_segment_to_strings_mixed() {
        let dict = super::create_prefix_tree(&["กา", "กาก"]);
        let wordcut = Wordcut::new(dict);
        let toks = wordcut.segment_into_strings("cat กากกา");
        let expected = vec![
            String::from("cat"),
            String::from(" "),
            String::from("กาก"),
            String::from("กา"),
        ];
        assert_eq!(toks, expected)
    }

    #[test]
    fn test_put_delimiters() {
        let dict = super::create_prefix_tree(&["กา", "กาก"]);
        let wordcut = Wordcut::new(dict);
        assert_eq!(wordcut.put_delimiters("กากกา", "|"), String::from("กาก|กา"))
    }

    #[test]
    fn test_load_wordlist() {
        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
        let v = super::load_wordlist(path);
        assert_eq!(v.unwrap(), vec![String::from("กา"), String::from("กาก")])
    }

    #[test]
    fn test_wordcut() {
        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
        let dict = super::load_dict(&path);
        let wordcut = Wordcut::new(dict.unwrap());
        assert_eq!(wordcut.put_delimiters("กากกา", "|"), String::from("กาก|กา"))
    }

    #[test]
    fn test_wordcut_with_replacer() {
        let dict = super::create_prefix_tree(&["ข้อ", "รับ", "สำหรับ", "เสนอ"]);
        let wordcut = Wordcut::new(dict);
        let rule = r###"{"pattern": "ํา", "replacement": "ำ"}"###;
        let rule: replacer::Rule = serde_json::from_str(rule).unwrap();
        let imm_rules = replacer::ImmRule::from_rules(&vec![rule]).unwrap();
        let mod_text = replacer::replace(&imm_rules, "สําหรับข้อเสนอ");
        assert_eq!(
            wordcut.put_delimiters(&mod_text, "|"),
            String::from("สำหรับ|ข้อ|เสนอ")
        )
    }

    #[test]
    fn test_wordcut_with_replacer_two_occurs() {
        let dict = super::create_prefix_tree(&["กำลัง", "ทำ", "พยายาม", "ลัง", "ให้"]);
        let wordcut = Wordcut::new(dict);
        let rule = r###"{"pattern": "ํา", "replacement": "ำ"}"###;
        let rule: replacer::Rule = serde_json::from_str(rule).unwrap();
        let imm_rules = replacer::ImmRule::from_rules(&vec![rule]).unwrap();
        let mod_text = replacer::replace(&imm_rules, "กําลังพยายามทําให้");
        assert_eq!(
            wordcut.put_delimiters(&mod_text, "|"),
            String::from("กำลัง|พยายาม|ทำ|ให้")
        )
    }

    #[test]
    fn test_wordcut_with_latin() {
        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
        let dict = super::load_dict(&path);
        let wordcut = Wordcut::new(dict.unwrap());
        assert_eq!(
            wordcut.put_delimiters("ฑฑACญญ", "|"),
            String::from("ฑฑ|AC|ญญ")
        )
    }

    #[test]
    fn test_wordcut_with_two_spaces() {
        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
        let dict = super::load_dict(&path);
        let wordcut = Wordcut::new(dict.unwrap());
        assert_eq!(
            wordcut.put_delimiters("กา  มา", "|"),
            String::from("กา|  |มา")
        )
    }

    #[test]
    fn test_wordcut_with_two_spaces_unk() {
        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
        let dict = super::load_dict(&path);
        let wordcut = Wordcut::new(dict.unwrap());
        assert_eq!(
            wordcut.put_delimiters("แแ  ยย", "|"),
            String::from("แแ|  |ยย")
        )
    }

    #[test]
    fn test_wordcut_with_unicode_quote() {
        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
        let dict = super::load_dict(&path);
        let wordcut = Wordcut::new(dict.unwrap());
        assert_eq!(
            wordcut.put_delimiters("“ฆกากา”", "|"),
            String::from("“|ฆ|กา|กา|”")
        )
    }

    #[test]
    fn test_dag() {
        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
        let dict = super::load_dict(&path).unwrap();
        let wordcut = Wordcut::new(dict);
        let dag = wordcut.build_dag("กากกา");
        let expected = vec![
            vec![DagEdge {
                s: 0,
                e: 0,
                etype: EdgeType::Init,
            }], // 0
            vec![DagEdge {
                s: 0,
                e: 1,
                etype: EdgeType::Unk,
            }], // 1
            vec![DagEdge {
                s: 0,
                e: 2,
                etype: EdgeType::Dict,
            }], // 2
            vec![DagEdge {
                s: 0,
                e: 3,
                etype: EdgeType::Dict,
            }], // 3
            vec![DagEdge {
                s: 3,
                e: 4,
                etype: EdgeType::Unk,
            }], // 4
            vec![DagEdge {
                s: 3,
                e: 5,
                etype: EdgeType::Dict,
            }], // 5
        ];
        assert_eq!(dag, expected);
    }

    #[test]
    fn test_dag_in_object() {
        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
        let dict = super::load_dict(&path);
        let dag = super::build_dag(&dict.unwrap(), &"กากกา".chars().collect());
        let expected = vec![
            vec![DagEdge {
                s: 0,
                e: 0,
                etype: EdgeType::Init,
            }], // 0
            vec![DagEdge {
                s: 0,
                e: 1,
                etype: EdgeType::Unk,
            }], // 1
            vec![DagEdge {
                s: 0,
                e: 2,
                etype: EdgeType::Dict,
            }], // 2
            vec![DagEdge {
                s: 0,
                e: 3,
                etype: EdgeType::Dict,
            }], // 3
            vec![DagEdge {
                s: 3,
                e: 4,
                etype: EdgeType::Unk,
            }], // 4
            vec![DagEdge {
                s: 3,
                e: 5,
                etype: EdgeType::Dict,
            }], // 5
        ];
        assert_eq!(dag, expected);
    }

    #[test]
    fn test_dag_empty() {
        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
        let dict = super::load_dict(&path);
        let dag = super::build_dag(&dict.unwrap(), &"".chars().collect());
        let expected = vec![
            vec![DagEdge {
                s: 0,
                e: 0,
                etype: EdgeType::Init,
            }], // 0
        ];
        assert_eq!(dag, expected);
    }

    #[test]
    fn test_dag_to_json() {
        let dag = vec![
            vec![DagEdge {
                s: 0,
                e: 0,
                etype: EdgeType::Init,
            }], // 0
        ];
        let s = serde_json::to_string(&dag).unwrap();
        assert_eq!(s, "[[{\"s\":0,\"e\":0,\"etype\":\"Init\"}]]");
    }

    #[test]
    fn test_find_clusters() {
        let text = "กาแกกก์A";
        let path = super::Path::new(concat!(
            env!("CARGO_MANIFEST_DIR"),
            "/data/thai_cluster_rules.txt"
        ));
        let cluster_re = super::load_cluster_rules(&path).unwrap();
        let byte_to_char_idx_map = create_byte_to_char_idx_map(text);
        let clusters = find_clusters(
            text,
            &byte_to_char_idx_map,
            &cluster_re,
            text.chars().count(),
        );
        assert_eq!(clusters, vec![2, 2, 1, 1, 1, 1, 1, 0]);
    }

    #[test]
    fn test_wordcut_with_clusters() {
        let text = "แมวแฐแกกก์มา";
        let cluster_path = super::Path::new(concat!(
            env!("CARGO_MANIFEST_DIR"),
            "/data/thai_cluster_rules.txt"
        ));
        let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
        let dict = super::load_dict(&path);
        let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
        assert_eq!(
            wordcut.put_delimiters(text, "|||"),
            String::from("แมว|||แฐแกกก์|||มา")
        );
    }

    #[test]
    fn test_wordcut_with_clusters_portsmouth() {
        let text = "จากพอร์ตสมัธไป";
        let cluster_path = super::Path::new(concat!(
            env!("CARGO_MANIFEST_DIR"),
            "/data/thai_cluster_rules.txt"
        ));
        let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
        let dict = super::load_dict(&path);
        let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
        assert_eq!(
            wordcut.put_delimiters(text, "|||"),
            String::from("จาก|||พอร์ต|||สมัธ|||ไป")
        );
    }

    #[test]
    fn test_wordcut_with_clusters2() {
        let text = "มีรีเควสต์อะไร";
        let cluster_path = super::Path::new(concat!(
            env!("CARGO_MANIFEST_DIR"),
            "/data/thai_cluster_rules.txt"
        ));
        let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
        let dict = super::load_dict(&path);
        let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
        assert_eq!(
            wordcut.put_delimiters(text, "|||"),
            String::from("มี|||รี|||เค|||วสต์|||อะไร")
        );
    }

    #[test]
    fn test_wordcut_khmer_cluster_basic() {
        let text = "ឡារី";
        let cluster_path = super::Path::new(concat!(
            env!("CARGO_MANIFEST_DIR"),
            "/data/khmer_cluster_rules.txt"
        ));
        let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/khmerdict.txt"));
        let dict = super::load_dict(&path);
        let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
        assert_eq!(wordcut.put_delimiters(text, "|||"), String::from("ឡា|||រី"));
    }

    #[test]
    fn test_rule_based_edge_builder() {
        let text = "  ABก";
        let text_chars: Vec<char> = text.chars().collect();
        let byte_to_char_idx_map = create_byte_to_char_idx_map(text);
        let mut builder =
            RuleBasedEdgeBuilder::new(&byte_to_char_idx_map, text, &DEFAULT_THAI_SPLIT_RE);
        let mut path = vec![];
        path.push(Edge {
            w: 10,
            unk: 20,
            p: 0,
            etype: EdgeType::Init,
        });
        let edge = builder.build(
            &EdgeBuildingContext {
                text: &text_chars,
                i: 0,
                ch: '\0',
                left_boundary: 0,
                best_edge: None,
            },
            &path,
        );
        assert!(edge.is_none());
        path.push(Edge {
            w: 20,
            unk: 30,
            p: 0,
            etype: EdgeType::Unk,
        });

        let edge = builder.build(
            &EdgeBuildingContext {
                text: &text_chars,
                i: 1,
                ch: '\0',
                left_boundary: 0,
                best_edge: None,
            },
            &path,
        );
        assert!(edge.is_some());
        path.push(Edge {
            w: 30,
            unk: 40,
            p: 0,
            etype: EdgeType::Pat,
        });

        let edge = builder.build(
            &EdgeBuildingContext {
                text: &text_chars,
                i: 2,
                ch: '\0',
                left_boundary: 0,
                best_edge: None,
            },
            &path,
        );
        assert!(edge.is_none());
        path.push(Edge {
            w: 50,
            unk: 60,
            p: 0,
            etype: EdgeType::Unk,
        });

        let edge = builder.build(
            &EdgeBuildingContext {
                text: &text_chars,
                i: 3,
                ch: '\0',
                left_boundary: 0,
                best_edge: None,
            },
            &path,
        );
        assert!(edge.is_some());
        let edge = edge.unwrap();
        assert_eq!(
            edge,
            Edge {
                w: 31,
                unk: 40,
                p: 2,
                etype: EdgeType::Pat
            }
        );
    }

    #[test]
    fn test_wordcut_with_split_rules() {
        let text = "AB   X(A)/12";
        let cluster_path = super::Path::new(concat!(
            env!("CARGO_MANIFEST_DIR"),
            "/data/thai_cluster_rules.txt"
        ));
        let split_path = super::Path::new(concat!(
            env!("CARGO_MANIFEST_DIR"),
            "/data/thai_split_rules.txt"
        ));

        let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
        let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
        let dict = super::load_dict(&path);
        let split_re = load_split_rules(&split_path).unwrap();
        let wordcut =
            Wordcut::new_with_cluster_re_and_split_re(dict.unwrap(), cluster_re, split_re);
        assert_eq!(
            wordcut.put_delimiters(text, "|||"),
            String::from("AB|||   |||X|||(|||A|||)|||/|||12")
        );
    }

    #[test]
    fn test_find_clusters_path() {
        let path = super::Path::new(concat!(
            env!("CARGO_MANIFEST_DIR"),
            "/data/thai_cluster_rules.txt"
        ));
        let cluster_re = super::load_cluster_rules(&path).unwrap();
        let cluster_path = find_cluster_path(&cluster_re, "เกียำ");
        assert_eq!(cluster_path.len(), 16);
        assert_eq!(cluster_path[15].p, 9);
    }
}