chamkho 1.4.3

Khmer, Lao, Myanmar, and Thai word segmentation/breaking library and command line
pub extern crate wordcut_engine;

use self::wordcut_engine::Dict;
use std::io::Result;
use std::path::Path;

pub type Wordcut = self::wordcut_engine::Wordcut;

macro_rules! insert_prefix {
    ($filename:expr) => {
        if cfg!(feature = "onedir") {
            Path::new(concat!("./", $filename))
        } else {
            Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/", $filename))
        }
    };
}

macro_rules! insert_prefix_str {
    ($filename:expr) => {
        if cfg!(feature = "onedir") {
            concat!("./", $filename)
        } else {
            concat!(env!("CARGO_MANIFEST_DIR"), "/data/", $filename)
        }
    };
}
  
pub fn default_path() -> &'static Path {
    insert_prefix!("words_th.txt")
}

pub fn lao_path() -> &'static Path {
    insert_prefix!("laowords.txt")
}

pub fn khmer_dict_path() -> &'static Path {
    insert_prefix!("khmerdict.txt")
}

pub fn myanmar_dict_path() -> &'static Path {
    insert_prefix!("myanmar-dict.txt")
}

pub fn thai_cluster_path() -> Option<String> {
    Some(insert_prefix_str!("thai_cluster_rules.txt").to_owned())
}

pub fn lao_clusters_path() -> Option<String> {
    None
}

pub fn khmer_clusters_path() -> Option<String> {
    Some(insert_prefix_str!("khmer_cluster_rules.txt").to_owned())
}

pub fn myanmar_clusters_path() -> Option<String> {
    None
}

pub fn load_dict(path: &Path) -> Result<Dict> {
    self::wordcut_engine::load_dict(path)
}

pub fn thai_replace_rules_path() -> Option<String> {
    Some(insert_prefix_str!("thai-replace-rules.json").to_owned())
}

#[cfg(test)]
mod tests {
    use super::wordcut_engine::TextRange;
    use super::Wordcut;
    use std::path::Path;

    #[test]
    fn test_default_dict() {
        let dict = super::load_dict(super::default_path());
        assert!(dict.is_ok());
    }

    #[test]
    fn test_segment_into_ranges() {
        let dict = super::load_dict(Path::new("data/thai.txt")).unwrap();
        let wordcut = Wordcut::new(dict);
        let ranges = wordcut.segment("กากกา");
        let expected = vec![TextRange { s: 0, e: 3 }, TextRange { s: 3, e: 5 }];
        assert_eq!(ranges, expected)
    }

    #[test]
    fn test_segment_into_strings() {
        let dict = super::load_dict(Path::new("data/thai.txt"));
        let wordcut = Wordcut::new(dict.unwrap());
        let words = wordcut.segment_into_strings("กากกา");
        let expected = vec!["กาก".to_string(), "กา".to_string()];
        assert_eq!(words, expected)
    }

    #[test]
    fn test_segment_with_punc() {
        let dict = super::load_dict(Path::new("data/thai.txt"));
        let wordcut = Wordcut::new(dict.unwrap());
        let words = wordcut.segment_into_strings("\"ฆกากา\"");
        let expected = &vec!["\"", "", "กา", "กา", "\""]
            .iter()
            .map(|&s| s.to_string())
            .collect::<Vec<String>>()[..];
        assert_eq!(words, expected)
    }

    #[test]
    fn test_segment_with_parentheses() {
        let dict = super::load_dict(Path::new("data/thai.txt"));
        let wordcut = Wordcut::new(dict.unwrap());
        let words = wordcut.segment_into_strings(&"(ฆกากา)".to_string());
        let expected = &vec!["(", "", "กา", "กา", ")"]
            .iter()
            .map(|&s| s.to_string())
            .collect::<Vec<String>>()[..];
        assert_eq!(words, expected)
    }

    #[test]
    fn test_segment_unknown_sandwich() {
        let dict = super::load_dict(Path::new("data/thai.txt"));
        let wordcut = Wordcut::new(dict.unwrap());
        let words = wordcut.segment_into_strings("ฮฮกาฮฮ");
        let expected = &vec!["ฮฮ", "กา", "ฮฮ"]
            .iter()
            .map(|&s| s.to_string())
            .collect::<Vec<String>>()[..];
        assert_eq!(words, expected)
    }

    #[test]
    fn test_one_word() {
        let dict = super::load_dict(Path::new("data/thai.txt"));
        let wordcut = Wordcut::new(dict.unwrap());
        let words = wordcut.segment_into_strings("มาตรา");
        let expected = &vec!["มาตรา"]
            .iter()
            .map(|&s| s.to_string())
            .collect::<Vec<String>>()[..];
        assert_eq!(words, expected)
    }

    #[test]
    fn test_from_law() {
        let dict = super::load_dict(Path::new("data/thai.txt"));
        let wordcut = Wordcut::new(dict.unwrap());
        let words = wordcut.segment_into_strings("มาตรา 482 ผู้ขายไม่");
        let expected = &vec!["มาตรา", " ", "482", " ", "ผู้", "ขาย", "ไม่"]
            .iter()
            .map(|&s| s.to_string())
            .collect::<Vec<String>>()[..];
        assert_eq!(words, expected)
    }

    #[test]
    fn test_space() {
        let dict = super::load_dict(Path::new("data/thai.txt"));
        let wordcut = Wordcut::new(dict.unwrap());
        let words = wordcut.segment("a cat");
        let expected = vec![
            TextRange { s: 0, e: 1 },
            TextRange { s: 1, e: 2 },
            TextRange { s: 2, e: 5 },
        ];
        assert_eq!(words, expected)
    }

    #[test]
    fn test_lao() {
        let dict = super::load_dict(super::lao_path());
        let wordcut = Wordcut::new(dict.unwrap());
        let words = wordcut.segment(&"ພາສາລາວມີ".to_string());
        let expected = vec![
            TextRange { s: 0, e: 4 },
            TextRange { s: 4, e: 7 },
            TextRange { s: 7, e: 9 },
        ];
        assert_eq!(words, expected)
    }
}