chamkho 1.4.3

Khmer, Lao, Myanmar, and Thai word segmentation/breaking library and command line
mod init;
use clap::Parser;
use std::io;
use std::io::BufRead;
use std::path::{Path, PathBuf};
use wordcut_engine::replacer;

#[derive(clap::ValueEnum, Clone, Debug)]
enum Lang {
    Lao,
    Khmer,
    Myanmar,
    Thai,
}

#[derive(Parser, Debug)]
#[clap(author, version, about, long_about = None)]
struct Args {
    #[clap(short, long, value_parser)]
    dict_path: Option<String>,
    #[clap(short, long, value_parser)]
    cluster_rules_path: Option<String>,
    #[clap(short = 's', long, value_parser)]
    word_delimiter: Option<String>,
    #[clap(short = 'r', long, value_parser)]
    replace_rules_path: Option<String>,
    #[clap(short, long, value_parser, value_enum)]
    lang: Option<Lang>,
}

pub fn join_data_path(base_path: &str) -> PathBuf {
    let mut buf = PathBuf::new();
    if !cfg!(feature = "onedir") {
        buf.push(env!("CARGO_MANIFEST_DIR"));
	buf.push("data");
    }
    buf.push(base_path);
    return buf
}

fn main() {
    let args = Args::parse();
    let lang = args.lang;
    let dict_path = match args.dict_path.as_ref() {
        Some(dict_path) => Path::new(dict_path),
        None => match lang {
            Some(Lang::Lao) => init::lao_path(),
            Some(Lang::Khmer) => init::khmer_dict_path(),
            Some(Lang::Myanmar) => init::myanmar_dict_path(),
            Some(Lang::Thai) | None => init::default_path(),
        },
    };
    let word_delim = match args.word_delimiter.as_ref().map(|delim| delim.as_str()) {
        Some(word_delim) => word_delim,
        None => "|",
    };
    let dict = init::load_dict(dict_path).unwrap();

    let cluster_rule_path = if let Some(cluster_rules_path) = args.cluster_rules_path {
        Some(cluster_rules_path.to_string())
    } else {
        match lang {
            Some(Lang::Lao) => init::lao_clusters_path(),
            Some(Lang::Khmer) => init::khmer_clusters_path(),
            Some(Lang::Myanmar) => init::myanmar_clusters_path(),
            Some(Lang::Thai) | None => init::thai_cluster_path(),
        }
    };

    let wordcut = match cluster_rule_path {
        Some(cluster_rule_path) => {
            let cluster_re =
                init::wordcut_engine::load_cluster_rules(Path::new(&cluster_rule_path)).unwrap();
            init::Wordcut::new_with_cluster_re(dict, cluster_re)
        }
        None => init::Wordcut::new(dict),
    };

    let replace_rules_path = if let Some(replace_rules_path) = args.replace_rules_path {
        Some(replace_rules_path)
    } else {
        match lang {
            Some(Lang::Thai) | None => init::thai_replace_rules_path(),
            _ => None,
        }
    };

    let replace_rules = if let Some(replace_rules_path) = replace_rules_path {
        replacer::load_imm_rules(&replace_rules_path).expect("Load replace rules")
    } else {
        vec![]
    };

    for line_opt in io::stdin().lock().lines() {
        let cleaned_line = match line_opt {
            Ok(line) => line.trim_end_matches('\n').to_string(),
            Err(e) => panic!("Cannot read line {}", e),
        };
        let mod_line = replacer::replace(&replace_rules, &cleaned_line);
        let segmented_string = wordcut.put_delimiters(&mod_line, word_delim);
        println!("{}", segmented_string);
    }
}