mecab-ko-core 0.7.2

한국어 형태소 분석 핵심 엔진 - Lattice, Viterbi, 토크나이저
Documentation
//! Analyze NR (numeral) tagging errors against a sample TSV dataset.

use mecab_ko_core::evaluate::TestDataset;
use mecab_ko_core::sejong::SejongConverter;
use mecab_ko_core::tokenizer::Tokenizer;

fn main() -> Result<(), Box<dyn std::error::Error>> {
    let dict_path = "../../../data/mecab-ko-dic-2.1.1-20180720";
    let mut tokenizer = Tokenizer::with_dict(dict_path)?;
    let converter = SejongConverter::new();
    let dataset = TestDataset::from_tsv("../../../data/eval/sample.tsv")?;

    // NR 오류 분석
    let mut errors = Vec::new();
    for sentence in &dataset.sentences {
        let tokens = tokenizer.tokenize(&sentence.text);
        let sejong_tokens = converter.convert_tokens(&tokens);

        for (i, gold) in sentence.tokens.iter().enumerate() {
            if gold.pos == "NR" {
                let pred = if i < sejong_tokens.len() {
                    format!("{}/{}", sejong_tokens[i].surface, sejong_tokens[i].pos)
                } else {
                    "MISSING".to_string()
                };

                if !pred.ends_with("/NR") {
                    errors.push((sentence.text.clone(), gold.surface.clone(), pred));
                }
            }
        }
    }

    println!("NR 오류 {}개:\n", errors.len());
    for (text, gold, pred) in &errors {
        println!("{text}: {gold}{pred}");
    }

    Ok(())
}