Module vibrato::trainer

source ·
Expand description

Module for training models.

Examples

use std::fs::File;
use vibrato::trainer::{Corpus, Trainer, TrainerConfig};
use vibrato::{SystemDictionaryBuilder, Tokenizer};

// Loads configurations
let lexicon_rdr = File::open("src/tests/resources/train_lex.csv")?;
let char_prop_rdr = File::open("src/tests/resources/char.def")?;
let unk_handler_rdr = File::open("src/tests/resources/train_unk.def")?;
let feature_templates_rdr = File::open("src/tests/resources/feature.def")?;
let rewrite_rules_rdr = File::open("src/tests/resources/rewrite.def")?;
let config = TrainerConfig::from_readers(
    lexicon_rdr,
    char_prop_rdr,
    unk_handler_rdr,
    feature_templates_rdr,
    rewrite_rules_rdr,
)?;

// Initializes trainer
let trainer = Trainer::new(config)?
    .regularization_cost(0.01)
    .max_iter(300)
    .num_threads(20);

// Loads corpus
let corpus_rdr = File::open("src/tests/resources/corpus.txt")?;
let corpus = Corpus::from_reader(corpus_rdr)?;

// Model data
let mut lexicon_trained = vec![];
let mut connector_trained = vec![];
let mut unk_handler_trained = vec![];
let mut user_lexicon_trained = vec![];

// Starts training
let mut model = trainer.train(corpus)?;

model.write_dictionary(
    &mut lexicon_trained,
    &mut connector_trained,
    &mut unk_handler_trained,
    &mut user_lexicon_trained,
)?;

// Loads trained model
let char_prop_rdr = File::open("src/tests/resources/char.def")?;
let dict = SystemDictionaryBuilder::from_readers(
    &*lexicon_trained,
    &*connector_trained,
    char_prop_rdr,
    &*unk_handler_trained,
)?;

let tokenizer = Tokenizer::new(dict);
let mut worker = tokenizer.new_worker();

worker.reset_sentence("外国人参政権");
worker.tokenize();
assert_eq!(worker.num_tokens(), 4); // 外国/人/参政/権

Structs

  • Representation of a corpus.
  • Representation of a sentence.
  • Tokenization Model
  • Trainer of morphological analyzer.
  • Configuration for a trainer.
  • Representation of a pair of a surface and features.