Expand description
§Vibrato-rkyv
Vibrato is a fast implementation of tokenization (or morphological analysis) based on the viterbi algorithm.
§Examples
use vibrato_rkyv::{Dictionary, SystemDictionaryBuilder, Tokenizer};
let lexicon_csv = "京都,4,4,5,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,*,A,*,*,*,1/5
東京都,5,5,9,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,5/9,*,5/9,*";
let matrix_def = "10 10\n0 4 -5\n0 5 -9";
let char_def = "DEFAULT 0 1 0";
let unk_def = "DEFAULT,0,0,100,DEFAULT,名詞,普通名詞,*,*,*,*,*,*,*,*,*,*,*,*";
let dict = SystemDictionaryBuilder::from_readers(
lexicon_csv.as_bytes(),
matrix_def.as_bytes(),
char_def.as_bytes(),
unk_def.as_bytes(),
)?;
let tokenizer = Tokenizer::from_inner(dict);
let mut worker = tokenizer.new_worker();
worker.reset_sentence("京都東京都");
worker.tokenize();
assert_eq!(worker.num_tokens(), 2);
let t0 = worker.token(0);
assert_eq!(t0.surface(), "京都");
assert_eq!(t0.range_char(), 0..2);
assert_eq!(t0.range_byte(), 0..6);
assert_eq!(t0.feature(), "京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,*,A,*,*,*,1/5");
let t1 = worker.token(1);
assert_eq!(t1.surface(), "東京都");
assert_eq!(t1.range_char(), 2..5);
assert_eq!(t1.range_byte(), 6..15);
assert_eq!(t1.feature(), "東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,5/9,*,5/9,*");Re-exports§
pub use dictionary::CacheStrategy;pub use dictionary::Dictionary;pub use dictionary::LoadMode;pub use dictionary::SystemDictionaryBuilder;pub use tokenizer::Tokenizer;
Modules§
- common
- Common settings in Vibrato.
- dictionary
- Dictionary for tokenization.
- errors
- Definition of errors.
- mecab
train - Utilities to support MeCab models.
- token
- Container of resultant tokens.
- tokenizer
- Viterbi-based tokenizer.
- trainer
train - Module for training models.
Constants§
- VERSION
- Version number of this library.