1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
extern crate yoin_core as core; use core::dic::{FstDic, Matrix}; use core::dic::unknown::CompiledUnkDic; use core::sysdic::SysDic; use core::tokenizer::Tokenizer; pub const BYTECODE: &'static [u8] = include_bytes!("../data/ipadic.dic"); pub const MORPHS: &'static [u8] = include_bytes!("../data/ipadic.morph"); pub const MATRIX: &'static [u8] = include_bytes!("../data/ipadic.matrix"); pub const UNKOWN: &'static [u8] = include_bytes!("../data/ipadic.unk"); pub fn dictionary() -> FstDic<&'static [u8]> { unsafe { FstDic::from_bytes(BYTECODE, MORPHS) } } pub fn matrix() -> Matrix<&'static [i16]> { unsafe { Matrix::decode(MATRIX) } } pub fn unkown_dic() -> CompiledUnkDic<'static> { unsafe { CompiledUnkDic::decode(UNKOWN) } } pub fn sysdic() -> SysDic { SysDic { dic: dictionary(), matrix: matrix(), unknown_dic: unkown_dic(), } } pub fn tokenizer<'a>() -> Tokenizer<'a> { Tokenizer::new(sysdic()) } #[cfg(test)] mod tests { use super::*; use core::dic::unknown::{UnknownDic, CharCategorize, Category}; #[test] fn test_unknown_dic() { let dic = unkown_dic(); let cate = dic.categorize('ビ'); assert_eq!(cate, Category { invoke: true, group: true, length: 2, }); let id = dic.category_id('ビ'); for e in dic.fetch_entries(id) { assert!(e.contents.contains("名詞") || e.contents.contains("感動詞"), "KATAKANA entry should be either '名詞' or '感動詞', got: {:?}", e); } let numerics = (0..10).map(|i| { let c = ::std::char::from_digit(i, 10).unwrap(); dic.category_id(c) }); for n in numerics { assert_eq!(n, dic.category_id('0')); } } #[test] fn test_tokenize() { let input = "すもももももももものうち"; let expected = vec!["すもも", "も", "もも", "も", "もも", "の", "うち"]; let tokenizer = tokenizer(); let tokens = tokenizer.tokenize(input); for (tok, e) in tokens.iter().zip(expected) { assert_eq!(tok.surface(), e); assert_eq!(&input[tok.start()..tok.end()], e); } } }