jpreprocess_dictionary/tokenizer/
mod.rs1use jpreprocess_core::{word_entry::WordEntry, JPreprocessResult};
2use lindera_dictionary::dictionary::UNK;
3
4pub mod default;
5mod identify_dictionary;
6pub mod jpreprocess;
7
8pub trait Tokenizer {
9 fn tokenize<'a>(&'a self, text: &'a str) -> JPreprocessResult<Vec<impl 'a + Token>>;
10}
11
12pub trait Token {
13 fn fetch(&mut self) -> JPreprocessResult<(&str, WordEntry)>;
14}
15
16impl Tokenizer for lindera::tokenizer::Tokenizer {
17 fn tokenize<'a>(&'a self, text: &'a str) -> JPreprocessResult<Vec<impl 'a + Token>> {
18 Ok(self.tokenize(text)?)
19 }
20}
21
22impl Token for lindera::token::Token<'_> {
23 fn fetch(&mut self) -> JPreprocessResult<(&str, WordEntry)> {
24 let mut details = self.details();
25 let entry = if details == *UNK {
26 WordEntry::default()
27 } else {
28 details.resize(12, "");
29 WordEntry::load(&details)?
30 };
31
32 Ok((&self.surface, entry))
33 }
34}