jpreprocess_dictionary/tokenizer/
mod.rs

1use jpreprocess_core::{word_entry::WordEntry, JPreprocessResult};
2use lindera_dictionary::dictionary::UNK;
3
4pub mod default;
5mod identify_dictionary;
6pub mod jpreprocess;
7
8pub trait Tokenizer {
9    fn tokenize<'a>(&'a self, text: &'a str) -> JPreprocessResult<Vec<impl 'a + Token>>;
10}
11
12pub trait Token {
13    fn fetch(&mut self) -> JPreprocessResult<(&str, WordEntry)>;
14}
15
16impl Tokenizer for lindera::tokenizer::Tokenizer {
17    fn tokenize<'a>(&'a self, text: &'a str) -> JPreprocessResult<Vec<impl 'a + Token>> {
18        Ok(self.tokenize(text)?)
19    }
20}
21
22impl Token for lindera::token::Token<'_> {
23    fn fetch(&mut self) -> JPreprocessResult<(&str, WordEntry)> {
24        let mut details = self.details();
25        let entry = if details == *UNK {
26            WordEntry::default()
27        } else {
28            details.resize(12, "");
29            WordEntry::load(&details)?
30        };
31
32        Ok((&self.surface, entry))
33    }
34}