jpreprocess_dictionary/tokenizer/
default.rs

1use jpreprocess_core::{word_entry::WordEntry, JPreprocessResult};
2use lindera_dictionary::dictionary::prefix_dictionary::PrefixDictionary;
3
4use super::{
5    identify_dictionary::DictionaryIdent,
6    jpreprocess::{JPreprocessToken, JPreprocessTokenizer},
7    Token, Tokenizer,
8};
9
10pub struct DefaultTokenizer {
11    lindera_tokenizer: lindera::tokenizer::Tokenizer,
12    system: TokenizerType,
13    user: Option<TokenizerType>,
14}
15
16enum TokenizerType {
17    JPreprocessTokenizer,
18    LinderaTokenizer,
19}
20
21impl DefaultTokenizer {
22    pub fn new(tokenizer: lindera::tokenizer::Tokenizer) -> Self {
23        fn identify_tokenizer(prefix_dictionary: &PrefixDictionary) -> TokenizerType {
24            let ident = DictionaryIdent::from_idx_data(
25                &prefix_dictionary.words_idx_data,
26                &prefix_dictionary.words_data,
27            );
28            match ident {
29                DictionaryIdent::JPreprocess => TokenizerType::JPreprocessTokenizer,
30                DictionaryIdent::Lindera => TokenizerType::LinderaTokenizer,
31            }
32        }
33
34        Self {
35            system: identify_tokenizer(&tokenizer.segmenter.dictionary.prefix_dictionary),
36            user: tokenizer
37                .segmenter
38                .user_dictionary
39                .as_ref()
40                .map(|d| identify_tokenizer(&d.dict)),
41            lindera_tokenizer: tokenizer,
42        }
43    }
44}
45
46impl Tokenizer for DefaultTokenizer {
47    fn tokenize<'a>(&'a self, text: &'a str) -> JPreprocessResult<Vec<impl 'a + Token>> {
48        let tokens = self.lindera_tokenizer.tokenize(text)?;
49
50        tokens
51            .into_iter()
52            .map(|token| {
53                if token.word_id.is_unknown() {
54                    Ok(DefaultToken::from_token(token))
55                } else if token.word_id.is_system() {
56                    match self.system {
57                        TokenizerType::JPreprocessTokenizer => {
58                            Ok(DefaultToken::from_token(JPreprocessToken::new(
59                                token.surface,
60                                JPreprocessTokenizer::get_word_from_prefixdict(
61                                    &token.dictionary.prefix_dictionary,
62                                    token.word_id,
63                                )?,
64                            )))
65                        }
66                        TokenizerType::LinderaTokenizer => Ok(DefaultToken::from_token(token)),
67                    }
68                } else {
69                    match self.user {
70                        Some(TokenizerType::JPreprocessTokenizer) => {
71                            Ok(DefaultToken::from_token(JPreprocessToken::new(
72                                token.surface,
73                                JPreprocessTokenizer::get_word_from_prefixdict(
74                                    &token.user_dictionary.as_ref().unwrap().dict,
75                                    token.word_id,
76                                )?,
77                            )))
78                        }
79                        Some(TokenizerType::LinderaTokenizer) => {
80                            Ok(DefaultToken::from_token(token))
81                        }
82                        None => Ok(DefaultToken::from_token(token)),
83                    }
84                }
85            })
86            .collect()
87    }
88}
89
90struct DefaultToken<'a> {
91    inner: Box<dyn 'a + Token>,
92}
93
94impl<'a> DefaultToken<'a> {
95    fn from_token(inner: impl 'a + Token) -> Self {
96        DefaultToken {
97            inner: Box::new(inner),
98        }
99    }
100}
101
102impl Token for DefaultToken<'_> {
103    fn fetch(&mut self) -> JPreprocessResult<(&str, WordEntry)> {
104        self.inner.fetch()
105    }
106}