jpreprocess_dictionary/tokenizer/
default.rs1use jpreprocess_core::{word_entry::WordEntry, JPreprocessResult};
2use lindera_dictionary::dictionary::prefix_dictionary::PrefixDictionary;
3
4use super::{
5 identify_dictionary::DictionaryIdent,
6 jpreprocess::{JPreprocessToken, JPreprocessTokenizer},
7 Token, Tokenizer,
8};
9
10pub struct DefaultTokenizer {
11 lindera_tokenizer: lindera::tokenizer::Tokenizer,
12 system: TokenizerType,
13 user: Option<TokenizerType>,
14}
15
16enum TokenizerType {
17 JPreprocessTokenizer,
18 LinderaTokenizer,
19}
20
21impl DefaultTokenizer {
22 pub fn new(tokenizer: lindera::tokenizer::Tokenizer) -> Self {
23 fn identify_tokenizer(prefix_dictionary: &PrefixDictionary) -> TokenizerType {
24 let ident = DictionaryIdent::from_idx_data(
25 &prefix_dictionary.words_idx_data,
26 &prefix_dictionary.words_data,
27 );
28 match ident {
29 DictionaryIdent::JPreprocess => TokenizerType::JPreprocessTokenizer,
30 DictionaryIdent::Lindera => TokenizerType::LinderaTokenizer,
31 }
32 }
33
34 Self {
35 system: identify_tokenizer(&tokenizer.segmenter.dictionary.prefix_dictionary),
36 user: tokenizer
37 .segmenter
38 .user_dictionary
39 .as_ref()
40 .map(|d| identify_tokenizer(&d.dict)),
41 lindera_tokenizer: tokenizer,
42 }
43 }
44}
45
46impl Tokenizer for DefaultTokenizer {
47 fn tokenize<'a>(&'a self, text: &'a str) -> JPreprocessResult<Vec<impl 'a + Token>> {
48 let tokens = self.lindera_tokenizer.tokenize(text)?;
49
50 tokens
51 .into_iter()
52 .map(|token| {
53 if token.word_id.is_unknown() {
54 Ok(DefaultToken::from_token(token))
55 } else if token.word_id.is_system() {
56 match self.system {
57 TokenizerType::JPreprocessTokenizer => {
58 Ok(DefaultToken::from_token(JPreprocessToken::new(
59 token.surface,
60 JPreprocessTokenizer::get_word_from_prefixdict(
61 &token.dictionary.prefix_dictionary,
62 token.word_id,
63 )?,
64 )))
65 }
66 TokenizerType::LinderaTokenizer => Ok(DefaultToken::from_token(token)),
67 }
68 } else {
69 match self.user {
70 Some(TokenizerType::JPreprocessTokenizer) => {
71 Ok(DefaultToken::from_token(JPreprocessToken::new(
72 token.surface,
73 JPreprocessTokenizer::get_word_from_prefixdict(
74 &token.user_dictionary.as_ref().unwrap().dict,
75 token.word_id,
76 )?,
77 )))
78 }
79 Some(TokenizerType::LinderaTokenizer) => {
80 Ok(DefaultToken::from_token(token))
81 }
82 None => Ok(DefaultToken::from_token(token)),
83 }
84 }
85 })
86 .collect()
87 }
88}
89
90struct DefaultToken<'a> {
91 inner: Box<dyn 'a + Token>,
92}
93
94impl<'a> DefaultToken<'a> {
95 fn from_token(inner: impl 'a + Token) -> Self {
96 DefaultToken {
97 inner: Box::new(inner),
98 }
99 }
100}
101
102impl Token for DefaultToken<'_> {
103 fn fetch(&mut self) -> JPreprocessResult<(&str, WordEntry)> {
104 self.inner.fetch()
105 }
106}