lindera_filter/
token_filter.rs

1pub mod japanese_base_form;
2pub mod japanese_compound_word;
3pub mod japanese_kana;
4pub mod japanese_katakana_stem;
5pub mod japanese_keep_tags;
6pub mod japanese_number;
7pub mod japanese_reading_form;
8pub mod japanese_stop_tags;
9pub mod keep_words;
10pub mod korean_keep_tags;
11pub mod korean_reading_form;
12pub mod korean_stop_tags;
13pub mod length;
14pub mod lowercase;
15pub mod mapping;
16pub mod stop_words;
17pub mod uppercase;
18
19use serde_json::Value;
20use std::ops::Deref;
21
22use lindera_core::error::LinderaErrorKind;
23use lindera_core::LinderaResult;
24
25use crate::parse_cli_flag;
26use crate::token::Token;
27use crate::token_filter::japanese_base_form::{
28    JapaneseBaseFormTokenFilter, JapaneseBaseFormTokenFilterConfig,
29    JAPANESE_BASE_FORM_TOKEN_FILTER_NAME,
30};
31use crate::token_filter::japanese_compound_word::{
32    JapaneseCompoundWordTokenFilter, JapaneseCompoundWordTokenFilterConfig,
33    JAPANESE_COMPOUND_WORD_TOKEN_FILTER_NAME,
34};
35use crate::token_filter::japanese_kana::{
36    JapaneseKanaTokenFilter, JapaneseKanaTokenFilterConfig, JAPANESE_KANA_TOKEN_FILTER_NAME,
37};
38use crate::token_filter::japanese_katakana_stem::{
39    JapaneseKatakanaStemTokenFilter, JapaneseKatakanaStemTokenFilterConfig,
40    JAPANESE_KATAKANA_STEM_TOKEN_FILTER_NAME,
41};
42use crate::token_filter::japanese_keep_tags::{
43    JapaneseKeepTagsTokenFilter, JapaneseKeepTagsTokenFilterConfig,
44    JAPANESE_KEEP_TAGS_TOKEN_FILTER_NAME,
45};
46use crate::token_filter::japanese_number::{
47    JapaneseNumberTokenFilter, JapaneseNumberTokenFilterConfig, JAPANESE_NUMBER_TOKEN_FILTER_NAME,
48};
49use crate::token_filter::japanese_reading_form::{
50    JapaneseReadingFormTokenFilter, JapaneseReadingFormTokenFilterConfig,
51    JAPANESE_READING_FORM_TOKEN_FILTER_NAME,
52};
53use crate::token_filter::japanese_stop_tags::{
54    JapaneseStopTagsTokenFilter, JapaneseStopTagsTokenFilterConfig,
55    JAPANESE_STOP_TAGS_TOKEN_FILTER_NAME,
56};
57use crate::token_filter::keep_words::{
58    KeepWordsTokenFilter, KeepWordsTokenFilterConfig, KEEP_WORDS_TOKEN_FILTER_NAME,
59};
60use crate::token_filter::korean_keep_tags::{
61    KoreanKeepTagsTokenFilter, KoreanKeepTagsTokenFilterConfig, KOREAN_KEEP_TAGS_TOKEN_FILTER_NAME,
62};
63use crate::token_filter::korean_reading_form::{
64    KoreanReadingFormTokenFilter, KOREAN_READING_FORM_TOKEN_FILTER_NAME,
65};
66use crate::token_filter::korean_stop_tags::{
67    KoreanStopTagsTokenFilter, KoreanStopTagsTokenFilterConfig, KOREAN_STOP_TAGS_TOKEN_FILTER_NAME,
68};
69use crate::token_filter::length::{
70    LengthTokenFilter, LengthTokenFilterConfig, LENGTH_TOKEN_FILTER_NAME,
71};
72use crate::token_filter::lowercase::{LowercaseTokenFilter, LOWERCASE_TOKEN_FILTER_NAME};
73use crate::token_filter::mapping::{
74    MappingTokenFilter, MappingTokenFilterConfig, MAPPING_TOKEN_FILTER_NAME,
75};
76use crate::token_filter::stop_words::{
77    StopWordsTokenFilter, StopWordsTokenFilterConfig, STOP_WORDS_TOKEN_FILTER_NAME,
78};
79use crate::token_filter::uppercase::{UppercaseTokenFilter, UPPERCASE_TOKEN_FILTER_NAME};
80
81pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
82    fn name(&self) -> &str;
83    fn apply(&self, tokens: &mut Vec<Token>) -> LinderaResult<()>;
84}
85
86pub struct BoxTokenFilter(Box<dyn TokenFilter + 'static + Send + Sync>);
87
88impl Deref for BoxTokenFilter {
89    type Target = dyn TokenFilter;
90
91    fn deref(&self) -> &dyn TokenFilter {
92        &*self.0
93    }
94}
95
96impl<T: TokenFilter> From<T> for BoxTokenFilter {
97    fn from(token_filter: T) -> BoxTokenFilter {
98        BoxTokenFilter(Box::new(token_filter))
99    }
100}
101
102pub trait TokenFilterClone {
103    fn box_clone(&self) -> BoxTokenFilter;
104}
105
106impl<T: TokenFilter + Clone + 'static> TokenFilterClone for T {
107    fn box_clone(&self) -> BoxTokenFilter {
108        BoxTokenFilter::from(self.clone())
109    }
110}
111
112pub struct TokenFilterLoader {}
113
114impl TokenFilterLoader {
115    pub fn load_from_value(kind: &str, value: &Value) -> LinderaResult<BoxTokenFilter> {
116        let token_filter = match kind {
117            JAPANESE_BASE_FORM_TOKEN_FILTER_NAME => {
118                let config = JapaneseBaseFormTokenFilterConfig::from_value(value)?;
119                BoxTokenFilter::from(JapaneseBaseFormTokenFilter::new(config))
120            }
121            JAPANESE_COMPOUND_WORD_TOKEN_FILTER_NAME => {
122                let config = JapaneseCompoundWordTokenFilterConfig::from_value(value)?;
123                BoxTokenFilter::from(JapaneseCompoundWordTokenFilter::new(config))
124            }
125            JAPANESE_KANA_TOKEN_FILTER_NAME => {
126                let config = JapaneseKanaTokenFilterConfig::from_value(value)?;
127                BoxTokenFilter::from(JapaneseKanaTokenFilter::new(config))
128            }
129            JAPANESE_KATAKANA_STEM_TOKEN_FILTER_NAME => {
130                let config = JapaneseKatakanaStemTokenFilterConfig::from_value(value)?;
131                BoxTokenFilter::from(JapaneseKatakanaStemTokenFilter::new(config))
132            }
133            JAPANESE_KEEP_TAGS_TOKEN_FILTER_NAME => {
134                let config = JapaneseKeepTagsTokenFilterConfig::from_value(value)?;
135                BoxTokenFilter::from(JapaneseKeepTagsTokenFilter::new(config))
136            }
137            JAPANESE_NUMBER_TOKEN_FILTER_NAME => {
138                let config = JapaneseNumberTokenFilterConfig::from_value(value)?;
139                BoxTokenFilter::from(JapaneseNumberTokenFilter::new(config))
140            }
141            JAPANESE_READING_FORM_TOKEN_FILTER_NAME => {
142                let config = JapaneseReadingFormTokenFilterConfig::from_value(value)?;
143                BoxTokenFilter::from(JapaneseReadingFormTokenFilter::new(config))
144            }
145            JAPANESE_STOP_TAGS_TOKEN_FILTER_NAME => {
146                let config = JapaneseStopTagsTokenFilterConfig::from_value(value)?;
147                BoxTokenFilter::from(JapaneseStopTagsTokenFilter::new(config))
148            }
149            KEEP_WORDS_TOKEN_FILTER_NAME => {
150                let config = KeepWordsTokenFilterConfig::from_value(value)?;
151                BoxTokenFilter::from(KeepWordsTokenFilter::new(config))
152            }
153            KOREAN_KEEP_TAGS_TOKEN_FILTER_NAME => {
154                let config = KoreanKeepTagsTokenFilterConfig::from_value(value)?;
155                BoxTokenFilter::from(KoreanKeepTagsTokenFilter::new(config))
156            }
157            KOREAN_READING_FORM_TOKEN_FILTER_NAME => {
158                BoxTokenFilter::from(KoreanReadingFormTokenFilter::new())
159            }
160            KOREAN_STOP_TAGS_TOKEN_FILTER_NAME => {
161                let config = KoreanStopTagsTokenFilterConfig::from_value(value)?;
162                BoxTokenFilter::from(KoreanStopTagsTokenFilter::new(config))
163            }
164            LENGTH_TOKEN_FILTER_NAME => {
165                let config = LengthTokenFilterConfig::from_value(value)?;
166                BoxTokenFilter::from(LengthTokenFilter::new(config))
167            }
168            LOWERCASE_TOKEN_FILTER_NAME => BoxTokenFilter::from(LowercaseTokenFilter::new()),
169            MAPPING_TOKEN_FILTER_NAME => {
170                let config = MappingTokenFilterConfig::from_value(value)?;
171                BoxTokenFilter::from(MappingTokenFilter::new(config)?)
172            }
173            STOP_WORDS_TOKEN_FILTER_NAME => {
174                let config = StopWordsTokenFilterConfig::from_value(value)?;
175                BoxTokenFilter::from(StopWordsTokenFilter::new(config))
176            }
177            UPPERCASE_TOKEN_FILTER_NAME => BoxTokenFilter::from(UppercaseTokenFilter::new()),
178            _ => {
179                return Err(LinderaErrorKind::Deserialize
180                    .with_error(anyhow::anyhow!("unsupported token filter: {}", kind)));
181            }
182        };
183
184        Ok(token_filter)
185    }
186
187    pub fn load_from_cli_flag(cli_flag: &str) -> LinderaResult<BoxTokenFilter> {
188        let (kind, args) = parse_cli_flag(cli_flag)?;
189
190        let character_filter = Self::load_from_value(kind, &args)?;
191
192        Ok(character_filter)
193    }
194}