lindera_filter/
token_filter.rs1pub mod japanese_base_form;
2pub mod japanese_compound_word;
3pub mod japanese_kana;
4pub mod japanese_katakana_stem;
5pub mod japanese_keep_tags;
6pub mod japanese_number;
7pub mod japanese_reading_form;
8pub mod japanese_stop_tags;
9pub mod keep_words;
10pub mod korean_keep_tags;
11pub mod korean_reading_form;
12pub mod korean_stop_tags;
13pub mod length;
14pub mod lowercase;
15pub mod mapping;
16pub mod stop_words;
17pub mod uppercase;
18
19use serde_json::Value;
20use std::ops::Deref;
21
22use lindera_core::error::LinderaErrorKind;
23use lindera_core::LinderaResult;
24
25use crate::parse_cli_flag;
26use crate::token::Token;
27use crate::token_filter::japanese_base_form::{
28 JapaneseBaseFormTokenFilter, JapaneseBaseFormTokenFilterConfig,
29 JAPANESE_BASE_FORM_TOKEN_FILTER_NAME,
30};
31use crate::token_filter::japanese_compound_word::{
32 JapaneseCompoundWordTokenFilter, JapaneseCompoundWordTokenFilterConfig,
33 JAPANESE_COMPOUND_WORD_TOKEN_FILTER_NAME,
34};
35use crate::token_filter::japanese_kana::{
36 JapaneseKanaTokenFilter, JapaneseKanaTokenFilterConfig, JAPANESE_KANA_TOKEN_FILTER_NAME,
37};
38use crate::token_filter::japanese_katakana_stem::{
39 JapaneseKatakanaStemTokenFilter, JapaneseKatakanaStemTokenFilterConfig,
40 JAPANESE_KATAKANA_STEM_TOKEN_FILTER_NAME,
41};
42use crate::token_filter::japanese_keep_tags::{
43 JapaneseKeepTagsTokenFilter, JapaneseKeepTagsTokenFilterConfig,
44 JAPANESE_KEEP_TAGS_TOKEN_FILTER_NAME,
45};
46use crate::token_filter::japanese_number::{
47 JapaneseNumberTokenFilter, JapaneseNumberTokenFilterConfig, JAPANESE_NUMBER_TOKEN_FILTER_NAME,
48};
49use crate::token_filter::japanese_reading_form::{
50 JapaneseReadingFormTokenFilter, JapaneseReadingFormTokenFilterConfig,
51 JAPANESE_READING_FORM_TOKEN_FILTER_NAME,
52};
53use crate::token_filter::japanese_stop_tags::{
54 JapaneseStopTagsTokenFilter, JapaneseStopTagsTokenFilterConfig,
55 JAPANESE_STOP_TAGS_TOKEN_FILTER_NAME,
56};
57use crate::token_filter::keep_words::{
58 KeepWordsTokenFilter, KeepWordsTokenFilterConfig, KEEP_WORDS_TOKEN_FILTER_NAME,
59};
60use crate::token_filter::korean_keep_tags::{
61 KoreanKeepTagsTokenFilter, KoreanKeepTagsTokenFilterConfig, KOREAN_KEEP_TAGS_TOKEN_FILTER_NAME,
62};
63use crate::token_filter::korean_reading_form::{
64 KoreanReadingFormTokenFilter, KOREAN_READING_FORM_TOKEN_FILTER_NAME,
65};
66use crate::token_filter::korean_stop_tags::{
67 KoreanStopTagsTokenFilter, KoreanStopTagsTokenFilterConfig, KOREAN_STOP_TAGS_TOKEN_FILTER_NAME,
68};
69use crate::token_filter::length::{
70 LengthTokenFilter, LengthTokenFilterConfig, LENGTH_TOKEN_FILTER_NAME,
71};
72use crate::token_filter::lowercase::{LowercaseTokenFilter, LOWERCASE_TOKEN_FILTER_NAME};
73use crate::token_filter::mapping::{
74 MappingTokenFilter, MappingTokenFilterConfig, MAPPING_TOKEN_FILTER_NAME,
75};
76use crate::token_filter::stop_words::{
77 StopWordsTokenFilter, StopWordsTokenFilterConfig, STOP_WORDS_TOKEN_FILTER_NAME,
78};
79use crate::token_filter::uppercase::{UppercaseTokenFilter, UPPERCASE_TOKEN_FILTER_NAME};
80
81pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
82 fn name(&self) -> &str;
83 fn apply(&self, tokens: &mut Vec<Token>) -> LinderaResult<()>;
84}
85
86pub struct BoxTokenFilter(Box<dyn TokenFilter + 'static + Send + Sync>);
87
88impl Deref for BoxTokenFilter {
89 type Target = dyn TokenFilter;
90
91 fn deref(&self) -> &dyn TokenFilter {
92 &*self.0
93 }
94}
95
96impl<T: TokenFilter> From<T> for BoxTokenFilter {
97 fn from(token_filter: T) -> BoxTokenFilter {
98 BoxTokenFilter(Box::new(token_filter))
99 }
100}
101
102pub trait TokenFilterClone {
103 fn box_clone(&self) -> BoxTokenFilter;
104}
105
106impl<T: TokenFilter + Clone + 'static> TokenFilterClone for T {
107 fn box_clone(&self) -> BoxTokenFilter {
108 BoxTokenFilter::from(self.clone())
109 }
110}
111
112pub struct TokenFilterLoader {}
113
114impl TokenFilterLoader {
115 pub fn load_from_value(kind: &str, value: &Value) -> LinderaResult<BoxTokenFilter> {
116 let token_filter = match kind {
117 JAPANESE_BASE_FORM_TOKEN_FILTER_NAME => {
118 let config = JapaneseBaseFormTokenFilterConfig::from_value(value)?;
119 BoxTokenFilter::from(JapaneseBaseFormTokenFilter::new(config))
120 }
121 JAPANESE_COMPOUND_WORD_TOKEN_FILTER_NAME => {
122 let config = JapaneseCompoundWordTokenFilterConfig::from_value(value)?;
123 BoxTokenFilter::from(JapaneseCompoundWordTokenFilter::new(config))
124 }
125 JAPANESE_KANA_TOKEN_FILTER_NAME => {
126 let config = JapaneseKanaTokenFilterConfig::from_value(value)?;
127 BoxTokenFilter::from(JapaneseKanaTokenFilter::new(config))
128 }
129 JAPANESE_KATAKANA_STEM_TOKEN_FILTER_NAME => {
130 let config = JapaneseKatakanaStemTokenFilterConfig::from_value(value)?;
131 BoxTokenFilter::from(JapaneseKatakanaStemTokenFilter::new(config))
132 }
133 JAPANESE_KEEP_TAGS_TOKEN_FILTER_NAME => {
134 let config = JapaneseKeepTagsTokenFilterConfig::from_value(value)?;
135 BoxTokenFilter::from(JapaneseKeepTagsTokenFilter::new(config))
136 }
137 JAPANESE_NUMBER_TOKEN_FILTER_NAME => {
138 let config = JapaneseNumberTokenFilterConfig::from_value(value)?;
139 BoxTokenFilter::from(JapaneseNumberTokenFilter::new(config))
140 }
141 JAPANESE_READING_FORM_TOKEN_FILTER_NAME => {
142 let config = JapaneseReadingFormTokenFilterConfig::from_value(value)?;
143 BoxTokenFilter::from(JapaneseReadingFormTokenFilter::new(config))
144 }
145 JAPANESE_STOP_TAGS_TOKEN_FILTER_NAME => {
146 let config = JapaneseStopTagsTokenFilterConfig::from_value(value)?;
147 BoxTokenFilter::from(JapaneseStopTagsTokenFilter::new(config))
148 }
149 KEEP_WORDS_TOKEN_FILTER_NAME => {
150 let config = KeepWordsTokenFilterConfig::from_value(value)?;
151 BoxTokenFilter::from(KeepWordsTokenFilter::new(config))
152 }
153 KOREAN_KEEP_TAGS_TOKEN_FILTER_NAME => {
154 let config = KoreanKeepTagsTokenFilterConfig::from_value(value)?;
155 BoxTokenFilter::from(KoreanKeepTagsTokenFilter::new(config))
156 }
157 KOREAN_READING_FORM_TOKEN_FILTER_NAME => {
158 BoxTokenFilter::from(KoreanReadingFormTokenFilter::new())
159 }
160 KOREAN_STOP_TAGS_TOKEN_FILTER_NAME => {
161 let config = KoreanStopTagsTokenFilterConfig::from_value(value)?;
162 BoxTokenFilter::from(KoreanStopTagsTokenFilter::new(config))
163 }
164 LENGTH_TOKEN_FILTER_NAME => {
165 let config = LengthTokenFilterConfig::from_value(value)?;
166 BoxTokenFilter::from(LengthTokenFilter::new(config))
167 }
168 LOWERCASE_TOKEN_FILTER_NAME => BoxTokenFilter::from(LowercaseTokenFilter::new()),
169 MAPPING_TOKEN_FILTER_NAME => {
170 let config = MappingTokenFilterConfig::from_value(value)?;
171 BoxTokenFilter::from(MappingTokenFilter::new(config)?)
172 }
173 STOP_WORDS_TOKEN_FILTER_NAME => {
174 let config = StopWordsTokenFilterConfig::from_value(value)?;
175 BoxTokenFilter::from(StopWordsTokenFilter::new(config))
176 }
177 UPPERCASE_TOKEN_FILTER_NAME => BoxTokenFilter::from(UppercaseTokenFilter::new()),
178 _ => {
179 return Err(LinderaErrorKind::Deserialize
180 .with_error(anyhow::anyhow!("unsupported token filter: {}", kind)));
181 }
182 };
183
184 Ok(token_filter)
185 }
186
187 pub fn load_from_cli_flag(cli_flag: &str) -> LinderaResult<BoxTokenFilter> {
188 let (kind, args) = parse_cli_flag(cli_flag)?;
189
190 let character_filter = Self::load_from_value(kind, &args)?;
191
192 Ok(character_filter)
193 }
194}