lindera_filter/token_filter/
length.rs1use serde::{Deserialize, Serialize};
2
3use lindera_core::error::LinderaErrorKind;
4use lindera_core::LinderaResult;
5
6use crate::token::Token;
7use crate::token_filter::TokenFilter;
8
9pub const LENGTH_TOKEN_FILTER_NAME: &str = "length";
10
11#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
12pub struct LengthTokenFilterConfig {
13 min: Option<usize>,
14 max: Option<usize>,
15}
16
17impl LengthTokenFilterConfig {
18 pub fn new(min: Option<usize>, max: Option<usize>) -> Self {
19 Self { min, max }
20 }
21
22 pub fn from_slice(data: &[u8]) -> LinderaResult<Self> {
23 serde_json::from_slice::<LengthTokenFilterConfig>(data)
24 .map_err(|err| LinderaErrorKind::Deserialize.with_error(err))
25 }
26
27 pub fn from_value(value: &serde_json::Value) -> LinderaResult<Self> {
28 serde_json::from_value::<LengthTokenFilterConfig>(value.clone())
29 .map_err(|err| LinderaErrorKind::Deserialize.with_error(err))
30 }
31}
32
33#[derive(Clone, Debug)]
36pub struct LengthTokenFilter {
37 config: LengthTokenFilterConfig,
38}
39
40impl LengthTokenFilter {
41 pub fn new(config: LengthTokenFilterConfig) -> Self {
42 Self { config }
43 }
44
45 pub fn from_slice(data: &[u8]) -> LinderaResult<Self> {
46 Ok(Self::new(LengthTokenFilterConfig::from_slice(data)?))
47 }
48}
49
50impl TokenFilter for LengthTokenFilter {
51 fn name(&self) -> &'static str {
52 LENGTH_TOKEN_FILTER_NAME
53 }
54
55 fn apply<'a>(&self, tokens: &mut Vec<Token>) -> LinderaResult<()> {
56 tokens.retain(|token| {
57 let len = token.text.chars().count();
58 if let Some(min) = self.config.min {
59 if len < min {
60 return false;
61 }
62 }
63 if let Some(max) = self.config.max {
64 if len > max {
65 return false;
66 }
67 }
68 true
69 });
70
71 Ok(())
72 }
73}
74
75#[cfg(test)]
76mod tests {
77 #[cfg(feature = "ipadic")]
78 use lindera_core::word_entry::WordId;
79
80 use crate::token_filter::length::{LengthTokenFilter, LengthTokenFilterConfig};
81 #[cfg(feature = "ipadic")]
82 use crate::{token::Token, token_filter::TokenFilter};
83
84 #[test]
85 fn test_length_token_filter_config_from_slice() {
86 let config_str = r#"
87 {
88 "min": 1,
89 "max": 3
90 }
91 "#;
92 let config = LengthTokenFilterConfig::from_slice(config_str.as_bytes()).unwrap();
93
94 assert_eq!(config.min.unwrap(), 1);
95 assert_eq!(config.max.unwrap(), 3);
96
97 let config_str = r#"
98 {
99 "min": 1
100 }
101 "#;
102 let config = LengthTokenFilterConfig::from_slice(config_str.as_bytes()).unwrap();
103
104 assert_eq!(config.min.unwrap(), 1);
105 assert_eq!(config.max, None);
106
107 let config_str = r#"
108 {
109 "max": 2
110 }
111 "#;
112 let config = LengthTokenFilterConfig::from_slice(config_str.as_bytes()).unwrap();
113
114 assert_eq!(config.min, None);
115 assert_eq!(config.max.unwrap(), 2);
116 }
117
118 #[test]
119 fn test_length_token_filter_from_slice() {
120 let config_str = r#"
121 {
122 "min": 1,
123 "max": 3
124 }
125 "#;
126 let result = LengthTokenFilter::from_slice(config_str.as_bytes());
127
128 assert_eq!(result.is_ok(), true);
129
130 let config_str = r#"
131 {
132 "min": 1
133 }
134 "#;
135 let result = LengthTokenFilter::from_slice(config_str.as_bytes());
136
137 assert_eq!(result.is_ok(), true);
138
139 let config_str = r#"
140 {
141 "max": 2
142 }
143 "#;
144 let result = LengthTokenFilter::from_slice(config_str.as_bytes());
145
146 assert_eq!(result.is_ok(), true);
147 }
148
149 #[test]
150 #[cfg(feature = "ipadic")]
151 fn test_length_token_filter_apply_ipadic() {
152 let config_str = r#"
153 {
154 "min": 2,
155 "max": 3
156 }
157 "#;
158 let filter = LengthTokenFilter::from_slice(config_str.as_bytes()).unwrap();
159
160 let mut tokens: Vec<Token> = vec![
161 Token {
162 text: "すもも".to_string(),
163 byte_start: 0,
164 byte_end: 9,
165 position: 0,
166 position_length: 1,
167 word_id: WordId(36165, true),
168 details: vec![
169 "名詞".to_string(),
170 "一般".to_string(),
171 "*".to_string(),
172 "*".to_string(),
173 "*".to_string(),
174 "*".to_string(),
175 "すもも".to_string(),
176 "スモモ".to_string(),
177 "スモモ".to_string(),
178 ],
179 },
180 Token {
181 text: "も".to_string(),
182 byte_start: 9,
183 byte_end: 12,
184 position: 1,
185 position_length: 1,
186 word_id: WordId(73246, true),
187 details: vec![
188 "助詞".to_string(),
189 "係助詞".to_string(),
190 "*".to_string(),
191 "*".to_string(),
192 "*".to_string(),
193 "*".to_string(),
194 "も".to_string(),
195 "モ".to_string(),
196 "モ".to_string(),
197 ],
198 },
199 Token {
200 text: "もも".to_string(),
201 byte_start: 12,
202 byte_end: 18,
203 position: 2,
204 position_length: 1,
205 word_id: WordId(74990, true),
206 details: vec![
207 "名詞".to_string(),
208 "一般".to_string(),
209 "*".to_string(),
210 "*".to_string(),
211 "*".to_string(),
212 "*".to_string(),
213 "もも".to_string(),
214 "モモ".to_string(),
215 "モモ".to_string(),
216 ],
217 },
218 Token {
219 text: "も".to_string(),
220 byte_start: 18,
221 byte_end: 21,
222 position: 3,
223 position_length: 1,
224 word_id: WordId(73246, true),
225 details: vec![
226 "助詞".to_string(),
227 "係助詞".to_string(),
228 "*".to_string(),
229 "*".to_string(),
230 "*".to_string(),
231 "*".to_string(),
232 "も".to_string(),
233 "モ".to_string(),
234 "モ".to_string(),
235 ],
236 },
237 Token {
238 text: "もも".to_string(),
239 byte_start: 21,
240 byte_end: 27,
241 position: 4,
242 position_length: 1,
243 word_id: WordId(74990, true),
244 details: vec![
245 "名詞".to_string(),
246 "一般".to_string(),
247 "*".to_string(),
248 "*".to_string(),
249 "*".to_string(),
250 "*".to_string(),
251 "もも".to_string(),
252 "モモ".to_string(),
253 "モモ".to_string(),
254 ],
255 },
256 Token {
257 text: "の".to_string(),
258 byte_start: 27,
259 byte_end: 30,
260 position: 5,
261 position_length: 1,
262 word_id: WordId(55831, true),
263 details: vec![
264 "助詞".to_string(),
265 "連体化".to_string(),
266 "*".to_string(),
267 "*".to_string(),
268 "*".to_string(),
269 "*".to_string(),
270 "の".to_string(),
271 "ノ".to_string(),
272 "ノ".to_string(),
273 ],
274 },
275 Token {
276 text: "うち".to_string(),
277 byte_start: 30,
278 byte_end: 36,
279 position: 6,
280 position_length: 1,
281 word_id: WordId(8029, true),
282 details: vec![
283 "名詞".to_string(),
284 "非自立".to_string(),
285 "副詞可能".to_string(),
286 "*".to_string(),
287 "*".to_string(),
288 "*".to_string(),
289 "うち".to_string(),
290 "ウチ".to_string(),
291 "ウチ".to_string(),
292 ],
293 },
294 ];
295
296 filter.apply(&mut tokens).unwrap();
297
298 assert_eq!(tokens.len(), 4);
299 assert_eq!(&tokens[0].text, "すもも");
300 assert_eq!(&tokens[1].text, "もも");
301 assert_eq!(&tokens[2].text, "もも");
302 assert_eq!(&tokens[3].text, "うち");
303 }
304}