lindera_filter/token_filter/
length.rs

1use serde::{Deserialize, Serialize};
2
3use lindera_core::error::LinderaErrorKind;
4use lindera_core::LinderaResult;
5
6use crate::token::Token;
7use crate::token_filter::TokenFilter;
8
9pub const LENGTH_TOKEN_FILTER_NAME: &str = "length";
10
11#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
12pub struct LengthTokenFilterConfig {
13    min: Option<usize>,
14    max: Option<usize>,
15}
16
17impl LengthTokenFilterConfig {
18    pub fn new(min: Option<usize>, max: Option<usize>) -> Self {
19        Self { min, max }
20    }
21
22    pub fn from_slice(data: &[u8]) -> LinderaResult<Self> {
23        serde_json::from_slice::<LengthTokenFilterConfig>(data)
24            .map_err(|err| LinderaErrorKind::Deserialize.with_error(err))
25    }
26
27    pub fn from_value(value: &serde_json::Value) -> LinderaResult<Self> {
28        serde_json::from_value::<LengthTokenFilterConfig>(value.clone())
29            .map_err(|err| LinderaErrorKind::Deserialize.with_error(err))
30    }
31}
32
33/// Keep only tokens with the specified number of characters of text.
34///
35#[derive(Clone, Debug)]
36pub struct LengthTokenFilter {
37    config: LengthTokenFilterConfig,
38}
39
40impl LengthTokenFilter {
41    pub fn new(config: LengthTokenFilterConfig) -> Self {
42        Self { config }
43    }
44
45    pub fn from_slice(data: &[u8]) -> LinderaResult<Self> {
46        Ok(Self::new(LengthTokenFilterConfig::from_slice(data)?))
47    }
48}
49
50impl TokenFilter for LengthTokenFilter {
51    fn name(&self) -> &'static str {
52        LENGTH_TOKEN_FILTER_NAME
53    }
54
55    fn apply<'a>(&self, tokens: &mut Vec<Token>) -> LinderaResult<()> {
56        tokens.retain(|token| {
57            let len = token.text.chars().count();
58            if let Some(min) = self.config.min {
59                if len < min {
60                    return false;
61                }
62            }
63            if let Some(max) = self.config.max {
64                if len > max {
65                    return false;
66                }
67            }
68            true
69        });
70
71        Ok(())
72    }
73}
74
75#[cfg(test)]
76mod tests {
77    #[cfg(feature = "ipadic")]
78    use lindera_core::word_entry::WordId;
79
80    use crate::token_filter::length::{LengthTokenFilter, LengthTokenFilterConfig};
81    #[cfg(feature = "ipadic")]
82    use crate::{token::Token, token_filter::TokenFilter};
83
84    #[test]
85    fn test_length_token_filter_config_from_slice() {
86        let config_str = r#"
87            {
88                "min": 1,
89                "max": 3
90            }
91            "#;
92        let config = LengthTokenFilterConfig::from_slice(config_str.as_bytes()).unwrap();
93
94        assert_eq!(config.min.unwrap(), 1);
95        assert_eq!(config.max.unwrap(), 3);
96
97        let config_str = r#"
98            {
99                "min": 1
100            }
101            "#;
102        let config = LengthTokenFilterConfig::from_slice(config_str.as_bytes()).unwrap();
103
104        assert_eq!(config.min.unwrap(), 1);
105        assert_eq!(config.max, None);
106
107        let config_str = r#"
108            {
109                "max": 2
110            }
111            "#;
112        let config = LengthTokenFilterConfig::from_slice(config_str.as_bytes()).unwrap();
113
114        assert_eq!(config.min, None);
115        assert_eq!(config.max.unwrap(), 2);
116    }
117
118    #[test]
119    fn test_length_token_filter_from_slice() {
120        let config_str = r#"
121            {
122                "min": 1,
123                "max": 3
124            }
125            "#;
126        let result = LengthTokenFilter::from_slice(config_str.as_bytes());
127
128        assert_eq!(result.is_ok(), true);
129
130        let config_str = r#"
131            {
132                "min": 1
133            }
134            "#;
135        let result = LengthTokenFilter::from_slice(config_str.as_bytes());
136
137        assert_eq!(result.is_ok(), true);
138
139        let config_str = r#"
140            {
141                "max": 2
142            }
143            "#;
144        let result = LengthTokenFilter::from_slice(config_str.as_bytes());
145
146        assert_eq!(result.is_ok(), true);
147    }
148
149    #[test]
150    #[cfg(feature = "ipadic")]
151    fn test_length_token_filter_apply_ipadic() {
152        let config_str = r#"
153            {
154                "min": 2,
155                "max": 3
156            }
157            "#;
158        let filter = LengthTokenFilter::from_slice(config_str.as_bytes()).unwrap();
159
160        let mut tokens: Vec<Token> = vec![
161            Token {
162                text: "すもも".to_string(),
163                byte_start: 0,
164                byte_end: 9,
165                position: 0,
166                position_length: 1,
167                word_id: WordId(36165, true),
168                details: vec![
169                    "名詞".to_string(),
170                    "一般".to_string(),
171                    "*".to_string(),
172                    "*".to_string(),
173                    "*".to_string(),
174                    "*".to_string(),
175                    "すもも".to_string(),
176                    "スモモ".to_string(),
177                    "スモモ".to_string(),
178                ],
179            },
180            Token {
181                text: "も".to_string(),
182                byte_start: 9,
183                byte_end: 12,
184                position: 1,
185                position_length: 1,
186                word_id: WordId(73246, true),
187                details: vec![
188                    "助詞".to_string(),
189                    "係助詞".to_string(),
190                    "*".to_string(),
191                    "*".to_string(),
192                    "*".to_string(),
193                    "*".to_string(),
194                    "も".to_string(),
195                    "モ".to_string(),
196                    "モ".to_string(),
197                ],
198            },
199            Token {
200                text: "もも".to_string(),
201                byte_start: 12,
202                byte_end: 18,
203                position: 2,
204                position_length: 1,
205                word_id: WordId(74990, true),
206                details: vec![
207                    "名詞".to_string(),
208                    "一般".to_string(),
209                    "*".to_string(),
210                    "*".to_string(),
211                    "*".to_string(),
212                    "*".to_string(),
213                    "もも".to_string(),
214                    "モモ".to_string(),
215                    "モモ".to_string(),
216                ],
217            },
218            Token {
219                text: "も".to_string(),
220                byte_start: 18,
221                byte_end: 21,
222                position: 3,
223                position_length: 1,
224                word_id: WordId(73246, true),
225                details: vec![
226                    "助詞".to_string(),
227                    "係助詞".to_string(),
228                    "*".to_string(),
229                    "*".to_string(),
230                    "*".to_string(),
231                    "*".to_string(),
232                    "も".to_string(),
233                    "モ".to_string(),
234                    "モ".to_string(),
235                ],
236            },
237            Token {
238                text: "もも".to_string(),
239                byte_start: 21,
240                byte_end: 27,
241                position: 4,
242                position_length: 1,
243                word_id: WordId(74990, true),
244                details: vec![
245                    "名詞".to_string(),
246                    "一般".to_string(),
247                    "*".to_string(),
248                    "*".to_string(),
249                    "*".to_string(),
250                    "*".to_string(),
251                    "もも".to_string(),
252                    "モモ".to_string(),
253                    "モモ".to_string(),
254                ],
255            },
256            Token {
257                text: "の".to_string(),
258                byte_start: 27,
259                byte_end: 30,
260                position: 5,
261                position_length: 1,
262                word_id: WordId(55831, true),
263                details: vec![
264                    "助詞".to_string(),
265                    "連体化".to_string(),
266                    "*".to_string(),
267                    "*".to_string(),
268                    "*".to_string(),
269                    "*".to_string(),
270                    "の".to_string(),
271                    "ノ".to_string(),
272                    "ノ".to_string(),
273                ],
274            },
275            Token {
276                text: "うち".to_string(),
277                byte_start: 30,
278                byte_end: 36,
279                position: 6,
280                position_length: 1,
281                word_id: WordId(8029, true),
282                details: vec![
283                    "名詞".to_string(),
284                    "非自立".to_string(),
285                    "副詞可能".to_string(),
286                    "*".to_string(),
287                    "*".to_string(),
288                    "*".to_string(),
289                    "うち".to_string(),
290                    "ウチ".to_string(),
291                    "ウチ".to_string(),
292                ],
293            },
294        ];
295
296        filter.apply(&mut tokens).unwrap();
297
298        assert_eq!(tokens.len(), 4);
299        assert_eq!(&tokens[0].text, "すもも");
300        assert_eq!(&tokens[1].text, "もも");
301        assert_eq!(&tokens[2].text, "もも");
302        assert_eq!(&tokens[3].text, "うち");
303    }
304}