lindera_filter/token_filter/
lowercase.rs

1use lindera_core::LinderaResult;
2
3use crate::token::Token;
4use crate::token_filter::TokenFilter;
5
6pub const LOWERCASE_TOKEN_FILTER_NAME: &str = "lowercase";
7
8/// Normalizes token text to lowercase.
9///
10#[derive(Clone, Debug)]
11pub struct LowercaseTokenFilter {}
12
13impl LowercaseTokenFilter {
14    pub fn new() -> Self {
15        Self {}
16    }
17}
18
19impl Default for LowercaseTokenFilter {
20    fn default() -> Self {
21        Self::new()
22    }
23}
24
25impl TokenFilter for LowercaseTokenFilter {
26    fn name(&self) -> &'static str {
27        LOWERCASE_TOKEN_FILTER_NAME
28    }
29
30    fn apply<'a>(&self, tokens: &mut Vec<Token>) -> LinderaResult<()> {
31        for token in tokens.iter_mut() {
32            token.text = token.text.to_lowercase();
33        }
34
35        Ok(())
36    }
37}
38
39#[cfg(test)]
40mod tests {
41    #[cfg(feature = "ipadic")]
42    use lindera_core::word_entry::WordId;
43
44    #[cfg(feature = "ipadic")]
45    use crate::{
46        token::Token,
47        token_filter::{lowercase::LowercaseTokenFilter, TokenFilter},
48    };
49
50    #[test]
51    #[cfg(feature = "ipadic")]
52    fn test_lowercase_token_filter_apply_ipadic() {
53        let filter = LowercaseTokenFilter::default();
54
55        let mut tokens: Vec<Token> = vec![Token {
56            text: "Rust".to_string(),
57            byte_start: 0,
58            byte_end: 4,
59            position: 0,
60            position_length: 1,
61            word_id: WordId(4294967295, true),
62            details: vec!["UNK".to_string()],
63        }];
64
65        filter.apply(&mut tokens).unwrap();
66
67        assert_eq!(tokens.len(), 1);
68        assert_eq!(&tokens[0].text, "rust");
69    }
70}