lindera_filter/character_filter/
regex.rs

1use regex::Regex;
2use serde::{Deserialize, Serialize};
3use serde_json::Value;
4
5use lindera_core::error::LinderaErrorKind;
6use lindera_core::LinderaResult;
7
8use crate::character_filter::{add_offset_diff, CharacterFilter};
9
10pub const REGEX_CHARACTER_FILTER_NAME: &str = "regex";
11
12#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
13pub struct RegexCharacterFilterConfig {
14    pub pattern: String,
15    pub replacement: String,
16}
17
18impl RegexCharacterFilterConfig {
19    pub fn new(pattern: String, replacement: String) -> Self {
20        Self {
21            pattern,
22            replacement,
23        }
24    }
25
26    pub fn from_slice(data: &[u8]) -> LinderaResult<Self> {
27        serde_json::from_slice::<RegexCharacterFilterConfig>(data)
28            .map_err(|err| LinderaErrorKind::Deserialize.with_error(err))
29    }
30
31    pub fn from_value(value: &Value) -> LinderaResult<Self> {
32        serde_json::from_value::<RegexCharacterFilterConfig>(value.clone())
33            .map_err(|err| LinderaErrorKind::Deserialize.with_error(err))
34    }
35}
36
37/// Character filter that uses a regular expression for the target of replace string.
38///
39#[derive(Clone, Debug)]
40pub struct RegexCharacterFilter {
41    config: RegexCharacterFilterConfig,
42    regex: Regex,
43}
44
45impl RegexCharacterFilter {
46    pub fn new(config: RegexCharacterFilterConfig) -> LinderaResult<Self> {
47        let regex =
48            Regex::new(&config.pattern).map_err(|err| LinderaErrorKind::Args.with_error(err))?;
49
50        Ok(Self { config, regex })
51    }
52
53    pub fn from_slice(data: &[u8]) -> LinderaResult<Self> {
54        Self::new(RegexCharacterFilterConfig::from_slice(data)?)
55    }
56}
57
58impl CharacterFilter for RegexCharacterFilter {
59    fn name(&self) -> &'static str {
60        REGEX_CHARACTER_FILTER_NAME
61    }
62
63    fn apply(&self, text: &str) -> LinderaResult<(String, Vec<usize>, Vec<i64>)> {
64        let mut offsets: Vec<usize> = Vec::new();
65        let mut diffs: Vec<i64> = Vec::new();
66
67        self.regex.find_iter(text).for_each(|mat| {
68            let input_start = mat.start();
69            let input_text = mat.as_str();
70            let input_len = input_text.len();
71            let replacement_text = self.config.replacement.as_str();
72            let replacement_len = replacement_text.len();
73            let diff_len = input_len as i64 - replacement_len as i64;
74            let input_offset = input_start + input_len;
75
76            if diff_len != 0 {
77                let prev_diff = *diffs.last().unwrap_or(&0);
78
79                if diff_len > 0 {
80                    // Replacement is shorter than matched surface.
81                    let offset = (input_offset as i64 - diff_len - prev_diff) as usize;
82                    let diff = prev_diff + diff_len;
83                    add_offset_diff(&mut offsets, &mut diffs, offset, diff);
84                } else {
85                    // Replacement is longer than matched surface.
86                    let output_start = (input_offset as i64 + -prev_diff) as usize;
87                    for extra_idx in 0..diff_len.unsigned_abs() as usize {
88                        let offset = output_start + extra_idx;
89                        let diff = prev_diff - extra_idx as i64 - 1;
90                        add_offset_diff(&mut offsets, &mut diffs, offset, diff);
91                    }
92                }
93            }
94        });
95
96        let new_text = self
97            .regex
98            .replace_all(text, &self.config.replacement)
99            .to_string();
100
101        Ok((new_text, offsets, diffs))
102    }
103}
104
105#[cfg(test)]
106mod tests {
107    use crate::character_filter::regex::{RegexCharacterFilter, RegexCharacterFilterConfig};
108    use crate::character_filter::{correct_offset, CharacterFilter};
109
110    #[test]
111    fn test_regex_character_filter_config_from_slice() {
112        let config_str = r#"
113        {
114            "pattern": "リンデラ",
115            "replacement": "Lindera"
116        }
117        "#;
118        let config = RegexCharacterFilterConfig::from_slice(config_str.as_bytes()).unwrap();
119        assert_eq!("リンデラ", config.pattern);
120        assert_eq!("Lindera", config.replacement);
121    }
122
123    #[test]
124    fn test_regex_character_filter_from_slice() {
125        let config_str = r#"
126        {
127            "pattern": "リンデラ",
128            "replacement": "Lindera"
129        }
130        "#;
131        let result = RegexCharacterFilterConfig::from_slice(config_str.as_bytes());
132        assert_eq!(true, result.is_ok());
133    }
134
135    #[test]
136    fn test_regex_character_filter_apply() {
137        {
138            let config_str = r#"
139            {
140                "pattern": "リンデラ",
141                "replacement": "Lindera"
142            }
143            "#;
144            let filter = RegexCharacterFilter::from_slice(config_str.as_bytes()).unwrap();
145            let text = "リンデラは形態素解析器です。";
146            let (filterd_text, offsets, diffs) = filter.apply(text).unwrap();
147            assert_eq!("Linderaは形態素解析器です。", filterd_text);
148            assert_eq!(vec![7], offsets);
149            assert_eq!(vec![5], diffs);
150            let start = 0;
151            let end = 7;
152            assert_eq!("Lindera", &filterd_text[start..end]);
153            let correct_start = correct_offset(start, &offsets, &diffs, filterd_text.len());
154            let correct_end = correct_offset(end, &offsets, &diffs, filterd_text.len());
155            assert_eq!(0, correct_start);
156            assert_eq!(12, correct_end);
157            assert_eq!("リンデラ", &text[correct_start..correct_end]);
158        }
159
160        {
161            let config_str = r#"
162            {
163                "pattern": "\\s{2,}",
164                "replacement": " "
165            }
166            "#;
167            let filter = RegexCharacterFilter::from_slice(config_str.as_bytes()).unwrap();
168            let text = "a     b     c";
169            let (filterd_text, offsets, diffs) = filter.apply(text).unwrap();
170            assert_eq!("a b c", filterd_text);
171            assert_eq!(vec![2, 4], offsets);
172            assert_eq!(vec![4, 8], diffs);
173            let start = 2;
174            let end = 3;
175            assert_eq!("b", &filterd_text[start..end]);
176            let correct_start = correct_offset(start, &offsets, &diffs, filterd_text.len());
177            let correct_end = correct_offset(end, &offsets, &diffs, filterd_text.len());
178            assert_eq!(6, correct_start);
179            assert_eq!(7, correct_end);
180            assert_eq!("b", &text[correct_start..correct_end]);
181        }
182    }
183}