lindera_filter/character_filter/
regex.rs1use regex::Regex;
2use serde::{Deserialize, Serialize};
3use serde_json::Value;
4
5use lindera_core::error::LinderaErrorKind;
6use lindera_core::LinderaResult;
7
8use crate::character_filter::{add_offset_diff, CharacterFilter};
9
10pub const REGEX_CHARACTER_FILTER_NAME: &str = "regex";
11
12#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
13pub struct RegexCharacterFilterConfig {
14 pub pattern: String,
15 pub replacement: String,
16}
17
18impl RegexCharacterFilterConfig {
19 pub fn new(pattern: String, replacement: String) -> Self {
20 Self {
21 pattern,
22 replacement,
23 }
24 }
25
26 pub fn from_slice(data: &[u8]) -> LinderaResult<Self> {
27 serde_json::from_slice::<RegexCharacterFilterConfig>(data)
28 .map_err(|err| LinderaErrorKind::Deserialize.with_error(err))
29 }
30
31 pub fn from_value(value: &Value) -> LinderaResult<Self> {
32 serde_json::from_value::<RegexCharacterFilterConfig>(value.clone())
33 .map_err(|err| LinderaErrorKind::Deserialize.with_error(err))
34 }
35}
36
37#[derive(Clone, Debug)]
40pub struct RegexCharacterFilter {
41 config: RegexCharacterFilterConfig,
42 regex: Regex,
43}
44
45impl RegexCharacterFilter {
46 pub fn new(config: RegexCharacterFilterConfig) -> LinderaResult<Self> {
47 let regex =
48 Regex::new(&config.pattern).map_err(|err| LinderaErrorKind::Args.with_error(err))?;
49
50 Ok(Self { config, regex })
51 }
52
53 pub fn from_slice(data: &[u8]) -> LinderaResult<Self> {
54 Self::new(RegexCharacterFilterConfig::from_slice(data)?)
55 }
56}
57
58impl CharacterFilter for RegexCharacterFilter {
59 fn name(&self) -> &'static str {
60 REGEX_CHARACTER_FILTER_NAME
61 }
62
63 fn apply(&self, text: &str) -> LinderaResult<(String, Vec<usize>, Vec<i64>)> {
64 let mut offsets: Vec<usize> = Vec::new();
65 let mut diffs: Vec<i64> = Vec::new();
66
67 self.regex.find_iter(text).for_each(|mat| {
68 let input_start = mat.start();
69 let input_text = mat.as_str();
70 let input_len = input_text.len();
71 let replacement_text = self.config.replacement.as_str();
72 let replacement_len = replacement_text.len();
73 let diff_len = input_len as i64 - replacement_len as i64;
74 let input_offset = input_start + input_len;
75
76 if diff_len != 0 {
77 let prev_diff = *diffs.last().unwrap_or(&0);
78
79 if diff_len > 0 {
80 let offset = (input_offset as i64 - diff_len - prev_diff) as usize;
82 let diff = prev_diff + diff_len;
83 add_offset_diff(&mut offsets, &mut diffs, offset, diff);
84 } else {
85 let output_start = (input_offset as i64 + -prev_diff) as usize;
87 for extra_idx in 0..diff_len.unsigned_abs() as usize {
88 let offset = output_start + extra_idx;
89 let diff = prev_diff - extra_idx as i64 - 1;
90 add_offset_diff(&mut offsets, &mut diffs, offset, diff);
91 }
92 }
93 }
94 });
95
96 let new_text = self
97 .regex
98 .replace_all(text, &self.config.replacement)
99 .to_string();
100
101 Ok((new_text, offsets, diffs))
102 }
103}
104
105#[cfg(test)]
106mod tests {
107 use crate::character_filter::regex::{RegexCharacterFilter, RegexCharacterFilterConfig};
108 use crate::character_filter::{correct_offset, CharacterFilter};
109
110 #[test]
111 fn test_regex_character_filter_config_from_slice() {
112 let config_str = r#"
113 {
114 "pattern": "リンデラ",
115 "replacement": "Lindera"
116 }
117 "#;
118 let config = RegexCharacterFilterConfig::from_slice(config_str.as_bytes()).unwrap();
119 assert_eq!("リンデラ", config.pattern);
120 assert_eq!("Lindera", config.replacement);
121 }
122
123 #[test]
124 fn test_regex_character_filter_from_slice() {
125 let config_str = r#"
126 {
127 "pattern": "リンデラ",
128 "replacement": "Lindera"
129 }
130 "#;
131 let result = RegexCharacterFilterConfig::from_slice(config_str.as_bytes());
132 assert_eq!(true, result.is_ok());
133 }
134
135 #[test]
136 fn test_regex_character_filter_apply() {
137 {
138 let config_str = r#"
139 {
140 "pattern": "リンデラ",
141 "replacement": "Lindera"
142 }
143 "#;
144 let filter = RegexCharacterFilter::from_slice(config_str.as_bytes()).unwrap();
145 let text = "リンデラは形態素解析器です。";
146 let (filterd_text, offsets, diffs) = filter.apply(text).unwrap();
147 assert_eq!("Linderaは形態素解析器です。", filterd_text);
148 assert_eq!(vec![7], offsets);
149 assert_eq!(vec![5], diffs);
150 let start = 0;
151 let end = 7;
152 assert_eq!("Lindera", &filterd_text[start..end]);
153 let correct_start = correct_offset(start, &offsets, &diffs, filterd_text.len());
154 let correct_end = correct_offset(end, &offsets, &diffs, filterd_text.len());
155 assert_eq!(0, correct_start);
156 assert_eq!(12, correct_end);
157 assert_eq!("リンデラ", &text[correct_start..correct_end]);
158 }
159
160 {
161 let config_str = r#"
162 {
163 "pattern": "\\s{2,}",
164 "replacement": " "
165 }
166 "#;
167 let filter = RegexCharacterFilter::from_slice(config_str.as_bytes()).unwrap();
168 let text = "a b c";
169 let (filterd_text, offsets, diffs) = filter.apply(text).unwrap();
170 assert_eq!("a b c", filterd_text);
171 assert_eq!(vec![2, 4], offsets);
172 assert_eq!(vec![4, 8], diffs);
173 let start = 2;
174 let end = 3;
175 assert_eq!("b", &filterd_text[start..end]);
176 let correct_start = correct_offset(start, &offsets, &diffs, filterd_text.len());
177 let correct_end = correct_offset(end, &offsets, &diffs, filterd_text.len());
178 assert_eq!(6, correct_start);
179 assert_eq!(7, correct_end);
180 assert_eq!("b", &text[correct_start..correct_end]);
181 }
182 }
183}