1#[derive(Clone, Debug)]
12pub struct OffsetCorrection {
13 pub filtered: usize,
15 pub cumulative_diff: isize,
19}
20
21pub trait CharFilter: Send + Sync {
26 fn filter(&self, text: &str) -> (String, Vec<OffsetCorrection>);
33}
34
35pub struct HtmlStripCharFilter;
41
42impl CharFilter for HtmlStripCharFilter {
43 fn filter(&self, text: &str) -> (String, Vec<OffsetCorrection>) {
44 let mut result = String::with_capacity(text.len());
45 let mut corrections = Vec::new();
46 let mut cumulative_diff: isize = 0;
47 let mut i = 0;
48 let bytes = text.as_bytes();
49
50 while i < bytes.len() {
51 if bytes[i] == b'<' {
52 let tag_start = i;
54 while i < bytes.len() && bytes[i] != b'>' {
55 i += 1;
56 }
57 if i < bytes.len() {
58 i += 1; }
60 let removed = i - tag_start;
61 cumulative_diff += removed as isize;
62 corrections.push(OffsetCorrection {
63 filtered: result.len(),
64 cumulative_diff,
65 });
66 } else if bytes[i] == b'&' {
67 if let Some((decoded, consumed)) = decode_entity(&text[i..]) {
69 let old_len = result.len();
70 result.push_str(&decoded);
71 let new_bytes = result.len() - old_len;
72 cumulative_diff += consumed as isize - new_bytes as isize;
73 corrections.push(OffsetCorrection {
74 filtered: result.len(),
75 cumulative_diff,
76 });
77 i += consumed;
78 } else {
79 result.push('&');
80 i += 1;
81 }
82 } else {
83 let ch = text[i..].chars().next().unwrap();
86 result.push(ch);
87 i += ch.len_utf8();
88 }
89 }
90
91 (result, corrections)
92 }
93}
94
95fn decode_entity(s: &str) -> Option<(String, usize)> {
98 let end = s.find(';')?;
99 if end > 10 {
100 return None; }
102 let entity = &s[1..end]; let consumed = end + 1; let decoded = match entity {
106 "amp" => "&".to_string(),
107 "lt" => "<".to_string(),
108 "gt" => ">".to_string(),
109 "quot" => "\"".to_string(),
110 "apos" => "'".to_string(),
111 "nbsp" => "\u{00A0}".to_string(),
112 _ if entity.starts_with('#') => {
113 let num_str = &entity[1..];
114 let code_point = if let Some(hex) = num_str.strip_prefix('x') {
115 u32::from_str_radix(hex, 16).ok()?
116 } else {
117 num_str.parse::<u32>().ok()?
118 };
119 char::from_u32(code_point)?.to_string()
120 }
121 _ => return None,
122 };
123
124 Some((decoded, consumed))
125}
126
127pub struct MappingCharFilter {
133 mappings: Vec<(String, String)>,
134}
135
136impl MappingCharFilter {
137 pub fn new(mappings: Vec<(String, String)>) -> Self {
138 let mut mappings = mappings;
140 mappings.sort_by(|a, b| b.0.len().cmp(&a.0.len()));
141 Self { mappings }
142 }
143}
144
145impl CharFilter for MappingCharFilter {
146 fn filter(&self, text: &str) -> (String, Vec<OffsetCorrection>) {
147 if self.mappings.is_empty() {
148 return (text.to_string(), Vec::new());
149 }
150
151 let mut result = text.to_string();
152 let mut corrections = Vec::new();
153 let mut cumulative_diff: isize = 0;
154
155 for (from, to) in &self.mappings {
156 let mut new_result = String::with_capacity(result.len());
157 let mut search_start = 0;
158
159 while let Some(pos) = result[search_start..].find(from.as_str()) {
160 let abs_pos = search_start + pos;
161 new_result.push_str(&result[search_start..abs_pos]);
162 new_result.push_str(to);
163
164 let len_diff = from.len() as isize - to.len() as isize;
165 cumulative_diff += len_diff;
166 corrections.push(OffsetCorrection {
167 filtered: new_result.len(),
168 cumulative_diff,
169 });
170
171 search_start = abs_pos + from.len();
172 }
173
174 new_result.push_str(&result[search_start..]);
175 result = new_result;
176 }
177
178 (result, corrections)
179 }
180}
181
182pub struct PatternReplaceCharFilter {
188 pattern: regex::Regex,
189 replacement: String,
190}
191
192impl PatternReplaceCharFilter {
193 pub fn new(pattern: &str, replacement: &str) -> Result<Self, regex::Error> {
194 Ok(Self {
195 pattern: regex::Regex::new(pattern)?,
196 replacement: replacement.to_string(),
197 })
198 }
199}
200
201impl CharFilter for PatternReplaceCharFilter {
202 fn filter(&self, text: &str) -> (String, Vec<OffsetCorrection>) {
203 let mut corrections = Vec::new();
204 let mut cumulative_diff: isize = 0;
205 let mut result = String::with_capacity(text.len());
206 let mut last_end = 0;
207
208 for m in self.pattern.find_iter(text) {
209 result.push_str(&text[last_end..m.start()]);
210 result.push_str(&self.replacement);
211
212 let match_len = m.end() - m.start();
213 let replace_len = self.replacement.len();
214 cumulative_diff += match_len as isize - replace_len as isize;
215 corrections.push(OffsetCorrection {
216 filtered: result.len(),
217 cumulative_diff,
218 });
219
220 last_end = m.end();
221 }
222
223 result.push_str(&text[last_end..]);
224 (result, corrections)
225 }
226}
227
228pub fn correct_offset(filtered_offset: usize, corrections: &[OffsetCorrection]) -> usize {
230 let diff = match corrections
232 .iter()
233 .rev()
234 .find(|c| c.filtered <= filtered_offset)
235 {
236 Some(c) => c.cumulative_diff,
237 None => 0,
238 };
239 (filtered_offset as isize + diff) as usize
240}
241
242#[cfg(test)]
243mod tests {
244 use super::*;
245
246 #[test]
249 fn html_strip_basic() {
250 let filter = HtmlStripCharFilter;
251 let (result, _) = filter.filter("<p>Hello <b>World</b></p>");
252 assert_eq!(result, "Hello World");
253 }
254
255 #[test]
256 fn html_strip_entities() {
257 let filter = HtmlStripCharFilter;
258 let (result, _) = filter.filter("foo & bar < baz");
259 assert_eq!(result, "foo & bar < baz");
260 }
261
262 #[test]
263 fn html_strip_numeric_entity() {
264 let filter = HtmlStripCharFilter;
265 let (result, _) = filter.filter("AB");
266 assert_eq!(result, "AB");
267 }
268
269 #[test]
270 fn html_strip_no_html() {
271 let filter = HtmlStripCharFilter;
272 let (result, corrections) = filter.filter("plain text");
273 assert_eq!(result, "plain text");
274 assert!(corrections.is_empty());
275 }
276
277 #[test]
278 fn html_strip_empty() {
279 let filter = HtmlStripCharFilter;
280 let (result, _) = filter.filter("");
281 assert_eq!(result, "");
282 }
283
284 #[test]
285 fn html_strip_offset_correction() {
286 let filter = HtmlStripCharFilter;
287 let (result, corrections) = filter.filter("<b>Hello</b>");
290 assert_eq!(result, "Hello");
291 let original_start = correct_offset(0, &corrections);
292 assert_eq!(original_start, 3);
293 }
294
295 #[test]
298 fn mapping_basic() {
299 let filter = MappingCharFilter::new(vec![
300 (":)".to_string(), "_happy_".to_string()),
301 (":(".to_string(), "_sad_".to_string()),
302 ]);
303 let (result, _) = filter.filter("I am :) and not :(");
304 assert_eq!(result, "I am _happy_ and not _sad_");
305 }
306
307 #[test]
308 fn mapping_empty() {
309 let filter = MappingCharFilter::new(vec![]);
310 let (result, corrections) = filter.filter("no change");
311 assert_eq!(result, "no change");
312 assert!(corrections.is_empty());
313 }
314
315 #[test]
316 fn mapping_no_match() {
317 let filter = MappingCharFilter::new(vec![("xyz".to_string(), "abc".to_string())]);
318 let (result, _) = filter.filter("hello world");
319 assert_eq!(result, "hello world");
320 }
321
322 #[test]
325 fn pattern_replace_basic() {
326 let filter = PatternReplaceCharFilter::new(r"\d+", "#").unwrap();
327 let (result, _) = filter.filter("abc123def456");
328 assert_eq!(result, "abc#def#");
329 }
330
331 #[test]
332 fn pattern_replace_no_match() {
333 let filter = PatternReplaceCharFilter::new(r"\d+", "#").unwrap();
334 let (result, corrections) = filter.filter("no digits");
335 assert_eq!(result, "no digits");
336 assert!(corrections.is_empty());
337 }
338
339 #[test]
340 fn pattern_replace_empty() {
341 let filter = PatternReplaceCharFilter::new(r"\d+", "#").unwrap();
342 let (result, _) = filter.filter("");
343 assert_eq!(result, "");
344 }
345
346 #[test]
349 fn correct_offset_no_corrections() {
350 assert_eq!(correct_offset(5, &[]), 5);
351 }
352
353 #[test]
354 fn correct_offset_single_removal() {
355 let corrections = vec![OffsetCorrection {
357 filtered: 0,
358 cumulative_diff: 3,
359 }];
360 assert_eq!(correct_offset(0, &corrections), 3);
362 assert_eq!(correct_offset(5, &corrections), 8);
364 }
365}