#[derive(Clone, Debug)]
pub struct OffsetCorrection {
pub filtered: usize,
pub cumulative_diff: isize,
}
pub trait CharFilter: Send + Sync {
fn filter(&self, text: &str) -> (String, Vec<OffsetCorrection>);
}
pub struct HtmlStripCharFilter;
impl CharFilter for HtmlStripCharFilter {
fn filter(&self, text: &str) -> (String, Vec<OffsetCorrection>) {
let mut result = String::with_capacity(text.len());
let mut corrections = Vec::new();
let mut cumulative_diff: isize = 0;
let mut i = 0;
let bytes = text.as_bytes();
while i < bytes.len() {
if bytes[i] == b'<' {
let tag_start = i;
while i < bytes.len() && bytes[i] != b'>' {
i += 1;
}
if i < bytes.len() {
i += 1; }
let removed = i - tag_start;
cumulative_diff += removed as isize;
corrections.push(OffsetCorrection {
filtered: result.len(),
cumulative_diff,
});
} else if bytes[i] == b'&' {
if let Some((decoded, consumed)) = decode_entity(&text[i..]) {
let old_len = result.len();
result.push_str(&decoded);
let new_bytes = result.len() - old_len;
cumulative_diff += consumed as isize - new_bytes as isize;
corrections.push(OffsetCorrection {
filtered: result.len(),
cumulative_diff,
});
i += consumed;
} else {
result.push('&');
i += 1;
}
} else {
let ch = text[i..].chars().next().unwrap();
result.push(ch);
i += ch.len_utf8();
}
}
(result, corrections)
}
}
fn decode_entity(s: &str) -> Option<(String, usize)> {
let end = s.find(';')?;
if end > 10 {
return None; }
let entity = &s[1..end]; let consumed = end + 1;
let decoded = match entity {
"amp" => "&".to_string(),
"lt" => "<".to_string(),
"gt" => ">".to_string(),
"quot" => "\"".to_string(),
"apos" => "'".to_string(),
"nbsp" => "\u{00A0}".to_string(),
_ if entity.starts_with('#') => {
let num_str = &entity[1..];
let code_point = if let Some(hex) = num_str.strip_prefix('x') {
u32::from_str_radix(hex, 16).ok()?
} else {
num_str.parse::<u32>().ok()?
};
char::from_u32(code_point)?.to_string()
}
_ => return None,
};
Some((decoded, consumed))
}
pub struct MappingCharFilter {
mappings: Vec<(String, String)>,
}
impl MappingCharFilter {
pub fn new(mappings: Vec<(String, String)>) -> Self {
let mut mappings = mappings;
mappings.sort_by(|a, b| b.0.len().cmp(&a.0.len()));
Self { mappings }
}
}
impl CharFilter for MappingCharFilter {
fn filter(&self, text: &str) -> (String, Vec<OffsetCorrection>) {
if self.mappings.is_empty() {
return (text.to_string(), Vec::new());
}
let mut result = text.to_string();
let mut corrections = Vec::new();
let mut cumulative_diff: isize = 0;
for (from, to) in &self.mappings {
let mut new_result = String::with_capacity(result.len());
let mut search_start = 0;
while let Some(pos) = result[search_start..].find(from.as_str()) {
let abs_pos = search_start + pos;
new_result.push_str(&result[search_start..abs_pos]);
new_result.push_str(to);
let len_diff = from.len() as isize - to.len() as isize;
cumulative_diff += len_diff;
corrections.push(OffsetCorrection {
filtered: new_result.len(),
cumulative_diff,
});
search_start = abs_pos + from.len();
}
new_result.push_str(&result[search_start..]);
result = new_result;
}
(result, corrections)
}
}
pub struct PatternReplaceCharFilter {
pattern: regex::Regex,
replacement: String,
}
impl PatternReplaceCharFilter {
pub fn new(pattern: &str, replacement: &str) -> Result<Self, regex::Error> {
Ok(Self {
pattern: regex::Regex::new(pattern)?,
replacement: replacement.to_string(),
})
}
}
impl CharFilter for PatternReplaceCharFilter {
fn filter(&self, text: &str) -> (String, Vec<OffsetCorrection>) {
let mut corrections = Vec::new();
let mut cumulative_diff: isize = 0;
let mut result = String::with_capacity(text.len());
let mut last_end = 0;
for m in self.pattern.find_iter(text) {
result.push_str(&text[last_end..m.start()]);
result.push_str(&self.replacement);
let match_len = m.end() - m.start();
let replace_len = self.replacement.len();
cumulative_diff += match_len as isize - replace_len as isize;
corrections.push(OffsetCorrection {
filtered: result.len(),
cumulative_diff,
});
last_end = m.end();
}
result.push_str(&text[last_end..]);
(result, corrections)
}
}
pub fn correct_offset(filtered_offset: usize, corrections: &[OffsetCorrection]) -> usize {
let diff = match corrections
.iter()
.rev()
.find(|c| c.filtered <= filtered_offset)
{
Some(c) => c.cumulative_diff,
None => 0,
};
(filtered_offset as isize + diff) as usize
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn html_strip_basic() {
let filter = HtmlStripCharFilter;
let (result, _) = filter.filter("<p>Hello <b>World</b></p>");
assert_eq!(result, "Hello World");
}
#[test]
fn html_strip_entities() {
let filter = HtmlStripCharFilter;
let (result, _) = filter.filter("foo & bar < baz");
assert_eq!(result, "foo & bar < baz");
}
#[test]
fn html_strip_numeric_entity() {
let filter = HtmlStripCharFilter;
let (result, _) = filter.filter("AB");
assert_eq!(result, "AB");
}
#[test]
fn html_strip_no_html() {
let filter = HtmlStripCharFilter;
let (result, corrections) = filter.filter("plain text");
assert_eq!(result, "plain text");
assert!(corrections.is_empty());
}
#[test]
fn html_strip_empty() {
let filter = HtmlStripCharFilter;
let (result, _) = filter.filter("");
assert_eq!(result, "");
}
#[test]
fn html_strip_offset_correction() {
let filter = HtmlStripCharFilter;
let (result, corrections) = filter.filter("<b>Hello</b>");
assert_eq!(result, "Hello");
let original_start = correct_offset(0, &corrections);
assert_eq!(original_start, 3);
}
#[test]
fn mapping_basic() {
let filter = MappingCharFilter::new(vec![
(":)".to_string(), "_happy_".to_string()),
(":(".to_string(), "_sad_".to_string()),
]);
let (result, _) = filter.filter("I am :) and not :(");
assert_eq!(result, "I am _happy_ and not _sad_");
}
#[test]
fn mapping_empty() {
let filter = MappingCharFilter::new(vec![]);
let (result, corrections) = filter.filter("no change");
assert_eq!(result, "no change");
assert!(corrections.is_empty());
}
#[test]
fn mapping_no_match() {
let filter = MappingCharFilter::new(vec![("xyz".to_string(), "abc".to_string())]);
let (result, _) = filter.filter("hello world");
assert_eq!(result, "hello world");
}
#[test]
fn pattern_replace_basic() {
let filter = PatternReplaceCharFilter::new(r"\d+", "#").unwrap();
let (result, _) = filter.filter("abc123def456");
assert_eq!(result, "abc#def#");
}
#[test]
fn pattern_replace_no_match() {
let filter = PatternReplaceCharFilter::new(r"\d+", "#").unwrap();
let (result, corrections) = filter.filter("no digits");
assert_eq!(result, "no digits");
assert!(corrections.is_empty());
}
#[test]
fn pattern_replace_empty() {
let filter = PatternReplaceCharFilter::new(r"\d+", "#").unwrap();
let (result, _) = filter.filter("");
assert_eq!(result, "");
}
#[test]
fn correct_offset_no_corrections() {
assert_eq!(correct_offset(5, &[]), 5);
}
#[test]
fn correct_offset_single_removal() {
let corrections = vec![OffsetCorrection {
filtered: 0,
cumulative_diff: 3,
}];
assert_eq!(correct_offset(0, &corrections), 3);
assert_eq!(correct_offset(5, &corrections), 8);
}
}