use scraper::{Html, Selector};
use serde::{Deserialize, Serialize};
use std::time::Instant;
use tracing::{debug, instrument, trace};
#[derive(Debug, Clone)]
pub struct ContentProcessorConfig {
pub max_length: usize,
pub preserve_structure: bool,
pub min_content_length: usize,
pub remove_tags: Vec<String>,
pub decode_entities: bool,
}
impl Default for ContentProcessorConfig {
fn default() -> Self {
Self {
max_length: 0, preserve_structure: true,
min_content_length: 10,
remove_tags: vec![
"script".to_string(),
"style".to_string(),
"noscript".to_string(),
"template".to_string(),
"svg".to_string(),
"math".to_string(),
],
decode_entities: true,
}
}
}
#[derive(Debug, Clone)]
pub struct ContentProcessor {
config: ContentProcessorConfig,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct ProcessedContent {
pub text: String,
pub word_count: usize,
pub char_count: usize,
pub was_truncated: bool,
pub processing_time_us: u64,
}
impl ContentProcessor {
pub fn new(config: ContentProcessorConfig) -> Self {
Self { config }
}
pub fn with_defaults() -> Self {
Self::new(ContentProcessorConfig::default())
}
pub fn with_max_length(max_length: usize) -> Self {
Self::new(ContentProcessorConfig {
max_length,
..Default::default()
})
}
#[instrument(skip(self, raw_html), fields(html_len = raw_html.len()))]
pub fn process(&self, raw_html: &str) -> ProcessedContent {
let start = Instant::now();
trace!("Starting content processing");
let cleaned_html = self.remove_scripts_styles(raw_html);
let extracted_text = self.extract_text(&cleaned_html);
let normalized = self.normalize_whitespace(&extracted_text);
let (text, was_truncated) =
if self.config.max_length > 0 && normalized.len() > self.config.max_length {
let truncated = self.truncate_with_ellipsis(&normalized, self.config.max_length);
(truncated, true)
} else {
(normalized, false)
};
let word_count = text.split_whitespace().count();
let char_count = text.chars().count();
let processing_time_us = start.elapsed().as_micros() as u64;
debug!(
"Processed content: {} words, {} chars, truncated={}, time={}us",
word_count, char_count, was_truncated, processing_time_us
);
ProcessedContent {
text,
word_count,
char_count,
was_truncated,
processing_time_us,
}
}
#[instrument(skip(self, html), fields(html_len = html.len()))]
pub fn extract_text(&self, html: &str) -> String {
let document = Html::parse_document(html);
let mut text_parts: Vec<String> = Vec::new();
let body_selector = Selector::parse("body").unwrap();
if let Some(body) = document.select(&body_selector).next() {
self.extract_text_from_element(&body, &mut text_parts);
} else {
let root = document.root_element();
self.extract_text_from_element(&root, &mut text_parts);
}
if self.config.preserve_structure {
text_parts.join("\n")
} else {
text_parts.join(" ")
}
}
fn extract_text_from_element(
&self,
element: &scraper::ElementRef<'_>,
text_parts: &mut Vec<String>,
) {
let tag_name = element.value().name().to_lowercase();
if self.config.remove_tags.contains(&tag_name) {
return;
}
let is_block = matches!(
tag_name.as_str(),
"p" | "div"
| "section"
| "article"
| "header"
| "footer"
| "main"
| "aside"
| "nav"
| "h1"
| "h2"
| "h3"
| "h4"
| "h5"
| "h6"
| "li"
| "dt"
| "dd"
| "blockquote"
| "pre"
| "table"
| "tr"
| "br"
| "hr"
);
if is_block && self.config.preserve_structure && !text_parts.is_empty() {
if let Some(last) = text_parts.last() {
if !last.is_empty() {
text_parts.push(String::new());
}
}
}
for child in element.children() {
if let Some(text_node) = child.value().as_text() {
let trimmed = text_node.trim();
if !trimmed.is_empty() {
let decoded = if self.config.decode_entities {
Self::decode_html_entities(trimmed)
} else {
trimmed.to_string()
};
text_parts.push(decoded);
}
} else if let Some(child_element) = scraper::ElementRef::wrap(child) {
self.extract_text_from_element(&child_element, text_parts);
}
}
if is_block && self.config.preserve_structure && !text_parts.is_empty() {
if let Some(last) = text_parts.last() {
if !last.is_empty() {
text_parts.push(String::new());
}
}
}
}
#[instrument(skip(self, html), fields(html_len = html.len()))]
pub fn remove_scripts_styles(&self, html: &str) -> String {
let mut result = html.to_string();
result = Self::remove_pattern(&result, r"<!--[\s\S]*?-->");
for tag in &self.config.remove_tags {
let pattern = format!(r"(?is)<{}\b[^>]*>[\s\S]*?</{}>", tag, tag);
result = Self::remove_pattern(&result, &pattern);
let self_closing_pattern = format!(r"(?i)<{}\b[^>]*/?>", tag);
result = Self::remove_pattern(&result, &self_closing_pattern);
}
result = Self::remove_pattern(&result, r#"(?i)\s+on\w+\s*=\s*["'][^"']*["']"#);
result = Self::remove_pattern(&result, r#"(?i)\s+on\w+\s*=\s*[^\s>]+"#);
result = Self::remove_pattern(&result, r#"(?i)href\s*=\s*["']javascript:[^"']*["']"#);
trace!(
"Removed scripts/styles: {} -> {} bytes",
html.len(),
result.len()
);
result
}
fn remove_pattern(text: &str, pattern: &str) -> String {
match regex::Regex::new(pattern) {
Ok(re) => re.replace_all(text, "").to_string(),
Err(_) => text.to_string(),
}
}
#[instrument(skip(self, text), fields(text_len = text.len()))]
pub fn normalize_whitespace(&self, text: &str) -> String {
let mut result = text.to_string();
result = result
.replace(
['\u{00A0}', '\u{2002}', '\u{2003}', '\u{2009}', '\u{200A}'],
" ",
)
.replace(['\u{200B}', '\u{FEFF}'], "");
result = result.replace('\t', " ");
result = result.replace("\r\n", "\n").replace('\r', "\n");
if self.config.preserve_structure {
let space_re = regex::Regex::new(r"[^\S\n]+").unwrap();
result = space_re.replace_all(&result, " ").to_string();
let newline_re = regex::Regex::new(r"\n{3,}").unwrap();
result = newline_re.replace_all(&result, "\n\n").to_string();
result = result
.lines()
.map(|line| line.trim())
.collect::<Vec<_>>()
.join("\n");
} else {
let ws_re = regex::Regex::new(r"\s+").unwrap();
result = ws_re.replace_all(&result, " ").to_string();
}
result.trim().to_string()
}
#[instrument(skip(self, text), fields(text_len = text.len(), max = max))]
pub fn truncate_with_ellipsis(&self, text: &str, max: usize) -> String {
if text.len() <= max {
return text.to_string();
}
let effective_max = max.saturating_sub(3);
if effective_max == 0 {
return "...".to_string();
}
let truncate_at = text[..effective_max]
.rfind(|c: char| c.is_whitespace())
.unwrap_or(effective_max);
let min_length = effective_max / 5;
let truncate_at = if truncate_at < min_length {
effective_max
} else {
truncate_at
};
let mut result = text[..truncate_at].trim_end().to_string();
result.push_str("...");
trace!("Truncated from {} to {} chars", text.len(), result.len());
result
}
pub fn decode_html_entities(text: &str) -> String {
let mut result = text.to_string();
let named_entities = [
("&", "&"),
("<", "<"),
(">", ">"),
(""", "\""),
("'", "'"),
(" ", " "),
("–", "\u{2013}"),
("—", "\u{2014}"),
("‘", "\u{2018}"),
("’", "\u{2019}"),
("“", "\u{201C}"),
("”", "\u{201D}"),
("…", "\u{2026}"),
("™", "\u{2122}"),
("©", "\u{00A9}"),
("®", "\u{00AE}"),
("°", "\u{00B0}"),
("±", "\u{00B1}"),
("×", "\u{00D7}"),
("÷", "\u{00F7}"),
("€", "\u{20AC}"),
("£", "\u{00A3}"),
("¥", "\u{00A5}"),
("¢", "\u{00A2}"),
];
for (entity, replacement) in named_entities {
result = result.replace(entity, replacement);
}
if result.contains("&#") {
let decimal_re = regex::Regex::new(r"&#(\d+);").unwrap();
result = decimal_re
.replace_all(&result, |caps: ®ex::Captures| {
caps.get(1)
.and_then(|m| m.as_str().parse::<u32>().ok())
.and_then(char::from_u32)
.map(|c| c.to_string())
.unwrap_or_else(|| caps[0].to_string())
})
.to_string();
let hex_re = regex::Regex::new(r"(?i)&#x([0-9a-f]+);").unwrap();
result = hex_re
.replace_all(&result, |caps: ®ex::Captures| {
caps.get(1)
.and_then(|m| u32::from_str_radix(m.as_str(), 16).ok())
.and_then(char::from_u32)
.map(|c| c.to_string())
.unwrap_or_else(|| caps[0].to_string())
})
.to_string();
}
result
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basic_processing() {
let processor = ContentProcessor::with_defaults();
let html = "<html><body><p>Hello world!</p></body></html>";
let result = processor.process(html);
assert_eq!(result.text.trim(), "Hello world!");
assert_eq!(result.word_count, 2);
assert!(!result.was_truncated);
}
#[test]
fn test_script_removal() {
let processor = ContentProcessor::with_defaults();
let html = r#"
<html>
<head><script>alert('evil');</script></head>
<body>
<p>Safe content</p>
<script type="text/javascript">
malicious_code();
</script>
</body>
</html>
"#;
let result = processor.process(html);
assert!(result.text.contains("Safe content"));
assert!(!result.text.contains("evil"));
assert!(!result.text.contains("malicious"));
}
#[test]
fn test_style_removal() {
let processor = ContentProcessor::with_defaults();
let html = r#"
<html>
<head><style>.hidden { display: none; }</style></head>
<body>
<p>Visible text</p>
<style>
body { background: red; }
</style>
</body>
</html>
"#;
let result = processor.process(html);
assert!(result.text.contains("Visible text"));
assert!(!result.text.contains("display"));
assert!(!result.text.contains("background"));
}
#[test]
fn test_entity_decoding() {
let processor = ContentProcessor::with_defaults();
let html = "<p>Tom & Jerry <3 "cheese"</p>";
let result = processor.process(html);
assert!(result.text.contains("Tom & Jerry"));
assert!(result.text.contains("<3"));
assert!(result.text.contains("\"cheese\""));
}
#[test]
fn test_numeric_entity_decoding() {
let decoded = ContentProcessor::decode_html_entities("'hello' 'world'");
assert_eq!(decoded, "'hello' 'world'");
}
#[test]
fn test_whitespace_normalization() {
let processor = ContentProcessor::with_defaults();
let html = "<p>Too many spaces</p>";
let result = processor.process(html);
assert!(!result.text.contains(" "));
assert!(result.text.contains("Too many spaces") || result.text.contains("Too many spaces"));
}
#[test]
fn test_structure_preservation() {
let config = ContentProcessorConfig {
preserve_structure: true,
..Default::default()
};
let processor = ContentProcessor::new(config);
let html = "<p>Paragraph 1</p><p>Paragraph 2</p>";
let result = processor.process(html);
assert!(result.text.contains("Paragraph 1"));
assert!(result.text.contains("Paragraph 2"));
}
#[test]
fn test_truncation_with_ellipsis() {
let processor = ContentProcessor::with_max_length(20);
let html = "<p>This is a very long piece of text that should be truncated.</p>";
let result = processor.process(html);
assert!(result.was_truncated);
assert!(result.text.ends_with("..."));
assert!(result.text.len() <= 20);
}
#[test]
fn test_truncation_at_word_boundary() {
let processor = ContentProcessor::with_defaults();
let text = "Hello world how are you doing today";
let truncated = processor.truncate_with_ellipsis(text, 15);
assert!(truncated.ends_with("..."));
assert!(!truncated.contains("wor...") || truncated == "Hello world...");
}
#[test]
fn test_no_truncation_for_short_content() {
let processor = ContentProcessor::with_max_length(1000);
let html = "<p>Short content</p>";
let result = processor.process(html);
assert!(!result.was_truncated);
}
#[test]
fn test_noscript_removal() {
let processor = ContentProcessor::with_defaults();
let html = r#"
<body>
<noscript>Enable JavaScript!</noscript>
<p>Content</p>
</body>
"#;
let result = processor.process(html);
assert!(result.text.contains("Content"));
assert!(!result.text.contains("JavaScript"));
}
#[test]
fn test_comment_removal() {
let processor = ContentProcessor::with_defaults();
let html = r#"
<body>
<!-- This is a comment -->
<p>Visible</p>
<!-- Another comment
with multiple lines -->
</body>
"#;
let cleaned = processor.remove_scripts_styles(html);
assert!(!cleaned.contains("This is a comment"));
assert!(!cleaned.contains("Another comment"));
}
#[test]
fn test_inline_event_handler_removal() {
let processor = ContentProcessor::with_defaults();
let html = r#"<button onclick="evil()">Click</button>"#;
let cleaned = processor.remove_scripts_styles(html);
assert!(!cleaned.contains("onclick"));
assert!(!cleaned.contains("evil"));
}
#[test]
fn test_javascript_href_removal() {
let processor = ContentProcessor::with_defaults();
let html = r#"<a href="javascript:alert('xss')">Click</a>"#;
let cleaned = processor.remove_scripts_styles(html);
assert!(!cleaned.contains("javascript:"));
}
#[test]
fn test_special_whitespace_normalization() {
let processor = ContentProcessor::with_defaults();
let text_with_nbsp = "Hello\u{00A0}world\u{2003}test";
let normalized = processor.normalize_whitespace(text_with_nbsp);
assert!(!normalized.contains('\u{00A0}'));
assert!(!normalized.contains('\u{2003}'));
assert!(normalized.contains("Hello world test") || normalized.contains("Hello world test"));
}
#[test]
fn test_processed_content_metrics() {
let processor = ContentProcessor::with_defaults();
let html = "<p>One two three four five</p>";
let result = processor.process(html);
assert_eq!(result.word_count, 5);
assert!(result.char_count > 0);
let _ = result.processing_time_us;
}
#[test]
fn test_empty_html() {
let processor = ContentProcessor::with_defaults();
let html = "<html><body></body></html>";
let result = processor.process(html);
assert!(result.text.is_empty() || result.word_count == 0);
}
#[test]
fn test_deeply_nested_content() {
let processor = ContentProcessor::with_defaults();
let html = "<div><div><div><span><p>Deep content</p></span></div></div></div>";
let result = processor.process(html);
assert!(result.text.contains("Deep content"));
}
#[test]
fn test_mixed_content() {
let processor = ContentProcessor::with_defaults();
let html = r#"
<html>
<head>
<title>Test Page</title>
<script>bad();</script>
<style>.foo { color: red; }</style>
</head>
<body>
<header><nav>Menu</nav></header>
<main>
<article>
<h1>Article Title</h1>
<p>First paragraph with <strong>bold</strong> text.</p>
<p>Second paragraph with a <a href="http://example.com">link</a>.</p>
</article>
</main>
<footer>© 2024</footer>
</body>
</html>
"#;
let result = processor.process(html);
assert!(result.text.contains("Article Title"));
assert!(result.text.contains("First paragraph"));
assert!(result.text.contains("bold"));
assert!(result.text.contains("link"));
assert!(!result.text.contains("bad()"));
assert!(!result.text.contains("color: red"));
}
#[test]
fn test_unicode_content() {
let processor = ContentProcessor::with_defaults();
let html = "<p>Hello \u{1F600} World! Caf\u{00E9}</p>";
let result = processor.process(html);
assert!(result.text.contains("\u{1F600}")); assert!(result.text.contains("Caf\u{00E9}")); }
#[test]
fn test_custom_remove_tags() {
let config = ContentProcessorConfig {
remove_tags: vec!["script".to_string(), "style".to_string(), "nav".to_string()],
..Default::default()
};
let processor = ContentProcessor::new(config);
let html = "<nav>Navigation</nav><p>Content</p>";
let result = processor.process(html);
assert!(!result.text.contains("Navigation"));
assert!(result.text.contains("Content"));
}
#[test]
fn test_without_entity_decoding() {
let config = ContentProcessorConfig {
decode_entities: false,
..Default::default()
};
let processor = ContentProcessor::new(config);
let html = "<p>& < ></p>";
let result = processor.process(html);
assert!(result.text.contains("&") || result.text.contains("&"));
}
#[test]
fn test_extract_text_directly() {
let processor = ContentProcessor::with_defaults();
let html = "<p>Direct <em>extraction</em> test</p>";
let text = processor.extract_text(html);
assert!(text.contains("Direct"));
assert!(text.contains("extraction"));
assert!(text.contains("test"));
}
#[test]
fn test_remove_scripts_styles_directly() {
let processor = ContentProcessor::with_defaults();
let html = "<script>bad();</script><p>Good</p><style>.x{}</style>";
let cleaned = processor.remove_scripts_styles(html);
assert!(!cleaned.contains("bad()"));
assert!(!cleaned.contains(".x{}"));
assert!(cleaned.contains("<p>Good</p>"));
}
#[test]
fn test_normalize_whitespace_directly() {
let processor = ContentProcessor::with_defaults();
let text = " Multiple spaces and\n\n\n\nmany newlines ";
let normalized = processor.normalize_whitespace(text);
assert!(!normalized.starts_with(' '));
assert!(!normalized.ends_with(' '));
assert!(!normalized.contains(" ")); }
}