1use scraper::{Html, Selector};
31use serde::{Deserialize, Serialize};
32use std::time::Instant;
33use tracing::{debug, instrument, trace};
34
35#[derive(Debug, Clone)]
37pub struct ContentProcessorConfig {
38 pub max_length: usize,
40 pub preserve_structure: bool,
42 pub min_content_length: usize,
44 pub remove_tags: Vec<String>,
46 pub decode_entities: bool,
48}
49
50impl Default for ContentProcessorConfig {
51 fn default() -> Self {
52 Self {
53 max_length: 0, preserve_structure: true,
55 min_content_length: 10,
56 remove_tags: vec![
57 "script".to_string(),
58 "style".to_string(),
59 "noscript".to_string(),
60 "template".to_string(),
61 "svg".to_string(),
62 "math".to_string(),
63 ],
64 decode_entities: true,
65 }
66 }
67}
68
69#[derive(Debug, Clone)]
73pub struct ContentProcessor {
74 config: ContentProcessorConfig,
75}
76
77#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
79pub struct ProcessedContent {
80 pub text: String,
82 pub word_count: usize,
84 pub char_count: usize,
86 pub was_truncated: bool,
88 pub processing_time_us: u64,
90}
91
92impl ContentProcessor {
93 pub fn new(config: ContentProcessorConfig) -> Self {
95 Self { config }
96 }
97
98 pub fn with_defaults() -> Self {
100 Self::new(ContentProcessorConfig::default())
101 }
102
103 pub fn with_max_length(max_length: usize) -> Self {
105 Self::new(ContentProcessorConfig {
106 max_length,
107 ..Default::default()
108 })
109 }
110
111 #[instrument(skip(self, raw_html), fields(html_len = raw_html.len()))]
120 pub fn process(&self, raw_html: &str) -> ProcessedContent {
121 let start = Instant::now();
122 trace!("Starting content processing");
123
124 let cleaned_html = self.remove_scripts_styles(raw_html);
126
127 let extracted_text = self.extract_text(&cleaned_html);
129
130 let normalized = self.normalize_whitespace(&extracted_text);
132
133 let (text, was_truncated) =
135 if self.config.max_length > 0 && normalized.len() > self.config.max_length {
136 let truncated = self.truncate_with_ellipsis(&normalized, self.config.max_length);
137 (truncated, true)
138 } else {
139 (normalized, false)
140 };
141
142 let word_count = text.split_whitespace().count();
144 let char_count = text.chars().count();
145 let processing_time_us = start.elapsed().as_micros() as u64;
146
147 debug!(
148 "Processed content: {} words, {} chars, truncated={}, time={}us",
149 word_count, char_count, was_truncated, processing_time_us
150 );
151
152 ProcessedContent {
153 text,
154 word_count,
155 char_count,
156 was_truncated,
157 processing_time_us,
158 }
159 }
160
161 #[instrument(skip(self, html), fields(html_len = html.len()))]
166 pub fn extract_text(&self, html: &str) -> String {
167 let document = Html::parse_document(html);
168 let mut text_parts: Vec<String> = Vec::new();
169
170 let body_selector = Selector::parse("body").unwrap();
172
173 if let Some(body) = document.select(&body_selector).next() {
174 self.extract_text_from_element(&body, &mut text_parts);
175 } else {
176 let root = document.root_element();
178 self.extract_text_from_element(&root, &mut text_parts);
179 }
180
181 if self.config.preserve_structure {
182 text_parts.join("\n")
183 } else {
184 text_parts.join(" ")
185 }
186 }
187
188 fn extract_text_from_element(
190 &self,
191 element: &scraper::ElementRef<'_>,
192 text_parts: &mut Vec<String>,
193 ) {
194 let tag_name = element.value().name().to_lowercase();
195
196 if self.config.remove_tags.contains(&tag_name) {
198 return;
199 }
200
201 let is_block = matches!(
203 tag_name.as_str(),
204 "p" | "div"
205 | "section"
206 | "article"
207 | "header"
208 | "footer"
209 | "main"
210 | "aside"
211 | "nav"
212 | "h1"
213 | "h2"
214 | "h3"
215 | "h4"
216 | "h5"
217 | "h6"
218 | "li"
219 | "dt"
220 | "dd"
221 | "blockquote"
222 | "pre"
223 | "table"
224 | "tr"
225 | "br"
226 | "hr"
227 );
228
229 if is_block && self.config.preserve_structure && !text_parts.is_empty() {
231 if let Some(last) = text_parts.last() {
232 if !last.is_empty() {
233 text_parts.push(String::new());
234 }
235 }
236 }
237
238 for child in element.children() {
240 if let Some(text_node) = child.value().as_text() {
241 let trimmed = text_node.trim();
242 if !trimmed.is_empty() {
243 let decoded = if self.config.decode_entities {
244 Self::decode_html_entities(trimmed)
245 } else {
246 trimmed.to_string()
247 };
248 text_parts.push(decoded);
249 }
250 } else if let Some(child_element) = scraper::ElementRef::wrap(child) {
251 self.extract_text_from_element(&child_element, text_parts);
252 }
253 }
254
255 if is_block && self.config.preserve_structure && !text_parts.is_empty() {
257 if let Some(last) = text_parts.last() {
258 if !last.is_empty() {
259 text_parts.push(String::new());
260 }
261 }
262 }
263 }
264
265 #[instrument(skip(self, html), fields(html_len = html.len()))]
274 pub fn remove_scripts_styles(&self, html: &str) -> String {
275 let mut result = html.to_string();
276
277 result = Self::remove_pattern(&result, r"<!--[\s\S]*?-->");
279
280 for tag in &self.config.remove_tags {
282 let pattern = format!(r"(?is)<{}\b[^>]*>[\s\S]*?</{}>", tag, tag);
284 result = Self::remove_pattern(&result, &pattern);
285
286 let self_closing_pattern = format!(r"(?i)<{}\b[^>]*/?>", tag);
288 result = Self::remove_pattern(&result, &self_closing_pattern);
289 }
290
291 result = Self::remove_pattern(&result, r#"(?i)\s+on\w+\s*=\s*["'][^"']*["']"#);
293 result = Self::remove_pattern(&result, r#"(?i)\s+on\w+\s*=\s*[^\s>]+"#);
294 result = Self::remove_pattern(&result, r#"(?i)href\s*=\s*["']javascript:[^"']*["']"#);
295
296 trace!(
297 "Removed scripts/styles: {} -> {} bytes",
298 html.len(),
299 result.len()
300 );
301 result
302 }
303
304 fn remove_pattern(text: &str, pattern: &str) -> String {
306 match regex::Regex::new(pattern) {
307 Ok(re) => re.replace_all(text, "").to_string(),
308 Err(_) => text.to_string(),
309 }
310 }
311
312 #[instrument(skip(self, text), fields(text_len = text.len()))]
320 pub fn normalize_whitespace(&self, text: &str) -> String {
321 let mut result = text.to_string();
322
323 result = result
325 .replace(
326 ['\u{00A0}', '\u{2002}', '\u{2003}', '\u{2009}', '\u{200A}'],
327 " ",
328 )
329 .replace(['\u{200B}', '\u{FEFF}'], ""); result = result.replace('\t', " ");
333
334 result = result.replace("\r\n", "\n").replace('\r', "\n");
336
337 if self.config.preserve_structure {
338 let space_re = regex::Regex::new(r"[^\S\n]+").unwrap();
340 result = space_re.replace_all(&result, " ").to_string();
341
342 let newline_re = regex::Regex::new(r"\n{3,}").unwrap();
344 result = newline_re.replace_all(&result, "\n\n").to_string();
345
346 result = result
348 .lines()
349 .map(|line| line.trim())
350 .collect::<Vec<_>>()
351 .join("\n");
352 } else {
353 let ws_re = regex::Regex::new(r"\s+").unwrap();
355 result = ws_re.replace_all(&result, " ").to_string();
356 }
357
358 result.trim().to_string()
359 }
360
361 #[instrument(skip(self, text), fields(text_len = text.len(), max = max))]
367 pub fn truncate_with_ellipsis(&self, text: &str, max: usize) -> String {
368 if text.len() <= max {
369 return text.to_string();
370 }
371
372 let effective_max = max.saturating_sub(3);
374 if effective_max == 0 {
375 return "...".to_string();
376 }
377
378 let truncate_at = text[..effective_max]
380 .rfind(|c: char| c.is_whitespace())
381 .unwrap_or(effective_max);
382
383 let min_length = effective_max / 5;
385 let truncate_at = if truncate_at < min_length {
386 effective_max
387 } else {
388 truncate_at
389 };
390
391 let mut result = text[..truncate_at].trim_end().to_string();
392 result.push_str("...");
393
394 trace!("Truncated from {} to {} chars", text.len(), result.len());
395 result
396 }
397
398 pub fn decode_html_entities(text: &str) -> String {
404 let mut result = text.to_string();
405
406 let named_entities = [
408 ("&", "&"),
409 ("<", "<"),
410 (">", ">"),
411 (""", "\""),
412 ("'", "'"),
413 (" ", " "),
414 ("–", "\u{2013}"),
415 ("—", "\u{2014}"),
416 ("‘", "\u{2018}"),
417 ("’", "\u{2019}"),
418 ("“", "\u{201C}"),
419 ("”", "\u{201D}"),
420 ("…", "\u{2026}"),
421 ("™", "\u{2122}"),
422 ("©", "\u{00A9}"),
423 ("®", "\u{00AE}"),
424 ("°", "\u{00B0}"),
425 ("±", "\u{00B1}"),
426 ("×", "\u{00D7}"),
427 ("÷", "\u{00F7}"),
428 ("€", "\u{20AC}"),
429 ("£", "\u{00A3}"),
430 ("¥", "\u{00A5}"),
431 ("¢", "\u{00A2}"),
432 ];
433
434 for (entity, replacement) in named_entities {
435 result = result.replace(entity, replacement);
436 }
437
438 if result.contains("&#") {
440 let decimal_re = regex::Regex::new(r"&#(\d+);").unwrap();
441 result = decimal_re
442 .replace_all(&result, |caps: ®ex::Captures| {
443 caps.get(1)
444 .and_then(|m| m.as_str().parse::<u32>().ok())
445 .and_then(char::from_u32)
446 .map(|c| c.to_string())
447 .unwrap_or_else(|| caps[0].to_string())
448 })
449 .to_string();
450
451 let hex_re = regex::Regex::new(r"(?i)&#x([0-9a-f]+);").unwrap();
453 result = hex_re
454 .replace_all(&result, |caps: ®ex::Captures| {
455 caps.get(1)
456 .and_then(|m| u32::from_str_radix(m.as_str(), 16).ok())
457 .and_then(char::from_u32)
458 .map(|c| c.to_string())
459 .unwrap_or_else(|| caps[0].to_string())
460 })
461 .to_string();
462 }
463
464 result
465 }
466}
467
468#[cfg(test)]
469mod tests {
470 use super::*;
471
472 #[test]
473 fn test_basic_processing() {
474 let processor = ContentProcessor::with_defaults();
475 let html = "<html><body><p>Hello world!</p></body></html>";
476 let result = processor.process(html);
477
478 assert_eq!(result.text.trim(), "Hello world!");
479 assert_eq!(result.word_count, 2);
480 assert!(!result.was_truncated);
481 }
482
483 #[test]
484 fn test_script_removal() {
485 let processor = ContentProcessor::with_defaults();
486 let html = r#"
487 <html>
488 <head><script>alert('evil');</script></head>
489 <body>
490 <p>Safe content</p>
491 <script type="text/javascript">
492 malicious_code();
493 </script>
494 </body>
495 </html>
496 "#;
497 let result = processor.process(html);
498
499 assert!(result.text.contains("Safe content"));
500 assert!(!result.text.contains("evil"));
501 assert!(!result.text.contains("malicious"));
502 }
503
504 #[test]
505 fn test_style_removal() {
506 let processor = ContentProcessor::with_defaults();
507 let html = r#"
508 <html>
509 <head><style>.hidden { display: none; }</style></head>
510 <body>
511 <p>Visible text</p>
512 <style>
513 body { background: red; }
514 </style>
515 </body>
516 </html>
517 "#;
518 let result = processor.process(html);
519
520 assert!(result.text.contains("Visible text"));
521 assert!(!result.text.contains("display"));
522 assert!(!result.text.contains("background"));
523 }
524
525 #[test]
526 fn test_entity_decoding() {
527 let processor = ContentProcessor::with_defaults();
528 let html = "<p>Tom & Jerry <3 "cheese"</p>";
529 let result = processor.process(html);
530
531 assert!(result.text.contains("Tom & Jerry"));
532 assert!(result.text.contains("<3"));
533 assert!(result.text.contains("\"cheese\""));
534 }
535
536 #[test]
537 fn test_numeric_entity_decoding() {
538 let decoded = ContentProcessor::decode_html_entities("'hello' 'world'");
539 assert_eq!(decoded, "'hello' 'world'");
540 }
541
542 #[test]
543 fn test_whitespace_normalization() {
544 let processor = ContentProcessor::with_defaults();
545 let html = "<p>Too many spaces</p>";
546 let result = processor.process(html);
547
548 assert!(!result.text.contains(" "));
549 assert!(result.text.contains("Too many spaces") || result.text.contains("Too many spaces"));
550 }
551
552 #[test]
553 fn test_structure_preservation() {
554 let config = ContentProcessorConfig {
555 preserve_structure: true,
556 ..Default::default()
557 };
558 let processor = ContentProcessor::new(config);
559 let html = "<p>Paragraph 1</p><p>Paragraph 2</p>";
560 let result = processor.process(html);
561
562 assert!(result.text.contains("Paragraph 1"));
564 assert!(result.text.contains("Paragraph 2"));
565 }
566
567 #[test]
568 fn test_truncation_with_ellipsis() {
569 let processor = ContentProcessor::with_max_length(20);
570 let html = "<p>This is a very long piece of text that should be truncated.</p>";
571 let result = processor.process(html);
572
573 assert!(result.was_truncated);
574 assert!(result.text.ends_with("..."));
575 assert!(result.text.len() <= 20);
576 }
577
578 #[test]
579 fn test_truncation_at_word_boundary() {
580 let processor = ContentProcessor::with_defaults();
581 let text = "Hello world how are you doing today";
582 let truncated = processor.truncate_with_ellipsis(text, 15);
583
584 assert!(truncated.ends_with("..."));
585 assert!(!truncated.contains("wor...") || truncated == "Hello world...");
587 }
588
589 #[test]
590 fn test_no_truncation_for_short_content() {
591 let processor = ContentProcessor::with_max_length(1000);
592 let html = "<p>Short content</p>";
593 let result = processor.process(html);
594
595 assert!(!result.was_truncated);
596 }
597
598 #[test]
599 fn test_noscript_removal() {
600 let processor = ContentProcessor::with_defaults();
601 let html = r#"
602 <body>
603 <noscript>Enable JavaScript!</noscript>
604 <p>Content</p>
605 </body>
606 "#;
607 let result = processor.process(html);
608
609 assert!(result.text.contains("Content"));
610 assert!(!result.text.contains("JavaScript"));
611 }
612
613 #[test]
614 fn test_comment_removal() {
615 let processor = ContentProcessor::with_defaults();
616 let html = r#"
617 <body>
618 <!-- This is a comment -->
619 <p>Visible</p>
620 <!-- Another comment
621 with multiple lines -->
622 </body>
623 "#;
624 let cleaned = processor.remove_scripts_styles(html);
625
626 assert!(!cleaned.contains("This is a comment"));
627 assert!(!cleaned.contains("Another comment"));
628 }
629
630 #[test]
631 fn test_inline_event_handler_removal() {
632 let processor = ContentProcessor::with_defaults();
633 let html = r#"<button onclick="evil()">Click</button>"#;
634 let cleaned = processor.remove_scripts_styles(html);
635
636 assert!(!cleaned.contains("onclick"));
637 assert!(!cleaned.contains("evil"));
638 }
639
640 #[test]
641 fn test_javascript_href_removal() {
642 let processor = ContentProcessor::with_defaults();
643 let html = r#"<a href="javascript:alert('xss')">Click</a>"#;
644 let cleaned = processor.remove_scripts_styles(html);
645
646 assert!(!cleaned.contains("javascript:"));
647 }
648
649 #[test]
650 fn test_special_whitespace_normalization() {
651 let processor = ContentProcessor::with_defaults();
652 let text_with_nbsp = "Hello\u{00A0}world\u{2003}test";
653 let normalized = processor.normalize_whitespace(text_with_nbsp);
654
655 assert!(!normalized.contains('\u{00A0}'));
656 assert!(!normalized.contains('\u{2003}'));
657 assert!(normalized.contains("Hello world test") || normalized.contains("Hello world test"));
658 }
659
660 #[test]
661 fn test_processed_content_metrics() {
662 let processor = ContentProcessor::with_defaults();
663 let html = "<p>One two three four five</p>";
664 let result = processor.process(html);
665
666 assert_eq!(result.word_count, 5);
667 assert!(result.char_count > 0);
668 let _ = result.processing_time_us;
669 }
670
671 #[test]
672 fn test_empty_html() {
673 let processor = ContentProcessor::with_defaults();
674 let html = "<html><body></body></html>";
675 let result = processor.process(html);
676
677 assert!(result.text.is_empty() || result.word_count == 0);
678 }
679
680 #[test]
681 fn test_deeply_nested_content() {
682 let processor = ContentProcessor::with_defaults();
683 let html = "<div><div><div><span><p>Deep content</p></span></div></div></div>";
684 let result = processor.process(html);
685
686 assert!(result.text.contains("Deep content"));
687 }
688
689 #[test]
690 fn test_mixed_content() {
691 let processor = ContentProcessor::with_defaults();
692 let html = r#"
693 <html>
694 <head>
695 <title>Test Page</title>
696 <script>bad();</script>
697 <style>.foo { color: red; }</style>
698 </head>
699 <body>
700 <header><nav>Menu</nav></header>
701 <main>
702 <article>
703 <h1>Article Title</h1>
704 <p>First paragraph with <strong>bold</strong> text.</p>
705 <p>Second paragraph with a <a href="http://example.com">link</a>.</p>
706 </article>
707 </main>
708 <footer>© 2024</footer>
709 </body>
710 </html>
711 "#;
712 let result = processor.process(html);
713
714 assert!(result.text.contains("Article Title"));
715 assert!(result.text.contains("First paragraph"));
716 assert!(result.text.contains("bold"));
717 assert!(result.text.contains("link"));
718 assert!(!result.text.contains("bad()"));
719 assert!(!result.text.contains("color: red"));
720 }
721
722 #[test]
723 fn test_unicode_content() {
724 let processor = ContentProcessor::with_defaults();
725 let html = "<p>Hello \u{1F600} World! Caf\u{00E9}</p>";
726 let result = processor.process(html);
727
728 assert!(result.text.contains("\u{1F600}")); assert!(result.text.contains("Caf\u{00E9}")); }
731
732 #[test]
733 fn test_custom_remove_tags() {
734 let config = ContentProcessorConfig {
735 remove_tags: vec!["script".to_string(), "style".to_string(), "nav".to_string()],
736 ..Default::default()
737 };
738 let processor = ContentProcessor::new(config);
739 let html = "<nav>Navigation</nav><p>Content</p>";
740 let result = processor.process(html);
741
742 assert!(!result.text.contains("Navigation"));
743 assert!(result.text.contains("Content"));
744 }
745
746 #[test]
747 fn test_without_entity_decoding() {
748 let config = ContentProcessorConfig {
749 decode_entities: false,
750 ..Default::default()
751 };
752 let processor = ContentProcessor::new(config);
753 let html = "<p>& < ></p>";
754 let result = processor.process(html);
755
756 assert!(result.text.contains("&") || result.text.contains("&"));
758 }
759
760 #[test]
761 fn test_extract_text_directly() {
762 let processor = ContentProcessor::with_defaults();
763 let html = "<p>Direct <em>extraction</em> test</p>";
764 let text = processor.extract_text(html);
765
766 assert!(text.contains("Direct"));
767 assert!(text.contains("extraction"));
768 assert!(text.contains("test"));
769 }
770
771 #[test]
772 fn test_remove_scripts_styles_directly() {
773 let processor = ContentProcessor::with_defaults();
774 let html = "<script>bad();</script><p>Good</p><style>.x{}</style>";
775 let cleaned = processor.remove_scripts_styles(html);
776
777 assert!(!cleaned.contains("bad()"));
778 assert!(!cleaned.contains(".x{}"));
779 assert!(cleaned.contains("<p>Good</p>"));
780 }
781
782 #[test]
783 fn test_normalize_whitespace_directly() {
784 let processor = ContentProcessor::with_defaults();
785 let text = " Multiple spaces and\n\n\n\nmany newlines ";
786 let normalized = processor.normalize_whitespace(text);
787
788 assert!(!normalized.starts_with(' '));
789 assert!(!normalized.ends_with(' '));
790 assert!(!normalized.contains(" ")); }
792}