reasonkit_web/
processing.rs

1//! DOM Content Processing Utilities
2//!
3//! This module provides high-performance utilities for processing raw HTML content,
4//! extracting clean text, and normalizing web page content for downstream consumption.
5//!
6//! # Features
7//!
8//! - **HTML Cleaning**: Remove scripts, styles, and other non-content elements
9//! - **Text Extraction**: Convert HTML to clean, readable text
10//! - **Entity Decoding**: Properly decode HTML entities
11//! - **Whitespace Normalization**: Clean up excessive whitespace while preserving structure
12//! - **Truncation**: Intelligently truncate content with ellipsis
13//!
14//! # Example
15//!
16//! ```rust
17//! use reasonkit_web::processing::{ContentProcessor, ContentProcessorConfig};
18//!
19//! let config = ContentProcessorConfig::default();
20//! let processor = ContentProcessor::new(config);
21//!
22//! let html = r#"<html><head><script>evil();</script></head>
23//!               <body><p>Hello &amp; welcome!</p></body></html>"#;
24//!
25//! let result = processor.process(html);
26//! assert!(result.text.contains("Hello & welcome!"));
27//! assert!(!result.text.contains("evil"));
28//! ```
29
30use scraper::{Html, Selector};
31use serde::{Deserialize, Serialize};
32use std::time::Instant;
33use tracing::{debug, instrument, trace};
34
35/// Configuration for the content processor
36#[derive(Debug, Clone)]
37pub struct ContentProcessorConfig {
38    /// Maximum length of processed content (0 = unlimited)
39    pub max_length: usize,
40    /// Whether to preserve structural elements like paragraph breaks
41    pub preserve_structure: bool,
42    /// Minimum text length to consider valid content
43    pub min_content_length: usize,
44    /// Tags to completely remove (including their content)
45    pub remove_tags: Vec<String>,
46    /// Whether to decode HTML entities
47    pub decode_entities: bool,
48}
49
50impl Default for ContentProcessorConfig {
51    fn default() -> Self {
52        Self {
53            max_length: 0, // unlimited by default
54            preserve_structure: true,
55            min_content_length: 10,
56            remove_tags: vec![
57                "script".to_string(),
58                "style".to_string(),
59                "noscript".to_string(),
60                "template".to_string(),
61                "svg".to_string(),
62                "math".to_string(),
63            ],
64            decode_entities: true,
65        }
66    }
67}
68
69/// Content processor for HTML documents
70///
71/// Provides methods to clean, extract, and normalize HTML content.
72#[derive(Debug, Clone)]
73pub struct ContentProcessor {
74    config: ContentProcessorConfig,
75}
76
77/// Result of content processing
78#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
79pub struct ProcessedContent {
80    /// The extracted and cleaned text content
81    pub text: String,
82    /// Number of words in the processed content
83    pub word_count: usize,
84    /// Number of characters in the processed content
85    pub char_count: usize,
86    /// Whether the content was truncated
87    pub was_truncated: bool,
88    /// Processing time in microseconds
89    pub processing_time_us: u64,
90}
91
92impl ContentProcessor {
93    /// Create a new content processor with the given configuration
94    pub fn new(config: ContentProcessorConfig) -> Self {
95        Self { config }
96    }
97
98    /// Create a content processor with default settings
99    pub fn with_defaults() -> Self {
100        Self::new(ContentProcessorConfig::default())
101    }
102
103    /// Create a content processor with a maximum length limit
104    pub fn with_max_length(max_length: usize) -> Self {
105        Self::new(ContentProcessorConfig {
106            max_length,
107            ..Default::default()
108        })
109    }
110
111    /// Process raw HTML and return cleaned content
112    ///
113    /// This is the main entry point for content processing. It:
114    /// 1. Removes script, style, and other non-content elements
115    /// 2. Extracts text from remaining HTML
116    /// 3. Decodes HTML entities
117    /// 4. Normalizes whitespace
118    /// 5. Optionally truncates to max_length
119    #[instrument(skip(self, raw_html), fields(html_len = raw_html.len()))]
120    pub fn process(&self, raw_html: &str) -> ProcessedContent {
121        let start = Instant::now();
122        trace!("Starting content processing");
123
124        // Step 1: Remove unwanted elements (scripts, styles, etc.)
125        let cleaned_html = self.remove_scripts_styles(raw_html);
126
127        // Step 2: Extract text from HTML
128        let extracted_text = self.extract_text(&cleaned_html);
129
130        // Step 3: Normalize whitespace
131        let normalized = self.normalize_whitespace(&extracted_text);
132
133        // Step 4: Truncate if needed
134        let (text, was_truncated) =
135            if self.config.max_length > 0 && normalized.len() > self.config.max_length {
136                let truncated = self.truncate_with_ellipsis(&normalized, self.config.max_length);
137                (truncated, true)
138            } else {
139                (normalized, false)
140            };
141
142        // Calculate metrics
143        let word_count = text.split_whitespace().count();
144        let char_count = text.chars().count();
145        let processing_time_us = start.elapsed().as_micros() as u64;
146
147        debug!(
148            "Processed content: {} words, {} chars, truncated={}, time={}us",
149            word_count, char_count, was_truncated, processing_time_us
150        );
151
152        ProcessedContent {
153            text,
154            word_count,
155            char_count,
156            was_truncated,
157            processing_time_us,
158        }
159    }
160
161    /// Extract text content from HTML
162    ///
163    /// Uses the scraper crate to parse HTML and extract text nodes,
164    /// preserving paragraph structure if configured.
165    #[instrument(skip(self, html), fields(html_len = html.len()))]
166    pub fn extract_text(&self, html: &str) -> String {
167        let document = Html::parse_document(html);
168        let mut text_parts: Vec<String> = Vec::new();
169
170        // Try to extract from body first, fall back to root
171        let body_selector = Selector::parse("body").unwrap();
172
173        if let Some(body) = document.select(&body_selector).next() {
174            self.extract_text_from_element(&body, &mut text_parts);
175        } else {
176            // No body, extract from root
177            let root = document.root_element();
178            self.extract_text_from_element(&root, &mut text_parts);
179        }
180
181        if self.config.preserve_structure {
182            text_parts.join("\n")
183        } else {
184            text_parts.join(" ")
185        }
186    }
187
188    /// Extract text from an element and its children using scraper's public API
189    fn extract_text_from_element(
190        &self,
191        element: &scraper::ElementRef<'_>,
192        text_parts: &mut Vec<String>,
193    ) {
194        let tag_name = element.value().name().to_lowercase();
195
196        // Skip removed tags entirely
197        if self.config.remove_tags.contains(&tag_name) {
198            return;
199        }
200
201        // Check if this is a block element
202        let is_block = matches!(
203            tag_name.as_str(),
204            "p" | "div"
205                | "section"
206                | "article"
207                | "header"
208                | "footer"
209                | "main"
210                | "aside"
211                | "nav"
212                | "h1"
213                | "h2"
214                | "h3"
215                | "h4"
216                | "h5"
217                | "h6"
218                | "li"
219                | "dt"
220                | "dd"
221                | "blockquote"
222                | "pre"
223                | "table"
224                | "tr"
225                | "br"
226                | "hr"
227        );
228
229        // Add blank line before block elements (for structure preservation)
230        if is_block && self.config.preserve_structure && !text_parts.is_empty() {
231            if let Some(last) = text_parts.last() {
232                if !last.is_empty() {
233                    text_parts.push(String::new());
234                }
235            }
236        }
237
238        // Process children
239        for child in element.children() {
240            if let Some(text_node) = child.value().as_text() {
241                let trimmed = text_node.trim();
242                if !trimmed.is_empty() {
243                    let decoded = if self.config.decode_entities {
244                        Self::decode_html_entities(trimmed)
245                    } else {
246                        trimmed.to_string()
247                    };
248                    text_parts.push(decoded);
249                }
250            } else if let Some(child_element) = scraper::ElementRef::wrap(child) {
251                self.extract_text_from_element(&child_element, text_parts);
252            }
253        }
254
255        // Add blank line after block elements
256        if is_block && self.config.preserve_structure && !text_parts.is_empty() {
257            if let Some(last) = text_parts.last() {
258                if !last.is_empty() {
259                    text_parts.push(String::new());
260                }
261            }
262        }
263    }
264
265    /// Remove script, style, and other non-content elements from HTML
266    ///
267    /// This method performs a comprehensive cleanup of HTML by:
268    /// - Removing `<script>` tags and their content
269    /// - Removing `<style>` tags and their content
270    /// - Removing `<noscript>` tags and their content
271    /// - Removing HTML comments
272    /// - Removing other configured tags
273    #[instrument(skip(self, html), fields(html_len = html.len()))]
274    pub fn remove_scripts_styles(&self, html: &str) -> String {
275        let mut result = html.to_string();
276
277        // Remove HTML comments first
278        result = Self::remove_pattern(&result, r"<!--[\s\S]*?-->");
279
280        // Remove each configured tag type
281        for tag in &self.config.remove_tags {
282            // Pattern for tags with content: <tag ...>...</tag>
283            let pattern = format!(r"(?is)<{}\b[^>]*>[\s\S]*?</{}>", tag, tag);
284            result = Self::remove_pattern(&result, &pattern);
285
286            // Pattern for self-closing tags: <tag ... />
287            let self_closing_pattern = format!(r"(?i)<{}\b[^>]*/?>", tag);
288            result = Self::remove_pattern(&result, &self_closing_pattern);
289        }
290
291        // Also remove inline event handlers and javascript: hrefs for security
292        result = Self::remove_pattern(&result, r#"(?i)\s+on\w+\s*=\s*["'][^"']*["']"#);
293        result = Self::remove_pattern(&result, r#"(?i)\s+on\w+\s*=\s*[^\s>]+"#);
294        result = Self::remove_pattern(&result, r#"(?i)href\s*=\s*["']javascript:[^"']*["']"#);
295
296        trace!(
297            "Removed scripts/styles: {} -> {} bytes",
298            html.len(),
299            result.len()
300        );
301        result
302    }
303
304    /// Helper to remove a regex pattern from text
305    fn remove_pattern(text: &str, pattern: &str) -> String {
306        match regex::Regex::new(pattern) {
307            Ok(re) => re.replace_all(text, "").to_string(),
308            Err(_) => text.to_string(),
309        }
310    }
311
312    /// Normalize whitespace in text
313    ///
314    /// This method:
315    /// - Collapses multiple spaces into single spaces
316    /// - Normalizes different whitespace characters (tabs, nbsp, etc.)
317    /// - Preserves paragraph breaks (double newlines) if structure preservation is enabled
318    /// - Trims leading and trailing whitespace
319    #[instrument(skip(self, text), fields(text_len = text.len()))]
320    pub fn normalize_whitespace(&self, text: &str) -> String {
321        let mut result = text.to_string();
322
323        // Replace non-breaking spaces and other special whitespace with regular space
324        result = result
325            .replace(
326                ['\u{00A0}', '\u{2002}', '\u{2003}', '\u{2009}', '\u{200A}'],
327                " ",
328            )
329            .replace(['\u{200B}', '\u{FEFF}'], ""); // Zero-width space and BOM
330
331        // Replace tabs with spaces
332        result = result.replace('\t', " ");
333
334        // Replace carriage returns with newlines
335        result = result.replace("\r\n", "\n").replace('\r', "\n");
336
337        if self.config.preserve_structure {
338            // Collapse multiple spaces (but not newlines)
339            let space_re = regex::Regex::new(r"[^\S\n]+").unwrap();
340            result = space_re.replace_all(&result, " ").to_string();
341
342            // Collapse multiple newlines (3+ becomes 2)
343            let newline_re = regex::Regex::new(r"\n{3,}").unwrap();
344            result = newline_re.replace_all(&result, "\n\n").to_string();
345
346            // Trim each line
347            result = result
348                .lines()
349                .map(|line| line.trim())
350                .collect::<Vec<_>>()
351                .join("\n");
352        } else {
353            // Collapse all whitespace into single spaces
354            let ws_re = regex::Regex::new(r"\s+").unwrap();
355            result = ws_re.replace_all(&result, " ").to_string();
356        }
357
358        result.trim().to_string()
359    }
360
361    /// Truncate text with ellipsis at a word boundary
362    ///
363    /// This method truncates text to approximately the given maximum length,
364    /// breaking at word boundaries to avoid cutting words in half.
365    /// Appends "..." to indicate truncation.
366    #[instrument(skip(self, text), fields(text_len = text.len(), max = max))]
367    pub fn truncate_with_ellipsis(&self, text: &str, max: usize) -> String {
368        if text.len() <= max {
369            return text.to_string();
370        }
371
372        // Reserve space for ellipsis
373        let effective_max = max.saturating_sub(3);
374        if effective_max == 0 {
375            return "...".to_string();
376        }
377
378        // Find the last space before the limit
379        let truncate_at = text[..effective_max]
380            .rfind(|c: char| c.is_whitespace())
381            .unwrap_or(effective_max);
382
383        // Avoid truncating too short (at least 20% of max)
384        let min_length = effective_max / 5;
385        let truncate_at = if truncate_at < min_length {
386            effective_max
387        } else {
388            truncate_at
389        };
390
391        let mut result = text[..truncate_at].trim_end().to_string();
392        result.push_str("...");
393
394        trace!("Truncated from {} to {} chars", text.len(), result.len());
395        result
396    }
397
398    /// Decode HTML entities in text
399    ///
400    /// Handles common HTML entities including:
401    /// - Named entities (&amp;, &lt;, &gt;, &quot;, &nbsp;, etc.)
402    /// - Numeric entities (&#39;, &#x27;, etc.)
403    pub fn decode_html_entities(text: &str) -> String {
404        let mut result = text.to_string();
405
406        // Named entities (most common first for performance)
407        let named_entities = [
408            ("&amp;", "&"),
409            ("&lt;", "<"),
410            ("&gt;", ">"),
411            ("&quot;", "\""),
412            ("&apos;", "'"),
413            ("&nbsp;", " "),
414            ("&ndash;", "\u{2013}"),
415            ("&mdash;", "\u{2014}"),
416            ("&lsquo;", "\u{2018}"),
417            ("&rsquo;", "\u{2019}"),
418            ("&ldquo;", "\u{201C}"),
419            ("&rdquo;", "\u{201D}"),
420            ("&hellip;", "\u{2026}"),
421            ("&trade;", "\u{2122}"),
422            ("&copy;", "\u{00A9}"),
423            ("&reg;", "\u{00AE}"),
424            ("&deg;", "\u{00B0}"),
425            ("&plusmn;", "\u{00B1}"),
426            ("&times;", "\u{00D7}"),
427            ("&divide;", "\u{00F7}"),
428            ("&euro;", "\u{20AC}"),
429            ("&pound;", "\u{00A3}"),
430            ("&yen;", "\u{00A5}"),
431            ("&cent;", "\u{00A2}"),
432        ];
433
434        for (entity, replacement) in named_entities {
435            result = result.replace(entity, replacement);
436        }
437
438        // Decimal numeric entities (&#123;)
439        if result.contains("&#") {
440            let decimal_re = regex::Regex::new(r"&#(\d+);").unwrap();
441            result = decimal_re
442                .replace_all(&result, |caps: &regex::Captures| {
443                    caps.get(1)
444                        .and_then(|m| m.as_str().parse::<u32>().ok())
445                        .and_then(char::from_u32)
446                        .map(|c| c.to_string())
447                        .unwrap_or_else(|| caps[0].to_string())
448                })
449                .to_string();
450
451            // Hexadecimal numeric entities (&#x1F;)
452            let hex_re = regex::Regex::new(r"(?i)&#x([0-9a-f]+);").unwrap();
453            result = hex_re
454                .replace_all(&result, |caps: &regex::Captures| {
455                    caps.get(1)
456                        .and_then(|m| u32::from_str_radix(m.as_str(), 16).ok())
457                        .and_then(char::from_u32)
458                        .map(|c| c.to_string())
459                        .unwrap_or_else(|| caps[0].to_string())
460                })
461                .to_string();
462        }
463
464        result
465    }
466}
467
468#[cfg(test)]
469mod tests {
470    use super::*;
471
472    #[test]
473    fn test_basic_processing() {
474        let processor = ContentProcessor::with_defaults();
475        let html = "<html><body><p>Hello world!</p></body></html>";
476        let result = processor.process(html);
477
478        assert_eq!(result.text.trim(), "Hello world!");
479        assert_eq!(result.word_count, 2);
480        assert!(!result.was_truncated);
481    }
482
483    #[test]
484    fn test_script_removal() {
485        let processor = ContentProcessor::with_defaults();
486        let html = r#"
487            <html>
488            <head><script>alert('evil');</script></head>
489            <body>
490                <p>Safe content</p>
491                <script type="text/javascript">
492                    malicious_code();
493                </script>
494            </body>
495            </html>
496        "#;
497        let result = processor.process(html);
498
499        assert!(result.text.contains("Safe content"));
500        assert!(!result.text.contains("evil"));
501        assert!(!result.text.contains("malicious"));
502    }
503
504    #[test]
505    fn test_style_removal() {
506        let processor = ContentProcessor::with_defaults();
507        let html = r#"
508            <html>
509            <head><style>.hidden { display: none; }</style></head>
510            <body>
511                <p>Visible text</p>
512                <style>
513                    body { background: red; }
514                </style>
515            </body>
516            </html>
517        "#;
518        let result = processor.process(html);
519
520        assert!(result.text.contains("Visible text"));
521        assert!(!result.text.contains("display"));
522        assert!(!result.text.contains("background"));
523    }
524
525    #[test]
526    fn test_entity_decoding() {
527        let processor = ContentProcessor::with_defaults();
528        let html = "<p>Tom &amp; Jerry &lt;3 &quot;cheese&quot;</p>";
529        let result = processor.process(html);
530
531        assert!(result.text.contains("Tom & Jerry"));
532        assert!(result.text.contains("<3"));
533        assert!(result.text.contains("\"cheese\""));
534    }
535
536    #[test]
537    fn test_numeric_entity_decoding() {
538        let decoded = ContentProcessor::decode_html_entities("&#39;hello&#39; &#x27;world&#x27;");
539        assert_eq!(decoded, "'hello' 'world'");
540    }
541
542    #[test]
543    fn test_whitespace_normalization() {
544        let processor = ContentProcessor::with_defaults();
545        let html = "<p>Too    many     spaces</p>";
546        let result = processor.process(html);
547
548        assert!(!result.text.contains("    "));
549        assert!(result.text.contains("Too many spaces") || result.text.contains("Too many spaces"));
550    }
551
552    #[test]
553    fn test_structure_preservation() {
554        let config = ContentProcessorConfig {
555            preserve_structure: true,
556            ..Default::default()
557        };
558        let processor = ContentProcessor::new(config);
559        let html = "<p>Paragraph 1</p><p>Paragraph 2</p>";
560        let result = processor.process(html);
561
562        // Should have some kind of separation between paragraphs
563        assert!(result.text.contains("Paragraph 1"));
564        assert!(result.text.contains("Paragraph 2"));
565    }
566
567    #[test]
568    fn test_truncation_with_ellipsis() {
569        let processor = ContentProcessor::with_max_length(20);
570        let html = "<p>This is a very long piece of text that should be truncated.</p>";
571        let result = processor.process(html);
572
573        assert!(result.was_truncated);
574        assert!(result.text.ends_with("..."));
575        assert!(result.text.len() <= 20);
576    }
577
578    #[test]
579    fn test_truncation_at_word_boundary() {
580        let processor = ContentProcessor::with_defaults();
581        let text = "Hello world how are you doing today";
582        let truncated = processor.truncate_with_ellipsis(text, 15);
583
584        assert!(truncated.ends_with("..."));
585        // Should break at "world" or similar, not in the middle of a word
586        assert!(!truncated.contains("wor...") || truncated == "Hello world...");
587    }
588
589    #[test]
590    fn test_no_truncation_for_short_content() {
591        let processor = ContentProcessor::with_max_length(1000);
592        let html = "<p>Short content</p>";
593        let result = processor.process(html);
594
595        assert!(!result.was_truncated);
596    }
597
598    #[test]
599    fn test_noscript_removal() {
600        let processor = ContentProcessor::with_defaults();
601        let html = r#"
602            <body>
603                <noscript>Enable JavaScript!</noscript>
604                <p>Content</p>
605            </body>
606        "#;
607        let result = processor.process(html);
608
609        assert!(result.text.contains("Content"));
610        assert!(!result.text.contains("JavaScript"));
611    }
612
613    #[test]
614    fn test_comment_removal() {
615        let processor = ContentProcessor::with_defaults();
616        let html = r#"
617            <body>
618                <!-- This is a comment -->
619                <p>Visible</p>
620                <!-- Another comment
621                     with multiple lines -->
622            </body>
623        "#;
624        let cleaned = processor.remove_scripts_styles(html);
625
626        assert!(!cleaned.contains("This is a comment"));
627        assert!(!cleaned.contains("Another comment"));
628    }
629
630    #[test]
631    fn test_inline_event_handler_removal() {
632        let processor = ContentProcessor::with_defaults();
633        let html = r#"<button onclick="evil()">Click</button>"#;
634        let cleaned = processor.remove_scripts_styles(html);
635
636        assert!(!cleaned.contains("onclick"));
637        assert!(!cleaned.contains("evil"));
638    }
639
640    #[test]
641    fn test_javascript_href_removal() {
642        let processor = ContentProcessor::with_defaults();
643        let html = r#"<a href="javascript:alert('xss')">Click</a>"#;
644        let cleaned = processor.remove_scripts_styles(html);
645
646        assert!(!cleaned.contains("javascript:"));
647    }
648
649    #[test]
650    fn test_special_whitespace_normalization() {
651        let processor = ContentProcessor::with_defaults();
652        let text_with_nbsp = "Hello\u{00A0}world\u{2003}test";
653        let normalized = processor.normalize_whitespace(text_with_nbsp);
654
655        assert!(!normalized.contains('\u{00A0}'));
656        assert!(!normalized.contains('\u{2003}'));
657        assert!(normalized.contains("Hello world test") || normalized.contains("Hello world test"));
658    }
659
660    #[test]
661    fn test_processed_content_metrics() {
662        let processor = ContentProcessor::with_defaults();
663        let html = "<p>One two three four five</p>";
664        let result = processor.process(html);
665
666        assert_eq!(result.word_count, 5);
667        assert!(result.char_count > 0);
668        let _ = result.processing_time_us;
669    }
670
671    #[test]
672    fn test_empty_html() {
673        let processor = ContentProcessor::with_defaults();
674        let html = "<html><body></body></html>";
675        let result = processor.process(html);
676
677        assert!(result.text.is_empty() || result.word_count == 0);
678    }
679
680    #[test]
681    fn test_deeply_nested_content() {
682        let processor = ContentProcessor::with_defaults();
683        let html = "<div><div><div><span><p>Deep content</p></span></div></div></div>";
684        let result = processor.process(html);
685
686        assert!(result.text.contains("Deep content"));
687    }
688
689    #[test]
690    fn test_mixed_content() {
691        let processor = ContentProcessor::with_defaults();
692        let html = r#"
693            <html>
694            <head>
695                <title>Test Page</title>
696                <script>bad();</script>
697                <style>.foo { color: red; }</style>
698            </head>
699            <body>
700                <header><nav>Menu</nav></header>
701                <main>
702                    <article>
703                        <h1>Article Title</h1>
704                        <p>First paragraph with <strong>bold</strong> text.</p>
705                        <p>Second paragraph with a <a href="http://example.com">link</a>.</p>
706                    </article>
707                </main>
708                <footer>&copy; 2024</footer>
709            </body>
710            </html>
711        "#;
712        let result = processor.process(html);
713
714        assert!(result.text.contains("Article Title"));
715        assert!(result.text.contains("First paragraph"));
716        assert!(result.text.contains("bold"));
717        assert!(result.text.contains("link"));
718        assert!(!result.text.contains("bad()"));
719        assert!(!result.text.contains("color: red"));
720    }
721
722    #[test]
723    fn test_unicode_content() {
724        let processor = ContentProcessor::with_defaults();
725        let html = "<p>Hello \u{1F600} World! Caf\u{00E9}</p>";
726        let result = processor.process(html);
727
728        assert!(result.text.contains("\u{1F600}")); // emoji preserved
729        assert!(result.text.contains("Caf\u{00E9}")); // accented char preserved
730    }
731
732    #[test]
733    fn test_custom_remove_tags() {
734        let config = ContentProcessorConfig {
735            remove_tags: vec!["script".to_string(), "style".to_string(), "nav".to_string()],
736            ..Default::default()
737        };
738        let processor = ContentProcessor::new(config);
739        let html = "<nav>Navigation</nav><p>Content</p>";
740        let result = processor.process(html);
741
742        assert!(!result.text.contains("Navigation"));
743        assert!(result.text.contains("Content"));
744    }
745
746    #[test]
747    fn test_without_entity_decoding() {
748        let config = ContentProcessorConfig {
749            decode_entities: false,
750            ..Default::default()
751        };
752        let processor = ContentProcessor::new(config);
753        let html = "<p>&amp; &lt; &gt;</p>";
754        let result = processor.process(html);
755
756        // Entities should remain as-is
757        assert!(result.text.contains("&amp;") || result.text.contains("&"));
758    }
759
760    #[test]
761    fn test_extract_text_directly() {
762        let processor = ContentProcessor::with_defaults();
763        let html = "<p>Direct <em>extraction</em> test</p>";
764        let text = processor.extract_text(html);
765
766        assert!(text.contains("Direct"));
767        assert!(text.contains("extraction"));
768        assert!(text.contains("test"));
769    }
770
771    #[test]
772    fn test_remove_scripts_styles_directly() {
773        let processor = ContentProcessor::with_defaults();
774        let html = "<script>bad();</script><p>Good</p><style>.x{}</style>";
775        let cleaned = processor.remove_scripts_styles(html);
776
777        assert!(!cleaned.contains("bad()"));
778        assert!(!cleaned.contains(".x{}"));
779        assert!(cleaned.contains("<p>Good</p>"));
780    }
781
782    #[test]
783    fn test_normalize_whitespace_directly() {
784        let processor = ContentProcessor::with_defaults();
785        let text = "  Multiple   spaces   and\n\n\n\nmany newlines  ";
786        let normalized = processor.normalize_whitespace(text);
787
788        assert!(!normalized.starts_with(' '));
789        assert!(!normalized.ends_with(' '));
790        assert!(!normalized.contains("   ")); // no triple spaces
791    }
792}