reasonkit_web/extraction/
content.rs

1//! Main content extraction
2//!
3//! This module extracts the main content from web pages, converting it
4//! to clean text or markdown format.
5
6use crate::browser::PageHandle;
7use crate::error::{ExtractionError, Result};
8use serde::{Deserialize, Serialize};
9use tracing::{debug, info, instrument};
10
11/// Extracted content from a page
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct ExtractedContent {
14    /// Plain text content
15    pub text: String,
16    /// Content as markdown (if converted)
17    pub markdown: Option<String>,
18    /// HTML of the main content
19    pub html: String,
20    /// Word count
21    pub word_count: usize,
22    /// Character count
23    pub char_count: usize,
24    /// Whether content was extracted from article/main element
25    pub from_main: bool,
26}
27
28/// Content extraction functionality
29pub struct ContentExtractor;
30
31impl ContentExtractor {
32    /// Extract main content from the page
33    #[instrument(skip(page))]
34    pub async fn extract_main_content(page: &PageHandle) -> Result<ExtractedContent> {
35        info!("Extracting main content");
36
37        // Try to find the main content using various strategies
38        let (html, from_main) = Self::find_main_content(&page.page).await?;
39        let text = Self::html_to_text(&html);
40        let markdown = Self::html_to_markdown(&html);
41
42        let word_count = text.split_whitespace().count();
43        let char_count = text.chars().count();
44
45        debug!(
46            "Extracted {} words, {} chars, from_main={}",
47            word_count, char_count, from_main
48        );
49
50        Ok(ExtractedContent {
51            text,
52            markdown: Some(markdown),
53            html,
54            word_count,
55            char_count,
56            from_main,
57        })
58    }
59
60    /// Extract content from a specific selector
61    #[instrument(skip(page))]
62    pub async fn extract_from_selector(
63        page: &PageHandle,
64        selector: &str,
65    ) -> Result<ExtractedContent> {
66        info!("Extracting from selector: {}", selector);
67
68        let script = format!(
69            r#"
70            (() => {{
71                const el = document.querySelector('{}');
72                if (!el) return null;
73                return {{
74                    html: el.innerHTML,
75                    text: el.innerText
76                }};
77            }})()
78            "#,
79            selector.replace('\'', "\\'")
80        );
81
82        let result: Option<serde_json::Value> = page
83            .page
84            .evaluate(script.as_str())
85            .await
86            .map_err(|e| ExtractionError::ExtractionFailed(e.to_string()))?
87            .into_value()
88            .map_err(|e| ExtractionError::ExtractionFailed(e.to_string()))?;
89
90        let result =
91            result.ok_or_else(|| ExtractionError::ElementNotFound(selector.to_string()))?;
92
93        let html = result["html"].as_str().unwrap_or("").to_string();
94        let text = result["text"].as_str().unwrap_or("").to_string();
95
96        let markdown = Self::html_to_markdown(&html);
97        let word_count = text.split_whitespace().count();
98        let char_count = text.chars().count();
99
100        Ok(ExtractedContent {
101            text,
102            markdown: Some(markdown),
103            html,
104            word_count,
105            char_count,
106            from_main: false,
107        })
108    }
109
110    /// Extract all text from the page body
111    #[instrument(skip(page))]
112    pub async fn extract_all_text(page: &PageHandle) -> Result<String> {
113        let script = r#"
114            document.body.innerText
115        "#;
116
117        let text: String = page
118            .page
119            .evaluate(script)
120            .await
121            .map_err(|e| ExtractionError::ExtractionFailed(e.to_string()))?
122            .into_value()
123            .map_err(|e| ExtractionError::ExtractionFailed(e.to_string()))?;
124
125        Ok(text)
126    }
127
128    /// Find the main content element using various strategies
129    async fn find_main_content(page: &chromiumoxide::Page) -> Result<(String, bool)> {
130        let script = r#"
131            (() => {
132                // Strategy 1: Look for article or main elements
133                const mainSelectors = [
134                    'article',
135                    'main',
136                    '[role="main"]',
137                    '[role="article"]',
138                    '.article',
139                    '.post',
140                    '.content',
141                    '.entry-content',
142                    '.post-content',
143                    '#content',
144                    '#main-content',
145                    '.main-content'
146                ];
147
148                for (const selector of mainSelectors) {
149                    const el = document.querySelector(selector);
150                    if (el && el.innerText.length > 200) {
151                        return { html: el.innerHTML, fromMain: true };
152                    }
153                }
154
155                // Strategy 2: Find the largest text block
156                const textBlocks = [];
157                const walker = document.createTreeWalker(
158                    document.body,
159                    NodeFilter.SHOW_ELEMENT,
160                    {
161                        acceptNode: (node) => {
162                            const tag = node.tagName.toLowerCase();
163                            if (['script', 'style', 'nav', 'header', 'footer', 'aside', 'noscript'].includes(tag)) {
164                                return NodeFilter.FILTER_REJECT;
165                            }
166                            return NodeFilter.FILTER_ACCEPT;
167                        }
168                    }
169                );
170
171                let node;
172                while (node = walker.nextNode()) {
173                    const text = node.innerText || '';
174                    if (text.length > 200) {
175                        textBlocks.push({
176                            el: node,
177                            length: text.length
178                        });
179                    }
180                }
181
182                if (textBlocks.length > 0) {
183                    // Sort by length and get the longest
184                    textBlocks.sort((a, b) => b.length - a.length);
185                    return { html: textBlocks[0].el.innerHTML, fromMain: false };
186                }
187
188                // Fallback: return body
189                return { html: document.body.innerHTML, fromMain: false };
190            })()
191        "#;
192
193        let result: serde_json::Value = page
194            .evaluate(script)
195            .await
196            .map_err(|e| ExtractionError::ExtractionFailed(e.to_string()))?
197            .into_value()
198            .map_err(|e| ExtractionError::ExtractionFailed(e.to_string()))?;
199
200        let html = result["html"].as_str().unwrap_or("").to_string();
201        let from_main = result["fromMain"].as_bool().unwrap_or(false);
202
203        Ok((html, from_main))
204    }
205
206    /// Convert HTML to plain text
207    pub fn html_to_text(html: &str) -> String {
208        // Remove script and style tags
209        let mut text = html.to_string();
210
211        // Remove script tags and content
212        let script_re = regex::Regex::new(r"<script[^>]*>[\s\S]*?</script>").unwrap();
213        text = script_re.replace_all(&text, "").to_string();
214
215        // Remove style tags and content
216        let style_re = regex::Regex::new(r"<style[^>]*>[\s\S]*?</style>").unwrap();
217        text = style_re.replace_all(&text, "").to_string();
218
219        // Replace block elements with newlines
220        let block_re = regex::Regex::new(r"</(p|div|br|li|h[1-6])>").unwrap();
221        text = block_re.replace_all(&text, "\n").to_string();
222
223        // Remove all remaining HTML tags
224        let tag_re = regex::Regex::new(r"<[^>]+>").unwrap();
225        text = tag_re.replace_all(&text, "").to_string();
226
227        // Decode common HTML entities
228        text = Self::decode_html_entities(&text);
229
230        // Normalize whitespace
231        let ws_re = regex::Regex::new(r"\s+").unwrap();
232        text = ws_re.replace_all(&text, " ").to_string();
233
234        // Normalize newlines
235        let nl_re = regex::Regex::new(r"\n\s*\n+").unwrap();
236        text = nl_re.replace_all(&text, "\n\n").to_string();
237
238        text.trim().to_string()
239    }
240
241    /// Decode common HTML entities
242    pub fn decode_html_entities(text: &str) -> String {
243        text.replace("&nbsp;", " ")
244            .replace("&lt;", "<")
245            .replace("&gt;", ">")
246            .replace("&amp;", "&")
247            .replace("&quot;", "\"")
248            .replace("&#39;", "'")
249            .replace("&apos;", "'")
250            .replace("&#x27;", "'")
251            .replace("&#x2F;", "/")
252            .replace("&copy;", "(c)")
253            .replace("&reg;", "(R)")
254            .replace("&trade;", "(TM)")
255            .replace("&ndash;", "-")
256            .replace("&mdash;", "--")
257            .replace("&hellip;", "...")
258            .replace("&lsquo;", "'")
259            .replace("&rsquo;", "'")
260            .replace("&ldquo;", "\"")
261            .replace("&rdquo;", "\"")
262    }
263
264    /// Convert HTML to markdown
265    pub fn html_to_markdown(html: &str) -> String {
266        let mut md = html.to_string();
267
268        // Remove script and style
269        let script_re = regex::Regex::new(r"<script[^>]*>[\s\S]*?</script>").unwrap();
270        md = script_re.replace_all(&md, "").to_string();
271        let style_re = regex::Regex::new(r"<style[^>]*>[\s\S]*?</style>").unwrap();
272        md = style_re.replace_all(&md, "").to_string();
273
274        // Convert headers
275        for i in (1..=6).rev() {
276            let h_re = regex::Regex::new(&format!(r"<h{}[^>]*>(.*?)</h{}>", i, i)).unwrap();
277            let prefix = "#".repeat(i);
278            md = h_re
279                .replace_all(&md, format!("{} $1\n\n", prefix))
280                .to_string();
281        }
282
283        // Convert paragraphs
284        let p_re = regex::Regex::new(r"<p[^>]*>(.*?)</p>").unwrap();
285        md = p_re.replace_all(&md, "$1\n\n").to_string();
286
287        // Convert line breaks
288        let br_re = regex::Regex::new(r"<br\s*/?>").unwrap();
289        md = br_re.replace_all(&md, "\n").to_string();
290
291        // Convert bold
292        let b_re = regex::Regex::new(r"<(b|strong)[^>]*>(.*?)</(b|strong)>").unwrap();
293        md = b_re.replace_all(&md, "**$2**").to_string();
294
295        // Convert italic
296        let i_re = regex::Regex::new(r"<(i|em)[^>]*>(.*?)</(i|em)>").unwrap();
297        md = i_re.replace_all(&md, "*$2*").to_string();
298
299        // Convert links
300        let a_re = regex::Regex::new(r#"<a[^>]*href=["']([^"']+)["'][^>]*>(.*?)</a>"#).unwrap();
301        md = a_re.replace_all(&md, "[$2]($1)").to_string();
302
303        // Convert code
304        let code_re = regex::Regex::new(r"<code[^>]*>(.*?)</code>").unwrap();
305        md = code_re.replace_all(&md, "`$1`").to_string();
306
307        // Convert pre blocks (use [\s\S]*? to match across newlines)
308        let pre_re = regex::Regex::new(r"<pre[^>]*>([\s\S]*?)</pre>").unwrap();
309        md = pre_re.replace_all(&md, "```\n$1\n```").to_string();
310
311        // Convert lists
312        let li_re = regex::Regex::new(r"<li[^>]*>(.*?)</li>").unwrap();
313        md = li_re.replace_all(&md, "- $1\n").to_string();
314
315        // Remove remaining tags
316        let tag_re = regex::Regex::new(r"<[^>]+>").unwrap();
317        md = tag_re.replace_all(&md, "").to_string();
318
319        // Decode HTML entities
320        md = Self::decode_html_entities(&md);
321
322        // Clean up whitespace
323        let ws_re = regex::Regex::new(r"\n{3,}").unwrap();
324        md = ws_re.replace_all(&md, "\n\n").to_string();
325
326        md.trim().to_string()
327    }
328
329    /// Normalize whitespace in text
330    pub fn normalize_whitespace(text: &str) -> String {
331        let ws_re = regex::Regex::new(r"\s+").unwrap();
332        ws_re.replace_all(text.trim(), " ").to_string()
333    }
334
335    /// Truncate text to a maximum length, adding ellipsis if truncated
336    pub fn truncate(text: &str, max_len: usize) -> String {
337        if text.len() <= max_len {
338            text.to_string()
339        } else if max_len <= 3 {
340            text.chars().take(max_len).collect()
341        } else {
342            let truncated: String = text.chars().take(max_len - 3).collect();
343            format!("{}...", truncated)
344        }
345    }
346}
347
348#[cfg(test)]
349mod tests {
350    use super::*;
351
352    // ========================================================================
353    // HTML to Text Conversion Tests
354    // ========================================================================
355
356    #[test]
357    fn test_html_to_text() {
358        let html = "<p>Hello <b>world</b>!</p><p>Second paragraph.</p>";
359        let text = ContentExtractor::html_to_text(html);
360        assert!(text.contains("Hello"));
361        assert!(text.contains("world"));
362        assert!(!text.contains("<"));
363    }
364
365    #[test]
366    fn test_html_to_text_removes_scripts() {
367        let html = "<p>Content</p><script>evil();</script><p>More</p>";
368        let text = ContentExtractor::html_to_text(html);
369        assert!(!text.contains("evil"));
370        assert!(text.contains("Content"));
371        assert!(text.contains("More"));
372    }
373
374    #[test]
375    fn test_html_to_text_removes_styles() {
376        let html = "<p>Content</p><style>.hidden { display: none; }</style><p>More</p>";
377        let text = ContentExtractor::html_to_text(html);
378        assert!(!text.contains("hidden"));
379        assert!(!text.contains("display"));
380        assert!(text.contains("Content"));
381        assert!(text.contains("More"));
382    }
383
384    #[test]
385    fn test_html_to_text_multiline_script() {
386        let html = r#"
387            <p>Before</p>
388            <script type="text/javascript">
389                function evil() {
390                    console.log("bad");
391                }
392                evil();
393            </script>
394            <p>After</p>
395        "#;
396        let text = ContentExtractor::html_to_text(html);
397        assert!(!text.contains("evil"));
398        assert!(!text.contains("console"));
399        assert!(text.contains("Before"));
400        assert!(text.contains("After"));
401    }
402
403    #[test]
404    fn test_html_to_text_preserves_newlines_for_blocks() {
405        let html = "<p>Para 1</p><p>Para 2</p>";
406        let text = ContentExtractor::html_to_text(html);
407        // Should have some separation between paragraphs
408        assert!(text.contains("Para 1"));
409        assert!(text.contains("Para 2"));
410    }
411
412    #[test]
413    fn test_html_to_text_strips_all_tags() {
414        let html = "<div class=\"container\"><span id=\"test\">Hello</span></div>";
415        let text = ContentExtractor::html_to_text(html);
416        assert_eq!(text, "Hello");
417        assert!(!text.contains("<"));
418        assert!(!text.contains(">"));
419        assert!(!text.contains("class"));
420    }
421
422    // ========================================================================
423    // HTML Entity Decoding Tests
424    // ========================================================================
425
426    #[test]
427    fn test_html_entity_decode_basic() {
428        assert_eq!(
429            ContentExtractor::decode_html_entities("&lt;div&gt;"),
430            "<div>"
431        );
432        assert_eq!(ContentExtractor::decode_html_entities("&amp;"), "&");
433        assert_eq!(ContentExtractor::decode_html_entities("&quot;"), "\"");
434    }
435
436    #[test]
437    fn test_html_entity_decode_quotes() {
438        assert_eq!(ContentExtractor::decode_html_entities("&#39;"), "'");
439        assert_eq!(ContentExtractor::decode_html_entities("&apos;"), "'");
440        assert_eq!(ContentExtractor::decode_html_entities("&#x27;"), "'");
441    }
442
443    #[test]
444    fn test_html_entity_decode_typography() {
445        assert_eq!(ContentExtractor::decode_html_entities("&ndash;"), "-");
446        assert_eq!(ContentExtractor::decode_html_entities("&mdash;"), "--");
447        assert_eq!(ContentExtractor::decode_html_entities("&hellip;"), "...");
448        assert_eq!(ContentExtractor::decode_html_entities("&lsquo;"), "'");
449        assert_eq!(ContentExtractor::decode_html_entities("&rsquo;"), "'");
450        assert_eq!(ContentExtractor::decode_html_entities("&ldquo;"), "\"");
451        assert_eq!(ContentExtractor::decode_html_entities("&rdquo;"), "\"");
452    }
453
454    #[test]
455    fn test_html_entity_decode_symbols() {
456        assert_eq!(ContentExtractor::decode_html_entities("&copy;"), "(c)");
457        assert_eq!(ContentExtractor::decode_html_entities("&reg;"), "(R)");
458        assert_eq!(ContentExtractor::decode_html_entities("&trade;"), "(TM)");
459    }
460
461    #[test]
462    fn test_html_entity_decode_nbsp() {
463        assert_eq!(
464            ContentExtractor::decode_html_entities("Hello&nbsp;World"),
465            "Hello World"
466        );
467    }
468
469    #[test]
470    fn test_html_entity_decode_mixed() {
471        let input = "Copyright &copy; 2024 &mdash; All rights reserved &amp; more";
472        let output = ContentExtractor::decode_html_entities(input);
473        assert_eq!(output, "Copyright (c) 2024 -- All rights reserved & more");
474    }
475
476    // ========================================================================
477    // Script Removal Tests
478    // ========================================================================
479
480    #[test]
481    fn test_script_removal_inline() {
482        let html = "<script>alert('xss')</script><p>Safe</p>";
483        let text = ContentExtractor::html_to_text(html);
484        assert!(!text.contains("alert"));
485        assert!(!text.contains("xss"));
486        assert!(text.contains("Safe"));
487    }
488
489    #[test]
490    fn test_script_removal_with_attributes() {
491        let html = "<script type=\"text/javascript\" src=\"bad.js\">code()</script><p>Safe</p>";
492        let text = ContentExtractor::html_to_text(html);
493        assert!(!text.contains("code"));
494        assert!(!text.contains("javascript"));
495        assert!(text.contains("Safe"));
496    }
497
498    #[test]
499    fn test_script_removal_multiple() {
500        let html = "<script>one()</script><p>Middle</p><script>two()</script>";
501        let text = ContentExtractor::html_to_text(html);
502        assert!(!text.contains("one"));
503        assert!(!text.contains("two"));
504        assert!(text.contains("Middle"));
505    }
506
507    // ========================================================================
508    // Whitespace Normalization Tests
509    // ========================================================================
510
511    #[test]
512    fn test_whitespace_normalization_spaces() {
513        let text = "Hello    world";
514        let normalized = ContentExtractor::normalize_whitespace(text);
515        assert_eq!(normalized, "Hello world");
516    }
517
518    #[test]
519    fn test_whitespace_normalization_tabs() {
520        let text = "Hello\t\tworld";
521        let normalized = ContentExtractor::normalize_whitespace(text);
522        assert_eq!(normalized, "Hello world");
523    }
524
525    #[test]
526    fn test_whitespace_normalization_newlines() {
527        let text = "Hello\n\n\nworld";
528        let normalized = ContentExtractor::normalize_whitespace(text);
529        assert_eq!(normalized, "Hello world");
530    }
531
532    #[test]
533    fn test_whitespace_normalization_mixed() {
534        let text = "  Hello   \t\n  world  ";
535        let normalized = ContentExtractor::normalize_whitespace(text);
536        assert_eq!(normalized, "Hello world");
537    }
538
539    #[test]
540    fn test_whitespace_normalization_empty() {
541        let text = "   ";
542        let normalized = ContentExtractor::normalize_whitespace(text);
543        assert_eq!(normalized, "");
544    }
545
546    #[test]
547    fn test_whitespace_normalization_single_word() {
548        let text = "  Hello  ";
549        let normalized = ContentExtractor::normalize_whitespace(text);
550        assert_eq!(normalized, "Hello");
551    }
552
553    // ========================================================================
554    // Truncation Tests
555    // ========================================================================
556
557    #[test]
558    fn test_truncation_short_text() {
559        let text = "Hello";
560        let truncated = ContentExtractor::truncate(text, 10);
561        assert_eq!(truncated, "Hello");
562    }
563
564    #[test]
565    fn test_truncation_exact_length() {
566        let text = "Hello";
567        let truncated = ContentExtractor::truncate(text, 5);
568        assert_eq!(truncated, "Hello");
569    }
570
571    #[test]
572    fn test_truncation_adds_ellipsis() {
573        let text = "Hello World";
574        let truncated = ContentExtractor::truncate(text, 8);
575        assert_eq!(truncated, "Hello...");
576        assert_eq!(truncated.len(), 8);
577    }
578
579    #[test]
580    fn test_truncation_very_short_limit() {
581        let text = "Hello";
582        let truncated = ContentExtractor::truncate(text, 3);
583        assert_eq!(truncated, "Hel");
584    }
585
586    #[test]
587    fn test_truncation_zero_limit() {
588        let text = "Hello";
589        let truncated = ContentExtractor::truncate(text, 0);
590        assert_eq!(truncated, "");
591    }
592
593    #[test]
594    fn test_truncation_empty_text() {
595        let text = "";
596        let truncated = ContentExtractor::truncate(text, 10);
597        assert_eq!(truncated, "");
598    }
599
600    #[test]
601    fn test_truncation_unicode() {
602        let text = "Hello World";
603        let truncated = ContentExtractor::truncate(text, 10);
604        // Should handle unicode correctly (ellipsis counts as 3 chars)
605        assert!(truncated.len() <= 10 || truncated.ends_with("..."));
606    }
607
608    // ========================================================================
609    // HTML to Markdown Conversion Tests
610    // ========================================================================
611
612    #[test]
613    fn test_html_to_markdown() {
614        let html = "<h1>Title</h1><p>Para with <b>bold</b> and <a href=\"http://example.com\">link</a>.</p>";
615        let md = ContentExtractor::html_to_markdown(html);
616        assert!(md.contains("# Title"));
617        assert!(md.contains("**bold**"));
618        assert!(md.contains("[link](http://example.com)"));
619    }
620
621    #[test]
622    fn test_html_to_markdown_headers() {
623        let html = "<h1>H1</h1><h2>H2</h2><h3>H3</h3><h4>H4</h4><h5>H5</h5><h6>H6</h6>";
624        let md = ContentExtractor::html_to_markdown(html);
625        assert!(md.contains("# H1"));
626        assert!(md.contains("## H2"));
627        assert!(md.contains("### H3"));
628        assert!(md.contains("#### H4"));
629        assert!(md.contains("##### H5"));
630        assert!(md.contains("###### H6"));
631    }
632
633    #[test]
634    fn test_html_to_markdown_emphasis() {
635        let html = "<p><b>bold</b> and <strong>strong</strong> and <i>italic</i> and <em>emphasis</em></p>";
636        let md = ContentExtractor::html_to_markdown(html);
637        assert!(md.contains("**bold**"));
638        assert!(md.contains("**strong**"));
639        assert!(md.contains("*italic*"));
640        assert!(md.contains("*emphasis*"));
641    }
642
643    #[test]
644    fn test_html_to_markdown_code() {
645        let html = "<p>Use <code>println!</code> for output.</p>";
646        let md = ContentExtractor::html_to_markdown(html);
647        assert!(md.contains("`println!`"));
648    }
649
650    #[test]
651    fn test_html_to_markdown_pre() {
652        let html = "<pre>fn main() {\n    println!(\"Hello\");\n}</pre>";
653        let md = ContentExtractor::html_to_markdown(html);
654        assert!(md.contains("```"));
655        assert!(md.contains("fn main()"));
656    }
657
658    #[test]
659    fn test_html_to_markdown_list() {
660        let html = "<ul><li>Item 1</li><li>Item 2</li><li>Item 3</li></ul>";
661        let md = ContentExtractor::html_to_markdown(html);
662        assert!(md.contains("- Item 1"));
663        assert!(md.contains("- Item 2"));
664        assert!(md.contains("- Item 3"));
665    }
666
667    #[test]
668    fn test_html_to_markdown_removes_scripts() {
669        let html = "<p>Safe</p><script>evil()</script>";
670        let md = ContentExtractor::html_to_markdown(html);
671        assert!(!md.contains("evil"));
672        assert!(md.contains("Safe"));
673    }
674
675    #[test]
676    fn test_html_to_markdown_line_breaks() {
677        let html = "Line 1<br>Line 2<br/>Line 3";
678        let md = ContentExtractor::html_to_markdown(html);
679        assert!(md.contains("Line 1"));
680        assert!(md.contains("Line 2"));
681        assert!(md.contains("Line 3"));
682    }
683
684    // ========================================================================
685    // ExtractedContent Structure Tests
686    // ========================================================================
687
688    #[test]
689    fn test_extracted_content_structure() {
690        let content = ExtractedContent {
691            text: "Hello world".to_string(),
692            markdown: Some("Hello world".to_string()),
693            html: "<p>Hello world</p>".to_string(),
694            word_count: 2,
695            char_count: 11,
696            from_main: true,
697        };
698        assert_eq!(content.word_count, 2);
699        assert!(content.from_main);
700    }
701
702    #[test]
703    fn test_extracted_content_serialization() {
704        let content = ExtractedContent {
705            text: "Hello".to_string(),
706            markdown: Some("Hello".to_string()),
707            html: "<p>Hello</p>".to_string(),
708            word_count: 1,
709            char_count: 5,
710            from_main: false,
711        };
712
713        let json = serde_json::to_string(&content).unwrap();
714        assert!(json.contains("\"text\":\"Hello\""));
715        assert!(json.contains("\"word_count\":1"));
716        assert!(json.contains("\"from_main\":false"));
717
718        let deserialized: ExtractedContent = serde_json::from_str(&json).unwrap();
719        assert_eq!(deserialized.text, "Hello");
720        assert_eq!(deserialized.word_count, 1);
721    }
722
723    #[test]
724    fn test_extracted_content_empty() {
725        let content = ExtractedContent {
726            text: String::new(),
727            markdown: None,
728            html: String::new(),
729            word_count: 0,
730            char_count: 0,
731            from_main: false,
732        };
733        assert_eq!(content.word_count, 0);
734        assert_eq!(content.char_count, 0);
735        assert!(content.markdown.is_none());
736    }
737
738    // ========================================================================
739    // Edge Cases Tests
740    // ========================================================================
741
742    #[test]
743    fn test_html_to_text_nested_tags() {
744        let html = "<div><p><span><b>Nested</b> content</span></p></div>";
745        let text = ContentExtractor::html_to_text(html);
746        assert!(text.contains("Nested"));
747        assert!(text.contains("content"));
748        assert!(!text.contains("<"));
749    }
750
751    #[test]
752    fn test_html_to_text_malformed_html() {
753        let html = "<p>Unclosed paragraph <b>bold";
754        let text = ContentExtractor::html_to_text(html);
755        // Should still extract text even with malformed HTML
756        assert!(text.contains("Unclosed"));
757        assert!(text.contains("bold"));
758    }
759
760    #[test]
761    fn test_html_to_text_self_closing_tags() {
762        let html = "Hello<br/>World<hr/>Done";
763        let text = ContentExtractor::html_to_text(html);
764        assert!(text.contains("Hello"));
765        assert!(text.contains("World"));
766        assert!(text.contains("Done"));
767    }
768
769    #[test]
770    fn test_html_to_text_comments() {
771        let html = "<p>Before</p><!-- This is a comment --><p>After</p>";
772        let text = ContentExtractor::html_to_text(html);
773        assert!(!text.contains("comment"));
774        assert!(text.contains("Before"));
775        assert!(text.contains("After"));
776    }
777
778    #[test]
779    fn test_html_to_text_empty() {
780        let html = "";
781        let text = ContentExtractor::html_to_text(html);
782        assert_eq!(text, "");
783    }
784
785    #[test]
786    fn test_html_to_text_only_whitespace() {
787        let html = "   \n\t   ";
788        let text = ContentExtractor::html_to_text(html);
789        assert_eq!(text, "");
790    }
791}
reasonkit_web/extraction/content.rs

reasonkit_web/extraction/
content.rs