markdown_harvest/
content_processor.rs

1use crate::patterns::{
2    additional_cleanup, content_selectors, media_elements, text_selectors, unwanted_elements,
3    unwanted_text_patterns,
4};
5
6use regex::Regex;
7use scraper::{Html, Selector};
8
9/// Component responsible for HTML cleaning and Markdown conversion.
10///
11/// `ContentProcessor` handles all aspects of content processing including HTML parsing,
12/// content extraction, cleaning unwanted elements, and converting to Markdown format.
13/// This component reuses the original functions from MarkdownHarvester to maintain
14/// compatibility and behavior.
15#[derive(Default, Clone)]
16pub struct ContentProcessor {}
17
18impl ContentProcessor {
19    /// Creates a new ContentProcessor instance.
20    pub fn new() -> Self {
21        Self {}
22    }
23
24    /// Converts HTML content to clean Markdown format.
25    pub fn html_to_markdown(&self, html: &str) -> String {
26        extract_and_clean_content(html)
27    }
28}
29
30/// Extracts the main content from HTML using a priority-based strategy.
31///
32/// Priority order:
33/// 1. Semantic HTML5 tags (article, main, [role='main'])
34/// 2. Content-specific class selectors (.content, .article, .post, .entry)
35/// 3. Fallback to body tag
36///
37/// # Arguments
38/// * `document` - Parsed HTML document
39///
40/// # Returns
41/// * String containing the extracted HTML content
42fn extract_main_content(document: &Html) -> String {
43    // Priority 1: Semantic HTML5 tags
44    if let Some(content) = try_semantic_tags(document) {
45        return content;
46    }
47
48    // Priority 2: Content selectors
49    if let Some(content) = try_content_selectors_direct(document) {
50        return content;
51    }
52
53    // Priority 3: Fallback to body
54    fallback_to_body_tag(document)
55}
56
57/// Attempts to extract content from semantic HTML5 tags.
58///
59/// Tries in order: <article>, <main>, [role='main']
60/// Returns the first match found.
61///
62/// # Arguments
63/// * `document` - Parsed HTML document
64///
65/// # Returns
66/// * Some(String) if a semantic tag is found, None otherwise
67fn try_semantic_tags(document: &Html) -> Option<String> {
68    // Define semantic selectors in priority order
69    let semantic_selectors = ["article", "main", "[role='main']"];
70
71    for selector_str in semantic_selectors.iter() {
72        if let Ok(selector) = Selector::parse(selector_str) {
73            if let Some(element) = document.select(&selector).next() {
74                // Found semantic tag, return its HTML
75                return Some(element.html());
76            }
77        }
78    }
79
80    None // No semantic tags found
81}
82
83/// Attempts to extract content using content-specific class selectors.
84///
85/// Tries: .content, .article, .post, .entry
86/// Returns the first match found.
87///
88/// # Arguments
89/// * `document` - Parsed HTML document
90///
91/// # Returns
92/// * Some(String) if a content selector matches, None otherwise
93fn try_content_selectors_direct(document: &Html) -> Option<String> {
94    let class_selectors = [".content", ".article", ".post", ".entry"];
95
96    for selector_str in class_selectors.iter() {
97        if let Ok(selector) = Selector::parse(selector_str) {
98            if let Some(element) = document.select(&selector).next() {
99                return Some(element.html());
100            }
101        }
102    }
103
104    None
105}
106
107/// Fallback extraction using the body tag.
108///
109/// Returns the entire body content as last resort.
110///
111/// # Arguments
112/// * `document` - Parsed HTML document
113///
114/// # Returns
115/// * String containing body HTML, or empty string if no body tag found
116fn fallback_to_body_tag(document: &Html) -> String {
117    let body_selector = Selector::parse("body").unwrap();
118
119    match document.select(&body_selector).next() {
120        Some(body_element) => body_element.html(),
121        None => String::new(), // No body tag found
122    }
123}
124
125fn extract_and_clean_content(html: &str) -> String {
126    // Step 1: Parse document
127    let document = Html::parse_document(html);
128
129    // Step 2: Smart content extraction (NEW!)
130    let extracted_html = extract_main_content(&document);
131
132    // Check if extraction was successful
133    if extracted_html.is_empty() {
134        return String::new();
135    }
136
137    // Step 3: Clean the extracted content
138    let relevant_html = clear_content(extracted_html);
139
140    // Step 4: Convert to Markdown
141    let markdown_content = html2md::parse_html(&relevant_html);
142
143    // Step 5: Final cleanup
144    final_clean_from_markdown(markdown_content)
145}
146
147// DEPRECATED: Kept for backwards compatibility during transition
148// Will be removed in future version
149#[allow(dead_code)]
150fn extract_and_clean_body(html: &str) -> String {
151    // Step 1: Extract only the body content from the HTML
152    let document = Html::parse_document(html);
153    let body_selector = Selector::parse("body").unwrap();
154
155    let body_html = match document.select(&body_selector).next() {
156        Some(body_element) => body_element.html(),
157        None => return String::new(), // Return empty if no body found
158    };
159
160    // Step 2: Clean the body content by removing unwanted elements
161    let relevant_html = clear_content(body_html);
162
163    // Step 3: Convert the cleaned HTML to Markdown
164    let markdown_content = html2md::parse_html(&relevant_html);
165
166    // Step 4: Final cleanup
167    // Remove unwanted elements while preserving Markdown structure
168    final_clean_from_markdown(markdown_content)
169}
170
171fn clear_content(content_html: String) -> String {
172    let mut cleaned_body = content_html;
173
174    // Remove script blocks
175    let script_regex = Regex::new(r"(?i)<script[^>]*>[\s\S]*?</script>").unwrap();
176    cleaned_body = script_regex.replace_all(&cleaned_body, "").to_string();
177
178    // Remove style blocks
179    let style_regex = Regex::new(r"(?i)<style[^>]*>[\s\S]*?</style>").unwrap();
180    cleaned_body = style_regex.replace_all(&cleaned_body, "").to_string();
181
182    // Remove images, iframes, and other non-textual elements
183    for pattern in media_elements().iter() {
184        let regex = Regex::new(pattern).unwrap();
185        cleaned_body = regex.replace_all(&cleaned_body, "").to_string();
186    }
187
188    // Remove navigation, header, footer, sidebar and advertising elements
189    for pattern in unwanted_elements().iter() {
190        let regex = Regex::new(pattern).unwrap();
191        cleaned_body = regex.replace_all(&cleaned_body, "").to_string();
192    }
193
194    // Parse the cleaned body HTML and use scraper to extract only text content elements
195    let cleaned_document =
196        Html::parse_document(&format!("<html><body>{}</body></html>", cleaned_body));
197
198    // Select only content-relevant elements and extract their inner HTML
199
200    let mut relevant_html = String::new();
201    let mut found_main_content = false;
202
203    // First try to find main content containers
204    for selector_str in content_selectors().iter() {
205        if let Ok(selector) = Selector::parse(selector_str) {
206            for element in cleaned_document.select(&selector) {
207                relevant_html.push_str(&element.html());
208                relevant_html.push('\n');
209                found_main_content = true;
210            }
211        }
212    }
213
214    // If no main content containers found, extract individual text elements
215    if !found_main_content {
216        for selector_str in text_selectors().iter() {
217            if let Ok(selector) = Selector::parse(selector_str) {
218                for element in cleaned_document.select(&selector) {
219                    relevant_html.push_str(&element.html());
220                    relevant_html.push('\n');
221                }
222            }
223        }
224    }
225
226    // If still no content found, fallback to the entire cleaned body
227    if relevant_html.trim().is_empty() {
228        relevant_html = cleaned_body;
229    }
230
231    // Additional cleanup before markdown conversion - remove remaining unwanted elements
232    for pattern in additional_cleanup().iter() {
233        let regex = Regex::new(pattern).unwrap();
234        relevant_html = regex.replace_all(&relevant_html, "").to_string();
235    }
236
237    return relevant_html;
238}
239
240fn final_clean_from_markdown(markdown_content: String) -> String {
241    let mut result = markdown_content;
242
243    // Remove any remaining HTML tags that might have been missed
244    let html_tag_regex = Regex::new(r"<[^>]+>").unwrap();
245    result = html_tag_regex.replace_all(&result, "").to_string();
246
247    // Remove Markdown links [text](url) and keep only the text part
248    let link_regex = Regex::new(r"\[([^\]]+)\]\([^)]+\)").unwrap();
249    result = link_regex.replace_all(&result, "$1").to_string();
250
251    // Remove standalone URLs that might remain
252    let url_regex = Regex::new(r"https?://[^\s]+").unwrap();
253    result = url_regex.replace_all(&result, "").to_string();
254
255    // Keep Markdown formatting but clean up problematic patterns
256    // Remove code blocks (usually not relevant content)
257    let code_block_regex = Regex::new(r"```[\s\S]*?```").unwrap();
258    result = code_block_regex.replace_all(&result, "").to_string();
259
260    // Remove excessive whitespace and normalize line breaks
261    let space_regex = Regex::new(r"[ \t]+").unwrap();
262    result = space_regex.replace_all(&result, " ").to_string();
263
264    let newline_regex = Regex::new(r"\n{3,}").unwrap();
265    result = newline_regex.replace_all(&result, "\n\n").to_string();
266
267    // Remove common advertising/navigation text patterns but preserve line structure
268    for pattern in unwanted_text_patterns().iter() {
269        let regex = Regex::new(pattern).unwrap();
270        result = regex.replace_all(&result, "").to_string();
271    }
272
273    // Clean up empty lines and extra spacing
274    let cleanup_regex = Regex::new(r"\n\s*\n\s*\n").unwrap();
275    result = cleanup_regex.replace_all(&result, "\n\n").to_string();
276
277    // Remove lines that are likely metadata or navigation while preserving markdown structure
278    result = remove_lines_metadata_or_navigation(result.lines().collect()).join("\n");
279
280    // Clean up excessive empty lines but preserve paragraph structure
281    let excessive_newlines_regex = Regex::new(r"\n{4,}").unwrap();
282    result = excessive_newlines_regex
283        .replace_all(&result, "\n\n\n")
284        .to_string();
285
286    result.trim().to_string()
287}
288
289fn remove_lines_metadata_or_navigation(lines: Vec<&str>) -> Vec<&str> {
290    lines
291        .into_iter()
292        .filter(|line| {
293            let trimmed = line.trim();
294
295            // Always keep lines that start with markdown headers
296            if trimmed.starts_with('#') || trimmed.starts_with("##") {
297                return true;
298            }
299
300            // Filter out very short lines that aren't meaningful
301            if trimmed.is_empty() || trimmed.len() < 5 {
302                return trimmed.is_empty(); // Keep empty lines for spacing
303            }
304
305            // Keep lines with meaningful content (relaxed filtering for single words)
306            let lower = trimmed.to_lowercase();
307
308            // Only filter out single words if they are likely navigation/metadata terms
309            if !trimmed.contains(' ') {
310                let navigation_terms = [
311                    "home",
312                    "about",
313                    "contact",
314                    "menu",
315                    "search",
316                    "login",
317                    "register",
318                    "subscribe",
319                    "share",
320                    "follow",
321                    "back",
322                    "next",
323                    "prev",
324                    "more",
325                    "advertisement",
326                    "ads",
327                    "sponsored",
328                    "cookie",
329                    "privacy",
330                    "terms",
331                ];
332                if navigation_terms.iter().any(|&term| lower == term) {
333                    return false;
334                }
335            }
336
337            // Skip obvious metadata/navigation patterns
338            if lower.starts_with("http")
339                || lower.contains("@")
340                || lower == "menu"
341                || lower == "navigation"
342                || lower == "nav"
343                || lower == "footer"
344                || lower == "header"
345                || lower == "sidebar"
346            {
347                return false;
348            }
349
350            // Filter out extremely short lines (less than 2 characters) that aren't meaningful
351            if trimmed.len() < 2 {
352                return false;
353            }
354
355            // Keep everything else, including single words that could be content
356            true
357        })
358        .collect()
359}
360
361#[cfg(test)]
362mod tests {
363    use super::*;
364
365    #[test]
366    fn test_new() {
367        let processor = ContentProcessor::new();
368        assert_eq!(std::mem::size_of_val(&processor), 0);
369    }
370
371    #[test]
372    fn test_extract_and_clean_body_with_empty_html() {
373        let empty_html = "";
374        let result = extract_and_clean_body(empty_html);
375        assert_eq!(result, "");
376    }
377
378    #[test]
379    fn test_extract_and_clean_body_with_no_body() {
380        let html_without_body = "<html><head><title>Test</title></head></html>";
381        let result = extract_and_clean_body(html_without_body);
382        assert_eq!(result, "");
383    }
384
385    #[test]
386    fn test_extract_and_clean_body_with_simple_content() {
387        let simple_html =
388            "<html><body><h1>Test Title</h1><p>Test paragraph content.</p></body></html>";
389        let result = extract_and_clean_body(simple_html);
390
391        // Should contain the content without HTML tags
392        assert!(result.contains("Test Title"));
393        assert!(result.contains("Test paragraph content"));
394        assert!(!result.contains("<h1>"));
395        assert!(!result.contains("<p>"));
396    }
397
398    #[test]
399    fn test_html_to_markdown() {
400        let processor = ContentProcessor::new();
401        let html = "<html><body><h1>Title</h1><p>Content</p></body></html>";
402        let result = processor.html_to_markdown(html);
403
404        assert!(result.contains("Title"));
405        assert!(result.contains("Content"));
406        assert!(!result.contains("<html>"));
407        assert!(!result.contains("<body>"));
408    }
409
410    // ============================================================================
411    // NEW TESTS FOR SEMANTIC HTML5 TAG EXTRACTION (Issue #40)
412    // ============================================================================
413    // These tests validate the smart article extraction algorithm that prioritizes
414    // semantic HTML5 tags (article, main) over body tag extraction.
415    // Expected to FAIL until implementation is complete.
416
417    #[test]
418    fn test_extract_article_tag_priority() {
419        let html = r#"
420            <html>
421            <body>
422                <nav>Navigation menu</nav>
423                <article>
424                    <h1>Article Title</h1>
425                    <p>Article main content here.</p>
426                </article>
427                <footer>Footer content</footer>
428            </body>
429            </html>
430        "#;
431
432        let result = extract_and_clean_body(html);
433
434        // Should extract article content
435        assert!(
436            result.contains("Article Title"),
437            "Expected to find 'Article Title' in extracted content"
438        );
439        assert!(
440            result.contains("Article main content"),
441            "Expected to find 'Article main content' in extracted content"
442        );
443
444        // Should NOT contain navigation or footer
445        assert!(
446            !result.contains("Navigation menu"),
447            "Should not contain navigation content"
448        );
449        assert!(
450            !result.contains("Footer content"),
451            "Should not contain footer content"
452        );
453    }
454
455    #[test]
456    fn test_extract_main_tag() {
457        let html = r#"
458            <html>
459            <body>
460                <header>Site header</header>
461                <main>
462                    <h1>Main Content Title</h1>
463                    <p>This is the main content area.</p>
464                </main>
465                <aside>Sidebar content</aside>
466            </body>
467            </html>
468        "#;
469
470        let result = extract_and_clean_body(html);
471
472        // Should extract main content
473        assert!(
474            result.contains("Main Content Title"),
475            "Expected to find 'Main Content Title' in extracted content"
476        );
477        assert!(
478            result.contains("main content area"),
479            "Expected to find main content text"
480        );
481
482        // Should NOT contain header or sidebar
483        assert!(
484            !result.contains("Site header"),
485            "Should not contain header content"
486        );
487        assert!(
488            !result.contains("Sidebar content"),
489            "Should not contain sidebar content"
490        );
491    }
492
493    #[test]
494    fn test_fallback_to_body_when_no_semantic_tags() {
495        let html = r#"
496            <html>
497            <body>
498                <div class="wrapper">
499                    <h1>Legacy Page Title</h1>
500                    <p>Content without semantic tags.</p>
501                </div>
502            </body>
503            </html>
504        "#;
505
506        let result = extract_and_clean_body(html);
507
508        // Should fallback to body extraction and still work
509        assert!(
510            result.contains("Legacy Page Title"),
511            "Expected to find title in extracted content"
512        );
513        assert!(
514            result.contains("Content without semantic tags"),
515            "Expected to find content text"
516        );
517    }
518
519    #[test]
520    fn test_article_takes_priority_over_body_clutter() {
521        let html = r#"
522            <html>
523            <body>
524                <header>
525                    <nav>
526                        <a href="/">Home</a>
527                        <a href="/about">About</a>
528                    </nav>
529                </header>
530                <div class="sidebar">
531                    <h3>Related Links</h3>
532                    <ul>
533                        <li><a href="/link1">Link 1</a></li>
534                        <li><a href="/link2">Link 2</a></li>
535                    </ul>
536                </div>
537                <article>
538                    <h1>Patterns for Defensive Programming in Rust</h1>
539                    <p>This article explains defensive programming techniques.</p>
540                    <h2>Introduction</h2>
541                    <p>Defensive programming is essential for building robust systems.</p>
542                </article>
543                <footer>
544                    <p>Copyright 2024</p>
545                </footer>
546            </body>
547            </html>
548        "#;
549
550        let result = extract_and_clean_body(html);
551
552        // Should extract article content
553        assert!(
554            result.contains("Patterns for Defensive Programming"),
555            "Expected to find article title"
556        );
557        assert!(
558            result.contains("defensive programming techniques"),
559            "Expected to find article content"
560        );
561        assert!(
562            result.contains("Introduction"),
563            "Expected to find article section heading"
564        );
565
566        // Should NOT contain navigation, sidebar, or footer
567        assert!(
568            !result.contains("Home") || !result.contains("About"),
569            "Should not contain navigation links"
570        );
571        assert!(
572            !result.contains("Related Links"),
573            "Should not contain sidebar content"
574        );
575        assert!(
576            !result.contains("Copyright"),
577            "Should not contain footer content"
578        );
579    }
580
581    #[test]
582    fn test_multiple_articles_extracts_first() {
583        let html = r#"
584            <html>
585            <body>
586                <article>
587                    <h1>First Article</h1>
588                    <p>First article content.</p>
589                </article>
590                <article>
591                    <h1>Second Article</h1>
592                    <p>Second article content.</p>
593                </article>
594            </body>
595            </html>
596        "#;
597
598        let result = extract_and_clean_body(html);
599
600        // Should extract only the first article (or both, depending on implementation)
601        assert!(
602            result.contains("First Article"),
603            "Expected to find first article"
604        );
605        // Note: Implementation may choose to extract all articles or just the first one
606        // This test validates that at least the first article is extracted
607    }
608
609    #[test]
610    fn test_role_main_attribute() {
611        let html = r#"
612            <html>
613            <body>
614                <nav>Navigation</nav>
615                <div role="main">
616                    <h1>Main Content via Role</h1>
617                    <p>Content identified by role attribute.</p>
618                </div>
619                <aside>Sidebar</aside>
620            </body>
621            </html>
622        "#;
623
624        let result = extract_and_clean_body(html);
625
626        // Should extract content with role="main"
627        assert!(
628            result.contains("Main Content via Role"),
629            "Expected to find content with role='main'"
630        );
631        assert!(
632            result.contains("role attribute"),
633            "Expected to find main content text"
634        );
635
636        // Should NOT contain navigation or sidebar
637        assert!(
638            !result.contains("Navigation"),
639            "Should not contain navigation"
640        );
641        assert!(
642            !result.contains("Sidebar"),
643            "Should not contain sidebar"
644        );
645    }
646
647    #[test]
648    #[ignore] // This test requires real HTTP fetch, run manually with: cargo test -- --ignored
649    fn test_corrode_dev_article_extraction() {
650        // This test validates the specific URL from Issue #40
651        // URL: https://corrode.dev/blog/defensive-programming/
652        //
653        // This test is ignored by default because it requires:
654        // 1. Network access to fetch the URL
655        // 2. The website to be available
656        // 3. The website structure to remain consistent
657        //
658        // To run this test manually:
659        // cargo test test_corrode_dev_article_extraction -- --ignored
660        //
661        // Expected behavior after implementation:
662        // - Should extract the main article content
663        // - Should contain the article title "Patterns for Defensive Programming in Rust"
664        // - Should have substantial content (> 1000 characters)
665        // - Should NOT be empty
666
667        use crate::http_client::HttpClient;
668        use crate::http_config::HttpConfig;
669
670        let text_with_url = "Check this article: https://corrode.dev/blog/defensive-programming/";
671
672        // Fetch HTML using the existing API
673        let http_config = HttpConfig::default();
674        let http_client = HttpClient::new();
675
676        let results = http_client.fetch_content_from_text(text_with_url, http_config);
677
678        if results.is_empty() {
679            eprintln!("Failed to fetch URL - network issue or URL unavailable");
680            eprintln!("Skipping test");
681            return;
682        }
683
684        // Get the HTML content
685        let (_url, html) = &results[0];
686
687        // Process content
688        let processor = ContentProcessor::new();
689        let result = processor.html_to_markdown(html);
690
691        // Validate extraction
692        assert!(
693            !result.is_empty(),
694            "Extracted content should not be empty"
695        );
696
697        assert!(
698            result.len() > 1000,
699            "Article should have substantial content (got {} characters)",
700            result.len()
701        );
702
703        assert!(
704            result.contains("Defensive Programming") || result.contains("defensive programming"),
705            "Should contain article title or main topic"
706        );
707
708        // Print result for manual inspection
709        println!("\n=== Extracted Content (first 500 chars) ===");
710        println!("{}", &result.chars().take(500).collect::<String>());
711        println!("\n=== Total length: {} characters ===", result.len());
712    }
713}