1use crate::patterns::{
2 additional_cleanup, content_selectors, media_elements, text_selectors, unwanted_elements,
3 unwanted_text_patterns,
4};
5
6use regex::Regex;
7use scraper::{Html, Selector};
8
9#[derive(Default, Clone)]
16pub struct ContentProcessor {}
17
18impl ContentProcessor {
19 pub fn new() -> Self {
21 Self {}
22 }
23
24 pub fn html_to_markdown(&self, html: &str) -> String {
26 extract_and_clean_content(html)
27 }
28}
29
30fn extract_main_content(document: &Html) -> String {
43 if let Some(content) = try_semantic_tags(document) {
45 return content;
46 }
47
48 if let Some(content) = try_content_selectors_direct(document) {
50 return content;
51 }
52
53 fallback_to_body_tag(document)
55}
56
57fn try_semantic_tags(document: &Html) -> Option<String> {
68 let semantic_selectors = ["article", "main", "[role='main']"];
70
71 for selector_str in semantic_selectors.iter() {
72 if let Ok(selector) = Selector::parse(selector_str) {
73 if let Some(element) = document.select(&selector).next() {
74 return Some(element.html());
76 }
77 }
78 }
79
80 None }
82
83fn try_content_selectors_direct(document: &Html) -> Option<String> {
94 let class_selectors = [".content", ".article", ".post", ".entry"];
95
96 for selector_str in class_selectors.iter() {
97 if let Ok(selector) = Selector::parse(selector_str) {
98 if let Some(element) = document.select(&selector).next() {
99 return Some(element.html());
100 }
101 }
102 }
103
104 None
105}
106
107fn fallback_to_body_tag(document: &Html) -> String {
117 let body_selector = Selector::parse("body").unwrap();
118
119 match document.select(&body_selector).next() {
120 Some(body_element) => body_element.html(),
121 None => String::new(), }
123}
124
125fn extract_and_clean_content(html: &str) -> String {
126 let document = Html::parse_document(html);
128
129 let extracted_html = extract_main_content(&document);
131
132 if extracted_html.is_empty() {
134 return String::new();
135 }
136
137 let relevant_html = clear_content(extracted_html);
139
140 let markdown_content = html2md::parse_html(&relevant_html);
142
143 final_clean_from_markdown(markdown_content)
145}
146
147#[allow(dead_code)]
150fn extract_and_clean_body(html: &str) -> String {
151 let document = Html::parse_document(html);
153 let body_selector = Selector::parse("body").unwrap();
154
155 let body_html = match document.select(&body_selector).next() {
156 Some(body_element) => body_element.html(),
157 None => return String::new(), };
159
160 let relevant_html = clear_content(body_html);
162
163 let markdown_content = html2md::parse_html(&relevant_html);
165
166 final_clean_from_markdown(markdown_content)
169}
170
171fn clear_content(content_html: String) -> String {
172 let mut cleaned_body = content_html;
173
174 let script_regex = Regex::new(r"(?i)<script[^>]*>[\s\S]*?</script>").unwrap();
176 cleaned_body = script_regex.replace_all(&cleaned_body, "").to_string();
177
178 let style_regex = Regex::new(r"(?i)<style[^>]*>[\s\S]*?</style>").unwrap();
180 cleaned_body = style_regex.replace_all(&cleaned_body, "").to_string();
181
182 for pattern in media_elements().iter() {
184 let regex = Regex::new(pattern).unwrap();
185 cleaned_body = regex.replace_all(&cleaned_body, "").to_string();
186 }
187
188 for pattern in unwanted_elements().iter() {
190 let regex = Regex::new(pattern).unwrap();
191 cleaned_body = regex.replace_all(&cleaned_body, "").to_string();
192 }
193
194 let cleaned_document =
196 Html::parse_document(&format!("<html><body>{}</body></html>", cleaned_body));
197
198 let mut relevant_html = String::new();
201 let mut found_main_content = false;
202
203 for selector_str in content_selectors().iter() {
205 if let Ok(selector) = Selector::parse(selector_str) {
206 for element in cleaned_document.select(&selector) {
207 relevant_html.push_str(&element.html());
208 relevant_html.push('\n');
209 found_main_content = true;
210 }
211 }
212 }
213
214 if !found_main_content {
216 for selector_str in text_selectors().iter() {
217 if let Ok(selector) = Selector::parse(selector_str) {
218 for element in cleaned_document.select(&selector) {
219 relevant_html.push_str(&element.html());
220 relevant_html.push('\n');
221 }
222 }
223 }
224 }
225
226 if relevant_html.trim().is_empty() {
228 relevant_html = cleaned_body;
229 }
230
231 for pattern in additional_cleanup().iter() {
233 let regex = Regex::new(pattern).unwrap();
234 relevant_html = regex.replace_all(&relevant_html, "").to_string();
235 }
236
237 return relevant_html;
238}
239
240fn final_clean_from_markdown(markdown_content: String) -> String {
241 let mut result = markdown_content;
242
243 let html_tag_regex = Regex::new(r"<[^>]+>").unwrap();
245 result = html_tag_regex.replace_all(&result, "").to_string();
246
247 let link_regex = Regex::new(r"\[([^\]]+)\]\([^)]+\)").unwrap();
249 result = link_regex.replace_all(&result, "$1").to_string();
250
251 let url_regex = Regex::new(r"https?://[^\s]+").unwrap();
253 result = url_regex.replace_all(&result, "").to_string();
254
255 let code_block_regex = Regex::new(r"```[\s\S]*?```").unwrap();
258 result = code_block_regex.replace_all(&result, "").to_string();
259
260 let space_regex = Regex::new(r"[ \t]+").unwrap();
262 result = space_regex.replace_all(&result, " ").to_string();
263
264 let newline_regex = Regex::new(r"\n{3,}").unwrap();
265 result = newline_regex.replace_all(&result, "\n\n").to_string();
266
267 for pattern in unwanted_text_patterns().iter() {
269 let regex = Regex::new(pattern).unwrap();
270 result = regex.replace_all(&result, "").to_string();
271 }
272
273 let cleanup_regex = Regex::new(r"\n\s*\n\s*\n").unwrap();
275 result = cleanup_regex.replace_all(&result, "\n\n").to_string();
276
277 result = remove_lines_metadata_or_navigation(result.lines().collect()).join("\n");
279
280 let excessive_newlines_regex = Regex::new(r"\n{4,}").unwrap();
282 result = excessive_newlines_regex
283 .replace_all(&result, "\n\n\n")
284 .to_string();
285
286 result.trim().to_string()
287}
288
289fn remove_lines_metadata_or_navigation(lines: Vec<&str>) -> Vec<&str> {
290 lines
291 .into_iter()
292 .filter(|line| {
293 let trimmed = line.trim();
294
295 if trimmed.starts_with('#') || trimmed.starts_with("##") {
297 return true;
298 }
299
300 if trimmed.is_empty() || trimmed.len() < 5 {
302 return trimmed.is_empty(); }
304
305 let lower = trimmed.to_lowercase();
307
308 if !trimmed.contains(' ') {
310 let navigation_terms = [
311 "home",
312 "about",
313 "contact",
314 "menu",
315 "search",
316 "login",
317 "register",
318 "subscribe",
319 "share",
320 "follow",
321 "back",
322 "next",
323 "prev",
324 "more",
325 "advertisement",
326 "ads",
327 "sponsored",
328 "cookie",
329 "privacy",
330 "terms",
331 ];
332 if navigation_terms.iter().any(|&term| lower == term) {
333 return false;
334 }
335 }
336
337 if lower.starts_with("http")
339 || lower.contains("@")
340 || lower == "menu"
341 || lower == "navigation"
342 || lower == "nav"
343 || lower == "footer"
344 || lower == "header"
345 || lower == "sidebar"
346 {
347 return false;
348 }
349
350 if trimmed.len() < 2 {
352 return false;
353 }
354
355 true
357 })
358 .collect()
359}
360
361#[cfg(test)]
362mod tests {
363 use super::*;
364
365 #[test]
366 fn test_new() {
367 let processor = ContentProcessor::new();
368 assert_eq!(std::mem::size_of_val(&processor), 0);
369 }
370
371 #[test]
372 fn test_extract_and_clean_body_with_empty_html() {
373 let empty_html = "";
374 let result = extract_and_clean_body(empty_html);
375 assert_eq!(result, "");
376 }
377
378 #[test]
379 fn test_extract_and_clean_body_with_no_body() {
380 let html_without_body = "<html><head><title>Test</title></head></html>";
381 let result = extract_and_clean_body(html_without_body);
382 assert_eq!(result, "");
383 }
384
385 #[test]
386 fn test_extract_and_clean_body_with_simple_content() {
387 let simple_html =
388 "<html><body><h1>Test Title</h1><p>Test paragraph content.</p></body></html>";
389 let result = extract_and_clean_body(simple_html);
390
391 assert!(result.contains("Test Title"));
393 assert!(result.contains("Test paragraph content"));
394 assert!(!result.contains("<h1>"));
395 assert!(!result.contains("<p>"));
396 }
397
398 #[test]
399 fn test_html_to_markdown() {
400 let processor = ContentProcessor::new();
401 let html = "<html><body><h1>Title</h1><p>Content</p></body></html>";
402 let result = processor.html_to_markdown(html);
403
404 assert!(result.contains("Title"));
405 assert!(result.contains("Content"));
406 assert!(!result.contains("<html>"));
407 assert!(!result.contains("<body>"));
408 }
409
410 #[test]
418 fn test_extract_article_tag_priority() {
419 let html = r#"
420 <html>
421 <body>
422 <nav>Navigation menu</nav>
423 <article>
424 <h1>Article Title</h1>
425 <p>Article main content here.</p>
426 </article>
427 <footer>Footer content</footer>
428 </body>
429 </html>
430 "#;
431
432 let result = extract_and_clean_body(html);
433
434 assert!(
436 result.contains("Article Title"),
437 "Expected to find 'Article Title' in extracted content"
438 );
439 assert!(
440 result.contains("Article main content"),
441 "Expected to find 'Article main content' in extracted content"
442 );
443
444 assert!(
446 !result.contains("Navigation menu"),
447 "Should not contain navigation content"
448 );
449 assert!(
450 !result.contains("Footer content"),
451 "Should not contain footer content"
452 );
453 }
454
455 #[test]
456 fn test_extract_main_tag() {
457 let html = r#"
458 <html>
459 <body>
460 <header>Site header</header>
461 <main>
462 <h1>Main Content Title</h1>
463 <p>This is the main content area.</p>
464 </main>
465 <aside>Sidebar content</aside>
466 </body>
467 </html>
468 "#;
469
470 let result = extract_and_clean_body(html);
471
472 assert!(
474 result.contains("Main Content Title"),
475 "Expected to find 'Main Content Title' in extracted content"
476 );
477 assert!(
478 result.contains("main content area"),
479 "Expected to find main content text"
480 );
481
482 assert!(
484 !result.contains("Site header"),
485 "Should not contain header content"
486 );
487 assert!(
488 !result.contains("Sidebar content"),
489 "Should not contain sidebar content"
490 );
491 }
492
493 #[test]
494 fn test_fallback_to_body_when_no_semantic_tags() {
495 let html = r#"
496 <html>
497 <body>
498 <div class="wrapper">
499 <h1>Legacy Page Title</h1>
500 <p>Content without semantic tags.</p>
501 </div>
502 </body>
503 </html>
504 "#;
505
506 let result = extract_and_clean_body(html);
507
508 assert!(
510 result.contains("Legacy Page Title"),
511 "Expected to find title in extracted content"
512 );
513 assert!(
514 result.contains("Content without semantic tags"),
515 "Expected to find content text"
516 );
517 }
518
519 #[test]
520 fn test_article_takes_priority_over_body_clutter() {
521 let html = r#"
522 <html>
523 <body>
524 <header>
525 <nav>
526 <a href="/">Home</a>
527 <a href="/about">About</a>
528 </nav>
529 </header>
530 <div class="sidebar">
531 <h3>Related Links</h3>
532 <ul>
533 <li><a href="/link1">Link 1</a></li>
534 <li><a href="/link2">Link 2</a></li>
535 </ul>
536 </div>
537 <article>
538 <h1>Patterns for Defensive Programming in Rust</h1>
539 <p>This article explains defensive programming techniques.</p>
540 <h2>Introduction</h2>
541 <p>Defensive programming is essential for building robust systems.</p>
542 </article>
543 <footer>
544 <p>Copyright 2024</p>
545 </footer>
546 </body>
547 </html>
548 "#;
549
550 let result = extract_and_clean_body(html);
551
552 assert!(
554 result.contains("Patterns for Defensive Programming"),
555 "Expected to find article title"
556 );
557 assert!(
558 result.contains("defensive programming techniques"),
559 "Expected to find article content"
560 );
561 assert!(
562 result.contains("Introduction"),
563 "Expected to find article section heading"
564 );
565
566 assert!(
568 !result.contains("Home") || !result.contains("About"),
569 "Should not contain navigation links"
570 );
571 assert!(
572 !result.contains("Related Links"),
573 "Should not contain sidebar content"
574 );
575 assert!(
576 !result.contains("Copyright"),
577 "Should not contain footer content"
578 );
579 }
580
581 #[test]
582 fn test_multiple_articles_extracts_first() {
583 let html = r#"
584 <html>
585 <body>
586 <article>
587 <h1>First Article</h1>
588 <p>First article content.</p>
589 </article>
590 <article>
591 <h1>Second Article</h1>
592 <p>Second article content.</p>
593 </article>
594 </body>
595 </html>
596 "#;
597
598 let result = extract_and_clean_body(html);
599
600 assert!(
602 result.contains("First Article"),
603 "Expected to find first article"
604 );
605 }
608
609 #[test]
610 fn test_role_main_attribute() {
611 let html = r#"
612 <html>
613 <body>
614 <nav>Navigation</nav>
615 <div role="main">
616 <h1>Main Content via Role</h1>
617 <p>Content identified by role attribute.</p>
618 </div>
619 <aside>Sidebar</aside>
620 </body>
621 </html>
622 "#;
623
624 let result = extract_and_clean_body(html);
625
626 assert!(
628 result.contains("Main Content via Role"),
629 "Expected to find content with role='main'"
630 );
631 assert!(
632 result.contains("role attribute"),
633 "Expected to find main content text"
634 );
635
636 assert!(
638 !result.contains("Navigation"),
639 "Should not contain navigation"
640 );
641 assert!(
642 !result.contains("Sidebar"),
643 "Should not contain sidebar"
644 );
645 }
646
647 #[test]
648 #[ignore] fn test_corrode_dev_article_extraction() {
650 use crate::http_client::HttpClient;
668 use crate::http_config::HttpConfig;
669
670 let text_with_url = "Check this article: https://corrode.dev/blog/defensive-programming/";
671
672 let http_config = HttpConfig::default();
674 let http_client = HttpClient::new();
675
676 let results = http_client.fetch_content_from_text(text_with_url, http_config);
677
678 if results.is_empty() {
679 eprintln!("Failed to fetch URL - network issue or URL unavailable");
680 eprintln!("Skipping test");
681 return;
682 }
683
684 let (_url, html) = &results[0];
686
687 let processor = ContentProcessor::new();
689 let result = processor.html_to_markdown(html);
690
691 assert!(
693 !result.is_empty(),
694 "Extracted content should not be empty"
695 );
696
697 assert!(
698 result.len() > 1000,
699 "Article should have substantial content (got {} characters)",
700 result.len()
701 );
702
703 assert!(
704 result.contains("Defensive Programming") || result.contains("defensive programming"),
705 "Should contain article title or main topic"
706 );
707
708 println!("\n=== Extracted Content (first 500 chars) ===");
710 println!("{}", &result.chars().take(500).collect::<String>());
711 println!("\n=== Total length: {} characters ===", result.len());
712 }
713}