1use crate::browser::PageHandle;
7use crate::error::{ExtractionError, Result};
8use serde::{Deserialize, Serialize};
9use tracing::{debug, info, instrument};
10
11#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct ExtractedContent {
14 pub text: String,
16 pub markdown: Option<String>,
18 pub html: String,
20 pub word_count: usize,
22 pub char_count: usize,
24 pub from_main: bool,
26}
27
28pub struct ContentExtractor;
30
31impl ContentExtractor {
32 #[instrument(skip(page))]
34 pub async fn extract_main_content(page: &PageHandle) -> Result<ExtractedContent> {
35 info!("Extracting main content");
36
37 let (html, from_main) = Self::find_main_content(&page.page).await?;
39 let text = Self::html_to_text(&html);
40 let markdown = Self::html_to_markdown(&html);
41
42 let word_count = text.split_whitespace().count();
43 let char_count = text.chars().count();
44
45 debug!(
46 "Extracted {} words, {} chars, from_main={}",
47 word_count, char_count, from_main
48 );
49
50 Ok(ExtractedContent {
51 text,
52 markdown: Some(markdown),
53 html,
54 word_count,
55 char_count,
56 from_main,
57 })
58 }
59
60 #[instrument(skip(page))]
62 pub async fn extract_from_selector(
63 page: &PageHandle,
64 selector: &str,
65 ) -> Result<ExtractedContent> {
66 info!("Extracting from selector: {}", selector);
67
68 let script = format!(
69 r#"
70 (() => {{
71 const el = document.querySelector('{}');
72 if (!el) return null;
73 return {{
74 html: el.innerHTML,
75 text: el.innerText
76 }};
77 }})()
78 "#,
79 selector.replace('\'', "\\'")
80 );
81
82 let result: Option<serde_json::Value> = page
83 .page
84 .evaluate(script.as_str())
85 .await
86 .map_err(|e| ExtractionError::ExtractionFailed(e.to_string()))?
87 .into_value()
88 .map_err(|e| ExtractionError::ExtractionFailed(e.to_string()))?;
89
90 let result =
91 result.ok_or_else(|| ExtractionError::ElementNotFound(selector.to_string()))?;
92
93 let html = result["html"].as_str().unwrap_or("").to_string();
94 let text = result["text"].as_str().unwrap_or("").to_string();
95
96 let markdown = Self::html_to_markdown(&html);
97 let word_count = text.split_whitespace().count();
98 let char_count = text.chars().count();
99
100 Ok(ExtractedContent {
101 text,
102 markdown: Some(markdown),
103 html,
104 word_count,
105 char_count,
106 from_main: false,
107 })
108 }
109
110 #[instrument(skip(page))]
112 pub async fn extract_all_text(page: &PageHandle) -> Result<String> {
113 let script = r#"
114 document.body.innerText
115 "#;
116
117 let text: String = page
118 .page
119 .evaluate(script)
120 .await
121 .map_err(|e| ExtractionError::ExtractionFailed(e.to_string()))?
122 .into_value()
123 .map_err(|e| ExtractionError::ExtractionFailed(e.to_string()))?;
124
125 Ok(text)
126 }
127
128 async fn find_main_content(page: &chromiumoxide::Page) -> Result<(String, bool)> {
130 let script = r#"
131 (() => {
132 // Strategy 1: Look for article or main elements
133 const mainSelectors = [
134 'article',
135 'main',
136 '[role="main"]',
137 '[role="article"]',
138 '.article',
139 '.post',
140 '.content',
141 '.entry-content',
142 '.post-content',
143 '#content',
144 '#main-content',
145 '.main-content'
146 ];
147
148 for (const selector of mainSelectors) {
149 const el = document.querySelector(selector);
150 if (el && el.innerText.length > 200) {
151 return { html: el.innerHTML, fromMain: true };
152 }
153 }
154
155 // Strategy 2: Find the largest text block
156 const textBlocks = [];
157 const walker = document.createTreeWalker(
158 document.body,
159 NodeFilter.SHOW_ELEMENT,
160 {
161 acceptNode: (node) => {
162 const tag = node.tagName.toLowerCase();
163 if (['script', 'style', 'nav', 'header', 'footer', 'aside', 'noscript'].includes(tag)) {
164 return NodeFilter.FILTER_REJECT;
165 }
166 return NodeFilter.FILTER_ACCEPT;
167 }
168 }
169 );
170
171 let node;
172 while (node = walker.nextNode()) {
173 const text = node.innerText || '';
174 if (text.length > 200) {
175 textBlocks.push({
176 el: node,
177 length: text.length
178 });
179 }
180 }
181
182 if (textBlocks.length > 0) {
183 // Sort by length and get the longest
184 textBlocks.sort((a, b) => b.length - a.length);
185 return { html: textBlocks[0].el.innerHTML, fromMain: false };
186 }
187
188 // Fallback: return body
189 return { html: document.body.innerHTML, fromMain: false };
190 })()
191 "#;
192
193 let result: serde_json::Value = page
194 .evaluate(script)
195 .await
196 .map_err(|e| ExtractionError::ExtractionFailed(e.to_string()))?
197 .into_value()
198 .map_err(|e| ExtractionError::ExtractionFailed(e.to_string()))?;
199
200 let html = result["html"].as_str().unwrap_or("").to_string();
201 let from_main = result["fromMain"].as_bool().unwrap_or(false);
202
203 Ok((html, from_main))
204 }
205
206 pub fn html_to_text(html: &str) -> String {
208 let mut text = html.to_string();
210
211 let script_re = regex::Regex::new(r"<script[^>]*>[\s\S]*?</script>").unwrap();
213 text = script_re.replace_all(&text, "").to_string();
214
215 let style_re = regex::Regex::new(r"<style[^>]*>[\s\S]*?</style>").unwrap();
217 text = style_re.replace_all(&text, "").to_string();
218
219 let block_re = regex::Regex::new(r"</(p|div|br|li|h[1-6])>").unwrap();
221 text = block_re.replace_all(&text, "\n").to_string();
222
223 let tag_re = regex::Regex::new(r"<[^>]+>").unwrap();
225 text = tag_re.replace_all(&text, "").to_string();
226
227 text = Self::decode_html_entities(&text);
229
230 let ws_re = regex::Regex::new(r"\s+").unwrap();
232 text = ws_re.replace_all(&text, " ").to_string();
233
234 let nl_re = regex::Regex::new(r"\n\s*\n+").unwrap();
236 text = nl_re.replace_all(&text, "\n\n").to_string();
237
238 text.trim().to_string()
239 }
240
241 pub fn decode_html_entities(text: &str) -> String {
243 text.replace(" ", " ")
244 .replace("<", "<")
245 .replace(">", ">")
246 .replace("&", "&")
247 .replace(""", "\"")
248 .replace("'", "'")
249 .replace("'", "'")
250 .replace("'", "'")
251 .replace("/", "/")
252 .replace("©", "(c)")
253 .replace("®", "(R)")
254 .replace("™", "(TM)")
255 .replace("–", "-")
256 .replace("—", "--")
257 .replace("…", "...")
258 .replace("‘", "'")
259 .replace("’", "'")
260 .replace("“", "\"")
261 .replace("”", "\"")
262 }
263
264 pub fn html_to_markdown(html: &str) -> String {
266 let mut md = html.to_string();
267
268 let script_re = regex::Regex::new(r"<script[^>]*>[\s\S]*?</script>").unwrap();
270 md = script_re.replace_all(&md, "").to_string();
271 let style_re = regex::Regex::new(r"<style[^>]*>[\s\S]*?</style>").unwrap();
272 md = style_re.replace_all(&md, "").to_string();
273
274 for i in (1..=6).rev() {
276 let h_re = regex::Regex::new(&format!(r"<h{}[^>]*>(.*?)</h{}>", i, i)).unwrap();
277 let prefix = "#".repeat(i);
278 md = h_re
279 .replace_all(&md, format!("{} $1\n\n", prefix))
280 .to_string();
281 }
282
283 let p_re = regex::Regex::new(r"<p[^>]*>(.*?)</p>").unwrap();
285 md = p_re.replace_all(&md, "$1\n\n").to_string();
286
287 let br_re = regex::Regex::new(r"<br\s*/?>").unwrap();
289 md = br_re.replace_all(&md, "\n").to_string();
290
291 let b_re = regex::Regex::new(r"<(b|strong)[^>]*>(.*?)</(b|strong)>").unwrap();
293 md = b_re.replace_all(&md, "**$2**").to_string();
294
295 let i_re = regex::Regex::new(r"<(i|em)[^>]*>(.*?)</(i|em)>").unwrap();
297 md = i_re.replace_all(&md, "*$2*").to_string();
298
299 let a_re = regex::Regex::new(r#"<a[^>]*href=["']([^"']+)["'][^>]*>(.*?)</a>"#).unwrap();
301 md = a_re.replace_all(&md, "[$2]($1)").to_string();
302
303 let code_re = regex::Regex::new(r"<code[^>]*>(.*?)</code>").unwrap();
305 md = code_re.replace_all(&md, "`$1`").to_string();
306
307 let pre_re = regex::Regex::new(r"<pre[^>]*>([\s\S]*?)</pre>").unwrap();
309 md = pre_re.replace_all(&md, "```\n$1\n```").to_string();
310
311 let li_re = regex::Regex::new(r"<li[^>]*>(.*?)</li>").unwrap();
313 md = li_re.replace_all(&md, "- $1\n").to_string();
314
315 let tag_re = regex::Regex::new(r"<[^>]+>").unwrap();
317 md = tag_re.replace_all(&md, "").to_string();
318
319 md = Self::decode_html_entities(&md);
321
322 let ws_re = regex::Regex::new(r"\n{3,}").unwrap();
324 md = ws_re.replace_all(&md, "\n\n").to_string();
325
326 md.trim().to_string()
327 }
328
329 pub fn normalize_whitespace(text: &str) -> String {
331 let ws_re = regex::Regex::new(r"\s+").unwrap();
332 ws_re.replace_all(text.trim(), " ").to_string()
333 }
334
335 pub fn truncate(text: &str, max_len: usize) -> String {
337 if text.len() <= max_len {
338 text.to_string()
339 } else if max_len <= 3 {
340 text.chars().take(max_len).collect()
341 } else {
342 let truncated: String = text.chars().take(max_len - 3).collect();
343 format!("{}...", truncated)
344 }
345 }
346}
347
348#[cfg(test)]
349mod tests {
350 use super::*;
351
352 #[test]
357 fn test_html_to_text() {
358 let html = "<p>Hello <b>world</b>!</p><p>Second paragraph.</p>";
359 let text = ContentExtractor::html_to_text(html);
360 assert!(text.contains("Hello"));
361 assert!(text.contains("world"));
362 assert!(!text.contains("<"));
363 }
364
365 #[test]
366 fn test_html_to_text_removes_scripts() {
367 let html = "<p>Content</p><script>evil();</script><p>More</p>";
368 let text = ContentExtractor::html_to_text(html);
369 assert!(!text.contains("evil"));
370 assert!(text.contains("Content"));
371 assert!(text.contains("More"));
372 }
373
374 #[test]
375 fn test_html_to_text_removes_styles() {
376 let html = "<p>Content</p><style>.hidden { display: none; }</style><p>More</p>";
377 let text = ContentExtractor::html_to_text(html);
378 assert!(!text.contains("hidden"));
379 assert!(!text.contains("display"));
380 assert!(text.contains("Content"));
381 assert!(text.contains("More"));
382 }
383
384 #[test]
385 fn test_html_to_text_multiline_script() {
386 let html = r#"
387 <p>Before</p>
388 <script type="text/javascript">
389 function evil() {
390 console.log("bad");
391 }
392 evil();
393 </script>
394 <p>After</p>
395 "#;
396 let text = ContentExtractor::html_to_text(html);
397 assert!(!text.contains("evil"));
398 assert!(!text.contains("console"));
399 assert!(text.contains("Before"));
400 assert!(text.contains("After"));
401 }
402
403 #[test]
404 fn test_html_to_text_preserves_newlines_for_blocks() {
405 let html = "<p>Para 1</p><p>Para 2</p>";
406 let text = ContentExtractor::html_to_text(html);
407 assert!(text.contains("Para 1"));
409 assert!(text.contains("Para 2"));
410 }
411
412 #[test]
413 fn test_html_to_text_strips_all_tags() {
414 let html = "<div class=\"container\"><span id=\"test\">Hello</span></div>";
415 let text = ContentExtractor::html_to_text(html);
416 assert_eq!(text, "Hello");
417 assert!(!text.contains("<"));
418 assert!(!text.contains(">"));
419 assert!(!text.contains("class"));
420 }
421
422 #[test]
427 fn test_html_entity_decode_basic() {
428 assert_eq!(
429 ContentExtractor::decode_html_entities("<div>"),
430 "<div>"
431 );
432 assert_eq!(ContentExtractor::decode_html_entities("&"), "&");
433 assert_eq!(ContentExtractor::decode_html_entities("""), "\"");
434 }
435
436 #[test]
437 fn test_html_entity_decode_quotes() {
438 assert_eq!(ContentExtractor::decode_html_entities("'"), "'");
439 assert_eq!(ContentExtractor::decode_html_entities("'"), "'");
440 assert_eq!(ContentExtractor::decode_html_entities("'"), "'");
441 }
442
443 #[test]
444 fn test_html_entity_decode_typography() {
445 assert_eq!(ContentExtractor::decode_html_entities("–"), "-");
446 assert_eq!(ContentExtractor::decode_html_entities("—"), "--");
447 assert_eq!(ContentExtractor::decode_html_entities("…"), "...");
448 assert_eq!(ContentExtractor::decode_html_entities("‘"), "'");
449 assert_eq!(ContentExtractor::decode_html_entities("’"), "'");
450 assert_eq!(ContentExtractor::decode_html_entities("“"), "\"");
451 assert_eq!(ContentExtractor::decode_html_entities("”"), "\"");
452 }
453
454 #[test]
455 fn test_html_entity_decode_symbols() {
456 assert_eq!(ContentExtractor::decode_html_entities("©"), "(c)");
457 assert_eq!(ContentExtractor::decode_html_entities("®"), "(R)");
458 assert_eq!(ContentExtractor::decode_html_entities("™"), "(TM)");
459 }
460
461 #[test]
462 fn test_html_entity_decode_nbsp() {
463 assert_eq!(
464 ContentExtractor::decode_html_entities("Hello World"),
465 "Hello World"
466 );
467 }
468
469 #[test]
470 fn test_html_entity_decode_mixed() {
471 let input = "Copyright © 2024 — All rights reserved & more";
472 let output = ContentExtractor::decode_html_entities(input);
473 assert_eq!(output, "Copyright (c) 2024 -- All rights reserved & more");
474 }
475
476 #[test]
481 fn test_script_removal_inline() {
482 let html = "<script>alert('xss')</script><p>Safe</p>";
483 let text = ContentExtractor::html_to_text(html);
484 assert!(!text.contains("alert"));
485 assert!(!text.contains("xss"));
486 assert!(text.contains("Safe"));
487 }
488
489 #[test]
490 fn test_script_removal_with_attributes() {
491 let html = "<script type=\"text/javascript\" src=\"bad.js\">code()</script><p>Safe</p>";
492 let text = ContentExtractor::html_to_text(html);
493 assert!(!text.contains("code"));
494 assert!(!text.contains("javascript"));
495 assert!(text.contains("Safe"));
496 }
497
498 #[test]
499 fn test_script_removal_multiple() {
500 let html = "<script>one()</script><p>Middle</p><script>two()</script>";
501 let text = ContentExtractor::html_to_text(html);
502 assert!(!text.contains("one"));
503 assert!(!text.contains("two"));
504 assert!(text.contains("Middle"));
505 }
506
507 #[test]
512 fn test_whitespace_normalization_spaces() {
513 let text = "Hello world";
514 let normalized = ContentExtractor::normalize_whitespace(text);
515 assert_eq!(normalized, "Hello world");
516 }
517
518 #[test]
519 fn test_whitespace_normalization_tabs() {
520 let text = "Hello\t\tworld";
521 let normalized = ContentExtractor::normalize_whitespace(text);
522 assert_eq!(normalized, "Hello world");
523 }
524
525 #[test]
526 fn test_whitespace_normalization_newlines() {
527 let text = "Hello\n\n\nworld";
528 let normalized = ContentExtractor::normalize_whitespace(text);
529 assert_eq!(normalized, "Hello world");
530 }
531
532 #[test]
533 fn test_whitespace_normalization_mixed() {
534 let text = " Hello \t\n world ";
535 let normalized = ContentExtractor::normalize_whitespace(text);
536 assert_eq!(normalized, "Hello world");
537 }
538
539 #[test]
540 fn test_whitespace_normalization_empty() {
541 let text = " ";
542 let normalized = ContentExtractor::normalize_whitespace(text);
543 assert_eq!(normalized, "");
544 }
545
546 #[test]
547 fn test_whitespace_normalization_single_word() {
548 let text = " Hello ";
549 let normalized = ContentExtractor::normalize_whitespace(text);
550 assert_eq!(normalized, "Hello");
551 }
552
553 #[test]
558 fn test_truncation_short_text() {
559 let text = "Hello";
560 let truncated = ContentExtractor::truncate(text, 10);
561 assert_eq!(truncated, "Hello");
562 }
563
564 #[test]
565 fn test_truncation_exact_length() {
566 let text = "Hello";
567 let truncated = ContentExtractor::truncate(text, 5);
568 assert_eq!(truncated, "Hello");
569 }
570
571 #[test]
572 fn test_truncation_adds_ellipsis() {
573 let text = "Hello World";
574 let truncated = ContentExtractor::truncate(text, 8);
575 assert_eq!(truncated, "Hello...");
576 assert_eq!(truncated.len(), 8);
577 }
578
579 #[test]
580 fn test_truncation_very_short_limit() {
581 let text = "Hello";
582 let truncated = ContentExtractor::truncate(text, 3);
583 assert_eq!(truncated, "Hel");
584 }
585
586 #[test]
587 fn test_truncation_zero_limit() {
588 let text = "Hello";
589 let truncated = ContentExtractor::truncate(text, 0);
590 assert_eq!(truncated, "");
591 }
592
593 #[test]
594 fn test_truncation_empty_text() {
595 let text = "";
596 let truncated = ContentExtractor::truncate(text, 10);
597 assert_eq!(truncated, "");
598 }
599
600 #[test]
601 fn test_truncation_unicode() {
602 let text = "Hello World";
603 let truncated = ContentExtractor::truncate(text, 10);
604 assert!(truncated.len() <= 10 || truncated.ends_with("..."));
606 }
607
608 #[test]
613 fn test_html_to_markdown() {
614 let html = "<h1>Title</h1><p>Para with <b>bold</b> and <a href=\"http://example.com\">link</a>.</p>";
615 let md = ContentExtractor::html_to_markdown(html);
616 assert!(md.contains("# Title"));
617 assert!(md.contains("**bold**"));
618 assert!(md.contains("[link](http://example.com)"));
619 }
620
621 #[test]
622 fn test_html_to_markdown_headers() {
623 let html = "<h1>H1</h1><h2>H2</h2><h3>H3</h3><h4>H4</h4><h5>H5</h5><h6>H6</h6>";
624 let md = ContentExtractor::html_to_markdown(html);
625 assert!(md.contains("# H1"));
626 assert!(md.contains("## H2"));
627 assert!(md.contains("### H3"));
628 assert!(md.contains("#### H4"));
629 assert!(md.contains("##### H5"));
630 assert!(md.contains("###### H6"));
631 }
632
633 #[test]
634 fn test_html_to_markdown_emphasis() {
635 let html = "<p><b>bold</b> and <strong>strong</strong> and <i>italic</i> and <em>emphasis</em></p>";
636 let md = ContentExtractor::html_to_markdown(html);
637 assert!(md.contains("**bold**"));
638 assert!(md.contains("**strong**"));
639 assert!(md.contains("*italic*"));
640 assert!(md.contains("*emphasis*"));
641 }
642
643 #[test]
644 fn test_html_to_markdown_code() {
645 let html = "<p>Use <code>println!</code> for output.</p>";
646 let md = ContentExtractor::html_to_markdown(html);
647 assert!(md.contains("`println!`"));
648 }
649
650 #[test]
651 fn test_html_to_markdown_pre() {
652 let html = "<pre>fn main() {\n println!(\"Hello\");\n}</pre>";
653 let md = ContentExtractor::html_to_markdown(html);
654 assert!(md.contains("```"));
655 assert!(md.contains("fn main()"));
656 }
657
658 #[test]
659 fn test_html_to_markdown_list() {
660 let html = "<ul><li>Item 1</li><li>Item 2</li><li>Item 3</li></ul>";
661 let md = ContentExtractor::html_to_markdown(html);
662 assert!(md.contains("- Item 1"));
663 assert!(md.contains("- Item 2"));
664 assert!(md.contains("- Item 3"));
665 }
666
667 #[test]
668 fn test_html_to_markdown_removes_scripts() {
669 let html = "<p>Safe</p><script>evil()</script>";
670 let md = ContentExtractor::html_to_markdown(html);
671 assert!(!md.contains("evil"));
672 assert!(md.contains("Safe"));
673 }
674
675 #[test]
676 fn test_html_to_markdown_line_breaks() {
677 let html = "Line 1<br>Line 2<br/>Line 3";
678 let md = ContentExtractor::html_to_markdown(html);
679 assert!(md.contains("Line 1"));
680 assert!(md.contains("Line 2"));
681 assert!(md.contains("Line 3"));
682 }
683
684 #[test]
689 fn test_extracted_content_structure() {
690 let content = ExtractedContent {
691 text: "Hello world".to_string(),
692 markdown: Some("Hello world".to_string()),
693 html: "<p>Hello world</p>".to_string(),
694 word_count: 2,
695 char_count: 11,
696 from_main: true,
697 };
698 assert_eq!(content.word_count, 2);
699 assert!(content.from_main);
700 }
701
702 #[test]
703 fn test_extracted_content_serialization() {
704 let content = ExtractedContent {
705 text: "Hello".to_string(),
706 markdown: Some("Hello".to_string()),
707 html: "<p>Hello</p>".to_string(),
708 word_count: 1,
709 char_count: 5,
710 from_main: false,
711 };
712
713 let json = serde_json::to_string(&content).unwrap();
714 assert!(json.contains("\"text\":\"Hello\""));
715 assert!(json.contains("\"word_count\":1"));
716 assert!(json.contains("\"from_main\":false"));
717
718 let deserialized: ExtractedContent = serde_json::from_str(&json).unwrap();
719 assert_eq!(deserialized.text, "Hello");
720 assert_eq!(deserialized.word_count, 1);
721 }
722
723 #[test]
724 fn test_extracted_content_empty() {
725 let content = ExtractedContent {
726 text: String::new(),
727 markdown: None,
728 html: String::new(),
729 word_count: 0,
730 char_count: 0,
731 from_main: false,
732 };
733 assert_eq!(content.word_count, 0);
734 assert_eq!(content.char_count, 0);
735 assert!(content.markdown.is_none());
736 }
737
738 #[test]
743 fn test_html_to_text_nested_tags() {
744 let html = "<div><p><span><b>Nested</b> content</span></p></div>";
745 let text = ContentExtractor::html_to_text(html);
746 assert!(text.contains("Nested"));
747 assert!(text.contains("content"));
748 assert!(!text.contains("<"));
749 }
750
751 #[test]
752 fn test_html_to_text_malformed_html() {
753 let html = "<p>Unclosed paragraph <b>bold";
754 let text = ContentExtractor::html_to_text(html);
755 assert!(text.contains("Unclosed"));
757 assert!(text.contains("bold"));
758 }
759
760 #[test]
761 fn test_html_to_text_self_closing_tags() {
762 let html = "Hello<br/>World<hr/>Done";
763 let text = ContentExtractor::html_to_text(html);
764 assert!(text.contains("Hello"));
765 assert!(text.contains("World"));
766 assert!(text.contains("Done"));
767 }
768
769 #[test]
770 fn test_html_to_text_comments() {
771 let html = "<p>Before</p><!-- This is a comment --><p>After</p>";
772 let text = ContentExtractor::html_to_text(html);
773 assert!(!text.contains("comment"));
774 assert!(text.contains("Before"));
775 assert!(text.contains("After"));
776 }
777
778 #[test]
779 fn test_html_to_text_empty() {
780 let html = "";
781 let text = ContentExtractor::html_to_text(html);
782 assert_eq!(text, "");
783 }
784
785 #[test]
786 fn test_html_to_text_only_whitespace() {
787 let html = " \n\t ";
788 let text = ContentExtractor::html_to_text(html);
789 assert_eq!(text, "");
790 }
791}