1pub mod readability;
2pub mod spa_detection;
3
4use html5ever::parse_document;
5use html5ever::tendril::TendrilSink;
6use markup5ever_rcdom::RcDom;
7
8use crate::document::{ExtractedContent, RawHtml, SpaDetection};
9use self::readability::ReadabilityExtractor;
10use self::spa_detection::{detect_spa, extract_text_length};
11
12#[derive(Debug, Clone)]
13pub struct ExtractorConfig {
14 pub min_content_length: usize,
15 pub noise_selectors: Vec<String>,
16 pub preserve_links: bool,
17}
18
19impl Default for ExtractorConfig {
20 fn default() -> Self {
21 Self {
22 min_content_length: 500,
23 noise_selectors: vec![],
24 preserve_links: true,
25 }
26 }
27}
28
29#[derive(Debug, thiserror::Error)]
30pub enum ExtractionError {
31 #[error("Failed to decode HTML: {0}")]
32 Decode(String),
33 #[error("No content found")]
34 NoContent,
35}
36
37pub struct ContentExtractor {
38 config: ExtractorConfig,
39}
40
41impl ContentExtractor {
42 pub fn new(config: ExtractorConfig) -> Self {
43 Self { config }
44 }
45
46 pub fn extract(&self, raw: &RawHtml) -> Result<ExtractedContent, ExtractionError> {
48 let html_str = self.decode_bytes(raw)?;
49 let dom = self.parse_html(&html_str);
50 let root = dom.document.clone();
51
52 let text_len = extract_text_length(&root);
53 let _spa = detect_spa(&root, text_len, self.config.min_content_length);
54
55 let extractor = ReadabilityExtractor {
56 preserve_links: self.config.preserve_links,
57 };
58 let content = extractor.extract(&root, &raw.url);
59
60 Ok(content)
61 }
62
63 pub fn detect_spa_for(&self, raw: &RawHtml) -> Result<SpaDetection, ExtractionError> {
64 let html_str = self.decode_bytes(raw)?;
65 let dom = self.parse_html(&html_str);
66 let root = dom.document.clone();
67 let text_len = extract_text_length(&root);
68 Ok(detect_spa(&root, text_len, self.config.min_content_length))
69 }
70
71 fn decode_bytes(&self, raw: &RawHtml) -> Result<String, ExtractionError> {
72 if let Ok(s) = std::str::from_utf8(&raw.bytes) {
75 return Ok(s.to_owned());
76 }
77 Ok(raw.bytes.iter().map(|&b| b as char).collect())
79 }
80
81 fn parse_html(&self, html: &str) -> RcDom {
82 parse_document(RcDom::default(), Default::default())
83 .from_utf8()
84 .read_from(&mut html.as_bytes())
85 .unwrap_or_default()
86 }
87}