1pub mod readability;
2pub mod spa_detection;
3
4use html5ever::parse_document;
5use html5ever::tendril::TendrilSink;
6use markup5ever_rcdom::{NodeData, RcDom};
7use url::Url;
8
9use crate::document::{ExtractedContent, RawHtml, SpaDetection};
10use self::readability::ReadabilityExtractor;
11use self::spa_detection::{detect_spa, extract_text_length};
12
13#[derive(Debug, Clone)]
14pub struct ExtractorConfig {
15 pub min_content_length: usize,
16 pub noise_selectors: Vec<String>,
17 pub preserve_links: bool,
18}
19
20impl Default for ExtractorConfig {
21 fn default() -> Self {
22 Self {
23 min_content_length: 500,
24 noise_selectors: vec![],
25 preserve_links: true,
26 }
27 }
28}
29
30#[derive(Debug, thiserror::Error)]
31pub enum ExtractionError {
32 #[error("Failed to decode HTML: {0}")]
33 Decode(String),
34 #[error("No content found")]
35 NoContent,
36}
37
38pub struct ContentExtractor {
39 config: ExtractorConfig,
40}
41
42impl ContentExtractor {
43 pub fn new(config: ExtractorConfig) -> Self {
44 Self { config }
45 }
46
47 pub fn extract(&self, raw: &RawHtml) -> Result<ExtractedContent, ExtractionError> {
49 let html_str = decode_bytes(raw);
50 let dom = parse_html(&html_str);
51 let root = dom.document.clone();
52
53 let text_len = extract_text_length(&root);
54 let _spa = detect_spa(&root, text_len, self.config.min_content_length);
55
56 let extractor = ReadabilityExtractor {
57 preserve_links: self.config.preserve_links,
58 };
59 let content = extractor.extract(&root, &raw.url);
60
61 Ok(content)
62 }
63
64 pub fn detect_spa_for(&self, raw: &RawHtml) -> Result<SpaDetection, ExtractionError> {
65 let html_str = decode_bytes(raw);
66 let dom = parse_html(&html_str);
67 let root = dom.document.clone();
68 let text_len = extract_text_length(&root);
69 Ok(detect_spa(&root, text_len, self.config.min_content_length))
70 }
71
72 pub fn detect_frames(&self, raw: &RawHtml) -> Vec<Url> {
74 let html_str = decode_bytes(raw);
75 let dom = parse_html(&html_str);
76 collect_frame_srcs(&dom.document, &raw.url)
77 }
78}
79
80fn decode_bytes(raw: &RawHtml) -> String {
83 let charset = extract_charset_from_content_type(&raw.content_type)
84 .or_else(|| sniff_charset_from_bytes(&raw.bytes));
85
86 if let Some(label) = charset {
87 if let Some(enc) = encoding_rs::Encoding::for_label(label.as_bytes()) {
88 let (cow, _, _) = enc.decode(&raw.bytes);
89 return cow.into_owned();
90 }
91 }
92
93 if let Ok(s) = std::str::from_utf8(&raw.bytes) {
94 return s.to_owned();
95 }
96
97 raw.bytes.iter().map(|&b| b as char).collect()
99}
100
101fn parse_html(html: &str) -> RcDom {
102 parse_document(RcDom::default(), Default::default())
103 .from_utf8()
104 .read_from(&mut html.as_bytes())
105 .unwrap_or_default()
106}
107
108fn extract_charset_from_content_type(content_type: &str) -> Option<String> {
110 for part in content_type.split(';') {
111 let part = part.trim();
112 if let Some(val) = part.strip_prefix("charset=") {
113 return Some(val.trim_matches('"').to_owned());
114 }
115 }
116 None
117}
118
119fn sniff_charset_from_bytes(bytes: &[u8]) -> Option<String> {
123 let head = &bytes[..bytes.len().min(4096)];
124 let needle = b"charset=";
125 let pos = head.windows(needle.len()).position(|w| {
126 w.eq_ignore_ascii_case(needle)
127 })?;
128 let after = &head[pos + needle.len()..];
129 let after = after.strip_prefix(b"\"").or_else(|| after.strip_prefix(b"'")).unwrap_or(after);
131 let val: Vec<u8> = after
132 .iter()
133 .copied()
134 .take_while(|&b| !matches!(b, b'"' | b'\'' | b';' | b' ' | b'>' | b'\n' | b'\r'))
135 .collect();
136 if val.is_empty() {
137 return None;
138 }
139 String::from_utf8(val).ok().filter(|s| !s.is_empty())
140}
141
142fn collect_frame_srcs(handle: &markup5ever_rcdom::Handle, base: &Url) -> Vec<Url> {
144 let mut result = Vec::new();
145 collect_frame_srcs_inner(handle, base, &mut result);
146 result
147}
148
149fn collect_frame_srcs_inner(
150 handle: &markup5ever_rcdom::Handle,
151 base: &Url,
152 out: &mut Vec<Url>,
153) {
154 if let NodeData::Element { name, attrs, .. } = &handle.data {
155 let tag = name.local.as_ref();
156 if tag == "frame" || tag == "iframe" {
157 if let Some(src) = attrs
158 .borrow()
159 .iter()
160 .find(|a| a.name.local.as_ref() == "src")
161 .map(|a| a.value.as_ref().to_owned())
162 {
163 if let Ok(url) = base.join(&src) {
164 out.push(url);
165 }
166 }
167 }
168 }
169 for child in handle.children.borrow().iter() {
170 collect_frame_srcs_inner(child, base, out);
171 }
172}