1use markup5ever_rcdom::{Handle, NodeData};
2use url::Url;
3use crate::document::{ExtractedContent, ExtractedLink, PageMetadata};
4
5const NOISE_TAGS: &[&str] = &["nav", "header", "footer", "aside", "script", "style", "noscript", "iframe", "form"];
6const NOISE_CLASS_PATTERNS: &[&str] = &["nav", "sidebar", "menu", "ad", "banner", "comment", "footer", "header", "widget"];
7const CONTENT_CLASS_PATTERNS: &[&str] = &["article", "content", "main", "post", "entry", "body", "text"];
8
9pub struct ReadabilityExtractor {
10 pub preserve_links: bool,
11}
12
13impl ReadabilityExtractor {
14 pub fn extract(&self, root: &Handle, base_url: &Url) -> ExtractedContent {
15 let title = extract_title(root);
16 let metadata = extract_metadata(root, base_url);
17 let body = find_body(root);
18
19 let main_node = body.as_ref()
20 .and_then(|b| find_main_content(b))
21 .or(body.clone());
22
23 let (body_html, body_text, links) = main_node
24 .as_ref()
25 .map(|n| self.serialize_content(n, base_url))
26 .unwrap_or_else(|| (String::new(), String::new(), Vec::new()));
27
28 ExtractedContent {
29 url: base_url.clone(),
30 title: title.unwrap_or_else(|| base_url.to_string()),
31 byline: metadata.og_title.clone(),
32 body_text,
33 body_html,
34 links,
35 metadata,
36 }
37 }
38
39 fn serialize_content(&self, handle: &Handle, base_url: &Url) -> (String, String, Vec<ExtractedLink>) {
40 let mut html = String::new();
41 let mut text = String::new();
42 let mut links = Vec::new();
43 serialize_node(handle, &mut html, &mut text, &mut links, base_url, self.preserve_links);
44 (html, text, links)
45 }
46}
47
48fn find_body(root: &Handle) -> Option<Handle> {
49 find_tag(root, "body")
50}
51
52fn find_tag(handle: &Handle, tag_name: &str) -> Option<Handle> {
53 if let NodeData::Element { name, .. } = &handle.data {
54 if name.local.as_ref() == tag_name {
55 return Some(handle.clone());
56 }
57 }
58 for child in handle.children.borrow().iter() {
59 if let Some(found) = find_tag(child, tag_name) {
60 return Some(found);
61 }
62 }
63 None
64}
65
66fn find_main_content(body: &Handle) -> Option<Handle> {
67 if let Some(node) = find_tag(body, "main").or_else(|| find_tag(body, "article")) {
69 return Some(node);
70 }
71
72 let mut best: Option<(Handle, f64)> = None;
74 score_nodes(body, &mut best);
75 best.map(|(node, _)| node)
76}
77
78fn score_nodes(handle: &Handle, best: &mut Option<(Handle, f64)>) {
79 if is_noise(handle) {
80 return;
81 }
82
83 if let NodeData::Element { name, .. } = &handle.data {
84 let tag = name.local.as_ref();
85 let score = compute_score(handle, tag);
86 if score > 20.0 {
87 match best {
88 None => *best = Some((handle.clone(), score)),
89 Some((_, best_score)) if score > *best_score => {
90 *best = Some((handle.clone(), score));
91 }
92 _ => {}
93 }
94 }
95 }
96
97 for child in handle.children.borrow().iter() {
98 score_nodes(child, best);
99 }
100}
101
102fn compute_score(handle: &Handle, tag: &str) -> f64 {
103 let base = match tag {
104 "article" => 30.0,
105 "section" => 10.0,
106 "div" => 5.0,
107 "p" => 3.0,
108 "td" => 3.0,
109 "blockquote" => 3.0,
110 "pre" => 3.0,
111 _ => 0.0,
112 };
113
114 if base == 0.0 {
115 return 0.0;
116 }
117
118 let class_bonus = class_score(handle);
120
121 let text_len = count_text(handle) as f64;
122 let link_len = count_link_text(handle) as f64;
123
124 let link_density = if text_len > 0.0 { link_len / text_len } else { 0.0 };
126 let density_penalty = link_density * 50.0;
127
128 base + class_bonus + (text_len * 0.1).min(30.0) - density_penalty
129}
130
131fn class_score(handle: &Handle) -> f64 {
132 let attrs = match &handle.data {
133 NodeData::Element { attrs, .. } => attrs.borrow(),
134 _ => return 0.0,
135 };
136
137 let mut score = 0.0;
138 for attr in attrs.iter() {
139 let name = attr.name.local.as_ref();
140 if name != "class" && name != "id" {
141 continue;
142 }
143 let val = attr.value.as_ref().to_lowercase();
144 for pattern in CONTENT_CLASS_PATTERNS {
145 if val.contains(pattern) {
146 score += 10.0;
147 }
148 }
149 for pattern in NOISE_CLASS_PATTERNS {
150 if val.contains(pattern) {
151 score -= 10.0;
152 }
153 }
154 }
155 score
156}
157
158fn is_noise(handle: &Handle) -> bool {
159 match &handle.data {
160 NodeData::Element { name, attrs, .. } => {
161 let tag = name.local.as_ref();
162 if NOISE_TAGS.contains(&tag) {
163 return true;
164 }
165 let attrs = attrs.borrow();
166 for attr in attrs.iter() {
167 let aname = attr.name.local.as_ref();
168 if aname != "class" && aname != "id" {
169 continue;
170 }
171 let val = attr.value.as_ref().to_lowercase();
172 for pattern in NOISE_CLASS_PATTERNS {
173 if val.contains(pattern) {
174 return true;
175 }
176 }
177 }
178 false
179 }
180 _ => false,
181 }
182}
183
184fn count_text(handle: &Handle) -> usize {
185 let mut total = 0;
186 count_text_inner(handle, &mut total);
187 total
188}
189
190fn count_text_inner(handle: &Handle, total: &mut usize) {
191 match &handle.data {
192 NodeData::Text { contents } => {
193 *total += contents.borrow().trim().len();
194 }
195 NodeData::Element { name, .. } => {
196 let tag = name.local.as_ref();
197 if tag == "script" || tag == "style" {
198 return;
199 }
200 for child in handle.children.borrow().iter() {
201 count_text_inner(child, total);
202 }
203 }
204 _ => {
205 for child in handle.children.borrow().iter() {
206 count_text_inner(child, total);
207 }
208 }
209 }
210}
211
212fn count_link_text(handle: &Handle) -> usize {
213 let mut total = 0;
214 count_link_text_inner(handle, &mut total, false);
215 total
216}
217
218fn count_link_text_inner(handle: &Handle, total: &mut usize, in_link: bool) {
219 match &handle.data {
220 NodeData::Text { contents } if in_link => {
221 *total += contents.borrow().trim().len();
222 }
223 NodeData::Element { name, .. } => {
224 let tag = name.local.as_ref();
225 let is_link = tag == "a";
226 for child in handle.children.borrow().iter() {
227 count_link_text_inner(child, total, in_link || is_link);
228 }
229 }
230 _ => {}
231 }
232}
233
234fn serialize_node(
235 handle: &Handle,
236 html: &mut String,
237 text: &mut String,
238 links: &mut Vec<ExtractedLink>,
239 base_url: &Url,
240 preserve_links: bool,
241) {
242 if is_noise(handle) {
243 return;
244 }
245
246 match &handle.data {
247 NodeData::Text { contents } => {
248 let t = contents.borrow();
249 let trimmed = t.as_ref();
250 if !trimmed.trim().is_empty() {
251 html.push_str(&html_escape(trimmed));
252 text.push_str(trimmed);
253 }
254 }
255 NodeData::Element { name, attrs, .. } => {
256 let tag = name.local.as_ref();
257 let attrs_ref = attrs.borrow();
258
259 match tag {
260 "script" | "style" | "noscript" | "iframe" => return,
261 "a" if preserve_links => {
262 let href = attrs_ref.iter()
263 .find(|a| a.name.local.as_ref() == "href")
264 .map(|a| a.value.as_ref().to_owned());
265 let rel = attrs_ref.iter()
266 .find(|a| a.name.local.as_ref() == "rel")
267 .map(|a| a.value.as_ref().to_owned());
268
269 let resolved = href.as_deref().and_then(|h| base_url.join(h).ok());
270
271 html.push_str("<a");
272 if let Some(ref h) = href {
273 html.push_str(&format!(" href=\"{}\"", html_escape(h)));
274 }
275 html.push('>');
276
277 let mut link_text = String::new();
278 let mut link_html = String::new();
279 for child in handle.children.borrow().iter() {
280 serialize_node(child, &mut link_html, text, links, base_url, preserve_links);
281 collect_text(child, &mut link_text);
282 }
283 html.push_str(&link_html);
284 html.push_str("</a>");
285
286 if let Some(href_url) = resolved {
287 links.push(ExtractedLink {
288 text: link_text.trim().to_owned(),
289 href: href_url,
290 rel,
291 });
292 }
293 return;
294 }
295 _ => {
296 let is_block = matches!(tag, "p" | "div" | "section" | "article" |
298 "h1" | "h2" | "h3" | "h4" | "h5" | "h6" |
299 "ul" | "ol" | "li" | "blockquote" | "pre" | "br" | "hr" |
300 "table" | "tr" | "td" | "th" | "thead" | "tbody");
301
302 if is_block {
303 html.push('<');
304 html.push_str(tag);
305 html.push('>');
306 if tag == "br" || tag == "hr" {
307 } else {
309 for child in handle.children.borrow().iter() {
310 serialize_node(child, html, text, links, base_url, preserve_links);
311 }
312 html.push_str("</");
313 html.push_str(tag);
314 html.push('>');
315 }
316 } else {
317 for child in handle.children.borrow().iter() {
319 serialize_node(child, html, text, links, base_url, preserve_links);
320 }
321 }
322 return;
323 }
324 }
325 }
326 _ => {}
327 }
328}
329
330fn collect_text(handle: &Handle, out: &mut String) {
331 match &handle.data {
332 NodeData::Text { contents } => {
333 out.push_str(contents.borrow().as_ref());
334 }
335 _ => {
336 for child in handle.children.borrow().iter() {
337 collect_text(child, out);
338 }
339 }
340 }
341}
342
343fn html_escape(s: &str) -> String {
344 s.replace('&', "&")
345 .replace('<', "<")
346 .replace('>', ">")
347 .replace('"', """)
348}
349
350fn extract_title(root: &Handle) -> Option<String> {
351 if let Some(title_node) = find_tag(root, "title") {
353 let mut text = String::new();
354 collect_text(&title_node, &mut text);
355 let trimmed = text.trim().to_owned();
356 if !trimmed.is_empty() {
357 return Some(trimmed);
358 }
359 }
360 if let Some(h1) = find_tag(root, "h1") {
361 let mut text = String::new();
362 collect_text(&h1, &mut text);
363 let trimmed = text.trim().to_owned();
364 if !trimmed.is_empty() {
365 return Some(trimmed);
366 }
367 }
368 None
369}
370
371fn extract_metadata(root: &Handle, base_url: &Url) -> PageMetadata {
372 let mut meta = PageMetadata {
373 description: None,
374 og_title: None,
375 og_image: None,
376 canonical: None,
377 published_at: None,
378 };
379 collect_meta(root, &mut meta, base_url);
380 meta
381}
382
383fn collect_meta(handle: &Handle, meta: &mut PageMetadata, base_url: &Url) {
384 if let NodeData::Element { name, attrs, .. } = &handle.data {
385 let tag = name.local.as_ref();
386 let attrs_ref = attrs.borrow();
387
388 if tag == "meta" {
389 let name_attr = attrs_ref.iter()
390 .find(|a| a.name.local.as_ref() == "name")
391 .map(|a| a.value.as_ref().to_lowercase());
392 let property_attr = attrs_ref.iter()
393 .find(|a| a.name.local.as_ref() == "property")
394 .map(|a| a.value.as_ref().to_lowercase());
395 let content = attrs_ref.iter()
396 .find(|a| a.name.local.as_ref() == "content")
397 .map(|a| a.value.as_ref().to_owned());
398
399 match (name_attr.as_deref(), property_attr.as_deref(), content) {
400 (Some("description"), _, Some(c)) => meta.description = Some(c),
401 (_, Some("og:title"), Some(c)) => meta.og_title = Some(c),
402 (_, Some("og:image"), Some(c)) => meta.og_image = Some(c),
403 _ => {}
404 }
405 } else if tag == "link" {
406 let is_canonical = attrs_ref.iter()
407 .any(|a| a.name.local.as_ref() == "rel" && a.value.as_ref() == "canonical");
408 if is_canonical {
409 if let Some(href) = attrs_ref.iter()
410 .find(|a| a.name.local.as_ref() == "href")
411 .and_then(|a| base_url.join(a.value.as_ref()).ok())
412 {
413 meta.canonical = Some(href);
414 }
415 }
416 }
417 }
418
419 for child in handle.children.borrow().iter() {
420 collect_meta(child, meta, base_url);
421 }
422}