1use markup5ever_rcdom::{Handle, NodeData};
2use url::Url;
3use std::collections::HashMap;
4use std::rc::Rc;
5use crate::document::{ExtractedContent, ExtractedLink, PageMetadata};
6
7const NOISE_TAGS: &[&str] = &["nav", "header", "footer", "aside", "script", "style", "noscript", "iframe", "form"];
8const NOISE_CLASS_PATTERNS: &[&str] = &["nav", "sidebar", "menu", "ad", "banner", "comment", "footer", "header", "widget"];
9const CONTENT_CLASS_PATTERNS: &[&str] = &["article", "content", "main", "post", "entry", "body", "text"];
10
11pub struct ReadabilityExtractor {
12 pub preserve_links: bool,
13}
14
15impl ReadabilityExtractor {
16 pub fn extract(&self, root: &Handle, base_url: &Url) -> ExtractedContent {
17 let title = extract_title(root);
18 let metadata = extract_metadata(root, base_url);
19 let body = find_body(root);
20
21 let main_node = body.as_ref()
22 .and_then(|b| find_main_content(b))
23 .or(body.clone());
24
25 let (mh, mt, ml) = main_node
26 .as_ref()
27 .map(|n| self.serialize_content(n, base_url))
28 .unwrap_or_else(|| (String::new(), String::new(), Vec::new()));
29
30 let (body_html, body_text, links) = if mt.trim().len() < 200 {
33 if let Some(b) = body.as_ref() {
34 let (bh, bt, bl) = self.serialize_content(b, base_url);
35 if bt.trim().len() > mt.trim().len() {
36 (bh, bt, bl)
37 } else {
38 (mh, mt, ml)
39 }
40 } else {
41 (mh, mt, ml)
42 }
43 } else {
44 (mh, mt, ml)
45 };
46
47 ExtractedContent {
48 url: base_url.clone(),
49 title: title.unwrap_or_else(|| base_url.to_string()),
50 byline: metadata.og_title.clone(),
51 body_text,
52 body_html,
53 links,
54 metadata,
55 }
56 }
57
58 fn serialize_content(&self, handle: &Handle, base_url: &Url) -> (String, String, Vec<ExtractedLink>) {
59 let mut html = String::new();
60 let mut text = String::new();
61 let mut links = Vec::new();
62 serialize_node(handle, &mut html, &mut text, &mut links, base_url, self.preserve_links);
63 (html, text, links)
64 }
65}
66
67fn find_body(root: &Handle) -> Option<Handle> {
68 find_tag(root, "body")
69}
70
71fn find_tag(handle: &Handle, tag_name: &str) -> Option<Handle> {
72 if let NodeData::Element { name, .. } = &handle.data {
73 if name.local.as_ref() == tag_name {
74 return Some(handle.clone());
75 }
76 }
77 for child in handle.children.borrow().iter() {
78 if let Some(found) = find_tag(child, tag_name) {
79 return Some(found);
80 }
81 }
82 None
83}
84
85fn find_main_content(body: &Handle) -> Option<Handle> {
86 if let Some(node) = find_tag(body, "main").or_else(|| find_tag(body, "article")) {
88 return Some(node);
89 }
90
91 let mut candidates: HashMap<usize, (Handle, f64)> = HashMap::new();
94 let mut ancestors: Vec<Handle> = Vec::new();
95 collect_candidate_scores(body, &mut ancestors, &mut candidates);
96
97 let best = candidates
98 .into_values()
99 .map(|(h, raw)| {
100 let text_len = count_text(&h) as f64;
101 let link_len = count_link_text(&h) as f64;
102 let density = if text_len > 0.0 { link_len / text_len } else { 1.0 };
103 let bonus = class_score(&h);
104 let score = (raw + bonus) * (1.0 - density);
105 (h, score)
106 })
107 .filter(|(_, s)| *s > 0.0)
108 .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
109 .map(|(node, _)| node);
110
111 if let Some(ref node) = best {
114 if let Some(expanded) = try_sibling_expand(node) {
115 return Some(expanded);
116 }
117 }
118 best
119}
120
121fn try_sibling_expand(node: &Handle) -> Option<Handle> {
127 let mut current = node.clone();
128 for _ in 0..3 {
129 let parent = {
131 let weak = current.parent.take();
132 current.parent.set(weak.clone());
133 weak?.upgrade()?
134 };
135
136 let current_tag = match ¤t.data {
137 NodeData::Element { name, .. } => name.local.as_ref().to_owned(),
138 _ => return None,
139 };
140
141 let same_tag_count = parent.children.borrow().iter()
142 .filter(|c| matches!(&c.data,
143 NodeData::Element { name, .. } if name.local.as_ref() == current_tag))
144 .count();
145
146 if same_tag_count >= 3 {
147 return Some(parent);
148 }
149
150 current = parent;
151 }
152 None
153}
154
155fn collect_candidate_scores(
158 handle: &Handle,
159 ancestors: &mut Vec<Handle>,
160 candidates: &mut HashMap<usize, (Handle, f64)>,
161) {
162 if is_noise(handle) {
163 return;
164 }
165
166 if let NodeData::Element { name, .. } = &handle.data {
167 let tag = name.local.as_ref();
168 let score = leaf_content_score(handle, tag);
169
170 if score > 0.0 {
171 let mut weight = 1.0;
172 let mut levels = 0usize;
173 for ancestor in ancestors.iter().rev() {
174 if let NodeData::Element { name: aname, .. } = &ancestor.data {
175 if is_candidate_tag(aname.local.as_ref()) {
176 let key = Rc::as_ptr(ancestor) as usize;
177 candidates
178 .entry(key)
179 .or_insert_with(|| (ancestor.clone(), 0.0))
180 .1 += score * weight;
181 weight *= 0.5;
182 levels += 1;
183 if levels >= 4 {
184 break;
185 }
186 }
187 }
188 }
189 }
190 }
191
192 ancestors.push(handle.clone());
193 for child in handle.children.borrow().iter() {
194 collect_candidate_scores(child, ancestors, candidates);
195 }
196 ancestors.pop();
197}
198
199fn leaf_content_score(handle: &Handle, tag: &str) -> f64 {
201 let text_len = count_text(handle) as f64;
202 if text_len < 20.0 {
203 return 0.0;
204 }
205 match tag {
206 "p" => 1.0 + (text_len / 100.0).min(3.0),
207 "pre" | "blockquote" => 3.0 + (text_len / 100.0).min(3.0),
208 "td" => (text_len / 50.0).min(3.0),
209 "li" => 0.5 + (text_len / 200.0).min(1.0),
210 _ => 0.0,
211 }
212}
213
214fn is_candidate_tag(tag: &str) -> bool {
216 matches!(tag, "div" | "section" | "article" | "main" | "blockquote" | "pre" | "td" | "tbody" | "p")
217}
218
219fn class_score(handle: &Handle) -> f64 {
220 let attrs = match &handle.data {
221 NodeData::Element { attrs, .. } => attrs.borrow(),
222 _ => return 0.0,
223 };
224
225 let mut score = 0.0;
226 for attr in attrs.iter() {
227 let name = attr.name.local.as_ref();
228 if name != "class" && name != "id" {
229 continue;
230 }
231 let val = attr.value.as_ref().to_lowercase();
232 for pattern in CONTENT_CLASS_PATTERNS {
233 if val.contains(pattern) {
234 score += 10.0;
235 }
236 }
237 for pattern in NOISE_CLASS_PATTERNS {
238 if val.contains(pattern) {
239 score -= 10.0;
240 }
241 }
242 }
243 score
244}
245
246fn is_noise(handle: &Handle) -> bool {
247 match &handle.data {
248 NodeData::Element { name, attrs, .. } => {
249 let tag = name.local.as_ref();
250 if NOISE_TAGS.contains(&tag) {
251 return true;
252 }
253 let attrs = attrs.borrow();
254 for attr in attrs.iter() {
255 let aname = attr.name.local.as_ref();
256 if aname != "class" && aname != "id" {
257 continue;
258 }
259 let val = attr.value.as_ref().to_lowercase();
260 for pattern in NOISE_CLASS_PATTERNS {
261 if val.contains(pattern) {
262 return true;
263 }
264 }
265 }
266 false
267 }
268 _ => false,
269 }
270}
271
272fn count_text(handle: &Handle) -> usize {
273 let mut total = 0;
274 count_text_inner(handle, &mut total);
275 total
276}
277
278fn count_text_inner(handle: &Handle, total: &mut usize) {
279 match &handle.data {
280 NodeData::Text { contents } => {
281 *total += contents.borrow().trim().len();
282 }
283 NodeData::Element { name, .. } => {
284 let tag = name.local.as_ref();
285 if tag == "script" || tag == "style" {
286 return;
287 }
288 for child in handle.children.borrow().iter() {
289 count_text_inner(child, total);
290 }
291 }
292 _ => {
293 for child in handle.children.borrow().iter() {
294 count_text_inner(child, total);
295 }
296 }
297 }
298}
299
300fn count_link_text(handle: &Handle) -> usize {
301 let mut total = 0;
302 count_link_text_inner(handle, &mut total, false);
303 total
304}
305
306fn count_link_text_inner(handle: &Handle, total: &mut usize, in_link: bool) {
307 match &handle.data {
308 NodeData::Text { contents } if in_link => {
309 *total += contents.borrow().trim().len();
310 }
311 NodeData::Element { name, .. } => {
312 let tag = name.local.as_ref();
313 let is_link = tag == "a";
314 for child in handle.children.borrow().iter() {
315 count_link_text_inner(child, total, in_link || is_link);
316 }
317 }
318 _ => {}
319 }
320}
321
322fn serialize_node(
323 handle: &Handle,
324 html: &mut String,
325 text: &mut String,
326 links: &mut Vec<ExtractedLink>,
327 base_url: &Url,
328 preserve_links: bool,
329) {
330 if is_noise(handle) {
331 return;
332 }
333
334 match &handle.data {
335 NodeData::Text { contents } => {
336 let t = contents.borrow();
337 let trimmed = t.as_ref();
338 if !trimmed.trim().is_empty() {
339 html.push_str(&html_escape(trimmed));
340 text.push_str(trimmed);
341 }
342 }
343 NodeData::Element { name, attrs, .. } => {
344 let tag = name.local.as_ref();
345 let attrs_ref = attrs.borrow();
346
347 match tag {
348 "script" | "style" | "noscript" | "iframe" => return,
349 "a" if preserve_links => {
350 let href = attrs_ref.iter()
351 .find(|a| a.name.local.as_ref() == "href")
352 .map(|a| a.value.as_ref().to_owned());
353 let rel = attrs_ref.iter()
354 .find(|a| a.name.local.as_ref() == "rel")
355 .map(|a| a.value.as_ref().to_owned());
356
357 let resolved = href.as_deref().and_then(|h| base_url.join(h).ok());
358
359 html.push_str("<a");
360 if let Some(ref h) = href {
361 html.push_str(&format!(" href=\"{}\"", html_escape(h)));
362 }
363 html.push('>');
364
365 let mut link_text = String::new();
366 let mut link_html = String::new();
367 for child in handle.children.borrow().iter() {
368 serialize_node(child, &mut link_html, text, links, base_url, preserve_links);
369 collect_text(child, &mut link_text);
370 }
371 html.push_str(&link_html);
372 html.push_str("</a>");
373
374 if let Some(href_url) = resolved {
375 let trimmed = link_text.trim().to_owned();
376 if !trimmed.is_empty() {
378 links.push(ExtractedLink {
379 text: trimmed,
380 href: href_url,
381 rel,
382 });
383 }
384 }
385 return;
386 }
387 _ => {
388 let is_block = matches!(tag, "p" | "div" | "section" | "article" |
390 "h1" | "h2" | "h3" | "h4" | "h5" | "h6" |
391 "ul" | "ol" | "li" | "blockquote" | "pre" | "br" | "hr" |
392 "table" | "tr" | "td" | "th" | "thead" | "tbody");
393
394 if is_block {
395 html.push('<');
396 html.push_str(tag);
397 html.push('>');
398 if tag == "br" || tag == "hr" {
399 } else {
401 for child in handle.children.borrow().iter() {
402 serialize_node(child, html, text, links, base_url, preserve_links);
403 }
404 html.push_str("</");
405 html.push_str(tag);
406 html.push('>');
407 }
408 } else {
409 for child in handle.children.borrow().iter() {
411 serialize_node(child, html, text, links, base_url, preserve_links);
412 }
413 }
414 return;
415 }
416 }
417 }
418 _ => {}
419 }
420}
421
422fn collect_text(handle: &Handle, out: &mut String) {
423 match &handle.data {
424 NodeData::Text { contents } => {
425 out.push_str(contents.borrow().as_ref());
426 }
427 _ => {
428 for child in handle.children.borrow().iter() {
429 collect_text(child, out);
430 }
431 }
432 }
433}
434
435fn html_escape(s: &str) -> String {
436 s.replace('&', "&")
437 .replace('<', "<")
438 .replace('>', ">")
439 .replace('"', """)
440}
441
442fn extract_title(root: &Handle) -> Option<String> {
443 if let Some(title_node) = find_tag(root, "title") {
445 let mut text = String::new();
446 collect_text(&title_node, &mut text);
447 let trimmed = text.trim().to_owned();
448 if !trimmed.is_empty() {
449 return Some(trimmed);
450 }
451 }
452 if let Some(h1) = find_tag(root, "h1") {
453 let mut text = String::new();
454 collect_text(&h1, &mut text);
455 let trimmed = text.trim().to_owned();
456 if !trimmed.is_empty() {
457 return Some(trimmed);
458 }
459 }
460 None
461}
462
463fn extract_metadata(root: &Handle, base_url: &Url) -> PageMetadata {
464 let mut meta = PageMetadata {
465 description: None,
466 og_title: None,
467 og_image: None,
468 canonical: None,
469 published_at: None,
470 };
471 collect_meta(root, &mut meta, base_url);
472 meta
473}
474
475fn collect_meta(handle: &Handle, meta: &mut PageMetadata, base_url: &Url) {
476 if let NodeData::Element { name, attrs, .. } = &handle.data {
477 let tag = name.local.as_ref();
478 let attrs_ref = attrs.borrow();
479
480 if tag == "meta" {
481 let name_attr = attrs_ref.iter()
482 .find(|a| a.name.local.as_ref() == "name")
483 .map(|a| a.value.as_ref().to_lowercase());
484 let property_attr = attrs_ref.iter()
485 .find(|a| a.name.local.as_ref() == "property")
486 .map(|a| a.value.as_ref().to_lowercase());
487 let content = attrs_ref.iter()
488 .find(|a| a.name.local.as_ref() == "content")
489 .map(|a| a.value.as_ref().to_owned());
490
491 match (name_attr.as_deref(), property_attr.as_deref(), content) {
492 (Some("description"), _, Some(c)) => meta.description = Some(c),
493 (_, Some("og:title"), Some(c)) => meta.og_title = Some(c),
494 (_, Some("og:image"), Some(c)) => meta.og_image = Some(c),
495 _ => {}
496 }
497 } else if tag == "link" {
498 let is_canonical = attrs_ref.iter()
499 .any(|a| a.name.local.as_ref() == "rel" && a.value.as_ref() == "canonical");
500 if is_canonical {
501 if let Some(href) = attrs_ref.iter()
502 .find(|a| a.name.local.as_ref() == "href")
503 .and_then(|a| base_url.join(a.value.as_ref()).ok())
504 {
505 meta.canonical = Some(href);
506 }
507 }
508 }
509 }
510
511 for child in handle.children.borrow().iter() {
512 collect_meta(child, meta, base_url);
513 }
514}