1use markup5ever_rcdom::{Handle, NodeData};
2use url::Url;
3use std::collections::HashMap;
4use std::rc::Rc;
5use crate::document::{ExtractedContent, ExtractedLink, PageMetadata};
6
7const NOISE_TAGS: &[&str] = &["nav", "header", "footer", "aside", "script", "style", "noscript", "iframe", "form"];
8const NOISE_CLASS_PATTERNS: &[&str] = &["nav", "sidebar", "menu", "ad", "banner", "comment", "footer", "header", "widget"];
9const CONTENT_CLASS_PATTERNS: &[&str] = &["article", "content", "main", "post", "entry", "body", "text"];
10
11pub struct ReadabilityExtractor {
12 pub preserve_links: bool,
13}
14
15impl ReadabilityExtractor {
16 pub fn extract(&self, root: &Handle, base_url: &Url) -> ExtractedContent {
17 let title = extract_title(root);
18 let metadata = extract_metadata(root, base_url);
19 let body = find_body(root);
20
21 let main_node = body.as_ref()
22 .and_then(|b| find_main_content(b))
23 .or(body.clone());
24
25 let (mh, mt, ml) = main_node
26 .as_ref()
27 .map(|n| self.serialize_content(n, base_url))
28 .unwrap_or_else(|| (String::new(), String::new(), Vec::new()));
29
30 let (body_html, body_text, links) = if mt.trim().len() < 200 {
33 if let Some(b) = body.as_ref() {
34 let (bh, bt, bl) = self.serialize_content(b, base_url);
35 if bt.trim().len() > mt.trim().len() {
36 (bh, bt, bl)
37 } else {
38 (mh, mt, ml)
39 }
40 } else {
41 (mh, mt, ml)
42 }
43 } else {
44 (mh, mt, ml)
45 };
46
47 ExtractedContent {
48 url: base_url.clone(),
49 title: title.unwrap_or_else(|| base_url.to_string()),
50 byline: metadata.og_title.clone(),
51 body_text,
52 body_html,
53 links,
54 metadata,
55 }
56 }
57
58 fn serialize_content(&self, handle: &Handle, base_url: &Url) -> (String, String, Vec<ExtractedLink>) {
59 let mut html = String::new();
60 let mut text = String::new();
61 let mut links = Vec::new();
62 serialize_node(handle, &mut html, &mut text, &mut links, base_url, self.preserve_links);
63 (html, text, links)
64 }
65}
66
67fn find_body(root: &Handle) -> Option<Handle> {
68 find_tag(root, "body")
69}
70
71fn find_tag(handle: &Handle, tag_name: &str) -> Option<Handle> {
72 if let NodeData::Element { name, .. } = &handle.data {
73 if name.local.as_ref() == tag_name {
74 return Some(handle.clone());
75 }
76 }
77 for child in handle.children.borrow().iter() {
78 if let Some(found) = find_tag(child, tag_name) {
79 return Some(found);
80 }
81 }
82 None
83}
84
85fn find_main_content(body: &Handle) -> Option<Handle> {
86 if let Some(node) = find_tag(body, "main").or_else(|| find_tag(body, "article")) {
88 return Some(node);
89 }
90
91 let mut candidates: HashMap<usize, (Handle, f64)> = HashMap::new();
94 let mut ancestors: Vec<Handle> = Vec::new();
95 collect_candidate_scores(body, &mut ancestors, &mut candidates);
96
97 let best = candidates
98 .into_values()
99 .map(|(h, raw)| {
100 let text_len = count_text(&h) as f64;
101 let link_len = count_link_text(&h) as f64;
102 let density = if text_len > 0.0 { link_len / text_len } else { 1.0 };
103 let bonus = class_score(&h);
104 let score = (raw + bonus) * (1.0 - density);
105 (h, score)
106 })
107 .filter(|(_, s)| *s > 0.0)
108 .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
109 .map(|(node, _)| node);
110
111 if let Some(ref node) = best {
114 if let Some(expanded) = try_sibling_expand(node) {
115 return Some(expanded);
116 }
117 }
118 best
119}
120
121fn try_sibling_expand(node: &Handle) -> Option<Handle> {
127 let mut current = node.clone();
128 for _ in 0..3 {
129 let parent = {
131 let weak = current.parent.take();
132 current.parent.set(weak.clone());
133 weak?.upgrade()?
134 };
135
136 let current_tag = match ¤t.data {
137 NodeData::Element { name, .. } => name.local.as_ref().to_owned(),
138 _ => return None,
139 };
140
141 let same_tag_count = parent.children.borrow().iter()
142 .filter(|c| matches!(&c.data,
143 NodeData::Element { name, .. } if name.local.as_ref() == current_tag))
144 .count();
145
146 if same_tag_count >= 3 {
147 return Some(parent);
148 }
149
150 current = parent;
151 }
152 None
153}
154
155fn collect_candidate_scores(
158 handle: &Handle,
159 ancestors: &mut Vec<Handle>,
160 candidates: &mut HashMap<usize, (Handle, f64)>,
161) {
162 if is_noise(handle) {
163 return;
164 }
165
166 if let NodeData::Element { name, .. } = &handle.data {
167 let tag = name.local.as_ref();
168 let score = leaf_content_score(handle, tag);
169
170 if score > 0.0 {
171 let mut weight = 1.0;
172 let mut levels = 0usize;
173 for ancestor in ancestors.iter().rev() {
174 if let NodeData::Element { name: aname, .. } = &ancestor.data {
175 if is_candidate_tag(aname.local.as_ref()) {
176 let key = Rc::as_ptr(ancestor) as usize;
177 candidates
178 .entry(key)
179 .or_insert_with(|| (ancestor.clone(), 0.0))
180 .1 += score * weight;
181 weight *= 0.5;
182 levels += 1;
183 if levels >= 4 {
184 break;
185 }
186 }
187 }
188 }
189 }
190 }
191
192 ancestors.push(handle.clone());
193 for child in handle.children.borrow().iter() {
194 collect_candidate_scores(child, ancestors, candidates);
195 }
196 ancestors.pop();
197}
198
199fn leaf_content_score(handle: &Handle, tag: &str) -> f64 {
201 let text_len = count_text(handle) as f64;
202 if text_len < 20.0 {
203 return 0.0;
204 }
205 match tag {
206 "p" => 1.0 + (text_len / 100.0).min(3.0),
207 "pre" | "blockquote" => 3.0 + (text_len / 100.0).min(3.0),
208 "td" => (text_len / 50.0).min(3.0),
209 "li" => 0.5 + (text_len / 200.0).min(1.0),
210 _ => 0.0,
211 }
212}
213
214fn is_candidate_tag(tag: &str) -> bool {
216 matches!(tag, "div" | "section" | "article" | "main" | "blockquote" | "pre" | "td" | "tbody" | "p")
217}
218
219fn class_score(handle: &Handle) -> f64 {
220 let attrs = match &handle.data {
221 NodeData::Element { attrs, .. } => attrs.borrow(),
222 _ => return 0.0,
223 };
224
225 let mut score = 0.0;
226 for attr in attrs.iter() {
227 let name = attr.name.local.as_ref();
228 if name != "class" && name != "id" {
229 continue;
230 }
231 let val = attr.value.as_ref().to_lowercase();
232 for pattern in CONTENT_CLASS_PATTERNS {
233 if class_contains_pattern(&val, pattern) {
234 score += 10.0;
235 }
236 }
237 for pattern in NOISE_CLASS_PATTERNS {
238 if class_contains_pattern(&val, pattern) {
239 score -= 10.0;
240 }
241 }
242 }
243 score
244}
245
246fn is_noise(handle: &Handle) -> bool {
247 match &handle.data {
248 NodeData::Element { name, attrs, .. } => {
249 let tag = name.local.as_ref();
250 if NOISE_TAGS.contains(&tag) {
251 return true;
252 }
253 let attrs = attrs.borrow();
254 for attr in attrs.iter() {
255 let aname = attr.name.local.as_ref();
256 if aname != "class" && aname != "id" {
257 continue;
258 }
259 let val = attr.value.as_ref().to_lowercase();
260 for pattern in NOISE_CLASS_PATTERNS {
261 if class_contains_pattern(&val, pattern) {
262 return true;
263 }
264 }
265 }
266 false
267 }
268 _ => false,
269 }
270}
271
272fn class_contains_pattern(class_val: &str, pattern: &str) -> bool {
276 class_val.split_whitespace().any(|token| {
277 let bare = token.split(':').last().unwrap_or(token);
279 bare.split('-').any(|part| part == pattern)
281 })
282}
283
284fn count_text(handle: &Handle) -> usize {
285 let mut total = 0;
286 count_text_inner(handle, &mut total);
287 total
288}
289
290fn count_text_inner(handle: &Handle, total: &mut usize) {
291 match &handle.data {
292 NodeData::Text { contents } => {
293 *total += contents.borrow().trim().len();
294 }
295 NodeData::Element { name, .. } => {
296 let tag = name.local.as_ref();
297 if tag == "script" || tag == "style" {
298 return;
299 }
300 for child in handle.children.borrow().iter() {
301 count_text_inner(child, total);
302 }
303 }
304 _ => {
305 for child in handle.children.borrow().iter() {
306 count_text_inner(child, total);
307 }
308 }
309 }
310}
311
312fn count_link_text(handle: &Handle) -> usize {
313 let mut total = 0;
314 count_link_text_inner(handle, &mut total, false);
315 total
316}
317
318fn count_link_text_inner(handle: &Handle, total: &mut usize, in_link: bool) {
319 match &handle.data {
320 NodeData::Text { contents } if in_link => {
321 *total += contents.borrow().trim().len();
322 }
323 NodeData::Element { name, .. } => {
324 let tag = name.local.as_ref();
325 let is_link = tag == "a";
326 for child in handle.children.borrow().iter() {
327 count_link_text_inner(child, total, in_link || is_link);
328 }
329 }
330 _ => {}
331 }
332}
333
334fn serialize_node(
335 handle: &Handle,
336 html: &mut String,
337 text: &mut String,
338 links: &mut Vec<ExtractedLink>,
339 base_url: &Url,
340 preserve_links: bool,
341) {
342 if is_noise(handle) {
343 return;
344 }
345
346 match &handle.data {
347 NodeData::Text { contents } => {
348 let t = contents.borrow();
349 let trimmed = t.as_ref();
350 if !trimmed.trim().is_empty() {
351 html.push_str(&html_escape(trimmed));
352 text.push_str(trimmed);
353 }
354 }
355 NodeData::Element { name, attrs, .. } => {
356 let tag = name.local.as_ref();
357 let attrs_ref = attrs.borrow();
358
359 match tag {
360 "script" | "style" | "noscript" | "iframe" => return,
361 "a" if preserve_links => {
362 let href = attrs_ref.iter()
363 .find(|a| a.name.local.as_ref() == "href")
364 .map(|a| a.value.as_ref().to_owned());
365 let rel = attrs_ref.iter()
366 .find(|a| a.name.local.as_ref() == "rel")
367 .map(|a| a.value.as_ref().to_owned());
368
369 let resolved = href.as_deref().and_then(|h| base_url.join(h).ok());
370
371 html.push_str("<a");
372 if let Some(ref h) = href {
373 html.push_str(&format!(" href=\"{}\"", html_escape(h)));
374 }
375 html.push('>');
376
377 let mut link_text = String::new();
378 let mut link_html = String::new();
379 for child in handle.children.borrow().iter() {
380 serialize_node(child, &mut link_html, text, links, base_url, preserve_links);
381 collect_text(child, &mut link_text);
382 }
383 html.push_str(&link_html);
384 html.push_str("</a>");
385
386 if let Some(href_url) = resolved {
387 let trimmed = link_text.trim().to_owned();
388 if !trimmed.is_empty() {
390 links.push(ExtractedLink {
391 text: trimmed,
392 href: href_url,
393 rel,
394 });
395 }
396 }
397 return;
398 }
399 _ => {
400 let is_block = matches!(tag, "p" | "div" | "section" | "article" |
402 "h1" | "h2" | "h3" | "h4" | "h5" | "h6" |
403 "ul" | "ol" | "li" | "blockquote" | "pre" | "br" | "hr" |
404 "table" | "tr" | "td" | "th" | "thead" | "tbody");
405
406 if is_block {
407 html.push('<');
408 html.push_str(tag);
409 html.push('>');
410 if tag == "br" || tag == "hr" {
411 } else {
413 for child in handle.children.borrow().iter() {
414 serialize_node(child, html, text, links, base_url, preserve_links);
415 }
416 html.push_str("</");
417 html.push_str(tag);
418 html.push('>');
419 }
420 } else {
421 for child in handle.children.borrow().iter() {
423 serialize_node(child, html, text, links, base_url, preserve_links);
424 }
425 }
426 return;
427 }
428 }
429 }
430 _ => {}
431 }
432}
433
434fn collect_text(handle: &Handle, out: &mut String) {
435 match &handle.data {
436 NodeData::Text { contents } => {
437 out.push_str(contents.borrow().as_ref());
438 }
439 _ => {
440 for child in handle.children.borrow().iter() {
441 collect_text(child, out);
442 }
443 }
444 }
445}
446
447fn html_escape(s: &str) -> String {
448 s.replace('&', "&")
449 .replace('<', "<")
450 .replace('>', ">")
451 .replace('"', """)
452}
453
454fn extract_title(root: &Handle) -> Option<String> {
455 if let Some(title_node) = find_tag(root, "title") {
457 let mut text = String::new();
458 collect_text(&title_node, &mut text);
459 let trimmed = text.trim().to_owned();
460 if !trimmed.is_empty() {
461 return Some(trimmed);
462 }
463 }
464 if let Some(h1) = find_tag(root, "h1") {
465 let mut text = String::new();
466 collect_text(&h1, &mut text);
467 let trimmed = text.trim().to_owned();
468 if !trimmed.is_empty() {
469 return Some(trimmed);
470 }
471 }
472 None
473}
474
475fn extract_metadata(root: &Handle, base_url: &Url) -> PageMetadata {
476 let mut meta = PageMetadata {
477 description: None,
478 og_title: None,
479 og_image: None,
480 canonical: None,
481 published_at: None,
482 };
483 collect_meta(root, &mut meta, base_url);
484 meta
485}
486
487fn collect_meta(handle: &Handle, meta: &mut PageMetadata, base_url: &Url) {
488 if let NodeData::Element { name, attrs, .. } = &handle.data {
489 let tag = name.local.as_ref();
490 let attrs_ref = attrs.borrow();
491
492 if tag == "meta" {
493 let name_attr = attrs_ref.iter()
494 .find(|a| a.name.local.as_ref() == "name")
495 .map(|a| a.value.as_ref().to_lowercase());
496 let property_attr = attrs_ref.iter()
497 .find(|a| a.name.local.as_ref() == "property")
498 .map(|a| a.value.as_ref().to_lowercase());
499 let content = attrs_ref.iter()
500 .find(|a| a.name.local.as_ref() == "content")
501 .map(|a| a.value.as_ref().to_owned());
502
503 match (name_attr.as_deref(), property_attr.as_deref(), content) {
504 (Some("description"), _, Some(c)) => meta.description = Some(c),
505 (_, Some("og:title"), Some(c)) => meta.og_title = Some(c),
506 (_, Some("og:image"), Some(c)) => meta.og_image = Some(c),
507 _ => {}
508 }
509 } else if tag == "link" {
510 let is_canonical = attrs_ref.iter()
511 .any(|a| a.name.local.as_ref() == "rel" && a.value.as_ref() == "canonical");
512 if is_canonical {
513 if let Some(href) = attrs_ref.iter()
514 .find(|a| a.name.local.as_ref() == "href")
515 .and_then(|a| base_url.join(a.value.as_ref()).ok())
516 {
517 meta.canonical = Some(href);
518 }
519 }
520 }
521 }
522
523 for child in handle.children.borrow().iter() {
524 collect_meta(child, meta, base_url);
525 }
526}