1use markup5ever_rcdom::{Handle, NodeData};
2use url::Url;
3use std::collections::HashMap;
4use std::rc::Rc;
5use crate::document::{ExtractedContent, ExtractedLink, PageMetadata};
6
7const NOISE_TAGS: &[&str] = &["nav", "header", "footer", "aside", "script", "style", "noscript", "iframe", "form"];
8const NOISE_CLASS_PATTERNS: &[&str] = &["nav", "sidebar", "menu", "ad", "banner", "comment", "footer", "header", "widget"];
9const CONTENT_CLASS_PATTERNS: &[&str] = &["article", "content", "main", "post", "entry", "body", "text"];
10
11pub struct ReadabilityExtractor {
12 pub preserve_links: bool,
13}
14
15impl ReadabilityExtractor {
16 pub fn extract(&self, root: &Handle, base_url: &Url) -> ExtractedContent {
17 let title = extract_title(root);
18 let metadata = extract_metadata(root, base_url);
19 let body = find_body(root);
20
21 let main_node = body.as_ref()
22 .and_then(|b| find_main_content(b))
23 .or(body.clone());
24
25 let (mh, mt, ml) = main_node
26 .as_ref()
27 .map(|n| self.serialize_content(n, base_url))
28 .unwrap_or_else(|| (String::new(), String::new(), Vec::new()));
29
30 let (body_html, body_text, links) = if mt.trim().len() < 200 {
33 if let Some(b) = body.as_ref() {
34 let (bh, bt, bl) = self.serialize_content(b, base_url);
35 if bt.trim().len() > mt.trim().len() {
36 (bh, bt, bl)
37 } else {
38 (mh, mt, ml)
39 }
40 } else {
41 (mh, mt, ml)
42 }
43 } else {
44 (mh, mt, ml)
45 };
46
47 ExtractedContent {
48 url: base_url.clone(),
49 title: title.unwrap_or_else(|| base_url.to_string()),
50 byline: metadata.og_title.clone(),
51 body_text,
52 body_html,
53 links,
54 metadata,
55 }
56 }
57
58 fn serialize_content(&self, handle: &Handle, base_url: &Url) -> (String, String, Vec<ExtractedLink>) {
59 let mut html = String::new();
60 let mut text = String::new();
61 let mut links = Vec::new();
62 serialize_node(handle, &mut html, &mut text, &mut links, base_url, self.preserve_links);
63 (html, text, links)
64 }
65}
66
67fn find_body(root: &Handle) -> Option<Handle> {
68 find_tag(root, "body")
69}
70
71fn find_tag(handle: &Handle, tag_name: &str) -> Option<Handle> {
72 if let NodeData::Element { name, .. } = &handle.data {
73 if name.local.as_ref() == tag_name {
74 return Some(handle.clone());
75 }
76 }
77 for child in handle.children.borrow().iter() {
78 if let Some(found) = find_tag(child, tag_name) {
79 return Some(found);
80 }
81 }
82 None
83}
84
85fn find_main_content(body: &Handle) -> Option<Handle> {
86 if let Some(node) = find_tag(body, "main").or_else(|| find_tag(body, "article")) {
88 return Some(node);
89 }
90
91 let mut candidates: HashMap<usize, (Handle, f64)> = HashMap::new();
94 let mut ancestors: Vec<Handle> = Vec::new();
95 collect_candidate_scores(body, &mut ancestors, &mut candidates);
96
97 candidates
98 .into_values()
99 .map(|(h, raw)| {
100 let text_len = count_text(&h) as f64;
101 let link_len = count_link_text(&h) as f64;
102 let density = if text_len > 0.0 { link_len / text_len } else { 1.0 };
103 let bonus = class_score(&h);
104 let score = (raw + bonus) * (1.0 - density);
105 (h, score)
106 })
107 .filter(|(_, s)| *s > 0.0)
108 .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
109 .map(|(node, _)| node)
110}
111
112fn collect_candidate_scores(
115 handle: &Handle,
116 ancestors: &mut Vec<Handle>,
117 candidates: &mut HashMap<usize, (Handle, f64)>,
118) {
119 if is_noise(handle) {
120 return;
121 }
122
123 if let NodeData::Element { name, .. } = &handle.data {
124 let tag = name.local.as_ref();
125 let score = leaf_content_score(handle, tag);
126
127 if score > 0.0 {
128 let mut weight = 1.0;
129 let mut levels = 0usize;
130 for ancestor in ancestors.iter().rev() {
131 if let NodeData::Element { name: aname, .. } = &ancestor.data {
132 if is_candidate_tag(aname.local.as_ref()) {
133 let key = Rc::as_ptr(ancestor) as usize;
134 candidates
135 .entry(key)
136 .or_insert_with(|| (ancestor.clone(), 0.0))
137 .1 += score * weight;
138 weight *= 0.5;
139 levels += 1;
140 if levels >= 4 {
141 break;
142 }
143 }
144 }
145 }
146 }
147 }
148
149 ancestors.push(handle.clone());
150 for child in handle.children.borrow().iter() {
151 collect_candidate_scores(child, ancestors, candidates);
152 }
153 ancestors.pop();
154}
155
156fn leaf_content_score(handle: &Handle, tag: &str) -> f64 {
158 let text_len = count_text(handle) as f64;
159 if text_len < 20.0 {
160 return 0.0;
161 }
162 match tag {
163 "p" => 1.0 + (text_len / 100.0).min(3.0),
164 "pre" | "blockquote" => 3.0 + (text_len / 100.0).min(3.0),
165 "td" => (text_len / 50.0).min(3.0),
166 "li" => 0.5 + (text_len / 200.0).min(1.0),
167 _ => 0.0,
168 }
169}
170
171fn is_candidate_tag(tag: &str) -> bool {
173 matches!(tag, "div" | "section" | "article" | "main" | "blockquote" | "pre" | "td" | "tbody" | "p")
174}
175
176fn class_score(handle: &Handle) -> f64 {
177 let attrs = match &handle.data {
178 NodeData::Element { attrs, .. } => attrs.borrow(),
179 _ => return 0.0,
180 };
181
182 let mut score = 0.0;
183 for attr in attrs.iter() {
184 let name = attr.name.local.as_ref();
185 if name != "class" && name != "id" {
186 continue;
187 }
188 let val = attr.value.as_ref().to_lowercase();
189 for pattern in CONTENT_CLASS_PATTERNS {
190 if val.contains(pattern) {
191 score += 10.0;
192 }
193 }
194 for pattern in NOISE_CLASS_PATTERNS {
195 if val.contains(pattern) {
196 score -= 10.0;
197 }
198 }
199 }
200 score
201}
202
203fn is_noise(handle: &Handle) -> bool {
204 match &handle.data {
205 NodeData::Element { name, attrs, .. } => {
206 let tag = name.local.as_ref();
207 if NOISE_TAGS.contains(&tag) {
208 return true;
209 }
210 let attrs = attrs.borrow();
211 for attr in attrs.iter() {
212 let aname = attr.name.local.as_ref();
213 if aname != "class" && aname != "id" {
214 continue;
215 }
216 let val = attr.value.as_ref().to_lowercase();
217 for pattern in NOISE_CLASS_PATTERNS {
218 if val.contains(pattern) {
219 return true;
220 }
221 }
222 }
223 false
224 }
225 _ => false,
226 }
227}
228
229fn count_text(handle: &Handle) -> usize {
230 let mut total = 0;
231 count_text_inner(handle, &mut total);
232 total
233}
234
235fn count_text_inner(handle: &Handle, total: &mut usize) {
236 match &handle.data {
237 NodeData::Text { contents } => {
238 *total += contents.borrow().trim().len();
239 }
240 NodeData::Element { name, .. } => {
241 let tag = name.local.as_ref();
242 if tag == "script" || tag == "style" {
243 return;
244 }
245 for child in handle.children.borrow().iter() {
246 count_text_inner(child, total);
247 }
248 }
249 _ => {
250 for child in handle.children.borrow().iter() {
251 count_text_inner(child, total);
252 }
253 }
254 }
255}
256
257fn count_link_text(handle: &Handle) -> usize {
258 let mut total = 0;
259 count_link_text_inner(handle, &mut total, false);
260 total
261}
262
263fn count_link_text_inner(handle: &Handle, total: &mut usize, in_link: bool) {
264 match &handle.data {
265 NodeData::Text { contents } if in_link => {
266 *total += contents.borrow().trim().len();
267 }
268 NodeData::Element { name, .. } => {
269 let tag = name.local.as_ref();
270 let is_link = tag == "a";
271 for child in handle.children.borrow().iter() {
272 count_link_text_inner(child, total, in_link || is_link);
273 }
274 }
275 _ => {}
276 }
277}
278
279fn serialize_node(
280 handle: &Handle,
281 html: &mut String,
282 text: &mut String,
283 links: &mut Vec<ExtractedLink>,
284 base_url: &Url,
285 preserve_links: bool,
286) {
287 if is_noise(handle) {
288 return;
289 }
290
291 match &handle.data {
292 NodeData::Text { contents } => {
293 let t = contents.borrow();
294 let trimmed = t.as_ref();
295 if !trimmed.trim().is_empty() {
296 html.push_str(&html_escape(trimmed));
297 text.push_str(trimmed);
298 }
299 }
300 NodeData::Element { name, attrs, .. } => {
301 let tag = name.local.as_ref();
302 let attrs_ref = attrs.borrow();
303
304 match tag {
305 "script" | "style" | "noscript" | "iframe" => return,
306 "a" if preserve_links => {
307 let href = attrs_ref.iter()
308 .find(|a| a.name.local.as_ref() == "href")
309 .map(|a| a.value.as_ref().to_owned());
310 let rel = attrs_ref.iter()
311 .find(|a| a.name.local.as_ref() == "rel")
312 .map(|a| a.value.as_ref().to_owned());
313
314 let resolved = href.as_deref().and_then(|h| base_url.join(h).ok());
315
316 html.push_str("<a");
317 if let Some(ref h) = href {
318 html.push_str(&format!(" href=\"{}\"", html_escape(h)));
319 }
320 html.push('>');
321
322 let mut link_text = String::new();
323 let mut link_html = String::new();
324 for child in handle.children.borrow().iter() {
325 serialize_node(child, &mut link_html, text, links, base_url, preserve_links);
326 collect_text(child, &mut link_text);
327 }
328 html.push_str(&link_html);
329 html.push_str("</a>");
330
331 if let Some(href_url) = resolved {
332 let trimmed = link_text.trim().to_owned();
333 if !trimmed.is_empty() {
335 links.push(ExtractedLink {
336 text: trimmed,
337 href: href_url,
338 rel,
339 });
340 }
341 }
342 return;
343 }
344 _ => {
345 let is_block = matches!(tag, "p" | "div" | "section" | "article" |
347 "h1" | "h2" | "h3" | "h4" | "h5" | "h6" |
348 "ul" | "ol" | "li" | "blockquote" | "pre" | "br" | "hr" |
349 "table" | "tr" | "td" | "th" | "thead" | "tbody");
350
351 if is_block {
352 html.push('<');
353 html.push_str(tag);
354 html.push('>');
355 if tag == "br" || tag == "hr" {
356 } else {
358 for child in handle.children.borrow().iter() {
359 serialize_node(child, html, text, links, base_url, preserve_links);
360 }
361 html.push_str("</");
362 html.push_str(tag);
363 html.push('>');
364 }
365 } else {
366 for child in handle.children.borrow().iter() {
368 serialize_node(child, html, text, links, base_url, preserve_links);
369 }
370 }
371 return;
372 }
373 }
374 }
375 _ => {}
376 }
377}
378
379fn collect_text(handle: &Handle, out: &mut String) {
380 match &handle.data {
381 NodeData::Text { contents } => {
382 out.push_str(contents.borrow().as_ref());
383 }
384 _ => {
385 for child in handle.children.borrow().iter() {
386 collect_text(child, out);
387 }
388 }
389 }
390}
391
392fn html_escape(s: &str) -> String {
393 s.replace('&', "&")
394 .replace('<', "<")
395 .replace('>', ">")
396 .replace('"', """)
397}
398
399fn extract_title(root: &Handle) -> Option<String> {
400 if let Some(title_node) = find_tag(root, "title") {
402 let mut text = String::new();
403 collect_text(&title_node, &mut text);
404 let trimmed = text.trim().to_owned();
405 if !trimmed.is_empty() {
406 return Some(trimmed);
407 }
408 }
409 if let Some(h1) = find_tag(root, "h1") {
410 let mut text = String::new();
411 collect_text(&h1, &mut text);
412 let trimmed = text.trim().to_owned();
413 if !trimmed.is_empty() {
414 return Some(trimmed);
415 }
416 }
417 None
418}
419
420fn extract_metadata(root: &Handle, base_url: &Url) -> PageMetadata {
421 let mut meta = PageMetadata {
422 description: None,
423 og_title: None,
424 og_image: None,
425 canonical: None,
426 published_at: None,
427 };
428 collect_meta(root, &mut meta, base_url);
429 meta
430}
431
432fn collect_meta(handle: &Handle, meta: &mut PageMetadata, base_url: &Url) {
433 if let NodeData::Element { name, attrs, .. } = &handle.data {
434 let tag = name.local.as_ref();
435 let attrs_ref = attrs.borrow();
436
437 if tag == "meta" {
438 let name_attr = attrs_ref.iter()
439 .find(|a| a.name.local.as_ref() == "name")
440 .map(|a| a.value.as_ref().to_lowercase());
441 let property_attr = attrs_ref.iter()
442 .find(|a| a.name.local.as_ref() == "property")
443 .map(|a| a.value.as_ref().to_lowercase());
444 let content = attrs_ref.iter()
445 .find(|a| a.name.local.as_ref() == "content")
446 .map(|a| a.value.as_ref().to_owned());
447
448 match (name_attr.as_deref(), property_attr.as_deref(), content) {
449 (Some("description"), _, Some(c)) => meta.description = Some(c),
450 (_, Some("og:title"), Some(c)) => meta.og_title = Some(c),
451 (_, Some("og:image"), Some(c)) => meta.og_image = Some(c),
452 _ => {}
453 }
454 } else if tag == "link" {
455 let is_canonical = attrs_ref.iter()
456 .any(|a| a.name.local.as_ref() == "rel" && a.value.as_ref() == "canonical");
457 if is_canonical {
458 if let Some(href) = attrs_ref.iter()
459 .find(|a| a.name.local.as_ref() == "href")
460 .and_then(|a| base_url.join(a.value.as_ref()).ok())
461 {
462 meta.canonical = Some(href);
463 }
464 }
465 }
466 }
467
468 for child in handle.children.borrow().iter() {
469 collect_meta(child, meta, base_url);
470 }
471}