article_extractor/full_text_parser/readability/
mod.rs1mod state;
2
3#[cfg(test)]
4mod tests;
5
6use self::state::State;
7use super::error::FullTextParserError;
8use crate::{constants, util::Util};
9use libxml::tree::{Document, Node};
10use std::cmp::Ordering;
11
12pub struct Readability;
16
17impl Readability {
18 pub fn extract(html: &str, base_url: Option<url::Url>) -> Result<String, FullTextParserError> {
26 libxml::tree::node::set_node_rc_guard(10);
27 let empty_config = crate::full_text_parser::config::ConfigEntry::default();
28
29 let url =
30 base_url.unwrap_or_else(|| url::Url::parse("http://fakehost/test/base/").unwrap());
31 let document = crate::FullTextParser::parse_html(html, None, &empty_config)?;
32 let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document)?;
33
34 crate::FullTextParser::prep_content(&xpath_ctx, None, &empty_config, &url, &document, None);
35 let mut article = crate::article::Article {
36 title: None,
37 author: None,
38 url,
39 date: None,
40 thumbnail_url: None,
41 html: None,
42 };
43
44 let mut article_document = Document::new().map_err(|()| FullTextParserError::Xml)?;
45 let mut root =
46 Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?;
47 article_document.set_root_element(&root);
48
49 crate::full_text_parser::metadata::extract(&xpath_ctx, None, None, &mut article);
50 super::Readability::extract_body(document, &mut root, article.title.as_deref())?;
51 crate::FullTextParser::post_process_document(&article_document)?;
52
53 let html = Util::serialize_node(&article_document, &root);
54 article.html.replace(html.clone());
55
56 Ok(html)
57 }
58
59 pub(crate) fn extract_body(
60 document: Document,
61 root: &mut Node,
62 title: Option<&str>,
63 ) -> Result<bool, FullTextParserError> {
64 let mut state = State::default();
65 let mut document = document;
66 let mut attempts: Vec<(Node, usize, Document)> = Vec::new();
67 let document_cache = document
68 .dup()
69 .map_err(|()| FullTextParserError::Readability)?;
70
71 loop {
72 let mut elements_to_score = Vec::new();
73 let mut node: Option<Node> = document.clone().get_root_element();
74
75 while let Some(node_ref) = node.as_mut() {
76 let tag_name = node_ref.get_name().to_uppercase();
77
78 if tag_name == "TEXT" && node_ref.get_content().trim().is_empty() {
79 node = Util::next_node(node_ref, true);
80 continue;
81 }
82
83 let match_string = Util::get_signature(node_ref);
84
85 if !Util::is_probably_visible(node_ref) {
86 log::debug!("removing hidden node {match_string}");
87 node = Util::remove_and_next(node_ref);
88 continue;
89 }
90
91 if Self::check_byline(node_ref, &match_string, &mut state) {
92 node = Util::remove_and_next(node_ref);
93 continue;
94 }
95
96 if state.should_remove_title_header
97 && Util::header_duplicates_title(node_ref, title)
98 {
99 state.should_remove_title_header = false;
100 node = Util::remove_and_next(node_ref);
101 continue;
102 }
103
104 if state.strip_unlikely {
106 if constants::UNLIELY_CANDIDATES.is_match(&match_string)
107 && !constants::OKAY_MAYBE_ITS_A_CANDIDATE.is_match(&match_string)
108 && !Util::has_ancestor_tag(
109 node_ref,
110 "table",
111 None,
112 None::<fn(&Node) -> bool>,
113 )
114 && !Util::has_ancestor_tag(
115 node_ref,
116 "code",
117 None,
118 None::<fn(&Node) -> bool>,
119 )
120 && tag_name != "BODY"
121 && tag_name != "A"
122 {
123 node = Util::remove_and_next(node_ref);
124 continue;
125 }
126
127 if let Some(role) = node_ref.get_attribute("role") {
128 if constants::UNLIKELY_ROLES.contains(&role.as_str()) {
129 node = Util::remove_and_next(node_ref);
130 continue;
131 }
132 }
133 }
134
135 if (tag_name == "DIV"
137 || tag_name == "SECTION"
138 || tag_name == "HEADER"
139 || tag_name == "H1"
140 || tag_name == "H2"
141 || tag_name == "H3"
142 || tag_name == "H4"
143 || tag_name == "H5"
144 || tag_name == "H6")
145 && Util::is_element_without_content(node_ref)
146 {
147 node = Util::remove_and_next(node_ref);
148 continue;
149 }
150
151 if constants::DEFAULT_TAGS_TO_SCORE.contains(&tag_name.as_str()) {
152 elements_to_score.push(node_ref.clone());
153 }
154
155 if tag_name == "DIV" {
157 let mut p: Option<Node> = None;
159 for mut child in node_ref.get_child_nodes().into_iter() {
160 if child.is_null() {
161 continue;
162 }
163
164 if Util::is_phrasing_content(&child) {
165 if let Some(p) = p.as_mut() {
166 child.unlink();
167 p.add_child(&mut child).map_err(|error| {
168 log::error!("{error}");
169 FullTextParserError::Readability
170 })?;
171 } else if !Util::is_whitespace(&child) {
172 let mut new_node = Node::new("p", None, &document)
173 .map_err(|()| FullTextParserError::Readability)?;
174 let mut old_node = node_ref
175 .replace_child_node(new_node.clone(), child)
176 .map_err(|error| {
177 log::error!("{error}");
178 FullTextParserError::Readability
179 })?;
180
181 new_node.add_child(&mut old_node).map_err(|error| {
182 log::error!("{error}");
183 FullTextParserError::Readability
184 })?;
185 p.replace(new_node);
186 }
187 } else if p.is_some() {
188 if let Some(p) = p.as_mut() {
189 for mut r_node in p.get_child_nodes().into_iter().rev() {
190 if r_node.is_null() {
191 continue;
192 }
193
194 if Util::is_whitespace(&r_node) {
195 r_node.unlink();
196 continue;
197 }
198 break;
199 }
200 }
201 _ = p.take();
202 }
203 }
204
205 if Util::has_single_tag_inside_element(node_ref, "P")
210 && Util::get_link_density(node_ref) < 0.25
211 {
212 if let Some(new_node) = node_ref.get_first_element_child() {
213 if let Some(mut parent) = node_ref.get_parent() {
214 parent
215 .replace_child_node(new_node.clone(), node_ref.clone())
216 .map_err(|error| {
217 log::error!("{error}");
218 FullTextParserError::Readability
219 })?;
220 node = Util::next_node(&new_node, false);
221 elements_to_score.push(new_node.clone());
222 continue;
223 }
224 }
225 } else if !Util::has_child_block_element(node_ref)
226 && node_ref.set_name("P").is_ok()
227 {
228 elements_to_score.push(node_ref.clone());
229 }
230 }
231
232 node = Util::next_node(node_ref, false);
233 }
234
235 let mut candidates = Vec::new();
236 for element_to_score in elements_to_score.drain(..) {
240 if element_to_score.get_parent().is_none() {
241 continue;
242 }
243
244 let inner_text = Util::get_inner_text(&element_to_score, true);
245 let inner_text_len = inner_text.len();
246
247 if inner_text_len < 25 {
249 continue;
250 }
251
252 let ancestors = Util::get_node_ancestors(&element_to_score, Some(5));
254 if ancestors.is_empty() {
255 continue;
256 }
257
258 let mut content_score = 0.0;
259
260 content_score += 1.0;
262
263 content_score += inner_text.split(',').count() as f64;
265
266 content_score += f64::min(f64::floor(inner_text.len() as f64 / 100.0), 3.0);
268
269 for (level, mut ancestor) in ancestors.into_iter().enumerate() {
271 let tag_name = ancestor.get_name().to_uppercase();
272
273 if ancestor.get_parent().is_none() || tag_name == "HTML" {
274 continue;
275 }
276
277 if Self::get_content_score(&ancestor).is_none() {
278 Self::initialize_node(&mut ancestor, &state)?;
279 candidates.push(ancestor.clone());
280 }
281
282 let score_divider = if level == 0 {
287 1.0
288 } else if level == 1 {
289 2.0
290 } else {
291 level as f64 * 3.0
292 };
293
294 if let Some(score) = Self::get_content_score(&ancestor) {
295 let add_score = content_score / score_divider;
296 let new_score = score + add_score;
297 log::debug!(
298 "{}: {score} + {add_score} = {new_score}",
299 ancestor.get_name()
300 );
301 Self::set_content_score(&mut ancestor, new_score)?;
302 }
303 }
304 }
305
306 for candidate in candidates.iter_mut() {
311 if let Some(content_score) = Self::get_content_score(candidate) {
315 let candidate_score = content_score * (1.0 - Util::get_link_density(candidate));
316 Self::set_content_score(candidate, candidate_score)?;
317 }
318 }
319
320 candidates.sort_by(|a, b| {
321 if let (Some(a), Some(b)) = (Self::get_content_score(a), Self::get_content_score(b))
322 {
323 b.partial_cmp(&a).unwrap_or(Ordering::Equal)
324 } else {
325 Ordering::Equal
326 }
327 });
328
329 let top_candidates = candidates.into_iter().take(5).collect::<Vec<_>>();
330
331 for candidate in top_candidates.iter() {
332 log::debug!(
333 "candidate: {} {:?}",
334 candidate.get_name(),
335 candidate.get_attributes()
336 );
337 }
338 let mut needed_to_create_top_candidate = false;
339 let mut top_candidate = if let Some(t) = top_candidates.first() {
340 t.clone()
341 } else {
342 let Some(mut root) = document.get_root_element() else {
345 log::error!("document has no root element");
346 return Err(FullTextParserError::Xml);
347 };
348
349 if let Some(body) = root
350 .get_child_elements()
351 .into_iter()
352 .find(|n| n.get_name().to_uppercase() == "BODY")
353 {
354 root = body;
355 }
356
357 let mut new_top_candidate =
358 Node::new("DIV", None, &document).expect("can't create new node");
359
360 for mut child in root.get_child_elements().drain(..) {
361 if child.is_null() {
362 continue;
363 }
364
365 child.unlink();
366 new_top_candidate.add_child(&mut child).unwrap();
367 }
368
369 root.add_child(&mut new_top_candidate).unwrap();
370
371 Self::initialize_node(&mut new_top_candidate, &state)
372 .expect("init should not fail");
373 needed_to_create_top_candidate = true;
374 new_top_candidate
375 };
376
377 let mut alternative_candidate_ancestors = Vec::new();
380 if let Some(top_score) = Self::get_content_score(&top_candidate) {
383 for candidate in top_candidates.iter().skip(1) {
384 let score = Self::get_content_score(candidate).unwrap_or(0.0);
385 if score / top_score >= 0.75 {
386 alternative_candidate_ancestors
387 .push(Util::get_node_ancestors(candidate, None));
388 }
389 }
390 }
391
392 if alternative_candidate_ancestors.len() >= constants::MINIMUM_TOPCANDIDATES {
393 let mut parent_of_top_candidate = top_candidate.get_parent();
394
395 while let Some(parent) = &parent_of_top_candidate {
396 if parent.get_name().to_uppercase() == "BODY" {
397 break;
398 }
399
400 let mut lists_containing_this_ancestor = 0;
401 let tmp = usize::min(
402 alternative_candidate_ancestors.len(),
403 constants::MINIMUM_TOPCANDIDATES,
404 );
405 for ancestors in alternative_candidate_ancestors.iter().take(tmp) {
406 lists_containing_this_ancestor +=
407 ancestors.iter().filter(|n| n == &parent).count();
408 }
409
410 if lists_containing_this_ancestor >= constants::MINIMUM_TOPCANDIDATES {
411 top_candidate = parent.clone();
412 break;
413 }
414
415 parent_of_top_candidate = parent_of_top_candidate.and_then(|n| n.get_parent());
416 }
417 }
418
419 if Self::get_content_score(&top_candidate).is_none() {
420 Self::initialize_node(&mut top_candidate, &state)?;
421 }
422
423 let mut parent_of_top_candidate = top_candidate.get_parent();
433 let mut last_score = Self::get_content_score(&top_candidate).unwrap_or(0.0);
434
435 let score_threshold = last_score / 3.0;
437
438 while parent_of_top_candidate.is_some()
439 && !Util::has_tag_name(parent_of_top_candidate.as_ref(), "BODY")
440 {
441 if parent_of_top_candidate
442 .as_ref()
443 .map(|n| Self::get_content_score(n).is_none())
444 .unwrap_or(false)
445 {
446 parent_of_top_candidate = parent_of_top_candidate.and_then(|n| n.get_parent());
447 continue;
448 }
449
450 let parent_score = parent_of_top_candidate
451 .as_ref()
452 .and_then(Self::get_content_score)
453 .unwrap_or(0.0);
454 if parent_score < score_threshold {
455 break;
456 }
457
458 if parent_score > last_score {
459 if let Some(parent) = parent_of_top_candidate {
461 top_candidate = parent;
462 }
463 break;
464 }
465
466 last_score = parent_of_top_candidate
467 .as_ref()
468 .and_then(Self::get_content_score)
469 .unwrap_or(0.0);
470 parent_of_top_candidate = parent_of_top_candidate.and_then(|n| n.get_parent());
471 }
472
473 parent_of_top_candidate = top_candidate.get_parent();
476
477 while !Util::has_tag_name(parent_of_top_candidate.as_ref(), "BODY")
478 && parent_of_top_candidate
479 .as_ref()
480 .map(|n| n.get_child_elements().len() == 1)
481 .unwrap_or(false)
482 {
483 top_candidate = parent_of_top_candidate.ok_or(FullTextParserError::Readability)?;
484 parent_of_top_candidate = top_candidate.get_parent();
485 }
486
487 if Self::get_content_score(&top_candidate).is_none() {
488 Self::initialize_node(&mut top_candidate, &state)?;
489 }
490
491 let mut article_content =
495 Node::new("DIV", None, &document).map_err(|()| FullTextParserError::Readability)?;
496
497 let sibling_score_threshold = f64::max(
498 10.0,
499 Self::get_content_score(&top_candidate).unwrap_or(0.0) * 0.2,
500 );
501 parent_of_top_candidate = top_candidate.get_parent();
503 let siblings = parent_of_top_candidate
504 .as_ref()
505 .map(|n| n.get_child_elements());
506
507 if let Some(mut siblings) = siblings {
508 for mut sibling in siblings.drain(..) {
509 if sibling.is_null() {
510 continue;
511 }
512
513 let mut append = false;
514
515 let score = Self::get_content_score(&sibling).unwrap_or(0.0);
516 log::debug!(
517 "Looking at sibling node: {} ({:?}) with score {score}",
518 sibling.get_name(),
519 sibling.get_attribute("class")
520 );
521
522 if top_candidate == sibling {
523 append = true;
524 } else {
525 let mut content_bonus = 0.0;
526
527 let sibling_classes = sibling.get_class_names();
529 let tc_classes = top_candidate.get_class_names();
530
531 if !tc_classes.is_empty()
532 && !sibling_classes.is_empty()
533 && sibling_classes
534 .iter()
535 .all(|class| tc_classes.contains(class))
536 {
537 content_bonus +=
538 Self::get_content_score(&top_candidate).unwrap_or(0.0) * 0.2;
539 }
540
541 if score + content_bonus >= sibling_score_threshold {
542 append = true;
543 } else if sibling.get_name().to_uppercase() == "P" {
544 let link_density = Util::get_link_density(&sibling);
545 let node_content = Util::get_inner_text(&sibling, true);
546 let node_length = node_content.len();
547
548 if node_length > 80
549 && (link_density < 0.25
550 || (node_length > 0
551 && link_density == 0.0
552 && constants::SIBLING_CONTENT.is_match(&node_content)))
553 {
554 append = true;
555 }
556 }
557 }
558
559 if append {
560 log::debug!(
561 "Appending node: {} ({:?})",
562 sibling.get_name(),
563 sibling.get_attribute("class")
564 );
565
566 if !constants::ALTER_TO_DIV_EXCEPTIONS
567 .contains(sibling.get_name().to_uppercase().as_str())
568 {
569 log::debug!(
572 "Altering sibling: {} ({:?})",
573 sibling.get_name(),
574 sibling.get_attribute("class")
575 );
576
577 sibling.set_name("DIV").map_err(|error| {
578 log::error!("{error}");
579 FullTextParserError::Readability
580 })?;
581 }
582
583 sibling.unlink();
584 article_content.add_child(&mut sibling).map_err(|error| {
585 log::error!("{error}");
586 FullTextParserError::Readability
587 })?;
588 }
589 }
590 }
591
592 if state.clean_conditionally {
593 crate::FullTextParser::post_process_page(&mut article_content)?;
594 }
595
596 if needed_to_create_top_candidate {
597 top_candidate
602 .set_property("id", "readability-page-1")
603 .map_err(|error| {
604 log::error!("{error}");
605 FullTextParserError::Readability
606 })?;
607 } else {
608 let mut div = Node::new("DIV", None, &document)
609 .map_err(|()| FullTextParserError::Readability)?;
610 div.set_property("id", "readability-page-1")
611 .map_err(|error| {
612 log::error!("{error}");
613 FullTextParserError::Readability
614 })?;
615
616 for mut child in article_content.get_child_nodes() {
617 if child.is_null() {
618 continue;
619 }
620
621 child.unlink();
622 div.add_child(&mut child).map_err(|error| {
623 log::error!("{error}");
624 FullTextParserError::Readability
625 })?;
626 }
627 article_content.add_child(&mut div).map_err(|error| {
628 log::error!("{error}");
629 FullTextParserError::Readability
630 })?;
631 }
632
633 let mut parse_successful = true;
634
635 let text = Util::get_inner_text(&article_content, true);
641 let text_length = text.len();
642
643 if text_length < constants::DEFAULT_CHAR_THRESHOLD {
644 parse_successful = false;
645
646 if state.strip_unlikely {
647 state.strip_unlikely = false;
648 attempts.push((article_content, text_length, document));
649 } else if state.weigh_classes {
650 state.weigh_classes = false;
651 attempts.push((article_content, text_length, document));
652 } else if state.clean_conditionally {
653 state.clean_conditionally = false;
654 attempts.push((article_content, text_length, document));
655 } else {
656 attempts.push((article_content, text_length, document));
657 attempts.sort_by(|(_, size_a, _), (_, size_b, _)| size_a.cmp(size_b));
660
661 if let Some((best_attempt, _len, _document)) = attempts.pop() {
663 for mut child in best_attempt.get_child_nodes() {
664 if child.is_null() {
665 continue;
666 }
667
668 child.unlink();
669 root.add_child(&mut child).map_err(|error| {
670 log::error!("{error}");
671 FullTextParserError::Readability
672 })?;
673 }
674 parse_successful = true;
675 }
676
677 return Ok(parse_successful);
678 }
679
680 document = document_cache
681 .dup()
682 .map_err(|()| FullTextParserError::Readability)?;
683 } else {
684 for mut child in article_content.get_child_nodes() {
685 if child.is_null() {
686 continue;
687 }
688
689 child.unlink();
690 root.add_child(&mut child).map_err(|error| {
691 log::error!("{error}");
692 FullTextParserError::Readability
693 })?;
694 }
695 return Ok(parse_successful);
696 }
697 }
698 }
699
700 fn get_content_score(node: &Node) -> Option<f64> {
701 node.get_attribute(constants::SCORE_ATTR)
702 .and_then(|a| a.parse::<f64>().ok())
703 }
704
705 fn set_content_score(node: &mut Node, score: f64) -> Result<(), FullTextParserError> {
706 node.set_attribute(constants::SCORE_ATTR, &score.to_string())
707 .map_err(|err| {
708 log::error!("failed to set content score: {err}");
709 FullTextParserError::Readability
710 })
711 }
712
713 fn check_byline(node: &Node, matchstring: &str, state: &mut State) -> bool {
714 if state.byline.is_some() {
715 return false;
716 }
717
718 let rel = node
719 .get_attribute("rel")
720 .map(|rel| rel == "author")
721 .unwrap_or(false);
722 let itemprop = node
723 .get_attribute("itemprop")
724 .map(|prop| prop.contains("author"))
725 .unwrap_or(false);
726
727 let content = node.get_content();
728 if rel
729 || itemprop
730 || constants::BYLINE.is_match(matchstring) && Self::is_valid_byline(&content)
731 {
732 state.byline = Some(content.trim().into());
733 true
734 } else {
735 false
736 }
737 }
738
739 fn is_valid_byline(line: &str) -> bool {
742 let len = line.trim().len();
743 len > 0 && len < 100
744 }
745
746 fn initialize_node(node: &mut Node, state: &State) -> Result<(), FullTextParserError> {
749 let score = match node.get_name().to_uppercase().as_str() {
750 "DIV" => 5,
751 "PRE" | "TD" | "BLOCKQUITE" => 3,
752 "ADDRESS" | "OL" | "UL" | "DL" | "DD" | "DT" | "LI" | "FORM" => -3,
753 "H1" | "H2" | "H3" | "H4" | "H5" | "H6" | "TH" => -5,
754 _ => 0,
755 };
756 let class_weight = if state.weigh_classes {
757 Util::get_class_weight(node)
758 } else {
759 0
760 };
761 let score = score + class_weight;
762 log::debug!(
763 "initialize node {} {}: {score}",
764 node.get_name(),
765 node.get_attribute("class").unwrap_or_default()
766 );
767 Self::set_content_score(node, score as f64)?;
768 Ok(())
769 }
770}