1use crate::errors::ParserError;
2use std::collections::{BTreeMap, HashMap, HashSet};
3use std::str::FromStr;
4use html5ever::{LocalName, Namespace, QualName};
5use kuchiki::{
6 iter::{Descendants, Elements, Select},
7 traits::*,
8 NodeData, NodeRef,
9};
10use log::info;
11use url::Url;
12
13
14const DEFAULT_CHAR_THRESHOLD: usize = 500;
15const FLAG_STRIP_UNLIKELYS: u32 = 0x1;
16const FLAG_WEIGHT_CLASSES: u32 = 0x2;
17const FLAG_CLEAN_CONDITIONALLY: u32 = 0x4;
18const READABILITY_SCORE: &'static str = "readability-score";
19const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml";
20const PHRASING_ELEMS: [&str; 39] = [
22 "abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em",
23 "embed", "i", "img", "input", "kbd", "label", "mark", "math", "meter", "noscript", "object",
24 "output", "progress", "q", "ruby", "samp", "script", "select", "small", "span", "strong",
25 "sub", "sup", "textarea", "time", "var", "wbr",
26];
27const DEFAULT_TAGS_TO_SCORE: [&str; 9] =
29 ["section", "h2", "h3", "h4", "h5", "h6", "p", "td", "pre"];
30const ALTER_TO_DIV_EXCEPTIONS: [&str; 4] = ["div", "article", "section", "p"];
32const PRESENTATIONAL_ATTRIBUTES: [&str; 12] = [
33 "align",
34 "background",
35 "bgcolor",
36 "border",
37 "cellpadding",
38 "cellspacing",
39 "frame",
40 "hspace",
41 "rules",
42 "style",
43 "valign",
44 "vspace",
45];
46
47const DATA_TABLE_DESCENDANTS: [&str; 5] = ["col", "colgroup", "tfoot", "thead", "th"];
48const DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [&str; 5] = ["table", "th", "td", "hr", "pre"];
50
51pub mod regexes;
52pub mod errors;
53
54pub struct Readability {
55 root_node: NodeRef,
56 byline: Option<String>,
57 article_title: String,
58 pub article_node: Option<NodeRef>,
59 article_dir: Option<String>,
60 flags: u32,
61 pub metadata: MetaData,
62}
63
64#[derive(Debug, PartialEq)]
65struct SizeInfo {
66 rows: usize,
67 columns: usize,
68}
69
70impl Readability {
71 pub fn new(html_str: &str) -> Self {
72 Self {
73 root_node: kuchiki::parse_html().one(html_str),
74 byline: None,
75 article_title: "".into(),
76 article_node: None,
77 article_dir: None,
78 flags: FLAG_STRIP_UNLIKELYS | FLAG_WEIGHT_CLASSES | FLAG_CLEAN_CONDITIONALLY,
79 metadata: MetaData::new(),
80 }
81 }
82 pub fn parse(&mut self, url: &str) -> Result<(), errors::ParserError> {
83 self.unwrap_no_script_tags();
84 self.remove_scripts();
85 self.prep_document();
86 self.metadata = self.get_article_metadata();
87 self.article_title = self.metadata.title.clone();
88 self.grab_article()?;
89 self.post_process_content(url);
90 Ok(())
91 }
92
93 fn is_single_image(node_ref: &NodeRef) -> bool {
96 if let Some(element) = node_ref.as_element() {
97 if &element.name.local == "img" {
98 return true;
99 }
100 }
101
102 if node_ref.children().filter(Self::has_content).count() != 1
103 || !node_ref.text_contents().trim().is_empty()
104 {
105 return false;
106 }
107
108 return Readability::is_single_image(
109 &node_ref
110 .children()
111 .filter(Self::has_content)
112 .next()
113 .expect("Unable to get first child which should exist"),
114 );
115 }
116
117 fn has_content(node_ref: &NodeRef) -> bool {
118 match node_ref.data() {
119 NodeData::Text(text) => !text.borrow().trim().is_empty(),
120 _ => true,
121 }
122 }
123
124 fn unwrap_no_script_tags(&mut self) {
128 if let Ok(imgs) = self.root_node.select("img") {
129 let mut nodes = imgs.filter(|img_node_ref| {
130 let img_attrs = img_node_ref.attributes.borrow();
131 !img_attrs.map.iter().any(|(name, attr)| {
132 &name.local == "src"
133 || &name.local == "srcset"
134 || &name.local == "data-src"
135 || &name.local == "data-srcset"
136 || regexes::is_match_img_ext(&attr.value)
137 })
138 });
139 let mut node_ref = nodes.next();
140 while let Some(img_ref) = node_ref {
141 node_ref = nodes.next();
142 img_ref.as_node().detach();
143 }
144 }
145
146 if let Ok(noscripts) = self.root_node.select("noscript") {
147 for noscript in noscripts {
148 let inner_node_ref = kuchiki::parse_fragment(
149 QualName::new(None, Namespace::from(HTML_NS), LocalName::from("div")),
150 Vec::new(),
151 )
152 .one(noscript.text_contents());
153 if !Self::is_single_image(&inner_node_ref) {
154 continue;
155 }
156 if let Some(mut prev_elem) = noscript.as_node().previous_sibling() {
157 while prev_elem.as_element().is_none() {
159 match prev_elem.previous_sibling() {
160 Some(new_prev) => prev_elem = new_prev,
161 None => break,
162 };
163 }
164
165 if Self::is_single_image(&prev_elem) && prev_elem.as_element().is_some() {
166 let prev_img = if &prev_elem.as_element().unwrap().name.local != "img" {
167 prev_elem.select_first("img").unwrap().as_node().clone()
168 } else {
169 prev_elem.clone()
170 };
171 let new_img = inner_node_ref.select_first("img").unwrap();
172 let prev_attrs = prev_img.as_element().unwrap().attributes.borrow();
173 let prev_attrs = prev_attrs.map.iter().filter(|(attr, val)| {
174 !val.value.trim().is_empty()
175 && (&attr.local == "src"
176 || &attr.local == "srcset"
177 || regexes::is_match_img_ext(&val.value))
178 });
179 for (prev_attr, prev_value) in prev_attrs {
180 match new_img.attributes.borrow().get(&prev_attr.local) {
181 Some(value) => {
182 if value == prev_value.value {
183 continue;
184 }
185 }
186 None => (),
187 }
188
189 let attr_name: &str = &prev_attr.local;
190 let mut attr_name = attr_name.to_owned();
191 if new_img.attributes.borrow().contains(attr_name.clone()) {
192 let new_name = format!("data-old-{}", &attr_name);
193 attr_name = new_name;
194 }
195 new_img
196 .attributes
197 .borrow_mut()
198 .insert(attr_name, prev_value.value.clone());
199 }
200 prev_elem.insert_after(new_img.as_node().clone());
201 prev_elem.detach();
202 }
203 }
204 }
205 }
206 }
207
208 fn remove_scripts(&mut self) {
210 match self.root_node.select("script") {
211 Ok(mut script_elems) => {
212 let mut next_script = script_elems.next();
213 while let Some(next_script_ref) = next_script {
214 next_script = script_elems.next();
215 next_script_ref.as_node().detach();
216 }
217 }
218 Err(_) => (),
219 }
220 match self.root_node.select("noscript") {
221 Ok(mut noscript_elems) => {
222 let mut next_noscript = noscript_elems.next();
223 while let Some(noscript_ref) = next_noscript {
224 next_noscript = noscript_elems.next();
225 noscript_ref.as_node().detach();
226 }
227 }
228 Err(_) => (),
229 }
230 }
231
232 fn prep_document(&mut self) {
235 match self.root_node.select("style") {
236 Ok(mut style_elems) => {
237 let mut style_elem = style_elems.next();
238 while let Some(style_ref) = style_elem {
239 style_elem = style_elems.next();
240 style_ref.as_node().detach();
241 }
242 }
243 Err(_) => (),
244 }
245 self.replace_brs();
246 match self.root_node.select("font") {
247 Ok(nodes_iter) => Self::replace_node_tags(nodes_iter, "span"),
248 Err(_) => (),
249 }
250 }
251
252 fn replace_brs(&mut self) {
258 if let Ok(mut br_tags) = self.root_node.select("br") {
259 while let Some(br_tag) = br_tags.next() {
261 let mut next = Self::next_element(br_tag.as_node().next_sibling(), false);
262 let mut replaced = false;
263 while let Some(next_elem) = next {
264 if next_elem.as_element().is_some()
265 && &next_elem.as_element().as_ref().unwrap().name.local == "br"
266 {
267 replaced = true;
268 let br_sibling = next_elem.next_sibling();
269 next = Self::next_element(br_sibling, false);
270 next_elem.detach();
271 } else {
272 break;
273 }
274 }
275 if replaced {
276 let p = NodeRef::new_element(
277 QualName { prefix: None, ns:Namespace::from(HTML_NS), local:LocalName::from("p")},
278 BTreeMap::new(),
279 );
280 br_tag.as_node().insert_before(p);
281 let p = br_tag.as_node().previous_sibling().unwrap();
282 br_tag.as_node().detach();
283
284 next = p.next_sibling();
285 while next.is_some() {
286 let next_sibling = next.unwrap();
287 if let Some(next_elem) = next_sibling.as_element() {
288 if &next_elem.name.local == "br" {
289 if let Some(second_sibling) = next_sibling.next_sibling() {
290 if second_sibling.as_element().is_some()
291 && "br" == &second_sibling.as_element().unwrap().name.local
292 {
293 break;
294 }
295 }
296 }
297 }
298
299 if !Self::is_phrasing_content(&next_sibling) {
300 break;
301 }
302
303 let sibling = next_sibling.next_sibling();
304 p.append(next_sibling);
305 next = sibling;
306 }
307
308 while let Some(first_child) = p.first_child() {
309 if Self::is_whitespace(&first_child) {
310 first_child.detach();
311 } else {
312 break;
313 }
314 }
315
316 while let Some(last_child) = p.last_child() {
317 if Self::is_whitespace(&last_child) {
318 last_child.detach();
319 } else {
320 break;
321 }
322 }
323
324 if let Some(parent) = p.parent() {
325 if &parent.as_element().as_ref().unwrap().name.local == "p" {
326 Self::set_node_tag(&parent, "div");
327 }
328 }
329 }
330 }
331 }
332 }
333
334 fn replace_node_tags(nodes: Select<Elements<Descendants>>, name: &str) {
336 for node in nodes {
337 Self::set_node_tag(node.as_node(), name);
338 }
339 }
340
341 fn set_node_tag(node_ref: &NodeRef, name: &str) -> NodeRef {
344 match node_ref.as_element() {
345 Some(elem) => {
346 let attributes = elem.attributes.borrow().clone().map.into_iter();
347 let replacement = NodeRef::new_element(
348 QualName::new(None, Namespace::from(HTML_NS), LocalName::from(name)),
349 attributes,
350 );
351 for child in node_ref.children() {
352 replacement.append(child);
353 }
354 node_ref.insert_before(replacement);
355 let new_node = node_ref.previous_sibling().unwrap();
356 node_ref.detach();
357 return new_node;
358 }
359 None => (),
360 }
361 node_ref.clone()
362 }
363
364 fn is_whitespace(node_ref: &NodeRef) -> bool {
365 match node_ref.data() {
366 NodeData::Element(elem_data) => &elem_data.name.local == "br",
367 NodeData::Text(text_ref) => text_ref.borrow().trim().len() == 0,
368 _ => false,
369 }
370 }
371
372 fn next_element(node_ref: Option<NodeRef>, must_be_element: bool) -> Option<NodeRef> {
378 let mut node_ref = node_ref;
380 while node_ref.is_some() {
381 match node_ref.as_ref().unwrap().data() {
382 NodeData::Element(_) => break,
383 _ => {
384 if node_ref.as_ref().unwrap().text_contents().trim().is_empty() {
385 node_ref = node_ref.as_ref().unwrap().next_sibling();
386 } else if must_be_element
387 && !node_ref.as_ref().unwrap().text_contents().trim().is_empty()
388 {
389 node_ref = node_ref.as_ref().unwrap().next_sibling();
390 } else {
391 break;
392 }
393 }
394 }
395 }
396 node_ref
397 }
398
399 fn is_phrasing_content(node_ref: &NodeRef) -> bool {
402 node_ref.as_text().is_some()
403 || match node_ref.as_element() {
404 Some(elem) => {
405 let name: &str = &elem.name.local;
406 PHRASING_ELEMS.contains(&name)
407 || ((name == "a" || name == "del" || name == "ins")
408 && node_ref
409 .children()
410 .all(|child_ref| Self::is_phrasing_content(&child_ref)))
411 }
412 None => false,
413 }
414 }
415
416 fn get_article_metadata(&self) -> MetaData {
418 let mut values: HashMap<String, String> = HashMap::new();
419 let mut meta_data = MetaData::new();
420 if let Ok(meta_elems) = self.root_node.select("meta") {
421 meta_elems
422 .filter(|node_ref| {
423 let node_attr = node_ref.attributes.borrow();
424 node_attr.get("content").is_some()
425 })
426 .for_each(|node_ref| {
427 let node_attr = node_ref.attributes.borrow();
428 let content = node_attr.get("content").unwrap();
429 let name_attr = node_attr.get("name");
430 let mut matches = None;
431 if let Some(property) = node_attr.get("property") {
432 matches = regexes::PROPERTY_REGEX.captures(property);
433 if let Some(captures) = &matches {
434 for capture in captures.iter() {
435 let mut name = capture.unwrap().as_str().to_lowercase();
436 name = regexes::REPLACE_WHITESPACE_REGEX
437 .replace_all(&name, "")
438 .to_string();
439 values.insert(name, content.trim().to_string());
440 }
441 }
442 }
443 if matches.is_none() && name_attr.is_some() {
444 let name_val = name_attr.unwrap();
445 if regexes::is_match_name_pattern(name_val) {
446 let name = name_val.to_lowercase();
447 let name = regexes::REPLACE_WHITESPACE_REGEX.replace_all(&name, "");
448 let name = regexes::REPLACE_DOT_REGEX.replace_all(&name, ":");
449 values.insert(name.to_string(), content.trim().to_string());
450 }
451 }
452 });
453 }
454
455 let meta_title_keys = [
456 "dc:title",
457 "dcterm:title",
458 "og:title",
459 "weibo:article:title",
460 "weibo:webpage:title",
461 "title",
462 "twitter:title",
463 ];
464 meta_data.title = if let Some(key) = meta_title_keys
465 .iter()
466 .find(|key| values.contains_key(**key))
467 {
468 let title = values.get(*key).map(|title| title.to_owned()).unwrap();
469 if title.is_empty() {
470 self.get_article_title()
471 } else {
472 title
473 }
474 } else {
475 self.get_article_title()
476 };
477
478 let meta_byline_keys = ["dc:creator", "dcterm:creator", "author"];
479 meta_data.byline = {
480 let possible_key = meta_byline_keys
481 .iter()
482 .find(|key| values.contains_key(**key));
483 if let Some(actual_key) = possible_key {
484 values.get(*actual_key).map(|byline| byline.to_owned())
485 } else {
486 None
487 }
488 };
489
490 let meta_excerpt_keys = [
491 "dc:description",
492 "dcterm:description",
493 "og:description",
494 "weibo:article:description",
495 "weibo:webpage:description",
496 "description",
497 "twitter:description",
498 ];
499 meta_data.excerpt = {
500 let possible_key = meta_excerpt_keys
501 .iter()
502 .find(|key| values.contains_key(**key));
503 if let Some(actual_key) = possible_key {
504 values.get(*actual_key).map(|excerpt| excerpt.to_owned())
505 } else {
506 None
507 }
508 };
509
510 meta_data.site_name = values
511 .get("og:site_name")
512 .map(|site_name| site_name.to_owned());
513
514 Self::unescape_html_entities(&mut meta_data.title);
515 if meta_data.byline.is_some() {
516 Self::unescape_html_entities(&mut meta_data.byline.as_mut().unwrap());
517 }
518
519 if meta_data.excerpt.is_some() {
520 Self::unescape_html_entities(&mut meta_data.excerpt.as_mut().unwrap());
521 }
522
523 if meta_data.site_name.is_some() {
524 Self::unescape_html_entities(&mut meta_data.site_name.as_mut().unwrap());
525 }
526
527 meta_data
528 }
529
530 fn unescape_html_entities(value: &mut String) {
532 if !value.is_empty() {
533 let mut html_escape_map: HashMap<&str, &str> = HashMap::new();
535 html_escape_map.insert("lt", "<");
536 html_escape_map.insert("gt", ">");
537 html_escape_map.insert("amp", "&");
538 html_escape_map.insert("quot", "\"");
539 html_escape_map.insert("apos", "'");
540 let mut new_value = regexes::REPLACE_HTML_ESCAPE_REGEX
541 .replace_all(&value, |captures: ®ex::Captures| {
542 html_escape_map[&captures[1]].to_string()
543 })
544 .to_string();
545 new_value = regexes::REPLACE_HEX_REGEX
546 .replace_all(&new_value, |captures: ®ex::Captures| {
547 let num = if let Some(hex_capture) = captures.get(1) {
548 u16::from_str_radix(hex_capture.as_str(), 16)
549 } else if let Some(dec_capture) = captures.get(2) {
550 u16::from_str(dec_capture.as_str())
551 } else {
552 unreachable!("Unable to match any of the captures");
553 };
554 String::from_utf16_lossy(&[num.unwrap()])
555 })
556 .to_string();
557 *value = new_value;
558 }
559 }
560
561 fn get_article_title(&self) -> String {
563 let mut cur_title = self
564 .root_node
565 .select_first("title")
566 .map(|title| title.text_contents().trim().to_string())
567 .unwrap_or("".to_string());
568 let orig_title = cur_title.clone();
569 let mut title_had_hierarchical_separators = false;
570 let word_count = |s: &str| -> usize { s.split_whitespace().count() };
571 if regexes::is_match_title_separator(&cur_title) {
572 title_had_hierarchical_separators = regexes::is_match_has_title_separator(&cur_title);
573 cur_title = regexes::REPLACE_START_SEPARATOR_REGEX
574 .replace_all(&orig_title, "$start")
575 .to_string();
576 if word_count(&cur_title) < 3 {
577 cur_title = regexes::REPLACE_END_SEPARATOR_REGEX
578 .replace_all(&orig_title, "$end")
579 .to_string();
580 }
581 } else if cur_title.contains(": ") {
582 let trimmed_title = cur_title.trim();
583 let is_match_heading = self
584 .root_node
585 .select("h1, h2")
586 .unwrap()
587 .any(|heading| heading.text_contents().trim() == trimmed_title);
588 if !is_match_heading {
589 let mut idx = orig_title.rfind(":").unwrap() + 1;
590 let mut new_title = &orig_title[idx..];
591 if word_count(new_title) < 3 {
592 idx = orig_title.find(":").unwrap() + 1;
593 new_title = &orig_title[idx..];
594 } else if word_count(&orig_title[0..orig_title.find(":").unwrap()]) > 5 {
595 new_title = &orig_title;
596 }
597 cur_title = new_title.to_string();
598 }
599 } else if cur_title.len() > 150 || cur_title.len() < 15 {
600 let mut h1_nodes = self.root_node.select("h1").unwrap();
601 let h1_count = self.root_node.select("h1").unwrap().count();
602 if h1_count == 1 {
603 cur_title = Self::get_inner_text(h1_nodes.next().unwrap().as_node(), None);
604 }
605 }
606 cur_title = regexes::NORMALIZE_REGEX
607 .replace_all(cur_title.trim(), " ")
608 .to_string();
609 let cur_word_count = word_count(&cur_title);
610
611 if cur_word_count <= 4
612 && (!title_had_hierarchical_separators
613 || cur_word_count
614 != word_count(
615 ®exes::REPLACE_MULTI_SEPARATOR_REGEX.replace_all(&orig_title, ""),
616 ) - 1)
617 {
618 cur_title = orig_title;
619 }
620 cur_title
621 }
622
623 fn clean_classes(&mut self) {
626 let classes_to_preserve: HashSet<&str> = HashSet::new();
628 if let Some(article_node) = &mut self.article_node {
629 for elem in article_node.inclusive_descendants().elements() {
630 let mut elem_attrs = elem.attributes.borrow_mut();
631 if let Some(class_list) = elem_attrs.get_mut("class") {
632 let filtered_class: String = class_list
633 .split_whitespace()
634 .filter(|class| classes_to_preserve.contains(class))
635 .fold("".to_string(), |acc, x| acc + " " + x);
636 if filtered_class.is_empty() {
637 elem_attrs.remove("class");
638 } else {
639 *class_list = filtered_class;
640 }
641 }
642 }
643 }
644 }
645
646 fn fix_relative_uris(&mut self, document_uri: &str) {
648 if let Some(article_node) = &mut self.article_node {
649 let document_uri =
650 Url::parse(document_uri).expect("Unable to parse the document's URI");
651 let base_uri = self
652 .root_node
653 .select("base")
654 .unwrap()
655 .filter(|node_ref| {
656 let node_attrs = node_ref.attributes.borrow();
657 node_attrs.contains("href")
658 })
659 .map(|node_ref| {
660 let node_attrs = node_ref.attributes.borrow();
661 let href = node_attrs.get("href").unwrap();
662
663 match Url::parse(href) {
664 Ok(url) => url,
665 Err(e) => match e {
666 url::ParseError::RelativeUrlWithoutBase => {
667 match document_uri.join(href) {
668 Ok(joined_url) => joined_url,
669 Err(e) => panic!(
670 "{:} unable to parse url {:?} on element {}",
671 e, href, &node_ref.name.local
672 ),
673 }
674 }
675 e => panic!(
676 "{:} unable to parse url {:?} on element {}",
677 e, href, &node_ref.name.local
678 ),
679 },
680 }
681 })
682 .next()
683 .unwrap_or(document_uri.clone());
684 let to_absolute_uri = |uri_str: &str| -> String {
685 if base_uri == document_uri && uri_str.starts_with("#") {
686 return uri_str.to_string();
687 }
688
689 if let Ok(new_uri) = Url::parse(uri_str) {
690 if new_uri.has_host() {
691 return new_uri.to_string();
692 }
693 } else if let Ok(joined_uri) = base_uri.join(uri_str) {
694 return joined_uri.to_string();
695 }
696
697 uri_str.to_string()
698 };
699 let mut links = article_node.select("a").unwrap().filter(|a_ref| {
700 let link_attrs = a_ref.attributes.borrow();
701 link_attrs.contains("href")
702 });
703 let mut link = links.next();
704 while let Some(link_ref) = link {
705 link = links.next();
706 let mut link_attrs = link_ref.attributes.borrow_mut();
707 let href = link_attrs.get("href").map(|val| val.to_string()).unwrap();
708 if href.starts_with("javascript:") {
709 let link_node = link_ref.as_node();
710 if link_node.children().count() == 1
711 && link_node
712 .first_child()
713 .map(|node_ref| node_ref.as_text().is_some())
714 .unwrap()
715 {
716 let text_node = NodeRef::new_text(link_node.text_contents());
717 link_node.insert_before(text_node);
718 link_node.detach();
719 } else {
720 let container = NodeRef::new_element(
721 QualName::new(None, Namespace::from(HTML_NS), LocalName::from("span")),
722 BTreeMap::new(),
723 );
724 let mut children = link_node.children();
725 let mut child = children.next();
726 while let Some(child_ref) = child {
727 child = children.next();
728 container.append(child_ref);
729 }
730 link_node.insert_before(container);
731 link_node.detach();
732 }
733 } else {
734 link_attrs.insert("href", to_absolute_uri(&href));
735 }
736 }
737 let media_nodes = article_node
738 .select("img, picture, figure, video, audio, source")
739 .unwrap();
740 for media_node in media_nodes {
741 let mut media_attrs = media_node.attributes.borrow_mut();
742 if let Some(src) = media_attrs.get_mut("src") {
743 *src = to_absolute_uri(&src);
744 }
745
746 if let Some(poster) = media_attrs.get_mut("poster") {
747 *poster = to_absolute_uri(&poster);
748 }
749
750 if let Some(srcset) = media_attrs.get_mut("srcset") {
751 let new_srcset = regexes::SRCSET_CAPTURE_REGEX.replace_all(
752 &srcset,
753 |captures: ®ex::Captures| {
754 to_absolute_uri(&captures[1])
755 + &captures.get(2).map(|cap| cap.as_str()).unwrap_or("")
756 + &captures[3]
757 },
758 );
759 *srcset = new_srcset.to_string();
760 }
761 }
762 }
763 }
764
765 fn clean_readability_attrs(&mut self) {
767 if let Some(article_node) = &mut self.article_node {
768 for node in article_node.inclusive_descendants().elements() {
769 let mut node_attrs = node.attributes.borrow_mut();
770 node_attrs.remove(READABILITY_SCORE);
771 node_attrs.remove("readability-data-table");
772 }
773 }
774 }
775
776 fn post_process_content(&mut self, url: &str) {
778 self.fix_relative_uris(url);
779 self.clean_classes();
781 self.clean_readability_attrs();
782 }
783
784 fn inline_css_str_to_map(css_str: &str) -> HashMap<String, String> {
786 enum State {
787 ReadProp,
788 ReadVal,
789 ReadQuot,
790 ReadDquot,
791 }
792 let mut decl: (Option<String>, Option<String>) = (None, None);
793 let mut chars = css_str.chars();
794 let mut state = State::ReadProp;
795 let mut token = String::new();
796 let mut tokens = vec![];
797 while let Some(c) = chars.next() {
798 match state {
799 State::ReadProp => {
800 if c != ':' {
801 token.push(c);
802 } else {
803 state = State::ReadVal;
804 decl.0 = Some(token.trim().to_string());
805 token.clear();
806 }
807 }
808 State::ReadVal => {
809 if c == '\'' {
810 state = State::ReadQuot;
811 token.push(c);
812 } else if c == '"' {
813 state = State::ReadDquot;
814 token.push(c);
815 } else if c == ';' {
816 state = State::ReadProp;
817 decl.1 = Some(token.trim().to_string());
818 tokens.push(decl.clone());
819 decl = (None, None);
820 token.clear();
821 } else {
822 token.push(c);
823 }
824 }
825 State::ReadQuot => {
826 token.push(c);
827 if c == '\'' {
828 state = State::ReadVal;
829 }
830 }
831 State::ReadDquot => {
832 token.push(c);
833 if c == '"' {
834 state = State::ReadVal;
835 }
836 }
837 }
838 }
839 if !token.is_empty() {
840 match state {
841 State::ReadVal => {
842 decl.1 = Some(token.trim().to_string());
843 tokens.push(decl);
844 }
845 _ => (),
846 }
847 }
848
849 tokens
850 .into_iter()
851 .filter(|tok_pair| tok_pair.0.is_some() && tok_pair.1.is_some())
852 .map(|tok_pair| (tok_pair.0.unwrap(), tok_pair.1.unwrap()))
853 .collect()
854 }
855
856 fn is_probably_visible(node_ref: &NodeRef) -> bool {
857 if let Some(elem_data) = node_ref.as_element() {
858 let attributes = elem_data.attributes.borrow();
859 (if let Some(css_str) = attributes.get("style"){
860 let style_map = Self::inline_css_str_to_map(css_str);
861 if let Some(display_val) = style_map.get("display") {
862 display_val != &"none"
863 } else {
864 true
865 }
866 } else {
867 true
868 })
869 && !attributes.contains("hidden")
870 &&
872 (!attributes.contains("aria-hidden") ||
873 attributes.get("aria-hidden").map(|val| val != "true").unwrap_or(true) ||
874 attributes.get("class").map(|class_list| class_list.split(" ").collect::<Vec<&str>>().contains(&"fallback-image")).unwrap_or(false))
875 } else {
876 true
878 }
879 }
880
881 fn is_valid_byline(input: &str) -> bool {
883 let text = input.trim();
884 text.len() > 0 && text.len() < 100
885 }
886
887 fn check_byline(&mut self, node_ref: &NodeRef, match_string: &str) -> bool {
888 if self.byline.is_none() {
889 if let Some(elem_data) = node_ref.as_element() {
890 let elem_attrs = elem_data.attributes.borrow();
891 let rel_attr = elem_attrs.get("rel");
892 let itemprop_attr = elem_attrs.get("itemprop");
893 let is_byline = (if rel_attr.is_some() {
894 rel_attr.unwrap() == "author"
895 } else if itemprop_attr.is_some() {
896 itemprop_attr.unwrap().contains("author")
897 } else {
898 regexes::is_match_byline(match_string)
899 }) && Self::is_valid_byline(&node_ref.text_contents());
900 if is_byline {
901 self.byline = Some(node_ref.text_contents().trim().to_owned());
902 }
903 is_byline
904 } else {
905 false
906 }
907 } else {
908 false
909 }
910 }
911
912 fn get_next_node(node_ref: &NodeRef, ignore_self_and_kids: bool) -> Option<NodeRef> {
918 let has_elem_children = node_ref.children().elements().count();
920 if !ignore_self_and_kids && has_elem_children > 0 {
921 Self::next_element(node_ref.first_child(), true)
922 } else if let Some(next_sibling) = Self::next_element(node_ref.next_sibling(), true) {
923 Some(next_sibling)
924 } else {
925 let mut node = node_ref.parent();
927 while let Some(parent) = node {
928 if let Some(next_sibling) = Self::next_element(parent.next_sibling(), true) {
929 return Some(next_sibling);
930 } else {
931 node = parent.parent();
932 }
933 }
934 None
935 }
936 }
937
938 fn remove_and_get_next(node_ref: NodeRef) -> Option<NodeRef> {
940 let next_node = Self::get_next_node(&node_ref, true);
941 node_ref.detach();
942 next_node
943 }
944
945 fn has_ancestor_tag(
948 node_ref: &NodeRef,
949 tag_name: &str,
950 max_depth: Option<i32>,
951 filter_fn: Option<fn(&NodeRef) -> bool>,
952 ) -> bool {
953 let mut depth = 0;
954 let max_depth = max_depth.or(Some(3)).unwrap();
955 let mut parent = node_ref.parent();
956 while parent.is_some() {
957 let parent_node = parent.as_ref().unwrap();
958 if parent_node.as_element().is_none() {
959 return false;
961 }
962 let parent_node_elem = parent_node.as_element().unwrap();
963 if max_depth > 0 && depth > max_depth {
964 return false;
965 }
966 if &parent_node_elem.name.local == tag_name
967 && (filter_fn.is_none() || filter_fn.unwrap()(parent_node))
968 {
969 return true;
970 }
971 parent = parent_node.parent();
972 depth += 1;
973 }
974 false
975 }
976
977 fn is_element_without_content(node_ref: &NodeRef) -> bool {
978 let child_count = node_ref.children().count();
979 node_ref.as_element().is_some()
980 && node_ref.text_contents().trim().is_empty()
981 && (child_count == 0
982 || child_count
983 == node_ref.select("br").unwrap().count()
984 + node_ref.select("hr").unwrap().count())
985 }
986
987 fn has_single_tag_inside_element(node_ref: &NodeRef, tag_name: &str) -> bool {
991 let first_child = node_ref.children().elements().next();
992 if node_ref.children().elements().count() != 1
993 || (first_child.is_some() && &first_child.unwrap().name.local != tag_name)
994 {
995 return false;
996 }
997 !node_ref.children().any(|node| {
998 node.as_text().is_some()
999 && regexes::is_match_has_content(&node.text_contents().trim_end())
1000 })
1001 }
1002
1003 fn get_inner_text(node_ref: &NodeRef, normalize_spaces: Option<bool>) -> String {
1004 let will_normalize = normalize_spaces.unwrap_or(true);
1005 let text = node_ref.text_contents();
1006 let text = text.trim();
1007 if will_normalize {
1008 return regexes::NORMALIZE_REGEX.replace_all(&text, " ").to_string();
1009 }
1010 text.to_owned()
1011 }
1012
1013 fn get_link_density(node_ref: &NodeRef) -> f32 {
1016 let text_length = Self::get_inner_text(node_ref, None).len() as f32;
1017 if text_length == 0_f32 {
1018 return 0_f32;
1019 }
1020 node_ref
1021 .select("a")
1022 .unwrap()
1023 .map(|a_node| Self::get_inner_text(a_node.as_node(), None).len() as f32)
1024 .sum::<f32>()
1025 / text_length
1026 }
1027
1028 fn has_child_block_element(node_ref: &NodeRef) -> bool {
1030 let block_level_elems: [&str; 32] = [
1032 "address",
1033 "article",
1034 "aside",
1035 "blockquote",
1036 "details",
1037 "dialog",
1038 "dd",
1039 "div",
1040 "dl",
1041 "dt",
1042 "fieldset",
1043 "figcaption",
1044 "footer",
1045 "form",
1046 "h1",
1047 "h2",
1048 "h3",
1049 "h4",
1050 "h5",
1051 "h6",
1052 "header",
1053 "hgroup",
1054 "hr",
1055 "li",
1056 "main",
1057 "nav",
1058 "ol",
1059 "p",
1060 "pre",
1061 "section",
1062 "table",
1063 "ul",
1064 ];
1065 node_ref.children().any(|child_node| {
1066 if child_node.as_element().is_some() {
1067 let child_elem = child_node.as_element().unwrap();
1068 block_level_elems.contains(&&*child_elem.name.local)
1069 || Self::has_child_block_element(&child_node)
1070 } else {
1071 false
1072 }
1073 })
1074 }
1075
1076 fn get_node_ancestors(node_ref: &NodeRef, max_depth: Option<usize>) -> Vec<NodeRef> {
1078 node_ref.ancestors().take(max_depth.unwrap_or(1)).collect()
1079 }
1080
1081 fn get_class_weight(&self, node_ref: &NodeRef) -> i32 {
1084 if !self.flag_is_active(FLAG_WEIGHT_CLASSES) {
1085 return 0;
1086 }
1087 let mut weight = 0;
1088 let node_elem = node_ref.as_element().unwrap();
1089 let node_attrs = node_elem.attributes.borrow();
1090 if let Some(id) = node_attrs.get("id") {
1091 if !id.trim().is_empty() {
1092 weight = if regexes::is_match_positive(id) {
1093 weight + 25
1094 } else if regexes::is_match_negative(id) {
1095 weight - 25
1096 } else {
1097 weight
1098 }
1099 }
1100 }
1101 if let Some(class) = node_attrs.get("class") {
1102 if !class.trim().is_empty() {
1103 weight = if regexes::is_match_positive(class) {
1104 weight + 25
1105 } else if regexes::is_match_negative(class) {
1106 weight - 25
1107 } else {
1108 weight
1109 }
1110 }
1111 }
1112 weight
1113 }
1114
1115 fn initialize_node(&self, node_ref: &mut NodeRef) {
1118 if let Some(element) = node_ref.as_element() {
1119 let mut score = 0.0;
1120 score += self.get_class_weight(node_ref) as f32;
1123 let mut elem_attrs = element.attributes.borrow_mut();
1124 elem_attrs.insert(READABILITY_SCORE, score.to_string());
1125 let readability = elem_attrs.get_mut(READABILITY_SCORE);
1126 match &*element.name.local {
1127 "div" => score += 5.0,
1128 "pre" | "td" | "blockquote" => score += 3.0,
1129 "address" | "ol" | "ul" | "dl" | "dd" | "dt" | "li" | "form" => score -= 3.0,
1130 "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "th" => score -= 5.0,
1131 _ => (),
1132 }
1133 if let Some(x) = readability {
1134 *x = score.to_string();
1135 }
1136 }
1137 }
1138
1139 fn get_row_and_column_count(node_ref: &NodeRef) -> SizeInfo {
1140 let mut rows = 0;
1141 let mut columns = 0;
1142 if let Ok(trs) = node_ref.select("tr") {
1143 for tr in trs {
1144 let tr_node = tr.as_node();
1145 let tr_attr = tr.attributes.borrow();
1146 let rowspan = tr_attr
1147 .get("rowspan")
1148 .map(|x| {
1149 x.parse::<usize>()
1150 .expect("Unable to parse rowspan value to usize")
1151 })
1152 .unwrap_or(1);
1153 rows += rowspan;
1154 let mut columns_in_row = 0;
1155 if let Ok(cells) = tr_node.select("td") {
1156 for cell in cells {
1157 let cell_attr = cell.attributes.borrow();
1158 let colspan = cell_attr
1159 .get("colspan")
1160 .map(|x| {
1161 x.parse::<usize>()
1162 .expect("Unable to parse colspan value to usize")
1163 })
1164 .unwrap_or(1);
1165 columns_in_row += colspan;
1166 }
1167 }
1168 columns = columns.max(columns_in_row);
1169 }
1170 }
1171 SizeInfo { rows, columns }
1172 }
1173
1174 fn mark_data_tables(&mut self) {
1177 if let Ok(tables) = self.root_node.select("table") {
1178 for table in tables {
1179 let mut table_attr = table.attributes.borrow_mut();
1180 let table_node = table.as_node();
1181 if table_attr.get("role") == Some("presentation") {
1182 table_attr.insert("readability-data-table", "false".to_string());
1183 continue;
1184 }
1185 if table_attr.get("datatable") == Some("0") {
1186 table_attr.insert("readability-data-table", "false".to_string());
1187 continue;
1188 }
1189
1190 if table_attr.contains("summary") {
1191 table_attr.insert("readability-data-table", "true".to_string());
1192 continue;
1193 }
1194 if let Ok(caption) = table_node.select_first("caption") {
1195 if caption.as_node().children().count() > 0 {
1196 table_attr.insert("readability-data-table", "true".to_string());
1197 continue;
1198 }
1199 }
1200
1201 if DATA_TABLE_DESCENDANTS
1202 .iter()
1203 .any(|tag_name| table_node.select_first(tag_name).is_ok())
1204 {
1205 table_attr.insert("readability-data-table", "true".to_string());
1206 continue;
1207 }
1208
1209 if table_node.select("table").unwrap().count() > 1 {
1210 table_attr.insert("readability-data-table", "false".to_string());
1211 continue;
1212 }
1213
1214 let size_info = Self::get_row_and_column_count(table_node);
1215 if size_info.rows >= 10 || size_info.columns > 4 {
1216 table_attr.insert("readability-data-table", "true".to_string());
1217 continue;
1218 }
1219
1220 if (size_info.rows * size_info.columns) > 10 {
1221 table_attr.insert("readability-data-table", "true".to_string());
1222 continue;
1223 } else {
1224 table_attr.insert("readability-data-table", "false".to_string());
1225 continue;
1226 }
1227 }
1228 }
1229 }
1230
1231 fn fix_lazy_images(node_ref: &mut NodeRef) {
1233 let nodes = node_ref.select("img, picture, figure").unwrap();
1234 for node in nodes {
1235 let mut node_attr = node.attributes.borrow_mut();
1236 if let Some(src) = node_attr.get("src") {
1237 let src_captures = regexes::B64_DATA_URL_REGEX.captures(src);
1238 if src_captures.is_some() {
1239 let svg_capture = src_captures.unwrap().get(1);
1240 if svg_capture.is_some() && svg_capture.unwrap().as_str() == "image/svg+xml" {
1241 continue;
1242 }
1243
1244 let src_could_be_removed = node_attr
1245 .map
1246 .iter()
1247 .filter(|(name, _)| &name.local != "src")
1248 .filter(|(_, val)| regexes::is_match_img_ext(&val.value))
1249 .count()
1250 > 0;
1251
1252 if src_could_be_removed {
1253 let b64_start = regexes::BASE64_REGEX.find(src).unwrap().start();
1254 let b64_length = src.len() - b64_start;
1255 if b64_length < 133 {
1256 node_attr.remove("src");
1257 }
1258 }
1259 }
1260 }
1261 let src = node_attr.get("src");
1262 let srcset = node_attr.get("srcset");
1263 let class = node_attr.get("class");
1264 if (src.is_some() || srcset.is_some())
1265 && class.and_then(|classname| classname.find("lazy")).is_none()
1266 {
1267 continue;
1268 }
1269
1270 node_attr
1271 .map
1272 .clone()
1273 .iter()
1274 .filter(|(key, _)| !(&key.local == "src" || &key.local == "srcset"))
1275 .for_each(|(_, val)| {
1276 let mut copy_to = "";
1277 if regexes::is_match_srcset(&val.value) {
1278 copy_to = "srcset";
1279 } else if regexes::is_match_src_regex(&val.value) {
1280 copy_to = "src";
1281 }
1282 if copy_to.len() > 0 {
1283 let new_val = val.value.clone();
1284 let tag_name = &node.name.local;
1285 if tag_name == "img" || tag_name == "picture" {
1286 node_attr.insert(copy_to, new_val);
1287 } else if tag_name == "figure" {
1288 let node_ref = node.as_node();
1289 let img_picture_nodes = node_ref.select("img, picture").unwrap();
1290 if img_picture_nodes.count() > 0 {
1291 let img = NodeRef::new_element(
1292 QualName::new(
1293 None,
1294 Namespace::from(HTML_NS),
1295 LocalName::from("img"),
1296 ),
1297 BTreeMap::new(),
1298 );
1299 {
1300 let mut img_attr =
1301 img.as_element().unwrap().attributes.borrow_mut();
1302 img_attr.insert(copy_to, new_val);
1303 }
1304 node_ref.append(img);
1305 }
1306 }
1307 }
1308 });
1309 }
1310 }
1311
1312 fn clean_conditionally(&self, node_ref: &mut NodeRef, tag_name: &str) {
1315 if !self.flag_is_active(FLAG_CLEAN_CONDITIONALLY) {
1316 return;
1317 }
1318 let is_list = tag_name == "ul" || tag_name == "ol";
1319 let is_data_table = |node_ref: &NodeRef| {
1320 let node_elem = node_ref.as_element().unwrap();
1321 let attrs = node_elem.attributes.borrow();
1322 attrs.get("readability-data-table") == Some("true")
1323 };
1324 let get_char_count = |node_ref: &NodeRef| node_ref.text_contents().matches(",").count();
1325
1326 let mut nodes = node_ref
1327 .descendants()
1328 .select(tag_name)
1329 .unwrap()
1330 .filter(|node_data_ref| {
1332 !(&node_data_ref.name.local == "table" && is_data_table(node_data_ref.as_node()))
1333 })
1334 .filter(|node_data_ref| {
1336 !Self::has_ancestor_tag(
1337 node_data_ref.as_node(),
1338 tag_name,
1339 Some(-1),
1340 Some(is_data_table),
1341 )
1342 });
1343 let mut next_node = nodes.next();
1344 while let Some(node_data_ref) = next_node {
1345 next_node = nodes.next();
1346 let node = node_data_ref.as_node();
1347 let weight = self.get_class_weight(node);
1348 if weight < 0 {
1350 node.detach();
1351 continue;
1352 }
1353
1354 if get_char_count(node) >= 10 {
1355 continue;
1356 }
1357 let mut embeds = node_data_ref
1358 .as_node()
1359 .select("object, embed, iframe")
1360 .unwrap();
1361 let can_skip_embed = embeds.any(|node_data_ref| {
1362 &node_data_ref.name.local == "object" || {
1363 let attrs = node_data_ref.attributes.borrow();
1364
1365 attrs
1366 .map
1367 .iter()
1368 .any(|(_, val)| regexes::is_match_videos(&val.value))
1369 }
1370 });
1371 if can_skip_embed {
1372 continue;
1373 }
1374
1375 let p_nodes = node_data_ref.as_node().select("p").unwrap().count();
1376 let img_nodes = node_data_ref.as_node().select("img").unwrap().count();
1377 let li_nodes = node_data_ref.as_node().select("li").unwrap().count() as i32 - 100;
1378 let input_nodes = node_data_ref.as_node().select("input").unwrap().count();
1379
1380 let p = p_nodes as f32;
1381 let img = img_nodes as f32;
1382
1383 let embed_count = node.select("object, embed, iframe").unwrap().count();
1384 let link_density = Self::get_link_density(node);
1385 let content_length = Self::get_inner_text(node, None).len();
1386 let has_figure_ancestor = Self::has_ancestor_tag(node, "figure", None, None);
1387 let have_to_remove = (img_nodes > 1 && p / img < 0.5 && !has_figure_ancestor)
1388 || (!is_list && li_nodes > p_nodes as i32)
1389 || (input_nodes > (p_nodes / 3))
1390 || (!is_list
1391 && content_length < 25
1392 && (img_nodes == 0 || img_nodes > 2)
1393 && !has_figure_ancestor)
1394 || (!is_list && weight < 25 && link_density > 0.2)
1395 || (weight >= 25 && link_density > 0.5)
1396 || ((embed_count == 1 && content_length < 75) || embed_count > 1);
1397 if have_to_remove {
1398 node.detach();
1399 }
1400 }
1401 }
1402
1403 fn clean(node_ref: &mut NodeRef, tag_name: &str) {
1405 let is_embed = vec!["object", "embed", "iframe"].contains(&tag_name);
1407 let mut nodes = node_ref
1408 .descendants()
1409 .select(tag_name)
1410 .unwrap()
1411 .filter(|node_data_ref| {
1412 !is_embed
1413 || {
1414 let attrs = node_data_ref.attributes.borrow();
1415 !attrs
1416 .map
1417 .iter()
1418 .any(|(_, val)| regexes::is_match_videos(&val.value))
1419 }
1420 || &node_data_ref.name.local == "object" });
1422 let mut node = nodes.next();
1423 while let Some(node_data_ref) = node {
1424 node = nodes.next();
1425 node_data_ref.as_node().detach()
1426 }
1427 }
1428
1429 fn clean_headers(&self, node_ref: &mut NodeRef) {
1431 let mut nodes = node_ref
1432 .descendants()
1433 .select("h1, h2")
1434 .unwrap()
1435 .filter(|node_data_ref| self.get_class_weight(node_data_ref.as_node()) < 0);
1436 let mut node = nodes.next();
1437
1438 while let Some(node_data_ref) = node {
1439 node = nodes.next();
1440 node_data_ref.as_node().detach();
1441 }
1442 }
1443
1444 fn clean_styles(node_ref: &mut NodeRef) {
1446 node_ref
1447 .inclusive_descendants()
1448 .elements()
1449 .filter(|node| &node.name.local != "svg")
1450 .for_each(|node_data_ref| {
1451 let mut attrs = node_data_ref.attributes.borrow_mut();
1452 PRESENTATIONAL_ATTRIBUTES.iter().for_each(|pres_attr| {
1453 attrs.remove(*pres_attr);
1454 });
1455 if DEPRECATED_SIZE_ATTRIBUTE_ELEMS.contains(&node_data_ref.name.local.as_ref()) {
1456 attrs.remove("width");
1457 attrs.remove("height");
1458 }
1459 });
1460 }
1461
1462 fn clean_matched_nodes(node_ref: &mut NodeRef, filter_fn: impl Fn(&NodeRef, &str) -> bool) {
1464 let end_of_search_marker_node = Self::get_next_node(node_ref, true);
1465 let mut next_node = Self::get_next_node(node_ref, false);
1466 while next_node.is_some() && next_node != end_of_search_marker_node {
1467 let node = next_node.unwrap();
1468 let attrs = node.as_element().unwrap().attributes.borrow();
1469 let class = attrs.get("class").unwrap_or("");
1470 let id = attrs.get("id").unwrap_or("");
1471 if filter_fn(&node, &(class.to_string() + " " + id)) {
1472 next_node = Self::remove_and_get_next(node.clone());
1473 } else {
1474 next_node = Self::get_next_node(&node, false);
1475 }
1476 }
1477 }
1478
1479 fn prep_article(&mut self, node_ref: &mut NodeRef) {
1482 Self::clean_styles(node_ref);
1483 self.mark_data_tables();
1484 Self::fix_lazy_images(node_ref);
1485 self.clean_conditionally(node_ref, "form");
1486 self.clean_conditionally(node_ref, "fieldset");
1487 Self::clean(node_ref, "object");
1488 Self::clean(node_ref, "embed");
1489 Self::clean(node_ref, "h1");
1490 Self::clean(node_ref, "footer");
1491 Self::clean(node_ref, "link");
1492 Self::clean(node_ref, "aside");
1493
1494 node_ref.children().for_each(|mut node| {
1495 Self::clean_matched_nodes(&mut node, |node: &NodeRef, match_string| {
1496 regexes::is_match_share_elems(match_string)
1497 && node.text_contents().len() < DEFAULT_CHAR_THRESHOLD
1498 });
1499 });
1500
1501 let h2_nodes = node_ref.select("h2").unwrap().take(2).collect::<Vec<_>>();
1502 if h2_nodes.len() == 1 {
1503 let h2_node = h2_nodes[0].as_node();
1504 let length_similar_rate = ((h2_node.text_contents().len() as isize
1505 - self.article_title.len() as isize) as f32)
1506 / self.article_title.len() as f32;
1507 if length_similar_rate.abs() < 0.5 {
1508 let titles_match = if length_similar_rate > 0.0 {
1509 h2_node.text_contents().contains(&self.article_title)
1510 } else {
1511 self.article_title.contains(&h2_node.text_contents())
1512 };
1513 if titles_match {
1514 Self::clean(node_ref, "h2");
1515 }
1516 }
1517 }
1518
1519 Self::clean(node_ref, "iframe");
1520 Self::clean(node_ref, "input");
1521 Self::clean(node_ref, "textarea");
1522 Self::clean(node_ref, "select");
1523 Self::clean(node_ref, "button");
1524 self.clean_headers(node_ref);
1525
1526 self.clean_conditionally(node_ref, "table");
1527 self.clean_conditionally(node_ref, "ul");
1528 self.clean_conditionally(node_ref, "div");
1529
1530 let mut p_nodes = node_ref.select("p").unwrap().filter(|node_data_ref| {
1531 let p_node = node_data_ref.as_node();
1532 let img_count = p_node.select("img").unwrap().count();
1533 let embed_count = p_node.select("embed").unwrap().count();
1534 let object_count = p_node.select("object").unwrap().count();
1535 let iframe_count = p_node.select("iframe").unwrap().count();
1536 let total = img_count + embed_count + object_count + iframe_count;
1537 total == 0 && Self::get_inner_text(node_data_ref.as_node(), Some(false)).is_empty()
1538 });
1539 let mut p_node = p_nodes.next();
1540 while let Some(p_node_ref) = p_node {
1541 p_node = p_nodes.next();
1542 p_node_ref.as_node().detach();
1543 }
1544
1545 let mut br_nodes = node_ref.select("br").unwrap().filter(|node_data_ref| {
1546 let br_node = node_data_ref.as_node();
1547 let next_node = Self::next_element(br_node.next_sibling(), true);
1549 next_node.is_some() && &next_node.unwrap().as_element().unwrap().name.local == "p"
1550 });
1551 let mut br_node = br_nodes.next();
1552 while let Some(br_node_ref) = br_node {
1553 br_node = br_nodes.next();
1554 br_node_ref.as_node().detach();
1555 }
1556
1557 let mut table_nodes = node_ref.select("table").unwrap();
1558 let mut table_node = table_nodes.next();
1559 while let Some(table_node_ref) = table_node {
1560 table_node = table_nodes.next();
1561 let table_node = table_node_ref.as_node();
1562 let table_child = Self::next_element(table_node.first_child(), true);
1564 let tbody = if Self::has_single_tag_inside_element(&table_node, "tbody") {
1565 table_child.as_ref().unwrap()
1566 } else {
1567 table_node
1568 };
1569
1570 if Self::has_single_tag_inside_element(&tbody, "tr") {
1572 let row = Self::next_element(tbody.first_child(), true).unwrap();
1573 if Self::has_single_tag_inside_element(&row, "td") {
1574 let mut cell = Self::next_element(row.first_child(), true).unwrap();
1575 let tag = if cell
1576 .children()
1577 .all(|cell_child| Self::is_phrasing_content(&cell_child))
1578 {
1579 "p"
1580 } else {
1581 "div"
1582 };
1583 cell = Self::set_node_tag(&cell, tag);
1584 if let Some(parent) = table_node.parent() {
1585 parent.append(cell);
1586 table_node.detach();
1587 }
1588 }
1589 }
1590 }
1591 }
1592
1593 fn flag_is_active(&self, flag: u32) -> bool {
1594 self.flags & flag > 0
1595 }
1596
1597 fn remove_flag(&mut self, flag: u32) {
1598 self.flags = self.flags & !flag;
1599 }
1600
1601 fn grab_article(&mut self) -> Result<(), ParserError> {
1604 info!("Grabbing article {:?}", self.metadata.title);
1605 let page = self.root_node.select_first("body");
1609 if page.is_err() {
1610 return Err(ParserError::new("Document has no <body>".into()).into());
1611 }
1612 let page = page.unwrap();
1613 let mut attempts: Vec<ExtractAttempt> = Vec::new();
1614
1615 loop {
1619 let strip_unlikely_candidates = self.flag_is_active(FLAG_STRIP_UNLIKELYS);
1621
1622 let mut elements_to_score: Vec<NodeRef> = Vec::new();
1626 let mut node = self
1627 .root_node
1628 .select_first("html")
1629 .ok()
1630 .map(|n| n.as_node().clone());
1631
1632 while let Some(node_ref) = node {
1633 let node_elem = node_ref.as_element().unwrap();
1634 let node_name: &str = node_elem.name.local.as_ref();
1635 let match_string = {
1636 let node_attrs = node_elem.attributes.borrow();
1637 node_attrs.get("class").unwrap_or("").to_string()
1638 + " "
1639 + node_attrs.get("id").unwrap_or("")
1640 };
1641 if !Self::is_probably_visible(&node_ref) {
1642 node = Self::remove_and_get_next(node_ref);
1643 continue;
1644 }
1645
1646 if self.check_byline(&node_ref, &match_string) {
1647 node = Self::remove_and_get_next(node_ref);
1648 continue;
1649 }
1650
1651 if strip_unlikely_candidates {
1652 if regexes::is_match_unlikely(&match_string)
1653 && !regexes::is_match_ok_maybe(&match_string)
1654 && !Self::has_ancestor_tag(&node_ref, "table", None, None)
1655 && node_name != "body"
1656 && node_name != "a"
1657 {
1658 node = Self::remove_and_get_next(node_ref);
1659 continue;
1660 }
1661
1662 let is_complementary = {
1663 let node_attrs = node_elem.attributes.borrow();
1664 node_attrs.get("role") == Some("complementary")
1665 };
1666 if is_complementary {
1667 node = Self::remove_and_get_next(node_ref);
1668 continue;
1669 }
1670 }
1671
1672 match node_name {
1673 "div" | "section" | "header" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
1674 if Self::is_element_without_content(&node_ref) {
1675 node = Self::remove_and_get_next(node_ref);
1676 continue;
1677 }
1678 }
1679 _ => (),
1680 }
1681 if DEFAULT_TAGS_TO_SCORE.contains(&node_name) {
1682 elements_to_score.push(node_ref.clone());
1683 }
1684 if node_name == "div" {
1685 let mut p: Option<NodeRef> = None;
1686 let mut child_node = node_ref.first_child();
1687 while let Some(child_node_ref) = child_node {
1688 let next_sibling = child_node_ref.next_sibling();
1689 if Self::is_phrasing_content(&child_node_ref) {
1690 if let Some(ref p_node) = p {
1691 p_node.append(child_node_ref);
1692 } else if !Self::is_whitespace(&child_node_ref) {
1693 let new_p_node = NodeRef::new_element(
1694 QualName::new(
1695 None,
1696 Namespace::from(HTML_NS),
1697 LocalName::from("p"),
1698 ),
1699 BTreeMap::new(),
1700 );
1701 child_node_ref.insert_before(new_p_node);
1702 p = child_node_ref.previous_sibling();
1703 p.as_mut().unwrap().append(child_node_ref);
1705 }
1706 } else if let Some(ref p_node) = p {
1707 while let Some(last_child) = p_node.last_child() {
1708 if Self::is_whitespace(&last_child) {
1709 last_child.detach();
1710 } else {
1711 break;
1712 }
1713 }
1714 p = None;
1715 }
1716 child_node = next_sibling;
1717 }
1718 if Self::has_single_tag_inside_element(&node_ref, "p")
1719 && Self::get_link_density(&node_ref) < 0.25
1720 {
1721 let new_node = Self::next_element(node_ref.first_child(), true).unwrap();
1723 elements_to_score.push(new_node.clone());
1724 node_ref.insert_before(new_node);
1725 let new_node = node_ref.previous_sibling();
1726 node_ref.detach();
1727 node = new_node;
1728 elements_to_score.push(node.clone().unwrap());
1729 } else if !Self::has_child_block_element(&node_ref) {
1730 node = Some(Self::set_node_tag(&node_ref, "p"));
1731 elements_to_score.push(node.clone().unwrap());
1732 }
1733 }
1734 node = Self::get_next_node(&node_ref, false);
1735 }
1736
1737 let mut candidates: Vec<NodeRef> = Vec::new();
1738 elements_to_score
1739 .iter()
1740 .filter(|node_ref| {
1741 let parent = node_ref.parent();
1742 parent.is_some() && parent.unwrap().as_element().is_some()
1743 })
1744 .map(|node_ref| (node_ref, Self::get_inner_text(&node_ref, None)))
1745 .filter(|(_, inner_text)| inner_text.len() >= 25)
1746 .map(|(node_ref, inner_text)| {
1747 (inner_text, Self::get_node_ancestors(&node_ref, Some(3)))
1748 })
1749 .filter(|(_, ancestors)| ancestors.len() != 0)
1750 .for_each(|(inner_text, ancestors)| {
1751 let mut content_score = 0;
1752 content_score += 1;
1753 content_score += inner_text.split(",").count();
1754 content_score += (3).min(inner_text.len() / 100);
1755 ancestors
1756 .into_iter()
1757 .enumerate()
1758 .filter(|(_, node)| {
1759 node.parent().is_some() && node.parent().unwrap().as_element().is_some()
1760 })
1761 .for_each(|(level, mut ancestor)| {
1762 let has_readability = {
1763 let ancestor_attrs =
1764 ancestor.as_element().unwrap().attributes.borrow();
1765 ancestor_attrs.contains(READABILITY_SCORE)
1766 };
1767 if !has_readability {
1768 self.initialize_node(&mut ancestor);
1769 candidates.push(ancestor.clone());
1770 }
1771
1772 let score_divider = if level == 0 {
1773 1.0
1774 } else if level == 1 {
1775 2.0
1776 } else {
1777 level as f32 * 3.0
1778 };
1779 let mut ancestor_attrs =
1780 ancestor.as_element().unwrap().attributes.borrow_mut();
1781 if let Some(readability_score) =
1782 ancestor_attrs.get_mut(READABILITY_SCORE)
1783 {
1784 *readability_score = (readability_score.parse::<f32>().unwrap()
1785 + (content_score as f32 / score_divider))
1786 .to_string();
1787 }
1788 });
1789 });
1790
1791 let mut top_candidates: Vec<NodeRef> = Vec::new();
1792 for candidate in candidates {
1793 let mut candidate_score = 0.0;
1794 {
1795 let mut candidate_attr =
1796 candidate.as_element().unwrap().attributes.borrow_mut();
1797 if let Some(readability_score) = candidate_attr.get_mut(READABILITY_SCORE) {
1798 candidate_score = readability_score.parse::<f32>().unwrap()
1799 * (1.0 - Self::get_link_density(&candidate));
1800 *readability_score = candidate_score.to_string();
1801 }
1802 }
1803 let nb_top_candidates = 5;
1804 for i in 0..nb_top_candidates {
1805 let top_candidate = top_candidates.get(i);
1806 let top_candidate_score = top_candidate
1807 .as_ref()
1808 .map(|node_ref| node_ref.as_element().unwrap().attributes.borrow())
1809 .map(|attrs| {
1810 attrs
1811 .get(READABILITY_SCORE)
1812 .unwrap_or("0")
1813 .parse::<f32>()
1814 .unwrap()
1815 });
1816 if top_candidate.is_none() || candidate_score > top_candidate_score.unwrap() {
1817 top_candidates.splice(i..i, vec![candidate].into_iter());
1818 if top_candidates.len() > nb_top_candidates {
1819 top_candidates.pop();
1820 }
1821 break;
1822 }
1823 }
1824 }
1825
1826 let possible_top_candidate = top_candidates.get(0);
1827 let mut top_candidate;
1828 let mut needed_to_create_top_candidate = false;
1829 let mut parent_of_top_candidate: NodeRef;
1830
1831 if possible_top_candidate.is_none()
1832 || possible_top_candidate
1833 .map(|node| &node.as_element().unwrap().name.local)
1834 .as_ref()
1835 .unwrap()
1836 == &"body"
1837 {
1838 top_candidate = NodeRef::new_element(
1839 QualName::new(None, Namespace::from(HTML_NS), LocalName::from("div")),
1840 BTreeMap::new(),
1841 );
1842 needed_to_create_top_candidate = true;
1843 let mut page_children = page.as_node().children();
1844 let mut page_child = page_children.next();
1845 while let Some(child_node) = page_child {
1846 page_child = page_children.next();
1847 top_candidate.append(child_node);
1848 }
1849 page.as_node().append(top_candidate.clone());
1850 self.initialize_node(&mut top_candidate);
1851 } else {
1852 let alternative_candidate_ancestors: Vec<Vec<NodeRef>>;
1853 top_candidate = top_candidates.get(0).unwrap().clone();
1854 let top_candidate_score = {
1855 let top_candidate_node_attrs =
1856 top_candidate.as_element().unwrap().attributes.borrow();
1857 top_candidate_node_attrs
1858 .get(READABILITY_SCORE)
1859 .unwrap()
1860 .parse::<f32>()
1861 .unwrap()
1862 };
1863
1864 alternative_candidate_ancestors = top_candidates
1865 .iter()
1866 .skip(1)
1867 .filter(|top_candidate_node| {
1868 let candidate_node_score = {
1869 let top_candidate_node_attrs =
1870 top_candidate_node.as_element().unwrap().attributes.borrow();
1871 top_candidate_node_attrs
1872 .get(READABILITY_SCORE)
1873 .unwrap()
1874 .parse::<f32>()
1875 .unwrap()
1876 };
1877 (candidate_node_score / top_candidate_score) >= 0.75
1878 })
1879 .map(|node| Self::get_node_ancestors(&node, None))
1880 .collect();
1881
1882 let minimum_top_candidates = 3;
1883 if alternative_candidate_ancestors.len() >= minimum_top_candidates {
1884 parent_of_top_candidate = top_candidate.parent().unwrap();
1885 while &parent_of_top_candidate.as_element().unwrap().name.local != "body" {
1886 let mut lists_containing_this_ancestor = alternative_candidate_ancestors
1887 .iter()
1888 .filter(|node_vec| node_vec.contains(&parent_of_top_candidate))
1889 .count();
1890 lists_containing_this_ancestor =
1891 lists_containing_this_ancestor.min(minimum_top_candidates);
1892 if lists_containing_this_ancestor >= minimum_top_candidates {
1893 top_candidate = parent_of_top_candidate;
1894 break;
1895 }
1896 parent_of_top_candidate = parent_of_top_candidate.parent().unwrap();
1897 }
1898 }
1899
1900 let top_candidate_readability = {
1901 let top_candidate_attrs =
1902 top_candidate.as_element().unwrap().attributes.borrow();
1903 top_candidate_attrs
1904 .get(READABILITY_SCORE)
1905 .map(|x| x.to_owned())
1906 };
1907
1908 if top_candidate_readability.is_none() {
1909 self.initialize_node(&mut top_candidate);
1910 }
1911 parent_of_top_candidate = top_candidate.parent().unwrap();
1912
1913 let mut last_score = {
1914 let top_candidate_node_attrs =
1915 top_candidate.as_element().unwrap().attributes.borrow();
1916 top_candidate_node_attrs
1917 .get(READABILITY_SCORE)
1918 .unwrap()
1919 .parse::<f32>()
1920 .unwrap()
1921 };
1922 let score_threshold = last_score / 3.0;
1923 while parent_of_top_candidate
1924 .as_element()
1925 .map(|elem| elem.name.local.as_ref())
1926 .unwrap()
1927 != "body"
1928 {
1929 let parent_readability = {
1930 let parent_attrs = parent_of_top_candidate
1931 .as_element()
1932 .unwrap()
1933 .attributes
1934 .borrow();
1935 parent_attrs
1936 .get(READABILITY_SCORE)
1937 .map(|score| score.parse::<f32>().unwrap())
1938 };
1939 if parent_readability.is_none() {
1940 parent_of_top_candidate = parent_of_top_candidate.parent().unwrap();
1941 continue;
1942 }
1943 if parent_readability.as_ref().unwrap() < &score_threshold {
1944 break;
1945 }
1946 if parent_readability.as_ref().unwrap() > &last_score {
1947 top_candidate = parent_of_top_candidate;
1948 break;
1949 }
1950 last_score = parent_readability.unwrap();
1951 parent_of_top_candidate = parent_of_top_candidate.parent().unwrap();
1952 }
1953
1954 parent_of_top_candidate = top_candidate.parent().unwrap();
1955 while &parent_of_top_candidate.as_element().unwrap().name.local != "body"
1956 && parent_of_top_candidate.children().count() == 1
1957 {
1958 top_candidate = parent_of_top_candidate;
1959 parent_of_top_candidate = top_candidate.parent().unwrap();
1960 }
1961 let top_candidate_readability = {
1962 let top_candidate_attrs =
1963 top_candidate.as_element().unwrap().attributes.borrow();
1964 top_candidate_attrs
1965 .get(READABILITY_SCORE)
1966 .map(|score| score.to_string())
1967 };
1968 if top_candidate_readability.is_none() {
1969 self.initialize_node(&mut top_candidate);
1970 }
1971 }
1972 let mut article_content = NodeRef::new_element(
1973 QualName::new(None, Namespace::from(HTML_NS), LocalName::from("div")),
1974 BTreeMap::new(),
1975 );
1976 let top_candidate_score = {
1977 let top_candidate_attrs = top_candidate.as_element().unwrap().attributes.borrow();
1978 top_candidate_attrs
1979 .get(READABILITY_SCORE)
1980 .map(|score| score.parse::<f32>().unwrap())
1981 .unwrap()
1982 };
1983
1984 let sibling_score_threshold = (10.0_f32).max(top_candidate_score * 0.2);
1985 parent_of_top_candidate = top_candidate.parent().unwrap();
1986
1987 let mut siblings = parent_of_top_candidate
1988 .children()
1989 .filter(|node| node.as_element().is_some());
1990
1991 let (top_candidate_class, top_candidate_score) = {
1992 let top_candidate_attrs = top_candidate.as_element().unwrap().attributes.borrow();
1993 let class = top_candidate_attrs
1994 .get("class")
1995 .map(|class| class.to_string())
1996 .unwrap_or("".to_string());
1997 let score = top_candidate_attrs
1998 .get(READABILITY_SCORE)
1999 .map(|score| score.parse::<f32>().unwrap())
2000 .unwrap();
2001 (class, score)
2002 };
2003 let mut next_sibling = siblings.next();
2004 while let Some(sibling) = next_sibling {
2005 next_sibling = siblings.next();
2006 let mut append = false;
2007 if sibling == top_candidate {
2008 append = true;
2009 } else {
2010 let mut content_bonus = 0.0;
2011 let sibling_attrs = sibling.as_element().unwrap().attributes.borrow();
2012
2013 let sibling_class = sibling_attrs
2014 .get("class")
2015 .map(|class| class.to_string())
2016 .unwrap_or("".to_string());
2017 let sibling_score = sibling_attrs
2018 .get(READABILITY_SCORE)
2019 .map(|score| score.parse::<f32>().unwrap());
2020
2021 if sibling_class == top_candidate_class && !top_candidate_class.is_empty() {
2022 content_bonus += top_candidate_score * 0.2;
2023 }
2024
2025 if sibling_score.is_some()
2026 && (sibling_score.unwrap() + content_bonus) >= sibling_score_threshold
2027 {
2028 append = true;
2029 } else if sibling.as_element().map(|elem| elem.name.local.as_ref()) == Some("p")
2030 {
2031 let link_density = Self::get_link_density(&sibling);
2032 let node_content = Self::get_inner_text(&sibling, None);
2033 let node_length = node_content.len();
2034 if node_length > 80 && link_density < 0.25 {
2035 append = true;
2036 } else if node_length < 80
2037 && node_length > 0
2038 && link_density == 0.0
2039 && !regexes::is_match_node_content(&node_content)
2040 {
2041 append = true;
2042 }
2043 }
2044 }
2045 if append {
2046 let new_article_child = if !ALTER_TO_DIV_EXCEPTIONS.contains(
2047 &sibling
2048 .as_element()
2049 .map(|elem| elem.name.local.as_ref())
2050 .unwrap(),
2051 ) {
2052 Self::set_node_tag(&sibling, "div")
2053 } else {
2054 sibling
2055 };
2056 article_content.append(new_article_child);
2057 }
2058 }
2059 self.prep_article(&mut article_content);
2060 if needed_to_create_top_candidate {
2061 let mut top_candidate_attrs =
2062 top_candidate.as_element().unwrap().attributes.borrow_mut();
2063 top_candidate_attrs.insert("id", "readability-page-1".to_string());
2064 top_candidate_attrs.insert("class", "page".to_string());
2065 } else {
2066 let div = NodeRef::new_element(
2067 QualName::new(None, Namespace::from(HTML_NS), LocalName::from("div")),
2068 BTreeMap::new(),
2069 );
2070 {
2071 let mut div_attrs = div.as_element().unwrap().attributes.borrow_mut();
2072 div_attrs.insert("id", "readability-page-1".to_string());
2073 div_attrs.insert("class", "page".to_string());
2074 }
2075 for child in article_content.children() {
2076 div.append(child);
2077 }
2078 article_content.append(div);
2079 }
2080
2081 let text_length = Self::get_inner_text(&article_content, Some(true)).len();
2082 let mut parse_successful = true;
2083 if text_length < DEFAULT_CHAR_THRESHOLD {
2084 parse_successful = false;
2085 if self.flag_is_active(FLAG_STRIP_UNLIKELYS) {
2086 self.remove_flag(FLAG_STRIP_UNLIKELYS);
2087 attempts.push(ExtractAttempt::new(article_content.clone(), text_length));
2088 } else if self.flag_is_active(FLAG_WEIGHT_CLASSES) {
2089 self.remove_flag(FLAG_WEIGHT_CLASSES);
2090 attempts.push(ExtractAttempt::new(article_content.clone(), text_length));
2091 } else if self.flag_is_active(FLAG_CLEAN_CONDITIONALLY) {
2092 self.remove_flag(FLAG_CLEAN_CONDITIONALLY);
2093 attempts.push(ExtractAttempt::new(article_content.clone(), text_length));
2094 } else {
2095 attempts.push(ExtractAttempt::new(article_content.clone(), text_length));
2096 attempts.sort_by(|a, b| b.length.partial_cmp(&a.length).unwrap());
2097 if attempts.first().as_ref().unwrap().length == 0 {
2098 return Err(ParserError::new(
2099 "Unable to extract content".into(),
2100 )
2101 .into());
2102 }
2103 article_content = attempts[0].article.clone();
2104 parse_successful = true;
2105 }
2106 }
2107 if parse_successful {
2108 let parent_ancestors = Self::get_node_ancestors(&parent_of_top_candidate, None);
2109 let ancestors = vec![
2110 vec![parent_of_top_candidate, top_candidate],
2111 parent_ancestors,
2112 ]
2113 .concat();
2114 ancestors.iter().any(|node| {
2115 let node_elem = node.as_element();
2116 if node_elem.is_none() {
2117 return false;
2118 }
2119 let node_attrs = node_elem.unwrap().attributes.borrow();
2120 if let Some(dir_attr) = node_attrs.get("dir") {
2121 self.article_dir = Some(dir_attr.to_string());
2122 return true;
2123 }
2124 false
2125 });
2126 self.article_node = Some(article_content);
2127 info!("Successfully grabbed article {:?}", self.metadata.title);
2128 return Ok(());
2129 }
2130 }
2131 }
2132}
2133
2134#[derive(Debug)]
2136struct ExtractAttempt {
2137 article: NodeRef,
2138 length: usize,
2139}
2140
2141impl ExtractAttempt {
2142 pub fn new(article: NodeRef, length: usize) -> Self {
2143 ExtractAttempt { article, length }
2144 }
2145}
2146
2147#[derive(Debug, PartialEq)]
2148pub struct MetaData {
2149 byline: Option<String>,
2150 excerpt: Option<String>,
2151 site_name: Option<String>,
2152 title: String,
2153}
2154
2155impl MetaData {
2156 pub fn new() -> Self {
2157 MetaData {
2158 byline: None,
2159 excerpt: None,
2160 site_name: None,
2161 title: "".into(),
2162 }
2163 }
2164
2165 pub fn title(&self) -> &str {
2166 &self.title
2167 }
2168
2169 pub fn byline(&self) -> Option<&String> {
2170 self.byline.as_ref()
2171 }
2172}
2173
2174#[cfg(test)]
2175mod test {
2176 use super::{
2177 MetaData, Readability, SizeInfo, FLAG_CLEAN_CONDITIONALLY, FLAG_STRIP_UNLIKELYS,
2178 FLAG_WEIGHT_CLASSES, HTML_NS, READABILITY_SCORE,
2179 };
2180 use html5ever::{LocalName, Namespace, QualName};
2181 use kuchiki::traits::*;
2182 use kuchiki::NodeRef;
2183
2184 const TEST_HTML: &'static str = r#"
2186 <!DOCTYPE html>
2187 <html lang="en">
2188
2189 <head>
2190 <title>Sample Document</title>
2191 </head>
2192
2193 <body>
2194 <h1>Some text in h1</h1>
2195 <img src="inexistent.png">
2196 <div class="invalid-elems">
2197 <!-- This div contains invalid elements -->
2198 <h1>Imagine some lorem ipsum</h1>
2199 <img>
2200 </div>
2201 <!-- Test that the no-script content is copied over -->
2202 <img src="lazy-load.png">
2203 <noscript>
2204 <div class="parent">
2205 <img src="eager-load.png" id="lazy-load">
2206 </div>
2207 </noscript>
2208 </body>
2209
2210 </html>
2211
2212 "#;
2213
2214 #[test]
2215 fn test_unwrap_no_script_tags() {
2216 let mut readability = Readability::new(TEST_HTML);
2217 let img_count = readability.root_node.select("img").unwrap().count();
2218 assert_eq!(3, img_count);
2219 readability.unwrap_no_script_tags();
2220 let img_count = readability.root_node.select("img").unwrap().count();
2221 assert_eq!(2, img_count);
2222
2223 let updated_img = readability.root_node.select_first("img#lazy-load").unwrap();
2225 let updated_img_attrs = updated_img.attributes.borrow();
2226 assert_eq!(true, updated_img_attrs.contains("data-old-src"));
2227 assert_eq!(Some("lazy-load.png"), updated_img_attrs.get("data-old-src"));
2228 assert_eq!(Some("eager-load.png"), updated_img_attrs.get("src"));
2229 }
2230
2231 #[test]
2232 fn test_is_single_image() {
2233 let readability = Readability::new(TEST_HTML);
2234
2235 let img_elem_ref = readability.root_node.select_first("img").unwrap();
2236 assert_eq!(true, Readability::is_single_image(&img_elem_ref.as_node()));
2237
2238 let noscript_elem_ref = readability.root_node.select_first("noscript").unwrap();
2239 assert_eq!(
2240 false,
2241 Readability::is_single_image(&noscript_elem_ref.as_node())
2242 );
2243
2244 let div_elem_ref = readability
2245 .root_node
2246 .select_first("div.invalid-elems")
2247 .unwrap();
2248 assert_eq!(false, Readability::is_single_image(&div_elem_ref.as_node()));
2249
2250 let div_elem_ref = kuchiki::parse_fragment(
2251 QualName::new(None, Namespace::from(HTML_NS), LocalName::from("div")),
2252 Vec::new(),
2253 )
2254 .one(noscript_elem_ref.as_node().text_contents().trim());
2255
2256 assert_eq!(true, Readability::is_single_image(&div_elem_ref));
2257 }
2258
2259 #[test]
2260 fn test_remove_scripts() {
2261 let mut readability = Readability::new(TEST_HTML);
2262
2263 let noscript_elems = readability.root_node.select("noscript").unwrap();
2264 assert_eq!(1, noscript_elems.count());
2265 readability.remove_scripts();
2266 let noscript_elems = readability.root_node.select("noscript").unwrap();
2267 assert_eq!(0, noscript_elems.count());
2268 }
2269
2270 #[test]
2271 fn test_next_element() {
2272 let html_str = r#"
2273 <p id="a">This is a node</p>
2274 <!-- Commented content -->
2275 <p id="b">This is another node. The next line is just whitespace</p>
2276
2277 This is standalone text
2278 <p> Some <span>more</span> text</p>"#;
2279 let doc = Readability::new(html_str);
2280 let p = doc.root_node.select_first("#a").unwrap();
2281 let p = p.as_node();
2282 let mut p_node_option: Option<NodeRef> = Some(p.clone());
2283 p_node_option = Readability::next_element(p_node_option, false);
2284 assert_eq!(Some(p.clone()), p_node_option);
2285
2286 let p_node_option = p_node_option.unwrap();
2287 let p_node_option = p_node_option.as_element();
2288 let p_node_option_attr = p_node_option.unwrap().attributes.borrow();
2289 assert_eq!("a", p_node_option_attr.get("id").unwrap());
2290
2291 let next = Readability::next_element(p.next_sibling(), false);
2292
2293 let next = next.unwrap();
2294 let next_elem = next.as_element();
2295 let next_attr = next_elem.unwrap().attributes.borrow();
2296 assert_eq!("b", next_attr.get("id").unwrap());
2297
2298 let next = Readability::next_element(next.next_sibling(), false);
2299
2300 let next = next.unwrap();
2301 assert_eq!(true, next.as_text().is_some());
2302 assert_eq!("This is standalone text", next.text_contents().trim());
2303
2304 let next = Readability::next_element(None, false);
2305 assert_eq!(None, next);
2306 }
2307
2308 #[test]
2309 fn test_is_phrasing_content() {
2310 let html_str = r#"
2311 Some text node
2312 <b>This is a phrasing content node</b>
2313 <p>This is not a phrasing content node</p>
2314 <a href="\#"><i>This is also a phrasing content</i></a>
2315 <a href="\#"><p>This is not a phrasing content</p></a>
2316 "#;
2317 let doc = Readability::new(html_str);
2318 let body = doc.root_node.select_first("body").unwrap();
2319 let body = body.as_node();
2320 let mut body_children = body.children();
2321 let mut node = body_children.next().unwrap();
2322 assert_eq!(true, node.as_text().is_some());
2323 assert_eq!(true, Readability::is_phrasing_content(&node));
2324
2325 node = node.next_sibling().unwrap();
2326 assert_eq!("b", &node.as_element().unwrap().name.local);
2327 assert_eq!(true, Readability::is_phrasing_content(&node));
2328
2329 node = node.next_sibling().unwrap(); node = node.next_sibling().unwrap();
2331 assert_eq!("p", &node.as_element().unwrap().name.local);
2332 assert_eq!(false, Readability::is_phrasing_content(&node));
2333
2334 node = node.next_sibling().unwrap(); node = node.next_sibling().unwrap();
2336 assert_eq!("a", &node.as_element().unwrap().name.local);
2337 assert_eq!(true, Readability::is_phrasing_content(&node));
2338
2339 node = node.next_sibling().unwrap(); node = node.next_sibling().unwrap();
2341 assert_eq!("a", &node.as_element().unwrap().name.local);
2342 assert_eq!(false, Readability::is_phrasing_content(&node));
2343 }
2344
2345 #[test]
2346 fn test_is_whitespace() {
2347 let html_str = r#"
2348 <p>Definitely not whitespace</p>
2349 I am also not whitespace
2350 <p> </p>
2351 <br>
2352 "#;
2353 let doc = Readability::new(html_str);
2354 let body = doc.root_node.select_first("body").unwrap();
2355
2356 let mut node = body.as_node().first_child().unwrap();
2357 assert_eq!("p", &node.as_element().unwrap().name.local);
2358 assert_eq!(false, Readability::is_whitespace(&node));
2359
2360 node = node.next_sibling().unwrap();
2361 assert_eq!(true, node.as_text().is_some());
2362 assert_eq!(false, Readability::is_whitespace(&node));
2363
2364 node = node.next_sibling().unwrap();
2365 assert_eq!("p", &node.as_element().unwrap().name.local);
2366 assert_eq!(
2367 true,
2368 Readability::is_whitespace(&node.first_child().unwrap())
2369 );
2370
2371 node = node.next_sibling().unwrap();
2373 assert_eq!(true, node.as_text().is_some());
2374 assert_eq!(true, Readability::is_whitespace(&node));
2375
2376 node = node.next_sibling().unwrap();
2377 assert_eq!("br", &node.as_element().unwrap().name.local);
2378 assert_eq!(true, Readability::is_whitespace(&node));
2379 }
2380
2381 #[test]
2382 fn test_set_node_tag() {
2383 let html_str = r#"
2384 <div id="target" class="some random class" tabindex="0"><p>Child 1</p><p>Child 2</p></div>
2385 <div id="not-the-target">The div above is being replaced</div>
2386 "#;
2387 let doc = Readability::new(html_str);
2388 let target = doc.root_node.select_first("#target").unwrap();
2389 let children_count = doc.root_node.children().count();
2390 let target_children_count = target.as_node().children().count();
2391
2392 assert_eq!("div", &target.name.local);
2393 let new_node = Readability::set_node_tag(target.as_node(), "section");
2394
2395 assert_eq!(children_count, doc.root_node.children().count());
2396 let target = doc.root_node.select_first("#target").unwrap();
2397 assert_eq!(&new_node, target.as_node());
2398 assert_eq!("section", &target.name.local);
2399 assert_eq!(target_children_count, target.as_node().children().count());
2400
2401 let target_attrs = target.as_node().as_element().unwrap().attributes.borrow();
2402 assert_eq!(3, target_attrs.map.len());
2403
2404 let old_div = doc.root_node.select_first("div#target");
2405 assert_eq!(true, old_div.is_err());
2406 }
2407
2408 #[test]
2409 fn test_replace_node_tags() {
2410 let html_str = r#"
2411 <div id="replace-p">
2412 <p>Tag 1</p><p>Tag 2</p><p>Tag 3</p>
2413 </div>
2414 "#;
2415 let doc = Readability::new(html_str);
2416 let target_parent = doc.root_node.select_first("div#replace-p").unwrap();
2417 let target_parent_child_count = target_parent.as_node().children().count();
2418 let nodes = target_parent.as_node().select("p").unwrap();
2419
2420 Readability::replace_node_tags(nodes, "span");
2421 assert_eq!(
2422 target_parent_child_count,
2423 target_parent.as_node().children().count()
2424 );
2425
2426 let nodes = target_parent.as_node().select("p").unwrap();
2427 assert_eq!(0, nodes.count());
2428 let nodes = target_parent.as_node().select("span").unwrap();
2429 assert_eq!(3, nodes.count());
2430 }
2431
2432 #[test]
2433 fn test_replace_brs() {
2434 let html_str = r#"
2435 <div>foo<br>bar<br> <br><br>abc</div>
2436 "#;
2437 let mut doc = Readability::new(html_str);
2438 let div = doc.root_node.select_first("div").unwrap();
2439 let br_count = div.as_node().select("br").unwrap().count();
2440 let p_count = div.as_node().select("p").unwrap().count();
2441 assert_eq!(4, br_count);
2442 assert_eq!(0, p_count);
2443
2444 doc.replace_brs();
2445 let br_count = div.as_node().select("br").unwrap().count();
2446 let p_count = div.as_node().select("p").unwrap().count();
2447 assert_eq!(1, br_count);
2448 assert_eq!(1, p_count);
2449
2450 let p_node = div.as_node().select_first("p").unwrap();
2451 assert_eq!("abc", p_node.as_node().text_contents());
2452
2453 let html_str = r#"
2454 <p>foo<br>bar<br> <br><br>abc</p>
2455 "#;
2456 doc = Readability::new(html_str);
2457 let p = doc.root_node.select_first("p").unwrap();
2458 let div_count = doc.root_node.select("div").unwrap().count();
2459 let br_count = p.as_node().select("br").unwrap().count();
2460 assert_eq!(4, br_count);
2461 assert_eq!(0, div_count);
2462
2463 doc.replace_brs();
2464 let br_count = doc.root_node.select("br").unwrap().count();
2465 let div_count = doc.root_node.select("div").unwrap().count();
2466 let p_count = doc.root_node.select("p").unwrap().count();
2467 assert_eq!(1, br_count);
2468 assert_eq!(1, div_count);
2469 assert_eq!(1, p_count);
2470 let p_node = doc.root_node.select_first("p").unwrap();
2471 assert_eq!("abc", p_node.as_node().text_contents());
2472 }
2473
2474 #[test]
2475 fn test_prep_document() {
2476 let html_str = r#"
2477 <!DOCTYPE html>
2478 <html>
2479 <head>
2480 <style>div {padding: 20px; border-bottom: 2px solid black; }</style>
2481 </head>
2482 <body>
2483 <font face="Times New Roman" size="10">Times New Roman</font>
2484 <div>foo<br>bar<br> <br><br>abc</div>
2485 </body>
2486 </html>
2487 "#;
2488 let mut doc = Readability::new(html_str);
2489 doc.prep_document();
2490
2491 let style_nodes = doc.root_node.select("style").unwrap();
2492 let font_nodes = doc.root_node.select("font").unwrap();
2493 let p_nodes = doc.root_node.select("p").unwrap();
2494 let br_nodes = doc.root_node.select("br").unwrap();
2495 assert_eq!(0, style_nodes.count());
2496 assert_eq!(0, font_nodes.count());
2497 assert_eq!(1, p_nodes.count());
2498 assert_eq!(1, br_nodes.count());
2499 }
2500
2501 #[test]
2502 fn test_inline_css_str_to_map() {
2503 use std::collections::HashMap;
2504 let css_str = "display: flex; height: 200px; width: 250px; justify-content: center; align-items: center; border: 2px solid black";
2505 let mut css_map = HashMap::new();
2506 css_map.insert("display".to_string(), "flex".to_string());
2507 css_map.insert("height".to_string(), "200px".to_string());
2508 css_map.insert("width".to_string(), "250px".to_string());
2509 css_map.insert("justify-content".to_string(), "center".to_string());
2510 css_map.insert("align-items".to_string(), "center".to_string());
2511 css_map.insert("border".to_string(), "2px solid black".to_string());
2512
2513 let css_str_to_map = Readability::inline_css_str_to_map(css_str);
2514 assert_eq!(css_map, css_str_to_map);
2515 let mut css_map = HashMap::new();
2516 css_map.insert("color".to_string(), "red".to_string());
2517 css_map.insert("background-image".to_string(), "url('')".to_string());
2518 assert_eq!(css_map, Readability::inline_css_str_to_map("color: red;background-image: url('')"));
2519
2520 let empty_map = HashMap::new();
2521 assert_eq!(empty_map, Readability::inline_css_str_to_map(" \n \t \r"));
2522 assert_eq!(empty_map, Readability::inline_css_str_to_map("color"));
2523
2524 let mut css_map = HashMap::new();
2525 css_map.insert("color".to_string(), "red".to_string());
2526 css_map.insert("height".to_string(), "300px".to_string());
2527 assert_eq!(
2528 css_map,
2529 Readability::inline_css_str_to_map("color: red;height: 300px;width")
2530 );
2531 }
2532
2533 #[test]
2534 fn test_is_probably_visible() {
2535 let html_str = r#"
2536 <!DOCTYPE html>
2537 <html>
2538 <body>
2539 <p id="visible">Lorem ipsum dolores</p>
2540 <div id="hidden-div" style="display: none">
2541 <p>This is hidden and so is the parent</p>
2542 </div>
2543 <input value="Some good CSRF token" hidden>
2544 <div id="hidden-aria" style="display: flex;" aria-hidden="true">
2545 <p>This is not considered visible</p>
2546 </div>
2547 <div id="visible-aria" style="display: flex;" aria-hidden="false">
2548 <p>This is considered visible</p>
2549 </div>
2550 <img src="./some-img.png" class="fallback-image">
2551 <div id="visible-div" style="display: block" class="visible" aria-hidden="false">
2552 <p>This is fully visible</p>
2553 </div>
2554 </body>
2555 </html>
2556 "#;
2557 let doc = Readability::new(html_str);
2558 let div_node = doc.root_node.select_first("div#hidden-div").unwrap();
2559 let p_node = doc.root_node.select_first("p#visible").unwrap();
2560 let input_node = doc.root_node.select_first("input").unwrap();
2561 let hidden_aria_div_node = doc.root_node.select_first("div#hidden-aria").unwrap();
2562 let visible_aria_div_node = doc.root_node.select_first("div#visible-aria").unwrap();
2563 let img_node = doc.root_node.select_first("img").unwrap();
2564 let visible_div_node = doc.root_node.select_first("div#visible-div").unwrap();
2565 assert_eq!(true, Readability::is_probably_visible(&p_node.as_node()));
2566 assert_eq!(false, Readability::is_probably_visible(&div_node.as_node()));
2567 assert_eq!(
2568 false,
2569 Readability::is_probably_visible(&input_node.as_node())
2570 );
2571 assert_eq!(
2572 false,
2573 Readability::is_probably_visible(&hidden_aria_div_node.as_node())
2574 );
2575 assert_eq!(
2576 true,
2577 Readability::is_probably_visible(&visible_aria_div_node.as_node())
2578 );
2579 assert_eq!(true, Readability::is_probably_visible(&img_node.as_node()));
2580 assert_eq!(
2581 true,
2582 Readability::is_probably_visible(&visible_div_node.as_node())
2583 );
2584 }
2585
2586 #[test]
2587 fn test_check_byline() {
2588 let html_str = r#"
2589 <!DOCTYPE html>
2590 <html>
2591 <body>
2592 <p class="byline description" id="author">
2593This test is used to find out whether a given node is a byline. This works by checking whether
2594a node has a rel attribute with "author" as its value, or if "author"
2595is part of its value in the itemprop attribute. If neither is the case then it checks whether the classes and id
2596of the node match a regex of a potential byline. If any condition is met, then the content must be less than 100
2597characters. For that reason, this <p> tag could not be a byline because it's too long.
2598 </p>
2599 <p class="author">A Paperoni maintainer</p>
2600 <p class="authors not-byline"></p>
2601 <p rel="author">Maintainer of Paperoni</p>
2602 </body>
2603 </html>
2604 "#;
2605 let mut doc = Readability::new(html_str);
2606 assert_eq!(&None, &doc.byline);
2607 let p1_node = doc.root_node.select_first("p.byline").unwrap();
2608 let p2_node = doc.root_node.select_first("p.author").unwrap();
2609 let p3_node = doc.root_node.select_first("p.not-byline").unwrap();
2610 let p4_node = doc.root_node.select_first(r#"p[rel="author""#).unwrap();
2611 assert_eq!(
2612 false,
2613 doc.check_byline(p1_node.as_node(), "byline description author")
2614 );
2615 assert_eq!(true, doc.check_byline(p2_node.as_node(), "author"));
2616 assert_eq!(
2617 false,
2618 doc.check_byline(p3_node.as_node(), "authors not-byline")
2619 );
2620 assert_eq!(Some("A Paperoni maintainer".into()), doc.byline);
2621 assert_eq!(false, doc.check_byline(p4_node.as_node(), ""));
2623 }
2624
2625 #[test]
2626 fn test_get_next_node() {
2627 let html_str = r#"
2628 <!DOCTYPE html>
2629 <html>
2630 <body>
2631 <div id="body-child-1">
2632 <p id="start">Foobar content</p>
2633 <div id="start-sib">
2634 <span>First child</span>
2635 </div>
2636 </div>
2637 <div id="body-child-2"><span>This will not be reached</p></div>
2638 <p id="body-child-last">Last element</p>
2639 </body>
2640 </html>
2641 "#;
2642 let doc = Readability::new(html_str);
2643 let node = doc.root_node.select_first("p#start").unwrap();
2644 let next_node = Readability::get_next_node(node.as_node(), false);
2645 assert_eq!(true, next_node.is_some());
2646 let next_node = next_node.unwrap();
2647 let next_node_attr = next_node.as_element().unwrap().attributes.borrow();
2648 assert_eq!(Some("start-sib"), next_node_attr.get("id"));
2649
2650 let next_node = Readability::get_next_node(&next_node, false);
2651 assert_eq!(true, next_node.is_some());
2652 let next_node = next_node.unwrap();
2653 assert_eq!("span", &next_node.as_element().unwrap().name.local);
2654
2655 let next_node = Readability::get_next_node(&next_node, false);
2656 assert_eq!(true, next_node.is_some());
2657 let next_node = next_node.unwrap();
2658 let next_node_attr = next_node.as_element().unwrap().attributes.borrow();
2659 assert_eq!(Some("body-child-2"), next_node_attr.get("id"));
2660
2661 let next_node = Readability::get_next_node(&next_node, true);
2662 assert_eq!(true, next_node.is_some());
2663 let next_node = next_node.unwrap();
2664 let next_node_attr = next_node.as_element().unwrap().attributes.borrow();
2665 assert_eq!(Some("body-child-last"), next_node_attr.get("id"));
2666
2667 let next_node = Readability::get_next_node(&next_node, true);
2668 assert_eq!(None, next_node);
2669 }
2670
2671 #[test]
2672 fn test_remove_and_get_next() {
2673 let html_str = r#"
2674 <!DOCTYPE html>
2675 <html>
2676 <body>
2677 <div id="body-child-1">
2678 <p id="start">Foobar content</p>
2679 <div id="start-sib">
2680 <span>First child</span>
2681 </div>
2682 </div>
2683 <div id="body-child-2"><span>This will not be reached</p></div>
2684 <p id="body-child-last">Last element</p>
2685 </body>
2686 </html>
2687 "#;
2688 let doc = Readability::new(html_str);
2689 let node = doc.root_node.select_first("div#body-child-1").unwrap();
2690 let p_node = Readability::get_next_node(node.as_node(), false).unwrap();
2691 let next_node = Readability::remove_and_get_next(p_node);
2692 assert_eq!(true, next_node.is_some());
2693
2694 let next_node = next_node.unwrap();
2695 let next_node_attr = next_node.as_element().unwrap().attributes.borrow();
2696 assert_eq!(Some("start-sib"), next_node_attr.get("id"));
2697
2698 let p_node = doc.root_node.select_first("p#start");
2700 assert_eq!(true, p_node.is_err());
2701 }
2702
2703 #[test]
2704 fn test_has_ancestor_tag() {
2705 let html_str = r#"
2706 <!DOCTYPE html>
2707 <html>
2708 <body>
2709 <div>
2710 <main>
2711 <p>
2712 <span>Target node</span>
2713 </p>
2714 </main>
2715 </div>
2716 </body>
2717 </html>
2718 "#;
2719 let doc = Readability::new(html_str);
2720 let target = doc.root_node.select_first("span").unwrap();
2721 assert_eq!(
2722 true,
2723 Readability::has_ancestor_tag(target.as_node(), "div", None, None)
2724 );
2725 assert_eq!(
2726 false,
2727 Readability::has_ancestor_tag(target.as_node(), "div", Some(1), None)
2728 );
2729 assert_eq!(
2730 false,
2731 Readability::has_ancestor_tag(
2732 target.as_node(),
2733 "div",
2734 Some(5),
2735 Some(|node_ref| {
2736 let node_attrs = node_ref.as_element().unwrap().attributes.borrow();
2737 node_attrs.contains("class")
2738 })
2739 )
2740 );
2741 }
2742
2743 #[test]
2744 fn test_is_element_without_content() {
2745 let html_str = r#"
2746 <!DOCTYPE html>
2747 <html>
2748 <body>
2749 <p>Node with content</p><!-- A comment node which is regarded as not having content -->
2750 <p id="empty"></p>
2751 <div id="contentful">
2752 <p>
2753 <span>Target node</span>
2754 </p>
2755 </div>
2756 <div id="no-content"><br><br><br><br><br><br><hr><hr><br></div>
2757 </body>
2758 </html>
2759 "#;
2760 let doc = Readability::new(html_str);
2761 let target = doc.root_node.select_first("p").unwrap();
2762 assert_eq!(
2763 false,
2764 Readability::is_element_without_content(target.as_node())
2765 );
2766
2767 let target = target.as_node().next_sibling().unwrap();
2768 assert_eq!(true, target.as_comment().is_some());
2769 assert_eq!(false, Readability::is_element_without_content(&target));
2770
2771 let mut target = doc.root_node.select_first("p#empty").unwrap();
2772 assert_eq!(
2773 true,
2774 Readability::is_element_without_content(target.as_node())
2775 );
2776
2777 target = doc.root_node.select_first("div#contentful").unwrap();
2778 assert_eq!(
2779 false,
2780 Readability::is_element_without_content(target.as_node())
2781 );
2782
2783 target = doc.root_node.select_first("div#no-content").unwrap();
2784 assert_eq!(
2785 true,
2786 Readability::is_element_without_content(target.as_node())
2787 );
2788 }
2789
2790 #[test]
2791 fn test_has_single_tag_inside_element() {
2792 let html_str = r#"
2793 <!DOCTYPE html>
2794 <html>
2795 <body>
2796 <p id="one">No element tags here</p>
2797 <p id="two"><span>The p tag has only one tag</span></p>
2798 <p id="three">
2799 <span>Target node</span>
2800 <span>
2801 The parent has multiple children
2802 </span>
2803 </p>
2804 <p id="four">
2805 The text here means this div doesn't have a single tag
2806 <span>Target node</span>
2807 </p>
2808 </body>
2809 </html>
2810 "#;
2811 let doc = Readability::new(html_str);
2812 let mut target = doc.root_node.select_first("p#one").unwrap();
2813 assert_eq!(
2814 false,
2815 Readability::has_single_tag_inside_element(target.as_node(), "span")
2816 );
2817
2818 target = doc.root_node.select_first("p#two").unwrap();
2819 assert_eq!(
2820 true,
2821 Readability::has_single_tag_inside_element(target.as_node(), "span")
2822 );
2823
2824 target = doc.root_node.select_first("p#three").unwrap();
2825 assert_eq!(
2826 false,
2827 Readability::has_single_tag_inside_element(target.as_node(), "span")
2828 );
2829
2830 target = doc.root_node.select_first("p#four").unwrap();
2831 assert_eq!(
2832 false,
2833 Readability::has_single_tag_inside_element(target.as_node(), "span")
2834 );
2835 }
2836
2837 #[test]
2838 fn test_get_inner_text() {
2839 let html_str = r#"
2840 <!DOCTYPE html>
2841 <html>
2842 <body>
2843 <p>The quick brown fox jumps over the lazy dog</p>
2844 </body>
2845 </html>
2846 "#;
2847 let doc = Readability::new(html_str);
2848 let target = doc.root_node.select_first("p").unwrap();
2849 assert_eq!(
2850 49,
2851 Readability::get_inner_text(target.as_node(), Some(false)).len()
2852 );
2853 assert_eq!(
2854 43,
2855 Readability::get_inner_text(target.as_node(), None).len()
2856 );
2857 }
2858
2859 #[test]
2860 fn test_get_link_density() {
2861 let html_str = r#"
2862 <!DOCTYPE html>
2863 <html>
2864 <body>
2865 <p id="one">Zero link density</p>
2866 <p id="two">Link density > 0 <a href="https://www.rust-lang.org/">The Rust home page</a></p>
2867 <p id="three"><a></a><a></a></p>
2868 </body>
2869 </html>
2870 "#;
2871 let doc = Readability::new(html_str);
2872 let mut target = doc.root_node.select_first("p#one").unwrap();
2873 assert_eq!(0_f32, Readability::get_link_density(target.as_node()));
2874
2875 target = doc.root_node.select_first("p#two").unwrap();
2876 assert_eq!(
2877 18_f32 / 35_f32,
2878 Readability::get_link_density(target.as_node())
2879 );
2880
2881 target = doc.root_node.select_first("p#three").unwrap();
2882 assert_eq!(0_f32, Readability::get_link_density(target.as_node()));
2883 }
2884
2885 #[test]
2886 fn test_has_child_block_element() {
2887 let html_str = r#"
2888 <!DOCTYPE html>
2889 <html>
2890 <body>
2891 <p id="one">Has no <span>block level</span> elements</p>
2892 <p id="two">Link density > 0 <a href="https://www.rust-lang.org/">The Rust home page</a></p>
2893 <div id="three">
2894 <p>This is a block level element</p>
2895 </div>
2896 </body>
2897 </html>
2898 "#;
2899 let doc = Readability::new(html_str);
2900 let mut target = doc.root_node.select_first("p#one").unwrap();
2901 assert_eq!(
2902 false,
2903 Readability::has_child_block_element(target.as_node())
2904 );
2905
2906 target = doc.root_node.select_first("p#two").unwrap();
2907 assert_eq!(
2908 false,
2909 Readability::has_child_block_element(target.as_node())
2910 );
2911
2912 target = doc.root_node.select_first("div#three").unwrap();
2913 assert_eq!(true, Readability::has_child_block_element(target.as_node()));
2914 }
2915
2916 #[test]
2917 fn test_get_node_ancestors() {
2918 let html_str = r#"
2919 <!DOCTYPE html>
2920 <html>
2921 <body>
2922 <section>
2923 <div>
2924 <p><span></span></p>
2925 </div>
2926 </section>
2927 </body>
2928 </html>
2929 "#;
2930 let doc = Readability::new(html_str);
2931 let mut target = doc.root_node.select_first("span").unwrap();
2932 assert_eq!(
2933 1,
2934 Readability::get_node_ancestors(target.as_node(), None).len()
2935 );
2936 assert_eq!(
2937 3,
2938 Readability::get_node_ancestors(target.as_node(), Some(3)).len()
2939 );
2940 assert_eq!(
2941 5,
2942 Readability::get_node_ancestors(target.as_node(), Some(5)).len()
2943 );
2944 assert_eq!(
2945 6,
2946 Readability::get_node_ancestors(target.as_node(), Some(200)).len()
2947 );
2948
2949 target = doc.root_node.select_first("html").unwrap();
2950 assert_eq!(
2951 1,
2952 Readability::get_node_ancestors(target.as_node(), Some(4)).len()
2953 );
2954 }
2955
2956 #[test]
2957 fn test_get_class_weight() {
2958 let html_str = r#"
2959 <!DOCTYPE html>
2960 <html>
2961 <body>
2962 <div id="blog" class="main">
2963 <h1 class="hidden">Up next...</h1>
2964 <p id="story">A story is told...</p>
2965 </div>
2966 <div id="comments">
2967 Tell us what you think
2968 <p class="comment">Great read...</p>
2969 </div>
2970 </body>
2971 </html>
2972 "#;
2973 let doc = Readability::new(html_str);
2974 let mut target = doc.root_node.select_first("body").unwrap();
2975 assert_eq!(0, doc.get_class_weight(target.as_node()));
2976
2977 target = doc.root_node.select_first("div#blog").unwrap();
2978 assert_eq!(50, doc.get_class_weight(target.as_node()));
2979
2980 target = doc.root_node.select_first("h1.hidden").unwrap();
2981 assert_eq!(-25, doc.get_class_weight(target.as_node()));
2982
2983 target = doc.root_node.select_first("p#story").unwrap();
2984 assert_eq!(25, doc.get_class_weight(target.as_node()));
2985
2986 target = doc.root_node.select_first("div#comments").unwrap();
2987 assert_eq!(-25, doc.get_class_weight(target.as_node()));
2988
2989 target = doc.root_node.select_first("p.comment").unwrap();
2990 assert_eq!(-25, doc.get_class_weight(target.as_node()));
2991 }
2992
2993 #[test]
2994 fn test_initialize_node() {
2995 let html_str = r#"
2996 <!DOCTYPE html>
2997 <html>
2998 <body>
2999 <div id="blog" class="main">
3000 <h1 class="hidden">Up next...</h1>
3001 <p id="story">A story is told...</p>
3002 </div>
3003 <div id="comments">
3004 Tell us what you think
3005 <pre class="comment">Great read...</pre>
3006 </div>
3007 </body>
3008 </html>
3009 "#;
3010 let doc = Readability::new(html_str);
3011 let mut target = doc.root_node.select_first("div#blog").unwrap();
3012 let mut node = target.as_node().clone();
3013 doc.initialize_node(&mut node);
3014 let node_attrs = node.as_element().unwrap().attributes.borrow();
3015 assert_eq!(Some("55"), node_attrs.get(READABILITY_SCORE));
3016
3017 target = doc.root_node.select_first("h1.hidden").unwrap();
3018 let mut node = target.as_node().clone();
3019 doc.initialize_node(&mut node);
3020 let node_attrs = node.as_element().unwrap().attributes.borrow();
3021 assert_eq!(Some("-30"), node_attrs.get(READABILITY_SCORE));
3022
3023 target = doc.root_node.select_first("p#story").unwrap();
3024 let mut node = target.as_node().clone();
3025 doc.initialize_node(&mut node);
3026 let node_attrs = node.as_element().unwrap().attributes.borrow();
3027 assert_eq!(Some("25"), node_attrs.get(READABILITY_SCORE));
3028
3029 target = doc.root_node.select_first("div#comments").unwrap();
3030 let mut node = target.as_node().clone();
3031 doc.initialize_node(&mut node);
3032 let node_attrs = node.as_element().unwrap().attributes.borrow();
3033 assert_eq!(Some("-20"), node_attrs.get(READABILITY_SCORE));
3034
3035 target = doc.root_node.select_first("pre.comment").unwrap();
3036 let mut node = target.as_node().clone();
3037 doc.initialize_node(&mut node);
3038 let node_attrs = node.as_element().unwrap().attributes.borrow();
3039 assert_eq!(Some("-22"), node_attrs.get(READABILITY_SCORE));
3040 }
3041
3042 #[test]
3043 fn test_get_row_and_column_count() {
3044 let html_str = r#"
3045 <!DOCTYPE html>
3046 <html>
3047 <body>
3048 <table>
3049 <tbody>
3050 <tr>
3051 <td> </td><td> </td><td> </td><td> </td>
3052 </tr>
3053 <tr>
3054 <td> </td><td> </td><td> </td><td rowspan="2"> </td>
3055 </tr>
3056 <tr>
3057 <td> </td><td> </td><td> </td>
3058 </tr>
3059 <tr>
3060 <td> </td><td colspan="2"> </td><td> </td>
3061 </tr>
3062 <tr>
3063 <td> </td><td> </td><td> </td><td> </td>
3064 </tr>
3065 <tr>
3066 <td colspan="4"> </td>
3067 </tr>
3068 </tbody>
3069 </table>
3070 </body>
3071 </html>
3072 "#;
3073 let doc = Readability::new(html_str);
3074 let target = doc.root_node.select_first("table").unwrap();
3075 assert_eq!(
3076 SizeInfo {
3077 rows: 6,
3078 columns: 4
3079 },
3080 Readability::get_row_and_column_count(target.as_node())
3081 );
3082 }
3083
3084 #[test]
3085 fn test_mark_data_tables() {
3086 let html_str = r#"
3087 <!DOCTYPE html>
3088 <html>
3089 <body>
3090 <table id="one"></table>
3091 <table width="100%" border="0" id="two">
3092 <tr valign="top">
3093 <td width="20%">Left</td>
3094 <td height="200" width="60%">Main</td>
3095 <td width="20%">Right</td>
3096 </tr>
3097 </table>
3098 <table id="three">
3099 <caption>Monthly savings</caption>
3100 <tr>
3101 <th>Month</th>
3102 <th>Savings</th>
3103 </tr>
3104 <tr>
3105 <td>January</td>
3106 <td>$100</td>
3107 </tr>
3108 <tr>
3109 <td>February</td>
3110 <td>$50</td>
3111 </tr>
3112 </table>
3113 <table id="four">
3114 <tbody>
3115 <tr>
3116 <td> </td><td> </td><td> </td><td> </td>
3117 </tr>
3118 <tr>
3119 <td> </td><td> </td><td> </td><td rowspan="2"> </td>
3120 </tr>
3121 <tr>
3122 <td> </td><td> </td><td> </td>
3123 </tr>
3124 <tr>
3125 <td> </td><td colspan="2"> </td><td> </td>
3126 </tr>
3127 <tr>
3128 <td> </td><td> </td><td> </td><td> </td>
3129 </tr>
3130 <tr>
3131 <td colspan="4"> </td>
3132 </tr>
3133 </tbody>
3134 </table>
3135 <table id="five">
3136 <table>
3137 <tbody>
3138 <tr>
3139 <td> </td><td> </td><td> </td><td> </td>
3140 </tr>
3141 <tr>
3142 <td> </td><td> </td><td> </td><td rowspan="2"> </td>
3143 </tr>
3144 <tr>
3145 <td> </td><td> </td><td> </td>
3146 </tr>
3147 <tr>
3148 <td> </td><td colspan="2"> </td><td> </td>
3149 </tr>
3150 <tr>
3151 <td> </td><td> </td><td> </td><td> </td>
3152 </tr>
3153 <tr>
3154 <td colspan="4"> </td>
3155 </tr>
3156 </tbody>
3157 </table>
3158 </table>
3159 </body>
3160 </html>
3161 "#;
3162 let mut doc = Readability::new(html_str);
3163 doc.mark_data_tables();
3164 let target = doc.root_node.select_first("table#one").unwrap();
3165 let target_attr = target.attributes.borrow();
3166 assert_eq!(Some("false"), target_attr.get("readability-data-table"));
3167
3168 let target = doc.root_node.select_first("table#two").unwrap();
3169 let target_attr = target.attributes.borrow();
3170 assert_eq!(Some("false"), target_attr.get("readability-data-table"));
3171
3172 let target = doc.root_node.select_first("table#three").unwrap();
3173 let target_attr = target.attributes.borrow();
3174 assert_eq!(Some("true"), target_attr.get("readability-data-table"));
3175
3176 let target = doc.root_node.select_first("table#four").unwrap();
3177 let target_atrr = target.attributes.borrow();
3178 assert_eq!(Some("true"), target_atrr.get("readability-data-table"));
3179
3180 let target = doc.root_node.select_first("table#five").unwrap();
3181 let target_atrr = target.attributes.borrow();
3182 assert_eq!(Some("false"), target_atrr.get("readability-data-table"));
3183 }
3184
3185 #[test]
3186 fn test_fix_lazy_images() {
3187 let html_str = r#"
3188 <!DOCTYPE html>
3189 <html>
3190 <body>
3191 <img id="svg-uri" alt="Basketball" src="" />
3192 <img id="normal-src" src="./foo.jpg">
3193 <img id="gif-uri" src="" alt="star" width="16" height="16">
3194 <img id="gif-uri-remove-src" data-src="./not-real-gif.png" src="" alt="star" width="16" height="16">
3195 <img id="lazy-loaded" class="lazy" src="placeholder.jpg" data-src="./720x640.jpg">
3196 <picture>
3197 <source media="(min-width:650px)" srcset="img_pink_flowers.jpg">
3198 <source media="(min-width:465px)" srcset="img_white_flower.jpg">
3199 <img src="img_orange_flowers.jpg" alt="Flowers" style="width:auto;">
3200 </picture>
3201 <img id="no-lazy-class" src="https://image.url/" data-attrs="{"src":"https://other.url/1.png","alt":""}"/>
3202 </body>
3203 </html>
3204 "#;
3205 let doc = Readability::new(html_str);
3206 let svg_uri = doc.root_node.select_first("#svg-uri").unwrap();
3207 let normal_src = doc.root_node.select_first("#normal-src").unwrap();
3208 let gif_uri = doc.root_node.select_first("#gif-uri").unwrap();
3209 let picture = doc.root_node.select_first("picture").unwrap();
3210 Readability::fix_lazy_images(&mut doc.root_node.clone());
3211 assert_eq!(svg_uri, doc.root_node.select_first("#svg-uri").unwrap());
3212 assert_eq!(
3213 normal_src,
3214 doc.root_node.select_first("#normal-src").unwrap()
3215 );
3216 assert_eq!(gif_uri, doc.root_node.select_first("#gif-uri").unwrap());
3217 assert_eq!(picture, doc.root_node.select_first("picture").unwrap());
3218
3219 let gif_uri_remove_src = doc.root_node.select_first("#gif-uri-remove-src").unwrap();
3220 let gif_uri_remove_src_attrs = gif_uri_remove_src.attributes.borrow();
3221 assert_eq!(
3222 gif_uri_remove_src_attrs.get("data-src"),
3223 gif_uri_remove_src_attrs.get("src")
3224 );
3225 let lazy_loaded = doc.root_node.select_first("#lazy-loaded").unwrap();
3226 let lazy_loaded_attrs = lazy_loaded.attributes.borrow();
3227 assert_eq!(
3228 lazy_loaded_attrs.get("data-src"),
3229 lazy_loaded_attrs.get("src")
3230 );
3231
3232 let no_lazy_class = doc.root_node.select_first("#no-lazy-class").unwrap();
3233 let no_lazy_class_attrs = no_lazy_class.attributes.borrow();
3234 assert_eq!(
3235 no_lazy_class_attrs.get("src").unwrap(),
3236 "https://image.url/"
3237 );
3238 }
3239
3240 #[test]
3241 fn test_clean_conditionally() {
3242 let html_str = r#"
3243 <!DOCTYPE html>
3244 <html>
3245 <body>
3246 <table id="data-table">
3247 <caption>Monthly savings</caption>
3248 <tr>
3249 <th>Month</th>
3250 <th>Savings</th>
3251 </tr>
3252 <tr>
3253 <td>January</td>
3254 <td>$100</td>
3255 </tr>
3256 <tr>
3257 <td>February</td>
3258 <td>$50</td>
3259 </tr>
3260 </table>
3261 <table width="100%" border="0" id="display-table">
3262 <tr valign="top">
3263 <td width="20%">Left</td>
3264 <td height="200" width="60%">Main</td>
3265 <td width="20%">Right</td>
3266 </tr>
3267 </table>
3268 <table width="100%" border="0" id="display-table-removed" class="comment">
3269 <tr valign="top">
3270 <td width="40%">One</td>
3271 <td width="60%">Two</td>
3272 </tr>
3273 </table>
3274 <div class="comment">
3275 <p>The parent div will be deleted due to negative weight classes</p>
3276 </div>
3277 <div id="some-content">
3278 The days of the week: Mon, Tue, Wed, Thur, Fri, Sat, Sun.
3279 The months of the year: Jan, Feb, Mar, Apr, May, Jun, Jul, Aug, Oct, Nov, Dec.
3280 </div>
3281 <div id="embeds">
3282 <iframe width="420" height="345" src="https://www.youtube.com/embed/dQw4w9WgXcQ"></iframe>
3283 </div>
3284 <div id="footer">
3285 <p>Check out more articles</p>
3286 <ul>
3287 <li><img src="article.jpg"><p>Article 1</p></li>
3288 <li><img src="article.jpg"><p>Article 2</p></li>
3289 <li><img src="article.jpg"><p>Article 3</p></li>
3290 </ul>
3291 </div>
3292 </body>
3293 </html>
3294 "#;
3295 let mut doc = Readability::new(html_str);
3296 let body = doc.root_node.select_first("body").unwrap();
3297 doc.mark_data_tables();
3298 doc.clean_conditionally(&mut body.as_node().clone(), "table");
3299 assert_eq!(true, doc.root_node.select_first("#data-table").is_ok());
3300 assert_eq!(false, doc.root_node.select_first("#display-table").is_ok());
3301 assert_eq!(
3302 false,
3303 doc.root_node.select_first("#display-table-removed").is_ok()
3304 );
3305 doc.clean_conditionally(&mut body.as_node().clone(), "div");
3306 assert_eq!(false, doc.root_node.select_first("div.comment").is_ok());
3307 assert_eq!(true, doc.root_node.select_first("div#some-content").is_ok());
3308 assert_eq!(true, doc.root_node.select_first("div#embeds").is_ok());
3309 assert_eq!(false, doc.root_node.select_first("div#footer").is_ok());
3310 }
3311
3312 #[test]
3313 fn test_clean() {
3314 let html_str = r#"
3315 <!DOCTYPE html>
3316 <html>
3317 <body>
3318 <pre>A Paperoni test</pre>
3319 <iframe width="420" height="345" src="https://www.youtube.com/embed/dQw4w9WgXcQ">
3320 </iframe>
3321 <iframe src="https://www.rust-lang.org/" name="rust_iframe" height="300px" width="100%" title="Rustlang Homepage">
3322 </iframe>
3323 <iframe src="https://crates.io/" name="crates_iframe" height="300px" width="100%" title="Crates.io Homepage">
3324 </iframe>
3325 <pre></pre>
3326 </body>
3327 </html>
3328 "#;
3329 let doc = Readability::new(html_str);
3330 Readability::clean(&mut doc.root_node.clone(), "pre");
3331 let pre_count = doc.root_node.select("pre").unwrap().count();
3332 assert_eq!(0, pre_count);
3333
3334 Readability::clean(&mut doc.root_node.clone(), "iframe");
3335 let iframe_count = doc.root_node.select("iframe").unwrap().count();
3336 assert_eq!(1, iframe_count);
3337 let iframe = doc.root_node.select_first("iframe").unwrap();
3338 let iframe_attrs = iframe.attributes.borrow();
3339 assert_eq!(
3340 Some("https://www.youtube.com/embed/dQw4w9WgXcQ"),
3341 iframe_attrs.get("src")
3342 );
3343 }
3344
3345 #[test]
3346 fn test_clean_headers() {
3347 let html_str = r#"
3348 <!DOCTYPE html>
3349 <html>
3350 <body>
3351 <h1 class="tags">#blog, #rust</h1>
3352 <h2>A blog in Rust</h2>
3353 <p>Foo bar baz quux</p>
3354 <h1 class="footer">Copyright info</h1>
3355 </body>
3356 </html>
3357 "#;
3358 let doc = Readability::new(html_str);
3359 let body = doc.root_node.select_first("body").unwrap();
3360 let h1_count = doc.root_node.select("h1").unwrap().count();
3361 let h2_count = doc.root_node.select("h2").unwrap().count();
3362 assert_eq!(2, h1_count);
3363 assert_eq!(1, h2_count);
3364 doc.clean_headers(&mut body.as_node().clone());
3365 let h1_count = doc.root_node.select("h1").unwrap().count();
3366 let h2_count = doc.root_node.select("h2").unwrap().count();
3367 assert_eq!(0, h1_count);
3368 assert_eq!(1, h2_count);
3369 }
3370
3371 #[test]
3372 fn test_clean_styles() {
3373 let html_str = r#"
3374 <!DOCTYPE html>
3375 <html>
3376 <body>
3377 <div style="color:red; padding: 10px" id="red">A red box</div>
3378 <div height="100px" style="color:blue; padding: 10px" id="blue">
3379 A blue box
3380 </div>
3381 <svg width="100" height="100">
3382 <circle cx="50" cy="50" r="40" fill="green" />
3383 </svg>
3384 <table width="100%" bgcolor="yellow">
3385 <tr>
3386 <th>Col 1</th>
3387 <th>Col 2</th>
3388 </tr>
3389 </table>
3390 </body>
3391 </html>
3392 "#;
3393 let doc = Readability::new(html_str);
3394 Readability::clean_styles(&mut doc.root_node.clone());
3395 let red_div = doc.root_node.select_first("#red").unwrap();
3396 let blue_div = doc.root_node.select_first("#blue").unwrap();
3397 let svg = doc.root_node.select_first("svg").unwrap();
3398 let table = doc.root_node.select_first("table").unwrap();
3399
3400 let red_div_attrs = red_div.attributes.borrow();
3401 let blue_div_attrs = blue_div.attributes.borrow();
3402 let svg_attrs = svg.attributes.borrow();
3403 let table_attrs = table.attributes.borrow();
3404
3405 assert_eq!(1, red_div_attrs.map.len());
3406 assert_eq!(false, red_div_attrs.contains("style"));
3407 assert_eq!(2, blue_div_attrs.map.len());
3408 assert_eq!(false, blue_div_attrs.contains("style"));
3409 assert_eq!(true, blue_div_attrs.contains("height"));
3410 assert_eq!(2, svg_attrs.map.len());
3411 assert_eq!(0, table_attrs.map.len());
3412 }
3413
3414 #[test]
3415 fn test_clean_matched_nodes() {
3416 let html_str = r#"
3417 <!DOCTYPE html>
3418 <html>
3419 <body>
3420 <p class="example">In Rust you can have 3 kinds of variables</p>
3421 <ul>
3422 <li class="example">Immutable</li>
3423 <li class="example">Mutable</li>
3424 <li class="example">Constant</li>
3425 </ul>
3426 <p>Onto more tests</p>
3427 </body>
3428 </html>
3429 "#;
3430 let doc = Readability::new(html_str);
3431 let body = doc.root_node.select_first("body").unwrap();
3432 Readability::clean_matched_nodes(&mut body.as_node().clone(), |node_ref, match_str| {
3433 &node_ref.as_element().unwrap().name.local == "li" && match_str.contains("example")
3434 });
3435 let p_count = doc.root_node.select("p").unwrap().count();
3436 let li_count = doc.root_node.select("li").unwrap().count();
3437 assert_eq!(2, p_count);
3438 assert_eq!(0, li_count);
3439 }
3440
3441 #[test]
3442 fn test_prep_article() {
3443 let html_str = r#"
3444 <!DOCTYPE html>
3445 <html>
3446 <head>
3447 <title>A test HTML file</title>
3448 </head>
3449 <body>
3450 <h2>A test HTML file</h2>
3451 <div class="search">
3452 Search for other posts
3453 <input type="search" placeholder="Type here...">
3454 <button id="search-btn">Search</button>
3455 </div>
3456 <aside>Some content aside</aside>
3457 <h1>A h1 tag</h1>
3458 <h1 class="banner">A h1 tag to be removed</h1>
3459 <table id="tbl-one"></table>
3460 <table width="100%" border="0" id="tbl-two">
3461 <tr valign="top">
3462 <td width="20%">Left</td>
3463 <td height="200" width="60%">Main Content of the system</td>
3464 <td width="20%">Right</td>
3465 </tr>
3466 </table>
3467 <div style="color:red; padding: 10px" id="red">A red box</div>
3468 <div height="100px" style="color:blue; padding: 10px" id="blue">
3469 A blue box
3470 </div>
3471 <svg width="100" height="100">
3472 <circle cx="50" cy="50" r="40" fill="green" />
3473 </svg>
3474 <ul>
3475 <li>one</li>
3476 <li>two</li>
3477 <li>three</li>
3478 </ul>
3479 <object data="obj.html" width="500" height="200"></object>
3480 <table id="tbl-three">
3481 <caption>Monthly savings</caption>
3482 <tr>
3483 <th>Month</th>
3484 <th>Savings</th>
3485 </tr>
3486 <tr>
3487 <td>January</td>
3488 <td>$100</td>
3489 </tr>
3490 <tr>
3491 <td>February</td>
3492 <td>$50</td>
3493 </tr>
3494 </table>
3495 <iframe id="yt" width="420" height="345" src="https://www.youtube.com/embed/dQw4w9WgXcQ">
3496 </iframe>
3497 <div id="foo">
3498 <form action="">
3499 <fieldset>
3500 <legend>Personal details:</legend>
3501 <label for="fname">First name:</label>
3502 <input type="text" id="fname" name="fname"><br><br>
3503 <label for="lname">Last name:</label>
3504 <input type="text" id="lname" name="lname"><br><br>
3505 </fieldset>
3506 </form>
3507 <br>
3508 <p id="p-link">
3509 omnis nemo qui libero? Eius suscipit veritatis, tenetur impedit et voluptatibus.
3510 <a href="\#">Rerum repellat totam quam nobis harum fuga consequatur</a>
3511 corrupti?
3512 </p>
3513 <br>
3514 <iframe src="https://www.rust-lang.org/" name="rust_iframe" height="300px" width="100%" title="Rustlang Homepage">
3515 </iframe>
3516 </div>
3517 <iframe src="https://crates.io/" name="crates_iframe" height="300px" width="100%" title="Crates.io Homepage">
3518 </iframe>
3519 <table id="tbl-replace-p">
3520 <tr valign="top">
3521 <td width="20%" id="td-to-p"><span>One cell table. This is going to be replaced</span></td>
3522 </tr>
3523 </table>
3524 <embed type="video/webm" src="video.mp4" width="400" height="300">
3525 <br>
3526 <embed type="image/jpg" src="foo.jpg" width="300" height="200">
3527 <div>
3528 <form action="">
3529 <div>
3530 <label>Join our newsletter</label>
3531 <input type="email" placeholder="Your email address">
3532 </div>
3533 <button>Sign up</button>
3534 </form>
3535 </div>
3536 <div id="div-p">
3537 <p class="share">Share this as a <a href="\#">Tweet</a></p>
3538 <br>
3539 <p id="share">
3540 Lorem ipsum dolor, sit amet consectetur adipisicing elit. Minima quia numquam aperiam dolores ipsam, eos perferendis cupiditate adipisci perspiciatis
3541 dolore, sunt, iusto nobis? Nulla molestiae id repellat quibusdam nobis quia. Lorem ipsum dolor sit amet consectetur, adipisicing elit. Voluptas
3542 laudantium omnis nemo qui libero? Eius suscipit veritatis, tenetur impedit et voluptatibus. Rerum repellat totam quam nobis harum fuga consequatur
3543 corrupti? Lorem ipsum dolor sit amet consectetur, adipisicing elit. Iure excepturi accusamus nemo voluptatibus laborum minus dicta blanditiis totam
3544 aperiam velit amet cupiditate hic a molestias odio nam, fugiat facere iusto.
3545 </p>
3546 </div>
3547 <table id="tbl-replace-div">
3548 <tr>
3549 <td id="td-to-div"><pre>One cell table. This is going to be replaced</pre></td>
3550 </tr>
3551 </table>
3552 <footer>A Paperoni test</footer>
3553 <footer>Copyright 2020</footer>
3554 </body>
3555 </html>
3556 "#;
3557 let mut doc = Readability::new(html_str);
3558 doc.article_title = "A test HTML file".into();
3559 let body = doc.root_node.select_first("body").unwrap();
3560 doc.prep_article(&mut body.as_node().clone());
3561
3562 let table_node = doc.root_node.select_first("table").unwrap();
3564 let node_attr = table_node.attributes.borrow();
3565 assert_eq!(true, node_attr.get("readability-data-table").is_some());
3566
3567 let forms_and_fieldsets = doc.root_node.select("form, fieldset").unwrap();
3568 assert_eq!(0, forms_and_fieldsets.count());
3569
3570 let nodes = doc
3571 .root_node
3572 .select("h1, object, embed, footer, link, aside")
3573 .unwrap();
3574 assert_eq!(0, nodes.count());
3575
3576 assert_eq!(2, doc.root_node.select("p").unwrap().count());
3577 assert_eq!(true, doc.root_node.select_first("p.share").is_err());
3578 assert_eq!(true, doc.root_node.select_first("p#share").is_ok());
3579 assert_eq!(true, doc.root_node.select_first("p#td-to-p").is_ok());
3580
3581 let node = doc.root_node.select_first("h2");
3582 assert_eq!(true, node.is_err());
3583
3584 let nodes = doc
3585 .root_node
3586 .select("input, textarea, select, button")
3587 .unwrap();
3588 assert_eq!(0, nodes.count());
3589
3590 let nodes = doc.root_node.select("iframe").unwrap();
3591 assert_eq!(1, nodes.count());
3592 let node = doc.root_node.select_first("iframe#yt");
3593 assert_eq!(true, node.is_ok());
3594
3595 let nodes = doc.root_node.select("h1").unwrap();
3596 assert_eq!(0, nodes.count());
3597
3598 let nodes = doc
3599 .root_node
3600 .select("#tbl-one, #tbl-replace-p, #tbl-replace-div")
3601 .unwrap();
3602 assert_eq!(0, nodes.count());
3603
3604 let tables = doc.root_node.select("#tbl-two, #tbl-three").unwrap();
3605 assert_eq!(2, tables.count());
3606
3607 assert_eq!(true, doc.root_node.select_first("ul").is_ok());
3608
3609 assert_eq!(2, doc.root_node.select("div").unwrap().count());
3610 assert_eq!(true, doc.root_node.select_first("div#div-p").is_ok());
3611 assert_eq!(true, doc.root_node.select_first("div#td-to-div").is_ok());
3612
3613 assert_eq!(1, doc.root_node.select("br").unwrap().count());
3614 let node_ref = doc.root_node.select_first("br").unwrap();
3615 assert_eq!(
3616 "div",
3617 &node_ref
3618 .as_node()
3619 .following_siblings()
3620 .elements()
3621 .next()
3622 .unwrap()
3623 .name
3624 .local
3625 );
3626 }
3627
3628 #[test]
3629 fn test_get_article_title() {
3630 let mut html_str = r#"
3631 <!DOCTYPE html>
3632 <html>
3633 <head>
3634 <title>Porting Readability to Rust</title>
3635 </head>
3636 <body>
3637 <p></p>
3638 </body>
3639 </html>
3640 "#;
3641 let doc = Readability::new(html_str);
3642 assert_eq!("Porting Readability to Rust", doc.get_article_title());
3643
3644 html_str = r#"
3645 <!DOCTYPE html>
3646 <html>
3647 <head>
3648 <title>Crates.io: The Rust package repository</title>
3649 </head>
3650 <body>
3651 <p></p>
3652 </body>
3653 </html>
3654 "#;
3655 let doc = Readability::new(html_str);
3656 assert_eq!(
3657 "Crates.io: The Rust package repository",
3658 doc.get_article_title()
3659 );
3660
3661 html_str = r#"
3662 <!DOCTYPE html>
3663 <html>
3664 <head>
3665 <title>Crates.io: The Rust package repository</title>
3666 </head>
3667 <body>
3668 <h1>Crates.io: The Rust package repository</h1>
3669 </body>
3670 </html>
3671 "#;
3672 let doc = Readability::new(html_str);
3673 assert_eq!(
3674 "Crates.io: The Rust package repository",
3675 doc.get_article_title()
3676 );
3677
3678 html_str = r#"
3679 <!DOCTYPE html>
3680 <html>
3681 <head>
3682 <title>Crates.io: A package repository</title>
3683 </head>
3684 <body>
3685 <h1>Crates.io: A Rust package repository</h1>
3686 </body>
3687 </html>
3688 "#;
3689 let doc = Readability::new(html_str);
3690 assert_eq!("Crates.io: A package repository", doc.get_article_title());
3691
3692 html_str = r#"
3693 <!DOCTYPE html>
3694 <html>
3695 <head>
3696 <title>Foo developer \ Blog</title>
3697 </head>
3698 <body>
3699 <p></p>
3700 </body>
3701 </html>
3702 "#;
3703 let doc = Readability::new(html_str);
3704 assert_eq!("Foo developer \\ Blog", doc.get_article_title());
3705
3706 html_str = r#"
3707 <!DOCTYPE html>
3708 <html>
3709 <head>
3710 <title>Foo developer » Blog Post on Foo bar stuff</title>
3711 </head>
3712 <body>
3713 <p></p>
3714 </body>
3715 </html>
3716 "#;
3717 let doc = Readability::new(html_str);
3718 assert_eq!("Blog Post on Foo bar stuff", doc.get_article_title());
3719
3720 html_str = r#"
3721 <!DOCTYPE html>
3722 <html>
3723 <head>
3724 <title>Blog</title>
3725 </head>
3726 <body>
3727 <h1>Getting started with Rust</h1>
3728 </body>
3729 </html>
3730 "#;
3731 let doc = Readability::new(html_str);
3732 assert_eq!("Blog", doc.get_article_title());
3733 }
3734
3735 #[test]
3736 fn test_unescape_html_entities() {
3737 let mut input = "Therefore, 5 > 3".to_string();
3738 Readability::unescape_html_entities(&mut input);
3739 assert_eq!("Therefore, 5 > 3", &input);
3740 input = "Logical AND (&&)".to_string();
3741 Readability::unescape_html_entities(&mut input);
3742 assert_eq!("Logical AND (&&)", &input);
3743 input = "u + e = ü".to_string();
3744 Readability::unescape_html_entities(&mut input);
3745 assert_eq!("u + e = ü", input);
3746 input = "Řŭšţ".to_string();
3747 Readability::unescape_html_entities(&mut input);
3748 assert_eq!("Řŭšţ", input);
3749 }
3750
3751 #[test]
3752 fn test_get_article_metadata() {
3753 let mut html_str = r#"
3754 <!DOCTYPE html>
3755 <html>
3756 <head>
3757 <meta charset="utf-8"/>
3758 <meta name="description" content="A post on how hard it is to work with text."/>
3759 <meta name="viewport" content="width=device-width"/>
3760 <title>Foo Coder / Blog on the difficulty of using utf-8</title>
3761 <meta name="author" content="Foo Coder"/>
3762 </head>
3763 <body></body>
3764 </html>
3765 "#;
3766 let doc = Readability::new(html_str);
3767 let mut result = MetaData::new();
3768 result.byline = Some("Foo Coder".to_string());
3769 result.excerpt = Some("A post on how hard it is to work with text.".to_string());
3770 result.title = "Blog on the difficulty of using utf-8".to_string();
3771 assert_eq!(result, doc.get_article_metadata());
3772
3773 html_str = r#"
3774 <!DOCTYPE html>
3775 <html>
3776 <head>
3777 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
3778 <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" user-scalable="no" />
3779 <meta name="title" content="A Long Title" />
3780 <meta name="description" content="Foo bar baz boß" />
3781 <meta property="og:site_name" content="Blog Place" />
3782 <meta property="og:title" content="A Longer Title" />
3783 <meta property="og:description" content="Foo bar baz boß" />
3784 <meta name="author" content="Föo Coder" />
3785 <meta name="dc:creator" content="Föo Coder" />
3786 <meta name="twitter:card" content="summary_large_image" />
3787 <title>The Longest Title</title>
3788 </head>
3789 </html>
3790 "#;
3791 let doc = Readability::new(html_str);
3792 result = MetaData::new();
3793 result.byline = Some("Föo Coder".to_string());
3794 result.excerpt = Some("Foo bar baz boß".to_string());
3795 result.site_name = Some("Blog Place".to_string());
3796 result.title = "A Longer Title".to_string();
3797 assert_eq!(result, doc.get_article_metadata());
3798 }
3799
3800 #[test]
3801 fn test_fix_relative_uris() {
3802 let html_str = r##"
3803 <!DOCTYPE html>
3804 <html>
3805 <body>
3806 <h1><a href="../home.html">Go back</a></h1>
3807 <img id="ex-1" src="https://example.image.com/images/1.jpg" alt="Ex 1">
3808 <img id="ex-2" src="https://example.image.com/images/2.jpg" alt="Ex 2">
3809 <img id="ex-3" src="../images/2.jpg" alt="Ex 3">
3810 <img id="ex-4" src="./images/1.jpg" alt="Ex 4">
3811 <img id="ex-5" src="https://images.com/images/1.jpg" alt="Ex 5">
3812 <img id="ex-6" src="/images/1.jpg" alt="Ex 6">
3813 <p><a href="#ex-1">First image</a></p>
3814 </body>
3815 </html>
3816 "##;
3817 let mut doc = Readability::new(html_str);
3818 doc.article_node = doc
3819 .root_node
3820 .select_first("body")
3821 .ok()
3822 .map(|node_ref| node_ref.as_node().clone());
3823 doc.fix_relative_uris("https://example.image.com/blog/");
3824
3825 let node = doc.root_node.select_first("img#ex-1").unwrap();
3826 let node_attrs = node.attributes.borrow();
3827 assert_eq!(
3828 Some("https://example.image.com/images/1.jpg"),
3829 node_attrs.get("src")
3830 );
3831
3832 let node = doc.root_node.select_first("img#ex-2").unwrap();
3833 let node_attrs = node.attributes.borrow();
3834 assert_eq!(
3835 Some("https://example.image.com/images/2.jpg"),
3836 node_attrs.get("src")
3837 );
3838
3839 let node = doc.root_node.select_first("img#ex-3").unwrap();
3840 let node_attrs = node.attributes.borrow();
3841 assert_eq!(
3842 Some("https://example.image.com/images/2.jpg"),
3843 node_attrs.get("src")
3844 );
3845
3846 let node = doc.root_node.select_first("img#ex-4").unwrap();
3847 let node_attrs = node.attributes.borrow();
3848 assert_eq!(
3849 Some("https://example.image.com/blog/images/1.jpg"),
3850 node_attrs.get("src")
3851 );
3852
3853 let node = doc.root_node.select_first("img#ex-5").unwrap();
3854 let node_attrs = node.attributes.borrow();
3855 assert_eq!(
3856 Some("https://images.com/images/1.jpg"),
3857 node_attrs.get("src")
3858 );
3859
3860 let node = doc.root_node.select_first("img#ex-6").unwrap();
3861 let node_attrs = node.attributes.borrow();
3862 assert_eq!(
3863 Some("https://example.image.com/images/1.jpg"),
3864 node_attrs.get("src")
3865 );
3866
3867 let node = doc.root_node.select_first("p a").unwrap();
3868 let node_attrs = node.attributes.borrow();
3869 assert_eq!(Some("#ex-1"), node_attrs.get("href"));
3870
3871 let node = doc.root_node.select_first("h1 a").unwrap();
3872 let node_attrs = node.attributes.borrow();
3873 assert_eq!(
3874 Some("https://example.image.com/home.html"),
3875 node_attrs.get("href")
3876 );
3877 }
3878
3879 #[test]
3880 fn test_clean_classes() {
3881 let html_str = r#"
3883 <!DOCTYPE html>
3884 <html>
3885 <body>
3886 <p class="a b c d">One</p>
3887 <p class="b c d e">Two</p>
3888 <div class="a b c div">Three</div>
3889 <div class="b c d e">Four</div>
3890 <ul class="a b c d">
3891 <li class="a b c d">One</li>
3892 <li class="b c d e">Two</li>
3893 <li class="b c d e">Three</li>
3894 </ul>
3895 </body>
3896 </html>
3897 "#;
3898 let mut doc = Readability::new(html_str);
3899 doc.article_node = doc
3900 .root_node
3901 .select_first("body")
3902 .ok()
3903 .map(|node_ref| node_ref.as_node().clone());
3904 doc.clean_classes();
3905
3906 assert_eq!(
3907 true,
3908 doc.root_node
3909 .inclusive_descendants()
3910 .elements()
3911 .all(|node_elem| {
3912 let node_attrs = node_elem.attributes.borrow();
3913 !node_attrs.contains("class")
3914 })
3915 );
3916 }
3917
3918 #[test]
3919 fn test_clean_readability_attrs() {
3920 let html_str = r#"
3921 <!DOCTYPE html>
3922 <html>
3923 <body>
3924 <div readability-score="0.921487">
3925 <p readability-score="0.8102">Welcome to this awesome blog post. Only good content is here. No spam.</p>
3926 <p readability-score="0.6004">Let's look at some statistics</p>
3927 <table readability-score="0.719275" readability-data-table="true">
3928 <caption>Monthly savings</caption>
3929 <tr>
3930 <th>Month</th>
3931 <th>Savings</th>
3932 </tr>
3933 <tr>
3934 <td>January</td>
3935 <td>$100</td>
3936 </tr>
3937 <tr>
3938 <td>February</td>
3939 <td>$50</td>
3940 </tr>
3941 </table>
3942 </div>
3943 </body>
3944 </html>
3945 "#;
3946 let mut doc = Readability::new(html_str);
3947 doc.article_node = doc
3948 .root_node
3949 .select_first("body")
3950 .ok()
3951 .map(|node_ref| node_ref.as_node().clone());
3952 doc.clean_readability_attrs();
3953 assert_eq!(
3954 true,
3955 doc.root_node
3956 .inclusive_descendants()
3957 .elements()
3958 .all(|node| {
3959 let node_attrs = node.attributes.borrow();
3960 node_attrs.map.len() == 0
3961 })
3962 );
3963 }
3964
3965 #[test]
3966 fn test_post_process_content() {
3967 let html_str = r##"
3968 <!DOCTYPE html>
3969 <html>
3970 <body>
3971 <p class="a b c d">One</p>
3972 <p class="b c d e">Two</p>
3973 <div class="a b c div">Three</div>
3974 <div class="b c d e">
3975 <img src="./img.jpg" class="lazy">
3976 </div>
3977 <ul class="a b c d">
3978 <li class="a b c d"><a href="#home">One</a></li>
3979 <li class="b c d e">Two</li>
3980 <li class="b c d e">Three</li>
3981 </ul>
3982 </body>
3983 </html>
3984 "##;
3985 let mut doc = Readability::new(html_str);
3986 doc.article_node = doc
3987 .root_node
3988 .select_first("body")
3989 .ok()
3990 .map(|node_ref| node_ref.as_node().clone());
3991 doc.post_process_content("https://foo.blog/post/");
3992 let has_class_attr = doc
3993 .root_node
3994 .inclusive_descendants()
3995 .elements()
3996 .any(|node_ref| {
3997 let attrs = node_ref.attributes.borrow();
3998 attrs.contains("class")
3999 });
4000 assert_eq!(false, has_class_attr);
4001 let a_node = doc.root_node.select_first("a").unwrap();
4002 let a_node_attrs = a_node.attributes.borrow();
4003 assert_eq!(Some("#home"), a_node_attrs.get("href"));
4004 let img_node = doc.root_node.select_first("img").unwrap();
4005 let img_attrs = img_node.attributes.borrow();
4006 assert_eq!(Some("https://foo.blog/post/img.jpg"), img_attrs.get("src"));
4007 }
4008
4009 #[test]
4010 fn test_flag_is_active() {
4011 let html_str = r"
4012 <!DOCTYPE html>
4013 <html>
4014 <body>
4015 </body>
4016 </html>
4017 ";
4018 let doc = Readability::new(html_str);
4019 assert_eq!(true, doc.flag_is_active(FLAG_STRIP_UNLIKELYS));
4020 assert_eq!(true, doc.flag_is_active(FLAG_WEIGHT_CLASSES));
4021 assert_eq!(true, doc.flag_is_active(FLAG_CLEAN_CONDITIONALLY));
4022 }
4023
4024 #[test]
4025 fn test_remove_flag() {
4026 let html_str = r"
4027 <!DOCTYPE html>
4028 <html>
4029 <body>
4030 </body>
4031 </html>
4032 ";
4033 let mut doc = Readability::new(html_str);
4034 assert_eq!(true, doc.flag_is_active(FLAG_CLEAN_CONDITIONALLY));
4035 doc.remove_flag(FLAG_CLEAN_CONDITIONALLY);
4036 assert_eq!(false, doc.flag_is_active(FLAG_CLEAN_CONDITIONALLY));
4037 assert_eq!(true, doc.flag_is_active(FLAG_WEIGHT_CLASSES));
4038 doc.remove_flag(FLAG_WEIGHT_CLASSES);
4039 assert_eq!(false, doc.flag_is_active(FLAG_WEIGHT_CLASSES));
4040 assert_eq!(true, doc.flag_is_active(FLAG_STRIP_UNLIKELYS));
4041 }
4042}