mozilla_readability/
lib.rs

1use crate::errors::ParserError;
2use std::collections::{BTreeMap, HashMap, HashSet};
3use std::str::FromStr;
4use html5ever::{LocalName, Namespace, QualName};
5use kuchiki::{
6    iter::{Descendants, Elements, Select},
7    traits::*,
8    NodeData, NodeRef,
9};
10use log::info;
11use url::Url;
12
13
14const DEFAULT_CHAR_THRESHOLD: usize = 500;
15const FLAG_STRIP_UNLIKELYS: u32 = 0x1;
16const FLAG_WEIGHT_CLASSES: u32 = 0x2;
17const FLAG_CLEAN_CONDITIONALLY: u32 = 0x4;
18const READABILITY_SCORE: &'static str = "readability-score";
19const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml";
20// TODO: Change to HashSet
21const PHRASING_ELEMS: [&str; 39] = [
22    "abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em",
23    "embed", "i", "img", "input", "kbd", "label", "mark", "math", "meter", "noscript", "object",
24    "output", "progress", "q", "ruby", "samp", "script", "select", "small", "span", "strong",
25    "sub", "sup", "textarea", "time", "var", "wbr",
26];
27// TODO: Change to HashSet
28const DEFAULT_TAGS_TO_SCORE: [&str; 9] =
29    ["section", "h2", "h3", "h4", "h5", "h6", "p", "td", "pre"];
30// TODO: Change to HashSet
31const ALTER_TO_DIV_EXCEPTIONS: [&str; 4] = ["div", "article", "section", "p"];
32const PRESENTATIONAL_ATTRIBUTES: [&str; 12] = [
33    "align",
34    "background",
35    "bgcolor",
36    "border",
37    "cellpadding",
38    "cellspacing",
39    "frame",
40    "hspace",
41    "rules",
42    "style",
43    "valign",
44    "vspace",
45];
46
47const DATA_TABLE_DESCENDANTS: [&str; 5] = ["col", "colgroup", "tfoot", "thead", "th"];
48// TODO: Change to HashSet
49const DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [&str; 5] = ["table", "th", "td", "hr", "pre"];
50
51pub mod regexes;
52pub mod errors;
53
54pub struct Readability {
55    root_node: NodeRef,
56    byline: Option<String>,
57    article_title: String,
58    pub article_node: Option<NodeRef>,
59    article_dir: Option<String>,
60    flags: u32,
61    pub metadata: MetaData,
62}
63
64#[derive(Debug, PartialEq)]
65struct SizeInfo {
66    rows: usize,
67    columns: usize,
68}
69
70impl Readability {
71    pub fn new(html_str: &str) -> Self {
72        Self {
73            root_node: kuchiki::parse_html().one(html_str),
74            byline: None,
75            article_title: "".into(),
76            article_node: None,
77            article_dir: None,
78            flags: FLAG_STRIP_UNLIKELYS | FLAG_WEIGHT_CLASSES | FLAG_CLEAN_CONDITIONALLY,
79            metadata: MetaData::new(),
80        }
81    }
82    pub fn parse(&mut self, url: &str) -> Result<(), errors::ParserError> {
83        self.unwrap_no_script_tags();
84        self.remove_scripts();
85        self.prep_document();
86        self.metadata = self.get_article_metadata();
87        self.article_title = self.metadata.title.clone();
88        self.grab_article()?;
89        self.post_process_content(url);
90        Ok(())
91    }
92
93    /// Recursively check if node is image, or if node contains exactly only one image
94    /// whether as a direct child or as its descendants.
95    fn is_single_image(node_ref: &NodeRef) -> bool {
96        if let Some(element) = node_ref.as_element() {
97            if &element.name.local == "img" {
98                return true;
99            }
100        }
101
102        if node_ref.children().filter(Self::has_content).count() != 1
103            || !node_ref.text_contents().trim().is_empty()
104        {
105            return false;
106        }
107
108        return Readability::is_single_image(
109            &node_ref
110                .children()
111                .filter(Self::has_content)
112                .next()
113                .expect("Unable to get first child which should exist"),
114        );
115    }
116
117    fn has_content(node_ref: &NodeRef) -> bool {
118        match node_ref.data() {
119            NodeData::Text(text) => !text.borrow().trim().is_empty(),
120            _ => true,
121        }
122    }
123
124    /// Find all <noscript> that are located after <img> nodes, and which contain only one <img> element.
125    /// Replace the first image with the image from inside the <noscript> tag, and remove the <noscript> tag.
126    /// This improves the quality of the images we use on some sites (e.g. Medium).
127    fn unwrap_no_script_tags(&mut self) {
128        if let Ok(imgs) = self.root_node.select("img") {
129            let mut nodes = imgs.filter(|img_node_ref| {
130                let img_attrs = img_node_ref.attributes.borrow();
131                !img_attrs.map.iter().any(|(name, attr)| {
132                    &name.local == "src"
133                        || &name.local == "srcset"
134                        || &name.local == "data-src"
135                        || &name.local == "data-srcset"
136                        || regexes::is_match_img_ext(&attr.value)
137                })
138            });
139            let mut node_ref = nodes.next();
140            while let Some(img_ref) = node_ref {
141                node_ref = nodes.next();
142                img_ref.as_node().detach();
143            }
144        }
145
146        if let Ok(noscripts) = self.root_node.select("noscript") {
147            for noscript in noscripts {
148                let inner_node_ref = kuchiki::parse_fragment(
149                    QualName::new(None, Namespace::from(HTML_NS), LocalName::from("div")),
150                    Vec::new(),
151                )
152                .one(noscript.text_contents());
153                if !Self::is_single_image(&inner_node_ref) {
154                    continue;
155                }
156                if let Some(mut prev_elem) = noscript.as_node().previous_sibling() {
157                    // TODO: Fix this to have a better way of extracting nodes that are elements
158                    while prev_elem.as_element().is_none() {
159                        match prev_elem.previous_sibling() {
160                            Some(new_prev) => prev_elem = new_prev,
161                            None => break,
162                        };
163                    }
164
165                    if Self::is_single_image(&prev_elem) && prev_elem.as_element().is_some() {
166                        let prev_img = if &prev_elem.as_element().unwrap().name.local != "img" {
167                            prev_elem.select_first("img").unwrap().as_node().clone()
168                        } else {
169                            prev_elem.clone()
170                        };
171                        let new_img = inner_node_ref.select_first("img").unwrap();
172                        let prev_attrs = prev_img.as_element().unwrap().attributes.borrow();
173                        let prev_attrs = prev_attrs.map.iter().filter(|(attr, val)| {
174                            !val.value.trim().is_empty()
175                                && (&attr.local == "src"
176                                    || &attr.local == "srcset"
177                                    || regexes::is_match_img_ext(&val.value))
178                        });
179                        for (prev_attr, prev_value) in prev_attrs {
180                            match new_img.attributes.borrow().get(&prev_attr.local) {
181                                Some(value) => {
182                                    if value == prev_value.value {
183                                        continue;
184                                    }
185                                }
186                                None => (),
187                            }
188
189                            let attr_name: &str = &prev_attr.local;
190                            let mut attr_name = attr_name.to_owned();
191                            if new_img.attributes.borrow().contains(attr_name.clone()) {
192                                let new_name = format!("data-old-{}", &attr_name);
193                                attr_name = new_name;
194                            }
195                            new_img
196                                .attributes
197                                .borrow_mut()
198                                .insert(attr_name, prev_value.value.clone());
199                        }
200                        prev_elem.insert_after(new_img.as_node().clone());
201                        prev_elem.detach();
202                    }
203                }
204            }
205        }
206    }
207
208    /// Removes script tags from the document.
209    fn remove_scripts(&mut self) {
210        match self.root_node.select("script") {
211            Ok(mut script_elems) => {
212                let mut next_script = script_elems.next();
213                while let Some(next_script_ref) = next_script {
214                    next_script = script_elems.next();
215                    next_script_ref.as_node().detach();
216                }
217            }
218            Err(_) => (),
219        }
220        match self.root_node.select("noscript") {
221            Ok(mut noscript_elems) => {
222                let mut next_noscript = noscript_elems.next();
223                while let Some(noscript_ref) = next_noscript {
224                    next_noscript = noscript_elems.next();
225                    noscript_ref.as_node().detach();
226                }
227            }
228            Err(_) => (),
229        }
230    }
231
232    /// Prepare the HTML document for readability to scrape it. This includes things like stripping
233    /// CSS, and handling terrible markup.
234    fn prep_document(&mut self) {
235        match self.root_node.select("style") {
236            Ok(mut style_elems) => {
237                let mut style_elem = style_elems.next();
238                while let Some(style_ref) = style_elem {
239                    style_elem = style_elems.next();
240                    style_ref.as_node().detach();
241                }
242            }
243            Err(_) => (),
244        }
245        self.replace_brs();
246        match self.root_node.select("font") {
247            Ok(nodes_iter) => Self::replace_node_tags(nodes_iter, "span"),
248            Err(_) => (),
249        }
250    }
251
252    /// Replaces 2 or more successive <br> elements with a single <p>.
253    /// Whitespace between <br> elements are ignored. For example:
254    ///  <div>foo<br>bar<br> <br><br>abc</div>
255    /// will become:
256    ///   <div>foo<br>bar<p>abc</p></div>
257    fn replace_brs(&mut self) {
258        if let Ok(mut br_tags) = self.root_node.select("br") {
259            // The uses of `next_element` here are safe as it explicitly ensures the next element is an element node
260            while let Some(br_tag) = br_tags.next() {
261                let mut next = Self::next_element(br_tag.as_node().next_sibling(), false);
262                let mut replaced = false;
263                while let Some(next_elem) = next {
264                    if next_elem.as_element().is_some()
265                        && &next_elem.as_element().as_ref().unwrap().name.local == "br"
266                    {
267                        replaced = true;
268                        let br_sibling = next_elem.next_sibling();
269                        next = Self::next_element(br_sibling, false);
270                        next_elem.detach();
271                    } else {
272                        break;
273                    }
274                }
275                if replaced {
276                    let p = NodeRef::new_element(
277                        QualName { prefix: None, ns:Namespace::from(HTML_NS), local:LocalName::from("p")},
278                        BTreeMap::new(),
279                    );
280                    br_tag.as_node().insert_before(p);
281                    let p = br_tag.as_node().previous_sibling().unwrap();
282                    br_tag.as_node().detach();
283
284                    next = p.next_sibling();
285                    while next.is_some() {
286                        let next_sibling = next.unwrap();
287                        if let Some(next_elem) = next_sibling.as_element() {
288                            if &next_elem.name.local == "br" {
289                                if let Some(second_sibling) = next_sibling.next_sibling() {
290                                    if second_sibling.as_element().is_some()
291                                        && "br" == &second_sibling.as_element().unwrap().name.local
292                                    {
293                                        break;
294                                    }
295                                }
296                            }
297                        }
298
299                        if !Self::is_phrasing_content(&next_sibling) {
300                            break;
301                        }
302
303                        let sibling = next_sibling.next_sibling();
304                        p.append(next_sibling);
305                        next = sibling;
306                    }
307
308                    while let Some(first_child) = p.first_child() {
309                        if Self::is_whitespace(&first_child) {
310                            first_child.detach();
311                        } else {
312                            break;
313                        }
314                    }
315
316                    while let Some(last_child) = p.last_child() {
317                        if Self::is_whitespace(&last_child) {
318                            last_child.detach();
319                        } else {
320                            break;
321                        }
322                    }
323
324                    if let Some(parent) = p.parent() {
325                        if &parent.as_element().as_ref().unwrap().name.local == "p" {
326                            Self::set_node_tag(&parent, "div");
327                        }
328                    }
329                }
330            }
331        }
332    }
333
334    /// Iterates over a Select, and calls set_node_tag for each node.
335    fn replace_node_tags(nodes: Select<Elements<Descendants>>, name: &str) {
336        for node in nodes {
337            Self::set_node_tag(node.as_node(), name);
338        }
339    }
340
341    /// Replaces the specified NodeRef by replacing its name. This works by copying over its
342    /// children and its attributes.
343    fn set_node_tag(node_ref: &NodeRef, name: &str) -> NodeRef {
344        match node_ref.as_element() {
345            Some(elem) => {
346                let attributes = elem.attributes.borrow().clone().map.into_iter();
347                let replacement = NodeRef::new_element(
348                    QualName::new(None, Namespace::from(HTML_NS), LocalName::from(name)),
349                    attributes,
350                );
351                for child in node_ref.children() {
352                    replacement.append(child);
353                }
354                node_ref.insert_before(replacement);
355                let new_node = node_ref.previous_sibling().unwrap();
356                node_ref.detach();
357                return new_node;
358            }
359            None => (),
360        }
361        node_ref.clone()
362    }
363
364    fn is_whitespace(node_ref: &NodeRef) -> bool {
365        match node_ref.data() {
366            NodeData::Element(elem_data) => &elem_data.name.local == "br",
367            NodeData::Text(text_ref) => text_ref.borrow().trim().len() == 0,
368            _ => false,
369        }
370    }
371
372    /// Finds the next element, starting from the given node, and ignoring
373    /// whitespace in between. If the given node is an element, the same node is
374    /// returned.
375    /// The must_be_element argument ensure the next element is actually an element node.
376    /// This is likely to factored out into a new function.
377    fn next_element(node_ref: Option<NodeRef>, must_be_element: bool) -> Option<NodeRef> {
378        // TODO: Could probably be refactored to use the elements method
379        let mut node_ref = node_ref;
380        while node_ref.is_some() {
381            match node_ref.as_ref().unwrap().data() {
382                NodeData::Element(_) => break,
383                _ => {
384                    if node_ref.as_ref().unwrap().text_contents().trim().is_empty() {
385                        node_ref = node_ref.as_ref().unwrap().next_sibling();
386                    } else if must_be_element
387                        && !node_ref.as_ref().unwrap().text_contents().trim().is_empty()
388                    {
389                        node_ref = node_ref.as_ref().unwrap().next_sibling();
390                    } else {
391                        break;
392                    }
393                }
394            }
395        }
396        node_ref
397    }
398
399    /// Determine if a node qualifies as phrasing content.
400    /// https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content
401    fn is_phrasing_content(node_ref: &NodeRef) -> bool {
402        node_ref.as_text().is_some()
403            || match node_ref.as_element() {
404                Some(elem) => {
405                    let name: &str = &elem.name.local;
406                    PHRASING_ELEMS.contains(&name)
407                        || ((name == "a" || name == "del" || name == "ins")
408                            && node_ref
409                                .children()
410                                .all(|child_ref| Self::is_phrasing_content(&child_ref)))
411                }
412                None => false,
413            }
414    }
415
416    ///Attempts to get excerpt and byline metadata for the article. @return Object with optional "excerpt" and "byline" properties
417    fn get_article_metadata(&self) -> MetaData {
418        let mut values: HashMap<String, String> = HashMap::new();
419        let mut meta_data = MetaData::new();
420        if let Ok(meta_elems) = self.root_node.select("meta") {
421            meta_elems
422                .filter(|node_ref| {
423                    let node_attr = node_ref.attributes.borrow();
424                    node_attr.get("content").is_some()
425                })
426                .for_each(|node_ref| {
427                    let node_attr = node_ref.attributes.borrow();
428                    let content = node_attr.get("content").unwrap();
429                    let name_attr = node_attr.get("name");
430                    let mut matches = None;
431                    if let Some(property) = node_attr.get("property") {
432                        matches = regexes::PROPERTY_REGEX.captures(property);
433                        if let Some(captures) = &matches {
434                            for capture in captures.iter() {
435                                let mut name = capture.unwrap().as_str().to_lowercase();
436                                name = regexes::REPLACE_WHITESPACE_REGEX
437                                    .replace_all(&name, "")
438                                    .to_string();
439                                values.insert(name, content.trim().to_string());
440                            }
441                        }
442                    }
443                    if matches.is_none() && name_attr.is_some() {
444                        let name_val = name_attr.unwrap();
445                        if regexes::is_match_name_pattern(name_val) {
446                            let name = name_val.to_lowercase();
447                            let name = regexes::REPLACE_WHITESPACE_REGEX.replace_all(&name, "");
448                            let name = regexes::REPLACE_DOT_REGEX.replace_all(&name, ":");
449                            values.insert(name.to_string(), content.trim().to_string());
450                        }
451                    }
452                });
453        }
454
455        let meta_title_keys = [
456            "dc:title",
457            "dcterm:title",
458            "og:title",
459            "weibo:article:title",
460            "weibo:webpage:title",
461            "title",
462            "twitter:title",
463        ];
464        meta_data.title = if let Some(key) = meta_title_keys
465            .iter()
466            .find(|key| values.contains_key(**key))
467        {
468            let title = values.get(*key).map(|title| title.to_owned()).unwrap();
469            if title.is_empty() {
470                self.get_article_title()
471            } else {
472                title
473            }
474        } else {
475            self.get_article_title()
476        };
477
478        let meta_byline_keys = ["dc:creator", "dcterm:creator", "author"];
479        meta_data.byline = {
480            let possible_key = meta_byline_keys
481                .iter()
482                .find(|key| values.contains_key(**key));
483            if let Some(actual_key) = possible_key {
484                values.get(*actual_key).map(|byline| byline.to_owned())
485            } else {
486                None
487            }
488        };
489
490        let meta_excerpt_keys = [
491            "dc:description",
492            "dcterm:description",
493            "og:description",
494            "weibo:article:description",
495            "weibo:webpage:description",
496            "description",
497            "twitter:description",
498        ];
499        meta_data.excerpt = {
500            let possible_key = meta_excerpt_keys
501                .iter()
502                .find(|key| values.contains_key(**key));
503            if let Some(actual_key) = possible_key {
504                values.get(*actual_key).map(|excerpt| excerpt.to_owned())
505            } else {
506                None
507            }
508        };
509
510        meta_data.site_name = values
511            .get("og:site_name")
512            .map(|site_name| site_name.to_owned());
513
514        Self::unescape_html_entities(&mut meta_data.title);
515        if meta_data.byline.is_some() {
516            Self::unescape_html_entities(&mut meta_data.byline.as_mut().unwrap());
517        }
518
519        if meta_data.excerpt.is_some() {
520            Self::unescape_html_entities(&mut meta_data.excerpt.as_mut().unwrap());
521        }
522
523        if meta_data.site_name.is_some() {
524            Self::unescape_html_entities(&mut meta_data.site_name.as_mut().unwrap());
525        }
526
527        meta_data
528    }
529
530    /// Converts some of the common HTML entities in string to their corresponding characters.
531    fn unescape_html_entities(value: &mut String) {
532        if !value.is_empty() {
533            // TODO: Extract this
534            let mut html_escape_map: HashMap<&str, &str> = HashMap::new();
535            html_escape_map.insert("lt", "<");
536            html_escape_map.insert("gt", ">");
537            html_escape_map.insert("amp", "&");
538            html_escape_map.insert("quot", "\"");
539            html_escape_map.insert("apos", "'");
540            let mut new_value = regexes::REPLACE_HTML_ESCAPE_REGEX
541                .replace_all(&value, |captures: &regex::Captures| {
542                    html_escape_map[&captures[1]].to_string()
543                })
544                .to_string();
545            new_value = regexes::REPLACE_HEX_REGEX
546                .replace_all(&new_value, |captures: &regex::Captures| {
547                    let num = if let Some(hex_capture) = captures.get(1) {
548                        u16::from_str_radix(hex_capture.as_str(), 16)
549                    } else if let Some(dec_capture) = captures.get(2) {
550                        u16::from_str(dec_capture.as_str())
551                    } else {
552                        unreachable!("Unable to match any of the captures");
553                    };
554                    String::from_utf16_lossy(&[num.unwrap()])
555                })
556                .to_string();
557            *value = new_value;
558        }
559    }
560
561    /// Get the article title as an H1.
562    fn get_article_title(&self) -> String {
563        let mut cur_title = self
564            .root_node
565            .select_first("title")
566            .map(|title| title.text_contents().trim().to_string())
567            .unwrap_or("".to_string());
568        let orig_title = cur_title.clone();
569        let mut title_had_hierarchical_separators = false;
570        let word_count = |s: &str| -> usize { s.split_whitespace().count() };
571        if regexes::is_match_title_separator(&cur_title) {
572            title_had_hierarchical_separators = regexes::is_match_has_title_separator(&cur_title);
573            cur_title = regexes::REPLACE_START_SEPARATOR_REGEX
574                .replace_all(&orig_title, "$start")
575                .to_string();
576            if word_count(&cur_title) < 3 {
577                cur_title = regexes::REPLACE_END_SEPARATOR_REGEX
578                    .replace_all(&orig_title, "$end")
579                    .to_string();
580            }
581        } else if cur_title.contains(": ") {
582            let trimmed_title = cur_title.trim();
583            let is_match_heading = self
584                .root_node
585                .select("h1, h2")
586                .unwrap()
587                .any(|heading| heading.text_contents().trim() == trimmed_title);
588            if !is_match_heading {
589                let mut idx = orig_title.rfind(":").unwrap() + 1;
590                let mut new_title = &orig_title[idx..];
591                if word_count(new_title) < 3 {
592                    idx = orig_title.find(":").unwrap() + 1;
593                    new_title = &orig_title[idx..];
594                } else if word_count(&orig_title[0..orig_title.find(":").unwrap()]) > 5 {
595                    new_title = &orig_title;
596                }
597                cur_title = new_title.to_string();
598            }
599        } else if cur_title.len() > 150 || cur_title.len() < 15 {
600            let mut h1_nodes = self.root_node.select("h1").unwrap();
601            let h1_count = self.root_node.select("h1").unwrap().count();
602            if h1_count == 1 {
603                cur_title = Self::get_inner_text(h1_nodes.next().unwrap().as_node(), None);
604            }
605        }
606        cur_title = regexes::NORMALIZE_REGEX
607            .replace_all(cur_title.trim(), " ")
608            .to_string();
609        let cur_word_count = word_count(&cur_title);
610
611        if cur_word_count <= 4
612            && (!title_had_hierarchical_separators
613                || cur_word_count
614                    != word_count(
615                        &regexes::REPLACE_MULTI_SEPARATOR_REGEX.replace_all(&orig_title, ""),
616                    ) - 1)
617        {
618            cur_title = orig_title;
619        }
620        cur_title
621    }
622
623    /// Removes the class="" attribute from every element in the given subtree, except those that
624    /// match CLASSES_TO_PRESERVE and the classesToPreserve array from the options object.
625    fn clean_classes(&mut self) {
626        // TODO: This should accessed from Self
627        let classes_to_preserve: HashSet<&str> = HashSet::new();
628        if let Some(article_node) = &mut self.article_node {
629            for elem in article_node.inclusive_descendants().elements() {
630                let mut elem_attrs = elem.attributes.borrow_mut();
631                if let Some(class_list) = elem_attrs.get_mut("class") {
632                    let filtered_class: String = class_list
633                        .split_whitespace()
634                        .filter(|class| classes_to_preserve.contains(class))
635                        .fold("".to_string(), |acc, x| acc + " " + x);
636                    if filtered_class.is_empty() {
637                        elem_attrs.remove("class");
638                    } else {
639                        *class_list = filtered_class;
640                    }
641                }
642            }
643        }
644    }
645
646    ///  Converts each <a> and <img> uri in the given element to an absolute URI, ignoring #ref URIs.
647    fn fix_relative_uris(&mut self, document_uri: &str) {
648        if let Some(article_node) = &mut self.article_node {
649            let document_uri =
650                Url::parse(document_uri).expect("Unable to parse the document's URI");
651            let base_uri = self
652                .root_node
653                .select("base")
654                .unwrap()
655                .filter(|node_ref| {
656                    let node_attrs = node_ref.attributes.borrow();
657                    node_attrs.contains("href")
658                })
659                .map(|node_ref| {
660                    let node_attrs = node_ref.attributes.borrow();
661                    let href = node_attrs.get("href").unwrap();
662
663                    match Url::parse(href) {
664                        Ok(url) => url,
665                        Err(e) => match e {
666                            url::ParseError::RelativeUrlWithoutBase => {
667                                match document_uri.join(href) {
668                                    Ok(joined_url) => joined_url,
669                                    Err(e) => panic!(
670                                        "{:} unable to parse url {:?} on element {}",
671                                        e, href, &node_ref.name.local
672                                    ),
673                                }
674                            }
675                            e => panic!(
676                                "{:} unable to parse url {:?} on element {}",
677                                e, href, &node_ref.name.local
678                            ),
679                        },
680                    }
681                })
682                .next()
683                .unwrap_or(document_uri.clone());
684            let to_absolute_uri = |uri_str: &str| -> String {
685                if base_uri == document_uri && uri_str.starts_with("#") {
686                    return uri_str.to_string();
687                }
688
689                if let Ok(new_uri) = Url::parse(uri_str) {
690                    if new_uri.has_host() {
691                        return new_uri.to_string();
692                    }
693                } else if let Ok(joined_uri) = base_uri.join(uri_str) {
694                    return joined_uri.to_string();
695                }
696
697                uri_str.to_string()
698            };
699            let mut links = article_node.select("a").unwrap().filter(|a_ref| {
700                let link_attrs = a_ref.attributes.borrow();
701                link_attrs.contains("href")
702            });
703            let mut link = links.next();
704            while let Some(link_ref) = link {
705                link = links.next();
706                let mut link_attrs = link_ref.attributes.borrow_mut();
707                let href = link_attrs.get("href").map(|val| val.to_string()).unwrap();
708                if href.starts_with("javascript:") {
709                    let link_node = link_ref.as_node();
710                    if link_node.children().count() == 1
711                        && link_node
712                            .first_child()
713                            .map(|node_ref| node_ref.as_text().is_some())
714                            .unwrap()
715                    {
716                        let text_node = NodeRef::new_text(link_node.text_contents());
717                        link_node.insert_before(text_node);
718                        link_node.detach();
719                    } else {
720                        let container = NodeRef::new_element(
721                            QualName::new(None, Namespace::from(HTML_NS), LocalName::from("span")),
722                            BTreeMap::new(),
723                        );
724                        let mut children = link_node.children();
725                        let mut child = children.next();
726                        while let Some(child_ref) = child {
727                            child = children.next();
728                            container.append(child_ref);
729                        }
730                        link_node.insert_before(container);
731                        link_node.detach();
732                    }
733                } else {
734                    link_attrs.insert("href", to_absolute_uri(&href));
735                }
736            }
737            let media_nodes = article_node
738                .select("img, picture, figure, video, audio, source")
739                .unwrap();
740            for media_node in media_nodes {
741                let mut media_attrs = media_node.attributes.borrow_mut();
742                if let Some(src) = media_attrs.get_mut("src") {
743                    *src = to_absolute_uri(&src);
744                }
745
746                if let Some(poster) = media_attrs.get_mut("poster") {
747                    *poster = to_absolute_uri(&poster);
748                }
749
750                if let Some(srcset) = media_attrs.get_mut("srcset") {
751                    let new_srcset = regexes::SRCSET_CAPTURE_REGEX.replace_all(
752                        &srcset,
753                        |captures: &regex::Captures| {
754                            to_absolute_uri(&captures[1])
755                                + &captures.get(2).map(|cap| cap.as_str()).unwrap_or("")
756                                + &captures[3]
757                        },
758                    );
759                    *srcset = new_srcset.to_string();
760                }
761            }
762        }
763    }
764
765    /// Removes readability attributes from DOM nodes as they are not needed in the final article
766    fn clean_readability_attrs(&mut self) {
767        if let Some(article_node) = &mut self.article_node {
768            for node in article_node.inclusive_descendants().elements() {
769                let mut node_attrs = node.attributes.borrow_mut();
770                node_attrs.remove(READABILITY_SCORE);
771                node_attrs.remove("readability-data-table");
772            }
773        }
774    }
775
776    /// Run any post-process modifications to article content as necessary.
777    fn post_process_content(&mut self, url: &str) {
778        self.fix_relative_uris(url);
779        // TODO: Add flag check
780        self.clean_classes();
781        self.clean_readability_attrs();
782    }
783
784    /// Converts an inline CSS string to a [HashMap] of property and value(s)
785    fn inline_css_str_to_map(css_str: &str) -> HashMap<String, String> {
786        enum State {
787            ReadProp,
788            ReadVal,
789            ReadQuot,
790            ReadDquot,
791        }
792        let mut decl: (Option<String>, Option<String>) = (None, None);
793        let mut chars = css_str.chars();
794        let mut state = State::ReadProp;
795        let mut token = String::new();
796        let mut tokens = vec![];
797        while let Some(c) = chars.next() {
798            match state {
799                State::ReadProp => {
800                    if c != ':' {
801                        token.push(c);
802                    } else {
803                        state = State::ReadVal;
804                        decl.0 = Some(token.trim().to_string());
805                        token.clear();
806                    }
807                }
808                State::ReadVal => {
809                    if c == '\'' {
810                        state = State::ReadQuot;
811                        token.push(c);
812                    } else if c == '"' {
813                        state = State::ReadDquot;
814                        token.push(c);
815                    } else if c == ';' {
816                        state = State::ReadProp;
817                        decl.1 = Some(token.trim().to_string());
818                        tokens.push(decl.clone());
819                        decl = (None, None);
820                        token.clear();
821                    } else {
822                        token.push(c);
823                    }
824                }
825                State::ReadQuot => {
826                    token.push(c);
827                    if c == '\'' {
828                        state = State::ReadVal;
829                    }
830                }
831                State::ReadDquot => {
832                    token.push(c);
833                    if c == '"' {
834                        state = State::ReadVal;
835                    }
836                }
837            }
838        }
839        if !token.is_empty() {
840            match state {
841                State::ReadVal => {
842                    decl.1 = Some(token.trim().to_string());
843                    tokens.push(decl);
844                }
845                _ => (),
846            }
847        }
848
849        tokens
850            .into_iter()
851            .filter(|tok_pair| tok_pair.0.is_some() && tok_pair.1.is_some())
852            .map(|tok_pair| (tok_pair.0.unwrap(), tok_pair.1.unwrap()))
853            .collect()
854    }
855
856    fn is_probably_visible(node_ref: &NodeRef) -> bool {
857        if let Some(elem_data) = node_ref.as_element() {
858            let attributes = elem_data.attributes.borrow();
859            (if let Some(css_str) = attributes.get("style"){
860                let style_map = Self::inline_css_str_to_map(css_str);
861                if let Some(display_val) = style_map.get("display") {
862                    display_val != &"none"
863                } else {
864                    true
865                }
866            } else {
867                true
868            })
869                && !attributes.contains("hidden")
870            // check for "fallback-image" so that wikimedia math images are displayed
871                &&
872                    (!attributes.contains("aria-hidden") ||
873                    attributes.get("aria-hidden").map(|val| val != "true").unwrap_or(true) ||
874                    attributes.get("class").map(|class_list| class_list.split(" ").collect::<Vec<&str>>().contains(&"fallback-image")).unwrap_or(false))
875        } else {
876            // Technically, it should not matter what value is returned here
877            true
878        }
879    }
880
881    /// Check whether the input string could be a byline, i.e is less than 100 chars
882    fn is_valid_byline(input: &str) -> bool {
883        let text = input.trim();
884        text.len() > 0 && text.len() < 100
885    }
886
887    fn check_byline(&mut self, node_ref: &NodeRef, match_string: &str) -> bool {
888        if self.byline.is_none() {
889            if let Some(elem_data) = node_ref.as_element() {
890                let elem_attrs = elem_data.attributes.borrow();
891                let rel_attr = elem_attrs.get("rel");
892                let itemprop_attr = elem_attrs.get("itemprop");
893                let is_byline = (if rel_attr.is_some() {
894                    rel_attr.unwrap() == "author"
895                } else if itemprop_attr.is_some() {
896                    itemprop_attr.unwrap().contains("author")
897                } else {
898                    regexes::is_match_byline(match_string)
899                }) && Self::is_valid_byline(&node_ref.text_contents());
900                if is_byline {
901                    self.byline = Some(node_ref.text_contents().trim().to_owned());
902                }
903                is_byline
904            } else {
905                false
906            }
907        } else {
908            false
909        }
910    }
911
912    /// Traverse the DOM from node to node, starting at the node passed in.
913    /// Pass true for the second parameter to indicate this node itself
914    /// (and its kids) are going away, and we want the next node over.
915    ///
916    /// Calling this in a loop will traverse the DOM depth-first.
917    fn get_next_node(node_ref: &NodeRef, ignore_self_and_kids: bool) -> Option<NodeRef> {
918        // WARN: The uses of `next_element` here assume it returns an element node.
919        let has_elem_children = node_ref.children().elements().count();
920        if !ignore_self_and_kids && has_elem_children > 0 {
921            Self::next_element(node_ref.first_child(), true)
922        } else if let Some(next_sibling) = Self::next_element(node_ref.next_sibling(), true) {
923            Some(next_sibling)
924        } else {
925            // Keep walking up the node hierarchy until a parent with element siblings is found
926            let mut node = node_ref.parent();
927            while let Some(parent) = node {
928                if let Some(next_sibling) = Self::next_element(parent.next_sibling(), true) {
929                    return Some(next_sibling);
930                } else {
931                    node = parent.parent();
932                }
933            }
934            None
935        }
936    }
937
938    /// Removes the node_ref passed in and returns the next possible node by calling [get_next_node]
939    fn remove_and_get_next(node_ref: NodeRef) -> Option<NodeRef> {
940        let next_node = Self::get_next_node(&node_ref, true);
941        node_ref.detach();
942        next_node
943    }
944
945    /// Check if a given node has one of its ancestor tag name matching the
946    /// provided one.
947    fn has_ancestor_tag(
948        node_ref: &NodeRef,
949        tag_name: &str,
950        max_depth: Option<i32>,
951        filter_fn: Option<fn(&NodeRef) -> bool>,
952    ) -> bool {
953        let mut depth = 0;
954        let max_depth = max_depth.or(Some(3)).unwrap();
955        let mut parent = node_ref.parent();
956        while parent.is_some() {
957            let parent_node = parent.as_ref().unwrap();
958            if parent_node.as_element().is_none() {
959                // The recursion may go up the DOM tree upto a document node at which point it must stop
960                return false;
961            }
962            let parent_node_elem = parent_node.as_element().unwrap();
963            if max_depth > 0 && depth > max_depth {
964                return false;
965            }
966            if &parent_node_elem.name.local == tag_name
967                && (filter_fn.is_none() || filter_fn.unwrap()(parent_node))
968            {
969                return true;
970            }
971            parent = parent_node.parent();
972            depth += 1;
973        }
974        false
975    }
976
977    fn is_element_without_content(node_ref: &NodeRef) -> bool {
978        let child_count = node_ref.children().count();
979        node_ref.as_element().is_some()
980            && node_ref.text_contents().trim().is_empty()
981            && (child_count == 0
982                || child_count
983                    == node_ref.select("br").unwrap().count()
984                        + node_ref.select("hr").unwrap().count())
985    }
986
987    /// Check if this node has only whitespace and a single element with given tag
988    /// Returns false if the <div> node contains non-empty text nodes
989    /// or if it contains no element with given tag or more than 1 element.
990    fn has_single_tag_inside_element(node_ref: &NodeRef, tag_name: &str) -> bool {
991        let first_child = node_ref.children().elements().next();
992        if node_ref.children().elements().count() != 1
993            || (first_child.is_some() && &first_child.unwrap().name.local != tag_name)
994        {
995            return false;
996        }
997        !node_ref.children().any(|node| {
998            node.as_text().is_some()
999                && regexes::is_match_has_content(&node.text_contents().trim_end())
1000        })
1001    }
1002
1003    fn get_inner_text(node_ref: &NodeRef, normalize_spaces: Option<bool>) -> String {
1004        let will_normalize = normalize_spaces.unwrap_or(true);
1005        let text = node_ref.text_contents();
1006        let text = text.trim();
1007        if will_normalize {
1008            return regexes::NORMALIZE_REGEX.replace_all(&text, " ").to_string();
1009        }
1010        text.to_owned()
1011    }
1012
1013    /// Get the density of links as a percentage of the content
1014    /// This is the amount of text that is inside a link divided by the total text in the node.
1015    fn get_link_density(node_ref: &NodeRef) -> f32 {
1016        let text_length = Self::get_inner_text(node_ref, None).len() as f32;
1017        if text_length == 0_f32 {
1018            return 0_f32;
1019        }
1020        node_ref
1021            .select("a")
1022            .unwrap()
1023            .map(|a_node| Self::get_inner_text(a_node.as_node(), None).len() as f32)
1024            .sum::<f32>()
1025            / text_length
1026    }
1027
1028    /// Determine whether element has any children block level elements.
1029    fn has_child_block_element(node_ref: &NodeRef) -> bool {
1030        // TODO: Refer to a static HashSet
1031        let block_level_elems: [&str; 32] = [
1032            "address",
1033            "article",
1034            "aside",
1035            "blockquote",
1036            "details",
1037            "dialog",
1038            "dd",
1039            "div",
1040            "dl",
1041            "dt",
1042            "fieldset",
1043            "figcaption",
1044            "footer",
1045            "form",
1046            "h1",
1047            "h2",
1048            "h3",
1049            "h4",
1050            "h5",
1051            "h6",
1052            "header",
1053            "hgroup",
1054            "hr",
1055            "li",
1056            "main",
1057            "nav",
1058            "ol",
1059            "p",
1060            "pre",
1061            "section",
1062            "table",
1063            "ul",
1064        ];
1065        node_ref.children().any(|child_node| {
1066            if child_node.as_element().is_some() {
1067                let child_elem = child_node.as_element().unwrap();
1068                block_level_elems.contains(&&*child_elem.name.local)
1069                    || Self::has_child_block_element(&child_node)
1070            } else {
1071                false
1072            }
1073        })
1074    }
1075
1076    /// Returns a [Vec] of ancestors
1077    fn get_node_ancestors(node_ref: &NodeRef, max_depth: Option<usize>) -> Vec<NodeRef> {
1078        node_ref.ancestors().take(max_depth.unwrap_or(1)).collect()
1079    }
1080
1081    /// Get an element's class/id weight using regular expressions to tell if this
1082    /// element looks good or bad.
1083    fn get_class_weight(&self, node_ref: &NodeRef) -> i32 {
1084        if !self.flag_is_active(FLAG_WEIGHT_CLASSES) {
1085            return 0;
1086        }
1087        let mut weight = 0;
1088        let node_elem = node_ref.as_element().unwrap();
1089        let node_attrs = node_elem.attributes.borrow();
1090        if let Some(id) = node_attrs.get("id") {
1091            if !id.trim().is_empty() {
1092                weight = if regexes::is_match_positive(id) {
1093                    weight + 25
1094                } else if regexes::is_match_negative(id) {
1095                    weight - 25
1096                } else {
1097                    weight
1098                }
1099            }
1100        }
1101        if let Some(class) = node_attrs.get("class") {
1102            if !class.trim().is_empty() {
1103                weight = if regexes::is_match_positive(class) {
1104                    weight + 25
1105                } else if regexes::is_match_negative(class) {
1106                    weight - 25
1107                } else {
1108                    weight
1109                }
1110            }
1111        }
1112        weight
1113    }
1114
1115    /// Initialize a node with the readability attribute. Also checks the
1116    /// className/id for special names to add to its score.
1117    fn initialize_node(&self, node_ref: &mut NodeRef) {
1118        if let Some(element) = node_ref.as_element() {
1119            let mut score = 0.0;
1120            // This must be computed first because it borrows the NodeRef which
1121            // should not also be mutably borrowed
1122            score += self.get_class_weight(node_ref) as f32;
1123            let mut elem_attrs = element.attributes.borrow_mut();
1124            elem_attrs.insert(READABILITY_SCORE, score.to_string());
1125            let readability = elem_attrs.get_mut(READABILITY_SCORE);
1126            match &*element.name.local {
1127                "div" => score += 5.0,
1128                "pre" | "td" | "blockquote" => score += 3.0,
1129                "address" | "ol" | "ul" | "dl" | "dd" | "dt" | "li" | "form" => score -= 3.0,
1130                "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "th" => score -= 5.0,
1131                _ => (),
1132            }
1133            if let Some(x) = readability {
1134                *x = score.to_string();
1135            }
1136        }
1137    }
1138
1139    fn get_row_and_column_count(node_ref: &NodeRef) -> SizeInfo {
1140        let mut rows = 0;
1141        let mut columns = 0;
1142        if let Ok(trs) = node_ref.select("tr") {
1143            for tr in trs {
1144                let tr_node = tr.as_node();
1145                let tr_attr = tr.attributes.borrow();
1146                let rowspan = tr_attr
1147                    .get("rowspan")
1148                    .map(|x| {
1149                        x.parse::<usize>()
1150                            .expect("Unable to parse rowspan value to usize")
1151                    })
1152                    .unwrap_or(1);
1153                rows += rowspan;
1154                let mut columns_in_row = 0;
1155                if let Ok(cells) = tr_node.select("td") {
1156                    for cell in cells {
1157                        let cell_attr = cell.attributes.borrow();
1158                        let colspan = cell_attr
1159                            .get("colspan")
1160                            .map(|x| {
1161                                x.parse::<usize>()
1162                                    .expect("Unable to parse colspan value to usize")
1163                            })
1164                            .unwrap_or(1);
1165                        columns_in_row += colspan;
1166                    }
1167                }
1168                columns = columns.max(columns_in_row);
1169            }
1170        }
1171        SizeInfo { rows, columns }
1172    }
1173
1174    /// Look for 'data' (as opposed to 'layout') tables, for which we use similar checks as
1175    /// https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920
1176    fn mark_data_tables(&mut self) {
1177        if let Ok(tables) = self.root_node.select("table") {
1178            for table in tables {
1179                let mut table_attr = table.attributes.borrow_mut();
1180                let table_node = table.as_node();
1181                if table_attr.get("role") == Some("presentation") {
1182                    table_attr.insert("readability-data-table", "false".to_string());
1183                    continue;
1184                }
1185                if table_attr.get("datatable") == Some("0") {
1186                    table_attr.insert("readability-data-table", "false".to_string());
1187                    continue;
1188                }
1189
1190                if table_attr.contains("summary") {
1191                    table_attr.insert("readability-data-table", "true".to_string());
1192                    continue;
1193                }
1194                if let Ok(caption) = table_node.select_first("caption") {
1195                    if caption.as_node().children().count() > 0 {
1196                        table_attr.insert("readability-data-table", "true".to_string());
1197                        continue;
1198                    }
1199                }
1200
1201                if DATA_TABLE_DESCENDANTS
1202                    .iter()
1203                    .any(|tag_name| table_node.select_first(tag_name).is_ok())
1204                {
1205                    table_attr.insert("readability-data-table", "true".to_string());
1206                    continue;
1207                }
1208
1209                if table_node.select("table").unwrap().count() > 1 {
1210                    table_attr.insert("readability-data-table", "false".to_string());
1211                    continue;
1212                }
1213
1214                let size_info = Self::get_row_and_column_count(table_node);
1215                if size_info.rows >= 10 || size_info.columns > 4 {
1216                    table_attr.insert("readability-data-table", "true".to_string());
1217                    continue;
1218                }
1219
1220                if (size_info.rows * size_info.columns) > 10 {
1221                    table_attr.insert("readability-data-table", "true".to_string());
1222                    continue;
1223                } else {
1224                    table_attr.insert("readability-data-table", "false".to_string());
1225                    continue;
1226                }
1227            }
1228        }
1229    }
1230
1231    /// Convert images and figures that have properties like data-src into images that can be loaded without JS
1232    fn fix_lazy_images(node_ref: &mut NodeRef) {
1233        let nodes = node_ref.select("img, picture, figure").unwrap();
1234        for node in nodes {
1235            let mut node_attr = node.attributes.borrow_mut();
1236            if let Some(src) = node_attr.get("src") {
1237                let src_captures = regexes::B64_DATA_URL_REGEX.captures(src);
1238                if src_captures.is_some() {
1239                    let svg_capture = src_captures.unwrap().get(1);
1240                    if svg_capture.is_some() && svg_capture.unwrap().as_str() == "image/svg+xml" {
1241                        continue;
1242                    }
1243
1244                    let src_could_be_removed = node_attr
1245                        .map
1246                        .iter()
1247                        .filter(|(name, _)| &name.local != "src")
1248                        .filter(|(_, val)| regexes::is_match_img_ext(&val.value))
1249                        .count()
1250                        > 0;
1251
1252                    if src_could_be_removed {
1253                        let b64_start = regexes::BASE64_REGEX.find(src).unwrap().start();
1254                        let b64_length = src.len() - b64_start;
1255                        if b64_length < 133 {
1256                            node_attr.remove("src");
1257                        }
1258                    }
1259                }
1260            }
1261            let src = node_attr.get("src");
1262            let srcset = node_attr.get("srcset");
1263            let class = node_attr.get("class");
1264            if (src.is_some() || srcset.is_some())
1265                && class.and_then(|classname| classname.find("lazy")).is_none()
1266            {
1267                continue;
1268            }
1269
1270            node_attr
1271                .map
1272                .clone()
1273                .iter()
1274                .filter(|(key, _)| !(&key.local == "src" || &key.local == "srcset"))
1275                .for_each(|(_, val)| {
1276                    let mut copy_to = "";
1277                    if regexes::is_match_srcset(&val.value) {
1278                        copy_to = "srcset";
1279                    } else if regexes::is_match_src_regex(&val.value) {
1280                        copy_to = "src";
1281                    }
1282                    if copy_to.len() > 0 {
1283                        let new_val = val.value.clone();
1284                        let tag_name = &node.name.local;
1285                        if tag_name == "img" || tag_name == "picture" {
1286                            node_attr.insert(copy_to, new_val);
1287                        } else if tag_name == "figure" {
1288                            let node_ref = node.as_node();
1289                            let img_picture_nodes = node_ref.select("img, picture").unwrap();
1290                            if img_picture_nodes.count() > 0 {
1291                                let img = NodeRef::new_element(
1292                                    QualName::new(
1293                                        None,
1294                                        Namespace::from(HTML_NS),
1295                                        LocalName::from("img"),
1296                                    ),
1297                                    BTreeMap::new(),
1298                                );
1299                                {
1300                                    let mut img_attr =
1301                                        img.as_element().unwrap().attributes.borrow_mut();
1302                                    img_attr.insert(copy_to, new_val);
1303                                }
1304                                node_ref.append(img);
1305                            }
1306                        }
1307                    }
1308                });
1309        }
1310    }
1311
1312    /// Clean an element of all tags of type "tag" if they look fishy. "Fishy" is an algorithm
1313    /// based on content length, classnames, link density, number of images & embeds, etc.
1314    fn clean_conditionally(&self, node_ref: &mut NodeRef, tag_name: &str) {
1315        if !self.flag_is_active(FLAG_CLEAN_CONDITIONALLY) {
1316            return;
1317        }
1318        let is_list = tag_name == "ul" || tag_name == "ol";
1319        let is_data_table = |node_ref: &NodeRef| {
1320            let node_elem = node_ref.as_element().unwrap();
1321            let attrs = node_elem.attributes.borrow();
1322            attrs.get("readability-data-table") == Some("true")
1323        };
1324        let get_char_count = |node_ref: &NodeRef| node_ref.text_contents().matches(",").count();
1325
1326        let mut nodes = node_ref
1327            .descendants()
1328            .select(tag_name)
1329            .unwrap()
1330            // Do not remove data tables
1331            .filter(|node_data_ref| {
1332                !(&node_data_ref.name.local == "table" && is_data_table(node_data_ref.as_node()))
1333            })
1334            // Do not remove if it is a child of a data table
1335            .filter(|node_data_ref| {
1336                !Self::has_ancestor_tag(
1337                    node_data_ref.as_node(),
1338                    tag_name,
1339                    Some(-1),
1340                    Some(is_data_table),
1341                )
1342            });
1343        let mut next_node = nodes.next();
1344        while let Some(node_data_ref) = next_node {
1345            next_node = nodes.next();
1346            let node = node_data_ref.as_node();
1347            let weight = self.get_class_weight(node);
1348            // Remove all elements with negative class weights
1349            if weight < 0 {
1350                node.detach();
1351                continue;
1352            }
1353
1354            if get_char_count(node) >= 10 {
1355                continue;
1356            }
1357            let mut embeds = node_data_ref
1358                .as_node()
1359                .select("object, embed, iframe")
1360                .unwrap();
1361            let can_skip_embed = embeds.any(|node_data_ref| {
1362                &node_data_ref.name.local == "object" || {
1363                    let attrs = node_data_ref.attributes.borrow();
1364
1365                    attrs
1366                        .map
1367                        .iter()
1368                        .any(|(_, val)| regexes::is_match_videos(&val.value))
1369                }
1370            });
1371            if can_skip_embed {
1372                continue;
1373            }
1374
1375            let p_nodes = node_data_ref.as_node().select("p").unwrap().count();
1376            let img_nodes = node_data_ref.as_node().select("img").unwrap().count();
1377            let li_nodes = node_data_ref.as_node().select("li").unwrap().count() as i32 - 100;
1378            let input_nodes = node_data_ref.as_node().select("input").unwrap().count();
1379
1380            let p = p_nodes as f32;
1381            let img = img_nodes as f32;
1382
1383            let embed_count = node.select("object, embed, iframe").unwrap().count();
1384            let link_density = Self::get_link_density(node);
1385            let content_length = Self::get_inner_text(node, None).len();
1386            let has_figure_ancestor = Self::has_ancestor_tag(node, "figure", None, None);
1387            let have_to_remove = (img_nodes > 1 && p / img < 0.5 && !has_figure_ancestor)
1388                || (!is_list && li_nodes > p_nodes as i32)
1389                || (input_nodes > (p_nodes / 3))
1390                || (!is_list
1391                    && content_length < 25
1392                    && (img_nodes == 0 || img_nodes > 2)
1393                    && !has_figure_ancestor)
1394                || (!is_list && weight < 25 && link_density > 0.2)
1395                || (weight >= 25 && link_density > 0.5)
1396                || ((embed_count == 1 && content_length < 75) || embed_count > 1);
1397            if have_to_remove {
1398                node.detach();
1399            }
1400        }
1401    }
1402
1403    /// Clean a node of all elements of type "tag". (Unless it's a YouTube or Vimeo video)
1404    fn clean(node_ref: &mut NodeRef, tag_name: &str) {
1405        // Can be changed to a HashSet
1406        let is_embed = vec!["object", "embed", "iframe"].contains(&tag_name);
1407        let mut nodes = node_ref
1408            .descendants()
1409            .select(tag_name)
1410            .unwrap()
1411            .filter(|node_data_ref| {
1412                !is_embed
1413                    || {
1414                        let attrs = node_data_ref.attributes.borrow();
1415                        !attrs
1416                            .map
1417                            .iter()
1418                            .any(|(_, val)| regexes::is_match_videos(&val.value))
1419                    }
1420                    || &node_data_ref.name.local == "object" // This currently does not check the innerHTML.
1421            });
1422        let mut node = nodes.next();
1423        while let Some(node_data_ref) = node {
1424            node = nodes.next();
1425            node_data_ref.as_node().detach()
1426        }
1427    }
1428
1429    /// Clean out spurious headers from an Element. Checks things like classnames and link density.
1430    fn clean_headers(&self, node_ref: &mut NodeRef) {
1431        let mut nodes = node_ref
1432            .descendants()
1433            .select("h1, h2")
1434            .unwrap()
1435            .filter(|node_data_ref| self.get_class_weight(node_data_ref.as_node()) < 0);
1436        let mut node = nodes.next();
1437
1438        while let Some(node_data_ref) = node {
1439            node = nodes.next();
1440            node_data_ref.as_node().detach();
1441        }
1442    }
1443
1444    /// Remove the style attribute on every element and descendants.
1445    fn clean_styles(node_ref: &mut NodeRef) {
1446        node_ref
1447            .inclusive_descendants()
1448            .elements()
1449            .filter(|node| &node.name.local != "svg")
1450            .for_each(|node_data_ref| {
1451                let mut attrs = node_data_ref.attributes.borrow_mut();
1452                PRESENTATIONAL_ATTRIBUTES.iter().for_each(|pres_attr| {
1453                    attrs.remove(*pres_attr);
1454                });
1455                if DEPRECATED_SIZE_ATTRIBUTE_ELEMS.contains(&node_data_ref.name.local.as_ref()) {
1456                    attrs.remove("width");
1457                    attrs.remove("height");
1458                }
1459            });
1460    }
1461
1462    /// Clean out elements that match the specified conditions
1463    fn clean_matched_nodes(node_ref: &mut NodeRef, filter_fn: impl Fn(&NodeRef, &str) -> bool) {
1464        let end_of_search_marker_node = Self::get_next_node(node_ref, true);
1465        let mut next_node = Self::get_next_node(node_ref, false);
1466        while next_node.is_some() && next_node != end_of_search_marker_node {
1467            let node = next_node.unwrap();
1468            let attrs = node.as_element().unwrap().attributes.borrow();
1469            let class = attrs.get("class").unwrap_or("");
1470            let id = attrs.get("id").unwrap_or("");
1471            if filter_fn(&node, &(class.to_string() + " " + id)) {
1472                next_node = Self::remove_and_get_next(node.clone());
1473            } else {
1474                next_node = Self::get_next_node(&node, false);
1475            }
1476        }
1477    }
1478
1479    /// Prepare the article node for display. Clean out any inline styles, iframes,
1480    /// forms, strip extraneous <p> tags, etc.
1481    fn prep_article(&mut self, node_ref: &mut NodeRef) {
1482        Self::clean_styles(node_ref);
1483        self.mark_data_tables();
1484        Self::fix_lazy_images(node_ref);
1485        self.clean_conditionally(node_ref, "form");
1486        self.clean_conditionally(node_ref, "fieldset");
1487        Self::clean(node_ref, "object");
1488        Self::clean(node_ref, "embed");
1489        Self::clean(node_ref, "h1");
1490        Self::clean(node_ref, "footer");
1491        Self::clean(node_ref, "link");
1492        Self::clean(node_ref, "aside");
1493
1494        node_ref.children().for_each(|mut node| {
1495            Self::clean_matched_nodes(&mut node, |node: &NodeRef, match_string| {
1496                regexes::is_match_share_elems(match_string)
1497                    && node.text_contents().len() < DEFAULT_CHAR_THRESHOLD
1498            });
1499        });
1500
1501        let h2_nodes = node_ref.select("h2").unwrap().take(2).collect::<Vec<_>>();
1502        if h2_nodes.len() == 1 {
1503            let h2_node = h2_nodes[0].as_node();
1504            let length_similar_rate = ((h2_node.text_contents().len() as isize
1505                - self.article_title.len() as isize) as f32)
1506                / self.article_title.len() as f32;
1507            if length_similar_rate.abs() < 0.5 {
1508                let titles_match = if length_similar_rate > 0.0 {
1509                    h2_node.text_contents().contains(&self.article_title)
1510                } else {
1511                    self.article_title.contains(&h2_node.text_contents())
1512                };
1513                if titles_match {
1514                    Self::clean(node_ref, "h2");
1515                }
1516            }
1517        }
1518
1519        Self::clean(node_ref, "iframe");
1520        Self::clean(node_ref, "input");
1521        Self::clean(node_ref, "textarea");
1522        Self::clean(node_ref, "select");
1523        Self::clean(node_ref, "button");
1524        self.clean_headers(node_ref);
1525
1526        self.clean_conditionally(node_ref, "table");
1527        self.clean_conditionally(node_ref, "ul");
1528        self.clean_conditionally(node_ref, "div");
1529
1530        let mut p_nodes = node_ref.select("p").unwrap().filter(|node_data_ref| {
1531            let p_node = node_data_ref.as_node();
1532            let img_count = p_node.select("img").unwrap().count();
1533            let embed_count = p_node.select("embed").unwrap().count();
1534            let object_count = p_node.select("object").unwrap().count();
1535            let iframe_count = p_node.select("iframe").unwrap().count();
1536            let total = img_count + embed_count + object_count + iframe_count;
1537            total == 0 && Self::get_inner_text(node_data_ref.as_node(), Some(false)).is_empty()
1538        });
1539        let mut p_node = p_nodes.next();
1540        while let Some(p_node_ref) = p_node {
1541            p_node = p_nodes.next();
1542            p_node_ref.as_node().detach();
1543        }
1544
1545        let mut br_nodes = node_ref.select("br").unwrap().filter(|node_data_ref| {
1546            let br_node = node_data_ref.as_node();
1547            // WARN: This assumes `next_element` returns an element node.
1548            let next_node = Self::next_element(br_node.next_sibling(), true);
1549            next_node.is_some() && &next_node.unwrap().as_element().unwrap().name.local == "p"
1550        });
1551        let mut br_node = br_nodes.next();
1552        while let Some(br_node_ref) = br_node {
1553            br_node = br_nodes.next();
1554            br_node_ref.as_node().detach();
1555        }
1556
1557        let mut table_nodes = node_ref.select("table").unwrap();
1558        let mut table_node = table_nodes.next();
1559        while let Some(table_node_ref) = table_node {
1560            table_node = table_nodes.next();
1561            let table_node = table_node_ref.as_node();
1562            // WARN: This assumes `next_element` returns an element node.
1563            let table_child = Self::next_element(table_node.first_child(), true);
1564            let tbody = if Self::has_single_tag_inside_element(&table_node, "tbody") {
1565                table_child.as_ref().unwrap()
1566            } else {
1567                table_node
1568            };
1569
1570            // WARN: This block assumes `next_element` returns an element node
1571            if Self::has_single_tag_inside_element(&tbody, "tr") {
1572                let row = Self::next_element(tbody.first_child(), true).unwrap();
1573                if Self::has_single_tag_inside_element(&row, "td") {
1574                    let mut cell = Self::next_element(row.first_child(), true).unwrap();
1575                    let tag = if cell
1576                        .children()
1577                        .all(|cell_child| Self::is_phrasing_content(&cell_child))
1578                    {
1579                        "p"
1580                    } else {
1581                        "div"
1582                    };
1583                    cell = Self::set_node_tag(&cell, tag);
1584                    if let Some(parent) = table_node.parent() {
1585                        parent.append(cell);
1586                        table_node.detach();
1587                    }
1588                }
1589            }
1590        }
1591    }
1592
1593    fn flag_is_active(&self, flag: u32) -> bool {
1594        self.flags & flag > 0
1595    }
1596
1597    fn remove_flag(&mut self, flag: u32) {
1598        self.flags = self.flags & !flag;
1599    }
1600
1601    /// Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff
1602    /// a user wants to read. Then return it wrapped up in a div.
1603    fn grab_article(&mut self) -> Result<(), ParserError> {
1604        info!("Grabbing article {:?}", self.metadata.title);
1605        // var doc = this._doc;
1606        // var isPaging = (page !== null ? true: false);
1607        // page = page ? page : this._doc.body;
1608        let page = self.root_node.select_first("body");
1609        if page.is_err() {
1610            return Err(ParserError::new("Document has no <body>".into()).into());
1611        }
1612        let page = page.unwrap();
1613        let mut attempts: Vec<ExtractAttempt> = Vec::new();
1614
1615        // var pageCacheHtml = page.innerHTML;
1616        //TODO: Add page cache
1617
1618        loop {
1619            //   var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
1620            let strip_unlikely_candidates = self.flag_is_active(FLAG_STRIP_UNLIKELYS);
1621
1622            //   // First, node prepping. Trash nodes that look cruddy (like ones with the
1623            //   // class name "comment", etc), and turn divs into P tags where they have been
1624            //   // used inappropriately (as in, where they contain no other block level elements.)
1625            let mut elements_to_score: Vec<NodeRef> = Vec::new();
1626            let mut node = self
1627                .root_node
1628                .select_first("html")
1629                .ok()
1630                .map(|n| n.as_node().clone());
1631
1632            while let Some(node_ref) = node {
1633                let node_elem = node_ref.as_element().unwrap();
1634                let node_name: &str = node_elem.name.local.as_ref();
1635                let match_string = {
1636                    let node_attrs = node_elem.attributes.borrow();
1637                    node_attrs.get("class").unwrap_or("").to_string()
1638                        + " "
1639                        + node_attrs.get("id").unwrap_or("")
1640                };
1641                if !Self::is_probably_visible(&node_ref) {
1642                    node = Self::remove_and_get_next(node_ref);
1643                    continue;
1644                }
1645
1646                if self.check_byline(&node_ref, &match_string) {
1647                    node = Self::remove_and_get_next(node_ref);
1648                    continue;
1649                }
1650
1651                if strip_unlikely_candidates {
1652                    if regexes::is_match_unlikely(&match_string)
1653                        && !regexes::is_match_ok_maybe(&match_string)
1654                        && !Self::has_ancestor_tag(&node_ref, "table", None, None)
1655                        && node_name != "body"
1656                        && node_name != "a"
1657                    {
1658                        node = Self::remove_and_get_next(node_ref);
1659                        continue;
1660                    }
1661
1662                    let is_complementary = {
1663                        let node_attrs = node_elem.attributes.borrow();
1664                        node_attrs.get("role") == Some("complementary")
1665                    };
1666                    if is_complementary {
1667                        node = Self::remove_and_get_next(node_ref);
1668                        continue;
1669                    }
1670                }
1671
1672                match node_name {
1673                    "div" | "section" | "header" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
1674                        if Self::is_element_without_content(&node_ref) {
1675                            node = Self::remove_and_get_next(node_ref);
1676                            continue;
1677                        }
1678                    }
1679                    _ => (),
1680                }
1681                if DEFAULT_TAGS_TO_SCORE.contains(&node_name) {
1682                    elements_to_score.push(node_ref.clone());
1683                }
1684                if node_name == "div" {
1685                    let mut p: Option<NodeRef> = None;
1686                    let mut child_node = node_ref.first_child();
1687                    while let Some(child_node_ref) = child_node {
1688                        let next_sibling = child_node_ref.next_sibling();
1689                        if Self::is_phrasing_content(&child_node_ref) {
1690                            if let Some(ref p_node) = p {
1691                                p_node.append(child_node_ref);
1692                            } else if !Self::is_whitespace(&child_node_ref) {
1693                                let new_p_node = NodeRef::new_element(
1694                                    QualName::new(
1695                                        None,
1696                                        Namespace::from(HTML_NS),
1697                                        LocalName::from("p"),
1698                                    ),
1699                                    BTreeMap::new(),
1700                                );
1701                                child_node_ref.insert_before(new_p_node);
1702                                p = child_node_ref.previous_sibling();
1703                                // Append will implicitly detach the child_node_ref
1704                                p.as_mut().unwrap().append(child_node_ref);
1705                            }
1706                        } else if let Some(ref p_node) = p {
1707                            while let Some(last_child) = p_node.last_child() {
1708                                if Self::is_whitespace(&last_child) {
1709                                    last_child.detach();
1710                                } else {
1711                                    break;
1712                                }
1713                            }
1714                            p = None;
1715                        }
1716                        child_node = next_sibling;
1717                    }
1718                    if Self::has_single_tag_inside_element(&node_ref, "p")
1719                        && Self::get_link_density(&node_ref) < 0.25
1720                    {
1721                        // WARN: This assumes `next_element` returns an element node.
1722                        let new_node = Self::next_element(node_ref.first_child(), true).unwrap();
1723                        elements_to_score.push(new_node.clone());
1724                        node_ref.insert_before(new_node);
1725                        let new_node = node_ref.previous_sibling();
1726                        node_ref.detach();
1727                        node = new_node;
1728                        elements_to_score.push(node.clone().unwrap());
1729                    } else if !Self::has_child_block_element(&node_ref) {
1730                        node = Some(Self::set_node_tag(&node_ref, "p"));
1731                        elements_to_score.push(node.clone().unwrap());
1732                    }
1733                }
1734                node = Self::get_next_node(&node_ref, false);
1735            }
1736
1737            let mut candidates: Vec<NodeRef> = Vec::new();
1738            elements_to_score
1739                .iter()
1740                .filter(|node_ref| {
1741                    let parent = node_ref.parent();
1742                    parent.is_some() && parent.unwrap().as_element().is_some()
1743                })
1744                .map(|node_ref| (node_ref, Self::get_inner_text(&node_ref, None)))
1745                .filter(|(_, inner_text)| inner_text.len() >= 25)
1746                .map(|(node_ref, inner_text)| {
1747                    (inner_text, Self::get_node_ancestors(&node_ref, Some(3)))
1748                })
1749                .filter(|(_, ancestors)| ancestors.len() != 0)
1750                .for_each(|(inner_text, ancestors)| {
1751                    let mut content_score = 0;
1752                    content_score += 1;
1753                    content_score += inner_text.split(",").count();
1754                    content_score += (3).min(inner_text.len() / 100);
1755                    ancestors
1756                        .into_iter()
1757                        .enumerate()
1758                        .filter(|(_, node)| {
1759                            node.parent().is_some() && node.parent().unwrap().as_element().is_some()
1760                        })
1761                        .for_each(|(level, mut ancestor)| {
1762                            let has_readability = {
1763                                let ancestor_attrs =
1764                                    ancestor.as_element().unwrap().attributes.borrow();
1765                                ancestor_attrs.contains(READABILITY_SCORE)
1766                            };
1767                            if !has_readability {
1768                                self.initialize_node(&mut ancestor);
1769                                candidates.push(ancestor.clone());
1770                            }
1771
1772                            let score_divider = if level == 0 {
1773                                1.0
1774                            } else if level == 1 {
1775                                2.0
1776                            } else {
1777                                level as f32 * 3.0
1778                            };
1779                            let mut ancestor_attrs =
1780                                ancestor.as_element().unwrap().attributes.borrow_mut();
1781                            if let Some(readability_score) =
1782                                ancestor_attrs.get_mut(READABILITY_SCORE)
1783                            {
1784                                *readability_score = (readability_score.parse::<f32>().unwrap()
1785                                    + (content_score as f32 / score_divider))
1786                                    .to_string();
1787                            }
1788                        });
1789                });
1790
1791            let mut top_candidates: Vec<NodeRef> = Vec::new();
1792            for candidate in candidates {
1793                let mut candidate_score = 0.0;
1794                {
1795                    let mut candidate_attr =
1796                        candidate.as_element().unwrap().attributes.borrow_mut();
1797                    if let Some(readability_score) = candidate_attr.get_mut(READABILITY_SCORE) {
1798                        candidate_score = readability_score.parse::<f32>().unwrap()
1799                            * (1.0 - Self::get_link_density(&candidate));
1800                        *readability_score = candidate_score.to_string();
1801                    }
1802                }
1803                let nb_top_candidates = 5;
1804                for i in 0..nb_top_candidates {
1805                    let top_candidate = top_candidates.get(i);
1806                    let top_candidate_score = top_candidate
1807                        .as_ref()
1808                        .map(|node_ref| node_ref.as_element().unwrap().attributes.borrow())
1809                        .map(|attrs| {
1810                            attrs
1811                                .get(READABILITY_SCORE)
1812                                .unwrap_or("0")
1813                                .parse::<f32>()
1814                                .unwrap()
1815                        });
1816                    if top_candidate.is_none() || candidate_score > top_candidate_score.unwrap() {
1817                        top_candidates.splice(i..i, vec![candidate].into_iter());
1818                        if top_candidates.len() > nb_top_candidates {
1819                            top_candidates.pop();
1820                        }
1821                        break;
1822                    }
1823                }
1824            }
1825
1826            let possible_top_candidate = top_candidates.get(0);
1827            let mut top_candidate;
1828            let mut needed_to_create_top_candidate = false;
1829            let mut parent_of_top_candidate: NodeRef;
1830
1831            if possible_top_candidate.is_none()
1832                || possible_top_candidate
1833                    .map(|node| &node.as_element().unwrap().name.local)
1834                    .as_ref()
1835                    .unwrap()
1836                    == &"body"
1837            {
1838                top_candidate = NodeRef::new_element(
1839                    QualName::new(None, Namespace::from(HTML_NS), LocalName::from("div")),
1840                    BTreeMap::new(),
1841                );
1842                needed_to_create_top_candidate = true;
1843                let mut page_children = page.as_node().children();
1844                let mut page_child = page_children.next();
1845                while let Some(child_node) = page_child {
1846                    page_child = page_children.next();
1847                    top_candidate.append(child_node);
1848                }
1849                page.as_node().append(top_candidate.clone());
1850                self.initialize_node(&mut top_candidate);
1851            } else {
1852                let alternative_candidate_ancestors: Vec<Vec<NodeRef>>;
1853                top_candidate = top_candidates.get(0).unwrap().clone();
1854                let top_candidate_score = {
1855                    let top_candidate_node_attrs =
1856                        top_candidate.as_element().unwrap().attributes.borrow();
1857                    top_candidate_node_attrs
1858                        .get(READABILITY_SCORE)
1859                        .unwrap()
1860                        .parse::<f32>()
1861                        .unwrap()
1862                };
1863
1864                alternative_candidate_ancestors = top_candidates
1865                    .iter()
1866                    .skip(1)
1867                    .filter(|top_candidate_node| {
1868                        let candidate_node_score = {
1869                            let top_candidate_node_attrs =
1870                                top_candidate_node.as_element().unwrap().attributes.borrow();
1871                            top_candidate_node_attrs
1872                                .get(READABILITY_SCORE)
1873                                .unwrap()
1874                                .parse::<f32>()
1875                                .unwrap()
1876                        };
1877                        (candidate_node_score / top_candidate_score) >= 0.75
1878                    })
1879                    .map(|node| Self::get_node_ancestors(&node, None))
1880                    .collect();
1881
1882                let minimum_top_candidates = 3;
1883                if alternative_candidate_ancestors.len() >= minimum_top_candidates {
1884                    parent_of_top_candidate = top_candidate.parent().unwrap();
1885                    while &parent_of_top_candidate.as_element().unwrap().name.local != "body" {
1886                        let mut lists_containing_this_ancestor = alternative_candidate_ancestors
1887                            .iter()
1888                            .filter(|node_vec| node_vec.contains(&parent_of_top_candidate))
1889                            .count();
1890                        lists_containing_this_ancestor =
1891                            lists_containing_this_ancestor.min(minimum_top_candidates);
1892                        if lists_containing_this_ancestor >= minimum_top_candidates {
1893                            top_candidate = parent_of_top_candidate;
1894                            break;
1895                        }
1896                        parent_of_top_candidate = parent_of_top_candidate.parent().unwrap();
1897                    }
1898                }
1899
1900                let top_candidate_readability = {
1901                    let top_candidate_attrs =
1902                        top_candidate.as_element().unwrap().attributes.borrow();
1903                    top_candidate_attrs
1904                        .get(READABILITY_SCORE)
1905                        .map(|x| x.to_owned())
1906                };
1907
1908                if top_candidate_readability.is_none() {
1909                    self.initialize_node(&mut top_candidate);
1910                }
1911                parent_of_top_candidate = top_candidate.parent().unwrap();
1912
1913                let mut last_score = {
1914                    let top_candidate_node_attrs =
1915                        top_candidate.as_element().unwrap().attributes.borrow();
1916                    top_candidate_node_attrs
1917                        .get(READABILITY_SCORE)
1918                        .unwrap()
1919                        .parse::<f32>()
1920                        .unwrap()
1921                };
1922                let score_threshold = last_score / 3.0;
1923                while parent_of_top_candidate
1924                    .as_element()
1925                    .map(|elem| elem.name.local.as_ref())
1926                    .unwrap()
1927                    != "body"
1928                {
1929                    let parent_readability = {
1930                        let parent_attrs = parent_of_top_candidate
1931                            .as_element()
1932                            .unwrap()
1933                            .attributes
1934                            .borrow();
1935                        parent_attrs
1936                            .get(READABILITY_SCORE)
1937                            .map(|score| score.parse::<f32>().unwrap())
1938                    };
1939                    if parent_readability.is_none() {
1940                        parent_of_top_candidate = parent_of_top_candidate.parent().unwrap();
1941                        continue;
1942                    }
1943                    if parent_readability.as_ref().unwrap() < &score_threshold {
1944                        break;
1945                    }
1946                    if parent_readability.as_ref().unwrap() > &last_score {
1947                        top_candidate = parent_of_top_candidate;
1948                        break;
1949                    }
1950                    last_score = parent_readability.unwrap();
1951                    parent_of_top_candidate = parent_of_top_candidate.parent().unwrap();
1952                }
1953
1954                parent_of_top_candidate = top_candidate.parent().unwrap();
1955                while &parent_of_top_candidate.as_element().unwrap().name.local != "body"
1956                    && parent_of_top_candidate.children().count() == 1
1957                {
1958                    top_candidate = parent_of_top_candidate;
1959                    parent_of_top_candidate = top_candidate.parent().unwrap();
1960                }
1961                let top_candidate_readability = {
1962                    let top_candidate_attrs =
1963                        top_candidate.as_element().unwrap().attributes.borrow();
1964                    top_candidate_attrs
1965                        .get(READABILITY_SCORE)
1966                        .map(|score| score.to_string())
1967                };
1968                if top_candidate_readability.is_none() {
1969                    self.initialize_node(&mut top_candidate);
1970                }
1971            }
1972            let mut article_content = NodeRef::new_element(
1973                QualName::new(None, Namespace::from(HTML_NS), LocalName::from("div")),
1974                BTreeMap::new(),
1975            );
1976            let top_candidate_score = {
1977                let top_candidate_attrs = top_candidate.as_element().unwrap().attributes.borrow();
1978                top_candidate_attrs
1979                    .get(READABILITY_SCORE)
1980                    .map(|score| score.parse::<f32>().unwrap())
1981                    .unwrap()
1982            };
1983
1984            let sibling_score_threshold = (10.0_f32).max(top_candidate_score * 0.2);
1985            parent_of_top_candidate = top_candidate.parent().unwrap();
1986
1987            let mut siblings = parent_of_top_candidate
1988                .children()
1989                .filter(|node| node.as_element().is_some());
1990
1991            let (top_candidate_class, top_candidate_score) = {
1992                let top_candidate_attrs = top_candidate.as_element().unwrap().attributes.borrow();
1993                let class = top_candidate_attrs
1994                    .get("class")
1995                    .map(|class| class.to_string())
1996                    .unwrap_or("".to_string());
1997                let score = top_candidate_attrs
1998                    .get(READABILITY_SCORE)
1999                    .map(|score| score.parse::<f32>().unwrap())
2000                    .unwrap();
2001                (class, score)
2002            };
2003            let mut next_sibling = siblings.next();
2004            while let Some(sibling) = next_sibling {
2005                next_sibling = siblings.next();
2006                let mut append = false;
2007                if sibling == top_candidate {
2008                    append = true;
2009                } else {
2010                    let mut content_bonus = 0.0;
2011                    let sibling_attrs = sibling.as_element().unwrap().attributes.borrow();
2012
2013                    let sibling_class = sibling_attrs
2014                        .get("class")
2015                        .map(|class| class.to_string())
2016                        .unwrap_or("".to_string());
2017                    let sibling_score = sibling_attrs
2018                        .get(READABILITY_SCORE)
2019                        .map(|score| score.parse::<f32>().unwrap());
2020
2021                    if sibling_class == top_candidate_class && !top_candidate_class.is_empty() {
2022                        content_bonus += top_candidate_score * 0.2;
2023                    }
2024
2025                    if sibling_score.is_some()
2026                        && (sibling_score.unwrap() + content_bonus) >= sibling_score_threshold
2027                    {
2028                        append = true;
2029                    } else if sibling.as_element().map(|elem| elem.name.local.as_ref()) == Some("p")
2030                    {
2031                        let link_density = Self::get_link_density(&sibling);
2032                        let node_content = Self::get_inner_text(&sibling, None);
2033                        let node_length = node_content.len();
2034                        if node_length > 80 && link_density < 0.25 {
2035                            append = true;
2036                        } else if node_length < 80
2037                            && node_length > 0
2038                            && link_density == 0.0
2039                            && !regexes::is_match_node_content(&node_content)
2040                        {
2041                            append = true;
2042                        }
2043                    }
2044                }
2045                if append {
2046                    let new_article_child = if !ALTER_TO_DIV_EXCEPTIONS.contains(
2047                        &sibling
2048                            .as_element()
2049                            .map(|elem| elem.name.local.as_ref())
2050                            .unwrap(),
2051                    ) {
2052                        Self::set_node_tag(&sibling, "div")
2053                    } else {
2054                        sibling
2055                    };
2056                    article_content.append(new_article_child);
2057                }
2058            }
2059            self.prep_article(&mut article_content);
2060            if needed_to_create_top_candidate {
2061                let mut top_candidate_attrs =
2062                    top_candidate.as_element().unwrap().attributes.borrow_mut();
2063                top_candidate_attrs.insert("id", "readability-page-1".to_string());
2064                top_candidate_attrs.insert("class", "page".to_string());
2065            } else {
2066                let div = NodeRef::new_element(
2067                    QualName::new(None, Namespace::from(HTML_NS), LocalName::from("div")),
2068                    BTreeMap::new(),
2069                );
2070                {
2071                    let mut div_attrs = div.as_element().unwrap().attributes.borrow_mut();
2072                    div_attrs.insert("id", "readability-page-1".to_string());
2073                    div_attrs.insert("class", "page".to_string());
2074                }
2075                for child in article_content.children() {
2076                    div.append(child);
2077                }
2078                article_content.append(div);
2079            }
2080
2081            let text_length = Self::get_inner_text(&article_content, Some(true)).len();
2082            let mut parse_successful = true;
2083            if text_length < DEFAULT_CHAR_THRESHOLD {
2084                parse_successful = false;
2085                if self.flag_is_active(FLAG_STRIP_UNLIKELYS) {
2086                    self.remove_flag(FLAG_STRIP_UNLIKELYS);
2087                    attempts.push(ExtractAttempt::new(article_content.clone(), text_length));
2088                } else if self.flag_is_active(FLAG_WEIGHT_CLASSES) {
2089                    self.remove_flag(FLAG_WEIGHT_CLASSES);
2090                    attempts.push(ExtractAttempt::new(article_content.clone(), text_length));
2091                } else if self.flag_is_active(FLAG_CLEAN_CONDITIONALLY) {
2092                    self.remove_flag(FLAG_CLEAN_CONDITIONALLY);
2093                    attempts.push(ExtractAttempt::new(article_content.clone(), text_length));
2094                } else {
2095                    attempts.push(ExtractAttempt::new(article_content.clone(), text_length));
2096                    attempts.sort_by(|a, b| b.length.partial_cmp(&a.length).unwrap());
2097                    if attempts.first().as_ref().unwrap().length == 0 {
2098                        return Err(ParserError::new(
2099                            "Unable to extract content".into(),
2100                        )
2101                        .into());
2102                    }
2103                    article_content = attempts[0].article.clone();
2104                    parse_successful = true;
2105                }
2106            }
2107            if parse_successful {
2108                let parent_ancestors = Self::get_node_ancestors(&parent_of_top_candidate, None);
2109                let ancestors = vec![
2110                    vec![parent_of_top_candidate, top_candidate],
2111                    parent_ancestors,
2112                ]
2113                .concat();
2114                ancestors.iter().any(|node| {
2115                    let node_elem = node.as_element();
2116                    if node_elem.is_none() {
2117                        return false;
2118                    }
2119                    let node_attrs = node_elem.unwrap().attributes.borrow();
2120                    if let Some(dir_attr) = node_attrs.get("dir") {
2121                        self.article_dir = Some(dir_attr.to_string());
2122                        return true;
2123                    }
2124                    false
2125                });
2126                self.article_node = Some(article_content);
2127                info!("Successfully grabbed article {:?}", self.metadata.title);
2128                return Ok(());
2129            }
2130        }
2131    }
2132}
2133
2134/// This represents the article node extracted after running the grab_article method
2135#[derive(Debug)]
2136struct ExtractAttempt {
2137    article: NodeRef,
2138    length: usize,
2139}
2140
2141impl ExtractAttempt {
2142    pub fn new(article: NodeRef, length: usize) -> Self {
2143        ExtractAttempt { article, length }
2144    }
2145}
2146
2147#[derive(Debug, PartialEq)]
2148pub struct MetaData {
2149    byline: Option<String>,
2150    excerpt: Option<String>,
2151    site_name: Option<String>,
2152    title: String,
2153}
2154
2155impl MetaData {
2156    pub fn new() -> Self {
2157        MetaData {
2158            byline: None,
2159            excerpt: None,
2160            site_name: None,
2161            title: "".into(),
2162        }
2163    }
2164
2165    pub fn title(&self) -> &str {
2166        &self.title
2167    }
2168
2169    pub fn byline(&self) -> Option<&String> {
2170        self.byline.as_ref()
2171    }
2172}
2173
2174#[cfg(test)]
2175mod test {
2176    use super::{
2177        MetaData, Readability, SizeInfo, FLAG_CLEAN_CONDITIONALLY, FLAG_STRIP_UNLIKELYS,
2178        FLAG_WEIGHT_CLASSES, HTML_NS, READABILITY_SCORE,
2179    };
2180    use html5ever::{LocalName, Namespace, QualName};
2181    use kuchiki::traits::*;
2182    use kuchiki::NodeRef;
2183
2184    // TODO: Refactor not to use test file possibly
2185    const TEST_HTML: &'static str = r#"
2186    <!DOCTYPE html>
2187    <html lang="en">
2188    
2189    <head>
2190        <title>Sample Document</title>
2191    </head>
2192    
2193    <body>
2194        <h1>Some text in h1</h1>
2195        <img src="inexistent.png">
2196        <div class="invalid-elems">
2197            <!-- This div contains invalid elements -->
2198            <h1>Imagine some lorem ipsum</h1>
2199            <img>
2200        </div>
2201        <!-- Test that the no-script content is copied over -->
2202        <img src="lazy-load.png">
2203        <noscript>
2204            <div class="parent">
2205                <img src="eager-load.png" id="lazy-load">
2206            </div>
2207        </noscript>
2208    </body>
2209    
2210    </html>
2211    
2212    "#;
2213
2214    #[test]
2215    fn test_unwrap_no_script_tags() {
2216        let mut readability = Readability::new(TEST_HTML);
2217        let img_count = readability.root_node.select("img").unwrap().count();
2218        assert_eq!(3, img_count);
2219        readability.unwrap_no_script_tags();
2220        let img_count = readability.root_node.select("img").unwrap().count();
2221        assert_eq!(2, img_count);
2222
2223        // Ensure attributes were copied over
2224        let updated_img = readability.root_node.select_first("img#lazy-load").unwrap();
2225        let updated_img_attrs = updated_img.attributes.borrow();
2226        assert_eq!(true, updated_img_attrs.contains("data-old-src"));
2227        assert_eq!(Some("lazy-load.png"), updated_img_attrs.get("data-old-src"));
2228        assert_eq!(Some("eager-load.png"), updated_img_attrs.get("src"));
2229    }
2230
2231    #[test]
2232    fn test_is_single_image() {
2233        let readability = Readability::new(TEST_HTML);
2234
2235        let img_elem_ref = readability.root_node.select_first("img").unwrap();
2236        assert_eq!(true, Readability::is_single_image(&img_elem_ref.as_node()));
2237
2238        let noscript_elem_ref = readability.root_node.select_first("noscript").unwrap();
2239        assert_eq!(
2240            false,
2241            Readability::is_single_image(&noscript_elem_ref.as_node())
2242        );
2243
2244        let div_elem_ref = readability
2245            .root_node
2246            .select_first("div.invalid-elems")
2247            .unwrap();
2248        assert_eq!(false, Readability::is_single_image(&div_elem_ref.as_node()));
2249
2250        let div_elem_ref = kuchiki::parse_fragment(
2251            QualName::new(None, Namespace::from(HTML_NS), LocalName::from("div")),
2252            Vec::new(),
2253        )
2254        .one(noscript_elem_ref.as_node().text_contents().trim());
2255
2256        assert_eq!(true, Readability::is_single_image(&div_elem_ref));
2257    }
2258
2259    #[test]
2260    fn test_remove_scripts() {
2261        let mut readability = Readability::new(TEST_HTML);
2262
2263        let noscript_elems = readability.root_node.select("noscript").unwrap();
2264        assert_eq!(1, noscript_elems.count());
2265        readability.remove_scripts();
2266        let noscript_elems = readability.root_node.select("noscript").unwrap();
2267        assert_eq!(0, noscript_elems.count());
2268    }
2269
2270    #[test]
2271    fn test_next_element() {
2272        let html_str = r#"
2273         <p id="a">This is a node</p>
2274         <!-- Commented content  -->
2275         <p id="b">This is another node. The next line is just whitespace</p>
2276
2277         This is standalone text
2278         <p> Some <span>more</span> text</p>"#;
2279        let doc = Readability::new(html_str);
2280        let p = doc.root_node.select_first("#a").unwrap();
2281        let p = p.as_node();
2282        let mut p_node_option: Option<NodeRef> = Some(p.clone());
2283        p_node_option = Readability::next_element(p_node_option, false);
2284        assert_eq!(Some(p.clone()), p_node_option);
2285
2286        let p_node_option = p_node_option.unwrap();
2287        let p_node_option = p_node_option.as_element();
2288        let p_node_option_attr = p_node_option.unwrap().attributes.borrow();
2289        assert_eq!("a", p_node_option_attr.get("id").unwrap());
2290
2291        let next = Readability::next_element(p.next_sibling(), false);
2292
2293        let next = next.unwrap();
2294        let next_elem = next.as_element();
2295        let next_attr = next_elem.unwrap().attributes.borrow();
2296        assert_eq!("b", next_attr.get("id").unwrap());
2297
2298        let next = Readability::next_element(next.next_sibling(), false);
2299
2300        let next = next.unwrap();
2301        assert_eq!(true, next.as_text().is_some());
2302        assert_eq!("This is standalone text", next.text_contents().trim());
2303
2304        let next = Readability::next_element(None, false);
2305        assert_eq!(None, next);
2306    }
2307
2308    #[test]
2309    fn test_is_phrasing_content() {
2310        let html_str = r#"
2311        Some text node
2312        <b>This is a phrasing content node</b>
2313        <p>This is not a phrasing content node</p>
2314        <a href="\#"><i>This is also a phrasing content</i></a>
2315        <a href="\#"><p>This is not a phrasing content</p></a>
2316        "#;
2317        let doc = Readability::new(html_str);
2318        let body = doc.root_node.select_first("body").unwrap();
2319        let body = body.as_node();
2320        let mut body_children = body.children();
2321        let mut node = body_children.next().unwrap();
2322        assert_eq!(true, node.as_text().is_some());
2323        assert_eq!(true, Readability::is_phrasing_content(&node));
2324
2325        node = node.next_sibling().unwrap();
2326        assert_eq!("b", &node.as_element().unwrap().name.local);
2327        assert_eq!(true, Readability::is_phrasing_content(&node));
2328
2329        node = node.next_sibling().unwrap(); // Skips the text node from the new line character
2330        node = node.next_sibling().unwrap();
2331        assert_eq!("p", &node.as_element().unwrap().name.local);
2332        assert_eq!(false, Readability::is_phrasing_content(&node));
2333
2334        node = node.next_sibling().unwrap(); // Skips the text node from the new line character
2335        node = node.next_sibling().unwrap();
2336        assert_eq!("a", &node.as_element().unwrap().name.local);
2337        assert_eq!(true, Readability::is_phrasing_content(&node));
2338
2339        node = node.next_sibling().unwrap(); // Skips the text node from the new line character
2340        node = node.next_sibling().unwrap();
2341        assert_eq!("a", &node.as_element().unwrap().name.local);
2342        assert_eq!(false, Readability::is_phrasing_content(&node));
2343    }
2344
2345    #[test]
2346    fn test_is_whitespace() {
2347        let html_str = r#"
2348        <p>Definitely not whitespace</p>
2349        I am also not whitespace
2350        <p>     </p>
2351        <br>
2352        "#;
2353        let doc = Readability::new(html_str);
2354        let body = doc.root_node.select_first("body").unwrap();
2355
2356        let mut node = body.as_node().first_child().unwrap();
2357        assert_eq!("p", &node.as_element().unwrap().name.local);
2358        assert_eq!(false, Readability::is_whitespace(&node));
2359
2360        node = node.next_sibling().unwrap();
2361        assert_eq!(true, node.as_text().is_some());
2362        assert_eq!(false, Readability::is_whitespace(&node));
2363
2364        node = node.next_sibling().unwrap();
2365        assert_eq!("p", &node.as_element().unwrap().name.local);
2366        assert_eq!(
2367            true,
2368            Readability::is_whitespace(&node.first_child().unwrap())
2369        );
2370
2371        // This is testing the new line character in between the <p> and <br> tags
2372        node = node.next_sibling().unwrap();
2373        assert_eq!(true, node.as_text().is_some());
2374        assert_eq!(true, Readability::is_whitespace(&node));
2375
2376        node = node.next_sibling().unwrap();
2377        assert_eq!("br", &node.as_element().unwrap().name.local);
2378        assert_eq!(true, Readability::is_whitespace(&node));
2379    }
2380
2381    #[test]
2382    fn test_set_node_tag() {
2383        let html_str = r#"
2384        <div id="target" class="some random class" tabindex="0"><p>Child 1</p><p>Child 2</p></div>
2385        <div id="not-the-target">The div above is being replaced</div>
2386        "#;
2387        let doc = Readability::new(html_str);
2388        let target = doc.root_node.select_first("#target").unwrap();
2389        let children_count = doc.root_node.children().count();
2390        let target_children_count = target.as_node().children().count();
2391
2392        assert_eq!("div", &target.name.local);
2393        let new_node = Readability::set_node_tag(target.as_node(), "section");
2394
2395        assert_eq!(children_count, doc.root_node.children().count());
2396        let target = doc.root_node.select_first("#target").unwrap();
2397        assert_eq!(&new_node, target.as_node());
2398        assert_eq!("section", &target.name.local);
2399        assert_eq!(target_children_count, target.as_node().children().count());
2400
2401        let target_attrs = target.as_node().as_element().unwrap().attributes.borrow();
2402        assert_eq!(3, target_attrs.map.len());
2403
2404        let old_div = doc.root_node.select_first("div#target");
2405        assert_eq!(true, old_div.is_err());
2406    }
2407
2408    #[test]
2409    fn test_replace_node_tags() {
2410        let html_str = r#"
2411        <div id="replace-p">
2412          <p>Tag 1</p><p>Tag 2</p><p>Tag 3</p>
2413        </div>
2414        "#;
2415        let doc = Readability::new(html_str);
2416        let target_parent = doc.root_node.select_first("div#replace-p").unwrap();
2417        let target_parent_child_count = target_parent.as_node().children().count();
2418        let nodes = target_parent.as_node().select("p").unwrap();
2419
2420        Readability::replace_node_tags(nodes, "span");
2421        assert_eq!(
2422            target_parent_child_count,
2423            target_parent.as_node().children().count()
2424        );
2425
2426        let nodes = target_parent.as_node().select("p").unwrap();
2427        assert_eq!(0, nodes.count());
2428        let nodes = target_parent.as_node().select("span").unwrap();
2429        assert_eq!(3, nodes.count());
2430    }
2431
2432    #[test]
2433    fn test_replace_brs() {
2434        let html_str = r#"
2435        <div>foo<br>bar<br> <br><br>abc</div>
2436        "#;
2437        let mut doc = Readability::new(html_str);
2438        let div = doc.root_node.select_first("div").unwrap();
2439        let br_count = div.as_node().select("br").unwrap().count();
2440        let p_count = div.as_node().select("p").unwrap().count();
2441        assert_eq!(4, br_count);
2442        assert_eq!(0, p_count);
2443
2444        doc.replace_brs();
2445        let br_count = div.as_node().select("br").unwrap().count();
2446        let p_count = div.as_node().select("p").unwrap().count();
2447        assert_eq!(1, br_count);
2448        assert_eq!(1, p_count);
2449
2450        let p_node = div.as_node().select_first("p").unwrap();
2451        assert_eq!("abc", p_node.as_node().text_contents());
2452
2453        let html_str = r#"
2454        <p>foo<br>bar<br> <br><br>abc</p>
2455        "#;
2456        doc = Readability::new(html_str);
2457        let p = doc.root_node.select_first("p").unwrap();
2458        let div_count = doc.root_node.select("div").unwrap().count();
2459        let br_count = p.as_node().select("br").unwrap().count();
2460        assert_eq!(4, br_count);
2461        assert_eq!(0, div_count);
2462
2463        doc.replace_brs();
2464        let br_count = doc.root_node.select("br").unwrap().count();
2465        let div_count = doc.root_node.select("div").unwrap().count();
2466        let p_count = doc.root_node.select("p").unwrap().count();
2467        assert_eq!(1, br_count);
2468        assert_eq!(1, div_count);
2469        assert_eq!(1, p_count);
2470        let p_node = doc.root_node.select_first("p").unwrap();
2471        assert_eq!("abc", p_node.as_node().text_contents());
2472    }
2473
2474    #[test]
2475    fn test_prep_document() {
2476        let html_str = r#"
2477        <!DOCTYPE html>
2478        <html>
2479          <head>
2480            <style>div {padding: 20px; border-bottom: 2px solid black; }</style>
2481          </head>
2482          <body>
2483            <font face="Times New Roman" size="10">Times New Roman</font>
2484            <div>foo<br>bar<br> <br><br>abc</div>
2485          </body>
2486        </html>
2487        "#;
2488        let mut doc = Readability::new(html_str);
2489        doc.prep_document();
2490
2491        let style_nodes = doc.root_node.select("style").unwrap();
2492        let font_nodes = doc.root_node.select("font").unwrap();
2493        let p_nodes = doc.root_node.select("p").unwrap();
2494        let br_nodes = doc.root_node.select("br").unwrap();
2495        assert_eq!(0, style_nodes.count());
2496        assert_eq!(0, font_nodes.count());
2497        assert_eq!(1, p_nodes.count());
2498        assert_eq!(1, br_nodes.count());
2499    }
2500
2501    #[test]
2502    fn test_inline_css_str_to_map() {
2503        use std::collections::HashMap;
2504        let css_str = "display: flex; height: 200px; width: 250px; justify-content: center; align-items: center; border: 2px solid black";
2505        let mut css_map = HashMap::new();
2506        css_map.insert("display".to_string(), "flex".to_string());
2507        css_map.insert("height".to_string(), "200px".to_string());
2508        css_map.insert("width".to_string(), "250px".to_string());
2509        css_map.insert("justify-content".to_string(), "center".to_string());
2510        css_map.insert("align-items".to_string(), "center".to_string());
2511        css_map.insert("border".to_string(), "2px solid black".to_string());
2512
2513        let css_str_to_map = Readability::inline_css_str_to_map(css_str);
2514        assert_eq!(css_map, css_str_to_map);
2515        let mut css_map = HashMap::new();
2516        css_map.insert("color".to_string(), "red".to_string());
2517        css_map.insert("background-image".to_string(), "url('data:image/jpeg;base64,/wgARCAALABQDASIAAhEBAxEB/8QAFwABAQEBAAAAAAAAAAAAAAAAAgADBP/')".to_string());
2518        assert_eq!(css_map, Readability::inline_css_str_to_map("color: red;background-image: url('data:image/jpeg;base64,/wgARCAALABQDASIAAhEBAxEB/8QAFwABAQEBAAAAAAAAAAAAAAAAAgADBP/')"));
2519
2520        let empty_map = HashMap::new();
2521        assert_eq!(empty_map, Readability::inline_css_str_to_map(" \n \t \r"));
2522        assert_eq!(empty_map, Readability::inline_css_str_to_map("color"));
2523
2524        let mut css_map = HashMap::new();
2525        css_map.insert("color".to_string(), "red".to_string());
2526        css_map.insert("height".to_string(), "300px".to_string());
2527        assert_eq!(
2528            css_map,
2529            Readability::inline_css_str_to_map("color: red;height: 300px;width")
2530        );
2531    }
2532
2533    #[test]
2534    fn test_is_probably_visible() {
2535        let html_str = r#"
2536        <!DOCTYPE html>
2537        <html>
2538          <body>
2539            <p id="visible">Lorem ipsum dolores</p>
2540            <div id="hidden-div" style="display: none">
2541              <p>This is hidden and so is the parent</p>
2542            </div>
2543            <input value="Some good CSRF token" hidden>
2544            <div id="hidden-aria" style="display: flex;" aria-hidden="true">
2545              <p>This is not considered visible</p>
2546            </div>
2547            <div id="visible-aria" style="display: flex;" aria-hidden="false">
2548              <p>This is considered visible</p>
2549            </div>
2550            <img src="./some-img.png" class="fallback-image">
2551            <div id="visible-div" style="display: block" class="visible" aria-hidden="false">
2552              <p>This is fully visible</p>
2553            </div>
2554          </body>
2555        </html>
2556      "#;
2557        let doc = Readability::new(html_str);
2558        let div_node = doc.root_node.select_first("div#hidden-div").unwrap();
2559        let p_node = doc.root_node.select_first("p#visible").unwrap();
2560        let input_node = doc.root_node.select_first("input").unwrap();
2561        let hidden_aria_div_node = doc.root_node.select_first("div#hidden-aria").unwrap();
2562        let visible_aria_div_node = doc.root_node.select_first("div#visible-aria").unwrap();
2563        let img_node = doc.root_node.select_first("img").unwrap();
2564        let visible_div_node = doc.root_node.select_first("div#visible-div").unwrap();
2565        assert_eq!(true, Readability::is_probably_visible(&p_node.as_node()));
2566        assert_eq!(false, Readability::is_probably_visible(&div_node.as_node()));
2567        assert_eq!(
2568            false,
2569            Readability::is_probably_visible(&input_node.as_node())
2570        );
2571        assert_eq!(
2572            false,
2573            Readability::is_probably_visible(&hidden_aria_div_node.as_node())
2574        );
2575        assert_eq!(
2576            true,
2577            Readability::is_probably_visible(&visible_aria_div_node.as_node())
2578        );
2579        assert_eq!(true, Readability::is_probably_visible(&img_node.as_node()));
2580        assert_eq!(
2581            true,
2582            Readability::is_probably_visible(&visible_div_node.as_node())
2583        );
2584    }
2585
2586    #[test]
2587    fn test_check_byline() {
2588        let html_str = r#"
2589        <!DOCTYPE html>
2590        <html>
2591        <body>
2592          <p class="byline description" id="author">
2593This test is used to find out whether a given node is a byline. This works by checking whether
2594a node has a rel attribute with "author" as its value, or if "author"
2595is part of its value in the itemprop attribute. If neither is the case then it checks whether the classes and id
2596of the node match a regex of a potential byline. If any condition is met, then the content must be less than 100
2597characters. For that reason, this <p> tag could not be a byline because it's too long.
2598          </p>
2599          <p class="author">A Paperoni maintainer</p>
2600          <p class="authors not-byline"></p>
2601          <p rel="author">Maintainer of Paperoni</p>
2602        </body>
2603        </html>
2604        "#;
2605        let mut doc = Readability::new(html_str);
2606        assert_eq!(&None, &doc.byline);
2607        let p1_node = doc.root_node.select_first("p.byline").unwrap();
2608        let p2_node = doc.root_node.select_first("p.author").unwrap();
2609        let p3_node = doc.root_node.select_first("p.not-byline").unwrap();
2610        let p4_node = doc.root_node.select_first(r#"p[rel="author""#).unwrap();
2611        assert_eq!(
2612            false,
2613            doc.check_byline(p1_node.as_node(), "byline description author")
2614        );
2615        assert_eq!(true, doc.check_byline(p2_node.as_node(), "author"));
2616        assert_eq!(
2617            false,
2618            doc.check_byline(p3_node.as_node(), "authors not-byline")
2619        );
2620        assert_eq!(Some("A Paperoni maintainer".into()), doc.byline);
2621        // The test below is false because there is already an existing byline.
2622        assert_eq!(false, doc.check_byline(p4_node.as_node(), ""));
2623    }
2624
2625    #[test]
2626    fn test_get_next_node() {
2627        let html_str = r#"
2628        <!DOCTYPE html>
2629        <html>
2630          <body>
2631            <div id="body-child-1">
2632              <p id="start">Foobar content</p>
2633              <div id="start-sib">
2634                <span>First child</span>
2635              </div>
2636            </div>
2637            <div id="body-child-2"><span>This will not be reached</p></div>
2638            <p id="body-child-last">Last element</p>
2639          </body>
2640        </html>
2641        "#;
2642        let doc = Readability::new(html_str);
2643        let node = doc.root_node.select_first("p#start").unwrap();
2644        let next_node = Readability::get_next_node(node.as_node(), false);
2645        assert_eq!(true, next_node.is_some());
2646        let next_node = next_node.unwrap();
2647        let next_node_attr = next_node.as_element().unwrap().attributes.borrow();
2648        assert_eq!(Some("start-sib"), next_node_attr.get("id"));
2649
2650        let next_node = Readability::get_next_node(&next_node, false);
2651        assert_eq!(true, next_node.is_some());
2652        let next_node = next_node.unwrap();
2653        assert_eq!("span", &next_node.as_element().unwrap().name.local);
2654
2655        let next_node = Readability::get_next_node(&next_node, false);
2656        assert_eq!(true, next_node.is_some());
2657        let next_node = next_node.unwrap();
2658        let next_node_attr = next_node.as_element().unwrap().attributes.borrow();
2659        assert_eq!(Some("body-child-2"), next_node_attr.get("id"));
2660
2661        let next_node = Readability::get_next_node(&next_node, true);
2662        assert_eq!(true, next_node.is_some());
2663        let next_node = next_node.unwrap();
2664        let next_node_attr = next_node.as_element().unwrap().attributes.borrow();
2665        assert_eq!(Some("body-child-last"), next_node_attr.get("id"));
2666
2667        let next_node = Readability::get_next_node(&next_node, true);
2668        assert_eq!(None, next_node);
2669    }
2670
2671    #[test]
2672    fn test_remove_and_get_next() {
2673        let html_str = r#"
2674        <!DOCTYPE html>
2675        <html>
2676          <body>
2677            <div id="body-child-1">
2678              <p id="start">Foobar content</p>
2679              <div id="start-sib">
2680                <span>First child</span>
2681              </div>
2682            </div>
2683            <div id="body-child-2"><span>This will not be reached</p></div>
2684            <p id="body-child-last">Last element</p>
2685          </body>
2686        </html>
2687        "#;
2688        let doc = Readability::new(html_str);
2689        let node = doc.root_node.select_first("div#body-child-1").unwrap();
2690        let p_node = Readability::get_next_node(node.as_node(), false).unwrap();
2691        let next_node = Readability::remove_and_get_next(p_node);
2692        assert_eq!(true, next_node.is_some());
2693
2694        let next_node = next_node.unwrap();
2695        let next_node_attr = next_node.as_element().unwrap().attributes.borrow();
2696        assert_eq!(Some("start-sib"), next_node_attr.get("id"));
2697
2698        // Confirm the p node no longer exists
2699        let p_node = doc.root_node.select_first("p#start");
2700        assert_eq!(true, p_node.is_err());
2701    }
2702
2703    #[test]
2704    fn test_has_ancestor_tag() {
2705        let html_str = r#"
2706        <!DOCTYPE html>
2707        <html>
2708          <body>
2709            <div>
2710              <main>
2711                <p>
2712                  <span>Target node</span>
2713                </p>
2714              </main>
2715            </div>
2716          </body>
2717        </html>
2718        "#;
2719        let doc = Readability::new(html_str);
2720        let target = doc.root_node.select_first("span").unwrap();
2721        assert_eq!(
2722            true,
2723            Readability::has_ancestor_tag(target.as_node(), "div", None, None)
2724        );
2725        assert_eq!(
2726            false,
2727            Readability::has_ancestor_tag(target.as_node(), "div", Some(1), None)
2728        );
2729        assert_eq!(
2730            false,
2731            Readability::has_ancestor_tag(
2732                target.as_node(),
2733                "div",
2734                Some(5),
2735                Some(|node_ref| {
2736                    let node_attrs = node_ref.as_element().unwrap().attributes.borrow();
2737                    node_attrs.contains("class")
2738                })
2739            )
2740        );
2741    }
2742
2743    #[test]
2744    fn test_is_element_without_content() {
2745        let html_str = r#"
2746        <!DOCTYPE html>
2747        <html>
2748          <body>
2749            <p>Node with content</p><!-- A comment node which is regarded as not having content -->
2750            <p id="empty"></p>
2751            <div id="contentful">
2752              <p>
2753                <span>Target node</span>
2754              </p>
2755            </div>
2756            <div id="no-content"><br><br><br><br><br><br><hr><hr><br></div>
2757          </body>
2758        </html>
2759        "#;
2760        let doc = Readability::new(html_str);
2761        let target = doc.root_node.select_first("p").unwrap();
2762        assert_eq!(
2763            false,
2764            Readability::is_element_without_content(target.as_node())
2765        );
2766
2767        let target = target.as_node().next_sibling().unwrap();
2768        assert_eq!(true, target.as_comment().is_some());
2769        assert_eq!(false, Readability::is_element_without_content(&target));
2770
2771        let mut target = doc.root_node.select_first("p#empty").unwrap();
2772        assert_eq!(
2773            true,
2774            Readability::is_element_without_content(target.as_node())
2775        );
2776
2777        target = doc.root_node.select_first("div#contentful").unwrap();
2778        assert_eq!(
2779            false,
2780            Readability::is_element_without_content(target.as_node())
2781        );
2782
2783        target = doc.root_node.select_first("div#no-content").unwrap();
2784        assert_eq!(
2785            true,
2786            Readability::is_element_without_content(target.as_node())
2787        );
2788    }
2789
2790    #[test]
2791    fn test_has_single_tag_inside_element() {
2792        let html_str = r#"
2793        <!DOCTYPE html>
2794        <html>
2795          <body>
2796            <p id="one">No element tags here</p>
2797            <p id="two"><span>The p tag has only one tag</span></p>
2798            <p id="three">
2799              <span>Target node</span>
2800              <span>
2801                The parent has multiple children
2802              </span>
2803            </p>
2804            <p id="four">
2805              The text here means this div doesn't have a single tag
2806              <span>Target node</span>
2807            </p>
2808          </body>
2809        </html>
2810        "#;
2811        let doc = Readability::new(html_str);
2812        let mut target = doc.root_node.select_first("p#one").unwrap();
2813        assert_eq!(
2814            false,
2815            Readability::has_single_tag_inside_element(target.as_node(), "span")
2816        );
2817
2818        target = doc.root_node.select_first("p#two").unwrap();
2819        assert_eq!(
2820            true,
2821            Readability::has_single_tag_inside_element(target.as_node(), "span")
2822        );
2823
2824        target = doc.root_node.select_first("p#three").unwrap();
2825        assert_eq!(
2826            false,
2827            Readability::has_single_tag_inside_element(target.as_node(), "span")
2828        );
2829
2830        target = doc.root_node.select_first("p#four").unwrap();
2831        assert_eq!(
2832            false,
2833            Readability::has_single_tag_inside_element(target.as_node(), "span")
2834        );
2835    }
2836
2837    #[test]
2838    fn test_get_inner_text() {
2839        let html_str = r#"
2840        <!DOCTYPE html>
2841        <html>
2842          <body>
2843            <p>The quick brown fox jumps       over the lazy dog</p>
2844           </body>
2845        </html>
2846         "#;
2847        let doc = Readability::new(html_str);
2848        let target = doc.root_node.select_first("p").unwrap();
2849        assert_eq!(
2850            49,
2851            Readability::get_inner_text(target.as_node(), Some(false)).len()
2852        );
2853        assert_eq!(
2854            43,
2855            Readability::get_inner_text(target.as_node(), None).len()
2856        );
2857    }
2858
2859    #[test]
2860    fn test_get_link_density() {
2861        let html_str = r#"
2862        <!DOCTYPE html>
2863        <html>
2864          <body>
2865            <p id="one">Zero link density</p>
2866            <p id="two">Link density > 0 <a href="https://www.rust-lang.org/">The Rust home page</a></p>
2867            <p id="three"><a></a><a></a></p>
2868           </body>
2869        </html>
2870         "#;
2871        let doc = Readability::new(html_str);
2872        let mut target = doc.root_node.select_first("p#one").unwrap();
2873        assert_eq!(0_f32, Readability::get_link_density(target.as_node()));
2874
2875        target = doc.root_node.select_first("p#two").unwrap();
2876        assert_eq!(
2877            18_f32 / 35_f32,
2878            Readability::get_link_density(target.as_node())
2879        );
2880
2881        target = doc.root_node.select_first("p#three").unwrap();
2882        assert_eq!(0_f32, Readability::get_link_density(target.as_node()));
2883    }
2884
2885    #[test]
2886    fn test_has_child_block_element() {
2887        let html_str = r#"
2888        <!DOCTYPE html>
2889        <html>
2890          <body>
2891            <p id="one">Has no <span>block level</span> elements</p>
2892            <p id="two">Link density > 0 <a href="https://www.rust-lang.org/">The Rust home page</a></p>
2893            <div id="three">
2894              <p>This is a block level element</p>
2895            </div>
2896           </body>
2897        </html>
2898        "#;
2899        let doc = Readability::new(html_str);
2900        let mut target = doc.root_node.select_first("p#one").unwrap();
2901        assert_eq!(
2902            false,
2903            Readability::has_child_block_element(target.as_node())
2904        );
2905
2906        target = doc.root_node.select_first("p#two").unwrap();
2907        assert_eq!(
2908            false,
2909            Readability::has_child_block_element(target.as_node())
2910        );
2911
2912        target = doc.root_node.select_first("div#three").unwrap();
2913        assert_eq!(true, Readability::has_child_block_element(target.as_node()));
2914    }
2915
2916    #[test]
2917    fn test_get_node_ancestors() {
2918        let html_str = r#"
2919        <!DOCTYPE html>
2920        <html>
2921          <body>
2922            <section>
2923              <div>
2924                <p><span></span></p>
2925              </div>
2926            </section>
2927          </body>
2928        </html>
2929        "#;
2930        let doc = Readability::new(html_str);
2931        let mut target = doc.root_node.select_first("span").unwrap();
2932        assert_eq!(
2933            1,
2934            Readability::get_node_ancestors(target.as_node(), None).len()
2935        );
2936        assert_eq!(
2937            3,
2938            Readability::get_node_ancestors(target.as_node(), Some(3)).len()
2939        );
2940        assert_eq!(
2941            5,
2942            Readability::get_node_ancestors(target.as_node(), Some(5)).len()
2943        );
2944        assert_eq!(
2945            6,
2946            Readability::get_node_ancestors(target.as_node(), Some(200)).len()
2947        );
2948
2949        target = doc.root_node.select_first("html").unwrap();
2950        assert_eq!(
2951            1,
2952            Readability::get_node_ancestors(target.as_node(), Some(4)).len()
2953        );
2954    }
2955
2956    #[test]
2957    fn test_get_class_weight() {
2958        let html_str = r#"
2959        <!DOCTYPE html>
2960        <html>
2961          <body>
2962            <div id="blog" class="main">
2963              <h1 class="hidden">Up next...</h1>
2964              <p id="story">A story is told...</p>
2965            </div>
2966            <div id="comments">
2967              Tell us what you think
2968              <p class="comment">Great read...</p>
2969            </div>
2970          </body>
2971        </html>
2972        "#;
2973        let doc = Readability::new(html_str);
2974        let mut target = doc.root_node.select_first("body").unwrap();
2975        assert_eq!(0, doc.get_class_weight(target.as_node()));
2976
2977        target = doc.root_node.select_first("div#blog").unwrap();
2978        assert_eq!(50, doc.get_class_weight(target.as_node()));
2979
2980        target = doc.root_node.select_first("h1.hidden").unwrap();
2981        assert_eq!(-25, doc.get_class_weight(target.as_node()));
2982
2983        target = doc.root_node.select_first("p#story").unwrap();
2984        assert_eq!(25, doc.get_class_weight(target.as_node()));
2985
2986        target = doc.root_node.select_first("div#comments").unwrap();
2987        assert_eq!(-25, doc.get_class_weight(target.as_node()));
2988
2989        target = doc.root_node.select_first("p.comment").unwrap();
2990        assert_eq!(-25, doc.get_class_weight(target.as_node()));
2991    }
2992
2993    #[test]
2994    fn test_initialize_node() {
2995        let html_str = r#"
2996        <!DOCTYPE html>
2997        <html>
2998          <body>
2999            <div id="blog" class="main">
3000              <h1 class="hidden">Up next...</h1>
3001              <p id="story">A story is told...</p>
3002            </div>
3003            <div id="comments">
3004              Tell us what you think
3005              <pre class="comment">Great read...</pre>
3006            </div>
3007          </body>
3008        </html>
3009        "#;
3010        let doc = Readability::new(html_str);
3011        let mut target = doc.root_node.select_first("div#blog").unwrap();
3012        let mut node = target.as_node().clone();
3013        doc.initialize_node(&mut node);
3014        let node_attrs = node.as_element().unwrap().attributes.borrow();
3015        assert_eq!(Some("55"), node_attrs.get(READABILITY_SCORE));
3016
3017        target = doc.root_node.select_first("h1.hidden").unwrap();
3018        let mut node = target.as_node().clone();
3019        doc.initialize_node(&mut node);
3020        let node_attrs = node.as_element().unwrap().attributes.borrow();
3021        assert_eq!(Some("-30"), node_attrs.get(READABILITY_SCORE));
3022
3023        target = doc.root_node.select_first("p#story").unwrap();
3024        let mut node = target.as_node().clone();
3025        doc.initialize_node(&mut node);
3026        let node_attrs = node.as_element().unwrap().attributes.borrow();
3027        assert_eq!(Some("25"), node_attrs.get(READABILITY_SCORE));
3028
3029        target = doc.root_node.select_first("div#comments").unwrap();
3030        let mut node = target.as_node().clone();
3031        doc.initialize_node(&mut node);
3032        let node_attrs = node.as_element().unwrap().attributes.borrow();
3033        assert_eq!(Some("-20"), node_attrs.get(READABILITY_SCORE));
3034
3035        target = doc.root_node.select_first("pre.comment").unwrap();
3036        let mut node = target.as_node().clone();
3037        doc.initialize_node(&mut node);
3038        let node_attrs = node.as_element().unwrap().attributes.borrow();
3039        assert_eq!(Some("-22"), node_attrs.get(READABILITY_SCORE));
3040    }
3041
3042    #[test]
3043    fn test_get_row_and_column_count() {
3044        let html_str = r#"
3045        <!DOCTYPE html>
3046        <html>
3047          <body>
3048            <table>
3049              <tbody>
3050                <tr>
3051                  <td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
3052                </tr>
3053                <tr>
3054                  <td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td rowspan="2">&nbsp;</td>
3055                </tr>
3056                <tr>
3057                  <td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
3058                </tr>
3059                <tr>
3060                  <td>&nbsp;</td><td colspan="2">&nbsp;</td><td>&nbsp;</td>
3061                </tr>
3062                <tr>
3063                  <td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
3064                </tr>
3065                <tr>
3066                  <td colspan="4">&nbsp;</td>
3067                </tr>
3068              </tbody>
3069            </table>
3070          </body>
3071        </html>
3072        "#;
3073        let doc = Readability::new(html_str);
3074        let target = doc.root_node.select_first("table").unwrap();
3075        assert_eq!(
3076            SizeInfo {
3077                rows: 6,
3078                columns: 4
3079            },
3080            Readability::get_row_and_column_count(target.as_node())
3081        );
3082    }
3083
3084    #[test]
3085    fn test_mark_data_tables() {
3086        let html_str = r#"
3087        <!DOCTYPE html>
3088        <html>
3089          <body>
3090            <table id="one"></table>
3091            <table width="100%" border="0" id="two">
3092              <tr valign="top">
3093                <td width="20%">Left</td>
3094                <td height="200" width="60%">Main</td>
3095                <td width="20%">Right</td>
3096              </tr>
3097            </table>
3098            <table id="three">
3099              <caption>Monthly savings</caption>
3100              <tr>
3101                <th>Month</th>
3102                <th>Savings</th>
3103              </tr>
3104              <tr>
3105                <td>January</td>
3106                <td>$100</td>
3107              </tr>
3108              <tr>
3109                <td>February</td>
3110                <td>$50</td>
3111              </tr>
3112            </table>
3113            <table id="four">
3114              <tbody>
3115                <tr>
3116                  <td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
3117                </tr>
3118                <tr>
3119                  <td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td rowspan="2">&nbsp;</td>
3120                </tr>
3121                <tr>
3122                  <td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
3123                </tr>
3124                <tr>
3125                  <td>&nbsp;</td><td colspan="2">&nbsp;</td><td>&nbsp;</td>
3126                </tr>
3127                <tr>
3128                  <td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
3129                </tr>
3130                <tr>
3131                  <td colspan="4">&nbsp;</td>
3132                </tr>
3133              </tbody>
3134            </table>
3135            <table id="five">
3136              <table>
3137                <tbody>
3138                  <tr>
3139                    <td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
3140                  </tr>
3141                  <tr>
3142                    <td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td rowspan="2">&nbsp;</td>
3143                  </tr>
3144                  <tr>
3145                    <td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
3146                  </tr>
3147                  <tr>
3148                    <td>&nbsp;</td><td colspan="2">&nbsp;</td><td>&nbsp;</td>
3149                  </tr>
3150                  <tr>
3151                    <td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
3152                  </tr>
3153                  <tr>
3154                    <td colspan="4">&nbsp;</td>
3155                  </tr>
3156                </tbody>
3157              </table>
3158            </table>
3159          </body>
3160        </html>
3161        "#;
3162        let mut doc = Readability::new(html_str);
3163        doc.mark_data_tables();
3164        let target = doc.root_node.select_first("table#one").unwrap();
3165        let target_attr = target.attributes.borrow();
3166        assert_eq!(Some("false"), target_attr.get("readability-data-table"));
3167
3168        let target = doc.root_node.select_first("table#two").unwrap();
3169        let target_attr = target.attributes.borrow();
3170        assert_eq!(Some("false"), target_attr.get("readability-data-table"));
3171
3172        let target = doc.root_node.select_first("table#three").unwrap();
3173        let target_attr = target.attributes.borrow();
3174        assert_eq!(Some("true"), target_attr.get("readability-data-table"));
3175
3176        let target = doc.root_node.select_first("table#four").unwrap();
3177        let target_atrr = target.attributes.borrow();
3178        assert_eq!(Some("true"), target_atrr.get("readability-data-table"));
3179
3180        let target = doc.root_node.select_first("table#five").unwrap();
3181        let target_atrr = target.attributes.borrow();
3182        assert_eq!(Some("false"), target_atrr.get("readability-data-table"));
3183    }
3184
3185    #[test]
3186    fn test_fix_lazy_images() {
3187        let html_str = r#"
3188        <!DOCTYPE html>
3189        <html>
3190            <body>
3191                <img id="svg-uri" alt="Basketball" src="data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHhtbG5zOnhsaW5rPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5L3hsaW5rIiB2ZXJzaW9uPSIxLjEiIGlkPSJMYXllcl8xIiB4PSIwcHgiIHk9IjBweCIgdmlld0JveD0iMCAwIDEwMCAxMDAiIGVuYWJsZS1iYWNrZ3JvdW5kPSJuZXcgMCAwIDEwMCAxMDAiIHhtbDpzcGFjZT0icHJlc2VydmUiIGhlaWdodD0iMTAwcHgiIHdpZHRoPSIxMDBweCI+CjxnPgoJPHBhdGggZD0iTTI4LjEsMzYuNmM0LjYsMS45LDEyLjIsMS42LDIwLjksMS4xYzguOS0wLjQsMTktMC45LDI4LjksMC45YzYuMywxLjIsMTEuOSwzLjEsMTYuOCw2Yy0xLjUtMTIuMi03LjktMjMuNy0xOC42LTMxLjMgICBjLTQuOS0wLjItOS45LDAuMy0xNC44LDEuNEM0Ny44LDE3LjksMzYuMiwyNS42LDI4LjEsMzYuNnoiLz4KCTxwYXRoIGQ9Ik03MC4zLDkuOEM1Ny41LDMuNCw0Mi44LDMuNiwzMC41LDkuNWMtMyw2LTguNCwxOS42LTUuMywyNC45YzguNi0xMS43LDIwLjktMTkuOCwzNS4yLTIzLjFDNjMuNywxMC41LDY3LDEwLDcwLjMsOS44eiIvPgoJPHBhdGggZD0iTTE2LjUsNTEuM2MwLjYtMS43LDEuMi0zLjQsMi01LjFjLTMuOC0zLjQtNy41LTctMTEtMTAuOGMtMi4xLDYuMS0yLjgsMTIuNS0yLjMsMTguN0M5LjYsNTEuMSwxMy40LDUwLjIsMTYuNSw1MS4zeiIvPgoJPHBhdGggZD0iTTksMzEuNmMzLjUsMy45LDcuMiw3LjYsMTEuMSwxMS4xYzAuOC0xLjYsMS43LTMuMSwyLjYtNC42YzAuMS0wLjIsMC4zLTAuNCwwLjQtMC42Yy0yLjktMy4zLTMuMS05LjItMC42LTE3LjYgICBjMC44LTIuNywxLjgtNS4zLDIuNy03LjRjLTUuMiwzLjQtOS44LDgtMTMuMywxMy43QzEwLjgsMjcuOSw5LjgsMjkuNyw5LDMxLjZ6Ii8+Cgk8cGF0aCBkPSJNMTUuNCw1NC43Yy0yLjYtMS02LjEsMC43LTkuNywzLjRjMS4yLDYuNiwzLjksMTMsOCwxOC41QzEzLDY5LjMsMTMuNSw2MS44LDE1LjQsNTQuN3oiLz4KCTxwYXRoIGQ9Ik0zOS44LDU3LjZDNTQuMyw2Ni43LDcwLDczLDg2LjUsNzYuNGMwLjYtMC44LDEuMS0xLjYsMS43LTIuNWM0LjgtNy43LDctMTYuMyw2LjgtMjQuOGMtMTMuOC05LjMtMzEuMy04LjQtNDUuOC03LjcgICBjLTkuNSwwLjUtMTcuOCwwLjktMjMuMi0xLjdjLTAuMSwwLjEtMC4yLDAuMy0wLjMsMC40Yy0xLDEuNy0yLDMuNC0yLjksNS4xQzI4LjIsNDkuNywzMy44LDUzLjksMzkuOCw1Ny42eiIvPgoJPHBhdGggZD0iTTI2LjIsODguMmMzLjMsMiw2LjcsMy42LDEwLjIsNC43Yy0zLjUtNi4yLTYuMy0xMi42LTguOC0xOC41Yy0zLjEtNy4yLTUuOC0xMy41LTktMTcuMmMtMS45LDgtMiwxNi40LTAuMywyNC43ICAgQzIwLjYsODQuMiwyMy4yLDg2LjMsMjYuMiw4OC4yeiIvPgoJPHBhdGggZD0iTTMwLjksNzNjMi45LDYuOCw2LjEsMTQuNCwxMC41LDIxLjJjMTUuNiwzLDMyLTIuMyw0Mi42LTE0LjZDNjcuNyw3Niw1Mi4yLDY5LjYsMzcuOSw2MC43QzMyLDU3LDI2LjUsNTMsMjEuMyw0OC42ICAgYy0wLjYsMS41LTEuMiwzLTEuNyw0LjZDMjQuMSw1Ny4xLDI3LjMsNjQuNSwzMC45LDczeiIvPgo8L2c+Cjwvc3ZnPg==" />
3192                <img id="normal-src" src="./foo.jpg">
3193                <img id="gif-uri" src="data:image/gif;base64,R0lGODlhEAAQAMQAAORHHOVSKudfOulrSOp3WOyDZu6QdvCchPGolfO0o/XBs/fNwfjZ0frl3/zy7////wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACH5BAkAABAALAAAAAAQABAAAAVVICSOZGlCQAosJ6mu7fiyZeKqNKToQGDsM8hBADgUXoGAiqhSvp5QAnQKGIgUhwFUYLCVDFCrKUE1lBavAViFIDlTImbKC5Gm2hB0SlBCBMQiB0UjIQA7" alt="star" width="16" height="16">
3194                <img id="gif-uri-remove-src" data-src="./not-real-gif.png" src="data:image/gif;base64,R0lGODlhEAAQAMQAAORHHOVSKudfOulrSOp3WOyDZu6QdvCchPGolfO0o/" alt="star" width="16" height="16">
3195                <img id="lazy-loaded" class="lazy" src="placeholder.jpg" data-src="./720x640.jpg">
3196                <picture>
3197                    <source media="(min-width:650px)" srcset="img_pink_flowers.jpg">
3198                    <source media="(min-width:465px)" srcset="img_white_flower.jpg">
3199                    <img src="img_orange_flowers.jpg" alt="Flowers" style="width:auto;">
3200                </picture>
3201                <img id="no-lazy-class" src="https://image.url/" data-attrs="{&quot;src&quot;:&quot;https://other.url/1.png&quot;,&quot;alt&quot;:&quot;&quot;}"/>
3202            </body>
3203        </html>
3204        "#;
3205        let doc = Readability::new(html_str);
3206        let svg_uri = doc.root_node.select_first("#svg-uri").unwrap();
3207        let normal_src = doc.root_node.select_first("#normal-src").unwrap();
3208        let gif_uri = doc.root_node.select_first("#gif-uri").unwrap();
3209        let picture = doc.root_node.select_first("picture").unwrap();
3210        Readability::fix_lazy_images(&mut doc.root_node.clone());
3211        assert_eq!(svg_uri, doc.root_node.select_first("#svg-uri").unwrap());
3212        assert_eq!(
3213            normal_src,
3214            doc.root_node.select_first("#normal-src").unwrap()
3215        );
3216        assert_eq!(gif_uri, doc.root_node.select_first("#gif-uri").unwrap());
3217        assert_eq!(picture, doc.root_node.select_first("picture").unwrap());
3218
3219        let gif_uri_remove_src = doc.root_node.select_first("#gif-uri-remove-src").unwrap();
3220        let gif_uri_remove_src_attrs = gif_uri_remove_src.attributes.borrow();
3221        assert_eq!(
3222            gif_uri_remove_src_attrs.get("data-src"),
3223            gif_uri_remove_src_attrs.get("src")
3224        );
3225        let lazy_loaded = doc.root_node.select_first("#lazy-loaded").unwrap();
3226        let lazy_loaded_attrs = lazy_loaded.attributes.borrow();
3227        assert_eq!(
3228            lazy_loaded_attrs.get("data-src"),
3229            lazy_loaded_attrs.get("src")
3230        );
3231
3232        let no_lazy_class = doc.root_node.select_first("#no-lazy-class").unwrap();
3233        let no_lazy_class_attrs = no_lazy_class.attributes.borrow();
3234        assert_eq!(
3235            no_lazy_class_attrs.get("src").unwrap(),
3236            "https://image.url/"
3237        );
3238    }
3239
3240    #[test]
3241    fn test_clean_conditionally() {
3242        let html_str = r#"
3243        <!DOCTYPE html>
3244        <html>
3245            <body>
3246                <table id="data-table">
3247                    <caption>Monthly savings</caption>
3248                    <tr>
3249                        <th>Month</th>
3250                        <th>Savings</th>
3251                    </tr>
3252                    <tr>
3253                        <td>January</td>
3254                        <td>$100</td>
3255                    </tr>
3256                    <tr>
3257                        <td>February</td>
3258                        <td>$50</td>
3259                    </tr>
3260                </table>
3261                <table width="100%" border="0" id="display-table">
3262                    <tr valign="top">
3263                        <td width="20%">Left</td>
3264                        <td height="200" width="60%">Main</td>
3265                        <td width="20%">Right</td>
3266                    </tr>
3267                </table>
3268                <table width="100%" border="0" id="display-table-removed" class="comment">
3269                    <tr valign="top">
3270                        <td width="40%">One</td>
3271                        <td width="60%">Two</td>
3272                    </tr>
3273                </table>
3274                <div class="comment">
3275                    <p>The parent div will be deleted due to negative weight classes</p>
3276                </div>
3277                <div id="some-content">
3278                    The days of the week: Mon, Tue, Wed, Thur, Fri, Sat, Sun.
3279                    The months of the year: Jan, Feb, Mar, Apr, May, Jun, Jul, Aug, Oct, Nov, Dec.
3280                </div>
3281                <div id="embeds">
3282                    <iframe width="420" height="345" src="https://www.youtube.com/embed/dQw4w9WgXcQ"></iframe>
3283                </div>
3284                <div id="footer">
3285                    <p>Check out more articles</p>
3286                    <ul>
3287                        <li><img src="article.jpg"><p>Article 1</p></li>
3288                        <li><img src="article.jpg"><p>Article 2</p></li>
3289                        <li><img src="article.jpg"><p>Article 3</p></li>
3290                    </ul>
3291                </div>
3292            </body>
3293        </html>
3294        "#;
3295        let mut doc = Readability::new(html_str);
3296        let body = doc.root_node.select_first("body").unwrap();
3297        doc.mark_data_tables();
3298        doc.clean_conditionally(&mut body.as_node().clone(), "table");
3299        assert_eq!(true, doc.root_node.select_first("#data-table").is_ok());
3300        assert_eq!(false, doc.root_node.select_first("#display-table").is_ok());
3301        assert_eq!(
3302            false,
3303            doc.root_node.select_first("#display-table-removed").is_ok()
3304        );
3305        doc.clean_conditionally(&mut body.as_node().clone(), "div");
3306        assert_eq!(false, doc.root_node.select_first("div.comment").is_ok());
3307        assert_eq!(true, doc.root_node.select_first("div#some-content").is_ok());
3308        assert_eq!(true, doc.root_node.select_first("div#embeds").is_ok());
3309        assert_eq!(false, doc.root_node.select_first("div#footer").is_ok());
3310    }
3311
3312    #[test]
3313    fn test_clean() {
3314        let html_str = r#"
3315        <!DOCTYPE html>
3316        <html>
3317            <body>
3318                <pre>A Paperoni test</pre>
3319                <iframe width="420" height="345" src="https://www.youtube.com/embed/dQw4w9WgXcQ">
3320                </iframe>
3321                <iframe src="https://www.rust-lang.org/" name="rust_iframe" height="300px" width="100%" title="Rustlang Homepage">
3322                </iframe>
3323                <iframe src="https://crates.io/" name="crates_iframe" height="300px" width="100%" title="Crates.io Homepage">
3324                </iframe>
3325                <pre></pre>
3326            </body>
3327        </html>
3328        "#;
3329        let doc = Readability::new(html_str);
3330        Readability::clean(&mut doc.root_node.clone(), "pre");
3331        let pre_count = doc.root_node.select("pre").unwrap().count();
3332        assert_eq!(0, pre_count);
3333
3334        Readability::clean(&mut doc.root_node.clone(), "iframe");
3335        let iframe_count = doc.root_node.select("iframe").unwrap().count();
3336        assert_eq!(1, iframe_count);
3337        let iframe = doc.root_node.select_first("iframe").unwrap();
3338        let iframe_attrs = iframe.attributes.borrow();
3339        assert_eq!(
3340            Some("https://www.youtube.com/embed/dQw4w9WgXcQ"),
3341            iframe_attrs.get("src")
3342        );
3343    }
3344
3345    #[test]
3346    fn test_clean_headers() {
3347        let html_str = r#"
3348        <!DOCTYPE html>
3349        <html>
3350            <body>
3351                <h1 class="tags">#blog, #rust</h1>
3352                <h2>A blog in Rust</h2>
3353                <p>Foo bar baz quux</p>
3354                <h1 class="footer">Copyright info</h1>
3355            </body>
3356        </html>
3357        "#;
3358        let doc = Readability::new(html_str);
3359        let body = doc.root_node.select_first("body").unwrap();
3360        let h1_count = doc.root_node.select("h1").unwrap().count();
3361        let h2_count = doc.root_node.select("h2").unwrap().count();
3362        assert_eq!(2, h1_count);
3363        assert_eq!(1, h2_count);
3364        doc.clean_headers(&mut body.as_node().clone());
3365        let h1_count = doc.root_node.select("h1").unwrap().count();
3366        let h2_count = doc.root_node.select("h2").unwrap().count();
3367        assert_eq!(0, h1_count);
3368        assert_eq!(1, h2_count);
3369    }
3370
3371    #[test]
3372    fn test_clean_styles() {
3373        let html_str = r#"
3374        <!DOCTYPE html>
3375        <html>
3376            <body>
3377                <div style="color:red; padding: 10px" id="red">A red box</div>
3378                <div height="100px" style="color:blue; padding: 10px" id="blue">
3379                    A blue box
3380                </div>
3381                <svg width="100" height="100">
3382                    <circle cx="50" cy="50" r="40" fill="green" />
3383                </svg>
3384                <table width="100%" bgcolor="yellow">
3385                    <tr>
3386                        <th>Col 1</th>
3387                        <th>Col 2</th>
3388                    </tr>
3389                </table>
3390            </body>
3391        </html>
3392        "#;
3393        let doc = Readability::new(html_str);
3394        Readability::clean_styles(&mut doc.root_node.clone());
3395        let red_div = doc.root_node.select_first("#red").unwrap();
3396        let blue_div = doc.root_node.select_first("#blue").unwrap();
3397        let svg = doc.root_node.select_first("svg").unwrap();
3398        let table = doc.root_node.select_first("table").unwrap();
3399
3400        let red_div_attrs = red_div.attributes.borrow();
3401        let blue_div_attrs = blue_div.attributes.borrow();
3402        let svg_attrs = svg.attributes.borrow();
3403        let table_attrs = table.attributes.borrow();
3404
3405        assert_eq!(1, red_div_attrs.map.len());
3406        assert_eq!(false, red_div_attrs.contains("style"));
3407        assert_eq!(2, blue_div_attrs.map.len());
3408        assert_eq!(false, blue_div_attrs.contains("style"));
3409        assert_eq!(true, blue_div_attrs.contains("height"));
3410        assert_eq!(2, svg_attrs.map.len());
3411        assert_eq!(0, table_attrs.map.len());
3412    }
3413
3414    #[test]
3415    fn test_clean_matched_nodes() {
3416        let html_str = r#"
3417        <!DOCTYPE html>
3418        <html>
3419            <body>
3420                <p class="example">In Rust you can have 3 kinds of variables</p>
3421                <ul>
3422                    <li class="example">Immutable</li>
3423                    <li class="example">Mutable</li>
3424                    <li class="example">Constant</li>
3425                </ul>
3426                <p>Onto more tests</p>
3427            </body>
3428        </html>
3429        "#;
3430        let doc = Readability::new(html_str);
3431        let body = doc.root_node.select_first("body").unwrap();
3432        Readability::clean_matched_nodes(&mut body.as_node().clone(), |node_ref, match_str| {
3433            &node_ref.as_element().unwrap().name.local == "li" && match_str.contains("example")
3434        });
3435        let p_count = doc.root_node.select("p").unwrap().count();
3436        let li_count = doc.root_node.select("li").unwrap().count();
3437        assert_eq!(2, p_count);
3438        assert_eq!(0, li_count);
3439    }
3440
3441    #[test]
3442    fn test_prep_article() {
3443        let html_str = r#"
3444        <!DOCTYPE html>
3445        <html>
3446            <head>
3447                <title>A test HTML file</title>
3448            </head>
3449            <body>
3450                <h2>A test HTML file</h2>
3451                <div class="search">
3452                    Search for other posts
3453                    <input type="search" placeholder="Type here...">
3454                    <button id="search-btn">Search</button>
3455                </div>
3456                <aside>Some content aside</aside>
3457                <h1>A h1 tag</h1>
3458                <h1 class="banner">A h1 tag to be removed</h1>
3459                <table id="tbl-one"></table>
3460                <table width="100%" border="0" id="tbl-two">
3461                    <tr valign="top">
3462                        <td width="20%">Left</td>
3463                        <td height="200" width="60%">Main Content of the system</td>
3464                        <td width="20%">Right</td>
3465                    </tr>
3466                </table>
3467                <div style="color:red; padding: 10px" id="red">A red box</div>
3468                <div height="100px" style="color:blue; padding: 10px" id="blue">
3469                    A blue box
3470                </div>
3471                <svg width="100" height="100">
3472                    <circle cx="50" cy="50" r="40" fill="green" />
3473                </svg>
3474                <ul>
3475                    <li>one</li>
3476                    <li>two</li>
3477                    <li>three</li>
3478                </ul>
3479                <object data="obj.html" width="500" height="200"></object>
3480                <table id="tbl-three">
3481                    <caption>Monthly savings</caption>
3482                    <tr>
3483                        <th>Month</th>
3484                        <th>Savings</th>
3485                    </tr>
3486                    <tr>
3487                        <td>January</td>
3488                        <td>$100</td>
3489                    </tr>
3490                    <tr>
3491                        <td>February</td>
3492                        <td>$50</td>
3493                    </tr>
3494                </table>
3495                <iframe id="yt" width="420" height="345" src="https://www.youtube.com/embed/dQw4w9WgXcQ">
3496                </iframe>
3497                <div id="foo">
3498                    <form action="">
3499                        <fieldset>
3500                            <legend>Personal details:</legend>
3501                            <label for="fname">First name:</label>
3502                            <input type="text" id="fname" name="fname"><br><br>
3503                            <label for="lname">Last name:</label>
3504                            <input type="text" id="lname" name="lname"><br><br>
3505                        </fieldset>
3506                    </form>
3507                    <br>
3508                    <p id="p-link">
3509                        omnis nemo qui libero? Eius suscipit veritatis, tenetur impedit et voluptatibus.
3510                        <a href="\#">Rerum repellat totam quam nobis harum fuga consequatur</a>
3511                        corrupti?
3512                    </p>
3513                    <br>
3514                    <iframe src="https://www.rust-lang.org/" name="rust_iframe" height="300px" width="100%" title="Rustlang Homepage">
3515                    </iframe>
3516                </div>
3517                <iframe src="https://crates.io/" name="crates_iframe" height="300px" width="100%" title="Crates.io Homepage">
3518                </iframe>
3519                <table id="tbl-replace-p">
3520                    <tr valign="top">
3521                        <td width="20%" id="td-to-p"><span>One cell table. This is going to be replaced</span></td>
3522                    </tr>
3523                </table>
3524                <embed type="video/webm" src="video.mp4" width="400" height="300">
3525                <br>
3526                <embed type="image/jpg" src="foo.jpg" width="300" height="200">
3527                <div>
3528                    <form action="">
3529                        <div>
3530                            <label>Join our newsletter</label>
3531                            <input type="email" placeholder="Your email address">
3532                        </div>
3533                        <button>Sign up</button>
3534                    </form>
3535                </div>
3536                <div id="div-p">
3537                    <p class="share">Share this as a <a href="\#">Tweet</a></p>
3538                    <br>
3539                    <p id="share">
3540                        Lorem ipsum dolor, sit amet consectetur adipisicing elit. Minima quia numquam aperiam dolores ipsam, eos perferendis cupiditate adipisci perspiciatis
3541                        dolore, sunt, iusto nobis? Nulla molestiae id repellat quibusdam nobis quia. Lorem ipsum dolor sit amet consectetur, adipisicing elit. Voluptas
3542                        laudantium omnis nemo qui libero? Eius suscipit veritatis, tenetur impedit et voluptatibus. Rerum repellat totam quam nobis harum fuga consequatur
3543                        corrupti? Lorem ipsum dolor sit amet consectetur, adipisicing elit. Iure excepturi accusamus nemo voluptatibus laborum minus dicta blanditiis totam
3544                        aperiam velit amet cupiditate hic a molestias odio nam, fugiat facere iusto.
3545                    </p>
3546                </div>
3547                <table id="tbl-replace-div">
3548                    <tr>
3549                        <td id="td-to-div"><pre>One cell table. This is going to be replaced</pre></td>
3550                    </tr>
3551                </table>
3552                <footer>A Paperoni test</footer>
3553                <footer>Copyright 2020</footer>
3554            </body>
3555        </html>
3556        "#;
3557        let mut doc = Readability::new(html_str);
3558        doc.article_title = "A test HTML file".into();
3559        let body = doc.root_node.select_first("body").unwrap();
3560        doc.prep_article(&mut body.as_node().clone());
3561
3562        // Ensure tables were assigned their data table scores
3563        let table_node = doc.root_node.select_first("table").unwrap();
3564        let node_attr = table_node.attributes.borrow();
3565        assert_eq!(true, node_attr.get("readability-data-table").is_some());
3566
3567        let forms_and_fieldsets = doc.root_node.select("form, fieldset").unwrap();
3568        assert_eq!(0, forms_and_fieldsets.count());
3569
3570        let nodes = doc
3571            .root_node
3572            .select("h1, object, embed, footer, link, aside")
3573            .unwrap();
3574        assert_eq!(0, nodes.count());
3575
3576        assert_eq!(2, doc.root_node.select("p").unwrap().count());
3577        assert_eq!(true, doc.root_node.select_first("p.share").is_err());
3578        assert_eq!(true, doc.root_node.select_first("p#share").is_ok());
3579        assert_eq!(true, doc.root_node.select_first("p#td-to-p").is_ok());
3580
3581        let node = doc.root_node.select_first("h2");
3582        assert_eq!(true, node.is_err());
3583
3584        let nodes = doc
3585            .root_node
3586            .select("input, textarea, select, button")
3587            .unwrap();
3588        assert_eq!(0, nodes.count());
3589
3590        let nodes = doc.root_node.select("iframe").unwrap();
3591        assert_eq!(1, nodes.count());
3592        let node = doc.root_node.select_first("iframe#yt");
3593        assert_eq!(true, node.is_ok());
3594
3595        let nodes = doc.root_node.select("h1").unwrap();
3596        assert_eq!(0, nodes.count());
3597
3598        let nodes = doc
3599            .root_node
3600            .select("#tbl-one, #tbl-replace-p, #tbl-replace-div")
3601            .unwrap();
3602        assert_eq!(0, nodes.count());
3603
3604        let tables = doc.root_node.select("#tbl-two, #tbl-three").unwrap();
3605        assert_eq!(2, tables.count());
3606
3607        assert_eq!(true, doc.root_node.select_first("ul").is_ok());
3608
3609        assert_eq!(2, doc.root_node.select("div").unwrap().count());
3610        assert_eq!(true, doc.root_node.select_first("div#div-p").is_ok());
3611        assert_eq!(true, doc.root_node.select_first("div#td-to-div").is_ok());
3612
3613        assert_eq!(1, doc.root_node.select("br").unwrap().count());
3614        let node_ref = doc.root_node.select_first("br").unwrap();
3615        assert_eq!(
3616            "div",
3617            &node_ref
3618                .as_node()
3619                .following_siblings()
3620                .elements()
3621                .next()
3622                .unwrap()
3623                .name
3624                .local
3625        );
3626    }
3627
3628    #[test]
3629    fn test_get_article_title() {
3630        let mut html_str = r#"
3631        <!DOCTYPE html>
3632        <html>
3633            <head>
3634                <title>Porting Readability to Rust</title>
3635            </head>
3636            <body>
3637                <p></p>
3638            </body>
3639        </html>
3640        "#;
3641        let doc = Readability::new(html_str);
3642        assert_eq!("Porting Readability to Rust", doc.get_article_title());
3643
3644        html_str = r#"
3645        <!DOCTYPE html>
3646        <html>
3647            <head>
3648                <title>Crates.io: The Rust package repository</title>
3649            </head>
3650            <body>
3651                <p></p>
3652            </body>
3653        </html>
3654        "#;
3655        let doc = Readability::new(html_str);
3656        assert_eq!(
3657            "Crates.io: The Rust package repository",
3658            doc.get_article_title()
3659        );
3660
3661        html_str = r#"
3662        <!DOCTYPE html>
3663        <html>
3664            <head>
3665                <title>Crates.io: The Rust package repository</title>
3666            </head>
3667            <body>
3668                <h1>Crates.io: The Rust package repository</h1>
3669            </body>
3670        </html>
3671        "#;
3672        let doc = Readability::new(html_str);
3673        assert_eq!(
3674            "Crates.io: The Rust package repository",
3675            doc.get_article_title()
3676        );
3677
3678        html_str = r#"
3679        <!DOCTYPE html>
3680        <html>
3681            <head>
3682                <title>Crates.io: A package repository</title>
3683            </head>
3684            <body>
3685                <h1>Crates.io: A Rust package repository</h1>
3686            </body>
3687        </html>
3688        "#;
3689        let doc = Readability::new(html_str);
3690        assert_eq!("Crates.io: A package repository", doc.get_article_title());
3691
3692        html_str = r#"
3693        <!DOCTYPE html>
3694        <html>
3695            <head>
3696                <title>Foo developer \ Blog</title>
3697            </head>
3698            <body>
3699                <p></p>
3700            </body>
3701        </html>
3702        "#;
3703        let doc = Readability::new(html_str);
3704        assert_eq!("Foo developer \\ Blog", doc.get_article_title());
3705
3706        html_str = r#"
3707        <!DOCTYPE html>
3708        <html>
3709            <head>
3710                <title>Foo developer » Blog Post on Foo bar stuff</title>
3711            </head>
3712            <body>
3713                <p></p>
3714            </body>
3715        </html>
3716        "#;
3717        let doc = Readability::new(html_str);
3718        assert_eq!("Blog Post on Foo bar stuff", doc.get_article_title());
3719
3720        html_str = r#"
3721        <!DOCTYPE html>
3722        <html>
3723            <head>
3724                <title>Blog</title>
3725            </head>
3726            <body>
3727                <h1>Getting started with Rust</h1>
3728            </body>
3729        </html>
3730        "#;
3731        let doc = Readability::new(html_str);
3732        assert_eq!("Blog", doc.get_article_title());
3733    }
3734
3735    #[test]
3736    fn test_unescape_html_entities() {
3737        let mut input = "Therefore, 5 &gt; 3".to_string();
3738        Readability::unescape_html_entities(&mut input);
3739        assert_eq!("Therefore, 5 > 3", &input);
3740        input = "Logical AND (&amp;&amp;)".to_string();
3741        Readability::unescape_html_entities(&mut input);
3742        assert_eq!("Logical AND (&&)", &input);
3743        input = "&#117; &#43; &#101; = &#252;".to_string();
3744        Readability::unescape_html_entities(&mut input);
3745        assert_eq!("u + e = ü", input);
3746        input = "&#x0158;&#x016d;&#x0161;&#x0163;".to_string();
3747        Readability::unescape_html_entities(&mut input);
3748        assert_eq!("Řŭšţ", input);
3749    }
3750
3751    #[test]
3752    fn test_get_article_metadata() {
3753        let mut html_str = r#"
3754        <!DOCTYPE html>
3755        <html>
3756            <head>
3757                <meta charset="utf-8"/>
3758                <meta name="description" content="A post on how hard it is to work with text."/>
3759                <meta name="viewport" content="width=device-width"/>
3760                <title>Foo Coder / Blog on the difficulty of using utf-8</title>
3761                <meta name="author" content="Foo Coder"/>
3762            </head>
3763            <body></body>
3764        </html>
3765        "#;
3766        let doc = Readability::new(html_str);
3767        let mut result = MetaData::new();
3768        result.byline = Some("Foo Coder".to_string());
3769        result.excerpt = Some("A post on how hard it is to work with text.".to_string());
3770        result.title = "Blog on the difficulty of using utf-8".to_string();
3771        assert_eq!(result, doc.get_article_metadata());
3772
3773        html_str = r#"
3774        <!DOCTYPE html>
3775        <html>
3776            <head>
3777                <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
3778                <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" user-scalable="no" />
3779                <meta name="title" content="A Long Title" />
3780                <meta name="description" content="Foo bar baz bo&#223;" />
3781                <meta property="og:site_name" content="Blog Place" />
3782                <meta property="og:title" content="A Longer Title" />
3783                <meta property="og:description" content="Foo bar baz bo&#223;" />
3784                <meta name="author" content="F&#x00f6;o Coder" />
3785                <meta name="dc:creator" content="F&#x00f6;o Coder" />
3786                <meta name="twitter:card" content="summary_large_image" />
3787                <title>The Longest Title</title>
3788            </head>
3789        </html>
3790        "#;
3791        let doc = Readability::new(html_str);
3792        result = MetaData::new();
3793        result.byline = Some("Föo Coder".to_string());
3794        result.excerpt = Some("Foo bar baz boß".to_string());
3795        result.site_name = Some("Blog Place".to_string());
3796        result.title = "A Longer Title".to_string();
3797        assert_eq!(result, doc.get_article_metadata());
3798    }
3799
3800    #[test]
3801    fn test_fix_relative_uris() {
3802        let html_str = r##"
3803        <!DOCTYPE html>
3804        <html>
3805            <body>
3806                <h1><a href="../home.html">Go back</a></h1>
3807                <img id="ex-1" src="https://example.image.com/images/1.jpg" alt="Ex 1">
3808                <img id="ex-2" src="https://example.image.com/images/2.jpg" alt="Ex 2">
3809                <img id="ex-3" src="../images/2.jpg" alt="Ex 3">
3810                <img id="ex-4" src="./images/1.jpg" alt="Ex 4">
3811                <img id="ex-5" src="https://images.com/images/1.jpg" alt="Ex 5">
3812                <img id="ex-6" src="/images/1.jpg" alt="Ex 6">
3813                <p><a href="#ex-1">First image</a></p>
3814            </body>
3815        </html>
3816        "##;
3817        let mut doc = Readability::new(html_str);
3818        doc.article_node = doc
3819            .root_node
3820            .select_first("body")
3821            .ok()
3822            .map(|node_ref| node_ref.as_node().clone());
3823        doc.fix_relative_uris("https://example.image.com/blog/");
3824
3825        let node = doc.root_node.select_first("img#ex-1").unwrap();
3826        let node_attrs = node.attributes.borrow();
3827        assert_eq!(
3828            Some("https://example.image.com/images/1.jpg"),
3829            node_attrs.get("src")
3830        );
3831
3832        let node = doc.root_node.select_first("img#ex-2").unwrap();
3833        let node_attrs = node.attributes.borrow();
3834        assert_eq!(
3835            Some("https://example.image.com/images/2.jpg"),
3836            node_attrs.get("src")
3837        );
3838
3839        let node = doc.root_node.select_first("img#ex-3").unwrap();
3840        let node_attrs = node.attributes.borrow();
3841        assert_eq!(
3842            Some("https://example.image.com/images/2.jpg"),
3843            node_attrs.get("src")
3844        );
3845
3846        let node = doc.root_node.select_first("img#ex-4").unwrap();
3847        let node_attrs = node.attributes.borrow();
3848        assert_eq!(
3849            Some("https://example.image.com/blog/images/1.jpg"),
3850            node_attrs.get("src")
3851        );
3852
3853        let node = doc.root_node.select_first("img#ex-5").unwrap();
3854        let node_attrs = node.attributes.borrow();
3855        assert_eq!(
3856            Some("https://images.com/images/1.jpg"),
3857            node_attrs.get("src")
3858        );
3859
3860        let node = doc.root_node.select_first("img#ex-6").unwrap();
3861        let node_attrs = node.attributes.borrow();
3862        assert_eq!(
3863            Some("https://example.image.com/images/1.jpg"),
3864            node_attrs.get("src")
3865        );
3866
3867        let node = doc.root_node.select_first("p a").unwrap();
3868        let node_attrs = node.attributes.borrow();
3869        assert_eq!(Some("#ex-1"), node_attrs.get("href"));
3870
3871        let node = doc.root_node.select_first("h1 a").unwrap();
3872        let node_attrs = node.attributes.borrow();
3873        assert_eq!(
3874            Some("https://example.image.com/home.html"),
3875            node_attrs.get("href")
3876        );
3877    }
3878
3879    #[test]
3880    fn test_clean_classes() {
3881        // TODO: This test will later be edited to ensure it checks to only remove certain classes
3882        let html_str = r#"
3883        <!DOCTYPE html>
3884        <html>
3885            <body>
3886                <p class="a b c d">One</p>
3887                <p class="b c d e">Two</p>
3888                <div class="a b c div">Three</div>
3889                <div class="b c d e">Four</div>
3890                <ul class="a b c d">
3891                    <li class="a b c d">One</li>
3892                    <li class="b c d e">Two</li>
3893                    <li class="b c d e">Three</li>
3894                </ul>
3895            </body>
3896        </html>
3897        "#;
3898        let mut doc = Readability::new(html_str);
3899        doc.article_node = doc
3900            .root_node
3901            .select_first("body")
3902            .ok()
3903            .map(|node_ref| node_ref.as_node().clone());
3904        doc.clean_classes();
3905
3906        assert_eq!(
3907            true,
3908            doc.root_node
3909                .inclusive_descendants()
3910                .elements()
3911                .all(|node_elem| {
3912                    let node_attrs = node_elem.attributes.borrow();
3913                    !node_attrs.contains("class")
3914                })
3915        );
3916    }
3917
3918    #[test]
3919    fn test_clean_readability_attrs() {
3920        let html_str = r#"
3921        <!DOCTYPE html>
3922        <html>
3923            <body>
3924                <div readability-score="0.921487">
3925                    <p readability-score="0.8102">Welcome to this awesome blog post. Only good content is here. No spam.</p>
3926                    <p readability-score="0.6004">Let's look at some statistics</p>
3927                    <table readability-score="0.719275" readability-data-table="true">
3928                        <caption>Monthly savings</caption>
3929                        <tr>
3930                            <th>Month</th>
3931                            <th>Savings</th>
3932                        </tr>
3933                        <tr>
3934                            <td>January</td>
3935                            <td>$100</td>
3936                        </tr>
3937                        <tr>
3938                            <td>February</td>
3939                            <td>$50</td>
3940                        </tr>
3941                    </table>
3942                </div>
3943            </body>
3944        </html>
3945        "#;
3946        let mut doc = Readability::new(html_str);
3947        doc.article_node = doc
3948            .root_node
3949            .select_first("body")
3950            .ok()
3951            .map(|node_ref| node_ref.as_node().clone());
3952        doc.clean_readability_attrs();
3953        assert_eq!(
3954            true,
3955            doc.root_node
3956                .inclusive_descendants()
3957                .elements()
3958                .all(|node| {
3959                    let node_attrs = node.attributes.borrow();
3960                    node_attrs.map.len() == 0
3961                })
3962        );
3963    }
3964
3965    #[test]
3966    fn test_post_process_content() {
3967        let html_str = r##"
3968        <!DOCTYPE html>
3969        <html>
3970            <body>
3971                <p class="a b c d">One</p>
3972                <p class="b c d e">Two</p>
3973                <div class="a b c div">Three</div>
3974                <div class="b c d e">
3975                    <img src="./img.jpg" class="lazy">
3976                </div>
3977                <ul class="a b c d">
3978                    <li class="a b c d"><a href="#home">One</a></li>
3979                    <li class="b c d e">Two</li>
3980                    <li class="b c d e">Three</li>
3981                </ul>
3982            </body>
3983        </html>
3984        "##;
3985        let mut doc = Readability::new(html_str);
3986        doc.article_node = doc
3987            .root_node
3988            .select_first("body")
3989            .ok()
3990            .map(|node_ref| node_ref.as_node().clone());
3991        doc.post_process_content("https://foo.blog/post/");
3992        let has_class_attr = doc
3993            .root_node
3994            .inclusive_descendants()
3995            .elements()
3996            .any(|node_ref| {
3997                let attrs = node_ref.attributes.borrow();
3998                attrs.contains("class")
3999            });
4000        assert_eq!(false, has_class_attr);
4001        let a_node = doc.root_node.select_first("a").unwrap();
4002        let a_node_attrs = a_node.attributes.borrow();
4003        assert_eq!(Some("#home"), a_node_attrs.get("href"));
4004        let img_node = doc.root_node.select_first("img").unwrap();
4005        let img_attrs = img_node.attributes.borrow();
4006        assert_eq!(Some("https://foo.blog/post/img.jpg"), img_attrs.get("src"));
4007    }
4008
4009    #[test]
4010    fn test_flag_is_active() {
4011        let html_str = r"
4012        <!DOCTYPE html>
4013        <html>
4014            <body>
4015            </body>
4016        </html>
4017        ";
4018        let doc = Readability::new(html_str);
4019        assert_eq!(true, doc.flag_is_active(FLAG_STRIP_UNLIKELYS));
4020        assert_eq!(true, doc.flag_is_active(FLAG_WEIGHT_CLASSES));
4021        assert_eq!(true, doc.flag_is_active(FLAG_CLEAN_CONDITIONALLY));
4022    }
4023
4024    #[test]
4025    fn test_remove_flag() {
4026        let html_str = r"
4027        <!DOCTYPE html>
4028        <html>
4029            <body>
4030            </body>
4031        </html>
4032        ";
4033        let mut doc = Readability::new(html_str);
4034        assert_eq!(true, doc.flag_is_active(FLAG_CLEAN_CONDITIONALLY));
4035        doc.remove_flag(FLAG_CLEAN_CONDITIONALLY);
4036        assert_eq!(false, doc.flag_is_active(FLAG_CLEAN_CONDITIONALLY));
4037        assert_eq!(true, doc.flag_is_active(FLAG_WEIGHT_CLASSES));
4038        doc.remove_flag(FLAG_WEIGHT_CLASSES);
4039        assert_eq!(false, doc.flag_is_active(FLAG_WEIGHT_CLASSES));
4040        assert_eq!(true, doc.flag_is_active(FLAG_STRIP_UNLIKELYS));
4041    }
4042}