readable_readability/
lib.rs

1use std::cmp;
2use std::iter;
3use std::f32;
4use std::fmt;
5
6use regex::Regex;
7use html5ever::{QualName, local_name, namespace_url, ns};
8use kuchiki::{NodeRef, NodeDataRef, NodeData, ElementData, Attributes};
9use kuchiki::traits::TendrilSink;
10use kuchiki::iter::NodeIterator;
11use lazy_static::lazy_static;
12use log::trace;
13use url::Url;
14
15pub use metadata::Metadata;
16use node_cache::NodeCache;
17
18mod metadata;
19mod node_cache;
20
21// TODO: add examples.
22// TODO: document it!
23
24type ElemRef = NodeDataRef<ElementData>;
25
26trait NodeRefExt {
27    fn node_ref(&self) -> &NodeRef;
28
29    fn is(&self, name: QualName) -> bool {
30        self.node_ref().as_element().map_or(false, |e| e.name == name)
31    }
32
33    fn replace<N: NodeRefExt>(&self, node: &N) {
34        self.node_ref().insert_before(node.node_ref().clone());
35        self.remove();
36    }
37
38    fn remove(&self) {
39        self.node_ref().detach();
40    }
41
42    fn rename(&self, name: QualName) -> NodeRef {
43        let node = self.node_ref();
44
45        if let Some(elem) = node.as_element() {
46            // I'd like find a way to do this without clone(), but 
47            // I'm not sure how because BTreeMap doesn't have drain()
48            let attributes = elem.attributes.borrow();
49            let replacement = NodeRef::new_element(name, attributes.map.clone());
50
51            for child in node.children() {
52                replacement.append(child);
53            }
54
55            node.replace(&replacement);
56
57            replacement
58        } else {
59            node.clone()
60        }
61    }
62
63    fn previous_element(&self) -> Option<ElemRef> {
64        self.node_ref().preceding_siblings().elements().next()
65    }
66}
67
68impl NodeRefExt for NodeRef {
69    #[inline]
70    fn node_ref(&self) -> &NodeRef {
71        self
72    }
73}
74
75impl NodeRefExt for ElemRef {
76    #[inline]
77    fn node_ref(&self) -> &NodeRef {
78        self.as_node()
79    }
80
81    fn is(&self, name: QualName) -> bool {
82        self.name == name
83    }
84}
85
86lazy_static! {
87    static ref UNLIKELY_CANDIDATE: Regex = Regex::new(r"(?xi)
88        -ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|
89        modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|
90        ad-break|agegate|pagination|pager|popup|yom-remote
91    ").unwrap();
92
93    static ref MAYBE_CANDIDATE: Regex = Regex::new(r"(?xi)
94        and|article|body|column|main|shadow
95    ").unwrap();
96
97    static ref POSITIVE: Regex = Regex::new(r"(?xi)
98        article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story
99    ").unwrap();
100
101    static ref NEGATIVE: Regex = Regex::new(r"(?xi)
102        -ad-|hidden|^hid$|\shid$|\shid\s|^hid\s|banner|combx|comment|com-|contact|foot|footer|footnote|
103        gdpr|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|
104        sponsor|shopping|tags|tool|widget
105    ").unwrap();
106
107    // TODO: restore byline parsing.
108    //static ref BYLINE: Regex = Regex::new(r"(?xi)
109        //byline|author|dateline|writtenby|p-author
110    //").unwrap();
111
112    static ref VIDEO: Regex = Regex::new(r"(?xi)
113        //(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com
114    ").unwrap();
115
116    static ref PROTOCOL: Regex = Regex::new(r"^\w+:").unwrap();
117}
118
119macro_rules! tag {
120    ($name:tt) => { 
121        QualName {
122            prefix: None,
123            ns: ns!(html),
124            local: local_name!($name),
125        }
126    };
127}
128
129macro_rules! attrib {
130    ($name:tt) => { local_name!($name) };
131}
132
133fn extract_byline(elem: &ElemRef) -> Option<String> {
134    let attributes = elem.attributes.borrow();
135
136    let rel = attributes.get(attrib!("rel")).unwrap_or("");
137    //let classes = attributes.get(attrib!("class")).unwrap_or("");
138    //let id = attributes.get(attrib!("id")).unwrap_or("");
139
140    // TODO: uncomment it after traverse repearing.
141    //let is_byline = rel == "author" || BYLINE.is_match(classes) || BYLINE.is_match(id);
142    let is_byline = rel == "author";
143
144    if is_byline {
145        // TODO: traverse subtrees manually to preserve spaces?
146        let text = elem.text_contents();
147        let byline = text.trim();
148
149        #[allow(clippy::len_zero)]
150        if 0 < byline.len() && byline.len() < 100 {
151            Some(byline.to_string())
152        } else {
153            None
154        }
155    } else {
156        None
157    }
158}
159
160fn is_unlikely_candidate(elem: &ElemRef) -> bool {
161    match elem.name {
162        tag!("a") | tag!("body") => return false,
163        _ => {}
164    }
165
166    let attributes = elem.attributes.borrow();
167
168    // Ok, check classes.
169    let classes = attributes.get(attrib!("class")).unwrap_or("");
170    let id = attributes.get(attrib!("id")).unwrap_or("");
171
172    (UNLIKELY_CANDIDATE.is_match(classes) || UNLIKELY_CANDIDATE.is_match(id)) &&
173        !(MAYBE_CANDIDATE.is_match(classes) || MAYBE_CANDIDATE.is_match(id))
174}
175
176fn transform_div(div: &ElemRef) {
177    debug_assert_eq!(div.name, tag!("div"));
178
179    let node = div.as_node();
180
181    if has_single_p(node) {
182        trace!("    => replacing <{}> with inner <p>", format_tag(node));
183        let p = node.children().elements().next().unwrap();
184        node.replace(&p);
185    } else if !has_block_elem(node) {
186        trace!("    => altering <{}> to <p>", format_tag(node));
187        node.rename(tag!("p"));
188    } else {
189        // TODO: move to upper level.
190        for child in node.children() {
191            if let Some(text) = child.as_text() {
192                trace!("    => moving text node in <{}> with <p>", format_tag(node));
193                let text = text.borrow();
194
195                if text.trim().is_empty() {
196                    continue;
197                }
198
199                let p = NodeRef::new_element(tag!("p"), iter::empty());
200                child.replace(&p);
201                p.append(child.clone());
202            }
203        }
204    }
205}
206
207fn has_single_p(node: &NodeRef) -> bool {
208    let mut child_it = node.children().elements();
209    let child = child_it.next();
210
211    if child.is_none() || child_it.next().is_some() {
212        return false;
213    }
214
215    let child = child.unwrap();
216    if !child.is(tag!("p")) {
217        return false;
218    }
219
220    node.children().text_nodes().all(|t| t.borrow().trim().is_empty())
221}
222
223fn has_block_elem(node: &NodeRef) -> bool {
224    node.descendants().elements().any(|elem| matches!{
225        elem.name,
226        tag!("a") | tag!("blockquote") | tag!("dl") | tag!("div") | tag!("img") | tag!("ol") |
227        tag!("p") | tag!("pre") | tag!("table") | tag!("ul") | tag!("select")
228    })
229}
230
231fn count_chars(text: &str) -> (u32, u32) {
232    let mut char_cnt = 0;
233    let mut comma_cnt = 0;
234
235    // TODO: what about graphemes?
236    let mut iter = text.trim().chars().peekable();
237
238    while let Some(ch) = iter.next() {
239        if ch.is_whitespace() {
240            if !iter.peek().unwrap().is_whitespace() {
241                char_cnt += 1;
242            }
243        } else if ch == ',' {
244            if iter.peek().map_or(true, |&c| c != ',') {
245                char_cnt += 1;
246                comma_cnt += 1;
247            }
248        } else {
249            char_cnt += 1;
250        }
251    }
252
253    (char_cnt, comma_cnt)
254}
255
256fn is_tag_to_score(tag: &QualName) -> bool {
257    matches!{
258        *tag,
259        tag!("section") | tag!("p") | tag!("td") | tag!("pre") |
260        tag!("h2") | tag!("h3") | tag!("h4") | tag!("h5") | tag!("h6")
261    }
262}
263
264fn tag_score(tag: &QualName) -> f32 {
265    match *tag {
266        tag!("section") => 15.,
267        tag!("div") => 5.,
268        tag!("pre") | tag!("td") | tag!("blockquote") => 3.,
269        tag!("address") | tag!("form") => -3.,
270        tag!("dl") | tag!("dt") | tag!("dd") => -3.,
271        tag!("li") | tag!("ol") | tag!("ul") => -3.,
272        tag!("body") => -5.,
273        tag!("h1") | tag!("h2") | tag!("h3") | tag!("h4") | tag!("h5") | tag!("h6") => -5.,
274        tag!("th") => -5.,
275        _ => 0.
276    }
277}
278
279fn class_score(elem: &ElemRef) -> f32 {
280    let attributes = elem.attributes.borrow();
281    let mut score = 0.;
282
283    if let Some(classes) = attributes.get(attrib!("class")) {
284        if POSITIVE.is_match(classes) { score += 25.; }
285        if NEGATIVE.is_match(classes) { score -= 25.; }
286    }
287
288    if let Some(id) = attributes.get(attrib!("id")) {
289        if POSITIVE.is_match(id) { score += 25.; }
290        if NEGATIVE.is_match(id) { score -= 25.; }
291    }
292
293    score
294}
295
296fn is_stuffed(elem: &ElemRef, info: &NodeInfo) -> bool {
297    match elem.name {
298        // TODO: remove <object>, <embed> etc.
299        tag!("h1") | tag!("footer") | tag!("button") => false,
300
301        tag!("div") | tag!("section") | tag!("header") |
302        tag!("h2") | tag!("h3") | tag!("h4") | tag!("h5") | tag!("h6") => {
303            if info.text_len == 0 {
304                let children_count = elem.as_node().children().count() as u32;
305
306                if children_count == 0 || children_count == info.br_count + info.hr_count {
307                    return false;
308                }
309            }
310
311            true
312        },
313
314        tag!("thead") | tag!("tbody") | tag!("th") | tag!("tr") | tag!("td") =>
315            // TODO: add <video> and <audio> counters to the sum.
316            info.text_len > 0 || info.img_count + info.embed_count + info.iframe_count > 0,
317
318        tag!("p") | tag!("pre") | tag!("blockquote") =>
319            // TODO: add <video> and <audio> counters to the sum.
320            info.img_count + info.embed_count + info.iframe_count > 0 ||
321            // TODO: calculate length without construction the string.
322                !elem.text_contents().trim().is_empty(),
323
324        _ => true
325    }
326}
327
328fn clean_attributes(attributes: &mut Attributes) {
329    // TODO: what about removing all except for `alt`, `href`, `src` and `title`?
330    attributes.remove(attrib!("style"));
331}
332
333fn fix_relative_urls(attributes: &mut Attributes, base_url: &Url) {
334    fn fix(url: &mut String, base: &Url) {
335        // Ignore absolute and hash urls.
336        if url.is_empty() || PROTOCOL.is_match(url) || url.starts_with('#') {
337            return;
338        }
339
340        if let Ok(resolved) = base.join(url) {
341            *url = resolved.into();
342        }
343    }
344
345    if let Some(attr) = attributes.get_mut(attrib!("href")) {
346        fix(attr, base_url);
347    }
348
349    if let Some(attr) = attributes.get_mut(attrib!("src")) {
350        fix(attr, base_url);
351    }
352}
353
354fn is_acceptable_top_level(tag: &QualName) -> bool {
355    matches!(*tag, tag!("div") | tag!("article") | tag!("section") | tag!("p"))
356}
357
358#[derive(Default, PartialEq, Clone)]
359struct NodeInfo {
360    content_score: f32,
361    text_len: u32,
362    link_len: u32,
363    commas: u32,
364    is_candidate: bool,
365    is_shabby: bool,
366
367    p_count: u32,
368    img_count: u32,
369    li_count: u32,
370    input_count: u32,
371    embed_count: u32,
372    iframe_count: u32,
373    br_count: u32,
374    hr_count: u32,
375}
376
377impl fmt::Debug for NodeInfo {
378    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
379        let mut s = fmt.debug_struct("");
380
381        if self.content_score > 0. { s.field("content_score", &self.content_score); }
382        if self.text_len > 0 { s.field("text", &self.text_len); }
383        if self.link_len > 0 { s.field("link", &self.link_len); }
384        if self.commas > 0 { s.field("commas", &self.commas); }
385        if self.is_candidate { s.field("candidate", &self.is_candidate); }
386        if self.is_shabby { s.field("shabby", &self.is_shabby); }
387        if self.p_count > 0 { s.field("p", &self.p_count); }
388        if self.img_count > 0 { s.field("img", &self.img_count); }
389        if self.li_count > 0 { s.field("li", &self.li_count); }
390        if self.input_count > 0 { s.field("input", &self.input_count); }
391        if self.embed_count > 0 { s.field("embed", &self.embed_count); }
392        if self.iframe_count > 0 { s.field("iframe", &self.iframe_count); }
393        if self.br_count > 0 { s.field("br", &self.br_count); }
394        if self.hr_count > 0 { s.field("hr", &self.hr_count); }
395
396        s.finish()
397    }
398}
399
400pub struct Readability {
401    info: NodeCache<NodeInfo>,
402    candidates: Vec<ElemRef>,
403    byline: Option<String>,
404
405    strip_unlikelys: bool,
406    weight_classes: bool,
407    clean_conditionally: bool,
408    clean_attributes: bool,
409    base_url: Option<Url>
410}
411
412impl Default for Readability {
413    fn default() -> Self {
414        Self::new()
415    }
416}
417
418impl Readability {
419    pub fn new() -> Readability {
420        Readability {
421            info: NodeCache::new(),
422            candidates: Vec::new(),
423            byline: None,
424
425            strip_unlikelys: true,
426            weight_classes: true,
427            clean_conditionally: true,
428            clean_attributes: true,
429            base_url: None,
430        }
431    }
432
433    pub fn strip_unlikelys(&mut self, enabled: bool) -> &mut Self {
434        self.strip_unlikelys = enabled;
435        self
436    }
437
438    pub fn weight_classes(&mut self, enabled: bool) -> &mut Self {
439        self.weight_classes = enabled;
440        self
441    }
442
443    pub fn clean_conditionally(&mut self, enabled: bool) -> &mut Self {
444        self.clean_conditionally = enabled;
445        self
446    }
447
448    pub fn clean_attributes(&mut self, enabled: bool) -> &mut Self {
449        self.clean_attributes = enabled;
450        self
451    }
452
453    pub fn base_url<U>(&mut self, url: U) -> &mut Self
454        where U: Into<Option<Url>>
455    {
456        self.base_url = url.into();
457        self
458    }
459
460    pub fn parse(&mut self, html: &str) -> (NodeRef, Metadata) {
461        let top_level = kuchiki::parse_html().one(html);
462
463        let metadata = metadata::extract(&top_level);
464
465        let top_level = top_level.select("html > body").unwrap().next()
466            .map_or(top_level, |b| b.as_node().clone());
467
468        top_level.detach();
469
470        // TODO: retry with fewer restrictions.
471        (self.readify(top_level), metadata)
472    }
473
474    fn readify(&mut self, top_level: NodeRef) -> NodeRef {
475        let mut current = top_level.clone();
476        let mut bubbling = false;
477
478        // TODO: refactor this shitty traverse!
479        // TODO: ignore or remove empty text nodes.
480        loop {
481            if !bubbling {
482                self.on_capturing(&current);
483
484                if let Some(child) = current.first_child() {
485                    current = child;
486                    continue;
487                }
488
489                bubbling = true;
490            }
491
492            let (ncurrent, nbubbling) = if let Some(next) = current.next_sibling() {
493                (next, false)
494            } else if let Some(parent) = current.parent() {
495                (parent, bubbling)
496            } else {
497                if bubbling { self.on_bubbling(&current); }
498                break;
499            };
500
501            if bubbling { self.on_bubbling(&current); }
502
503            bubbling = nbubbling;
504            current = ncurrent;
505        }
506
507        if self.candidates.is_empty() {
508            return top_level;
509        }
510
511        self.score_candidates();
512
513        if self.candidates.is_empty() {
514            return top_level;
515        }
516
517        let top_candidate = self.find_common_candidate();
518        self.correct_candidate(top_candidate)
519
520        // TODO: combine top candidates together.
521    }
522
523    // Capturing stage: remove unlikely candidates, unpack divs etc.
524    fn on_capturing(&mut self, node: &NodeRef) {
525        if node.as_element().is_some() {
526            trace!("<{}>", format_tag(node));
527        }
528
529        for child in node.children() {
530            let remove = match *child.data() {
531                NodeData::Comment(_) | NodeData::DocumentFragment => true,
532                NodeData::Text(ref data) => data.borrow().trim().is_empty(),
533                NodeData::Element(ref elem) => {
534                    matches!(elem.name, tag!("script") | tag!("style") | tag!("noscript"))
535                },
536                _ => false
537            };
538
539            if remove {
540                if child.as_element().is_some() {
541                    trace!("    => removing <{}> as useless element", format_tag(&child));
542                }
543
544                child.remove();
545            }
546
547            if let Some(child) = child.into_element_ref() {
548                // TODO: mozilla/readability takes into account only first occurrence.
549                //if self.byline.is_none() {
550                    if let Some(byline) = extract_byline(&child) {
551                        self.byline = Some(byline);
552                        trace!("    => removing <{}> as byline container", format_tag(&child));
553                        child.remove();
554
555                        continue;
556                    }
557                //}
558
559                if self.strip_unlikelys && is_unlikely_candidate(&child) {
560                    trace!("    => removing <{}> as unlikely candidate", format_tag(&child));
561                    child.remove();
562                } else if child.is(tag!("div")) {
563                    transform_div(&child);
564                } else if child.is(tag!("font")) {
565                    trace!("    => altering <{}> to <div>", format_tag(&child));
566                    child.rename(tag!("span"));
567                }
568            }
569        }
570    }
571
572    // Bubbling stage: collect info based on children and score elements.
573    fn on_bubbling(&mut self, node: &NodeRef) {
574        match *node.data() {
575            NodeData::Text(ref data) => {
576                let (char_cnt, comma_cnt) = count_chars(&data.borrow()[..]);
577
578                let parent = node.parent().unwrap();
579
580                let parent_info = self.info.get_or_create(&parent);
581                parent_info.text_len += char_cnt;
582                parent_info.commas += comma_cnt;
583            },
584            NodeData::Element(ElementData { ref name, ref attributes, .. }) => {
585                trace!("</{}> {}", format_tag(node), format_info(self.info.get(node)));
586
587                // FIXME: don't propagate info of bad nodes.
588                self.propagate_info(node);
589
590                if is_tag_to_score(name) {
591                    self.score_node(node);
592                }
593
594                let elem = node.clone().into_element_ref().unwrap();
595
596                // TODO: don't create info if it's not necessary.
597                if !is_stuffed(&elem, self.info.get_or_create(node)) {
598                    node.remove();
599                    trace!("    => removed (it's not stuffed)");
600
601                    return;
602                }
603
604                if self.clean_conditionally && !self.is_conditionally_acceptable(&elem) {
605                    if let Some(info) = self.info.get(node) {
606                        info.is_candidate = false;
607                    }
608
609                    if let Some(info) = node.parent().map(|parent| self.info.get_or_create(&parent)) {
610                        info.is_shabby = true;
611                    }
612
613                    node.remove();
614                    trace!("    => removed (it's conditionally unacceptable)");
615
616                    return;
617                }
618
619                // Remove <br> preceding <p>.
620                if node.is(tag!("p")) {
621                    if let Some(elem) = node.previous_element() {
622                        if elem.is(tag!("br")) {
623                            elem.remove();
624                        }
625                    }
626                }
627
628                let mut attributes = attributes.borrow_mut();
629
630                if self.clean_attributes {
631                    clean_attributes(&mut attributes);
632                }
633
634                if let Some(ref base_url) = self.base_url {
635                    if *name == tag!("a") || *name == tag!("img") {
636                        fix_relative_urls(&mut attributes, base_url);
637                    }
638                }
639            },
640            _ => {}
641        };
642    }
643
644    fn propagate_info(&mut self, node: &NodeRef) {
645        let parent = match node.parent() {
646            Some(parent) => parent,
647            None => return
648        };
649
650        let is_a = node.is(tag!("a"));
651        // TODO: avoid extra cloning.
652        let info = self.info.get_or_create(node).clone();
653
654        let parent_info = self.info.get_or_create(&parent);
655
656        if let Some(elem) = node.as_element() {
657            match elem.name {
658                tag!("p") => parent_info.p_count += 1,
659                tag!("img") => parent_info.img_count += 1,
660                tag!("li") => parent_info.li_count += 1,
661                tag!("input") => parent_info.input_count += 1,
662                tag!("br") => parent_info.br_count += 1,
663                tag!("hr") => parent_info.hr_count += 1,
664                tag!("iframe") => parent_info.iframe_count += 1,
665                tag!("embed") => {
666                    let attribs = elem.attributes.borrow();
667                    let src = attribs.get(attrib!("src")).unwrap_or("");
668
669                    if !VIDEO.is_match(src) {
670                        parent_info.embed_count += 1;
671                    }
672                },
673                _ => {}
674            };
675        }
676
677        parent_info.link_len += if is_a { info.text_len } else { info.link_len };
678        parent_info.text_len += info.text_len;
679        parent_info.commas += info.commas;
680        parent_info.p_count += info.p_count;
681        parent_info.img_count += info.img_count;
682        parent_info.li_count += info.li_count;
683        parent_info.input_count += info.input_count;
684        parent_info.embed_count += info.embed_count;
685        parent_info.iframe_count += info.iframe_count;
686        parent_info.br_count += info.br_count;
687        parent_info.hr_count += info.hr_count;
688    }
689
690    fn is_conditionally_acceptable(&mut self, elem: &ElemRef) -> bool {
691        let is_list = match elem.name {
692            tag!("form") | tag!("fieldset") | tag!("table") | tag!("div") => false,
693            tag!("ul") | tag!("ol") => true,
694            _ => return true
695        };
696
697        // TODO: cache the score to prevent extra calculations.
698        let class_score = if self.weight_classes { class_score(elem) } else { 0. };
699
700        if class_score < 0. {
701            return false;
702        }
703
704        let info = self.info.get_or_create(elem.as_node());
705
706        if info.commas >= 10 {
707            return true;
708        }
709
710        let link_density = info.link_len as f32 / info.text_len as f32;
711        let p_img_ratio = info.p_count as f32 / info.img_count as f32;
712
713        // TODO: take into account ancestor tags (check "figure").
714        !(
715            (info.img_count > 1 && p_img_ratio < 0.5) ||
716            (!is_list && info.li_count > info.p_count + 100) ||
717            (info.input_count * 3 > info.p_count) ||
718            (!is_list && info.text_len < 25 && (info.img_count == 0 || info.img_count > 2)) ||
719            (!is_list && class_score < 25. && link_density > 0.2) ||
720            (class_score >= 25. && link_density > 0.5) ||
721            ((info.embed_count == 1 && info.text_len < 75) || info.embed_count > 1)
722         )
723    }
724
725    fn score_node(&mut self, node: &NodeRef) {
726        if let Some(content_score) = self.calculate_content_score(node) {
727            self.propagate_score(node, content_score);
728        }
729    }
730
731    fn calculate_content_score(&mut self, node: &NodeRef) -> Option<f32> {
732        let parent_elem = node.parent().and_then(|p| p.into_element_ref());
733
734        #[allow(clippy::question_mark)]
735        if parent_elem.is_none() {
736            return None;
737        }
738
739        let info = self.info.get_or_create(node);
740
741        if info.text_len < 25 {
742            return None;
743        }
744
745        // Add a point for the paragraph itself as a base.
746        let mut content_score = 1;
747
748        // Add points for any commas within this paragraph.
749        content_score += info.commas;
750
751        // For every 100 characters in this paragraph, add another point. Up to 3 points.
752        let total_len = info.text_len + info.link_len;
753        content_score += cmp::min(total_len / 100, 3);
754
755        Some(content_score as f32)
756    }
757
758    fn propagate_score(&mut self, node: &NodeRef, content_score: f32) {
759        for (level, ancestor) in node.ancestors().elements().enumerate().take(3) {
760            let div = match level {
761                0 => 1.,
762                1 => 2.,
763                _ => 3. * level as f32
764            };
765
766            let addition = content_score / div;
767
768            let info = self.info.get_or_create(ancestor.as_node());
769
770            info.content_score += addition;
771
772            if !info.is_candidate {
773                self.candidates.push(ancestor);
774                info.is_candidate = true;
775            }
776        }
777    }
778
779    fn score_candidates(&mut self) {
780        trace!("Found {} candidates. Scoring...", self.candidates.len());
781
782        let mut scored_candidates = Vec::with_capacity(self.candidates.len());
783
784        for candidate in self.candidates.drain(..) {
785            trace!("Candidate: <{}> {}",
786                   format_tag(&candidate), format_info(self.info.get(candidate.as_node())));
787
788            let info = self.info.get(candidate.as_node()).unwrap();
789
790            // The node has been removed.
791            if !info.is_candidate {
792                trace!("    => the node has been removed!");
793                continue;
794            }
795
796            debug_assert!(info.text_len > 0);
797            debug_assert!(info.text_len >= info.link_len);
798
799            // Add content points.
800            let mut score = info.content_score;
801
802            // Add points for tag name.
803            score += tag_score(&candidate.name);
804
805            // Add points for an class/id weight.
806            if self.weight_classes {
807                score += class_score(&candidate);
808            }
809
810            // Scale the final score based on link density. Good content should have a relatively
811            // small link density (5% or less) and be mostly unaffected by this operation.
812            score *= 1. - info.link_len as f32 / info.text_len as f32;
813
814            trace!("    => score: {}", score);
815
816            debug_assert!(score.is_finite());
817
818            scored_candidates.push((score, candidate));
819        }
820
821        if scored_candidates.is_empty() {
822            trace!("There are no actual candidates!");
823            return;
824        }
825
826        scored_candidates.sort_by(|&(a, _), &(b, _)| b.partial_cmp(&a).unwrap());
827
828        let score_threshold = scored_candidates[0].0 * 0.75;
829
830        let top_candidate_it = scored_candidates.into_iter()
831            .take_while(|&(score, _)| score >= score_threshold)
832            .map(|(_, candidate)| candidate);
833
834        self.candidates.extend(top_candidate_it);
835
836        trace!("After scoring left {} candidates", self.candidates.len());
837    }
838
839    fn find_common_candidate(&self) -> NodeRef {
840        // TODO: mozilla/readability uses 3 here, but we still have problems.
841        const MIN_CANDIDATES: usize = 4;
842
843        trace!("Searching for common parent...");
844
845        let best = self.candidates[0].as_node();
846
847        if self.candidates.len() < MIN_CANDIDATES ||
848           best.is(tag!("body")) || best.parent().map_or(true, |p| p.is(tag!("body"))) {
849            return best.clone();
850        }
851
852        for common in best.ancestors().take_while(|n| !n.is(tag!("body"))) {
853            let mut n = 0;
854
855            for candidate in &self.candidates[1..] {
856                if candidate.as_node().ancestors().any(|a| a == common) {
857                    n += 1;
858                }
859
860                if n == MIN_CANDIDATES {
861                    trace!("Found common parent of top candidates: <{}>", format_tag(&common));
862                    return common;
863                }
864            }
865        }
866
867        best.clone()
868    }
869
870    fn correct_candidate(&mut self, mut candidate: NodeRef) -> NodeRef {
871        trace!("Correcting candidate...");
872
873        // Because of the bonus system, parents of candidates might have scores themselves.
874        // if we see the score going *up* in the first few steps up the tree, that's a decent sign
875        // that there might be more content lurking in other places that we want to unify in.
876        let mut last_score = self.info.get_or_create(&candidate).content_score;
877        let score_threshold = last_score / 3.;
878
879        for parent in candidate.ancestors().take_while(|n| !n.is(tag!("body"))) {
880            let parent_score = self.info.get_or_create(&parent).content_score;
881
882            if parent_score < score_threshold {
883                break;
884            }
885
886            if parent_score > last_score {
887                candidate = parent;
888                trace!("New candidate: <{}> (enough score)", format_tag(&candidate));
889            }
890
891            last_score = parent_score;
892        }
893
894        // If the top candidate is the only child, use parent instead. This will help sibling
895        // joining logic when adjacent content is actually located in parent's sibling node.
896        let parent_it = candidate.ancestors().take_while(|parent| {
897            let mut child_it = parent.children();
898
899            !parent.is(tag!("body")) && child_it.next().is_some() && child_it.next().is_none() &&
900                self.info.get(parent).map_or(true, |info| !info.is_shabby)
901        });
902
903        let result = parent_it.last().map_or(candidate, |parent| {
904            trace!("New candidate: <{}> (single child)", format_tag(&parent));
905            parent
906        });
907
908        if !is_acceptable_top_level(&result.as_element().unwrap().name) {
909            trace!("Altering result: <{}> to <div>", format_tag(&result));
910            result.rename(tag!("div"))
911        } else {
912            result
913        }
914    }
915}
916
917fn format_tag<N: NodeRefExt>(node: &N) -> String {
918    let elem = node.node_ref().as_element().unwrap();
919    let tag = &elem.name.local;
920    let attributes = elem.attributes.borrow();
921
922    match (attributes.get("id"), attributes.get("class")) {
923        (Some(id), Some(class)) => format!("{} id=\"{}\" class=\"{}\"", tag, id, class),
924        (Some(id), None) => format!("{} id=\"{}\"", tag, id),
925        (None, Some(class)) => format!("{} class=\"{}\"", tag, class),
926        (None, None) => format!("{}", tag)
927    }
928}
929
930fn format_info(info: Option<&mut NodeInfo>) -> String {
931    info.map_or_else(String::new, |i| format!("{:?}", i))
932}
readable_readability/lib.rs

readable_readability/
lib.rs