1use std::cmp;
2use std::iter;
3use std::f32;
4use std::fmt;
5
6use regex::Regex;
7use html5ever::{QualName, local_name, namespace_url, ns};
8use kuchiki::{NodeRef, NodeDataRef, NodeData, ElementData, Attributes};
9use kuchiki::traits::TendrilSink;
10use kuchiki::iter::NodeIterator;
11use lazy_static::lazy_static;
12use log::trace;
13use url::Url;
14
15pub use metadata::Metadata;
16use node_cache::NodeCache;
17
18mod metadata;
19mod node_cache;
20
21type ElemRef = NodeDataRef<ElementData>;
25
26trait NodeRefExt {
27 fn node_ref(&self) -> &NodeRef;
28
29 fn is(&self, name: QualName) -> bool {
30 self.node_ref().as_element().map_or(false, |e| e.name == name)
31 }
32
33 fn replace<N: NodeRefExt>(&self, node: &N) {
34 self.node_ref().insert_before(node.node_ref().clone());
35 self.remove();
36 }
37
38 fn remove(&self) {
39 self.node_ref().detach();
40 }
41
42 fn rename(&self, name: QualName) -> NodeRef {
43 let node = self.node_ref();
44
45 if let Some(elem) = node.as_element() {
46 let attributes = elem.attributes.borrow();
49 let replacement = NodeRef::new_element(name, attributes.map.clone());
50
51 for child in node.children() {
52 replacement.append(child);
53 }
54
55 node.replace(&replacement);
56
57 replacement
58 } else {
59 node.clone()
60 }
61 }
62
63 fn previous_element(&self) -> Option<ElemRef> {
64 self.node_ref().preceding_siblings().elements().next()
65 }
66}
67
68impl NodeRefExt for NodeRef {
69 #[inline]
70 fn node_ref(&self) -> &NodeRef {
71 self
72 }
73}
74
75impl NodeRefExt for ElemRef {
76 #[inline]
77 fn node_ref(&self) -> &NodeRef {
78 self.as_node()
79 }
80
81 fn is(&self, name: QualName) -> bool {
82 self.name == name
83 }
84}
85
86lazy_static! {
87 static ref UNLIKELY_CANDIDATE: Regex = Regex::new(r"(?xi)
88 -ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|
89 modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|
90 ad-break|agegate|pagination|pager|popup|yom-remote
91 ").unwrap();
92
93 static ref MAYBE_CANDIDATE: Regex = Regex::new(r"(?xi)
94 and|article|body|column|main|shadow
95 ").unwrap();
96
97 static ref POSITIVE: Regex = Regex::new(r"(?xi)
98 article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story
99 ").unwrap();
100
101 static ref NEGATIVE: Regex = Regex::new(r"(?xi)
102 -ad-|hidden|^hid$|\shid$|\shid\s|^hid\s|banner|combx|comment|com-|contact|foot|footer|footnote|
103 gdpr|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|
104 sponsor|shopping|tags|tool|widget
105 ").unwrap();
106
107 static ref VIDEO: Regex = Regex::new(r"(?xi)
113 //(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com
114 ").unwrap();
115
116 static ref PROTOCOL: Regex = Regex::new(r"^\w+:").unwrap();
117}
118
119macro_rules! tag {
120 ($name:tt) => {
121 QualName {
122 prefix: None,
123 ns: ns!(html),
124 local: local_name!($name),
125 }
126 };
127}
128
129macro_rules! attrib {
130 ($name:tt) => { local_name!($name) };
131}
132
133fn extract_byline(elem: &ElemRef) -> Option<String> {
134 let attributes = elem.attributes.borrow();
135
136 let rel = attributes.get(attrib!("rel")).unwrap_or("");
137 let is_byline = rel == "author";
143
144 if is_byline {
145 let text = elem.text_contents();
147 let byline = text.trim();
148
149 #[allow(clippy::len_zero)]
150 if 0 < byline.len() && byline.len() < 100 {
151 Some(byline.to_string())
152 } else {
153 None
154 }
155 } else {
156 None
157 }
158}
159
160fn is_unlikely_candidate(elem: &ElemRef) -> bool {
161 match elem.name {
162 tag!("a") | tag!("body") => return false,
163 _ => {}
164 }
165
166 let attributes = elem.attributes.borrow();
167
168 let classes = attributes.get(attrib!("class")).unwrap_or("");
170 let id = attributes.get(attrib!("id")).unwrap_or("");
171
172 (UNLIKELY_CANDIDATE.is_match(classes) || UNLIKELY_CANDIDATE.is_match(id)) &&
173 !(MAYBE_CANDIDATE.is_match(classes) || MAYBE_CANDIDATE.is_match(id))
174}
175
176fn transform_div(div: &ElemRef) {
177 debug_assert_eq!(div.name, tag!("div"));
178
179 let node = div.as_node();
180
181 if has_single_p(node) {
182 trace!(" => replacing <{}> with inner <p>", format_tag(node));
183 let p = node.children().elements().next().unwrap();
184 node.replace(&p);
185 } else if !has_block_elem(node) {
186 trace!(" => altering <{}> to <p>", format_tag(node));
187 node.rename(tag!("p"));
188 } else {
189 for child in node.children() {
191 if let Some(text) = child.as_text() {
192 trace!(" => moving text node in <{}> with <p>", format_tag(node));
193 let text = text.borrow();
194
195 if text.trim().is_empty() {
196 continue;
197 }
198
199 let p = NodeRef::new_element(tag!("p"), iter::empty());
200 child.replace(&p);
201 p.append(child.clone());
202 }
203 }
204 }
205}
206
207fn has_single_p(node: &NodeRef) -> bool {
208 let mut child_it = node.children().elements();
209 let child = child_it.next();
210
211 if child.is_none() || child_it.next().is_some() {
212 return false;
213 }
214
215 let child = child.unwrap();
216 if !child.is(tag!("p")) {
217 return false;
218 }
219
220 node.children().text_nodes().all(|t| t.borrow().trim().is_empty())
221}
222
223fn has_block_elem(node: &NodeRef) -> bool {
224 node.descendants().elements().any(|elem| matches!{
225 elem.name,
226 tag!("a") | tag!("blockquote") | tag!("dl") | tag!("div") | tag!("img") | tag!("ol") |
227 tag!("p") | tag!("pre") | tag!("table") | tag!("ul") | tag!("select")
228 })
229}
230
231fn count_chars(text: &str) -> (u32, u32) {
232 let mut char_cnt = 0;
233 let mut comma_cnt = 0;
234
235 let mut iter = text.trim().chars().peekable();
237
238 while let Some(ch) = iter.next() {
239 if ch.is_whitespace() {
240 if !iter.peek().unwrap().is_whitespace() {
241 char_cnt += 1;
242 }
243 } else if ch == ',' {
244 if iter.peek().map_or(true, |&c| c != ',') {
245 char_cnt += 1;
246 comma_cnt += 1;
247 }
248 } else {
249 char_cnt += 1;
250 }
251 }
252
253 (char_cnt, comma_cnt)
254}
255
256fn is_tag_to_score(tag: &QualName) -> bool {
257 matches!{
258 *tag,
259 tag!("section") | tag!("p") | tag!("td") | tag!("pre") |
260 tag!("h2") | tag!("h3") | tag!("h4") | tag!("h5") | tag!("h6")
261 }
262}
263
264fn tag_score(tag: &QualName) -> f32 {
265 match *tag {
266 tag!("section") => 15.,
267 tag!("div") => 5.,
268 tag!("pre") | tag!("td") | tag!("blockquote") => 3.,
269 tag!("address") | tag!("form") => -3.,
270 tag!("dl") | tag!("dt") | tag!("dd") => -3.,
271 tag!("li") | tag!("ol") | tag!("ul") => -3.,
272 tag!("body") => -5.,
273 tag!("h1") | tag!("h2") | tag!("h3") | tag!("h4") | tag!("h5") | tag!("h6") => -5.,
274 tag!("th") => -5.,
275 _ => 0.
276 }
277}
278
279fn class_score(elem: &ElemRef) -> f32 {
280 let attributes = elem.attributes.borrow();
281 let mut score = 0.;
282
283 if let Some(classes) = attributes.get(attrib!("class")) {
284 if POSITIVE.is_match(classes) { score += 25.; }
285 if NEGATIVE.is_match(classes) { score -= 25.; }
286 }
287
288 if let Some(id) = attributes.get(attrib!("id")) {
289 if POSITIVE.is_match(id) { score += 25.; }
290 if NEGATIVE.is_match(id) { score -= 25.; }
291 }
292
293 score
294}
295
296fn is_stuffed(elem: &ElemRef, info: &NodeInfo) -> bool {
297 match elem.name {
298 tag!("h1") | tag!("footer") | tag!("button") => false,
300
301 tag!("div") | tag!("section") | tag!("header") |
302 tag!("h2") | tag!("h3") | tag!("h4") | tag!("h5") | tag!("h6") => {
303 if info.text_len == 0 {
304 let children_count = elem.as_node().children().count() as u32;
305
306 if children_count == 0 || children_count == info.br_count + info.hr_count {
307 return false;
308 }
309 }
310
311 true
312 },
313
314 tag!("thead") | tag!("tbody") | tag!("th") | tag!("tr") | tag!("td") =>
315 info.text_len > 0 || info.img_count + info.embed_count + info.iframe_count > 0,
317
318 tag!("p") | tag!("pre") | tag!("blockquote") =>
319 info.img_count + info.embed_count + info.iframe_count > 0 ||
321 !elem.text_contents().trim().is_empty(),
323
324 _ => true
325 }
326}
327
328fn clean_attributes(attributes: &mut Attributes) {
329 attributes.remove(attrib!("style"));
331}
332
333fn fix_relative_urls(attributes: &mut Attributes, base_url: &Url) {
334 fn fix(url: &mut String, base: &Url) {
335 if url.is_empty() || PROTOCOL.is_match(url) || url.starts_with('#') {
337 return;
338 }
339
340 if let Ok(resolved) = base.join(url) {
341 *url = resolved.into();
342 }
343 }
344
345 if let Some(attr) = attributes.get_mut(attrib!("href")) {
346 fix(attr, base_url);
347 }
348
349 if let Some(attr) = attributes.get_mut(attrib!("src")) {
350 fix(attr, base_url);
351 }
352}
353
354fn is_acceptable_top_level(tag: &QualName) -> bool {
355 matches!(*tag, tag!("div") | tag!("article") | tag!("section") | tag!("p"))
356}
357
358#[derive(Default, PartialEq, Clone)]
359struct NodeInfo {
360 content_score: f32,
361 text_len: u32,
362 link_len: u32,
363 commas: u32,
364 is_candidate: bool,
365 is_shabby: bool,
366
367 p_count: u32,
368 img_count: u32,
369 li_count: u32,
370 input_count: u32,
371 embed_count: u32,
372 iframe_count: u32,
373 br_count: u32,
374 hr_count: u32,
375}
376
377impl fmt::Debug for NodeInfo {
378 fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
379 let mut s = fmt.debug_struct("");
380
381 if self.content_score > 0. { s.field("content_score", &self.content_score); }
382 if self.text_len > 0 { s.field("text", &self.text_len); }
383 if self.link_len > 0 { s.field("link", &self.link_len); }
384 if self.commas > 0 { s.field("commas", &self.commas); }
385 if self.is_candidate { s.field("candidate", &self.is_candidate); }
386 if self.is_shabby { s.field("shabby", &self.is_shabby); }
387 if self.p_count > 0 { s.field("p", &self.p_count); }
388 if self.img_count > 0 { s.field("img", &self.img_count); }
389 if self.li_count > 0 { s.field("li", &self.li_count); }
390 if self.input_count > 0 { s.field("input", &self.input_count); }
391 if self.embed_count > 0 { s.field("embed", &self.embed_count); }
392 if self.iframe_count > 0 { s.field("iframe", &self.iframe_count); }
393 if self.br_count > 0 { s.field("br", &self.br_count); }
394 if self.hr_count > 0 { s.field("hr", &self.hr_count); }
395
396 s.finish()
397 }
398}
399
400pub struct Readability {
401 info: NodeCache<NodeInfo>,
402 candidates: Vec<ElemRef>,
403 byline: Option<String>,
404
405 strip_unlikelys: bool,
406 weight_classes: bool,
407 clean_conditionally: bool,
408 clean_attributes: bool,
409 base_url: Option<Url>
410}
411
412impl Default for Readability {
413 fn default() -> Self {
414 Self::new()
415 }
416}
417
418impl Readability {
419 pub fn new() -> Readability {
420 Readability {
421 info: NodeCache::new(),
422 candidates: Vec::new(),
423 byline: None,
424
425 strip_unlikelys: true,
426 weight_classes: true,
427 clean_conditionally: true,
428 clean_attributes: true,
429 base_url: None,
430 }
431 }
432
433 pub fn strip_unlikelys(&mut self, enabled: bool) -> &mut Self {
434 self.strip_unlikelys = enabled;
435 self
436 }
437
438 pub fn weight_classes(&mut self, enabled: bool) -> &mut Self {
439 self.weight_classes = enabled;
440 self
441 }
442
443 pub fn clean_conditionally(&mut self, enabled: bool) -> &mut Self {
444 self.clean_conditionally = enabled;
445 self
446 }
447
448 pub fn clean_attributes(&mut self, enabled: bool) -> &mut Self {
449 self.clean_attributes = enabled;
450 self
451 }
452
453 pub fn base_url<U>(&mut self, url: U) -> &mut Self
454 where U: Into<Option<Url>>
455 {
456 self.base_url = url.into();
457 self
458 }
459
460 pub fn parse(&mut self, html: &str) -> (NodeRef, Metadata) {
461 let top_level = kuchiki::parse_html().one(html);
462
463 let metadata = metadata::extract(&top_level);
464
465 let top_level = top_level.select("html > body").unwrap().next()
466 .map_or(top_level, |b| b.as_node().clone());
467
468 top_level.detach();
469
470 (self.readify(top_level), metadata)
472 }
473
474 fn readify(&mut self, top_level: NodeRef) -> NodeRef {
475 let mut current = top_level.clone();
476 let mut bubbling = false;
477
478 loop {
481 if !bubbling {
482 self.on_capturing(¤t);
483
484 if let Some(child) = current.first_child() {
485 current = child;
486 continue;
487 }
488
489 bubbling = true;
490 }
491
492 let (ncurrent, nbubbling) = if let Some(next) = current.next_sibling() {
493 (next, false)
494 } else if let Some(parent) = current.parent() {
495 (parent, bubbling)
496 } else {
497 if bubbling { self.on_bubbling(¤t); }
498 break;
499 };
500
501 if bubbling { self.on_bubbling(¤t); }
502
503 bubbling = nbubbling;
504 current = ncurrent;
505 }
506
507 if self.candidates.is_empty() {
508 return top_level;
509 }
510
511 self.score_candidates();
512
513 if self.candidates.is_empty() {
514 return top_level;
515 }
516
517 let top_candidate = self.find_common_candidate();
518 self.correct_candidate(top_candidate)
519
520 }
522
523 fn on_capturing(&mut self, node: &NodeRef) {
525 if node.as_element().is_some() {
526 trace!("<{}>", format_tag(node));
527 }
528
529 for child in node.children() {
530 let remove = match *child.data() {
531 NodeData::Comment(_) | NodeData::DocumentFragment => true,
532 NodeData::Text(ref data) => data.borrow().trim().is_empty(),
533 NodeData::Element(ref elem) => {
534 matches!(elem.name, tag!("script") | tag!("style") | tag!("noscript"))
535 },
536 _ => false
537 };
538
539 if remove {
540 if child.as_element().is_some() {
541 trace!(" => removing <{}> as useless element", format_tag(&child));
542 }
543
544 child.remove();
545 }
546
547 if let Some(child) = child.into_element_ref() {
548 if let Some(byline) = extract_byline(&child) {
551 self.byline = Some(byline);
552 trace!(" => removing <{}> as byline container", format_tag(&child));
553 child.remove();
554
555 continue;
556 }
557 if self.strip_unlikelys && is_unlikely_candidate(&child) {
560 trace!(" => removing <{}> as unlikely candidate", format_tag(&child));
561 child.remove();
562 } else if child.is(tag!("div")) {
563 transform_div(&child);
564 } else if child.is(tag!("font")) {
565 trace!(" => altering <{}> to <div>", format_tag(&child));
566 child.rename(tag!("span"));
567 }
568 }
569 }
570 }
571
572 fn on_bubbling(&mut self, node: &NodeRef) {
574 match *node.data() {
575 NodeData::Text(ref data) => {
576 let (char_cnt, comma_cnt) = count_chars(&data.borrow()[..]);
577
578 let parent = node.parent().unwrap();
579
580 let parent_info = self.info.get_or_create(&parent);
581 parent_info.text_len += char_cnt;
582 parent_info.commas += comma_cnt;
583 },
584 NodeData::Element(ElementData { ref name, ref attributes, .. }) => {
585 trace!("</{}> {}", format_tag(node), format_info(self.info.get(node)));
586
587 self.propagate_info(node);
589
590 if is_tag_to_score(name) {
591 self.score_node(node);
592 }
593
594 let elem = node.clone().into_element_ref().unwrap();
595
596 if !is_stuffed(&elem, self.info.get_or_create(node)) {
598 node.remove();
599 trace!(" => removed (it's not stuffed)");
600
601 return;
602 }
603
604 if self.clean_conditionally && !self.is_conditionally_acceptable(&elem) {
605 if let Some(info) = self.info.get(node) {
606 info.is_candidate = false;
607 }
608
609 if let Some(info) = node.parent().map(|parent| self.info.get_or_create(&parent)) {
610 info.is_shabby = true;
611 }
612
613 node.remove();
614 trace!(" => removed (it's conditionally unacceptable)");
615
616 return;
617 }
618
619 if node.is(tag!("p")) {
621 if let Some(elem) = node.previous_element() {
622 if elem.is(tag!("br")) {
623 elem.remove();
624 }
625 }
626 }
627
628 let mut attributes = attributes.borrow_mut();
629
630 if self.clean_attributes {
631 clean_attributes(&mut attributes);
632 }
633
634 if let Some(ref base_url) = self.base_url {
635 if *name == tag!("a") || *name == tag!("img") {
636 fix_relative_urls(&mut attributes, base_url);
637 }
638 }
639 },
640 _ => {}
641 };
642 }
643
644 fn propagate_info(&mut self, node: &NodeRef) {
645 let parent = match node.parent() {
646 Some(parent) => parent,
647 None => return
648 };
649
650 let is_a = node.is(tag!("a"));
651 let info = self.info.get_or_create(node).clone();
653
654 let parent_info = self.info.get_or_create(&parent);
655
656 if let Some(elem) = node.as_element() {
657 match elem.name {
658 tag!("p") => parent_info.p_count += 1,
659 tag!("img") => parent_info.img_count += 1,
660 tag!("li") => parent_info.li_count += 1,
661 tag!("input") => parent_info.input_count += 1,
662 tag!("br") => parent_info.br_count += 1,
663 tag!("hr") => parent_info.hr_count += 1,
664 tag!("iframe") => parent_info.iframe_count += 1,
665 tag!("embed") => {
666 let attribs = elem.attributes.borrow();
667 let src = attribs.get(attrib!("src")).unwrap_or("");
668
669 if !VIDEO.is_match(src) {
670 parent_info.embed_count += 1;
671 }
672 },
673 _ => {}
674 };
675 }
676
677 parent_info.link_len += if is_a { info.text_len } else { info.link_len };
678 parent_info.text_len += info.text_len;
679 parent_info.commas += info.commas;
680 parent_info.p_count += info.p_count;
681 parent_info.img_count += info.img_count;
682 parent_info.li_count += info.li_count;
683 parent_info.input_count += info.input_count;
684 parent_info.embed_count += info.embed_count;
685 parent_info.iframe_count += info.iframe_count;
686 parent_info.br_count += info.br_count;
687 parent_info.hr_count += info.hr_count;
688 }
689
690 fn is_conditionally_acceptable(&mut self, elem: &ElemRef) -> bool {
691 let is_list = match elem.name {
692 tag!("form") | tag!("fieldset") | tag!("table") | tag!("div") => false,
693 tag!("ul") | tag!("ol") => true,
694 _ => return true
695 };
696
697 let class_score = if self.weight_classes { class_score(elem) } else { 0. };
699
700 if class_score < 0. {
701 return false;
702 }
703
704 let info = self.info.get_or_create(elem.as_node());
705
706 if info.commas >= 10 {
707 return true;
708 }
709
710 let link_density = info.link_len as f32 / info.text_len as f32;
711 let p_img_ratio = info.p_count as f32 / info.img_count as f32;
712
713 !(
715 (info.img_count > 1 && p_img_ratio < 0.5) ||
716 (!is_list && info.li_count > info.p_count + 100) ||
717 (info.input_count * 3 > info.p_count) ||
718 (!is_list && info.text_len < 25 && (info.img_count == 0 || info.img_count > 2)) ||
719 (!is_list && class_score < 25. && link_density > 0.2) ||
720 (class_score >= 25. && link_density > 0.5) ||
721 ((info.embed_count == 1 && info.text_len < 75) || info.embed_count > 1)
722 )
723 }
724
725 fn score_node(&mut self, node: &NodeRef) {
726 if let Some(content_score) = self.calculate_content_score(node) {
727 self.propagate_score(node, content_score);
728 }
729 }
730
731 fn calculate_content_score(&mut self, node: &NodeRef) -> Option<f32> {
732 let parent_elem = node.parent().and_then(|p| p.into_element_ref());
733
734 #[allow(clippy::question_mark)]
735 if parent_elem.is_none() {
736 return None;
737 }
738
739 let info = self.info.get_or_create(node);
740
741 if info.text_len < 25 {
742 return None;
743 }
744
745 let mut content_score = 1;
747
748 content_score += info.commas;
750
751 let total_len = info.text_len + info.link_len;
753 content_score += cmp::min(total_len / 100, 3);
754
755 Some(content_score as f32)
756 }
757
758 fn propagate_score(&mut self, node: &NodeRef, content_score: f32) {
759 for (level, ancestor) in node.ancestors().elements().enumerate().take(3) {
760 let div = match level {
761 0 => 1.,
762 1 => 2.,
763 _ => 3. * level as f32
764 };
765
766 let addition = content_score / div;
767
768 let info = self.info.get_or_create(ancestor.as_node());
769
770 info.content_score += addition;
771
772 if !info.is_candidate {
773 self.candidates.push(ancestor);
774 info.is_candidate = true;
775 }
776 }
777 }
778
779 fn score_candidates(&mut self) {
780 trace!("Found {} candidates. Scoring...", self.candidates.len());
781
782 let mut scored_candidates = Vec::with_capacity(self.candidates.len());
783
784 for candidate in self.candidates.drain(..) {
785 trace!("Candidate: <{}> {}",
786 format_tag(&candidate), format_info(self.info.get(candidate.as_node())));
787
788 let info = self.info.get(candidate.as_node()).unwrap();
789
790 if !info.is_candidate {
792 trace!(" => the node has been removed!");
793 continue;
794 }
795
796 debug_assert!(info.text_len > 0);
797 debug_assert!(info.text_len >= info.link_len);
798
799 let mut score = info.content_score;
801
802 score += tag_score(&candidate.name);
804
805 if self.weight_classes {
807 score += class_score(&candidate);
808 }
809
810 score *= 1. - info.link_len as f32 / info.text_len as f32;
813
814 trace!(" => score: {}", score);
815
816 debug_assert!(score.is_finite());
817
818 scored_candidates.push((score, candidate));
819 }
820
821 if scored_candidates.is_empty() {
822 trace!("There are no actual candidates!");
823 return;
824 }
825
826 scored_candidates.sort_by(|&(a, _), &(b, _)| b.partial_cmp(&a).unwrap());
827
828 let score_threshold = scored_candidates[0].0 * 0.75;
829
830 let top_candidate_it = scored_candidates.into_iter()
831 .take_while(|&(score, _)| score >= score_threshold)
832 .map(|(_, candidate)| candidate);
833
834 self.candidates.extend(top_candidate_it);
835
836 trace!("After scoring left {} candidates", self.candidates.len());
837 }
838
839 fn find_common_candidate(&self) -> NodeRef {
840 const MIN_CANDIDATES: usize = 4;
842
843 trace!("Searching for common parent...");
844
845 let best = self.candidates[0].as_node();
846
847 if self.candidates.len() < MIN_CANDIDATES ||
848 best.is(tag!("body")) || best.parent().map_or(true, |p| p.is(tag!("body"))) {
849 return best.clone();
850 }
851
852 for common in best.ancestors().take_while(|n| !n.is(tag!("body"))) {
853 let mut n = 0;
854
855 for candidate in &self.candidates[1..] {
856 if candidate.as_node().ancestors().any(|a| a == common) {
857 n += 1;
858 }
859
860 if n == MIN_CANDIDATES {
861 trace!("Found common parent of top candidates: <{}>", format_tag(&common));
862 return common;
863 }
864 }
865 }
866
867 best.clone()
868 }
869
870 fn correct_candidate(&mut self, mut candidate: NodeRef) -> NodeRef {
871 trace!("Correcting candidate...");
872
873 let mut last_score = self.info.get_or_create(&candidate).content_score;
877 let score_threshold = last_score / 3.;
878
879 for parent in candidate.ancestors().take_while(|n| !n.is(tag!("body"))) {
880 let parent_score = self.info.get_or_create(&parent).content_score;
881
882 if parent_score < score_threshold {
883 break;
884 }
885
886 if parent_score > last_score {
887 candidate = parent;
888 trace!("New candidate: <{}> (enough score)", format_tag(&candidate));
889 }
890
891 last_score = parent_score;
892 }
893
894 let parent_it = candidate.ancestors().take_while(|parent| {
897 let mut child_it = parent.children();
898
899 !parent.is(tag!("body")) && child_it.next().is_some() && child_it.next().is_none() &&
900 self.info.get(parent).map_or(true, |info| !info.is_shabby)
901 });
902
903 let result = parent_it.last().map_or(candidate, |parent| {
904 trace!("New candidate: <{}> (single child)", format_tag(&parent));
905 parent
906 });
907
908 if !is_acceptable_top_level(&result.as_element().unwrap().name) {
909 trace!("Altering result: <{}> to <div>", format_tag(&result));
910 result.rename(tag!("div"))
911 } else {
912 result
913 }
914 }
915}
916
917fn format_tag<N: NodeRefExt>(node: &N) -> String {
918 let elem = node.node_ref().as_element().unwrap();
919 let tag = &elem.name.local;
920 let attributes = elem.attributes.borrow();
921
922 match (attributes.get("id"), attributes.get("class")) {
923 (Some(id), Some(class)) => format!("{} id=\"{}\" class=\"{}\"", tag, id, class),
924 (Some(id), None) => format!("{} id=\"{}\"", tag, id),
925 (None, Some(class)) => format!("{} class=\"{}\"", tag, class),
926 (None, None) => format!("{}", tag)
927 }
928}
929
930fn format_info(info: Option<&mut NodeInfo>) -> String {
931 info.map_or_else(String::new, |i| format!("{:?}", i))
932}