readability/
scorer.rs

1use crate::dom;
2use html5ever::{
3    namespace_url, ns,
4    tree_builder::{ElementFlags, NodeOrText, TreeSink},
5    LocalName, QualName,
6};
7use lazy_static::lazy_static;
8use markup5ever_rcdom::{
9    Handle, Node,
10    NodeData::{Comment, Doctype, Document, Element, ProcessingInstruction, Text},
11    RcDom,
12};
13use regex::Regex;
14use std::{borrow::Cow, cell::Cell, collections::BTreeMap, path::Path, rc::Rc};
15use url::Url;
16
17const PUNCTUATIONS_REGEX: &str = r"([、。,.!?]|\.[^A-Za-z0-9]|,[^0-9]|!|\?)";
18// TODO: remove "comment" from unlikely candidates
19const UNLIKELY_CANDIDATES: &str = "combx|comment|community|disqus|extra|foot|header|menu\
20     |remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate\
21     |pagination|pager|popup|tweet|twitter\
22     |ssba";
23const LIKELY_CANDIDATES: &str = "and|article|body|column|main|shadow\
24                                              |content|hentry";
25const POSITIVE_CANDIDATES: &str = "article|body|content|entry|hentry|main|page\
26     |pagination|post|text|blog|story";
27// TODO: remove "comment" and "com" from unlikely candidates
28const NEGATIVE_CANDIDATES: &str = "combx|comment|com|contact|foot|footer|footnote\
29     |masthead|media|meta|outbrain|promo|related\
30     |scroll|shoutbox|sidebar|sponsor|shopping\
31     |tags|tool|widget|form|textfield\
32     |uiScale|hidden";
33const BLOCK_CHILD_TAGS: [&str; 10] = [
34    "a",
35    "blockquote",
36    "dl",
37    "div",
38    "img",
39    "ol",
40    "p",
41    "pre",
42    "table",
43    "ul",
44];
45lazy_static! {
46    static ref PUNCTUATIONS: Regex = Regex::new(PUNCTUATIONS_REGEX).unwrap();
47    static ref LIKELY: Regex = Regex::new(LIKELY_CANDIDATES).unwrap();
48    static ref UNLIKELY: Regex = Regex::new(UNLIKELY_CANDIDATES).unwrap();
49    static ref POSITIVE: Regex = Regex::new(POSITIVE_CANDIDATES).unwrap();
50    static ref NEGATIVE: Regex = Regex::new(NEGATIVE_CANDIDATES).unwrap();
51}
52
53#[derive(Clone)]
54pub struct Candidate {
55    pub node: Rc<Node>,
56    pub score: Cell<f32>,
57}
58
59pub struct TopCandidate<'a> {
60    id: Cow<'a, str>,
61    candidate: Cow<'a, Candidate>,
62}
63
64impl<'a> TopCandidate<'a> {
65    pub fn new(id: &'a str, candidate: Candidate) -> Self {
66        Self {
67            id: Cow::Borrowed(id),
68            candidate: Cow::Owned(candidate),
69        }
70    }
71
72    pub fn id(&self) -> &str {
73        self.id.as_ref()
74    }
75
76    pub fn candidate(&self) -> &Candidate {
77        &self.candidate
78    }
79
80    pub fn node(&self) -> &Rc<Node> {
81        &self.candidate.node
82    }
83
84    pub fn score(&self) -> &Cell<f32> {
85        &self.candidate.score
86    }
87}
88
89/// Distribution of the content score among parent nodes.
90#[derive(Debug)]
91pub enum CandidateScore {
92    /// The same weight for all parent nodes.
93    EqualWeight,
94    /// The weight decreases with the level of the parent node.
95    ///
96    /// For example, a parent node will be weighted more than a grandparent.
97    LevelWeight,
98}
99
100#[derive(Debug)]
101pub struct ScorerOptions<'a> {
102    /// The minimum word length of candidates.
103    pub min_candidate_length: usize,
104    /// The maximal number of parent nodes that will be traversed.
105    pub max_candidate_parents: usize,
106    /// Distribution of the content score among parent nodes.
107    pub candidate_score: CandidateScore,
108    pub punctuations: &'a Regex,
109    pub unlikely_candidates: &'a Regex,
110    pub likely_candidates: &'a Regex,
111    pub positive_candidates: &'a Regex,
112    pub positive_candidate_weight: f32,
113    pub negative_candidates: &'a Regex,
114    pub negative_candidate_weight: f32,
115    pub block_child_tags: &'a [&'a str],
116}
117
118impl Default for ScorerOptions<'_> {
119    fn default() -> Self {
120        Self {
121            min_candidate_length: 20,
122            max_candidate_parents: 10,
123            candidate_score: CandidateScore::EqualWeight,
124            punctuations: &PUNCTUATIONS,
125            likely_candidates: &LIKELY,
126            unlikely_candidates: &UNLIKELY,
127            positive_candidates: &POSITIVE,
128            positive_candidate_weight: 25.0,
129            negative_candidates: &NEGATIVE,
130            negative_candidate_weight: 25.0,
131            block_child_tags: &BLOCK_CHILD_TAGS,
132        }
133    }
134}
135
136pub struct Scorer<'a> {
137    options: ScorerOptions<'a>,
138}
139
140impl<'a> Scorer<'a> {
141    pub fn new(options: ScorerOptions<'a>) -> Self {
142        Scorer { options }
143    }
144
145    pub fn preprocess(&self, dom: &mut RcDom, handle: Handle, title: &mut String) -> bool {
146        if let Element {
147            ref name,
148            ref attrs,
149            ..
150        } = handle.clone().data
151        {
152            let tag_name = name.local.as_ref();
153            match tag_name.to_lowercase().as_ref() {
154                "script" | "link" | "style" => return true,
155                "title" => dom::extract_text(handle.clone(), title, true),
156                _ => (),
157            }
158            for name in ["id", "class"].iter() {
159                if let Some(val) = dom::attr(name, &attrs.borrow()) {
160                    if tag_name != "body"
161                        && self.options.unlikely_candidates.is_match(&val)
162                        && !self.options.likely_candidates.is_match(&val)
163                    {
164                        return true;
165                    }
166                }
167            }
168        }
169        let mut useless_nodes = vec![];
170        let mut paragraph_nodes = vec![];
171        let mut br_count = 0;
172        for child in handle.children.borrow().iter() {
173            if self.preprocess(dom, child.clone(), title) {
174                useless_nodes.push(child.clone());
175            }
176            let c = child.clone();
177            match c.data {
178                Element { ref name, .. } => {
179                    let tag_name = name.local.as_ref();
180                    if "br" == tag_name.to_lowercase() {
181                        br_count += 1
182                    } else {
183                        br_count = 0
184                    }
185                }
186                Text { ref contents } => {
187                    let s = contents.borrow();
188                    if br_count >= 2 && !s.trim().is_empty() {
189                        paragraph_nodes.push(child.clone());
190                        br_count = 0
191                    }
192                }
193                _ => (),
194            }
195        }
196        for node in useless_nodes.iter() {
197            dom.remove_from_parent(node);
198        }
199        for node in paragraph_nodes.iter() {
200            let name = QualName::new(None, ns!(), LocalName::from("p"));
201            let p = dom.create_element(name, vec![], ElementFlags::default());
202            dom.append_before_sibling(node, NodeOrText::AppendNode(p.clone()));
203            dom.remove_from_parent(node);
204            if let Text { ref contents } = node.clone().data {
205                let text = contents.clone().into_inner().clone();
206                dom.append(&p, NodeOrText::AppendText(text))
207            }
208        }
209        false
210    }
211
212    /// Find candidate tags in DOM node, and distribute score among them.
213    pub fn find_candidates(
214        &self,
215        node_id: &Path,
216        handle: Handle,
217        candidates: &mut BTreeMap<String, Candidate>,
218        nodes: &mut BTreeMap<String, Rc<Node>>,
219    ) {
220        if let Some(id) = node_id
221            .to_str()
222            .map(|candidate_id| candidate_id.to_string())
223        {
224            nodes.insert(id, handle.clone());
225        }
226
227        if self.is_candidate(handle.clone()) {
228            let content_score = self.calculate_content_score(handle.clone());
229
230            let mut current_node_id = Some(node_id.to_path_buf());
231            let mut level = 1;
232
233            // Traverse all parent nodes and distribute content score.
234            while let Some(current_id) = current_node_id {
235                // Break traversal for performance reasons.
236                if level > self.options.max_candidate_parents {
237                    break;
238                }
239
240                if let Some(candidate) =
241                    current_id.to_str().map(|id| id.to_string()).and_then(|id| {
242                        // Only parent nodes are valid candidates.
243                        if current_id != node_id {
244                            self.find_or_create_candidate(Path::new(&id), candidates, nodes)
245                        } else {
246                            None
247                        }
248                    })
249                {
250                    let adjusted_content_score = match self.options.candidate_score {
251                        CandidateScore::EqualWeight => content_score,
252                        CandidateScore::LevelWeight => content_score / (level as f32),
253                    };
254                    candidate
255                        .score
256                        .set(candidate.score.get() + adjusted_content_score);
257
258                    // Ignore candidates above the `body` node.
259                    if dom::get_tag_name(candidate.node.clone()).as_deref() == Some("body") {
260                        break;
261                    }
262                }
263
264                current_node_id = current_id.parent().map(|pid| pid.to_path_buf());
265                level += 1;
266            }
267        }
268
269        for (i, child) in handle.children.borrow().iter().enumerate() {
270            self.find_candidates(
271                node_id.join(i.to_string()).as_path(),
272                child.clone(),
273                candidates,
274                nodes,
275            )
276        }
277    }
278
279    // TODO: find top candidates with similar score.
280    pub fn find_top_candidate(
281        &self,
282        candidates: &'a BTreeMap<String, Candidate>,
283    ) -> Option<TopCandidate<'a>> {
284        let mut top_candidate: Option<TopCandidate> = None;
285
286        for (id, candidate) in candidates.iter() {
287            let score = candidate.score.get() * (1.0 - get_link_density(candidate.node.clone()));
288            candidate.score.set(score);
289
290            if top_candidate
291                .as_ref()
292                .map_or(true, |top| score > top.candidate.score.get())
293            {
294                top_candidate = Some(TopCandidate {
295                    id: Cow::Borrowed(id),
296                    candidate: Cow::Borrowed(candidate),
297                });
298            }
299        }
300
301        top_candidate
302    }
303
304    pub fn clean(
305        &self,
306        dom: &mut RcDom,
307        id: &Path,
308        handle: Handle,
309        url: &Url,
310        candidates: &BTreeMap<String, Candidate>,
311    ) -> bool {
312        let mut useless = false;
313        match handle.data {
314            Document => (),
315            Doctype { .. } => (),
316            Text { ref contents } => {
317                let s = contents.borrow();
318                if s.trim().is_empty() {
319                    useless = true
320                }
321            }
322            Comment { .. } => useless = true,
323            Element {
324                ref name,
325                ref attrs,
326                ..
327            } => {
328                let tag_name = name.local.as_ref();
329                match tag_name.to_lowercase().as_ref() {
330                    "script" | "link" | "style" | "noscript" | "meta" | "h1" | "object"
331                    | "header" | "footer" | "aside" => useless = true,
332                    "form" | "table" | "ul" | "div" => {
333                        useless = self.is_useless(id, handle.clone(), candidates)
334                    }
335                    "img" => useless = !fix_img_path(handle.clone(), url),
336                    "a" => useless = !fix_anchor_path(handle.clone(), url),
337                    _ => (),
338                }
339                dom::clean_attr("id", &mut attrs.borrow_mut());
340                dom::clean_attr("class", &mut attrs.borrow_mut());
341                dom::clean_attr("style", &mut attrs.borrow_mut());
342            }
343            ProcessingInstruction { .. } => unreachable!(),
344        }
345        let mut useless_nodes = vec![];
346        for (i, child) in handle.children.borrow().iter().enumerate() {
347            let pid = id.join(i.to_string());
348            if self.clean(dom, pid.as_path(), child.clone(), url, candidates) {
349                useless_nodes.push(child.clone());
350            }
351        }
352        for node in useless_nodes.iter() {
353            dom.remove_from_parent(node);
354        }
355        if dom::is_empty(handle) {
356            useless = true
357        }
358        useless
359    }
360
361    fn calculate_content_score(&self, handle: Handle) -> f32 {
362        let mut score: f32 = 1.0;
363        let mut text = String::new();
364        dom::extract_text(handle.clone(), &mut text, true);
365        let mat = self.options.punctuations.find_iter(&text);
366        score += mat.count() as f32;
367        score += f32::min(f32::floor(text.chars().count() as f32 / 100.0), 3.0);
368        score
369    }
370
371    fn get_class_weight(&self, handle: Handle) -> f32 {
372        let mut weight: f32 = 0.0;
373        if let Element {
374            name: _, ref attrs, ..
375        } = handle.data
376        {
377            for name in ["id", "class"].iter() {
378                if let Some(val) = dom::attr(name, &attrs.borrow()) {
379                    if self.options.positive_candidates.is_match(&val) {
380                        weight += self.options.positive_candidate_weight
381                    };
382                    if self.options.negative_candidates.is_match(&val) {
383                        weight -= self.options.negative_candidate_weight
384                    }
385                }
386            }
387        };
388        weight
389    }
390
391    fn init_content_score(&self, handle: Handle) -> f32 {
392        let tag_name = dom::get_tag_name(handle.clone()).unwrap_or_default();
393        let score = match tag_name.as_ref() {
394            "article" => 10.0,
395            "div" => 5.0,
396            "pre" | "td" | "blockquote" => 3.0,
397            "address" | "ol" | "ul" | "dl" | "dd" | "dt" | "li" | "form" => -3.0,
398            "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "th" => -5.0,
399            _ => 0.0,
400        };
401        score + self.get_class_weight(handle.clone())
402    }
403
404    fn find_or_create_candidate(
405        &self,
406        id: &Path,
407        candidates: &'a mut BTreeMap<String, Candidate>,
408        nodes: &BTreeMap<String, Rc<Node>>,
409    ) -> Option<&'a Candidate> {
410        if let Some(id) = id.to_str().map(|id| id.to_string()) {
411            if let Some(node) = nodes.get(&id) {
412                if candidates.get(&id).is_none() {
413                    candidates.insert(
414                        id.clone(),
415                        Candidate {
416                            node: node.clone(),
417                            score: Cell::new(self.init_content_score(node.clone())),
418                        },
419                    );
420                }
421                return candidates.get(&id);
422            }
423        }
424        None
425    }
426
427    fn is_useless(
428        &self,
429        id: &Path,
430        handle: Handle,
431        candidates: &BTreeMap<String, Candidate>,
432    ) -> bool {
433        let tag_name = &dom::get_tag_name(handle.clone()).unwrap_or_default();
434        let weight = self.get_class_weight(handle.clone());
435        let score = id
436            .to_str()
437            .and_then(|id| candidates.get(id))
438            .map(|c| c.score.get())
439            .unwrap_or(0.0);
440        if weight + score < 0.0 {
441            return true;
442        }
443        let text_nodes_len = dom::text_children_count(handle.clone());
444        let mut p_nodes: Vec<Rc<Node>> = vec![];
445        let mut img_nodes: Vec<Rc<Node>> = vec![];
446        let mut li_nodes: Vec<Rc<Node>> = vec![];
447        let mut input_nodes: Vec<Rc<Node>> = vec![];
448        let mut embed_nodes: Vec<Rc<Node>> = vec![];
449        dom::find_node(handle.clone(), "p", &mut p_nodes);
450        dom::find_node(handle.clone(), "img", &mut img_nodes);
451        dom::find_node(handle.clone(), "li", &mut li_nodes);
452        dom::find_node(handle.clone(), "input", &mut input_nodes);
453        dom::find_node(handle.clone(), "embed", &mut embed_nodes);
454        let p_count = p_nodes.len();
455        let img_count = img_nodes.len();
456        let li_count = li_nodes.len() as i32 - 100;
457        let input_count = input_nodes.len();
458        let embed_count = embed_nodes.len();
459        let link_density = get_link_density(handle.clone());
460        let content_length = dom::text_len(handle.clone());
461        let para_count = text_nodes_len + p_count;
462
463        if img_count > para_count + text_nodes_len {
464            return true;
465        }
466        if li_count > para_count as i32 && tag_name != "ul" && tag_name != "ol" {
467            return true;
468        }
469        if input_count as f32 > f32::floor(para_count as f32 / 3.0) {
470            return true;
471        }
472        if content_length < 25 && (img_count == 0 || img_count > 2) {
473            return true;
474        }
475        if weight < 25.0 && link_density > 0.2 {
476            return true;
477        }
478        if (embed_count == 1 && content_length < 35) || embed_count > 1 {
479            return true;
480        }
481        false
482    }
483
484    fn is_candidate(&self, handle: Handle) -> bool {
485        let text_len = dom::text_len(handle.clone());
486        if text_len < self.options.min_candidate_length {
487            return false;
488        }
489        let n: &str = &dom::get_tag_name(handle.clone()).unwrap_or_default();
490        match n {
491            "p" => true,
492            "div" | "article" | "center" | "section" => {
493                !dom::has_nodes(handle.clone(), self.options.block_child_tags)
494            }
495            _ => false,
496        }
497    }
498}
499
500pub fn fix_img_path(handle: Handle, url: &Url) -> bool {
501    let src = dom::get_attr("src", handle.clone());
502    let s = match src {
503        Some(src) => src,
504        None => return false,
505    };
506    if !s.starts_with("//") && !s.starts_with("http://") && !s.starts_with("https://") {
507        if let Ok(new_url) = url.join(&s) {
508            dom::set_attr("src", new_url.as_str(), handle)
509        }
510    }
511    true
512}
513
514pub fn fix_anchor_path(handle: Handle, url: &Url) -> bool {
515    let src = dom::get_attr("href", handle.clone());
516    let s = match src {
517        Some(src) => src,
518        None => return false,
519    };
520    if !s.starts_with("//") && !s.starts_with("http://") && !s.starts_with("https://") {
521        if let Ok(new_url) = url.join(&s) {
522            dom::set_attr("href", new_url.as_str(), handle)
523        }
524    }
525    true
526}
527
528pub fn get_link_density(handle: Handle) -> f32 {
529    let text_length = dom::text_len(handle.clone()) as f32;
530    if text_length == 0.0 {
531        return 0.0;
532    }
533    let mut link_length = 0.0;
534    let mut links: Vec<Rc<Node>> = vec![];
535    dom::find_node(handle.clone(), "a", &mut links);
536    for link in links.iter() {
537        link_length += dom::text_len(link.clone()) as f32;
538    }
539    link_length / text_length
540}
541
542#[cfg(test)]
543mod tests {
544    use super::*;
545    use crate::utils::{debug_candidates, CandidateTag};
546    use html5ever::{parse_document, tendril::TendrilSink};
547    use std::{fs::File, io::Read};
548
549    #[test]
550    fn test_find_candidates_basic() {
551        let html = r#"
552        <!DOCTYPE html>
553        <html>
554            <head><title>Test Title</title></head>
555            <body>
556                <h1>Welcome</h1>
557                <p>This is a test paragraph with more than 25 characters.</p>
558            </body>
559        </html>"#;
560        let options = ScorerOptions::default();
561        let scorer = Scorer::new(options);
562        let dom = parse_document(RcDom::default(), Default::default())
563            .from_utf8()
564            .read_from(&mut html.as_bytes())
565            .unwrap();
566
567        assert!(dom.errors.is_empty(), "{:?}", dom.errors);
568
569        let mut candidates = BTreeMap::new();
570        let mut nodes = BTreeMap::new();
571
572        scorer.find_candidates(Path::new("/"), dom.document, &mut candidates, &mut nodes);
573
574        let tags = dbg!(debug_candidates(&candidates));
575
576        assert_eq!(candidates.len(), 1);
577        assert!(tags.contains(&CandidateTag::new("body", None, 1.0)));
578    }
579
580    #[test]
581    fn test_find_candidates_comments() {
582        let mut file = File::open("data/comments/input.html").unwrap();
583        let mut html = String::new();
584        file.read_to_string(&mut html).unwrap();
585
586        let options = ScorerOptions {
587            unlikely_candidates: &Regex::new(
588                "combx|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter|ssba",
589            )
590            .unwrap(),
591            negative_candidates: &Regex::new("combx|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget|form|textfield|uiScale|hidden").unwrap(),
592            positive_candidates: &Regex::new("article|body|content|entry|hentry|main|page|pagination|post|blog|story").unwrap(),
593            ..Default::default()
594        };
595        let scorer = Scorer::new(options);
596        let dom = parse_document(RcDom::default(), Default::default())
597            .from_utf8()
598            .read_from(&mut html.as_bytes())
599            .unwrap();
600
601        assert!(dom.errors.is_empty(), "{:?}", dom.errors);
602
603        let mut candidates = BTreeMap::new();
604        let mut nodes = BTreeMap::new();
605
606        scorer.find_candidates(Path::new("/"), dom.document, &mut candidates, &mut nodes);
607
608        let tags = dbg!(debug_candidates(&candidates));
609
610        assert_eq!(candidates.len(), 15);
611
612        assert!(tags.contains(&CandidateTag::new("tbody", None, 6.0)));
613        assert!(tags.contains(&CandidateTag::new("tr", Some("tr_2"), 6.0)));
614        assert!(tags.contains(&CandidateTag::new("table", Some("table_2"), 6.0)));
615        assert!(tags.contains(&CandidateTag::new("td", Some("td_0"), 9.0)));
616        assert!(tags.contains(&CandidateTag::new("td", Some("td_1"), 5.0)));
617        assert!(tags.contains(&CandidateTag::new("td", Some("td_2"), 5.0)));
618        assert!(tags.contains(&CandidateTag::new("td", Some("td_3"), 5.0)));
619        assert!(tags.contains(&CandidateTag::new("div", Some("comment_1"), 7.0)));
620        assert!(tags.contains(&CandidateTag::new("div", Some("comment_2"), 7.0)));
621        assert!(tags.contains(&CandidateTag::new("div", Some("comment_3"), 7.0)));
622        assert!(tags.contains(&CandidateTag::new("div", Some("commtext_1"), 7.0)));
623        assert!(tags.contains(&CandidateTag::new("div", Some("commtext_2"), 7.0)));
624        assert!(tags.contains(&CandidateTag::new("div", Some("commtext_3"), 7.0)));
625    }
626}