readability/
scorer.rs

1use dom;
2use html5ever::tree_builder::TreeSink;
3use html5ever::tree_builder::{ElementFlags, NodeOrText};
4use html5ever::{LocalName, QualName};
5use markup5ever_rcdom::Handle;
6use markup5ever_rcdom::Node;
7use markup5ever_rcdom::NodeData::{Comment, Doctype, Document, ProcessingInstruction};
8use markup5ever_rcdom::NodeData::{Element, Text};
9use markup5ever_rcdom::RcDom;
10use regex::Regex;
11use std::cell::Cell;
12use std::collections::BTreeMap;
13use std::path::Path;
14use std::rc::Rc;
15use url::Url;
16
17pub static PUNCTUATIONS_REGEX: &str = r"([、。,.!?]|\.[^A-Za-z0-9]|,[^0-9]|!|\?)";
18pub static UNLIKELY_CANDIDATES: &str = "combx|comment|community|disqus|extra|foot|header|menu\
19     |remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate\
20     |pagination|pager|popup|tweet|twitter\
21     |ssba";
22pub static LIKELY_CANDIDATES: &str = "and|article|body|column|main|shadow\
23                                              |content|hentry";
24pub static POSITIVE_CANDIDATES: &str = "article|body|content|entry|hentry|main|page\
25     |pagination|post|text|blog|story";
26pub static NEGATIVE_CANDIDATES: &str = "combx|comment|com|contact|foot|footer|footnote\
27     |masthead|media|meta|outbrain|promo|related\
28     |scroll|shoutbox|sidebar|sponsor|shopping\
29     |tags|tool|widget|form|textfield\
30     |uiScale|hidden";
31static BLOCK_CHILD_TAGS: [&str; 10] = [
32    "a",
33    "blockquote",
34    "dl",
35    "div",
36    "img",
37    "ol",
38    "p",
39    "pre",
40    "table",
41    "ul",
42];
43lazy_static! {
44    static ref PUNCTUATIONS: Regex = Regex::new(PUNCTUATIONS_REGEX).unwrap();
45    static ref LIKELY: Regex = Regex::new(LIKELY_CANDIDATES).unwrap();
46    static ref UNLIKELY: Regex = Regex::new(UNLIKELY_CANDIDATES).unwrap();
47    static ref POSITIVE: Regex = Regex::new(POSITIVE_CANDIDATES).unwrap();
48    static ref NEGATIVE: Regex = Regex::new(NEGATIVE_CANDIDATES).unwrap();
49}
50
51pub struct Candidate {
52    pub node: Rc<Node>,
53    pub score: Cell<f32>,
54}
55
56pub fn fix_img_path(handle: Handle, url: &Url) -> bool {
57    let src = dom::get_attr("src", handle.clone());
58    let s = match src {
59        Some(src) => src,
60        None => return false,
61    };
62    if !s.starts_with("//") && !s.starts_with("http://") && !s.starts_with("https://") {
63        if let Ok(new_url) = url.join(&s) {
64            dom::set_attr("src", new_url.as_str(), handle)
65        }
66    }
67    true
68}
69
70pub fn fix_anchor_path(handle: Handle, url: &Url) -> bool {
71    let src = dom::get_attr("href", handle.clone());
72    let s = match src {
73        Some(src) => src,
74        None => return false,
75    };
76    if !s.starts_with("//") && !s.starts_with("http://") && !s.starts_with("https://") {
77        if let Ok(new_url) = url.join(&s) {
78            dom::set_attr("href", new_url.as_str(), handle)
79        }
80    }
81    true
82}
83
84pub fn get_link_density(handle: Handle) -> f32 {
85    let text_length = dom::text_len(handle.clone()) as f32;
86    if text_length == 0.0 {
87        return 0.0;
88    }
89    let mut link_length = 0.0;
90    let mut links: Vec<Rc<Node>> = vec![];
91    dom::find_node(handle.clone(), "a", &mut links);
92    for link in links.iter() {
93        link_length += dom::text_len(link.clone()) as f32;
94    }
95    link_length / text_length
96}
97
98pub fn is_candidate(handle: Handle) -> bool {
99    let text_len = dom::text_len(handle.clone());
100    if text_len < 20 {
101        return false;
102    }
103    let n: &str = &dom::get_tag_name(handle.clone()).unwrap_or_default();
104    match n {
105        "p" => true,
106        "div" | "article" | "center" | "section" => {
107            !dom::has_nodes(handle.clone(), &BLOCK_CHILD_TAGS.to_vec())
108        }
109        _ => false,
110    }
111}
112
113pub fn init_content_score(handle: Handle) -> f32 {
114    let tag_name = dom::get_tag_name(handle.clone()).unwrap_or_default();
115    let score = match tag_name.as_ref() {
116        "article" => 10.0,
117        "div" => 5.0,
118        "blockquote" => 3.0,
119        "form" => -3.0,
120        "th" => 5.0,
121        _ => 0.0,
122    };
123    score + get_class_weight(handle.clone())
124}
125
126pub fn calc_content_score(handle: Handle) -> f32 {
127    let mut score: f32 = 1.0;
128    let mut text = String::new();
129    dom::extract_text(handle.clone(), &mut text, true);
130    let mat = PUNCTUATIONS.find_iter(&text);
131    score += mat.count() as f32;
132    score += f32::min(f32::floor(text.chars().count() as f32 / 100.0), 3.0);
133    score
134}
135
136pub fn get_class_weight(handle: Handle) -> f32 {
137    let mut weight: f32 = 0.0;
138    if let Element {
139        name: _, ref attrs, ..
140    } = handle.data
141    {
142        for name in ["id", "class"].iter() {
143            if let Some(val) = dom::attr(name, &attrs.borrow()) {
144                if POSITIVE.is_match(&val) {
145                    weight += 25.0
146                };
147                if NEGATIVE.is_match(&val) {
148                    weight -= 25.0
149                }
150            }
151        }
152    };
153    weight
154}
155
156pub fn preprocess(dom: &mut RcDom, handle: Handle, title: &mut String) -> bool {
157    if let Element {
158        ref name,
159        ref attrs,
160        ..
161    } = handle.clone().data
162    {
163        let tag_name = name.local.as_ref();
164        match tag_name.to_lowercase().as_ref() {
165            "script" | "link" | "style" => return true,
166            "title" => dom::extract_text(handle.clone(), title, true),
167            _ => (),
168        }
169        for name in ["id", "class"].iter() {
170            if let Some(val) = dom::attr(name, &attrs.borrow()) {
171                if tag_name != "body" && UNLIKELY.is_match(&val) && !LIKELY.is_match(&val) {
172                    return true;
173                }
174            }
175        }
176    }
177    let mut useless_nodes = vec![];
178    let mut paragraph_nodes = vec![];
179    let mut br_count = 0;
180    for child in handle.children.borrow().iter() {
181        if preprocess(dom, child.clone(), title) {
182            useless_nodes.push(child.clone());
183        }
184        let c = child.clone();
185        match c.data {
186            Element { ref name, .. } => {
187                let tag_name = name.local.as_ref();
188                if "br" == tag_name.to_lowercase() {
189                    br_count += 1
190                } else {
191                    br_count = 0
192                }
193            }
194            Text { ref contents } => {
195                let s = contents.borrow();
196                if br_count >= 2 && !s.trim().is_empty() {
197                    paragraph_nodes.push(child.clone());
198                    br_count = 0
199                }
200            }
201            _ => (),
202        }
203    }
204    for node in useless_nodes.iter() {
205        dom.remove_from_parent(node);
206    }
207    for node in paragraph_nodes.iter() {
208        let name = QualName::new(None, ns!(), LocalName::from("p"));
209        let p = dom.create_element(name, vec![], ElementFlags::default());
210        dom.append_before_sibling(node, NodeOrText::AppendNode(p.clone()));
211        dom.remove_from_parent(node);
212        if let Text { ref contents } = node.clone().data {
213            let text = contents.clone().into_inner().clone();
214            dom.append(&p, NodeOrText::AppendText(text))
215        }
216    }
217    false
218}
219
220pub fn find_candidates(
221    id: &Path,
222    handle: Handle,
223    candidates: &mut BTreeMap<String, Candidate>,
224    nodes: &mut BTreeMap<String, Rc<Node>>,
225) {
226    if let Some(id) = id.to_str().map(|id| id.to_string()) {
227        nodes.insert(id, handle.clone());
228    }
229
230    if is_candidate(handle.clone()) {
231        let score = calc_content_score(handle.clone());
232        if let Some(c) = id
233            .parent()
234            .and_then(|pid| find_or_create_candidate(pid, candidates, nodes))
235        {
236            c.score.set(c.score.get() + score)
237        }
238        if let Some(c) = id
239            .parent()
240            .and_then(|pid| pid.parent())
241            .and_then(|gpid| find_or_create_candidate(gpid, candidates, nodes))
242        {
243            c.score.set(c.score.get() + score / 2.0)
244        }
245    }
246
247    if is_candidate(handle.clone()) {
248        let score = calc_content_score(handle.clone());
249        if let Some(c) = id
250            .to_str()
251            .map(|id| id.to_string())
252            .and_then(|id| candidates.get(&id))
253        {
254            c.score.set(c.score.get() + score)
255        }
256        if let Some(c) = id
257            .parent()
258            .and_then(|pid| pid.to_str())
259            .map(|id| id.to_string())
260            .and_then(|pid| candidates.get(&pid))
261        {
262            c.score.set(c.score.get() + score)
263        }
264        if let Some(c) = id
265            .parent()
266            .and_then(|p| p.parent())
267            .and_then(|pid| pid.to_str())
268            .map(|id| id.to_string())
269            .and_then(|pid| candidates.get(&pid))
270        {
271            c.score.set(c.score.get() + score)
272        }
273    }
274
275    for (i, child) in handle.children.borrow().iter().enumerate() {
276        find_candidates(
277            id.join(i.to_string()).as_path(),
278            child.clone(),
279            candidates,
280            nodes,
281        )
282    }
283}
284
285fn find_or_create_candidate<'a>(
286    id: &Path,
287    candidates: &'a mut BTreeMap<String, Candidate>,
288    nodes: &BTreeMap<String, Rc<Node>>,
289) -> Option<&'a Candidate> {
290    if let Some(id) = id.to_str().map(|id| id.to_string()) {
291        if let Some(node) = nodes.get(&id) {
292            if candidates.get(&id).is_none() {
293                candidates.insert(
294                    id.clone(),
295                    Candidate {
296                        node: node.clone(),
297                        score: Cell::new(init_content_score(node.clone())),
298                    },
299                );
300            }
301            return candidates.get(&id);
302        }
303    }
304    None
305}
306
307pub fn clean(
308    dom: &mut RcDom,
309    id: &Path,
310    handle: Handle,
311    url: &Url,
312    candidates: &BTreeMap<String, Candidate>,
313) -> bool {
314    let mut useless = false;
315    match handle.data {
316        Document => (),
317        Doctype { .. } => (),
318        Text { ref contents } => {
319            let s = contents.borrow();
320            if s.trim().is_empty() {
321                useless = true
322            }
323        }
324        Comment { .. } => useless = true,
325        Element {
326            ref name,
327            ref attrs,
328            ..
329        } => {
330            let tag_name = name.local.as_ref();
331            match tag_name.to_lowercase().as_ref() {
332                "script" | "link" | "style" | "noscript" | "meta" | "h1" | "object" | "header"
333                | "footer" | "aside" => useless = true,
334                "form" | "table" | "ul" | "div" => {
335                    useless = is_useless(id, handle.clone(), candidates)
336                }
337                "img" => useless = !fix_img_path(handle.clone(), url),
338                "a" => useless = !fix_anchor_path(handle.clone(), url),
339                _ => (),
340            }
341            dom::clean_attr("id", &mut attrs.borrow_mut());
342            dom::clean_attr("class", &mut attrs.borrow_mut());
343            dom::clean_attr("style", &mut attrs.borrow_mut());
344        }
345        ProcessingInstruction { .. } => unreachable!(),
346    }
347    let mut useless_nodes = vec![];
348    for (i, child) in handle.children.borrow().iter().enumerate() {
349        let pid = id.join(i.to_string());
350        if clean(dom, pid.as_path(), child.clone(), url, candidates) {
351            useless_nodes.push(child.clone());
352        }
353    }
354    for node in useless_nodes.iter() {
355        dom.remove_from_parent(node);
356    }
357    if dom::is_empty(handle) {
358        useless = true
359    }
360    useless
361}
362
363pub fn is_useless(id: &Path, handle: Handle, candidates: &BTreeMap<String, Candidate>) -> bool {
364    let tag_name = &dom::get_tag_name(handle.clone()).unwrap_or_default();
365    let weight = get_class_weight(handle.clone());
366    let score = id
367        .to_str()
368        .and_then(|id| candidates.get(id))
369        .map(|c| c.score.get())
370        .unwrap_or(0.0);
371    if weight + score < 0.0 {
372        return true;
373    }
374    let text_nodes_len = dom::text_children_count(handle.clone());
375    let mut p_nodes: Vec<Rc<Node>> = vec![];
376    let mut img_nodes: Vec<Rc<Node>> = vec![];
377    let mut li_nodes: Vec<Rc<Node>> = vec![];
378    let mut input_nodes: Vec<Rc<Node>> = vec![];
379    let mut embed_nodes: Vec<Rc<Node>> = vec![];
380    dom::find_node(handle.clone(), "p", &mut p_nodes);
381    dom::find_node(handle.clone(), "img", &mut img_nodes);
382    dom::find_node(handle.clone(), "li", &mut li_nodes);
383    dom::find_node(handle.clone(), "input", &mut input_nodes);
384    dom::find_node(handle.clone(), "embed", &mut embed_nodes);
385    let p_count = p_nodes.len();
386    let img_count = img_nodes.len();
387    let li_count = li_nodes.len() as i32 - 100;
388    let input_count = input_nodes.len();
389    let embed_count = embed_nodes.len();
390    let link_density = get_link_density(handle.clone());
391    let content_length = dom::text_len(handle.clone());
392    let para_count = text_nodes_len + p_count;
393
394    if img_count > para_count + text_nodes_len {
395        return true;
396    }
397    if li_count > para_count as i32 && tag_name != "ul" && tag_name != "ol" {
398        return true;
399    }
400    if input_count as f32 > f32::floor(para_count as f32 / 3.0) {
401        return true;
402    }
403    if content_length < 25 && (img_count == 0 || img_count > 2) {
404        return true;
405    }
406    if weight < 25.0 && link_density > 0.2 {
407        return true;
408    }
409    if (embed_count == 1 && content_length < 35) || embed_count > 1 {
410        return true;
411    }
412    false
413}