readability_fork/
scorer.rs

1use std::rc::Rc;
2use std::path::Path;
3use std::cell::Cell;
4use std::collections::BTreeMap;
5use url::Url;
6use regex::Regex;
7use lazy_static::lazy_static;
8use html5ever::tree_builder::TreeSink;
9use markup5ever_rcdom::Node;
10use markup5ever_rcdom::NodeData::{Element, Text};
11use markup5ever_rcdom::Handle;
12use markup5ever_rcdom::NodeData::{
13    Document,
14    Doctype,
15    Comment,
16    ProcessingInstruction
17};
18use markup5ever_rcdom::RcDom;
19use html5ever::{QualName, LocalName};
20use html5ever::tree_builder::{NodeOrText, ElementFlags};
21use html5ever::{ns, namespace_url};
22use crate::dom;
23
24pub static PUNCTUATIONS_REGEX: &'static str = r"([、。,.!?]|\.[^A-Za-z0-9]|,[^0-9]|!|\?)";
25pub static UNLIKELY_CANDIDATES: &'static str =
26    "combx|comment|community|disqus|extra|foot|header|menu\
27     |remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate\
28     |pagination|pager|popup|tweet|twitter\
29     |ssba";
30pub static LIKELY_CANDIDATES: &'static str = "and|article|body|column|main|shadow\
31                                              |content|hentry";
32pub static POSITIVE_CANDIDATES: &'static str =
33    "article|body|content|entry|hentry|main|page\
34     |pagination|post|text|blog|story";
35pub static NEGATIVE_CANDIDATES: &'static str =
36    "combx|comment|com|contact|foot|footer|footnote\
37     |masthead|media|meta|outbrain|promo|related\
38     |scroll|shoutbox|sidebar|sponsor|shopping\
39     |tags|tool|widget|form|textfield\
40     |uiScale|hidden";
41static BLOCK_CHILD_TAGS: [&'static str; 10] = [
42    "a", "blockquote", "dl", "div", "img", "ol", "p", "pre", "table", "ul",
43];
44lazy_static! {
45    static ref PUNCTUATIONS: Regex = Regex::new(PUNCTUATIONS_REGEX).unwrap();
46    static ref LIKELY:       Regex = Regex::new(LIKELY_CANDIDATES).unwrap();
47    static ref UNLIKELY:     Regex = Regex::new(UNLIKELY_CANDIDATES).unwrap();
48    static ref POSITIVE:     Regex = Regex::new(POSITIVE_CANDIDATES).unwrap();
49    static ref NEGATIVE:     Regex = Regex::new(NEGATIVE_CANDIDATES).unwrap();
50}
51
52pub struct Candidate {
53    pub node:  Rc<Node>,
54    pub score: Cell<f32>,
55}
56
57pub fn fix_img_path(handle: Handle, url: &Url) -> bool {
58    let src = dom::get_attr("src", handle.clone());
59    if src.is_none() {
60        return false
61    }
62    let s = src.unwrap();
63    if !s.starts_with("//") && !s.starts_with("http://") && s.starts_with("https://") {
64        match url.join(&s) {
65            Ok(new_url) => dom::set_attr("src", new_url.as_str(), handle),
66            Err(_)      => (),
67        }
68    }
69    true
70}
71
72pub fn get_link_density(handle: Handle) -> f32 {
73    let text_length = dom::text_len(handle.clone()) as f32;
74    if text_length == 0.0 {
75        return 0.0;
76    }
77    let mut link_length = 0.0;
78    let mut links: Vec<Rc<Node>> = vec![];
79    dom::find_node(handle.clone(), "a", &mut links);
80    for link in links.iter() {
81        link_length += dom::text_len(link.clone()) as f32;
82    }
83    link_length / text_length
84}
85
86pub fn is_candidate(handle: Handle) -> bool {
87    let text_len = dom::text_len(handle.clone());
88    if text_len < 20 {
89        return false
90    }
91    let n: &str = &dom::get_tag_name(handle. clone()).unwrap_or_default();
92    match n {
93        "p" => true,
94        "div" | "article" | "center" | "section" =>
95            !dom::has_nodes(handle.clone(), &BLOCK_CHILD_TAGS.iter().map(|t| *t).collect()),
96        _ => false
97    }
98}
99
100pub fn init_content_score(handle: Handle) -> f32 {
101    let tag_name = dom::get_tag_name(handle.clone()).unwrap_or_default();
102    let score = match tag_name.as_ref() {
103        "article"    => 10.0,
104        "div"        => 5.0,
105        "blockquote" => 3.0,
106        "form"       => -3.0,
107        "th"         => 5.0,
108        _            => 0.0,
109    };
110    score + get_class_weight(handle.clone())
111}
112
113pub fn calc_content_score(handle: Handle) -> f32 {
114    let mut score: f32 = 1.0;
115    let mut text = String::new();
116    dom::extract_text(handle.clone(), &mut text, true);
117    let mat = PUNCTUATIONS.find_iter(&text);
118    score += mat.count() as f32;
119    score += f32::min(f32::floor(text.chars().count() as f32 / 100.0), 3.0);
120    return score
121}
122
123pub fn get_class_weight(handle: Handle) -> f32 {
124    let mut weight: f32 = 0.0;
125    match handle.data {
126        Element { name: _, ref attrs, .. } => {
127            for name in ["id", "class"].iter() {
128                if let Some(val) = dom::attr(name, &attrs.borrow()) {
129                    if POSITIVE.is_match(&val) {
130                        weight += 25.0
131                    };
132                    if NEGATIVE.is_match(&val) {
133                        weight -= 25.0
134                    }
135                }
136            }
137        },
138        _ => (),
139    };
140    weight
141}
142
143pub fn preprocess(mut dom: &mut RcDom, handle: Handle, mut title: &mut String) -> bool {
144    match handle.clone().data {
145        Element { ref name, ref attrs, .. } => {
146            let tag_name = name.local.as_ref();
147            match tag_name.to_lowercase().as_ref() {
148                "script" | "link" | "style"  => {
149                    return true
150                },
151                "title" => dom::extract_text(handle.clone(), &mut title, true),
152                _     => (),
153            }
154            for name in ["id", "class"].iter() {
155                if let Some(val) = dom::attr(name, &attrs.borrow()) {
156                    if tag_name != "body" && UNLIKELY.is_match(&val) {
157                        if !LIKELY.is_match(&val) {
158                            return true
159                        }
160                    }
161                }
162            }
163        },
164        _ => (),
165    }
166    let mut useless_nodes = vec![];
167    let mut paragraph_nodes = vec![];
168    let mut br_count = 0;
169    for child in handle.children.borrow().iter() {
170        if preprocess(&mut dom, child.clone(), &mut title) {
171            useless_nodes.push(child.clone());
172        }
173        let c = child.clone();
174        match c.data {
175            Element { ref name, .. } => {
176                let tag_name = name.local.as_ref();
177                if "br" == tag_name.to_lowercase() {
178                    br_count += 1
179                } else {
180                    br_count = 0
181                }
182            },
183            Text { ref contents } => {
184                let s = contents.borrow();
185                if br_count >= 2 && s.trim().len() > 0 {
186                    paragraph_nodes.push(child.clone());
187                    br_count = 0
188                }
189            },
190            _ => ()
191        }
192    }
193    for node in useless_nodes.iter() {
194        dom.remove_from_parent(node);
195    }
196    for node in paragraph_nodes.iter() {
197        let name = QualName::new(None, ns!(), LocalName::from("p"));
198        let p = dom.create_element(name, vec![], ElementFlags::default());
199        dom.append_before_sibling(node, NodeOrText::AppendNode(p.clone()));
200        dom.remove_from_parent(node);
201        match node.clone().data {
202            Text { ref contents } => {
203                let text = contents.clone().into_inner().clone();
204                dom.append(&p, NodeOrText::AppendText(text))
205            },
206            _ => (),
207        }
208    }
209    false
210}
211
212pub fn find_candidates(mut dom:    &mut RcDom,
213                       id:         &Path,
214                       handle:     Handle,
215                       candidates: &mut BTreeMap<String, Candidate>,
216                       nodes:      &mut BTreeMap<String, Rc<Node>>) {
217
218    if let Some(id) = id.to_str().map(|id| id.to_string()) {
219        nodes.insert(id, handle.clone());
220    }
221
222    if is_candidate(handle.clone()) {
223        let score = calc_content_score(handle.clone());
224        if let Some(c) = id.parent()
225            .and_then(|pid| find_or_create_candidate(pid, candidates, nodes))
226        {
227            c.score.set(c.score.get() + score)
228        }
229        if let Some(c) = id.parent()
230            .and_then(|pid| pid.parent())
231            .and_then(|gpid| find_or_create_candidate(gpid, candidates, nodes))
232        {
233            c.score.set(c.score.get() + score / 2.0)
234        }
235    }
236
237
238    if is_candidate(handle.clone()) {
239        let score = calc_content_score(handle.clone());
240        if let Some(c) = id.to_str()
241            .map(|id| id.to_string())
242            .and_then(|id| candidates.get(&id)) {
243                c.score.set(c.score.get() + score)
244            }
245        if let Some(c) = id.parent()
246            .and_then(|pid| pid.to_str())
247            .map(|id| id.to_string())
248            .and_then(|pid| candidates.get(&pid)) {
249                c.score.set(c.score.get() + score)
250            }
251        if let Some(c) = id.parent()
252            .and_then(|p| p.parent())
253            .and_then(|pid| pid.to_str())
254            .map(|id| id.to_string())
255            .and_then(|pid| candidates.get(&pid)) {
256                c.score.set(c.score.get() + score)
257            }
258    }
259
260    for (i, child) in handle.children.borrow().iter().enumerate() {
261        find_candidates(&mut dom,
262                        id.join(i.to_string()).as_path(),
263                        child.clone(),
264                        candidates,
265                        nodes)
266    }
267}
268
269fn find_or_create_candidate<'a>(id: &Path,
270                                candidates: &'a mut BTreeMap<String, Candidate>,
271                                nodes: &BTreeMap<String, Rc<Node>>) -> Option<&'a Candidate> {
272    if let Some(id) = id.to_str().map(|id| id.to_string()) {
273        if let Some(node) = nodes.get(&id) {
274            if candidates.get(&id).is_none() {
275                candidates.insert(id.clone(), Candidate {
276                    node:  node.clone(),
277                    score: Cell::new(init_content_score(node.clone())),
278                });
279            }
280            return candidates.get(&id)
281        }
282    }
283    None
284}
285
286pub fn clean(mut dom: &mut RcDom, id: &Path, handle: Handle, url: &Url, candidates: &BTreeMap<String, Candidate>) -> bool {
287    let mut useless = false;
288    match handle.data {
289        Document       => (),
290        Doctype { .. } => (),
291        Text { ref contents } => {
292            let s = contents.borrow();
293            if s.trim().len() == 0 {
294                useless = true
295            }
296        },
297        Comment { .. } => useless = true,
298        Element { ref name, ref attrs, .. } => {
299            let tag_name = name.local.as_ref();
300            match tag_name.to_lowercase().as_ref() {
301                "script" | "link" | "style" | "noscript" | "meta"
302                    | "h1" | "object" | "header" | "footer" | "aside" => {
303                    useless = true
304                },
305                "form" | "table" | "ul" | "div" => {
306                    useless = is_useless(id, handle.clone(), candidates)
307                },
308                "img" => useless = !fix_img_path(handle.clone(), url),
309                _     => (),
310            }
311            dom::clean_attr("id"   , &mut *attrs.borrow_mut());
312            dom::clean_attr("class", &mut *attrs.borrow_mut());
313            dom::clean_attr("style", &mut *attrs.borrow_mut());
314        },
315        ProcessingInstruction { .. } => unreachable!()
316    }
317    let mut useless_nodes = vec![];
318    for (i, child) in handle.children.borrow().iter().enumerate() {
319        let pid = id.join(i.to_string());
320        if clean(&mut dom, pid.as_path(), child.clone(), url, candidates) {
321            useless_nodes.push(child.clone());
322        }
323    }
324    for node in useless_nodes.iter() {
325        dom.remove_from_parent(node);
326    }
327    if dom::is_empty(handle) {
328        useless = true
329    }
330    useless
331}
332
333pub fn is_useless(id: &Path, handle: Handle, candidates: &BTreeMap<String, Candidate>) -> bool {
334    let tag_name = &dom::get_tag_name(handle.clone()).unwrap_or_default();
335    let weight = get_class_weight(handle.clone());
336    let score = id.to_str()
337        .and_then(|id| candidates.get(id))
338        .map(|c| c.score.get()).unwrap_or(0.0);
339    if weight + score < 0.0 {
340        return true
341    }
342    let text_nodes_len = dom::text_children_count(handle.clone());
343    let mut p_nodes:     Vec<Rc<Node>> = vec![];
344    let mut img_nodes:   Vec<Rc<Node>> = vec![];
345    let mut li_nodes:    Vec<Rc<Node>> = vec![];
346    let mut input_nodes: Vec<Rc<Node>> = vec![];
347    let mut embed_nodes: Vec<Rc<Node>> = vec![];
348    dom::find_node(handle.clone(), "p"     , &mut p_nodes);
349    dom::find_node(handle.clone(), "img"   , &mut img_nodes);
350    dom::find_node(handle.clone(), "li"    , &mut li_nodes);
351    dom::find_node(handle.clone(), "input" , &mut input_nodes);
352    dom::find_node(handle.clone(), "embed" , &mut embed_nodes);
353    let p_count        = p_nodes.len();
354    let img_count      = img_nodes.len();
355    let li_count       = li_nodes.len() as i32 - 100;
356    let input_count    = input_nodes.len();
357    let embed_count    = embed_nodes.len();
358    let link_density   = get_link_density(handle.clone());
359    let content_length = dom::text_len(handle.clone());
360    let para_count = text_nodes_len + p_count;
361
362    if img_count > para_count + text_nodes_len {
363        return true
364    }
365    if li_count > para_count as i32 && tag_name != "ul" && tag_name != "ol" {
366        return true
367    }
368    if input_count as f32 > f32::floor(para_count as f32 / 3.0) {
369        return true
370    }
371    if content_length < 25 && (img_count == 0 || img_count > 2) {
372        return true
373    }
374    if weight < 25.0 && link_density > 0.2 {
375        return true
376    }
377    if (embed_count == 1 && content_length < 35) || embed_count > 1 {
378        return true
379    }
380    return false
381}