1use std::rc::Rc;
2use std::path::Path;
3use std::cell::Cell;
4use std::collections::BTreeMap;
5use url::Url;
6use regex::Regex;
7use lazy_static::lazy_static;
8use html5ever::tree_builder::TreeSink;
9use markup5ever_rcdom::Node;
10use markup5ever_rcdom::NodeData::{Element, Text};
11use markup5ever_rcdom::Handle;
12use markup5ever_rcdom::NodeData::{
13 Document,
14 Doctype,
15 Comment,
16 ProcessingInstruction
17};
18use markup5ever_rcdom::RcDom;
19use html5ever::{QualName, LocalName};
20use html5ever::tree_builder::{NodeOrText, ElementFlags};
21use html5ever::{ns, namespace_url};
22use crate::dom;
23
24pub static PUNCTUATIONS_REGEX: &'static str = r"([、。,.!?]|\.[^A-Za-z0-9]|,[^0-9]|!|\?)";
25pub static UNLIKELY_CANDIDATES: &'static str =
26 "combx|comment|community|disqus|extra|foot|header|menu\
27 |remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate\
28 |pagination|pager|popup|tweet|twitter\
29 |ssba";
30pub static LIKELY_CANDIDATES: &'static str = "and|article|body|column|main|shadow\
31 |content|hentry";
32pub static POSITIVE_CANDIDATES: &'static str =
33 "article|body|content|entry|hentry|main|page\
34 |pagination|post|text|blog|story";
35pub static NEGATIVE_CANDIDATES: &'static str =
36 "combx|comment|com|contact|foot|footer|footnote\
37 |masthead|media|meta|outbrain|promo|related\
38 |scroll|shoutbox|sidebar|sponsor|shopping\
39 |tags|tool|widget|form|textfield\
40 |uiScale|hidden";
41static BLOCK_CHILD_TAGS: [&'static str; 10] = [
42 "a", "blockquote", "dl", "div", "img", "ol", "p", "pre", "table", "ul",
43];
44lazy_static! {
45 static ref PUNCTUATIONS: Regex = Regex::new(PUNCTUATIONS_REGEX).unwrap();
46 static ref LIKELY: Regex = Regex::new(LIKELY_CANDIDATES).unwrap();
47 static ref UNLIKELY: Regex = Regex::new(UNLIKELY_CANDIDATES).unwrap();
48 static ref POSITIVE: Regex = Regex::new(POSITIVE_CANDIDATES).unwrap();
49 static ref NEGATIVE: Regex = Regex::new(NEGATIVE_CANDIDATES).unwrap();
50}
51
52pub struct Candidate {
53 pub node: Rc<Node>,
54 pub score: Cell<f32>,
55}
56
57pub fn fix_img_path(handle: Handle, url: &Url) -> bool {
58 let src = dom::get_attr("src", handle.clone());
59 if src.is_none() {
60 return false
61 }
62 let s = src.unwrap();
63 if !s.starts_with("//") && !s.starts_with("http://") && s.starts_with("https://") {
64 match url.join(&s) {
65 Ok(new_url) => dom::set_attr("src", new_url.as_str(), handle),
66 Err(_) => (),
67 }
68 }
69 true
70}
71
72pub fn get_link_density(handle: Handle) -> f32 {
73 let text_length = dom::text_len(handle.clone()) as f32;
74 if text_length == 0.0 {
75 return 0.0;
76 }
77 let mut link_length = 0.0;
78 let mut links: Vec<Rc<Node>> = vec![];
79 dom::find_node(handle.clone(), "a", &mut links);
80 for link in links.iter() {
81 link_length += dom::text_len(link.clone()) as f32;
82 }
83 link_length / text_length
84}
85
86pub fn is_candidate(handle: Handle) -> bool {
87 let text_len = dom::text_len(handle.clone());
88 if text_len < 20 {
89 return false
90 }
91 let n: &str = &dom::get_tag_name(handle. clone()).unwrap_or_default();
92 match n {
93 "p" => true,
94 "div" | "article" | "center" | "section" =>
95 !dom::has_nodes(handle.clone(), &BLOCK_CHILD_TAGS.iter().map(|t| *t).collect()),
96 _ => false
97 }
98}
99
100pub fn init_content_score(handle: Handle) -> f32 {
101 let tag_name = dom::get_tag_name(handle.clone()).unwrap_or_default();
102 let score = match tag_name.as_ref() {
103 "article" => 10.0,
104 "div" => 5.0,
105 "blockquote" => 3.0,
106 "form" => -3.0,
107 "th" => 5.0,
108 _ => 0.0,
109 };
110 score + get_class_weight(handle.clone())
111}
112
113pub fn calc_content_score(handle: Handle) -> f32 {
114 let mut score: f32 = 1.0;
115 let mut text = String::new();
116 dom::extract_text(handle.clone(), &mut text, true);
117 let mat = PUNCTUATIONS.find_iter(&text);
118 score += mat.count() as f32;
119 score += f32::min(f32::floor(text.chars().count() as f32 / 100.0), 3.0);
120 return score
121}
122
123pub fn get_class_weight(handle: Handle) -> f32 {
124 let mut weight: f32 = 0.0;
125 match handle.data {
126 Element { name: _, ref attrs, .. } => {
127 for name in ["id", "class"].iter() {
128 if let Some(val) = dom::attr(name, &attrs.borrow()) {
129 if POSITIVE.is_match(&val) {
130 weight += 25.0
131 };
132 if NEGATIVE.is_match(&val) {
133 weight -= 25.0
134 }
135 }
136 }
137 },
138 _ => (),
139 };
140 weight
141}
142
143pub fn preprocess(mut dom: &mut RcDom, handle: Handle, mut title: &mut String) -> bool {
144 match handle.clone().data {
145 Element { ref name, ref attrs, .. } => {
146 let tag_name = name.local.as_ref();
147 match tag_name.to_lowercase().as_ref() {
148 "script" | "link" | "style" => {
149 return true
150 },
151 "title" => dom::extract_text(handle.clone(), &mut title, true),
152 _ => (),
153 }
154 for name in ["id", "class"].iter() {
155 if let Some(val) = dom::attr(name, &attrs.borrow()) {
156 if tag_name != "body" && UNLIKELY.is_match(&val) {
157 if !LIKELY.is_match(&val) {
158 return true
159 }
160 }
161 }
162 }
163 },
164 _ => (),
165 }
166 let mut useless_nodes = vec![];
167 let mut paragraph_nodes = vec![];
168 let mut br_count = 0;
169 for child in handle.children.borrow().iter() {
170 if preprocess(&mut dom, child.clone(), &mut title) {
171 useless_nodes.push(child.clone());
172 }
173 let c = child.clone();
174 match c.data {
175 Element { ref name, .. } => {
176 let tag_name = name.local.as_ref();
177 if "br" == tag_name.to_lowercase() {
178 br_count += 1
179 } else {
180 br_count = 0
181 }
182 },
183 Text { ref contents } => {
184 let s = contents.borrow();
185 if br_count >= 2 && s.trim().len() > 0 {
186 paragraph_nodes.push(child.clone());
187 br_count = 0
188 }
189 },
190 _ => ()
191 }
192 }
193 for node in useless_nodes.iter() {
194 dom.remove_from_parent(node);
195 }
196 for node in paragraph_nodes.iter() {
197 let name = QualName::new(None, ns!(), LocalName::from("p"));
198 let p = dom.create_element(name, vec![], ElementFlags::default());
199 dom.append_before_sibling(node, NodeOrText::AppendNode(p.clone()));
200 dom.remove_from_parent(node);
201 match node.clone().data {
202 Text { ref contents } => {
203 let text = contents.clone().into_inner().clone();
204 dom.append(&p, NodeOrText::AppendText(text))
205 },
206 _ => (),
207 }
208 }
209 false
210}
211
212pub fn find_candidates(mut dom: &mut RcDom,
213 id: &Path,
214 handle: Handle,
215 candidates: &mut BTreeMap<String, Candidate>,
216 nodes: &mut BTreeMap<String, Rc<Node>>) {
217
218 if let Some(id) = id.to_str().map(|id| id.to_string()) {
219 nodes.insert(id, handle.clone());
220 }
221
222 if is_candidate(handle.clone()) {
223 let score = calc_content_score(handle.clone());
224 if let Some(c) = id.parent()
225 .and_then(|pid| find_or_create_candidate(pid, candidates, nodes))
226 {
227 c.score.set(c.score.get() + score)
228 }
229 if let Some(c) = id.parent()
230 .and_then(|pid| pid.parent())
231 .and_then(|gpid| find_or_create_candidate(gpid, candidates, nodes))
232 {
233 c.score.set(c.score.get() + score / 2.0)
234 }
235 }
236
237
238 if is_candidate(handle.clone()) {
239 let score = calc_content_score(handle.clone());
240 if let Some(c) = id.to_str()
241 .map(|id| id.to_string())
242 .and_then(|id| candidates.get(&id)) {
243 c.score.set(c.score.get() + score)
244 }
245 if let Some(c) = id.parent()
246 .and_then(|pid| pid.to_str())
247 .map(|id| id.to_string())
248 .and_then(|pid| candidates.get(&pid)) {
249 c.score.set(c.score.get() + score)
250 }
251 if let Some(c) = id.parent()
252 .and_then(|p| p.parent())
253 .and_then(|pid| pid.to_str())
254 .map(|id| id.to_string())
255 .and_then(|pid| candidates.get(&pid)) {
256 c.score.set(c.score.get() + score)
257 }
258 }
259
260 for (i, child) in handle.children.borrow().iter().enumerate() {
261 find_candidates(&mut dom,
262 id.join(i.to_string()).as_path(),
263 child.clone(),
264 candidates,
265 nodes)
266 }
267}
268
269fn find_or_create_candidate<'a>(id: &Path,
270 candidates: &'a mut BTreeMap<String, Candidate>,
271 nodes: &BTreeMap<String, Rc<Node>>) -> Option<&'a Candidate> {
272 if let Some(id) = id.to_str().map(|id| id.to_string()) {
273 if let Some(node) = nodes.get(&id) {
274 if candidates.get(&id).is_none() {
275 candidates.insert(id.clone(), Candidate {
276 node: node.clone(),
277 score: Cell::new(init_content_score(node.clone())),
278 });
279 }
280 return candidates.get(&id)
281 }
282 }
283 None
284}
285
286pub fn clean(mut dom: &mut RcDom, id: &Path, handle: Handle, url: &Url, candidates: &BTreeMap<String, Candidate>) -> bool {
287 let mut useless = false;
288 match handle.data {
289 Document => (),
290 Doctype { .. } => (),
291 Text { ref contents } => {
292 let s = contents.borrow();
293 if s.trim().len() == 0 {
294 useless = true
295 }
296 },
297 Comment { .. } => useless = true,
298 Element { ref name, ref attrs, .. } => {
299 let tag_name = name.local.as_ref();
300 match tag_name.to_lowercase().as_ref() {
301 "script" | "link" | "style" | "noscript" | "meta"
302 | "h1" | "object" | "header" | "footer" | "aside" => {
303 useless = true
304 },
305 "form" | "table" | "ul" | "div" => {
306 useless = is_useless(id, handle.clone(), candidates)
307 },
308 "img" => useless = !fix_img_path(handle.clone(), url),
309 _ => (),
310 }
311 dom::clean_attr("id" , &mut *attrs.borrow_mut());
312 dom::clean_attr("class", &mut *attrs.borrow_mut());
313 dom::clean_attr("style", &mut *attrs.borrow_mut());
314 },
315 ProcessingInstruction { .. } => unreachable!()
316 }
317 let mut useless_nodes = vec![];
318 for (i, child) in handle.children.borrow().iter().enumerate() {
319 let pid = id.join(i.to_string());
320 if clean(&mut dom, pid.as_path(), child.clone(), url, candidates) {
321 useless_nodes.push(child.clone());
322 }
323 }
324 for node in useless_nodes.iter() {
325 dom.remove_from_parent(node);
326 }
327 if dom::is_empty(handle) {
328 useless = true
329 }
330 useless
331}
332
333pub fn is_useless(id: &Path, handle: Handle, candidates: &BTreeMap<String, Candidate>) -> bool {
334 let tag_name = &dom::get_tag_name(handle.clone()).unwrap_or_default();
335 let weight = get_class_weight(handle.clone());
336 let score = id.to_str()
337 .and_then(|id| candidates.get(id))
338 .map(|c| c.score.get()).unwrap_or(0.0);
339 if weight + score < 0.0 {
340 return true
341 }
342 let text_nodes_len = dom::text_children_count(handle.clone());
343 let mut p_nodes: Vec<Rc<Node>> = vec![];
344 let mut img_nodes: Vec<Rc<Node>> = vec![];
345 let mut li_nodes: Vec<Rc<Node>> = vec![];
346 let mut input_nodes: Vec<Rc<Node>> = vec![];
347 let mut embed_nodes: Vec<Rc<Node>> = vec![];
348 dom::find_node(handle.clone(), "p" , &mut p_nodes);
349 dom::find_node(handle.clone(), "img" , &mut img_nodes);
350 dom::find_node(handle.clone(), "li" , &mut li_nodes);
351 dom::find_node(handle.clone(), "input" , &mut input_nodes);
352 dom::find_node(handle.clone(), "embed" , &mut embed_nodes);
353 let p_count = p_nodes.len();
354 let img_count = img_nodes.len();
355 let li_count = li_nodes.len() as i32 - 100;
356 let input_count = input_nodes.len();
357 let embed_count = embed_nodes.len();
358 let link_density = get_link_density(handle.clone());
359 let content_length = dom::text_len(handle.clone());
360 let para_count = text_nodes_len + p_count;
361
362 if img_count > para_count + text_nodes_len {
363 return true
364 }
365 if li_count > para_count as i32 && tag_name != "ul" && tag_name != "ol" {
366 return true
367 }
368 if input_count as f32 > f32::floor(para_count as f32 / 3.0) {
369 return true
370 }
371 if content_length < 25 && (img_count == 0 || img_count > 2) {
372 return true
373 }
374 if weight < 25.0 && link_density > 0.2 {
375 return true
376 }
377 if (embed_count == 1 && content_length < 35) || embed_count > 1 {
378 return true
379 }
380 return false
381}