1use dom;
2use html5ever::tree_builder::TreeSink;
3use html5ever::tree_builder::{ElementFlags, NodeOrText};
4use html5ever::{LocalName, QualName};
5use markup5ever_rcdom::Handle;
6use markup5ever_rcdom::Node;
7use markup5ever_rcdom::NodeData::{Comment, Doctype, Document, ProcessingInstruction};
8use markup5ever_rcdom::NodeData::{Element, Text};
9use markup5ever_rcdom::RcDom;
10use regex::Regex;
11use std::cell::Cell;
12use std::collections::BTreeMap;
13use std::path::Path;
14use std::rc::Rc;
15use url::Url;
16
17pub static PUNCTUATIONS_REGEX: &str = r"([、。,.!?]|\.[^A-Za-z0-9]|,[^0-9]|!|\?)";
18pub static UNLIKELY_CANDIDATES: &str = "combx|comment|community|disqus|extra|foot|header|menu\
19 |remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate\
20 |pagination|pager|popup|tweet|twitter\
21 |ssba";
22pub static LIKELY_CANDIDATES: &str = "and|article|body|column|main|shadow\
23 |content|hentry";
24pub static POSITIVE_CANDIDATES: &str = "article|body|content|entry|hentry|main|page\
25 |pagination|post|text|blog|story";
26pub static NEGATIVE_CANDIDATES: &str = "combx|comment|com|contact|foot|footer|footnote\
27 |masthead|media|meta|outbrain|promo|related\
28 |scroll|shoutbox|sidebar|sponsor|shopping\
29 |tags|tool|widget|form|textfield\
30 |uiScale|hidden";
31static BLOCK_CHILD_TAGS: [&str; 10] = [
32 "a",
33 "blockquote",
34 "dl",
35 "div",
36 "img",
37 "ol",
38 "p",
39 "pre",
40 "table",
41 "ul",
42];
43lazy_static! {
44 static ref PUNCTUATIONS: Regex = Regex::new(PUNCTUATIONS_REGEX).unwrap();
45 static ref LIKELY: Regex = Regex::new(LIKELY_CANDIDATES).unwrap();
46 static ref UNLIKELY: Regex = Regex::new(UNLIKELY_CANDIDATES).unwrap();
47 static ref POSITIVE: Regex = Regex::new(POSITIVE_CANDIDATES).unwrap();
48 static ref NEGATIVE: Regex = Regex::new(NEGATIVE_CANDIDATES).unwrap();
49}
50
51pub struct Candidate {
52 pub node: Rc<Node>,
53 pub score: Cell<f32>,
54}
55
56pub fn fix_img_path(handle: Handle, url: &Url) -> bool {
57 let src = dom::get_attr("src", handle.clone());
58 let s = match src {
59 Some(src) => src,
60 None => return false,
61 };
62 if !s.starts_with("//") && !s.starts_with("http://") && !s.starts_with("https://") {
63 if let Ok(new_url) = url.join(&s) {
64 dom::set_attr("src", new_url.as_str(), handle)
65 }
66 }
67 true
68}
69
70pub fn fix_anchor_path(handle: Handle, url: &Url) -> bool {
71 let src = dom::get_attr("href", handle.clone());
72 let s = match src {
73 Some(src) => src,
74 None => return false,
75 };
76 if !s.starts_with("//") && !s.starts_with("http://") && !s.starts_with("https://") {
77 if let Ok(new_url) = url.join(&s) {
78 dom::set_attr("href", new_url.as_str(), handle)
79 }
80 }
81 true
82}
83
84pub fn get_link_density(handle: Handle) -> f32 {
85 let text_length = dom::text_len(handle.clone()) as f32;
86 if text_length == 0.0 {
87 return 0.0;
88 }
89 let mut link_length = 0.0;
90 let mut links: Vec<Rc<Node>> = vec![];
91 dom::find_node(handle.clone(), "a", &mut links);
92 for link in links.iter() {
93 link_length += dom::text_len(link.clone()) as f32;
94 }
95 link_length / text_length
96}
97
98pub fn is_candidate(handle: Handle) -> bool {
99 let text_len = dom::text_len(handle.clone());
100 if text_len < 20 {
101 return false;
102 }
103 let n: &str = &dom::get_tag_name(handle.clone()).unwrap_or_default();
104 match n {
105 "p" => true,
106 "div" | "article" | "center" | "section" => {
107 !dom::has_nodes(handle.clone(), &BLOCK_CHILD_TAGS.to_vec())
108 }
109 _ => false,
110 }
111}
112
113pub fn init_content_score(handle: Handle) -> f32 {
114 let tag_name = dom::get_tag_name(handle.clone()).unwrap_or_default();
115 let score = match tag_name.as_ref() {
116 "article" => 10.0,
117 "div" => 5.0,
118 "blockquote" => 3.0,
119 "form" => -3.0,
120 "th" => 5.0,
121 _ => 0.0,
122 };
123 score + get_class_weight(handle.clone())
124}
125
126pub fn calc_content_score(handle: Handle) -> f32 {
127 let mut score: f32 = 1.0;
128 let mut text = String::new();
129 dom::extract_text(handle.clone(), &mut text, true);
130 let mat = PUNCTUATIONS.find_iter(&text);
131 score += mat.count() as f32;
132 score += f32::min(f32::floor(text.chars().count() as f32 / 100.0), 3.0);
133 score
134}
135
136pub fn get_class_weight(handle: Handle) -> f32 {
137 let mut weight: f32 = 0.0;
138 if let Element {
139 name: _, ref attrs, ..
140 } = handle.data
141 {
142 for name in ["id", "class"].iter() {
143 if let Some(val) = dom::attr(name, &attrs.borrow()) {
144 if POSITIVE.is_match(&val) {
145 weight += 25.0
146 };
147 if NEGATIVE.is_match(&val) {
148 weight -= 25.0
149 }
150 }
151 }
152 };
153 weight
154}
155
156pub fn preprocess(dom: &mut RcDom, handle: Handle, title: &mut String) -> bool {
157 if let Element {
158 ref name,
159 ref attrs,
160 ..
161 } = handle.clone().data
162 {
163 let tag_name = name.local.as_ref();
164 match tag_name.to_lowercase().as_ref() {
165 "script" | "link" | "style" => return true,
166 "title" => dom::extract_text(handle.clone(), title, true),
167 _ => (),
168 }
169 for name in ["id", "class"].iter() {
170 if let Some(val) = dom::attr(name, &attrs.borrow()) {
171 if tag_name != "body" && UNLIKELY.is_match(&val) && !LIKELY.is_match(&val) {
172 return true;
173 }
174 }
175 }
176 }
177 let mut useless_nodes = vec![];
178 let mut paragraph_nodes = vec![];
179 let mut br_count = 0;
180 for child in handle.children.borrow().iter() {
181 if preprocess(dom, child.clone(), title) {
182 useless_nodes.push(child.clone());
183 }
184 let c = child.clone();
185 match c.data {
186 Element { ref name, .. } => {
187 let tag_name = name.local.as_ref();
188 if "br" == tag_name.to_lowercase() {
189 br_count += 1
190 } else {
191 br_count = 0
192 }
193 }
194 Text { ref contents } => {
195 let s = contents.borrow();
196 if br_count >= 2 && !s.trim().is_empty() {
197 paragraph_nodes.push(child.clone());
198 br_count = 0
199 }
200 }
201 _ => (),
202 }
203 }
204 for node in useless_nodes.iter() {
205 dom.remove_from_parent(node);
206 }
207 for node in paragraph_nodes.iter() {
208 let name = QualName::new(None, ns!(), LocalName::from("p"));
209 let p = dom.create_element(name, vec![], ElementFlags::default());
210 dom.append_before_sibling(node, NodeOrText::AppendNode(p.clone()));
211 dom.remove_from_parent(node);
212 if let Text { ref contents } = node.clone().data {
213 let text = contents.clone().into_inner().clone();
214 dom.append(&p, NodeOrText::AppendText(text))
215 }
216 }
217 false
218}
219
220pub fn find_candidates(
221 id: &Path,
222 handle: Handle,
223 candidates: &mut BTreeMap<String, Candidate>,
224 nodes: &mut BTreeMap<String, Rc<Node>>,
225) {
226 if let Some(id) = id.to_str().map(|id| id.to_string()) {
227 nodes.insert(id, handle.clone());
228 }
229
230 if is_candidate(handle.clone()) {
231 let score = calc_content_score(handle.clone());
232 if let Some(c) = id
233 .parent()
234 .and_then(|pid| find_or_create_candidate(pid, candidates, nodes))
235 {
236 c.score.set(c.score.get() + score)
237 }
238 if let Some(c) = id
239 .parent()
240 .and_then(|pid| pid.parent())
241 .and_then(|gpid| find_or_create_candidate(gpid, candidates, nodes))
242 {
243 c.score.set(c.score.get() + score / 2.0)
244 }
245 }
246
247 if is_candidate(handle.clone()) {
248 let score = calc_content_score(handle.clone());
249 if let Some(c) = id
250 .to_str()
251 .map(|id| id.to_string())
252 .and_then(|id| candidates.get(&id))
253 {
254 c.score.set(c.score.get() + score)
255 }
256 if let Some(c) = id
257 .parent()
258 .and_then(|pid| pid.to_str())
259 .map(|id| id.to_string())
260 .and_then(|pid| candidates.get(&pid))
261 {
262 c.score.set(c.score.get() + score)
263 }
264 if let Some(c) = id
265 .parent()
266 .and_then(|p| p.parent())
267 .and_then(|pid| pid.to_str())
268 .map(|id| id.to_string())
269 .and_then(|pid| candidates.get(&pid))
270 {
271 c.score.set(c.score.get() + score)
272 }
273 }
274
275 for (i, child) in handle.children.borrow().iter().enumerate() {
276 find_candidates(
277 id.join(i.to_string()).as_path(),
278 child.clone(),
279 candidates,
280 nodes,
281 )
282 }
283}
284
285fn find_or_create_candidate<'a>(
286 id: &Path,
287 candidates: &'a mut BTreeMap<String, Candidate>,
288 nodes: &BTreeMap<String, Rc<Node>>,
289) -> Option<&'a Candidate> {
290 if let Some(id) = id.to_str().map(|id| id.to_string()) {
291 if let Some(node) = nodes.get(&id) {
292 if candidates.get(&id).is_none() {
293 candidates.insert(
294 id.clone(),
295 Candidate {
296 node: node.clone(),
297 score: Cell::new(init_content_score(node.clone())),
298 },
299 );
300 }
301 return candidates.get(&id);
302 }
303 }
304 None
305}
306
307pub fn clean(
308 dom: &mut RcDom,
309 id: &Path,
310 handle: Handle,
311 url: &Url,
312 candidates: &BTreeMap<String, Candidate>,
313) -> bool {
314 let mut useless = false;
315 match handle.data {
316 Document => (),
317 Doctype { .. } => (),
318 Text { ref contents } => {
319 let s = contents.borrow();
320 if s.trim().is_empty() {
321 useless = true
322 }
323 }
324 Comment { .. } => useless = true,
325 Element {
326 ref name,
327 ref attrs,
328 ..
329 } => {
330 let tag_name = name.local.as_ref();
331 match tag_name.to_lowercase().as_ref() {
332 "script" | "link" | "style" | "noscript" | "meta" | "h1" | "object" | "header"
333 | "footer" | "aside" => useless = true,
334 "form" | "table" | "ul" | "div" => {
335 useless = is_useless(id, handle.clone(), candidates)
336 }
337 "img" => useless = !fix_img_path(handle.clone(), url),
338 "a" => useless = !fix_anchor_path(handle.clone(), url),
339 _ => (),
340 }
341 dom::clean_attr("id", &mut attrs.borrow_mut());
342 dom::clean_attr("class", &mut attrs.borrow_mut());
343 dom::clean_attr("style", &mut attrs.borrow_mut());
344 }
345 ProcessingInstruction { .. } => unreachable!(),
346 }
347 let mut useless_nodes = vec![];
348 for (i, child) in handle.children.borrow().iter().enumerate() {
349 let pid = id.join(i.to_string());
350 if clean(dom, pid.as_path(), child.clone(), url, candidates) {
351 useless_nodes.push(child.clone());
352 }
353 }
354 for node in useless_nodes.iter() {
355 dom.remove_from_parent(node);
356 }
357 if dom::is_empty(handle) {
358 useless = true
359 }
360 useless
361}
362
363pub fn is_useless(id: &Path, handle: Handle, candidates: &BTreeMap<String, Candidate>) -> bool {
364 let tag_name = &dom::get_tag_name(handle.clone()).unwrap_or_default();
365 let weight = get_class_weight(handle.clone());
366 let score = id
367 .to_str()
368 .and_then(|id| candidates.get(id))
369 .map(|c| c.score.get())
370 .unwrap_or(0.0);
371 if weight + score < 0.0 {
372 return true;
373 }
374 let text_nodes_len = dom::text_children_count(handle.clone());
375 let mut p_nodes: Vec<Rc<Node>> = vec![];
376 let mut img_nodes: Vec<Rc<Node>> = vec![];
377 let mut li_nodes: Vec<Rc<Node>> = vec![];
378 let mut input_nodes: Vec<Rc<Node>> = vec![];
379 let mut embed_nodes: Vec<Rc<Node>> = vec![];
380 dom::find_node(handle.clone(), "p", &mut p_nodes);
381 dom::find_node(handle.clone(), "img", &mut img_nodes);
382 dom::find_node(handle.clone(), "li", &mut li_nodes);
383 dom::find_node(handle.clone(), "input", &mut input_nodes);
384 dom::find_node(handle.clone(), "embed", &mut embed_nodes);
385 let p_count = p_nodes.len();
386 let img_count = img_nodes.len();
387 let li_count = li_nodes.len() as i32 - 100;
388 let input_count = input_nodes.len();
389 let embed_count = embed_nodes.len();
390 let link_density = get_link_density(handle.clone());
391 let content_length = dom::text_len(handle.clone());
392 let para_count = text_nodes_len + p_count;
393
394 if img_count > para_count + text_nodes_len {
395 return true;
396 }
397 if li_count > para_count as i32 && tag_name != "ul" && tag_name != "ol" {
398 return true;
399 }
400 if input_count as f32 > f32::floor(para_count as f32 / 3.0) {
401 return true;
402 }
403 if content_length < 25 && (img_count == 0 || img_count > 2) {
404 return true;
405 }
406 if weight < 25.0 && link_density > 0.2 {
407 return true;
408 }
409 if (embed_count == 1 && content_length < 35) || embed_count > 1 {
410 return true;
411 }
412 false
413}