1use crate::dom;
2use html5ever::{
3 namespace_url, ns,
4 tree_builder::{ElementFlags, NodeOrText, TreeSink},
5 LocalName, QualName,
6};
7use lazy_static::lazy_static;
8use markup5ever_rcdom::{
9 Handle, Node,
10 NodeData::{Comment, Doctype, Document, Element, ProcessingInstruction, Text},
11 RcDom,
12};
13use regex::Regex;
14use std::{borrow::Cow, cell::Cell, collections::BTreeMap, path::Path, rc::Rc};
15use url::Url;
16
17const PUNCTUATIONS_REGEX: &str = r"([、。,.!?]|\.[^A-Za-z0-9]|,[^0-9]|!|\?)";
18const UNLIKELY_CANDIDATES: &str = "combx|comment|community|disqus|extra|foot|header|menu\
20 |remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate\
21 |pagination|pager|popup|tweet|twitter\
22 |ssba";
23const LIKELY_CANDIDATES: &str = "and|article|body|column|main|shadow\
24 |content|hentry";
25const POSITIVE_CANDIDATES: &str = "article|body|content|entry|hentry|main|page\
26 |pagination|post|text|blog|story";
27const NEGATIVE_CANDIDATES: &str = "combx|comment|com|contact|foot|footer|footnote\
29 |masthead|media|meta|outbrain|promo|related\
30 |scroll|shoutbox|sidebar|sponsor|shopping\
31 |tags|tool|widget|form|textfield\
32 |uiScale|hidden";
33const BLOCK_CHILD_TAGS: [&str; 10] = [
34 "a",
35 "blockquote",
36 "dl",
37 "div",
38 "img",
39 "ol",
40 "p",
41 "pre",
42 "table",
43 "ul",
44];
45lazy_static! {
46 static ref PUNCTUATIONS: Regex = Regex::new(PUNCTUATIONS_REGEX).unwrap();
47 static ref LIKELY: Regex = Regex::new(LIKELY_CANDIDATES).unwrap();
48 static ref UNLIKELY: Regex = Regex::new(UNLIKELY_CANDIDATES).unwrap();
49 static ref POSITIVE: Regex = Regex::new(POSITIVE_CANDIDATES).unwrap();
50 static ref NEGATIVE: Regex = Regex::new(NEGATIVE_CANDIDATES).unwrap();
51}
52
53#[derive(Clone)]
54pub struct Candidate {
55 pub node: Rc<Node>,
56 pub score: Cell<f32>,
57}
58
59pub struct TopCandidate<'a> {
60 id: Cow<'a, str>,
61 candidate: Cow<'a, Candidate>,
62}
63
64impl<'a> TopCandidate<'a> {
65 pub fn new(id: &'a str, candidate: Candidate) -> Self {
66 Self {
67 id: Cow::Borrowed(id),
68 candidate: Cow::Owned(candidate),
69 }
70 }
71
72 pub fn id(&self) -> &str {
73 self.id.as_ref()
74 }
75
76 pub fn candidate(&self) -> &Candidate {
77 &self.candidate
78 }
79
80 pub fn node(&self) -> &Rc<Node> {
81 &self.candidate.node
82 }
83
84 pub fn score(&self) -> &Cell<f32> {
85 &self.candidate.score
86 }
87}
88
89#[derive(Debug)]
91pub enum CandidateScore {
92 EqualWeight,
94 LevelWeight,
98}
99
100#[derive(Debug)]
101pub struct ScorerOptions<'a> {
102 pub min_candidate_length: usize,
104 pub max_candidate_parents: usize,
106 pub candidate_score: CandidateScore,
108 pub punctuations: &'a Regex,
109 pub unlikely_candidates: &'a Regex,
110 pub likely_candidates: &'a Regex,
111 pub positive_candidates: &'a Regex,
112 pub positive_candidate_weight: f32,
113 pub negative_candidates: &'a Regex,
114 pub negative_candidate_weight: f32,
115 pub block_child_tags: &'a [&'a str],
116}
117
118impl Default for ScorerOptions<'_> {
119 fn default() -> Self {
120 Self {
121 min_candidate_length: 20,
122 max_candidate_parents: 10,
123 candidate_score: CandidateScore::EqualWeight,
124 punctuations: &PUNCTUATIONS,
125 likely_candidates: &LIKELY,
126 unlikely_candidates: &UNLIKELY,
127 positive_candidates: &POSITIVE,
128 positive_candidate_weight: 25.0,
129 negative_candidates: &NEGATIVE,
130 negative_candidate_weight: 25.0,
131 block_child_tags: &BLOCK_CHILD_TAGS,
132 }
133 }
134}
135
136pub struct Scorer<'a> {
137 options: ScorerOptions<'a>,
138}
139
140impl<'a> Scorer<'a> {
141 pub fn new(options: ScorerOptions<'a>) -> Self {
142 Scorer { options }
143 }
144
145 pub fn preprocess(&self, dom: &mut RcDom, handle: Handle, title: &mut String) -> bool {
146 if let Element {
147 ref name,
148 ref attrs,
149 ..
150 } = handle.clone().data
151 {
152 let tag_name = name.local.as_ref();
153 match tag_name.to_lowercase().as_ref() {
154 "script" | "link" | "style" => return true,
155 "title" => dom::extract_text(handle.clone(), title, true),
156 _ => (),
157 }
158 for name in ["id", "class"].iter() {
159 if let Some(val) = dom::attr(name, &attrs.borrow()) {
160 if tag_name != "body"
161 && self.options.unlikely_candidates.is_match(&val)
162 && !self.options.likely_candidates.is_match(&val)
163 {
164 return true;
165 }
166 }
167 }
168 }
169 let mut useless_nodes = vec![];
170 let mut paragraph_nodes = vec![];
171 let mut br_count = 0;
172 for child in handle.children.borrow().iter() {
173 if self.preprocess(dom, child.clone(), title) {
174 useless_nodes.push(child.clone());
175 }
176 let c = child.clone();
177 match c.data {
178 Element { ref name, .. } => {
179 let tag_name = name.local.as_ref();
180 if "br" == tag_name.to_lowercase() {
181 br_count += 1
182 } else {
183 br_count = 0
184 }
185 }
186 Text { ref contents } => {
187 let s = contents.borrow();
188 if br_count >= 2 && !s.trim().is_empty() {
189 paragraph_nodes.push(child.clone());
190 br_count = 0
191 }
192 }
193 _ => (),
194 }
195 }
196 for node in useless_nodes.iter() {
197 dom.remove_from_parent(node);
198 }
199 for node in paragraph_nodes.iter() {
200 let name = QualName::new(None, ns!(), LocalName::from("p"));
201 let p = dom.create_element(name, vec![], ElementFlags::default());
202 dom.append_before_sibling(node, NodeOrText::AppendNode(p.clone()));
203 dom.remove_from_parent(node);
204 if let Text { ref contents } = node.clone().data {
205 let text = contents.clone().into_inner().clone();
206 dom.append(&p, NodeOrText::AppendText(text))
207 }
208 }
209 false
210 }
211
212 pub fn find_candidates(
214 &self,
215 node_id: &Path,
216 handle: Handle,
217 candidates: &mut BTreeMap<String, Candidate>,
218 nodes: &mut BTreeMap<String, Rc<Node>>,
219 ) {
220 if let Some(id) = node_id
221 .to_str()
222 .map(|candidate_id| candidate_id.to_string())
223 {
224 nodes.insert(id, handle.clone());
225 }
226
227 if self.is_candidate(handle.clone()) {
228 let content_score = self.calculate_content_score(handle.clone());
229
230 let mut current_node_id = Some(node_id.to_path_buf());
231 let mut level = 1;
232
233 while let Some(current_id) = current_node_id {
235 if level > self.options.max_candidate_parents {
237 break;
238 }
239
240 if let Some(candidate) =
241 current_id.to_str().map(|id| id.to_string()).and_then(|id| {
242 if current_id != node_id {
244 self.find_or_create_candidate(Path::new(&id), candidates, nodes)
245 } else {
246 None
247 }
248 })
249 {
250 let adjusted_content_score = match self.options.candidate_score {
251 CandidateScore::EqualWeight => content_score,
252 CandidateScore::LevelWeight => content_score / (level as f32),
253 };
254 candidate
255 .score
256 .set(candidate.score.get() + adjusted_content_score);
257
258 if dom::get_tag_name(candidate.node.clone()).as_deref() == Some("body") {
260 break;
261 }
262 }
263
264 current_node_id = current_id.parent().map(|pid| pid.to_path_buf());
265 level += 1;
266 }
267 }
268
269 for (i, child) in handle.children.borrow().iter().enumerate() {
270 self.find_candidates(
271 node_id.join(i.to_string()).as_path(),
272 child.clone(),
273 candidates,
274 nodes,
275 )
276 }
277 }
278
279 pub fn find_top_candidate(
281 &self,
282 candidates: &'a BTreeMap<String, Candidate>,
283 ) -> Option<TopCandidate<'a>> {
284 let mut top_candidate: Option<TopCandidate> = None;
285
286 for (id, candidate) in candidates.iter() {
287 let score = candidate.score.get() * (1.0 - get_link_density(candidate.node.clone()));
288 candidate.score.set(score);
289
290 if top_candidate
291 .as_ref()
292 .map_or(true, |top| score > top.candidate.score.get())
293 {
294 top_candidate = Some(TopCandidate {
295 id: Cow::Borrowed(id),
296 candidate: Cow::Borrowed(candidate),
297 });
298 }
299 }
300
301 top_candidate
302 }
303
304 pub fn clean(
305 &self,
306 dom: &mut RcDom,
307 id: &Path,
308 handle: Handle,
309 url: &Url,
310 candidates: &BTreeMap<String, Candidate>,
311 ) -> bool {
312 let mut useless = false;
313 match handle.data {
314 Document => (),
315 Doctype { .. } => (),
316 Text { ref contents } => {
317 let s = contents.borrow();
318 if s.trim().is_empty() {
319 useless = true
320 }
321 }
322 Comment { .. } => useless = true,
323 Element {
324 ref name,
325 ref attrs,
326 ..
327 } => {
328 let tag_name = name.local.as_ref();
329 match tag_name.to_lowercase().as_ref() {
330 "script" | "link" | "style" | "noscript" | "meta" | "h1" | "object"
331 | "header" | "footer" | "aside" => useless = true,
332 "form" | "table" | "ul" | "div" => {
333 useless = self.is_useless(id, handle.clone(), candidates)
334 }
335 "img" => useless = !fix_img_path(handle.clone(), url),
336 "a" => useless = !fix_anchor_path(handle.clone(), url),
337 _ => (),
338 }
339 dom::clean_attr("id", &mut attrs.borrow_mut());
340 dom::clean_attr("class", &mut attrs.borrow_mut());
341 dom::clean_attr("style", &mut attrs.borrow_mut());
342 }
343 ProcessingInstruction { .. } => unreachable!(),
344 }
345 let mut useless_nodes = vec![];
346 for (i, child) in handle.children.borrow().iter().enumerate() {
347 let pid = id.join(i.to_string());
348 if self.clean(dom, pid.as_path(), child.clone(), url, candidates) {
349 useless_nodes.push(child.clone());
350 }
351 }
352 for node in useless_nodes.iter() {
353 dom.remove_from_parent(node);
354 }
355 if dom::is_empty(handle) {
356 useless = true
357 }
358 useless
359 }
360
361 fn calculate_content_score(&self, handle: Handle) -> f32 {
362 let mut score: f32 = 1.0;
363 let mut text = String::new();
364 dom::extract_text(handle.clone(), &mut text, true);
365 let mat = self.options.punctuations.find_iter(&text);
366 score += mat.count() as f32;
367 score += f32::min(f32::floor(text.chars().count() as f32 / 100.0), 3.0);
368 score
369 }
370
371 fn get_class_weight(&self, handle: Handle) -> f32 {
372 let mut weight: f32 = 0.0;
373 if let Element {
374 name: _, ref attrs, ..
375 } = handle.data
376 {
377 for name in ["id", "class"].iter() {
378 if let Some(val) = dom::attr(name, &attrs.borrow()) {
379 if self.options.positive_candidates.is_match(&val) {
380 weight += self.options.positive_candidate_weight
381 };
382 if self.options.negative_candidates.is_match(&val) {
383 weight -= self.options.negative_candidate_weight
384 }
385 }
386 }
387 };
388 weight
389 }
390
391 fn init_content_score(&self, handle: Handle) -> f32 {
392 let tag_name = dom::get_tag_name(handle.clone()).unwrap_or_default();
393 let score = match tag_name.as_ref() {
394 "article" => 10.0,
395 "div" => 5.0,
396 "pre" | "td" | "blockquote" => 3.0,
397 "address" | "ol" | "ul" | "dl" | "dd" | "dt" | "li" | "form" => -3.0,
398 "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "th" => -5.0,
399 _ => 0.0,
400 };
401 score + self.get_class_weight(handle.clone())
402 }
403
404 fn find_or_create_candidate(
405 &self,
406 id: &Path,
407 candidates: &'a mut BTreeMap<String, Candidate>,
408 nodes: &BTreeMap<String, Rc<Node>>,
409 ) -> Option<&'a Candidate> {
410 if let Some(id) = id.to_str().map(|id| id.to_string()) {
411 if let Some(node) = nodes.get(&id) {
412 if candidates.get(&id).is_none() {
413 candidates.insert(
414 id.clone(),
415 Candidate {
416 node: node.clone(),
417 score: Cell::new(self.init_content_score(node.clone())),
418 },
419 );
420 }
421 return candidates.get(&id);
422 }
423 }
424 None
425 }
426
427 fn is_useless(
428 &self,
429 id: &Path,
430 handle: Handle,
431 candidates: &BTreeMap<String, Candidate>,
432 ) -> bool {
433 let tag_name = &dom::get_tag_name(handle.clone()).unwrap_or_default();
434 let weight = self.get_class_weight(handle.clone());
435 let score = id
436 .to_str()
437 .and_then(|id| candidates.get(id))
438 .map(|c| c.score.get())
439 .unwrap_or(0.0);
440 if weight + score < 0.0 {
441 return true;
442 }
443 let text_nodes_len = dom::text_children_count(handle.clone());
444 let mut p_nodes: Vec<Rc<Node>> = vec![];
445 let mut img_nodes: Vec<Rc<Node>> = vec![];
446 let mut li_nodes: Vec<Rc<Node>> = vec![];
447 let mut input_nodes: Vec<Rc<Node>> = vec![];
448 let mut embed_nodes: Vec<Rc<Node>> = vec![];
449 dom::find_node(handle.clone(), "p", &mut p_nodes);
450 dom::find_node(handle.clone(), "img", &mut img_nodes);
451 dom::find_node(handle.clone(), "li", &mut li_nodes);
452 dom::find_node(handle.clone(), "input", &mut input_nodes);
453 dom::find_node(handle.clone(), "embed", &mut embed_nodes);
454 let p_count = p_nodes.len();
455 let img_count = img_nodes.len();
456 let li_count = li_nodes.len() as i32 - 100;
457 let input_count = input_nodes.len();
458 let embed_count = embed_nodes.len();
459 let link_density = get_link_density(handle.clone());
460 let content_length = dom::text_len(handle.clone());
461 let para_count = text_nodes_len + p_count;
462
463 if img_count > para_count + text_nodes_len {
464 return true;
465 }
466 if li_count > para_count as i32 && tag_name != "ul" && tag_name != "ol" {
467 return true;
468 }
469 if input_count as f32 > f32::floor(para_count as f32 / 3.0) {
470 return true;
471 }
472 if content_length < 25 && (img_count == 0 || img_count > 2) {
473 return true;
474 }
475 if weight < 25.0 && link_density > 0.2 {
476 return true;
477 }
478 if (embed_count == 1 && content_length < 35) || embed_count > 1 {
479 return true;
480 }
481 false
482 }
483
484 fn is_candidate(&self, handle: Handle) -> bool {
485 let text_len = dom::text_len(handle.clone());
486 if text_len < self.options.min_candidate_length {
487 return false;
488 }
489 let n: &str = &dom::get_tag_name(handle.clone()).unwrap_or_default();
490 match n {
491 "p" => true,
492 "div" | "article" | "center" | "section" => {
493 !dom::has_nodes(handle.clone(), self.options.block_child_tags)
494 }
495 _ => false,
496 }
497 }
498}
499
500pub fn fix_img_path(handle: Handle, url: &Url) -> bool {
501 let src = dom::get_attr("src", handle.clone());
502 let s = match src {
503 Some(src) => src,
504 None => return false,
505 };
506 if !s.starts_with("//") && !s.starts_with("http://") && !s.starts_with("https://") {
507 if let Ok(new_url) = url.join(&s) {
508 dom::set_attr("src", new_url.as_str(), handle)
509 }
510 }
511 true
512}
513
514pub fn fix_anchor_path(handle: Handle, url: &Url) -> bool {
515 let src = dom::get_attr("href", handle.clone());
516 let s = match src {
517 Some(src) => src,
518 None => return false,
519 };
520 if !s.starts_with("//") && !s.starts_with("http://") && !s.starts_with("https://") {
521 if let Ok(new_url) = url.join(&s) {
522 dom::set_attr("href", new_url.as_str(), handle)
523 }
524 }
525 true
526}
527
528pub fn get_link_density(handle: Handle) -> f32 {
529 let text_length = dom::text_len(handle.clone()) as f32;
530 if text_length == 0.0 {
531 return 0.0;
532 }
533 let mut link_length = 0.0;
534 let mut links: Vec<Rc<Node>> = vec![];
535 dom::find_node(handle.clone(), "a", &mut links);
536 for link in links.iter() {
537 link_length += dom::text_len(link.clone()) as f32;
538 }
539 link_length / text_length
540}
541
542#[cfg(test)]
543mod tests {
544 use super::*;
545 use crate::utils::{debug_candidates, CandidateTag};
546 use html5ever::{parse_document, tendril::TendrilSink};
547 use std::{fs::File, io::Read};
548
549 #[test]
550 fn test_find_candidates_basic() {
551 let html = r#"
552 <!DOCTYPE html>
553 <html>
554 <head><title>Test Title</title></head>
555 <body>
556 <h1>Welcome</h1>
557 <p>This is a test paragraph with more than 25 characters.</p>
558 </body>
559 </html>"#;
560 let options = ScorerOptions::default();
561 let scorer = Scorer::new(options);
562 let dom = parse_document(RcDom::default(), Default::default())
563 .from_utf8()
564 .read_from(&mut html.as_bytes())
565 .unwrap();
566
567 assert!(dom.errors.is_empty(), "{:?}", dom.errors);
568
569 let mut candidates = BTreeMap::new();
570 let mut nodes = BTreeMap::new();
571
572 scorer.find_candidates(Path::new("/"), dom.document, &mut candidates, &mut nodes);
573
574 let tags = dbg!(debug_candidates(&candidates));
575
576 assert_eq!(candidates.len(), 1);
577 assert!(tags.contains(&CandidateTag::new("body", None, 1.0)));
578 }
579
580 #[test]
581 fn test_find_candidates_comments() {
582 let mut file = File::open("data/comments/input.html").unwrap();
583 let mut html = String::new();
584 file.read_to_string(&mut html).unwrap();
585
586 let options = ScorerOptions {
587 unlikely_candidates: &Regex::new(
588 "combx|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter|ssba",
589 )
590 .unwrap(),
591 negative_candidates: &Regex::new("combx|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget|form|textfield|uiScale|hidden").unwrap(),
592 positive_candidates: &Regex::new("article|body|content|entry|hentry|main|page|pagination|post|blog|story").unwrap(),
593 ..Default::default()
594 };
595 let scorer = Scorer::new(options);
596 let dom = parse_document(RcDom::default(), Default::default())
597 .from_utf8()
598 .read_from(&mut html.as_bytes())
599 .unwrap();
600
601 assert!(dom.errors.is_empty(), "{:?}", dom.errors);
602
603 let mut candidates = BTreeMap::new();
604 let mut nodes = BTreeMap::new();
605
606 scorer.find_candidates(Path::new("/"), dom.document, &mut candidates, &mut nodes);
607
608 let tags = dbg!(debug_candidates(&candidates));
609
610 assert_eq!(candidates.len(), 15);
611
612 assert!(tags.contains(&CandidateTag::new("tbody", None, 6.0)));
613 assert!(tags.contains(&CandidateTag::new("tr", Some("tr_2"), 6.0)));
614 assert!(tags.contains(&CandidateTag::new("table", Some("table_2"), 6.0)));
615 assert!(tags.contains(&CandidateTag::new("td", Some("td_0"), 9.0)));
616 assert!(tags.contains(&CandidateTag::new("td", Some("td_1"), 5.0)));
617 assert!(tags.contains(&CandidateTag::new("td", Some("td_2"), 5.0)));
618 assert!(tags.contains(&CandidateTag::new("td", Some("td_3"), 5.0)));
619 assert!(tags.contains(&CandidateTag::new("div", Some("comment_1"), 7.0)));
620 assert!(tags.contains(&CandidateTag::new("div", Some("comment_2"), 7.0)));
621 assert!(tags.contains(&CandidateTag::new("div", Some("comment_3"), 7.0)));
622 assert!(tags.contains(&CandidateTag::new("div", Some("commtext_1"), 7.0)));
623 assert!(tags.contains(&CandidateTag::new("div", Some("commtext_2"), 7.0)));
624 assert!(tags.contains(&CandidateTag::new("div", Some("commtext_3"), 7.0)));
625 }
626}