1use crate::logging::logger::*;
2use crate::logging::logging_defs::*;
3use crate::parser::{NodeExt, NodeRef, new_html_element, parse_html};
4
5use regex::Regex;
6use std::collections::{HashMap, HashSet};
7use std::sync::LazyLock;
8
9pub static PHRASING_ELEMENTS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
11 HashSet::from([
12 "abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn",
13 "em", "embed", "i", "img", "input", "kbd", "label", "mark", "math", "meter",
14 "noscript", "object", "output", "progress", "q", "ruby", "samp", "script", "select",
15 "small", "span", "strong", "sub", "sup", "textarea", "time", "var", "wbr",
16 ])
17});
18
19pub static PRESENTATIONAL_ATTRIBUTES: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
22 vec![
23 "align",
24 "background",
25 "bgcolor",
26 "border",
27 "cellpadding",
28 "cellspacing",
29 "frame",
30 "hspace",
31 "rules",
32 "style",
33 "valign",
34 "vspace",
35 ]
36});
37
38pub static DEPRECATED_SIZE_ATTRIBUTE_ELEMS: LazyLock<HashSet<&'static str>> =
41 LazyLock::new(|| HashSet::from(["table", "th", "td", "hr", "pre"]));
42
43pub static SELF_CLOSING_TAGS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
47 HashSet::from([
48 "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "param", "source",
49 "track", "wbr",
50 ])
51});
52
53pub static HAS_CONTENT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\S$").unwrap());
56
57pub static POSITIVE_CLASSES_AND_IDS: LazyLock<Regex> = LazyLock::new(|| {
60 Regex::new(r"(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story").unwrap()
61});
62pub static NEGATIVE_CLASSES_AND_IDS: LazyLock<Regex> = LazyLock::new(|| {
65 Regex::new(r"(?i)-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|footer|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|widget").unwrap()
66});
67pub static VIDEO_ATTRS_REGEX: LazyLock<Regex> = LazyLock::new(|| {
68 Regex::new(r"(?i)//(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq|bilibili|live\.bilibili)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)").unwrap()
69});
70pub static SHARE_ELEMENTS_REGEX: LazyLock<Regex> =
71 LazyLock::new(|| Regex::new(r"(?i)(\b|_)(share|sharedaddy)(\b|_)").unwrap());
72pub static UNLIKELY_CANDIDATES_REGEX: LazyLock<Regex> = LazyLock::new(|| {
73 Regex::new(r"(?i)-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote").unwrap()
74});
75pub static MAYBE_A_CANDIDATE: LazyLock<Regex> =
76 LazyLock::new(|| Regex::new(r"(?i)and|article|body|column|content|main|mathjax|shadow").unwrap());
77pub static TOKENIZE_REGEX: LazyLock<Regex> =
78 LazyLock::new(|| Regex::new(r"[^A-Za-z0-9_]+").unwrap());
79pub static B64_DATA_URL: LazyLock<Regex> =
80 LazyLock::new(|| Regex::new(r"(?i)^data:\s*([^\s;,]+)\s*;\s*base64\s*,").unwrap());
81pub static SRCSET_URL: LazyLock<Regex> =
82 LazyLock::new(|| Regex::new(r"(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))").unwrap());
83pub static COMMA_REGEX: LazyLock<Regex> = LazyLock::new(|| {
84 Regex::new(r"\u002C|\u060C|\uFE50|\uFE10|\uFE11|\u2E41|\u2E34|\u2E32|\uFF0C").unwrap()
85});
86pub static UNESCAPE_NAMED_ENTITIES: LazyLock<Regex> =
87 LazyLock::new(|| Regex::new(r"&(?:quot|amp|apos|lt|gt);").unwrap());
88pub static UNESCAPE_NUMERIC_ENTITIES: LazyLock<Regex> =
89 LazyLock::new(|| Regex::new(r"&#(?:x([0-9a-fA-F]+)|([0-9]+));").unwrap());
90pub static IMAGE_EXTENSION: LazyLock<Regex> =
91 LazyLock::new(|| Regex::new(r"(?i)\.(jpg|jpeg|png|webp)").unwrap());
92pub static SRCSET_EXTENSION: LazyLock<Regex> =
93 LazyLock::new(|| Regex::new(r"\.(jpg|jpeg|png|webp)\s+\d").unwrap());
94pub static SRC_EXTENSION: LazyLock<Regex> =
95 LazyLock::new(|| Regex::new(r"^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$").unwrap());
96pub static SENTENCE_END: LazyLock<Regex> =
97 LazyLock::new(|| Regex::new(r"\.( |$)").unwrap());
98pub static CDATA_STRIP: LazyLock<Regex> =
99 LazyLock::new(|| Regex::new(r"<!\[CDATA\[|\]\]>").unwrap());
100pub static UNLIKELY_ROLES: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
101 HashSet::from([
102 "menu",
103 "menubar",
104 "complementary",
105 "navigation",
106 "alert",
107 "alertdialog",
108 "dialog",
109 ])
110});
111
112pub const REFERENCING_ATTRIBUTES: &[&str] = &["id", "name"];
115
116pub const DEFAULT_MAX_ANCESTORS_DEPTH: i16 = 0;
119
120pub const DEFAULT_MAX_ANCESTORS_LOOKUP_DEPTH: i16 = 3;
123
124pub fn concat_optionals(l: Option<String>, r: Option<String>, sep: &str) -> String {
135 match (l, r) {
136 (Some(l), Some(r)) => format!("{}{}{}", l, sep, r),
137 (Some(l), None) => l,
138 (None, Some(r)) => r,
139 (None, None) => String::new(),
140 }
141}
142
143pub fn remove_tags_with_selector(node: &NodeRef, selector: &str) {
145 for n in select_descendants(node, selector) {
146 n.detach();
147 }
148}
149
150pub fn select_descendants(node: &NodeRef, selector: &str) -> Vec<NodeRef> {
153 match node.select(selector) {
154 Ok(iter) => iter
155 .filter_map(|e| {
156 let n = e.as_node();
157 if n == node { None } else { Some(n.clone()) }
158 })
159 .collect(),
160 Err(_) => vec![],
161 }
162}
163
164pub fn remove_comment_nodes(node: &NodeRef) {
167 let descendants: Vec<_> = node.descendants().collect();
168 for n in descendants {
169 if n.as_comment().is_some() {
170 n.detach();
171 }
172 }
173}
174
175pub fn cleanup_readability_p_wrappers(root: &NodeRef) {
180 let mut divs = vec![];
181 if let Ok(iter) = root.select("div") {
182 for d in iter {
183 divs.push(d.as_node().clone());
184 }
185 }
186
187 for div in divs.into_iter().rev() {
188 if div.attr_value("data-readability-p-wrap").is_none() {
189 continue;
190 }
191
192 let parent = match div.parent() {
193 Some(p) => p,
194 None => {
195 if let Some(e) = div.as_element() {
196 e.attributes.borrow_mut().remove("data-readability-p-wrap");
197 }
198 continue;
199 }
200 };
201
202 let mut parent_has_text = false;
203 for child in parent.children() {
204 if child.as_text().is_some() && !child.text_contents().trim().is_empty() {
205 parent_has_text = true;
206 break;
207 }
208 }
209
210 let parent_elements = parent.element_children();
211 let should_unwrap = !parent_has_text && parent_elements.len() == 1;
212
213 if should_unwrap {
214 let children: Vec<_> = div.children().collect();
215 for child in children {
216 child.detach();
217 div.insert_before(child);
218 }
219 div.detach();
220 } else if let Some(e) = div.as_element() {
221 e.attributes.borrow_mut().remove("data-readability-p-wrap");
222 }
223 }
224}
225
226pub fn is_element_without_content(node: &NodeRef) -> bool {
229 if node.as_element().is_none() {
230 return false;
231 }
232 if !node.text_contents().trim().is_empty() {
233 return false;
234 }
235 if !select_descendants(node, "img").is_empty() {
236 return false;
237 }
238 let children = node.element_children();
239 if children.is_empty() {
240 return true;
241 }
242 let brs = select_descendants(node, "br").len();
243 let hrs = select_descendants(node, "hr").len();
244 children.len() == brs + hrs
245}
246
247pub fn simplify_nested_elements(article_content: &NodeRef) {
258 let mut node = Some(article_content.clone());
259 while let Some(current) = node {
260 let mut next = get_next_node(¤t, false);
261 if current.parent().is_some() {
262 if let Some(name) = current.element_name() {
263 let name = name.to_lowercase();
264 let id_is_readability = current
265 .attr_value("id")
266 .map(|id| id.starts_with("readability"))
267 .unwrap_or(false);
268 if (name == "div" || name == "section") && !id_is_readability {
269 if is_element_without_content(¤t) {
270 next = remove_and_get_next(¤t);
271 node = next;
272 continue;
273 }
274 if contains_single_tag_in_element(¤t, "div")
275 || contains_single_tag_in_element(¤t, "section")
276 {
277 if let Some(child) = current.element_children().get(0).cloned() {
278 if let (Some(parent_e), Some(child_e)) =
279 (current.as_element(), child.as_element())
280 {
281 for (attr_name, attr) in
282 parent_e.attributes.borrow().map.clone()
283 {
284 child_e.attributes.borrow_mut().insert(
285 attr_name.local.to_string(),
286 attr.value.clone(),
287 );
288 }
289 }
290 current.insert_before(child.clone());
291 current.detach();
292 node = Some(child);
293 continue;
294 }
295 }
296 }
297 }
298 }
299 node = next;
300 }
301}
302
303pub fn get_next_node(node: &NodeRef, ignore_self_and_children: bool) -> Option<NodeRef> {
313 let first_child = node.first_element_child();
315 if !ignore_self_and_children && first_child.is_some() {
316 return first_child;
317 }
318 if let Some(next_sibling) = node.next_element_sibling() {
320 return Some(next_sibling);
321 }
322
323 let mut current = node.parent();
327 while let Some(p) = current {
328 if let Some(sibling) = p.next_element_sibling() {
329 return Some(sibling);
330 }
331 current = p.parent();
332 }
333 None
334}
335
336pub fn remove_and_get_next(node: &NodeRef) -> Option<NodeRef> {
339 let next = get_next_node(node, true);
340 node.detach();
341 next
342}
343
344pub fn match_string_for_node(node: &NodeRef) -> String {
347 concat_optionals(node.attr_value("class"), node.attr_value("id"), " ")
348}
349
350pub fn remove_matched_nodes<F>(node: &NodeRef, predicate: F)
353where
354 F: Fn(&NodeRef, &str) -> bool,
355{
356 let end_of_search_marker = get_next_node(node, true);
357 let mut next = get_next_node(node, false);
358 while next.is_some() && next != end_of_search_marker {
359 let n = next.clone().unwrap();
360 let match_str = match_string_for_node(&n);
361 if predicate(&n, match_str.as_str()) {
362 next = remove_and_get_next(&n);
363 } else {
364 next = get_next_node(&n, false);
365 }
366 }
367}
368
369pub fn next_element(node: Option<NodeRef>) -> Option<NodeRef> {
373 let mut next = node;
374 while let Some(ref n) = next {
375 if n.as_element().is_some() || !n.text_contents().trim().is_empty() {
376 break;
377 }
378 next = n.next_sibling();
379 }
380 next
381}
382
383pub fn rename_tags_with_selector(node: &NodeRef, selector: &str, new_tag_name: &str) {
386 for n in select_descendants(node, selector) {
387 n.clone().clone_and_rename_element(new_tag_name);
388 }
389}
390
391pub fn is_whitespace_node(node: &NodeRef) -> bool {
393 if (node.as_text().is_some() && node.text_contents().trim().is_empty())
394 || node.element_name() == Some("br")
395 {
396 return true;
397 }
398
399 false
400}
401
402pub fn is_empty_node(node: &NodeRef, logger: &PerfLogger) -> bool {
407 if let Some(name) = node.element_name() {
410 if SELF_CLOSING_TAGS.contains(name) {
411 return false;
412 }
413 }
414
415 if is_phrasing_content(node) {
417 return false;
418 }
419
420 for attr_name in REFERENCING_ATTRIBUTES {
423 if node.attr_value(attr_name).is_some() {
424 return false;
425 }
426 }
427
428 let txt = get_normalized_text_content(node, logger);
429
430 txt.trim().is_empty() && select_descendants(node, "img").is_empty()
431}
432
433pub fn is_phrasing_content(node: &NodeRef) -> bool {
438 if node.as_text().is_some() {
439 return true;
440 }
441
442 if let Some(name) = node.element_name() {
443 if PHRASING_ELEMENTS.contains(name) {
444 return true;
445 }
446 }
447 if (node.element_name() == Some("a")
448 || node.element_name() == Some("del")
449 || node.element_name() == Some("ins"))
450 && test_all_siblings(node.first_child(), is_phrasing_content)
451 {
452 return true;
453 }
454 false
455}
456
457pub fn test_all_siblings<F>(node: Option<NodeRef>, test_func: F) -> bool
461where
462 F: Fn(&NodeRef) -> bool,
463{
464 let mut next = node;
465 while next.is_some() {
466 let n = next.clone().unwrap();
467 if !test_func(&n) {
468 return false;
469 }
470 next = n.next_sibling();
471 }
472 true
473}
474
475pub fn move_children(from: &NodeRef, to: &NodeRef) {
478 let mut child = from.first_child();
479 while child.is_some() {
480 let child_unwraped = child.clone().unwrap();
481 child = child_unwraped.next_sibling();
482 to.append(child_unwraped);
483 }
484}
485
486pub fn test_any_node_by_selector<F>(node: &NodeRef, sel: &str, test_func: F) -> bool
489where
490 F: Fn(&NodeRef) -> bool,
491{
492 for n in select_descendants(node, sel) {
493 if test_func(&n) {
494 return true;
495 }
496 }
497 false
498}
499
500pub fn contains_single_tag_in_element(node: &NodeRef, tag_name: &str) -> bool {
503 let mut elements = vec![];
504 let mut non_elements = vec![];
505 for c in node.children() {
506 if c.as_element().is_some() {
507 elements.push(c);
508 } else {
509 non_elements.push(c);
510 }
511 }
512 if elements.len() != 1 || elements.get(0).unwrap().element_name() != Some(tag_name) {
514 return false;
515 }
516
517 for c in non_elements {
519 if HAS_CONTENT.is_match(c.text_contents().as_str()) {
520 return false;
521 }
522 }
523
524 true
525}
526
527pub fn has_ancestor_tag(node: &NodeRef, ancestor_tag_name: &str, max_depth: i16) -> bool {
531 has_ancestor_tag_with_predicate(node, ancestor_tag_name, max_depth, |_| true)
532}
533
534pub fn has_ancestor_tag_with_predicate<F>(
538 node: &NodeRef,
539 ancestor_tag_name: &str,
540 max_depth: i16,
541 predicate: F,
542) -> bool
543where
544 F: Fn(&NodeRef) -> bool,
545{
546 let mut depth = 0;
547 let mut node = node.clone();
548 while let Some(p) = node.parent() {
549 depth += 1;
550 if max_depth > 0 && depth > max_depth {
551 return false;
552 }
553 if p.element_name() == Some(ancestor_tag_name) && predicate(&p) {
554 return true;
555 }
556 node = p;
557 }
558 false
559}
560
561pub fn concate_nodes_with_selectors(container: &NodeRef, selectors: Vec<&str>) -> Vec<NodeRef> {
564 let mut res = vec![];
565 for s in selectors {
566 res.extend(select_descendants(container, s));
567 }
568 res
569}
570
571pub fn get_node_ancestors(node: &NodeRef, max_depth: i16) -> Vec<NodeRef> {
575 let mut ancestors = vec![];
576 let mut depth = 1;
577 let mut current = node.parent();
578 while let Some(p) = current {
579 ancestors.push(p.clone());
580 if max_depth > 0 && depth == max_depth {
581 break;
582 }
583 depth += 1;
584 current = p.parent();
585 }
586 ancestors
587}
588
589pub fn get_link_density(node: &NodeRef, logger: &PerfLogger) -> f64 {
594 start_span!(logger, GET_LINK_DENSITY);
595 add_point_to_span_str!(logger, GET_LINK_DENSITY, "get_node_normalized_text_begin");
596 let text_length = get_normalized_text_length(node, logger);
597 add_point_to_span_str!(logger, GET_LINK_DENSITY, "get_node_normalized_text_end");
598 if text_length == 0 {
599 annotate_span_str!(
600 logger,
601 GET_LINK_DENSITY,
602 "early return because node content is empty"
603 );
604 end_span!(logger, GET_LINK_DENSITY);
605 return 0.0;
606 }
607
608 let mut link_length = 0.0_f64;
609 add_point_to_span_str!(logger, GET_LINK_DENSITY, "sum_link_text_lengths_begin");
610 for a in select_descendants(node, "a") {
611 let mut coefficient = 1.0;
612 if let Some(href) = a.attr_value("href") {
613 if href.trim().starts_with('#') {
614 coefficient = 0.3;
615 }
616 }
617 link_length += get_normalized_text_length(&a, logger) as f64 * coefficient;
618 }
619 add_point_to_span_str!(logger, GET_LINK_DENSITY, "sum_link_text_lengths_end");
620 let result = link_length / (text_length as f64);
621 end_span!(logger, GET_LINK_DENSITY);
622 result
623}
624
625fn get_normalized_text_length(node: &NodeRef, logger: &PerfLogger) -> usize {
626 start_span!(logger, NORMALIZE_AND_COUNT_CHARS);
627 add_point_to_span_str!(
628 logger,
629 NORMALIZE_AND_COUNT_CHARS,
630 "get_normalized_txt_begin"
631 );
632 let txt = get_normalized_text_content(node, logger);
633 add_point_to_span_str!(logger, NORMALIZE_AND_COUNT_CHARS, "get_normalized_txt_end");
634
635 add_point_to_span_str!(logger, NORMALIZE_AND_COUNT_CHARS, "count_chars_begin");
636 let count = txt.chars().count();
637 add_point_to_span_str!(logger, NORMALIZE_AND_COUNT_CHARS, "count_chars_end");
638 end_span!(logger, NORMALIZE_AND_COUNT_CHARS);
639 count
640}
641
642pub fn get_normalized_text_content(node: &NodeRef, logger: &PerfLogger) -> String {
645 start_span!(logger, NORMALIZE_NODE_TEXT);
646
647 add_point_to_span_str!(logger, NORMALIZE_NODE_TEXT, "get_text_contents_begin");
648 let txt = node.text_contents();
649 add_point_to_span_str!(logger, NORMALIZE_NODE_TEXT, "get_text_contents_end");
650 add_point_to_span_str!(logger, NORMALIZE_NODE_TEXT, "remove_duplicate_spaces_begin");
651 let txt = normalize_text(txt.trim());
652 add_point_to_span_str!(logger, NORMALIZE_NODE_TEXT, "remove_duplicate_spaces_end");
653 end_span!(logger, NORMALIZE_NODE_TEXT);
654 txt
655}
656
657static NORMALIZE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\s{2,}").unwrap());
658
659pub fn normalize_text(src: &str) -> String {
662 NORMALIZE_REGEX.replace_all(src, " ").to_string()
663}
664
665pub fn matches_ad_or_loading(text: &str) -> bool {
669 fn is_loading_word(s: &str) -> bool {
670 let lowered = s.to_lowercase();
671 let bases = [
672 "loading",
673 "正在加载",
674 "загрузка",
675 "chargement",
676 "cargando",
677 ];
678 for base in bases {
679 if lowered == base {
680 return true;
681 }
682 let dots = format!("{}...", base);
683 if lowered == dots {
684 return true;
685 }
686 let ellipsis = format!("{}…", base);
687 if lowered == ellipsis {
688 return true;
689 }
690 }
691 false
692 }
693
694 fn is_ad_word(s: &str) -> bool {
695 let lowered = s.to_lowercase();
696 matches!(
697 lowered.as_str(),
698 "ad"
699 | "advertising"
700 | "advertisement"
701 | "pub"
702 | "publicite"
703 | "publicité"
704 | "werb"
705 | "werbung"
706 | "广告"
707 | "реклама"
708 | "anuncio"
709 )
710 }
711
712 let trimmed = text.trim();
713 if is_ad_word(trimmed) || is_loading_word(trimmed) {
714 return true;
715 }
716
717 let compact: String = trimmed
718 .chars()
719 .filter(|c| !c.is_whitespace() && *c != '\u{200b}' && *c != '\u{feff}')
720 .collect();
721 is_ad_word(compact.as_str()) || is_loading_word(compact.as_str())
722}
723
724pub fn normalize_text_preserve_nbsp(src: &str) -> String {
729 let mut out = String::new();
730 let mut in_ws = false;
731 for ch in src.chars() {
732 if is_preserved_unicode_space(ch) {
733 out.push(ch);
734 in_ws = false;
735 continue;
736 }
737 if is_ascii_whitespace(ch) {
738 if !in_ws {
739 out.push(' ');
740 in_ws = true;
741 }
742 } else {
743 out.push(ch);
744 in_ws = false;
745 }
746 }
747 out
748}
749
750fn is_ascii_whitespace(ch: char) -> bool {
751 matches!(ch, ' ' | '\t' | '\n' | '\r' | '\x0C')
752}
753
754fn is_preserved_unicode_space(ch: char) -> bool {
755 matches!(ch,
756 '\u{00A0}' | '\u{1680}' | '\u{2000}' | '\u{2001}' | '\u{2002}' | '\u{2003}' | '\u{2004}' |
759 '\u{2005}' | '\u{2006}' | '\u{2007}' | '\u{2008}' | '\u{2009}' |
760 '\u{200A}' | '\u{202F}' | '\u{205F}' | '\u{3000}' )
765}
766
767pub fn text_similarity(text_a: &str, text_b: &str) -> f64 {
775 let tokens_a: HashSet<String> = TOKENIZE_REGEX
776 .split(&text_a.to_lowercase())
777 .filter(|s| !s.is_empty())
778 .map(|s| s.to_string())
779 .collect();
780 let tokens_b: Vec<String> = TOKENIZE_REGEX
781 .split(&text_b.to_lowercase())
782 .filter(|s| !s.is_empty())
783 .map(|s| s.to_string())
784 .collect();
785
786 if tokens_a.is_empty() || tokens_b.is_empty() {
787 return 0.0;
788 }
789
790 let uniq_tokens_b: Vec<&String> = tokens_b.iter().filter(|t| !tokens_a.contains(*t)).collect();
791 let uniq_len: usize = uniq_tokens_b.iter().map(|s| s.len()).sum::<usize>()
792 + uniq_tokens_b.len().saturating_sub(1);
793 let total_len: usize =
794 tokens_b.iter().map(|s| s.len()).sum::<usize>() + tokens_b.len().saturating_sub(1);
795 let distance_b = uniq_len as f64 / total_len as f64;
796 1.0 - distance_b
797}
798
799pub fn unescape_html_entities(value: &str) -> String {
803 if value.is_empty() {
804 return value.to_string();
805 }
806
807 let replaced = UNESCAPE_NAMED_ENTITIES.replace_all(value, |caps: ®ex::Captures| {
808 match caps.get(0).map(|m| m.as_str()).unwrap_or_default() {
809 """ => "\"",
810 "&" => "&",
811 "'" => "'",
812 "<" => "<",
813 ">" => ">",
814 _ => "",
815 }
816 });
817
818 let replaced = UNESCAPE_NUMERIC_ENTITIES.replace_all(&replaced, |caps: ®ex::Captures| {
819 let num = if let Some(hex) = caps.get(1) {
820 u32::from_str_radix(hex.as_str(), 16).unwrap_or(0)
821 } else if let Some(dec) = caps.get(2) {
822 dec.as_str().parse::<u32>().unwrap_or(0)
823 } else {
824 0
825 };
826
827 let num = if num == 0 || num > 0x10FFFF || (0xD800..=0xDFFF).contains(&num) {
828 0xFFFD
829 } else {
830 num
831 };
832 std::char::from_u32(num).unwrap_or('\u{FFFD}').to_string()
833 });
834
835 replaced.into_owned()
836}
837
838pub fn normalize_text_nodes(root: &NodeRef) {
843 let skip_tags = [
844 "pre",
845 "code",
846 "textarea",
847 "script",
848 "style",
849 "svg",
850 "math",
851 ];
852 let nodes: Vec<_> = root.descendants().collect();
853 for n in nodes {
854 if let Some(text) = n.as_text() {
855 if let Some(parent) = n.parent() {
856 if let Some(tag) = parent.element_name() {
857 let tag = tag.to_lowercase();
858 if skip_tags.contains(&tag.as_str()) {
859 continue;
860 }
861 }
862 }
863 let current = text.borrow().to_string();
864 if let Some(next) = n.next_sibling() {
865 if let Some(next_text) = next.as_text() {
866 let merged = format!("{}{}", current, next_text.borrow());
867 let normalized = normalize_text_preserve_nbsp(merged.as_str());
868 let new_node = NodeRef::new_text(normalized);
869 n.insert_after(new_node);
870 n.detach();
871 next.detach();
872 continue;
873 }
874 }
875 let normalized = normalize_text_preserve_nbsp(current.as_str());
876 if normalized.trim().is_empty() {
877 let prev = n.previous_sibling();
878 let next = n.next_sibling();
879 match (prev.clone(), next.clone()) {
880 (Some(prev_node), Some(next_node)) => {
881 if let Some(prev_text) = prev_node.as_text() {
882 let mut prev_val = prev_text.borrow().to_string();
883 if let Some(last) = prev_val.chars().last() {
884 if is_phrasing_content(&prev_node)
885 && !is_ascii_whitespace(last)
886 && !is_preserved_unicode_space(last)
887 {
888 prev_val.push(' ');
889 let new_prev =
890 NodeRef::new_text(normalize_text_preserve_nbsp(
891 prev_val.as_str(),
892 ));
893 prev_node.insert_before(new_prev);
894 prev_node.detach();
895 }
896 }
897 }
898 if let Some(next_text) = next_node.as_text() {
899 let mut next_val = next_text.borrow().to_string();
900 if let Some(first) = next_val.chars().next() {
901 if is_phrasing_content(&next_node)
902 && !is_ascii_whitespace(first)
903 && !is_preserved_unicode_space(first)
904 {
905 next_val.insert(0, ' ');
906 let new_next =
907 NodeRef::new_text(normalize_text_preserve_nbsp(
908 next_val.as_str(),
909 ));
910 next_node.insert_before(new_next);
911 next_node.detach();
912 }
913 }
914 }
915 }
916 (None, Some(next_node)) => {
917 if is_phrasing_content(&next_node) {
918 if let Some(next_text) = next_node.as_text() {
919 let mut next_val = next_text.borrow().to_string();
920 if let Some(first) = next_val.chars().next() {
921 if !is_ascii_whitespace(first)
922 && !is_preserved_unicode_space(first)
923 {
924 next_val.insert(0, ' ');
925 let new_next = NodeRef::new_text(
926 normalize_text_preserve_nbsp(next_val.as_str()),
927 );
928 next_node.insert_before(new_next);
929 next_node.detach();
930 }
931 }
932 } else {
933 next_node.insert_before(NodeRef::new_text(" "));
934 }
935 }
936 }
937 (Some(prev_node), None) => {
938 if is_phrasing_content(&prev_node) {
939 if let Some(prev_text) = prev_node.as_text() {
940 let mut prev_val = prev_text.borrow().to_string();
941 if let Some(last) = prev_val.chars().last() {
942 if !is_ascii_whitespace(last)
943 && !is_preserved_unicode_space(last)
944 {
945 prev_val.push(' ');
946 let new_prev = NodeRef::new_text(
947 normalize_text_preserve_nbsp(prev_val.as_str()),
948 );
949 prev_node.insert_before(new_prev);
950 prev_node.detach();
951 }
952 }
953 } else {
954 prev_node.insert_after(NodeRef::new_text(" "));
955 }
956 }
957 }
958 _ => {}
959 }
960 n.detach();
961 continue;
962 }
963 let normalized = normalized;
964 if normalized != current {
965 let new_node = NodeRef::new_text(normalized);
966 n.insert_after(new_node);
967 n.detach();
968 }
969 }
970 }
971}
972
973fn get_class_and_id_attr_weight(attr_value: &str) -> i64 {
974 let mut weight = 0;
975 if NEGATIVE_CLASSES_AND_IDS.is_match(attr_value) {
976 weight -= 25;
977 }
978
979 if POSITIVE_CLASSES_AND_IDS.is_match(attr_value) {
980 weight += 25;
981 }
982
983 weight
984}
985
986pub fn get_class_and_id_weight(node: &NodeRef) -> i64 {
990 let mut weight = 0;
991 if let Some(class_name) = node.attr_value("class") {
992 weight += get_class_and_id_attr_weight(class_name.as_str())
993 }
994
995 if let Some(tag_id) = node.attr_value("id") {
996 weight += get_class_and_id_attr_weight(tag_id.as_str())
997 }
998
999 weight
1000}
1001
1002pub fn node_contains_any_tag_of(node: &NodeRef, look_up_tag_names: &[&str]) -> bool {
1005 for tag in look_up_tag_names {
1006 if !select_descendants(node, tag).is_empty() {
1007 return true;
1008 }
1009 }
1010 false
1011}
1012
1013pub fn fix_lazy_images(node: &NodeRef) {
1015 apply(node, &["img", "picture", "figure"], |n, tag_name| {
1016 let src = n.attr_value("src");
1017 if let Some(src_val) = src.clone() {
1018 if let Some(caps) = B64_DATA_URL.captures(src_val.as_str()) {
1019 let mime = caps.get(1).map(|m| m.as_str()).unwrap_or_default();
1020 if mime != "image/svg+xml" {
1021 let mut src_could_be_removed = false;
1022 if let Some(e) = n.as_element() {
1023 for (name, attr) in e.attributes.borrow().clone().map {
1024 if name.local.to_string() == "src" {
1025 continue;
1026 }
1027 if IMAGE_EXTENSION.is_match(attr.value.as_str()) {
1028 src_could_be_removed = true;
1029 break;
1030 }
1031 }
1032 }
1033 if src_could_be_removed {
1034 let b64starts = caps.get(0).map(|m| m.as_str().len()).unwrap_or(0);
1035 let b64length = src_val.len().saturating_sub(b64starts);
1036 if b64length < 133 {
1037 if let Some(e) = n.as_element() {
1038 e.attributes.borrow_mut().remove("src");
1039 }
1040 }
1041 }
1042 }
1043 }
1044 }
1045
1046 let src = n.attr_value("src");
1047 let srcset = n.attr_value("srcset");
1048 let class_name = n.attr_value("class").unwrap_or_default().to_lowercase();
1049
1050 if (src.is_some() || (srcset.is_some() && srcset.as_deref() != Some("null")))
1051 && !class_name.contains("lazy")
1052 {
1053 return;
1054 }
1055
1056 if let Some(e) = n.as_element() {
1057 let mut tmp: HashMap<String, String> = HashMap::new();
1058 for (name, attr) in e.attributes.borrow().clone().map {
1059 let local_name = name.local.to_string();
1060 if local_name == "src" || local_name == "srcset" || local_name == "alt" {
1061 continue;
1062 }
1063
1064 let copy_to_attr: Option<&str> = if SRCSET_EXTENSION.is_match(attr.value.as_str()) {
1065 Some("srcset")
1066 } else if SRC_EXTENSION.is_match(attr.value.as_str()) {
1067 Some("src")
1068 } else {
1069 None
1070 };
1071
1072 if let Some(copy_to_attr) = copy_to_attr {
1073 if tag_name == "img" || tag_name == "picture" {
1074 tmp.insert(copy_to_attr.to_string(), attr.value.clone());
1075 } else if tag_name == "figure"
1076 && n.select_first("img").is_err()
1077 && n.select_first("picture").is_err()
1078 {
1079 let img = new_html_element("img");
1080 img.as_element()
1081 .unwrap()
1082 .attributes
1083 .borrow_mut()
1084 .insert(copy_to_attr, attr.value.clone());
1085 n.append(img);
1086 }
1087 }
1088 }
1089
1090 for (name, value) in tmp {
1091 e.attributes.borrow_mut().insert(name, value);
1092 }
1093 }
1094 });
1095}
1096
1097fn is_single_image(node: &NodeRef) -> bool {
1098 let mut current = node.clone();
1099 loop {
1100 if current.element_name() == Some("img") {
1101 return true;
1102 }
1103 let children = current.element_children();
1104 if children.len() != 1 || !current.text_contents().trim().is_empty() {
1105 return false;
1106 }
1107 current = children[0].clone();
1108 }
1109}
1110
1111pub fn unwrap_noscript_images(doc: &NodeRef) {
1122 let imgs = select_descendants(doc, "img");
1123 for img in imgs {
1124 if let Some(e) = img.as_element() {
1125 let mut has_image_attr = false;
1126 for (name, attr) in e.attributes.borrow().clone().map {
1127 let local = name.local.to_string();
1128 match local.as_str() {
1129 "src" | "srcset" | "data-src" | "data-srcset" => {
1130 has_image_attr = true;
1131 break;
1132 }
1133 _ => {
1134 if IMAGE_EXTENSION.is_match(attr.value.as_str()) {
1135 has_image_attr = true;
1136 break;
1137 }
1138 }
1139 }
1140 }
1141 if !has_image_attr {
1142 img.detach();
1143 }
1144 }
1145 }
1146
1147 let noscripts = select_descendants(doc, "noscript");
1148 for noscript in noscripts {
1149 if !is_single_image(&noscript) {
1150 continue;
1151 }
1152 let inner = noscript.inner_html();
1153 let tmp = parse_html(format!("<div>{}</div>", inner).as_str());
1154 let new_img = tmp.select_first("img").ok().map(|n| n.as_node().clone());
1155 if new_img.is_none() {
1156 continue;
1157 }
1158
1159 if let Some(prev_element) = noscript.previous_element_sibling() {
1160 if is_single_image(&prev_element) {
1161 let mut prev_img = prev_element.clone();
1162 if prev_img.element_name() != Some("img") {
1163 if let Ok(img) = prev_element.select_first("img") {
1164 prev_img = img.as_node().clone();
1165 }
1166 }
1167
1168 if let (Some(prev_e), Some(new_e)) =
1169 (prev_img.as_element(), new_img.clone().unwrap().as_element())
1170 {
1171 for (name, attr) in prev_e.attributes.borrow().clone().map {
1172 if attr.value.is_empty() {
1173 continue;
1174 }
1175 let local = name.local.to_string();
1176 let should_copy = local == "src"
1177 || local == "srcset"
1178 || IMAGE_EXTENSION.is_match(attr.value.as_str());
1179 if !should_copy {
1180 continue;
1181 }
1182 if let Some(existing) = new_e.attributes.borrow().get(local.as_str()) {
1183 if existing == attr.value.as_str() {
1184 continue;
1185 }
1186 }
1187 let mut attr_name = local.clone();
1188 if new_e.attributes.borrow().contains(local.as_str()) {
1189 attr_name = format!("data-old-{}", local);
1190 }
1191 new_e
1192 .attributes
1193 .borrow_mut()
1194 .insert(attr_name.as_str(), attr.value.clone());
1195 }
1196 }
1197
1198 let new_img_node = new_img.unwrap();
1199 prev_element.insert_after(new_img_node);
1200 prev_element.detach();
1201 }
1202 }
1203 }
1204}
1205
1206pub fn is_possibly_useful_video_node(node: &NodeRef, tag: &str) -> bool {
1211 if tag != "embed" && tag != "object" && tag != "iframe" {
1212 return false;
1213 }
1214
1215 if let Some(e) = node.as_element() {
1216 for (_, attr) in e.attributes.borrow().clone().map {
1218 if VIDEO_ATTRS_REGEX.is_match(attr.value.as_str()) {
1219 return true;
1220 }
1221 }
1222
1223 if node.element_name() == Some("object")
1225 && VIDEO_ATTRS_REGEX.is_match(node.inner_html().as_str())
1226 {
1227 return true;
1228 }
1229 }
1230
1231 false
1232}
1233
1234pub fn strip_unlikely_and_get_next(node: &NodeRef, matching_str: &str) -> Option<NodeRef> {
1239 if let Some(role) = node.attr_value("role") {
1240 if UNLIKELY_ROLES.contains(role.as_str()) {
1241 return remove_and_get_next(node);
1242 }
1243 }
1244 if UNLIKELY_CANDIDATES_REGEX.is_match(matching_str)
1245 && !MAYBE_A_CANDIDATE.is_match(matching_str)
1246 && !has_ancestor_tag(node, "table", DEFAULT_MAX_ANCESTORS_LOOKUP_DEPTH)
1247 && !has_ancestor_tag(node, "code", DEFAULT_MAX_ANCESTORS_LOOKUP_DEPTH)
1248 && node.element_name() != Some("body")
1249 && node.element_name() != Some("a")
1250 {
1251 return remove_and_get_next(node);
1253 }
1254 None
1255}
1256
1257pub fn remove_nodes<F>(container: &NodeRef, tag_name: &str, condition: F)
1262where
1263 F: Fn(&NodeRef, &str) -> bool,
1264{
1265 for node in select_descendants(container, tag_name).into_iter().rev() {
1266 if condition(&node, tag_name) {
1267 node.detach();
1268 }
1269 }
1270}
1271
1272pub fn word_count(text: &str) -> usize {
1284 text.split_whitespace().count()
1285}
1286
1287pub fn apply<F>(root_node: &NodeRef, selectors: &[&str], func: F)
1305where
1306 F: Fn(&NodeRef, &str),
1307{
1308 for s in selectors {
1309 for n in select_descendants(root_node, s).into_iter().rev() {
1310 func(&n, s);
1311 }
1312 }
1313}
1314
1315pub fn resolve_base_uri(doc_uri: &str, base_path: &str) -> String {
1318 if base_path.is_empty() {
1319 return doc_uri.to_string();
1320 }
1321 if let Ok(parsed_url) = url::Url::parse(doc_uri) {
1322 if let Ok(base) = parsed_url.join(base_path) {
1323 return base.to_string();
1324 }
1325 }
1326 base_path.to_string()
1327}
1328
1329pub fn to_absolute_uri(uri: &str, doc_uri: &str, base_path: &str) -> String {
1333 let uri = uri.trim();
1336 if let Ok(parsed) = url::Url::parse(uri) {
1337 return parsed.into();
1338 }
1339 if base_path.is_empty() && uri.starts_with('#') {
1340 return String::from(uri);
1341 }
1342 let base_uri = resolve_base_uri(doc_uri, base_path);
1343 if base_uri == doc_uri && uri.starts_with('#') {
1344 return String::from(uri);
1345 }
1346
1347 if let Ok(parsed_url) = url::Url::parse(base_uri.as_str()) {
1348 if let Ok(parsed_url) = parsed_url.join(uri) {
1349 return parsed_url.into();
1350 }
1351 }
1352
1353 uri.to_string()
1354}
1355
1356fn resolve_srcset(srcset: &str, doc_uri: &str, base_path: &str) -> String {
1358 let mut out = String::new();
1359 for caps in SRCSET_URL.captures_iter(srcset) {
1360 let url = caps.get(1).map(|m| m.as_str()).unwrap_or_default();
1361 let descriptor = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1362 let trailing = caps.get(3).map(|m| m.as_str()).unwrap_or("");
1363 let absolute = to_absolute_uri(url, doc_uri, base_path);
1364 out.push_str(absolute.as_str());
1365 out.push_str(descriptor);
1366 out.push_str(trailing);
1367 }
1368 if out.is_empty() {
1369 srcset.to_string()
1370 } else {
1371 out
1372 }
1373}
1374
1375pub fn replace_relative_urls_with_absolute(node: &NodeRef, doc_uri: &str, base_path: &str) {
1386 for link in select_descendants(node, "a") {
1387 if let Some(href) = link.attr_value("href") {
1388 let replace_link = |link: &NodeRef, preserve_attrs: bool| {
1389 let mut child_count = 0usize;
1390 let mut single_text_child = false;
1391 for child in link.children() {
1392 child_count += 1;
1393 if child_count == 1 && child.as_text().is_some() {
1394 single_text_child = true;
1395 } else {
1396 single_text_child = false;
1397 }
1398 }
1399
1400 if !preserve_attrs && child_count == 1 && single_text_child {
1401 let text_node = NodeRef::new_text(link.text_contents());
1402 link.insert_before(text_node);
1403 link.detach();
1404 return;
1405 }
1406
1407 let container = new_html_element("span");
1408 if preserve_attrs {
1409 if let (Some(src), Some(dst)) = (link.as_element(), container.as_element()) {
1410 for (attr_name, attr) in src.attributes.borrow().map.clone() {
1411 if attr_name.local.to_string() == "href" {
1412 continue;
1413 }
1414 dst.attributes
1415 .borrow_mut()
1416 .insert(attr_name.local.to_string(), attr.value.clone());
1417 }
1418 }
1419 }
1420 while let Some(child) = link.first_child() {
1421 container.append(child);
1422 }
1423 link.insert_before(container);
1424 link.detach();
1425 };
1426
1427 if href.starts_with("javascript:") {
1428 let preserve_attrs =
1430 link.attr_value("id").is_some() || link.attr_value("name").is_some();
1431 replace_link(&link, preserve_attrs);
1432 continue;
1433 }
1434
1435 if href.starts_with('#') {
1436 if let Some(id) = link.attr_value("id") {
1437 if href == format!("#{}", id) {
1438 replace_link(&link, true);
1439 continue;
1440 }
1441 }
1442 let absolute = to_absolute_uri(href.as_str(), doc_uri, base_path);
1443 if let Some(e) = link.as_element() {
1444 e.attributes.borrow_mut().insert("href", absolute);
1445 }
1446 continue;
1447 }
1448 let absolute = to_absolute_uri(href.as_str(), doc_uri, base_path);
1449 if let Some(e) = link.as_element() {
1450 e.attributes.borrow_mut().insert("href", absolute);
1451 }
1452 }
1453 }
1454
1455 for tag in ["img", "picture", "figure", "video", "audio", "source"] {
1456 for media in select_descendants(node, tag) {
1457 if let Some(src) = media.attr_value("src") {
1458 let absolute = to_absolute_uri(src.as_str(), doc_uri, base_path);
1459 if let Some(e) = media.as_element() {
1460 e.attributes.borrow_mut().insert("src", absolute);
1461 }
1462 }
1463 if let Some(poster) = media.attr_value("poster") {
1464 let absolute = to_absolute_uri(poster.as_str(), doc_uri, base_path);
1465 if let Some(e) = media.as_element() {
1466 e.attributes.borrow_mut().insert("poster", absolute);
1467 }
1468 }
1469 if let Some(srcset) = media.attr_value("srcset") {
1470 let absolute = resolve_srcset(srcset.as_str(), doc_uri, base_path);
1471 if let Some(e) = media.as_element() {
1472 e.attributes.borrow_mut().insert("srcset", absolute);
1473 }
1474 }
1475 }
1476 }
1477}
1478
1479#[cfg(test)]
1481pub(crate) fn count_elements(doc: &NodeRef, tag_name: &str) -> usize {
1482 doc.select(tag_name).unwrap().count()
1483}
1484
1485#[cfg(test)]
1486mod tests {
1487 use crate::parser::parse_html;
1488 use crate::utils::*;
1489
1490 #[test]
1493 fn b64_data_url_matches_valid_data_uri() {
1494 let input = "data: image/png ; base64 , iVBORw0KGgo=";
1496 assert!(
1497 B64_DATA_URL.is_match(input),
1498 "B64_DATA_URL should match a valid data URI; got no match on: {input:?}"
1499 );
1500 let caps = B64_DATA_URL.captures(input).unwrap();
1502 assert_eq!(caps.get(1).unwrap().as_str(), "image/png");
1503 }
1504
1505 #[test]
1506 fn b64_data_url_does_not_match_non_data_uri() {
1507 assert!(!B64_DATA_URL.is_match("https://example.com/img.png"));
1508 }
1509
1510 #[test]
1511 fn srcset_url_parses_single_entry() {
1512 let input = "https://example.com/img.png 2x";
1513 let caps: Vec<_> = SRCSET_URL.captures_iter(input).collect();
1514 assert!(!caps.is_empty(), "SRCSET_URL should match a srcset entry");
1515 assert_eq!(caps[0].get(1).unwrap().as_str(), "https://example.com/img.png");
1516 assert_eq!(caps[0].get(2).unwrap().as_str().trim(), "2x");
1517 }
1518
1519 #[test]
1520 fn srcset_url_parses_multiple_entries() {
1521 let input = "small.jpg 480w, large.jpg 800w";
1522 let caps: Vec<_> = SRCSET_URL.captures_iter(input).collect();
1523 assert_eq!(caps.len(), 2, "SRCSET_URL should match both srcset entries");
1524 assert_eq!(caps[0].get(1).unwrap().as_str(), "small.jpg");
1525 assert_eq!(caps[1].get(1).unwrap().as_str(), "large.jpg");
1526 }
1527
1528 #[test]
1531 fn has_ancestor_tag_with_depth_3_finds_at_exactly_3() {
1532 let doc = parse_html("<div><section><span><p>leaf</p></span></section></div>");
1536 let p = doc.select_first("p").unwrap().as_node().clone();
1537 assert!(
1539 has_ancestor_tag(&p, "div", 3),
1540 "should find 'div' ancestor at depth 3"
1541 );
1542 assert!(
1544 !has_ancestor_tag(&p, "div", 2),
1545 "should NOT find 'div' ancestor when max_depth=2"
1546 );
1547 }
1548
1549 #[test]
1550 fn test_negative_classes_weight() {
1551 let negatives = "hidden|banner|combx|comment|com-|contact|footer|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|widget".split('|').collect::<Vec<_>>();
1552 for n in negatives {
1553 let attr_value = format!("some random value {}", n);
1554 assert_eq!(get_class_and_id_attr_weight(attr_value.as_str()), -25);
1555 let attr_value = attr_value.to_uppercase();
1556 assert_eq!(get_class_and_id_attr_weight(attr_value.as_str()), -25);
1557 }
1558
1559 assert_eq!(get_class_and_id_attr_weight("hid"), -25);
1560 assert_eq!(get_class_and_id_attr_weight("aaassaa hid"), -25);
1561 assert_eq!(get_class_and_id_attr_weight("aaassaa hid aaaaa"), -25);
1562 assert_eq!(get_class_and_id_attr_weight("hid dfsdss"), -25);
1563
1564 assert_eq!(get_class_and_id_attr_weight("hId"), -25);
1565 assert_eq!(get_class_and_id_attr_weight("aaassaa Hid"), -25);
1566 assert_eq!(get_class_and_id_attr_weight("aaassaa hiD aaaaa"), -25);
1567 assert_eq!(get_class_and_id_attr_weight("HiD dfsdss"), -25);
1568 }
1569
1570 #[test]
1571 fn test_positive_classes_weight() {
1572 let positives =
1573 "article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story"
1574 .split('|')
1575 .collect::<Vec<_>>();
1576 for n in positives {
1577 let attr_value = format!("some random value {}", n);
1578 assert_eq!(get_class_and_id_attr_weight(attr_value.as_str()), 25);
1579 let attr_value = attr_value.to_uppercase();
1580 assert_eq!(get_class_and_id_attr_weight(attr_value.as_str()), 25);
1581 }
1582 }
1583
1584 #[test]
1585 fn test_replace_relative_urls_with_rel_img_src_with_base() {
1586 const TEST_INPUT: &str = r###"<!doctype html><html><head>
1587 <title>Example Domain</title>
1588 </head>
1589 <body>
1590 <div>
1591 <img src="images/img.png" />
1592 </div>
1593 </body>
1594 </html>"###;
1595
1596 let doc = parse_html(TEST_INPUT);
1597 replace_relative_urls_with_absolute(&doc, "http://www.example.com/world/", "..");
1598 let e = doc.select_first("img").unwrap();
1599 let node = e.as_node();
1600 assert_eq!(node.element_name().unwrap(), "img");
1601 assert_eq!(
1602 node.attr_value("src").unwrap(),
1603 "http://www.example.com/images/img.png"
1604 );
1605 assert_eq!(doc.select("img").unwrap().count(), 1);
1606 }
1607
1608 #[test]
1609 fn test_replace_relative_urls_with_rel_img_src_without_base() {
1610 const TEST_INPUT: &str = r###"<!doctype html><html><head>
1611 <title>Example Domain</title>
1612 </head>
1613 <body>
1614 <div>
1615 <img src="images/img.png" />
1616 </div>
1617 </body>
1618 </html>"###;
1619
1620 let doc = parse_html(TEST_INPUT);
1621 replace_relative_urls_with_absolute(&doc, "http://www.example.com/world/", "");
1622 let e = doc.select_first("img").unwrap();
1623 let node = e.as_node();
1624 assert_eq!(node.element_name().unwrap(), "img");
1625 assert_eq!(
1626 node.attr_value("src").unwrap(),
1627 "http://www.example.com/world/images/img.png"
1628 );
1629 assert_eq!(doc.select("img").unwrap().count(), 1);
1630 }
1631
1632 #[test]
1633 fn test_replace_relative_urls_with_abs_img_src() {
1634 const TEST_INPUT: &str = r###"<!doctype html><html><head>
1635 <title>Example Domain</title>
1636 </head>
1637 <body>
1638 <div>
1639 <img src="https://google.com/images/img.png" />
1640 </div>
1641 </body>
1642 </html>"###;
1643
1644 let doc = parse_html(TEST_INPUT);
1645 replace_relative_urls_with_absolute(&doc, "http://www.example.com/world/", "");
1646 let e = doc.select_first("img").unwrap();
1647 let node = e.as_node();
1648 assert_eq!(node.element_name().unwrap(), "img");
1649 assert_eq!(
1650 node.attr_value("src").unwrap(),
1651 "https://google.com/images/img.png"
1652 );
1653 assert_eq!(doc.select("img").unwrap().count(), 1);
1654 }
1655
1656 #[test]
1657 fn test_replace_relative_urls_with_hash_link_to_self() {
1658 const TEST_INPUT: &str = r###"<!doctype html><html><head>
1659 <title>Example Domain</title>
1660 </head>
1661 <body>
1662 <div>
1663 <a href="#self_id" id="self_id">Self ID</a>
1664 </div>
1665 </body>
1666 </html>"###;
1667
1668 let doc = parse_html(TEST_INPUT);
1669 assert_eq!(
1670 doc.select_first("#self_id")
1671 .unwrap()
1672 .as_node()
1673 .element_name()
1674 .unwrap(),
1675 "a"
1676 );
1677 replace_relative_urls_with_absolute(&doc, "http://www.example.com", "");
1678 assert_eq!(
1679 doc.select_first("#self_id")
1680 .unwrap()
1681 .as_node()
1682 .element_name()
1683 .unwrap(),
1684 "span"
1685 );
1686 assert_eq!(doc.select("a").unwrap().count(), 0);
1687 }
1688
1689 #[test]
1690 fn test_replace_relative_urls_with_hash_link_to_js() {
1691 const TEST_INPUT: &str = r###"<!doctype html><html><head>
1692 <title>Example Domain</title>
1693 </head>
1694 <body>
1695 <div>
1696 <a href="javascript:" id="js_link">JS Link</a>
1697 </div>
1698 </body>
1699 </html>"###;
1700
1701 let doc = parse_html(TEST_INPUT);
1702 assert_eq!(
1703 doc.select_first("#js_link")
1704 .unwrap()
1705 .as_node()
1706 .element_name()
1707 .unwrap(),
1708 "a"
1709 );
1710 replace_relative_urls_with_absolute(&doc, "http://www.example.com", "");
1711 assert_eq!(
1712 doc.select_first("#js_link")
1713 .unwrap()
1714 .as_node()
1715 .element_name()
1716 .unwrap(),
1717 "span"
1718 );
1719 assert_eq!(doc.select("a").unwrap().count(), 0);
1720 }
1721
1722 #[test]
1723 fn test_replace_relative_urls_with_hash_link_to_other() {
1724 const TEST_INPUT: &str = r###"<!doctype html><html><head>
1725 <title>Example Domain</title>
1726 </head>
1727 <body>
1728 <div>
1729 <a href="#sib_id" id="other_id">Self ID</a>
1730 </div>
1731 </body>
1732 </html>"###;
1733
1734 let doc = parse_html(TEST_INPUT);
1735 assert_eq!(
1736 doc.select_first("#other_id")
1737 .unwrap()
1738 .as_node()
1739 .element_name()
1740 .unwrap(),
1741 "a"
1742 );
1743 replace_relative_urls_with_absolute(&doc, "http://www.example.com", "");
1744 let e = doc.select_first("#other_id").unwrap();
1745 let node = e.as_node();
1746 assert_eq!(node.element_name().unwrap(), "a");
1747 assert_eq!(node.attr_value("href").unwrap(), "#sib_id");
1748 assert_eq!(doc.select("a").unwrap().count(), 1);
1749 }
1750
1751 #[test]
1752 fn test_replace_relative_urls_with_rel_link_without_base_and_with_hash() {
1753 const TEST_INPUT: &str = r###"<!doctype html><html><head>
1754 <title>Example Domain</title>
1755 </head>
1756 <body>
1757 <div>
1758 <a href="hello_world#hash" id="hello">Self ID</a>
1759 </div>
1760 </body>
1761 </html>"###;
1762
1763 let doc = parse_html(TEST_INPUT);
1764 replace_relative_urls_with_absolute(&doc, "http://www.example.com/world/", "");
1765 let e = doc.select_first("#hello").unwrap();
1766 let node = e.as_node();
1767 assert_eq!(node.element_name().unwrap(), "a");
1768 assert_eq!(
1769 node.attr_value("href").unwrap(),
1770 "http://www.example.com/world/hello_world#hash"
1771 );
1772 assert_eq!(doc.select("a").unwrap().count(), 1);
1773 }
1774
1775 #[test]
1776 fn test_replace_relative_urls_with_rel_link_with_base_and_hash() {
1777 const TEST_INPUT: &str = r###"<!doctype html><html><head>
1778 <title>Example Domain</title>
1779 </head>
1780 <body>
1781 <div>
1782 <a href="hello_world#hash" id="hello">Self ID</a>
1783 </div>
1784 </body>
1785 </html>"###;
1786
1787 let doc = parse_html(TEST_INPUT);
1788 replace_relative_urls_with_absolute(&doc, "http://www.example.com/world/", "../");
1789 let e = doc.select_first("#hello").unwrap();
1790 let node = e.as_node();
1791 assert_eq!(node.element_name().unwrap(), "a");
1792 assert_eq!(
1793 node.attr_value("href").unwrap(),
1794 "http://www.example.com/hello_world#hash"
1795 );
1796 assert_eq!(doc.select("a").unwrap().count(), 1);
1797 }
1798
1799 #[test]
1800 fn test_replace_relative_urls_with_rel_link_with_base() {
1801 const TEST_INPUT: &str = r###"<!doctype html><html><head>
1802 <title>Example Domain</title>
1803 </head>
1804 <body>
1805 <div>
1806 <a href="hello_world" id="hello">Self ID</a>
1807 </div>
1808 </body>
1809 </html>"###;
1810
1811 let doc = parse_html(TEST_INPUT);
1812 replace_relative_urls_with_absolute(&doc, "http://www.example.com/world/", "../");
1813 let e = doc.select_first("#hello").unwrap();
1814 let node = e.as_node();
1815 assert_eq!(node.element_name().unwrap(), "a");
1816 assert_eq!(
1817 node.attr_value("href").unwrap(),
1818 "http://www.example.com/hello_world"
1819 );
1820 assert_eq!(doc.select("a").unwrap().count(), 1);
1821 }
1822
1823 #[test]
1824 fn test_replace_relative_urls_with_rel_link_without_base() {
1825 const TEST_INPUT: &str = r###"<!doctype html><html><head>
1826 <title>Example Domain</title>
1827 </head>
1828 <body>
1829 <div>
1830 <a href="hello_world" id="hello">Self ID</a>
1831 </div>
1832 </body>
1833 </html>"###;
1834
1835 let doc = parse_html(TEST_INPUT);
1836 replace_relative_urls_with_absolute(&doc, "http://www.example.com/world/", "");
1837 let e = doc.select_first("#hello").unwrap();
1838 let node = e.as_node();
1839 assert_eq!(node.element_name().unwrap(), "a");
1840 assert_eq!(
1841 node.attr_value("href").unwrap(),
1842 "http://www.example.com/world/hello_world"
1843 );
1844 assert_eq!(doc.select("a").unwrap().count(), 1);
1845 }
1846
1847 #[test]
1848 fn test_word_count() {
1849 assert_eq!(word_count("Hello World Another word"), 4);
1850 assert_eq!(word_count("Hello ."), 2);
1851 }
1852
1853 #[test]
1854 fn test_apply_with_valid_selector() {
1855 const TEST_INPUT: &str = r###"<!doctype html><html><head>
1856<title>Example Domain</title>
1857</head>
1858<body>
1859<div id="that_node" background="black" border="1px">
1860<table height="100" width="100" style="width:100%">
1861<tr><th>Firstname</th>
1862<p align="center" style="border:1px solid;">Text Here</p>
1863<p>Another P</p>
1864</tr><tr><td>Jill</td></tr>
1865</table>
1866</div>
1867</body>
1868</html>"###;
1869 use std::sync::atomic::{AtomicUsize, Ordering};
1870
1871 let doc = parse_html(TEST_INPUT);
1872 let counter: AtomicUsize = AtomicUsize::new(0);
1873 apply(&doc, &["p", "tr"], |_, s| {
1874 assert!(s == "p" || s == "tr");
1875 counter.fetch_add(1, Ordering::Relaxed);
1876 });
1877 assert_eq!(4, counter.load(Ordering::Relaxed));
1878 }
1879
1880 #[test]
1881 fn test_apply_with_invalid_selector() {
1882 const TEST_INPUT: &str = r###"<!doctype html><html><head>
1883<title>Example Domain</title>
1884</head>
1885<body>
1886<div id="that_node" background="black" border="1px">
1887<table height="100" width="100" style="width:100%">
1888<tr><th>Firstname</th>
1889<p align="center" style="border:1px solid;">Text Here</p>
1890<p>Another P</p>
1891</tr><tr><td>Jill</td></tr>
1892</table>
1893</div>
1894</body>
1895</html>"###;
1896 use std::sync::atomic::{AtomicUsize, Ordering};
1897
1898 let doc = parse_html(TEST_INPUT);
1899 let counter: AtomicUsize = AtomicUsize::new(0);
1900 apply(&doc, &["p", "-123"], |_, s| {
1901 assert!(s == "p");
1902 counter.fetch_add(1, Ordering::Relaxed);
1903 });
1904 assert_eq!(2, counter.load(Ordering::Relaxed));
1905 }
1906
1907 #[test]
1908 fn test_resolve_url_with_normal_base_and_relative() {
1909 let result = to_absolute_uri("index.html", "http://example.com", "");
1910 assert_eq!(result, "http://example.com/index.html");
1911 }
1912
1913 #[test]
1914 fn test_resolve_url_with_normal_base_as_file_url_and_relative() {
1915 let result = to_absolute_uri("foo/bar/index.html", "http://fakehost/test/page.html", "");
1916 assert_eq!(result, "http://fakehost/test/foo/bar/index.html");
1917 }
1918
1919 #[test]
1920 fn test_resolve_url_with_base_trailing_slash_and_normal_relative() {
1921 let result = to_absolute_uri("index.html", "http://example.com/", "");
1922 assert_eq!(result, "http://example.com/index.html");
1923 }
1924
1925 #[test]
1926 fn test_resolve_url_with_normal_base_and_relative_starting_with_slash() {
1927 let result = to_absolute_uri("/index.html", "http://example.com", "");
1928 assert_eq!(result, "http://example.com/index.html");
1929 }
1930
1931 #[test]
1932 fn test_resolve_url_with_base_trailing_slash_and_relative_starting_with_slash() {
1933 let result = to_absolute_uri("/index.html", "http://example.com/", "");
1934 assert_eq!(result, "http://example.com/index.html");
1935 }
1936
1937 #[test]
1938 fn test_resolve_url_with_full_url() {
1939 let result = to_absolute_uri("http://example.com/index.html", "http://example.com/", "");
1940 assert_eq!(result, "http://example.com/index.html");
1941 }
1942
1943 #[test]
1944 fn test_rename_tag_with_selector_with_by_id_selector() {
1945 const TEST_INPUT: &str = r###"<!doctype html><html><head>
1946<title>Example Domain</title>
1947</head>
1948<body>
1949<div>foo <p id="rename_it"><br>bar<br> <br><br>abc</p></div>
1950</body>
1951</html>"###;
1952 let doc = parse_html(TEST_INPUT);
1953 assert_eq!(count_elements(&doc, "br"), 4);
1954 assert_eq!(count_elements(&doc, "p"), 1);
1955 assert_eq!(count_elements(&doc, "div"), 1);
1956 let body = doc.select("body").unwrap().next().unwrap();
1957 let n = body.as_node();
1958 rename_tags_with_selector(n, "#rename_it", "div");
1959 assert_eq!(count_elements(&doc, "br"), 4);
1960 assert_eq!(count_elements(&doc, "p"), 0);
1961 assert_eq!(count_elements(&doc, "div"), 2);
1962 }
1963
1964 #[test]
1965 fn test_rename_tag_with_selector_with_by_class_selector() {
1966 const TEST_INPUT: &str = r###"<!doctype html><html><head>
1967<title>Example Domain</title>
1968</head>
1969<body>
1970<div>foo <p class="rename_it"><br>bar<br> <br><br>abc</p>
1971<p class="rename_it">
1972nothing special in here
1973</p>
1974</div>
1975</body>
1976</html>"###;
1977 let doc = parse_html(TEST_INPUT);
1978 assert_eq!(count_elements(&doc, "br"), 4);
1979 assert_eq!(count_elements(&doc, "p"), 2);
1980 assert_eq!(count_elements(&doc, "div"), 1);
1981 let body = doc.select("body").unwrap().next().unwrap();
1982 let n = body.as_node();
1983 rename_tags_with_selector(n, ".rename_it", "div");
1984 assert_eq!(count_elements(&doc, "br"), 4);
1985 assert_eq!(count_elements(&doc, "p"), 0);
1986 assert_eq!(count_elements(&doc, "div"), 3);
1987 }
1988
1989 #[test]
1990 fn test_link_to_itself_with_postivie_absolute_url() {
1991 let n = new_html_element("a");
1992 n.as_element()
1993 .unwrap()
1994 .attributes
1995 .borrow_mut()
1996 .insert("id", String::from("content"));
1997 let href = "http://www.something.com/#content";
1998 let doc_uri = "http://www.something.com/";
1999 assert!(link_to_itself(&n, href, doc_uri));
2000 }
2001
2002 fn link_to_itself(node: &NodeRef, href: &str, doc_uri: &str) -> bool {
2003 if !href.starts_with('#') && !href.starts_with(doc_uri) {
2004 return false;
2005 }
2006 if let Some(id) = node.attr_value("id") {
2007 if href.is_empty()
2008 || id == href[1..]
2009 || (href.starts_with(doc_uri)
2010 && href.len() > doc_uri.len() + 1
2012 && id == href[doc_uri.len() + 1..])
2013 {
2014 return true;
2015 }
2016 }
2017
2018 false
2019 }
2020}