Skip to main content

readable_rs/
utils.rs

1use crate::logging::logger::*;
2use crate::logging::logging_defs::*;
3use crate::parser::{NodeExt, NodeRef, new_html_element, parse_html};
4
5use regex::Regex;
6use std::collections::{HashMap, HashSet};
7use std::sync::LazyLock;
8
9/// HTML phrasing-content tag names per the [HTML spec](https://html.spec.whatwg.org/multipage/dom.html#phrasing-content).
10pub static PHRASING_ELEMENTS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
11    HashSet::from([
12        "abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn",
13        "em", "embed", "i", "img", "input", "kbd", "label", "mark", "math", "meter",
14        "noscript", "object", "output", "progress", "q", "ruby", "samp", "script", "select",
15        "small", "span", "strong", "sub", "sup", "textarea", "time", "var", "wbr",
16    ])
17});
18
19/// HTML attributes that are purely presentational and are stripped during
20/// the cleanup phase.
21pub static PRESENTATIONAL_ATTRIBUTES: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
22    vec![
23        "align",
24        "background",
25        "bgcolor",
26        "border",
27        "cellpadding",
28        "cellspacing",
29        "frame",
30        "hspace",
31        "rules",
32        "style",
33        "valign",
34        "vspace",
35    ]
36});
37
38/// Element tags that historically accepted (now-deprecated) `width`/`height`
39/// attributes.  Those attributes are stripped when such elements are renamed.
40pub static DEPRECATED_SIZE_ATTRIBUTE_ELEMS: LazyLock<HashSet<&'static str>> =
41    LazyLock::new(|| HashSet::from(["table", "th", "td", "hr", "pre"]));
42
43/// Void (self-closing) HTML elements — they have no closing tag and therefore
44/// no children.  Used to avoid treating empty void elements as "empty nodes"
45/// that should be pruned.
46pub static SELF_CLOSING_TAGS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
47    HashSet::from([
48        "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "param", "source",
49        "track", "wbr",
50    ])
51});
52
53/// Matches a string that ends with a non-whitespace character (i.e. has
54/// visible content).
55pub static HAS_CONTENT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\S$").unwrap());
56
57/// Class / id tokens that suggest a node is *content* (article body, blog
58/// post, etc.).
59pub static POSITIVE_CLASSES_AND_IDS: LazyLock<Regex> = LazyLock::new(|| {
60    Regex::new(r"(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story").unwrap()
61});
62/// Class / id tokens that suggest a node is *non-content* (ads, sidebars,
63/// navigation, footers, etc.).
64pub static NEGATIVE_CLASSES_AND_IDS: LazyLock<Regex> = LazyLock::new(|| {
65    Regex::new(r"(?i)-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|footer|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|widget").unwrap()
66});
67pub static VIDEO_ATTRS_REGEX: LazyLock<Regex> = LazyLock::new(|| {
68    Regex::new(r"(?i)//(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq|bilibili|live\.bilibili)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)").unwrap()
69});
70pub static SHARE_ELEMENTS_REGEX: LazyLock<Regex> =
71    LazyLock::new(|| Regex::new(r"(?i)(\b|_)(share|sharedaddy)(\b|_)").unwrap());
72pub static UNLIKELY_CANDIDATES_REGEX: LazyLock<Regex> = LazyLock::new(|| {
73    Regex::new(r"(?i)-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote").unwrap()
74});
75pub static MAYBE_A_CANDIDATE: LazyLock<Regex> =
76    LazyLock::new(|| Regex::new(r"(?i)and|article|body|column|content|main|mathjax|shadow").unwrap());
77pub static TOKENIZE_REGEX: LazyLock<Regex> =
78    LazyLock::new(|| Regex::new(r"[^A-Za-z0-9_]+").unwrap());
79pub static B64_DATA_URL: LazyLock<Regex> =
80    LazyLock::new(|| Regex::new(r"(?i)^data:\s*([^\s;,]+)\s*;\s*base64\s*,").unwrap());
81pub static SRCSET_URL: LazyLock<Regex> =
82    LazyLock::new(|| Regex::new(r"(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))").unwrap());
83pub static COMMA_REGEX: LazyLock<Regex> = LazyLock::new(|| {
84    Regex::new(r"\u002C|\u060C|\uFE50|\uFE10|\uFE11|\u2E41|\u2E34|\u2E32|\uFF0C").unwrap()
85});
86pub static UNESCAPE_NAMED_ENTITIES: LazyLock<Regex> =
87    LazyLock::new(|| Regex::new(r"&(?:quot|amp|apos|lt|gt);").unwrap());
88pub static UNESCAPE_NUMERIC_ENTITIES: LazyLock<Regex> =
89    LazyLock::new(|| Regex::new(r"&#(?:x([0-9a-fA-F]+)|([0-9]+));").unwrap());
90pub static IMAGE_EXTENSION: LazyLock<Regex> =
91    LazyLock::new(|| Regex::new(r"(?i)\.(jpg|jpeg|png|webp)").unwrap());
92pub static SRCSET_EXTENSION: LazyLock<Regex> =
93    LazyLock::new(|| Regex::new(r"\.(jpg|jpeg|png|webp)\s+\d").unwrap());
94pub static SRC_EXTENSION: LazyLock<Regex> =
95    LazyLock::new(|| Regex::new(r"^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$").unwrap());
96pub static SENTENCE_END: LazyLock<Regex> =
97    LazyLock::new(|| Regex::new(r"\.( |$)").unwrap());
98pub static CDATA_STRIP: LazyLock<Regex> =
99    LazyLock::new(|| Regex::new(r"<!\[CDATA\[|\]\]>").unwrap());
100pub static UNLIKELY_ROLES: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
101    HashSet::from([
102        "menu",
103        "menubar",
104        "complementary",
105        "navigation",
106        "alert",
107        "alertdialog",
108        "dialog",
109    ])
110});
111
112/// HTML attributes that can serve as anchor targets.  Nodes carrying any of
113/// these are kept as placeholders even when otherwise empty.
114pub const REFERENCING_ATTRIBUTES: &[&str] = &["id", "name"];
115
116/// Sentinel value meaning "walk all the way to the root" when passed as
117/// `max_depth` to ancestor-lookup helpers.
118pub const DEFAULT_MAX_ANCESTORS_DEPTH: i16 = 0;
119
120/// Default depth limit for ancestor lookups that are used as quick
121/// heuristic checks (e.g. "is this node inside a `<table>`?").
122pub const DEFAULT_MAX_ANCESTORS_LOOKUP_DEPTH: i16 = 3;
123
124/// Join two optional strings with a separator, omitting the separator when
125/// either side is `None`.
126///
127/// ```rust
128/// use readable_rs::shared_utils::*; // for illustration; concat_optionals is internal
129/// // concat_optionals(Some("a".into()), Some("b".into()), " ") == "a b"
130/// // concat_optionals(None,            Some("b".into()), " ") == "b"
131/// // concat_optionals(Some("a".into()), None,            " ") == "a"
132/// // concat_optionals(None,            None,            " ") == ""
133/// ```
134pub fn concat_optionals(l: Option<String>, r: Option<String>, sep: &str) -> String {
135    match (l, r) {
136        (Some(l), Some(r)) => format!("{}{}{}", l, sep, r),
137        (Some(l), None) => l,
138        (None, Some(r)) => r,
139        (None, None) => String::new(),
140    }
141}
142
143/// Detach every descendant of `node` that matches the CSS `selector`.
144pub fn remove_tags_with_selector(node: &NodeRef, selector: &str) {
145    for n in select_descendants(node, selector) {
146        n.detach();
147    }
148}
149
150/// Return all descendants of `node` that match `selector`, excluding `node`
151/// itself.  An invalid selector returns an empty `Vec` rather than panicking.
152pub fn select_descendants(node: &NodeRef, selector: &str) -> Vec<NodeRef> {
153    match node.select(selector) {
154        Ok(iter) => iter
155            .filter_map(|e| {
156                let n = e.as_node();
157                if n == node { None } else { Some(n.clone()) }
158            })
159            .collect(),
160        Err(_) => vec![],
161    }
162}
163
164/// Remove all HTML comment nodes (`<!-- … -->`) from the subtree rooted at
165/// `node`.
166pub fn remove_comment_nodes(node: &NodeRef) {
167    let descendants: Vec<_> = node.descendants().collect();
168    for n in descendants {
169        if n.as_comment().is_some() {
170            n.detach();
171        }
172    }
173}
174
175/// Undo the `<div data-readability-p-wrap>` wrappers that were inserted
176/// earlier to group inline content.  A wrapper is removed (its children
177/// are spliced in place) when it is the only element child of its parent
178/// and the parent has no non-whitespace text of its own.
179pub fn cleanup_readability_p_wrappers(root: &NodeRef) {
180    let mut divs = vec![];
181    if let Ok(iter) = root.select("div") {
182        for d in iter {
183            divs.push(d.as_node().clone());
184        }
185    }
186
187    for div in divs.into_iter().rev() {
188        if div.attr_value("data-readability-p-wrap").is_none() {
189            continue;
190        }
191
192        let parent = match div.parent() {
193            Some(p) => p,
194            None => {
195                if let Some(e) = div.as_element() {
196                    e.attributes.borrow_mut().remove("data-readability-p-wrap");
197                }
198                continue;
199            }
200        };
201
202        let mut parent_has_text = false;
203        for child in parent.children() {
204            if child.as_text().is_some() && !child.text_contents().trim().is_empty() {
205                parent_has_text = true;
206                break;
207            }
208        }
209
210        let parent_elements = parent.element_children();
211        let should_unwrap = !parent_has_text && parent_elements.len() == 1;
212
213        if should_unwrap {
214            let children: Vec<_> = div.children().collect();
215            for child in children {
216                child.detach();
217                div.insert_before(child);
218            }
219            div.detach();
220        } else if let Some(e) = div.as_element() {
221            e.attributes.borrow_mut().remove("data-readability-p-wrap");
222        }
223    }
224}
225
226/// Returns true if the element has no meaningful content: no text, no images,
227/// and the only child elements (if any) are <br> or <hr>.
228pub fn is_element_without_content(node: &NodeRef) -> bool {
229    if node.as_element().is_none() {
230        return false;
231    }
232    if !node.text_contents().trim().is_empty() {
233        return false;
234    }
235    if !select_descendants(node, "img").is_empty() {
236        return false;
237    }
238    let children = node.element_children();
239    if children.is_empty() {
240        return true;
241    }
242    let brs = select_descendants(node, "br").len();
243    let hrs = select_descendants(node, "hr").len();
244    children.len() == brs + hrs
245}
246
247/// Flatten redundant single-child `<div>` / `<section>` wrappers.
248///
249/// Two cases are handled:
250/// * The element is completely empty → it is removed.
251/// * The element's only child is itself a `<div>` or `<section>` → the
252///   wrapper's attributes are merged onto the child and the wrapper is
253///   replaced by the child.
254///
255/// Nodes whose `id` starts with `"readability"` are left untouched so
256/// that the algorithm's own marker elements survive.
257pub fn simplify_nested_elements(article_content: &NodeRef) {
258    let mut node = Some(article_content.clone());
259    while let Some(current) = node {
260        let mut next = get_next_node(&current, false);
261        if current.parent().is_some() {
262            if let Some(name) = current.element_name() {
263                let name = name.to_lowercase();
264                let id_is_readability = current
265                    .attr_value("id")
266                    .map(|id| id.starts_with("readability"))
267                    .unwrap_or(false);
268                if (name == "div" || name == "section") && !id_is_readability {
269                    if is_element_without_content(&current) {
270                        next = remove_and_get_next(&current);
271                        node = next;
272                        continue;
273                    }
274                    if contains_single_tag_in_element(&current, "div")
275                        || contains_single_tag_in_element(&current, "section")
276                    {
277                        if let Some(child) = current.element_children().get(0).cloned() {
278                            if let (Some(parent_e), Some(child_e)) =
279                                (current.as_element(), child.as_element())
280                            {
281                                for (attr_name, attr) in
282                                    parent_e.attributes.borrow().map.clone()
283                                {
284                                    child_e.attributes.borrow_mut().insert(
285                                        attr_name.local.to_string(),
286                                        attr.value.clone(),
287                                    );
288                                }
289                            }
290                            current.insert_before(child.clone());
291                            current.detach();
292                            node = Some(child);
293                            continue;
294                        }
295                    }
296                }
297            }
298        }
299        node = next;
300    }
301}
302
303/// Depth-first DOM iterator step.  Returns the next element node in a
304/// depth-first traversal.
305///
306/// * `ignore_self_and_children = false` – descend into `node`'s children
307///   first (normal DFS step).
308/// * `ignore_self_and_children = true` – skip `node` and its subtree
309///   entirely; useful when `node` is about to be detached.
310///
311/// Returns `None` when the end of the tree is reached.
312pub fn get_next_node(node: &NodeRef, ignore_self_and_children: bool) -> Option<NodeRef> {
313    // First check for kids if those aren't being ignored
314    let first_child = node.first_element_child();
315    if !ignore_self_and_children && first_child.is_some() {
316        return first_child;
317    }
318    // Then for siblings...
319    if let Some(next_sibling) = node.next_element_sibling() {
320        return Some(next_sibling);
321    }
322
323    // And finally, move up the parent chain *and* find a sibling
324    // (because this is depth-first traversal, we will have already
325    // seen the parent nodes themselves).
326    let mut current = node.parent();
327    while let Some(p) = current {
328        if let Some(sibling) = p.next_element_sibling() {
329            return Some(sibling);
330        }
331        current = p.parent();
332    }
333    None
334}
335
336/// Detach `node` from the tree and return the next node in DFS order
337/// (equivalent to `get_next_node(node, true)` followed by `node.detach()`).
338pub fn remove_and_get_next(node: &NodeRef) -> Option<NodeRef> {
339    let next = get_next_node(node, true);
340    node.detach();
341    next
342}
343
344/// Build the string used for class/id regex matching: the node's `class`
345/// and `id` attributes joined by a space.
346pub fn match_string_for_node(node: &NodeRef) -> String {
347    concat_optionals(node.attr_value("class"), node.attr_value("id"), " ")
348}
349
350/// Walk the subtree rooted at `node` in DFS order and detach every
351/// descendant for which `predicate(node, class_id_string)` returns `true`.
352pub fn remove_matched_nodes<F>(node: &NodeRef, predicate: F)
353where
354    F: Fn(&NodeRef, &str) -> bool,
355{
356    let end_of_search_marker = get_next_node(node, true);
357    let mut next = get_next_node(node, false);
358    while next.is_some() && next != end_of_search_marker {
359        let n = next.clone().unwrap();
360        let match_str = match_string_for_node(&n);
361        if predicate(&n, match_str.as_str()) {
362            next = remove_and_get_next(&n);
363        } else {
364            next = get_next_node(&n, false);
365        }
366    }
367}
368
369/// Advance through the sibling list starting at `node` until an element
370/// node or a non-whitespace text node is found.  Returns `None` at the end
371/// of the sibling list.
372pub fn next_element(node: Option<NodeRef>) -> Option<NodeRef> {
373    let mut next = node;
374    while let Some(ref n) = next {
375        if n.as_element().is_some() || !n.text_contents().trim().is_empty() {
376            break;
377        }
378        next = n.next_sibling();
379    }
380    next
381}
382
383/// Replace every descendant element matching `selector` with a copy that
384/// has tag name `new_tag_name`, preserving attributes and children.
385pub fn rename_tags_with_selector(node: &NodeRef, selector: &str, new_tag_name: &str) {
386    for n in select_descendants(node, selector) {
387        n.clone().clone_and_rename_element(new_tag_name);
388    }
389}
390
391/// Return `true` if `node` is a whitespace-only text node or a `<br>`.
392pub fn is_whitespace_node(node: &NodeRef) -> bool {
393    if (node.as_text().is_some() && node.text_contents().trim().is_empty())
394        || node.element_name() == Some("br")
395    {
396        return true;
397    }
398
399    false
400}
401
402/// Return `true` if `node` carries no meaningful content and can be safely
403/// pruned.  Void elements, phrasing content, and nodes with an `id` or
404/// `name` attribute are never considered empty (they may be referenced
405/// elsewhere).
406pub fn is_empty_node(node: &NodeRef, logger: &PerfLogger) -> bool {
407    // self closing tags, aren't expected to have any children
408    // and shouldn't be treated as empty
409    if let Some(name) = node.element_name() {
410        if SELF_CLOSING_TAGS.contains(name) {
411            return false;
412        }
413    }
414
415    // phrasing elements should remain even if they are empty
416    if is_phrasing_content(node) {
417        return false;
418    }
419
420    // if the node has an id or name then this node
421    // can be potentially needed as a placeholder
422    for attr_name in REFERENCING_ATTRIBUTES {
423        if node.attr_value(attr_name).is_some() {
424            return false;
425        }
426    }
427
428    let txt = get_normalized_text_content(node, logger);
429
430    txt.trim().is_empty() && select_descendants(node, "img").is_empty()
431}
432
433/// Return `true` if `node` is [phrasing content](https://html.spec.whatwg.org/multipage/dom.html#phrasing-content)
434/// per the HTML spec.  Text nodes, elements in [`PHRASING_ELEMENTS`], and
435/// `<a>` / `<del>` / `<ins>` whose *entire* child list is also phrasing
436/// content all qualify.
437pub fn is_phrasing_content(node: &NodeRef) -> bool {
438    if node.as_text().is_some() {
439        return true;
440    }
441
442    if let Some(name) = node.element_name() {
443        if PHRASING_ELEMENTS.contains(name) {
444            return true;
445        }
446    }
447    if (node.element_name() == Some("a")
448        || node.element_name() == Some("del")
449        || node.element_name() == Some("ins"))
450        && test_all_siblings(node.first_child(), is_phrasing_content)
451    {
452        return true;
453    }
454    false
455}
456
457/// Return `true` if every sibling starting at `node` (inclusive) satisfies
458/// `test_func`.  An empty sibling list (i.e. `node = None`) vacuously
459/// returns `true`.
460pub fn test_all_siblings<F>(node: Option<NodeRef>, test_func: F) -> bool
461where
462    F: Fn(&NodeRef) -> bool,
463{
464    let mut next = node;
465    while next.is_some() {
466        let n = next.clone().unwrap();
467        if !test_func(&n) {
468            return false;
469        }
470        next = n.next_sibling();
471    }
472    true
473}
474
475/// Move every child node of `from` (in order) to be the last children of
476/// `to`.  After the call, `from` has no children.
477pub fn move_children(from: &NodeRef, to: &NodeRef) {
478    let mut child = from.first_child();
479    while child.is_some() {
480        let child_unwraped = child.clone().unwrap();
481        child = child_unwraped.next_sibling();
482        to.append(child_unwraped);
483    }
484}
485
486/// Return `true` if at least one descendant of `node` matching `sel`
487/// satisfies `test_func`.
488pub fn test_any_node_by_selector<F>(node: &NodeRef, sel: &str, test_func: F) -> bool
489where
490    F: Fn(&NodeRef) -> bool,
491{
492    for n in select_descendants(node, sel) {
493        if test_func(&n) {
494            return true;
495        }
496    }
497    false
498}
499
500/// Returns true if the passed node contains only single
501/// node that matches the tag_name, false otherwise
502pub fn contains_single_tag_in_element(node: &NodeRef, tag_name: &str) -> bool {
503    let mut elements = vec![];
504    let mut non_elements = vec![];
505    for c in node.children() {
506        if c.as_element().is_some() {
507            elements.push(c);
508        } else {
509            non_elements.push(c);
510        }
511    }
512    // There should be exactly 1 element child with given tag
513    if elements.len() != 1 || elements.get(0).unwrap().element_name() != Some(tag_name) {
514        return false;
515    }
516
517    // And there should be no text nodes with real content
518    for c in non_elements {
519        if HAS_CONTENT.is_match(c.text_contents().as_str()) {
520            return false;
521        }
522    }
523
524    true
525}
526
527/// Return `true` if any ancestor of `node` (up to `max_depth` levels)
528/// has tag name `ancestor_tag_name`.  Pass `0` for `max_depth` to search
529/// all the way to the root.
530pub fn has_ancestor_tag(node: &NodeRef, ancestor_tag_name: &str, max_depth: i16) -> bool {
531    has_ancestor_tag_with_predicate(node, ancestor_tag_name, max_depth, |_| true)
532}
533
534/// Like [`has_ancestor_tag`], but the matching ancestor must also satisfy
535/// `predicate`.  Useful for checking properties on the ancestor (e.g.
536/// whether a `<table>` ancestor has been classified as a data table).
537pub fn has_ancestor_tag_with_predicate<F>(
538    node: &NodeRef,
539    ancestor_tag_name: &str,
540    max_depth: i16,
541    predicate: F,
542) -> bool
543where
544    F: Fn(&NodeRef) -> bool,
545{
546    let mut depth = 0;
547    let mut node = node.clone();
548    while let Some(p) = node.parent() {
549        depth += 1;
550        if max_depth > 0 && depth > max_depth {
551            return false;
552        }
553        if p.element_name() == Some(ancestor_tag_name) && predicate(&p) {
554            return true;
555        }
556        node = p;
557    }
558    false
559}
560
561/// Collect all descendants of `container` matching *any* of the given
562/// CSS selectors into a single flat `Vec`.
563pub fn concate_nodes_with_selectors(container: &NodeRef, selectors: Vec<&str>) -> Vec<NodeRef> {
564    let mut res = vec![];
565    for s in selectors {
566        res.extend(select_descendants(container, s));
567    }
568    res
569}
570
571/// Walk up the parent chain from `node` and collect ancestors into a `Vec`
572/// (nearest ancestor first).  Stop after `max_depth` levels; pass `0` to
573/// collect all ancestors up to the root.
574pub fn get_node_ancestors(node: &NodeRef, max_depth: i16) -> Vec<NodeRef> {
575    let mut ancestors = vec![];
576    let mut depth = 1;
577    let mut current = node.parent();
578    while let Some(p) = current {
579        ancestors.push(p.clone());
580        if max_depth > 0 && depth == max_depth {
581            break;
582        }
583        depth += 1;
584        current = p.parent();
585    }
586    ancestors
587}
588
589/// Compute the ratio of link-text length to total text length inside `node`.
590/// Anchor links whose `href` is a bare hash (`#…`) contribute at only 30 %
591/// of their actual length, because in-page navigation links are common in
592/// legitimate content.  Returns `0.0` when the node has no text at all.
593pub fn get_link_density(node: &NodeRef, logger: &PerfLogger) -> f64 {
594    start_span!(logger, GET_LINK_DENSITY);
595    add_point_to_span_str!(logger, GET_LINK_DENSITY, "get_node_normalized_text_begin");
596    let text_length = get_normalized_text_length(node, logger);
597    add_point_to_span_str!(logger, GET_LINK_DENSITY, "get_node_normalized_text_end");
598    if text_length == 0 {
599        annotate_span_str!(
600            logger,
601            GET_LINK_DENSITY,
602            "early return because node content is empty"
603        );
604        end_span!(logger, GET_LINK_DENSITY);
605        return 0.0;
606    }
607
608    let mut link_length = 0.0_f64;
609    add_point_to_span_str!(logger, GET_LINK_DENSITY, "sum_link_text_lengths_begin");
610    for a in select_descendants(node, "a") {
611        let mut coefficient = 1.0;
612        if let Some(href) = a.attr_value("href") {
613            if href.trim().starts_with('#') {
614                coefficient = 0.3;
615            }
616        }
617        link_length += get_normalized_text_length(&a, logger) as f64 * coefficient;
618    }
619    add_point_to_span_str!(logger, GET_LINK_DENSITY, "sum_link_text_lengths_end");
620    let result = link_length / (text_length as f64);
621    end_span!(logger, GET_LINK_DENSITY);
622    result
623}
624
625fn get_normalized_text_length(node: &NodeRef, logger: &PerfLogger) -> usize {
626    start_span!(logger, NORMALIZE_AND_COUNT_CHARS);
627    add_point_to_span_str!(
628        logger,
629        NORMALIZE_AND_COUNT_CHARS,
630        "get_normalized_txt_begin"
631    );
632    let txt = get_normalized_text_content(node, logger);
633    add_point_to_span_str!(logger, NORMALIZE_AND_COUNT_CHARS, "get_normalized_txt_end");
634
635    add_point_to_span_str!(logger, NORMALIZE_AND_COUNT_CHARS, "count_chars_begin");
636    let count = txt.chars().count();
637    add_point_to_span_str!(logger, NORMALIZE_AND_COUNT_CHARS, "count_chars_end");
638    end_span!(logger, NORMALIZE_AND_COUNT_CHARS);
639    count
640}
641
642/// Extract the full text content of `node` and collapse all runs of
643/// whitespace into single spaces (leading/trailing whitespace is trimmed).
644pub fn get_normalized_text_content(node: &NodeRef, logger: &PerfLogger) -> String {
645    start_span!(logger, NORMALIZE_NODE_TEXT);
646
647    add_point_to_span_str!(logger, NORMALIZE_NODE_TEXT, "get_text_contents_begin");
648    let txt = node.text_contents();
649    add_point_to_span_str!(logger, NORMALIZE_NODE_TEXT, "get_text_contents_end");
650    add_point_to_span_str!(logger, NORMALIZE_NODE_TEXT, "remove_duplicate_spaces_begin");
651    let txt = normalize_text(txt.trim());
652    add_point_to_span_str!(logger, NORMALIZE_NODE_TEXT, "remove_duplicate_spaces_end");
653    end_span!(logger, NORMALIZE_NODE_TEXT);
654    txt
655}
656
657static NORMALIZE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\s{2,}").unwrap());
658
659/// Collapse every run of two or more whitespace characters in `src` into a
660/// single ASCII space.
661pub fn normalize_text(src: &str) -> String {
662    NORMALIZE_REGEX.replace_all(src, " ").to_string()
663}
664
665/// Return `true` if `text` (after trimming and zero-width-space removal)
666/// is a known ad or "loading …" placeholder in any supported language
667/// (English, French, Spanish, German, Chinese, Russian).
668pub fn matches_ad_or_loading(text: &str) -> bool {
669    fn is_loading_word(s: &str) -> bool {
670        let lowered = s.to_lowercase();
671        let bases = [
672            "loading",
673            "正在加载",
674            "загрузка",
675            "chargement",
676            "cargando",
677        ];
678        for base in bases {
679            if lowered == base {
680                return true;
681            }
682            let dots = format!("{}...", base);
683            if lowered == dots {
684                return true;
685            }
686            let ellipsis = format!("{}…", base);
687            if lowered == ellipsis {
688                return true;
689            }
690        }
691        false
692    }
693
694    fn is_ad_word(s: &str) -> bool {
695        let lowered = s.to_lowercase();
696        matches!(
697            lowered.as_str(),
698            "ad"
699                | "advertising"
700                | "advertisement"
701                | "pub"
702                | "publicite"
703                | "publicité"
704                | "werb"
705                | "werbung"
706                | "广告"
707                | "реклама"
708                | "anuncio"
709        )
710    }
711
712    let trimmed = text.trim();
713    if is_ad_word(trimmed) || is_loading_word(trimmed) {
714        return true;
715    }
716
717    let compact: String = trimmed
718        .chars()
719        .filter(|c| !c.is_whitespace() && *c != '\u{200b}' && *c != '\u{feff}')
720        .collect();
721    is_ad_word(compact.as_str()) || is_loading_word(compact.as_str())
722}
723
724/// Like [`normalize_text`], but preserves Unicode non-breaking and
725/// typographic space characters (U+00A0, U+2000–U+200A, etc.) instead
726/// of collapsing them.  Used when generating the final output so that
727/// intentional non-breaking spaces survive.
728pub fn normalize_text_preserve_nbsp(src: &str) -> String {
729    let mut out = String::new();
730    let mut in_ws = false;
731    for ch in src.chars() {
732        if is_preserved_unicode_space(ch) {
733            out.push(ch);
734            in_ws = false;
735            continue;
736        }
737        if is_ascii_whitespace(ch) {
738            if !in_ws {
739                out.push(' ');
740                in_ws = true;
741            }
742        } else {
743            out.push(ch);
744            in_ws = false;
745        }
746    }
747    out
748}
749
750fn is_ascii_whitespace(ch: char) -> bool {
751    matches!(ch, ' ' | '\t' | '\n' | '\r' | '\x0C')
752}
753
754fn is_preserved_unicode_space(ch: char) -> bool {
755    matches!(ch,
756        '\u{00A0}' | // NO-BREAK SPACE
757        '\u{1680}' | // OGHAM SPACE MARK
758        '\u{2000}' | '\u{2001}' | '\u{2002}' | '\u{2003}' | '\u{2004}' |
759        '\u{2005}' | '\u{2006}' | '\u{2007}' | '\u{2008}' | '\u{2009}' |
760        '\u{200A}' | // EN/EM/THIN/HAIR spaces
761        '\u{202F}' | // NARROW NO-BREAK SPACE
762        '\u{205F}' | // MEDIUM MATHEMATICAL SPACE
763        '\u{3000}'   // IDEOGRAPHIC SPACE
764    )
765}
766
767/// Compute a similarity score between two strings in the range `[0.0, 1.0]`.
768///
769/// The algorithm tokenises both strings on non-alphanumeric boundaries,
770/// lowercases, then measures how much of `text_b`'s token sequence is
771/// *not* present in `text_a`.  A score of `1.0` means `text_b`'s tokens
772/// are a subset of `text_a`'s; `0.0` means no overlap.  Used to detect
773/// when an `<h1>` / `<h2>` duplicates the page title.
774pub fn text_similarity(text_a: &str, text_b: &str) -> f64 {
775    let tokens_a: HashSet<String> = TOKENIZE_REGEX
776        .split(&text_a.to_lowercase())
777        .filter(|s| !s.is_empty())
778        .map(|s| s.to_string())
779        .collect();
780    let tokens_b: Vec<String> = TOKENIZE_REGEX
781        .split(&text_b.to_lowercase())
782        .filter(|s| !s.is_empty())
783        .map(|s| s.to_string())
784        .collect();
785
786    if tokens_a.is_empty() || tokens_b.is_empty() {
787        return 0.0;
788    }
789
790    let uniq_tokens_b: Vec<&String> = tokens_b.iter().filter(|t| !tokens_a.contains(*t)).collect();
791    let uniq_len: usize = uniq_tokens_b.iter().map(|s| s.len()).sum::<usize>()
792        + uniq_tokens_b.len().saturating_sub(1);
793    let total_len: usize =
794        tokens_b.iter().map(|s| s.len()).sum::<usize>() + tokens_b.len().saturating_sub(1);
795    let distance_b = uniq_len as f64 / total_len as f64;
796    1.0 - distance_b
797}
798
799/// Decode the five named HTML entities (`&quot;` `&amp;` `&apos;` `&lt;`
800/// `&gt;`) and all numeric character references (`&#…;` / `&#x…;`) in
801/// `value`.  Invalid code points are replaced with U+FFFD.
802pub fn unescape_html_entities(value: &str) -> String {
803    if value.is_empty() {
804        return value.to_string();
805    }
806
807    let replaced = UNESCAPE_NAMED_ENTITIES.replace_all(value, |caps: &regex::Captures| {
808        match caps.get(0).map(|m| m.as_str()).unwrap_or_default() {
809            "&quot;" => "\"",
810            "&amp;" => "&",
811            "&apos;" => "'",
812            "&lt;" => "<",
813            "&gt;" => ">",
814            _ => "",
815        }
816    });
817
818    let replaced = UNESCAPE_NUMERIC_ENTITIES.replace_all(&replaced, |caps: &regex::Captures| {
819        let num = if let Some(hex) = caps.get(1) {
820            u32::from_str_radix(hex.as_str(), 16).unwrap_or(0)
821        } else if let Some(dec) = caps.get(2) {
822            dec.as_str().parse::<u32>().unwrap_or(0)
823        } else {
824            0
825        };
826
827        let num = if num == 0 || num > 0x10FFFF || (0xD800..=0xDFFF).contains(&num) {
828            0xFFFD
829        } else {
830            num
831        };
832        std::char::from_u32(num).unwrap_or('\u{FFFD}').to_string()
833    });
834
835    replaced.into_owned()
836}
837
838/// Normalise whitespace in every text node under `root`, merging adjacent
839/// text nodes and collapsing runs of whitespace while preserving
840/// non-breaking spaces.  Content inside `<pre>`, `<code>`, `<textarea>`,
841/// `<script>`, `<style>`, `<svg>`, and `<math>` is left untouched.
842pub fn normalize_text_nodes(root: &NodeRef) {
843    let skip_tags = [
844        "pre",
845        "code",
846        "textarea",
847        "script",
848        "style",
849        "svg",
850        "math",
851    ];
852    let nodes: Vec<_> = root.descendants().collect();
853    for n in nodes {
854        if let Some(text) = n.as_text() {
855            if let Some(parent) = n.parent() {
856                if let Some(tag) = parent.element_name() {
857                    let tag = tag.to_lowercase();
858                    if skip_tags.contains(&tag.as_str()) {
859                        continue;
860                    }
861                }
862            }
863            let current = text.borrow().to_string();
864            if let Some(next) = n.next_sibling() {
865                if let Some(next_text) = next.as_text() {
866                    let merged = format!("{}{}", current, next_text.borrow());
867                    let normalized = normalize_text_preserve_nbsp(merged.as_str());
868                    let new_node = NodeRef::new_text(normalized);
869                    n.insert_after(new_node);
870                    n.detach();
871                    next.detach();
872                    continue;
873                }
874            }
875            let normalized = normalize_text_preserve_nbsp(current.as_str());
876            if normalized.trim().is_empty() {
877                let prev = n.previous_sibling();
878                let next = n.next_sibling();
879                match (prev.clone(), next.clone()) {
880                    (Some(prev_node), Some(next_node)) => {
881                        if let Some(prev_text) = prev_node.as_text() {
882                            let mut prev_val = prev_text.borrow().to_string();
883                            if let Some(last) = prev_val.chars().last() {
884                                if is_phrasing_content(&prev_node)
885                                    && !is_ascii_whitespace(last)
886                                    && !is_preserved_unicode_space(last)
887                                {
888                                    prev_val.push(' ');
889                                    let new_prev =
890                                        NodeRef::new_text(normalize_text_preserve_nbsp(
891                                            prev_val.as_str(),
892                                        ));
893                                    prev_node.insert_before(new_prev);
894                                    prev_node.detach();
895                                }
896                            }
897                        }
898                        if let Some(next_text) = next_node.as_text() {
899                            let mut next_val = next_text.borrow().to_string();
900                            if let Some(first) = next_val.chars().next() {
901                                if is_phrasing_content(&next_node)
902                                    && !is_ascii_whitespace(first)
903                                    && !is_preserved_unicode_space(first)
904                                {
905                                    next_val.insert(0, ' ');
906                                    let new_next =
907                                        NodeRef::new_text(normalize_text_preserve_nbsp(
908                                            next_val.as_str(),
909                                        ));
910                                    next_node.insert_before(new_next);
911                                    next_node.detach();
912                                }
913                            }
914                        }
915                    }
916                    (None, Some(next_node)) => {
917                        if is_phrasing_content(&next_node) {
918                            if let Some(next_text) = next_node.as_text() {
919                                let mut next_val = next_text.borrow().to_string();
920                                if let Some(first) = next_val.chars().next() {
921                                    if !is_ascii_whitespace(first)
922                                        && !is_preserved_unicode_space(first)
923                                    {
924                                        next_val.insert(0, ' ');
925                                        let new_next = NodeRef::new_text(
926                                            normalize_text_preserve_nbsp(next_val.as_str()),
927                                        );
928                                        next_node.insert_before(new_next);
929                                        next_node.detach();
930                                    }
931                                }
932                            } else {
933                                next_node.insert_before(NodeRef::new_text(" "));
934                            }
935                        }
936                    }
937                    (Some(prev_node), None) => {
938                        if is_phrasing_content(&prev_node) {
939                            if let Some(prev_text) = prev_node.as_text() {
940                                let mut prev_val = prev_text.borrow().to_string();
941                                if let Some(last) = prev_val.chars().last() {
942                                    if !is_ascii_whitespace(last)
943                                        && !is_preserved_unicode_space(last)
944                                    {
945                                        prev_val.push(' ');
946                                        let new_prev = NodeRef::new_text(
947                                            normalize_text_preserve_nbsp(prev_val.as_str()),
948                                        );
949                                        prev_node.insert_before(new_prev);
950                                        prev_node.detach();
951                                    }
952                                }
953                            } else {
954                                prev_node.insert_after(NodeRef::new_text(" "));
955                            }
956                        }
957                    }
958                    _ => {}
959                }
960                n.detach();
961                continue;
962            }
963            let normalized = normalized;
964            if normalized != current {
965                let new_node = NodeRef::new_text(normalized);
966                n.insert_after(new_node);
967                n.detach();
968            }
969        }
970    }
971}
972
973fn get_class_and_id_attr_weight(attr_value: &str) -> i64 {
974    let mut weight = 0;
975    if NEGATIVE_CLASSES_AND_IDS.is_match(attr_value) {
976        weight -= 25;
977    }
978
979    if POSITIVE_CLASSES_AND_IDS.is_match(attr_value) {
980        weight += 25;
981    }
982
983    weight
984}
985
986/// Score a node's `class` and `id` attributes against the positive and
987/// negative word-lists.  Each attribute contributes +25 (positive match)
988/// or −25 (negative match); the two are summed.
989pub fn get_class_and_id_weight(node: &NodeRef) -> i64 {
990    let mut weight = 0;
991    if let Some(class_name) = node.attr_value("class") {
992        weight += get_class_and_id_attr_weight(class_name.as_str())
993    }
994
995    if let Some(tag_id) = node.attr_value("id") {
996        weight += get_class_and_id_attr_weight(tag_id.as_str())
997    }
998
999    weight
1000}
1001
1002/// Returns true if the passed node has at least one child with name matches
1003/// any of the names in the look_up_tag_names
1004pub fn node_contains_any_tag_of(node: &NodeRef, look_up_tag_names: &[&str]) -> bool {
1005    for tag in look_up_tag_names {
1006        if !select_descendants(node, tag).is_empty() {
1007            return true;
1008        }
1009    }
1010    false
1011}
1012
1013/// convert images and figures that have properties like data-src into images that can be loaded without JS
1014pub fn fix_lazy_images(node: &NodeRef) {
1015    apply(node, &["img", "picture", "figure"], |n, tag_name| {
1016        let src = n.attr_value("src");
1017        if let Some(src_val) = src.clone() {
1018            if let Some(caps) = B64_DATA_URL.captures(src_val.as_str()) {
1019                let mime = caps.get(1).map(|m| m.as_str()).unwrap_or_default();
1020                if mime != "image/svg+xml" {
1021                    let mut src_could_be_removed = false;
1022                    if let Some(e) = n.as_element() {
1023                        for (name, attr) in e.attributes.borrow().clone().map {
1024                            if name.local.to_string() == "src" {
1025                                continue;
1026                            }
1027                            if IMAGE_EXTENSION.is_match(attr.value.as_str()) {
1028                                src_could_be_removed = true;
1029                                break;
1030                            }
1031                        }
1032                    }
1033                    if src_could_be_removed {
1034                        let b64starts = caps.get(0).map(|m| m.as_str().len()).unwrap_or(0);
1035                        let b64length = src_val.len().saturating_sub(b64starts);
1036                        if b64length < 133 {
1037                            if let Some(e) = n.as_element() {
1038                                e.attributes.borrow_mut().remove("src");
1039                            }
1040                        }
1041                    }
1042                }
1043            }
1044        }
1045
1046        let src = n.attr_value("src");
1047        let srcset = n.attr_value("srcset");
1048        let class_name = n.attr_value("class").unwrap_or_default().to_lowercase();
1049
1050        if (src.is_some() || (srcset.is_some() && srcset.as_deref() != Some("null")))
1051            && !class_name.contains("lazy")
1052        {
1053            return;
1054        }
1055
1056        if let Some(e) = n.as_element() {
1057            let mut tmp: HashMap<String, String> = HashMap::new();
1058            for (name, attr) in e.attributes.borrow().clone().map {
1059                let local_name = name.local.to_string();
1060                if local_name == "src" || local_name == "srcset" || local_name == "alt" {
1061                    continue;
1062                }
1063
1064                let copy_to_attr: Option<&str> = if SRCSET_EXTENSION.is_match(attr.value.as_str()) {
1065                    Some("srcset")
1066                } else if SRC_EXTENSION.is_match(attr.value.as_str()) {
1067                    Some("src")
1068                } else {
1069                    None
1070                };
1071
1072                if let Some(copy_to_attr) = copy_to_attr {
1073                    if tag_name == "img" || tag_name == "picture" {
1074                        tmp.insert(copy_to_attr.to_string(), attr.value.clone());
1075                    } else if tag_name == "figure"
1076                        && n.select_first("img").is_err()
1077                        && n.select_first("picture").is_err()
1078                    {
1079                        let img = new_html_element("img");
1080                        img.as_element()
1081                            .unwrap()
1082                            .attributes
1083                            .borrow_mut()
1084                            .insert(copy_to_attr, attr.value.clone());
1085                        n.append(img);
1086                    }
1087                }
1088            }
1089
1090            for (name, value) in tmp {
1091                e.attributes.borrow_mut().insert(name, value);
1092            }
1093        }
1094    });
1095}
1096
1097fn is_single_image(node: &NodeRef) -> bool {
1098    let mut current = node.clone();
1099    loop {
1100        if current.element_name() == Some("img") {
1101            return true;
1102        }
1103        let children = current.element_children();
1104        if children.len() != 1 || !current.text_contents().trim().is_empty() {
1105            return false;
1106        }
1107        current = children[0].clone();
1108    }
1109}
1110
1111/// Replace JS-dependent image placeholders with their `<noscript>` fallbacks.
1112///
1113/// Two passes are performed:
1114/// 1. Any `<img>` that has no recognised image attribute (`src`, `srcset`,
1115///    `data-src`, `data-srcset`, or an attribute whose value looks like an
1116///    image URL) is removed entirely.
1117/// 2. Each `<noscript>` that contains exactly one image is parsed; if the
1118///    preceding sibling is also a single-image wrapper the placeholder is
1119///    replaced with the noscript image, inheriting any `src`/`srcset` from
1120///    the old placeholder under a `data-old-*` key.
1121pub fn unwrap_noscript_images(doc: &NodeRef) {
1122    let imgs = select_descendants(doc, "img");
1123    for img in imgs {
1124        if let Some(e) = img.as_element() {
1125            let mut has_image_attr = false;
1126            for (name, attr) in e.attributes.borrow().clone().map {
1127                let local = name.local.to_string();
1128                match local.as_str() {
1129                    "src" | "srcset" | "data-src" | "data-srcset" => {
1130                        has_image_attr = true;
1131                        break;
1132                    }
1133                    _ => {
1134                        if IMAGE_EXTENSION.is_match(attr.value.as_str()) {
1135                            has_image_attr = true;
1136                            break;
1137                        }
1138                    }
1139                }
1140            }
1141            if !has_image_attr {
1142                img.detach();
1143            }
1144        }
1145    }
1146
1147    let noscripts = select_descendants(doc, "noscript");
1148    for noscript in noscripts {
1149        if !is_single_image(&noscript) {
1150            continue;
1151        }
1152        let inner = noscript.inner_html();
1153        let tmp = parse_html(format!("<div>{}</div>", inner).as_str());
1154        let new_img = tmp.select_first("img").ok().map(|n| n.as_node().clone());
1155        if new_img.is_none() {
1156            continue;
1157        }
1158
1159        if let Some(prev_element) = noscript.previous_element_sibling() {
1160            if is_single_image(&prev_element) {
1161                let mut prev_img = prev_element.clone();
1162                if prev_img.element_name() != Some("img") {
1163                    if let Ok(img) = prev_element.select_first("img") {
1164                        prev_img = img.as_node().clone();
1165                    }
1166                }
1167
1168                if let (Some(prev_e), Some(new_e)) =
1169                    (prev_img.as_element(), new_img.clone().unwrap().as_element())
1170                {
1171                    for (name, attr) in prev_e.attributes.borrow().clone().map {
1172                        if attr.value.is_empty() {
1173                            continue;
1174                        }
1175                        let local = name.local.to_string();
1176                        let should_copy = local == "src"
1177                            || local == "srcset"
1178                            || IMAGE_EXTENSION.is_match(attr.value.as_str());
1179                        if !should_copy {
1180                            continue;
1181                        }
1182                        if let Some(existing) = new_e.attributes.borrow().get(local.as_str()) {
1183                            if existing == attr.value.as_str() {
1184                                continue;
1185                            }
1186                        }
1187                        let mut attr_name = local.clone();
1188                        if new_e.attributes.borrow().contains(local.as_str()) {
1189                            attr_name = format!("data-old-{}", local);
1190                        }
1191                        new_e
1192                            .attributes
1193                            .borrow_mut()
1194                            .insert(attr_name.as_str(), attr.value.clone());
1195                    }
1196                }
1197
1198                let new_img_node = new_img.unwrap();
1199                prev_element.insert_after(new_img_node);
1200                prev_element.detach();
1201            }
1202        }
1203    }
1204}
1205
1206/// Return `true` if `node` (an `<embed>`, `<object>`, or `<iframe>`) looks
1207/// like an embedded video from a known provider (YouTube, Vimeo, Dailymotion,
1208/// etc.).  Such embeds are preserved even though they would otherwise be
1209/// stripped as non-content.
1210pub fn is_possibly_useful_video_node(node: &NodeRef, tag: &str) -> bool {
1211    if tag != "embed" && tag != "object" && tag != "iframe" {
1212        return false;
1213    }
1214
1215    if let Some(e) = node.as_element() {
1216        // If this embed has attribute that matches video regex, don't delete it.
1217        for (_, attr) in e.attributes.borrow().clone().map {
1218            if VIDEO_ATTRS_REGEX.is_match(attr.value.as_str()) {
1219                return true;
1220            }
1221        }
1222
1223        // For embed with <object> tag, check inner HTML as well.
1224        if node.element_name() == Some("object")
1225            && VIDEO_ATTRS_REGEX.is_match(node.inner_html().as_str())
1226        {
1227            return true;
1228        }
1229    }
1230
1231    false
1232}
1233
1234/// If `node` is an unlikely content candidate (its role or class/id string
1235/// matches the unlikely regex and does *not* also match the "maybe a
1236/// candidate" exception list), detach it and return the next DFS node.
1237/// Otherwise return `None` (the node is kept).
1238pub fn strip_unlikely_and_get_next(node: &NodeRef, matching_str: &str) -> Option<NodeRef> {
1239    if let Some(role) = node.attr_value("role") {
1240        if UNLIKELY_ROLES.contains(role.as_str()) {
1241            return remove_and_get_next(node);
1242        }
1243    }
1244    if UNLIKELY_CANDIDATES_REGEX.is_match(matching_str)
1245        && !MAYBE_A_CANDIDATE.is_match(matching_str)
1246        && !has_ancestor_tag(node, "table", DEFAULT_MAX_ANCESTORS_LOOKUP_DEPTH)
1247        && !has_ancestor_tag(node, "code", DEFAULT_MAX_ANCESTORS_LOOKUP_DEPTH)
1248        && node.element_name() != Some("body")
1249        && node.element_name() != Some("a")
1250    {
1251        // Remove unlikely candidate
1252        return remove_and_get_next(node);
1253    }
1254    None
1255}
1256
1257/// Detach every descendant of `container` with tag `tag_name` for which
1258/// `condition(node, tag_name)` returns `true`.  Iteration is done in
1259/// reverse document order so that detaching a node does not invalidate
1260/// the remaining iterator.
1261pub fn remove_nodes<F>(container: &NodeRef, tag_name: &str, condition: F)
1262where
1263    F: Fn(&NodeRef, &str) -> bool,
1264{
1265    for node in select_descendants(container, tag_name).into_iter().rev() {
1266        if condition(&node, tag_name) {
1267            node.detach();
1268        }
1269    }
1270}
1271
1272/// Count the number of whitespace-delimited tokens in `text`.
1273///
1274/// # Examples
1275///
1276/// ```rust
1277/// use readable_rs::shared_utils::word_count;
1278///
1279/// assert_eq!(word_count("Hello World      Another word"), 4);
1280/// assert_eq!(word_count(""), 0);
1281/// assert_eq!(word_count("   "), 0);
1282/// ```
1283pub fn word_count(text: &str) -> usize {
1284    text.split_whitespace().count()
1285}
1286
1287/// Apply `func(node, selector)` to every descendant of `root_node` that
1288/// matches any selector in `selectors`.  Each selector's matches are
1289/// visited in reverse document order (safe for detach).  Invalid CSS
1290/// selectors are silently skipped.
1291///
1292/// # Examples
1293///
1294/// ```rust
1295/// use std::cell::Cell;
1296/// use readable_rs::parser::parse_html;
1297/// use readable_rs::shared_utils::apply;
1298///
1299/// let doc = parse_html("<div><p>one</p><p>two</p></div>");
1300/// let count = Cell::new(0usize);
1301/// apply(&doc, &["p"], |_node, _sel| { count.set(count.get() + 1); });
1302/// assert_eq!(count.get(), 2);
1303/// ```
1304pub fn apply<F>(root_node: &NodeRef, selectors: &[&str], func: F)
1305where
1306    F: Fn(&NodeRef, &str),
1307{
1308    for s in selectors {
1309        for n in select_descendants(root_node, s).into_iter().rev() {
1310            func(&n, s);
1311        }
1312    }
1313}
1314
1315/// Resolve a `<base href>` value against the document URI.  If
1316/// `base_path` is empty the document URI is returned unchanged.
1317pub fn resolve_base_uri(doc_uri: &str, base_path: &str) -> String {
1318    if base_path.is_empty() {
1319        return doc_uri.to_string();
1320    }
1321    if let Ok(parsed_url) = url::Url::parse(doc_uri) {
1322        if let Ok(base) = parsed_url.join(base_path) {
1323            return base.to_string();
1324        }
1325    }
1326    base_path.to_string()
1327}
1328
1329/// Convert a potentially-relative URI to an absolute one using the
1330/// document URI and an optional `<base href>` path.  Bare hash links
1331/// (`#…`) are left as-is when they resolve to the same document.
1332pub fn to_absolute_uri(uri: &str, doc_uri: &str, base_path: &str) -> String {
1333    // Leave hash links alone if the base_path is empty
1334    // or if it not a relative uri
1335    let uri = uri.trim();
1336    if let Ok(parsed) = url::Url::parse(uri) {
1337        return parsed.into();
1338    }
1339    if base_path.is_empty() && uri.starts_with('#') {
1340        return String::from(uri);
1341    }
1342    let base_uri = resolve_base_uri(doc_uri, base_path);
1343    if base_uri == doc_uri && uri.starts_with('#') {
1344        return String::from(uri);
1345    }
1346
1347    if let Ok(parsed_url) = url::Url::parse(base_uri.as_str()) {
1348        if let Ok(parsed_url) = parsed_url.join(uri) {
1349            return parsed_url.into();
1350        }
1351    }
1352
1353    uri.to_string()
1354}
1355
1356/// Resolve each URL in a `srcset` attribute against the document base.
1357fn resolve_srcset(srcset: &str, doc_uri: &str, base_path: &str) -> String {
1358    let mut out = String::new();
1359    for caps in SRCSET_URL.captures_iter(srcset) {
1360        let url = caps.get(1).map(|m| m.as_str()).unwrap_or_default();
1361        let descriptor = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1362        let trailing = caps.get(3).map(|m| m.as_str()).unwrap_or("");
1363        let absolute = to_absolute_uri(url, doc_uri, base_path);
1364        out.push_str(absolute.as_str());
1365        out.push_str(descriptor);
1366        out.push_str(trailing);
1367    }
1368    if out.is_empty() {
1369        srcset.to_string()
1370    } else {
1371        out
1372    }
1373}
1374
1375/// Rewrite all relative URLs in `<a>`, `<img>`, `<picture>`, `<figure>`,
1376/// `<video>`, `<audio>`, and `<source>` elements under `node` to absolute
1377/// URLs using `doc_uri` and the optional `<base href>` path.
1378///
1379/// Special handling for `<a>` links:
1380/// * `javascript:` hrefs → the `<a>` is replaced by a `<span>` (attributes
1381///   like `id` / `name` are preserved if present).
1382/// * Hash links that point to the element's own `id` → same replacement,
1383///   because the anchor target is preserved as a `<span id>`.
1384/// * All other hrefs (relative or absolute) → resolved normally.
1385pub fn replace_relative_urls_with_absolute(node: &NodeRef, doc_uri: &str, base_path: &str) {
1386    for link in select_descendants(node, "a") {
1387        if let Some(href) = link.attr_value("href") {
1388            let replace_link = |link: &NodeRef, preserve_attrs: bool| {
1389                let mut child_count = 0usize;
1390                let mut single_text_child = false;
1391                for child in link.children() {
1392                    child_count += 1;
1393                    if child_count == 1 && child.as_text().is_some() {
1394                        single_text_child = true;
1395                    } else {
1396                        single_text_child = false;
1397                    }
1398                }
1399
1400                if !preserve_attrs && child_count == 1 && single_text_child {
1401                    let text_node = NodeRef::new_text(link.text_contents());
1402                    link.insert_before(text_node);
1403                    link.detach();
1404                    return;
1405                }
1406
1407                let container = new_html_element("span");
1408                if preserve_attrs {
1409                    if let (Some(src), Some(dst)) = (link.as_element(), container.as_element()) {
1410                        for (attr_name, attr) in src.attributes.borrow().map.clone() {
1411                            if attr_name.local.to_string() == "href" {
1412                                continue;
1413                            }
1414                            dst.attributes
1415                                .borrow_mut()
1416                                .insert(attr_name.local.to_string(), attr.value.clone());
1417                        }
1418                    }
1419                }
1420                while let Some(child) = link.first_child() {
1421                    container.append(child);
1422                }
1423                link.insert_before(container);
1424                link.detach();
1425            };
1426
1427            if href.starts_with("javascript:") {
1428                // Match Readability.js behavior for javascript: links.
1429                let preserve_attrs =
1430                    link.attr_value("id").is_some() || link.attr_value("name").is_some();
1431                replace_link(&link, preserve_attrs);
1432                continue;
1433            }
1434
1435            if href.starts_with('#') {
1436                if let Some(id) = link.attr_value("id") {
1437                    if href == format!("#{}", id) {
1438                        replace_link(&link, true);
1439                        continue;
1440                    }
1441                }
1442                let absolute = to_absolute_uri(href.as_str(), doc_uri, base_path);
1443                if let Some(e) = link.as_element() {
1444                    e.attributes.borrow_mut().insert("href", absolute);
1445                }
1446                continue;
1447            }
1448            let absolute = to_absolute_uri(href.as_str(), doc_uri, base_path);
1449            if let Some(e) = link.as_element() {
1450                e.attributes.borrow_mut().insert("href", absolute);
1451            }
1452        }
1453    }
1454
1455    for tag in ["img", "picture", "figure", "video", "audio", "source"] {
1456        for media in select_descendants(node, tag) {
1457            if let Some(src) = media.attr_value("src") {
1458                let absolute = to_absolute_uri(src.as_str(), doc_uri, base_path);
1459                if let Some(e) = media.as_element() {
1460                    e.attributes.borrow_mut().insert("src", absolute);
1461                }
1462            }
1463            if let Some(poster) = media.attr_value("poster") {
1464                let absolute = to_absolute_uri(poster.as_str(), doc_uri, base_path);
1465                if let Some(e) = media.as_element() {
1466                    e.attributes.borrow_mut().insert("poster", absolute);
1467                }
1468            }
1469            if let Some(srcset) = media.attr_value("srcset") {
1470                let absolute = resolve_srcset(srcset.as_str(), doc_uri, base_path);
1471                if let Some(e) = media.as_element() {
1472                    e.attributes.borrow_mut().insert("srcset", absolute);
1473                }
1474            }
1475        }
1476    }
1477}
1478
1479/// Shared test helper: count elements matching a CSS selector in a document.
1480#[cfg(test)]
1481pub(crate) fn count_elements(doc: &NodeRef, tag_name: &str) -> usize {
1482    doc.select(tag_name).unwrap().count()
1483}
1484
1485#[cfg(test)]
1486mod tests {
1487    use crate::parser::parse_html;
1488    use crate::utils::*;
1489
1490    // --- 4c: B64_DATA_URL and SRCSET_URL regex correctness ---
1491
1492    #[test]
1493    fn b64_data_url_matches_valid_data_uri() {
1494        // A well-formed data: URI with whitespace around the mime / base64 parts
1495        let input = "data: image/png ; base64 , iVBORw0KGgo=";
1496        assert!(
1497            B64_DATA_URL.is_match(input),
1498            "B64_DATA_URL should match a valid data URI; got no match on: {input:?}"
1499        );
1500        // Capture group 1 should be the mime type
1501        let caps = B64_DATA_URL.captures(input).unwrap();
1502        assert_eq!(caps.get(1).unwrap().as_str(), "image/png");
1503    }
1504
1505    #[test]
1506    fn b64_data_url_does_not_match_non_data_uri() {
1507        assert!(!B64_DATA_URL.is_match("https://example.com/img.png"));
1508    }
1509
1510    #[test]
1511    fn srcset_url_parses_single_entry() {
1512        let input = "https://example.com/img.png 2x";
1513        let caps: Vec<_> = SRCSET_URL.captures_iter(input).collect();
1514        assert!(!caps.is_empty(), "SRCSET_URL should match a srcset entry");
1515        assert_eq!(caps[0].get(1).unwrap().as_str(), "https://example.com/img.png");
1516        assert_eq!(caps[0].get(2).unwrap().as_str().trim(), "2x");
1517    }
1518
1519    #[test]
1520    fn srcset_url_parses_multiple_entries() {
1521        let input = "small.jpg 480w, large.jpg 800w";
1522        let caps: Vec<_> = SRCSET_URL.captures_iter(input).collect();
1523        assert_eq!(caps.len(), 2, "SRCSET_URL should match both srcset entries");
1524        assert_eq!(caps[0].get(1).unwrap().as_str(), "small.jpg");
1525        assert_eq!(caps[1].get(1).unwrap().as_str(), "large.jpg");
1526    }
1527
1528    // --- 4d: has_ancestor_tag_with_predicate depth check ---
1529
1530    #[test]
1531    fn has_ancestor_tag_with_depth_3_finds_at_exactly_3() {
1532        // Structure: <div><section><span><p>leaf</p></span></section></div>
1533        // From <p>, "div" is at depth 3 (section=1, span=2 … wait, let's just be precise:
1534        //   parent chain of <p>: span (depth 1), section (depth 2), div (depth 3)
1535        let doc = parse_html("<div><section><span><p>leaf</p></span></section></div>");
1536        let p = doc.select_first("p").unwrap().as_node().clone();
1537        // depth limit = 3 should find "div"
1538        assert!(
1539            has_ancestor_tag(&p, "div", 3),
1540            "should find 'div' ancestor at depth 3"
1541        );
1542        // depth limit = 2 should NOT find "div"
1543        assert!(
1544            !has_ancestor_tag(&p, "div", 2),
1545            "should NOT find 'div' ancestor when max_depth=2"
1546        );
1547    }
1548
1549    #[test]
1550    fn test_negative_classes_weight() {
1551        let negatives = "hidden|banner|combx|comment|com-|contact|footer|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|widget".split('|').collect::<Vec<_>>();
1552        for n in negatives {
1553            let attr_value = format!("some random value {}", n);
1554            assert_eq!(get_class_and_id_attr_weight(attr_value.as_str()), -25);
1555            let attr_value = attr_value.to_uppercase();
1556            assert_eq!(get_class_and_id_attr_weight(attr_value.as_str()), -25);
1557        }
1558
1559        assert_eq!(get_class_and_id_attr_weight("hid"), -25);
1560        assert_eq!(get_class_and_id_attr_weight("aaassaa hid"), -25);
1561        assert_eq!(get_class_and_id_attr_weight("aaassaa hid aaaaa"), -25);
1562        assert_eq!(get_class_and_id_attr_weight("hid dfsdss"), -25);
1563
1564        assert_eq!(get_class_and_id_attr_weight("hId"), -25);
1565        assert_eq!(get_class_and_id_attr_weight("aaassaa Hid"), -25);
1566        assert_eq!(get_class_and_id_attr_weight("aaassaa hiD aaaaa"), -25);
1567        assert_eq!(get_class_and_id_attr_weight("HiD dfsdss"), -25);
1568    }
1569
1570    #[test]
1571    fn test_positive_classes_weight() {
1572        let positives =
1573            "article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story"
1574                .split('|')
1575                .collect::<Vec<_>>();
1576        for n in positives {
1577            let attr_value = format!("some random value {}", n);
1578            assert_eq!(get_class_and_id_attr_weight(attr_value.as_str()), 25);
1579            let attr_value = attr_value.to_uppercase();
1580            assert_eq!(get_class_and_id_attr_weight(attr_value.as_str()), 25);
1581        }
1582    }
1583
1584    #[test]
1585    fn test_replace_relative_urls_with_rel_img_src_with_base() {
1586        const TEST_INPUT: &str = r###"<!doctype html><html><head>
1587        <title>Example Domain</title>
1588        </head>
1589        <body>
1590        <div>
1591        <img src="images/img.png" />
1592        </div>
1593        </body>
1594        </html>"###;
1595
1596        let doc = parse_html(TEST_INPUT);
1597        replace_relative_urls_with_absolute(&doc, "http://www.example.com/world/", "..");
1598        let e = doc.select_first("img").unwrap();
1599        let node = e.as_node();
1600        assert_eq!(node.element_name().unwrap(), "img");
1601        assert_eq!(
1602            node.attr_value("src").unwrap(),
1603            "http://www.example.com/images/img.png"
1604        );
1605        assert_eq!(doc.select("img").unwrap().count(), 1);
1606    }
1607
1608    #[test]
1609    fn test_replace_relative_urls_with_rel_img_src_without_base() {
1610        const TEST_INPUT: &str = r###"<!doctype html><html><head>
1611        <title>Example Domain</title>
1612        </head>
1613        <body>
1614        <div>
1615        <img src="images/img.png" />
1616        </div>
1617        </body>
1618        </html>"###;
1619
1620        let doc = parse_html(TEST_INPUT);
1621        replace_relative_urls_with_absolute(&doc, "http://www.example.com/world/", "");
1622        let e = doc.select_first("img").unwrap();
1623        let node = e.as_node();
1624        assert_eq!(node.element_name().unwrap(), "img");
1625        assert_eq!(
1626            node.attr_value("src").unwrap(),
1627            "http://www.example.com/world/images/img.png"
1628        );
1629        assert_eq!(doc.select("img").unwrap().count(), 1);
1630    }
1631
1632    #[test]
1633    fn test_replace_relative_urls_with_abs_img_src() {
1634        const TEST_INPUT: &str = r###"<!doctype html><html><head>
1635        <title>Example Domain</title>
1636        </head>
1637        <body>
1638        <div>
1639        <img src="https://google.com/images/img.png" />
1640        </div>
1641        </body>
1642        </html>"###;
1643
1644        let doc = parse_html(TEST_INPUT);
1645        replace_relative_urls_with_absolute(&doc, "http://www.example.com/world/", "");
1646        let e = doc.select_first("img").unwrap();
1647        let node = e.as_node();
1648        assert_eq!(node.element_name().unwrap(), "img");
1649        assert_eq!(
1650            node.attr_value("src").unwrap(),
1651            "https://google.com/images/img.png"
1652        );
1653        assert_eq!(doc.select("img").unwrap().count(), 1);
1654    }
1655
1656    #[test]
1657    fn test_replace_relative_urls_with_hash_link_to_self() {
1658        const TEST_INPUT: &str = r###"<!doctype html><html><head>
1659        <title>Example Domain</title>
1660        </head>
1661        <body>
1662        <div>
1663        <a href="#self_id" id="self_id">Self ID</a>
1664        </div>
1665        </body>
1666        </html>"###;
1667
1668        let doc = parse_html(TEST_INPUT);
1669        assert_eq!(
1670            doc.select_first("#self_id")
1671                .unwrap()
1672                .as_node()
1673                .element_name()
1674                .unwrap(),
1675            "a"
1676        );
1677        replace_relative_urls_with_absolute(&doc, "http://www.example.com", "");
1678        assert_eq!(
1679            doc.select_first("#self_id")
1680                .unwrap()
1681                .as_node()
1682                .element_name()
1683                .unwrap(),
1684            "span"
1685        );
1686        assert_eq!(doc.select("a").unwrap().count(), 0);
1687    }
1688
1689    #[test]
1690    fn test_replace_relative_urls_with_hash_link_to_js() {
1691        const TEST_INPUT: &str = r###"<!doctype html><html><head>
1692        <title>Example Domain</title>
1693        </head>
1694        <body>
1695        <div>
1696        <a href="javascript:" id="js_link">JS Link</a>
1697        </div>
1698        </body>
1699        </html>"###;
1700
1701        let doc = parse_html(TEST_INPUT);
1702        assert_eq!(
1703            doc.select_first("#js_link")
1704                .unwrap()
1705                .as_node()
1706                .element_name()
1707                .unwrap(),
1708            "a"
1709        );
1710        replace_relative_urls_with_absolute(&doc, "http://www.example.com", "");
1711        assert_eq!(
1712            doc.select_first("#js_link")
1713                .unwrap()
1714                .as_node()
1715                .element_name()
1716                .unwrap(),
1717            "span"
1718        );
1719        assert_eq!(doc.select("a").unwrap().count(), 0);
1720    }
1721
1722    #[test]
1723    fn test_replace_relative_urls_with_hash_link_to_other() {
1724        const TEST_INPUT: &str = r###"<!doctype html><html><head>
1725        <title>Example Domain</title>
1726        </head>
1727        <body>
1728        <div>
1729        <a href="#sib_id" id="other_id">Self ID</a>
1730        </div>
1731        </body>
1732        </html>"###;
1733
1734        let doc = parse_html(TEST_INPUT);
1735        assert_eq!(
1736            doc.select_first("#other_id")
1737                .unwrap()
1738                .as_node()
1739                .element_name()
1740                .unwrap(),
1741            "a"
1742        );
1743        replace_relative_urls_with_absolute(&doc, "http://www.example.com", "");
1744        let e = doc.select_first("#other_id").unwrap();
1745        let node = e.as_node();
1746        assert_eq!(node.element_name().unwrap(), "a");
1747        assert_eq!(node.attr_value("href").unwrap(), "#sib_id");
1748        assert_eq!(doc.select("a").unwrap().count(), 1);
1749    }
1750
1751    #[test]
1752    fn test_replace_relative_urls_with_rel_link_without_base_and_with_hash() {
1753        const TEST_INPUT: &str = r###"<!doctype html><html><head>
1754        <title>Example Domain</title>
1755        </head>
1756        <body>
1757        <div>
1758        <a href="hello_world#hash" id="hello">Self ID</a>
1759        </div>
1760        </body>
1761        </html>"###;
1762
1763        let doc = parse_html(TEST_INPUT);
1764        replace_relative_urls_with_absolute(&doc, "http://www.example.com/world/", "");
1765        let e = doc.select_first("#hello").unwrap();
1766        let node = e.as_node();
1767        assert_eq!(node.element_name().unwrap(), "a");
1768        assert_eq!(
1769            node.attr_value("href").unwrap(),
1770            "http://www.example.com/world/hello_world#hash"
1771        );
1772        assert_eq!(doc.select("a").unwrap().count(), 1);
1773    }
1774
1775    #[test]
1776    fn test_replace_relative_urls_with_rel_link_with_base_and_hash() {
1777        const TEST_INPUT: &str = r###"<!doctype html><html><head>
1778        <title>Example Domain</title>
1779        </head>
1780        <body>
1781        <div>
1782        <a href="hello_world#hash" id="hello">Self ID</a>
1783        </div>
1784        </body>
1785        </html>"###;
1786
1787        let doc = parse_html(TEST_INPUT);
1788        replace_relative_urls_with_absolute(&doc, "http://www.example.com/world/", "../");
1789        let e = doc.select_first("#hello").unwrap();
1790        let node = e.as_node();
1791        assert_eq!(node.element_name().unwrap(), "a");
1792        assert_eq!(
1793            node.attr_value("href").unwrap(),
1794            "http://www.example.com/hello_world#hash"
1795        );
1796        assert_eq!(doc.select("a").unwrap().count(), 1);
1797    }
1798
1799    #[test]
1800    fn test_replace_relative_urls_with_rel_link_with_base() {
1801        const TEST_INPUT: &str = r###"<!doctype html><html><head>
1802        <title>Example Domain</title>
1803        </head>
1804        <body>
1805        <div>
1806        <a href="hello_world" id="hello">Self ID</a>
1807        </div>
1808        </body>
1809        </html>"###;
1810
1811        let doc = parse_html(TEST_INPUT);
1812        replace_relative_urls_with_absolute(&doc, "http://www.example.com/world/", "../");
1813        let e = doc.select_first("#hello").unwrap();
1814        let node = e.as_node();
1815        assert_eq!(node.element_name().unwrap(), "a");
1816        assert_eq!(
1817            node.attr_value("href").unwrap(),
1818            "http://www.example.com/hello_world"
1819        );
1820        assert_eq!(doc.select("a").unwrap().count(), 1);
1821    }
1822
1823    #[test]
1824    fn test_replace_relative_urls_with_rel_link_without_base() {
1825        const TEST_INPUT: &str = r###"<!doctype html><html><head>
1826        <title>Example Domain</title>
1827        </head>
1828        <body>
1829        <div>
1830        <a href="hello_world" id="hello">Self ID</a>
1831        </div>
1832        </body>
1833        </html>"###;
1834
1835        let doc = parse_html(TEST_INPUT);
1836        replace_relative_urls_with_absolute(&doc, "http://www.example.com/world/", "");
1837        let e = doc.select_first("#hello").unwrap();
1838        let node = e.as_node();
1839        assert_eq!(node.element_name().unwrap(), "a");
1840        assert_eq!(
1841            node.attr_value("href").unwrap(),
1842            "http://www.example.com/world/hello_world"
1843        );
1844        assert_eq!(doc.select("a").unwrap().count(), 1);
1845    }
1846
1847    #[test]
1848    fn test_word_count() {
1849        assert_eq!(word_count("Hello World      Another word"), 4);
1850        assert_eq!(word_count("Hello ."), 2);
1851    }
1852
1853    #[test]
1854    fn test_apply_with_valid_selector() {
1855        const TEST_INPUT: &str = r###"<!doctype html><html><head>
1856<title>Example Domain</title>
1857</head>
1858<body>
1859<div id="that_node" background="black" border="1px">
1860<table height="100" width="100" style="width:100%">
1861<tr><th>Firstname</th>
1862<p align="center" style="border:1px solid;">Text Here</p>
1863<p>Another P</p>
1864</tr><tr><td>Jill</td></tr>
1865</table>
1866</div>
1867</body>
1868</html>"###;
1869        use std::sync::atomic::{AtomicUsize, Ordering};
1870
1871        let doc = parse_html(TEST_INPUT);
1872        let counter: AtomicUsize = AtomicUsize::new(0);
1873        apply(&doc, &["p", "tr"], |_, s| {
1874            assert!(s == "p" || s == "tr");
1875            counter.fetch_add(1, Ordering::Relaxed);
1876        });
1877        assert_eq!(4, counter.load(Ordering::Relaxed));
1878    }
1879
1880    #[test]
1881    fn test_apply_with_invalid_selector() {
1882        const TEST_INPUT: &str = r###"<!doctype html><html><head>
1883<title>Example Domain</title>
1884</head>
1885<body>
1886<div id="that_node" background="black" border="1px">
1887<table height="100" width="100" style="width:100%">
1888<tr><th>Firstname</th>
1889<p align="center" style="border:1px solid;">Text Here</p>
1890<p>Another P</p>
1891</tr><tr><td>Jill</td></tr>
1892</table>
1893</div>
1894</body>
1895</html>"###;
1896        use std::sync::atomic::{AtomicUsize, Ordering};
1897
1898        let doc = parse_html(TEST_INPUT);
1899        let counter: AtomicUsize = AtomicUsize::new(0);
1900        apply(&doc, &["p", "-123"], |_, s| {
1901            assert!(s == "p");
1902            counter.fetch_add(1, Ordering::Relaxed);
1903        });
1904        assert_eq!(2, counter.load(Ordering::Relaxed));
1905    }
1906
1907    #[test]
1908    fn test_resolve_url_with_normal_base_and_relative() {
1909        let result = to_absolute_uri("index.html", "http://example.com", "");
1910        assert_eq!(result, "http://example.com/index.html");
1911    }
1912
1913    #[test]
1914    fn test_resolve_url_with_normal_base_as_file_url_and_relative() {
1915        let result = to_absolute_uri("foo/bar/index.html", "http://fakehost/test/page.html", "");
1916        assert_eq!(result, "http://fakehost/test/foo/bar/index.html");
1917    }
1918
1919    #[test]
1920    fn test_resolve_url_with_base_trailing_slash_and_normal_relative() {
1921        let result = to_absolute_uri("index.html", "http://example.com/", "");
1922        assert_eq!(result, "http://example.com/index.html");
1923    }
1924
1925    #[test]
1926    fn test_resolve_url_with_normal_base_and_relative_starting_with_slash() {
1927        let result = to_absolute_uri("/index.html", "http://example.com", "");
1928        assert_eq!(result, "http://example.com/index.html");
1929    }
1930
1931    #[test]
1932    fn test_resolve_url_with_base_trailing_slash_and_relative_starting_with_slash() {
1933        let result = to_absolute_uri("/index.html", "http://example.com/", "");
1934        assert_eq!(result, "http://example.com/index.html");
1935    }
1936
1937    #[test]
1938    fn test_resolve_url_with_full_url() {
1939        let result = to_absolute_uri("http://example.com/index.html", "http://example.com/", "");
1940        assert_eq!(result, "http://example.com/index.html");
1941    }
1942
1943    #[test]
1944    fn test_rename_tag_with_selector_with_by_id_selector() {
1945        const TEST_INPUT: &str = r###"<!doctype html><html><head>
1946<title>Example Domain</title>
1947</head>
1948<body>
1949<div>foo <p id="rename_it"><br>bar<br> <br><br>abc</p></div>
1950</body>
1951</html>"###;
1952        let doc = parse_html(TEST_INPUT);
1953        assert_eq!(count_elements(&doc, "br"), 4);
1954        assert_eq!(count_elements(&doc, "p"), 1);
1955        assert_eq!(count_elements(&doc, "div"), 1);
1956        let body = doc.select("body").unwrap().next().unwrap();
1957        let n = body.as_node();
1958        rename_tags_with_selector(n, "#rename_it", "div");
1959        assert_eq!(count_elements(&doc, "br"), 4);
1960        assert_eq!(count_elements(&doc, "p"), 0);
1961        assert_eq!(count_elements(&doc, "div"), 2);
1962    }
1963
1964    #[test]
1965    fn test_rename_tag_with_selector_with_by_class_selector() {
1966        const TEST_INPUT: &str = r###"<!doctype html><html><head>
1967<title>Example Domain</title>
1968</head>
1969<body>
1970<div>foo <p class="rename_it"><br>bar<br> <br><br>abc</p>
1971<p class="rename_it">
1972nothing special in here
1973</p>
1974</div>
1975</body>
1976</html>"###;
1977        let doc = parse_html(TEST_INPUT);
1978        assert_eq!(count_elements(&doc, "br"), 4);
1979        assert_eq!(count_elements(&doc, "p"), 2);
1980        assert_eq!(count_elements(&doc, "div"), 1);
1981        let body = doc.select("body").unwrap().next().unwrap();
1982        let n = body.as_node();
1983        rename_tags_with_selector(n, ".rename_it", "div");
1984        assert_eq!(count_elements(&doc, "br"), 4);
1985        assert_eq!(count_elements(&doc, "p"), 0);
1986        assert_eq!(count_elements(&doc, "div"), 3);
1987    }
1988
1989    #[test]
1990    fn test_link_to_itself_with_postivie_absolute_url() {
1991        let n = new_html_element("a");
1992        n.as_element()
1993            .unwrap()
1994            .attributes
1995            .borrow_mut()
1996            .insert("id", String::from("content"));
1997        let href = "http://www.something.com/#content";
1998        let doc_uri = "http://www.something.com/";
1999        assert!(link_to_itself(&n, href, doc_uri));
2000    }
2001
2002    fn link_to_itself(node: &NodeRef, href: &str, doc_uri: &str) -> bool {
2003        if !href.starts_with('#') && !href.starts_with(doc_uri) {
2004            return false;
2005        }
2006        if let Some(id) = node.attr_value("id") {
2007            if href.is_empty()
2008                || id == href[1..]
2009                || (href.starts_with(doc_uri)
2010            // account for the hash
2011            && href.len() > doc_uri.len() + 1
2012            && id == href[doc_uri.len() + 1..])
2013            {
2014                return true;
2015            }
2016        }
2017
2018        false
2019    }
2020}