use crate::constants::{PHRASING_ELEMS, flags::*, regexps};
use crate::dom::{NodeDataStore, NodeStats, get_tag_name, node_select_matcher};
use crate::selectors::Selectors;
use dom_query::{Matcher, Node};
pub fn compute_node_stats(node: &Node<'_>) -> NodeStats {
let text = get_inner_text(node, true);
let mut text_length: usize = 0;
let mut comma_count: usize = 0;
for c in text.chars() {
text_length += 1;
if matches!(
c,
',' | '\u{060C}'
| '\u{FE50}'
| '\u{FE10}'
| '\u{FE11}'
| '\u{2E41}'
| '\u{2E34}'
| '\u{2E32}'
| '\u{FF0C}'
) {
comma_count += 1;
}
}
NodeStats {
text_length,
comma_count,
has_sentence_end: has_sentence_end(&text),
}
}
pub fn compute_node_stats_with_text(node: &Node<'_>) -> (NodeStats, String) {
let text = get_inner_text(node, true);
let mut text_length: usize = 0;
let mut comma_count: usize = 0;
for c in text.chars() {
text_length += 1;
if matches!(
c,
',' | '\u{060C}'
| '\u{FE50}'
| '\u{FE10}'
| '\u{FE11}'
| '\u{2E41}'
| '\u{2E34}'
| '\u{2E32}'
| '\u{FF0C}'
) {
comma_count += 1;
}
}
let stats = NodeStats {
text_length,
comma_count,
has_sentence_end: has_sentence_end(&text),
};
(stats, text)
}
fn has_sentence_end(text: &str) -> bool {
let bytes = text.as_bytes();
for (i, &b) in bytes.iter().enumerate() {
if b == b'.' && (i + 1 >= bytes.len() || bytes[i + 1] == b' ') {
return true;
}
}
false
}
#[inline]
fn is_hash_url(s: &str) -> bool {
s.starts_with('#') && s.len() > 1
}
pub fn get_or_compute_stats(node: &Node<'_>, store: &mut NodeDataStore) -> NodeStats {
if let Some(stats) = store.get_stats(&node.id) {
return *stats;
}
let stats = compute_node_stats(node);
store.set_stats(node.id, stats);
stats
}
pub fn get_or_compute_stats_with_text(
node: &Node<'_>,
store: &mut NodeDataStore,
) -> (NodeStats, String) {
if let Some(stats) = store.get_stats(&node.id).copied() {
if let Some(text) = store.get_text(&node.id) {
return (stats, text.to_string());
}
let text = get_inner_text(node, true);
store.set_text(node.id, text.clone());
return (stats, text);
}
let (stats, text) = compute_node_stats_with_text(node);
store.set_stats(node.id, stats);
store.set_text(node.id, text.clone());
(stats, text)
}
pub fn compute_initial_readability_data(
node: &Node<'_>,
flags: u32,
) -> crate::dom::ReadabilityData {
let initial_score = match get_tag_name(node).as_deref() {
Some("DIV") => 5.0,
Some("PRE") | Some("TD") | Some("BLOCKQUOTE") => 3.0,
Some("ADDRESS") | Some("OL") | Some("UL") | Some("DL") | Some("DD") | Some("DT")
| Some("LI") | Some("FORM") => -3.0,
Some("H1") | Some("H2") | Some("H3") | Some("H4") | Some("H5") | Some("H6")
| Some("TH") => -5.0,
_ => 0.0,
};
let class_weight = get_class_weight(node, flags);
crate::dom::ReadabilityData::with_score(initial_score + class_weight as f64)
}
pub fn initialize_node(node: &Node<'_>, store: &mut NodeDataStore, flags: u32) {
store.set(node.id, compute_initial_readability_data(node, flags));
}
pub fn get_class_weight(node: &Node<'_>, flags: u32) -> i32 {
if (flags & FLAG_WEIGHT_CLASSES) == 0 {
return 0;
}
let mut weight: i32 = 0;
if let Some(class_name) = node.attr("class") {
let class_str = class_name.as_ref();
if !class_str.is_empty() {
let matches = regexps::CLASS_WEIGHT_SET.matches(class_str);
if matches.matched(0) {
weight -= 25; }
if matches.matched(1) {
weight += 25; }
}
}
if let Some(id) = node.attr("id") {
let id_str = id.as_ref();
if !id_str.is_empty() {
let matches = regexps::CLASS_WEIGHT_SET.matches(id_str);
if matches.matched(0) {
weight -= 25; }
if matches.matched(1) {
weight += 25; }
}
}
weight
}
pub fn get_inner_text(node: &Node<'_>, normalize_spaces: bool) -> String {
let text = node.text();
let trimmed = text.trim();
if normalize_spaces {
normalize_whitespace(trimmed)
} else {
trimmed.to_string()
}
}
fn normalize_whitespace(s: &str) -> String {
let needs_normalize = s
.as_bytes()
.windows(2)
.any(|w| w[0].is_ascii_whitespace() && w[1].is_ascii_whitespace())
|| s.bytes().any(|b| b == b'\t' || b == b'\n' || b == b'\r');
if !needs_normalize {
return s.to_string();
}
let mut result = String::with_capacity(s.len());
let mut prev_ws = false;
for c in s.chars() {
if c.is_whitespace() {
if !prev_ws {
result.push(' ');
}
prev_ws = true;
} else {
result.push(c);
prev_ws = false;
}
}
result
}
pub fn get_link_density_with_text(
node: &Node<'_>,
node_text: Option<&str>,
selectors: &Selectors,
) -> f64 {
let text_length = match node_text {
Some(t) => t.chars().count(),
None => get_inner_text(node, true).chars().count(),
};
if text_length == 0 {
return 0.0;
}
let mut link_length = 0.0;
for link in node_select_matcher(node, &selectors.a).nodes().iter() {
let coefficient = match link.attr("href") {
Some(href) if is_hash_url(href.as_ref()) => 0.3,
_ => 1.0,
};
link_length += get_inner_text(link, true).chars().count() as f64 * coefficient;
}
link_length / text_length as f64
}
pub fn get_link_density(node: &Node<'_>, selectors: &Selectors) -> f64 {
get_link_density_with_text(node, None, selectors)
}
pub fn get_link_density_cached(
node: &Node<'_>,
parent_text_length: usize,
store: &mut NodeDataStore,
selectors: &Selectors,
) -> f64 {
if parent_text_length == 0 {
return 0.0;
}
let mut link_length = 0.0;
for link in node_select_matcher(node, &selectors.a).nodes().iter() {
let link_stats = get_or_compute_stats(link, store);
let coefficient = match link.attr("href") {
Some(href) if is_hash_url(href.as_ref()) => 0.3,
_ => 1.0,
};
link_length += link_stats.text_length as f64 * coefficient;
}
link_length / parent_text_length as f64
}
pub fn get_text_density_cached(
node: &Node<'_>,
parent_text_length: usize,
matcher: &Matcher,
store: &mut NodeDataStore,
) -> f64 {
if parent_text_length == 0 {
return 0.0;
}
let mut children_length = 0;
for child in node_select_matcher(node, matcher).nodes().iter() {
let child_stats = get_or_compute_stats(child, store);
children_length += child_stats.text_length;
}
children_length as f64 / parent_text_length as f64
}
pub fn is_whitespace(node: &Node<'_>) -> bool {
if node.is_text() {
let text = node.text();
return text.trim().is_empty();
}
if node.is_element()
&& let Some(tag) = get_tag_name(node)
{
return tag == "BR";
}
false
}
pub fn is_phrasing_content(node: &Node<'_>) -> bool {
is_phrasing_content_depth(node, 0)
}
fn is_phrasing_content_depth(node: &Node<'_>, depth: u32) -> bool {
if node.is_text() {
return true;
}
if let Some(tag) = get_tag_name(node) {
if PHRASING_ELEMS.contains(&*tag) {
return true;
}
if (tag == "A" || tag == "DEL" || tag == "INS") && depth < 10 {
return node
.children()
.iter()
.all(|child| is_phrasing_content_depth(child, depth + 1));
}
}
false
}
pub fn wrap_phrasing_content_in_p(div: &Node<'_>) {
let children: Vec<_> = div.children();
let mut i = 0;
while i < children.len() {
let child = &children[i];
if is_phrasing_content(child) {
let mut phrasing_nodes = Vec::new();
let mut j = i;
while j < children.len() && is_phrasing_content(&children[j]) {
phrasing_nodes.push(j);
j += 1;
}
let has_content = phrasing_nodes.iter().any(|&idx| {
let n = &children[idx];
if n.is_text() {
!n.text().trim().is_empty()
} else {
true
}
});
if has_content && !phrasing_nodes.is_empty() {
let mut start = 0;
let mut end = phrasing_nodes.len();
while start < end && is_whitespace(&children[phrasing_nodes[start]]) {
start += 1;
}
while start < end && is_whitespace(&children[phrasing_nodes[end - 1]]) {
end -= 1;
}
if start < end {
let trimmed_nodes = &phrasing_nodes[start..end];
if let Some(first_node) = children.get(trimmed_nodes[0]) {
let p = div.tree.new_element("p");
first_node.insert_before(&p);
for &idx in trimmed_nodes {
if let Some(n) = children.get(idx) {
p.append_child(n);
}
}
for &idx in phrasing_nodes[..start]
.iter()
.chain(phrasing_nodes[end..].iter())
{
if let Some(n) = children.get(idx) {
n.remove_from_parent();
}
}
}
}
}
i = j;
} else {
i += 1;
}
}
}
pub fn is_element_without_content(node: &Node<'_>, selectors: &Selectors) -> bool {
if !node.is_element() {
return false;
}
if !node.text().trim().is_empty() {
return false;
}
let children = node.element_children();
if children.is_empty() {
return true;
}
let br_count = node_select_matcher(node, &selectors.br).length();
let hr_count = node_select_matcher(node, &selectors.hr).length();
children.len() == br_count + hr_count
}
pub fn has_single_tag_inside_element(node: &Node<'_>, tag: &str) -> bool {
let children = node.element_children();
if children.len() != 1 {
return false;
}
if let Some(child_tag) = get_tag_name(&children[0]) {
if child_tag != tag {
return false;
}
} else {
return false;
}
!node.children().iter().any(|child| {
child.is_text()
&& child
.text()
.as_ref()
.ends_with(|c: char| !c.is_whitespace())
})
}
pub fn has_child_block_element(node: &Node<'_>) -> bool {
use crate::constants::DIV_TO_P_ELEMS;
node.descendants_it()
.any(|child| get_tag_name(&child).is_some_and(|tag| DIV_TO_P_ELEMS.contains(&*tag)))
}
pub fn is_probably_visible(node: &Node<'_>) -> bool {
if let Some(style) = node.attr("style") {
let style_str = style.as_ref();
if contains_ignore_ascii_ws_case(style_str, b"display:none")
|| contains_ignore_ascii_ws_case(style_str, b"visibility:hidden")
{
return false;
}
}
if node.has_attr("hidden") {
return false;
}
if let Some(aria_hidden) = node.attr("aria-hidden")
&& aria_hidden.as_ref() == "true"
{
if let Some(class) = node.attr("class") {
if !class.as_ref().contains("fallback-image") {
return false;
}
} else {
return false;
}
}
true
}
pub fn is_valid_byline(node: &Node<'_>, match_string: &str) -> bool {
let is_byline_attr = node.attr("rel").is_some_and(|rel| rel.as_ref() == "author")
|| node
.attr("itemprop")
.is_some_and(|ip| ip.as_ref().contains("author"))
|| regexps::BYLINE.is_match(match_string);
if !is_byline_attr {
return false;
}
let text = node.text();
let trimmed = text.trim();
!trimmed.is_empty() && trimmed.len() < 400 && trimmed.chars().count() < 100
}
pub fn is_single_image(node: &Node<'_>) -> bool {
let mut current = Some(*node);
while let Some(n) = current {
if let Some(tag) = get_tag_name(&n)
&& tag == "IMG"
{
return true;
}
let children = n.element_children();
if children.len() != 1 || !n.text().trim().is_empty() {
return false;
}
current = children.into_iter().next();
}
false
}
fn contains_ignore_ascii_ws_case(haystack: &str, needle: &[u8]) -> bool {
let haystack = haystack.as_bytes();
let mut i = 0;
while i < haystack.len() {
let mut hi = i;
let mut ni = 0;
while ni < needle.len() && hi < haystack.len() {
if haystack[hi].is_ascii_whitespace() {
hi += 1;
continue;
}
if haystack[hi].to_ascii_lowercase() != needle[ni] {
break;
}
hi += 1;
ni += 1;
}
if ni == needle.len() {
return true;
}
i += 1;
}
false
}