use scraper::{ElementRef, Element};
use url::Url;
use std::collections::HashSet;
pub const PHRASING_ELEMS: &[&str] = &[
"ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA",
"DATALIST", "DFN", "EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL",
"MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", "OUTPUT", "PROGRESS",
"Q", "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG",
"SUB", "SUP", "TEXTAREA", "TIME", "VAR", "WBR"
];
pub const PRESENTATIONAL_ATTRIBUTES: &[&str] = &[
"align", "background", "bgcolor", "border", "cellpadding", "cellspacing",
"frame", "hspace", "rules", "style", "valign", "vspace"
];
pub fn to_absolute_uri(uri: &str, base_uri: &str) -> String {
if uri.starts_with('#') {
return uri.to_string();
}
match Url::parse(base_uri) {
Ok(base) => {
match base.join(uri) {
Ok(absolute_url) => absolute_url.to_string(),
Err(_) => uri.to_string(), }
}
Err(_) => uri.to_string(), }
}
pub fn is_url(text: &str) -> bool {
Url::parse(text).is_ok()
}
pub fn get_inner_text(element: &ElementRef, normalize_spaces: bool) -> String {
let text = element.text().collect::<Vec<_>>().join(" ");
if normalize_spaces {
normalize_whitespace(&text)
} else {
text
}
}
pub fn normalize_whitespace(text: &str) -> String {
let mut result = String::new();
let mut prev_was_space = false;
for ch in text.chars() {
if ch.is_whitespace() {
if !prev_was_space {
result.push(' ');
prev_was_space = true;
}
} else {
result.push(ch);
prev_was_space = false;
}
}
result.trim().to_string()
}
pub fn get_char_count(text: &str, separator: Option<char>) -> usize {
if let Some(sep) = separator {
text.matches(sep).count()
} else {
text.chars().count()
}
}
pub fn is_phrasing_content(tag_name: &str) -> bool {
PHRASING_ELEMS.contains(&tag_name.to_uppercase().as_str())
}
pub fn is_single_image(element: &ElementRef) -> bool {
let tag_name = element.value().name().to_uppercase();
if tag_name == "IMG" {
return true;
}
let children: Vec<_> = element.children().collect();
if children.len() == 1 {
if let Some(child_element) = children[0].value().as_element() {
return child_element.name().to_uppercase() == "IMG";
}
}
false
}
pub fn is_node_visible(element: &ElementRef) -> bool {
let style = element.value().attr("style").unwrap_or("");
if style.contains("display:none") || style.contains("display: none") {
return false;
}
if style.contains("visibility:hidden") || style.contains("visibility: hidden") {
return false;
}
if element.value().attr("hidden").is_some() {
return false;
}
if element.value().attr("aria-hidden") == Some("true") {
return false;
}
true
}
pub fn has_ancestor_tag(
element: &ElementRef,
tag_name: &str,
max_depth: Option<usize>,
filter_fn: Option<fn(&ElementRef) -> bool>
) -> bool {
let target_tag = tag_name.to_uppercase();
let mut current = element.parent_element();
let mut depth = 0;
while let Some(parent) = current {
if let Some(max) = max_depth {
if depth >= max {
break;
}
}
if parent.value().name().to_uppercase() == target_tag {
if let Some(filter) = filter_fn {
if filter(&parent) {
return true;
}
} else {
return true;
}
}
current = parent.parent_element();
depth += 1;
}
false
}
pub fn get_node_ancestors<'a>(element: &'a ElementRef<'a>, max_depth: usize) -> Vec<ElementRef<'a>> {
let mut ancestors = Vec::new();
let mut current = element.parent();
let mut depth = 0;
while let Some(parent) = current {
if depth >= max_depth {
break;
}
if let Some(parent_element) = ElementRef::wrap(parent) {
ancestors.push(parent_element);
current = parent.parent();
depth += 1;
} else {
break;
}
}
ancestors
}
pub fn is_element_without_content(element: &ElementRef) -> bool {
let tag_name = element.value().name().to_uppercase();
match tag_name.as_str() {
"IMG" | "VIDEO" | "AUDIO" | "EMBED" | "OBJECT" | "IFRAME" => false,
_ => {
let text_content = get_inner_text(element, true);
text_content.is_empty()
}
}
}
pub fn has_single_tag_inside_element(element: &ElementRef, tag: &str) -> bool {
let children: Vec<_> = element.children()
.filter_map(|child| child.value().as_element())
.collect();
children.len() == 1 &&
children[0].name().eq_ignore_ascii_case(tag)
}
pub fn has_child_block_element(element: &ElementRef) -> bool {
for child in element.children() {
if let Some(child_element) = child.value().as_element() {
let tag_name = child_element.name().to_uppercase();
if !is_phrasing_content(&tag_name) {
return true;
}
}
}
false
}
pub fn should_clean_attribute(attr_name: &str) -> bool {
PRESENTATIONAL_ATTRIBUTES.contains(&attr_name.to_lowercase().as_str())
}
pub fn extract_text_content(element: &ElementRef) -> String {
element.text().collect::<Vec<_>>().join(" ")
}
pub fn word_count(text: &str) -> usize {
text.split_whitespace().count()
}
pub fn is_title_candidate(text: &str, current_title: Option<&str>) -> bool {
let word_count = word_count(text);
if word_count < 2 || word_count > 10 || text.len() > 80 {
return false;
}
if let Some(title) = current_title {
let similarity = text_similarity(text, title);
similarity > 0.3 } else {
true
}
}
pub fn text_similarity(text_a: &str, text_b: &str) -> f64 {
let words_a: HashSet<&str> = text_a.split_whitespace().collect();
let words_b: HashSet<&str> = text_b.split_whitespace().collect();
if words_a.is_empty() && words_b.is_empty() {
return 1.0;
}
if words_a.is_empty() || words_b.is_empty() {
return 0.0;
}
let intersection = words_a.intersection(&words_b).count();
let union = words_a.union(&words_b).count();
intersection as f64 / union as f64
}
pub fn unescape_html_entities(text: &str) -> String {
let text = text.replace("&", "&");
text.replace("<", "<")
.replace(">", ">")
.replace(""", "\"")
.replace("'", "'")
.replace("'", "'")
}
pub fn clean_text(text: &str) -> String {
let unescaped = unescape_html_entities(text);
normalize_whitespace(&unescaped)
}
pub fn get_link_density(element: &ElementRef) -> f64 {
let total_text_length = get_inner_text(element, false).len();
if total_text_length == 0 {
return 0.0;
}
let mut link_text_length = 0;
for descendant in element.descendants() {
if let Some(descendant_element) = descendant.value().as_element() {
if descendant_element.name().eq_ignore_ascii_case("a") {
let link_element = ElementRef::wrap(descendant).unwrap();
link_text_length += get_inner_text(&link_element, false).len();
}
}
}
link_text_length as f64 / total_text_length as f64
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_normalize_whitespace() {
assert_eq!(normalize_whitespace("hello world\n\ntest"), "hello world test");
assert_eq!(normalize_whitespace(" \n\t "), "");
assert_eq!(normalize_whitespace("single"), "single");
}
#[test]
fn test_word_count() {
assert_eq!(word_count("hello world"), 2);
assert_eq!(word_count(" hello world test "), 3);
assert_eq!(word_count(""), 0);
}
#[test]
fn test_text_similarity() {
assert_eq!(text_similarity("hello world", "hello world"), 1.0);
assert!(text_similarity("hello world", "hello there") > 0.0);
assert!(text_similarity("hello world", "hello there") < 1.0);
assert_eq!(text_similarity("hello", "world"), 0.0);
assert_eq!(text_similarity("", ""), 1.0);
}
#[test]
fn test_is_url() {
assert!(is_url("https://example.com"));
assert!(is_url("http://example.com"));
assert!(!is_url("not a url"));
assert!(!is_url(""));
}
#[test]
fn test_to_absolute_uri() {
let base = "https://example.com/path/";
assert_eq!(to_absolute_uri("#anchor", base), "#anchor");
assert_eq!(to_absolute_uri("/absolute", base), "https://example.com/absolute");
assert_eq!(to_absolute_uri("relative", base), "https://example.com/path/relative");
}
#[test]
fn test_is_phrasing_content() {
assert!(is_phrasing_content("span"));
assert!(is_phrasing_content("STRONG"));
assert!(!is_phrasing_content("div"));
assert!(!is_phrasing_content("section"));
}
#[test]
fn test_unescape_html_entities() {
assert_eq!(unescape_html_entities("<div>"), "<div>");
assert_eq!(unescape_html_entities(""hello""), "\"hello\"");
assert_eq!(unescape_html_entities("&nbsp;"), " ");
}
#[test]
fn test_is_title_candidate() {
assert!(is_title_candidate("A Great Article Title", None));
assert!(!is_title_candidate("A", None)); assert!(!is_title_candidate("This is way too long to be a reasonable title for an article", None)); }
#[test]
fn test_get_char_count() {
assert_eq!(get_char_count("hello,world,test", Some(',')), 2);
assert_eq!(get_char_count("hello world", None), 11);
}
}