pub mod language;
pub(crate) mod lru;
pub(crate) mod regex_patterns;
pub(crate) mod text;
pub(crate) mod url;
use std::path::Path;
use unicode_normalization::UnicodeNormalization;
use crate::dom::{Document, NodeId};
pub fn trim(s: &str) -> String {
let no_soft_hyphen: String = s.chars().filter(|&c| c != '\u{00AD}').collect();
let joined = no_soft_hyphen
.split_whitespace()
.collect::<Vec<_>>()
.join(" ");
joined.nfc().collect()
}
pub(crate) fn str_word_count(s: &str) -> usize {
s.split_whitespace().count()
}
pub(crate) fn str_or<'a>(args: &[&'a str]) -> &'a str {
args.iter().find(|&&s| !s.is_empty()).copied().unwrap_or("")
}
pub(crate) fn is_image_element(doc: &Document, id: NodeId) -> bool {
for attr_name in ["src", "data-src", "data-srcset"] {
if let Some(val) = doc.get_attribute(id, attr_name) {
if is_image_file(&val) {
return true;
}
}
}
for attr in doc.attribute_names(id) {
if attr.starts_with("data-src") {
if let Some(val) = doc.get_attribute(id, &attr) {
if is_image_file(&val) {
return true;
}
}
}
}
false
}
pub(crate) fn is_image_file(src: &str) -> bool {
if src.is_empty() {
return false;
}
let path_part = src.split('?').next().unwrap_or(src);
let ext = Path::new(path_part)
.extension()
.and_then(|e| e.to_str())
.unwrap_or("");
let ext_lower = ext.to_lowercase();
matches!(
ext_lower.as_str(),
"jpg" | "jpeg" | "png" | "gif" | "webp" | "svg" | "bmp" | "avif" | "tiff" | "tif" | "ico"
)
}
pub(crate) fn uniquify_lists(inputs: &[&str]) -> Vec<String> {
let mut seen = std::collections::HashSet::new();
let mut result = Vec::new();
for &input in inputs {
let sep = if input.chars().filter(|&c| c == ';').count()
> input.chars().filter(|&c| c == ',').count()
{
';'
} else {
','
};
for entry in input.split(sep) {
let entry = trim(entry).replace(['"', '\''], "");
if !entry.is_empty() && seen.insert(entry.clone()) {
result.push(entry);
}
}
}
result
}
pub(crate) fn unescape_html(s: &str) -> String {
if !s.contains('&') {
return s.to_string();
}
let mut result = String::with_capacity(s.len());
let mut chars = s.char_indices().peekable();
while let Some((_, ch)) = chars.next() {
if ch != '&' {
result.push(ch);
continue;
}
let mut entity = String::new();
let mut found_semi = false;
while let Some(&(_, ec)) = chars.peek() {
if ec == ';' {
chars.next();
found_semi = true;
break;
}
if entity.len() > 12 || (!ec.is_alphanumeric() && ec != '#') {
break;
}
chars.next();
entity.push(ec);
}
let decoded: Option<&str> = match entity.as_str() {
"amp" => Some("&"),
"lt" => Some("<"),
"gt" => Some(">"),
"quot" => Some("\""),
"apos" => Some("'"),
"nbsp" => Some("\u{00A0}"),
"ndash" => Some("\u{2013}"),
"mdash" => Some("\u{2014}"),
"hellip" => Some("\u{2026}"),
"laquo" => Some("\u{00AB}"),
"raquo" => Some("\u{00BB}"),
"copy" => Some("\u{00A9}"),
"reg" => Some("\u{00AE}"),
"trade" => Some("\u{2122}"),
_ => None,
};
if let Some(d) = decoded {
result.push_str(d);
continue;
}
if let Some(stripped) = entity.strip_prefix('#') {
let cp = if let Some(hex) = stripped
.strip_prefix('x')
.or_else(|| stripped.strip_prefix('X'))
{
u32::from_str_radix(hex, 16).ok()
} else {
stripped.parse::<u32>().ok()
};
if let Some(c) = cp.and_then(char::from_u32) {
result.push(c);
continue;
}
}
if !found_semi {
result.push('&');
result.push_str(&entity);
continue;
}
result.push('&');
result.push_str(&entity);
result.push(';');
}
result
}
pub(crate) fn remove_emojis(s: &str) -> String {
s.chars()
.filter(|&c| {
let cp = c as u32;
!matches!(
cp,
0x2600..=0x27BF | 0x2900..=0x2BFF | 0x1F000..=0x1FFFF | 0xFE00..=0xFE0F | 0xE0000..=0xE007F )
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_trim() {
assert_eq!(trim(" hello world "), "hello world");
assert_eq!(trim("single"), "single");
assert_eq!(trim(""), "");
assert_eq!(trim(" "), "");
assert_eq!(trim("a b\tc"), "a b c");
}
#[test]
fn test_str_word_count() {
assert_eq!(str_word_count("hello world"), 2);
assert_eq!(str_word_count(" one two three "), 3);
assert_eq!(str_word_count(""), 0);
assert_eq!(str_word_count("single"), 1);
}
#[test]
fn test_str_or() {
assert_eq!(str_or(&["", "second", "third"]), "second");
assert_eq!(str_or(&["first", "second"]), "first");
assert_eq!(str_or(&["", ""]), "");
assert_eq!(str_or(&[]), "");
}
#[test]
fn test_is_image_file() {
assert!(is_image_file("photo.jpg"));
assert!(is_image_file("image.PNG"));
assert!(is_image_file(
"https://cdn.example.com/img/photo.webp?size=large"
));
assert!(!is_image_file("document.pdf"));
assert!(!is_image_file("script.js"));
assert!(!is_image_file(""));
assert!(!is_image_file("noextension"));
}
#[test]
fn test_uniquify_lists() {
let result = uniquify_lists(&["one, two, three", "two, four"]);
assert!(result.contains(&"one".to_string()));
assert!(result.contains(&"two".to_string()));
assert!(result.contains(&"three".to_string()));
assert!(result.contains(&"four".to_string()));
assert_eq!(result.iter().filter(|&s| s == "two").count(), 1);
}
#[test]
fn test_uniquify_lists_semicolon() {
let result = uniquify_lists(&["alpha; beta; gamma"]);
assert_eq!(result.len(), 3);
}
#[test]
fn test_uniquify_lists_strips_quotes() {
let result = uniquify_lists(&[r#""rust", "python""#]);
assert!(result.contains(&"rust".to_string()));
assert!(result.contains(&"python".to_string()));
}
#[test]
fn test_unescape_html_common_entities() {
assert_eq!(unescape_html("&"), "&");
assert_eq!(unescape_html("<tag>"), "<tag>");
assert_eq!(unescape_html(""hello""), "\"hello\"");
assert_eq!(unescape_html("A & B"), "A & B");
assert_eq!(unescape_html("no entities"), "no entities");
}
#[test]
fn test_unescape_html_numeric_refs() {
assert_eq!(unescape_html("A"), "A");
assert_eq!(unescape_html("A"), "A");
assert_eq!(unescape_html("😀"), "\u{1F600}");
}
#[test]
fn test_unescape_html_long_entity_no_corruption() {
let input = "&verylongentityname; rest";
let result = unescape_html(input);
assert!(result.ends_with(" rest"), "got: {result}");
assert!(result.contains('&'), "got: {result}");
}
#[test]
fn test_unescape_html_non_entity_ampersand() {
assert_eq!(unescape_html("a & b"), "a & b");
}
#[test]
fn test_remove_emojis_basic() {
assert_eq!(remove_emojis("hello \u{1F600} world"), "hello world");
assert_eq!(remove_emojis("love \u{2764}"), "love ");
assert_eq!(remove_emojis("plain text"), "plain text");
}
#[test]
fn test_remove_emojis_extended_ranges() {
assert_eq!(remove_emojis("rating \u{2B50}"), "rating ");
assert_eq!(remove_emojis("up \u{2B06}"), "up ");
}
}