use once_cell::sync::Lazy;
use regex::Regex;
use scraper::Html;
use crate::selectors::LINK_SELECTOR;
static EMAIL_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?i)\b[a-z0-9._%+-]+@[a-z0-9]([a-z0-9-]*[a-z0-9])?(\.[a-z0-9]([a-z0-9-]*[a-z0-9])?)*\.[a-z]{2,24}\b").expect("valid regex")
});
static PHONE_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}").expect("valid regex")
});
fn extract_with_prefix(doc: &Html, prefix: &str) -> Vec<String> {
let mut results = Vec::new();
for link in doc.select(&LINK_SELECTOR) {
if let Some(href) = link.value().attr("href") {
if let Some(value) = href.strip_prefix(prefix) {
let clean = value.split('?').next().unwrap_or(value).trim();
if !clean.is_empty() {
results.push(clean.to_string());
}
}
}
}
results
}
fn extract_with_regex(doc: &Html, regex: &Regex) -> Vec<String> {
let text = doc.root_element().text().collect::<Vec<_>>().join(" ");
regex
.captures_iter(&text)
.filter_map(|cap| cap.get(0))
.map(|m| m.as_str().to_string())
.collect()
}
pub(super) fn extract_email_elements(html: &str) -> Vec<String> {
let doc = Html::parse_fragment(html);
crate::merge!(
extract_with_prefix(&doc, "mailto:"),
extract_with_regex(&doc, &EMAIL_REGEX)
)
}
pub(super) fn extract_phone_elements(html: &str) -> Vec<String> {
let doc = Html::parse_fragment(html);
crate::merge!(
extract_with_prefix(&doc, "tel:"),
extract_with_regex(&doc, &PHONE_REGEX)
)
}
pub(super) fn find_metadata_value(pairs: &[(String, String)], keys: &[&str]) -> Option<String> {
for key in keys {
for (k, v) in pairs {
if k.eq_ignore_ascii_case(key) {
let cleaned = v.trim().to_string();
if !cleaned.is_empty() {
return Some(cleaned);
}
}
}
}
None
}
pub(super) fn push_unique(items: &mut Vec<String>, value: String) {
if !items.iter().any(|existing| existing == &value) {
items.push(value);
}
}