use regex::Regex;
use reqwest::Client;
use scraper::{Html, Selector};
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet};
use std::time::Duration;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SocialProfile {
pub platform: String,
pub username: String,
pub url: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub found_on: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ContactSpyResult {
pub domain: String,
pub emails: Vec<String>,
pub phones: Vec<String>,
pub social_media: Vec<SocialProfile>,
pub social_media_by_platform: HashMap<String, Vec<SocialProfile>>,
pub pages_scanned: usize,
pub total_emails: usize,
pub total_phones: usize,
pub total_social_media: usize,
}
const INVALID_USERNAMES: &[&str] = &[
"share", "sharer", "intent", "oauth", "login", "register", "signup", "api", "www", "mobile",
"m", "help", "support", "about", "privacy", "terms", "contact", "home", "index", "main",
"page", "site", "web", "app", "download", "install", "get", "go", "redirect", "link", "url",
"http", "https", "com", "org", "net", "plugins", "dialog", "p", "explore", "accounts",
];
const SKIP_EXTENSIONS: &[&str] = &[
".pdf", ".jpg", ".jpeg", ".png", ".gif", ".zip", ".doc", ".mp4", ".css", ".js", ".svg", ".ico",
".woff", ".ttf",
];
const SKIP_DIRS: &[&str] = &[
"/assets/",
"/images/",
"/css/",
"/js/",
"/fonts/",
"/media/",
"/wp-content/uploads/",
"/static/",
];
const EMAIL_SKIP_PATTERNS: &[&str] = &[
"example.",
"test@",
"noreply@",
"no-reply@",
"admin@example",
"user@example",
"email@example",
"name@example",
".jpg@",
".png@",
"wixpress.",
"sentry.",
"webpack.",
];
const PHONE_FALSE_POSITIVES: &[&str] = &[
r"^(19|20)\d{6,8}$", r"^(\d)\1{6,}$", r"^(123|456|789|987|654|321){2,}$", ];
pub async fn crawl_contacts(
domain: &str,
max_pages: usize,
) -> Result<ContactSpyResult, Box<dyn std::error::Error + Send + Sync>> {
let base_url = if domain.starts_with("http") {
domain.to_string()
} else {
format!("https://{}", domain)
};
let clean_domain = domain
.trim_start_matches("https://")
.trim_start_matches("http://")
.split('/')
.next()
.unwrap_or(domain);
let client = Client::builder()
.timeout(Duration::from_secs(15))
.danger_accept_invalid_certs(true)
.redirect(reqwest::redirect::Policy::limited(3))
.user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
.build()?;
let email_regex = Regex::new(r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}").unwrap();
let phone_regex = Regex::new(r"[\+]?[\d\s\-\(\)\.]{8,20}").unwrap();
let social_patterns = build_social_patterns();
let phone_fp_regexes: Vec<Regex> = PHONE_FALSE_POSITIVES
.iter()
.filter_map(|p| Regex::new(p).ok())
.collect();
let mut all_emails: HashSet<String> = HashSet::new();
let mut all_phones: HashSet<String> = HashSet::new();
let mut all_social: Vec<SocialProfile> = Vec::new();
let mut seen_profiles: HashSet<String> = HashSet::new();
let mut visited: HashSet<String> = HashSet::new();
let mut to_visit: Vec<String> = vec![base_url.clone()];
while let Some(current_url) = to_visit.pop() {
if visited.len() >= max_pages {
break;
}
if visited.contains(¤t_url) {
continue;
}
visited.insert(current_url.clone());
let resp = match client.get(¤t_url).send().await {
Ok(r) if r.status().is_success() => r,
_ => continue,
};
let html = match resp.text().await {
Ok(t) => t,
Err(_) => continue,
};
let doc = Html::parse_document(&html);
let text_sel = Selector::parse("body").unwrap();
let link_sel = Selector::parse("a[href]").unwrap();
let mut clean_text = String::new();
if let Some(body) = doc.select(&text_sel).next() {
for node in body.text() {
clean_text.push(' ');
clean_text.push_str(node);
}
}
let mut all_hrefs = String::new();
for el in doc.select(&link_sel) {
if let Some(href) = el.value().attr("href") {
all_hrefs.push(' ');
all_hrefs.push_str(href);
}
}
let full_text = format!("{} {}", clean_text, all_hrefs);
for mat in email_regex.find_iter(&full_text) {
let email = mat.as_str().to_lowercase();
if email.len() > 5
&& email.chars().filter(|c| *c == '@').count() == 1
&& !EMAIL_SKIP_PATTERNS.iter().any(|skip| email.contains(skip))
{
all_emails.insert(email);
}
}
for mat in phone_regex.find_iter(&clean_text) {
let raw = mat.as_str().trim();
let digits: String = raw
.chars()
.filter(|c| c.is_ascii_digit() || *c == '+')
.collect();
let digits_only: String = digits.replace('+', "");
if is_valid_phone(&digits_only, &phone_fp_regexes)
&& (digits.starts_with('+') || digits_only.len() >= 10) {
all_phones.insert(digits);
}
}
for (platform, regex) in &social_patterns {
for caps in regex.captures_iter(&full_text) {
let username = caps
.get(caps.len() - 1)
.or_else(|| caps.get(1))
.map(|m| m.as_str().to_string())
.unwrap_or_default();
if username.is_empty() {
continue;
}
if !is_valid_social_username(&username, platform) {
continue;
}
let profile_id = format!("{}:{}", platform, username.to_lowercase());
if !seen_profiles.insert(profile_id) {
continue;
}
let full_url = caps
.get(0)
.map(|m| {
let u = m.as_str().to_string();
if u.starts_with("http") {
u
} else {
format!("https://{}", u)
}
})
.unwrap_or_default();
all_social.push(SocialProfile {
platform: platform.to_string(),
username,
url: full_url,
found_on: Some(current_url.clone()),
});
}
}
if visited.len() < max_pages {
for el in doc.select(&link_sel) {
if let Some(href) = el.value().attr("href") {
if let Some(abs_url) = resolve_url(&base_url, href) {
if is_valid_crawl_url(&abs_url, clean_domain) && !visited.contains(&abs_url)
{
to_visit.push(abs_url);
}
}
}
}
}
}
let mut by_platform: HashMap<String, Vec<SocialProfile>> = HashMap::new();
for profile in &all_social {
by_platform
.entry(profile.platform.clone())
.or_default()
.push(profile.clone());
}
let total_emails = all_emails.len();
let total_phones = all_phones.len();
let total_social = all_social.len();
Ok(ContactSpyResult {
domain: clean_domain.to_string(),
emails: all_emails.into_iter().collect(),
phones: all_phones.into_iter().collect(),
social_media: all_social,
social_media_by_platform: by_platform,
pages_scanned: visited.len(),
total_emails,
total_phones,
total_social_media: total_social,
})
}
fn build_social_patterns() -> Vec<(String, Regex)> {
vec![
(
"Facebook".into(),
Regex::new(r"(?i)facebook\.com/([a-zA-Z0-9._-]+)").unwrap(),
),
(
"Twitter".into(),
Regex::new(r"(?i)(?:twitter\.com|x\.com)/([a-zA-Z0-9._-]+)")
.unwrap(),
),
(
"Instagram".into(),
Regex::new(r"(?i)instagram\.com/([a-zA-Z0-9._-]+)").unwrap(),
),
(
"LinkedIn".into(),
Regex::new(r"(?i)linkedin\.com/(?:in|company)/([a-zA-Z0-9._-]+)").unwrap(),
),
(
"YouTube".into(),
Regex::new(r"(?i)youtube\.com/(?:channel/|user/|c/|@)([a-zA-Z0-9._-]+)").unwrap(),
),
(
"GitHub".into(),
Regex::new(r"(?i)github\.com/([a-zA-Z0-9._-]+)").unwrap(),
),
(
"TikTok".into(),
Regex::new(r"(?i)tiktok\.com/@([a-zA-Z0-9._-]+)").unwrap(),
),
]
}
fn is_valid_phone(digits: &str, fp_regexes: &[Regex]) -> bool {
if digits.len() < 7 || digits.len() > 15 {
return false;
}
if !digits.chars().all(|c| c.is_ascii_digit()) {
return false;
}
if fp_regexes.iter().any(|rx| rx.is_match(digits)) {
return false;
}
true
}
fn is_valid_social_username(username: &str, platform: &str) -> bool {
if username.len() < 2 {
return false;
}
if INVALID_USERNAMES.contains(&username.to_lowercase().as_str()) {
return false;
}
match platform {
"Twitter" => username.len() <= 15 && !username.starts_with('_'),
"Instagram" => username.len() <= 30,
"LinkedIn" => username.len() <= 100,
"GitHub" => username.len() <= 39 && !username.starts_with('-'),
"YouTube" => username.len() <= 100,
"Facebook" => username.len() <= 50,
"TikTok" => username.len() <= 24,
_ => true,
}
}
fn is_valid_crawl_url(url: &str, base_domain: &str) -> bool {
let lower = url.to_lowercase();
if !lower.contains(base_domain) {
return false;
}
if SKIP_EXTENSIONS.iter().any(|ext| lower.ends_with(ext)) {
return false;
}
if SKIP_DIRS.iter().any(|dir| lower.contains(dir)) {
return false;
}
if url.starts_with('#') || url.starts_with("javascript:") || url.starts_with("mailto:") {
return false;
}
true
}
fn resolve_url(base: &str, href: &str) -> Option<String> {
if href.starts_with("javascript:")
|| href.starts_with('#')
|| href.starts_with("mailto:")
|| href.starts_with("tel:")
{
return None;
}
if href.starts_with("//") {
return Some(format!("https:{}", href));
}
if href.starts_with("http://") || href.starts_with("https://") {
return Some(href.to_string());
}
let base_trimmed = if let Some(idx) = base.rfind('/') {
&base[..idx + 1]
} else {
base
};
Some(format!("{}{}", base_trimmed, href.trim_start_matches('/')))
}