1use regex::Regex;
2use reqwest::Client;
3use scraper::{Html, Selector};
4use serde::{Deserialize, Serialize};
5use std::collections::{HashMap, HashSet};
6use std::time::Duration;
7
8#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct SocialProfile {
12 pub platform: String,
13 pub username: String,
14 pub url: String,
15 #[serde(skip_serializing_if = "Option::is_none")]
16 pub found_on: Option<String>,
17}
18
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct ContactSpyResult {
21 pub domain: String,
22 pub emails: Vec<String>,
23 pub phones: Vec<String>,
24 pub social_media: Vec<SocialProfile>,
25 pub social_media_by_platform: HashMap<String, Vec<SocialProfile>>,
26 pub pages_scanned: usize,
27 pub total_emails: usize,
28 pub total_phones: usize,
29 pub total_social_media: usize,
30}
31
32const INVALID_USERNAMES: &[&str] = &[
35 "share", "sharer", "intent", "oauth", "login", "register", "signup", "api", "www", "mobile",
36 "m", "help", "support", "about", "privacy", "terms", "contact", "home", "index", "main",
37 "page", "site", "web", "app", "download", "install", "get", "go", "redirect", "link", "url",
38 "http", "https", "com", "org", "net", "plugins", "dialog", "p", "explore", "accounts",
39];
40
41const SKIP_EXTENSIONS: &[&str] = &[
44 ".pdf", ".jpg", ".jpeg", ".png", ".gif", ".zip", ".doc", ".mp4", ".css", ".js", ".svg", ".ico",
45 ".woff", ".ttf",
46];
47
48const SKIP_DIRS: &[&str] = &[
49 "/assets/",
50 "/images/",
51 "/css/",
52 "/js/",
53 "/fonts/",
54 "/media/",
55 "/wp-content/uploads/",
56 "/static/",
57];
58
59const EMAIL_SKIP_PATTERNS: &[&str] = &[
62 "example.",
63 "test@",
64 "noreply@",
65 "no-reply@",
66 "admin@example",
67 "user@example",
68 "email@example",
69 "name@example",
70 ".jpg@",
71 ".png@",
72 "wixpress.",
73 "sentry.",
74 "webpack.",
75];
76
77const PHONE_FALSE_POSITIVES: &[&str] = &[
80 r"^(19|20)\d{6,8}$", r"^(\d)\1{6,}$", r"^(123|456|789|987|654|321){2,}$", ];
84
85pub async fn crawl_contacts(
88 domain: &str,
89 max_pages: usize,
90) -> Result<ContactSpyResult, Box<dyn std::error::Error + Send + Sync>> {
91 let base_url = if domain.starts_with("http") {
92 domain.to_string()
93 } else {
94 format!("https://{}", domain)
95 };
96
97 let clean_domain = domain
98 .trim_start_matches("https://")
99 .trim_start_matches("http://")
100 .split('/')
101 .next()
102 .unwrap_or(domain);
103
104 let client = Client::builder()
105 .timeout(Duration::from_secs(15))
106 .danger_accept_invalid_certs(true)
107 .redirect(reqwest::redirect::Policy::limited(3))
108 .user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
109 .build()?;
110
111 let email_regex = Regex::new(r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}").unwrap();
112 let phone_regex = Regex::new(r"[\+]?[\d\s\-\(\)\.]{8,20}").unwrap();
113
114 let social_patterns = build_social_patterns();
115 let phone_fp_regexes: Vec<Regex> = PHONE_FALSE_POSITIVES
116 .iter()
117 .filter_map(|p| Regex::new(p).ok())
118 .collect();
119
120 let mut all_emails: HashSet<String> = HashSet::new();
121 let mut all_phones: HashSet<String> = HashSet::new();
122 let mut all_social: Vec<SocialProfile> = Vec::new();
123 let mut seen_profiles: HashSet<String> = HashSet::new();
124
125 let mut visited: HashSet<String> = HashSet::new();
126 let mut to_visit: Vec<String> = vec![base_url.clone()];
127
128 while let Some(current_url) = to_visit.pop() {
130 if visited.len() >= max_pages {
131 break;
132 }
133 if visited.contains(¤t_url) {
134 continue;
135 }
136 visited.insert(current_url.clone());
137
138 let resp = match client.get(¤t_url).send().await {
139 Ok(r) if r.status().is_success() => r,
140 _ => continue,
141 };
142
143 let html = match resp.text().await {
144 Ok(t) => t,
145 Err(_) => continue,
146 };
147 let doc = Html::parse_document(&html);
148
149 let text_sel = Selector::parse("body").unwrap();
151 let link_sel = Selector::parse("a[href]").unwrap();
152
153 let mut clean_text = String::new();
155 if let Some(body) = doc.select(&text_sel).next() {
156 for node in body.text() {
157 clean_text.push(' ');
158 clean_text.push_str(node);
159 }
160 }
161
162 let mut all_hrefs = String::new();
164 for el in doc.select(&link_sel) {
165 if let Some(href) = el.value().attr("href") {
166 all_hrefs.push(' ');
167 all_hrefs.push_str(href);
168 }
169 }
170
171 let full_text = format!("{} {}", clean_text, all_hrefs);
172
173 for mat in email_regex.find_iter(&full_text) {
175 let email = mat.as_str().to_lowercase();
176 if email.len() > 5
177 && email.chars().filter(|c| *c == '@').count() == 1
178 && !EMAIL_SKIP_PATTERNS.iter().any(|skip| email.contains(skip))
179 {
180 all_emails.insert(email);
181 }
182 }
183
184 for mat in phone_regex.find_iter(&clean_text) {
186 let raw = mat.as_str().trim();
187 let digits: String = raw
188 .chars()
189 .filter(|c| c.is_ascii_digit() || *c == '+')
190 .collect();
191 let digits_only: String = digits.replace('+', "");
192
193 if is_valid_phone(&digits_only, &phone_fp_regexes)
194 && (digits.starts_with('+') || digits_only.len() >= 10) {
195 all_phones.insert(digits);
196 }
197 }
198
199 for (platform, regex) in &social_patterns {
201 for caps in regex.captures_iter(&full_text) {
202 let username = caps
203 .get(caps.len() - 1)
204 .or_else(|| caps.get(1))
205 .map(|m| m.as_str().to_string())
206 .unwrap_or_default();
207
208 if username.is_empty() {
209 continue;
210 }
211 if !is_valid_social_username(&username, platform) {
212 continue;
213 }
214
215 let profile_id = format!("{}:{}", platform, username.to_lowercase());
216 if !seen_profiles.insert(profile_id) {
217 continue;
218 }
219
220 let full_url = caps
221 .get(0)
222 .map(|m| {
223 let u = m.as_str().to_string();
224 if u.starts_with("http") {
225 u
226 } else {
227 format!("https://{}", u)
228 }
229 })
230 .unwrap_or_default();
231
232 all_social.push(SocialProfile {
233 platform: platform.to_string(),
234 username,
235 url: full_url,
236 found_on: Some(current_url.clone()),
237 });
238 }
239 }
240
241 if visited.len() < max_pages {
243 for el in doc.select(&link_sel) {
244 if let Some(href) = el.value().attr("href") {
245 if let Some(abs_url) = resolve_url(&base_url, href) {
246 if is_valid_crawl_url(&abs_url, clean_domain) && !visited.contains(&abs_url)
247 {
248 to_visit.push(abs_url);
249 }
250 }
251 }
252 }
253 }
254 }
255
256 let mut by_platform: HashMap<String, Vec<SocialProfile>> = HashMap::new();
258 for profile in &all_social {
259 by_platform
260 .entry(profile.platform.clone())
261 .or_default()
262 .push(profile.clone());
263 }
264
265 let total_emails = all_emails.len();
266 let total_phones = all_phones.len();
267 let total_social = all_social.len();
268
269 Ok(ContactSpyResult {
270 domain: clean_domain.to_string(),
271 emails: all_emails.into_iter().collect(),
272 phones: all_phones.into_iter().collect(),
273 social_media: all_social,
274 social_media_by_platform: by_platform,
275 pages_scanned: visited.len(),
276 total_emails,
277 total_phones,
278 total_social_media: total_social,
279 })
280}
281
282fn build_social_patterns() -> Vec<(String, Regex)> {
285 vec![
286 (
287 "Facebook".into(),
288 Regex::new(r"(?i)facebook\.com/([a-zA-Z0-9._-]+)").unwrap(),
289 ),
290 (
291 "Twitter".into(),
292 Regex::new(r"(?i)(?:twitter\.com|x\.com)/([a-zA-Z0-9._-]+)")
293 .unwrap(),
294 ),
295 (
296 "Instagram".into(),
297 Regex::new(r"(?i)instagram\.com/([a-zA-Z0-9._-]+)").unwrap(),
298 ),
299 (
300 "LinkedIn".into(),
301 Regex::new(r"(?i)linkedin\.com/(?:in|company)/([a-zA-Z0-9._-]+)").unwrap(),
302 ),
303 (
304 "YouTube".into(),
305 Regex::new(r"(?i)youtube\.com/(?:channel/|user/|c/|@)([a-zA-Z0-9._-]+)").unwrap(),
306 ),
307 (
308 "GitHub".into(),
309 Regex::new(r"(?i)github\.com/([a-zA-Z0-9._-]+)").unwrap(),
310 ),
311 (
312 "TikTok".into(),
313 Regex::new(r"(?i)tiktok\.com/@([a-zA-Z0-9._-]+)").unwrap(),
314 ),
315 ]
316}
317
318fn is_valid_phone(digits: &str, fp_regexes: &[Regex]) -> bool {
321 if digits.len() < 7 || digits.len() > 15 {
322 return false;
323 }
324 if !digits.chars().all(|c| c.is_ascii_digit()) {
325 return false;
326 }
327 if fp_regexes.iter().any(|rx| rx.is_match(digits)) {
328 return false;
329 }
330 true
331}
332
333fn is_valid_social_username(username: &str, platform: &str) -> bool {
334 if username.len() < 2 {
335 return false;
336 }
337 if INVALID_USERNAMES.contains(&username.to_lowercase().as_str()) {
338 return false;
339 }
340
341 match platform {
343 "Twitter" => username.len() <= 15 && !username.starts_with('_'),
344 "Instagram" => username.len() <= 30,
345 "LinkedIn" => username.len() <= 100,
346 "GitHub" => username.len() <= 39 && !username.starts_with('-'),
347 "YouTube" => username.len() <= 100,
348 "Facebook" => username.len() <= 50,
349 "TikTok" => username.len() <= 24,
350 _ => true,
351 }
352}
353
354fn is_valid_crawl_url(url: &str, base_domain: &str) -> bool {
355 let lower = url.to_lowercase();
356
357 if !lower.contains(base_domain) {
359 return false;
360 }
361
362 if SKIP_EXTENSIONS.iter().any(|ext| lower.ends_with(ext)) {
364 return false;
365 }
366 if SKIP_DIRS.iter().any(|dir| lower.contains(dir)) {
367 return false;
368 }
369
370 if url.starts_with('#') || url.starts_with("javascript:") || url.starts_with("mailto:") {
372 return false;
373 }
374
375 true
376}
377
378fn resolve_url(base: &str, href: &str) -> Option<String> {
379 if href.starts_with("javascript:")
380 || href.starts_with('#')
381 || href.starts_with("mailto:")
382 || href.starts_with("tel:")
383 {
384 return None;
385 }
386 if href.starts_with("//") {
387 return Some(format!("https:{}", href));
388 }
389 if href.starts_with("http://") || href.starts_with("https://") {
390 return Some(href.to_string());
391 }
392 let base_trimmed = if let Some(idx) = base.rfind('/') {
394 &base[..idx + 1]
395 } else {
396 base
397 };
398 Some(format!("{}{}", base_trimmed, href.trim_start_matches('/')))
399}