Skip to main content

web_analyzer/
contact_spy.rs

1use regex::Regex;
2use reqwest::Client;
3use scraper::{Html, Selector};
4use serde::{Deserialize, Serialize};
5use std::collections::{HashMap, HashSet};
6use std::time::Duration;
7
8// ── Structs ─────────────────────────────────────────────────────────────────
9
10#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct SocialProfile {
12    pub platform: String,
13    pub username: String,
14    pub url: String,
15    #[serde(skip_serializing_if = "Option::is_none")]
16    pub found_on: Option<String>,
17}
18
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct ContactSpyResult {
21    pub domain: String,
22    pub emails: Vec<String>,
23    pub phones: Vec<String>,
24    pub social_media: Vec<SocialProfile>,
25    pub social_media_by_platform: HashMap<String, Vec<SocialProfile>>,
26    pub pages_scanned: usize,
27    pub total_emails: usize,
28    pub total_phones: usize,
29    pub total_social_media: usize,
30}
31
32// ── Invalid social media usernames ──────────────────────────────────────────
33
34const INVALID_USERNAMES: &[&str] = &[
35    "share", "sharer", "intent", "oauth", "login", "register", "signup", "api", "www", "mobile",
36    "m", "help", "support", "about", "privacy", "terms", "contact", "home", "index", "main",
37    "page", "site", "web", "app", "download", "install", "get", "go", "redirect", "link", "url",
38    "http", "https", "com", "org", "net", "plugins", "dialog", "p", "explore", "accounts",
39];
40
41// ── Skip extensions and directories ─────────────────────────────────────────
42
43const SKIP_EXTENSIONS: &[&str] = &[
44    ".pdf", ".jpg", ".jpeg", ".png", ".gif", ".zip", ".doc", ".mp4", ".css", ".js", ".svg", ".ico",
45    ".woff", ".ttf",
46];
47
48const SKIP_DIRS: &[&str] = &[
49    "/assets/",
50    "/images/",
51    "/css/",
52    "/js/",
53    "/fonts/",
54    "/media/",
55    "/wp-content/uploads/",
56    "/static/",
57];
58
59// ── Email false positive filters ────────────────────────────────────────────
60
61const EMAIL_SKIP_PATTERNS: &[&str] = &[
62    "example.",
63    "test@",
64    "noreply@",
65    "no-reply@",
66    "admin@example",
67    "user@example",
68    "email@example",
69    "name@example",
70    ".jpg@",
71    ".png@",
72    "wixpress.",
73    "sentry.",
74    "webpack.",
75];
76
77// ── Phone false positive patterns (regex) ───────────────────────────────────
78
79const PHONE_FALSE_POSITIVES: &[&str] = &[
80    r"^(19|20)\d{6,8}$",                // Date patterns (1900-2099)
81    r"^(\d)\1{6,}$",                    // Same digit repeated 7+ times
82    r"^(123|456|789|987|654|321){2,}$", // Sequential numbers
83];
84
85// ── Main function ───────────────────────────────────────────────────────────
86
87pub async fn crawl_contacts(
88    domain: &str,
89    max_pages: usize,
90) -> Result<ContactSpyResult, Box<dyn std::error::Error + Send + Sync>> {
91    let base_url = if domain.starts_with("http") {
92        domain.to_string()
93    } else {
94        format!("https://{}", domain)
95    };
96
97    let clean_domain = domain
98        .trim_start_matches("https://")
99        .trim_start_matches("http://")
100        .split('/')
101        .next()
102        .unwrap_or(domain);
103
104    let client = Client::builder()
105        .timeout(Duration::from_secs(15))
106        .danger_accept_invalid_certs(true)
107        .redirect(reqwest::redirect::Policy::limited(3))
108        .user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
109        .build()?;
110
111    let email_regex = Regex::new(r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}").unwrap();
112    let phone_regex = Regex::new(r"[\+]?[\d\s\-\(\)\.]{8,20}").unwrap();
113
114    let social_patterns = build_social_patterns();
115    let phone_fp_regexes: Vec<Regex> = PHONE_FALSE_POSITIVES
116        .iter()
117        .filter_map(|p| Regex::new(p).ok())
118        .collect();
119
120    let mut all_emails: HashSet<String> = HashSet::new();
121    let mut all_phones: HashSet<String> = HashSet::new();
122    let mut all_social: Vec<SocialProfile> = Vec::new();
123    let mut seen_profiles: HashSet<String> = HashSet::new();
124
125    let mut visited: HashSet<String> = HashSet::new();
126    let mut to_visit: Vec<String> = vec![base_url.clone()];
127
128    // ── BFS Crawl ───────────────────────────────────────────────────────
129    while let Some(current_url) = to_visit.pop() {
130        if visited.len() >= max_pages {
131            break;
132        }
133        if visited.contains(&current_url) {
134            continue;
135        }
136        visited.insert(current_url.clone());
137
138        let resp = match client.get(&current_url).send().await {
139            Ok(r) if r.status().is_success() => r,
140            _ => continue,
141        };
142
143        let html = match resp.text().await {
144            Ok(t) => t,
145            Err(_) => continue,
146        };
147        let doc = Html::parse_document(&html);
148
149        // Remove script/style content — extract clean text
150        let text_sel = Selector::parse("body").unwrap();
151        let link_sel = Selector::parse("a[href]").unwrap();
152
153        // Collect clean text (excluding script/style)
154        let mut clean_text = String::new();
155        if let Some(body) = doc.select(&text_sel).next() {
156            for node in body.text() {
157                clean_text.push(' ');
158                clean_text.push_str(node);
159            }
160        }
161
162        // Collect all href values for social media extraction
163        let mut all_hrefs = String::new();
164        for el in doc.select(&link_sel) {
165            if let Some(href) = el.value().attr("href") {
166                all_hrefs.push(' ');
167                all_hrefs.push_str(href);
168            }
169        }
170
171        let full_text = format!("{} {}", clean_text, all_hrefs);
172
173        // ── Extract emails ──────────────────────────────────────────────
174        for mat in email_regex.find_iter(&full_text) {
175            let email = mat.as_str().to_lowercase();
176            if email.len() > 5
177                && email.chars().filter(|c| *c == '@').count() == 1
178                && !EMAIL_SKIP_PATTERNS.iter().any(|skip| email.contains(skip))
179            {
180                all_emails.insert(email);
181            }
182        }
183
184        // ── Extract phones (from clean text only) ───────────────────────
185        for mat in phone_regex.find_iter(&clean_text) {
186            let raw = mat.as_str().trim();
187            let digits: String = raw
188                .chars()
189                .filter(|c| c.is_ascii_digit() || *c == '+')
190                .collect();
191            let digits_only: String = digits.replace('+', "");
192
193            if is_valid_phone(&digits_only, &phone_fp_regexes)
194                && (digits.starts_with('+') || digits_only.len() >= 10) {
195                    all_phones.insert(digits);
196                }
197        }
198
199        // ── Extract social media ────────────────────────────────────────
200        for (platform, regex) in &social_patterns {
201            for caps in regex.captures_iter(&full_text) {
202                let username = caps
203                    .get(caps.len() - 1)
204                    .or_else(|| caps.get(1))
205                    .map(|m| m.as_str().to_string())
206                    .unwrap_or_default();
207
208                if username.is_empty() {
209                    continue;
210                }
211                if !is_valid_social_username(&username, platform) {
212                    continue;
213                }
214
215                let profile_id = format!("{}:{}", platform, username.to_lowercase());
216                if !seen_profiles.insert(profile_id) {
217                    continue;
218                }
219
220                let full_url = caps
221                    .get(0)
222                    .map(|m| {
223                        let u = m.as_str().to_string();
224                        if u.starts_with("http") {
225                            u
226                        } else {
227                            format!("https://{}", u)
228                        }
229                    })
230                    .unwrap_or_default();
231
232                all_social.push(SocialProfile {
233                    platform: platform.to_string(),
234                    username,
235                    url: full_url,
236                    found_on: Some(current_url.clone()),
237                });
238            }
239        }
240
241        // ── Discover new links for crawling ─────────────────────────────
242        if visited.len() < max_pages {
243            for el in doc.select(&link_sel) {
244                if let Some(href) = el.value().attr("href") {
245                    if let Some(abs_url) = resolve_url(&base_url, href) {
246                        if is_valid_crawl_url(&abs_url, clean_domain) && !visited.contains(&abs_url)
247                        {
248                            to_visit.push(abs_url);
249                        }
250                    }
251                }
252            }
253        }
254    }
255
256    // ── Group social by platform ────────────────────────────────────────
257    let mut by_platform: HashMap<String, Vec<SocialProfile>> = HashMap::new();
258    for profile in &all_social {
259        by_platform
260            .entry(profile.platform.clone())
261            .or_default()
262            .push(profile.clone());
263    }
264
265    let total_emails = all_emails.len();
266    let total_phones = all_phones.len();
267    let total_social = all_social.len();
268
269    Ok(ContactSpyResult {
270        domain: clean_domain.to_string(),
271        emails: all_emails.into_iter().collect(),
272        phones: all_phones.into_iter().collect(),
273        social_media: all_social,
274        social_media_by_platform: by_platform,
275        pages_scanned: visited.len(),
276        total_emails,
277        total_phones,
278        total_social_media: total_social,
279    })
280}
281
282// ── Social media patterns ───────────────────────────────────────────────────
283
284fn build_social_patterns() -> Vec<(String, Regex)> {
285    vec![
286        (
287            "Facebook".into(),
288            Regex::new(r"(?i)facebook\.com/([a-zA-Z0-9._-]+)").unwrap(),
289        ),
290        (
291            "Twitter".into(),
292            Regex::new(r"(?i)(?:twitter\.com|x\.com)/([a-zA-Z0-9._-]+)")
293                .unwrap(),
294        ),
295        (
296            "Instagram".into(),
297            Regex::new(r"(?i)instagram\.com/([a-zA-Z0-9._-]+)").unwrap(),
298        ),
299        (
300            "LinkedIn".into(),
301            Regex::new(r"(?i)linkedin\.com/(?:in|company)/([a-zA-Z0-9._-]+)").unwrap(),
302        ),
303        (
304            "YouTube".into(),
305            Regex::new(r"(?i)youtube\.com/(?:channel/|user/|c/|@)([a-zA-Z0-9._-]+)").unwrap(),
306        ),
307        (
308            "GitHub".into(),
309            Regex::new(r"(?i)github\.com/([a-zA-Z0-9._-]+)").unwrap(),
310        ),
311        (
312            "TikTok".into(),
313            Regex::new(r"(?i)tiktok\.com/@([a-zA-Z0-9._-]+)").unwrap(),
314        ),
315    ]
316}
317
318// ── Validation helpers ──────────────────────────────────────────────────────
319
320fn is_valid_phone(digits: &str, fp_regexes: &[Regex]) -> bool {
321    if digits.len() < 7 || digits.len() > 15 {
322        return false;
323    }
324    if !digits.chars().all(|c| c.is_ascii_digit()) {
325        return false;
326    }
327    if fp_regexes.iter().any(|rx| rx.is_match(digits)) {
328        return false;
329    }
330    true
331}
332
333fn is_valid_social_username(username: &str, platform: &str) -> bool {
334    if username.len() < 2 {
335        return false;
336    }
337    if INVALID_USERNAMES.contains(&username.to_lowercase().as_str()) {
338        return false;
339    }
340
341    // Platform-specific length limits
342    match platform {
343        "Twitter" => username.len() <= 15 && !username.starts_with('_'),
344        "Instagram" => username.len() <= 30,
345        "LinkedIn" => username.len() <= 100,
346        "GitHub" => username.len() <= 39 && !username.starts_with('-'),
347        "YouTube" => username.len() <= 100,
348        "Facebook" => username.len() <= 50,
349        "TikTok" => username.len() <= 24,
350        _ => true,
351    }
352}
353
354fn is_valid_crawl_url(url: &str, base_domain: &str) -> bool {
355    let lower = url.to_lowercase();
356
357    // Must contain the domain
358    if !lower.contains(base_domain) {
359        return false;
360    }
361
362    // Skip static assets
363    if SKIP_EXTENSIONS.iter().any(|ext| lower.ends_with(ext)) {
364        return false;
365    }
366    if SKIP_DIRS.iter().any(|dir| lower.contains(dir)) {
367        return false;
368    }
369
370    // Skip fragments and javascript
371    if url.starts_with('#') || url.starts_with("javascript:") || url.starts_with("mailto:") {
372        return false;
373    }
374
375    true
376}
377
378fn resolve_url(base: &str, href: &str) -> Option<String> {
379    if href.starts_with("javascript:")
380        || href.starts_with('#')
381        || href.starts_with("mailto:")
382        || href.starts_with("tel:")
383    {
384        return None;
385    }
386    if href.starts_with("//") {
387        return Some(format!("https:{}", href));
388    }
389    if href.starts_with("http://") || href.starts_with("https://") {
390        return Some(href.to_string());
391    }
392    // Relative URL
393    let base_trimmed = if let Some(idx) = base.rfind('/') {
394        &base[..idx + 1]
395    } else {
396        base
397    };
398    Some(format!("{}{}", base_trimmed, href.trim_start_matches('/')))
399}