halldyll_parser/
contact.rs

1//! Contact information extraction for halldyll-parser
2//!
3//! This module handles extraction of:
4//! - Email addresses
5//! - Phone numbers
6//! - Physical addresses
7//! - Social media links
8//! - Business information
9
10use regex::Regex;
11use scraper::{Html, Selector, ElementRef};
12use serde::{Deserialize, Serialize};
13use lazy_static::lazy_static;
14use std::collections::HashSet;
15
16use crate::types::ParserResult;
17
18// ============================================================================
19// TYPES
20// ============================================================================
21
22/// All contact information found on a page
23#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
24pub struct ContactInfo {
25    /// Email addresses found
26    pub emails: Vec<Email>,
27    /// Phone numbers found
28    pub phones: Vec<Phone>,
29    /// Physical addresses found
30    pub addresses: Vec<Address>,
31    /// Social media links found
32    pub social_links: Vec<SocialLink>,
33    /// Contact page URL if found
34    pub contact_page: Option<String>,
35    /// Business name if found
36    pub business_name: Option<String>,
37}
38
39impl ContactInfo {
40    pub fn new() -> Self {
41        Self::default()
42    }
43
44    /// Check if any contact info was found
45    pub fn has_contact_info(&self) -> bool {
46        !self.emails.is_empty() ||
47        !self.phones.is_empty() ||
48        !self.addresses.is_empty() ||
49        !self.social_links.is_empty()
50    }
51
52    /// Get primary email (first non-generic email)
53    pub fn primary_email(&self) -> Option<&Email> {
54        self.emails.iter().find(|e| !e.is_generic)
55            .or_else(|| self.emails.first())
56    }
57
58    /// Get primary phone
59    pub fn primary_phone(&self) -> Option<&Phone> {
60        self.phones.first()
61    }
62
63    /// Get all unique email addresses as strings
64    pub fn email_addresses(&self) -> Vec<&str> {
65        self.emails.iter().map(|e| e.address.as_str()).collect()
66    }
67}
68
69/// An email address with metadata
70#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
71pub struct Email {
72    /// The email address
73    pub address: String,
74    /// Label (e.g., "support", "sales", "info")
75    pub label: Option<String>,
76    /// Whether this is a generic email (info@, contact@, etc.)
77    pub is_generic: bool,
78    /// Source of the email (href, text, structured data)
79    pub source: EmailSource,
80}
81
82impl Email {
83    pub fn new(address: String) -> Self {
84        let is_generic = Self::check_is_generic(&address);
85        Self {
86            address,
87            label: None,
88            is_generic,
89            source: EmailSource::Text,
90        }
91    }
92
93    fn check_is_generic(address: &str) -> bool {
94        let local = address.split('@').next().unwrap_or("").to_lowercase();
95        let generic_prefixes = [
96            "info", "contact", "hello", "hi", "support", "help",
97            "sales", "admin", "webmaster", "noreply", "no-reply",
98            "mail", "email", "enquiries", "enquiry", "general",
99        ];
100        generic_prefixes.iter().any(|p| local == *p || local.starts_with(&format!("{}.", p)))
101    }
102}
103
104/// Source of email extraction
105#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
106pub enum EmailSource {
107    /// From mailto: link
108    MailtoLink,
109    /// From page text
110    #[default]
111    Text,
112    /// From structured data
113    StructuredData,
114    /// From meta tag
115    MetaTag,
116}
117
118/// A phone number with metadata
119#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
120pub struct Phone {
121    /// Raw phone number as found
122    pub raw: String,
123    /// Normalized phone number (digits only)
124    pub normalized: String,
125    /// International format if parseable
126    pub international: Option<String>,
127    /// Phone type if detectable
128    pub phone_type: PhoneType,
129    /// Label (e.g., "main", "fax", "mobile")
130    pub label: Option<String>,
131}
132
133impl Phone {
134    pub fn new(raw: String) -> Self {
135        let normalized = Self::normalize(&raw);
136        Self {
137            raw,
138            normalized,
139            international: None,
140            phone_type: PhoneType::Unknown,
141            label: None,
142        }
143    }
144
145    fn normalize(raw: &str) -> String {
146        raw.chars().filter(|c| c.is_ascii_digit() || *c == '+').collect()
147    }
148}
149
150/// Type of phone number
151#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
152pub enum PhoneType {
153    /// Main business line
154    Main,
155    /// Mobile/cell phone
156    Mobile,
157    /// Fax number
158    Fax,
159    /// Toll-free number
160    TollFree,
161    /// Unknown type
162    #[default]
163    Unknown,
164}
165
166/// A physical address
167#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
168pub struct Address {
169    /// Full address as text
170    pub full_text: String,
171    /// Street address
172    pub street: Option<String>,
173    /// City
174    pub city: Option<String>,
175    /// State/Province
176    pub state: Option<String>,
177    /// Postal/ZIP code
178    pub postal_code: Option<String>,
179    /// Country
180    pub country: Option<String>,
181    /// Coordinates if available
182    pub coordinates: Option<Coordinates>,
183}
184
185/// Geographic coordinates
186#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
187pub struct Coordinates {
188    pub latitude: f64,
189    pub longitude: f64,
190}
191
192/// A social media link
193#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
194pub struct SocialLink {
195    /// Platform name
196    pub platform: SocialPlatform,
197    /// Full URL
198    pub url: String,
199    /// Username/handle if extractable
200    pub username: Option<String>,
201}
202
203/// Social media platforms
204#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
205pub enum SocialPlatform {
206    Facebook,
207    Twitter,
208    Instagram,
209    LinkedIn,
210    YouTube,
211    TikTok,
212    Pinterest,
213    GitHub,
214    Reddit,
215    Discord,
216    Telegram,
217    WhatsApp,
218    Snapchat,
219    Tumblr,
220    Medium,
221    Twitch,
222    Vimeo,
223    Flickr,
224    Other,
225}
226
227// ============================================================================
228// LAZY STATIC PATTERNS
229// ============================================================================
230
231lazy_static! {
232    /// Email regex pattern
233    static ref EMAIL_PATTERN: Regex = Regex::new(
234        r"(?i)[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}"
235    ).unwrap();
236
237    /// Phone number patterns (various formats)
238    static ref PHONE_PATTERN: Regex = Regex::new(
239        r"(?x)
240        (?:\+?1[-.\s]?)?          # Optional country code
241        (?:\(?\d{3}\)?[-.\s]?)    # Area code
242        \d{3}[-.\s]?              # Exchange
243        \d{4}                     # Subscriber
244        |
245        \+\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}  # International
246        "
247    ).unwrap();
248
249    /// US/CA toll-free pattern
250    static ref TOLL_FREE_PATTERN: Regex = Regex::new(
251        r"(?i)1?[-.\s]?(?:800|888|877|866|855|844|833)[-.\s]?\d{3}[-.\s]?\d{4}"
252    ).unwrap();
253
254    /// Postal code patterns
255    static ref US_ZIP_PATTERN: Regex = Regex::new(r"\b\d{5}(?:-\d{4})?\b").unwrap();
256    static ref UK_POSTAL_PATTERN: Regex = Regex::new(
257        r"(?i)\b[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}\b"
258    ).unwrap();
259    static ref CA_POSTAL_PATTERN: Regex = Regex::new(
260        r"(?i)\b[A-Z]\d[A-Z]\s*\d[A-Z]\d\b"
261    ).unwrap();
262}
263
264// ============================================================================
265// EXTRACTION FUNCTIONS
266// ============================================================================
267
268/// Extract all contact information from HTML document
269pub fn extract_contact_info(document: &Html) -> ParserResult<ContactInfo> {
270    let mut info = ContactInfo::new();
271
272    // Extract emails
273    info.emails = extract_emails(document);
274
275    // Extract phones
276    info.phones = extract_phones(document);
277
278    // Extract addresses
279    info.addresses = extract_addresses(document);
280
281    // Extract social links
282    info.social_links = extract_social_links(document);
283
284    // Find contact page link
285    info.contact_page = find_contact_page(document);
286
287    // Try to find business name
288    info.business_name = extract_business_name(document);
289
290    Ok(info)
291}
292
293/// Extract email addresses from document
294pub fn extract_emails(document: &Html) -> Vec<Email> {
295    let mut emails = Vec::new();
296    let mut seen = HashSet::new();
297
298    // From mailto: links
299    if let Ok(sel) = Selector::parse("a[href^='mailto:']") {
300        for el in document.select(&sel) {
301            if let Some(href) = el.value().attr("href") {
302                let addr = href.trim_start_matches("mailto:")
303                    .split('?').next()
304                    .unwrap_or("")
305                    .to_lowercase();
306                
307                if EMAIL_PATTERN.is_match(&addr) && !seen.contains(&addr) {
308                    seen.insert(addr.clone());
309                    let mut email = Email::new(addr);
310                    email.source = EmailSource::MailtoLink;
311                    email.label = extract_email_label(&el);
312                    emails.push(email);
313                }
314            }
315        }
316    }
317
318    // From text content
319    let text = document.root_element().text().collect::<String>();
320    for caps in EMAIL_PATTERN.find_iter(&text) {
321        let addr = caps.as_str().to_lowercase();
322        if !seen.contains(&addr) && is_valid_email(&addr) {
323            seen.insert(addr.clone());
324            emails.push(Email::new(addr));
325        }
326    }
327
328    // From structured data
329    if let Ok(sel) = Selector::parse("[itemprop='email'], [property='email']") {
330        for el in document.select(&sel) {
331            let content = el.value().attr("content")
332                .or_else(|| el.value().attr("href"))
333                .map(|s| s.trim_start_matches("mailto:").to_lowercase())
334                .or_else(|| Some(el.text().collect::<String>().trim().to_lowercase()));
335            
336            if let Some(addr) = content {
337                if EMAIL_PATTERN.is_match(&addr) && !seen.contains(&addr) {
338                    seen.insert(addr.clone());
339                    let mut email = Email::new(addr);
340                    email.source = EmailSource::StructuredData;
341                    emails.push(email);
342                }
343            }
344        }
345    }
346
347    emails
348}
349
350/// Validate email address
351fn is_valid_email(email: &str) -> bool {
352    // Reject image file extensions often caught by regex
353    let invalid_endings = [".png", ".jpg", ".jpeg", ".gif", ".webp", ".svg"];
354    !invalid_endings.iter().any(|e| email.ends_with(e))
355}
356
357/// Extract label for email from surrounding context
358fn extract_email_label(element: &ElementRef) -> Option<String> {
359    let text = element.text().collect::<String>().trim().to_lowercase();
360    
361    let labels = ["support", "sales", "info", "contact", "help", "billing", "press", "careers", "jobs"];
362    for label in labels {
363        if text.contains(label) {
364            return Some(label.to_string());
365        }
366    }
367    
368    None
369}
370
371/// Extract phone numbers from document
372pub fn extract_phones(document: &Html) -> Vec<Phone> {
373    let mut phones = Vec::new();
374    let mut seen = HashSet::new();
375
376    // From tel: links
377    if let Ok(sel) = Selector::parse("a[href^='tel:']") {
378        for el in document.select(&sel) {
379            if let Some(href) = el.value().attr("href") {
380                let raw = href.trim_start_matches("tel:").to_string();
381                let normalized = Phone::new(raw.clone()).normalized.clone();
382                
383                if normalized.len() >= 10 && !seen.contains(&normalized) {
384                    seen.insert(normalized);
385                    let mut phone = Phone::new(raw);
386                    phone.label = extract_phone_label(&el);
387                    phone.phone_type = detect_phone_type(&phone);
388                    phones.push(phone);
389                }
390            }
391        }
392    }
393
394    // From text content
395    let text = document.root_element().text().collect::<String>();
396    for caps in PHONE_PATTERN.find_iter(&text) {
397        let raw = caps.as_str().to_string();
398        let normalized = Phone::new(raw.clone()).normalized.clone();
399        
400        if normalized.len() >= 10 && !seen.contains(&normalized) {
401            seen.insert(normalized);
402            let mut phone = Phone::new(raw);
403            phone.phone_type = detect_phone_type(&phone);
404            phones.push(phone);
405        }
406    }
407
408    // From structured data
409    if let Ok(sel) = Selector::parse("[itemprop='telephone'], [property='telephone']") {
410        for el in document.select(&sel) {
411            let content = el.value().attr("content")
412                .map(|s| s.to_string())
413                .or_else(|| Some(el.text().collect::<String>().trim().to_string()));
414            
415            if let Some(raw) = content {
416                let normalized = Phone::new(raw.clone()).normalized.clone();
417                if normalized.len() >= 10 && !seen.contains(&normalized) {
418                    seen.insert(normalized);
419                    phones.push(Phone::new(raw));
420                }
421            }
422        }
423    }
424
425    phones
426}
427
428/// Extract phone label from context
429fn extract_phone_label(element: &ElementRef) -> Option<String> {
430    let text = element.text().collect::<String>().to_lowercase();
431    
432    if text.contains("fax") {
433        Some("fax".to_string())
434    } else if text.contains("mobile") || text.contains("cell") {
435        Some("mobile".to_string())
436    } else if text.contains("main") || text.contains("office") {
437        Some("main".to_string())
438    } else {
439        None
440    }
441}
442
443/// Detect phone type
444fn detect_phone_type(phone: &Phone) -> PhoneType {
445    if TOLL_FREE_PATTERN.is_match(&phone.raw) {
446        PhoneType::TollFree
447    } else if phone.label.as_ref().map(|l| l.contains("fax")).unwrap_or(false) {
448        PhoneType::Fax
449    } else if phone.label.as_ref().map(|l| l.contains("mobile") || l.contains("cell")).unwrap_or(false) {
450        PhoneType::Mobile
451    } else {
452        PhoneType::Unknown
453    }
454}
455
456/// Extract physical addresses from document
457pub fn extract_addresses(document: &Html) -> Vec<Address> {
458    let mut addresses = Vec::new();
459
460    // From structured data (Schema.org)
461    if let Ok(sel) = Selector::parse("[itemtype*='PostalAddress'], [itemprop='address']") {
462        for el in document.select(&sel) {
463            if let Some(addr) = extract_structured_address(&el) {
464                addresses.push(addr);
465            }
466        }
467    }
468
469    // From address elements
470    if let Ok(sel) = Selector::parse("address") {
471        for el in document.select(&sel) {
472            let text = el.text().collect::<String>().trim().to_string();
473            if !text.is_empty() && text.len() > 10 {
474                let mut addr = Address {
475                    full_text: text,
476                    ..Default::default()
477                };
478                extract_address_components(&mut addr);
479                addresses.push(addr);
480            }
481        }
482    }
483
484    // Deduplicate
485    addresses.dedup_by(|a, b| a.full_text == b.full_text);
486
487    addresses
488}
489
490/// Extract address from structured data element
491fn extract_structured_address(element: &ElementRef) -> Option<Address> {
492    let mut addr = Address::default();
493
494    let selectors = [
495        ("streetAddress", "street"),
496        ("addressLocality", "city"),
497        ("addressRegion", "state"),
498        ("postalCode", "postal_code"),
499        ("addressCountry", "country"),
500    ];
501
502    for (itemprop, field) in selectors {
503        if let Ok(sel) = Selector::parse(&format!("[itemprop='{}']", itemprop)) {
504            if let Some(el) = element.select(&sel).next() {
505                let value = el.value().attr("content")
506                    .map(|s| s.to_string())
507                    .or_else(|| Some(el.text().collect::<String>().trim().to_string()));
508                
509                match field {
510                    "street" => addr.street = value,
511                    "city" => addr.city = value,
512                    "state" => addr.state = value,
513                    "postal_code" => addr.postal_code = value,
514                    "country" => addr.country = value,
515                    _ => {}
516                }
517            }
518        }
519    }
520
521    // Build full text
522    let parts: Vec<&str> = [
523        addr.street.as_deref(),
524        addr.city.as_deref(),
525        addr.state.as_deref(),
526        addr.postal_code.as_deref(),
527        addr.country.as_deref(),
528    ].into_iter().flatten().collect();
529
530    if parts.is_empty() {
531        return None;
532    }
533
534    addr.full_text = parts.join(", ");
535    Some(addr)
536}
537
538/// Extract address components from full text
539fn extract_address_components(addr: &mut Address) {
540    // Try to extract postal code
541    if let Some(caps) = US_ZIP_PATTERN.find(&addr.full_text) {
542        addr.postal_code = Some(caps.as_str().to_string());
543    } else if let Some(caps) = UK_POSTAL_PATTERN.find(&addr.full_text) {
544        addr.postal_code = Some(caps.as_str().to_string());
545    } else if let Some(caps) = CA_POSTAL_PATTERN.find(&addr.full_text) {
546        addr.postal_code = Some(caps.as_str().to_string());
547    }
548}
549
550/// Extract social media links from document
551pub fn extract_social_links(document: &Html) -> Vec<SocialLink> {
552    let mut links = Vec::new();
553    let mut seen = HashSet::new();
554
555    if let Ok(sel) = Selector::parse("a[href]") {
556        for el in document.select(&sel) {
557            if let Some(href) = el.value().attr("href") {
558                if let Some(platform) = detect_social_platform(href) {
559                    let url = href.to_string();
560                    if !seen.contains(&url) {
561                        seen.insert(url.clone());
562                        links.push(SocialLink {
563                            platform,
564                            url: url.clone(),
565                            username: extract_social_username(&url, platform),
566                        });
567                    }
568                }
569            }
570        }
571    }
572
573    links
574}
575
576/// Detect social platform from URL
577fn detect_social_platform(url: &str) -> Option<SocialPlatform> {
578    let url_lower = url.to_lowercase();
579    
580    let platforms = [
581        ("facebook.com", SocialPlatform::Facebook),
582        ("fb.com", SocialPlatform::Facebook),
583        ("twitter.com", SocialPlatform::Twitter),
584        ("x.com", SocialPlatform::Twitter),
585        ("instagram.com", SocialPlatform::Instagram),
586        ("linkedin.com", SocialPlatform::LinkedIn),
587        ("youtube.com", SocialPlatform::YouTube),
588        ("youtu.be", SocialPlatform::YouTube),
589        ("tiktok.com", SocialPlatform::TikTok),
590        ("pinterest.com", SocialPlatform::Pinterest),
591        ("github.com", SocialPlatform::GitHub),
592        ("reddit.com", SocialPlatform::Reddit),
593        ("discord.gg", SocialPlatform::Discord),
594        ("discord.com", SocialPlatform::Discord),
595        ("t.me", SocialPlatform::Telegram),
596        ("telegram.me", SocialPlatform::Telegram),
597        ("wa.me", SocialPlatform::WhatsApp),
598        ("whatsapp.com", SocialPlatform::WhatsApp),
599        ("snapchat.com", SocialPlatform::Snapchat),
600        ("tumblr.com", SocialPlatform::Tumblr),
601        ("medium.com", SocialPlatform::Medium),
602        ("twitch.tv", SocialPlatform::Twitch),
603        ("vimeo.com", SocialPlatform::Vimeo),
604        ("flickr.com", SocialPlatform::Flickr),
605    ];
606
607    for (domain, platform) in platforms {
608        if url_lower.contains(domain) {
609            return Some(platform);
610        }
611    }
612
613    None
614}
615
616/// Extract username from social URL
617fn extract_social_username(url: &str, platform: SocialPlatform) -> Option<String> {
618    let url_lower = url.to_lowercase();
619    
620    // Remove protocol and www
621    let cleaned = url_lower
622        .trim_start_matches("https://")
623        .trim_start_matches("http://")
624        .trim_start_matches("www.");
625
626    // Split by /
627    let parts: Vec<&str> = cleaned.split('/').collect();
628    
629    match platform {
630        SocialPlatform::Twitter | SocialPlatform::Instagram | 
631        SocialPlatform::GitHub | SocialPlatform::TikTok => {
632            // Format: platform.com/username
633            parts.get(1).filter(|s| !s.is_empty() && !s.starts_with('?')).map(|s| s.to_string())
634        }
635        SocialPlatform::Facebook => {
636            // Format: facebook.com/username or facebook.com/pages/name/id
637            parts.get(1).filter(|s| !s.is_empty() && **s != "pages" && !s.starts_with('?')).map(|s| s.to_string())
638        }
639        SocialPlatform::YouTube => {
640            // Format: youtube.com/c/channel or youtube.com/@handle
641            if let Some(p) = parts.get(1) {
642                if *p == "c" || *p == "channel" || *p == "user" {
643                    return parts.get(2).map(|s| s.to_string());
644                }
645                if p.starts_with('@') {
646                    return Some(p.to_string());
647                }
648            }
649            None
650        }
651        _ => None,
652    }
653}
654
655/// Find contact page link
656fn find_contact_page(document: &Html) -> Option<String> {
657    let contact_patterns = ["contact", "kontakt", "contacto", "contato"];
658    
659    if let Ok(sel) = Selector::parse("a[href]") {
660        for el in document.select(&sel) {
661            if let Some(href) = el.value().attr("href") {
662                let href_lower = href.to_lowercase();
663                let text_lower = el.text().collect::<String>().to_lowercase();
664                
665                for pattern in contact_patterns {
666                    if href_lower.contains(pattern) || text_lower.contains(pattern) {
667                        return Some(href.to_string());
668                    }
669                }
670            }
671        }
672    }
673
674    None
675}
676
677/// Extract business name from document
678fn extract_business_name(document: &Html) -> Option<String> {
679    // From structured data
680    if let Ok(sel) = Selector::parse("[itemprop='name'], [property='og:site_name']") {
681        if let Some(el) = document.select(&sel).next() {
682            let text_content = el.text().collect::<String>();
683            let name = el.value().attr("content")
684                .or(Some(text_content.as_str()))
685                .map(|s| s.trim().to_string());
686            
687            if let Some(ref n) = name {
688                if !n.is_empty() && n.len() < 100 {
689                    return name;
690                }
691            }
692        }
693    }
694
695    None
696}
697
698// ============================================================================
699// CONVENIENCE FUNCTIONS
700// ============================================================================
701
702/// Check if page has contact information
703pub fn has_contact_info(document: &Html) -> bool {
704    extract_contact_info(document)
705        .map(|c| c.has_contact_info())
706        .unwrap_or(false)
707}
708
709/// Get all emails from page
710pub fn get_emails(document: &Html) -> Vec<String> {
711    extract_emails(document)
712        .into_iter()
713        .map(|e| e.address)
714        .collect()
715}
716
717/// Get all phone numbers from page
718pub fn get_phones(document: &Html) -> Vec<String> {
719    extract_phones(document)
720        .into_iter()
721        .map(|p| p.raw)
722        .collect()
723}
724
725/// Get all social links from page
726pub fn get_social_links(document: &Html) -> Vec<String> {
727    extract_social_links(document)
728        .into_iter()
729        .map(|s| s.url)
730        .collect()
731}
732
733// ============================================================================
734// TESTS
735// ============================================================================
736
737#[cfg(test)]
738mod tests {
739    use super::*;
740
741    fn parse_html(html: &str) -> Html {
742        Html::parse_document(html)
743    }
744
745    #[test]
746    fn test_extract_mailto_email() {
747        let html = r#"
748            <a href="mailto:contact@example.com">Contact Us</a>
749        "#;
750        
751        let doc = parse_html(html);
752        let emails = extract_emails(&doc);
753        
754        assert_eq!(emails.len(), 1);
755        assert_eq!(emails[0].address, "contact@example.com");
756        assert_eq!(emails[0].source, EmailSource::MailtoLink);
757        assert!(emails[0].is_generic);
758    }
759
760    #[test]
761    fn test_extract_text_email() {
762        let html = r#"
763            <p>Email us at john.doe@company.org for inquiries.</p>
764        "#;
765        
766        let doc = parse_html(html);
767        let emails = extract_emails(&doc);
768        
769        assert_eq!(emails.len(), 1);
770        assert_eq!(emails[0].address, "john.doe@company.org");
771        assert!(!emails[0].is_generic);
772    }
773
774    #[test]
775    fn test_extract_phone_tel() {
776        let html = r#"
777            <a href="tel:+1-555-123-4567">Call Us</a>
778        "#;
779        
780        let doc = parse_html(html);
781        let phones = extract_phones(&doc);
782        
783        assert_eq!(phones.len(), 1);
784        assert_eq!(phones[0].normalized, "+15551234567");
785    }
786
787    #[test]
788    fn test_extract_toll_free() {
789        let html = r#"
790            <p>Call us at 1-800-555-1234</p>
791        "#;
792        
793        let doc = parse_html(html);
794        let phones = extract_phones(&doc);
795        
796        assert!(!phones.is_empty());
797        assert_eq!(phones[0].phone_type, PhoneType::TollFree);
798    }
799
800    #[test]
801    fn test_extract_social_links() {
802        let html = r#"
803            <a href="https://twitter.com/example">Twitter</a>
804            <a href="https://facebook.com/example">Facebook</a>
805            <a href="https://github.com/example">GitHub</a>
806        "#;
807        
808        let doc = parse_html(html);
809        let links = extract_social_links(&doc);
810        
811        assert_eq!(links.len(), 3);
812        assert!(links.iter().any(|l| l.platform == SocialPlatform::Twitter));
813        assert!(links.iter().any(|l| l.platform == SocialPlatform::Facebook));
814        assert!(links.iter().any(|l| l.platform == SocialPlatform::GitHub));
815    }
816
817    #[test]
818    fn test_extract_social_username() {
819        assert_eq!(
820            extract_social_username("https://twitter.com/johndoe", SocialPlatform::Twitter),
821            Some("johndoe".to_string())
822        );
823        assert_eq!(
824            extract_social_username("https://github.com/rust-lang", SocialPlatform::GitHub),
825            Some("rust-lang".to_string())
826        );
827    }
828
829    #[test]
830    fn test_extract_structured_address() {
831        let html = r#"
832            <div itemscope itemtype="https://schema.org/PostalAddress">
833                <span itemprop="streetAddress">123 Main St</span>
834                <span itemprop="addressLocality">Springfield</span>
835                <span itemprop="addressRegion">IL</span>
836                <span itemprop="postalCode">62701</span>
837                <span itemprop="addressCountry">USA</span>
838            </div>
839        "#;
840        
841        let doc = parse_html(html);
842        let addresses = extract_addresses(&doc);
843        
844        assert_eq!(addresses.len(), 1);
845        assert_eq!(addresses[0].street, Some("123 Main St".to_string()));
846        assert_eq!(addresses[0].city, Some("Springfield".to_string()));
847        assert_eq!(addresses[0].postal_code, Some("62701".to_string()));
848    }
849
850    #[test]
851    fn test_find_contact_page() {
852        let html = r#"
853            <nav>
854                <a href="/about">About</a>
855                <a href="/contact">Contact Us</a>
856            </nav>
857        "#;
858        
859        let doc = parse_html(html);
860        let contact = find_contact_page(&doc);
861        
862        assert_eq!(contact, Some("/contact".to_string()));
863    }
864
865    #[test]
866    fn test_email_is_generic() {
867        assert!(Email::new("info@example.com".to_string()).is_generic);
868        assert!(Email::new("contact@example.com".to_string()).is_generic);
869        assert!(Email::new("support@example.com".to_string()).is_generic);
870        assert!(!Email::new("john.doe@example.com".to_string()).is_generic);
871    }
872
873    #[test]
874    fn test_primary_email() {
875        let mut info = ContactInfo::new();
876        info.emails = vec![
877            Email::new("info@example.com".to_string()),
878            Email::new("john@example.com".to_string()),
879        ];
880        
881        let primary = info.primary_email().unwrap();
882        assert_eq!(primary.address, "john@example.com");
883    }
884
885    #[test]
886    fn test_has_contact_info() {
887        let html = r#"
888            <a href="mailto:test@example.com">Email</a>
889        "#;
890        
891        let doc = parse_html(html);
892        assert!(has_contact_info(&doc));
893
894        let html_empty = "<html><body><p>No contact info</p></body></html>";
895        let doc_empty = parse_html(html_empty);
896        assert!(!has_contact_info(&doc_empty));
897    }
898
899    #[test]
900    fn test_x_twitter_detection() {
901        let html = r#"
902            <a href="https://x.com/example">X (Twitter)</a>
903        "#;
904        
905        let doc = parse_html(html);
906        let links = extract_social_links(&doc);
907        
908        assert_eq!(links.len(), 1);
909        assert_eq!(links[0].platform, SocialPlatform::Twitter);
910    }
911
912    #[test]
913    fn test_youtube_username_extraction() {
914        assert_eq!(
915            extract_social_username("https://youtube.com/@handle", SocialPlatform::YouTube),
916            Some("@handle".to_string())
917        );
918        assert_eq!(
919            extract_social_username("https://youtube.com/c/channelname", SocialPlatform::YouTube),
920            Some("channelname".to_string())
921        );
922    }
923}