Skip to main content

email_extract/
extracted.rs

1//! Entity extraction from email content
2
3use regex::Regex;
4use serde::{Deserialize, Serialize};
5
6/// All entities extracted from email content
7#[derive(Debug, Clone, Default, Serialize, Deserialize)]
8pub struct ExtractedEntities {
9    /// Email addresses found in body
10    pub emails: Vec<ExtractedEmail>,
11
12    /// Phone numbers found
13    pub phone_numbers: Vec<PhoneNumber>,
14
15    /// URLs found
16    pub urls: Vec<ExtractedUrl>,
17
18    /// Possible person names
19    pub names: Vec<String>,
20
21    /// Company names detected
22    pub companies: Vec<String>,
23
24    /// Dates mentioned
25    pub dates: Vec<String>,
26
27    /// Monetary amounts
28    pub amounts: Vec<MonetaryAmount>,
29
30    /// Physical addresses
31    pub addresses: Vec<String>,
32
33    /// Social media handles
34    pub social_handles: Vec<SocialHandle>,
35}
36
37/// Extracted email address
38#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct ExtractedEmail {
40    pub address: String,
41    pub context: String, // surrounding text
42    pub position: usize, // character position in body
43}
44
45/// Phone number with type detection
46#[derive(Debug, Clone, Serialize, Deserialize)]
47pub struct PhoneNumber {
48    pub raw: String,
49    pub normalized: String,
50    pub phone_type: PhoneType,
51    pub country_code: Option<String>,
52}
53
54/// Type of phone number
55#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
56pub enum PhoneType {
57    Mobile,
58    Landline,
59    TollFree,
60    Unknown,
61}
62
63/// Extracted URL with analysis
64#[derive(Debug, Clone, Serialize, Deserialize)]
65pub struct ExtractedUrl {
66    pub url: String,
67    pub domain: String,
68    pub is_tracking: bool,
69    pub url_type: UrlType,
70}
71
72/// Type of URL
73#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
74pub enum UrlType {
75    Website,
76    SocialMedia,
77    Unsubscribe,
78    Tracking,
79    Calendar,
80    Document,
81    Other,
82}
83
84/// Monetary amount
85#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct MonetaryAmount {
87    pub raw: String,
88    pub value: f64,
89    pub currency: String,
90}
91
92/// Social media handle
93#[derive(Debug, Clone, Serialize, Deserialize)]
94pub struct SocialHandle {
95    pub platform: SocialPlatform,
96    pub handle: String,
97}
98
99/// Social media platform
100#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
101pub enum SocialPlatform {
102    Twitter,
103    LinkedIn,
104    Instagram,
105    Facebook,
106    GitHub,
107    Other(String),
108}
109
110// Regex patterns
111static EMAIL_REGEX: std::sync::LazyLock<Regex> = std::sync::LazyLock::new(|| {
112    Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap()
113});
114
115static PHONE_REGEX: std::sync::LazyLock<Regex> = std::sync::LazyLock::new(|| {
116    Regex::new(r"(?:\+?1[-.\s]?)?(?:\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}").unwrap()
117});
118
119static URL_REGEX: std::sync::LazyLock<Regex> =
120    std::sync::LazyLock::new(|| Regex::new(r"https?://[^\s<>\[\]{}|\\^]+").unwrap());
121
122static AMOUNT_REGEX: std::sync::LazyLock<Regex> = std::sync::LazyLock::new(|| {
123    Regex::new(r"(?:[$€£¥])\s*[\d,]+(?:\.\d{2})?|[\d,]+(?:\.\d{2})?\s*(?:USD|EUR|GBP|CAD|AUD)")
124        .unwrap()
125});
126
127static TWITTER_REGEX: std::sync::LazyLock<Regex> =
128    std::sync::LazyLock::new(|| Regex::new(r"@([a-zA-Z0-9_]{1,15})").unwrap());
129
130static LINKEDIN_REGEX: std::sync::LazyLock<Regex> =
131    std::sync::LazyLock::new(|| Regex::new(r"linkedin\.com/in/([a-zA-Z0-9-]+)").unwrap());
132
133/// Snap a byte index to the nearest valid UTF-8 char boundary (backwards)
134const fn snap_to_char_boundary(s: &str, idx: usize) -> usize {
135    if idx >= s.len() {
136        return s.len();
137    }
138    let mut i = idx;
139    while !s.is_char_boundary(i) && i > 0 {
140        i -= 1;
141    }
142    i
143}
144
145impl ExtractedEntities {
146    /// Extract all entities from text content
147    pub fn extract(text: &str) -> Self {
148        let mut entities = Self::default();
149
150        // Extract emails
151        for cap in EMAIL_REGEX.find_iter(text) {
152            let start = snap_to_char_boundary(text, cap.start().saturating_sub(30));
153            let end = snap_to_char_boundary(text, (cap.end() + 30).min(text.len()));
154            let context = text[start..end].to_string();
155
156            entities.emails.push(ExtractedEmail {
157                address: cap.as_str().to_string(),
158                context,
159                position: cap.start(),
160            });
161        }
162
163        // Extract phone numbers
164        for cap in PHONE_REGEX.find_iter(text) {
165            let raw = cap.as_str().to_string();
166            let normalized = normalize_phone(&raw);
167            let phone_type = detect_phone_type(&normalized);
168
169            entities.phone_numbers.push(PhoneNumber {
170                raw,
171                normalized,
172                phone_type,
173                country_code: None,
174            });
175        }
176
177        // Extract URLs
178        for cap in URL_REGEX.find_iter(text) {
179            let url = cap.as_str().to_string();
180            let domain = extract_domain(&url);
181            let is_tracking = is_tracking_url(&url);
182            let url_type = detect_url_type(&url, &domain);
183
184            entities.urls.push(ExtractedUrl {
185                url,
186                domain,
187                is_tracking,
188                url_type,
189            });
190        }
191
192        // Extract monetary amounts
193        for cap in AMOUNT_REGEX.find_iter(text) {
194            if let Some(amount) = parse_amount(cap.as_str()) {
195                entities.amounts.push(amount);
196            }
197        }
198
199        // Extract social handles
200        for cap in TWITTER_REGEX.captures_iter(text) {
201            if let Some(handle) = cap.get(1) {
202                entities.social_handles.push(SocialHandle {
203                    platform: SocialPlatform::Twitter,
204                    handle: handle.as_str().to_string(),
205                });
206            }
207        }
208
209        for cap in LINKEDIN_REGEX.captures_iter(text) {
210            if let Some(handle) = cap.get(1) {
211                entities.social_handles.push(SocialHandle {
212                    platform: SocialPlatform::LinkedIn,
213                    handle: handle.as_str().to_string(),
214                });
215            }
216        }
217
218        entities
219    }
220
221    /// Check if any entities were extracted
222    #[must_use]
223    pub const fn is_empty(&self) -> bool {
224        self.emails.is_empty()
225            && self.phone_numbers.is_empty()
226            && self.urls.is_empty()
227            && self.amounts.is_empty()
228    }
229
230    /// Get count of all extracted entities
231    #[must_use]
232    pub const fn total_count(&self) -> usize {
233        self.emails.len()
234            + self.phone_numbers.len()
235            + self.urls.len()
236            + self.amounts.len()
237            + self.social_handles.len()
238    }
239}
240
241fn normalize_phone(phone: &str) -> String {
242    phone
243        .chars()
244        .filter(|c| c.is_ascii_digit() || *c == '+')
245        .collect()
246}
247
248fn detect_phone_type(normalized: &str) -> PhoneType {
249    let digits: String = normalized.chars().filter(char::is_ascii_digit).collect();
250
251    if digits.starts_with("1800") || digits.starts_with("1888") || digits.starts_with("1877") {
252        PhoneType::TollFree
253    } else {
254        PhoneType::Unknown
255    }
256}
257
258fn extract_domain(url: &str) -> String {
259    url.trim_start_matches("https://")
260        .trim_start_matches("http://")
261        .split('/')
262        .next()
263        .unwrap_or("")
264        .to_string()
265}
266
267fn is_tracking_url(url: &str) -> bool {
268    let lower = url.to_lowercase();
269    lower.contains("track")
270        || lower.contains("click")
271        || lower.contains("redirect")
272        || lower.contains("utm_")
273        || lower.contains("mc_eid")
274        || lower.contains("trk")
275}
276
277#[allow(clippy::case_sensitive_file_extension_comparisons)]
278fn detect_url_type(url: &str, domain: &str) -> UrlType {
279    let lower = url.to_lowercase();
280    let domain_lower = domain.to_lowercase();
281
282    if lower.contains("unsubscribe") || lower.contains("optout") {
283        UrlType::Unsubscribe
284    } else if is_tracking_url(url) {
285        UrlType::Tracking
286    } else if domain_lower.contains("linkedin")
287        || domain_lower.contains("twitter")
288        || domain_lower.contains("facebook")
289        || domain_lower.contains("instagram")
290    {
291        UrlType::SocialMedia
292    } else if lower.contains("calendar") || lower.contains(".ics") {
293        UrlType::Calendar
294    } else if lower.ends_with(".pdf")
295        || lower.ends_with(".doc")
296        || lower.ends_with(".docx")
297        || lower.ends_with(".xls")
298    {
299        UrlType::Document
300    } else {
301        UrlType::Website
302    }
303}
304
305fn parse_amount(raw: &str) -> Option<MonetaryAmount> {
306    let clean: String = raw
307        .chars()
308        .filter(|c| c.is_ascii_digit() || *c == '.' || *c == ',')
309        .collect();
310
311    let clean = clean.replace(',', "");
312
313    let value: f64 = clean.parse().ok()?;
314
315    let currency = if raw.contains('$') || raw.contains("USD") {
316        "USD"
317    } else if raw.contains('€') || raw.contains("EUR") {
318        "EUR"
319    } else if raw.contains('£') || raw.contains("GBP") {
320        "GBP"
321    } else {
322        "USD"
323    };
324
325    Some(MonetaryAmount {
326        raw: raw.to_string(),
327        value,
328        currency: currency.to_string(),
329    })
330}