1use regex::Regex;
4use serde::{Deserialize, Serialize};
5
6#[derive(Debug, Clone, Default, Serialize, Deserialize)]
8pub struct ExtractedEntities {
9 pub emails: Vec<ExtractedEmail>,
11
12 pub phone_numbers: Vec<PhoneNumber>,
14
15 pub urls: Vec<ExtractedUrl>,
17
18 pub names: Vec<String>,
20
21 pub companies: Vec<String>,
23
24 pub dates: Vec<String>,
26
27 pub amounts: Vec<MonetaryAmount>,
29
30 pub addresses: Vec<String>,
32
33 pub social_handles: Vec<SocialHandle>,
35}
36
37#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct ExtractedEmail {
40 pub address: String,
41 pub context: String, pub position: usize, }
44
45#[derive(Debug, Clone, Serialize, Deserialize)]
47pub struct PhoneNumber {
48 pub raw: String,
49 pub normalized: String,
50 pub phone_type: PhoneType,
51 pub country_code: Option<String>,
52}
53
54#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
56pub enum PhoneType {
57 Mobile,
58 Landline,
59 TollFree,
60 Unknown,
61}
62
63#[derive(Debug, Clone, Serialize, Deserialize)]
65pub struct ExtractedUrl {
66 pub url: String,
67 pub domain: String,
68 pub is_tracking: bool,
69 pub url_type: UrlType,
70}
71
72#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
74pub enum UrlType {
75 Website,
76 SocialMedia,
77 Unsubscribe,
78 Tracking,
79 Calendar,
80 Document,
81 Other,
82}
83
84#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct MonetaryAmount {
87 pub raw: String,
88 pub value: f64,
89 pub currency: String,
90}
91
92#[derive(Debug, Clone, Serialize, Deserialize)]
94pub struct SocialHandle {
95 pub platform: SocialPlatform,
96 pub handle: String,
97}
98
99#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
101pub enum SocialPlatform {
102 Twitter,
103 LinkedIn,
104 Instagram,
105 Facebook,
106 GitHub,
107 Other(String),
108}
109
110static EMAIL_REGEX: std::sync::LazyLock<Regex> = std::sync::LazyLock::new(|| {
112 Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap()
113});
114
115static PHONE_REGEX: std::sync::LazyLock<Regex> = std::sync::LazyLock::new(|| {
116 Regex::new(r"(?:\+?1[-.\s]?)?(?:\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}").unwrap()
117});
118
119static URL_REGEX: std::sync::LazyLock<Regex> =
120 std::sync::LazyLock::new(|| Regex::new(r"https?://[^\s<>\[\]{}|\\^]+").unwrap());
121
122static AMOUNT_REGEX: std::sync::LazyLock<Regex> = std::sync::LazyLock::new(|| {
123 Regex::new(r"(?:[$€£¥])\s*[\d,]+(?:\.\d{2})?|[\d,]+(?:\.\d{2})?\s*(?:USD|EUR|GBP|CAD|AUD)")
124 .unwrap()
125});
126
127static TWITTER_REGEX: std::sync::LazyLock<Regex> =
128 std::sync::LazyLock::new(|| Regex::new(r"@([a-zA-Z0-9_]{1,15})").unwrap());
129
130static LINKEDIN_REGEX: std::sync::LazyLock<Regex> =
131 std::sync::LazyLock::new(|| Regex::new(r"linkedin\.com/in/([a-zA-Z0-9-]+)").unwrap());
132
133const fn snap_to_char_boundary(s: &str, idx: usize) -> usize {
135 if idx >= s.len() {
136 return s.len();
137 }
138 let mut i = idx;
139 while !s.is_char_boundary(i) && i > 0 {
140 i -= 1;
141 }
142 i
143}
144
145impl ExtractedEntities {
146 pub fn extract(text: &str) -> Self {
148 let mut entities = Self::default();
149
150 for cap in EMAIL_REGEX.find_iter(text) {
152 let start = snap_to_char_boundary(text, cap.start().saturating_sub(30));
153 let end = snap_to_char_boundary(text, (cap.end() + 30).min(text.len()));
154 let context = text[start..end].to_string();
155
156 entities.emails.push(ExtractedEmail {
157 address: cap.as_str().to_string(),
158 context,
159 position: cap.start(),
160 });
161 }
162
163 for cap in PHONE_REGEX.find_iter(text) {
165 let raw = cap.as_str().to_string();
166 let normalized = normalize_phone(&raw);
167 let phone_type = detect_phone_type(&normalized);
168
169 entities.phone_numbers.push(PhoneNumber {
170 raw,
171 normalized,
172 phone_type,
173 country_code: None,
174 });
175 }
176
177 for cap in URL_REGEX.find_iter(text) {
179 let url = cap.as_str().to_string();
180 let domain = extract_domain(&url);
181 let is_tracking = is_tracking_url(&url);
182 let url_type = detect_url_type(&url, &domain);
183
184 entities.urls.push(ExtractedUrl {
185 url,
186 domain,
187 is_tracking,
188 url_type,
189 });
190 }
191
192 for cap in AMOUNT_REGEX.find_iter(text) {
194 if let Some(amount) = parse_amount(cap.as_str()) {
195 entities.amounts.push(amount);
196 }
197 }
198
199 for cap in TWITTER_REGEX.captures_iter(text) {
201 if let Some(handle) = cap.get(1) {
202 entities.social_handles.push(SocialHandle {
203 platform: SocialPlatform::Twitter,
204 handle: handle.as_str().to_string(),
205 });
206 }
207 }
208
209 for cap in LINKEDIN_REGEX.captures_iter(text) {
210 if let Some(handle) = cap.get(1) {
211 entities.social_handles.push(SocialHandle {
212 platform: SocialPlatform::LinkedIn,
213 handle: handle.as_str().to_string(),
214 });
215 }
216 }
217
218 entities
219 }
220
221 #[must_use]
223 pub const fn is_empty(&self) -> bool {
224 self.emails.is_empty()
225 && self.phone_numbers.is_empty()
226 && self.urls.is_empty()
227 && self.amounts.is_empty()
228 }
229
230 #[must_use]
232 pub const fn total_count(&self) -> usize {
233 self.emails.len()
234 + self.phone_numbers.len()
235 + self.urls.len()
236 + self.amounts.len()
237 + self.social_handles.len()
238 }
239}
240
241fn normalize_phone(phone: &str) -> String {
242 phone
243 .chars()
244 .filter(|c| c.is_ascii_digit() || *c == '+')
245 .collect()
246}
247
248fn detect_phone_type(normalized: &str) -> PhoneType {
249 let digits: String = normalized.chars().filter(char::is_ascii_digit).collect();
250
251 if digits.starts_with("1800") || digits.starts_with("1888") || digits.starts_with("1877") {
252 PhoneType::TollFree
253 } else {
254 PhoneType::Unknown
255 }
256}
257
258fn extract_domain(url: &str) -> String {
259 url.trim_start_matches("https://")
260 .trim_start_matches("http://")
261 .split('/')
262 .next()
263 .unwrap_or("")
264 .to_string()
265}
266
267fn is_tracking_url(url: &str) -> bool {
268 let lower = url.to_lowercase();
269 lower.contains("track")
270 || lower.contains("click")
271 || lower.contains("redirect")
272 || lower.contains("utm_")
273 || lower.contains("mc_eid")
274 || lower.contains("trk")
275}
276
277#[allow(clippy::case_sensitive_file_extension_comparisons)]
278fn detect_url_type(url: &str, domain: &str) -> UrlType {
279 let lower = url.to_lowercase();
280 let domain_lower = domain.to_lowercase();
281
282 if lower.contains("unsubscribe") || lower.contains("optout") {
283 UrlType::Unsubscribe
284 } else if is_tracking_url(url) {
285 UrlType::Tracking
286 } else if domain_lower.contains("linkedin")
287 || domain_lower.contains("twitter")
288 || domain_lower.contains("facebook")
289 || domain_lower.contains("instagram")
290 {
291 UrlType::SocialMedia
292 } else if lower.contains("calendar") || lower.contains(".ics") {
293 UrlType::Calendar
294 } else if lower.ends_with(".pdf")
295 || lower.ends_with(".doc")
296 || lower.ends_with(".docx")
297 || lower.ends_with(".xls")
298 {
299 UrlType::Document
300 } else {
301 UrlType::Website
302 }
303}
304
305fn parse_amount(raw: &str) -> Option<MonetaryAmount> {
306 let clean: String = raw
307 .chars()
308 .filter(|c| c.is_ascii_digit() || *c == '.' || *c == ',')
309 .collect();
310
311 let clean = clean.replace(',', "");
312
313 let value: f64 = clean.parse().ok()?;
314
315 let currency = if raw.contains('$') || raw.contains("USD") {
316 "USD"
317 } else if raw.contains('€') || raw.contains("EUR") {
318 "EUR"
319 } else if raw.contains('£') || raw.contains("GBP") {
320 "GBP"
321 } else {
322 "USD"
323 };
324
325 Some(MonetaryAmount {
326 raw: raw.to_string(),
327 value,
328 currency: currency.to_string(),
329 })
330}