1use crate::error::{ParseError, Result};
4use crate::extracted::ExtractedEntities;
5use crate::types::{
6 AuthResult, AuthenticationResults, Body, CategoryHint, Email, EmailAddress, EmailMetadata,
7 Headers, MessageId, Priority, Sentiment, SpamIndicator, Subject, ThreadInfo, Urgency,
8};
9use chrono::{DateTime, Utc};
10use tracing::debug;
11
12pub fn parse_email(uid: u32, raw: &[u8]) -> Result<Email> {
14 let parsed = mailparse::parse_mail(raw).map_err(|e| ParseError::Structure(e.to_string()))?;
15
16 let headers = parse_headers(&parsed.headers)?;
17 let message_id = extract_message_id(&parsed.headers, uid);
18 let from = extract_from(&parsed.headers)?;
19 let to = extract_addresses(&parsed.headers, "to");
20 let cc = extract_addresses(&parsed.headers, "cc");
21 let bcc = extract_addresses(&parsed.headers, "bcc");
22 let reply_to = extract_reply_to(&parsed.headers);
23 let subject = extract_subject(&parsed.headers);
24 let date = extract_date(&parsed.headers);
25 let thread = extract_thread_info(&parsed.headers, &subject);
26 let body = extract_body(&parsed);
27
28 let extracted = ExtractedEntities::extract(body.best_text());
30
31 let metadata = analyze_metadata(&from, &headers, &subject, &body, &extracted);
33
34 debug!("Parsed email: {} from {}", subject.original, from.address);
35
36 Ok(Email {
37 message_id,
38 uid,
39 from,
40 to,
41 cc,
42 bcc,
43 reply_to,
44 subject,
45 body,
46 date,
47 headers,
48 thread,
49 extracted,
50 metadata,
51 })
52}
53
54#[allow(clippy::unnecessary_wraps)]
55fn parse_headers(headers: &[mailparse::MailHeader]) -> Result<Headers> {
56 let all: Vec<(String, String)> = headers
57 .iter()
58 .map(|h| (h.get_key().to_lowercase(), h.get_value()))
59 .collect();
60
61 let content_type = headers
62 .iter()
63 .find(|h| h.get_key().to_lowercase() == "content-type")
64 .map(mailparse::MailHeader::get_value);
65
66 let mailer = headers
67 .iter()
68 .find(|h| {
69 let key = h.get_key().to_lowercase();
70 key == "x-mailer" || key == "user-agent"
71 })
72 .map(mailparse::MailHeader::get_value);
73
74 let priority = headers
75 .iter()
76 .find(|h| h.get_key().to_lowercase() == "x-priority")
77 .map(|h| Priority::from_header(&h.get_value()));
78
79 let list_unsubscribe = headers
80 .iter()
81 .find(|h| h.get_key().to_lowercase() == "list-unsubscribe")
82 .map(mailparse::MailHeader::get_value);
83
84 let authentication = parse_authentication_results(headers);
85
86 let custom: Vec<(String, String)> = headers
87 .iter()
88 .filter(|h| h.get_key().to_lowercase().starts_with("x-"))
89 .map(|h| (h.get_key(), h.get_value()))
90 .collect();
91
92 Ok(Headers {
93 all,
94 content_type,
95 mailer,
96 priority,
97 list_unsubscribe,
98 authentication,
99 custom,
100 })
101}
102
103fn parse_authentication_results(headers: &[mailparse::MailHeader]) -> AuthenticationResults {
104 let mut results = AuthenticationResults::default();
105
106 for header in headers {
107 if header.get_key().to_lowercase() == "authentication-results" {
108 let value = header.get_value().to_lowercase();
109
110 if value.contains("spf=pass") {
111 results.spf = Some(AuthResult::Pass);
112 } else if value.contains("spf=fail") {
113 results.spf = Some(AuthResult::Fail);
114 }
115
116 if value.contains("dkim=pass") {
117 results.dkim = Some(AuthResult::Pass);
118 } else if value.contains("dkim=fail") {
119 results.dkim = Some(AuthResult::Fail);
120 }
121
122 if value.contains("dmarc=pass") {
123 results.dmarc = Some(AuthResult::Pass);
124 } else if value.contains("dmarc=fail") {
125 results.dmarc = Some(AuthResult::Fail);
126 }
127 }
128 }
129
130 results
131}
132
133fn extract_message_id(headers: &[mailparse::MailHeader], uid: u32) -> MessageId {
134 headers
135 .iter()
136 .find(|h| h.get_key().to_lowercase() == "message-id")
137 .map_or_else(
138 || MessageId::synthetic(uid),
139 |h| MessageId::new(h.get_value()),
140 )
141}
142
143fn extract_from(headers: &[mailparse::MailHeader]) -> Result<EmailAddress> {
144 let from_header = headers
145 .iter()
146 .find(|h| h.get_key().to_lowercase() == "from")
147 .map(mailparse::MailHeader::get_value)
148 .ok_or_else(|| ParseError::MissingHeader("From".into()))?;
149
150 EmailAddress::parse(&from_header).ok_or_else(|| ParseError::InvalidHeader {
151 header: "From".into(),
152 details: format!("Could not parse: {from_header}"),
153 })
154}
155
156fn extract_addresses(headers: &[mailparse::MailHeader], header_name: &str) -> Vec<EmailAddress> {
157 headers
158 .iter()
159 .find(|h| h.get_key().to_lowercase() == header_name)
160 .map(|h| {
161 h.get_value()
162 .split(',')
163 .filter_map(|addr| EmailAddress::parse(addr.trim()))
164 .collect()
165 })
166 .unwrap_or_default()
167}
168
169fn extract_reply_to(headers: &[mailparse::MailHeader]) -> Option<EmailAddress> {
170 headers
171 .iter()
172 .find(|h| h.get_key().to_lowercase() == "reply-to")
173 .and_then(|h| EmailAddress::parse(&h.get_value()))
174}
175
176fn extract_subject(headers: &[mailparse::MailHeader]) -> Subject {
177 let subject_text = headers
178 .iter()
179 .find(|h| h.get_key().to_lowercase() == "subject")
180 .map_or_else(
181 || "(no subject)".to_string(),
182 mailparse::MailHeader::get_value,
183 );
184
185 Subject::parse(&subject_text)
186}
187
188fn extract_date(headers: &[mailparse::MailHeader]) -> DateTime<Utc> {
189 headers
190 .iter()
191 .find(|h| h.get_key().to_lowercase() == "date")
192 .and_then(|h| DateTime::parse_from_rfc2822(&h.get_value()).ok())
193 .map_or_else(Utc::now, |dt| dt.with_timezone(&Utc))
194}
195
196fn extract_thread_info(headers: &[mailparse::MailHeader], subject: &Subject) -> ThreadInfo {
197 let in_reply_to = headers
198 .iter()
199 .find(|h| h.get_key().to_lowercase() == "in-reply-to")
200 .map(|h| MessageId::new(h.get_value()));
201
202 let references: Vec<MessageId> = headers
203 .iter()
204 .find(|h| h.get_key().to_lowercase() == "references")
205 .map(|h| {
206 h.get_value()
207 .split_whitespace()
208 .map(|s| MessageId::new(s.to_string()))
209 .collect()
210 })
211 .unwrap_or_default();
212
213 let is_reply = in_reply_to.is_some() || subject.reply_depth > 0;
214 #[allow(clippy::cast_possible_truncation)]
215 let thread_position = if is_reply {
216 references.len() as u32 + 1
217 } else {
218 0
219 };
220
221 ThreadInfo {
222 in_reply_to,
223 references,
224 is_reply,
225 thread_position,
226 }
227}
228
229fn extract_body(parsed: &mailparse::ParsedMail) -> Body {
230 let (text, html) = extract_body_parts(parsed);
231
232 let text_from_html = if text.is_empty() {
234 html.as_ref().map(|h| strip_html(h))
235 } else {
236 None
237 };
238
239 let best_text = if !text.is_empty() {
240 &text
241 } else if let Some(ref html_text) = text_from_html {
242 html_text
243 } else {
244 ""
245 };
246
247 let (content_without_signature, signature) = separate_signature(best_text);
249
250 Body {
251 word_count: best_text.split_whitespace().count(),
252 char_count: best_text.len(),
253 line_count: best_text.lines().count(),
254 text,
255 html,
256 text_from_html,
257 language: None, has_attachments: false, signature,
260 content_without_signature,
261 }
262}
263
264fn extract_body_parts(parsed: &mailparse::ParsedMail) -> (String, Option<String>) {
265 let mut text = String::new();
266 let mut html: Option<String> = None;
267
268 if parsed.subparts.is_empty() {
269 let content_type = parsed.ctype.mimetype.to_lowercase();
270 if let Ok(body) = parsed.get_body() {
271 if content_type.contains("text/html") {
272 html = Some(body);
273 } else {
274 text = body;
275 }
276 }
277 } else {
278 extract_body_recursive(parsed, &mut text, &mut html);
279 }
280
281 (text, html)
282}
283
284fn extract_body_recursive(
285 parsed: &mailparse::ParsedMail,
286 text: &mut String,
287 html: &mut Option<String>,
288) {
289 for part in &parsed.subparts {
290 let content_type = part.ctype.mimetype.to_lowercase();
291
292 if part.subparts.is_empty() {
293 if let Ok(body) = part.get_body() {
294 if content_type.contains("text/plain") && text.is_empty() {
295 *text = body;
296 } else if content_type.contains("text/html") && html.is_none() {
297 *html = Some(body);
298 }
299 }
300 } else {
301 extract_body_recursive(part, text, html);
302 }
303 }
304}
305
306fn strip_html(html: &str) -> String {
307 let mut result = String::new();
308 let mut in_tag = false;
309 let mut in_script = false;
310 let mut in_style = false;
311 let mut tag_start_idx: usize = 0;
312
313 let lower_chars: Vec<char> = html.to_lowercase().chars().collect();
314 let chars: Vec<char> = html.chars().collect();
315
316 let mut i = 0;
317 while i < chars.len() {
318 if !in_tag && chars[i] == '<' {
319 tag_start_idx = i;
320 let remaining: String = lower_chars[i..].iter().collect();
322 if remaining.starts_with("<script") {
323 in_script = true;
324 } else if remaining.starts_with("<style") {
325 in_style = true;
326 } else if remaining.starts_with("</script") {
327 in_script = false;
328 } else if remaining.starts_with("</style") {
329 in_style = false;
330 }
331 in_tag = true;
332 } else if in_tag && chars[i] == '>' {
333 in_tag = false;
334 let tag_content: String = lower_chars[tag_start_idx + 1..i].iter().collect();
336 if tag_content.starts_with("br")
337 || tag_content.starts_with("/p")
338 || tag_content.starts_with("/div")
339 || tag_content.starts_with("/li")
340 || tag_content.starts_with("/h")
341 {
342 result.push('\n');
343 }
344 } else if !in_tag && !in_script && !in_style {
345 result.push(chars[i]);
346 }
347 i += 1;
348 }
349
350 result = result
352 .replace(" ", " ")
353 .replace("&", "&")
354 .replace("<", "<")
355 .replace(">", ">")
356 .replace(""", "\"")
357 .replace("'", "'");
358
359 result
361 .lines()
362 .map(str::trim)
363 .filter(|l| !l.is_empty())
364 .collect::<Vec<_>>()
365 .join("\n")
366}
367
368fn separate_signature(text: &str) -> (String, Option<String>) {
369 let delimiters = [
371 "--\n",
372 "-- \n",
373 "---\n",
374 "Best regards",
375 "Kind regards",
376 "Regards,",
377 ];
378
379 for delimiter in delimiters {
380 if let Some(pos) = text.find(delimiter) {
381 let content = text[..pos].trim().to_string();
382 let signature = text[pos..].trim().to_string();
383 if !signature.is_empty() {
384 return (content, Some(signature));
385 }
386 }
387 }
388
389 (text.to_string(), None)
390}
391
392fn analyze_metadata(
393 from: &EmailAddress,
394 headers: &Headers,
395 subject: &Subject,
396 body: &Body,
397 extracted: &ExtractedEntities,
398) -> EmailMetadata {
399 let mut spam_indicators = Vec::new();
400 let mut spam_score: f32 = 0.0;
401
402 if from.is_noreply() {
404 spam_indicators.push(SpamIndicator {
405 indicator: "noreply_sender".into(),
406 weight: 0.1,
407 });
408 spam_score += 0.1;
409 }
410
411 let tracking_count = extracted.urls.iter().filter(|u| u.is_tracking).count();
413 if tracking_count > 3 {
414 spam_indicators.push(SpamIndicator {
415 indicator: "excessive_tracking".into(),
416 weight: 0.2,
417 });
418 spam_score += 0.2;
419 }
420
421 let subject_lower = subject.original.to_lowercase();
423 if subject_lower.contains("urgent")
424 || subject_lower.contains("act now")
425 || subject_lower.contains("limited time")
426 {
427 spam_indicators.push(SpamIndicator {
428 indicator: "urgency_language".into(),
429 weight: 0.15,
430 });
431 spam_score += 0.15;
432 }
433
434 let urgency = if subject_lower.contains("urgent")
436 || subject_lower.contains("asap")
437 || subject_lower.contains("emergency")
438 || headers.priority == Some(Priority::High)
439 || headers.priority == Some(Priority::Highest)
440 {
441 Urgency::High
442 } else {
443 Urgency::Normal
444 };
445
446 let mut category_hints = Vec::new();
448
449 if headers.list_unsubscribe.is_some() {
450 category_hints.push(CategoryHint {
451 category: "newsletter".into(),
452 confidence: 0.9,
453 reason: "Has List-Unsubscribe header".into(),
454 });
455 }
456
457 if from.is_noreply() {
458 category_hints.push(CategoryHint {
459 category: "automated".into(),
460 confidence: 0.8,
461 reason: "From noreply address".into(),
462 });
463 }
464
465 if !extracted.phone_numbers.is_empty() && !extracted.companies.is_empty() {
466 category_hints.push(CategoryHint {
467 category: "lead".into(),
468 confidence: 0.6,
469 reason: "Contains contact information".into(),
470 });
471 }
472
473 let is_automated = from.is_noreply() || headers.mailer.is_some();
474 let is_mailing_list = headers.list_unsubscribe.is_some();
475
476 let text_lower = body.best_text().to_lowercase();
478 let sentiment = if text_lower.contains("thank")
479 || text_lower.contains("appreciate")
480 || text_lower.contains("great")
481 || text_lower.contains("excellent")
482 {
483 Sentiment::Positive
484 } else if text_lower.contains("complaint")
485 || text_lower.contains("frustrated")
486 || text_lower.contains("disappointed")
487 || text_lower.contains("problem")
488 {
489 Sentiment::Negative
490 } else {
491 Sentiment::Neutral
492 };
493
494 EmailMetadata {
495 spam_score: spam_score.min(1.0),
496 spam_indicators,
497 urgency,
498 category_hints,
499 is_automated,
500 is_mailing_list,
501 sentiment,
502 }
503}