Skip to main content

email_extract/
parser.rs

1//! Main email parser implementation
2
3use crate::error::{ParseError, Result};
4use crate::extracted::ExtractedEntities;
5use crate::types::{
6    AuthResult, AuthenticationResults, Body, CategoryHint, Email, EmailAddress, EmailMetadata,
7    Headers, MessageId, Priority, Sentiment, SpamIndicator, Subject, ThreadInfo, Urgency,
8};
9use chrono::{DateTime, Utc};
10use tracing::debug;
11
12/// Parse raw email bytes into a structured Email
13pub fn parse_email(uid: u32, raw: &[u8]) -> Result<Email> {
14    let parsed = mailparse::parse_mail(raw).map_err(|e| ParseError::Structure(e.to_string()))?;
15
16    let headers = parse_headers(&parsed.headers)?;
17    let message_id = extract_message_id(&parsed.headers, uid);
18    let from = extract_from(&parsed.headers)?;
19    let to = extract_addresses(&parsed.headers, "to");
20    let cc = extract_addresses(&parsed.headers, "cc");
21    let bcc = extract_addresses(&parsed.headers, "bcc");
22    let reply_to = extract_reply_to(&parsed.headers);
23    let subject = extract_subject(&parsed.headers);
24    let date = extract_date(&parsed.headers);
25    let thread = extract_thread_info(&parsed.headers, &subject);
26    let body = extract_body(&parsed);
27
28    // Extract entities from body
29    let extracted = ExtractedEntities::extract(body.best_text());
30
31    // Analyze email metadata
32    let metadata = analyze_metadata(&from, &headers, &subject, &body, &extracted);
33
34    debug!("Parsed email: {} from {}", subject.original, from.address);
35
36    Ok(Email {
37        message_id,
38        uid,
39        from,
40        to,
41        cc,
42        bcc,
43        reply_to,
44        subject,
45        body,
46        date,
47        headers,
48        thread,
49        extracted,
50        metadata,
51    })
52}
53
54#[allow(clippy::unnecessary_wraps)]
55fn parse_headers(headers: &[mailparse::MailHeader]) -> Result<Headers> {
56    let all: Vec<(String, String)> = headers
57        .iter()
58        .map(|h| (h.get_key().to_lowercase(), h.get_value()))
59        .collect();
60
61    let content_type = headers
62        .iter()
63        .find(|h| h.get_key().to_lowercase() == "content-type")
64        .map(mailparse::MailHeader::get_value);
65
66    let mailer = headers
67        .iter()
68        .find(|h| {
69            let key = h.get_key().to_lowercase();
70            key == "x-mailer" || key == "user-agent"
71        })
72        .map(mailparse::MailHeader::get_value);
73
74    let priority = headers
75        .iter()
76        .find(|h| h.get_key().to_lowercase() == "x-priority")
77        .map(|h| Priority::from_header(&h.get_value()));
78
79    let list_unsubscribe = headers
80        .iter()
81        .find(|h| h.get_key().to_lowercase() == "list-unsubscribe")
82        .map(mailparse::MailHeader::get_value);
83
84    let authentication = parse_authentication_results(headers);
85
86    let custom: Vec<(String, String)> = headers
87        .iter()
88        .filter(|h| h.get_key().to_lowercase().starts_with("x-"))
89        .map(|h| (h.get_key(), h.get_value()))
90        .collect();
91
92    Ok(Headers {
93        all,
94        content_type,
95        mailer,
96        priority,
97        list_unsubscribe,
98        authentication,
99        custom,
100    })
101}
102
103fn parse_authentication_results(headers: &[mailparse::MailHeader]) -> AuthenticationResults {
104    let mut results = AuthenticationResults::default();
105
106    for header in headers {
107        if header.get_key().to_lowercase() == "authentication-results" {
108            let value = header.get_value().to_lowercase();
109
110            if value.contains("spf=pass") {
111                results.spf = Some(AuthResult::Pass);
112            } else if value.contains("spf=fail") {
113                results.spf = Some(AuthResult::Fail);
114            }
115
116            if value.contains("dkim=pass") {
117                results.dkim = Some(AuthResult::Pass);
118            } else if value.contains("dkim=fail") {
119                results.dkim = Some(AuthResult::Fail);
120            }
121
122            if value.contains("dmarc=pass") {
123                results.dmarc = Some(AuthResult::Pass);
124            } else if value.contains("dmarc=fail") {
125                results.dmarc = Some(AuthResult::Fail);
126            }
127        }
128    }
129
130    results
131}
132
133fn extract_message_id(headers: &[mailparse::MailHeader], uid: u32) -> MessageId {
134    headers
135        .iter()
136        .find(|h| h.get_key().to_lowercase() == "message-id")
137        .map_or_else(
138            || MessageId::synthetic(uid),
139            |h| MessageId::new(h.get_value()),
140        )
141}
142
143fn extract_from(headers: &[mailparse::MailHeader]) -> Result<EmailAddress> {
144    let from_header = headers
145        .iter()
146        .find(|h| h.get_key().to_lowercase() == "from")
147        .map(mailparse::MailHeader::get_value)
148        .ok_or_else(|| ParseError::MissingHeader("From".into()))?;
149
150    EmailAddress::parse(&from_header).ok_or_else(|| ParseError::InvalidHeader {
151        header: "From".into(),
152        details: format!("Could not parse: {from_header}"),
153    })
154}
155
156fn extract_addresses(headers: &[mailparse::MailHeader], header_name: &str) -> Vec<EmailAddress> {
157    headers
158        .iter()
159        .find(|h| h.get_key().to_lowercase() == header_name)
160        .map(|h| {
161            h.get_value()
162                .split(',')
163                .filter_map(|addr| EmailAddress::parse(addr.trim()))
164                .collect()
165        })
166        .unwrap_or_default()
167}
168
169fn extract_reply_to(headers: &[mailparse::MailHeader]) -> Option<EmailAddress> {
170    headers
171        .iter()
172        .find(|h| h.get_key().to_lowercase() == "reply-to")
173        .and_then(|h| EmailAddress::parse(&h.get_value()))
174}
175
176fn extract_subject(headers: &[mailparse::MailHeader]) -> Subject {
177    let subject_text = headers
178        .iter()
179        .find(|h| h.get_key().to_lowercase() == "subject")
180        .map_or_else(
181            || "(no subject)".to_string(),
182            mailparse::MailHeader::get_value,
183        );
184
185    Subject::parse(&subject_text)
186}
187
188fn extract_date(headers: &[mailparse::MailHeader]) -> DateTime<Utc> {
189    headers
190        .iter()
191        .find(|h| h.get_key().to_lowercase() == "date")
192        .and_then(|h| DateTime::parse_from_rfc2822(&h.get_value()).ok())
193        .map_or_else(Utc::now, |dt| dt.with_timezone(&Utc))
194}
195
196fn extract_thread_info(headers: &[mailparse::MailHeader], subject: &Subject) -> ThreadInfo {
197    let in_reply_to = headers
198        .iter()
199        .find(|h| h.get_key().to_lowercase() == "in-reply-to")
200        .map(|h| MessageId::new(h.get_value()));
201
202    let references: Vec<MessageId> = headers
203        .iter()
204        .find(|h| h.get_key().to_lowercase() == "references")
205        .map(|h| {
206            h.get_value()
207                .split_whitespace()
208                .map(|s| MessageId::new(s.to_string()))
209                .collect()
210        })
211        .unwrap_or_default();
212
213    let is_reply = in_reply_to.is_some() || subject.reply_depth > 0;
214    #[allow(clippy::cast_possible_truncation)]
215    let thread_position = if is_reply {
216        references.len() as u32 + 1
217    } else {
218        0
219    };
220
221    ThreadInfo {
222        in_reply_to,
223        references,
224        is_reply,
225        thread_position,
226    }
227}
228
229fn extract_body(parsed: &mailparse::ParsedMail) -> Body {
230    let (text, html) = extract_body_parts(parsed);
231
232    // Extract text from HTML if no plain text
233    let text_from_html = if text.is_empty() {
234        html.as_ref().map(|h| strip_html(h))
235    } else {
236        None
237    };
238
239    let best_text = if !text.is_empty() {
240        &text
241    } else if let Some(ref html_text) = text_from_html {
242        html_text
243    } else {
244        ""
245    };
246
247    // Separate signature from content
248    let (content_without_signature, signature) = separate_signature(best_text);
249
250    Body {
251        word_count: best_text.split_whitespace().count(),
252        char_count: best_text.len(),
253        line_count: best_text.lines().count(),
254        text,
255        html,
256        text_from_html,
257        language: None,         // Could add language detection
258        has_attachments: false, // Would need to check multipart structure
259        signature,
260        content_without_signature,
261    }
262}
263
264fn extract_body_parts(parsed: &mailparse::ParsedMail) -> (String, Option<String>) {
265    let mut text = String::new();
266    let mut html: Option<String> = None;
267
268    if parsed.subparts.is_empty() {
269        let content_type = parsed.ctype.mimetype.to_lowercase();
270        if let Ok(body) = parsed.get_body() {
271            if content_type.contains("text/html") {
272                html = Some(body);
273            } else {
274                text = body;
275            }
276        }
277    } else {
278        extract_body_recursive(parsed, &mut text, &mut html);
279    }
280
281    (text, html)
282}
283
284fn extract_body_recursive(
285    parsed: &mailparse::ParsedMail,
286    text: &mut String,
287    html: &mut Option<String>,
288) {
289    for part in &parsed.subparts {
290        let content_type = part.ctype.mimetype.to_lowercase();
291
292        if part.subparts.is_empty() {
293            if let Ok(body) = part.get_body() {
294                if content_type.contains("text/plain") && text.is_empty() {
295                    *text = body;
296                } else if content_type.contains("text/html") && html.is_none() {
297                    *html = Some(body);
298                }
299            }
300        } else {
301            extract_body_recursive(part, text, html);
302        }
303    }
304}
305
306fn strip_html(html: &str) -> String {
307    let mut result = String::new();
308    let mut in_tag = false;
309    let mut in_script = false;
310    let mut in_style = false;
311    let mut tag_start_idx: usize = 0;
312
313    let lower_chars: Vec<char> = html.to_lowercase().chars().collect();
314    let chars: Vec<char> = html.chars().collect();
315
316    let mut i = 0;
317    while i < chars.len() {
318        if !in_tag && chars[i] == '<' {
319            tag_start_idx = i;
320            // Check for script/style start via char slice
321            let remaining: String = lower_chars[i..].iter().collect();
322            if remaining.starts_with("<script") {
323                in_script = true;
324            } else if remaining.starts_with("<style") {
325                in_style = true;
326            } else if remaining.starts_with("</script") {
327                in_script = false;
328            } else if remaining.starts_with("</style") {
329                in_style = false;
330            }
331            in_tag = true;
332        } else if in_tag && chars[i] == '>' {
333            in_tag = false;
334            // Add newline after block elements
335            let tag_content: String = lower_chars[tag_start_idx + 1..i].iter().collect();
336            if tag_content.starts_with("br")
337                || tag_content.starts_with("/p")
338                || tag_content.starts_with("/div")
339                || tag_content.starts_with("/li")
340                || tag_content.starts_with("/h")
341            {
342                result.push('\n');
343            }
344        } else if !in_tag && !in_script && !in_style {
345            result.push(chars[i]);
346        }
347        i += 1;
348    }
349
350    // Decode HTML entities
351    result = result
352        .replace("&nbsp;", " ")
353        .replace("&amp;", "&")
354        .replace("&lt;", "<")
355        .replace("&gt;", ">")
356        .replace("&quot;", "\"")
357        .replace("&#39;", "'");
358
359    // Clean up whitespace
360    result
361        .lines()
362        .map(str::trim)
363        .filter(|l| !l.is_empty())
364        .collect::<Vec<_>>()
365        .join("\n")
366}
367
368fn separate_signature(text: &str) -> (String, Option<String>) {
369    // Common signature delimiters
370    let delimiters = [
371        "--\n",
372        "-- \n",
373        "---\n",
374        "Best regards",
375        "Kind regards",
376        "Regards,",
377    ];
378
379    for delimiter in delimiters {
380        if let Some(pos) = text.find(delimiter) {
381            let content = text[..pos].trim().to_string();
382            let signature = text[pos..].trim().to_string();
383            if !signature.is_empty() {
384                return (content, Some(signature));
385            }
386        }
387    }
388
389    (text.to_string(), None)
390}
391
392fn analyze_metadata(
393    from: &EmailAddress,
394    headers: &Headers,
395    subject: &Subject,
396    body: &Body,
397    extracted: &ExtractedEntities,
398) -> EmailMetadata {
399    let mut spam_indicators = Vec::new();
400    let mut spam_score: f32 = 0.0;
401
402    // Check spam indicators
403    if from.is_noreply() {
404        spam_indicators.push(SpamIndicator {
405            indicator: "noreply_sender".into(),
406            weight: 0.1,
407        });
408        spam_score += 0.1;
409    }
410
411    // Check for tracking URLs
412    let tracking_count = extracted.urls.iter().filter(|u| u.is_tracking).count();
413    if tracking_count > 3 {
414        spam_indicators.push(SpamIndicator {
415            indicator: "excessive_tracking".into(),
416            weight: 0.2,
417        });
418        spam_score += 0.2;
419    }
420
421    // Check subject for spam patterns
422    let subject_lower = subject.original.to_lowercase();
423    if subject_lower.contains("urgent")
424        || subject_lower.contains("act now")
425        || subject_lower.contains("limited time")
426    {
427        spam_indicators.push(SpamIndicator {
428            indicator: "urgency_language".into(),
429            weight: 0.15,
430        });
431        spam_score += 0.15;
432    }
433
434    // Determine urgency
435    let urgency = if subject_lower.contains("urgent")
436        || subject_lower.contains("asap")
437        || subject_lower.contains("emergency")
438        || headers.priority == Some(Priority::High)
439        || headers.priority == Some(Priority::Highest)
440    {
441        Urgency::High
442    } else {
443        Urgency::Normal
444    };
445
446    // Category hints
447    let mut category_hints = Vec::new();
448
449    if headers.list_unsubscribe.is_some() {
450        category_hints.push(CategoryHint {
451            category: "newsletter".into(),
452            confidence: 0.9,
453            reason: "Has List-Unsubscribe header".into(),
454        });
455    }
456
457    if from.is_noreply() {
458        category_hints.push(CategoryHint {
459            category: "automated".into(),
460            confidence: 0.8,
461            reason: "From noreply address".into(),
462        });
463    }
464
465    if !extracted.phone_numbers.is_empty() && !extracted.companies.is_empty() {
466        category_hints.push(CategoryHint {
467            category: "lead".into(),
468            confidence: 0.6,
469            reason: "Contains contact information".into(),
470        });
471    }
472
473    let is_automated = from.is_noreply() || headers.mailer.is_some();
474    let is_mailing_list = headers.list_unsubscribe.is_some();
475
476    // Simple sentiment detection
477    let text_lower = body.best_text().to_lowercase();
478    let sentiment = if text_lower.contains("thank")
479        || text_lower.contains("appreciate")
480        || text_lower.contains("great")
481        || text_lower.contains("excellent")
482    {
483        Sentiment::Positive
484    } else if text_lower.contains("complaint")
485        || text_lower.contains("frustrated")
486        || text_lower.contains("disappointed")
487        || text_lower.contains("problem")
488    {
489        Sentiment::Negative
490    } else {
491        Sentiment::Neutral
492    };
493
494    EmailMetadata {
495        spam_score: spam_score.min(1.0),
496        spam_indicators,
497        urgency,
498        category_hints,
499        is_automated,
500        is_mailing_list,
501        sentiment,
502    }
503}