use std::collections::HashMap;
use anyhow::{anyhow, Result};
use log::{debug, warn};
use mailparse::{addrparse, MailAddr, MailHeaderMap, ParsedMail};
pub type ParsedEmail = (String, Option<String>, String, Option<String>, HashMap<String, String>);
pub struct EmailParser;
impl EmailParser {
pub fn parse(raw_data: &[u8], header_prefixes: &[String]) -> Result<ParsedEmail> {
let mail = mailparse::parse_mail(raw_data).map_err(|e| anyhow!("Mail parsing failed: {}", e))?;
let subject = mail.headers.get_first_value("Subject")
.unwrap_or_else(|| {
debug!("Subject header not found");
String::new() });
debug!("Extracted subject: {}", subject);
let from_name = mail.headers.get_first_value("From") .and_then(|reply_to_str| {
match addrparse(&reply_to_str) {
Ok(addrs) => {
addrs.first().and_then(|mail_addr| {
match mail_addr {
MailAddr::Single(spec) => spec.display_name.clone(),
MailAddr::Group(group) => Some(group.group_name.clone()),
}
})
},
Err(e) => {
warn!("Failed to parse From header '{}': {}", reply_to_str, e); None }
}
});
debug!("Extracted From name: {:?}", from_name);
let matched_headers = if header_prefixes.is_empty() {
HashMap::new()
} else {
let lowercase_prefixes: Vec<String> = header_prefixes
.iter()
.map(|p| p.to_lowercase())
.collect();
let mut headers_map = HashMap::new();
for header in &mail.headers {
let key_lower = header.get_key().to_lowercase();
if lowercase_prefixes.iter().any(|prefix| key_lower.starts_with(prefix)) {
let key = header.get_key();
let value = header.get_value();
debug!("Matched header: {} = {}", key, value);
headers_map.insert(key, value);
}
}
headers_map
};
debug!("Matched {} headers against prefixes", matched_headers.len());
let mut text_body: Option<String> = None;
let mut html_body: Option<String> = None;
process_mail_part(&mail, &mut text_body, &mut html_body)?;
let final_text_body = if let Some(ref html) = html_body {
debug!("HTML part found, generating final text body from HTML using html2text.");
match html2text::from_read(html.as_bytes(), 80) {
Ok(converted_text) => converted_text,
Err(e) => {
warn!("Failed to convert HTML body to text using html2text, falling back to plain text part if available: {}", e);
text_body.unwrap_or_else(|| {
warn!("HTML conversion failed and no plain text part found, using empty string.");
String::new()
})
}
}
} else if let Some(text) = text_body {
debug!("Using found text/plain part for final text body (no HTML part found).");
text
} else {
debug!("No text/plain or text/html body part found.");
String::new() };
Ok((subject, from_name, final_text_body, html_body, matched_headers))
}
}
fn process_mail_part(part: &ParsedMail, text_body: &mut Option<String>, html_body: &mut Option<String>) -> Result<()> {
if part.subparts.is_empty() {
let ctype = &part.ctype;
let content_type_str = &ctype.mimetype; debug!("Processing leaf part with Content-Type: {}", content_type_str);
match content_type_str.as_str() {
"text/plain" if text_body.is_none() => {
let body_str = part.get_body().map_err(|e| anyhow!("Failed to get/decode plain text body: {}", e))?;
debug!("Found and decoded text/plain part.");
*text_body = Some(body_str);
}
"text/html" if html_body.is_none() => {
let body_str = part.get_body().map_err(|e| anyhow!("Failed to get/decode HTML body: {}", e))?;
debug!("Found and decoded text/html part.");
*html_body = Some(body_str);
}
_ => {
debug!("Ignoring part with Content-Type: {}", content_type_str);
}
}
} else {
debug!("Processing multipart container ({}) with {} subparts.", part.ctype.mimetype, part.subparts.len());
for subpart in &part.subparts {
if text_body.is_some() && html_body.is_some() {
debug!("Found both text and html parts, stopping search in this branch.");
break;
}
process_mail_part(subpart, text_body, html_body)?;
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_simple_email() {
let email = "From: sender@example.com\r\n\
To: recipient@example.com\r\n\
Subject: Test Email\r\n\
\r\n\
This is a test email.\r\n\
It has multiple lines.\r\n";
let (subject, from_name, text_body, html_body, _) = EmailParser::parse(email.as_bytes(), &[]).expect("Parsing failed for simple email");
assert!(from_name.is_none(), "From name should be None for simple email with only address");
assert_eq!(subject, "Test Email");
assert_eq!(text_body.trim(), "This is a test email.\r\nIt has multiple lines.".trim());
assert!(html_body.is_none(), "HTML body should be None for plain text email");
}
#[test]
fn test_parse_email_with_html_content_type() {
let email = "From: sender@example.com\r\n\
To: recipient@example.com\r\n\
Subject: HTML Email\r\n\
Content-Type: text/html; charset=utf-8\r\n\
\r\n\
Plain text part that might be ignored by html2text if not in tags.\r\n\
<html><body>\r\n\
<p>HTML content that should be ignored.</p>\r\n\
</body></html>\r\n\
Another plain line.\r\n";
let (subject, from_name, text_body, html_body, _) = EmailParser::parse(email.as_bytes(), &[]).expect("Parsing failed for HTML email");
assert!(from_name.is_none(), "From name should be None for HTML email with only address");
assert_eq!(subject, "HTML Email");
let expected_text_fragment_1 = "Plain text part that might be ignored by html2text if not in tags.";
let expected_text_fragment_2 = "HTML content that should be ignored.";
let expected_text_fragment_3 = "Another plain line.";
assert!(text_body.contains(expected_text_fragment_1), "Text body missing first plain part. Got: {}", text_body);
assert!(text_body.contains(expected_text_fragment_2), "Text body missing HTML content part. Got: {}", text_body);
assert!(text_body.contains(expected_text_fragment_3), "Text body missing second plain part. Got: {}", text_body);
assert!(html_body.is_some(), "HTML body should be Some for HTML email");
let html_content = html_body.unwrap();
assert!(html_content.contains("<html>"), "HTML body missing <html> tag");
assert!(html_content.contains("<p>HTML content that should be ignored.</p>"), "HTML body missing <p> tag content");
assert!(html_content.contains("</html>"), "HTML body missing </html> tag");
assert!(html_content.contains("Plain text part that might be ignored"), "HTML body missing plain text part");
}
#[test]
fn test_parse_html_with_links_and_formatting_no_content_type() {
let email = "Subject: Complex HTML Heuristic\r\n\r\n<html><body><h1>Title</h1><p>This is <strong>bold</strong> text and a <a href=\"http://example.com\">link</a>.</p><div>Another section</div></body></html>";
let (subject, from_name, text_body, html_body, _) = EmailParser::parse(email.as_bytes(), &[]).expect("Parsing failed for complex HTML heuristic");
assert!(from_name.is_none(), "From name should be None when From header is missing");
assert_eq!(subject, "Complex HTML Heuristic");
assert!(text_body.contains("<h1>Title</h1>"), "Text body missing raw h1 tag. Got: {}", text_body);
assert!(text_body.contains("<strong>bold</strong>"), "Text body missing raw strong tag. Got: {}", text_body);
assert!(text_body.contains("<a href=\"http://example.com\">link</a>"), "Text body missing raw a tag. Got: {}", text_body);
assert!(text_body.contains("<div>Another section</div>"), "Text body missing raw div tag. Got: {}", text_body);
assert!(html_body.is_none(), "HTML body should be None when Content-Type is missing and mailparse defaults to text/plain");
}
#[test]
fn test_parse_no_subject() {
let email = "From: sender@example.com\r\n\
To: recipient@example.com\r\n\
\r\n\
Body only.\r\n";
let (subject, from_name, text_body, html_body, _) = EmailParser::parse(email.as_bytes(), &[]).expect("Parsing failed for no-subject email");
assert!(from_name.is_none(), "From name should be None for no-subject email with only address");
assert!(subject.is_empty(), "Subject should be empty when not present");
assert_eq!(text_body.trim(), "Body only.".trim());
assert!(html_body.is_none(), "HTML body should be None for plain text email");
}
#[test]
fn test_parse_empty_body() {
let email = "From: sender@example.com\r\n\
Subject: Empty Body Test\r\n\
\r\n";
let (subject, from_name, text_body, html_body, _) = EmailParser::parse(email.as_bytes(), &[]).expect("Parsing failed for empty-body email");
assert!(from_name.is_none(), "From name should be None for empty-body email with only address");
assert_eq!(subject, "Empty Body Test");
assert!(text_body.is_empty(), "Text body should be empty");
assert!(html_body.is_none(), "HTML body should be None for empty body email");
}
#[test]
fn test_parse_from_name() {
let email_with_name = "From: Kangaroo Roo <roo@example.com>\r\n\
Subject: Test With Name\r\n\
\r\n\
Body.";
let (subject1, name1, body1, html1, _) = EmailParser::parse(email_with_name.as_bytes(), &[]).expect("Parsing failed for From with name");
assert_eq!(subject1, "Test With Name");
assert_eq!(name1.as_deref(), Some("Kangaroo Roo"), "From name mismatch");
assert_eq!(body1.trim(), "Body.");
assert!(html1.is_none());
let email_only_addr_angle = "From: <just_email@example.com>\r\n\
Subject: Test Email Only Angle\r\n\
\r\n\
Body.";
let (subject2, name2, body2, html2, _) = EmailParser::parse(email_only_addr_angle.as_bytes(), &[]).expect("Parsing failed for From email only angle");
assert_eq!(subject2, "Test Email Only Angle");
assert!(name2.is_none(), "Name should be None when From only has email (angle)");
assert_eq!(body2.trim(), "Body.");
assert!(html2.is_none());
let email_only_addr_plain = "From: plain_email@example.com\r\n\
Subject: Test Email Only Plain\r\n\
\r\n\
Body.";
let (subject3, name3, body3, html3, _) = EmailParser::parse(email_only_addr_plain.as_bytes(), &[]).expect("Parsing failed for From email only plain");
assert_eq!(subject3, "Test Email Only Plain");
assert!(name3.is_none(), "Name should be None when From only has email (plain)");
assert_eq!(body3.trim(), "Body.");
assert!(html3.is_none());
let email_no_from = "Subject: Test No From\r\n\
\r\n\
Body.";
let (subject4, name4, body4, html4, _) = EmailParser::parse(email_no_from.as_bytes(), &[]).expect("Parsing failed for no From");
assert_eq!(subject4, "Test No From");
assert!(name4.is_none(), "Name should be None when From header is missing");
assert_eq!(body4.trim(), "Body.");
assert!(html4.is_none());
}
#[test]
fn test_parse_multipart_alternative() {
let email_data = r#"MIME-Version: 1.0
Date: Sun, 6 Apr 2025 02:37:39 -0500
Message-ID: <CALGz_fUk-EJ9wi-VSkZMuAgcHa9bK+kFKnsKdSLrxX62LU1inA@mail.gmail.com>
Subject: hopefully no html
From: Roland Rodriguez <rolandrodriguez@gmail.com>
Reply-To: "Another Name" <another@example.com>
To: design@my.stickerai.shop
Content-Type: multipart/alternative; boundary="0000000000005e994006321734d8"
--0000000000005e994006321734d8
Content-Type: text/plain; charset="UTF-8"
trying to make sure all email is stripped from this message. Thanks!
*Yours truly,*
*ME*
*https://govcraft.ai <https://govcraft.ai>*
--0000000000005e994006321734d8
Content-Type: text/html; charset="UTF-8"
<div dir="ltr">trying to make sure all email is stripped from this message. Thanks!<br><br><b>Yours truly,</b><div><i>ME</i></div><div><i><a href="https://govcraft.ai">https://govcraft.ai</a></i></div><div><i><br></i></div></div>
--0000000000005e994006321734d8--
"#;
let (subject, from_name, text_body, html_body_opt, _) = EmailParser::parse(email_data.as_bytes(), &[]).expect("Parsing multipart failed");
assert_eq!(subject, "hopefully no html");
assert_eq!(from_name.as_deref(), Some("Roland Rodriguez"), "From name mismatch in multipart test");
let expected_markdown = "trying to make sure all email is stripped from this message. Thanks!\n\nYours truly,\nME\n[https://govcraft.ai][1]\n\n\n[1]: https://govcraft.ai";
assert_eq!(text_body.trim(), expected_markdown.trim());
assert!(html_body_opt.is_some(), "HTML body should be present");
let html_body = html_body_opt.unwrap();
let expected_html_fragment = "<div dir=\"ltr\">trying to make sure all email is stripped from this message.";
assert!(html_body.contains(expected_html_fragment), "HTML body missing expected content. Got: {}", html_body);
assert!(html_body.contains("<b>Yours truly,</b>"), "HTML body missing bold tag");
assert!(html_body.contains("<a href=\"https://govcraft.ai\">"), "HTML body missing link tag");
}
#[test]
fn test_parse_headers_with_prefixes() {
let email = "From: sender@example.com\r\n\
Subject: Header Test\r\n\
X-Custom-Foo: value1\r\n\
X-Custom-Bar: value2\r\n\
X-Other: should-not-match\r\n\
\r\n\
Body.\r\n";
let prefixes = vec!["X-Custom".to_string()];
let (_, _, _, _, matched) = EmailParser::parse(email.as_bytes(), &prefixes)
.expect("Parsing failed for header prefix test");
assert_eq!(matched.len(), 2, "Expected 2 matched headers, got {}", matched.len());
assert_eq!(matched.get("X-Custom-Foo").map(String::as_str), Some("value1"));
assert_eq!(matched.get("X-Custom-Bar").map(String::as_str), Some("value2"));
assert!(!matched.contains_key("X-Other"), "X-Other should not be matched");
}
#[test]
fn test_parse_headers_case_insensitive() {
let email = "From: sender@example.com\r\n\
Subject: Case Test\r\n\
X-MY-HEADER: upper-value\r\n\
x-my-other: lower-value\r\n\
\r\n\
Body.\r\n";
let prefixes = vec!["x-my".to_string()];
let (_, _, _, _, matched) = EmailParser::parse(email.as_bytes(), &prefixes)
.expect("Parsing failed for case-insensitive header test");
assert_eq!(matched.len(), 2, "Expected 2 matched headers (case-insensitive), got {}", matched.len());
assert!(matched.values().any(|v| v == "upper-value"), "Missing upper-value header");
assert!(matched.values().any(|v| v == "lower-value"), "Missing lower-value header");
}
#[test]
fn test_parse_headers_no_match() {
let email = "From: sender@example.com\r\n\
Subject: No Match Test\r\n\
X-Something: value\r\n\
\r\n\
Body.\r\n";
let prefixes = vec!["X-Nonexistent".to_string()];
let (_, _, _, _, matched) = EmailParser::parse(email.as_bytes(), &prefixes)
.expect("Parsing failed for no-match header test");
assert!(matched.is_empty(), "Expected no matched headers, got {}", matched.len());
}
#[test]
fn test_parse_headers_empty_prefixes() {
let email = "From: sender@example.com\r\n\
Subject: Empty Prefixes Test\r\n\
X-Custom: value\r\n\
\r\n\
Body.\r\n";
let (_, _, _, _, matched) = EmailParser::parse(email.as_bytes(), &[])
.expect("Parsing failed for empty prefixes test");
assert!(matched.is_empty(), "Expected no matched headers when prefixes are empty");
}
#[test]
fn test_parse_malformed_email_data() {
let garbage: &[u8] = &[0xFF, 0xFE, 0x00, 0x01, 0x80, 0x90];
let result = EmailParser::parse(garbage, &[]);
match result {
Ok((subject, _, _, _, _)) => {
assert!(subject.is_empty(), "Subject should be empty for garbage data");
}
Err(_) => {
}
}
}
#[test]
fn test_parse_empty_input() {
let empty: &[u8] = b"";
let result = EmailParser::parse(empty, &[]);
match result {
Ok((subject, from_name, text_body, html_body, headers)) => {
assert!(subject.is_empty());
assert!(from_name.is_none());
assert!(text_body.is_empty());
assert!(html_body.is_none());
assert!(headers.is_empty());
}
Err(_) => {
}
}
}
}