use mail_parser::{Address, MessageParser};
use scraper::Html;
use super::super::LoaderError;
pub(crate) fn extract(bytes: &[u8]) -> Result<String, LoaderError> {
let message = MessageParser::default().parse(bytes).ok_or_else(|| {
LoaderError::ExtractionFailed("Failed to parse email message".to_string())
})?;
let mut elements: Vec<String> = Vec::new();
let mut headers: Vec<String> = Vec::new();
if let Some(from) = message.from() {
let from_str = format_address(from);
if !from_str.is_empty() {
headers.push(format!("From: {from_str}"));
}
}
if let Some(to) = message.to() {
let to_str = format_address(to);
if !to_str.is_empty() {
headers.push(format!("To: {to_str}"));
}
}
if let Some(subject) = message.subject()
&& !subject.is_empty()
{
headers.push(format!("Subject: {subject}"));
}
if let Some(date) = message.date() {
headers.push(format!("Date: {date}"));
}
if !headers.is_empty() {
elements.push(headers.join("\n"));
}
let body_text = message
.body_text(0)
.map(|t| t.trim().to_string())
.filter(|t| !t.is_empty());
let body = if let Some(text) = body_text {
Some(text)
} else {
message
.body_html(0)
.map(|html| strip_html(&html))
.filter(|t| !t.is_empty())
};
if let Some(body) = body {
elements.push(body);
}
Ok(elements.join("\n\n"))
}
fn format_address(addr: &Address<'_>) -> String {
match addr {
Address::List(list) => list
.iter()
.map(|a| format_single_addr(a))
.collect::<Vec<_>>()
.join(", "),
Address::Group(groups) => groups
.iter()
.map(|g| {
let addrs: Vec<String> =
g.addresses.iter().map(|a| format_single_addr(a)).collect();
match &g.name {
Some(name) => format!("{name}: {}", addrs.join(", ")),
None => addrs.join(", "),
}
})
.collect::<Vec<_>>()
.join("; "),
}
}
fn format_single_addr(addr: &mail_parser::Addr<'_>) -> String {
match (&addr.name, &addr.address) {
(Some(name), Some(email)) => format!("{name} <{email}>"),
(None, Some(email)) => email.to_string(),
(Some(name), None) => name.to_string(),
(None, None) => String::new(),
}
}
fn strip_html(html: &str) -> String {
let document = Html::parse_document(html);
document
.root_element()
.text()
.map(|t| t.trim())
.filter(|t| !t.is_empty())
.collect::<Vec<&str>>()
.join(" ")
}
#[cfg(test)]
#[allow(
clippy::unwrap_used,
clippy::expect_used,
reason = "test code — panics are acceptable failures"
)]
mod tests {
use super::*;
#[test]
fn parse_simple_email() {
let eml = b"From: sender@example.com\r\n\
To: recipient@example.com\r\n\
Subject: Test Email\r\n\
Date: Mon, 1 Jan 2024 00:00:00 +0000\r\n\
\r\n\
Hello, this is a test email body.\r\n";
let result = extract(eml).unwrap();
assert!(result.contains("From:"));
assert!(result.contains("sender@example.com"));
assert!(result.contains("To:"));
assert!(result.contains("recipient@example.com"));
assert!(result.contains("Subject: Test Email"));
assert!(result.contains("Hello, this is a test email body."));
}
#[test]
fn parse_email_without_body() {
let eml = b"From: sender@example.com\r\n\
Subject: No Body\r\n\
\r\n";
let result = extract(eml).unwrap();
assert!(result.contains("Subject: No Body"));
}
#[test]
fn invalid_email_returns_error() {
let result = extract(b"");
assert!(result.is_err());
let err = result.unwrap_err();
assert!(
matches!(err, LoaderError::ExtractionFailed(_)),
"expected ExtractionFailed, got {err:?}"
);
}
#[test]
fn strip_html_basic() {
let result = strip_html("<html><body><p>Hello</p></body></html>");
assert!(result.contains("Hello"));
assert!(!result.contains("<p>"));
}
}