1use chrono::{DateTime, Utc};
2use mail_parser::{Message, MessageParser, MimeHeaders};
3use mxr_core::types::{
4 Address, CalendarMetadata, MessageMetadata, TextPlainFormat, UnsubscribeMethod,
5};
6use regex::Regex;
7use std::sync::OnceLock;
8use url::Url;
9
10#[derive(Debug, Clone)]
11pub struct ParsedHeaders {
12 pub from: Option<Address>,
13 pub to: Vec<Address>,
14 pub cc: Vec<Address>,
15 pub bcc: Vec<Address>,
16 pub subject: String,
17 pub date: DateTime<Utc>,
18 pub message_id_header: Option<String>,
19 pub in_reply_to: Option<String>,
20 pub references: Vec<String>,
21 pub unsubscribe: UnsubscribeMethod,
22 pub metadata: MessageMetadata,
23}
24
25#[derive(Debug, thiserror::Error)]
26pub enum ParseError {
27 #[error("failed to parse RFC 5322 headers")]
28 InvalidMessage,
29}
30
31pub fn raw_headers_from_pairs(headers: &[(String, String)]) -> String {
32 headers
33 .iter()
34 .map(|(name, value)| format!("{name}: {value}\r\n"))
35 .collect()
36}
37
38pub fn parse_headers_from_pairs(
39 headers: &[(String, String)],
40 fallback_date: Option<DateTime<Utc>>,
41) -> Result<ParsedHeaders, ParseError> {
42 parse_headers_from_raw(&raw_headers_from_pairs(headers), fallback_date)
43}
44
45pub fn parse_headers_from_raw(
46 raw_headers: &str,
47 fallback_date: Option<DateTime<Utc>>,
48) -> Result<ParsedHeaders, ParseError> {
49 let mut raw_message = normalize_header_block(raw_headers);
50 raw_message.push_str("\r\n");
51 let parsed = MessageParser::default()
52 .parse(raw_message.as_bytes())
53 .ok_or(ParseError::InvalidMessage)?;
54 Ok(extract_parsed_headers(
55 &parsed,
56 Some(normalize_header_block(raw_headers)),
57 fallback_date,
58 ))
59}
60
61pub fn parse_address_list(raw: &str) -> Vec<Address> {
62 if raw.trim().is_empty() {
63 return Vec::new();
64 }
65
66 parse_headers_from_pairs(
67 &[("To".to_string(), raw.to_string())],
68 Some(Utc::now()),
69 )
70 .map(|parsed| parsed.to)
71 .unwrap_or_default()
72}
73
74pub fn parse_message_metadata_from_raw(raw_message: &[u8]) -> Result<MessageMetadata, ParseError> {
75 let parsed = MessageParser::default()
76 .parse(raw_message)
77 .ok_or(ParseError::InvalidMessage)?;
78 let raw_headers = extract_raw_header_block(raw_message);
79 Ok(extract_metadata(&parsed, raw_headers))
80}
81
82pub fn body_unsubscribe_from_html(html: &str) -> Option<UnsubscribeMethod> {
83 static HREF_RE: OnceLock<Regex> = OnceLock::new();
84 let re = HREF_RE.get_or_init(|| {
85 Regex::new(r#"(?is)href\s*=\s*["']([^"']*(unsubscribe|opt-out|preferences)[^"']*)["']"#)
86 .unwrap()
87 });
88 re.captures(html).and_then(|caps| {
89 caps.get(1).map(|url| UnsubscribeMethod::BodyLink {
90 url: html_unescape(url.as_str()),
91 })
92 })
93}
94
95pub fn decode_format_flowed(text: &str, delsp: bool) -> String {
96 let mut out = String::new();
97 let mut current = String::new();
98
99 for line in text.lines() {
100 if line == "-- " {
101 flush_paragraph(&mut out, &mut current);
102 out.push_str("-- \n");
103 continue;
104 }
105
106 if line.is_empty() {
107 flush_paragraph(&mut out, &mut current);
108 out.push('\n');
109 continue;
110 }
111
112 let flowed = line.ends_with(' ');
113 let segment = if flowed && delsp {
114 line.trim_end_matches(' ')
115 } else {
116 line
117 };
118
119 current.push_str(segment);
120 if flowed {
121 if !delsp {
122 current.push(' ');
123 }
124 } else {
125 flush_paragraph(&mut out, &mut current);
126 }
127 }
128
129 flush_paragraph(&mut out, &mut current);
130 out.trim_end().to_string()
131}
132
133pub fn calendar_metadata_from_text(calendar_text: &str) -> Option<CalendarMetadata> {
134 let mut method = None;
135 let mut summary = None;
136
137 for line in calendar_text.lines() {
138 let line = line.trim();
139 if method.is_none() {
140 method = line.strip_prefix("METHOD:").map(|value| value.trim().to_string());
141 }
142 if summary.is_none() {
143 summary = line.strip_prefix("SUMMARY:").map(|value| value.trim().to_string());
144 }
145 if method.is_some() && summary.is_some() {
146 break;
147 }
148 }
149
150 if method.is_some() || summary.is_some() {
151 Some(CalendarMetadata { method, summary })
152 } else {
153 None
154 }
155}
156
157pub fn extract_parsed_headers(
158 message: &Message<'_>,
159 raw_headers: Option<String>,
160 fallback_date: Option<DateTime<Utc>>,
161) -> ParsedHeaders {
162 ParsedHeaders {
163 from: message.from().and_then(extract_first_addr),
164 to: message.to().map(extract_addrs).unwrap_or_default(),
165 cc: message.cc().map(extract_addrs).unwrap_or_default(),
166 bcc: message.bcc().map(extract_addrs).unwrap_or_default(),
167 subject: message
168 .subject()
169 .map(|subject| subject.to_string())
170 .unwrap_or_default(),
171 date: message
172 .date()
173 .and_then(|date| DateTime::from_timestamp(date.to_timestamp(), 0))
174 .or(fallback_date)
175 .unwrap_or_else(Utc::now),
176 message_id_header: message.message_id().map(|id| format!("<{id}>")),
177 in_reply_to: message
178 .in_reply_to()
179 .as_text_list()
180 .and_then(|ids| ids.first().map(|id| format!("<{id}>"))),
181 references: message
182 .references()
183 .as_text_list()
184 .map(|ids| ids.iter().map(|id| format!("<{id}>")).collect())
185 .unwrap_or_default(),
186 unsubscribe: parse_list_unsubscribe(message),
187 metadata: extract_metadata(message, raw_headers),
188 }
189}
190
191fn extract_metadata(message: &Message<'_>, raw_headers: Option<String>) -> MessageMetadata {
192 let content_language = message
193 .header_values("Content-Language")
194 .flat_map(|value| {
195 value
196 .as_text()
197 .unwrap_or_default()
198 .split(',')
199 .map(|lang| lang.trim().to_string())
200 .collect::<Vec<_>>()
201 })
202 .filter(|lang| !lang.is_empty())
203 .collect();
204
205 let auth_results = message
206 .header_values("Authentication-Results")
207 .filter_map(|value| value.as_text().map(|value| value.to_string()))
208 .collect();
209
210 let list_id = message.list_id().as_text().map(|value| value.to_string());
211 let text_plain_format = message.content_type().and_then(parse_text_plain_format);
212
213 MessageMetadata {
214 list_id,
215 auth_results,
216 content_language,
217 text_plain_format,
218 calendar: None,
219 raw_headers,
220 }
221}
222
223fn parse_text_plain_format(content_type: &mail_parser::ContentType<'_>) -> Option<TextPlainFormat> {
224 if !content_type
225 .ctype()
226 .eq_ignore_ascii_case("text")
227 || !content_type
228 .subtype()
229 .unwrap_or_default()
230 .eq_ignore_ascii_case("plain")
231 {
232 return None;
233 }
234
235 let format = content_type.attribute("format");
236 let delsp = content_type
237 .attribute("delsp")
238 .map(|value| value.eq_ignore_ascii_case("yes"))
239 .unwrap_or(false);
240
241 match format {
242 Some(value) if value.eq_ignore_ascii_case("flowed") => {
243 Some(TextPlainFormat::Flowed { delsp })
244 }
245 _ => Some(TextPlainFormat::Fixed),
246 }
247}
248
249fn parse_list_unsubscribe(message: &Message<'_>) -> UnsubscribeMethod {
250 let entries: Vec<String> = match message.list_unsubscribe().as_address() {
251 Some(mail_parser::Address::List(list)) => list
252 .iter()
253 .filter_map(|addr| addr.address.as_ref().map(|value| value.to_string()))
254 .collect(),
255 Some(mail_parser::Address::Group(groups)) => groups
256 .iter()
257 .flat_map(|group| group.addresses.iter())
258 .filter_map(|addr| addr.address.as_ref().map(|value| value.to_string()))
259 .collect(),
260 None => Vec::new(),
261 };
262 if entries.is_empty() {
263 return UnsubscribeMethod::None;
264 }
265
266 let one_click = message
267 .header_raw("List-Unsubscribe-Post")
268 .map(|value| value.to_ascii_lowercase())
269 .map(|value| value.contains("list-unsubscribe=one-click"))
270 .unwrap_or(false);
271
272 if one_click {
273 if let Some(url) = entries
274 .iter()
275 .find(|entry| entry.starts_with("https://") || entry.starts_with("http://"))
276 {
277 return UnsubscribeMethod::OneClick {
278 url: url.to_string(),
279 };
280 }
281 }
282
283 for entry in &entries {
284 if let Some(mailto) = entry.strip_prefix("mailto:") {
285 return parse_mailto_unsubscribe(mailto);
286 }
287 }
288
289 if let Some(url) = entries
290 .iter()
291 .find(|entry| entry.starts_with("https://") || entry.starts_with("http://"))
292 {
293 return UnsubscribeMethod::HttpLink {
294 url: url.to_string(),
295 };
296 }
297
298 UnsubscribeMethod::None
299}
300
301fn parse_mailto_unsubscribe(mailto: &str) -> UnsubscribeMethod {
302 let mut subject = None;
303 let address = if let Some((address, query)) = mailto.split_once('?') {
304 for (key, value) in url::form_urlencoded::parse(query.as_bytes()) {
305 if key.eq_ignore_ascii_case("subject") {
306 subject = Some(value.to_string());
307 }
308 }
309 address.to_string()
310 } else if let Ok(url) = Url::parse(&format!("mailto:{mailto}")) {
311 for (key, value) in url.query_pairs() {
312 if key.eq_ignore_ascii_case("subject") {
313 subject = Some(value.to_string());
314 }
315 }
316 url.path().to_string()
317 } else {
318 mailto.to_string()
319 };
320
321 UnsubscribeMethod::Mailto { address, subject }
322}
323
324fn extract_first_addr(addr: &mail_parser::Address<'_>) -> Option<Address> {
325 match addr {
326 mail_parser::Address::List(list) => list.first().map(to_address),
327 mail_parser::Address::Group(groups) => groups
328 .first()
329 .and_then(|group| group.addresses.first())
330 .map(to_address),
331 }
332}
333
334fn extract_addrs(addr: &mail_parser::Address<'_>) -> Vec<Address> {
335 match addr {
336 mail_parser::Address::List(list) => list.iter().map(to_address).collect(),
337 mail_parser::Address::Group(groups) => groups
338 .iter()
339 .flat_map(|group| group.addresses.iter())
340 .map(to_address)
341 .collect(),
342 }
343}
344
345fn to_address(addr: &mail_parser::Addr<'_>) -> Address {
346 Address {
347 name: addr.name().map(|name| name.to_string()),
348 email: addr.address().unwrap_or_default().to_string(),
349 }
350}
351
352fn normalize_header_block(raw_headers: &str) -> String {
353 raw_headers
354 .lines()
355 .map(|line| line.trim_end_matches('\r'))
356 .collect::<Vec<_>>()
357 .join("\r\n")
358}
359
360pub fn extract_raw_header_block(raw_message: &[u8]) -> Option<String> {
361 let raw = String::from_utf8_lossy(raw_message);
362 let header_block = raw
363 .split("\r\n\r\n")
364 .next()
365 .or_else(|| raw.split("\n\n").next())?;
366 Some(normalize_header_block(header_block))
367}
368
369fn flush_paragraph(out: &mut String, current: &mut String) {
370 if current.is_empty() {
371 return;
372 }
373 out.push_str(current);
374 out.push('\n');
375 current.clear();
376}
377
378fn html_unescape(value: &str) -> String {
379 value
380 .replace("&", "&")
381 .replace("<", "<")
382 .replace(">", ">")
383 .replace(""", "\"")
384 .replace("'", "'")
385}
386
387#[cfg(test)]
388mod tests {
389 use super::*;
390 use mxr_test_support::{fixture_stem, standards_fixture_bytes, standards_fixture_names};
391 use serde_json::json;
392
393 #[test]
394 fn parses_address_list_with_comments_and_quotes() {
395 let addresses = parse_address_list("\"Last, First\" <first@example.com>, second@example.com");
396 assert_eq!(addresses.len(), 2);
397 assert_eq!(addresses[0].name.as_deref(), Some("Last, First"));
398 assert_eq!(addresses[1].email, "second@example.com");
399 }
400
401 #[test]
402 fn parses_unsubscribe_mailto_subject() {
403 let parsed = parse_headers_from_pairs(
404 &[
405 (
406 "List-Unsubscribe".to_string(),
407 "<mailto:list@example.com?subject=unsubscribe>".to_string(),
408 ),
409 ],
410 Some(Utc::now()),
411 )
412 .unwrap();
413 assert!(matches!(
414 &parsed.unsubscribe,
415 UnsubscribeMethod::Mailto {
416 address,
417 subject: Some(subject)
418 } if address == "list@example.com" && subject == "unsubscribe"
419 ), "{:?}", parsed.unsubscribe);
420 }
421
422 #[test]
423 fn decodes_format_flowed() {
424 let text = "Hello there \r\nworld\r\n\r\nNext paragraph\r\n";
425 assert_eq!(
426 decode_format_flowed(text, false),
427 "Hello there world\n\nNext paragraph"
428 );
429 }
430
431 #[test]
432 fn extracts_body_unsubscribe_link() {
433 let html = r#"<a href="https://example.com/unsubscribe?id=1">unsubscribe</a>"#;
434 assert!(matches!(
435 body_unsubscribe_from_html(html),
436 Some(UnsubscribeMethod::BodyLink { url }) if url.contains("unsubscribe")
437 ));
438 }
439
440 #[test]
441 fn standards_fixture_folded_flowed_headers_snapshot() {
442 let raw = standards_fixture_bytes("folded-flowed.eml");
443 let parsed = parse_message_metadata_from_raw(&raw).unwrap();
444 let headers = parse_headers_from_raw(
445 &extract_raw_header_block(&raw).unwrap(),
446 Some(Utc::now()),
447 )
448 .unwrap();
449
450 insta::assert_yaml_snapshot!(
451 "folded_flowed_headers",
452 json!({
453 "from": headers.from.as_ref().map(|addr| json!({"name": addr.name.clone(), "email": addr.email.clone()})),
454 "subject": headers.subject,
455 "message_id": headers.message_id_header,
456 "in_reply_to": headers.in_reply_to,
457 "references": headers.references,
458 "unsubscribe": format!("{:?}", headers.unsubscribe),
459 "list_id": parsed.list_id,
460 "auth_results": parsed.auth_results,
461 "content_language": parsed.content_language,
462 "text_plain_format": format!("{:?}", parsed.text_plain_format),
463 })
464 );
465 }
466
467 #[test]
468 fn standards_fixture_minimal_message_metadata_snapshot() {
469 let raw = standards_fixture_bytes("malformed-minimal.eml");
470 let parsed = parse_message_metadata_from_raw(&raw).unwrap();
471 insta::assert_yaml_snapshot!(
472 "malformed_minimal_metadata",
473 json!({
474 "list_id": parsed.list_id,
475 "auth_results": parsed.auth_results,
476 "content_language": parsed.content_language,
477 "text_plain_format": format!("{:?}", parsed.text_plain_format),
478 "raw_headers_present": parsed.raw_headers.is_some(),
479 })
480 );
481 }
482
483 #[test]
484 fn standards_fixture_header_matrix_snapshots() {
485 for fixture in standards_fixture_names() {
486 let raw = standards_fixture_bytes(fixture);
487 let headers = extract_raw_header_block(&raw).unwrap_or_default();
488 let parsed = parse_headers_from_raw(&headers, Some(Utc::now())).unwrap();
489 let metadata = parse_message_metadata_from_raw(&raw).unwrap();
490
491 insta::assert_yaml_snapshot!(
492 format!("fixture_headers__{}", fixture_stem(fixture)),
493 json!({
494 "from": parsed.from.as_ref().map(|addr| json!({"name": addr.name.clone(), "email": addr.email.clone()})),
495 "to": parsed.to.iter().map(|addr| json!({"name": addr.name.clone(), "email": addr.email.clone()})).collect::<Vec<_>>(),
496 "cc": parsed.cc.iter().map(|addr| json!({"name": addr.name.clone(), "email": addr.email.clone()})).collect::<Vec<_>>(),
497 "bcc": parsed.bcc.iter().map(|addr| json!({"name": addr.name.clone(), "email": addr.email.clone()})).collect::<Vec<_>>(),
498 "subject": parsed.subject,
499 "message_id": parsed.message_id_header,
500 "in_reply_to": parsed.in_reply_to,
501 "references": parsed.references,
502 "unsubscribe": format!("{:?}", parsed.unsubscribe),
503 "list_id": metadata.list_id,
504 "auth_results": metadata.auth_results,
505 "content_language": metadata.content_language,
506 "text_plain_format": format!("{:?}", metadata.text_plain_format),
507 "raw_headers_present": metadata.raw_headers.is_some(),
508 })
509 );
510 }
511 }
512}