1use chrono::{DateTime, Utc};
2use mail_parser::{Message, MessageParser, MimeHeaders};
3use mxr_core::types::{
4 Address, CalendarMetadata, MessageMetadata, TextPlainFormat, UnsubscribeMethod,
5};
6use regex::Regex;
7use std::sync::OnceLock;
8use url::Url;
9
10#[derive(Debug, Clone)]
11pub struct ParsedHeaders {
12 pub from: Option<Address>,
13 pub to: Vec<Address>,
14 pub cc: Vec<Address>,
15 pub bcc: Vec<Address>,
16 pub subject: String,
17 pub date: DateTime<Utc>,
18 pub message_id_header: Option<String>,
19 pub in_reply_to: Option<String>,
20 pub references: Vec<String>,
21 pub unsubscribe: UnsubscribeMethod,
22 pub metadata: MessageMetadata,
23}
24
25#[derive(Debug, thiserror::Error)]
26pub enum ParseError {
27 #[error("failed to parse RFC 5322 headers")]
28 InvalidMessage,
29}
30
31pub fn raw_headers_from_pairs(headers: &[(String, String)]) -> String {
32 headers
33 .iter()
34 .map(|(name, value)| format!("{name}: {value}\r\n"))
35 .collect()
36}
37
38pub fn parse_headers_from_pairs(
39 headers: &[(String, String)],
40 fallback_date: Option<DateTime<Utc>>,
41) -> Result<ParsedHeaders, ParseError> {
42 parse_headers_from_raw(&raw_headers_from_pairs(headers), fallback_date)
43}
44
45pub fn parse_headers_from_raw(
46 raw_headers: &str,
47 fallback_date: Option<DateTime<Utc>>,
48) -> Result<ParsedHeaders, ParseError> {
49 let mut raw_message = normalize_header_block(raw_headers);
50 raw_message.push_str("\r\n");
51 let parsed = MessageParser::default()
52 .parse(raw_message.as_bytes())
53 .ok_or(ParseError::InvalidMessage)?;
54 Ok(extract_parsed_headers(
55 &parsed,
56 Some(normalize_header_block(raw_headers)),
57 fallback_date,
58 ))
59}
60
61pub fn parse_address_list(raw: &str) -> Vec<Address> {
62 if raw.trim().is_empty() {
63 return Vec::new();
64 }
65
66 parse_headers_from_pairs(&[("To".to_string(), raw.to_string())], Some(Utc::now()))
67 .map(|parsed| parsed.to)
68 .unwrap_or_default()
69}
70
71pub fn parse_message_metadata_from_raw(raw_message: &[u8]) -> Result<MessageMetadata, ParseError> {
72 let parsed = MessageParser::default()
73 .parse(raw_message)
74 .ok_or(ParseError::InvalidMessage)?;
75 let raw_headers = extract_raw_header_block(raw_message);
76 Ok(extract_metadata(&parsed, raw_headers))
77}
78
79pub fn body_unsubscribe_from_html(html: &str) -> Option<UnsubscribeMethod> {
80 static HREF_RE: OnceLock<Regex> = OnceLock::new();
81 let re = HREF_RE.get_or_init(|| {
82 Regex::new(r#"(?is)href\s*=\s*["']([^"']*(unsubscribe|opt-out|preferences)[^"']*)["']"#)
83 .unwrap()
84 });
85 re.captures(html).and_then(|caps| {
86 caps.get(1).map(|url| UnsubscribeMethod::BodyLink {
87 url: html_unescape(url.as_str()),
88 })
89 })
90}
91
92pub fn decode_format_flowed(text: &str, delsp: bool) -> String {
93 let mut out = String::new();
94 let mut current = String::new();
95
96 for line in text.lines() {
97 if line == "-- " {
98 flush_paragraph(&mut out, &mut current);
99 out.push_str("-- \n");
100 continue;
101 }
102
103 if line.is_empty() {
104 flush_paragraph(&mut out, &mut current);
105 out.push('\n');
106 continue;
107 }
108
109 let flowed = line.ends_with(' ');
110 let segment = if flowed && delsp {
111 line.trim_end_matches(' ')
112 } else {
113 line
114 };
115
116 current.push_str(segment);
117 if flowed {
118 if !delsp {
119 current.push(' ');
120 }
121 } else {
122 flush_paragraph(&mut out, &mut current);
123 }
124 }
125
126 flush_paragraph(&mut out, &mut current);
127 out.trim_end().to_string()
128}
129
130pub fn calendar_metadata_from_text(calendar_text: &str) -> Option<CalendarMetadata> {
131 let mut method = None;
132 let mut summary = None;
133
134 for line in calendar_text.lines() {
135 let line = line.trim();
136 if method.is_none() {
137 method = line
138 .strip_prefix("METHOD:")
139 .map(|value| value.trim().to_string());
140 }
141 if summary.is_none() {
142 summary = line
143 .strip_prefix("SUMMARY:")
144 .map(|value| value.trim().to_string());
145 }
146 if method.is_some() && summary.is_some() {
147 break;
148 }
149 }
150
151 if method.is_some() || summary.is_some() {
152 Some(CalendarMetadata { method, summary })
153 } else {
154 None
155 }
156}
157
158pub fn extract_parsed_headers(
159 message: &Message<'_>,
160 raw_headers: Option<String>,
161 fallback_date: Option<DateTime<Utc>>,
162) -> ParsedHeaders {
163 ParsedHeaders {
164 from: message.from().and_then(extract_first_addr),
165 to: message.to().map(extract_addrs).unwrap_or_default(),
166 cc: message.cc().map(extract_addrs).unwrap_or_default(),
167 bcc: message.bcc().map(extract_addrs).unwrap_or_default(),
168 subject: message
169 .subject()
170 .map(|subject| subject.to_string())
171 .unwrap_or_default(),
172 date: message
173 .date()
174 .and_then(|date| DateTime::from_timestamp(date.to_timestamp(), 0))
175 .or(fallback_date)
176 .unwrap_or_else(Utc::now),
177 message_id_header: message.message_id().map(|id| format!("<{id}>")),
178 in_reply_to: message
179 .in_reply_to()
180 .as_text_list()
181 .and_then(|ids| ids.first().map(|id| format!("<{id}>"))),
182 references: message
183 .references()
184 .as_text_list()
185 .map(|ids| ids.iter().map(|id| format!("<{id}>")).collect())
186 .unwrap_or_default(),
187 unsubscribe: parse_list_unsubscribe(message),
188 metadata: extract_metadata(message, raw_headers),
189 }
190}
191
192fn extract_metadata(message: &Message<'_>, raw_headers: Option<String>) -> MessageMetadata {
193 let content_language = message
194 .header_values("Content-Language")
195 .flat_map(|value| {
196 value
197 .as_text()
198 .unwrap_or_default()
199 .split(',')
200 .map(|lang| lang.trim().to_string())
201 .collect::<Vec<_>>()
202 })
203 .filter(|lang| !lang.is_empty())
204 .collect();
205
206 let auth_results = message
207 .header_values("Authentication-Results")
208 .filter_map(|value| value.as_text().map(|value| value.to_string()))
209 .collect();
210
211 let list_id = message.list_id().as_text().map(|value| value.to_string());
212 let text_plain_format = message.content_type().and_then(parse_text_plain_format);
213
214 MessageMetadata {
215 list_id,
216 auth_results,
217 content_language,
218 text_plain_format,
219 calendar: None,
220 raw_headers,
221 }
222}
223
224fn parse_text_plain_format(content_type: &mail_parser::ContentType<'_>) -> Option<TextPlainFormat> {
225 if !content_type.ctype().eq_ignore_ascii_case("text")
226 || !content_type
227 .subtype()
228 .unwrap_or_default()
229 .eq_ignore_ascii_case("plain")
230 {
231 return None;
232 }
233
234 let format = content_type.attribute("format");
235 let delsp = content_type
236 .attribute("delsp")
237 .map(|value| value.eq_ignore_ascii_case("yes"))
238 .unwrap_or(false);
239
240 match format {
241 Some(value) if value.eq_ignore_ascii_case("flowed") => {
242 Some(TextPlainFormat::Flowed { delsp })
243 }
244 _ => Some(TextPlainFormat::Fixed),
245 }
246}
247
248fn parse_list_unsubscribe(message: &Message<'_>) -> UnsubscribeMethod {
249 let entries: Vec<String> = match message.list_unsubscribe().as_address() {
250 Some(mail_parser::Address::List(list)) => list
251 .iter()
252 .filter_map(|addr| addr.address.as_ref().map(|value| value.to_string()))
253 .collect(),
254 Some(mail_parser::Address::Group(groups)) => groups
255 .iter()
256 .flat_map(|group| group.addresses.iter())
257 .filter_map(|addr| addr.address.as_ref().map(|value| value.to_string()))
258 .collect(),
259 None => Vec::new(),
260 };
261 if entries.is_empty() {
262 return UnsubscribeMethod::None;
263 }
264
265 let one_click = message
266 .header_raw("List-Unsubscribe-Post")
267 .map(|value| value.to_ascii_lowercase())
268 .map(|value| value.contains("list-unsubscribe=one-click"))
269 .unwrap_or(false);
270
271 if one_click {
272 if let Some(url) = entries
273 .iter()
274 .find(|entry| entry.starts_with("https://") || entry.starts_with("http://"))
275 {
276 return UnsubscribeMethod::OneClick {
277 url: url.to_string(),
278 };
279 }
280 }
281
282 for entry in &entries {
283 if let Some(mailto) = entry.strip_prefix("mailto:") {
284 return parse_mailto_unsubscribe(mailto);
285 }
286 }
287
288 if let Some(url) = entries
289 .iter()
290 .find(|entry| entry.starts_with("https://") || entry.starts_with("http://"))
291 {
292 return UnsubscribeMethod::HttpLink {
293 url: url.to_string(),
294 };
295 }
296
297 UnsubscribeMethod::None
298}
299
300fn parse_mailto_unsubscribe(mailto: &str) -> UnsubscribeMethod {
301 let mut subject = None;
302 let address = if let Some((address, query)) = mailto.split_once('?') {
303 for (key, value) in url::form_urlencoded::parse(query.as_bytes()) {
304 if key.eq_ignore_ascii_case("subject") {
305 subject = Some(value.to_string());
306 }
307 }
308 address.to_string()
309 } else if let Ok(url) = Url::parse(&format!("mailto:{mailto}")) {
310 for (key, value) in url.query_pairs() {
311 if key.eq_ignore_ascii_case("subject") {
312 subject = Some(value.to_string());
313 }
314 }
315 url.path().to_string()
316 } else {
317 mailto.to_string()
318 };
319
320 UnsubscribeMethod::Mailto { address, subject }
321}
322
323fn extract_first_addr(addr: &mail_parser::Address<'_>) -> Option<Address> {
324 match addr {
325 mail_parser::Address::List(list) => list.first().map(to_address),
326 mail_parser::Address::Group(groups) => groups
327 .first()
328 .and_then(|group| group.addresses.first())
329 .map(to_address),
330 }
331}
332
333fn extract_addrs(addr: &mail_parser::Address<'_>) -> Vec<Address> {
334 match addr {
335 mail_parser::Address::List(list) => list.iter().map(to_address).collect(),
336 mail_parser::Address::Group(groups) => groups
337 .iter()
338 .flat_map(|group| group.addresses.iter())
339 .map(to_address)
340 .collect(),
341 }
342}
343
344fn to_address(addr: &mail_parser::Addr<'_>) -> Address {
345 Address {
346 name: addr.name().map(|name| name.to_string()),
347 email: addr.address().unwrap_or_default().to_string(),
348 }
349}
350
351fn normalize_header_block(raw_headers: &str) -> String {
352 raw_headers
353 .lines()
354 .map(|line| line.trim_end_matches('\r'))
355 .collect::<Vec<_>>()
356 .join("\r\n")
357}
358
359pub fn extract_raw_header_block(raw_message: &[u8]) -> Option<String> {
360 let raw = String::from_utf8_lossy(raw_message);
361 let header_block = raw
362 .split("\r\n\r\n")
363 .next()
364 .or_else(|| raw.split("\n\n").next())?;
365 Some(normalize_header_block(header_block))
366}
367
368fn flush_paragraph(out: &mut String, current: &mut String) {
369 if current.is_empty() {
370 return;
371 }
372 out.push_str(current);
373 out.push('\n');
374 current.clear();
375}
376
377fn html_unescape(value: &str) -> String {
378 value
379 .replace("&", "&")
380 .replace("<", "<")
381 .replace(">", ">")
382 .replace(""", "\"")
383 .replace("'", "'")
384}
385
386#[cfg(test)]
387mod tests {
388 use super::*;
389 use mxr_test_support::{fixture_stem, standards_fixture_bytes, standards_fixture_names};
390 use serde_json::json;
391
392 #[test]
393 fn parses_address_list_with_comments_and_quotes() {
394 let addresses =
395 parse_address_list("\"Last, First\" <first@example.com>, second@example.com");
396 assert_eq!(addresses.len(), 2);
397 assert_eq!(addresses[0].name.as_deref(), Some("Last, First"));
398 assert_eq!(addresses[1].email, "second@example.com");
399 }
400
401 #[test]
402 fn parses_unsubscribe_mailto_subject() {
403 let parsed = parse_headers_from_pairs(
404 &[(
405 "List-Unsubscribe".to_string(),
406 "<mailto:list@example.com?subject=unsubscribe>".to_string(),
407 )],
408 Some(Utc::now()),
409 )
410 .unwrap();
411 assert!(
412 matches!(
413 &parsed.unsubscribe,
414 UnsubscribeMethod::Mailto {
415 address,
416 subject: Some(subject)
417 } if address == "list@example.com" && subject == "unsubscribe"
418 ),
419 "{:?}",
420 parsed.unsubscribe
421 );
422 }
423
424 #[test]
425 fn decodes_format_flowed() {
426 let text = "Hello there \r\nworld\r\n\r\nNext paragraph\r\n";
427 assert_eq!(
428 decode_format_flowed(text, false),
429 "Hello there world\n\nNext paragraph"
430 );
431 }
432
433 #[test]
434 fn extracts_body_unsubscribe_link() {
435 let html = r#"<a href="https://example.com/unsubscribe?id=1">unsubscribe</a>"#;
436 assert!(matches!(
437 body_unsubscribe_from_html(html),
438 Some(UnsubscribeMethod::BodyLink { url }) if url.contains("unsubscribe")
439 ));
440 }
441
442 #[test]
443 fn standards_fixture_folded_flowed_headers_snapshot() {
444 let raw = standards_fixture_bytes("folded-flowed.eml");
445 let parsed = parse_message_metadata_from_raw(&raw).unwrap();
446 let headers =
447 parse_headers_from_raw(&extract_raw_header_block(&raw).unwrap(), Some(Utc::now()))
448 .unwrap();
449
450 insta::assert_yaml_snapshot!(
451 "folded_flowed_headers",
452 json!({
453 "from": headers.from.as_ref().map(|addr| json!({"name": addr.name.clone(), "email": addr.email.clone()})),
454 "subject": headers.subject,
455 "message_id": headers.message_id_header,
456 "in_reply_to": headers.in_reply_to,
457 "references": headers.references,
458 "unsubscribe": format!("{:?}", headers.unsubscribe),
459 "list_id": parsed.list_id,
460 "auth_results": parsed.auth_results,
461 "content_language": parsed.content_language,
462 "text_plain_format": format!("{:?}", parsed.text_plain_format),
463 })
464 );
465 }
466
467 #[test]
468 fn standards_fixture_minimal_message_metadata_snapshot() {
469 let raw = standards_fixture_bytes("malformed-minimal.eml");
470 let parsed = parse_message_metadata_from_raw(&raw).unwrap();
471 insta::assert_yaml_snapshot!(
472 "malformed_minimal_metadata",
473 json!({
474 "list_id": parsed.list_id,
475 "auth_results": parsed.auth_results,
476 "content_language": parsed.content_language,
477 "text_plain_format": format!("{:?}", parsed.text_plain_format),
478 "raw_headers_present": parsed.raw_headers.is_some(),
479 })
480 );
481 }
482
483 #[test]
484 fn standards_fixture_header_matrix_snapshots() {
485 for fixture in standards_fixture_names() {
486 let raw = standards_fixture_bytes(fixture);
487 let headers = extract_raw_header_block(&raw).unwrap_or_default();
488 let parsed = parse_headers_from_raw(&headers, Some(Utc::now())).unwrap();
489 let metadata = parse_message_metadata_from_raw(&raw).unwrap();
490
491 insta::assert_yaml_snapshot!(
492 format!("fixture_headers__{}", fixture_stem(fixture)),
493 json!({
494 "from": parsed.from.as_ref().map(|addr| json!({"name": addr.name.clone(), "email": addr.email.clone()})),
495 "to": parsed.to.iter().map(|addr| json!({"name": addr.name.clone(), "email": addr.email.clone()})).collect::<Vec<_>>(),
496 "cc": parsed.cc.iter().map(|addr| json!({"name": addr.name.clone(), "email": addr.email.clone()})).collect::<Vec<_>>(),
497 "bcc": parsed.bcc.iter().map(|addr| json!({"name": addr.name.clone(), "email": addr.email.clone()})).collect::<Vec<_>>(),
498 "subject": parsed.subject,
499 "message_id": parsed.message_id_header,
500 "in_reply_to": parsed.in_reply_to,
501 "references": parsed.references,
502 "unsubscribe": format!("{:?}", parsed.unsubscribe),
503 "list_id": metadata.list_id,
504 "auth_results": metadata.auth_results,
505 "content_language": metadata.content_language,
506 "text_plain_format": format!("{:?}", metadata.text_plain_format),
507 "raw_headers_present": metadata.raw_headers.is_some(),
508 })
509 );
510 }
511 }
512}