Skip to main content

mxr_compose/
parse.rs

1use chrono::{DateTime, Utc};
2use mail_parser::{Message, MessageParser, MimeHeaders};
3use mxr_core::types::{
4    Address, CalendarMetadata, MessageMetadata, TextPlainFormat, UnsubscribeMethod,
5};
6use regex::Regex;
7use std::sync::OnceLock;
8use url::Url;
9
10#[derive(Debug, Clone)]
11pub struct ParsedHeaders {
12    pub from: Option<Address>,
13    pub to: Vec<Address>,
14    pub cc: Vec<Address>,
15    pub bcc: Vec<Address>,
16    pub subject: String,
17    pub date: DateTime<Utc>,
18    pub message_id_header: Option<String>,
19    pub in_reply_to: Option<String>,
20    pub references: Vec<String>,
21    pub unsubscribe: UnsubscribeMethod,
22    pub metadata: MessageMetadata,
23}
24
25#[derive(Debug, thiserror::Error)]
26pub enum ParseError {
27    #[error("failed to parse RFC 5322 headers")]
28    InvalidMessage,
29}
30
31pub fn raw_headers_from_pairs(headers: &[(String, String)]) -> String {
32    headers
33        .iter()
34        .map(|(name, value)| format!("{name}: {value}\r\n"))
35        .collect()
36}
37
38pub fn parse_headers_from_pairs(
39    headers: &[(String, String)],
40    fallback_date: Option<DateTime<Utc>>,
41) -> Result<ParsedHeaders, ParseError> {
42    parse_headers_from_raw(&raw_headers_from_pairs(headers), fallback_date)
43}
44
45pub fn parse_headers_from_raw(
46    raw_headers: &str,
47    fallback_date: Option<DateTime<Utc>>,
48) -> Result<ParsedHeaders, ParseError> {
49    let mut raw_message = normalize_header_block(raw_headers);
50    raw_message.push_str("\r\n");
51    let parsed = MessageParser::default()
52        .parse(raw_message.as_bytes())
53        .ok_or(ParseError::InvalidMessage)?;
54    Ok(extract_parsed_headers(
55        &parsed,
56        Some(normalize_header_block(raw_headers)),
57        fallback_date,
58    ))
59}
60
61pub fn parse_address_list(raw: &str) -> Vec<Address> {
62    if raw.trim().is_empty() {
63        return Vec::new();
64    }
65
66    parse_headers_from_pairs(&[("To".to_string(), raw.to_string())], Some(Utc::now()))
67        .map(|parsed| parsed.to)
68        .unwrap_or_default()
69}
70
71pub fn parse_message_metadata_from_raw(raw_message: &[u8]) -> Result<MessageMetadata, ParseError> {
72    let parsed = MessageParser::default()
73        .parse(raw_message)
74        .ok_or(ParseError::InvalidMessage)?;
75    let raw_headers = extract_raw_header_block(raw_message);
76    Ok(extract_metadata(&parsed, raw_headers))
77}
78
79pub fn body_unsubscribe_from_html(html: &str) -> Option<UnsubscribeMethod> {
80    static HREF_RE: OnceLock<Regex> = OnceLock::new();
81    let re = HREF_RE.get_or_init(|| {
82        Regex::new(r#"(?is)href\s*=\s*["']([^"']*(unsubscribe|opt-out|preferences)[^"']*)["']"#)
83            .unwrap()
84    });
85    re.captures(html).and_then(|caps| {
86        caps.get(1).map(|url| UnsubscribeMethod::BodyLink {
87            url: html_unescape(url.as_str()),
88        })
89    })
90}
91
92pub fn decode_format_flowed(text: &str, delsp: bool) -> String {
93    let mut out = String::new();
94    let mut current = String::new();
95
96    for line in text.lines() {
97        if line == "-- " {
98            flush_paragraph(&mut out, &mut current);
99            out.push_str("-- \n");
100            continue;
101        }
102
103        if line.is_empty() {
104            flush_paragraph(&mut out, &mut current);
105            out.push('\n');
106            continue;
107        }
108
109        let flowed = line.ends_with(' ');
110        let segment = if flowed && delsp {
111            line.trim_end_matches(' ')
112        } else {
113            line
114        };
115
116        current.push_str(segment);
117        if flowed {
118            if !delsp {
119                current.push(' ');
120            }
121        } else {
122            flush_paragraph(&mut out, &mut current);
123        }
124    }
125
126    flush_paragraph(&mut out, &mut current);
127    out.trim_end().to_string()
128}
129
130pub fn calendar_metadata_from_text(calendar_text: &str) -> Option<CalendarMetadata> {
131    let mut method = None;
132    let mut summary = None;
133
134    for line in calendar_text.lines() {
135        let line = line.trim();
136        if method.is_none() {
137            method = line
138                .strip_prefix("METHOD:")
139                .map(|value| value.trim().to_string());
140        }
141        if summary.is_none() {
142            summary = line
143                .strip_prefix("SUMMARY:")
144                .map(|value| value.trim().to_string());
145        }
146        if method.is_some() && summary.is_some() {
147            break;
148        }
149    }
150
151    if method.is_some() || summary.is_some() {
152        Some(CalendarMetadata { method, summary })
153    } else {
154        None
155    }
156}
157
158pub fn extract_parsed_headers(
159    message: &Message<'_>,
160    raw_headers: Option<String>,
161    fallback_date: Option<DateTime<Utc>>,
162) -> ParsedHeaders {
163    ParsedHeaders {
164        from: message.from().and_then(extract_first_addr),
165        to: message.to().map(extract_addrs).unwrap_or_default(),
166        cc: message.cc().map(extract_addrs).unwrap_or_default(),
167        bcc: message.bcc().map(extract_addrs).unwrap_or_default(),
168        subject: message
169            .subject()
170            .map(|subject| subject.to_string())
171            .unwrap_or_default(),
172        date: message
173            .date()
174            .and_then(|date| DateTime::from_timestamp(date.to_timestamp(), 0))
175            .or(fallback_date)
176            .unwrap_or_else(Utc::now),
177        message_id_header: message.message_id().map(|id| format!("<{id}>")),
178        in_reply_to: message
179            .in_reply_to()
180            .as_text_list()
181            .and_then(|ids| ids.first().map(|id| format!("<{id}>"))),
182        references: message
183            .references()
184            .as_text_list()
185            .map(|ids| ids.iter().map(|id| format!("<{id}>")).collect())
186            .unwrap_or_default(),
187        unsubscribe: parse_list_unsubscribe(message),
188        metadata: extract_metadata(message, raw_headers),
189    }
190}
191
192fn extract_metadata(message: &Message<'_>, raw_headers: Option<String>) -> MessageMetadata {
193    let content_language = message
194        .header_values("Content-Language")
195        .flat_map(|value| {
196            value
197                .as_text()
198                .unwrap_or_default()
199                .split(',')
200                .map(|lang| lang.trim().to_string())
201                .collect::<Vec<_>>()
202        })
203        .filter(|lang| !lang.is_empty())
204        .collect();
205
206    let auth_results = message
207        .header_values("Authentication-Results")
208        .filter_map(|value| value.as_text().map(|value| value.to_string()))
209        .collect();
210
211    let list_id = message.list_id().as_text().map(|value| value.to_string());
212    let text_plain_format = message.content_type().and_then(parse_text_plain_format);
213
214    MessageMetadata {
215        list_id,
216        auth_results,
217        content_language,
218        text_plain_format,
219        calendar: None,
220        raw_headers,
221    }
222}
223
224fn parse_text_plain_format(content_type: &mail_parser::ContentType<'_>) -> Option<TextPlainFormat> {
225    if !content_type.ctype().eq_ignore_ascii_case("text")
226        || !content_type
227            .subtype()
228            .unwrap_or_default()
229            .eq_ignore_ascii_case("plain")
230    {
231        return None;
232    }
233
234    let format = content_type.attribute("format");
235    let delsp = content_type
236        .attribute("delsp")
237        .map(|value| value.eq_ignore_ascii_case("yes"))
238        .unwrap_or(false);
239
240    match format {
241        Some(value) if value.eq_ignore_ascii_case("flowed") => {
242            Some(TextPlainFormat::Flowed { delsp })
243        }
244        _ => Some(TextPlainFormat::Fixed),
245    }
246}
247
248fn parse_list_unsubscribe(message: &Message<'_>) -> UnsubscribeMethod {
249    let entries: Vec<String> = match message.list_unsubscribe().as_address() {
250        Some(mail_parser::Address::List(list)) => list
251            .iter()
252            .filter_map(|addr| addr.address.as_ref().map(|value| value.to_string()))
253            .collect(),
254        Some(mail_parser::Address::Group(groups)) => groups
255            .iter()
256            .flat_map(|group| group.addresses.iter())
257            .filter_map(|addr| addr.address.as_ref().map(|value| value.to_string()))
258            .collect(),
259        None => Vec::new(),
260    };
261    if entries.is_empty() {
262        return UnsubscribeMethod::None;
263    }
264
265    let one_click = message
266        .header_raw("List-Unsubscribe-Post")
267        .map(|value| value.to_ascii_lowercase())
268        .map(|value| value.contains("list-unsubscribe=one-click"))
269        .unwrap_or(false);
270
271    if one_click {
272        if let Some(url) = entries
273            .iter()
274            .find(|entry| entry.starts_with("https://") || entry.starts_with("http://"))
275        {
276            return UnsubscribeMethod::OneClick {
277                url: url.to_string(),
278            };
279        }
280    }
281
282    for entry in &entries {
283        if let Some(mailto) = entry.strip_prefix("mailto:") {
284            return parse_mailto_unsubscribe(mailto);
285        }
286    }
287
288    if let Some(url) = entries
289        .iter()
290        .find(|entry| entry.starts_with("https://") || entry.starts_with("http://"))
291    {
292        return UnsubscribeMethod::HttpLink {
293            url: url.to_string(),
294        };
295    }
296
297    UnsubscribeMethod::None
298}
299
300fn parse_mailto_unsubscribe(mailto: &str) -> UnsubscribeMethod {
301    let mut subject = None;
302    let address = if let Some((address, query)) = mailto.split_once('?') {
303        for (key, value) in url::form_urlencoded::parse(query.as_bytes()) {
304            if key.eq_ignore_ascii_case("subject") {
305                subject = Some(value.to_string());
306            }
307        }
308        address.to_string()
309    } else if let Ok(url) = Url::parse(&format!("mailto:{mailto}")) {
310        for (key, value) in url.query_pairs() {
311            if key.eq_ignore_ascii_case("subject") {
312                subject = Some(value.to_string());
313            }
314        }
315        url.path().to_string()
316    } else {
317        mailto.to_string()
318    };
319
320    UnsubscribeMethod::Mailto { address, subject }
321}
322
323fn extract_first_addr(addr: &mail_parser::Address<'_>) -> Option<Address> {
324    match addr {
325        mail_parser::Address::List(list) => list.first().map(to_address),
326        mail_parser::Address::Group(groups) => groups
327            .first()
328            .and_then(|group| group.addresses.first())
329            .map(to_address),
330    }
331}
332
333fn extract_addrs(addr: &mail_parser::Address<'_>) -> Vec<Address> {
334    match addr {
335        mail_parser::Address::List(list) => list.iter().map(to_address).collect(),
336        mail_parser::Address::Group(groups) => groups
337            .iter()
338            .flat_map(|group| group.addresses.iter())
339            .map(to_address)
340            .collect(),
341    }
342}
343
344fn to_address(addr: &mail_parser::Addr<'_>) -> Address {
345    Address {
346        name: addr.name().map(|name| name.to_string()),
347        email: addr.address().unwrap_or_default().to_string(),
348    }
349}
350
351fn normalize_header_block(raw_headers: &str) -> String {
352    raw_headers
353        .lines()
354        .map(|line| line.trim_end_matches('\r'))
355        .collect::<Vec<_>>()
356        .join("\r\n")
357}
358
359pub fn extract_raw_header_block(raw_message: &[u8]) -> Option<String> {
360    let raw = String::from_utf8_lossy(raw_message);
361    let header_block = raw
362        .split("\r\n\r\n")
363        .next()
364        .or_else(|| raw.split("\n\n").next())?;
365    Some(normalize_header_block(header_block))
366}
367
368fn flush_paragraph(out: &mut String, current: &mut String) {
369    if current.is_empty() {
370        return;
371    }
372    out.push_str(current);
373    out.push('\n');
374    current.clear();
375}
376
377fn html_unescape(value: &str) -> String {
378    value
379        .replace("&amp;", "&")
380        .replace("&lt;", "<")
381        .replace("&gt;", ">")
382        .replace("&quot;", "\"")
383        .replace("&#39;", "'")
384}
385
386#[cfg(test)]
387mod tests {
388    use super::*;
389    use mxr_test_support::{fixture_stem, standards_fixture_bytes, standards_fixture_names};
390    use serde_json::json;
391
392    #[test]
393    fn parses_address_list_with_comments_and_quotes() {
394        let addresses =
395            parse_address_list("\"Last, First\" <first@example.com>, second@example.com");
396        assert_eq!(addresses.len(), 2);
397        assert_eq!(addresses[0].name.as_deref(), Some("Last, First"));
398        assert_eq!(addresses[1].email, "second@example.com");
399    }
400
401    #[test]
402    fn parses_unsubscribe_mailto_subject() {
403        let parsed = parse_headers_from_pairs(
404            &[(
405                "List-Unsubscribe".to_string(),
406                "<mailto:list@example.com?subject=unsubscribe>".to_string(),
407            )],
408            Some(Utc::now()),
409        )
410        .unwrap();
411        assert!(
412            matches!(
413                &parsed.unsubscribe,
414                UnsubscribeMethod::Mailto {
415                    address,
416                    subject: Some(subject)
417                } if address == "list@example.com" && subject == "unsubscribe"
418            ),
419            "{:?}",
420            parsed.unsubscribe
421        );
422    }
423
424    #[test]
425    fn decodes_format_flowed() {
426        let text = "Hello there \r\nworld\r\n\r\nNext paragraph\r\n";
427        assert_eq!(
428            decode_format_flowed(text, false),
429            "Hello there  world\n\nNext paragraph"
430        );
431    }
432
433    #[test]
434    fn extracts_body_unsubscribe_link() {
435        let html = r#"<a href="https://example.com/unsubscribe?id=1">unsubscribe</a>"#;
436        assert!(matches!(
437            body_unsubscribe_from_html(html),
438            Some(UnsubscribeMethod::BodyLink { url }) if url.contains("unsubscribe")
439        ));
440    }
441
442    #[test]
443    fn standards_fixture_folded_flowed_headers_snapshot() {
444        let raw = standards_fixture_bytes("folded-flowed.eml");
445        let parsed = parse_message_metadata_from_raw(&raw).unwrap();
446        let headers =
447            parse_headers_from_raw(&extract_raw_header_block(&raw).unwrap(), Some(Utc::now()))
448                .unwrap();
449
450        insta::assert_yaml_snapshot!(
451            "folded_flowed_headers",
452            json!({
453                "from": headers.from.as_ref().map(|addr| json!({"name": addr.name.clone(), "email": addr.email.clone()})),
454                "subject": headers.subject,
455                "message_id": headers.message_id_header,
456                "in_reply_to": headers.in_reply_to,
457                "references": headers.references,
458                "unsubscribe": format!("{:?}", headers.unsubscribe),
459                "list_id": parsed.list_id,
460                "auth_results": parsed.auth_results,
461                "content_language": parsed.content_language,
462                "text_plain_format": format!("{:?}", parsed.text_plain_format),
463            })
464        );
465    }
466
467    #[test]
468    fn standards_fixture_minimal_message_metadata_snapshot() {
469        let raw = standards_fixture_bytes("malformed-minimal.eml");
470        let parsed = parse_message_metadata_from_raw(&raw).unwrap();
471        insta::assert_yaml_snapshot!(
472            "malformed_minimal_metadata",
473            json!({
474                "list_id": parsed.list_id,
475                "auth_results": parsed.auth_results,
476                "content_language": parsed.content_language,
477                "text_plain_format": format!("{:?}", parsed.text_plain_format),
478                "raw_headers_present": parsed.raw_headers.is_some(),
479            })
480        );
481    }
482
483    #[test]
484    fn standards_fixture_header_matrix_snapshots() {
485        for fixture in standards_fixture_names() {
486            let raw = standards_fixture_bytes(fixture);
487            let headers = extract_raw_header_block(&raw).unwrap_or_default();
488            let parsed = parse_headers_from_raw(&headers, Some(Utc::now())).unwrap();
489            let metadata = parse_message_metadata_from_raw(&raw).unwrap();
490
491            insta::assert_yaml_snapshot!(
492                format!("fixture_headers__{}", fixture_stem(fixture)),
493                json!({
494                    "from": parsed.from.as_ref().map(|addr| json!({"name": addr.name.clone(), "email": addr.email.clone()})),
495                    "to": parsed.to.iter().map(|addr| json!({"name": addr.name.clone(), "email": addr.email.clone()})).collect::<Vec<_>>(),
496                    "cc": parsed.cc.iter().map(|addr| json!({"name": addr.name.clone(), "email": addr.email.clone()})).collect::<Vec<_>>(),
497                    "bcc": parsed.bcc.iter().map(|addr| json!({"name": addr.name.clone(), "email": addr.email.clone()})).collect::<Vec<_>>(),
498                    "subject": parsed.subject,
499                    "message_id": parsed.message_id_header,
500                    "in_reply_to": parsed.in_reply_to,
501                    "references": parsed.references,
502                    "unsubscribe": format!("{:?}", parsed.unsubscribe),
503                    "list_id": metadata.list_id,
504                    "auth_results": metadata.auth_results,
505                    "content_language": metadata.content_language,
506                    "text_plain_format": format!("{:?}", metadata.text_plain_format),
507                    "raw_headers_present": metadata.raw_headers.is_some(),
508                })
509            );
510        }
511    }
512}