Skip to main content

mxr_compose/
parse.rs

1use chrono::{DateTime, Utc};
2use mail_parser::{Message, MessageParser, MimeHeaders};
3use mxr_core::types::{
4    Address, CalendarMetadata, MessageMetadata, TextPlainFormat, UnsubscribeMethod,
5};
6use regex::Regex;
7use std::sync::OnceLock;
8use url::Url;
9
10#[derive(Debug, Clone)]
11pub struct ParsedHeaders {
12    pub from: Option<Address>,
13    pub to: Vec<Address>,
14    pub cc: Vec<Address>,
15    pub bcc: Vec<Address>,
16    pub subject: String,
17    pub date: DateTime<Utc>,
18    pub message_id_header: Option<String>,
19    pub in_reply_to: Option<String>,
20    pub references: Vec<String>,
21    pub unsubscribe: UnsubscribeMethod,
22    pub metadata: MessageMetadata,
23}
24
25#[derive(Debug, thiserror::Error)]
26pub enum ParseError {
27    #[error("failed to parse RFC 5322 headers")]
28    InvalidMessage,
29}
30
31pub fn raw_headers_from_pairs(headers: &[(String, String)]) -> String {
32    headers
33        .iter()
34        .map(|(name, value)| format!("{name}: {value}\r\n"))
35        .collect()
36}
37
38pub fn parse_headers_from_pairs(
39    headers: &[(String, String)],
40    fallback_date: Option<DateTime<Utc>>,
41) -> Result<ParsedHeaders, ParseError> {
42    parse_headers_from_raw(&raw_headers_from_pairs(headers), fallback_date)
43}
44
45pub fn parse_headers_from_raw(
46    raw_headers: &str,
47    fallback_date: Option<DateTime<Utc>>,
48) -> Result<ParsedHeaders, ParseError> {
49    let mut raw_message = normalize_header_block(raw_headers);
50    raw_message.push_str("\r\n");
51    let parsed = MessageParser::default()
52        .parse(raw_message.as_bytes())
53        .ok_or(ParseError::InvalidMessage)?;
54    Ok(extract_parsed_headers(
55        &parsed,
56        Some(normalize_header_block(raw_headers)),
57        fallback_date,
58    ))
59}
60
61pub fn parse_address_list(raw: &str) -> Vec<Address> {
62    if raw.trim().is_empty() {
63        return Vec::new();
64    }
65
66    parse_headers_from_pairs(
67        &[("To".to_string(), raw.to_string())],
68        Some(Utc::now()),
69    )
70    .map(|parsed| parsed.to)
71    .unwrap_or_default()
72}
73
74pub fn parse_message_metadata_from_raw(raw_message: &[u8]) -> Result<MessageMetadata, ParseError> {
75    let parsed = MessageParser::default()
76        .parse(raw_message)
77        .ok_or(ParseError::InvalidMessage)?;
78    let raw_headers = extract_raw_header_block(raw_message);
79    Ok(extract_metadata(&parsed, raw_headers))
80}
81
82pub fn body_unsubscribe_from_html(html: &str) -> Option<UnsubscribeMethod> {
83    static HREF_RE: OnceLock<Regex> = OnceLock::new();
84    let re = HREF_RE.get_or_init(|| {
85        Regex::new(r#"(?is)href\s*=\s*["']([^"']*(unsubscribe|opt-out|preferences)[^"']*)["']"#)
86            .unwrap()
87    });
88    re.captures(html).and_then(|caps| {
89        caps.get(1).map(|url| UnsubscribeMethod::BodyLink {
90            url: html_unescape(url.as_str()),
91        })
92    })
93}
94
95pub fn decode_format_flowed(text: &str, delsp: bool) -> String {
96    let mut out = String::new();
97    let mut current = String::new();
98
99    for line in text.lines() {
100        if line == "-- " {
101            flush_paragraph(&mut out, &mut current);
102            out.push_str("-- \n");
103            continue;
104        }
105
106        if line.is_empty() {
107            flush_paragraph(&mut out, &mut current);
108            out.push('\n');
109            continue;
110        }
111
112        let flowed = line.ends_with(' ');
113        let segment = if flowed && delsp {
114            line.trim_end_matches(' ')
115        } else {
116            line
117        };
118
119        current.push_str(segment);
120        if flowed {
121            if !delsp {
122                current.push(' ');
123            }
124        } else {
125            flush_paragraph(&mut out, &mut current);
126        }
127    }
128
129    flush_paragraph(&mut out, &mut current);
130    out.trim_end().to_string()
131}
132
133pub fn calendar_metadata_from_text(calendar_text: &str) -> Option<CalendarMetadata> {
134    let mut method = None;
135    let mut summary = None;
136
137    for line in calendar_text.lines() {
138        let line = line.trim();
139        if method.is_none() {
140            method = line.strip_prefix("METHOD:").map(|value| value.trim().to_string());
141        }
142        if summary.is_none() {
143            summary = line.strip_prefix("SUMMARY:").map(|value| value.trim().to_string());
144        }
145        if method.is_some() && summary.is_some() {
146            break;
147        }
148    }
149
150    if method.is_some() || summary.is_some() {
151        Some(CalendarMetadata { method, summary })
152    } else {
153        None
154    }
155}
156
157pub fn extract_parsed_headers(
158    message: &Message<'_>,
159    raw_headers: Option<String>,
160    fallback_date: Option<DateTime<Utc>>,
161) -> ParsedHeaders {
162    ParsedHeaders {
163        from: message.from().and_then(extract_first_addr),
164        to: message.to().map(extract_addrs).unwrap_or_default(),
165        cc: message.cc().map(extract_addrs).unwrap_or_default(),
166        bcc: message.bcc().map(extract_addrs).unwrap_or_default(),
167        subject: message
168            .subject()
169            .map(|subject| subject.to_string())
170            .unwrap_or_default(),
171        date: message
172            .date()
173            .and_then(|date| DateTime::from_timestamp(date.to_timestamp(), 0))
174            .or(fallback_date)
175            .unwrap_or_else(Utc::now),
176        message_id_header: message.message_id().map(|id| format!("<{id}>")),
177        in_reply_to: message
178            .in_reply_to()
179            .as_text_list()
180            .and_then(|ids| ids.first().map(|id| format!("<{id}>"))),
181        references: message
182            .references()
183            .as_text_list()
184            .map(|ids| ids.iter().map(|id| format!("<{id}>")).collect())
185            .unwrap_or_default(),
186        unsubscribe: parse_list_unsubscribe(message),
187        metadata: extract_metadata(message, raw_headers),
188    }
189}
190
191fn extract_metadata(message: &Message<'_>, raw_headers: Option<String>) -> MessageMetadata {
192    let content_language = message
193        .header_values("Content-Language")
194        .flat_map(|value| {
195            value
196                .as_text()
197                .unwrap_or_default()
198                .split(',')
199                .map(|lang| lang.trim().to_string())
200                .collect::<Vec<_>>()
201        })
202        .filter(|lang| !lang.is_empty())
203        .collect();
204
205    let auth_results = message
206        .header_values("Authentication-Results")
207        .filter_map(|value| value.as_text().map(|value| value.to_string()))
208        .collect();
209
210    let list_id = message.list_id().as_text().map(|value| value.to_string());
211    let text_plain_format = message.content_type().and_then(parse_text_plain_format);
212
213    MessageMetadata {
214        list_id,
215        auth_results,
216        content_language,
217        text_plain_format,
218        calendar: None,
219        raw_headers,
220    }
221}
222
223fn parse_text_plain_format(content_type: &mail_parser::ContentType<'_>) -> Option<TextPlainFormat> {
224    if !content_type
225        .ctype()
226        .eq_ignore_ascii_case("text")
227        || !content_type
228            .subtype()
229            .unwrap_or_default()
230            .eq_ignore_ascii_case("plain")
231    {
232        return None;
233    }
234
235    let format = content_type.attribute("format");
236    let delsp = content_type
237        .attribute("delsp")
238        .map(|value| value.eq_ignore_ascii_case("yes"))
239        .unwrap_or(false);
240
241    match format {
242        Some(value) if value.eq_ignore_ascii_case("flowed") => {
243            Some(TextPlainFormat::Flowed { delsp })
244        }
245        _ => Some(TextPlainFormat::Fixed),
246    }
247}
248
249fn parse_list_unsubscribe(message: &Message<'_>) -> UnsubscribeMethod {
250    let entries: Vec<String> = match message.list_unsubscribe().as_address() {
251        Some(mail_parser::Address::List(list)) => list
252            .iter()
253            .filter_map(|addr| addr.address.as_ref().map(|value| value.to_string()))
254            .collect(),
255        Some(mail_parser::Address::Group(groups)) => groups
256            .iter()
257            .flat_map(|group| group.addresses.iter())
258            .filter_map(|addr| addr.address.as_ref().map(|value| value.to_string()))
259            .collect(),
260        None => Vec::new(),
261    };
262    if entries.is_empty() {
263        return UnsubscribeMethod::None;
264    }
265
266    let one_click = message
267        .header_raw("List-Unsubscribe-Post")
268        .map(|value| value.to_ascii_lowercase())
269        .map(|value| value.contains("list-unsubscribe=one-click"))
270        .unwrap_or(false);
271
272    if one_click {
273        if let Some(url) = entries
274            .iter()
275            .find(|entry| entry.starts_with("https://") || entry.starts_with("http://"))
276        {
277            return UnsubscribeMethod::OneClick {
278                url: url.to_string(),
279            };
280        }
281    }
282
283    for entry in &entries {
284        if let Some(mailto) = entry.strip_prefix("mailto:") {
285            return parse_mailto_unsubscribe(mailto);
286        }
287    }
288
289    if let Some(url) = entries
290        .iter()
291        .find(|entry| entry.starts_with("https://") || entry.starts_with("http://"))
292    {
293        return UnsubscribeMethod::HttpLink {
294            url: url.to_string(),
295        };
296    }
297
298    UnsubscribeMethod::None
299}
300
301fn parse_mailto_unsubscribe(mailto: &str) -> UnsubscribeMethod {
302    let mut subject = None;
303    let address = if let Some((address, query)) = mailto.split_once('?') {
304        for (key, value) in url::form_urlencoded::parse(query.as_bytes()) {
305            if key.eq_ignore_ascii_case("subject") {
306                subject = Some(value.to_string());
307            }
308        }
309        address.to_string()
310    } else if let Ok(url) = Url::parse(&format!("mailto:{mailto}")) {
311        for (key, value) in url.query_pairs() {
312            if key.eq_ignore_ascii_case("subject") {
313                subject = Some(value.to_string());
314            }
315        }
316        url.path().to_string()
317    } else {
318        mailto.to_string()
319    };
320
321    UnsubscribeMethod::Mailto { address, subject }
322}
323
324fn extract_first_addr(addr: &mail_parser::Address<'_>) -> Option<Address> {
325    match addr {
326        mail_parser::Address::List(list) => list.first().map(to_address),
327        mail_parser::Address::Group(groups) => groups
328            .first()
329            .and_then(|group| group.addresses.first())
330            .map(to_address),
331    }
332}
333
334fn extract_addrs(addr: &mail_parser::Address<'_>) -> Vec<Address> {
335    match addr {
336        mail_parser::Address::List(list) => list.iter().map(to_address).collect(),
337        mail_parser::Address::Group(groups) => groups
338            .iter()
339            .flat_map(|group| group.addresses.iter())
340            .map(to_address)
341            .collect(),
342    }
343}
344
345fn to_address(addr: &mail_parser::Addr<'_>) -> Address {
346    Address {
347        name: addr.name().map(|name| name.to_string()),
348        email: addr.address().unwrap_or_default().to_string(),
349    }
350}
351
352fn normalize_header_block(raw_headers: &str) -> String {
353    raw_headers
354        .lines()
355        .map(|line| line.trim_end_matches('\r'))
356        .collect::<Vec<_>>()
357        .join("\r\n")
358}
359
360pub fn extract_raw_header_block(raw_message: &[u8]) -> Option<String> {
361    let raw = String::from_utf8_lossy(raw_message);
362    let header_block = raw
363        .split("\r\n\r\n")
364        .next()
365        .or_else(|| raw.split("\n\n").next())?;
366    Some(normalize_header_block(header_block))
367}
368
369fn flush_paragraph(out: &mut String, current: &mut String) {
370    if current.is_empty() {
371        return;
372    }
373    out.push_str(current);
374    out.push('\n');
375    current.clear();
376}
377
378fn html_unescape(value: &str) -> String {
379    value
380        .replace("&amp;", "&")
381        .replace("&lt;", "<")
382        .replace("&gt;", ">")
383        .replace("&quot;", "\"")
384        .replace("&#39;", "'")
385}
386
387#[cfg(test)]
388mod tests {
389    use super::*;
390    use mxr_test_support::{fixture_stem, standards_fixture_bytes, standards_fixture_names};
391    use serde_json::json;
392
393    #[test]
394    fn parses_address_list_with_comments_and_quotes() {
395        let addresses = parse_address_list("\"Last, First\" <first@example.com>, second@example.com");
396        assert_eq!(addresses.len(), 2);
397        assert_eq!(addresses[0].name.as_deref(), Some("Last, First"));
398        assert_eq!(addresses[1].email, "second@example.com");
399    }
400
401    #[test]
402    fn parses_unsubscribe_mailto_subject() {
403        let parsed = parse_headers_from_pairs(
404            &[
405                (
406                    "List-Unsubscribe".to_string(),
407                    "<mailto:list@example.com?subject=unsubscribe>".to_string(),
408                ),
409            ],
410            Some(Utc::now()),
411        )
412        .unwrap();
413        assert!(matches!(
414            &parsed.unsubscribe,
415            UnsubscribeMethod::Mailto {
416                address,
417                subject: Some(subject)
418            } if address == "list@example.com" && subject == "unsubscribe"
419        ), "{:?}", parsed.unsubscribe);
420    }
421
422    #[test]
423    fn decodes_format_flowed() {
424        let text = "Hello there \r\nworld\r\n\r\nNext paragraph\r\n";
425        assert_eq!(
426            decode_format_flowed(text, false),
427            "Hello there  world\n\nNext paragraph"
428        );
429    }
430
431    #[test]
432    fn extracts_body_unsubscribe_link() {
433        let html = r#"<a href="https://example.com/unsubscribe?id=1">unsubscribe</a>"#;
434        assert!(matches!(
435            body_unsubscribe_from_html(html),
436            Some(UnsubscribeMethod::BodyLink { url }) if url.contains("unsubscribe")
437        ));
438    }
439
440    #[test]
441    fn standards_fixture_folded_flowed_headers_snapshot() {
442        let raw = standards_fixture_bytes("folded-flowed.eml");
443        let parsed = parse_message_metadata_from_raw(&raw).unwrap();
444        let headers = parse_headers_from_raw(
445            &extract_raw_header_block(&raw).unwrap(),
446            Some(Utc::now()),
447        )
448        .unwrap();
449
450        insta::assert_yaml_snapshot!(
451            "folded_flowed_headers",
452            json!({
453                "from": headers.from.as_ref().map(|addr| json!({"name": addr.name.clone(), "email": addr.email.clone()})),
454                "subject": headers.subject,
455                "message_id": headers.message_id_header,
456                "in_reply_to": headers.in_reply_to,
457                "references": headers.references,
458                "unsubscribe": format!("{:?}", headers.unsubscribe),
459                "list_id": parsed.list_id,
460                "auth_results": parsed.auth_results,
461                "content_language": parsed.content_language,
462                "text_plain_format": format!("{:?}", parsed.text_plain_format),
463            })
464        );
465    }
466
467    #[test]
468    fn standards_fixture_minimal_message_metadata_snapshot() {
469        let raw = standards_fixture_bytes("malformed-minimal.eml");
470        let parsed = parse_message_metadata_from_raw(&raw).unwrap();
471        insta::assert_yaml_snapshot!(
472            "malformed_minimal_metadata",
473            json!({
474                "list_id": parsed.list_id,
475                "auth_results": parsed.auth_results,
476                "content_language": parsed.content_language,
477                "text_plain_format": format!("{:?}", parsed.text_plain_format),
478                "raw_headers_present": parsed.raw_headers.is_some(),
479            })
480        );
481    }
482
483    #[test]
484    fn standards_fixture_header_matrix_snapshots() {
485        for fixture in standards_fixture_names() {
486            let raw = standards_fixture_bytes(fixture);
487            let headers = extract_raw_header_block(&raw).unwrap_or_default();
488            let parsed = parse_headers_from_raw(&headers, Some(Utc::now())).unwrap();
489            let metadata = parse_message_metadata_from_raw(&raw).unwrap();
490
491            insta::assert_yaml_snapshot!(
492                format!("fixture_headers__{}", fixture_stem(fixture)),
493                json!({
494                    "from": parsed.from.as_ref().map(|addr| json!({"name": addr.name.clone(), "email": addr.email.clone()})),
495                    "to": parsed.to.iter().map(|addr| json!({"name": addr.name.clone(), "email": addr.email.clone()})).collect::<Vec<_>>(),
496                    "cc": parsed.cc.iter().map(|addr| json!({"name": addr.name.clone(), "email": addr.email.clone()})).collect::<Vec<_>>(),
497                    "bcc": parsed.bcc.iter().map(|addr| json!({"name": addr.name.clone(), "email": addr.email.clone()})).collect::<Vec<_>>(),
498                    "subject": parsed.subject,
499                    "message_id": parsed.message_id_header,
500                    "in_reply_to": parsed.in_reply_to,
501                    "references": parsed.references,
502                    "unsubscribe": format!("{:?}", parsed.unsubscribe),
503                    "list_id": metadata.list_id,
504                    "auth_results": metadata.auth_results,
505                    "content_language": metadata.content_language,
506                    "text_plain_format": format!("{:?}", metadata.text_plain_format),
507                    "raw_headers_present": metadata.raw_headers.is_some(),
508                })
509            );
510        }
511    }
512}