1use std::{fs, iter::Peekable, path::Path, sync::LazyLock};
2
3use regex::Regex;
4
5use crate::eml::*;
6use crate::errors::EmlError;
7
8#[allow(non_camel_case_types, clippy::upper_case_acronyms)]
9#[derive(Debug)]
10enum LwspState {
11 ReadingContent,
12 LF, CR, CRLF, CRLFCR, EndOfHeader_LFLF,
17 EndOfHeader_CRCR,
18 EndOfHeader_CRLFCRLF,
19}
20
21#[allow(clippy::upper_case_acronyms)]
22#[derive(Debug)]
23enum InputType {
24 CR,
25 LF,
26 WSP,
27 NonWsp,
28}
29
30#[derive(Debug)]
31enum BodyHandling {
32 None,
33 Preview(usize),
34 All,
35}
36
37#[derive(Debug)]
38pub struct EmlParser {
39 content: String,
40 position: usize,
41
42 body_handling: BodyHandling,
43}
44
45static NAME_ADDR_RE: LazyLock<Regex> =
47 LazyLock::new(|| Regex::new(r#"^"(.?+)" <\s*([^>]+)\s*>[ ,]*"#).unwrap());
48
49static ADDR_RE1: LazyLock<Regex> =
51 LazyLock::new(|| Regex::new(r#"^\s*<\s*([^>]+)\s*>[ ,]*"#).unwrap());
52
53static ADDR_RE2: LazyLock<Regex> =
55 LazyLock::new(|| Regex::new(r#"^\s*([^"<>@]+@[^"<>@\s,]+)[ ,]*"#).unwrap());
56
57impl EmlParser {
58 pub fn from_file(filename: impl AsRef<Path>) -> Result<Self, EmlError> {
64 let content = fs::read_to_string(filename)?;
65
66 Ok(EmlParser {
67 content,
68 position: 0,
69 body_handling: BodyHandling::All,
70 })
71 }
72
73 pub fn from_string(content: String) -> Self {
74 EmlParser {
75 content,
76 position: 0,
77 body_handling: BodyHandling::All,
78 }
79 }
80
81 pub fn ignore_body(mut self) -> Self {
83 self.body_handling = BodyHandling::None;
84 self
85 }
86
87 pub fn with_body(mut self) -> Self {
88 self.body_handling = BodyHandling::All;
89 self
90 }
91
92 pub fn with_body_preview(mut self, bytes: usize) -> Self {
93 self.body_handling = BodyHandling::Preview(bytes);
94 self
95 }
96
97 pub fn parse(&mut self) -> Result<Eml, EmlError> {
98 if self.content.is_empty() {
99 return Err(EmlError::UnexpectedEndOfStream(String::from("Empty input")));
100 }
101
102 let content = self.content.to_string(); let chars = content.chars();
104 let mut char_input = chars.peekable();
105 let eml = self.parse_email(&mut char_input)?;
106
107 Ok(eml)
108 }
109
110 fn parse_email<T: Iterator<Item = char>>(
111 &mut self,
112 char_input: &mut Peekable<T>,
113 ) -> Result<Eml, EmlError> {
114 let headers = self.parse_header_fields(char_input)?;
115
116 let mut result = Eml {
117 body: self.parse_body(),
118 ..Default::default()
119 };
120
121 for header in headers {
122 match (&header.name[..], &header.value) {
123 ("To", _) => result.to = Some(header.value),
124 ("From", _) => result.from = Some(header.value),
125 ("Subject", HeaderFieldValue::Unstructured(subj)) => {
126 result.subject = Some((*subj).to_string())
127 }
128 _ => result.headers.push(header),
129 }
130 }
131
132 Ok(result)
133 }
134
135 fn parse_header_fields<T: Iterator<Item = char>>(
136 &mut self,
137 char_input: &mut Peekable<T>,
138 ) -> Result<Vec<HeaderField>, EmlError> {
139 use HeaderFieldValue::*;
140 let mut headers = Vec::new();
141
142 while let Some((name, value, eoh)) = self.read_raw_header_field(char_input)? {
143 let value = match (&name[..], value) {
145 ("From", v)
146 | ("To", v)
147 | ("Reply-To", v)
148 | ("Delivered-To", v)
149 | ("X-Original-To", v)
150 | ("Return-Path", v) => EmlParser::parse_email_address(v),
151 (_, v) if v.is_empty() => Empty,
152 (_, v) => match rfc2047_decoder::decode(&v) {
153 Ok(decoded) => Unstructured(decoded),
154 Err(_) => Unstructured(v),
155 },
156 };
157 headers.push(HeaderField { name, value });
158
159 if eoh {
160 break;
161 }
162 }
163 Ok(headers)
164 }
165
166 fn parse_email_address(value: String) -> HeaderFieldValue {
167 let mut remaining = value.replace(['\n', '\r'], "");
169
170 let mut found_addresses = Vec::new();
171
172 while !remaining.is_empty() {
173 if let Some(cap) = NAME_ADDR_RE.captures(&remaining) {
174 let name = cap.get(1).unwrap().as_str().to_string();
175 let address = cap.get(2).unwrap().as_str().to_string();
176 found_addresses.push(EmailAddress::NameAndEmailAddress { name, address });
177
178 let entire_match = cap.get(0).unwrap();
179 remaining = remaining[entire_match.end()..].to_string();
180 } else if let Some(cap) = ADDR_RE1.captures(&remaining) {
181 let address = cap.get(1).unwrap().as_str().to_string();
182 found_addresses.push(EmailAddress::AddressOnly { address });
183
184 let entire_match = cap.get(0).unwrap();
185 remaining = remaining[entire_match.end()..].to_string();
186 } else if let Some(cap) = ADDR_RE2.captures(&remaining) {
187 let address = cap.get(1).unwrap().as_str().to_string();
188 found_addresses.push(EmailAddress::AddressOnly { address });
189
190 let entire_match = cap.get(0).unwrap();
191 remaining = remaining[entire_match.end()..].to_string();
192 } else {
193 return HeaderFieldValue::Unstructured(value);
195 }
196 }
197
198 if found_addresses.len() == 1 {
200 HeaderFieldValue::SingleEmailAddress(found_addresses.into_iter().next().unwrap())
201 } else {
202 HeaderFieldValue::MultipleEmailAddresses(found_addresses)
203 }
204 }
205
206 fn read_raw_header_field<T: Iterator<Item = char>>(
207 &mut self,
208 char_input: &mut Peekable<T>,
209 ) -> Result<Option<(String, String, bool)>, EmlError> {
210 match char_input.peek() {
211 Some('\n') | Some('\r') => return Ok(None), Some(_) => {}
213 None => {
214 return Err(EmlError::UnexpectedEndOfStream(String::from(
215 "Expected the beginning of a header field name",
216 )))
217 }
218 };
219
220 if let Some(name) = self.read_field_name(char_input)? {
221 match char_input.peek() {
222 Some(':') => {
223 self.position += 1;
224 char_input.next();
225 }
226 Some(c) => {
227 return Err(EmlError::UnexpectedContent(format!(
228 "Expected ':' to terminate header field '{}'; got '{}' (byte value {})",
229 name, c, *c as u8
230 )))
231 }
232 None => {
233 return Err(EmlError::UnexpectedEndOfStream(format!(
234 "Expected ':' to terminate header field '{}'",
235 name
236 )))
237 }
238 };
239
240 match char_input.peek() {
241 Some(' ') => {
242 self.position += 1;
243 char_input.next();
244 }
245 Some(_) => {}
246 None => {
247 return Err(EmlError::UnexpectedEndOfStream(format!(
248 "Expected non-empty content for header field '{}'",
249 name
250 )))
251 }
252 };
253
254 let (value, eoh) = self.read_field_body(char_input)?;
255
256 Ok(Some((name, value, eoh)))
257 } else {
258 Ok(None)
259 }
260 }
261
262 fn read_field_name<T: Iterator<Item = char>>(
264 &mut self,
265 char_input: &mut Peekable<T>,
266 ) -> Result<Option<String>, EmlError> {
267 let start_position = self.position;
268 let mut end_position = self.position;
269
270 while let Some(c) = char_input.peek() {
271 if c == &'\n' || c == &'\r' {
272 return Ok(None);
275 } else if c != &' ' && c != &':' && !c.is_control() {
276 end_position += c.len_utf8();
277 char_input.next();
278 } else {
279 break;
280 }
281 }
282
283 if end_position == self.content.len() {
284 Err(EmlError::UnexpectedEndOfStream(String::from(
285 "Expected content for header field",
286 )))
287 } else {
288 self.position = end_position;
289 Ok(Some(String::from(
290 &self.content[start_position..end_position],
291 )))
292 }
293 }
294
295 fn read_field_body<T: Iterator<Item = char>>(
299 &mut self,
300 char_input: &mut Peekable<T>,
301 ) -> Result<(String, bool), EmlError> {
302 let start_position = self.position;
303 let mut end_position = self.position;
304 let mut state = LwspState::ReadingContent;
305
306 while let Some(next_char) = char_input.peek() {
307 let ws = EmlParser::next_char_type(*next_char);
308 let len = next_char.len_utf8();
309
310 match (&state, ws) {
311 (LwspState::ReadingContent, InputType::WSP)
312 | (LwspState::ReadingContent, InputType::NonWsp) => {
313 char_input.next();
315 end_position += len;
316 }
317
318 (LwspState::ReadingContent, InputType::CR) => {
319 state = LwspState::CR;
320 char_input.next();
321 end_position += len;
322 }
323
324 (LwspState::ReadingContent, InputType::LF) => {
325 state = LwspState::LF;
326 char_input.next();
327 end_position += len;
328 }
329
330 (LwspState::LF, InputType::WSP)
331 | (LwspState::CR, InputType::WSP)
332 | (LwspState::CRLF, InputType::WSP) => {
333 state = LwspState::ReadingContent;
339 char_input.next();
340 end_position += len;
341 }
342
343 (LwspState::LF, InputType::NonWsp)
344 | (LwspState::CR, InputType::NonWsp)
345 | (LwspState::CRLF, InputType::NonWsp) => {
346 break;
349 }
350
351 (LwspState::LF, InputType::LF) => {
352 state = LwspState::EndOfHeader_LFLF;
354 char_input.next();
355 end_position += len;
356 break;
357 }
358 (LwspState::CR, InputType::CR) => {
359 state = LwspState::EndOfHeader_CRCR;
361 char_input.next();
362 end_position += len;
363 break;
364 }
365 (LwspState::CRLFCR, InputType::LF) => {
366 state = LwspState::EndOfHeader_CRLFCRLF;
368 char_input.next();
369 end_position += len;
370 break;
371 }
372
373 (LwspState::CR, InputType::LF) => {
374 state = LwspState::CRLF;
376 char_input.next();
377 end_position += len;
378 }
379
380 (LwspState::CRLF, InputType::CR) => {
381 state = LwspState::CRLFCR;
383 char_input.next();
384 end_position += len;
385 }
386
387 (LwspState::CRLFCR, _) => {
396 return Err(EmlError::UnexpectedContent(String::from(
398 "Found CRLF+CR in header without expected LF",
399 )));
400 }
401
402 (LwspState::CRLF, InputType::LF) => {
403 return Err(EmlError::UnexpectedContent(String::from(
405 "Found CRLF+LF in header without expected CR first",
406 )));
407 }
408
409 (LwspState::LF, InputType::CR) => {
410 return Err(EmlError::UnexpectedContent(String::from(
412 "Found LF+CR in header as line delimeter",
413 )));
414 }
415
416 (LwspState::EndOfHeader_LFLF, _)
418 | (LwspState::EndOfHeader_CRCR, _)
419 | (LwspState::EndOfHeader_CRLFCRLF, _) => unreachable!(),
420 }
421 }
422
423 self.position = end_position;
424
425 let value_end = end_position
428 - match state {
429 LwspState::LF => 1,
430 LwspState::CR => 1,
431 LwspState::CRLF => 2,
432 LwspState::EndOfHeader_LFLF => 2,
433 LwspState::EndOfHeader_CRCR => 2,
434 LwspState::EndOfHeader_CRLFCRLF => 4,
435 LwspState::ReadingContent | LwspState::CRLFCR => unreachable!(),
436 };
437
438 let end_of_header = matches!(
439 state,
440 LwspState::EndOfHeader_LFLF
441 | LwspState::EndOfHeader_CRCR
442 | LwspState::EndOfHeader_CRLFCRLF
443 );
444
445 Ok((
446 String::from(&self.content[start_position..value_end]),
447 end_of_header,
448 ))
449 }
450
451 fn next_char_type(c: char) -> InputType {
452 match c {
453 '\n' => InputType::LF,
454 '\r' => InputType::CR,
455 ' ' | '\t' => InputType::WSP,
456 c if c.is_ascii_whitespace() => InputType::WSP,
461 _ => InputType::NonWsp,
462 }
463 }
464
465 fn parse_body(&mut self) -> Option<String> {
466 match self.body_handling {
467 BodyHandling::None => None,
468 BodyHandling::Preview(bytes) => {
469 let bytes_remaining = self.content.len() - self.position;
470 let bytes = bytes.min(bytes_remaining);
471
472 Some(String::from(
473 &self.content
474 [self.position..self.content.floor_char_boundary(self.position + bytes)],
475 ))
476 }
477 BodyHandling::All => Some(String::from(&self.content[self.position..])),
478 }
479 }
480}
481
482#[cfg(test)]
483mod tests {
484 use super::*;
485
486 const TEST_HEADER: &str = r#"Delivered-To: john.public@example.com
487Received: by 2002:ac9:700e:0:0:0:0:0 with SMTP id w14csp4493771ocr;
488 Mon, 13 Apr 2020 14:04:07 -0700 (PDT)
489X-Google-Smtp-Source: APiQypIbRnWumT0t4TOJHlvDOVkxfqZ8A8HBzdR39kgdjVQQfKUsY/DkKFeZI53Ux1Z3reMRqaCl
490X-Received: by 2002:a37:aa8e:: with SMTP id t136mr9744838qke.175.1586811847065;
491 Mon, 13 Apr 2020 14:04:07 -0700 (PDT)
492foo: bar
493
494This is the start of the body
495"#;
496
497 #[test]
498 fn basic_test() {
499 let eml = EmlParser::from_string(TEST_HEADER.to_string())
500 .with_body()
501 .parse();
502
503 assert!(eml.is_ok());
504 let eml = eml.unwrap();
505
506 assert_eq!(5, eml.headers.len());
507
508 let delivered_to: &HeaderField = &eml.headers[0];
509 assert_eq!("Delivered-To", delivered_to.name);
510 assert_eq!(
511 HeaderFieldValue::SingleEmailAddress(EmailAddress::AddressOnly {
512 address: ("john.public@example.com".to_string())
513 }),
514 delivered_to.value
515 );
516
517 let received: &HeaderField = &eml.headers[1];
518 assert_eq!("Received", received.name);
519 assert_eq!(
520 HeaderFieldValue::Unstructured(
521 r#"by 2002:ac9:700e:0:0:0:0:0 with SMTP id w14csp4493771ocr;
522 Mon, 13 Apr 2020 14:04:07 -0700 (PDT)"#
523 .to_string()
524 ),
525 received.value
526 );
527
528 assert_eq!("X-Google-Smtp-Source".to_string(), eml.headers[2].name);
529 assert_eq!(
530 HeaderFieldValue::Unstructured(
531 "APiQypIbRnWumT0t4TOJHlvDOVkxfqZ8A8HBzdR39kgdjVQQfKUsY/DkKFeZI53Ux1Z3reMRqaCl"
532 .to_string()
533 ),
534 eml.headers[2].value
535 );
536
537 assert_eq!("X-Received".to_string(), eml.headers[3].name);
538 assert_eq!(
539 HeaderFieldValue::Unstructured(
540 r#"by 2002:a37:aa8e:: with SMTP id t136mr9744838qke.175.1586811847065;
541 Mon, 13 Apr 2020 14:04:07 -0700 (PDT)"#
542 .to_string()
543 ),
544 eml.headers[3].value
545 );
546
547 assert_eq!("foo".to_string(), eml.headers[4].name);
548 assert_eq!(
549 HeaderFieldValue::Unstructured("bar".to_string()),
550 eml.headers[4].value
551 );
552
553 assert!(eml.body.is_some());
554 let body = eml.body.unwrap();
555 assert_eq!("This is the start of the body\n", body);
556 }
557
558 #[test]
559 fn basic_test_with_truncated_body() {
560 let eml: Eml = EmlParser::from_string(TEST_HEADER.to_string())
561 .with_body_preview(15)
562 .parse()
563 .unwrap(); let body = eml.body.unwrap();
566 let expected = &"This is the start of the body\n"[0..15];
567 assert_eq!(expected, body);
568 }
569
570 #[test]
571 fn basic_test_with_truncation_gt_body_length() {
572 let eml: Eml = EmlParser::from_string(TEST_HEADER.to_string())
573 .with_body_preview(150)
574 .parse()
575 .unwrap(); assert_eq!(5, eml.headers.len());
578
579 let body = eml.body.unwrap();
580 assert_eq!("This is the start of the body\n", body);
581 }
582
583 #[test]
584 fn body_truncated_in_multibyte_char() {
585 let result = EmlParser::from_string("Foo: ok\n\nBá".to_string())
586 .with_body_preview(2)
587 .parse()
588 .unwrap();
589
590 let body = result.body.unwrap();
591 assert_eq!("B", body);
592 }
593
594 #[test]
595 fn parse_emails() {
596 let parsed =
597 EmlParser::parse_email_address(r#""John Smith" <jsmith@example.com>"#.to_string());
598
599 let jsmith = EmailAddress::NameAndEmailAddress {
600 name: "John Smith".to_string(),
601 address: "jsmith@example.com".to_string(),
602 };
603 let expected = HeaderFieldValue::SingleEmailAddress(jsmith);
604
605 assert_eq!(parsed, expected);
606 }
607
608 #[test]
609 fn parse_and_display_emails() {
610 let single = r#""John Q. Public" < john@example.com>, "#.to_string();
611 let parsed = EmlParser::parse_email_address(single);
612
613 match &parsed {
614 HeaderFieldValue::SingleEmailAddress(EmailAddress::NameAndEmailAddress {
615 name,
616 address,
617 }) => {
618 assert_eq!(name, "John Q. Public");
619 assert_eq!(address, "john@example.com");
620 }
621 _ => panic!("Expected SingleEmailAddress, got something else"),
622 };
623
624 assert_eq!(parsed.to_string(), r#""John Q. Public" <john@example.com>"#);
625 }
626
627 #[test]
628 fn test_errors() {
629 let filename = "nonexistent.eml";
630 let parsed = EmlParser::from_file(filename);
631 assert!(parsed.is_err());
632
633 let errval = parsed.unwrap_err();
634 assert!(matches!(errval, EmlError::IoError(_inner)));
635 }
636
637 #[test]
638 fn last_header_empty() {
639 let eml: Eml = EmlParser::from_string("Foo: ok\nBar: \n\nHello".to_string())
640 .with_body()
641 .parse()
642 .unwrap();
643
644 assert_eq!(2, eml.headers.len());
645
646 let foo = &eml.headers[0];
647 let HeaderField { name, value } = foo;
648 assert_eq!("Foo", name);
649 assert_eq!(&HeaderFieldValue::Unstructured("ok".to_string()), value);
650
651 let bar = &eml.headers[1];
652 let HeaderField { name, value } = bar;
653 assert_eq!("Bar", name);
654 assert_eq!(&HeaderFieldValue::Empty, value);
655
656 assert_eq!(Some("Hello".to_string()), eml.body);
657 }
658
659 #[test]
660 fn last_header_get_full_value() {
661 let eml: Eml = EmlParser::from_string("Foo: ok\nBar: super\n\nHello".to_string())
662 .with_body()
663 .parse()
664 .unwrap();
665
666 assert_eq!(2, eml.headers.len());
667
668 let foo = &eml.headers[0];
669 let HeaderField { name, value } = foo;
670 assert_eq!("Foo", name);
671 assert_eq!(&HeaderFieldValue::Unstructured("ok".to_string()), value);
672
673 let bar = &eml.headers[1];
674 let HeaderField { name, value } = bar;
675 assert_eq!("Bar", name);
676 assert_eq!(&HeaderFieldValue::Unstructured("super".to_string()), value);
677
678 assert_eq!(Some("Hello".to_string()), eml.body);
679 }
680
681 #[test]
683 fn nonascii() {
684 let result = EmlParser::from_string("Foo: tést\nBar: bar\n\nHello".to_string())
686 .ignore_body()
687 .parse()
688 .expect("Should parse");
689
690 let headers = result.headers;
691 assert_eq!(2, headers.len());
692
693 let HeaderField { name, value } = &headers[0];
694 assert_eq!("Foo", name);
695 assert_eq!("tést", value.to_string());
696
697 let HeaderField { name, value } = &headers[1];
698 assert_eq!("Bar", name);
699 assert_eq!("bar", value.to_string());
700
701 let result = EmlParser::from_string("Foo: testé\nBar: bar\n\nHello".to_string())
703 .ignore_body()
704 .parse()
705 .expect("Should parse");
706
707 let headers = result.headers;
708 assert_eq!(2, headers.len());
709
710 let HeaderField { name, value } = &headers[0];
711 assert_eq!("Foo", name);
712 assert_eq!("testé", value.to_string());
713
714 let HeaderField { name, value } = &headers[1];
715 assert_eq!("Bar", name);
716 assert_eq!("bar", value.to_string());
717
718 let result = EmlParser::from_string("ō: test\n\n".to_string())
720 .ignore_body()
721 .parse()
722 .unwrap();
723
724 let headers = result.headers;
725 assert_eq!(1, headers.len());
726
727 let HeaderField { name, value } = &headers[0];
728 assert_eq!("ō", name);
729 assert_eq!("test", value.to_string());
730 }
731
732 #[test]
733 fn test_parse_phishing_emails() {
734 for n in 0..10 {
735 let filename = format!("test_emails/{n}.eml");
736
737 let mut e = EmlParser::from_file(&filename).expect("Load file");
738 let _parsed = e.parse().expect("Parse file");
739 }
740 }
741
742 #[test]
743 fn test_parse_rfc2047() {
744 let mut e = EmlParser::from_file("test_emails/rfc2047.eml").unwrap();
745 let parsed = e.parse().expect("Parse rfc2047.eml");
746 let schöne = HeaderFieldValue::Unstructured("Schöne Grüße".to_string());
747
748 for h in parsed.headers {
749 if h.name == "Salutation" {
750 assert_eq!(h.value, schöne);
751 }
752 }
753 }
754}