1use crate::eml::*;
2use crate::errors::EmlError;
3use regex::Regex;
4use std::fs;
5use std::iter::Peekable;
6use std::path::Path;
7
8#[allow(non_camel_case_types, clippy::upper_case_acronyms)]
9#[derive(Debug)]
10enum LwspState {
11 ReadingContent,
12 LF, CR, CRLF, CRLFCR, EndOfHeader_LFLF,
17 EndOfHeader_CRCR,
18 EndOfHeader_CRLFCRLF,
19}
20
21#[allow(clippy::upper_case_acronyms)]
22#[derive(Debug)]
23enum InputType {
24 CR,
25 LF,
26 WSP,
27 NonWsp,
28}
29
30#[derive(Debug)]
31enum BodyHandling {
32 None,
33 Preview(usize),
34 All,
35}
36
37#[derive(Debug)]
38pub struct EmlParser {
39 content: String,
40 position: usize,
41
42 body_handling: BodyHandling,
43}
44
45impl EmlParser {
46 pub fn from_file(filename: impl AsRef<Path>) -> Result<Self, EmlError> {
52 let content = fs::read_to_string(filename)?;
53
54 Ok(EmlParser {
55 content,
56 position: 0,
57 body_handling: BodyHandling::All,
58 })
59 }
60
61 pub fn from_string(content: String) -> Self {
62 EmlParser {
63 content,
64 position: 0,
65 body_handling: BodyHandling::All,
66 }
67 }
68
69 pub fn ignore_body(mut self) -> Self {
71 self.body_handling = BodyHandling::None;
72 self
73 }
74
75 pub fn with_body(mut self) -> Self {
76 self.body_handling = BodyHandling::All;
77 self
78 }
79
80 pub fn with_body_preview(mut self, bytes: usize) -> Self {
81 self.body_handling = BodyHandling::Preview(bytes);
82 self
83 }
84
85 pub fn parse(&mut self) -> Result<Eml, EmlError> {
86 if self.content.is_empty() {
87 return Err(EmlError::UnexpectedEndOfStream(String::from("Empty input")));
88 }
89
90 let content = self.content.to_string(); let chars = content.chars();
92 let mut char_input = chars.peekable();
93 let eml = self.parse_email(&mut char_input)?;
94
95 Ok(eml)
96 }
97
98 fn parse_email<T: Iterator<Item = char>>(
99 &mut self,
100 char_input: &mut Peekable<T>,
101 ) -> Result<Eml, EmlError> {
102 let headers = self.parse_header_fields(char_input)?;
103
104 let mut result = Eml {
105 body: self.parse_body(),
106 ..Default::default()
107 };
108 for header in headers {
112 match (&header.name[..], &header.value) {
113 ("To", _) => result.to = Some(header.value),
114 ("From", _) => result.from = Some(header.value),
115 ("Subject", HeaderFieldValue::Unstructured(subj)) => {
116 result.subject = Some((*subj).to_string())
117 }
118 _ => result.headers.push(header),
119 }
120 }
121
122 Ok(result)
123 }
124
125 fn parse_header_fields<T: Iterator<Item = char>>(
126 &mut self,
127 char_input: &mut Peekable<T>,
128 ) -> Result<Vec<HeaderField>, EmlError> {
129 use HeaderFieldValue::*;
130 let mut headers = Vec::new();
131
132 while let Some((name, value, eoh)) = self.read_raw_header_field(char_input)? {
133 let value = match (&name[..], value) {
135 ("From", v)
136 | ("To", v)
137 | ("Reply-To", v)
138 | ("Delivered-To", v)
139 | ("X-Original-To", v)
140 | ("Return-Path", v) => EmlParser::parse_email_address(v),
141 (_, v) if v.is_empty() => Empty,
142 (_, v) => match rfc2047_decoder::decode(&v) {
143 Ok(decoded) => Unstructured(decoded),
144 Err(_) => Unstructured(v),
145 },
146 };
147 headers.push(HeaderField { name, value });
148
149 if eoh {
150 break;
151 }
152 }
153 Ok(headers)
154 }
155
156 fn parse_email_address(value: String) -> HeaderFieldValue {
157 let mut remaining = value.replace(['\n', '\r'], "");
159
160 let mut found_addresses = Vec::new();
161
162 let name_addr_re = Regex::new(r#"^"(.?+)" <\s*([^>]+)\s*>[ ,]*"#).unwrap(); let addr_re1 = Regex::new(r#"^\s*<\s*([^>]+)\s*>[ ,]*"#).unwrap(); let addr_re2 = Regex::new(r#"^\s*([^"<>@]+@[^"<>@\s,]+)[ ,]*"#).unwrap(); while !remaining.is_empty() {
167 if let Some(cap) = name_addr_re.captures(&remaining) {
168 let name = cap.get(1).unwrap().as_str().to_string();
169 let address = cap.get(2).unwrap().as_str().to_string();
170 found_addresses.push(EmailAddress::NameAndEmailAddress { name, address });
171
172 let entire_match = cap.get(0).unwrap();
173 remaining = remaining[entire_match.end()..].to_string();
174 } else if let Some(cap) = addr_re1.captures(&remaining) {
175 let address = cap.get(1).unwrap().as_str().to_string();
176 found_addresses.push(EmailAddress::AddressOnly { address });
177
178 let entire_match = cap.get(0).unwrap();
179 remaining = remaining[entire_match.end()..].to_string();
180 } else if let Some(cap) = addr_re2.captures(&remaining) {
181 let address = cap.get(1).unwrap().as_str().to_string();
182 found_addresses.push(EmailAddress::AddressOnly { address });
183
184 let entire_match = cap.get(0).unwrap();
185 remaining = remaining[entire_match.end()..].to_string();
186 } else {
187 return HeaderFieldValue::Unstructured(value);
189 }
190 }
191
192 if found_addresses.len() == 1 {
194 HeaderFieldValue::SingleEmailAddress(found_addresses.into_iter().next().unwrap())
195 } else {
196 HeaderFieldValue::MultipleEmailAddresses(found_addresses)
197 }
198 }
199
200 fn read_raw_header_field<T: Iterator<Item = char>>(
201 &mut self,
202 char_input: &mut Peekable<T>,
203 ) -> Result<Option<(String, String, bool)>, EmlError> {
204 match char_input.peek() {
205 Some('\n') | Some('\r') => return Ok(None), Some(_) => {}
207 None => {
208 return Err(EmlError::UnexpectedEndOfStream(String::from(
209 "Expected the beginning of a header field name",
210 )))
211 }
212 };
213
214 if let Some(name) = self.read_field_name(char_input)? {
215 match char_input.peek() {
216 Some(':') => {
217 self.position += 1;
218 char_input.next();
219 }
220 Some(c) => {
221 return Err(EmlError::UnexpectedContent(format!(
222 "Expected ':' to terminate header field '{}'; got '{}' (byte value {})",
223 name, c, *c as u8
224 )))
225 }
226 None => {
227 return Err(EmlError::UnexpectedEndOfStream(format!(
228 "Expected ':' to terminate header field '{}'",
229 name
230 )))
231 }
232 };
233
234 match char_input.peek() {
235 Some(' ') => {
236 self.position += 1;
237 char_input.next();
238 }
239 Some(_) => {}
240 None => {
241 return Err(EmlError::UnexpectedEndOfStream(format!(
242 "Expected non-empty content for header field '{}'",
243 name
244 )))
245 }
246 };
247
248 let (value, eoh) = self.read_field_body(char_input)?;
249
250 Ok(Some((name, value, eoh)))
251 } else {
252 Ok(None)
253 }
254 }
255
256 fn read_field_name<T: Iterator<Item = char>>(
258 &mut self,
259 char_input: &mut Peekable<T>,
260 ) -> Result<Option<String>, EmlError> {
261 let start_position = self.position;
262 let mut end_position = self.position;
263
264 while let Some(c) = char_input.peek() {
265 if c == &'\n' || c == &'\r' {
266 return Ok(None);
269 } else if c != &' ' && c != &':' && !c.is_control() {
270 char_input.next();
271 end_position += 1;
272 } else {
273 break;
274 }
275 }
276
277 if end_position == self.content.len() {
278 Err(EmlError::UnexpectedEndOfStream(String::from(
279 "Expected content for header field",
280 )))
281 } else {
282 self.position = end_position;
283 Ok(Some(String::from(
284 &self.content[start_position..end_position],
285 )))
286 }
287 }
288
289 fn read_field_body<T: Iterator<Item = char>>(
293 &mut self,
294 char_input: &mut Peekable<T>,
295 ) -> Result<(String, bool), EmlError> {
296 let start_position = self.position;
297 let mut end_position = self.position;
298 let mut state = LwspState::ReadingContent;
299
300 while let Some(next_char) = char_input.peek() {
301 let ws = EmlParser::next_char_type(*next_char);
302 let len = next_char.len_utf8();
303
304 match (&state, ws) {
305 (LwspState::ReadingContent, InputType::WSP)
306 | (LwspState::ReadingContent, InputType::NonWsp) => {
307 char_input.next();
309 end_position += len;
310 }
311
312 (LwspState::ReadingContent, InputType::CR) => {
313 state = LwspState::CR;
314 char_input.next();
315 end_position += len;
316 }
317
318 (LwspState::ReadingContent, InputType::LF) => {
319 state = LwspState::LF;
320 char_input.next();
321 end_position += len;
322 }
323
324 (LwspState::LF, InputType::WSP)
325 | (LwspState::CR, InputType::WSP)
326 | (LwspState::CRLF, InputType::WSP) => {
327 state = LwspState::ReadingContent;
333 char_input.next();
334 end_position += len;
335 }
336
337 (LwspState::LF, InputType::NonWsp)
338 | (LwspState::CR, InputType::NonWsp)
339 | (LwspState::CRLF, InputType::NonWsp) => {
340 break;
343 }
344
345 (LwspState::LF, InputType::LF) => {
346 state = LwspState::EndOfHeader_LFLF;
348 char_input.next();
349 end_position += len;
350 break;
351 }
352 (LwspState::CR, InputType::CR) => {
353 state = LwspState::EndOfHeader_CRCR;
355 char_input.next();
356 end_position += len;
357 break;
358 }
359 (LwspState::CRLFCR, InputType::LF) => {
360 state = LwspState::EndOfHeader_CRLFCRLF;
362 char_input.next();
363 end_position += len;
364 break;
365 }
366
367 (LwspState::CR, InputType::LF) => {
368 state = LwspState::CRLF;
370 char_input.next();
371 end_position += len;
372 }
373
374 (LwspState::CRLF, InputType::CR) => {
375 state = LwspState::CRLFCR;
377 char_input.next();
378 end_position += len;
379 }
380
381 (LwspState::CRLFCR, _) => {
390 return Err(EmlError::UnexpectedContent(String::from(
392 "Found CRLF+CR in header without expected LF",
393 )));
394 }
395
396 (LwspState::CRLF, InputType::LF) => {
397 return Err(EmlError::UnexpectedContent(String::from(
399 "Found CRLF+LF in header without expected CR first",
400 )));
401 }
402
403 (LwspState::LF, InputType::CR) => {
404 return Err(EmlError::UnexpectedContent(String::from(
406 "Found LF+CR in header as line delimeter",
407 )));
408 }
409
410 (LwspState::EndOfHeader_LFLF, _)
412 | (LwspState::EndOfHeader_CRCR, _)
413 | (LwspState::EndOfHeader_CRLFCRLF, _) => unreachable!(),
414 }
415 }
416
417 self.position = end_position;
418
419 let value_end = end_position
422 - match state {
423 LwspState::LF => 1,
424 LwspState::CR => 1,
425 LwspState::CRLF => 2,
426 LwspState::EndOfHeader_LFLF => 2,
427 LwspState::EndOfHeader_CRCR => 2,
428 LwspState::EndOfHeader_CRLFCRLF => 4,
429 LwspState::ReadingContent | LwspState::CRLFCR => unreachable!(),
430 };
431
432 let end_of_header = matches!(
433 state,
434 LwspState::EndOfHeader_LFLF
435 | LwspState::EndOfHeader_CRCR
436 | LwspState::EndOfHeader_CRLFCRLF
437 );
438
439 Ok((
440 String::from(&self.content[start_position..value_end]),
441 end_of_header,
442 ))
443 }
444
445 fn next_char_type(c: char) -> InputType {
446 match c {
447 '\n' => InputType::LF,
448 '\r' => InputType::CR,
449 ' ' | '\t' => InputType::WSP,
450 c if c.is_ascii_whitespace() => InputType::WSP,
455 _ => InputType::NonWsp,
456 }
457 }
458
459 fn parse_body(&mut self) -> Option<String> {
460 match self.body_handling {
461 BodyHandling::None => None,
462 BodyHandling::Preview(bytes) => {
463 let bytes_remaining = self.content.len() - self.position;
464 let bytes = std::cmp::min(bytes, bytes_remaining);
465
466 Some(String::from(
469 &self.content[self.position..self.position + bytes],
470 ))
471 }
472 BodyHandling::All => Some(String::from(&self.content[self.position..])),
473 }
474 }
475}
476
477#[cfg(test)]
478mod tests {
479 use super::HeaderFieldValue;
480 use super::*;
481
482 const TEST_HEADER: &str = r#"Delivered-To: john.public@example.com
483Received: by 2002:ac9:700e:0:0:0:0:0 with SMTP id w14csp4493771ocr;
484 Mon, 13 Apr 2020 14:04:07 -0700 (PDT)
485X-Google-Smtp-Source: APiQypIbRnWumT0t4TOJHlvDOVkxfqZ8A8HBzdR39kgdjVQQfKUsY/DkKFeZI53Ux1Z3reMRqaCl
486X-Received: by 2002:a37:aa8e:: with SMTP id t136mr9744838qke.175.1586811847065;
487 Mon, 13 Apr 2020 14:04:07 -0700 (PDT)
488foo: bar
489
490This is the start of the body
491"#;
492
493 #[test]
494 fn basic_test() {
495 let eml = EmlParser::from_string(TEST_HEADER.to_string())
496 .with_body()
497 .parse();
498
499 assert!(eml.is_ok());
500 let eml = eml.unwrap();
501
502 assert_eq!(5, eml.headers.len());
503
504 let delivered_to: &HeaderField = &eml.headers[0];
505 assert_eq!("Delivered-To", delivered_to.name);
506 assert_eq!(
507 HeaderFieldValue::SingleEmailAddress(EmailAddress::AddressOnly {
508 address: ("john.public@example.com".to_string())
509 }),
510 delivered_to.value
511 );
512
513 let received: &HeaderField = &eml.headers[1];
514 assert_eq!("Received", received.name);
515 assert_eq!(
516 HeaderFieldValue::Unstructured(
517 r#"by 2002:ac9:700e:0:0:0:0:0 with SMTP id w14csp4493771ocr;
518 Mon, 13 Apr 2020 14:04:07 -0700 (PDT)"#
519 .to_string()
520 ),
521 received.value
522 );
523
524 assert_eq!("X-Google-Smtp-Source".to_string(), eml.headers[2].name);
525 assert_eq!(
526 HeaderFieldValue::Unstructured(
527 "APiQypIbRnWumT0t4TOJHlvDOVkxfqZ8A8HBzdR39kgdjVQQfKUsY/DkKFeZI53Ux1Z3reMRqaCl"
528 .to_string()
529 ),
530 eml.headers[2].value
531 );
532
533 assert_eq!("X-Received".to_string(), eml.headers[3].name);
534 assert_eq!(
535 HeaderFieldValue::Unstructured(
536 r#"by 2002:a37:aa8e:: with SMTP id t136mr9744838qke.175.1586811847065;
537 Mon, 13 Apr 2020 14:04:07 -0700 (PDT)"#
538 .to_string()
539 ),
540 eml.headers[3].value
541 );
542
543 assert_eq!("foo".to_string(), eml.headers[4].name);
544 assert_eq!(
545 HeaderFieldValue::Unstructured("bar".to_string()),
546 eml.headers[4].value
547 );
548
549 assert!(eml.body.is_some());
550 let body = eml.body.unwrap();
551 assert_eq!("This is the start of the body\n", body);
552 }
553
554 #[test]
555 fn basic_test_with_truncated_body() {
556 let eml: Eml = EmlParser::from_string(TEST_HEADER.to_string())
557 .with_body_preview(15)
558 .parse()
559 .unwrap(); let body = eml.body.unwrap();
562 let expected = &"This is the start of the body\n"[0..15];
563 assert_eq!(expected, body);
564 }
565
566 #[test]
567 fn basic_test_with_truncation_gt_body_length() {
568 let eml: Eml = EmlParser::from_string(TEST_HEADER.to_string())
569 .with_body_preview(150)
570 .parse()
571 .unwrap(); assert_eq!(5, eml.headers.len());
574
575 let body = eml.body.unwrap();
576 assert_eq!("This is the start of the body\n", body);
577 }
578
579 #[test]
580 fn parse_emails() {
581 let parsed =
582 EmlParser::parse_email_address(r#""John Smith" <jsmith@example.com>"#.to_string());
583
584 let jsmith = EmailAddress::NameAndEmailAddress {
585 name: "John Smith".to_string(),
586 address: "jsmith@example.com".to_string(),
587 };
588 let expected = HeaderFieldValue::SingleEmailAddress(jsmith);
589
590 assert_eq!(parsed, expected);
591 }
592
593 #[test]
594 fn parse_and_display_emails() {
595 let single = r#""John Q. Public" < john@example.com>, "#.to_string();
596 let parsed = EmlParser::parse_email_address(single);
597
598 match &parsed {
599 HeaderFieldValue::SingleEmailAddress(EmailAddress::NameAndEmailAddress {
600 name,
601 address,
602 }) => {
603 assert_eq!(name, "John Q. Public");
604 assert_eq!(address, "john@example.com");
605 }
606 _ => panic!("Expected SingleEmailAddress, got something else"),
607 };
608
609 assert_eq!(parsed.to_string(), r#""John Q. Public" <john@example.com>"#);
610 }
611
612 #[test]
613 fn test_errors() {
614 let filename = "nonexistent.eml";
615 let parsed = EmlParser::from_file(filename);
616 assert!(parsed.is_err());
617
618 let errval = parsed.unwrap_err();
619 assert!(matches!(errval, EmlError::IoError(_inner)));
620 }
621
622 #[test]
623 fn last_header_empty() {
624 let eml: Eml = EmlParser::from_string("Foo: ok\nBar: \n\nHello".to_string())
625 .with_body()
626 .parse()
627 .unwrap();
628
629 assert_eq!(2, eml.headers.len());
630
631 let foo = &eml.headers[0];
632 let HeaderField { name, value } = foo;
633 assert_eq!("Foo", name);
634 assert_eq!(&HeaderFieldValue::Unstructured("ok".to_string()), value);
635
636 let bar = &eml.headers[1];
637 let HeaderField { name, value } = bar;
638 assert_eq!("Bar", name);
639 assert_eq!(&HeaderFieldValue::Empty, value);
640
641 assert_eq!(Some("Hello".to_string()), eml.body);
642 }
643
644 #[test]
645 fn last_header_get_full_value() {
646 let eml: Eml = EmlParser::from_string("Foo: ok\nBar: super\n\nHello".to_string())
647 .with_body()
648 .parse()
649 .unwrap();
650
651 assert_eq!(2, eml.headers.len());
652
653 let foo = &eml.headers[0];
654 let HeaderField { name, value } = foo;
655 assert_eq!("Foo", name);
656 assert_eq!(&HeaderFieldValue::Unstructured("ok".to_string()), value);
657
658 let bar = &eml.headers[1];
659 let HeaderField { name, value } = bar;
660 assert_eq!("Bar", name);
661 assert_eq!(&HeaderFieldValue::Unstructured("super".to_string()), value);
662
663 assert_eq!(Some("Hello".to_string()), eml.body);
664 }
665
666 #[test]
668 fn nonascii() {
669 let result = EmlParser::from_string("Foo: tést\nBar: bar\n\nHello".to_string())
671 .ignore_body()
672 .parse()
673 .expect("Should parse");
674
675 let headers = result.headers;
676 assert_eq!(2, headers.len());
677
678 let HeaderField { name, value } = &headers[0];
679 assert_eq!("Foo", name);
680 assert_eq!("tést", value.to_string());
681
682 let HeaderField { name, value } = &headers[1];
683 assert_eq!("Bar", name);
684 assert_eq!("bar", value.to_string());
685
686 let result = EmlParser::from_string("Foo: testé\nBar: bar\n\nHello".to_string())
688 .ignore_body()
689 .parse()
690 .expect("Should parse");
691
692 let headers = result.headers;
693 assert_eq!(2, headers.len());
694
695 let HeaderField { name, value } = &headers[0];
696 assert_eq!("Foo", name);
697 assert_eq!("testé", value.to_string());
698
699 let HeaderField { name, value } = &headers[1];
700 assert_eq!("Bar", name);
701 assert_eq!("bar", value.to_string());
702 }
703
704 #[test]
705 fn test_parse_phishing_emails() {
706 for n in 0..10 {
707 let filename = format!("test_emails/{n}.eml");
708
709 let mut e = EmlParser::from_file(&filename).expect("Load file");
710 let _parsed = e.parse().expect("Parse file");
711 }
712 }
713
714 #[test]
715 fn test_parse_rfc2047() {
716 let mut e = EmlParser::from_file("test_emails/rfc2047.eml").unwrap();
717 let parsed = e.parse().expect("Parse rfc2047.eml");
718 let schöne = HeaderFieldValue::Unstructured("Schöne Grüße".to_string());
719
720 for h in parsed.headers {
721 if h.name == "Salutation" {
722 assert_eq!(h.value, schöne);
723 }
724 }
725 }
726}