use crate::eml::*;
use crate::errors::EmlError;
use regex::Regex;
use std::fs;
use std::iter::Peekable;
use std::path::Path;
#[allow(non_camel_case_types)]
#[derive(Debug)]
enum LwspState {
ReadingContent,
LF,
CR,
CRLF,
CRLFCR,
EndOfHeader_LFLF,
EndOfHeader_CRCR,
EndOfHeader_CRLFCRLF,
}
#[derive(Debug)]
enum InputType {
CR,
LF,
WSP,
NonWsp,
}
#[derive(Debug)]
enum BodyHandling {
None,
Preview(usize),
All,
}
#[derive(Debug)]
pub struct EmlParser {
content: String,
position: usize,
body_handling: BodyHandling,
}
impl EmlParser {
pub fn from_file(filename: impl AsRef<Path>) -> Result<Self, EmlError> {
let content = fs::read_to_string(filename)?;
Ok(EmlParser {
content,
position: 0,
body_handling: BodyHandling::All,
})
}
pub fn from_string(content: String) -> Self {
EmlParser {
content,
position: 0,
body_handling: BodyHandling::All,
}
}
pub fn ignore_body(mut self) -> Self {
self.body_handling = BodyHandling::None;
self
}
pub fn with_body(mut self) -> Self {
self.body_handling = BodyHandling::All;
self
}
pub fn with_body_preview(mut self, bytes: usize) -> Self {
self.body_handling = BodyHandling::Preview(bytes);
self
}
pub fn parse(&mut self) -> Result<Eml, EmlError> {
if self.content.is_empty() {
return Err(EmlError::UnexpectedEndOfStream(String::from("Empty input")));
}
let content = self.content.to_string();
let chars = content.chars();
let mut char_input = chars.peekable();
let eml = self.parse_email(&mut char_input)?;
Ok(eml)
}
fn parse_email<T: Iterator<Item = char>>(
&mut self,
mut char_input: &mut Peekable<T>,
) -> Result<Eml, EmlError> {
let headers = self.parse_header_fields(&mut char_input)?;
let mut result = Eml::default();
result.body = self.parse_body();
for header in headers {
match (&header.name[..], &header.value) {
("To", _) => result.to = Some(header.value),
("From", _) => result.from = Some(header.value),
("Subject", HeaderFieldValue::Unstructured(subj)) => {
result.subject = Some((*subj).to_string())
}
_ => result.headers.push(header),
}
}
Ok(result)
}
fn parse_header_fields<T: Iterator<Item = char>>(
&mut self,
mut char_input: &mut Peekable<T>,
) -> Result<Vec<HeaderField>, EmlError> {
use HeaderFieldValue::*;
let mut headers = Vec::new();
while let Some((name, value, eoh)) = self.read_raw_header_field(&mut char_input)? {
let value = match (&name[..], value) {
("From", v)
| ("To", v)
| ("Reply-To", v)
| ("Delivered-To", v)
| ("X-Original-To", v)
| ("Return-Path", v) => EmlParser::parse_email_address(v),
(_, v) if v.is_empty() => Empty,
(_, v) => Unstructured(v),
};
headers.push(HeaderField { name, value });
if eoh {
break;
}
}
Ok(headers)
}
fn parse_email_address(value: String) -> HeaderFieldValue {
let mut remaining = value.replace("\n", "").replace("\r", "");
let mut found_addresses = Vec::new();
let name_addr_re = Regex::new(r#"^"(.?+)" <\s*([^>]+)\s*>[ ,]*"#).unwrap();
let addr_re1 = Regex::new(r#"^\s*<\s*([^>]+)\s*>[ ,]*"#).unwrap();
let addr_re2 = Regex::new(r#"^\s*([^"<>@]+@[^"<>@\s,]+)[ ,]*"#).unwrap();
while !remaining.is_empty() {
if let Some(cap) = name_addr_re.captures(&remaining) {
let name = cap.get(1).unwrap().as_str().to_string();
let address = cap.get(2).unwrap().as_str().to_string();
found_addresses.push(EmailAddress::NameAndEmailAddress { name, address });
let entire_match = cap.get(0).unwrap();
remaining = remaining[entire_match.end()..].to_string();
} else if let Some(cap) = addr_re1.captures(&remaining) {
let address = cap.get(1).unwrap().as_str().to_string();
found_addresses.push(EmailAddress::AddressOnly { address });
let entire_match = cap.get(0).unwrap();
remaining = remaining[entire_match.end()..].to_string();
} else if let Some(cap) = addr_re2.captures(&remaining) {
let address = cap.get(1).unwrap().as_str().to_string();
found_addresses.push(EmailAddress::AddressOnly { address });
let entire_match = cap.get(0).unwrap();
remaining = remaining[entire_match.end()..].to_string();
} else {
return HeaderFieldValue::Unstructured(value);
}
}
if found_addresses.len() == 1 {
HeaderFieldValue::SingleEmailAddress(found_addresses.into_iter().next().unwrap())
} else {
HeaderFieldValue::MultipleEmailAddresses(found_addresses)
}
}
fn read_raw_header_field<T: Iterator<Item = char>>(
&mut self,
mut char_input: &mut Peekable<T>,
) -> Result<Option<(String, String, bool)>, EmlError> {
match char_input.peek() {
Some('\n') | Some('\r') => return Ok(None),
Some(_) => {}
None => {
return Err(EmlError::UnexpectedEndOfStream(String::from(
"Expected the beginning of a header field name",
)))
}
};
if let Some(name) = self.read_field_name(&mut char_input)? {
match char_input.peek() {
Some(':') => {
self.position += 1;
char_input.next();
}
Some(c) => {
return Err(EmlError::UnexpectedContent(format!(
"Expected ':' to terminate header field '{}'; got '{}' (byte value {})",
name, c, *c as u8
)))
}
None => {
return Err(EmlError::UnexpectedEndOfStream(format!(
"Expected ':' to terminate header field '{}'",
name
)))
}
};
match char_input.peek() {
Some(' ') => {
self.position += 1;
char_input.next();
}
Some(_) => {}
None => {
return Err(EmlError::UnexpectedEndOfStream(format!(
"Expected non-empty content for header field '{}'",
name
)))
}
};
let (value, eoh) = self.read_field_body(&mut char_input)?;
Ok(Some((name, value, eoh)))
} else {
Ok(None)
}
}
fn read_field_name<T: Iterator<Item = char>>(
&mut self,
char_input: &mut Peekable<T>,
) -> Result<Option<String>, EmlError> {
let start_position = self.position;
let mut end_position = self.position;
while let Some(c) = char_input.peek() {
if c == &'\n' || c == &'\r' {
return Ok(None);
} else if c != &' ' && c != &':' && !c.is_control() {
char_input.next();
end_position += 1;
} else {
break;
}
}
if end_position == self.content.len() {
Err(EmlError::UnexpectedEndOfStream(String::from(
"Expected content for header field",
)))
} else {
self.position = end_position;
Ok(Some(String::from(
&self.content[start_position..end_position],
)))
}
}
fn read_field_body<T: Iterator<Item = char>>(
&mut self,
char_input: &mut Peekable<T>,
) -> Result<(String, bool), EmlError> {
let start_position = self.position;
let mut end_position = self.position;
let mut state = LwspState::ReadingContent;
while let Some(next_char) = char_input.peek() {
let ws = EmlParser::next_char_type(*next_char);
match (&state, ws) {
(LwspState::ReadingContent, InputType::WSP)
| (LwspState::ReadingContent, InputType::NonWsp) => {
char_input.next();
end_position += 1;
}
(LwspState::ReadingContent, InputType::CR) => {
state = LwspState::CR;
char_input.next();
end_position += 1;
}
(LwspState::ReadingContent, InputType::LF) => {
state = LwspState::LF;
char_input.next();
end_position += 1;
}
(LwspState::LF, InputType::WSP)
| (LwspState::CR, InputType::WSP)
| (LwspState::CRLF, InputType::WSP) => {
state = LwspState::ReadingContent;
char_input.next();
end_position += 1;
}
(LwspState::LF, InputType::NonWsp)
| (LwspState::CR, InputType::NonWsp)
| (LwspState::CRLF, InputType::NonWsp) => {
break;
}
(LwspState::LF, InputType::LF) => {
state = LwspState::EndOfHeader_LFLF;
char_input.next();
end_position += 1;
break;
}
(LwspState::CR, InputType::CR) => {
state = LwspState::EndOfHeader_CRCR;
char_input.next();
end_position += 1;
break;
}
(LwspState::CRLFCR, InputType::LF) => {
state = LwspState::EndOfHeader_CRLFCRLF;
char_input.next();
end_position += 1;
break;
}
(LwspState::CR, InputType::LF) => {
state = LwspState::CRLF;
char_input.next();
end_position += 1;
}
(LwspState::CRLF, InputType::CR) => {
state = LwspState::CRLFCR;
char_input.next();
end_position += 1;
}
(LwspState::CRLFCR, _) => {
return Err(EmlError::UnexpectedContent(String::from(
"Found CRLF+CR in header without expected LF",
)));
}
(LwspState::CRLF, InputType::LF) => {
return Err(EmlError::UnexpectedContent(String::from(
"Found CRLF+LF in header without expected CR first",
)));
}
(LwspState::LF, InputType::CR) => {
return Err(EmlError::UnexpectedContent(String::from(
"Found LF+CR in header as line delimeter",
)));
}
(LwspState::EndOfHeader_LFLF, _)
| (LwspState::EndOfHeader_CRCR, _)
| (LwspState::EndOfHeader_CRLFCRLF, _) => unreachable!(),
}
}
self.position = end_position;
let value_end = end_position
- match state {
LwspState::LF => 1,
LwspState::CR => 1,
LwspState::CRLF => 2,
LwspState::EndOfHeader_LFLF => 2,
LwspState::EndOfHeader_CRCR => 2,
LwspState::EndOfHeader_CRLFCRLF => 4,
LwspState::ReadingContent | LwspState::CRLFCR => unreachable!(),
};
let end_of_header = match state {
LwspState::EndOfHeader_LFLF
| LwspState::EndOfHeader_CRCR
| LwspState::EndOfHeader_CRLFCRLF => true,
_ => false,
};
Ok((
String::from(&self.content[start_position..value_end]),
end_of_header,
))
}
fn next_char_type(c: char) -> InputType {
match c {
'\n' => InputType::LF,
'\r' => InputType::CR,
' ' | '\t' => InputType::WSP,
c if c.is_ascii_whitespace() => InputType::WSP,
_ => InputType::NonWsp,
}
}
fn parse_body(&mut self) -> Option<String> {
match self.body_handling {
BodyHandling::None => None,
BodyHandling::Preview(bytes) => {
let bytes_remaining = self.content.len() - self.position;
let bytes = std::cmp::min(bytes, bytes_remaining);
Some(String::from(
&self.content[self.position..self.position + bytes],
))
}
BodyHandling::All => Some(String::from(&self.content[self.position..])),
}
}
}
#[cfg(test)]
mod tests {
use super::HeaderFieldValue;
use super::*;
const TEST_HEADER: &str = r#"Delivered-To: john.public@example.com
Received: by 2002:ac9:700e:0:0:0:0:0 with SMTP id w14csp4493771ocr;
Mon, 13 Apr 2020 14:04:07 -0700 (PDT)
X-Google-Smtp-Source: APiQypIbRnWumT0t4TOJHlvDOVkxfqZ8A8HBzdR39kgdjVQQfKUsY/DkKFeZI53Ux1Z3reMRqaCl
X-Received: by 2002:a37:aa8e:: with SMTP id t136mr9744838qke.175.1586811847065;
Mon, 13 Apr 2020 14:04:07 -0700 (PDT)
foo: bar
This is the start of the body
"#;
#[test]
fn basic_test() {
let eml = EmlParser::from_string(TEST_HEADER.to_string())
.with_body()
.parse();
assert!(eml.is_ok());
let eml = eml.unwrap();
assert_eq!(5, eml.headers.len());
let delivered_to: &HeaderField = &eml.headers[0];
assert_eq!("Delivered-To", delivered_to.name);
assert_eq!(
HeaderFieldValue::SingleEmailAddress(EmailAddress::AddressOnly {
address: ("john.public@example.com".to_string())
}),
delivered_to.value
);
let received: &HeaderField = &eml.headers[1];
assert_eq!("Received", received.name);
assert_eq!(
HeaderFieldValue::Unstructured(
r#"by 2002:ac9:700e:0:0:0:0:0 with SMTP id w14csp4493771ocr;
Mon, 13 Apr 2020 14:04:07 -0700 (PDT)"#
.to_string()
),
received.value
);
assert_eq!("X-Google-Smtp-Source".to_string(), eml.headers[2].name);
assert_eq!(
HeaderFieldValue::Unstructured(
"APiQypIbRnWumT0t4TOJHlvDOVkxfqZ8A8HBzdR39kgdjVQQfKUsY/DkKFeZI53Ux1Z3reMRqaCl"
.to_string()
),
eml.headers[2].value
);
assert_eq!("X-Received".to_string(), eml.headers[3].name);
assert_eq!(
HeaderFieldValue::Unstructured(
r#"by 2002:a37:aa8e:: with SMTP id t136mr9744838qke.175.1586811847065;
Mon, 13 Apr 2020 14:04:07 -0700 (PDT)"#.to_string()
),
eml.headers[3].value
);
assert_eq!("foo".to_string(), eml.headers[4].name);
assert_eq!(
HeaderFieldValue::Unstructured("bar".to_string()),
eml.headers[4].value
);
assert!(eml.body.is_some());
let body = eml.body.unwrap();
assert_eq!("This is the start of the body\n", body);
}
#[test]
fn basic_test_with_truncated_body() {
let eml: Eml = EmlParser::from_string(TEST_HEADER.to_string())
.with_body_preview(15)
.parse()
.unwrap();
let body = eml.body.unwrap();
let expected = &"This is the start of the body\n"[0..15];
assert_eq!(expected, body);
}
#[test]
fn basic_test_with_truncation_gt_body_length() {
let eml: Eml = EmlParser::from_string(TEST_HEADER.to_string())
.with_body_preview(150)
.parse()
.unwrap();
assert_eq!(5, eml.headers.len());
let body = eml.body.unwrap();
assert_eq!("This is the start of the body\n", body);
}
#[test]
fn parse_emails() {
let parsed =
EmlParser::parse_email_address(r#""John Smith" <jsmith@example.com>"#.to_string());
let jsmith = EmailAddress::NameAndEmailAddress {
name: "John Smith".to_string(),
address: "jsmith@example.com".to_string(),
};
let expected = HeaderFieldValue::SingleEmailAddress(jsmith);
assert_eq!(parsed, expected);
}
#[test]
fn parse_and_display_emails() {
let single = r#""John Q. Public" < john@example.com>, "#.to_string();
let parsed = EmlParser::parse_email_address(single);
match &parsed {
HeaderFieldValue::SingleEmailAddress(EmailAddress::NameAndEmailAddress {
name,
address,
}) => {
assert_eq!(name, "John Q. Public");
assert_eq!(address, "john@example.com");
}
_ => panic!("Expected SingleEmailAddress, got something else"),
};
assert_eq!(parsed.to_string(), r#""John Q. Public" <john@example.com>"#);
}
#[test]
fn test_errors() {
let filename = "nonexistent.eml";
let parsed = EmlParser::from_file(filename);
assert!(parsed.is_err());
let errval = parsed.unwrap_err();
assert!(matches!(errval, EmlError::IoError(_inner)));
}
#[test]
fn last_header_empty() {
let eml: Eml = EmlParser::from_string("Foo: ok\nBar: \n\nHello".to_string())
.with_body()
.parse()
.unwrap();
assert_eq!(2, eml.headers.len());
let foo = &eml.headers[0];
let HeaderField { name, value } = foo;
assert_eq!("Foo", name);
assert_eq!(&HeaderFieldValue::Unstructured("ok".to_string()), value);
let bar = &eml.headers[1];
let HeaderField { name, value } = bar;
assert_eq!("Bar", name);
assert_eq!(&HeaderFieldValue::Empty, value);
assert_eq!(Some("Hello".to_string()), eml.body);
}
#[test]
fn last_header_get_full_value() {
let eml: Eml = EmlParser::from_string("Foo: ok\nBar: super\n\nHello".to_string())
.with_body()
.parse()
.unwrap();
assert_eq!(2, eml.headers.len());
let foo = &eml.headers[0];
let HeaderField { name, value } = foo;
assert_eq!("Foo", name);
assert_eq!(&HeaderFieldValue::Unstructured("ok".to_string()), value);
let bar = &eml.headers[1];
let HeaderField { name, value } = bar;
assert_eq!("Bar", name);
assert_eq!(&HeaderFieldValue::Unstructured("super".to_string()), value);
assert_eq!(Some("Hello".to_string()), eml.body);
}
}