use ::regex::bytes::{RegexBuilder, Regex, Captures};
use std::collections::HashMap;
use std::iter::Peekable;
use memchr::{memchr, memchr_iter};
use charset::Charset;
use std::borrow::Cow;
use lazy_static::lazy_static;
use crate::decode::{base64_decode_into_buf, qp_decode_into_buf};
enum Element {
HeaderField{data: Vec<u8>},
Body{
data: Vec<u8>,
encoding: Option<String>,
content_type: Option<String>,
charset: Option<String>
},
Verbatim{data: Vec<u8>},
}
struct Part {
encoding: Option<String>,
content_type: Option<String>,
charset: Option<String>,
subpart_boundary: Option<Vec<u8>>,
}
impl Part {
fn new() -> Self {
Part{
encoding: None,
content_type: None,
charset: None,
subpart_boundary: None,
}
}
}
pub struct SliceLines<'a> {
buf: &'a [u8],
last: usize,
}
impl<'a> Iterator for SliceLines<'a> {
type Item = &'a [u8];
fn next(&mut self) -> Option<&'a [u8]> {
match memchr(b'\n', &self.buf[self.last..]) {
Some(m) => {
let line = &self.buf[self.last..=(self.last + m)];
self.last = self.last + m + 1;
Some(line)
},
None => {
let line = &self.buf[self.last..];
if line.is_empty() {
None
} else {
self.last = self.buf.len();
Some(line)
}
}
}
}
}
struct EmailParser<'a> {
lines: Peekable<SliceLines<'a>>,
part_stack: Vec<Part>,
in_header: bool,
active_boundary: Vec<u8>,
content_encoding_regex: Regex,
content_type_regex: Regex,
boundary_regex: Regex,
}
impl<'a> EmailParser<'a> {
fn new(buf: &'a [u8]) -> Self {
let content_encoding_regex =
RegexBuilder::new(r"Content-Transfer-Encoding:\s*([[:alnum:]-]+)")
.case_insensitive(true)
.build().unwrap();
let content_type_regex =
RegexBuilder::new(r#"^Content-Type:\s*([^;]+)\s*(?:;\s*charset\s*=\s*"?([[:alnum:]_:\-\.]+))?"?"#)
.case_insensitive(true)
.build().unwrap();
let boundary_regex =
RegexBuilder::new(r#"^Content-Type:\s*multipart/.*boundary\s*=\s*"?([[:alnum:]'_,/:=\(\)\+\-\.\?]+)"?"#)
.case_insensitive(true)
.build().unwrap();
EmailParser{
lines: SliceLines{buf, last: 0}.peekable(),
part_stack: vec![Part::new()],
in_header: true,
active_boundary: Vec::new(),
content_encoding_regex: content_encoding_regex,
content_type_regex: content_type_regex,
boundary_regex: boundary_regex,
}
}
fn active_content_type(&self) -> Option<String> {
self.part_stack.last()?.content_type.clone()
}
fn active_encoding(&self) -> Option<String> {
self.part_stack.last()?.encoding.clone()
}
fn active_charset(&self) -> Option<String> {
self.part_stack.last()?.charset.clone()
}
fn begin_part(&mut self) {
let part = self.part_stack.last().unwrap();
if part.subpart_boundary.as_ref().is_some() &&
part.subpart_boundary.as_ref().unwrap() == &self.active_boundary {
self.part_stack.push(Part::new())
} else {
let part = self.part_stack.last_mut().unwrap();
*part = Part::new();
}
}
fn end_part(&mut self) {
match &self.part_stack.last().unwrap().subpart_boundary {
Some(b) if b == &self.active_boundary => {},
_ => { self.part_stack.pop(); }
}
self.part_stack.last_mut().unwrap().subpart_boundary = None;
self.active_boundary.clear();
for p in self.part_stack.iter().rev() {
if let Some(b) = &p.subpart_boundary {
self.active_boundary = b.clone();
}
}
}
fn update_active_part_from_header_field(&mut self, field: &[u8]) {
let mut part = self.part_stack.last_mut().unwrap();
if let Some(captures) = self.content_encoding_regex.captures(&field) {
let enc_bytes = captures.get(1).unwrap().as_bytes();
part.encoding = Some(std::str::from_utf8(&enc_bytes).unwrap().to_lowercase());
} else if let Some(captures) = self.boundary_regex.captures(&field) {
part.subpart_boundary = Some(captures.get(1).unwrap().as_bytes().to_vec());
self.active_boundary = part.subpart_boundary.as_ref().unwrap().clone();
}
else if let Some(captures) = self.content_type_regex.captures(&field) {
let type_bytes = captures.get(1).unwrap().as_bytes();
part.content_type = Some(std::str::from_utf8(&type_bytes).unwrap().to_lowercase());
if let Some(charset) = captures.get(2) {
part.charset = Some(std::str::from_utf8(charset.as_bytes()).unwrap().to_lowercase());
}
}
}
}
fn vec_trim_end_newline(line: &mut Vec<u8>) {
while let Some(&b) = line.last() {
if b != b'\n' && b != b'\r' {
break;
}
line.pop();
}
}
fn slice_trim_end_newline(mut line: &[u8]) -> &[u8] {
while let Some(&b) = line.last() {
if b != b'\n' && b != b'\r' {
break;
}
line = &line[..line.len()-1];
}
line
}
fn is_boundary_line(line: &[u8], boundary: &[u8]) -> bool {
if line.starts_with(b"--") && !boundary.is_empty() {
let line = slice_trim_end_newline(&line);
let line = if line.ends_with(b"--") { &line[..line.len()-2] } else { &line[..] };
return line.len() > 2 && &line[2..] == boundary;
}
false
}
impl Iterator for EmailParser<'_> {
type Item = Element;
fn next(&mut self) -> Option<Element> {
let mut inprogress = Vec::new();
let mut element = None;
loop {
let line = match self.lines.next() {
Some(l) => l,
None => break,
};
if self.in_header {
match line[0] {
b'\n' | b'\r' => {
self.in_header = false;
element = Some(Element::Verbatim{data: line.to_vec()});
break;
},
b' ' | b'\t' => {
vec_trim_end_newline(&mut inprogress);
inprogress.extend(line);
},
_ => inprogress = line.to_vec(),
};
if let Some(next_line) = self.lines.peek() {
if next_line[0] != b' ' && next_line[0] != b'\t' {
break;
}
}
continue;
}
if is_boundary_line(&line, &self.active_boundary) {
if slice_trim_end_newline(&line).ends_with(b"--") {
self.end_part();
} else {
self.begin_part();
self.in_header = true;
}
element = Some(Element::Verbatim{data: line.to_vec()});
break;
}
inprogress.extend(line);
if let Some(next_line) = self.lines.peek() {
if is_boundary_line(next_line, &self.active_boundary) {
break;
}
}
}
if !inprogress.is_empty() {
assert!(element.is_none());
if self.in_header {
element = Some(Element::HeaderField{data: inprogress});
} else {
element = Some(
Element::Body{
data: inprogress,
encoding: self.active_encoding(),
content_type: self.active_content_type(),
charset: self.active_charset(),
}
);
}
}
if let Some(Element::HeaderField{data: field}) = element.as_ref() {
self.update_active_part_from_header_field(&field);
}
element
}
}
fn decode_text_data_to_buf(
data: &[u8],
encoding: Option<&str>,
charset: Option<&str>,
mut out: &mut Vec<u8>,
) {
let should_decode = encoding.is_some();
let mut should_convert_charset = true;
let initial_len = out.len();
if should_decode {
let result = match encoding.unwrap().as_ref() {
"base64" => base64_decode_into_buf(&data, &mut out),
"quoted-printable" => qp_decode_into_buf(&data, &mut out),
"8bit" | "binary" => { out.extend(data); Ok(()) },
_ => Err("unknown encoding".into()),
};
if result.is_ok() {
const CRLF: &[u8] = &[b'\r', b'\n'];
const LF: &[u8] = &[b'\n'];
if data.ends_with(CRLF) && !out.ends_with(CRLF) {
out.extend(CRLF);
} else if data.ends_with(LF) && !out.ends_with(LF) {
out.extend(LF);
}
} else {
out.resize(initial_len, 0);
should_convert_charset = false;
}
}
if out.len() == initial_len {
out.extend(data);
}
if should_convert_charset {
if let Some(chr) = Charset::for_label(charset.unwrap_or("us-ascii").as_bytes()) {
let (cow, _, _) = chr.decode(&out[initial_len..]);
if let Cow::Owned(c) = cow {
out.resize(initial_len, 0);
out.extend(c.bytes());
}
}
}
}
fn maybe_contains_encoded_word(data: &[u8]) -> bool {
for spacepos in memchr_iter(b'?', &data) {
if spacepos + 1 < data.len() && data[spacepos + 1] == b'=' {
return true;
}
}
false
}
fn decode_encoded_word_from_captures(caps: &Captures) -> Vec<u8> {
let charset = String::from_utf8_lossy(&caps[1]).to_lowercase();
let encoding = match &caps[2] {
b"q" | b"Q" => "quoted-printable",
b"b" | b"B" => "base64",
_ => "",
};
let mut data = Cow::from(&caps[3]);
if encoding == "quoted-printable" {
let space_positions: Vec<_> = memchr_iter(b'_', &data).collect();
for pos in space_positions {
data.to_mut()[pos] = b' ';
}
}
let mut decoded = Vec::new();
decode_text_data_to_buf(&data, Some(encoding), Some(&charset), &mut decoded);
decoded
}
pub fn normalize_email(data: &[u8]) -> (Vec<u8>, HashMap<String, Vec<String>>) {
lazy_static! {
static ref ENCODED_WORD_REGEX: Regex =
RegexBuilder::new(r"=\?([^?]+)\?([^?]+)\?([^? \t]+)\?=")
.case_insensitive(true)
.build().unwrap();
static ref ENCODED_WORD_WSP_REGEX: Regex =
RegexBuilder::new(r"\?([^?]+)\?=\s*=\?([^?]+)\?")
.case_insensitive(true)
.build().unwrap();
}
let parser = EmailParser::new(&data);
let mut normalized = Vec::new();
let mut fields = HashMap::new();
for element in parser {
match element {
Element::HeaderField{data} => {
let initial_len = normalized.len();
if maybe_contains_encoded_word(&data) {
let data = ENCODED_WORD_WSP_REGEX.replace_all(
&data, "?$1?==?$2?".as_bytes());
let data = ENCODED_WORD_REGEX.replace_all(
&data, decode_encoded_word_from_captures);
normalized.extend(data.as_ref());
} else {
normalized.extend(&data);
}
let field_str = String::from_utf8_lossy(&normalized[initial_len..]);
let field_str = field_str.trim();
let mut split = field_str.splitn(2, ':');
let name = split.next().map(|n| n.to_lowercase()).unwrap();
let value = split.next().unwrap_or("").to_owned();
fields.entry(name).or_insert(Vec::new()).push(value);
},
Element::Body{data, encoding, content_type, charset} => {
match content_type {
Some(ref content_type) if !content_type.starts_with("text/") => {
normalized.extend(&data);
},
_ => {
decode_text_data_to_buf(
&data,
encoding.as_ref().map(String::as_str),
charset.as_ref().map(String::as_str),
&mut normalized);
}
};
},
Element::Verbatim{data} => {
normalized.extend(&data);
},
}
}
(normalized, fields)
}