mailparse/header.rs
1use charset::Charset;
2
3use crate::find_from;
4
5/// Some types of tokens that might be present in a MIME header. This
6/// list is incomplete relative the types of tokens defined in the RFC,
7/// but can be expanded as needed. Currently the list of tokens is
8/// sufficient to properly handle encoded words and line unfolding.
9pub enum HeaderToken<'a> {
10 /// A bunch of not-encoded text. This can include whitespace and
11 /// non-whitespace chars.
12 Text(&'a str),
13 /// A bunch of text that is purely whitespace.
14 Whitespace(&'a str),
15 /// An end-of-line marker. If it contains None, then it represents
16 /// a raw CRLF that has not yet been line-unfolded. If it contains
17 /// a string, that represents the whitespace that was produced
18 /// around that CRLF during line unfolding. This may include whitespace
19 /// from the end of the previous line.
20 Newline(Option<String>),
21 /// The decoded value of an encoded word found in the header.
22 DecodedWord(String),
23}
24
25fn is_boundary(line: &str, ix: Option<usize>) -> bool {
26 ix.and_then(|v| line.chars().nth(v))
27 .map(|c| {
28 c.is_whitespace()
29 || c == '"'
30 || c == '('
31 || c == ')'
32 || c == '<'
33 || c == '>'
34 || c == ','
35 })
36 .unwrap_or(true)
37}
38
39fn decode_word(encoded: &str) -> Option<String> {
40 let ix_delim1 = encoded.find('?')?;
41 let ix_delim2 = find_from(encoded, ix_delim1 + 1, "?")?;
42
43 let charset = &encoded[0..ix_delim1];
44 let transfer_coding = &encoded[ix_delim1 + 1..ix_delim2];
45 let input = &encoded[ix_delim2 + 1..];
46
47 let decoded = match transfer_coding {
48 "B" | "b" => data_encoding::BASE64_MIME_PERMISSIVE
49 .decode(input.as_bytes())
50 .ok()?,
51 "Q" | "q" => {
52 // The quoted_printable module does a trim_end on the input, so if
53 // that affects the output we should save and restore the trailing
54 // whitespace
55 let to_decode = input.replace('_', " ");
56 let trimmed = to_decode.trim_end();
57 let mut d = quoted_printable::decode(trimmed, quoted_printable::ParseMode::Robust);
58 if d.is_ok() && to_decode.len() != trimmed.len() {
59 d.as_mut()
60 .unwrap()
61 .extend_from_slice(to_decode[trimmed.len()..].as_bytes());
62 }
63 d.ok()?
64 }
65 _ => return None,
66 };
67 let charset = Charset::for_label_no_replacement(charset.as_bytes())?;
68 let (cow, _) = charset.decode_without_bom_handling(&decoded);
69 Some(cow.into_owned())
70}
71
72/// Tokenizes a single line of the header and produces a vector of
73/// tokens. Because this only processes a single line, it will never
74/// generate `HeaderToken::Newline` tokens.
75fn tokenize_header_line(line: &str) -> Vec<HeaderToken> {
76 fn maybe_whitespace(text: &str) -> HeaderToken {
77 if text.trim_end().is_empty() {
78 HeaderToken::Whitespace(text)
79 } else {
80 HeaderToken::Text(text)
81 }
82 }
83
84 let mut result = Vec::new();
85 let mut ix_search = 0;
86 loop {
87 match find_from(line, ix_search, "=?") {
88 Some(v) => {
89 let ix_begin = v + 2;
90 if !is_boundary(line, ix_begin.checked_sub(3)) {
91 result.push(HeaderToken::Text(&line[ix_search..ix_begin]));
92 ix_search = ix_begin;
93 continue;
94 }
95 result.push(maybe_whitespace(&line[ix_search..ix_begin - 2]));
96 let mut ix_end_search = ix_begin;
97 loop {
98 match find_from(line, ix_end_search, "?=") {
99 Some(ix_end) => {
100 if !is_boundary(line, ix_end.checked_add(2)) {
101 ix_end_search = ix_end + 2;
102 continue;
103 }
104 match decode_word(&line[ix_begin..ix_end]) {
105 Some(v) => result.push(HeaderToken::DecodedWord(v)),
106 None => {
107 result.push(HeaderToken::Text(&line[ix_begin - 2..ix_end + 2]));
108 }
109 };
110 ix_search = ix_end;
111 }
112 None => {
113 result.push(HeaderToken::Text("=?"));
114 ix_search = ix_begin - 2;
115 }
116 };
117 break;
118 }
119 ix_search += 2;
120 continue;
121 }
122 None => {
123 result.push(maybe_whitespace(&line[ix_search..]));
124 break;
125 }
126 };
127 }
128 result
129}
130
131/// Tokenize an entire header, including newlines. This includes
132/// decoded words, but doesn't do line unfolding, so any `HeaderToken::Newline`
133/// tokens will always have a `None` inner value. Whitespace preceding
134/// the newline will be in a separate `HeaderToken::Whitespace` or
135/// `HeaderToken::Text` token. Semantically the `HeaderToken::Newline`
136/// tokens that come out of this still represent the CRLF newline.
137fn tokenize_header(value: &str) -> Vec<HeaderToken> {
138 let mut tokens = Vec::new();
139 let mut lines = value.lines();
140 let mut first = true;
141 while let Some(line) = lines.next().map(str::trim_start) {
142 if first {
143 first = false;
144 } else {
145 tokens.push(HeaderToken::Newline(None));
146 }
147 let mut line_tokens = tokenize_header_line(line);
148 tokens.append(&mut line_tokens);
149 }
150 tokens
151}
152
153/// Takes in a list of tokens and processes them to normalize the whitespace
154/// per the RFC. This includes dropping any whitespace between two adjacent
155/// encoded words, and also doing line unfolding. As a result, the `HeaderToken::Newline`
156/// tokens that come out of this no longer represent the CRLF newline, but instead
157/// their contained `Option<String>` will be populated with whatever whitespace gets
158/// generated from unfolding the line. This might include end-of-line whitespace from
159/// the previous line.
160fn normalize_header_whitespace(tokens: Vec<HeaderToken>) -> Vec<HeaderToken> {
161 let mut result = Vec::<HeaderToken>::new();
162
163 let mut saved_token = None;
164 // See RFC 2047 section 6.2 for what's going on here. Basically whitespace
165 // that's between two adjacent encoded words should be thrown away.
166 for tok in tokens {
167 match &tok {
168 HeaderToken::Text(_) => {
169 // If we saved some whitespace, put it in since we encountered
170 // non-whitespace chars that weren't part of an encoded word.
171 if let Some(HeaderToken::Whitespace(_)) = &saved_token {
172 result.push(saved_token.unwrap());
173 } else if let Some(HeaderToken::Newline(Some(_))) = &saved_token {
174 result.push(saved_token.unwrap());
175 }
176 // Also put the actual non-whitespace chars.
177 result.push(tok);
178 saved_token = None;
179 }
180 HeaderToken::Whitespace(_) => {
181 // If the previous token was an encoded word, save the whitespace
182 // as whitespace that's between two encoded words should be dropped.
183 // We only know if this whitespace goes into `result` after parsing
184 // the next token.
185 if let Some(HeaderToken::DecodedWord(_)) = saved_token {
186 saved_token = Some(tok);
187 } else {
188 result.push(tok);
189 saved_token = None;
190 }
191 }
192 HeaderToken::Newline(_) => {
193 // If we saved whitespace at the end of the line, add an extra space
194 // to it from the line unfolding.
195 if let Some(HeaderToken::Whitespace(ws)) = saved_token {
196 let new_ws = ws.to_owned() + " ";
197 saved_token = Some(HeaderToken::Newline(Some(new_ws)));
198 // If the end of the line had an encoded word, save the space from
199 // line unfolding.
200 } else if let Some(HeaderToken::DecodedWord(_)) = saved_token {
201 saved_token = Some(HeaderToken::Newline(Some(" ".to_string())));
202 } else {
203 result.push(HeaderToken::Newline(Some(" ".to_string())));
204 saved_token = None;
205 }
206 }
207 HeaderToken::DecodedWord(_) => {
208 // Note that saved_token might be a whitespace thing here. But we
209 // throw it away because that means it fell between two adjacent
210 // encoded words.
211 saved_token = Some(HeaderToken::DecodedWord(String::new()));
212 result.push(tok);
213 }
214 }
215 }
216 result
217}
218
219pub fn normalized_tokens(raw_value: &str) -> Vec<HeaderToken> {
220 normalize_header_whitespace(tokenize_header(raw_value))
221}
222
223#[cfg(test)]
224mod tests {
225 use super::*;
226
227 #[test]
228 fn test_is_boundary_multibyte() {
229 // Bug #26, Incorrect unwrap() guard in is_boundary()
230 // 6x'REPLACEMENT CHARACTER', but 18 bytes of data:
231 let test = "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}";
232 assert!(is_boundary(test, Some(8)));
233 }
234}