mailparse/
header.rs

1use charset::Charset;
2
3use crate::find_from;
4
5/// Some types of tokens that might be present in a MIME header. This
6/// list is incomplete relative the types of tokens defined in the RFC,
7/// but can be expanded as needed. Currently the list of tokens is
8/// sufficient to properly handle encoded words and line unfolding.
9pub enum HeaderToken<'a> {
10    /// A bunch of not-encoded text. This can include whitespace and
11    /// non-whitespace chars.
12    Text(&'a str),
13    /// A bunch of text that is purely whitespace.
14    Whitespace(&'a str),
15    /// An end-of-line marker. If it contains None, then it represents
16    /// a raw CRLF that has not yet been line-unfolded. If it contains
17    /// a string, that represents the whitespace that was produced
18    /// around that CRLF during line unfolding. This may include whitespace
19    /// from the end of the previous line.
20    Newline(Option<String>),
21    /// The decoded value of an encoded word found in the header.
22    DecodedWord(String),
23}
24
25fn is_boundary(line: &str, ix: Option<usize>) -> bool {
26    ix.and_then(|v| line.chars().nth(v))
27        .map(|c| {
28            c.is_whitespace()
29                || c == '"'
30                || c == '('
31                || c == ')'
32                || c == '<'
33                || c == '>'
34                || c == ','
35        })
36        .unwrap_or(true)
37}
38
39fn decode_word(encoded: &str) -> Option<String> {
40    let ix_delim1 = encoded.find('?')?;
41    let ix_delim2 = find_from(encoded, ix_delim1 + 1, "?")?;
42
43    let charset = &encoded[0..ix_delim1];
44    let transfer_coding = &encoded[ix_delim1 + 1..ix_delim2];
45    let input = &encoded[ix_delim2 + 1..];
46
47    let decoded = match transfer_coding {
48        "B" | "b" => data_encoding::BASE64_MIME_PERMISSIVE
49            .decode(input.as_bytes())
50            .ok()?,
51        "Q" | "q" => {
52            // The quoted_printable module does a trim_end on the input, so if
53            // that affects the output we should save and restore the trailing
54            // whitespace
55            let to_decode = input.replace('_', " ");
56            let trimmed = to_decode.trim_end();
57            let mut d = quoted_printable::decode(trimmed, quoted_printable::ParseMode::Robust);
58            if d.is_ok() && to_decode.len() != trimmed.len() {
59                d.as_mut()
60                    .unwrap()
61                    .extend_from_slice(to_decode[trimmed.len()..].as_bytes());
62            }
63            d.ok()?
64        }
65        _ => return None,
66    };
67    let charset = Charset::for_label_no_replacement(charset.as_bytes())?;
68    let (cow, _) = charset.decode_without_bom_handling(&decoded);
69    Some(cow.into_owned())
70}
71
72/// Tokenizes a single line of the header and produces a vector of
73/// tokens. Because this only processes a single line, it will never
74/// generate `HeaderToken::Newline` tokens.
75fn tokenize_header_line(line: &str) -> Vec<HeaderToken> {
76    fn maybe_whitespace(text: &str) -> HeaderToken {
77        if text.trim_end().is_empty() {
78            HeaderToken::Whitespace(text)
79        } else {
80            HeaderToken::Text(text)
81        }
82    }
83
84    let mut result = Vec::new();
85    let mut ix_search = 0;
86    loop {
87        match find_from(line, ix_search, "=?") {
88            Some(v) => {
89                let ix_begin = v + 2;
90                if !is_boundary(line, ix_begin.checked_sub(3)) {
91                    result.push(HeaderToken::Text(&line[ix_search..ix_begin]));
92                    ix_search = ix_begin;
93                    continue;
94                }
95                result.push(maybe_whitespace(&line[ix_search..ix_begin - 2]));
96                let mut ix_end_search = ix_begin;
97                loop {
98                    match find_from(line, ix_end_search, "?=") {
99                        Some(ix_end) => {
100                            if !is_boundary(line, ix_end.checked_add(2)) {
101                                ix_end_search = ix_end + 2;
102                                continue;
103                            }
104                            match decode_word(&line[ix_begin..ix_end]) {
105                                Some(v) => result.push(HeaderToken::DecodedWord(v)),
106                                None => {
107                                    result.push(HeaderToken::Text(&line[ix_begin - 2..ix_end + 2]));
108                                }
109                            };
110                            ix_search = ix_end;
111                        }
112                        None => {
113                            result.push(HeaderToken::Text("=?"));
114                            ix_search = ix_begin - 2;
115                        }
116                    };
117                    break;
118                }
119                ix_search += 2;
120                continue;
121            }
122            None => {
123                result.push(maybe_whitespace(&line[ix_search..]));
124                break;
125            }
126        };
127    }
128    result
129}
130
131/// Tokenize an entire header, including newlines. This includes
132/// decoded words, but doesn't do line unfolding, so any `HeaderToken::Newline`
133/// tokens will always have a `None` inner value. Whitespace preceding
134/// the newline will be in a separate `HeaderToken::Whitespace` or
135/// `HeaderToken::Text` token. Semantically the `HeaderToken::Newline`
136/// tokens that come out of this still represent the CRLF newline.
137fn tokenize_header(value: &str) -> Vec<HeaderToken> {
138    let mut tokens = Vec::new();
139    let mut lines = value.lines();
140    let mut first = true;
141    while let Some(line) = lines.next().map(str::trim_start) {
142        if first {
143            first = false;
144        } else {
145            tokens.push(HeaderToken::Newline(None));
146        }
147        let mut line_tokens = tokenize_header_line(line);
148        tokens.append(&mut line_tokens);
149    }
150    tokens
151}
152
153/// Takes in a list of tokens and processes them to normalize the whitespace
154/// per the RFC. This includes dropping any whitespace between two adjacent
155/// encoded words, and also doing line unfolding. As a result, the `HeaderToken::Newline`
156/// tokens that come out of this no longer represent the CRLF newline, but instead
157/// their contained `Option<String>` will be populated with whatever whitespace gets
158/// generated from unfolding the line. This might include end-of-line whitespace from
159/// the previous line.
160fn normalize_header_whitespace(tokens: Vec<HeaderToken>) -> Vec<HeaderToken> {
161    let mut result = Vec::<HeaderToken>::new();
162
163    let mut saved_token = None;
164    // See RFC 2047 section 6.2 for what's going on here. Basically whitespace
165    // that's between two adjacent encoded words should be thrown away.
166    for tok in tokens {
167        match &tok {
168            HeaderToken::Text(_) => {
169                // If we saved some whitespace, put it in since we encountered
170                // non-whitespace chars that weren't part of an encoded word.
171                if let Some(HeaderToken::Whitespace(_)) = &saved_token {
172                    result.push(saved_token.unwrap());
173                } else if let Some(HeaderToken::Newline(Some(_))) = &saved_token {
174                    result.push(saved_token.unwrap());
175                }
176                // Also put the actual non-whitespace chars.
177                result.push(tok);
178                saved_token = None;
179            }
180            HeaderToken::Whitespace(_) => {
181                // If the previous token was an encoded word, save the whitespace
182                // as whitespace that's between two encoded words should be dropped.
183                // We only know if this whitespace goes into `result` after parsing
184                // the next token.
185                if let Some(HeaderToken::DecodedWord(_)) = saved_token {
186                    saved_token = Some(tok);
187                } else {
188                    result.push(tok);
189                    saved_token = None;
190                }
191            }
192            HeaderToken::Newline(_) => {
193                // If we saved whitespace at the end of the line, add an extra space
194                // to it from the line unfolding.
195                if let Some(HeaderToken::Whitespace(ws)) = saved_token {
196                    let new_ws = ws.to_owned() + " ";
197                    saved_token = Some(HeaderToken::Newline(Some(new_ws)));
198                // If the end of the line had an encoded word, save the space from
199                // line unfolding.
200                } else if let Some(HeaderToken::DecodedWord(_)) = saved_token {
201                    saved_token = Some(HeaderToken::Newline(Some(" ".to_string())));
202                } else {
203                    result.push(HeaderToken::Newline(Some(" ".to_string())));
204                    saved_token = None;
205                }
206            }
207            HeaderToken::DecodedWord(_) => {
208                // Note that saved_token might be a whitespace thing here. But we
209                // throw it away because that means it fell between two adjacent
210                // encoded words.
211                saved_token = Some(HeaderToken::DecodedWord(String::new()));
212                result.push(tok);
213            }
214        }
215    }
216    result
217}
218
219pub fn normalized_tokens(raw_value: &str) -> Vec<HeaderToken> {
220    normalize_header_whitespace(tokenize_header(raw_value))
221}
222
223#[cfg(test)]
224mod tests {
225    use super::*;
226
227    #[test]
228    fn test_is_boundary_multibyte() {
229        // Bug #26, Incorrect unwrap() guard in is_boundary()
230        // 6x'REPLACEMENT CHARACTER', but 18 bytes of data:
231        let test = "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}";
232        assert!(is_boundary(test, Some(8)));
233    }
234}