Skip to main content

rfc2047_decoder/lexer/
mod.rs

1pub mod encoded_word;
2
3use chumsky::{extra, prelude::*, text::whitespace, Parser};
4use std::{collections::HashSet, fmt::Display};
5use thiserror::Error;
6
7use crate::decoder::RecoverStrategy;
8
9use self::encoded_word::EncodedWord;
10
11pub const QUESTION_MARK: u8 = b'?';
12const SPACE: u8 = b' ';
13
14/// A helper struct which implements [std::fmt::Display] for `Vec<String>` and
15/// which contains the encoded words which are too long as a `String`.
16///
17/// # Example
18/// ```
19/// use rfc2047_decoder::{self, decode, RecoverStrategy, LexerError};
20///
21/// // the first string and the third string are more than 75 characters, hence
22/// // they are actually invalid encoded words
23/// let message = concat![
24///     "=?utf-8?B?bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb==?=",
25///     "among us",
26///     "=?utf-8?B?aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa==?=",
27/// ];
28
29/// let result = decode(message).unwrap_err();
30/// if let rfc2047_decoder::Error::Lexer(LexerError::ParseEncodedWordTooLongError(invalid_encoded_words)) = result {
31///     assert_eq!(invalid_encoded_words.0[0], "=?utf-8?B?bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb==?=");
32///     assert_eq!(invalid_encoded_words.0[1], "=?utf-8?B?aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa==?=");
33/// } else {
34///     assert!(false);
35/// }
36/// ```
37#[derive(Debug, Clone, PartialEq, Eq, Hash)]
38pub struct TooLongEncodedWords(pub Vec<String>);
39
40impl TooLongEncodedWords {
41    pub fn new(encoded_words: Vec<String>) -> Self {
42        Self(encoded_words)
43    }
44}
45
46impl Display for TooLongEncodedWords {
47    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
48        let mut message = String::new();
49
50        if !self.0.is_empty() {
51            message = self.0[0].clone();
52
53            for encoded_word in self.0.iter().skip(1) {
54                message.push_str(&format!(", {}", encoded_word));
55            }
56        }
57
58        f.write_str(&message)
59    }
60}
61
62/// All errors which the lexer can throw.
63#[derive(Error, Debug, Clone, PartialEq)]
64pub enum Error {
65    #[error("cannot parse bytes into tokens: {0}")]
66    ParseBytesError(String),
67    #[error("Cannot parse the following encoded words, because they are too long: {0}")]
68    ParseEncodedWordTooLongError(TooLongEncodedWords),
69}
70
71pub type Tokens = Vec<Token>;
72
73#[derive(Debug, Clone, PartialEq, Hash, Eq)]
74pub enum Token {
75    ClearText(Vec<u8>),
76    EncodedWord(EncodedWord),
77}
78
79impl Token {
80    /// Returns the amount of bytes which the token holds
81    pub fn len(&self) -> usize {
82        match self {
83            Self::ClearText(clear_text) => clear_text.len(),
84            Self::EncodedWord(encoded_word) => encoded_word.len(),
85        }
86    }
87}
88
89pub fn run(encoded_bytes: &[u8], strategy: RecoverStrategy) -> Result<Tokens, Error> {
90    let tokens = get_parser(strategy)
91        .parse(encoded_bytes)
92        .into_result()
93        .map_err(|err| {
94            let mut msg = String::new();
95
96            if !err.is_empty() {
97                for e in err {
98                    msg.push_str(&format!("{}\n", e));
99                }
100            }
101
102            Error::ParseBytesError(msg)
103        })?;
104
105    validate_tokens(tokens, strategy)
106}
107
108fn get_parser<'src>(
109    strategy: RecoverStrategy,
110) -> impl Parser<'src, &'src [u8], Tokens, extra::Err<Simple<'src, u8>>> {
111    let encoded_words_in_a_row = {
112        let following_encoded_word =
113            whitespace().ignore_then(encoded_word_parser(strategy).rewind());
114        encoded_word_parser(strategy).then_ignore(following_encoded_word)
115    };
116
117    let single_encoded_word = encoded_word_parser(strategy);
118    let single_clear_text = clear_text_parser(strategy);
119
120    encoded_words_in_a_row
121        .or(single_encoded_word)
122        .or(single_clear_text)
123        .repeated()
124        .collect()
125}
126
127fn clear_text_parser<'src>(
128    skip_encoded_word_length: RecoverStrategy,
129) -> impl Parser<'src, &'src [u8], Token, extra::Err<Simple<'src, u8>>> {
130    any()
131        .and_is(
132            encoded_word_parser(skip_encoded_word_length)
133                .rewind()
134                .ignored()
135                .or(end())
136                .not(),
137        )
138        .repeated()
139        .collect::<Vec<u8>>()
140        .try_map(|chars, span| {
141            if chars.is_empty() {
142                Err(Simple::new(None, span))
143            } else {
144                Ok(Token::ClearText(chars))
145            }
146        })
147}
148
149fn encoded_word_parser<'src>(
150    skip_encoded_word_length: RecoverStrategy,
151) -> impl Parser<'src, &'src [u8], Token, extra::Err<Simple<'src, u8>>> {
152    let convert_to_token = move |encoded_word: EncodedWord| {
153        if encoded_word.len() > encoded_word::MAX_LENGTH
154            && skip_encoded_word_length == RecoverStrategy::Skip
155        {
156            Token::ClearText(encoded_word.get_bytes(true))
157        } else {
158            Token::EncodedWord(encoded_word)
159        }
160    };
161
162    let is_especial = |c: u8| get_especials().contains(&c);
163
164    let token = any().filter(move |&c: &u8| c != SPACE && !c.is_ascii_control() && !is_especial(c));
165    let charset = token.repeated().at_least(1).collect::<Vec<u8>>();
166    let encoding = token.repeated().at_least(1).collect::<Vec<u8>>();
167    let encoded_text = any()
168        .filter(|&c: &u8| c != QUESTION_MARK && c != SPACE)
169        .repeated()
170        .collect::<Vec<u8>>();
171
172    just(encoded_word::PREFIX)
173        .ignore_then(charset)
174        .then_ignore(just(QUESTION_MARK))
175        .then(encoding)
176        .then_ignore(just(QUESTION_MARK))
177        .then(encoded_text)
178        .then_ignore(just(encoded_word::SUFFIX))
179        .map(EncodedWord::from_parser)
180        .map(convert_to_token)
181}
182
183fn get_especials() -> HashSet<u8> {
184    "()<>@,;:/[]?.=".bytes().collect()
185}
186
187fn validate_tokens(tokens: Tokens, strategy: RecoverStrategy) -> Result<Tokens, Error> {
188    if let Some(too_long_encoded_words) = get_too_long_encoded_words(&tokens, strategy) {
189        return Err(Error::ParseEncodedWordTooLongError(too_long_encoded_words));
190    }
191
192    Ok(tokens)
193}
194
195fn get_too_long_encoded_words(
196    tokens: &Tokens,
197    strategy: RecoverStrategy,
198) -> Option<TooLongEncodedWords> {
199    let mut too_long_encoded_words: Vec<String> = Vec::new();
200
201    for token in tokens.iter() {
202        if let Token::EncodedWord(encoded_word) = token {
203            if token.len() > encoded_word::MAX_LENGTH && strategy == RecoverStrategy::Abort {
204                too_long_encoded_words.push(encoded_word.to_string());
205            }
206        }
207    }
208
209    if too_long_encoded_words.is_empty() {
210        None
211    } else {
212        Some(TooLongEncodedWords::new(too_long_encoded_words))
213    }
214}
215
216#[cfg(test)]
217mod tests {
218    use crate::{
219        lexer::{encoded_word::EncodedWord, get_parser, run, Error, Token, TooLongEncodedWords},
220        RecoverStrategy,
221    };
222    use chumsky::Parser;
223
224    #[test]
225    fn encoded_word() {
226        let parser = get_parser(RecoverStrategy::Abort);
227        let message = "=?ISO-8859-1?Q?Yeet?=".as_bytes();
228
229        let parsed = parser.parse(message).unwrap();
230
231        assert_eq!(
232            parsed,
233            vec![Token::EncodedWord(EncodedWord {
234                charset: "ISO-8859-1".as_bytes().to_vec(),
235                encoding: "Q".as_bytes().to_vec(),
236                encoded_text: "Yeet".as_bytes().to_vec(),
237            })]
238        );
239    }
240
241    #[test]
242    fn clear_text() {
243        let parser = get_parser(RecoverStrategy::Abort);
244        let message = "I use Arch by the way".as_bytes();
245
246        let parsed = parser.parse(message).unwrap();
247
248        assert_eq!(
249            parsed,
250            vec![Token::ClearText(
251                "I use Arch by the way".as_bytes().to_vec()
252            )]
253        );
254    }
255
256    // The following examples are from the encoded-form table in section 8:
257    // https://datatracker.ietf.org/doc/html/rfc2047#section-8
258    #[test]
259    fn encoded_from_1() {
260        let parser = get_parser(RecoverStrategy::Abort);
261        let message = "=?ISO-8859-1?Q?a?=".as_bytes();
262
263        let parsed = parser.parse(message).unwrap();
264
265        assert_eq!(
266            parsed,
267            vec![Token::EncodedWord(EncodedWord {
268                charset: "ISO-8859-1".as_bytes().to_vec(),
269                encoding: "Q".as_bytes().to_vec(),
270                encoded_text: "a".as_bytes().to_vec()
271            })]
272        );
273    }
274
275    // see encoded_from_1
276    #[test]
277    fn encoded_from_2() {
278        let parser = get_parser(RecoverStrategy::Abort);
279        let message = "=?ISO-8859-1?Q?a?= b".as_bytes();
280
281        let parsed = parser.parse(message).unwrap();
282
283        assert_eq!(
284            parsed,
285            vec![
286                Token::EncodedWord(EncodedWord {
287                    charset: "ISO-8859-1".as_bytes().to_vec(),
288                    encoding: "Q".as_bytes().to_vec(),
289                    encoded_text: "a".as_bytes().to_vec(),
290                }),
291                Token::ClearText(" b".as_bytes().to_vec()),
292            ]
293        );
294    }
295
296    // see encoded_from_1
297    #[test]
298    fn encoded_from_3() {
299        let parser = get_parser(RecoverStrategy::Abort);
300        let message = "=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=".as_bytes();
301
302        let parsed = parser.parse(message).unwrap();
303
304        assert_eq!(
305            parsed,
306            vec![
307                Token::EncodedWord(EncodedWord {
308                    charset: "ISO-8859-1".as_bytes().to_vec(),
309                    encoding: "Q".as_bytes().to_vec(),
310                    encoded_text: "a".as_bytes().to_vec(),
311                }),
312                Token::EncodedWord(EncodedWord {
313                    charset: "ISO-8859-1".as_bytes().to_vec(),
314                    encoding: "Q".as_bytes().to_vec(),
315                    encoded_text: "b".as_bytes().to_vec()
316                })
317            ]
318        );
319    }
320
321    /// Test if parser can parse multiple encoded words in a row
322    /// See: https://datatracker.ietf.org/doc/html/rfc2047#section-8
323    #[test]
324    fn multiple_encoded_words() {
325        let parser = get_parser(RecoverStrategy::Abort);
326        let message = "=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?= =?ISO-8859-1?Q?c?=".as_bytes();
327
328        let parsed = parser.parse(message).unwrap();
329
330        assert_eq!(
331            parsed,
332            vec![
333                Token::EncodedWord(EncodedWord {
334                    charset: "ISO-8859-1".as_bytes().to_vec(),
335                    encoding: "Q".as_bytes().to_vec(),
336                    encoded_text: "a".as_bytes().to_vec(),
337                }),
338                Token::EncodedWord(EncodedWord {
339                    charset: "ISO-8859-1".as_bytes().to_vec(),
340                    encoding: "Q".as_bytes().to_vec(),
341                    encoded_text: "b".as_bytes().to_vec()
342                }),
343                Token::EncodedWord(EncodedWord {
344                    charset: "ISO-8859-1".as_bytes().to_vec(),
345                    encoding: "Q".as_bytes().to_vec(),
346                    encoded_text: "c".as_bytes().to_vec()
347                })
348            ]
349        );
350    }
351
352    #[test]
353    fn ignore_mutiple_spaces_between_encoded_words() {
354        let parser = get_parser(RecoverStrategy::Abort);
355        let message =
356            "=?ISO-8859-1?Q?a?=                               =?ISO-8859-1?Q?b?=".as_bytes();
357
358        let parsed = parser.parse(message).unwrap();
359
360        assert_eq!(
361            parsed,
362            vec![
363                Token::EncodedWord(EncodedWord {
364                    charset: "ISO-8859-1".as_bytes().to_vec(),
365                    encoding: "Q".as_bytes().to_vec(),
366                    encoded_text: "a".as_bytes().to_vec(),
367                }),
368                Token::EncodedWord(EncodedWord {
369                    charset: "ISO-8859-1".as_bytes().to_vec(),
370                    encoding: "Q".as_bytes().to_vec(),
371                    encoded_text: "b".as_bytes().to_vec()
372                })
373            ]
374        );
375    }
376
377    /// An encoded word with more then 75 chars should panic
378    #[test]
379    fn err_on_too_long_encoded_word() {
380        // "=?" (2) + "ISO-8859-1" (10) + "?" (1) + "Q" (1) + "?" (1) + 'a' (60) + "?=" (2)
381        // = 2 + 10 + 1 + 1 + 1 + 60 + 2
382        // = 77 => too long
383        let message =
384            "=?ISO-8859-1?Q?aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa?="
385                .as_bytes();
386        let parsed = run(message, RecoverStrategy::Abort);
387
388        assert_eq!(
389            parsed,
390            Err(Error::ParseEncodedWordTooLongError(
391                TooLongEncodedWords::new(vec![EncodedWord {
392                    charset: "ISO-8859-1".as_bytes().to_vec(),
393                    encoding: "Q".as_bytes().to_vec(),
394                    encoded_text: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
395                        .as_bytes()
396                        .to_vec()
397                }
398                .to_string()])
399            ))
400        );
401    }
402
403    #[test]
404    fn encoded_word_has_especials() {
405        let parser = get_parser(RecoverStrategy::Abort);
406        let message = "=?ISO-8859-1(?Q?a?=".as_bytes();
407        let parsed = parser.parse(message).unwrap();
408
409        assert_eq!(parsed, vec![Token::ClearText(message.to_vec())]);
410    }
411}