rfc2047_decoder/lexer/
mod.rs

1pub mod encoded_word;
2
3use chumsky::{prelude::Simple, text::whitespace, Parser};
4use std::{collections::HashSet, fmt::Display, result};
5use thiserror::Error;
6
7use crate::{decoder::RecoverStrategy, Decoder};
8
9use self::encoded_word::EncodedWord;
10
11pub const QUESTION_MARK: u8 = b'?';
12const SPACE: u8 = b' ';
13
14/// A helper struct which implements [std::fmt::Display] for `Vec<String>` and
15/// which contains the encoded words which are too long as a `String`.
16///
17/// # Example
18/// ```
19/// use rfc2047_decoder::{self, decode, RecoverStrategy, LexerError};
20///
21/// // the first string and the third string are more than 75 characters, hence
22/// // they are actually invalid encoded words
23/// let message = concat![
24///     "=?utf-8?B?bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb==?=",
25///     "among us",
26///     "=?utf-8?B?aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa==?=",
27/// ];
28
29/// let result = decode(message).unwrap_err();
30/// if let rfc2047_decoder::Error::Lexer(LexerError::ParseEncodedWordTooLongError(invalid_encoded_words)) = result {
31///     assert_eq!(invalid_encoded_words.0[0], "=?utf-8?B?bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb==?=");
32///     assert_eq!(invalid_encoded_words.0[1], "=?utf-8?B?aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa==?=");
33/// } else {
34///     assert!(false);
35/// }
36/// ```
37#[derive(Debug, Clone, PartialEq, Eq, Hash)]
38pub struct TooLongEncodedWords(pub Vec<String>);
39
40impl TooLongEncodedWords {
41    pub fn new(encoded_words: Vec<String>) -> Self {
42        Self(encoded_words)
43    }
44}
45
46impl Display for TooLongEncodedWords {
47    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
48        let mut message = String::new();
49
50        if !self.0.is_empty() {
51            message = self.0[0].clone();
52
53            for encoded_word in self.0.iter().skip(1) {
54                message.push_str(&format!(", {}", encoded_word));
55            }
56        }
57
58        f.write_str(&message)
59    }
60}
61
62/// All errors which the lexer can throw.
63#[derive(Error, Debug, Clone, PartialEq)]
64pub enum Error {
65    #[error("cannot parse bytes into tokens")]
66    ParseBytesError(Vec<Simple<u8>>),
67    #[error("Cannot parse the following encoded words, because they are too long: {0}")]
68    ParseEncodedWordTooLongError(TooLongEncodedWords),
69}
70
71type Result<T> = result::Result<T, Error>;
72
73pub type Tokens = Vec<Token>;
74
75#[derive(Debug, Clone, PartialEq, Hash, Eq)]
76pub enum Token {
77    ClearText(Vec<u8>),
78    EncodedWord(EncodedWord),
79}
80
81impl Token {
82    /// Returns the amount of bytes which the token holds
83    pub fn len(&self) -> usize {
84        match self {
85            Self::ClearText(clear_text) => clear_text.len(),
86            Self::EncodedWord(encoded_word) => encoded_word.len(),
87        }
88    }
89}
90
91pub fn run(encoded_bytes: &[u8], decoder: Decoder) -> Result<Tokens> {
92    let tokens = get_parser(&decoder)
93        .parse(encoded_bytes)
94        .map_err(Error::ParseBytesError)?;
95
96    validate_tokens(tokens, &decoder)
97}
98
99fn get_parser(decoder: &Decoder) -> impl Parser<u8, Tokens, Error = Simple<u8>> {
100    use chumsky::prelude::*;
101
102    let encoded_words_in_a_row = {
103        let following_encoded_word =
104            whitespace().ignore_then(encoded_word_parser(decoder).rewind());
105        encoded_word_parser(decoder).then_ignore(following_encoded_word)
106    };
107
108    let single_encoded_word = encoded_word_parser(decoder);
109    let single_clear_text = clear_text_parser(decoder);
110
111    encoded_words_in_a_row
112        .or(single_encoded_word)
113        .or(single_clear_text)
114        .repeated()
115}
116
117fn clear_text_parser(decoder: &Decoder) -> impl Parser<u8, Token, Error = Simple<u8>> {
118    use chumsky::prelude::*;
119
120    const DEFAULT_EMPTY_INPUT_ERROR_MESSAGE: &str = "got empty input";
121
122    take_until(encoded_word_parser(decoder).rewind().ignored().or(end())).try_map(
123        |(chars, ()), span| {
124            if chars.is_empty() {
125                Err(Simple::custom(span, DEFAULT_EMPTY_INPUT_ERROR_MESSAGE))
126            } else {
127                Ok(Token::ClearText(chars))
128            }
129        },
130    )
131}
132
133fn encoded_word_parser(decoder: &Decoder) -> impl Parser<u8, Token, Error = Simple<u8>> {
134    use chumsky::prelude::*;
135
136    let skip_encoded_word_length = decoder.too_long_encoded_word;
137
138    let convert_to_token = move |encoded_word: EncodedWord| {
139        if encoded_word.len() > encoded_word::MAX_LENGTH
140            && skip_encoded_word_length == RecoverStrategy::Skip
141        {
142            Token::ClearText(encoded_word.get_bytes(true))
143        } else {
144            Token::EncodedWord(encoded_word)
145        }
146    };
147
148    let is_especial = |c: u8| get_especials().contains(&c);
149
150    let token = filter(move |&c: &u8| c != SPACE && !c.is_ascii_control() && !is_especial(c));
151    let charset = token.repeated().at_least(1).collect::<Vec<u8>>();
152    let encoding = token.repeated().at_least(1).collect::<Vec<u8>>();
153    let encoded_text = filter(|&c: &u8| c != QUESTION_MARK && c != SPACE)
154        .repeated()
155        .collect::<Vec<u8>>();
156
157    just(encoded_word::PREFIX)
158        .ignore_then(charset)
159        .then_ignore(just(QUESTION_MARK))
160        .then(encoding)
161        .then_ignore(just(QUESTION_MARK))
162        .then(encoded_text)
163        .then_ignore(just(encoded_word::SUFFIX))
164        .map(EncodedWord::from_parser)
165        .map(convert_to_token)
166}
167
168fn get_especials() -> HashSet<u8> {
169    "()<>@,;:/[]?.=".bytes().collect()
170}
171
172fn validate_tokens(tokens: Tokens, decoder: &Decoder) -> Result<Tokens> {
173    if let Some(too_long_encoded_words) = get_too_long_encoded_words(&tokens, decoder) {
174        return Err(Error::ParseEncodedWordTooLongError(too_long_encoded_words));
175    }
176
177    Ok(tokens)
178}
179
180fn get_too_long_encoded_words(tokens: &Tokens, decoder: &Decoder) -> Option<TooLongEncodedWords> {
181    let strategy = decoder.too_long_encoded_word;
182    let mut too_long_encoded_words: Vec<String> = Vec::new();
183
184    for token in tokens.iter() {
185        if let Token::EncodedWord(encoded_word) = token {
186            if token.len() > encoded_word::MAX_LENGTH && strategy == RecoverStrategy::Abort {
187                too_long_encoded_words.push(encoded_word.to_string());
188            }
189        }
190    }
191
192    if too_long_encoded_words.is_empty() {
193        None
194    } else {
195        Some(TooLongEncodedWords::new(too_long_encoded_words))
196    }
197}
198
199#[cfg(test)]
200mod tests {
201    use crate::{
202        lexer::{encoded_word::EncodedWord, run, Token},
203        Decoder,
204    };
205
206    use super::{get_parser, Error, TooLongEncodedWords};
207    use chumsky::Parser;
208
209    #[test]
210    fn encoded_word() {
211        let parser = get_parser(&Decoder::new());
212        let message = "=?ISO-8859-1?Q?Yeet?=".as_bytes();
213
214        let parsed = parser.parse(message).unwrap();
215
216        assert_eq!(
217            parsed,
218            vec![Token::EncodedWord(EncodedWord {
219                charset: "ISO-8859-1".as_bytes().to_vec(),
220                encoding: "Q".as_bytes().to_vec(),
221                encoded_text: "Yeet".as_bytes().to_vec(),
222            })]
223        );
224    }
225
226    #[test]
227    fn clear_text() {
228        let parser = get_parser(&Decoder::new());
229        let message = "I use Arch by the way".as_bytes();
230
231        let parsed = parser.parse(message).unwrap();
232
233        assert_eq!(
234            parsed,
235            vec![Token::ClearText(
236                "I use Arch by the way".as_bytes().to_vec()
237            )]
238        );
239    }
240
241    // The following examples are from the encoded-form table in section 8:
242    // https://datatracker.ietf.org/doc/html/rfc2047#section-8
243    #[test]
244    fn encoded_from_1() {
245        let parser = get_parser(&Decoder::new());
246        let message = "=?ISO-8859-1?Q?a?=".as_bytes();
247
248        let parsed = parser.parse(message).unwrap();
249
250        assert_eq!(
251            parsed,
252            vec![Token::EncodedWord(EncodedWord {
253                charset: "ISO-8859-1".as_bytes().to_vec(),
254                encoding: "Q".as_bytes().to_vec(),
255                encoded_text: "a".as_bytes().to_vec()
256            })]
257        );
258    }
259
260    // see encoded_from_1
261    #[test]
262    fn encoded_from_2() {
263        let parser = get_parser(&Decoder::new());
264        let message = "=?ISO-8859-1?Q?a?= b".as_bytes();
265
266        let parsed = parser.parse(message).unwrap();
267
268        assert_eq!(
269            parsed,
270            vec![
271                Token::EncodedWord(EncodedWord {
272                    charset: "ISO-8859-1".as_bytes().to_vec(),
273                    encoding: "Q".as_bytes().to_vec(),
274                    encoded_text: "a".as_bytes().to_vec(),
275                }),
276                Token::ClearText(" b".as_bytes().to_vec()),
277            ]
278        );
279    }
280
281    // see encoded_from_1
282    #[test]
283    fn encoded_from_3() {
284        let parser = get_parser(&Decoder::new());
285        let message = "=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=".as_bytes();
286
287        let parsed = parser.parse(message).unwrap();
288
289        assert_eq!(
290            parsed,
291            vec![
292                Token::EncodedWord(EncodedWord {
293                    charset: "ISO-8859-1".as_bytes().to_vec(),
294                    encoding: "Q".as_bytes().to_vec(),
295                    encoded_text: "a".as_bytes().to_vec(),
296                }),
297                Token::EncodedWord(EncodedWord {
298                    charset: "ISO-8859-1".as_bytes().to_vec(),
299                    encoding: "Q".as_bytes().to_vec(),
300                    encoded_text: "b".as_bytes().to_vec()
301                })
302            ]
303        );
304    }
305
306    /// Test if parser can parse multiple encoded words in a row
307    /// See: https://datatracker.ietf.org/doc/html/rfc2047#section-8
308    #[test]
309    fn multiple_encoded_words() {
310        let parser = get_parser(&Decoder::new());
311        let message = "=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?= =?ISO-8859-1?Q?c?=".as_bytes();
312
313        let parsed = parser.parse(message).unwrap();
314
315        assert_eq!(
316            parsed,
317            vec![
318                Token::EncodedWord(EncodedWord {
319                    charset: "ISO-8859-1".as_bytes().to_vec(),
320                    encoding: "Q".as_bytes().to_vec(),
321                    encoded_text: "a".as_bytes().to_vec(),
322                }),
323                Token::EncodedWord(EncodedWord {
324                    charset: "ISO-8859-1".as_bytes().to_vec(),
325                    encoding: "Q".as_bytes().to_vec(),
326                    encoded_text: "b".as_bytes().to_vec()
327                }),
328                Token::EncodedWord(EncodedWord {
329                    charset: "ISO-8859-1".as_bytes().to_vec(),
330                    encoding: "Q".as_bytes().to_vec(),
331                    encoded_text: "c".as_bytes().to_vec()
332                })
333            ]
334        );
335    }
336
337    #[test]
338    fn ignore_mutiple_spaces_between_encoded_words() {
339        let parser = get_parser(&Decoder::new());
340        let message =
341            "=?ISO-8859-1?Q?a?=                               =?ISO-8859-1?Q?b?=".as_bytes();
342
343        let parsed = parser.parse(message).unwrap();
344
345        assert_eq!(
346            parsed,
347            vec![
348                Token::EncodedWord(EncodedWord {
349                    charset: "ISO-8859-1".as_bytes().to_vec(),
350                    encoding: "Q".as_bytes().to_vec(),
351                    encoded_text: "a".as_bytes().to_vec(),
352                }),
353                Token::EncodedWord(EncodedWord {
354                    charset: "ISO-8859-1".as_bytes().to_vec(),
355                    encoding: "Q".as_bytes().to_vec(),
356                    encoded_text: "b".as_bytes().to_vec()
357                })
358            ]
359        );
360    }
361
362    /// An encoded word with more then 75 chars should panic
363    #[test]
364    fn err_on_too_long_encoded_word() {
365        // "=?" (2) + "ISO-8859-1" (10) + "?" (1) + "Q" (1) + "?" (1) + 'a' (60) + "?=" (2)
366        // = 2 + 10 + 1 + 1 + 1 + 60 + 2
367        // = 77 => too long
368        let message =
369            "=?ISO-8859-1?Q?aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa?="
370                .as_bytes();
371        let parsed = run(message, Decoder::new());
372
373        assert_eq!(
374            parsed,
375            Err(Error::ParseEncodedWordTooLongError(
376                TooLongEncodedWords::new(vec![EncodedWord {
377                    charset: "ISO-8859-1".as_bytes().to_vec(),
378                    encoding: "Q".as_bytes().to_vec(),
379                    encoded_text: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
380                        .as_bytes()
381                        .to_vec()
382                }
383                .to_string()])
384            ))
385        );
386    }
387
388    #[test]
389    fn encoded_word_has_especials() {
390        let parser = get_parser(&Decoder::new());
391        let message = "=?ISO-8859-1(?Q?a?=".as_bytes();
392        let parsed = parser.parse(message).unwrap();
393
394        assert_eq!(parsed, vec![Token::ClearText(message.to_vec())]);
395    }
396}