1pub mod encoded_word;
2
3use chumsky::{extra, prelude::*, text::whitespace, Parser};
4use std::{collections::HashSet, fmt::Display};
5use thiserror::Error;
6
7use crate::decoder::RecoverStrategy;
8
9use self::encoded_word::EncodedWord;
10
11pub const QUESTION_MARK: u8 = b'?';
12const SPACE: u8 = b' ';
13
14#[derive(Debug, Clone, PartialEq, Eq, Hash)]
38pub struct TooLongEncodedWords(pub Vec<String>);
39
40impl TooLongEncodedWords {
41 pub fn new(encoded_words: Vec<String>) -> Self {
42 Self(encoded_words)
43 }
44}
45
46impl Display for TooLongEncodedWords {
47 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
48 let mut message = String::new();
49
50 if !self.0.is_empty() {
51 message = self.0[0].clone();
52
53 for encoded_word in self.0.iter().skip(1) {
54 message.push_str(&format!(", {}", encoded_word));
55 }
56 }
57
58 f.write_str(&message)
59 }
60}
61
62#[derive(Error, Debug, Clone, PartialEq)]
64pub enum Error {
65 #[error("cannot parse bytes into tokens: {0}")]
66 ParseBytesError(String),
67 #[error("Cannot parse the following encoded words, because they are too long: {0}")]
68 ParseEncodedWordTooLongError(TooLongEncodedWords),
69}
70
71pub type Tokens = Vec<Token>;
72
73#[derive(Debug, Clone, PartialEq, Hash, Eq)]
74pub enum Token {
75 ClearText(Vec<u8>),
76 EncodedWord(EncodedWord),
77}
78
79impl Token {
80 pub fn len(&self) -> usize {
82 match self {
83 Self::ClearText(clear_text) => clear_text.len(),
84 Self::EncodedWord(encoded_word) => encoded_word.len(),
85 }
86 }
87}
88
89pub fn run(encoded_bytes: &[u8], strategy: RecoverStrategy) -> Result<Tokens, Error> {
90 let tokens = get_parser(strategy)
91 .parse(encoded_bytes)
92 .into_result()
93 .map_err(|err| {
94 let mut msg = String::new();
95
96 if !err.is_empty() {
97 for e in err {
98 msg.push_str(&format!("{}\n", e));
99 }
100 }
101
102 Error::ParseBytesError(msg)
103 })?;
104
105 validate_tokens(tokens, strategy)
106}
107
108fn get_parser<'src>(
109 strategy: RecoverStrategy,
110) -> impl Parser<'src, &'src [u8], Tokens, extra::Err<Simple<'src, u8>>> {
111 let encoded_words_in_a_row = {
112 let following_encoded_word =
113 whitespace().ignore_then(encoded_word_parser(strategy).rewind());
114 encoded_word_parser(strategy).then_ignore(following_encoded_word)
115 };
116
117 let single_encoded_word = encoded_word_parser(strategy);
118 let single_clear_text = clear_text_parser(strategy);
119
120 encoded_words_in_a_row
121 .or(single_encoded_word)
122 .or(single_clear_text)
123 .repeated()
124 .collect()
125}
126
127fn clear_text_parser<'src>(
128 skip_encoded_word_length: RecoverStrategy,
129) -> impl Parser<'src, &'src [u8], Token, extra::Err<Simple<'src, u8>>> {
130 any()
131 .and_is(
132 encoded_word_parser(skip_encoded_word_length)
133 .rewind()
134 .ignored()
135 .or(end())
136 .not(),
137 )
138 .repeated()
139 .collect::<Vec<u8>>()
140 .try_map(|chars, span| {
141 if chars.is_empty() {
142 Err(Simple::new(None, span))
143 } else {
144 Ok(Token::ClearText(chars))
145 }
146 })
147}
148
149fn encoded_word_parser<'src>(
150 skip_encoded_word_length: RecoverStrategy,
151) -> impl Parser<'src, &'src [u8], Token, extra::Err<Simple<'src, u8>>> {
152 let convert_to_token = move |encoded_word: EncodedWord| {
153 if encoded_word.len() > encoded_word::MAX_LENGTH
154 && skip_encoded_word_length == RecoverStrategy::Skip
155 {
156 Token::ClearText(encoded_word.get_bytes(true))
157 } else {
158 Token::EncodedWord(encoded_word)
159 }
160 };
161
162 let is_especial = |c: u8| get_especials().contains(&c);
163
164 let token = any().filter(move |&c: &u8| c != SPACE && !c.is_ascii_control() && !is_especial(c));
165 let charset = token.repeated().at_least(1).collect::<Vec<u8>>();
166 let encoding = token.repeated().at_least(1).collect::<Vec<u8>>();
167 let encoded_text = any()
168 .filter(|&c: &u8| c != QUESTION_MARK && c != SPACE)
169 .repeated()
170 .collect::<Vec<u8>>();
171
172 just(encoded_word::PREFIX)
173 .ignore_then(charset)
174 .then_ignore(just(QUESTION_MARK))
175 .then(encoding)
176 .then_ignore(just(QUESTION_MARK))
177 .then(encoded_text)
178 .then_ignore(just(encoded_word::SUFFIX))
179 .map(EncodedWord::from_parser)
180 .map(convert_to_token)
181}
182
183fn get_especials() -> HashSet<u8> {
184 "()<>@,;:/[]?.=".bytes().collect()
185}
186
187fn validate_tokens(tokens: Tokens, strategy: RecoverStrategy) -> Result<Tokens, Error> {
188 if let Some(too_long_encoded_words) = get_too_long_encoded_words(&tokens, strategy) {
189 return Err(Error::ParseEncodedWordTooLongError(too_long_encoded_words));
190 }
191
192 Ok(tokens)
193}
194
195fn get_too_long_encoded_words(
196 tokens: &Tokens,
197 strategy: RecoverStrategy,
198) -> Option<TooLongEncodedWords> {
199 let mut too_long_encoded_words: Vec<String> = Vec::new();
200
201 for token in tokens.iter() {
202 if let Token::EncodedWord(encoded_word) = token {
203 if token.len() > encoded_word::MAX_LENGTH && strategy == RecoverStrategy::Abort {
204 too_long_encoded_words.push(encoded_word.to_string());
205 }
206 }
207 }
208
209 if too_long_encoded_words.is_empty() {
210 None
211 } else {
212 Some(TooLongEncodedWords::new(too_long_encoded_words))
213 }
214}
215
216#[cfg(test)]
217mod tests {
218 use crate::{
219 lexer::{encoded_word::EncodedWord, get_parser, run, Error, Token, TooLongEncodedWords},
220 RecoverStrategy,
221 };
222 use chumsky::Parser;
223
224 #[test]
225 fn encoded_word() {
226 let parser = get_parser(RecoverStrategy::Abort);
227 let message = "=?ISO-8859-1?Q?Yeet?=".as_bytes();
228
229 let parsed = parser.parse(message).unwrap();
230
231 assert_eq!(
232 parsed,
233 vec![Token::EncodedWord(EncodedWord {
234 charset: "ISO-8859-1".as_bytes().to_vec(),
235 encoding: "Q".as_bytes().to_vec(),
236 encoded_text: "Yeet".as_bytes().to_vec(),
237 })]
238 );
239 }
240
241 #[test]
242 fn clear_text() {
243 let parser = get_parser(RecoverStrategy::Abort);
244 let message = "I use Arch by the way".as_bytes();
245
246 let parsed = parser.parse(message).unwrap();
247
248 assert_eq!(
249 parsed,
250 vec![Token::ClearText(
251 "I use Arch by the way".as_bytes().to_vec()
252 )]
253 );
254 }
255
256 #[test]
259 fn encoded_from_1() {
260 let parser = get_parser(RecoverStrategy::Abort);
261 let message = "=?ISO-8859-1?Q?a?=".as_bytes();
262
263 let parsed = parser.parse(message).unwrap();
264
265 assert_eq!(
266 parsed,
267 vec![Token::EncodedWord(EncodedWord {
268 charset: "ISO-8859-1".as_bytes().to_vec(),
269 encoding: "Q".as_bytes().to_vec(),
270 encoded_text: "a".as_bytes().to_vec()
271 })]
272 );
273 }
274
275 #[test]
277 fn encoded_from_2() {
278 let parser = get_parser(RecoverStrategy::Abort);
279 let message = "=?ISO-8859-1?Q?a?= b".as_bytes();
280
281 let parsed = parser.parse(message).unwrap();
282
283 assert_eq!(
284 parsed,
285 vec![
286 Token::EncodedWord(EncodedWord {
287 charset: "ISO-8859-1".as_bytes().to_vec(),
288 encoding: "Q".as_bytes().to_vec(),
289 encoded_text: "a".as_bytes().to_vec(),
290 }),
291 Token::ClearText(" b".as_bytes().to_vec()),
292 ]
293 );
294 }
295
296 #[test]
298 fn encoded_from_3() {
299 let parser = get_parser(RecoverStrategy::Abort);
300 let message = "=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=".as_bytes();
301
302 let parsed = parser.parse(message).unwrap();
303
304 assert_eq!(
305 parsed,
306 vec![
307 Token::EncodedWord(EncodedWord {
308 charset: "ISO-8859-1".as_bytes().to_vec(),
309 encoding: "Q".as_bytes().to_vec(),
310 encoded_text: "a".as_bytes().to_vec(),
311 }),
312 Token::EncodedWord(EncodedWord {
313 charset: "ISO-8859-1".as_bytes().to_vec(),
314 encoding: "Q".as_bytes().to_vec(),
315 encoded_text: "b".as_bytes().to_vec()
316 })
317 ]
318 );
319 }
320
321 #[test]
324 fn multiple_encoded_words() {
325 let parser = get_parser(RecoverStrategy::Abort);
326 let message = "=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?= =?ISO-8859-1?Q?c?=".as_bytes();
327
328 let parsed = parser.parse(message).unwrap();
329
330 assert_eq!(
331 parsed,
332 vec![
333 Token::EncodedWord(EncodedWord {
334 charset: "ISO-8859-1".as_bytes().to_vec(),
335 encoding: "Q".as_bytes().to_vec(),
336 encoded_text: "a".as_bytes().to_vec(),
337 }),
338 Token::EncodedWord(EncodedWord {
339 charset: "ISO-8859-1".as_bytes().to_vec(),
340 encoding: "Q".as_bytes().to_vec(),
341 encoded_text: "b".as_bytes().to_vec()
342 }),
343 Token::EncodedWord(EncodedWord {
344 charset: "ISO-8859-1".as_bytes().to_vec(),
345 encoding: "Q".as_bytes().to_vec(),
346 encoded_text: "c".as_bytes().to_vec()
347 })
348 ]
349 );
350 }
351
352 #[test]
353 fn ignore_mutiple_spaces_between_encoded_words() {
354 let parser = get_parser(RecoverStrategy::Abort);
355 let message =
356 "=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=".as_bytes();
357
358 let parsed = parser.parse(message).unwrap();
359
360 assert_eq!(
361 parsed,
362 vec![
363 Token::EncodedWord(EncodedWord {
364 charset: "ISO-8859-1".as_bytes().to_vec(),
365 encoding: "Q".as_bytes().to_vec(),
366 encoded_text: "a".as_bytes().to_vec(),
367 }),
368 Token::EncodedWord(EncodedWord {
369 charset: "ISO-8859-1".as_bytes().to_vec(),
370 encoding: "Q".as_bytes().to_vec(),
371 encoded_text: "b".as_bytes().to_vec()
372 })
373 ]
374 );
375 }
376
377 #[test]
379 fn err_on_too_long_encoded_word() {
380 let message =
384 "=?ISO-8859-1?Q?aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa?="
385 .as_bytes();
386 let parsed = run(message, RecoverStrategy::Abort);
387
388 assert_eq!(
389 parsed,
390 Err(Error::ParseEncodedWordTooLongError(
391 TooLongEncodedWords::new(vec![EncodedWord {
392 charset: "ISO-8859-1".as_bytes().to_vec(),
393 encoding: "Q".as_bytes().to_vec(),
394 encoded_text: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
395 .as_bytes()
396 .to_vec()
397 }
398 .to_string()])
399 ))
400 );
401 }
402
403 #[test]
404 fn encoded_word_has_especials() {
405 let parser = get_parser(RecoverStrategy::Abort);
406 let message = "=?ISO-8859-1(?Q?a?=".as_bytes();
407 let parsed = parser.parse(message).unwrap();
408
409 assert_eq!(parsed, vec![Token::ClearText(message.to_vec())]);
410 }
411}