1pub mod encoded_word;
2
3use chumsky::{prelude::Simple, text::whitespace, Parser};
4use std::{collections::HashSet, fmt::Display, result};
5use thiserror::Error;
6
7use crate::{decoder::RecoverStrategy, Decoder};
8
9use self::encoded_word::EncodedWord;
10
11pub const QUESTION_MARK: u8 = b'?';
12const SPACE: u8 = b' ';
13
14#[derive(Debug, Clone, PartialEq, Eq, Hash)]
38pub struct TooLongEncodedWords(pub Vec<String>);
39
40impl TooLongEncodedWords {
41 pub fn new(encoded_words: Vec<String>) -> Self {
42 Self(encoded_words)
43 }
44}
45
46impl Display for TooLongEncodedWords {
47 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
48 let mut message = String::new();
49
50 if !self.0.is_empty() {
51 message = self.0[0].clone();
52
53 for encoded_word in self.0.iter().skip(1) {
54 message.push_str(&format!(", {}", encoded_word));
55 }
56 }
57
58 f.write_str(&message)
59 }
60}
61
62#[derive(Error, Debug, Clone, PartialEq)]
64pub enum Error {
65 #[error("cannot parse bytes into tokens")]
66 ParseBytesError(Vec<Simple<u8>>),
67 #[error("Cannot parse the following encoded words, because they are too long: {0}")]
68 ParseEncodedWordTooLongError(TooLongEncodedWords),
69}
70
71type Result<T> = result::Result<T, Error>;
72
73pub type Tokens = Vec<Token>;
74
75#[derive(Debug, Clone, PartialEq, Hash, Eq)]
76pub enum Token {
77 ClearText(Vec<u8>),
78 EncodedWord(EncodedWord),
79}
80
81impl Token {
82 pub fn len(&self) -> usize {
84 match self {
85 Self::ClearText(clear_text) => clear_text.len(),
86 Self::EncodedWord(encoded_word) => encoded_word.len(),
87 }
88 }
89}
90
91pub fn run(encoded_bytes: &[u8], decoder: Decoder) -> Result<Tokens> {
92 let tokens = get_parser(&decoder)
93 .parse(encoded_bytes)
94 .map_err(Error::ParseBytesError)?;
95
96 validate_tokens(tokens, &decoder)
97}
98
99fn get_parser(decoder: &Decoder) -> impl Parser<u8, Tokens, Error = Simple<u8>> {
100 use chumsky::prelude::*;
101
102 let encoded_words_in_a_row = {
103 let following_encoded_word =
104 whitespace().ignore_then(encoded_word_parser(decoder).rewind());
105 encoded_word_parser(decoder).then_ignore(following_encoded_word)
106 };
107
108 let single_encoded_word = encoded_word_parser(decoder);
109 let single_clear_text = clear_text_parser(decoder);
110
111 encoded_words_in_a_row
112 .or(single_encoded_word)
113 .or(single_clear_text)
114 .repeated()
115}
116
117fn clear_text_parser(decoder: &Decoder) -> impl Parser<u8, Token, Error = Simple<u8>> {
118 use chumsky::prelude::*;
119
120 const DEFAULT_EMPTY_INPUT_ERROR_MESSAGE: &str = "got empty input";
121
122 take_until(encoded_word_parser(decoder).rewind().ignored().or(end())).try_map(
123 |(chars, ()), span| {
124 if chars.is_empty() {
125 Err(Simple::custom(span, DEFAULT_EMPTY_INPUT_ERROR_MESSAGE))
126 } else {
127 Ok(Token::ClearText(chars))
128 }
129 },
130 )
131}
132
133fn encoded_word_parser(decoder: &Decoder) -> impl Parser<u8, Token, Error = Simple<u8>> {
134 use chumsky::prelude::*;
135
136 let skip_encoded_word_length = decoder.too_long_encoded_word;
137
138 let convert_to_token = move |encoded_word: EncodedWord| {
139 if encoded_word.len() > encoded_word::MAX_LENGTH
140 && skip_encoded_word_length == RecoverStrategy::Skip
141 {
142 Token::ClearText(encoded_word.get_bytes(true))
143 } else {
144 Token::EncodedWord(encoded_word)
145 }
146 };
147
148 let is_especial = |c: u8| get_especials().contains(&c);
149
150 let token = filter(move |&c: &u8| c != SPACE && !c.is_ascii_control() && !is_especial(c));
151 let charset = token.repeated().at_least(1).collect::<Vec<u8>>();
152 let encoding = token.repeated().at_least(1).collect::<Vec<u8>>();
153 let encoded_text = filter(|&c: &u8| c != QUESTION_MARK && c != SPACE)
154 .repeated()
155 .collect::<Vec<u8>>();
156
157 just(encoded_word::PREFIX)
158 .ignore_then(charset)
159 .then_ignore(just(QUESTION_MARK))
160 .then(encoding)
161 .then_ignore(just(QUESTION_MARK))
162 .then(encoded_text)
163 .then_ignore(just(encoded_word::SUFFIX))
164 .map(EncodedWord::from_parser)
165 .map(convert_to_token)
166}
167
168fn get_especials() -> HashSet<u8> {
169 "()<>@,;:/[]?.=".bytes().collect()
170}
171
172fn validate_tokens(tokens: Tokens, decoder: &Decoder) -> Result<Tokens> {
173 if let Some(too_long_encoded_words) = get_too_long_encoded_words(&tokens, decoder) {
174 return Err(Error::ParseEncodedWordTooLongError(too_long_encoded_words));
175 }
176
177 Ok(tokens)
178}
179
180fn get_too_long_encoded_words(tokens: &Tokens, decoder: &Decoder) -> Option<TooLongEncodedWords> {
181 let strategy = decoder.too_long_encoded_word;
182 let mut too_long_encoded_words: Vec<String> = Vec::new();
183
184 for token in tokens.iter() {
185 if let Token::EncodedWord(encoded_word) = token {
186 if token.len() > encoded_word::MAX_LENGTH && strategy == RecoverStrategy::Abort {
187 too_long_encoded_words.push(encoded_word.to_string());
188 }
189 }
190 }
191
192 if too_long_encoded_words.is_empty() {
193 None
194 } else {
195 Some(TooLongEncodedWords::new(too_long_encoded_words))
196 }
197}
198
199#[cfg(test)]
200mod tests {
201 use crate::{
202 lexer::{encoded_word::EncodedWord, run, Token},
203 Decoder,
204 };
205
206 use super::{get_parser, Error, TooLongEncodedWords};
207 use chumsky::Parser;
208
209 #[test]
210 fn encoded_word() {
211 let parser = get_parser(&Decoder::new());
212 let message = "=?ISO-8859-1?Q?Yeet?=".as_bytes();
213
214 let parsed = parser.parse(message).unwrap();
215
216 assert_eq!(
217 parsed,
218 vec![Token::EncodedWord(EncodedWord {
219 charset: "ISO-8859-1".as_bytes().to_vec(),
220 encoding: "Q".as_bytes().to_vec(),
221 encoded_text: "Yeet".as_bytes().to_vec(),
222 })]
223 );
224 }
225
226 #[test]
227 fn clear_text() {
228 let parser = get_parser(&Decoder::new());
229 let message = "I use Arch by the way".as_bytes();
230
231 let parsed = parser.parse(message).unwrap();
232
233 assert_eq!(
234 parsed,
235 vec![Token::ClearText(
236 "I use Arch by the way".as_bytes().to_vec()
237 )]
238 );
239 }
240
241 #[test]
244 fn encoded_from_1() {
245 let parser = get_parser(&Decoder::new());
246 let message = "=?ISO-8859-1?Q?a?=".as_bytes();
247
248 let parsed = parser.parse(message).unwrap();
249
250 assert_eq!(
251 parsed,
252 vec![Token::EncodedWord(EncodedWord {
253 charset: "ISO-8859-1".as_bytes().to_vec(),
254 encoding: "Q".as_bytes().to_vec(),
255 encoded_text: "a".as_bytes().to_vec()
256 })]
257 );
258 }
259
260 #[test]
262 fn encoded_from_2() {
263 let parser = get_parser(&Decoder::new());
264 let message = "=?ISO-8859-1?Q?a?= b".as_bytes();
265
266 let parsed = parser.parse(message).unwrap();
267
268 assert_eq!(
269 parsed,
270 vec![
271 Token::EncodedWord(EncodedWord {
272 charset: "ISO-8859-1".as_bytes().to_vec(),
273 encoding: "Q".as_bytes().to_vec(),
274 encoded_text: "a".as_bytes().to_vec(),
275 }),
276 Token::ClearText(" b".as_bytes().to_vec()),
277 ]
278 );
279 }
280
281 #[test]
283 fn encoded_from_3() {
284 let parser = get_parser(&Decoder::new());
285 let message = "=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=".as_bytes();
286
287 let parsed = parser.parse(message).unwrap();
288
289 assert_eq!(
290 parsed,
291 vec![
292 Token::EncodedWord(EncodedWord {
293 charset: "ISO-8859-1".as_bytes().to_vec(),
294 encoding: "Q".as_bytes().to_vec(),
295 encoded_text: "a".as_bytes().to_vec(),
296 }),
297 Token::EncodedWord(EncodedWord {
298 charset: "ISO-8859-1".as_bytes().to_vec(),
299 encoding: "Q".as_bytes().to_vec(),
300 encoded_text: "b".as_bytes().to_vec()
301 })
302 ]
303 );
304 }
305
306 #[test]
309 fn multiple_encoded_words() {
310 let parser = get_parser(&Decoder::new());
311 let message = "=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?= =?ISO-8859-1?Q?c?=".as_bytes();
312
313 let parsed = parser.parse(message).unwrap();
314
315 assert_eq!(
316 parsed,
317 vec![
318 Token::EncodedWord(EncodedWord {
319 charset: "ISO-8859-1".as_bytes().to_vec(),
320 encoding: "Q".as_bytes().to_vec(),
321 encoded_text: "a".as_bytes().to_vec(),
322 }),
323 Token::EncodedWord(EncodedWord {
324 charset: "ISO-8859-1".as_bytes().to_vec(),
325 encoding: "Q".as_bytes().to_vec(),
326 encoded_text: "b".as_bytes().to_vec()
327 }),
328 Token::EncodedWord(EncodedWord {
329 charset: "ISO-8859-1".as_bytes().to_vec(),
330 encoding: "Q".as_bytes().to_vec(),
331 encoded_text: "c".as_bytes().to_vec()
332 })
333 ]
334 );
335 }
336
337 #[test]
338 fn ignore_mutiple_spaces_between_encoded_words() {
339 let parser = get_parser(&Decoder::new());
340 let message =
341 "=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=".as_bytes();
342
343 let parsed = parser.parse(message).unwrap();
344
345 assert_eq!(
346 parsed,
347 vec![
348 Token::EncodedWord(EncodedWord {
349 charset: "ISO-8859-1".as_bytes().to_vec(),
350 encoding: "Q".as_bytes().to_vec(),
351 encoded_text: "a".as_bytes().to_vec(),
352 }),
353 Token::EncodedWord(EncodedWord {
354 charset: "ISO-8859-1".as_bytes().to_vec(),
355 encoding: "Q".as_bytes().to_vec(),
356 encoded_text: "b".as_bytes().to_vec()
357 })
358 ]
359 );
360 }
361
362 #[test]
364 fn err_on_too_long_encoded_word() {
365 let message =
369 "=?ISO-8859-1?Q?aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa?="
370 .as_bytes();
371 let parsed = run(message, Decoder::new());
372
373 assert_eq!(
374 parsed,
375 Err(Error::ParseEncodedWordTooLongError(
376 TooLongEncodedWords::new(vec![EncodedWord {
377 charset: "ISO-8859-1".as_bytes().to_vec(),
378 encoding: "Q".as_bytes().to_vec(),
379 encoded_text: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
380 .as_bytes()
381 .to_vec()
382 }
383 .to_string()])
384 ))
385 );
386 }
387
388 #[test]
389 fn encoded_word_has_especials() {
390 let parser = get_parser(&Decoder::new());
391 let message = "=?ISO-8859-1(?Q?a?=".as_bytes();
392 let parsed = parser.parse(message).unwrap();
393
394 assert_eq!(parsed, vec![Token::ClearText(message.to_vec())]);
395 }
396}