1use regex::bytes::{Captures, NoExpand, Regex};
6use thiserror::Error;
7
8use crate::charset::Charset;
9use crate::defects::Defect;
10
11lazy_static::lazy_static! {
16 static ref Q_BYTE_RE_1: Regex = Regex::new(r"(_)").unwrap();
17 static ref Q_BYTE_RE_2: Regex = Regex::new(r"=([a-fA-F0-9]{2})").unwrap();
18}
19
20fn decode_q<T: AsRef<[u8]>>(encoded: T) -> Vec<u8> {
21 let one = Q_BYTE_RE_1.replace_all(encoded.as_ref(), NoExpand(b" "));
22 Q_BYTE_RE_2
23 .replace_all(one.as_ref(), |caps: &Captures| {
24 hex::decode(caps[1].as_ref()).expect("invalid regex capture")
25 })
26 .to_vec()
27}
28
29fn write_q_byte<T: std::fmt::Write>(mut writer: T, byte: u8) -> std::fmt::Result {
30 match byte {
31 b' ' => writer.write_char('_'),
32 b'-' | b'!' | b'*' | b'+' | b'/' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => {
33 writer.write_char(byte as char)
34 }
35 _ => write!(writer, "={:02X}", byte),
36 }
37}
38
39fn encode_q<T: AsRef<[u8]>>(bstring: T) -> String {
40 let mut out = String::with_capacity(bstring.as_ref().len());
41
42 for byte in bstring.as_ref() {
43 write_q_byte(&mut out, *byte).expect("String writes always succeed");
44 }
45
46 out
47}
48
49fn len_q<T: AsRef<[u8]>>(bstring: T) -> usize {
50 bstring.as_ref().iter().copied().map(len_q_byte).sum()
51}
52
53fn len_q_byte(byte: u8) -> usize {
54 match byte {
55 b' ' => 1,
56 b'-' | b'!' | b'*' | b'+' | b'/' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => 1,
57 _ => 3,
58 }
59}
60
61fn decode_b<T: AsRef<[u8]>>(encoded: T) -> (Vec<u8>, Vec<Defect>) {
64 let mut defects = Vec::new();
65
66 let config =
67 base64::Config::new(base64::CharacterSet::Standard, true).decode_allow_trailing_bits(true);
68 match base64::decode_config(&encoded, config) {
70 Ok(decoded) => {
71 let pad_err = encoded.as_ref().len() % 4;
72 if pad_err > 0 {
73 defects.push(Defect::InvalidBase64Padding);
74 }
75
76 (decoded, defects)
77 }
78 Err(err) => match err {
79 base64::DecodeError::InvalidByte(_offset, byte) => {
80 defects.push(Defect::InvalidBase64Characters { byte });
81
82 let encoded: Vec<u8> = encoded
84 .as_ref()
85 .iter()
86 .copied()
87 .filter(|b| match b {
88 0..=42 => false,
89 43 => true,
90 44..=46 => false,
91 47..=57 => true,
92 58..=64 => false,
93 65..=90 => true,
94 91..=96 => false,
95 97..=122 => true,
96 _ => false,
97 })
98 .collect();
99
100 if encoded.len() % 4 > 0 {
101 defects.push(Defect::InvalidBase64Padding);
102 }
103
104 match base64::decode_config(&encoded, config) {
105 Ok(decoded) => (decoded, defects),
106 Err(_err) => {
107 (encoded.to_vec(), defects)
109 }
110 }
111 }
112 base64::DecodeError::InvalidLastSymbol(_offset, _byte) => {
113 unreachable!("config disables this error");
114 }
115 base64::DecodeError::InvalidLength => {
116 defects.push(Defect::InvalidBase64Length);
118 (encoded.as_ref().to_vec(), defects)
119 }
120 },
121 }
122}
123
124fn encode_b<T: AsRef<[u8]>>(bstring: T) -> String {
125 base64::encode(&bstring)
126}
127
128fn len_b<T: AsRef<[u8]>>(bstring: T) -> usize {
129 let len = bstring.as_ref().len();
130 let groups_of_3 = len / 3;
131 let leftover = len % 3;
132
133 let padding_len = if leftover > 0 { 4 } else { 0 };
135 groups_of_3 * 4 + padding_len
136}
137
138#[derive(Debug, Clone, PartialEq)]
140pub struct DecodingResult {
141 pub decoded: String,
142 pub charset: Charset,
143 pub lang: String,
144 pub defects: Vec<Defect>,
145}
146
147#[derive(Clone, Debug, Error, PartialEq, Eq)]
148pub enum DecodingError {
149 #[error("Malformed input")]
150 MalformedInput,
151 #[error("Unknown charset {}", charset)]
152 UnknownCharset { charset: String },
153}
154
155pub fn decode<T: AsRef<str>>(ew: T) -> Result<DecodingResult, DecodingError> {
172 let mut split = ew.as_ref().split('?');
173 let _ = split.next().ok_or_else(|| DecodingError::MalformedInput)?;
174 let charset = split.next().ok_or_else(|| DecodingError::MalformedInput)?;
175 let cte = split.next().ok_or_else(|| DecodingError::MalformedInput)?;
176 let cte_string = split.next().ok_or_else(|| DecodingError::MalformedInput)?;
177
178 let (charset, lang) = if let Some(index) = charset.find('*') {
179 let (charset, lang) = charset.split_at(index);
180 (charset, &lang[1..])
181 } else {
182 (charset, "")
183 };
184
185 let mut defects = Vec::new();
186
187 let charset = if charset == "latin-1" {
188 Charset::for_label(b"latin1").unwrap()
190 } else {
191 match Charset::for_label(charset.as_bytes()) {
192 Some(c) => c,
193 None => {
194 if charset != "unknown-8bit" {
195 defects.push(Defect::InvalidCharset {
196 charset: charset.into(),
197 })
198 }
199 Charset::Ascii
200 }
201 }
202 };
203
204 let cte = cte.to_lowercase();
205
206 let (bstring, has_invalid_ascii) = Charset::Ascii.encode(cte_string);
208 if has_invalid_ascii {
209 defects.push(Defect::UndecodableBytes);
210 }
211 let (bstring, new_defects) = match cte.as_str() {
212 "q" => (decode_q(bstring), Vec::new()),
213 "b" => decode_b(bstring),
214 _ => return Err(DecodingError::MalformedInput),
215 };
216 defects.extend_from_slice(&new_defects);
217
218 let (decoded, has_invalid_bytes) = charset.decode_without_bom_handling(&bstring);
220
221 if has_invalid_bytes {
222 defects.push(Defect::UndecodableBytes);
223 }
224
225 Ok(DecodingResult {
226 decoded: decoded.into(),
227 charset,
228 lang: lang.into(),
229 defects,
230 })
231}
232
233pub enum EncodingFlag {
235 QuotedPrintable,
237 Base64,
239 Shortest,
241}
242
243#[derive(Debug, Copy, Clone, PartialEq, Eq)]
244pub enum Encoding {
245 QuotedPrintable,
246 Base64,
247}
248
249impl Encoding {
250 pub fn decode<T: AsRef<[u8]>>(self, ew: T) -> (Vec<u8>, Vec<Defect>) {
251 match self {
252 Encoding::QuotedPrintable => (decode_q(ew), Vec::new()),
253 Encoding::Base64 => decode_b(ew),
254 }
255 }
256
257 pub fn encode<T: AsRef<[u8]>>(self, bstring: T) -> String {
258 match self {
259 Encoding::QuotedPrintable => encode_q(bstring),
260 Encoding::Base64 => encode_b(bstring),
261 }
262 }
263 pub fn char(self) -> char {
264 match self {
265 Encoding::QuotedPrintable => 'q',
266 Encoding::Base64 => 'b',
267 }
268 }
269}
270
271pub fn encode<T: AsRef<str>>(
284 ew: T,
285 charset: Option<Charset>,
286 encoding_flag: EncodingFlag,
287 lang: Option<&str>,
288) -> String {
289 let charset = charset.unwrap_or_else(|| Charset::Encoding(encoding_rs::UTF_8));
291 let (bstring, _) = charset.encode(ew.as_ref());
292
293 let encoding = match encoding_flag {
294 EncodingFlag::Base64 => Encoding::Base64,
295 EncodingFlag::QuotedPrintable => Encoding::QuotedPrintable,
296 EncodingFlag::Shortest => {
297 let q_len = len_q(&bstring);
298 let b_len = len_b(&bstring);
299
300 if q_len as isize - (b_len as isize) < 5 {
302 Encoding::QuotedPrintable
303 } else {
304 Encoding::Base64
305 }
306 }
307 };
308
309 let encoded = encoding.encode(&bstring);
310 if let Some(lang) = lang {
311 format!(
312 "=?{}*{}?{}?{}?=",
313 charset.name().to_lowercase(),
314 lang,
315 encoding.char(),
316 encoded
317 )
318 } else {
319 format!(
320 "=?{}?{}?{}?=",
321 charset.name().to_lowercase(),
322 encoding.char(),
323 encoded
324 )
325 }
326}
327
328#[cfg(test)]
329mod tests {
330 use super::*;
331
332 #[test]
333 fn test_decode_q_no_encoded() {
334 assert_eq!(&decode_q(b"foobar"), b"foobar");
335 }
336
337 #[test]
338 fn test_decode_q_spaces() {
339 assert_eq!(&decode_q(b"foo=20bar=20"), b"foo bar ");
340 assert_eq!(&decode_q(b"foo_bar_"), b"foo bar ");
341 }
342
343 #[test]
344 fn test_decode_q_encoded() {
345 assert_eq!(&decode_q(b"foo=20=20=21=2Cbar"), b"foo !,bar");
346 }
347
348 #[test]
349 fn test_decode_b_simple() {
350 assert_eq!(decode_b(b"Zm9v"), (b"foo".to_vec(), Vec::new()));
351 }
352
353 #[test]
354 fn test_decode_b_missing_padding() {
355 assert_eq!(
357 decode_b(b"dmk"),
358 (b"vi".to_vec(), vec![Defect::InvalidBase64Padding])
359 );
360 assert_eq!(
362 decode_b(b"dg"),
363 (b"v".to_vec(), vec![Defect::InvalidBase64Padding])
364 );
365 }
366
367 #[test]
368 fn test_decode_b_invalid_character() {
369 assert_eq!(
370 decode_b(b"dm\x01k==="),
371 (
372 b"vi".to_vec(),
373 vec![
374 Defect::InvalidBase64Characters { byte: b'\x01' },
375 Defect::InvalidBase64Padding
376 ]
377 )
378 );
379 }
380
381 #[test]
382 fn test_decode_b_invalid_character_and_bad_padding() {
383 assert_eq!(
384 decode_b(b"dm\x01k"),
385 (
386 b"vi".to_vec(),
387 vec![
388 Defect::InvalidBase64Characters { byte: b'\x01' },
389 Defect::InvalidBase64Padding
390 ]
391 )
392 );
393 }
394
395 #[test]
396 fn test_decode_b_invalid_length() {
397 assert_eq!(
398 decode_b(b"abcde"),
399 (b"abcde".to_vec(), vec![Defect::InvalidBase64Length])
400 );
401 }
402
403 #[test]
404 fn test_decode_wrong_format_input() {
405 assert_eq!(decode("=?badone?="), Err(DecodingError::MalformedInput));
406 assert_eq!(decode("=?"), Err(DecodingError::MalformedInput));
407 assert_eq!(decode(""), Err(DecodingError::MalformedInput));
408 assert_eq!(
409 decode("=?utf-9?X?somevalue?="),
410 Err(DecodingError::MalformedInput)
411 );
412 }
413
414 #[test]
415 fn test_decode_simple_q() {
416 assert_eq!(
417 decode("=?us-ascii?q?foo?=").unwrap(),
418 DecodingResult {
419 decoded: "foo".into(),
420 charset: Charset::Ascii,
421 lang: "".into(),
422 defects: Vec::new(),
423 }
424 );
425 }
426
427 #[test]
428 fn test_decode_simple_b() {
429 assert_eq!(
430 decode("=?us-ascii?b?dmk=?=").unwrap(),
431 DecodingResult {
432 decoded: "vi".into(),
433 charset: Charset::Ascii,
434 lang: "".into(),
435 defects: Vec::new(),
436 }
437 );
438 }
439
440 #[test]
441 fn test_decode_case_ignored_q() {
442 assert_eq!(
443 decode("=?us-ascii?Q?foo?=").unwrap(),
444 DecodingResult {
445 decoded: "foo".into(),
446 charset: Charset::Ascii,
447 lang: "".into(),
448 defects: Vec::new(),
449 }
450 );
451 }
452
453 #[test]
454 fn test_decode_case_ignored_b() {
455 assert_eq!(
456 decode("=?us-ascii?B?dmk=?=").unwrap(),
457 DecodingResult {
458 decoded: "vi".into(),
459 charset: Charset::Ascii,
460 lang: "".into(),
461 defects: Vec::new(),
462 }
463 );
464 }
465
466 #[test]
467 fn test_decode_non_trivial_q() {
468 assert_eq!(
469 decode("=?latin-1?q?=20F=fcr=20Elise=20?=").unwrap(),
470 DecodingResult {
471 decoded: " Für Elise ".into(),
472 charset: Charset::for_label(b"latin1").unwrap(),
473 lang: "".into(),
474 defects: Vec::new(),
475 }
476 );
477 }
478
479 #[test]
480 fn test_decode_escaped_bytes_preserved_q() {
481 assert_eq!(
482 decode("=?us-ascii?q?=20\u{AC}foo?=").unwrap(),
483 DecodingResult {
484 decoded: " \u{AC}foo".into(),
485 charset: Charset::Ascii,
486 lang: "".into(),
487 defects: vec![],
488 }
489 );
490 }
491
492 #[test]
493 fn test_decode_undecodable_bytes_ignored_with_defect_b() {
494 assert_eq!(
495 decode("=?us-ascii?b?dm\u{AC}k?=").unwrap(),
496 DecodingResult {
497 decoded: "vi".into(),
498 charset: Charset::Ascii,
499 lang: "".into(),
500 defects: vec![
501 Defect::InvalidBase64Characters { byte: 172 },
502 Defect::InvalidBase64Padding
503 ],
504 }
505 );
506 }
507
508 #[test]
509 fn test_decode_invalid_bytes_ignored_with_defect_b() {
510 assert_eq!(
511 decode("=?us-ascii?b?dm\x01k===?=").unwrap(),
512 DecodingResult {
513 decoded: "vi".into(),
514 charset: Charset::Ascii,
515 lang: "".into(),
516 defects: vec![
517 Defect::InvalidBase64Characters { byte: 1 },
518 Defect::InvalidBase64Padding
519 ],
520 }
521 );
522 }
523
524 #[test]
525 fn test_decode_padding_defect_b() {
526 assert_eq!(
527 decode("=?us-ascii?b?dmk?=").unwrap(),
528 DecodingResult {
529 decoded: "vi".into(),
530 charset: Charset::Ascii,
531 lang: "".into(),
532 defects: vec![Defect::InvalidBase64Padding],
533 }
534 );
535 }
536
537 #[test]
538 fn test_decode_nonnull_lang() {
539 assert_eq!(
540 decode("=?us-ascii*jive?q?test?=").unwrap(),
541 DecodingResult {
542 decoded: "test".into(),
543 charset: Charset::Ascii,
544 lang: "jive".into(),
545 defects: vec![],
546 }
547 );
548 }
549
550 #[test]
551 fn test_decode_unknown_8bit_charset() {
552 assert_eq!(
553 decode("=?unknown-8bit?q?foo=ACbar?=").unwrap(),
554 DecodingResult {
555 decoded: "foo\u{ac}bar".into(),
556 charset: Charset::Unknown8Bit,
557 lang: "".into(),
558 defects: vec![],
559 }
560 );
561 }
562
563 #[test]
564 fn test_decode_unknown_charset() {
565 assert_eq!(
566 decode("=?foobar?q?foo=ACbar?=").unwrap(),
567 DecodingResult {
568 decoded: "foo\u{ac}bar".into(),
569 charset: Charset::Ascii,
570 lang: "".into(),
571 defects: vec![Defect::InvalidCharset {
572 charset: "foobar".into()
573 }],
574 }
575 );
576 }
577
578 #[test]
579 fn test_decode_nonascii_q() {
580 assert_eq!(
581 decode("=?utf-8?q?=C3=89ric?=").unwrap(),
582 DecodingResult {
583 decoded: "Éric".into(),
584 charset: Charset::for_label(b"utf-8").unwrap(),
585 lang: "".into(),
586 defects: vec![],
587 }
588 );
589 }
590
591 #[test]
592 fn test_encode_q_all_safe() {
593 assert_eq!(&encode_q(b"foobar"), "foobar");
594 }
595
596 #[test]
597 fn test_encode_q_spaces() {
598 assert_eq!(&encode_q(b"foo bar "), "foo_bar_");
599 }
600
601 #[test]
602 fn test_encode_q_encodables() {
603 assert_eq!(&encode_q(b"foo ,,bar"), "foo__=2C=2Cbar");
604 assert_eq!(len_q(b"foo ,,bar"), b"foo__=2C=2Cbar".len());
605 }
606
607 #[test]
608 fn test_encode_b_simple() {
609 assert_eq!(&encode_b(b"foo"), "Zm9v");
610 assert_eq!(len_b(b"foo"), b"Zm9v".len());
611 }
612
613 #[test]
614 fn test_encode_b_padding() {
615 assert_eq!(&encode_b(b"vi"), "dmk=");
616 assert_eq!(len_b(b"vi"), b"dmk=".len());
617 }
618
619 #[test]
620 fn test_encode_simple_q() {
621 assert_eq!(
622 &encode(
623 "foo",
624 Some(encoding_rs::UTF_8.into()),
625 EncodingFlag::QuotedPrintable,
626 None,
627 ),
628 "=?utf-8?q?foo?="
629 );
630 }
631
632 #[test]
633 fn test_encode_simple_b() {
634 assert_eq!(
635 &encode(
636 "foo",
637 Some(encoding_rs::UTF_8.into()),
638 EncodingFlag::Base64,
639 None,
640 ),
641 "=?utf-8?b?Zm9v?="
642 );
643 }
644
645 #[test]
646 fn test_encode_auto_q() {
647 assert_eq!(
648 &encode(
649 "foo",
650 Some(encoding_rs::UTF_8.into()),
651 EncodingFlag::Shortest,
652 None,
653 ),
654 "=?utf-8?q?foo?="
655 );
656 }
657
658 #[test]
659 fn test_encode_auto_q_if_short_mostly_safe() {
660 assert_eq!(
661 &encode(
662 "vi.",
663 Some(encoding_rs::UTF_8.into()),
664 EncodingFlag::Shortest,
665 None,
666 ),
667 "=?utf-8?q?vi=2E?="
668 );
669 }
670
671 #[test]
672 fn test_encode_auto_b_if_enough_unsafe() {
673 assert_eq!(
674 &encode(
675 ".....",
676 Some(encoding_rs::UTF_8.into()),
677 EncodingFlag::Shortest,
678 None,
679 ),
680 "=?utf-8?b?Li4uLi4=?="
681 );
682 }
683
684 #[test]
685 fn test_encode_auto_b_if_long_unsafe() {
686 assert_eq!(
687 &encode(
688 "vi.vi.vi.vi.vi.",
689 Some(encoding_rs::UTF_8.into()),
690 EncodingFlag::Shortest,
691 None,
692 ),
693 "=?utf-8?b?dmkudmkudmkudmkudmku?="
694 );
695 }
696
697 #[test]
698 fn test_encode_auto_q_if_mostly_safe() {
699 assert_eq!(
700 &encode(
701 "vi vi vi.vi ",
702 Some(encoding_rs::UTF_8.into()),
703 EncodingFlag::Shortest,
704 None,
705 ),
706 "=?utf-8?q?vi_vi_vi=2Evi_?="
707 );
708 }
709
710 #[test]
711 fn test_encode_utf8_default() {
712 assert_eq!(
713 &encode("foo", None, EncodingFlag::Shortest, None,),
714 "=?utf-8?q?foo?="
715 );
716 }
717
718 #[test]
719 fn test_encode_lang() {
720 assert_eq!(
721 &encode("foo", None, EncodingFlag::Shortest, Some("jive")),
722 "=?utf-8*jive?q?foo?="
723 );
724 }
725}