mail_parser/decoders/charsets/
utf.rs1use std::char::{decode_utf16, REPLACEMENT_CHARACTER};
8
9use crate::decoders::base64::BASE64_MAP;
10
11struct Utf7DecoderState {
12 utf16_bytes: Vec<u16>,
13 pending_byte: Option<u8>,
14 b64_bytes: u32,
15}
16
17fn add_utf16_bytes(state: &mut Utf7DecoderState, n_bytes: usize) {
18 debug_assert!(n_bytes < std::mem::size_of::<u32>());
19
20 for byte in state.b64_bytes.to_le_bytes()[0..n_bytes].iter() {
21 if let Some(pending_byte) = state.pending_byte {
22 state
23 .utf16_bytes
24 .push(u16::from_be_bytes([pending_byte, *byte]));
25 state.pending_byte = None;
26 } else {
27 state.pending_byte = Some(*byte);
28 }
29 }
30}
31
32pub(crate) fn decoder_utf7(bytes: &[u8]) -> String {
33 let mut result = String::with_capacity(bytes.len());
34 let mut byte_count: u8 = 0;
35 let mut in_b64 = false;
36
37 let mut state = Utf7DecoderState {
38 utf16_bytes: Vec::with_capacity(10),
39 pending_byte: None,
40 b64_bytes: 0,
41 };
42
43 for byte in bytes {
44 if in_b64 {
45 let val = BASE64_MAP[byte_count as usize][*byte as usize];
46
47 if val < 0x01ffffff {
48 byte_count = (byte_count + 1) & 3;
49
50 if byte_count == 1 {
51 state.b64_bytes = val;
52 } else {
53 state.b64_bytes |= val;
54
55 if byte_count == 0 {
56 add_utf16_bytes(&mut state, 3);
57 }
58 }
59 } else {
60 match byte_count {
61 1 | 2 => {
62 add_utf16_bytes(&mut state, 1);
63 }
64 3 => {
65 add_utf16_bytes(&mut state, 2);
66 }
67 _ => (),
68 }
69
70 if !state.utf16_bytes.is_empty() {
71 result.push_str(
72 decode_utf16(state.utf16_bytes.drain(..))
73 .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER))
74 .collect::<String>()
75 .as_str(),
76 );
77 } else if byte_count > 0 || state.pending_byte.is_some() {
78 result.push(REPLACEMENT_CHARACTER);
79 } else {
80 result.push('+');
81 result.push(char::from(*byte));
82 }
83
84 state.pending_byte = None;
85 byte_count = 0;
86 in_b64 = false;
87 }
88 } else if byte == &b'+' {
89 in_b64 = true;
90 } else {
91 result.push(char::from(*byte));
92 }
93 }
94
95 result.shrink_to_fit();
96 result
97}
98
99fn decoder_utf16_(bytes: &[u8], fnc: fn([u8; 2]) -> u16) -> String {
100 if bytes.len() >= 2 {
101 decode_utf16(bytes.chunks_exact(2).map(|c| fnc([c[0], c[1]])))
102 .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER))
103 .collect::<String>()
104 } else {
105 "".to_string()
106 }
107}
108
109#[inline(always)]
110pub(crate) fn decoder_utf16_le(bytes: &[u8]) -> String {
111 decoder_utf16_(bytes, u16::from_le_bytes)
112}
113
114#[inline(always)]
115pub(crate) fn decoder_utf16_be(bytes: &[u8]) -> String {
116 decoder_utf16_(bytes, u16::from_be_bytes)
117}
118
119#[allow(clippy::type_complexity)]
120pub(crate) fn decoder_utf16(bytes: &[u8]) -> String {
121 let (bytes, fnc): (&[u8], fn([u8; 2]) -> u16) = match bytes.get(0..2) {
123 Some([0xfe, 0xff]) => (bytes.get(2..).unwrap_or(&[]), u16::from_be_bytes),
124 Some([0xff, 0xfe]) => (bytes.get(2..).unwrap_or(&[]), u16::from_le_bytes),
125 _ => (bytes, u16::from_le_bytes),
126 };
127
128 decoder_utf16_(bytes, fnc)
129}
130
131#[cfg(test)]
132mod tests {
133 use crate::decoders::charsets::utf::decoder_utf7;
134
135 #[test]
136 fn decode_utf7() {
137 let inputs = [
138 ("Hello, World+ACE-", "Hello, World!"),
139 ("Hi Mom -+Jjo--!", "Hi Mom -☺-!"),
140 ("+ZeVnLIqe-", "日本語"),
141 ("Item 3 is +AKM-1.", "Item 3 is £1."),
142 ("Plus minus +- -+ +--", "Plus minus +- -+ +--"),
143 (
144 "+APw-ber ihre mi+AN8-liche Lage+ADs- +ACI-wir",
145 "über ihre mißliche Lage; \"wir",
146 ),
147 (
148 concat!(
149 "+ACI-The sayings of Confucius,+ACI- James R. Ware, trans. +U/BTFw-:\n",
150 "+ZYeB9FH6ckh5Pg-, 1980.\n",
151 "+Vttm+E6UfZM-, +W4tRQ066bOg-, +UxdOrA-: +Ti1XC2b4Xpc-, 1990."
152 ),
153 concat!(
154 "\"The sayings of Confucius,\" James R. Ware, trans. 台北:\n",
155 "文致出版社, 1980.\n",
156 "四書五經, 宋元人注, 北京: 中國書店, 1990."
157 ),
158 ),
159 ];
160
161 for input in inputs {
162 assert_eq!(decoder_utf7(input.0.as_bytes()), input.1);
163 }
164 }
165}