mail_parser/decoders/charsets/
utf.rs

1/*
2 * SPDX-FileCopyrightText: 2020 Stalwart Labs LLC <hello@stalw.art>
3 *
4 * SPDX-License-Identifier: Apache-2.0 OR MIT
5 */
6
7use std::char::{decode_utf16, REPLACEMENT_CHARACTER};
8
9use crate::decoders::base64::BASE64_MAP;
10
11struct Utf7DecoderState {
12    utf16_bytes: Vec<u16>,
13    pending_byte: Option<u8>,
14    b64_bytes: u32,
15}
16
17fn add_utf16_bytes(state: &mut Utf7DecoderState, n_bytes: usize) {
18    debug_assert!(n_bytes < std::mem::size_of::<u32>());
19
20    for byte in state.b64_bytes.to_le_bytes()[0..n_bytes].iter() {
21        if let Some(pending_byte) = state.pending_byte {
22            state
23                .utf16_bytes
24                .push(u16::from_be_bytes([pending_byte, *byte]));
25            state.pending_byte = None;
26        } else {
27            state.pending_byte = Some(*byte);
28        }
29    }
30}
31
32pub(crate) fn decoder_utf7(bytes: &[u8]) -> String {
33    let mut result = String::with_capacity(bytes.len());
34    let mut byte_count: u8 = 0;
35    let mut in_b64 = false;
36
37    let mut state = Utf7DecoderState {
38        utf16_bytes: Vec::with_capacity(10),
39        pending_byte: None,
40        b64_bytes: 0,
41    };
42
43    for byte in bytes {
44        if in_b64 {
45            let val = BASE64_MAP[byte_count as usize][*byte as usize];
46
47            if val < 0x01ffffff {
48                byte_count = (byte_count + 1) & 3;
49
50                if byte_count == 1 {
51                    state.b64_bytes = val;
52                } else {
53                    state.b64_bytes |= val;
54
55                    if byte_count == 0 {
56                        add_utf16_bytes(&mut state, 3);
57                    }
58                }
59            } else {
60                match byte_count {
61                    1 | 2 => {
62                        add_utf16_bytes(&mut state, 1);
63                    }
64                    3 => {
65                        add_utf16_bytes(&mut state, 2);
66                    }
67                    _ => (),
68                }
69
70                if !state.utf16_bytes.is_empty() {
71                    result.push_str(
72                        decode_utf16(state.utf16_bytes.drain(..))
73                            .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER))
74                            .collect::<String>()
75                            .as_str(),
76                    );
77                } else if byte_count > 0 || state.pending_byte.is_some() {
78                    result.push(REPLACEMENT_CHARACTER);
79                } else {
80                    result.push('+');
81                    result.push(char::from(*byte));
82                }
83
84                state.pending_byte = None;
85                byte_count = 0;
86                in_b64 = false;
87            }
88        } else if byte == &b'+' {
89            in_b64 = true;
90        } else {
91            result.push(char::from(*byte));
92        }
93    }
94
95    result.shrink_to_fit();
96    result
97}
98
99fn decoder_utf16_(bytes: &[u8], fnc: fn([u8; 2]) -> u16) -> String {
100    if bytes.len() >= 2 {
101        decode_utf16(bytes.chunks_exact(2).map(|c| fnc([c[0], c[1]])))
102            .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER))
103            .collect::<String>()
104    } else {
105        "".to_string()
106    }
107}
108
109#[inline(always)]
110pub(crate) fn decoder_utf16_le(bytes: &[u8]) -> String {
111    decoder_utf16_(bytes, u16::from_le_bytes)
112}
113
114#[inline(always)]
115pub(crate) fn decoder_utf16_be(bytes: &[u8]) -> String {
116    decoder_utf16_(bytes, u16::from_be_bytes)
117}
118
119#[allow(clippy::type_complexity)]
120pub(crate) fn decoder_utf16(bytes: &[u8]) -> String {
121    // Read BOM
122    let (bytes, fnc): (&[u8], fn([u8; 2]) -> u16) = match bytes.get(0..2) {
123        Some([0xfe, 0xff]) => (bytes.get(2..).unwrap_or(&[]), u16::from_be_bytes),
124        Some([0xff, 0xfe]) => (bytes.get(2..).unwrap_or(&[]), u16::from_le_bytes),
125        _ => (bytes, u16::from_le_bytes),
126    };
127
128    decoder_utf16_(bytes, fnc)
129}
130
131#[cfg(test)]
132mod tests {
133    use crate::decoders::charsets::utf::decoder_utf7;
134
135    #[test]
136    fn decode_utf7() {
137        let inputs = [
138            ("Hello, World+ACE-", "Hello, World!"),
139            ("Hi Mom -+Jjo--!", "Hi Mom -☺-!"),
140            ("+ZeVnLIqe-", "日本語"),
141            ("Item 3 is +AKM-1.", "Item 3 is £1."),
142            ("Plus minus +- -+ +--", "Plus minus +- -+ +--"),
143            (
144                "+APw-ber ihre mi+AN8-liche Lage+ADs- +ACI-wir",
145                "über ihre mißliche Lage; \"wir",
146            ),
147            (
148                concat!(
149                    "+ACI-The sayings of Confucius,+ACI- James R. Ware, trans.  +U/BTFw-:\n",
150                    "+ZYeB9FH6ckh5Pg-, 1980.\n",
151                    "+Vttm+E6UfZM-, +W4tRQ066bOg-, +UxdOrA-:  +Ti1XC2b4Xpc-, 1990."
152                ),
153                concat!(
154                    "\"The sayings of Confucius,\" James R. Ware, trans.  台北:\n",
155                    "文致出版社, 1980.\n",
156                    "四書五經, 宋元人注, 北京:  中國書店, 1990."
157                ),
158            ),
159        ];
160
161        for input in inputs {
162            assert_eq!(decoder_utf7(input.0.as_bytes()), input.1);
163        }
164    }
165}