Skip to main content

anyxml_encoding/
utf16.rs

1use std::iter::once;
2
3use crate::{DecodeError, Decoder, EncodeError, Encoder};
4
5/// Encoding name for UTF-16.
6pub const UTF16_NAME: &str = "UTF-16";
7
8/// Encoder for UTF-16.
9#[derive(Debug, Default)]
10pub struct UTF16Encoder {
11    init: bool,
12}
13impl Encoder for UTF16Encoder {
14    fn name(&self) -> &'static str {
15        UTF16_NAME
16    }
17
18    fn encode(
19        &mut self,
20        src: &str,
21        dst: &mut [u8],
22        finish: bool,
23    ) -> Result<(usize, usize), EncodeError> {
24        if src.is_empty() {
25            return Err(EncodeError::InputIsEmpty);
26        }
27        if dst.len() < 4 {
28            return Err(EncodeError::OutputTooShort);
29        }
30
31        if !self.init {
32            self.init = true;
33            // Write BOM as LE
34            dst[0] = 0xFF;
35            dst[1] = 0xFE;
36            return Ok((0, 2));
37        }
38        UTF16LEEncoder.encode(src, dst, finish)
39    }
40}
41
42/// Decoder for UTF-16.
43pub struct UTF16Decoder {
44    read: usize,
45    top: [u8; 2],
46    be: bool,
47}
48impl Decoder for UTF16Decoder {
49    fn name(&self) -> &'static str {
50        UTF16_NAME
51    }
52
53    fn decode(
54        &mut self,
55        mut src: &[u8],
56        dst: &mut String,
57        finish: bool,
58    ) -> Result<(usize, usize), DecodeError> {
59        if src.is_empty() {
60            return Err(DecodeError::InputIsEmpty);
61        }
62        if dst.capacity() - dst.len() < 4 {
63            return Err(DecodeError::OutputTooShort);
64        }
65
66        let mut base = 0;
67        if self.read < 2 {
68            let orig = src.len();
69            while self.read < 2 && !src.is_empty() {
70                self.top[self.read] = src[0];
71                src = &src[1..];
72                self.read += 1;
73            }
74            base = orig - src.len();
75            if self.read == 2 {
76                // If the first 2 bytes of the buffer are 0xFF, 0xFE, it is LE; otherwise, it is BE.
77                if matches!(self.top[..], [0xFF, 0xFE]) {
78                    self.be = false;
79                    return Ok((base, 0));
80                } else if matches!(self.top[..], [0xFE, 0xFF]) {
81                    self.be = true;
82                    return Ok((base, 0));
83                } else {
84                    self.be = true;
85                    // Since the first two bytes were not BOM,
86                    // try decoding using the first two bytes that have already been acquired.
87                };
88            } else {
89                return Ok((base, 0));
90            }
91        }
92
93        if self.be && !matches!(self.top[..], [0xFE, 0xFF]) {
94            let mut read = 0;
95            let mut write = 0;
96            for c in char::decode_utf16(
97                once(((self.top[0] as u16) << 8) | self.top[1] as u16).chain(
98                    src.chunks_exact(2)
99                        .map(|v| ((v[0] as u16) << 8) | v[1] as u16),
100                ),
101            ) {
102                if let Ok(c) = c {
103                    read += c.len_utf16() * 2;
104                    write += c.len_utf8();
105                    dst.push(c);
106                } else {
107                    let rem = src.len() - (read - 2);
108                    if !finish && rem < 4 {
109                        // If this is not the last buffer and the unread buffer is less than 2 bytes,
110                        // return `Ok` because the corresponding surrogate pair may be at the beginning of the next buffer to be input.
111                        break;
112                    } else {
113                        // If this is the last buffer, or if there is sufficient data to form a surrogate pair but an error occurs,
114                        // it is simply an invalid byte sequence.
115                        return Err(DecodeError::Malformed {
116                            read: read + 2,
117                            write,
118                            length: 2,
119                            offset: 0,
120                        });
121                    }
122                }
123
124                if dst.capacity() - dst.len() < 4 {
125                    break;
126                }
127            }
128            return if read > 0 {
129                self.top = [0xFE, 0xFF];
130                read -= 2 - base;
131                Ok((read, write))
132            } else {
133                Ok((base, 0))
134            };
135        }
136
137        if self.be {
138            UTF16BEDecoder.decode(src, dst, finish)
139        } else {
140            UTF16LEDecoder.decode(src, dst, finish)
141        }
142    }
143}
144
145impl Default for UTF16Decoder {
146    fn default() -> Self {
147        Self {
148            read: 0,
149            top: [0; 2],
150            be: true,
151        }
152    }
153}
154
155/// Encoding name for UTF-16BE.
156pub const UTF16BE_NAME: &str = "UTF-16BE";
157
158/// Encoder for UTF-16BE.
159pub struct UTF16BEEncoder;
160impl Encoder for UTF16BEEncoder {
161    fn name(&self) -> &'static str {
162        UTF16BE_NAME
163    }
164
165    fn encode(
166        &mut self,
167        src: &str,
168        mut dst: &mut [u8],
169        _finish: bool,
170    ) -> Result<(usize, usize), EncodeError> {
171        if src.is_empty() {
172            return Err(EncodeError::InputIsEmpty);
173        }
174        if dst.len() < 4 {
175            return Err(EncodeError::OutputTooShort);
176        }
177
178        let mut buf = [0u16; 2];
179        let mut read = 0;
180        let mut write = 0;
181        for c in src.chars() {
182            read += c.len_utf8();
183            let b = c.encode_utf16(&mut buf);
184            dst[..2].copy_from_slice(&b[0].to_be_bytes());
185            dst = &mut dst[2..];
186            write += 2;
187            if b.len() == 2 {
188                dst[..2].copy_from_slice(&b[1].to_be_bytes());
189                dst = &mut dst[2..];
190                write += 2;
191            }
192            if dst.len() < 4 {
193                break;
194            }
195        }
196        Ok((read, write))
197    }
198}
199
200/// Decoder for UTF-16BE.
201pub struct UTF16BEDecoder;
202impl Decoder for UTF16BEDecoder {
203    fn name(&self) -> &'static str {
204        UTF16BE_NAME
205    }
206
207    fn decode(
208        &mut self,
209        src: &[u8],
210        dst: &mut String,
211        finish: bool,
212    ) -> Result<(usize, usize), DecodeError> {
213        if src.is_empty() {
214            return Err(DecodeError::InputIsEmpty);
215        }
216        let cap = dst.capacity() - dst.len();
217        if cap < 4 {
218            return Err(DecodeError::OutputTooShort);
219        }
220
221        let mut read = 0;
222        let mut write = 0;
223        for c in char::decode_utf16(
224            src.chunks_exact(2)
225                .map(|v| u16::from_be_bytes([v[0], v[1]])),
226        ) {
227            if let Ok(c) = c {
228                read += c.len_utf16() * 2;
229                write += c.len_utf8();
230                dst.push(c);
231            } else {
232                let rem = src.len() - read;
233                if !finish && rem < 4 {
234                    break;
235                } else {
236                    return Err(DecodeError::Malformed {
237                        read: read + 2,
238                        write,
239                        length: 2,
240                        offset: 0,
241                    });
242                }
243            }
244
245            if dst.capacity() - dst.len() < 4 {
246                break;
247            }
248        }
249
250        Ok((read, write))
251    }
252}
253
254/// Encoding name for UTF-16LE.
255pub const UTF16LE_NAME: &str = "UTF-16LE";
256
257/// Encoder for UTF-16LE.
258pub struct UTF16LEEncoder;
259impl Encoder for UTF16LEEncoder {
260    fn name(&self) -> &'static str {
261        UTF16LE_NAME
262    }
263
264    fn encode(
265        &mut self,
266        src: &str,
267        mut dst: &mut [u8],
268        _finish: bool,
269    ) -> Result<(usize, usize), EncodeError> {
270        if src.is_empty() {
271            return Err(EncodeError::InputIsEmpty);
272        }
273        if dst.len() < 4 {
274            return Err(EncodeError::OutputTooShort);
275        }
276
277        let mut buf = [0u16; 2];
278        let mut read = 0;
279        let mut write = 0;
280        for c in src.chars() {
281            read += c.len_utf8();
282            let b = c.encode_utf16(&mut buf);
283            dst[..2].copy_from_slice(&b[0].to_le_bytes());
284            dst = &mut dst[2..];
285            write += 2;
286            if b.len() == 2 {
287                dst[..2].copy_from_slice(&b[1].to_le_bytes());
288                dst = &mut dst[2..];
289                write += 2;
290            }
291            if dst.len() < 4 {
292                break;
293            }
294        }
295        Ok((read, write))
296    }
297}
298
299/// Decoder for UTF-16LE.
300pub struct UTF16LEDecoder;
301impl Decoder for UTF16LEDecoder {
302    fn name(&self) -> &'static str {
303        UTF16LE_NAME
304    }
305
306    fn decode(
307        &mut self,
308        src: &[u8],
309        dst: &mut String,
310        finish: bool,
311    ) -> Result<(usize, usize), DecodeError> {
312        if src.is_empty() {
313            return Err(DecodeError::InputIsEmpty);
314        }
315        let cap = dst.capacity() - dst.len();
316        if cap < 4 {
317            return Err(DecodeError::OutputTooShort);
318        }
319
320        let mut read = 0;
321        let mut write = 0;
322        for c in char::decode_utf16(
323            src.chunks_exact(2)
324                .map(|v| u16::from_le_bytes([v[0], v[1]])),
325        ) {
326            if let Ok(c) = c {
327                read += c.len_utf16() * 2;
328                write += c.len_utf8();
329                dst.push(c);
330            } else {
331                let rem = src.len() - read;
332                if !finish && rem < 4 {
333                    break;
334                } else {
335                    return Err(DecodeError::Malformed {
336                        read: read + 2,
337                        write,
338                        length: 2,
339                        offset: 0,
340                    });
341                }
342            }
343
344            if dst.capacity() - dst.len() < 4 {
345                break;
346            }
347        }
348
349        Ok((read, write))
350    }
351}