Skip to main content

anyxml_encoding/
ucs4.rs

1use crate::{DecodeError, Decoder, EncodeError, Encoder};
2
3/// Encoding name for UTF-32.
4pub const UTF32_NAME: &str = "UTF-32";
5
6/// Encoder for UTF-32.
7#[derive(Debug, Default)]
8pub struct UTF32Encoder {
9    init: bool,
10}
11impl Encoder for UTF32Encoder {
12    fn name(&self) -> &'static str {
13        UTF32_NAME
14    }
15
16    fn encode(
17        &mut self,
18        src: &str,
19        dst: &mut [u8],
20        finish: bool,
21    ) -> Result<(usize, usize), EncodeError> {
22        if src.is_empty() {
23            return Err(EncodeError::InputIsEmpty);
24        }
25        if dst.len() < 4 {
26            return Err(EncodeError::OutputTooShort);
27        }
28
29        if !self.init {
30            self.init = true;
31            // Write BOM as LE
32            dst[0] = 0xFF;
33            dst[1] = 0xFE;
34            dst[2] = 0x00;
35            dst[3] = 0x00;
36            return Ok((0, 4));
37        }
38        UTF32LEEncoder.encode(src, dst, finish)
39    }
40}
41
42/// Decoder for UTF-32.
43pub struct UTF32Decoder {
44    read: usize,
45    top: [u8; 4],
46    be: bool,
47}
48impl Decoder for UTF32Decoder {
49    fn name(&self) -> &'static str {
50        UTF32_NAME
51    }
52
53    fn decode(
54        &mut self,
55        mut src: &[u8],
56        dst: &mut String,
57        finish: bool,
58    ) -> Result<(usize, usize), DecodeError> {
59        if src.is_empty() {
60            return Err(DecodeError::InputIsEmpty);
61        }
62        if dst.capacity() - dst.len() < 4 {
63            return Err(DecodeError::OutputTooShort);
64        }
65
66        let mut base = 0;
67        if self.read < 4 {
68            let orig = src.len();
69            while self.read < 4 && !src.is_empty() {
70                self.top[self.read] = src[0];
71                src = &src[1..];
72                self.read += 1;
73            }
74            base = orig - src.len();
75            if self.read == 4 {
76                // If the first 4 bytes of the buffer are 0xFF, 0xFE, 0x00, 0x00, it is LE;
77                // otherwise, it is BE.
78                if matches!(self.top[..], [0xFF, 0xFE, 0x00, 0x00]) {
79                    self.be = false;
80                    return Ok((base, 0));
81                } else if matches!(self.top[..], [0x00, 0x00, 0xFE, 0xFF]) {
82                    self.be = true;
83                    return Ok((base, 0));
84                } else {
85                    self.be = true;
86                    // Since the first two bytes were not BOM,
87                    // try decoding using the first two bytes that have already been acquired.
88                };
89            } else {
90                return Ok((base, 0));
91            }
92        }
93
94        if self.be && !matches!(self.top[..], [0x00, 0x00, 0xFE, 0xFF]) {
95            let codepoint = u32::from_be_bytes(self.top);
96            let mut write = 0;
97            match char::from_u32(codepoint) {
98                Some(c) => {
99                    write += c.len_utf8();
100                    dst.push(c);
101                }
102                None => {
103                    // If this is the last buffer, or if there is sufficient data to form a surrogate pair but an error occurs,
104                    // it is simply an invalid byte sequence.
105                    return Err(DecodeError::Malformed {
106                        read: 4,
107                        write,
108                        length: 4,
109                        offset: 0,
110                    });
111                }
112            }
113
114            self.top = [0x00, 0x00, 0xFE, 0xFF];
115            return Ok((base, write));
116        }
117
118        if self.be {
119            UTF32BEDecoder.decode(src, dst, finish)
120        } else {
121            UTF32LEDecoder.decode(src, dst, finish)
122        }
123    }
124}
125
126impl Default for UTF32Decoder {
127    fn default() -> Self {
128        Self {
129            read: 0,
130            top: [0; 4],
131            be: true,
132        }
133    }
134}
135
136/// Encoding name for UTF-32BE.
137pub const UTF32BE_NAME: &str = "UTF-32BE";
138
139/// Encoder for UTF-32BE.
140pub struct UTF32BEEncoder;
141impl Encoder for UTF32BEEncoder {
142    fn name(&self) -> &'static str {
143        UTF32BE_NAME
144    }
145
146    fn encode(
147        &mut self,
148        src: &str,
149        mut dst: &mut [u8],
150        _finish: bool,
151    ) -> Result<(usize, usize), EncodeError> {
152        if src.is_empty() {
153            return Err(EncodeError::InputIsEmpty);
154        }
155        if dst.len() < 4 {
156            return Err(EncodeError::OutputTooShort);
157        }
158
159        let mut read = 0;
160        let mut write = 0;
161        for c in src.chars() {
162            read += c.len_utf8();
163            dst[..4].copy_from_slice(&(c as u32).to_be_bytes()[..]);
164            dst = &mut dst[4..];
165            write += 4;
166            if dst.len() < 4 {
167                break;
168            }
169        }
170        Ok((read, write))
171    }
172}
173
174/// Decoder for UTF-32BE.
175pub struct UTF32BEDecoder;
176impl Decoder for UTF32BEDecoder {
177    fn name(&self) -> &'static str {
178        UTF32BE_NAME
179    }
180
181    fn decode(
182        &mut self,
183        src: &[u8],
184        dst: &mut String,
185        finish: bool,
186    ) -> Result<(usize, usize), DecodeError> {
187        if src.is_empty() {
188            return Err(DecodeError::InputIsEmpty);
189        }
190        let cap = dst.capacity() - dst.len();
191        if cap < 4 {
192            return Err(DecodeError::OutputTooShort);
193        }
194
195        let mut read = 0;
196        let mut write = 0;
197        for bytes in src.chunks_exact(4) {
198            read += 4;
199            let codepoint = u32::from_be_bytes(bytes.try_into().unwrap());
200            match char::from_u32(codepoint) {
201                Some(c) => {
202                    write += c.len_utf8();
203                    dst.push(c);
204                }
205                None => {
206                    return Err(DecodeError::Malformed {
207                        read,
208                        write,
209                        length: 4,
210                        offset: 0,
211                    });
212                }
213            }
214            if dst.capacity() - dst.len() < 4 {
215                break;
216            }
217        }
218
219        let rem = src.len() - read;
220        if finish && rem < 4 && rem != 0 && dst.capacity() - dst.len() >= 4 {
221            return Err(DecodeError::Malformed {
222                read: src.len(),
223                write,
224                length: src.len() - read,
225                offset: 0,
226            });
227        }
228
229        Ok((read, write))
230    }
231}
232
233/// Encoding name for UTF-32LE.
234pub const UTF32LE_NAME: &str = "UTF-32LE";
235
236/// Encoder for UTF-32LE.
237pub struct UTF32LEEncoder;
238impl Encoder for UTF32LEEncoder {
239    fn name(&self) -> &'static str {
240        UTF32LE_NAME
241    }
242
243    fn encode(
244        &mut self,
245        src: &str,
246        mut dst: &mut [u8],
247        _finish: bool,
248    ) -> Result<(usize, usize), EncodeError> {
249        if src.is_empty() {
250            return Err(EncodeError::InputIsEmpty);
251        }
252        if dst.len() < 4 {
253            return Err(EncodeError::OutputTooShort);
254        }
255
256        let mut read = 0;
257        let mut write = 0;
258        for c in src.chars() {
259            read += c.len_utf8();
260            dst[..4].copy_from_slice(&(c as u32).to_le_bytes()[..]);
261            dst = &mut dst[4..];
262            write += 4;
263            if dst.len() < 4 {
264                break;
265            }
266        }
267        Ok((read, write))
268    }
269}
270
271/// Decoder for UTF-32LE.
272pub struct UTF32LEDecoder;
273impl Decoder for UTF32LEDecoder {
274    fn name(&self) -> &'static str {
275        UTF32LE_NAME
276    }
277
278    fn decode(
279        &mut self,
280        src: &[u8],
281        dst: &mut String,
282        finish: bool,
283    ) -> Result<(usize, usize), DecodeError> {
284        if src.is_empty() {
285            return Err(DecodeError::InputIsEmpty);
286        }
287        let cap = dst.capacity() - dst.len();
288        if cap < 4 {
289            return Err(DecodeError::OutputTooShort);
290        }
291
292        let mut read = 0;
293        let mut write = 0;
294        for bytes in src.chunks_exact(4) {
295            read += 4;
296            let codepoint = u32::from_le_bytes(bytes.try_into().unwrap());
297            match char::from_u32(codepoint) {
298                Some(c) => {
299                    write += c.len_utf8();
300                    dst.push(c);
301                }
302                None => {
303                    return Err(DecodeError::Malformed {
304                        read,
305                        write,
306                        length: 4,
307                        offset: 0,
308                    });
309                }
310            }
311            if dst.capacity() - dst.len() < 4 {
312                break;
313            }
314        }
315
316        let rem = src.len() - read;
317        if finish && rem < 4 && rem != 0 && dst.capacity() - dst.len() >= 4 {
318            return Err(DecodeError::Malformed {
319                read: src.len(),
320                write,
321                length: src.len() - read,
322                offset: 0,
323            });
324        }
325
326        Ok((read, write))
327    }
328}
329
330// This is not an officially registered name, but it is set for convenience.
331const UCS4_UNUSUAL_2143_NAME: &str = "UCS4-UNUSUAL-2143";
332
333pub struct UCS4Unusual2143Encoder;
334impl Encoder for UCS4Unusual2143Encoder {
335    fn name(&self) -> &'static str {
336        UCS4_UNUSUAL_2143_NAME
337    }
338
339    fn encode(
340        &mut self,
341        src: &str,
342        mut dst: &mut [u8],
343        _finish: bool,
344    ) -> Result<(usize, usize), EncodeError> {
345        if src.is_empty() {
346            return Err(EncodeError::InputIsEmpty);
347        }
348        if dst.len() < 4 {
349            return Err(EncodeError::OutputTooShort);
350        }
351
352        let mut read = 0;
353        let mut write = 0;
354        for c in src.chars() {
355            read += c.len_utf8();
356            let bytes = (c as u32).to_be_bytes();
357            dst[0] = bytes[1];
358            dst[1] = bytes[0];
359            dst[2] = bytes[3];
360            dst[3] = bytes[2];
361            dst = &mut dst[4..];
362            write += 4;
363            if dst.len() < 4 {
364                break;
365            }
366        }
367        Ok((read, write))
368    }
369}
370
371pub struct UCS4Unusual2143Decoder;
372impl Decoder for UCS4Unusual2143Decoder {
373    fn name(&self) -> &'static str {
374        UCS4_UNUSUAL_2143_NAME
375    }
376
377    fn decode(
378        &mut self,
379        src: &[u8],
380        dst: &mut String,
381        finish: bool,
382    ) -> Result<(usize, usize), DecodeError> {
383        if src.is_empty() {
384            return Err(DecodeError::InputIsEmpty);
385        }
386        let cap = dst.capacity() - dst.len();
387        if cap < 4 {
388            return Err(DecodeError::OutputTooShort);
389        }
390
391        let mut read = 0;
392        let mut write = 0;
393        for bytes in src.chunks_exact(4) {
394            read += 4;
395            let codepoint = u32::from_le_bytes([bytes[2], bytes[3], bytes[0], bytes[1]]);
396            match char::from_u32(codepoint) {
397                Some(c) => {
398                    write += c.len_utf8();
399                    dst.push(c);
400                }
401                None => {
402                    return Err(DecodeError::Malformed {
403                        read,
404                        write,
405                        length: 4,
406                        offset: 0,
407                    });
408                }
409            }
410            if dst.capacity() - dst.len() < 4 {
411                break;
412            }
413        }
414
415        let rem = src.len() - read;
416        if finish && rem < 4 && rem != 0 && dst.capacity() - dst.len() >= 4 {
417            return Err(DecodeError::Malformed {
418                read: src.len(),
419                write,
420                length: src.len() - read,
421                offset: 0,
422            });
423        }
424
425        Ok((read, write))
426    }
427}
428
429// This is not an officially registered name, but it is set for convenience.
430const UCS4_UNUSUAL_3412_NAME: &str = "UCS4-UNUSUAL-3412";
431
432pub struct UCS4Unusual3412Encoder;
433impl Encoder for UCS4Unusual3412Encoder {
434    fn name(&self) -> &'static str {
435        UCS4_UNUSUAL_3412_NAME
436    }
437
438    fn encode(
439        &mut self,
440        src: &str,
441        mut dst: &mut [u8],
442        _finish: bool,
443    ) -> Result<(usize, usize), EncodeError> {
444        if src.is_empty() {
445            return Err(EncodeError::InputIsEmpty);
446        }
447        if dst.len() < 4 {
448            return Err(EncodeError::OutputTooShort);
449        }
450
451        let mut read = 0;
452        let mut write = 0;
453        for c in src.chars() {
454            read += c.len_utf8();
455            let bytes = (c as u32).to_be_bytes();
456            dst[0] = bytes[2];
457            dst[1] = bytes[3];
458            dst[2] = bytes[0];
459            dst[3] = bytes[1];
460            dst = &mut dst[4..];
461            write += 4;
462            if dst.len() < 4 {
463                break;
464            }
465        }
466        Ok((read, write))
467    }
468}
469
470pub struct UCS4Unusual3412Decoder;
471impl Decoder for UCS4Unusual3412Decoder {
472    fn name(&self) -> &'static str {
473        UCS4_UNUSUAL_3412_NAME
474    }
475
476    fn decode(
477        &mut self,
478        src: &[u8],
479        dst: &mut String,
480        finish: bool,
481    ) -> Result<(usize, usize), DecodeError> {
482        if src.is_empty() {
483            return Err(DecodeError::InputIsEmpty);
484        }
485        let cap = dst.capacity() - dst.len();
486        if cap < 4 {
487            return Err(DecodeError::OutputTooShort);
488        }
489
490        let mut read = 0;
491        let mut write = 0;
492        for bytes in src.chunks_exact(4) {
493            read += 4;
494            let codepoint = u32::from_le_bytes([bytes[1], bytes[0], bytes[2], bytes[3]]);
495            match char::from_u32(codepoint) {
496                Some(c) => {
497                    write += c.len_utf8();
498                    dst.push(c);
499                }
500                None => {
501                    return Err(DecodeError::Malformed {
502                        read,
503                        write,
504                        length: 4,
505                        offset: 0,
506                    });
507                }
508            }
509            if dst.capacity() - dst.len() < 4 {
510                break;
511            }
512        }
513
514        let rem = src.len() - read;
515        if finish && rem < 4 && rem != 0 && dst.capacity() - dst.len() >= 4 {
516            return Err(DecodeError::Malformed {
517                read: src.len(),
518                write,
519                length: src.len() - read,
520                offset: 0,
521            });
522        }
523
524        Ok((read, write))
525    }
526}