Skip to main content

anyxml_encoding/
ucs4.rs

1use crate::{DecodeError, Decoder, EncodeError, Encoder};
2
3pub const UTF32_NAME: &str = "UTF-32";
4
5#[derive(Debug, Default)]
6pub struct UTF32Encoder {
7    init: bool,
8}
9impl Encoder for UTF32Encoder {
10    fn name(&self) -> &'static str {
11        UTF32_NAME
12    }
13
14    fn encode(
15        &mut self,
16        src: &str,
17        dst: &mut [u8],
18        finish: bool,
19    ) -> Result<(usize, usize), EncodeError> {
20        if src.is_empty() {
21            return Err(EncodeError::InputIsEmpty);
22        }
23        if dst.len() < 4 {
24            return Err(EncodeError::OutputTooShort);
25        }
26
27        if !self.init {
28            self.init = true;
29            // Write BOM as LE
30            dst[0] = 0xFF;
31            dst[1] = 0xFE;
32            dst[2] = 0x00;
33            dst[3] = 0x00;
34            return Ok((0, 4));
35        }
36        UTF32LEEncoder.encode(src, dst, finish)
37    }
38}
39
40pub struct UTF32Decoder {
41    read: usize,
42    top: [u8; 4],
43    be: bool,
44}
45impl Decoder for UTF32Decoder {
46    fn name(&self) -> &'static str {
47        UTF32_NAME
48    }
49
50    fn decode(
51        &mut self,
52        mut src: &[u8],
53        dst: &mut String,
54        finish: bool,
55    ) -> Result<(usize, usize), DecodeError> {
56        if src.is_empty() {
57            return Err(DecodeError::InputIsEmpty);
58        }
59        if dst.capacity() - dst.len() < 4 {
60            return Err(DecodeError::OutputTooShort);
61        }
62
63        let mut base = 0;
64        if self.read < 4 {
65            let orig = src.len();
66            while self.read < 4 && !src.is_empty() {
67                self.top[self.read] = src[0];
68                src = &src[1..];
69                self.read += 1;
70            }
71            base = orig - src.len();
72            if self.read == 4 {
73                // If the first 4 bytes of the buffer are 0xFF, 0xFE, 0x00, 0x00, it is LE;
74                // otherwise, it is BE.
75                if matches!(self.top[..], [0xFF, 0xFE, 0x00, 0x00]) {
76                    self.be = false;
77                    return Ok((base, 0));
78                } else if matches!(self.top[..], [0x00, 0x00, 0xFE, 0xFF]) {
79                    self.be = true;
80                    return Ok((base, 0));
81                } else {
82                    self.be = true;
83                    // Since the first two bytes were not BOM,
84                    // try decoding using the first two bytes that have already been acquired.
85                };
86            } else {
87                return Ok((base, 0));
88            }
89        }
90
91        if self.be && !matches!(self.top[..], [0x00, 0x00, 0xFE, 0xFF]) {
92            let codepoint = u32::from_be_bytes(self.top);
93            let mut write = 0;
94            match char::from_u32(codepoint) {
95                Some(c) => {
96                    write += c.len_utf8();
97                    dst.push(c);
98                }
99                None => {
100                    // If this is the last buffer, or if there is sufficient data to form a surrogate pair but an error occurs,
101                    // it is simply an invalid byte sequence.
102                    return Err(DecodeError::Malformed {
103                        read: 4,
104                        write,
105                        length: 4,
106                        offset: 0,
107                    });
108                }
109            }
110
111            self.top = [0x00, 0x00, 0xFE, 0xFF];
112            return Ok((base, write));
113        }
114
115        if self.be {
116            UTF32BEDecoder.decode(src, dst, finish)
117        } else {
118            UTF32LEDecoder.decode(src, dst, finish)
119        }
120    }
121}
122
123impl Default for UTF32Decoder {
124    fn default() -> Self {
125        Self {
126            read: 0,
127            top: [0; 4],
128            be: true,
129        }
130    }
131}
132
133pub const UTF32BE_NAME: &str = "UTF-32BE";
134
135pub struct UTF32BEEncoder;
136impl Encoder for UTF32BEEncoder {
137    fn name(&self) -> &'static str {
138        UTF32BE_NAME
139    }
140
141    fn encode(
142        &mut self,
143        src: &str,
144        mut dst: &mut [u8],
145        _finish: bool,
146    ) -> Result<(usize, usize), EncodeError> {
147        if src.is_empty() {
148            return Err(EncodeError::InputIsEmpty);
149        }
150        if dst.len() < 4 {
151            return Err(EncodeError::OutputTooShort);
152        }
153
154        let mut read = 0;
155        let mut write = 0;
156        for c in src.chars() {
157            read += c.len_utf8();
158            dst[..4].copy_from_slice(&(c as u32).to_be_bytes()[..]);
159            dst = &mut dst[4..];
160            write += 4;
161            if dst.len() < 4 {
162                break;
163            }
164        }
165        Ok((read, write))
166    }
167}
168
169pub struct UTF32BEDecoder;
170impl Decoder for UTF32BEDecoder {
171    fn name(&self) -> &'static str {
172        UTF32BE_NAME
173    }
174
175    fn decode(
176        &mut self,
177        src: &[u8],
178        dst: &mut String,
179        finish: bool,
180    ) -> Result<(usize, usize), DecodeError> {
181        if src.is_empty() {
182            return Err(DecodeError::InputIsEmpty);
183        }
184        let cap = dst.capacity() - dst.len();
185        if cap < 4 {
186            return Err(DecodeError::OutputTooShort);
187        }
188
189        let mut read = 0;
190        let mut write = 0;
191        for bytes in src.chunks_exact(4) {
192            read += 4;
193            let codepoint = u32::from_be_bytes(bytes.try_into().unwrap());
194            match char::from_u32(codepoint) {
195                Some(c) => {
196                    write += c.len_utf8();
197                    dst.push(c);
198                }
199                None => {
200                    return Err(DecodeError::Malformed {
201                        read,
202                        write,
203                        length: 4,
204                        offset: 0,
205                    });
206                }
207            }
208            if dst.capacity() - dst.len() < 4 {
209                break;
210            }
211        }
212
213        let rem = src.len() - read;
214        if finish && rem < 4 && rem != 0 && dst.capacity() - dst.len() >= 4 {
215            return Err(DecodeError::Malformed {
216                read: src.len(),
217                write,
218                length: src.len() - read,
219                offset: 0,
220            });
221        }
222
223        Ok((read, write))
224    }
225}
226
227pub const UTF32LE_NAME: &str = "UTF-32LE";
228
229pub struct UTF32LEEncoder;
230impl Encoder for UTF32LEEncoder {
231    fn name(&self) -> &'static str {
232        UTF32LE_NAME
233    }
234
235    fn encode(
236        &mut self,
237        src: &str,
238        mut dst: &mut [u8],
239        _finish: bool,
240    ) -> Result<(usize, usize), EncodeError> {
241        if src.is_empty() {
242            return Err(EncodeError::InputIsEmpty);
243        }
244        if dst.len() < 4 {
245            return Err(EncodeError::OutputTooShort);
246        }
247
248        let mut read = 0;
249        let mut write = 0;
250        for c in src.chars() {
251            read += c.len_utf8();
252            dst[..4].copy_from_slice(&(c as u32).to_le_bytes()[..]);
253            dst = &mut dst[4..];
254            write += 4;
255            if dst.len() < 4 {
256                break;
257            }
258        }
259        Ok((read, write))
260    }
261}
262
263pub struct UTF32LEDecoder;
264impl Decoder for UTF32LEDecoder {
265    fn name(&self) -> &'static str {
266        UTF32LE_NAME
267    }
268
269    fn decode(
270        &mut self,
271        src: &[u8],
272        dst: &mut String,
273        finish: bool,
274    ) -> Result<(usize, usize), DecodeError> {
275        if src.is_empty() {
276            return Err(DecodeError::InputIsEmpty);
277        }
278        let cap = dst.capacity() - dst.len();
279        if cap < 4 {
280            return Err(DecodeError::OutputTooShort);
281        }
282
283        let mut read = 0;
284        let mut write = 0;
285        for bytes in src.chunks_exact(4) {
286            read += 4;
287            let codepoint = u32::from_le_bytes(bytes.try_into().unwrap());
288            match char::from_u32(codepoint) {
289                Some(c) => {
290                    write += c.len_utf8();
291                    dst.push(c);
292                }
293                None => {
294                    return Err(DecodeError::Malformed {
295                        read,
296                        write,
297                        length: 4,
298                        offset: 0,
299                    });
300                }
301            }
302            if dst.capacity() - dst.len() < 4 {
303                break;
304            }
305        }
306
307        let rem = src.len() - read;
308        if finish && rem < 4 && rem != 0 && dst.capacity() - dst.len() >= 4 {
309            return Err(DecodeError::Malformed {
310                read: src.len(),
311                write,
312                length: src.len() - read,
313                offset: 0,
314            });
315        }
316
317        Ok((read, write))
318    }
319}
320
321// This is not an officially registered name, but it is set for convenience.
322const UCS4_UNUSUAL_2143_NAME: &str = "UCS4-UNUSUAL-2143";
323
324pub struct UCS4Unusual2143Encoder;
325impl Encoder for UCS4Unusual2143Encoder {
326    fn name(&self) -> &'static str {
327        UCS4_UNUSUAL_2143_NAME
328    }
329
330    fn encode(
331        &mut self,
332        src: &str,
333        mut dst: &mut [u8],
334        _finish: bool,
335    ) -> Result<(usize, usize), EncodeError> {
336        if src.is_empty() {
337            return Err(EncodeError::InputIsEmpty);
338        }
339        if dst.len() < 4 {
340            return Err(EncodeError::OutputTooShort);
341        }
342
343        let mut read = 0;
344        let mut write = 0;
345        for c in src.chars() {
346            read += c.len_utf8();
347            let bytes = (c as u32).to_be_bytes();
348            dst[0] = bytes[1];
349            dst[1] = bytes[0];
350            dst[2] = bytes[3];
351            dst[3] = bytes[2];
352            dst = &mut dst[4..];
353            write += 4;
354            if dst.len() < 4 {
355                break;
356            }
357        }
358        Ok((read, write))
359    }
360}
361
362pub struct UCS4Unusual2143Decoder;
363impl Decoder for UCS4Unusual2143Decoder {
364    fn name(&self) -> &'static str {
365        UCS4_UNUSUAL_2143_NAME
366    }
367
368    fn decode(
369        &mut self,
370        src: &[u8],
371        dst: &mut String,
372        finish: bool,
373    ) -> Result<(usize, usize), DecodeError> {
374        if src.is_empty() {
375            return Err(DecodeError::InputIsEmpty);
376        }
377        let cap = dst.capacity() - dst.len();
378        if cap < 4 {
379            return Err(DecodeError::OutputTooShort);
380        }
381
382        let mut read = 0;
383        let mut write = 0;
384        for bytes in src.chunks_exact(4) {
385            read += 4;
386            let codepoint = u32::from_le_bytes([bytes[2], bytes[3], bytes[0], bytes[1]]);
387            match char::from_u32(codepoint) {
388                Some(c) => {
389                    write += c.len_utf8();
390                    dst.push(c);
391                }
392                None => {
393                    return Err(DecodeError::Malformed {
394                        read,
395                        write,
396                        length: 4,
397                        offset: 0,
398                    });
399                }
400            }
401            if dst.capacity() - dst.len() < 4 {
402                break;
403            }
404        }
405
406        let rem = src.len() - read;
407        if finish && rem < 4 && rem != 0 && dst.capacity() - dst.len() >= 4 {
408            return Err(DecodeError::Malformed {
409                read: src.len(),
410                write,
411                length: src.len() - read,
412                offset: 0,
413            });
414        }
415
416        Ok((read, write))
417    }
418}
419
420// This is not an officially registered name, but it is set for convenience.
421const UCS4_UNUSUAL_3412_NAME: &str = "UCS4-UNUSUAL-3412";
422
423pub struct UCS4Unusual3412Encoder;
424impl Encoder for UCS4Unusual3412Encoder {
425    fn name(&self) -> &'static str {
426        UCS4_UNUSUAL_3412_NAME
427    }
428
429    fn encode(
430        &mut self,
431        src: &str,
432        mut dst: &mut [u8],
433        _finish: bool,
434    ) -> Result<(usize, usize), EncodeError> {
435        if src.is_empty() {
436            return Err(EncodeError::InputIsEmpty);
437        }
438        if dst.len() < 4 {
439            return Err(EncodeError::OutputTooShort);
440        }
441
442        let mut read = 0;
443        let mut write = 0;
444        for c in src.chars() {
445            read += c.len_utf8();
446            let bytes = (c as u32).to_be_bytes();
447            dst[0] = bytes[2];
448            dst[1] = bytes[3];
449            dst[2] = bytes[0];
450            dst[3] = bytes[1];
451            dst = &mut dst[4..];
452            write += 4;
453            if dst.len() < 4 {
454                break;
455            }
456        }
457        Ok((read, write))
458    }
459}
460
461pub struct UCS4Unusual3412Decoder;
462impl Decoder for UCS4Unusual3412Decoder {
463    fn name(&self) -> &'static str {
464        UCS4_UNUSUAL_3412_NAME
465    }
466
467    fn decode(
468        &mut self,
469        src: &[u8],
470        dst: &mut String,
471        finish: bool,
472    ) -> Result<(usize, usize), DecodeError> {
473        if src.is_empty() {
474            return Err(DecodeError::InputIsEmpty);
475        }
476        let cap = dst.capacity() - dst.len();
477        if cap < 4 {
478            return Err(DecodeError::OutputTooShort);
479        }
480
481        let mut read = 0;
482        let mut write = 0;
483        for bytes in src.chunks_exact(4) {
484            read += 4;
485            let codepoint = u32::from_le_bytes([bytes[1], bytes[0], bytes[2], bytes[3]]);
486            match char::from_u32(codepoint) {
487                Some(c) => {
488                    write += c.len_utf8();
489                    dst.push(c);
490                }
491                None => {
492                    return Err(DecodeError::Malformed {
493                        read,
494                        write,
495                        length: 4,
496                        offset: 0,
497                    });
498                }
499            }
500            if dst.capacity() - dst.len() < 4 {
501                break;
502            }
503        }
504
505        let rem = src.len() - read;
506        if finish && rem < 4 && rem != 0 && dst.capacity() - dst.len() >= 4 {
507            return Err(DecodeError::Malformed {
508                read: src.len(),
509                write,
510                length: src.len() - read,
511                offset: 0,
512            });
513        }
514
515        Ok((read, write))
516    }
517}