sipp 0.2.1

Simple parser package
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
use std::io::{Error, ErrorKind, Read};

use crate::{buffer::ByteBuffer, decoder::ByteStreamCharDecoder};

/// Mask of the value bits of a UTF-8 continuation byte.
const CONT_MASK: u8 = 0b0011_1111;

/**
A decoder for a byte stream which is using UTF-8 character encoding.
*/
pub struct Utf8Decoder<R> {
    byte_buffer: ByteBuffer<R>,
}

impl<R: Read> ByteStreamCharDecoder<R> for Utf8Decoder<R> {
    /**
    Wraps the given `Read` type as a byte stream and uses UTF-8 encoding to convert bytes into
    characters.

    # Examples

    Given a hardcoded string (which Rust encodes using UTF-8) you can take it as a byte slice
    and `wrap` it with a `Utf8Decoder`, then read the characters out one at a time.

    ```
    // Important: you need to `use` both Utf8Decoder and the trait ByteStreamCharDecoder!
    use sipp::decoder::{Utf8Decoder, ByteStreamCharDecoder};
    use std::io::Error;

    fn main() -> Result<(), Error> {
        let input = "Some input text!";
        let mut decoder = Utf8Decoder::wrap(input.as_bytes());
        while let Some(c) = decoder.decode_char()? {
            println!("Found character {}", c);
        }
        println!("No more input available!");
        Ok(())
    }
    ```
    */
    fn wrap(reader: R) -> Utf8Decoder<R> {
        Utf8Decoder {
            byte_buffer: ByteBuffer::wrap(reader),
        }
    }

    /**
    Wraps the given `ByteBuffer` and uses UTF-8 encoding to convert bytes into characters.

    # Examples

    If you need to read from a UTF-8 file (or any other `Read` type) but you need to check for
    a BOM (byte-order mark) at the start of the byte stream, then you can wrap the `File`
    in a `ByteBuffer`, check for the BOM and skip past it, then wrap the `ByteBuffer` with a
    `Utf8Decoder` and start reading the actual content from it one character at a time.

    ```
    // Important: you need to `use` both Utf8Decoder and the trait ByteStreamCharDecoder!
    use sipp::{buffer::ByteBuffer, decoder::{Utf8Decoder, ByteStreamCharDecoder}};
    use std::fs::File;
    use std::io::Error;

    // A UTF-8 BOM is three bytes: 0xEF 0xBB 0xBF
    const BOM_UTF8: &[u8] = &[0xEF, 0xBB, 0xBF];

    fn main() -> Result<(), Error> {
        let file = File::open("test_resources/xml_utf8_BOM.xml")?;
        let mut byte_buffer = ByteBuffer::wrap(file);
        let first_bytes = byte_buffer.peek()?;
        # let mut found_byte_order_mark = false;
        if first_bytes.len() > 2 && first_bytes[0..3] == *BOM_UTF8 {
            println!("Found input which starts with UTF-8 BOM!");
            # found_byte_order_mark = true;
            // Now read past the three bytes which make up the UTF-8 BOM.
            assert_eq!(byte_buffer.read_next()?, Some(0xEF));
            assert_eq!(byte_buffer.read_next()?, Some(0xBB));
            assert_eq!(byte_buffer.read_next()?, Some(0xBF));
        } else {
            println!("No BOM found!");
        }
        # assert!(found_byte_order_mark);
        // Now the BOM is out of the way, you can wrap the ByteBuffer with Utf8Decoder so that
        // it's ready to decode actual character content.
        let mut decoder = Utf8Decoder::wrap_buffer(byte_buffer);
        # assert_eq!(decoder.decode_char()?, Some('<'));
        Ok(())
    }
    ```
    */
    fn wrap_buffer(byte_buffer: ByteBuffer<R>) -> Self {
        Utf8Decoder { byte_buffer }
    }

    /**
    Returns the next character represented by the byte stream. If there are no bytes remaining
    in the input stream then this method will return `None`.

    This method will not (must not) return Unicode surrogate codepoint characters.

    # Errors

    If the byte stream contains a sequence of bytes which do not represent a valid character
    under UTF-8 encoding, or if something goes wrong while reading the byte stream, then this
    method will return an `std::io::Error` variant.

    # Examples

    Using a hardcoded short string as an example, you can see how `decode_next` works.

    ```
    // Important: you need to `use` both Utf8Decoder and the trait ByteStreamCharDecoder!
    use sipp::decoder::{Utf8Decoder, ByteStreamCharDecoder};
    use std::io::Error;

    fn main() -> Result<(), Error> {
        let input = "Short";
        let mut decoder = Utf8Decoder::wrap(input.as_bytes());
        // While there is content, Some(c) will be returned.
        assert_eq!(decoder.decode_char()?, Some('S'));
        assert_eq!(decoder.decode_char()?, Some('h'));
        assert_eq!(decoder.decode_char()?, Some('o'));
        assert_eq!(decoder.decode_char()?, Some('r'));
        assert_eq!(decoder.decode_char()?, Some('t'));
        // Once we've consumed all of the content, None will be returned.
        assert_eq!(decoder.decode_char()?, None);
        Ok(())
    }
    ```

    As an example of what might cause an error to be returned, see what happens if we ask
    `Utf8Decoder` to decode a byte sequence which is not valid a valid character.

    ```
    // Important: you need to `use` both Utf8Decoder and the trait ByteStreamCharDecoder!
    use sipp::decoder::{Utf8Decoder, ByteStreamCharDecoder};
    use std::io::Error;

    fn main() -> Result<(), Error> {
        // UTF-8 representation of "Hello" followed by Unicode high surrogate codepoint U+D800
        let bytes: &[u8] = &[0x48, 0x65, 0x6C, 0x6C, 0x6F, 0xED, 0xA0, 0x80];
        let mut decoder = Utf8Decoder::wrap(bytes);
        // Reading works fine while we have valid UTF-8 bytes to decode:
        assert_eq!(decoder.decode_char()?, Some('H'));
        assert_eq!(decoder.decode_char()?, Some('e'));
        assert_eq!(decoder.decode_char()?, Some('l'));
        assert_eq!(decoder.decode_char()?, Some('l'));
        assert_eq!(decoder.decode_char()?, Some('o'));
        // But once the decoder reaches the byte sequence for a high surrogate codepoint, which
        // is not a valid character, then an error will be returned.
        let invalid_read = decoder.decode_char();
        assert!(invalid_read.is_err());
        # let invalid_read = decoder.decode_char()?;
        # assert!(invalid_read.is_none());
        Ok(())
    }
    ```

    While you may be able to keep reading after an error has been returned, it is recommended
    that an error is considered to indicate an invalid or corrupt UTF-8 stream, and no further
    reading should be attempted.
    */
    fn decode_char(&mut self) -> Result<Option<char>, Error> {
        // This decode algorithm is based closely on the method next_code_point found in the Rust source code:
        // https://doc.rust-lang.org/src/core/str/validations.rs.html#36-70
        match self.byte_buffer.read_next()? {
            None => Ok(None),
            Some(start_byte) => {
                let width = Self::determine_utf8_byte_count(start_byte)?;
                if width == 1 {
                    // println!("Found UTF-8 sequence with exactly one byte!");
                    return Self::u32_to_char(start_byte as u32);
                }
                // println!("Found UTF-8 sequence with at least two bytes!");
                let init = (start_byte & (0x7F >> 2)) as u32;
                match self.byte_buffer.read_next()? {
                    None => Err(Error::new(
                        ErrorKind::InvalidData,
                        "Invalid UTF-8: second byte missing.",
                    )),
                    Some(second_byte) => {
                        self.accumulate_multibyte_sequence(width, init, second_byte)
                    }
                }
            }
        }
    }
}

impl<R: Read> Utf8Decoder<R> {
    fn determine_utf8_byte_count(start_byte: u8) -> Result<usize, Error> {
        // Based on table 3-7 in section https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf#G7404
        if start_byte <= 0x7F {
            return Ok(1);
        }
        if start_byte < 0xC2 {
            return Err(Error::new(
                ErrorKind::InvalidData,
                "Invalid UTF-8: start byte from illegal 0x80 to 0xC1 interval.",
            ));
        }
        if start_byte <= 0xDF {
            return Ok(2);
        }
        if start_byte <= 0xEF {
            return Ok(3);
        }
        if start_byte <= 0xF4 {
            return Ok(4);
        }
        Err(Error::new(
            ErrorKind::InvalidData,
            "Invalid UTF-8: start byte from illegal 0xF5 to 0xFF interval.",
        ))
    }

    fn u32_to_char(codepoint: u32) -> Result<Option<char>, Error> {
        let conversion = char::from_u32(codepoint);
        match conversion {
            Some(c) => Ok(Some(c)),
            None => Err(Error::new(
                ErrorKind::InvalidData,
                "Invalid UTF-8: byte sequence maps to illegal codepoint.",
            )),
        }
    }

    fn accumulate_multibyte_sequence(
        &mut self,
        width: usize,
        init: u32,
        second_byte: u8,
    ) -> Result<Option<char>, Error> {
        match width {
            2 => {
                let codepoint = (init << 6) | (second_byte & CONT_MASK) as u32;
                // println!(
                //     "Found UTF-8 sequence of two bytes: {:?}",
                //     Self::u32_to_char(codepoint)
                // );
                Self::u32_to_char(codepoint)
            }
            _ => {
                match self.byte_buffer.read_next()? {
                    None => Err(Error::new(
                        ErrorKind::InvalidData,
                        "Invalid UTF-8: third byte missing.",
                    )),
                    Some(third_byte) => {
                        let second_masked = (second_byte & CONT_MASK) as u32;
                        let second_third = (second_masked << 6) | (third_byte & CONT_MASK) as u32;
                        match width {
                            3 => {
                                let codepoint = init << 12 | second_third;
                                // println!(
                                //     "Found UTF-8 sequence of three bytes: {:?}",
                                //     Self::u32_to_char(codepoint)
                                // );
                                Self::u32_to_char(codepoint)
                            }
                            _ => {
                                match self.byte_buffer.read_next()? {
                                    None => Err(Error::new(
                                        ErrorKind::InvalidData,
                                        "Invalid UTF-8: third byte missing.",
                                    )),
                                    Some(fourth_byte) => {
                                        let last_three =
                                            (second_third << 6) | (fourth_byte & CONT_MASK) as u32;
                                        let codepoint = (init & 7) << 18 | last_three;
                                        // println!(
                                        //     "Found UTF-8 sequence of four bytes: {:?}",
                                        //     Self::u32_to_char(codepoint)
                                        // );
                                        Self::u32_to_char(codepoint)
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }
}

#[cfg(test)]
mod tests {
    // Note this useful idiom: importing names from outer (for mod tests) scope.
    use super::*;

    #[test]
    fn test_utf8_empty() -> Result<(), Error> {
        let original = "";
        let bytes = original.as_bytes();
        let mut decoder = Utf8Decoder::wrap(bytes);
        let mut builder = String::new();
        while let Some(c) = decoder.decode_char()? {
            builder.push(c);
        }
        assert_eq!(builder.to_owned(), original.to_owned());
        Ok(())
    }

    #[test]
    fn test_ascii() -> Result<(), Error> {
        let original = "simple ASCII string";
        let bytes = original.as_bytes();
        let mut decoder = Utf8Decoder::wrap(bytes);
        let mut builder = String::new();
        while let Some(c) = decoder.decode_char()? {
            builder.push(c);
        }
        assert_eq!(builder.to_owned(), original.to_owned());
        Ok(())
    }

    #[test]
    fn test_utf8_two_bytes() -> Result<(), Error> {
        let original = "Swedish: Svenska är ett östnordiskt språk som talas av ungefär tio miljoner personer främst i Sverige.
        Ukrainian: Украї́нська мо́ва - національна мова українців. Належить до східнослов'янської групи слов'янських мов, що входять до індоєвропейської мовної сім'ї, поряд з романськими, германськими, кельтськими, грецькою, албанською, вірменською та найближче спорідненими зі слов'янськими балтійськими мовами.
        Greek: Η ελληνική γλώσσα ανήκει στην ινδοευρωπαϊκή οικογένεια και αποτελεί το μοναδικό μέλος του ελληνικού κλάδου, ενώ είναι η επίσημη γλώσσα της Ελλάδας και της Κύπρου. Ανήκει επίσης στο βαλκανικό γλωσσικό δεσμό.";
        let bytes = original.as_bytes();
        let mut decoder = Utf8Decoder::wrap(bytes);
        let mut builder = String::new();
        while let Some(c) = decoder.decode_char()? {
            builder.push(c);
        }
        assert_eq!(builder.to_owned(), original.to_owned());
        Ok(())
    }

    #[test]
    fn test_utf8_three_bytes() -> Result<(), Error> {
        let original = "Japanese: 日本語 は、日本国内や、かつての日本領だった国、そして国外移民や移住者を含む日本人同士の間で使用されている言語。
        Mathematical symbols: ∀ x ∃ ∅ ∌ x";
        let bytes = original.as_bytes();
        let mut decoder = Utf8Decoder::wrap(bytes);
        let mut builder = String::new();
        while let Some(c) = decoder.decode_char()? {
            builder.push(c);
        }
        assert_eq!(builder.to_owned(), original.to_owned());
        Ok(())
    }

    #[test]
    fn test_utf8_four_bytes() -> Result<(), Error> {
        let original = "Emoticons: 😀 😄 😌 🙄";
        let bytes = original.as_bytes();
        let mut decoder = Utf8Decoder::wrap(bytes);
        let mut builder = String::new();
        while let Some(c) = decoder.decode_char()? {
            builder.push(c);
        }
        assert_eq!(builder.to_owned(), original.to_owned());
        Ok(())
    }

    #[test]
    fn test_utf8_right_to_left_script() -> Result<(), Error> {
        let original = "Arabic: هذه المقالة عن اللغة العربية. لمعانٍ أخرى، طالع عربية (توضيح).
        Uyghur: ھەممە ئادەم زانىدىنلا ئەركىن، ئىززەت-ھۆرمەت ۋە ھوقۇقتا باپباراۋەر بولۇپ تۇغۇلغان.";
        let bytes = original.as_bytes();
        let mut decoder = Utf8Decoder::wrap(bytes);
        let mut builder = String::new();
        while let Some(c) = decoder.decode_char()? {
            builder.push(c);
        }
        assert_eq!(builder.to_owned(), original.to_owned());
        Ok(())
    }

    #[test]
    fn test_utf8_combining_diacritical_marks() -> Result<(), Error> {
        let original = "c\u{30C} = č ≍ č ≠ ć ≍ ć = c\u{301}";
        let bytes = original.as_bytes();
        let mut decoder = Utf8Decoder::wrap(bytes);
        let mut builder = String::new();
        while let Some(c) = decoder.decode_char()? {
            builder.push(c);
        }
        assert_eq!(builder.to_owned(), original.to_owned());
        Ok(())
    }

    #[test]
    fn invalid_unicode_codepoint_high_surrogate_first() -> Result<(), Error> {
        // UTF-8 representation of first high surrogate codepoint U+D800
        let bytes: &[u8] = &[0xED, 0xA0, 0x80];
        let mut decoder = Utf8Decoder::wrap(bytes);
        let invalid_read = decoder.decode_char();
        assert!(invalid_read.is_err());
        Ok(())
    }

    #[test]
    fn invalid_unicode_codepoint_high_surrogate_last() -> Result<(), Error> {
        // UTF-8 representation of last high surrogate codepoint U+DBFF
        let bytes: &[u8] = &[0xED, 0xAF, 0xBF];
        let mut decoder = Utf8Decoder::wrap(bytes);
        let invalid_read = decoder.decode_char();
        assert!(invalid_read.is_err());
        Ok(())
    }

    #[test]
    fn invalid_unicode_codepoint_low_surrogate_first() -> Result<(), Error> {
        // UTF-8 representation of first low surrogate codepoint U+DC00
        let bytes: &[u8] = &[0xED, 0xB0, 0x80];
        let mut decoder = Utf8Decoder::wrap(bytes);
        let invalid_read = decoder.decode_char();
        assert!(invalid_read.is_err());
        Ok(())
    }

    #[test]
    fn invalid_unicode_codepoint_low_surrogate_last() -> Result<(), Error> {
        // UTF-8 representation of last low surrogate codepoint U+DFFF
        let bytes: &[u8] = &[0xED, 0xBF, 0xBF];
        let mut decoder = Utf8Decoder::wrap(bytes);
        let invalid_read = decoder.decode_char();
        assert!(invalid_read.is_err());
        Ok(())
    }
}