chisel_decoders/
utf8.rs

1#![allow(dead_code)]
2#![allow(clippy::transmute_int_to_char)]
3//! A character-oriented decoder implementation that will take an underlying [std::u8] (byte) source
4//! and produce a stream of decoded Unicode (UTF-8) characters
5use std::io::BufRead;
6use std::mem::transmute;
7
8use crate::common::*;
9use crate::utf8::SequenceType::Unrecognised;
10use crate::{decoder_error, invalid_byte_sequence};
11
12enum SequenceType {
13    Single,
14    Pair,
15    Triple,
16    Quad,
17    Unrecognised,
18}
19
20/// Mask for extracting 7 bits from a single byte sequence
21const SINGLE_BYTE_MASK: u32 = 0b0111_1111;
22/// Mask for extracting initial 5 bits within a double byte UTF-8 sequence
23const DOUBLE_BYTE_MASK: u32 = 0b0001_1111;
24/// Mask for extracting initial 4 bits within a triple byte UTF-8 ssequence
25const TRIPLE_BYTE_MASK: u32 = 0b0000_1111;
26/// Mask for extracting initial 3 bits within a quad byte UTF-8 ssequence
27const QUAD_BYTE_MASK: u32 = 0b0000_0111;
28/// Mask for extracting 6 bits from following byte UTF-8 ssequences
29const FOLLOWING_BYTE_MASK: u32 = 0b0011_1111;
30
31/// Low bound for checking excluded triples
32const TRIPLE_EXCLUDED_LOW_BOUND: u32 = 0xd800;
33
34/// High bound for checking excluded triples
35const TRIPLE_EXCLUDED_HIGH_BOUND: u32 = 0xdfff;
36
37/// High bound for checking quads
38const QUAD_HIGH_BOUND: u32 = 0x10ffff;
39
40/// Convenience macro for some bit twiddlin'
41macro_rules! single_byte_sequence {
42    ($byte : expr) => {
43        $byte >> 7 == 0
44    };
45}
46
47/// Convenience macro for some bit twiddlin'
48macro_rules! double_byte_sequence {
49    ($byte : expr) => {
50        $byte >> 5 == 0b0000_0110
51    };
52}
53
54/// Convenience macro for some bit twiddlin'
55macro_rules! triple_byte_sequence {
56    ($byte : expr) => {
57        $byte >> 4 == 0b0000_1110
58    };
59}
60
61/// Convenience macro for some bit twiddlin'
62macro_rules! quad_byte_sequence {
63    ($byte : expr) => {
64        $byte >> 3 == 0b0001_1110
65    };
66}
67
68macro_rules! decode_pair {
69    ($buf : expr) => {
70        ($buf[1] as u32 & FOLLOWING_BYTE_MASK) | (($buf[0] as u32 & DOUBLE_BYTE_MASK) << 6)
71    };
72}
73
74macro_rules! decode_triple {
75    ($buf : expr) => {
76        ($buf[2] as u32 & FOLLOWING_BYTE_MASK)
77            | (($buf[1] as u32 & FOLLOWING_BYTE_MASK) << 6)
78            | (($buf[0] as u32 & TRIPLE_BYTE_MASK) << 12)
79    };
80}
81
82macro_rules! decode_quad {
83    ($buf : expr) => {
84        ($buf[3] as u32 & FOLLOWING_BYTE_MASK)
85            | (($buf[2] as u32 & FOLLOWING_BYTE_MASK) << 6)
86            | (($buf[1] as u32 & FOLLOWING_BYTE_MASK) << 12)
87            | (($buf[0] as u32 & QUAD_BYTE_MASK) << 18)
88    };
89}
90
91/// Determine what kind of UTF-8 sequence we're dealing with
92#[inline]
93fn sequence_type(b: u8) -> SequenceType {
94    if single_byte_sequence!(b) {
95        return SequenceType::Single;
96    }
97    if triple_byte_sequence!(b) {
98        return SequenceType::Triple;
99    }
100    if double_byte_sequence!(b) {
101        return SequenceType::Pair;
102    }
103    if quad_byte_sequence!(b) {
104        return SequenceType::Quad;
105    }
106    Unrecognised
107}
108
109/// A UTF-8 decoder, which takes a ref to a [BufRead] instance.
110pub struct Utf8Decoder<'a, B: BufRead> {
111    /// The input stream
112    input: &'a mut B,
113    /// Staging buffer
114    buffer: Vec<u8>,
115    init: bool,
116    index: usize,
117}
118
119impl<'a, Buffer: BufRead> Utf8Decoder<'a, Buffer> {
120    /// Create a new decoder with a default buffer size
121    pub fn new(r: &'a mut Buffer) -> Self {
122        Utf8Decoder {
123            input: r,
124            buffer: vec![],
125            init: false,
126            index: 0,
127        }
128    }
129
130    /// Initialise and read the input into an internal buffer for decoding
131    fn init(&mut self) -> DecoderResult<()> {
132        match self.input.read_to_end(&mut self.buffer) {
133            Ok(_) => {
134                self.init = true;
135                Ok(())
136            }
137            Err(_) => Err(decoder_error!(
138                DecoderErrorCode::StreamFailure,
139                "failed to read input"
140            )),
141        }
142    }
143
144    /// Attempt to decode the next character in the underlying stream. Assumes the maximum
145    /// number of unicode bytes is 4 *not* 6
146    fn decode_next(&mut self) -> DecoderResult<char> {
147        if !self.init {
148            self.init()?;
149        }
150
151        if self.index >= self.buffer.len() {
152            return Err(decoder_error!(
153                DecoderErrorCode::EndOfInput,
154                "end of input reached"
155            ));
156        }
157
158        match sequence_type(self.buffer[self.index]) {
159            SequenceType::Single => unsafe {
160                self.index += 1;
161                Ok(transmute(self.buffer[self.index - 1] as u32))
162            },
163            SequenceType::Pair => unsafe {
164                self.index += 2;
165                Ok(transmute(decode_pair!(
166                    &self.buffer[self.index - 2..self.index]
167                )))
168            },
169            SequenceType::Triple => unsafe {
170                self.index += 3;
171                let value = decode_triple!(&self.buffer[self.index - 3..self.index]);
172                if (TRIPLE_EXCLUDED_LOW_BOUND..=TRIPLE_EXCLUDED_HIGH_BOUND).contains(&value) {
173                    Err(decoder_error!(
174                        DecoderErrorCode::OutOfRange,
175                        "value falls within forbidden range [0xd800, 0xdfff]"
176                    ))
177                } else {
178                    Ok(transmute(value))
179                }
180            },
181            SequenceType::Quad => unsafe {
182                self.index += 4;
183                let value = decode_quad!(&self.buffer[self.index - 4..self.index]);
184                if value > QUAD_HIGH_BOUND {
185                    Err(decoder_error!(
186                        DecoderErrorCode::OutOfRange,
187                        "value falls outside maximum bound 0x10ffff"
188                    ))
189                } else {
190                    Ok(transmute(value))
191                }
192            },
193            Unrecognised => {
194                invalid_byte_sequence!()
195            }
196        }
197    }
198}
199
200impl<'a, B: BufRead> Iterator for Utf8Decoder<'a, B> {
201    type Item = char;
202    /// Decode the next character from the underlying stream
203    fn next(&mut self) -> Option<Self::Item> {
204        match self.decode_next() {
205            Ok(c) => Some(c),
206            Err(_) => None,
207        }
208    }
209}
210
211#[cfg(test)]
212mod tests {
213    use std::fs::File;
214    use std::io::BufReader;
215    use std::time::Instant;
216
217    use crate::utf8::Utf8Decoder;
218
219    fn fuzz_file() -> File {
220        File::open("fixtures/fuzz.txt").unwrap()
221    }
222
223    fn complex_file() -> File {
224        File::open("fixtures/json/bench/utf8/twitter.json").unwrap()
225    }
226
227    #[test]
228    fn can_create_from_array() {
229        let buffer: &[u8] = &[0x10, 0x12, 0x23, 0x12];
230        let mut reader = BufReader::new(buffer);
231        let mut decoder = Utf8Decoder::new(&mut reader);
232        let mut _count = 0;
233        while decoder.decode_next().is_ok() {
234            _count += 1;
235        }
236    }
237
238    #[test]
239    fn can_create_from_file() {
240        let mut reader = BufReader::new(fuzz_file());
241        let _decoder = Utf8Decoder::new(&mut reader);
242    }
243
244    #[test]
245    fn pass_a_fuzz_test() {
246        let start = Instant::now();
247        let mut reader = BufReader::new(fuzz_file());
248        let mut decoder = Utf8Decoder::new(&mut reader);
249        let mut count = 0;
250        while decoder.decode_next().is_ok() {
251            count += 1;
252        }
253        assert_eq!(count, 35283);
254        println!("Decoded fuzz file in {:?}", start.elapsed());
255    }
256
257    #[test]
258    fn decode_a_complex_document() {
259        let mut reader = BufReader::new(complex_file());
260        let mut decoder = Utf8Decoder::new(&mut reader);
261        let mut count = 0;
262        while decoder.decode_next().is_ok() {
263            count += 1;
264        }
265        assert_eq!(count, 567916);
266    }
267
268    #[test]
269    fn should_be_an_iterator() {
270        let start = Instant::now();
271        let mut reader = BufReader::new(fuzz_file());
272        let decoder = Utf8Decoder::new(&mut reader);
273        assert_eq!(decoder.count(), 35283);
274        println!("Counted fuzz file in {:?}", start.elapsed());
275    }
276}