1#![allow(dead_code)]
2#![allow(clippy::transmute_int_to_char)]
3use std::io::BufRead;
6use std::mem::transmute;
7
8use crate::common::*;
9use crate::utf8::SequenceType::Unrecognised;
10use crate::{decoder_error, invalid_byte_sequence};
11
12enum SequenceType {
13 Single,
14 Pair,
15 Triple,
16 Quad,
17 Unrecognised,
18}
19
20const SINGLE_BYTE_MASK: u32 = 0b0111_1111;
22const DOUBLE_BYTE_MASK: u32 = 0b0001_1111;
24const TRIPLE_BYTE_MASK: u32 = 0b0000_1111;
26const QUAD_BYTE_MASK: u32 = 0b0000_0111;
28const FOLLOWING_BYTE_MASK: u32 = 0b0011_1111;
30
31const TRIPLE_EXCLUDED_LOW_BOUND: u32 = 0xd800;
33
34const TRIPLE_EXCLUDED_HIGH_BOUND: u32 = 0xdfff;
36
37const QUAD_HIGH_BOUND: u32 = 0x10ffff;
39
40macro_rules! single_byte_sequence {
42 ($byte : expr) => {
43 $byte >> 7 == 0
44 };
45}
46
47macro_rules! double_byte_sequence {
49 ($byte : expr) => {
50 $byte >> 5 == 0b0000_0110
51 };
52}
53
54macro_rules! triple_byte_sequence {
56 ($byte : expr) => {
57 $byte >> 4 == 0b0000_1110
58 };
59}
60
61macro_rules! quad_byte_sequence {
63 ($byte : expr) => {
64 $byte >> 3 == 0b0001_1110
65 };
66}
67
68macro_rules! decode_pair {
69 ($buf : expr) => {
70 ($buf[1] as u32 & FOLLOWING_BYTE_MASK) | (($buf[0] as u32 & DOUBLE_BYTE_MASK) << 6)
71 };
72}
73
74macro_rules! decode_triple {
75 ($buf : expr) => {
76 ($buf[2] as u32 & FOLLOWING_BYTE_MASK)
77 | (($buf[1] as u32 & FOLLOWING_BYTE_MASK) << 6)
78 | (($buf[0] as u32 & TRIPLE_BYTE_MASK) << 12)
79 };
80}
81
82macro_rules! decode_quad {
83 ($buf : expr) => {
84 ($buf[3] as u32 & FOLLOWING_BYTE_MASK)
85 | (($buf[2] as u32 & FOLLOWING_BYTE_MASK) << 6)
86 | (($buf[1] as u32 & FOLLOWING_BYTE_MASK) << 12)
87 | (($buf[0] as u32 & QUAD_BYTE_MASK) << 18)
88 };
89}
90
91#[inline]
93fn sequence_type(b: u8) -> SequenceType {
94 if single_byte_sequence!(b) {
95 return SequenceType::Single;
96 }
97 if triple_byte_sequence!(b) {
98 return SequenceType::Triple;
99 }
100 if double_byte_sequence!(b) {
101 return SequenceType::Pair;
102 }
103 if quad_byte_sequence!(b) {
104 return SequenceType::Quad;
105 }
106 Unrecognised
107}
108
109pub struct Utf8Decoder<'a, B: BufRead> {
111 input: &'a mut B,
113 buffer: Vec<u8>,
115 init: bool,
116 index: usize,
117}
118
119impl<'a, Buffer: BufRead> Utf8Decoder<'a, Buffer> {
120 pub fn new(r: &'a mut Buffer) -> Self {
122 Utf8Decoder {
123 input: r,
124 buffer: vec![],
125 init: false,
126 index: 0,
127 }
128 }
129
130 fn init(&mut self) -> DecoderResult<()> {
132 match self.input.read_to_end(&mut self.buffer) {
133 Ok(_) => {
134 self.init = true;
135 Ok(())
136 }
137 Err(_) => Err(decoder_error!(
138 DecoderErrorCode::StreamFailure,
139 "failed to read input"
140 )),
141 }
142 }
143
144 fn decode_next(&mut self) -> DecoderResult<char> {
147 if !self.init {
148 self.init()?;
149 }
150
151 if self.index >= self.buffer.len() {
152 return Err(decoder_error!(
153 DecoderErrorCode::EndOfInput,
154 "end of input reached"
155 ));
156 }
157
158 match sequence_type(self.buffer[self.index]) {
159 SequenceType::Single => unsafe {
160 self.index += 1;
161 Ok(transmute(self.buffer[self.index - 1] as u32))
162 },
163 SequenceType::Pair => unsafe {
164 self.index += 2;
165 Ok(transmute(decode_pair!(
166 &self.buffer[self.index - 2..self.index]
167 )))
168 },
169 SequenceType::Triple => unsafe {
170 self.index += 3;
171 let value = decode_triple!(&self.buffer[self.index - 3..self.index]);
172 if (TRIPLE_EXCLUDED_LOW_BOUND..=TRIPLE_EXCLUDED_HIGH_BOUND).contains(&value) {
173 Err(decoder_error!(
174 DecoderErrorCode::OutOfRange,
175 "value falls within forbidden range [0xd800, 0xdfff]"
176 ))
177 } else {
178 Ok(transmute(value))
179 }
180 },
181 SequenceType::Quad => unsafe {
182 self.index += 4;
183 let value = decode_quad!(&self.buffer[self.index - 4..self.index]);
184 if value > QUAD_HIGH_BOUND {
185 Err(decoder_error!(
186 DecoderErrorCode::OutOfRange,
187 "value falls outside maximum bound 0x10ffff"
188 ))
189 } else {
190 Ok(transmute(value))
191 }
192 },
193 Unrecognised => {
194 invalid_byte_sequence!()
195 }
196 }
197 }
198}
199
200impl<'a, B: BufRead> Iterator for Utf8Decoder<'a, B> {
201 type Item = char;
202 fn next(&mut self) -> Option<Self::Item> {
204 match self.decode_next() {
205 Ok(c) => Some(c),
206 Err(_) => None,
207 }
208 }
209}
210
211#[cfg(test)]
212mod tests {
213 use std::fs::File;
214 use std::io::BufReader;
215 use std::time::Instant;
216
217 use crate::utf8::Utf8Decoder;
218
219 fn fuzz_file() -> File {
220 File::open("fixtures/fuzz.txt").unwrap()
221 }
222
223 fn complex_file() -> File {
224 File::open("fixtures/json/bench/utf8/twitter.json").unwrap()
225 }
226
227 #[test]
228 fn can_create_from_array() {
229 let buffer: &[u8] = &[0x10, 0x12, 0x23, 0x12];
230 let mut reader = BufReader::new(buffer);
231 let mut decoder = Utf8Decoder::new(&mut reader);
232 let mut _count = 0;
233 while decoder.decode_next().is_ok() {
234 _count += 1;
235 }
236 }
237
238 #[test]
239 fn can_create_from_file() {
240 let mut reader = BufReader::new(fuzz_file());
241 let _decoder = Utf8Decoder::new(&mut reader);
242 }
243
244 #[test]
245 fn pass_a_fuzz_test() {
246 let start = Instant::now();
247 let mut reader = BufReader::new(fuzz_file());
248 let mut decoder = Utf8Decoder::new(&mut reader);
249 let mut count = 0;
250 while decoder.decode_next().is_ok() {
251 count += 1;
252 }
253 assert_eq!(count, 35283);
254 println!("Decoded fuzz file in {:?}", start.elapsed());
255 }
256
257 #[test]
258 fn decode_a_complex_document() {
259 let mut reader = BufReader::new(complex_file());
260 let mut decoder = Utf8Decoder::new(&mut reader);
261 let mut count = 0;
262 while decoder.decode_next().is_ok() {
263 count += 1;
264 }
265 assert_eq!(count, 567916);
266 }
267
268 #[test]
269 fn should_be_an_iterator() {
270 let start = Instant::now();
271 let mut reader = BufReader::new(fuzz_file());
272 let decoder = Utf8Decoder::new(&mut reader);
273 assert_eq!(decoder.count(), 35283);
274 println!("Counted fuzz file in {:?}", start.elapsed());
275 }
276}