1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#![allow(dead_code)]
#![allow(clippy::transmute_int_to_char)]
//! A character-oriented decoder implementation that will take an underlying [std::u8] (byte) source
//! and produce a stream of decoded ASCII characters
use std::io::BufRead;
use std::mem::transmute;

use crate::common::*;
use crate::decoder_error;

/// An ASCIIdecoder, which takes a ref to a [BufRead] instance.
pub struct AsciiDecoder<'a, B: BufRead> {
    /// The input stream
    input: &'a mut B,
    /// Staging buffer
    buffer: Vec<u8>,
    /// Initialisation flag
    init: bool,
    /// The current index into the input
    index: usize,
}

impl<'a, Buffer: BufRead> AsciiDecoder<'a, Buffer> {
    /// Create a new decoder with a default buffer size
    pub fn new(r: &'a mut Buffer) -> Self {
        AsciiDecoder {
            input: r,
            buffer: vec![],
            init: false,
            index: 0,
        }
    }

    /// Initialise and read the input into an internal buffer for decoding
    fn init(&mut self) -> DecoderResult<()> {
        match self.input.read_to_end(&mut self.buffer) {
            Ok(_) => {
                self.init = true;
                Ok(())
            }
            Err(_) => Err(decoder_error!(
                DecoderErrorCode::StreamFailure,
                "failed to read input"
            )),
        }
    }

    /// Attempt to decode the next character in the underlying stream.
    fn decode_next(&mut self) -> DecoderResult<char> {
        if !self.init {
            self.init()?;
        }

        if self.index >= self.buffer.len() {
            return Err(decoder_error!(
                DecoderErrorCode::EndOfInput,
                "end of input reached"
            ));
        }

        if self.buffer[self.index] >> 7 == 0 {
            return unsafe {
                self.index += 1;
                Ok(transmute(self.buffer[self.index - 1] as u32))
            };
        } else {
            return Err(decoder_error!(
                DecoderErrorCode::OutOfRange,
                "non-ascii character detected"
            ));
        }
    }
}

impl<'a, B: BufRead> Iterator for AsciiDecoder<'a, B> {
    type Item = char;
    /// Decode the next character from the underlying stream
    fn next(&mut self) -> Option<Self::Item> {
        match self.decode_next() {
            Ok(c) => Some(c),
            Err(_) => None,
        }
    }
}

#[cfg(test)]
mod tests {
    use std::fs::File;
    use std::io::BufReader;
    use std::time::Instant;

    use crate::ascii::AsciiDecoder;
    use crate::common::DecoderErrorCode;

    fn utf8_fuzz_file() -> File {
        File::open("fixtures/fuzz.txt").unwrap()
    }
    fn ascii_fuzz_file() -> File {
        File::open("fixtures/json/bench/ascii/asciiart.json").unwrap()
    }
    fn complex_file() -> File {
        File::open("fixtures/json/bench/utf8/twitter.json").unwrap()
    }

    #[test]
    fn can_create_from_array() {
        let buffer: &[u8] = &[0x10, 0x12, 0x23, 0x12];
        let mut reader = BufReader::new(buffer);
        let mut decoder = AsciiDecoder::new(&mut reader);
        let mut _count = 0;
        while decoder.decode_next().is_ok() {
            _count += 1;
        }
    }

    #[test]
    fn can_create_from_file() {
        let mut reader = BufReader::new(utf8_fuzz_file());
        let _decoder = AsciiDecoder::new(&mut reader);
    }

    #[test]
    fn should_out_of_range_utf8() {
        let mut reader = BufReader::new(utf8_fuzz_file());
        let mut decoder = AsciiDecoder::new(&mut reader);
        loop {
            match decoder.decode_next() {
                Ok(_) => (),
                Err(e) => {
                    assert_eq!(e.code, DecoderErrorCode::OutOfRange);
                    break;
                }
            }
        }
    }

    #[test]
    fn should_pass_an_ascii_fuzz_test() {
        let mut reader = BufReader::new(ascii_fuzz_file());
        let mut decoder = AsciiDecoder::new(&mut reader);
        let mut count = 0;
        while decoder.decode_next().is_ok() {
            count += 1;
        }
        assert_eq!(count, 6406307);
    }

    #[test]
    fn should_be_an_iterator() {
        let start = Instant::now();
        let mut reader = BufReader::new(ascii_fuzz_file());
        let decoder = AsciiDecoder::new(&mut reader);
        assert_eq!(decoder.count(), 6406307);
        println!("Counted fuzz file in {:?}", start.elapsed());
    }
}