json_codec_wasm/
utf8.rs

1// Translation of Bjoern Hoehrmann's "Flexible and Economical UTF-8 Decoder".
2// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
3//
4// License
5//
6// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
7//
8// Permission is hereby granted, free of charge, to any person obtaining
9// a copy of this software and associated documentation files
10// (the "Software"), to deal in the Software without restriction, including
11// without limitation the rights to use, copy, modify, merge, publish,
12// distribute, sublicense, and/or sell copies of the Software, and to
13// permit persons to whom the Software is furnished to do so, subject
14// to the following conditions:
15//
16// The above copyright notice and this permission notice shall be included
17// in all copies or substantial portions of the Software.
18//
19// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22// IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
23// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25// SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26//
27
28use std::char::from_u32_unchecked;
29use std::error::Error;
30use std::fmt;
31use std::io::{self, Read};
32
33const UTF8D: [u8; 364] = [
34    // The first part of the table maps bytes to character classes that
35    // to reduce the size of the transition table and create bitmasks.
36    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
37    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
38    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
39    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
40    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
41    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
42    8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
43    10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
44    8,
45    // The second part is a transition table that maps a combination
46    // of a state of the automaton and a character class to a state.
47    0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
48    12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 24, 12, 12,
49    12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12,
50    12, 12, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12,
51    12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
52];
53
54pub const UTF8_ACCEPT: u32 = 0;
55pub const UTF8_REJECT: u32 = 12;
56
57#[inline]
58pub fn decode(state: u32, byte: u32, codep: &mut u32) -> u32 {
59    let typ = UTF8D[byte as usize] as u32;
60
61    *codep = if state != UTF8_ACCEPT {
62        (byte & 0x3f) | (*codep << 6)
63    } else {
64        (0xff >> typ) & byte
65    };
66
67    let ix = 256 + state + typ;
68    UTF8D[ix as usize] as u32
69}
70
71pub struct Chars<R> {
72    reader: R,
73    state: u32,
74    codep: u32,
75}
76
77impl<R: Read> Chars<R> {
78    pub fn new(r: R) -> Chars<R> {
79        Chars {
80            reader: r,
81            state: UTF8_ACCEPT,
82            codep: 0,
83        }
84    }
85}
86
87impl<R: Read> Iterator for Chars<R> {
88    type Item = Result<char, ReadError>;
89
90    fn next(&mut self) -> Option<Result<char, ReadError>> {
91        loop {
92            match read_byte(&mut self.reader) {
93                Some(Ok(b)) => {
94                    self.state = decode(self.state, b as u32, &mut self.codep);
95                    match self.state {
96                        UTF8_ACCEPT => unsafe { return Some(Ok(from_u32_unchecked(self.codep))) },
97                        UTF8_REJECT => return Some(Err(ReadError::InvalidUtf8)),
98                        _ => {}
99                    }
100                }
101                Some(Err(e)) => return Some(Err(e.into())),
102                None => return None,
103            }
104        }
105    }
106}
107
108fn read_byte<R: Read>(r: &mut R) -> Option<io::Result<u8>> {
109    let mut b = [0];
110    loop {
111        match r.read(&mut b) {
112            Ok(0) => return None,
113            Ok(_) => return Some(Ok(b[0])),
114            Err(e) => {
115                if e.kind() == io::ErrorKind::Interrupted {
116                    continue;
117                } else {
118                    return Some(Err(e));
119                }
120            }
121        }
122    }
123}
124
125#[derive(Debug)]
126pub enum ReadError {
127    InvalidUtf8,
128    Io(io::Error),
129}
130
131impl fmt::Display for ReadError {
132    fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
133        match *self {
134            ReadError::InvalidUtf8 => write!(f, "invalid utf-8 encoding"),
135            ReadError::Io(ref e) => write!(f, "i/o: {}", e),
136        }
137    }
138}
139
140impl Error for ReadError {
141    fn description(&self) -> &str {
142        match *self {
143            ReadError::InvalidUtf8 => "invalid utf-8 encoding",
144            ReadError::Io(_) => "i/o error",
145        }
146    }
147
148    fn cause(&self) -> Option<&dyn Error> {
149        match *self {
150            ReadError::Io(ref e) => Some(e),
151            _ => None,
152        }
153    }
154}
155
156impl From<io::Error> for ReadError {
157    fn from(e: io::Error) -> ReadError {
158        ReadError::Io(e)
159    }
160}