json/
utf8.rs

1// Translation of Bjoern Hoehrmann's "Flexible and Economical UTF-8 Decoder".
2// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
3//
4// License
5//
6// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
7//
8// Permission is hereby granted, free of charge, to any person obtaining
9// a copy of this software and associated documentation files
10// (the "Software"), to deal in the Software without restriction, including
11// without limitation the rights to use, copy, modify, merge, publish,
12// distribute, sublicense, and/or sell copies of the Software, and to
13// permit persons to whom the Software is furnished to do so, subject
14// to the following conditions:
15//
16// The above copyright notice and this permission notice shall be included
17// in all copies or substantial portions of the Software.
18//
19// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22// IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
23// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25// SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26//
27
28use std::char::from_u32_unchecked;
29use std::error::Error;
30use std::fmt;
31use std::io::{self, Read};
32
33const UTF8D: [u8; 364] = [
34    // The first part of the table maps bytes to character classes that
35    // to reduce the size of the transition table and create bitmasks.
36     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
37     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
38     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
39     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
40     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
41     7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
42     8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
43    10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
44
45    // The second part is a transition table that maps a combination
46    // of a state of the automaton and a character class to a state.
47     0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
48    12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
49    12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
50    12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
51    12,36,12,12,12,12,12,12,12,12,12,12,
52];
53
54pub const UTF8_ACCEPT: u32 = 0;
55pub const UTF8_REJECT: u32 = 12;
56
57#[inline]
58pub fn decode(state: u32, byte: u32, codep: &mut u32) -> u32 {
59    let typ = UTF8D[byte as usize] as u32;
60
61    *codep =
62        if state != UTF8_ACCEPT {
63            (byte & 0x3f) | (*codep << 6)
64        } else {
65            (0xff >> typ) & byte
66        };
67
68    let ix = 256 + state + typ;
69    UTF8D[ix as usize] as u32
70}
71
72pub struct Chars<R> {
73    reader: R,
74    state:  u32,
75    codep:  u32
76}
77
78impl<R: Read> Chars<R> {
79    pub fn new(r: R) -> Chars<R> {
80        Chars {
81            reader: r,
82            state:  UTF8_ACCEPT,
83            codep:  0
84        }
85    }
86}
87
88impl<R: Read> Iterator for Chars<R> {
89    type Item = Result<char, ReadError>;
90
91    fn next(&mut self) -> Option<Result<char, ReadError>> {
92        loop {
93            match read_byte(&mut self.reader) {
94                Some(Ok(b)) => {
95                    self.state = decode(self.state, b as u32, &mut self.codep);
96                    match self.state {
97                        UTF8_ACCEPT => unsafe { return Some(Ok(from_u32_unchecked(self.codep))) },
98                        UTF8_REJECT => return Some(Err(ReadError::InvalidUtf8)),
99                        _           => {}
100                    }
101                }
102                Some(Err(e)) => return Some(Err(e.into())),
103                None         => return None
104            }
105        }
106    }
107}
108
109fn read_byte<R: Read>(r: &mut R) -> Option<io::Result<u8>> {
110    let mut b = [0];
111    loop {
112        match r.read(&mut b) {
113            Ok(0)  => return None,
114            Ok(_)  => return Some(Ok(b[0])),
115            Err(e) =>
116                if e.kind() == io::ErrorKind::Interrupted {
117                    continue
118                } else {
119                    return Some(Err(e))
120                }
121        }
122    }
123}
124
125#[derive(Debug)]
126pub enum ReadError {
127    InvalidUtf8,
128    Io(io::Error)
129}
130
131impl fmt::Display for ReadError {
132    fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
133        match *self {
134            ReadError::InvalidUtf8 => write!(f, "invalid utf-8 encoding"),
135            ReadError::Io(ref e)   => write!(f, "i/o: {}", e)
136        }
137    }
138}
139
140impl Error for ReadError {
141    fn description(&self) -> &str {
142        match *self {
143            ReadError::InvalidUtf8 => "invalid utf-8 encoding",
144            ReadError::Io(_)       => "i/o error"
145        }
146
147    }
148
149    fn cause(&self) -> Option<&Error> {
150        match *self {
151            ReadError::Io(ref e) => Some(e),
152            _                    => None
153        }
154    }
155}
156
157impl From<io::Error> for ReadError {
158    fn from(e: io::Error) -> ReadError {
159        ReadError::Io(e)
160    }
161}
162