domain_core/master/
source.rs

1//! Character sources.
2//!
3//! This is here so we can read from things that aren’t ASCII or UTF-8.
4
5use std::{char, io};
6use std::io::Read;
7use std::fs::File;
8use std::path::Path;
9use failure::Fail;
10use super::scan::CharSource;
11
12
13//------------ str -----------------------------------------------------------
14
15impl<'a> CharSource for &'a str {
16    fn next(&mut self) -> Result<Option<char>, io::Error> {
17        let res = match self.chars().next() {
18            Some(ch) => ch,
19            None => return Ok(None),
20        };
21        *self = &self[res.len_utf8()..];
22        Ok(Some(res))
23    }
24}
25
26
27//------------ AsciiFile -----------------------------------------------------
28
29/// A file that contains only ASCII characters.
30///
31//  This isn’t built atop a BufReader because we can optimize for our
32//  strategy of reading from the buffer byte by byte.
33pub struct AsciiFile {
34    file: File,
35    buf: Option<(Box<[u8]>, usize, usize)>,
36}
37
38impl AsciiFile {
39    pub fn new(file: File) -> Self {
40        AsciiFile {
41            file,
42            buf: unsafe {
43                let mut buffer = Vec::with_capacity(CAP);
44                buffer.set_len(CAP);
45                Some((buffer.into_boxed_slice(), 0, 0))
46            }
47        }
48    }
49
50    /// Opens a file at the given path as an ASCII-only file.
51    pub fn open<P: AsRef<Path>>(path: P) -> Result<Self, io::Error> {
52        File::open(path).map(Self::new)
53    }
54}
55
56impl CharSource for AsciiFile {
57    fn next(&mut self) -> Result<Option<char>, io::Error> {
58        let err = if let Some((ref mut buf, ref mut len, ref mut pos))
59                                = self.buf {
60            if *pos < *len {
61                let res = buf[*pos];
62                if res.is_ascii() {
63                    *pos += 1;
64                    return Ok(Some(res as char))
65                }
66                Err(io::Error::new(
67                    io::ErrorKind::InvalidData, AsciiError(res).compat()
68                ))
69            }
70            else {
71                match self.file.read(buf) {
72                    Ok(0) => Ok(None),
73                    Ok(read_len) => {
74                        *len = read_len;
75                        let res = buf[0];
76                        if res.is_ascii() {
77                            *pos = 1;
78                            return Ok(Some(res as char))
79                        }
80                        Err(io::Error::new(
81                            io::ErrorKind::InvalidData,
82                            AsciiError(res).compat()
83                        ))
84                    }
85                    Err(err) => Err(err)
86                }
87            }
88        }
89        else {
90            return Ok(None);
91        };
92        self.buf = None;
93        err
94    }
95}
96
97
98//------------ AsciiError ----------------------------------------------------
99
100/// An error happened while reading an ASCII-only file.
101#[derive(Clone, Copy, Debug, Eq, Fail, PartialEq)]
102#[fail(display="invalid ASCII character '{}'", _0)]
103pub struct AsciiError(u8);
104
105
106//------------ Utf8File ------------------------------------------------------
107
108/// A file that contains UTF-8 encoded text.
109pub struct Utf8File(OctetFile);
110
111impl Utf8File {
112    pub fn new(file: File) -> Self {
113        Utf8File(OctetFile::new(file))
114    }
115
116    /// Opens a file at the given path as an ASCII-only file.
117    pub fn open<P: AsRef<Path>>(path: P) -> Result<Self, io::Error> {
118        File::open(path).map(Self::new)
119    }
120}
121
122impl CharSource for Utf8File {
123    fn next(&mut self) -> Result<Option<char>, io::Error> {
124        let first = match self.0.next()? {
125            Some(ch) => ch,
126            None => return Ok(None)
127        };
128        if first.is_ascii() { //first < 0x80  {
129            return Ok(Some(first as char))
130        }
131        let second = match self.0.next()? {
132            Some(ch) => ch,
133            None => {
134                return Err(io::Error::new(
135                    io::ErrorKind::UnexpectedEof, "unexpected EOF"
136                ))
137            }
138        };
139        if first < 0xC0 || second < 0x80 {
140            return Err(Utf8Error.into())
141        }
142        if first < 0xE0 {
143            return Ok(Some(unsafe {
144                char::from_u32_unchecked(
145                    (u32::from(first & 0x1F)) << 6 |
146                    u32::from(second & 0x3F)
147                )
148            }))
149        }
150        let third = match self.0.next()? {
151            Some(ch) => ch,
152            None => {
153                return Err(io::Error::new(
154                    io::ErrorKind::UnexpectedEof, "unexpected EOF"
155                ))
156            }
157        };
158        if third < 0x80 {
159            return Err(Utf8Error.into())
160        }
161        if first < 0xF0 {
162            return Ok(Some(unsafe {
163                char::from_u32_unchecked(
164                    (u32::from(first & 0x0F)) << 12 |
165                    (u32::from(second & 0x3F)) << 6 |
166                    u32::from(third & 0x3F)
167                )
168            }))
169        }
170        let fourth = match self.0.next()? {
171            Some(ch) => ch,
172            None => {
173                return Err(io::Error::new(
174                    io::ErrorKind::UnexpectedEof, "unexpected EOF"
175                ))
176            }
177        };
178        if first > 0xF7 || fourth < 0x80 {
179            return Err(Utf8Error.into())
180        }
181        Ok(Some(unsafe {
182            char::from_u32_unchecked(
183                (u32::from(first & 0x07)) << 18 |
184                (u32::from(second & 0x3F)) << 12 |
185                (u32::from(third & 0x3F)) << 6 |
186                u32::from(fourth & 0x3F)
187            )
188        }))
189    }
190}
191
192
193//------------ Utf8Error -----------------------------------------------------
194
195/// An error happened while reading an ASCII-only file.
196#[derive(Clone, Copy, Debug, Eq, Fail, PartialEq)]
197#[fail(display="invalid UTF-8 sequence")]
198pub struct Utf8Error;
199
200impl From<Utf8Error> for io::Error {
201    fn from(err: Utf8Error) -> Self {
202        io::Error::new(io::ErrorKind::Other, err.compat())
203    }
204}
205
206
207//------------ OctetFile -----------------------------------------------------
208
209//  This isn’t built atop a BufReader because we can optimize for our
210//  strategy of reading from the buffer byte by byte.
211pub struct OctetFile {
212    file: File,
213    buf: Option<(Box<[u8]>, usize, usize)>,
214}
215
216const CAP: usize = 8 * 1024;
217
218impl OctetFile {
219    pub fn new(file: File) -> Self {
220        OctetFile {
221            file,
222            buf: unsafe {
223                let mut buffer = Vec::with_capacity(CAP);
224                buffer.set_len(CAP);
225                Some((buffer.into_boxed_slice(), 0, 0))
226            }
227        }
228    }
229
230    /// Opens a file at the given path as an ASCII-only file.
231    pub fn open<P: AsRef<Path>>(path: P) -> Result<Self, io::Error> {
232        File::open(path).map(Self::new)
233    }
234
235    #[inline]
236    fn next(&mut self) -> Result<Option<u8>, io::Error> {
237        let err = if let Some((ref mut buf, ref mut len, ref mut pos))
238                                = self.buf {
239            if *pos < *len {
240                let res = buf[*pos];
241                *pos += 1;
242                return Ok(Some(res))
243            }
244            else {
245                match self.file.read(buf) {
246                    Ok(0) => Ok(None),
247                    Ok(read_len) => {
248                        *len = read_len;
249                        let res = buf[0];
250                        if res.is_ascii() {
251                            *pos = 1;
252                            return Ok(Some(res))
253                        }
254                        Err(io::Error::new(
255                            io::ErrorKind::InvalidData,
256                            AsciiError(res).compat()
257                        ))
258                    }
259                    Err(err) => Err(err)
260                }
261            }
262        }
263        else {
264            return Ok(None);
265        };
266        self.buf = None;
267        err
268    }
269}