html5gum/
reader.rs

1use std::cmp::min;
2use std::convert::Infallible;
3use std::fmt::Debug;
4use std::fs::File;
5use std::io::{self, Read};
6
7/// An object that provides characters to the tokenizer.
8///
9/// See [`crate::Tokenizer::new`] for more information.
10pub trait Reader {
11    /// The error returned by this reader.
12    type Error: std::error::Error;
13
14    /// Return a new byte from the input stream.
15    ///
16    /// The input stream does **not** have to be preprocessed in any way, it can contain standalone
17    /// surrogates and have inconsistent newlines.
18    fn read_byte(&mut self) -> Result<Option<u8>, Self::Error>;
19
20    /// Attempt to read an entire string at once, either case-insensitively or not.
21    ///
22    /// `case_sensitive=false` means that characters of the input stream should be compared while
23    /// ignoring ASCII-casing.
24    ///
25    /// It can be assumed that this function is never called with a string that contains `\r` or
26    /// `\n`.
27    ///
28    /// If the next characters equal to `s`, this function consumes the respective characters from
29    /// the input stream and returns `true`. If not, it does nothing and returns `false`.
30    fn try_read_string(&mut self, s: &[u8], case_sensitive: bool) -> Result<bool, Self::Error>;
31
32    /// Read an arbitrary amount of characters up until and including the next character that
33    /// matches an array entry in `needle`.
34    ///
35    /// Return either:
36    ///
37    /// 1. A chunk of consumed characters that does not contain any characters from `needle`. The chunk can be arbitrarily large or small.
38    /// 2. If the next character is included in `needle`, a string with just that character and nothing else.
39    ///
40    /// In other words, case 1 means "we didn't find the needle yet, but here's some read data",
41    /// while case 2 means "we have found the needle".
42    ///
43    /// The default implementation simply reads one character and calls `read_cb` with that
44    /// character, ignoring the needle entirely. It is recommended to manually implement
45    /// `read_until` if there is any sort of in-memory buffer where some sort of efficient string
46    /// search (see `memchr` or `jetscii` crate) can be run on.
47    ///
48    /// The return value is usually borrowed from underlying buffers. If that's not possible, a
49    /// small buffer is provided as `char_buf` to put a single character into.
50    ///
51    /// # Example
52    ///
53    /// Here is how [`StringReader`] behaves:
54    ///
55    /// ```rust
56    /// use html5gum::{Reader, Readable};
57    ///
58    /// let mut reader = "hello world".to_reader();
59    /// let mut eof = false;
60    /// let mut chunks = Vec::new();
61    /// while !eof {
62    ///     let mut char_buf = [0; 4];
63    ///     let xs = reader.read_until(&[b' ', b'r'], &mut char_buf).unwrap();
64    ///     if let Some(xs) = xs {
65    ///         chunks.push(std::str::from_utf8(xs).unwrap().to_owned());
66    ///     } else {
67    ///         eof = true;
68    ///     }
69    /// }
70    ///
71    /// assert_eq!(chunks, &["hello", " ", "wo", "r", "ld"]);
72    /// ```
73    ///
74    /// The inefficient default implementation produces:
75    ///
76    /// ```text
77    /// ["h", "e", "l", "l", "o", " ", "w", "o", "r", "l", "d"]
78    /// ```
79    #[inline(always)]
80    fn read_until<'b>(
81        &'b mut self,
82        needle: &[u8],
83        char_buf: &'b mut [u8; 4],
84    ) -> Result<Option<&'b [u8]>, Self::Error> {
85        let _ = needle;
86
87        match self.read_byte()? {
88            Some(x) => {
89                char_buf[0] = x;
90                Ok(Some(&char_buf[..1]))
91            }
92            None => Ok(None),
93        }
94    }
95}
96
97/// An object that can be converted into a [`crate::Reader`].
98///
99/// For example, any utf8-string can be converted into a `StringReader`, such that
100/// `Tokenizer::new("mystring")` and `Tokenizer::new(&String::new("foo"))` work.
101pub trait Readable<'a> {
102    /// The reader type to which this type should be converted.
103    type Reader: Reader + 'a;
104
105    /// Convert self to some sort of reader.
106    fn to_reader(self) -> Self::Reader;
107}
108
109impl<'a, R: 'a + Reader> Readable<'a> for R {
110    type Reader = Self;
111
112    fn to_reader(self) -> Self::Reader {
113        self
114    }
115}
116
117/// A helper struct to seek forwards and backwards in strings. Used by the tokenizer to read HTML
118/// from strings.
119///
120/// Example:
121///
122/// ```rust
123/// use std::fmt::Write;
124/// use html5gum::{Tokenizer, Token};
125///
126/// let html = "<title   >hello world</title>";
127/// let mut new_html = String::new();
128///
129/// for Ok(token) in Tokenizer::new(html) {
130///     match token {
131///         Token::StartTag(tag) => {
132///             write!(new_html, "<{}>", String::from_utf8_lossy(&tag.name)).unwrap();
133///         }
134///         Token::String(hello_world) => {
135///             write!(new_html, "{}", String::from_utf8_lossy(&hello_world)).unwrap();
136///         }
137///         Token::EndTag(tag) => {
138///             write!(new_html, "</{}>", String::from_utf8_lossy(&tag.name)).unwrap();
139///         }
140///         _ => panic!("unexpected input"),
141///     }
142/// }
143///
144/// assert_eq!(new_html, "<title>hello world</title>");
145/// ```
146#[derive(Debug)]
147pub struct StringReader<'a> {
148    input: &'a [u8],
149}
150
151impl<'a> StringReader<'a> {
152    fn new(input: &'a [u8]) -> Self {
153        StringReader { input }
154    }
155}
156
157impl<'a> Reader for StringReader<'a> {
158    type Error = Infallible;
159
160    #[inline(always)]
161    fn read_byte(&mut self) -> Result<Option<u8>, Self::Error> {
162        if self.input.is_empty() {
163            Ok(None)
164        } else {
165            let rv = self.input[0];
166            self.input = &self.input[1..];
167            Ok(Some(rv))
168        }
169    }
170
171    #[inline(always)]
172    fn read_until<'b>(
173        &'b mut self,
174        needle: &[u8],
175        _: &'b mut [u8; 4],
176    ) -> Result<Option<&'b [u8]>, Self::Error> {
177        if self.input.is_empty() {
178            return Ok(None);
179        }
180
181        if let Some(needle_pos) = fast_find(needle, self.input) {
182            if needle_pos == 0 {
183                let (rv, new_input) = self.input.split_at(1);
184                self.input = new_input;
185                Ok(Some(rv))
186            } else {
187                let (rv, new_input) = self.input.split_at(needle_pos);
188                self.input = new_input;
189                Ok(Some(rv))
190            }
191        } else {
192            let rv = self.input;
193            self.input = b"";
194            Ok(Some(rv))
195        }
196    }
197
198    #[inline(always)]
199    fn try_read_string(&mut self, s1: &[u8], case_sensitive: bool) -> Result<bool, Self::Error> {
200        // we do not need to call validate_char here because `s` hopefully does not contain invalid
201        // characters
202        if let Some(s2) = self.input.get(..s1.len()) {
203            if s1 == s2 || (!case_sensitive && s1.eq_ignore_ascii_case(s2)) {
204                self.input = &self.input[s1.len()..];
205                return Ok(true);
206            }
207        }
208
209        Ok(false)
210    }
211}
212
213impl<'a> Readable<'a> for &'a str {
214    type Reader = StringReader<'a>;
215
216    fn to_reader(self) -> Self::Reader {
217        StringReader::new(self.as_bytes())
218    }
219}
220
221impl<'a> Readable<'a> for &'a String {
222    type Reader = StringReader<'a>;
223
224    fn to_reader(self) -> Self::Reader {
225        StringReader::new(self.as_bytes())
226    }
227}
228
229impl<'a> Readable<'a> for &'a Vec<u8> {
230    type Reader = StringReader<'a>;
231
232    fn to_reader(self) -> Self::Reader {
233        StringReader::new(self.as_slice())
234    }
235}
236
237impl<'a> Readable<'a> for &'a [u8] {
238    type Reader = StringReader<'a>;
239
240    fn to_reader(self) -> Self::Reader {
241        StringReader::new(self)
242    }
243}
244
245/// A [`IoReader`] can be used to construct a tokenizer from any type that implements
246/// `std::io::Read`.
247///
248/// Because of trait impl conflicts, `IoReader` needs to be explicitly constructed. The exception
249/// to that is `File`, which can be directly passed to `Tokenizer::new`.
250///
251/// When passing `Read`-types into html5gum, no I/O buffering is required. html5gum maintains its
252/// own read-buffer (16kb, heap-allocated) such that it can be accessed directly. Put more simply,
253/// it's wasteful to wrap your `File` in a `std::io::BufReader` before passing it to html5gum.
254///
255/// Example:
256///
257/// ```rust
258/// use std::fmt::Write;
259/// use html5gum::{Token, IoReader, Tokenizer};
260///
261/// let tokenizer = Tokenizer::new(IoReader::new("<title>hello world</title>".as_bytes()));
262/// // more realistically: Tokenizer::new(File::open("index.html")?)
263/// // long-form: Tokenizer::new(IoReader::new(File::open("index.html")?))
264///
265/// let mut new_html = String::new();
266///
267/// for token in tokenizer {
268///     let token = token.unwrap();
269///
270///     match token {
271///         Token::StartTag(tag) => {
272///             write!(new_html, "<{}>", String::from_utf8_lossy(&tag.name)).unwrap();
273///         }
274///         Token::String(hello_world) => {
275///             write!(new_html, "{}", String::from_utf8_lossy(&hello_world)).unwrap();
276///         }
277///         Token::EndTag(tag) => {
278///             write!(new_html, "</{}>", String::from_utf8_lossy(&tag.name)).unwrap();
279///         }
280///         _ => panic!("unexpected input"),
281///     }
282///
283/// }
284///
285/// assert_eq!(new_html, "<title>hello world</title>");
286/// ```
287#[derive(Debug)]
288pub struct IoReader<R: Read, Buffer: AsRef<[u8]> + AsMut<[u8]> = Box<[u8]>> {
289    buf: Buffer,
290    read_cursor: usize,
291    write_cursor: usize,
292    reader: R,
293}
294
295impl<R: Read> IoReader<R> {
296    /// Construct a new `BufReadReader` from any type that implements `Read`.
297    pub fn new(reader: R) -> Self {
298        Self::new_with_buffer_size::<16384>(reader)
299    }
300
301    /// Construct a new `BufReadReader` with a specific internal buffer size.
302    ///
303    /// `new` defaults to a heap-allocated buffer of size 16kB.
304    pub fn new_with_buffer_size<const BUF_SIZE: usize>(reader: R) -> Self {
305        Self::new_with_buffer_impl(reader, Box::new([0; BUF_SIZE]))
306    }
307}
308
309impl<'a, R: Read> IoReader<R, &'a mut [u8]> {
310    /// Instantiate `IoReader` with a custom kind of buffer.
311    ///
312    /// Buffers do not need to be zero-initialized.
313    pub fn new_with_buffer(reader: R, buf: &'a mut [u8]) -> Self {
314        Self::new_with_buffer_impl(reader, buf)
315    }
316}
317
318impl<R: Read, Buffer: AsRef<[u8]> + AsMut<[u8]>> IoReader<R, Buffer> {
319    // new_with_buffer_impl is not exposed because we cannot use any kind of AsMut. It has to be
320    // one where we can be sure that the size of the buffer does not change with repeated calls to
321    // `as_mut()`. There are complex solutions to this sort of thing, but for now it seems simpler
322    // to allow either Box<[u8; _]> or &mut [u8], and nothing else.
323    //
324    // See discussion at https://users.rust-lang.org/t/cowmut-or-borrowed-owned-mutable-temp-buffers/96595
325    fn new_with_buffer_impl(reader: R, buf: Buffer) -> Self {
326        IoReader {
327            buf,
328            read_cursor: 0,
329            write_cursor: 0,
330            reader,
331        }
332    }
333
334    /// Ensure that the buffer contains at leaast `min_read_len` bytes to read.
335    ///
336    /// Shift all to-be-read buffer contents between `self.read_cursor` and `self.write_cursor` to
337    /// the beginning of the buffer, and read extra bytes if necessary.
338    #[inline(always)]
339    fn prepare_buf(&mut self, min_read_len: usize) -> Result<(), io::Error> {
340        let mut readable_len = self.write_cursor - self.read_cursor;
341        debug_assert!(min_read_len <= self.buf.as_mut().len());
342        debug_assert!(readable_len <= self.buf.as_mut().len());
343        if readable_len < min_read_len {
344            let mut raw_buf = &mut self.buf.as_mut()[..];
345            raw_buf.copy_within(self.read_cursor..self.write_cursor, 0);
346            raw_buf = &mut raw_buf[readable_len..];
347            while readable_len < min_read_len {
348                let n = self.reader.read(raw_buf)?;
349                if n == 0 {
350                    break;
351                }
352                readable_len += n;
353                raw_buf = &mut raw_buf[n..];
354            }
355            self.write_cursor = readable_len;
356            self.read_cursor = 0;
357        }
358        Ok(())
359    }
360}
361
362impl<R: Read, Buffer: AsRef<[u8]> + AsMut<[u8]>> Reader for IoReader<R, Buffer> {
363    type Error = io::Error;
364
365    #[inline(always)]
366    fn read_byte(&mut self) -> Result<Option<u8>, Self::Error> {
367        self.prepare_buf(1)?;
368        if self.read_cursor == self.write_cursor {
369            return Ok(None);
370        }
371        let rv = self.buf.as_mut().get(self.read_cursor).copied();
372        if rv.is_some() {
373            self.read_cursor += 1;
374        }
375        Ok(rv)
376    }
377
378    #[inline(always)]
379    fn try_read_string(&mut self, s1: &[u8], case_sensitive: bool) -> Result<bool, Self::Error> {
380        debug_assert!(!s1.contains(&b'\r'));
381        debug_assert!(!s1.contains(&b'\n'));
382
383        self.prepare_buf(s1.len())?;
384        let s2 = &self.buf.as_mut()
385            [self.read_cursor..min(self.read_cursor + s1.len(), self.write_cursor)];
386        if s1 == s2 || (!case_sensitive && s1.eq_ignore_ascii_case(s2)) {
387            self.read_cursor += s1.len();
388            Ok(true)
389        } else {
390            Ok(false)
391        }
392    }
393
394    #[inline(always)]
395    fn read_until<'b>(
396        &'b mut self,
397        needle: &[u8],
398        _: &'b mut [u8; 4],
399    ) -> Result<Option<&'b [u8]>, Self::Error> {
400        self.prepare_buf(4)?;
401        let buf = &self.buf.as_ref()[self.read_cursor..self.write_cursor];
402        if buf.is_empty() {
403            Ok(None)
404        } else if let Some(needle_pos) = fast_find(needle, buf) {
405            if needle_pos == 0 {
406                self.read_cursor += 1;
407                Ok(Some(&buf[..1]))
408            } else {
409                self.read_cursor += needle_pos;
410                Ok(Some(&buf[..needle_pos]))
411            }
412        } else {
413            self.read_cursor += buf.len();
414            Ok(Some(buf))
415        }
416    }
417}
418
419impl<'a> Readable<'a> for File {
420    type Reader = IoReader<File>;
421
422    fn to_reader(self) -> Self::Reader {
423        IoReader::new(self)
424    }
425}
426
427#[inline]
428fn fast_find(needle: &[u8], haystack: &[u8]) -> Option<usize> {
429    #[cfg(feature = "jetscii")]
430    {
431        debug_assert!(needle.len() <= 16);
432        let mut needle_arr = [0; 16];
433        needle_arr[..needle.len()].copy_from_slice(needle);
434        #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
435        jetscii::Bytes::new(needle_arr, needle.len() as i32, |b| needle.contains(&b)).find(haystack)
436    }
437
438    #[cfg(not(feature = "jetscii"))]
439    haystack.iter().position(|b| needle.contains(b))
440}