html5gum/reader.rs
1use std::cmp::min;
2use std::convert::Infallible;
3use std::fmt::Debug;
4use std::fs::File;
5use std::io::{self, Read};
6
7/// An object that provides characters to the tokenizer.
8///
9/// See [`crate::Tokenizer::new`] for more information.
10pub trait Reader {
11 /// The error returned by this reader.
12 type Error: std::error::Error;
13
14 /// Return a new byte from the input stream.
15 ///
16 /// The input stream does **not** have to be preprocessed in any way, it can contain standalone
17 /// surrogates and have inconsistent newlines.
18 fn read_byte(&mut self) -> Result<Option<u8>, Self::Error>;
19
20 /// Attempt to read an entire string at once, either case-insensitively or not.
21 ///
22 /// `case_sensitive=false` means that characters of the input stream should be compared while
23 /// ignoring ASCII-casing.
24 ///
25 /// It can be assumed that this function is never called with a string that contains `\r` or
26 /// `\n`.
27 ///
28 /// If the next characters equal to `s`, this function consumes the respective characters from
29 /// the input stream and returns `true`. If not, it does nothing and returns `false`.
30 fn try_read_string(&mut self, s: &[u8], case_sensitive: bool) -> Result<bool, Self::Error>;
31
32 /// Read an arbitrary amount of characters up until and including the next character that
33 /// matches an array entry in `needle`.
34 ///
35 /// Return either:
36 ///
37 /// 1. A chunk of consumed characters that does not contain any characters from `needle`. The chunk can be arbitrarily large or small.
38 /// 2. If the next character is included in `needle`, a string with just that character and nothing else.
39 ///
40 /// In other words, case 1 means "we didn't find the needle yet, but here's some read data",
41 /// while case 2 means "we have found the needle".
42 ///
43 /// The default implementation simply reads one character and calls `read_cb` with that
44 /// character, ignoring the needle entirely. It is recommended to manually implement
45 /// `read_until` if there is any sort of in-memory buffer where some sort of efficient string
46 /// search (see `memchr` or `jetscii` crate) can be run on.
47 ///
48 /// The return value is usually borrowed from underlying buffers. If that's not possible, a
49 /// small buffer is provided as `char_buf` to put a single character into.
50 ///
51 /// # Example
52 ///
53 /// Here is how [`StringReader`] behaves:
54 ///
55 /// ```rust
56 /// use html5gum::{Reader, Readable};
57 ///
58 /// let mut reader = "hello world".to_reader();
59 /// let mut eof = false;
60 /// let mut chunks = Vec::new();
61 /// while !eof {
62 /// let mut char_buf = [0; 4];
63 /// let xs = reader.read_until(&[b' ', b'r'], &mut char_buf).unwrap();
64 /// if let Some(xs) = xs {
65 /// chunks.push(std::str::from_utf8(xs).unwrap().to_owned());
66 /// } else {
67 /// eof = true;
68 /// }
69 /// }
70 ///
71 /// assert_eq!(chunks, &["hello", " ", "wo", "r", "ld"]);
72 /// ```
73 ///
74 /// The inefficient default implementation produces:
75 ///
76 /// ```text
77 /// ["h", "e", "l", "l", "o", " ", "w", "o", "r", "l", "d"]
78 /// ```
79 #[inline(always)]
80 fn read_until<'b>(
81 &'b mut self,
82 needle: &[u8],
83 char_buf: &'b mut [u8; 4],
84 ) -> Result<Option<&'b [u8]>, Self::Error> {
85 let _ = needle;
86
87 match self.read_byte()? {
88 Some(x) => {
89 char_buf[0] = x;
90 Ok(Some(&char_buf[..1]))
91 }
92 None => Ok(None),
93 }
94 }
95}
96
97/// An object that can be converted into a [`crate::Reader`].
98///
99/// For example, any utf8-string can be converted into a `StringReader`, such that
100/// `Tokenizer::new("mystring")` and `Tokenizer::new(&String::new("foo"))` work.
101pub trait Readable<'a> {
102 /// The reader type to which this type should be converted.
103 type Reader: Reader + 'a;
104
105 /// Convert self to some sort of reader.
106 fn to_reader(self) -> Self::Reader;
107}
108
109impl<'a, R: 'a + Reader> Readable<'a> for R {
110 type Reader = Self;
111
112 fn to_reader(self) -> Self::Reader {
113 self
114 }
115}
116
117/// A helper struct to seek forwards and backwards in strings. Used by the tokenizer to read HTML
118/// from strings.
119///
120/// Example:
121///
122/// ```rust
123/// use std::fmt::Write;
124/// use html5gum::{Tokenizer, Token};
125///
126/// let html = "<title >hello world</title>";
127/// let mut new_html = String::new();
128///
129/// for Ok(token) in Tokenizer::new(html) {
130/// match token {
131/// Token::StartTag(tag) => {
132/// write!(new_html, "<{}>", String::from_utf8_lossy(&tag.name)).unwrap();
133/// }
134/// Token::String(hello_world) => {
135/// write!(new_html, "{}", String::from_utf8_lossy(&hello_world)).unwrap();
136/// }
137/// Token::EndTag(tag) => {
138/// write!(new_html, "</{}>", String::from_utf8_lossy(&tag.name)).unwrap();
139/// }
140/// _ => panic!("unexpected input"),
141/// }
142/// }
143///
144/// assert_eq!(new_html, "<title>hello world</title>");
145/// ```
146#[derive(Debug)]
147pub struct StringReader<'a> {
148 input: &'a [u8],
149}
150
151impl<'a> StringReader<'a> {
152 fn new(input: &'a [u8]) -> Self {
153 StringReader { input }
154 }
155}
156
157impl<'a> Reader for StringReader<'a> {
158 type Error = Infallible;
159
160 #[inline(always)]
161 fn read_byte(&mut self) -> Result<Option<u8>, Self::Error> {
162 if self.input.is_empty() {
163 Ok(None)
164 } else {
165 let rv = self.input[0];
166 self.input = &self.input[1..];
167 Ok(Some(rv))
168 }
169 }
170
171 #[inline(always)]
172 fn read_until<'b>(
173 &'b mut self,
174 needle: &[u8],
175 _: &'b mut [u8; 4],
176 ) -> Result<Option<&'b [u8]>, Self::Error> {
177 if self.input.is_empty() {
178 return Ok(None);
179 }
180
181 if let Some(needle_pos) = fast_find(needle, self.input) {
182 if needle_pos == 0 {
183 let (rv, new_input) = self.input.split_at(1);
184 self.input = new_input;
185 Ok(Some(rv))
186 } else {
187 let (rv, new_input) = self.input.split_at(needle_pos);
188 self.input = new_input;
189 Ok(Some(rv))
190 }
191 } else {
192 let rv = self.input;
193 self.input = b"";
194 Ok(Some(rv))
195 }
196 }
197
198 #[inline(always)]
199 fn try_read_string(&mut self, s1: &[u8], case_sensitive: bool) -> Result<bool, Self::Error> {
200 // we do not need to call validate_char here because `s` hopefully does not contain invalid
201 // characters
202 if let Some(s2) = self.input.get(..s1.len()) {
203 if s1 == s2 || (!case_sensitive && s1.eq_ignore_ascii_case(s2)) {
204 self.input = &self.input[s1.len()..];
205 return Ok(true);
206 }
207 }
208
209 Ok(false)
210 }
211}
212
213impl<'a> Readable<'a> for &'a str {
214 type Reader = StringReader<'a>;
215
216 fn to_reader(self) -> Self::Reader {
217 StringReader::new(self.as_bytes())
218 }
219}
220
221impl<'a> Readable<'a> for &'a String {
222 type Reader = StringReader<'a>;
223
224 fn to_reader(self) -> Self::Reader {
225 StringReader::new(self.as_bytes())
226 }
227}
228
229impl<'a> Readable<'a> for &'a Vec<u8> {
230 type Reader = StringReader<'a>;
231
232 fn to_reader(self) -> Self::Reader {
233 StringReader::new(self.as_slice())
234 }
235}
236
237impl<'a> Readable<'a> for &'a [u8] {
238 type Reader = StringReader<'a>;
239
240 fn to_reader(self) -> Self::Reader {
241 StringReader::new(self)
242 }
243}
244
245/// A [`IoReader`] can be used to construct a tokenizer from any type that implements
246/// `std::io::Read`.
247///
248/// Because of trait impl conflicts, `IoReader` needs to be explicitly constructed. The exception
249/// to that is `File`, which can be directly passed to `Tokenizer::new`.
250///
251/// When passing `Read`-types into html5gum, no I/O buffering is required. html5gum maintains its
252/// own read-buffer (16kb, heap-allocated) such that it can be accessed directly. Put more simply,
253/// it's wasteful to wrap your `File` in a `std::io::BufReader` before passing it to html5gum.
254///
255/// Example:
256///
257/// ```rust
258/// use std::fmt::Write;
259/// use html5gum::{Token, IoReader, Tokenizer};
260///
261/// let tokenizer = Tokenizer::new(IoReader::new("<title>hello world</title>".as_bytes()));
262/// // more realistically: Tokenizer::new(File::open("index.html")?)
263/// // long-form: Tokenizer::new(IoReader::new(File::open("index.html")?))
264///
265/// let mut new_html = String::new();
266///
267/// for token in tokenizer {
268/// let token = token.unwrap();
269///
270/// match token {
271/// Token::StartTag(tag) => {
272/// write!(new_html, "<{}>", String::from_utf8_lossy(&tag.name)).unwrap();
273/// }
274/// Token::String(hello_world) => {
275/// write!(new_html, "{}", String::from_utf8_lossy(&hello_world)).unwrap();
276/// }
277/// Token::EndTag(tag) => {
278/// write!(new_html, "</{}>", String::from_utf8_lossy(&tag.name)).unwrap();
279/// }
280/// _ => panic!("unexpected input"),
281/// }
282///
283/// }
284///
285/// assert_eq!(new_html, "<title>hello world</title>");
286/// ```
287#[derive(Debug)]
288pub struct IoReader<R: Read, Buffer: AsRef<[u8]> + AsMut<[u8]> = Box<[u8]>> {
289 buf: Buffer,
290 read_cursor: usize,
291 write_cursor: usize,
292 reader: R,
293}
294
295impl<R: Read> IoReader<R> {
296 /// Construct a new `BufReadReader` from any type that implements `Read`.
297 pub fn new(reader: R) -> Self {
298 Self::new_with_buffer_size::<16384>(reader)
299 }
300
301 /// Construct a new `BufReadReader` with a specific internal buffer size.
302 ///
303 /// `new` defaults to a heap-allocated buffer of size 16kB.
304 pub fn new_with_buffer_size<const BUF_SIZE: usize>(reader: R) -> Self {
305 Self::new_with_buffer_impl(reader, Box::new([0; BUF_SIZE]))
306 }
307}
308
309impl<'a, R: Read> IoReader<R, &'a mut [u8]> {
310 /// Instantiate `IoReader` with a custom kind of buffer.
311 ///
312 /// Buffers do not need to be zero-initialized.
313 pub fn new_with_buffer(reader: R, buf: &'a mut [u8]) -> Self {
314 Self::new_with_buffer_impl(reader, buf)
315 }
316}
317
318impl<R: Read, Buffer: AsRef<[u8]> + AsMut<[u8]>> IoReader<R, Buffer> {
319 // new_with_buffer_impl is not exposed because we cannot use any kind of AsMut. It has to be
320 // one where we can be sure that the size of the buffer does not change with repeated calls to
321 // `as_mut()`. There are complex solutions to this sort of thing, but for now it seems simpler
322 // to allow either Box<[u8; _]> or &mut [u8], and nothing else.
323 //
324 // See discussion at https://users.rust-lang.org/t/cowmut-or-borrowed-owned-mutable-temp-buffers/96595
325 fn new_with_buffer_impl(reader: R, buf: Buffer) -> Self {
326 IoReader {
327 buf,
328 read_cursor: 0,
329 write_cursor: 0,
330 reader,
331 }
332 }
333
334 /// Ensure that the buffer contains at leaast `min_read_len` bytes to read.
335 ///
336 /// Shift all to-be-read buffer contents between `self.read_cursor` and `self.write_cursor` to
337 /// the beginning of the buffer, and read extra bytes if necessary.
338 #[inline(always)]
339 fn prepare_buf(&mut self, min_read_len: usize) -> Result<(), io::Error> {
340 let mut readable_len = self.write_cursor - self.read_cursor;
341 debug_assert!(min_read_len <= self.buf.as_mut().len());
342 debug_assert!(readable_len <= self.buf.as_mut().len());
343 if readable_len < min_read_len {
344 let mut raw_buf = &mut self.buf.as_mut()[..];
345 raw_buf.copy_within(self.read_cursor..self.write_cursor, 0);
346 raw_buf = &mut raw_buf[readable_len..];
347 while readable_len < min_read_len {
348 let n = self.reader.read(raw_buf)?;
349 if n == 0 {
350 break;
351 }
352 readable_len += n;
353 raw_buf = &mut raw_buf[n..];
354 }
355 self.write_cursor = readable_len;
356 self.read_cursor = 0;
357 }
358 Ok(())
359 }
360}
361
362impl<R: Read, Buffer: AsRef<[u8]> + AsMut<[u8]>> Reader for IoReader<R, Buffer> {
363 type Error = io::Error;
364
365 #[inline(always)]
366 fn read_byte(&mut self) -> Result<Option<u8>, Self::Error> {
367 self.prepare_buf(1)?;
368 if self.read_cursor == self.write_cursor {
369 return Ok(None);
370 }
371 let rv = self.buf.as_mut().get(self.read_cursor).copied();
372 if rv.is_some() {
373 self.read_cursor += 1;
374 }
375 Ok(rv)
376 }
377
378 #[inline(always)]
379 fn try_read_string(&mut self, s1: &[u8], case_sensitive: bool) -> Result<bool, Self::Error> {
380 debug_assert!(!s1.contains(&b'\r'));
381 debug_assert!(!s1.contains(&b'\n'));
382
383 self.prepare_buf(s1.len())?;
384 let s2 = &self.buf.as_mut()
385 [self.read_cursor..min(self.read_cursor + s1.len(), self.write_cursor)];
386 if s1 == s2 || (!case_sensitive && s1.eq_ignore_ascii_case(s2)) {
387 self.read_cursor += s1.len();
388 Ok(true)
389 } else {
390 Ok(false)
391 }
392 }
393
394 #[inline(always)]
395 fn read_until<'b>(
396 &'b mut self,
397 needle: &[u8],
398 _: &'b mut [u8; 4],
399 ) -> Result<Option<&'b [u8]>, Self::Error> {
400 self.prepare_buf(4)?;
401 let buf = &self.buf.as_ref()[self.read_cursor..self.write_cursor];
402 if buf.is_empty() {
403 Ok(None)
404 } else if let Some(needle_pos) = fast_find(needle, buf) {
405 if needle_pos == 0 {
406 self.read_cursor += 1;
407 Ok(Some(&buf[..1]))
408 } else {
409 self.read_cursor += needle_pos;
410 Ok(Some(&buf[..needle_pos]))
411 }
412 } else {
413 self.read_cursor += buf.len();
414 Ok(Some(buf))
415 }
416 }
417}
418
419impl<'a> Readable<'a> for File {
420 type Reader = IoReader<File>;
421
422 fn to_reader(self) -> Self::Reader {
423 IoReader::new(self)
424 }
425}
426
427#[inline]
428fn fast_find(needle: &[u8], haystack: &[u8]) -> Option<usize> {
429 #[cfg(feature = "jetscii")]
430 {
431 debug_assert!(needle.len() <= 16);
432 let mut needle_arr = [0; 16];
433 needle_arr[..needle.len()].copy_from_slice(needle);
434 #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
435 jetscii::Bytes::new(needle_arr, needle.len() as i32, |b| needle.contains(&b)).find(haystack)
436 }
437
438 #[cfg(not(feature = "jetscii"))]
439 haystack.iter().position(|b| needle.contains(b))
440}