1use std::str;
2
3use derive_more::{Deref, DerefMut};
4
5use crate::Result;
6use crate::error::Error;
7
8pub(crate) const MAX_PEEK_N: usize = 7;
10
11pub(crate) const DEFAULT_BUFFER_SIZE: usize = 512;
12
13#[inline]
28pub fn leading_whitespace_bytes(bytes: &[u8]) -> usize {
29 if bytes.is_empty() {
30 return 0;
31 }
32 match bytes {
33 [b'\t' | b'\n' | 0x0B | 0x0C | b'\r' | b' ' | 0x1C..=0x1F, ..] => 1,
35
36 [0xC2, 0x85, ..] => 2,
38
39 [0xC2, 0xA0, ..] => 2,
41
42 [0xE1, 0x9A, 0x80, ..] => 3,
44
45 [0xE2, 0x80, 0x80..=0x8A, ..] => 3,
47
48 [0xE2, 0x80, 0xA8..=0xA9, ..] => 3,
50
51 [0xE2, 0x80, 0xAF, ..] => 3,
53
54 [0xE2, 0x81, 0x9F, ..] => 3,
56
57 [0xE3, 0x80, 0x80, ..] => 3,
59
60 [0xEF, 0xBB, 0xBF, ..] => 3,
62
63 _ => 0, }
65}
66
67fn parse_escaped_char<'de, R: Read<'de>>(reader: &mut R, scratch: &mut Vec<u8>) -> Result<()> {
68 let ch = reader.next()?;
69 match ch {
70 b'"' => scratch.push(b'"'),
71 b'\\' => scratch.push(b'\\'),
72 b'/' => scratch.push(b'/'),
73 b'b' => scratch.push(b'\x08'),
74 b'f' => scratch.push(b'\x0c'),
75 b'n' => scratch.push(b'\n'),
76 b'r' => scratch.push(b'\r'),
77 b't' => scratch.push(b'\t'),
78 b'u' => parse_escaped_unicode(reader, scratch)?,
79 _ => return Err(Error::InvalidEscape),
80 }
81 Ok(())
82}
83
84fn parse_escaped_unicode<'de, R: Read<'de>>(reader: &mut R, scratch: &mut Vec<u8>) -> Result<()> {
117 fn parse_hex16<'de, R: Read<'de>>(reader: &mut R) -> Result<u16> {
119 let mut n: u16 = 0;
120 for _ in 0..4 {
121 let b = reader.next()?;
122 n = match b {
123 b'0'..=b'9' => (n << 4) | (b - b'0') as u16,
124 b'a'..=b'f' => (n << 4) | (10 + b - b'a') as u16,
125 b'A'..=b'F' => (n << 4) | (10 + b - b'A') as u16,
126 _ => return Err(Error::InvalidEscape),
127 };
128 }
129 Ok(n)
130 }
131
132 let mut n = parse_hex16(reader)? as u32;
134
135 if (0xD800..=0xDBFF).contains(&n) {
137 if reader.next()? != b'\\' || reader.next()? != b'u' {
139 return Err(Error::InvalidEscape);
140 }
141 let n2 = parse_hex16(reader)? as u32;
142 if !(0xDC00..=0xDFFF).contains(&n2) {
143 return Err(Error::InvalidEscape);
144 }
145 n = 0x10000 + (((n - 0xD800) << 10) | (n2 - 0xDC00));
147 }
148
149 if let Some(ch) = char::from_u32(n) {
151 let mut buf = [0u8; 4];
152 scratch.extend_from_slice(ch.encode_utf8(&mut buf).as_bytes());
153 Ok(())
154 } else {
155 Err(Error::InvalidEscape)
156 }
157}
158
159#[derive(Debug, Clone, Copy)]
160pub struct Position {
161 pub line: usize,
162 pub column: usize,
163}
164
165pub enum Reference<'b, 'c, T>
166where
167 T: ?Sized + 'static,
168{
169 Borrowed(&'b T),
170 Copied(&'c T),
171}
172
173impl<'b, 'c, T> std::ops::Deref for Reference<'b, 'c, T>
174where
175 T: ?Sized + 'static,
176{
177 type Target = T;
178
179 fn deref(&self) -> &Self::Target {
180 match *self {
181 Reference::Borrowed(b) => b,
182 Reference::Copied(c) => c,
183 }
184 }
185}
186
187pub trait Read<'de> {
188 fn position(&self) -> Position;
189
190 fn peek_n(&mut self, n: usize) -> Result<&[u8]>;
191
192 #[inline]
193 fn peek(&mut self) -> Result<u8> {
194 let chars = self.peek_n(1)?;
195 Ok(chars[0])
196 }
197
198 #[inline]
199 fn peek2(&mut self) -> Result<(u8, u8)> {
200 let chars = self.peek_n(2)?;
201 Ok((chars[0], chars[1]))
202 }
203
204 fn next(&mut self) -> Result<u8>;
205
206 #[inline]
207 fn discard(&mut self, n: usize) -> Result<()> {
208 for _ in 0..n {
209 self.next()?;
210 }
211 Ok(())
212 }
213
214 fn parse_str<'s, F>(
215 &'s mut self,
216 escape: bool,
217 scratch: &'s mut Vec<u8>,
218 delimiter: F,
219 ) -> Result<Reference<'de, 's, str>>
220 where
221 F: Fn(&mut Self) -> Result<bool>;
222
223 #[inline]
224 fn peek_whitespace(&mut self) -> Result<Option<usize>> {
225 let n = match self.peek_n(3) {
226 Ok(bytes) => leading_whitespace_bytes(bytes),
227 Err(Error::Eof) => match self.peek_n(2) {
228 Ok(bytes) => leading_whitespace_bytes(bytes),
229 Err(Error::Eof) => match self.peek_n(1) {
230 Ok(bytes) => leading_whitespace_bytes(bytes),
231 Err(err) => {
232 return Err(err);
233 }
234 },
235 Err(err) => return Err(err),
236 },
237 Err(err) => return Err(err),
238 };
239 if n > 0 { Ok(Some(n)) } else { Ok(None) }
240 }
241
242 #[inline]
243 fn starts_with_whitespace(&mut self) -> Result<bool> {
244 self.peek_whitespace().map(|n| n.is_some())
245 }
246
247 #[inline]
248 fn peek_horizontal_whitespace(&mut self) -> Result<Option<usize>> {
249 if self.peek()? != b'\n' {
250 self.peek_whitespace()
251 } else {
252 Ok(None)
253 }
254 }
255
256 #[inline]
257 fn starts_with_horizontal_whitespace(&mut self) -> Result<bool> {
258 self.peek_horizontal_whitespace().map(|n| n.is_some())
259 }
260}
261
262pub struct StreamRead<R: std::io::Read> {
263 inner: R,
264 buffer: [u8; DEFAULT_BUFFER_SIZE],
265 head: usize,
266 tail: usize,
267 eof: bool,
268 line: usize,
269 col: usize,
270}
271
272impl<R: std::io::Read> StreamRead<R> {
273 pub fn new(reader: R) -> Self {
274 StreamRead {
275 inner: reader,
276 buffer: [0u8; _],
277 head: 0,
278 tail: 0,
279 eof: false,
280 line: 0,
281 col: 0,
282 }
283 }
284
285 fn fill_buf(&mut self) -> Result<()> {
286 if self.eof {
287 return Err(Error::Eof);
288 }
289
290 if self.tail == self.buffer.len() {
292 return Ok(());
293 }
294
295 let empty_buf = &mut self.buffer[self.tail..];
296 let n = self.inner.read(empty_buf)?;
297 if n == 0 {
298 self.eof = true;
299 }
300 self.tail += n;
301 Ok(())
302 }
303
304 #[inline]
305 fn available_data_len(&self) -> usize {
306 self.tail - self.head
307 }
308}
309
310impl<'de, R: std::io::Read> Read<'de> for StreamRead<R> {
311 fn position(&self) -> Position {
312 Position {
313 line: self.line,
314 column: self.col,
315 }
316 }
317
318 #[inline]
319 fn peek_n(&mut self, n: usize) -> Result<&[u8]> {
320 debug_assert!(n > 0 && n <= MAX_PEEK_N);
321
322 if self.available_data_len() < n && !self.eof {
323 if self.tail == self.buffer.len() && self.head > 0 {
325 let len = self.tail - self.head;
326 self.buffer.copy_within(self.head..self.tail, 0);
327 self.head = 0;
328 self.tail = len;
329 }
330 self.fill_buf()?;
331 }
332 if self.available_data_len() < n {
333 Err(Error::Eof)
334 } else {
335 Ok(&self.buffer[self.head..self.head + n])
336 }
337 }
338
339 #[inline]
340 fn next(&mut self) -> Result<u8> {
341 if self.available_data_len() == 0 && !self.eof {
342 self.fill_buf()?;
343 }
344 let byte = self.buffer[self.head];
345 if byte == b'\n' {
346 self.line += 1;
347 } else {
348 self.col += 1;
349 }
350 self.head += 1;
351 if self.head == self.tail {
352 self.head = 0;
353 self.tail = 0;
354 }
355 Ok(byte)
356 }
357
358 #[inline]
359 fn parse_str<'s, F>(
360 &'s mut self,
361 escape: bool,
362 scratch: &'s mut Vec<u8>,
363 delimiter: F,
364 ) -> Result<Reference<'de, 's, str>>
365 where
366 F: Fn(&mut Self) -> Result<bool>,
367 {
368 loop {
369 if !delimiter(self)? {
370 match self.next()? {
371 b'\\' if escape => {
372 parse_escaped_char(self, scratch)?;
373 }
374 ch => {
375 scratch.push(ch);
376 }
377 }
378 } else {
379 break;
380 }
381 }
382 str::from_utf8(scratch)
383 .map_err(|_| Error::InvalidUtf8)
384 .map(Reference::Copied)
385 }
386}
387
388macro_rules! parse_str_bytes_impl {
389 ($self:expr, $escape:expr, $scratch:expr, $delimiter:expr, $result:expr) => {{
390 let mut start = $self.index;
391 loop {
392 if !$delimiter($self)? {
393 if $self.index == $self.slice.len() {
394 break;
395 }
396 match $self.slice[$self.index] {
397 b'\\' if $escape => {
398 $scratch.extend_from_slice(&$self.slice[start..$self.index]);
399 $self.index += 1;
400 parse_escaped_char($self, $scratch)?;
401 start = $self.index;
402 }
403 _ => {
404 $self.index += 1;
405 }
406 }
407 } else {
408 break;
409 }
410 }
411 if $scratch.is_empty() {
412 let borrowed = &$self.slice[start..$self.index];
413 $result(borrowed).map(Reference::Borrowed)
414 } else {
415 $scratch.extend_from_slice(&$self.slice[start..$self.index]);
416 $result($scratch).map(Reference::Copied)
417 }
418 }};
419}
420
421pub struct SliceRead<'de> {
422 slice: &'de [u8],
423 index: usize,
424}
425
426impl<'de> SliceRead<'de> {
427 pub fn new(slice: &'de [u8]) -> Self {
428 SliceRead { slice, index: 0 }
429 }
430
431 fn position_of_index(&self, i: usize) -> Position {
432 let start_of_line = match memchr::memrchr(b'\n', &self.slice[..i]) {
433 Some(position) => position + 1,
434 None => 0,
435 };
436 Position {
437 line: 1 + memchr::memchr_iter(b'\n', &self.slice[..start_of_line]).count(),
438 column: i - start_of_line,
439 }
440 }
441
442 #[inline]
443 fn available_data_len(&self) -> usize {
444 self.slice.len() - self.index
445 }
446
447 pub(crate) fn rest(&self) -> &[u8] {
448 &self.slice[self.index..]
449 }
450
451 #[inline]
452 fn parse_str_bytes<'s, E, T, R>(
453 &'s mut self,
454 escape: bool,
455 scratch: &'s mut Vec<u8>,
456 delimiter: E,
457 result: R,
458 ) -> Result<Reference<'de, 's, T>>
459 where
460 T: ?Sized + 's,
461 E: Fn(&mut Self) -> Result<bool>,
462 R: for<'f> FnOnce(&'f [u8]) -> Result<&'f T>,
463 {
464 parse_str_bytes_impl!(self, escape, scratch, delimiter, result)
465 }
466}
467
468impl<'de> Read<'de> for SliceRead<'de> {
469 fn position(&self) -> Position {
470 self.position_of_index(self.index)
471 }
472
473 #[inline]
474 fn peek_n(&mut self, n: usize) -> Result<&[u8]> {
475 debug_assert!(n > 0 && n <= MAX_PEEK_N);
476 if self.available_data_len() < n {
477 Err(Error::Eof)
478 } else {
479 Ok(&self.slice[self.index..self.index + n])
480 }
481 }
482
483 #[inline]
484 fn next(&mut self) -> Result<u8> {
485 if self.index == self.slice.len() {
486 return Err(Error::Eof);
487 }
488 let byte = self.slice[self.index];
489 self.index += 1;
490 Ok(byte)
491 }
492
493 fn discard(&mut self, n: usize) -> Result<()> {
494 if self.available_data_len() < n {
495 Err(Error::Eof)
496 } else {
497 self.index += n;
498 Ok(())
499 }
500 }
501
502 #[inline]
503 fn parse_str<'s, F>(
504 &'s mut self,
505 escape: bool,
506 scratch: &'s mut Vec<u8>,
507 end: F,
508 ) -> Result<Reference<'de, 's, str>>
509 where
510 F: Fn(&mut Self) -> Result<bool>,
511 {
512 self.parse_str_bytes(escape, scratch, end, |bytes| {
513 str::from_utf8(bytes).map_err(|_| Error::InvalidUtf8)
514 })
515 }
516}
517
518#[derive(Deref, DerefMut)]
519pub struct StrRead<'de> {
520 delegate: SliceRead<'de>,
521}
522
523impl<'de> StrRead<'de> {
524 pub fn new(s: &'de str) -> Self {
525 Self {
526 delegate: SliceRead::new(s.as_bytes()),
527 }
528 }
529
530 pub fn rest(&self) -> Result<&str> {
531 str::from_utf8(self.delegate.rest()).map_err(|_| Error::InvalidUtf8)
532 }
533
534 #[inline]
535 fn parse_str_bytes<'s, E, T, R>(
536 &'s mut self,
537 no_escape: bool,
538 scratch: &'s mut Vec<u8>,
539 delimiter: E,
540 result: R,
541 ) -> Result<Reference<'de, 's, T>>
542 where
543 T: ?Sized + 's,
544 E: Fn(&mut Self) -> Result<bool>,
545 R: for<'f> FnOnce(&'f [u8]) -> Result<&'f T>,
546 {
547 parse_str_bytes_impl!(self, no_escape, scratch, delimiter, result)
548 }
549}
550
551impl<'de> Read<'de> for StrRead<'de> {
552 fn position(&self) -> Position {
553 self.delegate.position()
554 }
555
556 #[inline]
557 fn peek_n(&mut self, n: usize) -> Result<&[u8]> {
558 self.delegate.peek_n(n)
559 }
560
561 #[inline]
562 fn next(&mut self) -> Result<u8> {
563 self.delegate.next()
564 }
565
566 #[inline]
567 fn parse_str<'s, F>(
568 &'s mut self,
569 no_escape: bool,
570 scratch: &'s mut Vec<u8>,
571 end: F,
572 ) -> Result<Reference<'de, 's, str>>
573 where
574 F: Fn(&mut Self) -> Result<bool>,
575 {
576 self.parse_str_bytes(no_escape, scratch, end, |bytes| {
577 Ok(unsafe { str::from_utf8_unchecked(bytes) })
578 })
579 }
580}
581
582#[cfg(test)]
583mod tests {
584 use crate::Result;
585 use crate::parser::read::leading_whitespace_bytes;
586 use crate::parser::read::{Read, StreamRead};
587 use rstest::rstest;
588
589 #[test]
590 fn test_stream_peek() -> Result<()> {
591 let input = "hello world";
592 let mut read = StreamRead::new(input.as_bytes());
593 let ch = read.peek()?;
594 assert_eq!(ch, b'h');
595 let (ch1, ch2) = read.peek2()?;
596 assert_eq!(ch1, b'h');
597 assert_eq!(ch2, b'e');
598 let chars = read.peek_n(3)?;
599 assert_eq!(chars, b"hel");
600 read.discard(3)?;
601 let ch = read.peek()?;
602 assert_eq!(ch, b'l');
603 let (ch1, ch2) = read.peek2()?;
604 assert_eq!(ch1, b'l');
605 assert_eq!(ch2, b'o');
606 let chars = read.peek_n(3)?;
607 assert_eq!(chars, b"lo ");
608 Ok(())
609 }
610
611 #[rstest]
612 #[case(&[] as &[u8], 0)]
613 #[case(b"\txyz", 1)]
614 #[case(b"\nabc", 1)]
615 #[case(&[0x0B, b'a', b'b'], 1)]
616 #[case(&[0x0C, b'a', b'b'], 1)]
617 #[case(b"\rHELLO", 1)]
618 #[case(b" world", 1)]
619 #[case(&[0x1C, b'X', b'Y'], 1)]
620 #[case(&[0x1F, b'Z'], 1)]
621 #[case(&[0xC2, 0x85, b'a', b'b'], 2)]
622 #[case(&[0xC2, 0xA0, b'X'], 2)]
623 #[case(&[0xE1, 0x9A, 0x80, b'!'], 3)]
624 #[case(&[0xE2, 0x80, 0x80, b'a'], 3)]
625 #[case(&[0xE2, 0x80, 0x87, b'b'], 3)]
626 #[case(&[0xE2, 0x80, 0x8A, b'c'], 3)]
627 #[case(&[0xE2, 0x80, 0xA8, b'x'], 3)]
628 #[case(&[0xE2, 0x80, 0xA9, b'y'], 3)]
629 #[case(&[0xE2, 0x80, 0xAF, b'Z'], 3)]
630 #[case(&[0xE2, 0x81, 0x9F, b'M'], 3)]
631 #[case(&[0xE3, 0x80, 0x80, b'A'], 3)]
632 #[case(&[0xEF, 0xBB, 0xBF, b'h'], 3)]
633 #[case(b"Hello", 0)]
634 #[case(&[0xE6, 0x97, 0xA5, b'X'], 0)]
635 #[case(&[0xC2], 0)]
636 #[case(&[0xE2, 0x80], 0)]
637 fn test_leading_whitespace_bytes(#[case] bytes: &[u8], #[case] expected: usize) {
638 assert_eq!(leading_whitespace_bytes(bytes), expected);
639 }
640}