json_event_parser_witespace/
read.rs

1#![allow(dead_code)]
2
3use crate::event::JsonEvent;
4use anyhow::{anyhow, Result};
5use std::borrow::Cow;
6use std::collections::VecDeque;
7use std::io::{BufRead, Error, ErrorKind, Seek, SeekFrom};
8use std::str;
9
10#[allow(dead_code)]
11pub struct JsonReader<R> {
12    reader: R,
13    state_stack: Vec<JsonState>,
14    element_read: bool,
15    remain_events: VecDeque<JsonEvent<'static>>,
16    max_stack_size: Option<usize>,
17}
18
19impl<R: BufRead + Seek> JsonReader<R> {
20    pub fn from_reader(reader: R) -> Self {
21        Self {
22            reader,
23            state_stack: Vec::new(),
24            element_read: false,
25            max_stack_size: None,
26            remain_events: VecDeque::new(),
27        }
28    }
29
30    /// Maximal allowed number of nested object and array openings. Infinite by default.
31    pub fn max_stack_size(&mut self, size: usize) -> &mut Self {
32        self.max_stack_size = Some(size);
33        self
34    }
35
36    pub fn read_event<'a>(&mut self, buffer: &'a mut Vec<u8>) -> Result<JsonEvent<'a>> {
37        if let Some(ev) = self.remain_events.pop_front() {
38            return Ok(ev);
39        }
40
41        match self.lookup_front_segment_whitespaces(buffer)? {
42            SkipWhitespace::EmptyBuffer => {
43                if self.state_stack.is_empty() && self.element_read {
44                    Ok(JsonEvent::Eof)
45                } else {
46                    Err(anyhow!(Error::from(ErrorKind::UnexpectedEof)))
47                }
48            }
49            SkipWhitespace::Skip(whitespaces, _) => Ok(JsonEvent::WhiteSpace(whitespaces)),
50            SkipWhitespace::NoSkip(front) => match front {
51                b'{' => {
52                    self.reader.consume(1);
53                    self.check_stack_size()?;
54                    self.state_stack.push(JsonState::FirstObjectKey);
55                    Ok(JsonEvent::StartObject)
56                }
57                b'}' => {
58                    self.reader.consume(1);
59                    if matches!(
60                        self.state_stack.pop(),
61                        Some(JsonState::FirstObjectKey) | Some(JsonState::LastObjectKey)
62                    ) {
63                        self.read_after_value(JsonEvent::EndObject, buffer)
64                    } else {
65                        Err(anyhow!(Error::new(
66                            ErrorKind::InvalidData,
67                            "Closing a not opened object",
68                        )))
69                    }
70                }
71                b'[' => {
72                    self.reader.consume(1);
73                    self.check_stack_size()?;
74                    self.state_stack.push(JsonState::FirstArray);
75                    Ok(JsonEvent::StartArray)
76                }
77                b']' => {
78                    self.reader.consume(1);
79                    if matches!(
80                        self.state_stack.pop(),
81                        Some(JsonState::FirstArray) | Some(JsonState::LastArray)
82                    ) {
83                        self.read_after_value(JsonEvent::EndArray, buffer)
84                    } else {
85                        Err(anyhow!(Error::new(
86                            ErrorKind::InvalidData,
87                            "Closing a not opened array",
88                        )))
89                    }
90                }
91                b'"' => self.parse_string(buffer),
92                b't' => self.parse_constant::<4>("true", JsonEvent::Boolean(true), buffer),
93                b'f' => self.parse_constant::<5>("false", JsonEvent::Boolean(false), buffer),
94                b'n' => self.parse_constant::<4>("null", JsonEvent::Null, buffer),
95                b'-' | b'0'..=b'9' => self.parse_number(front, buffer),
96                c => {
97                    self.reader.consume(1);
98                    Err(anyhow!(Error::new(
99                        ErrorKind::InvalidData,
100                        format!("Unexpected char: {}", char::from(c)),
101                    )))
102                }
103            },
104        }
105    }
106
107    fn parse_string<'a>(&mut self, output: &'a mut Vec<u8>) -> Result<JsonEvent<'a>> {
108        output.clear();
109        self.reader.consume(1);
110
111        #[derive(Eq, PartialEq, Copy, Clone)]
112        #[allow(dead_code)]
113        enum StringState {
114            Default,
115            Escape,
116        }
117
118        let mut state = StringState::Default;
119        loop {
120            match state {
121                StringState::Default => {
122                    let buffer = match self.reader.fill_buf() {
123                        Ok(buf) => {
124                            if buf.is_empty() {
125                                return Err(anyhow!(Error::from(ErrorKind::UnexpectedEof)));
126                            } else {
127                                buf
128                            }
129                        }
130                        Err(e) => {
131                            if e.kind() == ErrorKind::Interrupted {
132                                continue;
133                            } else {
134                                return Err(anyhow!(e));
135                            }
136                        }
137                    };
138                    let mut i = 0;
139                    for c in buffer {
140                        i += 1;
141                        match *c {
142                            b'"' => {
143                                self.reader.consume(i);
144                                return self.read_after_value(
145                                    JsonEvent::String(Cow::Owned(
146                                        String::from_utf8(output.clone())
147                                            .map_err(|e| Error::new(ErrorKind::InvalidData, e))?,
148                                    )),
149                                    output,
150                                );
151                            }
152                            b'\\' => {
153                                state = StringState::Escape;
154                                break;
155                            }
156                            0..=0x1F => {
157                                self.reader.consume(i);
158                                return Err(anyhow!(Error::new(
159                                    ErrorKind::InvalidData,
160                                    "Control characters are not allowed in JSON",
161                                )));
162                            }
163                            c => output.push(c),
164                        }
165                    }
166                    self.reader.consume(i);
167                }
168                StringState::Escape => {
169                    let c = self.lookup_mandatory_front()?;
170                    self.reader.consume(1);
171                    match c {
172                        b'"' => {
173                            output.push(b'"');
174                        }
175                        b'\\' => {
176                            output.push(b'\\');
177                        }
178                        b'/' => {
179                            output.push(b'/');
180                        }
181                        b'b' => {
182                            output.push(8);
183                        }
184                        b'f' => {
185                            output.push(12);
186                        }
187                        b'n' => {
188                            output.push(b'\n');
189                        }
190                        b'r' => {
191                            output.push(b'\r');
192                        }
193                        b't' => {
194                            output.push(b'\t');
195                        }
196                        b'u' => {
197                            let mut buf = [0u8; 4];
198                            self.reader.read_exact(&mut buf)?;
199                            let code_point = read_hexa_char(&buf)?;
200                            if let Some(c) = char::from_u32(code_point) {
201                                output.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
202                            } else {
203                                let high_surrogate = code_point;
204                                let mut buf = [0u8; 6];
205                                self.reader.read_exact(&mut buf)?;
206                                if !buf.starts_with(b"\\u") {
207                                    return Err(anyhow!(Error::new(
208                                        ErrorKind::InvalidData,
209                                        format!(
210                                            "\\u{:X} is a surrogate should be followed by an other surrogate",
211                                            high_surrogate
212                                        )),
213                                    ));
214                                }
215                                let low_surrogate = read_hexa_char(&buf[2..])?;
216                                let code_point = 0x10000
217                                    + ((high_surrogate & 0x03FF) << 10)
218                                    + (low_surrogate & 0x03FF);
219                                if let Some(c) = char::from_u32(code_point) {
220                                    output.extend_from_slice(c.encode_utf8(&mut buf).as_bytes())
221                                } else {
222                                    return Err(anyhow!(Error::new(
223                                        ErrorKind::InvalidData,
224                                        format!(
225                                            "\\u{:X}\\u{:X} is an invalid surrogate pair",
226                                            high_surrogate, low_surrogate
227                                        ),
228                                    )));
229                                }
230                            }
231                        }
232                        _ => {
233                            return Err(anyhow!(Error::new(
234                                ErrorKind::InvalidData,
235                                "Invalid string escape",
236                            )));
237                        }
238                    }
239                    state = StringState::Default;
240                }
241            }
242        }
243    }
244
245    fn parse_constant<'a, const SIZE: usize>(
246        &mut self,
247        expected: &str,
248        value: JsonEvent<'a>,
249        buffer: &mut Vec<u8>,
250    ) -> Result<JsonEvent<'a>> {
251        debug_assert_eq!(expected.len(), SIZE);
252        let mut buf = [0u8; SIZE];
253        self.reader.read_exact(&mut buf)?;
254        if buf == expected.as_bytes() {
255            self.read_after_value(value, buffer)
256        } else {
257            Err(anyhow!(Error::new(
258                ErrorKind::InvalidData,
259                format!(
260                    "{} expected, found {}",
261                    expected,
262                    str::from_utf8(&buf).map_err(|e| Error::new(ErrorKind::InvalidData, e))?
263                ),
264            )))
265        }
266    }
267
268    fn parse_number<'a>(
269        &mut self,
270        first_byte: u8,
271        output: &'a mut Vec<u8>,
272    ) -> Result<JsonEvent<'a>> {
273        output.clear();
274        if first_byte == b'-' {
275            output.push(b'-');
276            self.reader.consume(1);
277        }
278        // integer starting with first bytes
279        // TODO: avoid too many fill_buf
280        let c = self.lookup_mandatory_front()?;
281        match c {
282            b'0' => {
283                output.push(b'0');
284                self.reader.consume(1);
285            }
286            b'1'..=b'9' => {
287                output.push(c);
288                self.reader.consume(1);
289                self.read_digits(output)?;
290            }
291            _ => {
292                return Err(anyhow!(Error::new(
293                    ErrorKind::InvalidData,
294                    "Invalid number"
295                )))
296            }
297        }
298
299        // Dot
300        if self.lookup_front()? == Some(b'.') {
301            output.push(b'.');
302            self.reader.consume(1);
303            self.read_char(|c| matches!(c, b'0'..=b'9'), output)?;
304            self.read_digits(output)?;
305        }
306
307        // Exp
308        if let Some(c) = self.lookup_front()? {
309            if c == b'e' || c == b'E' {
310                output.push(c);
311                self.reader.consume(1);
312                let c = self.lookup_mandatory_front()?;
313                match c {
314                    b'-' | b'+' => {
315                        output.push(c);
316                        self.reader.consume(1);
317                        self.read_char(|c| matches!(c, b'0'..=b'9'), output)?;
318                    }
319                    b'0'..=b'9' => {
320                        output.push(c);
321                        self.reader.consume(1);
322                    }
323                    _ => {
324                        return Err(anyhow!(Error::new(
325                            ErrorKind::InvalidData,
326                            format!("Invalid number. Found char {}", char::from(c)),
327                        )))
328                    }
329                }
330                self.read_digits(output)?;
331            }
332        }
333
334        self.read_after_value(
335            JsonEvent::Number(Cow::Owned(
336                String::from_utf8(output.clone())
337                    .map_err(|e| Error::new(ErrorKind::InvalidData, e))?,
338            )),
339            output,
340        )
341    }
342
343    fn read_char(&mut self, valid: impl Fn(u8) -> bool, output: &mut Vec<u8>) -> Result<()> {
344        let c = self.lookup_mandatory_front()?;
345        if valid(c) {
346            output.push(c);
347            self.reader.consume(1);
348            Ok(())
349        } else {
350            Err(anyhow!(Error::new(
351                ErrorKind::InvalidData,
352                format!("Invalid number. Found char {}", char::from(c)),
353            )))
354        }
355    }
356
357    fn read_digits(&mut self, output: &mut Vec<u8>) -> Result<()> {
358        while let Some(c) = self.lookup_front()? {
359            if matches!(c, b'0'..=b'9') {
360                output.push(c);
361                self.reader.consume(1);
362            } else {
363                break;
364            }
365        }
366        Ok(())
367    }
368
369    fn read_after_value<'a>(
370        &mut self,
371        value: JsonEvent<'a>,
372        buffer: &mut Vec<u8>,
373    ) -> Result<JsonEvent<'a>> {
374        let JsonReader {
375            reader,
376            remain_events,
377            ..
378        } = self;
379
380        type SkipMatchValue<'a> = (JsonEvent<'a>, bool, Option<JsonEvent<'static>>);
381        let mut skip_with_match_before =
382            |skip_whitespace,
383             match_func: Box<dyn FnOnce(Option<u8>) -> Result<SkipMatchValue<'a>>>| {
384                match skip_whitespace {
385                    SkipWhitespace::NoSkip(front) => {
386                        let (event, _, next) = match_func(Some(front))?;
387                        if let Some(next) = next {
388                            remain_events.push_back(next)
389                        }
390                        Ok(event)
391                    }
392                    SkipWhitespace::Skip(whitespace, front) => {
393                        let (event, is_before, next) = match_func(front)?;
394
395                        if is_before {
396                            remain_events.push_back(event.into_owned());
397                            Ok(JsonEvent::WhiteSpace(whitespace))
398                        } else {
399                            remain_events.push_back(JsonEvent::WhiteSpace(whitespace));
400                            if let Some(next) = next {
401                                remain_events.push_back(next)
402                            }
403                            Ok(event.into_owned())
404                        }
405                    }
406                    SkipWhitespace::EmptyBuffer => Err(anyhow!(Error::new(
407                        ErrorKind::UnexpectedEof,
408                        "Unexpected end of input while parsing JSON",
409                    ))),
410                }
411            };
412
413        match self.state_stack.pop() {
414            Some(JsonState::FirstObjectKey) | Some(JsonState::NextObjectKey) => {
415                skip_with_match_before(
416                    lookup_front_segment_whitespaces_impl(reader, buffer)?,
417                    Box::new(|front| {
418                        if front == Some(b':') {
419                            self.reader.consume(1);
420                            self.state_stack.push(JsonState::ObjectValue);
421                            if let JsonEvent::String(value) = value {
422                                Ok((JsonEvent::ObjectKey(value), true, None))
423                            } else {
424                                Err(anyhow!(Error::new(
425                                    ErrorKind::InvalidData,
426                                    "Object keys should strings",
427                                )))
428                            }
429                        } else {
430                            Err(anyhow!(Error::new(
431                                ErrorKind::InvalidData,
432                                "Object keys should be followed by ':'",
433                            )))
434                        }
435                    }),
436                )
437            }
438            Some(JsonState::ObjectValue) => skip_with_match_before(
439                lookup_front_segment_whitespaces_impl(reader, buffer)?,
440                Box::new(|front| match front {
441                    Some(b',') => {
442                        self.reader.consume(1);
443                        self.state_stack.push(JsonState::NextObjectKey);
444                        Ok((value, false, Some(JsonEvent::NextObjectValue)))
445                    }
446                    Some(b'}') => {
447                        self.state_stack.push(JsonState::LastObjectKey);
448                        Ok((value, false, None))
449                    }
450                    _ => Err(anyhow!(Error::new(
451                        ErrorKind::InvalidData,
452                        "Object values should be followed by ',' or '}'",
453                    ))),
454                }),
455            ),
456            Some(JsonState::FirstArray) | Some(JsonState::NextArray) => skip_with_match_before(
457                lookup_front_segment_whitespaces_impl(reader, buffer)?,
458                Box::new(|front| match front {
459                    Some(b',') => {
460                        self.reader.consume(1);
461                        self.state_stack.push(JsonState::NextArray);
462                        Ok((value, false, Some(JsonEvent::NextArrayValue)))
463                    }
464                    Some(b']') => {
465                        self.state_stack.push(JsonState::LastArray);
466                        Ok((value, false, None))
467                    }
468                    _ => Err(anyhow!(Error::new(
469                        ErrorKind::InvalidData,
470                        "Array values should be followed by ',' or ']'",
471                    ))),
472                }),
473            ),
474            None => {
475                if self.element_read {
476                    Err(anyhow!(Error::new(
477                        ErrorKind::InvalidData,
478                        "JSON trailing content"
479                    )))
480                } else {
481                    self.element_read = true;
482                    Ok(value)
483                }
484            }
485            Some(JsonState::LastObjectKey) => Err(anyhow!(Error::new(
486                ErrorKind::InvalidData,
487                "JSON object elements should be separated by commas",
488            ))),
489            Some(JsonState::LastArray) => Err(anyhow!(Error::new(
490                ErrorKind::InvalidData,
491                "JSON array elements should be separated by commas",
492            ))),
493        }
494    }
495
496    fn peek_front_skipping_whitespaces(&mut self) -> Result<Option<u8>> {
497        let mut back_pos = None;
498        loop {
499            match self.reader.fill_buf() {
500                Ok(buf) => {
501                    if buf.is_empty() {
502                        return Ok(None);
503                    }
504                    let skipped = skip_whitespaces(buf);
505                    if skipped == buf.len() {
506                        back_pos = Some(self.reader.stream_position()?);
507                        self.reader.consume(skipped);
508                    } else {
509                        let result = Some(buf[skipped]);
510                        if let Some(bp) = back_pos {
511                            self.reader.seek(SeekFrom::Start(bp))?;
512                        }
513                        return Ok(result);
514                    }
515                }
516                Err(error) => {
517                    if error.kind() != ErrorKind::Interrupted {
518                        return Err(anyhow!(error));
519                    }
520                }
521            }
522        }
523    }
524
525    fn lookup_front_segment_whitespaces(&mut self, output: &mut Vec<u8>) -> Result<SkipWhitespace> {
526        lookup_front_segment_whitespaces_impl(&mut self.reader, output)
527    }
528
529    fn lookup_mandatory_front(&mut self) -> Result<u8> {
530        if let Some(v) = self.lookup_front()? {
531            Ok(v)
532        } else {
533            Err(anyhow!(Error::from(ErrorKind::UnexpectedEof)))
534        }
535    }
536
537    fn lookup_front(&mut self) -> Result<Option<u8>> {
538        loop {
539            match self.reader.fill_buf() {
540                Ok(buf) => return Ok(if buf.is_empty() { None } else { Some(buf[0]) }),
541                Err(error) => {
542                    if error.kind() != ErrorKind::Interrupted {
543                        return Err(anyhow!(error));
544                    }
545                }
546            }
547        }
548    }
549
550    fn check_stack_size(&self) -> Result<()> {
551        if let Some(max_stack_size) = self.max_stack_size {
552            if self.state_stack.len() > max_stack_size {
553                Err(anyhow!(Error::new(
554                    ErrorKind::InvalidData,
555                    format!(
556                        "Max stack size of {} reached on an object opening",
557                        max_stack_size
558                    ),
559                )))
560            } else {
561                Ok(())
562            }
563        } else {
564            Ok(())
565        }
566    }
567}
568
569#[derive(Debug, Eq, PartialEq, Copy, Clone)]
570#[allow(dead_code)]
571enum JsonState {
572    FirstArray,
573    NextArray,
574    LastArray,
575    FirstObjectKey,
576    NextObjectKey,
577    LastObjectKey,
578    ObjectValue,
579}
580
581#[allow(dead_code)]
582fn skip_whitespaces(buf: &[u8]) -> usize {
583    for (i, c) in buf.iter().enumerate() {
584        if !matches!(c, b' ' | b'\t' | b'\n' | b'\r') {
585            return i;
586        }
587    }
588    buf.len()
589}
590fn read_hexa_char(input: &[u8]) -> Result<u32> {
591    let mut value = 0;
592    for c in input.iter().copied() {
593        value = value * 16
594            + match c {
595                b'0'..=b'9' => u32::from(c) - u32::from(b'0'),
596                b'a'..=b'f' => u32::from(c) - u32::from(b'a') + 10,
597                b'A'..=b'F' => u32::from(c) - u32::from(b'A') + 10,
598                _ => {
599                    return Err(anyhow!(Error::new(
600                        ErrorKind::InvalidData,
601                        "Unexpected character in a unicode escape",
602                    )))
603                }
604            }
605    }
606    Ok(value)
607}
608
609fn lookup_front_segment_whitespaces_impl<R: BufRead + Seek>(
610    reader: &mut R,
611    output: &mut Vec<u8>,
612) -> Result<SkipWhitespace> {
613    output.clear();
614    loop {
615        match reader.fill_buf() {
616            Ok(buf) => {
617                if buf.is_empty() {
618                    return if output.is_empty() {
619                        Ok(SkipWhitespace::EmptyBuffer)
620                    } else {
621                        Ok(SkipWhitespace::Skip(
622                            String::from_utf8(output.clone())?,
623                            None,
624                        ))
625                    };
626                }
627                let skipped = skip_whitespaces(buf);
628                if skipped == buf.len() {
629                    output.extend_from_slice(buf);
630                    reader.consume(skipped);
631                } else {
632                    let c = buf[skipped];
633                    output.extend_from_slice(&buf[0..skipped]);
634                    reader.consume(skipped);
635
636                    return if output.is_empty() {
637                        Ok(SkipWhitespace::NoSkip(c))
638                    } else {
639                        Ok(SkipWhitespace::Skip(
640                            String::from_utf8(output.clone())?,
641                            Some(c),
642                        ))
643                    };
644                }
645            }
646            Err(error) => {
647                if error.kind() != ErrorKind::Interrupted {
648                    return Err(anyhow!(error));
649                }
650            }
651        }
652    }
653}
654
655enum SkipWhitespace {
656    EmptyBuffer,
657    NoSkip(u8),
658    Skip(String, Option<u8>),
659}
660
661#[cfg(test)]
662mod tests {
663    use super::*;
664    use std::io::{BufReader, Cursor};
665
666    #[test]
667    fn it_works() {
668        let json_str = r#"{
669    "nadeko": "cute",
670    "sumire": "cute",
671    "number": 1234,
672    "numbers": [1, 2, 3]
673}"#
674        .to_string();
675
676        let mut reader = JsonReader::from_reader(BufReader::new(Cursor::new(json_str.as_bytes())));
677        let mut buffer = Vec::new();
678        loop {
679            let ev = dbg!(reader.read_event(&mut buffer).unwrap());
680            if matches!(ev, JsonEvent::Eof) {
681                break;
682            }
683        }
684    }
685}