json_threat_protection/read/
mod.rs

1//! Defines the [`Read`] trait, and provided implementations for [`std::io::Read`], [`&str`], and slice for [`u8`].
2
3mod io;
4mod slice;
5mod str;
6mod utils;
7pub use io::IoRead;
8pub use slice::SliceRead;
9pub use str::StrRead;
10use thiserror::Error;
11
12use utils::{decode_hex_sequence, IS_HEX, NEED_ESCAPE};
13
14macro_rules! parse_number {
15    ($self:ident) => {{
16        match $self.peek()? {
17            Some(b'-') => $self.discard(),
18            Some(b'0'..=b'9') => (),
19            Some(_) => return Err(ReadError::Bug{
20                msg: "macro_rules! parse_number: assume the first character is a number or a minus sign".to_string(),
21                position: $self.position(),
22            }),
23            None => return Err(ReadError::UnexpectedEndOfInput($self.position())),
24        }
25
26        let first = match $self.next()? {
27            Some(n @ b'0'..=b'9') => n,
28            _ => return Err(ReadError::Bug {
29                msg: "macro_rules! parse_number: assume the first character is a number".to_string(),
30                position: $self.position(),
31            }),
32        };
33
34        let second = $self.peek()?;
35        if second.is_none() {
36            return Ok(());
37        }
38
39        if first == b'0' && matches!(second, Some(b'0'..=b'9')) {
40            return Err(ReadError::LeadingZerosInNumber($self.position()));
41        }
42
43        loop {
44            match $self.peek()? {
45                Some(b'0'..=b'9') => $self.discard(),
46                Some(b'.') => return parse_float!($self),
47                Some(b'e') | Some(b'E') => return parse_exponent!($self),
48                _ => break,
49            }
50        }
51
52        Ok(())
53    }};
54}
55
56macro_rules! parse_float {
57    ($self:ident) => {{
58        if $self.next()? != Some(b'.') {
59            return Err(ReadError::Bug {
60                msg: "macro_rules! parse_float: assume the first character is a period".to_string(),
61                position: $self.position(),
62            });
63        }
64
65        match $self.peek()? {
66            Some(b'0'..=b'9') => $self.discard(),
67            Some(_) => return Err(ReadError::NoNumberCharactersAfterFraction($self.position())),
68            None => return Err(ReadError::UnexpectedEndOfInput($self.position())),
69        }
70
71        loop {
72            match $self.peek()? {
73                Some(b'0'..=b'9') => $self.discard(),
74                Some(b'e') | Some(b'E') => return parse_exponent!($self),
75                _ => break,
76            }
77        }
78
79        Ok(())
80    }};
81}
82
83macro_rules! parse_exponent {
84    ($self:ident) => {{
85        if !matches!($self.next()?, Some(b'e') | Some(b'E')) {
86            return Err(ReadError::Bug {
87                msg: "macro_rules! parse_exponent: assume the first character is an exponent"
88                    .to_string(),
89                position: $self.position(),
90            });
91        }
92
93        match $self.peek()? {
94            Some(b'-') | Some(b'+') => $self.discard(),
95            Some(b'0'..=b'9') => (),
96            Some(_) => return Err(ReadError::NoNumberCharactersAfterExponent($self.position())),
97            None => return Err(ReadError::UnexpectedEndOfInput($self.position())),
98        }
99
100        match $self.peek()? {
101            Some(b'0'..=b'9') => (),
102            Some(_) => return Err(ReadError::NoNumberCharactersAfterExponent($self.position())),
103            None => return Err(ReadError::UnexpectedEndOfInput($self.position())),
104        }
105
106        loop {
107            match $self.peek()? {
108                Some(b'0'..=b'9') => $self.discard(),
109                _ => break,
110            }
111        }
112
113        Ok(())
114    }};
115}
116
117macro_rules! next4_hex {
118    ($self:ident) => {{
119        let mut buf = [0; 4];
120        for i in 0..4 {
121            let next = $self.next()?;
122            if next.is_none() {
123                return Err(ReadError::UnexpectedEndOfInput($self.position()));
124            }
125
126            // unwrap is safe because we checked if next is None
127            let next = next.unwrap();
128            if IS_HEX[next as usize] {
129                buf[i] = next;
130            } else {
131                return Err(ReadError::NonHexCharacterInUnicodeEscape($self.position()));
132            }
133        }
134        buf
135    }};
136}
137
138pub use utils::Position;
139
140#[derive(Debug, Error)]
141/// An error that can occur when reading characters.
142pub enum ReadError {
143    /// Unexpected end of input.
144    #[error("unexpected end of input ({0})")]
145    UnexpectedEndOfInput(Position),
146
147    /// I/O error.
148    #[error("I/O Error ({0})")]
149    IoError(std::io::Error, Position),
150
151    /// Non numerical character.
152    #[error("non numirical character ({0})")]
153    NonNumericalCharacter(Position),
154
155    /// Unclosed string.
156    #[error("unclosed string ({0})")]
157    UnclosedString(Position),
158
159    /// Invalid escape sequence.
160    #[error("invalid escape sequence ({0})")]
161    InvalidEscapeSequence(Position),
162
163    /// Control character in string.
164    #[error("control character in string ({0})")]
165    ControlCharacterInString(Position),
166
167    /// Non hex character in unicode escape sequence.
168    #[error("non hex character in unicode escape sequence ({0})")]
169    NonHexCharacterInUnicodeEscape(Position),
170
171    /// Leading zeros in number.
172    #[error("leading zeros in number ({0})")]
173    LeadingZerosInNumber(Position),
174
175    /// No number characters after fraction.
176    #[error("no number characters after fraction ({0})")]
177    NoNumberCharactersAfterFraction(Position),
178
179    /// No number characters after exponent.
180    #[error("no number characters after exponent ({0})")]
181    NoNumberCharactersAfterExponent(Position),
182
183    /// Running into unexpected state.
184    #[error("running into unexpected state, please report this issue to the maintainer, ({msg}) ({position})")]
185    Bug {
186        /// Diagnostic message.
187        msg: String,
188
189        /// The position where the bug happened.
190        position: Position,
191    },
192}
193
194/// A trait for reading characters from a source.
195///
196/// # Performance
197///
198/// All provided methods might not be the most efficient way to read characters
199/// as it might return the [`ReadError`], which basically an [`std::io::Error`].
200/// However, reading characters from in memory source should raise [`std::io::Error`],
201/// so you could implement your own [`Read`] trait and its provided methods
202/// for better performance, such as [`SliceRead`].
203pub trait Read {
204    /// Get the current position of the reader.
205    fn position(&self) -> Position;
206
207    /// Peek the next character without consuming it.
208    fn peek(&mut self) -> Result<Option<u8>, ReadError>;
209
210    /// Get the next character and consume it.
211    fn next(&mut self) -> Result<Option<u8>, ReadError>;
212
213    /// Discard the next character.
214    ///
215    /// # Panic
216    ///
217    /// This method panics if the next character is None.
218    fn discard(&mut self) {
219        self.next().unwrap();
220    }
221
222    /// Get the next 4 characters and consume them.
223    fn next4(&mut self) -> Result<[u8; 4], ReadError> {
224        let mut buf = [0; 4];
225        for i in 0..4 {
226            match self.next()? {
227                Some(ch) => buf[i] = ch,
228                None => return Err(ReadError::UnexpectedEndOfInput(self.position())),
229            }
230        }
231        Ok(buf)
232    }
233
234    /// Get the next 5 characters and consume them.
235    fn next5(&mut self) -> Result<[u8; 5], ReadError> {
236        let mut buf = [0; 5];
237        for i in 0..5 {
238            match self.next()? {
239                Some(ch) => buf[i] = ch,
240                None => return Err(ReadError::UnexpectedEndOfInput(self.position())),
241            }
242        }
243        Ok(buf)
244    }
245
246    /// Skip whitespace characters (`' '`, `'\t'`, `'\n'`, `'\r'`).
247    fn skip_whitespace(&mut self) -> Result<(), ReadError> {
248        loop {
249            match self.peek()? {
250                Some(b' ') | Some(b'\t') | Some(b'\n') | Some(b'\r') => {
251                    self.next()?;
252                }
253                _ => break,
254            }
255        }
256        Ok(())
257    }
258
259    /// Parse a number and allow arbitrary precision.
260    fn next_number(&mut self) -> Result<(), ReadError> {
261        parse_number!(self)
262    }
263
264    /// Parse a string, but not guaranteed to be correct UTF-8.
265    fn next_likely_string(&mut self, buf: &mut Vec<u8>) -> Result<(), ReadError> {
266        if self.next()? != Some(b'"') {
267            return Err(ReadError::Bug {
268                msg: "Read.next_likely_string: assume the first character is a double quote"
269                    .to_string(),
270                position: self.position(),
271            });
272        }
273
274        while let Some(byte) = self.next()? {
275            if !NEED_ESCAPE[byte as usize] {
276                buf.push(byte);
277                continue;
278            }
279
280            match byte {
281                b'"' => return Ok(()),
282                b'\\' => {
283                    let mut simple_escape = true;
284
285                    match self.next()? {
286                        Some(b'"') => buf.push(b'"'),
287                        Some(b'\\') => buf.push(b'\\'),
288                        Some(b'/') => buf.push(b'/'),
289                        Some(b'b') => buf.push(b'\x08'),
290                        Some(b'f') => buf.push(b'\x0C'),
291                        Some(b'n') => buf.push(b'\n'),
292                        Some(b'r') => buf.push(b'\r'),
293                        Some(b't') => buf.push(b'\t'),
294                        Some(b'u') => simple_escape = false,
295                        Some(_) => return Err(ReadError::InvalidEscapeSequence(self.position())),
296                        None => return Err(ReadError::UnexpectedEndOfInput(self.position())),
297                    };
298
299                    if simple_escape {
300                        continue;
301                    }
302
303                    let hex = decode_hex_sequence(&next4_hex!(self));
304                    let ch = match hex {
305                        _n @ 0xDC00..=0xDFFF => {
306                            return Err(ReadError::InvalidEscapeSequence(self.position()));
307                        }
308                        n @ 0xD800..=0xDBFF => {
309                            let high = n;
310                            if self.next()? != Some(b'\\') {
311                                return Err(ReadError::InvalidEscapeSequence(self.position()));
312                            }
313                            if self.next()? != Some(b'u') {
314                                return Err(ReadError::InvalidEscapeSequence(self.position()));
315                            }
316                            let low = decode_hex_sequence(&next4_hex!(self));
317                            if !matches!(low, 0xDC00..=0xDFFF) {
318                                return Err(ReadError::InvalidEscapeSequence(self.position()));
319                            }
320
321                            let high = ((high & 0x03FF) << 10) as u32;
322                            let low = (low & 0x03FF) as u32;
323                            let codepoint = 0x10000u32 + high + low;
324
325                            match std::char::from_u32(codepoint) {
326                                Some(ch) => ch,
327                                None => {
328                                    return Err(ReadError::Bug {
329                                        msg:
330                                            "Read.next_likely_string: assume the codepoint is valid"
331                                                .to_string(),
332                                        position: self.position(),
333                                    })
334                                }
335                            }
336                        }
337                        n => match std::char::from_u32(n as u32) {
338                            Some(ch) => ch,
339                            None => {
340                                return Err(ReadError::Bug {
341                                    msg: "Read.next_likely_string: assume the codepoint is valid"
342                                        .to_string(),
343                                    position: self.position(),
344                                });
345                            }
346                        },
347                    };
348
349                    buf.extend_from_slice(ch.encode_utf8(&mut [0u8; 4]).as_bytes());
350                }
351                _ => return Err(ReadError::ControlCharacterInString(self.position())),
352            }
353        }
354
355        Err(ReadError::UnclosedString(self.position()))
356    }
357}