musli/json/parser/
string.rs

1#![allow(clippy::zero_prefixed_literal)]
2
3use crate::alloc::Vec;
4use crate::{Allocator, Context};
5
6// Copied and adapter form the serde-json project under the MIT and Apache 2.0
7// license.
8//
9// See: https://github.com/serde-rs/json
10
11// Lookup table of bytes that must be escaped. A value of true at index i means
12// that byte i requires an escape sequence in the input.
13static ESCAPE: [bool; 256] = {
14    const CT: bool = true; // control character \x00..=\x1F
15    const QU: bool = true; // quote \x22
16    const BS: bool = true; // backslash \x5C
17    const __: bool = false; // allow unescaped
18    [
19        //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
20        CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 0
21        CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 1
22        __, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2
23        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3
24        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4
25        __, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5
26        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6
27        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7
28        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8
29        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9
30        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A
31        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B
32        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C
33        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D
34        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E
35        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F
36    ]
37};
38
39/// A parsed string reference.
40#[doc(hidden)]
41pub enum StringReference<'de, 'scratch> {
42    Borrowed(&'de str),
43    Scratch(&'scratch str),
44}
45
46impl StringReference<'_, '_> {
47    /// Returns the string as a borrowed reference.
48    #[inline]
49    pub(crate) fn as_str(&self) -> &str {
50        match self {
51            Self::Borrowed(s) => s,
52            Self::Scratch(s) => s,
53        }
54    }
55}
56
57/// Accessor for a slice.
58pub(crate) struct SliceAccess<'de, C> {
59    cx: C,
60    slice: &'de [u8],
61    pub(crate) index: usize,
62}
63
64impl<'de, C> SliceAccess<'de, C>
65where
66    C: Context,
67{
68    #[inline]
69    pub(crate) fn new(cx: C, slice: &'de [u8], index: usize) -> Self {
70        Self { cx, slice, index }
71    }
72
73    #[inline]
74    fn next(&mut self) -> Result<u8, C::Error> {
75        let Some(b) = self.slice.get(self.index) else {
76            return Err(self.cx.message("End of input"));
77        };
78
79        self.cx.advance(1);
80        self.index += 1;
81        Ok(*b)
82    }
83
84    #[inline]
85    fn parse_hex_escape(&mut self) -> Result<u16, C::Error> {
86        let &[a, b, c, d, ..] = &self.slice[self.index..] else {
87            return Err(self.cx.message("Unexpected end of hex escape"));
88        };
89
90        let mut n = 0;
91        let start = self.cx.mark();
92
93        for b in [a, b, c, d] {
94            let Some(val) = decode_hex_val(b) else {
95                return Err(self
96                    .cx
97                    .message_at(&start, "Non-hex digit in escape sequence"));
98            };
99
100            n = (n << 4) + val;
101        }
102
103        self.index += 4;
104        self.cx.advance(4);
105        Ok(n)
106    }
107
108    /// Parses a JSON escape sequence and appends it into the scratch space. Assumes
109    /// the previous byte read was a backslash.
110    pub(crate) fn parse_escape(
111        &mut self,
112        validate: bool,
113        scratch: &mut Vec<u8, C::Allocator>,
114    ) -> Result<bool, C::Error> {
115        let start = self.cx.mark();
116        let b = self.next()?;
117
118        let extend = match b {
119            b'"' => scratch.push(b'"').is_ok(),
120            b'\\' => scratch.push(b'\\').is_ok(),
121            b'/' => scratch.push(b'/').is_ok(),
122            b'b' => scratch.push(b'\x08').is_ok(),
123            b'f' => scratch.push(b'\x0c').is_ok(),
124            b'n' => scratch.push(b'\n').is_ok(),
125            b'r' => scratch.push(b'\r').is_ok(),
126            b't' => scratch.push(b'\t').is_ok(),
127            b'u' => {
128                fn encode_surrogate(scratch: &mut Vec<u8, impl Allocator>, n: u16) -> bool {
129                    scratch
130                        .extend_from_slice(&[
131                            ((n >> 12) & 0b0000_1111) as u8 | 0b1110_0000,
132                            ((n >> 6) & 0b0011_1111) as u8 | 0b1000_0000,
133                            (n & 0b0011_1111) as u8 | 0b1000_0000,
134                        ])
135                        .is_ok()
136                }
137
138                let c = match self.parse_hex_escape()? {
139                    n @ 0xDC00..=0xDFFF => {
140                        return if validate {
141                            Err(self
142                                .cx
143                                .message_at(&start, "Lone leading surrogate in hex escape"))
144                        } else {
145                            Ok(encode_surrogate(scratch, n))
146                        };
147                    }
148
149                    // Non-BMP characters are encoded as a sequence of two hex
150                    // escapes, representing UTF-16 surrogates. If deserializing a
151                    // utf-8 string the surrogates are required to be paired,
152                    // whereas deserializing a byte string accepts lone surrogates.
153                    n1 @ 0xD800..=0xDBFF => {
154                        let pos = self.cx.mark();
155
156                        if self.next()? != b'\\' {
157                            return if validate {
158                                Err(self.cx.message_at(&pos, "Unexpected end of hex escape"))
159                            } else {
160                                Ok(encode_surrogate(scratch, n1))
161                            };
162                        }
163
164                        if self.next()? != b'u' {
165                            return if validate {
166                                Err(self.cx.message_at(&pos, "Unexpected end of hex escape"))
167                            } else {
168                                if !encode_surrogate(scratch, n1) {
169                                    return Ok(false);
170                                }
171
172                                // The \ prior to this byte started an escape sequence,
173                                // so we need to parse that now. This recursive call
174                                // does not blow the stack on malicious input because
175                                // the escape is not \u, so it will be handled by one
176                                // of the easy nonrecursive cases.
177                                self.parse_escape(validate, scratch)
178                            };
179                        }
180
181                        let n2 = self.parse_hex_escape()?;
182
183                        if !(0xDC00..=0xDFFF).contains(&n2) {
184                            return Err(self
185                                .cx
186                                .message_at(&start, "Lone leading surrogate in hex escape"));
187                        }
188
189                        let n = ((((n1 - 0xD800) as u32) << 10) | (n2 - 0xDC00) as u32) + 0x1_0000;
190
191                        match char::from_u32(n) {
192                            Some(c) => c,
193                            None => {
194                                return Err(self.cx.message_at(&start, "Invalid unicode"));
195                            }
196                        }
197                    }
198
199                    // Every u16 outside of the surrogate ranges above is guaranteed
200                    // to be a legal char.
201                    n => char::from_u32(n as u32).unwrap(),
202                };
203
204                scratch
205                    .extend_from_slice(c.encode_utf8(&mut [0u8; 4]).as_bytes())
206                    .is_ok()
207            }
208            _ => {
209                return Err(self.cx.message_at(&start, "Invalid string escape"));
210            }
211        };
212
213        Ok(extend)
214    }
215
216    /// Parses a JSON escape sequence and appends it into the scratch space. Assumes
217    /// the previous byte read was a backslash.
218    fn skip_escape(&mut self, validate: bool) -> Result<(), C::Error> {
219        let start = self.cx.mark();
220        let b = self.next()?;
221
222        match b {
223            b'"' | b'\\' | b'/' | b'b' | b'f' | b'n' | b'r' | b't' => (),
224            b'u' => {
225                match self.parse_hex_escape()? {
226                    0xDC00..=0xDFFF => {
227                        return if validate {
228                            Err(self
229                                .cx
230                                .message_at(&start, "Lone leading surrogate in hex escape"))
231                        } else {
232                            Ok(())
233                        };
234                    }
235
236                    // Non-BMP characters are encoded as a sequence of two hex
237                    // escapes, representing UTF-16 surrogates. If deserializing a
238                    // utf-8 string the surrogates are required to be paired,
239                    // whereas deserializing a byte string accepts lone surrogates.
240                    n1 @ 0xD800..=0xDBFF => {
241                        let pos = self.cx.mark();
242
243                        if self.next()? != b'\\' {
244                            return if validate {
245                                Err(self.cx.message_at(&pos, "Unexpected end of hex escape"))
246                            } else {
247                                Ok(())
248                            };
249                        }
250
251                        if self.next()? != b'u' {
252                            return if validate {
253                                Err(self.cx.message_at(&pos, "Unexpected end of hex escape"))
254                            } else {
255                                // The \ prior to this byte started an escape sequence,
256                                // so we need to parse that now. This recursive call
257                                // does not blow the stack on malicious input because
258                                // the escape is not \u, so it will be handled by one
259                                // of the easy nonrecursive cases.
260                                self.skip_escape(validate)
261                            };
262                        }
263
264                        let n2 = self.parse_hex_escape()?;
265
266                        if !(0xDC00..=0xDFFF).contains(&n2) {
267                            return Err(self
268                                .cx
269                                .message_at(&start, "Lone leading surrogate in hex escape"));
270                        }
271
272                        let n = ((((n1 - 0xD800) as u32) << 10) | (n2 - 0xDC00) as u32) + 0x1_0000;
273
274                        if char::from_u32(n).is_none() {
275                            return Err(self.cx.message_at(&start, "Invalid unicode"));
276                        }
277                    }
278
279                    // Every u16 outside of the surrogate ranges above is guaranteed
280                    // to be a legal char.
281                    _ => (),
282                }
283            }
284            _ => {
285                return Err(self.cx.message_at(&start, "Invalid string escape"));
286            }
287        };
288
289        Ok(())
290    }
291
292    /// Reader implementation from a slice.
293    pub(crate) fn parse_string<'scratch>(
294        &mut self,
295        validate: bool,
296        start: &C::Mark,
297        scratch: &'scratch mut Vec<u8, C::Allocator>,
298    ) -> Result<StringReference<'de, 'scratch>, C::Error> {
299        // Index of the first byte not yet copied into the scratch space.
300        let mut open_mark = self.cx.mark();
301        let mut open = self.index;
302
303        loop {
304            while self.index < self.slice.len() && !ESCAPE[self.slice[self.index] as usize] {
305                self.index = self.index.wrapping_add(1);
306                self.cx.advance(1);
307            }
308
309            if self.index == self.slice.len() {
310                return Err(self.cx.message("End of input"));
311            }
312
313            match self.slice[self.index] {
314                b'"' => {
315                    if scratch.is_empty() {
316                        // Fast path: return a slice of the raw JSON without any
317                        // copying.
318                        let borrowed = &self.slice[open..self.index];
319
320                        self.index = self.index.wrapping_add(1);
321                        self.cx.advance(1);
322
323                        self.check_utf8(borrowed, start)?;
324
325                        // SAFETY: we've checked each segment to be valid UTF-8.
326                        let borrowed = unsafe { core::str::from_utf8_unchecked(borrowed) };
327                        return Ok(StringReference::Borrowed(borrowed));
328                    } else {
329                        let slice = &self.slice[open..self.index];
330                        self.check_utf8(slice, start)?;
331
332                        if scratch.extend_from_slice(slice).is_err() {
333                            return Err(self.cx.message("Scratch buffer overflow"));
334                        }
335
336                        self.index = self.index.wrapping_add(1);
337                        self.cx.advance(1);
338
339                        // SAFETY: we've checked each segment to be valid UTF-8.
340                        let scratch = unsafe { core::str::from_utf8_unchecked(scratch.as_slice()) };
341                        return Ok(StringReference::Scratch(scratch));
342                    }
343                }
344                b'\\' => {
345                    let slice = &self.slice[open..self.index];
346                    self.check_utf8(slice, start)?;
347
348                    if scratch.extend_from_slice(slice).is_err() {
349                        return Err(self.cx.message("Scratch buffer overflow"));
350                    }
351
352                    self.index = self.index.wrapping_add(1);
353                    self.cx.advance(1);
354
355                    if !self.parse_escape(validate, scratch)? {
356                        return Err(self.cx.message_at(&open_mark, "Buffer overflow"));
357                    }
358
359                    open = self.index;
360                    open_mark = self.cx.mark();
361                }
362                _ => {
363                    if validate {
364                        return Err(self
365                            .cx
366                            .message_at(&open_mark, "Control character while parsing string"));
367                    }
368
369                    self.index = self.index.wrapping_add(1);
370                    self.cx.advance(1);
371                }
372            }
373        }
374    }
375
376    /// Reader implementation from a slice.
377    pub(crate) fn skip_string(&mut self) -> Result<(), C::Error> {
378        loop {
379            while let Some(b) = self.slice.get(self.index) {
380                if ESCAPE[*b as usize] {
381                    break;
382                }
383
384                self.index = self.index.wrapping_add(1);
385                self.cx.advance(1);
386            }
387
388            let b = self.next()?;
389
390            match b {
391                b'"' => {
392                    return Ok(());
393                }
394                b'\\' => {
395                    self.skip_escape(true)?;
396                }
397                _ => {
398                    return Err(self.cx.message("Control character while parsing string"));
399                }
400            }
401        }
402    }
403
404    /// Check that the given slice is valid UTF-8.
405    #[inline]
406    fn check_utf8(&self, bytes: &[u8], start: &C::Mark) -> Result<(), C::Error> {
407        if crate::str::from_utf8(bytes).is_err() {
408            Err(self.cx.message_at(start, "Invalid unicode string"))
409        } else {
410            Ok(())
411        }
412    }
413}
414
415static HEX: [u8; 256] = {
416    const __: u8 = 255; // not a hex digit
417    [
418        //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
419        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 0
420        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 1
421        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2
422        00, 01, 02, 03, 04, 05, 06, 07, 08, 09, __, __, __, __, __, __, // 3
423        __, 10, 11, 12, 13, 14, 15, __, __, __, __, __, __, __, __, __, // 4
424        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 5
425        __, 10, 11, 12, 13, 14, 15, __, __, __, __, __, __, __, __, __, // 6
426        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7
427        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8
428        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9
429        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A
430        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B
431        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C
432        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D
433        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E
434        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F
435    ]
436};
437
438#[inline]
439pub(crate) fn decode_hex_val(val: u8) -> Option<u16> {
440    let n = HEX[val as usize] as u16;
441
442    if n == 255 {
443        None
444    } else {
445        Some(n)
446    }
447}