musli_json/parser/
string.rs

1#![allow(clippy::zero_prefixed_literal)]
2
3use musli::{Buf, Context};
4
5use crate::parser::{Parser, SliceParser};
6
7// Copied and adapter form the serde-json project under the MIT and Apache 2.0
8// license.
9//
10// See: https://github.com/serde-rs/json
11
12// Lookup table of bytes that must be escaped. A value of true at index i means
13// that byte i requires an escape sequence in the input.
14static ESCAPE: [bool; 256] = {
15    const CT: bool = true; // control character \x00..=\x1F
16    const QU: bool = true; // quote \x22
17    const BS: bool = true; // backslash \x5C
18    const __: bool = false; // allow unescaped
19    [
20        //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
21        CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 0
22        CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 1
23        __, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2
24        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3
25        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4
26        __, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5
27        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6
28        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7
29        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8
30        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9
31        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A
32        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B
33        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C
34        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D
35        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E
36        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F
37    ]
38};
39
40/// A parsed string reference.
41#[doc(hidden)]
42pub enum StringReference<'de, 'scratch> {
43    Borrowed(&'de str),
44    Scratch(&'scratch str),
45}
46
47/// Specialized reader implementation from a slice.
48pub(crate) fn parse_string_slice_reader<'de, 'scratch, C, S>(
49    cx: &C,
50    reader: &mut SliceParser<'de>,
51    validate: bool,
52    start: C::Mark,
53    scratch: &'scratch mut S,
54) -> Result<StringReference<'de, 'scratch>, C::Error>
55where
56    C: ?Sized + Context,
57    S: ?Sized + Buf,
58{
59    // Index of the first byte not yet copied into the scratch space.
60    let mut open_mark = cx.mark();
61    let mut open = reader.index;
62
63    loop {
64        while reader.index < reader.slice.len() && !ESCAPE[reader.slice[reader.index] as usize] {
65            reader.index = reader.index.wrapping_add(1);
66            cx.advance(1);
67        }
68
69        if reader.index == reader.slice.len() {
70            return Err(cx.message("End of input"));
71        }
72
73        match reader.slice[reader.index] {
74            b'"' => {
75                if scratch.is_empty() {
76                    // Fast path: return a slice of the raw JSON without any
77                    // copying.
78                    let borrowed = &reader.slice[open..reader.index];
79                    reader.index = reader.index.wrapping_add(1);
80                    cx.advance(1);
81                    check_utf8(cx, borrowed, start)?;
82                    // SAFETY: we've checked each segment to be valid UTF-8.
83                    let borrowed = unsafe { core::str::from_utf8_unchecked(borrowed) };
84                    return Ok(StringReference::Borrowed(borrowed));
85                } else {
86                    let slice = &reader.slice[open..reader.index];
87                    check_utf8(cx, slice, start)?;
88
89                    if !scratch.write(slice) {
90                        return Err(cx.message("Scratch buffer overflow"));
91                    }
92
93                    reader.index = reader.index.wrapping_add(1);
94                    cx.advance(1);
95                    // SAFETY: we've checked each segment to be valid UTF-8.
96                    let scratch = unsafe { core::str::from_utf8_unchecked(scratch.as_slice()) };
97                    return Ok(StringReference::Scratch(scratch));
98                }
99            }
100            b'\\' => {
101                let slice = &reader.slice[open..reader.index];
102                check_utf8(cx, slice, start)?;
103
104                if !scratch.write(slice) {
105                    return Err(cx.message("Scratch buffer overflow"));
106                }
107
108                reader.index = reader.index.wrapping_add(1);
109                cx.advance(1);
110
111                if !parse_escape(cx, reader, validate, scratch)? {
112                    return Err(cx.marked_message(open_mark, "Buffer overflow"));
113                }
114
115                open = reader.index;
116                open_mark = cx.mark();
117            }
118            _ => {
119                if validate {
120                    return Err(
121                        cx.marked_message(open_mark, "Control character while parsing string")
122                    );
123                }
124
125                reader.index = reader.index.wrapping_add(1);
126                cx.advance(1);
127            }
128        }
129    }
130}
131
132/// Check that the given slice is valid UTF-8.
133#[inline]
134fn check_utf8<C>(cx: &C, bytes: &[u8], start: C::Mark) -> Result<(), C::Error>
135where
136    C: ?Sized + Context,
137{
138    if crate::str::from_utf8(bytes).is_err() {
139        Err(cx.marked_message(start, "Invalid unicode string"))
140    } else {
141        Ok(())
142    }
143}
144
145/// Parses a JSON escape sequence and appends it into the scratch space. Assumes
146/// the previous byte read was a backslash.
147fn parse_escape<C, B>(
148    cx: &C,
149    parser: &mut SliceParser<'_>,
150    validate: bool,
151    scratch: &mut B,
152) -> Result<bool, C::Error>
153where
154    C: ?Sized + Context,
155    B: ?Sized + Buf,
156{
157    let start = cx.mark();
158    let b = parser.read_byte(cx)?;
159
160    let extend = match b {
161        b'"' => scratch.push(b'"'),
162        b'\\' => scratch.push(b'\\'),
163        b'/' => scratch.push(b'/'),
164        b'b' => scratch.push(b'\x08'),
165        b'f' => scratch.push(b'\x0c'),
166        b'n' => scratch.push(b'\n'),
167        b'r' => scratch.push(b'\r'),
168        b't' => scratch.push(b'\t'),
169        b'u' => {
170            fn encode_surrogate<B>(scratch: &mut B, n: u16) -> bool
171            where
172                B: ?Sized + Buf,
173            {
174                scratch.write(&[
175                    (n >> 12 & 0b0000_1111) as u8 | 0b1110_0000,
176                    (n >> 6 & 0b0011_1111) as u8 | 0b1000_0000,
177                    (n & 0b0011_1111) as u8 | 0b1000_0000,
178                ])
179            }
180
181            let c = match parser.parse_hex_escape(cx)? {
182                n @ 0xDC00..=0xDFFF => {
183                    return if validate {
184                        Err(cx.marked_message(start, "Lone leading surrogate in hex escape"))
185                    } else {
186                        Ok(encode_surrogate(scratch, n))
187                    };
188                }
189
190                // Non-BMP characters are encoded as a sequence of two hex
191                // escapes, representing UTF-16 surrogates. If deserializing a
192                // utf-8 string the surrogates are required to be paired,
193                // whereas deserializing a byte string accepts lone surrogates.
194                n1 @ 0xD800..=0xDBFF => {
195                    let pos = cx.mark();
196
197                    if parser.read_byte(cx)? != b'\\' {
198                        return if validate {
199                            Err(cx.marked_message(pos, "Unexpected end of hex escape"))
200                        } else {
201                            Ok(encode_surrogate(scratch, n1))
202                        };
203                    }
204
205                    if parser.read_byte(cx)? != b'u' {
206                        return if validate {
207                            Err(cx.marked_message(pos, "Unexpected end of hex escape"))
208                        } else {
209                            if !encode_surrogate(scratch, n1) {
210                                return Ok(false);
211                            }
212
213                            // The \ prior to this byte started an escape sequence,
214                            // so we need to parse that now. This recursive call
215                            // does not blow the stack on malicious input because
216                            // the escape is not \u, so it will be handled by one
217                            // of the easy nonrecursive cases.
218                            parse_escape(cx, parser, validate, scratch)
219                        };
220                    }
221
222                    let n2 = parser.parse_hex_escape(cx)?;
223
224                    if !(0xDC00..=0xDFFF).contains(&n2) {
225                        return Err(
226                            cx.marked_message(start, "Lone leading surrogate in hex escape")
227                        );
228                    }
229
230                    let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
231
232                    match char::from_u32(n) {
233                        Some(c) => c,
234                        None => {
235                            return Err(cx.marked_message(start, "Invalid unicode"));
236                        }
237                    }
238                }
239
240                // Every u16 outside of the surrogate ranges above is guaranteed
241                // to be a legal char.
242                n => char::from_u32(n as u32).unwrap(),
243            };
244
245            scratch.write(c.encode_utf8(&mut [0u8; 4]).as_bytes())
246        }
247        _ => {
248            return Err(cx.marked_message(start, "Invalid string escape"));
249        }
250    };
251
252    Ok(extend)
253}
254
255static HEX: [u8; 256] = {
256    const __: u8 = 255; // not a hex digit
257    [
258        //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
259        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 0
260        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 1
261        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2
262        00, 01, 02, 03, 04, 05, 06, 07, 08, 09, __, __, __, __, __, __, // 3
263        __, 10, 11, 12, 13, 14, 15, __, __, __, __, __, __, __, __, __, // 4
264        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 5
265        __, 10, 11, 12, 13, 14, 15, __, __, __, __, __, __, __, __, __, // 6
266        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7
267        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8
268        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9
269        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A
270        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B
271        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C
272        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D
273        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E
274        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F
275    ]
276};
277
278pub(crate) fn decode_hex_val(val: u8) -> Option<u16> {
279    let n = HEX[val as usize] as u16;
280
281    if n == 255 {
282        None
283    } else {
284        Some(n)
285    }
286}
287
288/// Specialized reader implementation from a slice.
289pub(crate) fn skip_string<'de, P, C>(cx: &C, mut p: P, validate: bool) -> Result<(), C::Error>
290where
291    P: Parser<'de>,
292    C: ?Sized + Context,
293{
294    loop {
295        while let Some(b) = p.peek_byte(cx)? {
296            if ESCAPE[b as usize] {
297                break;
298            }
299
300            p.skip(cx, 1)?;
301        }
302
303        let b = p.read_byte(cx)?;
304
305        match b {
306            b'"' => {
307                return Ok(());
308            }
309            b'\\' => {
310                skip_escape(cx, p.borrow_mut(), validate)?;
311            }
312            _ => {
313                if validate {
314                    return Err(cx.message("Control character while parsing string"));
315                }
316            }
317        }
318    }
319}
320
321/// Parses a JSON escape sequence and appends it into the scratch space. Assumes
322/// the previous byte read was a backslash.
323fn skip_escape<'de, P, C>(cx: &C, mut p: P, validate: bool) -> Result<(), C::Error>
324where
325    P: Parser<'de>,
326    C: ?Sized + Context,
327{
328    let start = cx.mark();
329    let b = p.read_byte(cx)?;
330
331    match b {
332        b'"' | b'\\' | b'/' | b'b' | b'f' | b'n' | b'r' | b't' => (),
333        b'u' => {
334            match p.parse_hex_escape(cx)? {
335                0xDC00..=0xDFFF => {
336                    return if validate {
337                        Err(cx.marked_message(start, "Lone leading surrogate in hex escape"))
338                    } else {
339                        Ok(())
340                    };
341                }
342
343                // Non-BMP characters are encoded as a sequence of two hex
344                // escapes, representing UTF-16 surrogates. If deserializing a
345                // utf-8 string the surrogates are required to be paired,
346                // whereas deserializing a byte string accepts lone surrogates.
347                n1 @ 0xD800..=0xDBFF => {
348                    let pos = cx.mark();
349
350                    if p.read_byte(cx)? != b'\\' {
351                        return if validate {
352                            Err(cx.marked_message(pos, "Unexpected end of hex escape"))
353                        } else {
354                            Ok(())
355                        };
356                    }
357
358                    if p.read_byte(cx)? != b'u' {
359                        return if validate {
360                            Err(cx.marked_message(pos, "Unexpected end of hex escape"))
361                        } else {
362                            // The \ prior to this byte started an escape sequence,
363                            // so we need to parse that now. This recursive call
364                            // does not blow the stack on malicious input because
365                            // the escape is not \u, so it will be handled by one
366                            // of the easy nonrecursive cases.
367                            skip_escape(cx, p, validate)
368                        };
369                    }
370
371                    let n2 = p.parse_hex_escape(cx)?;
372
373                    if !(0xDC00..=0xDFFF).contains(&n2) {
374                        return Err(
375                            cx.marked_message(start, "Lone leading surrogate in hex escape")
376                        );
377                    }
378
379                    let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
380
381                    if char::from_u32(n).is_none() {
382                        return Err(cx.marked_message(start, "Invalid unicode"));
383                    }
384                }
385
386                // Every u16 outside of the surrogate ranges above is guaranteed
387                // to be a legal char.
388                _ => (),
389            }
390        }
391        _ => {
392            return Err(cx.marked_message(start, "Invalid string escape"));
393        }
394    };
395
396    Ok(())
397}