Skip to main content

osproxy_core/
json.rs

1//! Byte-level JSON scanning for the no-materialization body path (ADR-014).
2//!
3//! These routines read exactly what a tenancy transform needs, the set of
4//! top-level field names (to detect a spoofed reserved field) and a scalar at a
5//! path (to find the partition key or build an id), by scanning the raw body
6//! bytes, **without ever building a parsed JSON tree**. Retained memory is
7//! bounded by the few small key strings (or the one extracted scalar), never by
8//! document size (INV-MEM): every value the scan does not need is skipped without
9//! allocating.
10//!
11//! It lives in `core` because it is dependency-free pure computation that both
12//! the SPI (partition extraction utilities) and the transform layer (id
13//! construction, field-splice injection) build on, the two sides cannot share a
14//! helper that lives in either of them.
15//!
16//! The scanner is strict: it parses the JSON grammar fully so a malformed body
17//! is rejected here rather than mis-located. Key strings are decoded before they
18//! are compared, so a client cannot smuggle a reserved field name past a
19//! collision check by escaping it (e.g. `"_tenant"` for `_tenant`).
20//
21// JUSTIFY(file-length): one cohesive recursive-descent JSON scanner, the
22// `Parser` and its grammar productions (value/object/array/string/number/escape)
23// are a single unit that must agree on cursor invariants; splitting the
24// productions across files would scatter that shared state for no readability
25// gain. Tests live separately in `json_tests.rs`.
26
27use thiserror::Error;
28
29/// A failure scanning raw JSON bytes.
30///
31/// Deliberately exhaustive (not `#[non_exhaustive]`): it is a small, closed set
32/// of JSON-shape failures, and downstream `From` conversions must map every
33/// variant, a new one should be a compile error to handle, not silently fall
34/// through a wildcard.
35#[derive(Debug, Error, PartialEq, Eq)]
36pub enum JsonError {
37    /// The bytes were not valid JSON.
38    #[error("not valid JSON")]
39    Invalid,
40
41    /// The document was expected to be a JSON object but was not.
42    #[error("not a JSON object")]
43    NotAnObject,
44
45    /// A path does not resolve to a scalar value in the document.
46    #[error("path does not resolve to a scalar value")]
47    PathNotScalar {
48        /// The dotted path that failed to resolve.
49        path: String,
50    },
51}
52
53/// The located top level of a JSON object: where to splice injected fields,
54/// whether it already has members, and its decoded top-level key names.
55#[derive(Debug)]
56pub struct TopLevel {
57    /// Byte offset just past the opening `{`, the splice insertion point.
58    pub insert_at: usize,
59    /// True if the object has no members (`{}`), no trailing comma on splice.
60    pub empty: bool,
61    /// Decoded top-level key names (escapes resolved), for collision checks.
62    pub keys: Vec<String>,
63}
64
65/// Locates the top level of the JSON object in `body`, validating the whole
66/// document as it goes.
67///
68/// # Errors
69///
70/// [`JsonError::NotAnObject`] if `body` is valid JSON but not an object,
71/// [`JsonError::Invalid`] if it is not valid JSON.
72pub fn object_top_level(body: &[u8]) -> Result<TopLevel, JsonError> {
73    let mut p = Parser::new(body);
74    p.skip_ws();
75    if p.peek() != Some(b'{') {
76        // Not an object: distinguish malformed JSON from a non-object value so
77        // the caller can report the right error.
78        return Err(match validate(body) {
79            Ok(()) => JsonError::NotAnObject,
80            Err(e) => e,
81        });
82    }
83    let top = p.object_members()?;
84    p.skip_ws();
85    if p.peek().is_some() {
86        return Err(JsonError::Invalid);
87    }
88    Ok(top)
89}
90
91/// Follows `segments` into the object in `body` and returns the leaf scalar as a
92/// string: strings are decoded, numbers and bools use their source text.
93///
94/// # Errors
95///
96/// [`JsonError::PathNotScalar`] if a segment is missing or the leaf is an
97/// object, array, or null; [`JsonError::Invalid`] if `body` up to the leaf is
98/// not valid JSON.
99pub fn scalar_at_path<'a, I>(body: &[u8], segments: I) -> Result<String, JsonError>
100where
101    I: IntoIterator<Item = &'a str>,
102{
103    let mut p = Parser::new(body);
104    let mut walked: Vec<&str> = Vec::new();
105    for segment in segments {
106        walked.push(segment);
107        p.enter_field(segment)
108            .ok_or_else(|| JsonError::PathNotScalar {
109                path: walked.join("."),
110            })?;
111    }
112    p.skip_ws();
113    p.scalar_string().ok_or_else(|| JsonError::PathNotScalar {
114        path: walked.join("."),
115    })
116}
117
118/// Validates that `body` is a single well-formed JSON document (trailing
119/// whitespace allowed), allocating nothing.
120///
121/// # Errors
122///
123/// [`JsonError::Invalid`] if `body` is not valid JSON.
124pub fn validate(body: &[u8]) -> Result<(), JsonError> {
125    let mut p = Parser::new(body);
126    p.skip_value()?;
127    p.skip_ws();
128    if p.peek().is_some() {
129        return Err(JsonError::Invalid);
130    }
131    Ok(())
132}
133
134/// A cursor over the raw JSON bytes.
135struct Parser<'a> {
136    b: &'a [u8],
137    i: usize,
138}
139
140impl<'a> Parser<'a> {
141    fn new(b: &'a [u8]) -> Self {
142        Self { b, i: 0 }
143    }
144
145    fn peek(&self) -> Option<u8> {
146        self.b.get(self.i).copied()
147    }
148
149    fn skip_ws(&mut self) {
150        while matches!(self.peek(), Some(b' ' | b'\t' | b'\n' | b'\r')) {
151            self.i += 1;
152        }
153    }
154
155    /// Parses the object at the cursor (which must be `{`), recording its top
156    /// level. Used for the document root by [`object_top_level`].
157    fn object_members(&mut self) -> Result<TopLevel, JsonError> {
158        debug_assert_eq!(self.peek(), Some(b'{'));
159        self.i += 1; // opening brace
160        let insert_at = self.i;
161        let mut keys = Vec::new();
162        self.skip_ws();
163        if self.peek() == Some(b'}') {
164            self.i += 1;
165            return Ok(TopLevel {
166                insert_at,
167                empty: true,
168                keys,
169            });
170        }
171        loop {
172            self.skip_ws();
173            keys.push(self.string_decode()?);
174            self.skip_ws();
175            self.expect(b':')?;
176            self.skip_value()?;
177            self.skip_ws();
178            match self.peek() {
179                Some(b',') => self.i += 1,
180                Some(b'}') => {
181                    self.i += 1;
182                    break;
183                }
184                _ => return Err(JsonError::Invalid),
185            }
186        }
187        Ok(TopLevel {
188            insert_at,
189            empty: false,
190            keys,
191        })
192    }
193
194    /// Positions the cursor at the value of object member `key`, returning
195    /// `Some(())` if found. On a miss (or a non-object), returns `None` and the
196    /// cursor position is unspecified.
197    fn enter_field(&mut self, key: &str) -> Option<()> {
198        self.skip_ws();
199        if self.peek() != Some(b'{') {
200            return None;
201        }
202        self.i += 1;
203        self.skip_ws();
204        if self.peek() == Some(b'}') {
205            return None;
206        }
207        loop {
208            self.skip_ws();
209            let k = self.string_decode().ok()?;
210            self.skip_ws();
211            self.expect(b':').ok()?;
212            self.skip_ws();
213            if k == key {
214                return Some(());
215            }
216            self.skip_value().ok()?;
217            self.skip_ws();
218            match self.peek() {
219                Some(b',') => self.i += 1,
220                _ => return None,
221            }
222        }
223    }
224
225    /// Reads the scalar at the cursor as a string: strings decoded, numbers and
226    /// bools as their source text. `None` for object/array/null/malformed.
227    fn scalar_string(&mut self) -> Option<String> {
228        match self.peek()? {
229            b'"' => self.string_decode().ok(),
230            b't' => self.literal(b"true").ok().map(|()| "true".to_owned()),
231            b'f' => self.literal(b"false").ok().map(|()| "false".to_owned()),
232            c if c == b'-' || c.is_ascii_digit() => {
233                let start = self.i;
234                self.number().ok()?;
235                std::str::from_utf8(&self.b[start..self.i])
236                    .ok()
237                    .map(str::to_owned)
238            }
239            _ => None,
240        }
241    }
242
243    fn expect(&mut self, byte: u8) -> Result<(), JsonError> {
244        if self.peek() == Some(byte) {
245            self.i += 1;
246            Ok(())
247        } else {
248            Err(JsonError::Invalid)
249        }
250    }
251
252    /// Skips one complete JSON value, allocating nothing.
253    fn skip_value(&mut self) -> Result<(), JsonError> {
254        self.skip_ws();
255        match self.peek().ok_or(JsonError::Invalid)? {
256            b'{' => self.skip_object(),
257            b'[' => self.skip_array(),
258            b'"' => self.skip_string(),
259            b't' => self.literal(b"true"),
260            b'f' => self.literal(b"false"),
261            b'n' => self.literal(b"null"),
262            c if c == b'-' || c.is_ascii_digit() => self.number(),
263            _ => Err(JsonError::Invalid),
264        }
265    }
266
267    fn skip_object(&mut self) -> Result<(), JsonError> {
268        self.i += 1; // '{'
269        self.skip_ws();
270        if self.peek() == Some(b'}') {
271            self.i += 1;
272            return Ok(());
273        }
274        loop {
275            self.skip_ws();
276            self.skip_string()?;
277            self.skip_ws();
278            self.expect(b':')?;
279            self.skip_value()?;
280            self.skip_ws();
281            match self.peek() {
282                Some(b',') => self.i += 1,
283                Some(b'}') => {
284                    self.i += 1;
285                    return Ok(());
286                }
287                _ => return Err(JsonError::Invalid),
288            }
289        }
290    }
291
292    fn skip_array(&mut self) -> Result<(), JsonError> {
293        self.i += 1; // '['
294        self.skip_ws();
295        if self.peek() == Some(b']') {
296            self.i += 1;
297            return Ok(());
298        }
299        loop {
300            self.skip_value()?;
301            self.skip_ws();
302            match self.peek() {
303                Some(b',') => self.i += 1,
304                Some(b']') => {
305                    self.i += 1;
306                    return Ok(());
307                }
308                _ => return Err(JsonError::Invalid),
309            }
310        }
311    }
312
313    /// Skips a string (cursor at the opening quote), handling escapes, no alloc.
314    fn skip_string(&mut self) -> Result<(), JsonError> {
315        self.expect(b'"')?;
316        loop {
317            match self.peek().ok_or(JsonError::Invalid)? {
318                b'"' => {
319                    self.i += 1;
320                    return Ok(());
321                }
322                b'\\' => {
323                    self.i += 1;
324                    // Consume the escaped char; `\u` carries four more hex digits.
325                    let esc = self.peek().ok_or(JsonError::Invalid)?;
326                    self.i += 1;
327                    if esc == b'u' {
328                        for _ in 0..4 {
329                            self.hex_digit()?;
330                        }
331                    }
332                }
333                c if c < 0x20 => return Err(JsonError::Invalid),
334                _ => self.i += 1,
335            }
336        }
337    }
338
339    /// Decodes a string (cursor at the opening quote) into an owned `String`.
340    fn string_decode(&mut self) -> Result<String, JsonError> {
341        self.expect(b'"')?;
342        // Accumulate raw bytes, not `char`s: a literal multi-byte UTF-8 sequence
343        // must be copied verbatim. Decoding each byte as a `char` (the Latin-1
344        // `char::from(u8)` mapping) would re-encode every continuation byte as its
345        // own code point, e.g. "café" → "café", corrupting any non-ASCII
346        // partition key or id-template input. The validation at the close rejects a
347        // string that is not valid UTF-8 (JSON must be UTF-8), keeping the scanner
348        // strict rather than silently producing mojibake.
349        let mut out: Vec<u8> = Vec::new();
350        loop {
351            match self.peek().ok_or(JsonError::Invalid)? {
352                b'"' => {
353                    self.i += 1;
354                    return String::from_utf8(out).map_err(|_| JsonError::Invalid);
355                }
356                b'\\' => {
357                    self.i += 1;
358                    self.decode_escape(&mut out)?;
359                }
360                c if c < 0x20 => return Err(JsonError::Invalid),
361                _ => {
362                    // A literal byte (ASCII, or one byte of a multi-byte sequence):
363                    // copy it verbatim. Continuation bytes are >= 0x80, so they are
364                    // never an escape or terminator and fall through here.
365                    out.push(self.b[self.i]);
366                    self.i += 1;
367                }
368            }
369        }
370    }
371
372    /// Decodes one escape sequence (cursor just past the backslash) into `out`.
373    fn decode_escape(&mut self, out: &mut Vec<u8>) -> Result<(), JsonError> {
374        let esc = self.peek().ok_or(JsonError::Invalid)?;
375        self.i += 1;
376        let ch = match esc {
377            b'"' => '"',
378            b'\\' => '\\',
379            b'/' => '/',
380            b'b' => '\u{0008}',
381            b'f' => '\u{000C}',
382            b'n' => '\n',
383            b'r' => '\r',
384            b't' => '\t',
385            b'u' => return self.decode_unicode_escape(out),
386            _ => return Err(JsonError::Invalid),
387        };
388        push_char(out, ch);
389        Ok(())
390    }
391
392    /// Decodes a `\u` escape (cursor just past the `u`), pairing surrogates.
393    fn decode_unicode_escape(&mut self, out: &mut Vec<u8>) -> Result<(), JsonError> {
394        let hi = self.hex4()?;
395        let code = if (0xD800..=0xDBFF).contains(&hi) {
396            // High surrogate: must be followed by `\u` + a low surrogate.
397            self.expect(b'\\')?;
398            self.expect(b'u')?;
399            let lo = self.hex4()?;
400            if !(0xDC00..=0xDFFF).contains(&lo) {
401                return Err(JsonError::Invalid);
402            }
403            0x1_0000 + ((u32::from(hi) - 0xD800) << 10) + (u32::from(lo) - 0xDC00)
404        } else if (0xDC00..=0xDFFF).contains(&hi) {
405            return Err(JsonError::Invalid); // lone low surrogate
406        } else {
407            u32::from(hi)
408        };
409        push_char(out, char::from_u32(code).ok_or(JsonError::Invalid)?);
410        Ok(())
411    }
412
413    /// Reads four hex digits as a `u16` (cursor at the first digit).
414    fn hex4(&mut self) -> Result<u16, JsonError> {
415        let mut v: u16 = 0;
416        for _ in 0..4 {
417            let d = self.hex_digit()?;
418            v = v * 16 + u16::from(d);
419        }
420        Ok(v)
421    }
422
423    /// Consumes one hex digit, returning its value.
424    fn hex_digit(&mut self) -> Result<u8, JsonError> {
425        let c = self.peek().ok_or(JsonError::Invalid)?;
426        let v = match c {
427            b'0'..=b'9' => c - b'0',
428            b'a'..=b'f' => c - b'a' + 10,
429            b'A'..=b'F' => c - b'A' + 10,
430            _ => return Err(JsonError::Invalid),
431        };
432        self.i += 1;
433        Ok(v)
434    }
435
436    /// Validates and skips a JSON number (cursor at `-` or a digit).
437    fn number(&mut self) -> Result<(), JsonError> {
438        if self.peek() == Some(b'-') {
439            self.i += 1;
440        }
441        match self.peek() {
442            Some(b'0') => self.i += 1,
443            Some(c) if c.is_ascii_digit() => self.digits(),
444            _ => return Err(JsonError::Invalid),
445        }
446        if self.peek() == Some(b'.') {
447            self.i += 1;
448            self.one_or_more_digits()?;
449        }
450        if matches!(self.peek(), Some(b'e' | b'E')) {
451            self.i += 1;
452            if matches!(self.peek(), Some(b'+' | b'-')) {
453                self.i += 1;
454            }
455            self.one_or_more_digits()?;
456        }
457        Ok(())
458    }
459
460    fn digits(&mut self) {
461        while matches!(self.peek(), Some(c) if c.is_ascii_digit()) {
462            self.i += 1;
463        }
464    }
465
466    fn one_or_more_digits(&mut self) -> Result<(), JsonError> {
467        if !matches!(self.peek(), Some(c) if c.is_ascii_digit()) {
468            return Err(JsonError::Invalid);
469        }
470        self.digits();
471        Ok(())
472    }
473
474    /// Matches an exact literal (`true`/`false`/`null`) at the cursor.
475    fn literal(&mut self, lit: &[u8]) -> Result<(), JsonError> {
476        if self.b[self.i..].starts_with(lit) {
477            self.i += lit.len();
478            Ok(())
479        } else {
480            Err(JsonError::Invalid)
481        }
482    }
483}
484
485/// Appends a decoded escape character's UTF-8 encoding to the byte buffer.
486fn push_char(out: &mut Vec<u8>, ch: char) {
487    let mut buf = [0u8; 4];
488    out.extend_from_slice(ch.encode_utf8(&mut buf).as_bytes());
489}
490
491#[cfg(test)]
492#[path = "json_tests.rs"]
493mod tests;