osproxy_core/json.rs
1//! Byte-level JSON scanning for the no-materialization body path (ADR-014).
2//!
3//! These routines read exactly what a tenancy transform needs, the set of
4//! top-level field names (to detect a spoofed reserved field) and a scalar at a
5//! path (to find the partition key or build an id), by scanning the raw body
6//! bytes, **without ever building a parsed JSON tree**. Retained memory is
7//! bounded by the few small key strings (or the one extracted scalar), never by
8//! document size (INV-MEM): every value the scan does not need is skipped without
9//! allocating.
10//!
11//! It lives in `core` because it is dependency-free pure computation that both
12//! the SPI (partition extraction utilities) and the transform layer (id
13//! construction, field-splice injection) build on, the two sides cannot share a
14//! helper that lives in either of them.
15//!
16//! The scanner is strict: it parses the JSON grammar fully so a malformed body
17//! is rejected here rather than mis-located. Key strings are decoded before they
18//! are compared, so a client cannot smuggle a reserved field name past a
19//! collision check by escaping it (e.g. `"_tenant"` for `_tenant`).
20//
21// JUSTIFY(file-length): one cohesive recursive-descent JSON scanner, the
22// `Parser` and its grammar productions (value/object/array/string/number/escape)
23// are a single unit that must agree on cursor invariants; splitting the
24// productions across files would scatter that shared state for no readability
25// gain. Tests live separately in `json_tests.rs`.
26
27use thiserror::Error;
28
29/// A failure scanning raw JSON bytes.
30///
31/// Deliberately exhaustive (not `#[non_exhaustive]`): it is a small, closed set
32/// of JSON-shape failures, and downstream `From` conversions must map every
33/// variant, a new one should be a compile error to handle, not silently fall
34/// through a wildcard.
35#[derive(Debug, Error, PartialEq, Eq)]
36pub enum JsonError {
37 /// The bytes were not valid JSON.
38 #[error("not valid JSON")]
39 Invalid,
40
41 /// The document was expected to be a JSON object but was not.
42 #[error("not a JSON object")]
43 NotAnObject,
44
45 /// A path does not resolve to a scalar value in the document.
46 #[error("path does not resolve to a scalar value")]
47 PathNotScalar {
48 /// The dotted path that failed to resolve.
49 path: String,
50 },
51}
52
53/// The located top level of a JSON object: where to splice injected fields,
54/// whether it already has members, and its decoded top-level key names.
55#[derive(Debug)]
56pub struct TopLevel {
57 /// Byte offset just past the opening `{`, the splice insertion point.
58 pub insert_at: usize,
59 /// True if the object has no members (`{}`), no trailing comma on splice.
60 pub empty: bool,
61 /// Decoded top-level key names (escapes resolved), for collision checks.
62 pub keys: Vec<String>,
63}
64
65/// Locates the top level of the JSON object in `body`, validating the whole
66/// document as it goes.
67///
68/// # Errors
69///
70/// [`JsonError::NotAnObject`] if `body` is valid JSON but not an object,
71/// [`JsonError::Invalid`] if it is not valid JSON.
72pub fn object_top_level(body: &[u8]) -> Result<TopLevel, JsonError> {
73 let mut p = Parser::new(body);
74 p.skip_ws();
75 if p.peek() != Some(b'{') {
76 // Not an object: distinguish malformed JSON from a non-object value so
77 // the caller can report the right error.
78 return Err(match validate(body) {
79 Ok(()) => JsonError::NotAnObject,
80 Err(e) => e,
81 });
82 }
83 let top = p.object_members()?;
84 p.skip_ws();
85 if p.peek().is_some() {
86 return Err(JsonError::Invalid);
87 }
88 Ok(top)
89}
90
91/// Follows `segments` into the object in `body` and returns the leaf scalar as a
92/// string: strings are decoded, numbers and bools use their source text.
93///
94/// # Errors
95///
96/// [`JsonError::PathNotScalar`] if a segment is missing or the leaf is an
97/// object, array, or null; [`JsonError::Invalid`] if `body` up to the leaf is
98/// not valid JSON.
99pub fn scalar_at_path<'a, I>(body: &[u8], segments: I) -> Result<String, JsonError>
100where
101 I: IntoIterator<Item = &'a str>,
102{
103 let mut p = Parser::new(body);
104 let mut walked: Vec<&str> = Vec::new();
105 for segment in segments {
106 walked.push(segment);
107 p.enter_field(segment)
108 .ok_or_else(|| JsonError::PathNotScalar {
109 path: walked.join("."),
110 })?;
111 }
112 p.skip_ws();
113 p.scalar_string().ok_or_else(|| JsonError::PathNotScalar {
114 path: walked.join("."),
115 })
116}
117
118/// Validates that `body` is a single well-formed JSON document (trailing
119/// whitespace allowed), allocating nothing.
120///
121/// # Errors
122///
123/// [`JsonError::Invalid`] if `body` is not valid JSON.
124pub fn validate(body: &[u8]) -> Result<(), JsonError> {
125 let mut p = Parser::new(body);
126 p.skip_value()?;
127 p.skip_ws();
128 if p.peek().is_some() {
129 return Err(JsonError::Invalid);
130 }
131 Ok(())
132}
133
134/// A cursor over the raw JSON bytes.
135struct Parser<'a> {
136 b: &'a [u8],
137 i: usize,
138}
139
140impl<'a> Parser<'a> {
141 fn new(b: &'a [u8]) -> Self {
142 Self { b, i: 0 }
143 }
144
145 fn peek(&self) -> Option<u8> {
146 self.b.get(self.i).copied()
147 }
148
149 fn skip_ws(&mut self) {
150 while matches!(self.peek(), Some(b' ' | b'\t' | b'\n' | b'\r')) {
151 self.i += 1;
152 }
153 }
154
155 /// Parses the object at the cursor (which must be `{`), recording its top
156 /// level. Used for the document root by [`object_top_level`].
157 fn object_members(&mut self) -> Result<TopLevel, JsonError> {
158 debug_assert_eq!(self.peek(), Some(b'{'));
159 self.i += 1; // opening brace
160 let insert_at = self.i;
161 let mut keys = Vec::new();
162 self.skip_ws();
163 if self.peek() == Some(b'}') {
164 self.i += 1;
165 return Ok(TopLevel {
166 insert_at,
167 empty: true,
168 keys,
169 });
170 }
171 loop {
172 self.skip_ws();
173 keys.push(self.string_decode()?);
174 self.skip_ws();
175 self.expect(b':')?;
176 self.skip_value()?;
177 self.skip_ws();
178 match self.peek() {
179 Some(b',') => self.i += 1,
180 Some(b'}') => {
181 self.i += 1;
182 break;
183 }
184 _ => return Err(JsonError::Invalid),
185 }
186 }
187 Ok(TopLevel {
188 insert_at,
189 empty: false,
190 keys,
191 })
192 }
193
194 /// Positions the cursor at the value of object member `key`, returning
195 /// `Some(())` if found. On a miss (or a non-object), returns `None` and the
196 /// cursor position is unspecified.
197 fn enter_field(&mut self, key: &str) -> Option<()> {
198 self.skip_ws();
199 if self.peek() != Some(b'{') {
200 return None;
201 }
202 self.i += 1;
203 self.skip_ws();
204 if self.peek() == Some(b'}') {
205 return None;
206 }
207 loop {
208 self.skip_ws();
209 let k = self.string_decode().ok()?;
210 self.skip_ws();
211 self.expect(b':').ok()?;
212 self.skip_ws();
213 if k == key {
214 return Some(());
215 }
216 self.skip_value().ok()?;
217 self.skip_ws();
218 match self.peek() {
219 Some(b',') => self.i += 1,
220 _ => return None,
221 }
222 }
223 }
224
225 /// Reads the scalar at the cursor as a string: strings decoded, numbers and
226 /// bools as their source text. `None` for object/array/null/malformed.
227 fn scalar_string(&mut self) -> Option<String> {
228 match self.peek()? {
229 b'"' => self.string_decode().ok(),
230 b't' => self.literal(b"true").ok().map(|()| "true".to_owned()),
231 b'f' => self.literal(b"false").ok().map(|()| "false".to_owned()),
232 c if c == b'-' || c.is_ascii_digit() => {
233 let start = self.i;
234 self.number().ok()?;
235 std::str::from_utf8(&self.b[start..self.i])
236 .ok()
237 .map(str::to_owned)
238 }
239 _ => None,
240 }
241 }
242
243 fn expect(&mut self, byte: u8) -> Result<(), JsonError> {
244 if self.peek() == Some(byte) {
245 self.i += 1;
246 Ok(())
247 } else {
248 Err(JsonError::Invalid)
249 }
250 }
251
252 /// Skips one complete JSON value, allocating nothing.
253 fn skip_value(&mut self) -> Result<(), JsonError> {
254 self.skip_ws();
255 match self.peek().ok_or(JsonError::Invalid)? {
256 b'{' => self.skip_object(),
257 b'[' => self.skip_array(),
258 b'"' => self.skip_string(),
259 b't' => self.literal(b"true"),
260 b'f' => self.literal(b"false"),
261 b'n' => self.literal(b"null"),
262 c if c == b'-' || c.is_ascii_digit() => self.number(),
263 _ => Err(JsonError::Invalid),
264 }
265 }
266
267 fn skip_object(&mut self) -> Result<(), JsonError> {
268 self.i += 1; // '{'
269 self.skip_ws();
270 if self.peek() == Some(b'}') {
271 self.i += 1;
272 return Ok(());
273 }
274 loop {
275 self.skip_ws();
276 self.skip_string()?;
277 self.skip_ws();
278 self.expect(b':')?;
279 self.skip_value()?;
280 self.skip_ws();
281 match self.peek() {
282 Some(b',') => self.i += 1,
283 Some(b'}') => {
284 self.i += 1;
285 return Ok(());
286 }
287 _ => return Err(JsonError::Invalid),
288 }
289 }
290 }
291
292 fn skip_array(&mut self) -> Result<(), JsonError> {
293 self.i += 1; // '['
294 self.skip_ws();
295 if self.peek() == Some(b']') {
296 self.i += 1;
297 return Ok(());
298 }
299 loop {
300 self.skip_value()?;
301 self.skip_ws();
302 match self.peek() {
303 Some(b',') => self.i += 1,
304 Some(b']') => {
305 self.i += 1;
306 return Ok(());
307 }
308 _ => return Err(JsonError::Invalid),
309 }
310 }
311 }
312
313 /// Skips a string (cursor at the opening quote), handling escapes, no alloc.
314 fn skip_string(&mut self) -> Result<(), JsonError> {
315 self.expect(b'"')?;
316 loop {
317 match self.peek().ok_or(JsonError::Invalid)? {
318 b'"' => {
319 self.i += 1;
320 return Ok(());
321 }
322 b'\\' => {
323 self.i += 1;
324 // Consume the escaped char; `\u` carries four more hex digits.
325 let esc = self.peek().ok_or(JsonError::Invalid)?;
326 self.i += 1;
327 if esc == b'u' {
328 for _ in 0..4 {
329 self.hex_digit()?;
330 }
331 }
332 }
333 c if c < 0x20 => return Err(JsonError::Invalid),
334 _ => self.i += 1,
335 }
336 }
337 }
338
339 /// Decodes a string (cursor at the opening quote) into an owned `String`.
340 fn string_decode(&mut self) -> Result<String, JsonError> {
341 self.expect(b'"')?;
342 // Accumulate raw bytes, not `char`s: a literal multi-byte UTF-8 sequence
343 // must be copied verbatim. Decoding each byte as a `char` (the Latin-1
344 // `char::from(u8)` mapping) would re-encode every continuation byte as its
345 // own code point, e.g. "café" → "café", corrupting any non-ASCII
346 // partition key or id-template input. The validation at the close rejects a
347 // string that is not valid UTF-8 (JSON must be UTF-8), keeping the scanner
348 // strict rather than silently producing mojibake.
349 let mut out: Vec<u8> = Vec::new();
350 loop {
351 match self.peek().ok_or(JsonError::Invalid)? {
352 b'"' => {
353 self.i += 1;
354 return String::from_utf8(out).map_err(|_| JsonError::Invalid);
355 }
356 b'\\' => {
357 self.i += 1;
358 self.decode_escape(&mut out)?;
359 }
360 c if c < 0x20 => return Err(JsonError::Invalid),
361 _ => {
362 // A literal byte (ASCII, or one byte of a multi-byte sequence):
363 // copy it verbatim. Continuation bytes are >= 0x80, so they are
364 // never an escape or terminator and fall through here.
365 out.push(self.b[self.i]);
366 self.i += 1;
367 }
368 }
369 }
370 }
371
372 /// Decodes one escape sequence (cursor just past the backslash) into `out`.
373 fn decode_escape(&mut self, out: &mut Vec<u8>) -> Result<(), JsonError> {
374 let esc = self.peek().ok_or(JsonError::Invalid)?;
375 self.i += 1;
376 let ch = match esc {
377 b'"' => '"',
378 b'\\' => '\\',
379 b'/' => '/',
380 b'b' => '\u{0008}',
381 b'f' => '\u{000C}',
382 b'n' => '\n',
383 b'r' => '\r',
384 b't' => '\t',
385 b'u' => return self.decode_unicode_escape(out),
386 _ => return Err(JsonError::Invalid),
387 };
388 push_char(out, ch);
389 Ok(())
390 }
391
392 /// Decodes a `\u` escape (cursor just past the `u`), pairing surrogates.
393 fn decode_unicode_escape(&mut self, out: &mut Vec<u8>) -> Result<(), JsonError> {
394 let hi = self.hex4()?;
395 let code = if (0xD800..=0xDBFF).contains(&hi) {
396 // High surrogate: must be followed by `\u` + a low surrogate.
397 self.expect(b'\\')?;
398 self.expect(b'u')?;
399 let lo = self.hex4()?;
400 if !(0xDC00..=0xDFFF).contains(&lo) {
401 return Err(JsonError::Invalid);
402 }
403 0x1_0000 + ((u32::from(hi) - 0xD800) << 10) + (u32::from(lo) - 0xDC00)
404 } else if (0xDC00..=0xDFFF).contains(&hi) {
405 return Err(JsonError::Invalid); // lone low surrogate
406 } else {
407 u32::from(hi)
408 };
409 push_char(out, char::from_u32(code).ok_or(JsonError::Invalid)?);
410 Ok(())
411 }
412
413 /// Reads four hex digits as a `u16` (cursor at the first digit).
414 fn hex4(&mut self) -> Result<u16, JsonError> {
415 let mut v: u16 = 0;
416 for _ in 0..4 {
417 let d = self.hex_digit()?;
418 v = v * 16 + u16::from(d);
419 }
420 Ok(v)
421 }
422
423 /// Consumes one hex digit, returning its value.
424 fn hex_digit(&mut self) -> Result<u8, JsonError> {
425 let c = self.peek().ok_or(JsonError::Invalid)?;
426 let v = match c {
427 b'0'..=b'9' => c - b'0',
428 b'a'..=b'f' => c - b'a' + 10,
429 b'A'..=b'F' => c - b'A' + 10,
430 _ => return Err(JsonError::Invalid),
431 };
432 self.i += 1;
433 Ok(v)
434 }
435
436 /// Validates and skips a JSON number (cursor at `-` or a digit).
437 fn number(&mut self) -> Result<(), JsonError> {
438 if self.peek() == Some(b'-') {
439 self.i += 1;
440 }
441 match self.peek() {
442 Some(b'0') => self.i += 1,
443 Some(c) if c.is_ascii_digit() => self.digits(),
444 _ => return Err(JsonError::Invalid),
445 }
446 if self.peek() == Some(b'.') {
447 self.i += 1;
448 self.one_or_more_digits()?;
449 }
450 if matches!(self.peek(), Some(b'e' | b'E')) {
451 self.i += 1;
452 if matches!(self.peek(), Some(b'+' | b'-')) {
453 self.i += 1;
454 }
455 self.one_or_more_digits()?;
456 }
457 Ok(())
458 }
459
460 fn digits(&mut self) {
461 while matches!(self.peek(), Some(c) if c.is_ascii_digit()) {
462 self.i += 1;
463 }
464 }
465
466 fn one_or_more_digits(&mut self) -> Result<(), JsonError> {
467 if !matches!(self.peek(), Some(c) if c.is_ascii_digit()) {
468 return Err(JsonError::Invalid);
469 }
470 self.digits();
471 Ok(())
472 }
473
474 /// Matches an exact literal (`true`/`false`/`null`) at the cursor.
475 fn literal(&mut self, lit: &[u8]) -> Result<(), JsonError> {
476 if self.b[self.i..].starts_with(lit) {
477 self.i += lit.len();
478 Ok(())
479 } else {
480 Err(JsonError::Invalid)
481 }
482 }
483}
484
485/// Appends a decoded escape character's UTF-8 encoding to the byte buffer.
486fn push_char(out: &mut Vec<u8>, ch: char) {
487 let mut buf = [0u8; 4];
488 out.extend_from_slice(ch.encode_utf8(&mut buf).as_bytes());
489}
490
491#[cfg(test)]
492#[path = "json_tests.rs"]
493mod tests;