Skip to main content

vr_jcs/
strict_parse.rs

1//! Strict admission parser for untrusted JSON input.
2//!
3//! All public functions in this module enforce the same RFC 8785 / I-JSON
4//! invariants used by the strict canonical-emit pipeline:
5//!
6//! - **Duplicate property names** are rejected at parse time. Object
7//!   members are tracked in a [`BTreeSet`] (deterministic) so the
8//!   rejection error path itself is order-stable.
9//! - **Forbidden Unicode noncharacters** in strings and property names
10//!   reject. Specifically: the range `U+FDD0..=U+FDEF`, plus any code
11//!   point with the bottom 16 bits matching `U+xFFFE` or `U+xFFFF`.
12//! - **Nesting depth** is capped at [`crate::MAX_NESTING_DEPTH`]. The
13//!   limit is enforced via a sentinel-encoded serde error that
14//!   [`parse_json_value_no_duplicates`] unwraps back into
15//!   [`JcsError::NestingDepthExceeded`].
16//!
17//! Sibling crate `vertrule-schemas` consumes [`deserialize_json_value_no_duplicates`],
18//! [`validate_string_contents`], and [`is_safe_integer`] for its own
19//! schema-validation pipeline.
20//!
21//! # The `'$'`-prefix exception
22//!
23//! `serde_json` with `arbitrary_precision` enabled uses internal
24//! sentinel keys like `"$serde_json::private::Number"` during number
25//! deserialization. Those sentinels would otherwise look like ordinary
26//! property names to this visitor. We bypass [`validate_string_contents`]
27//! for any key starting with `'$'` so the sentinel survives. This
28//! intentionally over-matches — a user key like `"$ref"` containing a
29//! noncharacter would not be validated. Acceptable because forbidden
30//! noncharacters in `'$'`-prefixed keys are vanishingly unlikely in
31//! practice.
32
33use std::collections::BTreeSet;
34
35use serde::de::{self, DeserializeSeed, Error as DeError, MapAccess, SeqAccess, Visitor};
36use serde::Deserializer;
37use serde_json::{Number, Value};
38
39use crate::error::JcsError;
40use crate::MAX_NESTING_DEPTH;
41
42/// I-JSON safe integer ceiling (`2^53 - 1`).
43pub const MAX_SAFE_INTEGER: i64 = 9_007_199_254_740_991;
44
45/// Sentinel prefix used by [`NoDuplicateValueSeed`] to signal depth
46/// exceeded through serde's error channel. Matched in
47/// [`parse_json_value_no_duplicates`] to promote the error to
48/// [`JcsError::NestingDepthExceeded`].
49const DEPTH_EXCEEDED_SENTINEL: &str = "nesting depth exceeded maximum of ";
50
51/// Parse untrusted JSON bytes, rejecting duplicate property names and
52/// I-JSON-forbidden code points, enforcing [`MAX_NESTING_DEPTH`].
53///
54/// # Errors
55///
56/// - [`JcsError::Json`] for malformed JSON or duplicate property names.
57/// - [`JcsError::InvalidString`] for forbidden noncharacters.
58/// - [`JcsError::NestingDepthExceeded`] for depth limit breach.
59pub fn parse_json_value_no_duplicates(json: &[u8]) -> Result<Value, JcsError> {
60    let mut deserializer = serde_json::Deserializer::from_slice(json);
61    // Disable serde_json's built-in recursion limit — we enforce
62    // MAX_NESTING_DEPTH via NoDuplicateValueSeed instead.
63    deserializer.disable_recursion_limit();
64    let value = deserialize_json_value_no_duplicates(&mut deserializer).map_err(|e| {
65        if e.to_string().starts_with(DEPTH_EXCEEDED_SENTINEL) {
66            JcsError::NestingDepthExceeded
67        } else {
68            JcsError::Json(e)
69        }
70    })?;
71    deserializer.end()?;
72    Ok(value)
73}
74
75/// Deserialize a JSON value while rejecting duplicate property names.
76///
77/// Used by `vertrule-schemas` for ingestion validation.
78///
79/// # Errors
80///
81/// Returns an error if the input contains duplicate property names,
82/// forbidden noncharacters, or is otherwise invalid JSON.
83pub fn deserialize_json_value_no_duplicates<'de, D>(deserializer: D) -> Result<Value, D::Error>
84where
85    D: Deserializer<'de>,
86{
87    NoDuplicateValueSeed { depth: 0 }.deserialize(deserializer)
88}
89
90/// Validate that a string contains no I-JSON forbidden noncharacters.
91///
92/// # Errors
93///
94/// Returns a description of the violation if the string contains a
95/// forbidden Unicode noncharacter (U+FDD0..U+FDEF, U+xFFFE, U+xFFFF).
96pub fn validate_string_contents(value: &str, context: &str) -> Result<(), String> {
97    if let Some(ch) = value.chars().find(|&ch| is_noncharacter(ch)) {
98        return Err(format!(
99            "{context} contains the forbidden noncharacter U+{:04X}",
100            ch as u32
101        ));
102    }
103    Ok(())
104}
105
106/// Check if an integer is in the I-JSON safe integer range
107/// `[-2^53+1, 2^53-1]`.
108#[must_use]
109pub const fn is_safe_integer(value: i64) -> bool {
110    value >= -MAX_SAFE_INTEGER && value <= MAX_SAFE_INTEGER
111}
112
113const fn is_noncharacter(ch: char) -> bool {
114    let code = ch as u32;
115    (0xFDD0 <= code && code <= 0xFDEF) || code & 0xFFFE == 0xFFFE
116}
117
118struct NoDuplicateValueSeed {
119    depth: usize,
120}
121
122impl<'de> DeserializeSeed<'de> for NoDuplicateValueSeed {
123    type Value = Value;
124
125    fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
126    where
127        D: Deserializer<'de>,
128    {
129        if self.depth > MAX_NESTING_DEPTH {
130            return Err(D::Error::custom(format!(
131                "{DEPTH_EXCEEDED_SENTINEL}{MAX_NESTING_DEPTH}"
132            )));
133        }
134        deserializer.deserialize_any(NoDuplicateValueVisitor { depth: self.depth })
135    }
136}
137
138struct NoDuplicateValueVisitor {
139    depth: usize,
140}
141
142impl<'de> Visitor<'de> for NoDuplicateValueVisitor {
143    type Value = Value;
144
145    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
146        formatter.write_str("a valid JSON value")
147    }
148
149    fn visit_bool<E>(self, value: bool) -> Result<Self::Value, E> {
150        Ok(Value::Bool(value))
151    }
152
153    fn visit_i64<E>(self, value: i64) -> Result<Self::Value, E> {
154        Ok(Value::Number(Number::from(value)))
155    }
156
157    fn visit_u64<E>(self, value: u64) -> Result<Self::Value, E> {
158        Ok(Value::Number(Number::from(value)))
159    }
160
161    fn visit_f64<E>(self, value: f64) -> Result<Self::Value, E>
162    where
163        E: de::Error,
164    {
165        Number::from_f64(value)
166            .map(Value::Number)
167            .ok_or_else(|| E::custom("encountered a non-finite floating-point number"))
168    }
169
170    fn visit_str<E>(self, value: &str) -> Result<Self::Value, E>
171    where
172        E: de::Error,
173    {
174        validate_string_contents(value, "string value").map_err(E::custom)?;
175        Ok(Value::String(value.to_owned()))
176    }
177
178    fn visit_borrowed_str<E>(self, value: &'de str) -> Result<Self::Value, E>
179    where
180        E: de::Error,
181    {
182        self.visit_str(value)
183    }
184
185    fn visit_string<E>(self, value: String) -> Result<Self::Value, E>
186    where
187        E: de::Error,
188    {
189        validate_string_contents(&value, "string value").map_err(E::custom)?;
190        Ok(Value::String(value))
191    }
192
193    fn visit_none<E>(self) -> Result<Self::Value, E> {
194        Ok(Value::Null)
195    }
196
197    fn visit_unit<E>(self) -> Result<Self::Value, E> {
198        Ok(Value::Null)
199    }
200
201    fn visit_seq<A>(self, mut access: A) -> Result<Self::Value, A::Error>
202    where
203        A: SeqAccess<'de>,
204    {
205        let mut values = Vec::with_capacity(access.size_hint().unwrap_or(0));
206        while let Some(value) = access.next_element_seed(NoDuplicateValueSeed {
207            depth: self.depth + 1,
208        })? {
209            values.push(value);
210        }
211        Ok(Value::Array(values))
212    }
213
214    fn visit_map<A>(self, mut access: A) -> Result<Self::Value, A::Error>
215    where
216        A: MapAccess<'de>,
217    {
218        let Some(first_key) = access.next_key::<String>()? else {
219            return Ok(Value::Object(serde_json::Map::new()));
220        };
221
222        // See module-level docs: '$'-prefix bypass for serde_json
223        // arbitrary_precision sentinels.
224        if !first_key.starts_with('$') {
225            validate_string_contents(&first_key, "object property name")
226                .map_err(A::Error::custom)?;
227        }
228
229        let first_value = access.next_value_seed(NoDuplicateValueSeed {
230            depth: self.depth + 1,
231        })?;
232
233        let mut object = serde_json::Map::new();
234        object.insert(first_key.clone(), first_value);
235
236        let mut seen = BTreeSet::new();
237        seen.insert(first_key);
238
239        while let Some(key) = access.next_key::<String>()? {
240            // Same '$'-prefix bypass as above; see module-level docs.
241            if !key.starts_with('$') {
242                validate_string_contents(&key, "object property name").map_err(A::Error::custom)?;
243            }
244
245            if !seen.insert(key.clone()) {
246                return Err(A::Error::custom(format!("duplicate property name `{key}`")));
247            }
248
249            let value = access.next_value_seed(NoDuplicateValueSeed {
250                depth: self.depth + 1,
251            })?;
252            object.insert(key, value);
253        }
254
255        // If the map is a serde_json internal number representation,
256        // serde_json::from_value will reconstruct the proper Number.
257        // For real JSON objects, this is a no-op identity conversion.
258        serde_json::from_value(Value::Object(object)).map_err(A::Error::custom)
259    }
260}