Skip to main content

hedl_json/from_json/
config.rs

1// Dweve HEDL - Hierarchical Entity Data Language
2//
3// Copyright (c) 2025 Dweve IP B.V. and individual contributors.
4//
5// SPDX-License-Identifier: Apache-2.0
6//
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License in the LICENSE file at the
10// root of this repository or at: http://www.apache.org/licenses/LICENSE-2.0
11//
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17
18//! Configuration and error types for JSON to HEDL conversion
19
20use hedl_core::Value;
21
22/// Default maximum recursion depth for JSON parsing
23///
24/// Set to 10,000 levels to handle deeply nested JSON structures.
25/// This is significantly higher than typical JSON depth but prevents
26/// stack overflow from malicious or malformed inputs.
27pub const DEFAULT_MAX_DEPTH: usize = 10_000;
28
29/// Default maximum array size for JSON parsing
30///
31/// Set to 10,000,000 elements to handle large datasets, including
32/// large arrays commonly found in data science and ML applications.
33pub const DEFAULT_MAX_ARRAY_SIZE: usize = 10_000_000;
34
35/// Default maximum string length for JSON parsing
36///
37/// Set to 100 MB to handle large strings including base64-encoded
38/// binary data, large text fields, and embedded documents.
39pub const DEFAULT_MAX_STRING_LENGTH: usize = 100 * 1024 * 1024;
40
41/// Default maximum object size (number of keys)
42///
43/// Set to 100,000 keys to handle objects with many properties,
44/// common in configuration files and metadata-rich documents.
45pub const DEFAULT_MAX_OBJECT_SIZE: usize = 100_000;
46
47/// Policy for handling unpaired UTF-16 surrogates in JSON input
48///
49/// JSON's `\uXXXX` escapes use UTF-16 encoding. Characters outside the
50/// Basic Multilingual Plane (U+10000+, including emoji) require surrogate
51/// pairs: a high surrogate (0xD800-0xDBFF) followed immediately by a low
52/// surrogate (0xDC00-0xDFFF).
53///
54/// Some systems (e.g., JavaScript with truncated strings, legacy databases)
55/// may emit unpaired surrogates, which are technically invalid Unicode but
56/// may appear in real-world data.
57///
58/// # Example
59///
60/// ```text
61/// use hedl_json::{FromJsonConfig, SurrogatePolicy};
62///
63/// // Default: reject unpaired surrogates
64/// let strict = FromJsonConfig::default();
65///
66/// // Replace unpaired surrogates with U+FFFD
67/// let lenient = FromJsonConfig::builder()
68///     .surrogate_policy(SurrogatePolicy::ReplaceWithFFFD)
69///     .build();
70///
71/// // Skip (remove) unpaired surrogates entirely
72/// let skip = FromJsonConfig::builder()
73///     .surrogate_policy(SurrogatePolicy::Skip)
74///     .build();
75/// ```
76#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
77pub enum SurrogatePolicy {
78    /// Reject unpaired surrogates with an error (default, strict)
79    ///
80    /// This is the safest option and ensures all processed JSON contains
81    /// valid Unicode. Use this for data integrity requirements.
82    #[default]
83    Reject,
84
85    /// Replace unpaired surrogates with U+FFFD (replacement character)
86    ///
87    /// This allows processing of JSON with invalid Unicode while preserving
88    /// string structure. The replacement character (�) signals data loss.
89    ReplaceWithFFFD,
90
91    /// Skip (remove) unpaired surrogates silently
92    ///
93    /// Use with caution: this modifies string content without indication.
94    /// Suitable when the surrogates are known to be noise or artifacts.
95    Skip,
96}
97
98/// Errors that can occur during JSON to HEDL conversion
99#[derive(Debug, Clone, thiserror::Error)]
100pub enum JsonConversionError {
101    /// JSON parsing failed
102    #[error("JSON parse error: {0}")]
103    ParseError(String),
104
105    /// Root value must be an object
106    #[error("Root must be a JSON object, found {0}")]
107    InvalidRoot(String),
108
109    /// Invalid number value
110    #[error("Invalid number: {0}")]
111    InvalidNumber(String),
112
113    /// Invalid expression syntax
114    #[error("Invalid expression: {0}")]
115    InvalidExpression(String),
116
117    /// Invalid tensor element
118    #[error("Invalid tensor element - must be number or array")]
119    InvalidTensor,
120
121    /// Nested objects not allowed in scalar context
122    #[error("Nested objects not allowed in scalar context")]
123    NestedObject,
124
125    /// Reference parsing failed
126    #[error("Invalid reference: {0}")]
127    InvalidReference(String),
128
129    /// Invalid Unicode encoding
130    ///
131    /// This error occurs when JSON contains invalid Unicode sequences, such as:
132    /// - Unpaired UTF-16 surrogates (`\uD83D` without its low surrogate pair)
133    /// - Invalid surrogate pairs (low surrogate before high surrogate)
134    /// - Unescaped control characters in strings
135    ///
136    /// # UTF-16 Surrogate Background
137    ///
138    /// JSON's `\uXXXX` escapes use UTF-16 encoding. Characters outside the
139    /// Basic Multilingual Plane (U+10000 and above, including emoji) require
140    /// surrogate pairs: a high surrogate (0xD800-0xDBFF) followed by a low
141    /// surrogate (0xDC00-0xDFFF).
142    ///
143    /// # Solutions
144    ///
145    /// 1. **Use the `SurrogatePolicy::ReplaceWithFFFD` option**:
146    ///    Replace invalid surrogates with the Unicode replacement character.
147    ///
148    /// 2. **Preprocess the JSON** to fix or remove invalid sequences.
149    ///
150    /// 3. **Ensure the source system** produces valid UTF-8/UTF-16 pairs.
151    #[error("Invalid Unicode: {0}")]
152    InvalidUnicode(String),
153
154    /// Maximum recursion depth exceeded
155    #[error("Maximum recursion depth ({0}) exceeded - possible deeply nested structure")]
156    MaxDepthExceeded(usize),
157
158    /// Maximum array size exceeded
159    #[error("Maximum array size ({0}) exceeded - array has {1} elements")]
160    MaxArraySizeExceeded(usize, usize),
161
162    /// Maximum string length exceeded
163    #[error("Maximum string length ({0}) exceeded - string has {1} characters")]
164    MaxStringLengthExceeded(usize, usize),
165
166    /// Maximum object size exceeded
167    #[error("Maximum object size ({0}) exceeded - object has {1} keys")]
168    MaxObjectSizeExceeded(usize, usize),
169
170    /// Integer value outside i64 range
171    ///
172    /// JSON supports arbitrary-precision numbers, but HEDL's `Value::Int`
173    /// uses `i64` which has a fixed range: -9,223,372,036,854,775,808 to
174    /// 9,223,372,036,854,775,807.
175    ///
176    /// # Common Causes
177    ///
178    /// - Twitter/Snowflake IDs (often exceed `i64::MAX`)
179    /// - Unsigned 64-bit integers from other systems
180    /// - Large database auto-increment IDs
181    /// - Timestamps in nanoseconds beyond year 2262
182    ///
183    /// # Solutions
184    ///
185    /// 1. **Use strings for large IDs** (recommended):
186    ///    ```json
187    ///    {"tweet_id": "18446744073709551615"}
188    ///    ```
189    ///
190    /// 2. **Use hex encoding**:
191    ///    ```json
192    ///    {"large_number": "0xFFFFFFFFFFFFFFFF"}
193    ///    ```
194    ///
195    /// 3. **Split into high/low parts**:
196    ///    ```json
197    ///    {"value_high": 1844674407, "value_low": 3709551615}
198    ///    ```
199    ///
200    /// # Examples
201    ///
202    /// ```
203    /// use hedl_json::{from_json, FromJsonConfig};
204    ///
205    /// let json = r#"{"id": 18446744073709551615}"#;
206    /// let result = from_json(json, &FromJsonConfig::default());
207    ///
208    /// assert!(result.is_err());
209    /// assert!(result.unwrap_err().to_string().contains("Integer overflow"));
210    /// ```
211    #[error(
212        "Integer overflow: {value} exceeds i64 range [{min}..{max}]. \
213         Consider using a string for large IDs or timestamps."
214    )]
215    IntegerOverflow {
216        /// String representation of the overflowing value.
217        value: String,
218        /// Maximum valid i64 value.
219        max: i64,
220        /// Minimum valid i64 value.
221        min: i64,
222    },
223}
224
225impl From<serde_json::Error> for JsonConversionError {
226    fn from(err: serde_json::Error) -> Self {
227        let msg = err.to_string();
228
229        // Detect surrogate-related errors from serde_json
230        if msg.contains("lone surrogate")
231            || msg.contains("surrogate")
232            || msg.contains("invalid unicode")
233        {
234            JsonConversionError::InvalidUnicode(format!(
235                "Invalid UTF-16 surrogate sequence: {msg}. \
236                 JSON contains unpaired surrogates which cannot be represented \
237                 in Rust UTF-8 strings. Configure SurrogatePolicy::ReplaceWithFFFD \
238                 to replace with the Unicode replacement character (U+FFFD)."
239            ))
240        } else if msg.contains("control character") {
241            JsonConversionError::InvalidUnicode(format!(
242                "Unescaped control character in JSON string: {msg}. \
243                 Control characters (U+0000-U+001F) must be escaped as \\uXXXX \
244                 per RFC 8259."
245            ))
246        } else {
247            JsonConversionError::ParseError(msg)
248        }
249    }
250}
251
252/// Check if a `serde_json::Number` represents an integer outside i64 range
253///
254/// Returns `true` if the number is an integer (not a float) but cannot
255/// fit in i64 range. This happens when:
256/// - The value is larger than `i64::MAX` (9,223,372,036,854,775,807)
257/// - The value is smaller than `i64::MIN` (-9,223,372,036,854,775,808)
258///
259/// # Implementation Note
260///
261/// `serde_json::Number::as_i64()` returns `None` for both:
262/// 1. Numbers outside i64 range (overflow)
263/// 2. Floating point numbers
264///
265/// We use `as_u64()` to detect case 1: if `as_i64()` fails but `as_u64()`
266/// succeeds, the number is an unsigned integer too large for i64.
267/// We also check `is_i64()` to catch negative overflow cases.
268#[inline]
269pub(super) fn is_integer_overflow(n: &serde_json::Number) -> bool {
270    // If as_i64() fails but as_u64() succeeds, it's an unsigned int overflow
271    // Or if is_i64() is true but as_i64() is None, it's a signed int overflow
272    n.as_i64().is_none() && (n.as_u64().is_some() || n.is_i64())
273}
274
275/// Convert JSON number to HEDL Value with overflow detection
276///
277/// This function enforces strict integer validation to prevent silent
278/// precision loss from i64 overflow converting to f64.
279///
280/// # Behavior
281///
282/// 1. **i64 range integers**: Convert to `Value::Int(i64)`
283/// 2. **Overflow integers**: Return `IntegerOverflow` error
284/// 3. **Floating point**: Convert to `Value::Float(f64)`
285///
286/// # Implementation Details
287///
288/// - Valid i64 values are converted to `Value::Int`
289/// - Integer values outside i64 range trigger `IntegerOverflow` error
290/// - Floating point values are converted to `Value::Float`
291/// - Uses fast-path optimization for common i64 case
292#[inline]
293pub(super) fn json_number_to_value(n: &serde_json::Number) -> Result<Value, JsonConversionError> {
294    // Try i64 first (most common case - fast path)
295    if let Some(i) = n.as_i64() {
296        return Ok(Value::Int(i));
297    }
298
299    // Check for integer overflow
300    if is_integer_overflow(n) {
301        return Err(JsonConversionError::IntegerOverflow {
302            value: n.to_string(),
303            max: i64::MAX,
304            min: i64::MIN,
305        });
306    }
307
308    // Must be a float
309    if let Some(f) = n.as_f64() {
310        Ok(Value::Float(f))
311    } else {
312        // Should never happen with valid JSON
313        Err(JsonConversionError::InvalidNumber(n.to_string()))
314    }
315}
316
317/// Configuration for JSON import
318///
319/// Controls how JSON is converted to HEDL, including security limits
320/// to prevent denial-of-service attacks from malicious inputs.
321///
322/// # High Default Limits
323///
324/// The default limits are set intentionally high to handle large-scale
325/// data processing scenarios common in ML/AI applications:
326///
327/// - **10,000 depth**: Deep nesting in complex hierarchical data
328/// - **10,000,000 array size**: Large datasets and batches
329/// - **100 MB string length**: Base64-encoded binary data, embeddings
330/// - **100,000 object size**: Rich metadata and configuration objects
331///
332/// These defaults prioritize functionality over restrictiveness. For
333/// untrusted input, consider using the builder pattern with custom limits.
334///
335/// # Examples
336///
337/// ```text
338/// use hedl_json::FromJsonConfig;
339///
340/// // Default configuration with high limits for ML/data workloads
341/// let config = FromJsonConfig::default();
342///
343/// // Custom configuration using builder pattern
344/// let custom_config = FromJsonConfig::builder()
345///     .max_depth(1_000)
346///     .max_array_size(100_000)
347///     .max_string_length(10 * 1024 * 1024) // 10 MB
348///     .build();
349///
350/// // Strict configuration for untrusted input
351/// let strict_config = FromJsonConfig::builder()
352///     .max_depth(50)
353///     .max_array_size(10_000)
354///     .max_string_length(1_000_000)
355///     .max_object_size(1_000)
356///     .build();
357///
358/// // Unlimited configuration (use with caution)
359/// let unlimited_config = FromJsonConfig::builder()
360///     .unlimited()
361///     .build();
362/// ```
363#[derive(Debug, Clone)]
364pub struct FromJsonConfig {
365    /// Default type name for arrays without metadata
366    pub default_type_name: String,
367
368    /// HEDL version to use
369    pub version: (u32, u32),
370
371    /// Maximum recursion depth (default: 10,000)
372    ///
373    /// Prevents stack overflow from deeply nested JSON structures.
374    /// Set to `None` to disable (not recommended for untrusted input).
375    pub max_depth: Option<usize>,
376
377    /// Maximum array size (default: 10,000,000)
378    ///
379    /// Prevents memory exhaustion from extremely large arrays.
380    /// JSON arrays can contain large datasets, batches, or embeddings.
381    /// Set to `None` to disable (not recommended for untrusted input).
382    pub max_array_size: Option<usize>,
383
384    /// Maximum string length (default: 100 MB)
385    ///
386    /// Prevents memory exhaustion from extremely large strings.
387    /// JSON strings often contain base64-encoded binary data, large
388    /// text fields, or embedded documents requiring high limits.
389    /// Set to `None` to disable (not recommended for untrusted input).
390    pub max_string_length: Option<usize>,
391
392    /// Maximum object size (default: 100,000)
393    ///
394    /// Prevents memory exhaustion from objects with many keys.
395    /// Configuration files and metadata-rich objects can have many properties.
396    /// Set to `None` to disable (not recommended for untrusted input).
397    pub max_object_size: Option<usize>,
398
399    /// Policy for handling unpaired UTF-16 surrogates
400    ///
401    /// Some systems emit JSON with unpaired surrogates (e.g., truncated
402    /// JavaScript strings). This setting controls how to handle them.
403    ///
404    /// Default: `SurrogatePolicy::Reject` (strict validation)
405    pub surrogate_policy: SurrogatePolicy,
406
407    /// Enable lenient JSON parsing (JSON5-style trailing commas and comments)
408    ///
409    /// When enabled, the parser accepts:
410    /// - Trailing commas in arrays and objects
411    /// - Single-line (//) and multi-line (/* */) comments
412    ///
413    /// Requires the `lenient` feature flag.
414    ///
415    /// Default: false (strict RFC 8259 JSON)
416    #[cfg(feature = "lenient")]
417    pub lenient: bool,
418}
419
420impl Default for FromJsonConfig {
421    fn default() -> Self {
422        Self {
423            default_type_name: "Item".to_string(),
424            version: (2, 0),
425            max_depth: Some(DEFAULT_MAX_DEPTH),
426            max_array_size: Some(DEFAULT_MAX_ARRAY_SIZE),
427            max_string_length: Some(DEFAULT_MAX_STRING_LENGTH),
428            max_object_size: Some(DEFAULT_MAX_OBJECT_SIZE),
429            surrogate_policy: SurrogatePolicy::default(),
430            #[cfg(feature = "lenient")]
431            lenient: false,
432        }
433    }
434}
435
436impl FromJsonConfig {
437    /// Create a new builder for configuring JSON import
438    ///
439    /// # Examples
440    ///
441    /// ```text
442    /// use hedl_json::FromJsonConfig;
443    ///
444    /// let config = FromJsonConfig::builder()
445    ///     .max_depth(1_000)
446    ///     .max_array_size(100_000)
447    ///     .build();
448    /// ```
449    #[must_use]
450    pub fn builder() -> FromJsonConfigBuilder {
451        FromJsonConfigBuilder::default()
452    }
453}
454
455impl hedl_core::convert::ImportConfig for FromJsonConfig {
456    fn default_type_name(&self) -> &str {
457        &self.default_type_name
458    }
459
460    fn version(&self) -> (u32, u32) {
461        self.version
462    }
463}
464
465/// Builder for `FromJsonConfig`
466///
467/// Provides ergonomic configuration of JSON import limits and behavior.
468///
469/// # Examples
470///
471/// ```text
472/// use hedl_json::FromJsonConfig;
473///
474/// // Custom limits
475/// let config = FromJsonConfig::builder()
476///     .max_depth(1_000)
477///     .max_array_size(100_000)
478///     .max_string_length(10 * 1024 * 1024)
479///     .build();
480///
481/// // Strict limits for untrusted input
482/// let strict = FromJsonConfig::builder()
483///     .max_depth(50)
484///     .max_array_size(10_000)
485///     .max_string_length(1_000_000)
486///     .max_object_size(1_000)
487///     .build();
488///
489/// // Unlimited (use with caution!)
490/// let unlimited = FromJsonConfig::builder()
491///     .unlimited()
492///     .build();
493/// ```
494#[derive(Debug, Clone)]
495pub struct FromJsonConfigBuilder {
496    default_type_name: String,
497    version: (u32, u32),
498    max_depth: Option<usize>,
499    max_array_size: Option<usize>,
500    max_string_length: Option<usize>,
501    max_object_size: Option<usize>,
502    surrogate_policy: SurrogatePolicy,
503    #[cfg(feature = "lenient")]
504    lenient: bool,
505}
506
507impl Default for FromJsonConfigBuilder {
508    fn default() -> Self {
509        Self {
510            default_type_name: "Item".to_string(),
511            version: (2, 0),
512            max_depth: Some(DEFAULT_MAX_DEPTH),
513            max_array_size: Some(DEFAULT_MAX_ARRAY_SIZE),
514            max_string_length: Some(DEFAULT_MAX_STRING_LENGTH),
515            max_object_size: Some(DEFAULT_MAX_OBJECT_SIZE),
516            surrogate_policy: SurrogatePolicy::default(),
517            #[cfg(feature = "lenient")]
518            lenient: false,
519        }
520    }
521}
522
523impl FromJsonConfigBuilder {
524    /// Set the default type name for arrays without metadata
525    pub fn default_type_name(mut self, name: impl Into<String>) -> Self {
526        self.default_type_name = name.into();
527        self
528    }
529
530    /// Set the HEDL version to use
531    #[must_use]
532    pub fn version(mut self, major: u32, minor: u32) -> Self {
533        self.version = (major, minor);
534        self
535    }
536
537    /// Set the maximum recursion depth
538    ///
539    /// Use `None` to disable the limit (not recommended for untrusted input).
540    #[must_use]
541    pub fn max_depth(mut self, limit: usize) -> Self {
542        self.max_depth = Some(limit);
543        self
544    }
545
546    /// Set the maximum array size
547    ///
548    /// Use `None` to disable the limit (not recommended for untrusted input).
549    #[must_use]
550    pub fn max_array_size(mut self, limit: usize) -> Self {
551        self.max_array_size = Some(limit);
552        self
553    }
554
555    /// Set the maximum string length in bytes
556    ///
557    /// Use `None` to disable the limit (not recommended for untrusted input).
558    #[must_use]
559    pub fn max_string_length(mut self, limit: usize) -> Self {
560        self.max_string_length = Some(limit);
561        self
562    }
563
564    /// Set the maximum object size (number of keys)
565    ///
566    /// Use `None` to disable the limit (not recommended for untrusted input).
567    #[must_use]
568    pub fn max_object_size(mut self, limit: usize) -> Self {
569        self.max_object_size = Some(limit);
570        self
571    }
572
573    /// Set the policy for handling unpaired UTF-16 surrogates
574    ///
575    /// # Options
576    ///
577    /// - `SurrogatePolicy::Reject` (default): Error on invalid surrogates
578    /// - `SurrogatePolicy::ReplaceWithFFFD`: Replace with U+FFFD
579    /// - `SurrogatePolicy::Skip`: Remove invalid surrogates silently
580    ///
581    /// # Example
582    ///
583    /// ```text
584    /// use hedl_json::{FromJsonConfig, SurrogatePolicy};
585    ///
586    /// let config = FromJsonConfig::builder()
587    ///     .surrogate_policy(SurrogatePolicy::ReplaceWithFFFD)
588    ///     .build();
589    /// ```
590    #[must_use]
591    pub fn surrogate_policy(mut self, policy: SurrogatePolicy) -> Self {
592        self.surrogate_policy = policy;
593        self
594    }
595
596    /// Disable all limits (use with caution - only for trusted input)
597    ///
598    /// This removes all safety limits and can lead to memory exhaustion
599    /// or stack overflow with malicious or malformed JSON.
600    #[must_use]
601    pub fn unlimited(mut self) -> Self {
602        self.max_depth = None;
603        self.max_array_size = None;
604        self.max_string_length = None;
605        self.max_object_size = None;
606        self
607    }
608
609    /// Enable lenient JSON parsing (trailing commas, comments)
610    ///
611    /// When enabled, the parser accepts:
612    /// - Trailing commas in arrays and objects
613    /// - Single-line (//) and multi-line (/* */) comments
614    ///
615    /// Requires the `lenient` feature flag.
616    ///
617    /// # Examples
618    ///
619    /// ```text
620    /// use hedl_json::FromJsonConfig;
621    ///
622    /// let config = FromJsonConfig::builder()
623    ///     .lenient(true)
624    ///     .build();
625    ///
626    /// // Now you can parse JSON with trailing commas
627    /// let json = r#"{"name": "Alice", "age": 30,}"#;
628    /// ```
629    #[cfg(feature = "lenient")]
630    #[must_use]
631    pub fn lenient(mut self, lenient: bool) -> Self {
632        self.lenient = lenient;
633        self
634    }
635
636    /// Build the configuration
637    #[must_use]
638    pub fn build(self) -> FromJsonConfig {
639        FromJsonConfig {
640            default_type_name: self.default_type_name,
641            version: self.version,
642            max_depth: self.max_depth,
643            max_array_size: self.max_array_size,
644            max_string_length: self.max_string_length,
645            max_object_size: self.max_object_size,
646            surrogate_policy: self.surrogate_policy,
647            #[cfg(feature = "lenient")]
648            lenient: self.lenient,
649        }
650    }
651}
652
653/// Schema cache for avoiding redundant schema inference
654///
655/// When converting large JSON arrays to matrix lists, we often encounter the same
656/// structure repeatedly. Caching the inferred schema significantly improves performance
657/// by avoiding redundant key iteration and sorting.
658///
659/// # Performance Impact
660///
661/// - First schema inference: ~O(n*log(n)) where n is number of keys
662/// - Cached lookup: ~O(1) hash map lookup
663/// - Expected speedup: 30-50% for documents with repeated array structures
664pub(super) type SchemaCache = std::collections::HashMap<Vec<String>, Vec<String>>;