hedl_json/from_json/config.rs
1// Dweve HEDL - Hierarchical Entity Data Language
2//
3// Copyright (c) 2025 Dweve IP B.V. and individual contributors.
4//
5// SPDX-License-Identifier: Apache-2.0
6//
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License in the LICENSE file at the
10// root of this repository or at: http://www.apache.org/licenses/LICENSE-2.0
11//
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17
18//! Configuration and error types for JSON to HEDL conversion
19
20use hedl_core::Value;
21
22/// Default maximum recursion depth for JSON parsing
23///
24/// Set to 10,000 levels to handle deeply nested JSON structures.
25/// This is significantly higher than typical JSON depth but prevents
26/// stack overflow from malicious or malformed inputs.
27pub const DEFAULT_MAX_DEPTH: usize = 10_000;
28
29/// Default maximum array size for JSON parsing
30///
31/// Set to 10,000,000 elements to handle large datasets, including
32/// large arrays commonly found in data science and ML applications.
33pub const DEFAULT_MAX_ARRAY_SIZE: usize = 10_000_000;
34
35/// Default maximum string length for JSON parsing
36///
37/// Set to 100 MB to handle large strings including base64-encoded
38/// binary data, large text fields, and embedded documents.
39pub const DEFAULT_MAX_STRING_LENGTH: usize = 100 * 1024 * 1024;
40
41/// Default maximum object size (number of keys)
42///
43/// Set to 100,000 keys to handle objects with many properties,
44/// common in configuration files and metadata-rich documents.
45pub const DEFAULT_MAX_OBJECT_SIZE: usize = 100_000;
46
47/// Policy for handling unpaired UTF-16 surrogates in JSON input
48///
49/// JSON's `\uXXXX` escapes use UTF-16 encoding. Characters outside the
50/// Basic Multilingual Plane (U+10000+, including emoji) require surrogate
51/// pairs: a high surrogate (0xD800-0xDBFF) followed immediately by a low
52/// surrogate (0xDC00-0xDFFF).
53///
54/// Some systems (e.g., JavaScript with truncated strings, legacy databases)
55/// may emit unpaired surrogates, which are technically invalid Unicode but
56/// may appear in real-world data.
57///
58/// # Example
59///
60/// ```text
61/// use hedl_json::{FromJsonConfig, SurrogatePolicy};
62///
63/// // Default: reject unpaired surrogates
64/// let strict = FromJsonConfig::default();
65///
66/// // Replace unpaired surrogates with U+FFFD
67/// let lenient = FromJsonConfig::builder()
68/// .surrogate_policy(SurrogatePolicy::ReplaceWithFFFD)
69/// .build();
70///
71/// // Skip (remove) unpaired surrogates entirely
72/// let skip = FromJsonConfig::builder()
73/// .surrogate_policy(SurrogatePolicy::Skip)
74/// .build();
75/// ```
76#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
77pub enum SurrogatePolicy {
78 /// Reject unpaired surrogates with an error (default, strict)
79 ///
80 /// This is the safest option and ensures all processed JSON contains
81 /// valid Unicode. Use this for data integrity requirements.
82 #[default]
83 Reject,
84
85 /// Replace unpaired surrogates with U+FFFD (replacement character)
86 ///
87 /// This allows processing of JSON with invalid Unicode while preserving
88 /// string structure. The replacement character (�) signals data loss.
89 ReplaceWithFFFD,
90
91 /// Skip (remove) unpaired surrogates silently
92 ///
93 /// Use with caution: this modifies string content without indication.
94 /// Suitable when the surrogates are known to be noise or artifacts.
95 Skip,
96}
97
98/// Errors that can occur during JSON to HEDL conversion
99#[derive(Debug, Clone, thiserror::Error)]
100pub enum JsonConversionError {
101 /// JSON parsing failed
102 #[error("JSON parse error: {0}")]
103 ParseError(String),
104
105 /// Root value must be an object
106 #[error("Root must be a JSON object, found {0}")]
107 InvalidRoot(String),
108
109 /// Invalid number value
110 #[error("Invalid number: {0}")]
111 InvalidNumber(String),
112
113 /// Invalid expression syntax
114 #[error("Invalid expression: {0}")]
115 InvalidExpression(String),
116
117 /// Invalid tensor element
118 #[error("Invalid tensor element - must be number or array")]
119 InvalidTensor,
120
121 /// Nested objects not allowed in scalar context
122 #[error("Nested objects not allowed in scalar context")]
123 NestedObject,
124
125 /// Reference parsing failed
126 #[error("Invalid reference: {0}")]
127 InvalidReference(String),
128
129 /// Invalid Unicode encoding
130 ///
131 /// This error occurs when JSON contains invalid Unicode sequences, such as:
132 /// - Unpaired UTF-16 surrogates (`\uD83D` without its low surrogate pair)
133 /// - Invalid surrogate pairs (low surrogate before high surrogate)
134 /// - Unescaped control characters in strings
135 ///
136 /// # UTF-16 Surrogate Background
137 ///
138 /// JSON's `\uXXXX` escapes use UTF-16 encoding. Characters outside the
139 /// Basic Multilingual Plane (U+10000 and above, including emoji) require
140 /// surrogate pairs: a high surrogate (0xD800-0xDBFF) followed by a low
141 /// surrogate (0xDC00-0xDFFF).
142 ///
143 /// # Solutions
144 ///
145 /// 1. **Use the `SurrogatePolicy::ReplaceWithFFFD` option**:
146 /// Replace invalid surrogates with the Unicode replacement character.
147 ///
148 /// 2. **Preprocess the JSON** to fix or remove invalid sequences.
149 ///
150 /// 3. **Ensure the source system** produces valid UTF-8/UTF-16 pairs.
151 #[error("Invalid Unicode: {0}")]
152 InvalidUnicode(String),
153
154 /// Maximum recursion depth exceeded
155 #[error("Maximum recursion depth ({0}) exceeded - possible deeply nested structure")]
156 MaxDepthExceeded(usize),
157
158 /// Maximum array size exceeded
159 #[error("Maximum array size ({0}) exceeded - array has {1} elements")]
160 MaxArraySizeExceeded(usize, usize),
161
162 /// Maximum string length exceeded
163 #[error("Maximum string length ({0}) exceeded - string has {1} characters")]
164 MaxStringLengthExceeded(usize, usize),
165
166 /// Maximum object size exceeded
167 #[error("Maximum object size ({0}) exceeded - object has {1} keys")]
168 MaxObjectSizeExceeded(usize, usize),
169
170 /// Integer value outside i64 range
171 ///
172 /// JSON supports arbitrary-precision numbers, but HEDL's `Value::Int`
173 /// uses `i64` which has a fixed range: -9,223,372,036,854,775,808 to
174 /// 9,223,372,036,854,775,807.
175 ///
176 /// # Common Causes
177 ///
178 /// - Twitter/Snowflake IDs (often exceed `i64::MAX`)
179 /// - Unsigned 64-bit integers from other systems
180 /// - Large database auto-increment IDs
181 /// - Timestamps in nanoseconds beyond year 2262
182 ///
183 /// # Solutions
184 ///
185 /// 1. **Use strings for large IDs** (recommended):
186 /// ```json
187 /// {"tweet_id": "18446744073709551615"}
188 /// ```
189 ///
190 /// 2. **Use hex encoding**:
191 /// ```json
192 /// {"large_number": "0xFFFFFFFFFFFFFFFF"}
193 /// ```
194 ///
195 /// 3. **Split into high/low parts**:
196 /// ```json
197 /// {"value_high": 1844674407, "value_low": 3709551615}
198 /// ```
199 ///
200 /// # Examples
201 ///
202 /// ```
203 /// use hedl_json::{from_json, FromJsonConfig};
204 ///
205 /// let json = r#"{"id": 18446744073709551615}"#;
206 /// let result = from_json(json, &FromJsonConfig::default());
207 ///
208 /// assert!(result.is_err());
209 /// assert!(result.unwrap_err().to_string().contains("Integer overflow"));
210 /// ```
211 #[error(
212 "Integer overflow: {value} exceeds i64 range [{min}..{max}]. \
213 Consider using a string for large IDs or timestamps."
214 )]
215 IntegerOverflow {
216 /// String representation of the overflowing value.
217 value: String,
218 /// Maximum valid i64 value.
219 max: i64,
220 /// Minimum valid i64 value.
221 min: i64,
222 },
223}
224
225impl From<serde_json::Error> for JsonConversionError {
226 fn from(err: serde_json::Error) -> Self {
227 let msg = err.to_string();
228
229 // Detect surrogate-related errors from serde_json
230 if msg.contains("lone surrogate")
231 || msg.contains("surrogate")
232 || msg.contains("invalid unicode")
233 {
234 JsonConversionError::InvalidUnicode(format!(
235 "Invalid UTF-16 surrogate sequence: {msg}. \
236 JSON contains unpaired surrogates which cannot be represented \
237 in Rust UTF-8 strings. Configure SurrogatePolicy::ReplaceWithFFFD \
238 to replace with the Unicode replacement character (U+FFFD)."
239 ))
240 } else if msg.contains("control character") {
241 JsonConversionError::InvalidUnicode(format!(
242 "Unescaped control character in JSON string: {msg}. \
243 Control characters (U+0000-U+001F) must be escaped as \\uXXXX \
244 per RFC 8259."
245 ))
246 } else {
247 JsonConversionError::ParseError(msg)
248 }
249 }
250}
251
252/// Check if a `serde_json::Number` represents an integer outside i64 range
253///
254/// Returns `true` if the number is an integer (not a float) but cannot
255/// fit in i64 range. This happens when:
256/// - The value is larger than `i64::MAX` (9,223,372,036,854,775,807)
257/// - The value is smaller than `i64::MIN` (-9,223,372,036,854,775,808)
258///
259/// # Implementation Note
260///
261/// `serde_json::Number::as_i64()` returns `None` for both:
262/// 1. Numbers outside i64 range (overflow)
263/// 2. Floating point numbers
264///
265/// We use `as_u64()` to detect case 1: if `as_i64()` fails but `as_u64()`
266/// succeeds, the number is an unsigned integer too large for i64.
267/// We also check `is_i64()` to catch negative overflow cases.
268#[inline]
269pub(super) fn is_integer_overflow(n: &serde_json::Number) -> bool {
270 // If as_i64() fails but as_u64() succeeds, it's an unsigned int overflow
271 // Or if is_i64() is true but as_i64() is None, it's a signed int overflow
272 n.as_i64().is_none() && (n.as_u64().is_some() || n.is_i64())
273}
274
275/// Convert JSON number to HEDL Value with overflow detection
276///
277/// This function enforces strict integer validation to prevent silent
278/// precision loss from i64 overflow converting to f64.
279///
280/// # Behavior
281///
282/// 1. **i64 range integers**: Convert to `Value::Int(i64)`
283/// 2. **Overflow integers**: Return `IntegerOverflow` error
284/// 3. **Floating point**: Convert to `Value::Float(f64)`
285///
286/// # Implementation Details
287///
288/// - Valid i64 values are converted to `Value::Int`
289/// - Integer values outside i64 range trigger `IntegerOverflow` error
290/// - Floating point values are converted to `Value::Float`
291/// - Uses fast-path optimization for common i64 case
292#[inline]
293pub(super) fn json_number_to_value(n: &serde_json::Number) -> Result<Value, JsonConversionError> {
294 // Try i64 first (most common case - fast path)
295 if let Some(i) = n.as_i64() {
296 return Ok(Value::Int(i));
297 }
298
299 // Check for integer overflow
300 if is_integer_overflow(n) {
301 return Err(JsonConversionError::IntegerOverflow {
302 value: n.to_string(),
303 max: i64::MAX,
304 min: i64::MIN,
305 });
306 }
307
308 // Must be a float
309 if let Some(f) = n.as_f64() {
310 Ok(Value::Float(f))
311 } else {
312 // Should never happen with valid JSON
313 Err(JsonConversionError::InvalidNumber(n.to_string()))
314 }
315}
316
317/// Configuration for JSON import
318///
319/// Controls how JSON is converted to HEDL, including security limits
320/// to prevent denial-of-service attacks from malicious inputs.
321///
322/// # High Default Limits
323///
324/// The default limits are set intentionally high to handle large-scale
325/// data processing scenarios common in ML/AI applications:
326///
327/// - **10,000 depth**: Deep nesting in complex hierarchical data
328/// - **10,000,000 array size**: Large datasets and batches
329/// - **100 MB string length**: Base64-encoded binary data, embeddings
330/// - **100,000 object size**: Rich metadata and configuration objects
331///
332/// These defaults prioritize functionality over restrictiveness. For
333/// untrusted input, consider using the builder pattern with custom limits.
334///
335/// # Examples
336///
337/// ```text
338/// use hedl_json::FromJsonConfig;
339///
340/// // Default configuration with high limits for ML/data workloads
341/// let config = FromJsonConfig::default();
342///
343/// // Custom configuration using builder pattern
344/// let custom_config = FromJsonConfig::builder()
345/// .max_depth(1_000)
346/// .max_array_size(100_000)
347/// .max_string_length(10 * 1024 * 1024) // 10 MB
348/// .build();
349///
350/// // Strict configuration for untrusted input
351/// let strict_config = FromJsonConfig::builder()
352/// .max_depth(50)
353/// .max_array_size(10_000)
354/// .max_string_length(1_000_000)
355/// .max_object_size(1_000)
356/// .build();
357///
358/// // Unlimited configuration (use with caution)
359/// let unlimited_config = FromJsonConfig::builder()
360/// .unlimited()
361/// .build();
362/// ```
363#[derive(Debug, Clone)]
364pub struct FromJsonConfig {
365 /// Default type name for arrays without metadata
366 pub default_type_name: String,
367
368 /// HEDL version to use
369 pub version: (u32, u32),
370
371 /// Maximum recursion depth (default: 10,000)
372 ///
373 /// Prevents stack overflow from deeply nested JSON structures.
374 /// Set to `None` to disable (not recommended for untrusted input).
375 pub max_depth: Option<usize>,
376
377 /// Maximum array size (default: 10,000,000)
378 ///
379 /// Prevents memory exhaustion from extremely large arrays.
380 /// JSON arrays can contain large datasets, batches, or embeddings.
381 /// Set to `None` to disable (not recommended for untrusted input).
382 pub max_array_size: Option<usize>,
383
384 /// Maximum string length (default: 100 MB)
385 ///
386 /// Prevents memory exhaustion from extremely large strings.
387 /// JSON strings often contain base64-encoded binary data, large
388 /// text fields, or embedded documents requiring high limits.
389 /// Set to `None` to disable (not recommended for untrusted input).
390 pub max_string_length: Option<usize>,
391
392 /// Maximum object size (default: 100,000)
393 ///
394 /// Prevents memory exhaustion from objects with many keys.
395 /// Configuration files and metadata-rich objects can have many properties.
396 /// Set to `None` to disable (not recommended for untrusted input).
397 pub max_object_size: Option<usize>,
398
399 /// Policy for handling unpaired UTF-16 surrogates
400 ///
401 /// Some systems emit JSON with unpaired surrogates (e.g., truncated
402 /// JavaScript strings). This setting controls how to handle them.
403 ///
404 /// Default: `SurrogatePolicy::Reject` (strict validation)
405 pub surrogate_policy: SurrogatePolicy,
406
407 /// Enable lenient JSON parsing (JSON5-style trailing commas and comments)
408 ///
409 /// When enabled, the parser accepts:
410 /// - Trailing commas in arrays and objects
411 /// - Single-line (//) and multi-line (/* */) comments
412 ///
413 /// Requires the `lenient` feature flag.
414 ///
415 /// Default: false (strict RFC 8259 JSON)
416 #[cfg(feature = "lenient")]
417 pub lenient: bool,
418}
419
420impl Default for FromJsonConfig {
421 fn default() -> Self {
422 Self {
423 default_type_name: "Item".to_string(),
424 version: (2, 0),
425 max_depth: Some(DEFAULT_MAX_DEPTH),
426 max_array_size: Some(DEFAULT_MAX_ARRAY_SIZE),
427 max_string_length: Some(DEFAULT_MAX_STRING_LENGTH),
428 max_object_size: Some(DEFAULT_MAX_OBJECT_SIZE),
429 surrogate_policy: SurrogatePolicy::default(),
430 #[cfg(feature = "lenient")]
431 lenient: false,
432 }
433 }
434}
435
436impl FromJsonConfig {
437 /// Create a new builder for configuring JSON import
438 ///
439 /// # Examples
440 ///
441 /// ```text
442 /// use hedl_json::FromJsonConfig;
443 ///
444 /// let config = FromJsonConfig::builder()
445 /// .max_depth(1_000)
446 /// .max_array_size(100_000)
447 /// .build();
448 /// ```
449 #[must_use]
450 pub fn builder() -> FromJsonConfigBuilder {
451 FromJsonConfigBuilder::default()
452 }
453}
454
455impl hedl_core::convert::ImportConfig for FromJsonConfig {
456 fn default_type_name(&self) -> &str {
457 &self.default_type_name
458 }
459
460 fn version(&self) -> (u32, u32) {
461 self.version
462 }
463}
464
465/// Builder for `FromJsonConfig`
466///
467/// Provides ergonomic configuration of JSON import limits and behavior.
468///
469/// # Examples
470///
471/// ```text
472/// use hedl_json::FromJsonConfig;
473///
474/// // Custom limits
475/// let config = FromJsonConfig::builder()
476/// .max_depth(1_000)
477/// .max_array_size(100_000)
478/// .max_string_length(10 * 1024 * 1024)
479/// .build();
480///
481/// // Strict limits for untrusted input
482/// let strict = FromJsonConfig::builder()
483/// .max_depth(50)
484/// .max_array_size(10_000)
485/// .max_string_length(1_000_000)
486/// .max_object_size(1_000)
487/// .build();
488///
489/// // Unlimited (use with caution!)
490/// let unlimited = FromJsonConfig::builder()
491/// .unlimited()
492/// .build();
493/// ```
494#[derive(Debug, Clone)]
495pub struct FromJsonConfigBuilder {
496 default_type_name: String,
497 version: (u32, u32),
498 max_depth: Option<usize>,
499 max_array_size: Option<usize>,
500 max_string_length: Option<usize>,
501 max_object_size: Option<usize>,
502 surrogate_policy: SurrogatePolicy,
503 #[cfg(feature = "lenient")]
504 lenient: bool,
505}
506
507impl Default for FromJsonConfigBuilder {
508 fn default() -> Self {
509 Self {
510 default_type_name: "Item".to_string(),
511 version: (2, 0),
512 max_depth: Some(DEFAULT_MAX_DEPTH),
513 max_array_size: Some(DEFAULT_MAX_ARRAY_SIZE),
514 max_string_length: Some(DEFAULT_MAX_STRING_LENGTH),
515 max_object_size: Some(DEFAULT_MAX_OBJECT_SIZE),
516 surrogate_policy: SurrogatePolicy::default(),
517 #[cfg(feature = "lenient")]
518 lenient: false,
519 }
520 }
521}
522
523impl FromJsonConfigBuilder {
524 /// Set the default type name for arrays without metadata
525 pub fn default_type_name(mut self, name: impl Into<String>) -> Self {
526 self.default_type_name = name.into();
527 self
528 }
529
530 /// Set the HEDL version to use
531 #[must_use]
532 pub fn version(mut self, major: u32, minor: u32) -> Self {
533 self.version = (major, minor);
534 self
535 }
536
537 /// Set the maximum recursion depth
538 ///
539 /// Use `None` to disable the limit (not recommended for untrusted input).
540 #[must_use]
541 pub fn max_depth(mut self, limit: usize) -> Self {
542 self.max_depth = Some(limit);
543 self
544 }
545
546 /// Set the maximum array size
547 ///
548 /// Use `None` to disable the limit (not recommended for untrusted input).
549 #[must_use]
550 pub fn max_array_size(mut self, limit: usize) -> Self {
551 self.max_array_size = Some(limit);
552 self
553 }
554
555 /// Set the maximum string length in bytes
556 ///
557 /// Use `None` to disable the limit (not recommended for untrusted input).
558 #[must_use]
559 pub fn max_string_length(mut self, limit: usize) -> Self {
560 self.max_string_length = Some(limit);
561 self
562 }
563
564 /// Set the maximum object size (number of keys)
565 ///
566 /// Use `None` to disable the limit (not recommended for untrusted input).
567 #[must_use]
568 pub fn max_object_size(mut self, limit: usize) -> Self {
569 self.max_object_size = Some(limit);
570 self
571 }
572
573 /// Set the policy for handling unpaired UTF-16 surrogates
574 ///
575 /// # Options
576 ///
577 /// - `SurrogatePolicy::Reject` (default): Error on invalid surrogates
578 /// - `SurrogatePolicy::ReplaceWithFFFD`: Replace with U+FFFD
579 /// - `SurrogatePolicy::Skip`: Remove invalid surrogates silently
580 ///
581 /// # Example
582 ///
583 /// ```text
584 /// use hedl_json::{FromJsonConfig, SurrogatePolicy};
585 ///
586 /// let config = FromJsonConfig::builder()
587 /// .surrogate_policy(SurrogatePolicy::ReplaceWithFFFD)
588 /// .build();
589 /// ```
590 #[must_use]
591 pub fn surrogate_policy(mut self, policy: SurrogatePolicy) -> Self {
592 self.surrogate_policy = policy;
593 self
594 }
595
596 /// Disable all limits (use with caution - only for trusted input)
597 ///
598 /// This removes all safety limits and can lead to memory exhaustion
599 /// or stack overflow with malicious or malformed JSON.
600 #[must_use]
601 pub fn unlimited(mut self) -> Self {
602 self.max_depth = None;
603 self.max_array_size = None;
604 self.max_string_length = None;
605 self.max_object_size = None;
606 self
607 }
608
609 /// Enable lenient JSON parsing (trailing commas, comments)
610 ///
611 /// When enabled, the parser accepts:
612 /// - Trailing commas in arrays and objects
613 /// - Single-line (//) and multi-line (/* */) comments
614 ///
615 /// Requires the `lenient` feature flag.
616 ///
617 /// # Examples
618 ///
619 /// ```text
620 /// use hedl_json::FromJsonConfig;
621 ///
622 /// let config = FromJsonConfig::builder()
623 /// .lenient(true)
624 /// .build();
625 ///
626 /// // Now you can parse JSON with trailing commas
627 /// let json = r#"{"name": "Alice", "age": 30,}"#;
628 /// ```
629 #[cfg(feature = "lenient")]
630 #[must_use]
631 pub fn lenient(mut self, lenient: bool) -> Self {
632 self.lenient = lenient;
633 self
634 }
635
636 /// Build the configuration
637 #[must_use]
638 pub fn build(self) -> FromJsonConfig {
639 FromJsonConfig {
640 default_type_name: self.default_type_name,
641 version: self.version,
642 max_depth: self.max_depth,
643 max_array_size: self.max_array_size,
644 max_string_length: self.max_string_length,
645 max_object_size: self.max_object_size,
646 surrogate_policy: self.surrogate_policy,
647 #[cfg(feature = "lenient")]
648 lenient: self.lenient,
649 }
650 }
651}
652
653/// Schema cache for avoiding redundant schema inference
654///
655/// When converting large JSON arrays to matrix lists, we often encounter the same
656/// structure repeatedly. Caching the inferred schema significantly improves performance
657/// by avoiding redundant key iteration and sorting.
658///
659/// # Performance Impact
660///
661/// - First schema inference: ~O(n*log(n)) where n is number of keys
662/// - Cached lookup: ~O(1) hash map lookup
663/// - Expected speedup: 30-50% for documents with repeated array structures
664pub(super) type SchemaCache = std::collections::HashMap<Vec<String>, Vec<String>>;