Skip to main content

json_extractor/
lib.rs

1//! # JSON Fragment Scanner
2//!
3//! A high-performance two-stage JSON fragment scanner that identifies and extracts
4//! JSON objects and arrays from complete documents using SIMD acceleration.
5//!
6//! ## Features
7//!
8//! - **Two-stage pipeline**: Bulk character classification + fragment extraction
9//! - **SIMD-accelerated**: AVX2/SSE4.2 for maximum throughput (5-10 GiB/s)
10//! - **Fragment detection**: Identifies JSON objects (`{}`) and arrays (`[]`)
11//! - **Error reporting**: Detailed error information for invalid fragments
12//! - **Position tracking**: Absolute byte offsets for each fragment
13//! - **Nesting support**: Handles arbitrary levels of nesting
14//!
15//! ## Quick Start
16//!
17//! Extract the first JSON fragment from a string:
18//!
19//! ```
20//! use json_extractor::extract_first;
21//!
22//! let input = r#"some log prefix {"name": "Alice"} tail"#;
23//! assert_eq!(extract_first(input), Some(r#"{"name": "Alice"}"#));
24//! ```
25//!
26//! ## Advanced Usage
27//!
28//! Use [`StagedScanner`] for repeated scans with buffer reuse:
29//!
30//! ```
31//! use json_extractor::StagedScanner;
32//!
33//! let mut scanner = StagedScanner::new();
34//! let data = br#"{"name": "Alice"} {"age": 30}"#;
35//! let fragments = scanner.scan_fragments(data);
36//!
37//! assert_eq!(fragments.len(), 2);
38//! assert!(fragments[0].is_complete());
39//! assert_eq!(fragments[0].start, 0);
40//! ```
41
42// Two-stage pipeline modules
43mod stage1;
44mod stage2;
45
46// Character classification lookup table
47pub(crate) mod charclass;
48
49/// Types of errors that can occur during parsing
50#[derive(Debug, Clone, Copy, PartialEq, Eq)]
51pub enum ErrorKind {
52    /// Reached end of input while parsing a fragment
53    UnexpectedEof,
54    /// String was not terminated with a closing quote
55    UnterminatedString,
56    /// Object missing closing brace `}`
57    MissingClosingBrace,
58    /// Array missing closing bracket `]`
59    MissingClosingBracket,
60    /// Invalid character encountered in the given context
61    InvalidCharacter(u8),
62    /// Invalid escape sequence in a string
63    InvalidEscape,
64    /// Missing colon after object key
65    MissingColon,
66    /// Missing comma between values
67    MissingComma,
68    /// Invalid value in the current context
69    InvalidValue,
70    /// Mismatched bracket (e.g., `[` closed with `}`)
71    MismatchedBracket,
72    /// Invalid JSON structure detected
73    InvalidStructure,
74}
75
76impl std::fmt::Display for ErrorKind {
77    #[cold]
78    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
79        match self {
80            ErrorKind::UnexpectedEof => write!(f, "Unexpected end of input"),
81            ErrorKind::UnterminatedString => write!(f, "Unterminated string"),
82            ErrorKind::MissingClosingBrace => write!(f, "Missing closing brace"),
83            ErrorKind::MissingClosingBracket => write!(f, "Missing closing bracket"),
84            ErrorKind::InvalidCharacter(c) => {
85                write!(f, "Invalid character: {}", char::from(*c))
86            }
87            ErrorKind::InvalidEscape => write!(f, "Invalid escape sequence"),
88            ErrorKind::MissingColon => write!(f, "Missing colon after object key"),
89            ErrorKind::MissingComma => write!(f, "Missing comma between values"),
90            ErrorKind::InvalidValue => write!(f, "Invalid value"),
91            ErrorKind::MismatchedBracket => write!(f, "Mismatched bracket"),
92            ErrorKind::InvalidStructure => write!(f, "Invalid JSON structure"),
93        }
94    }
95}
96
97impl std::error::Error for ErrorKind {}
98
99/// Fragment completion status
100#[derive(Debug, Clone, Copy, PartialEq, Eq)]
101pub enum FragmentStatus {
102    /// Fragment is complete and valid
103    Complete,
104    /// Fragment is incomplete with an error
105    Incomplete(ErrorKind),
106}
107
108/// Represents a found JSON fragment
109///
110/// A fragment identifies a JSON object or array in the input stream,
111/// including its position and completion status.
112#[derive(Debug, Clone, PartialEq, Eq)]
113pub struct Fragment {
114    /// Absolute byte offset from the start of the first chunk
115    pub start: usize,
116    /// Length of the fragment in bytes
117    pub length: usize,
118    /// Completion status
119    pub status: FragmentStatus,
120}
121
122impl Fragment {
123    /// Get the end position (exclusive)
124    ///
125    /// # Returns
126    /// The byte position immediately after the last byte of the fragment
127    #[inline]
128    pub fn end(&self) -> usize {
129        self.start + self.length
130    }
131
132    /// Check if the fragment is complete
133    ///
134    /// # Returns
135    /// `true` if the fragment was successfully parsed, `false` if it has errors
136    #[inline]
137    pub fn is_complete(&self) -> bool {
138        matches!(self.status, FragmentStatus::Complete)
139    }
140}
141
142/// Stateful JSON fragment scanner with buffer reuse
143///
144/// The scanner processes complete JSON documents to identify and extract JSON objects
145/// and arrays using a high-performance two-stage pipeline. Reuses internal buffers
146/// across multiple scans to eliminate allocation overhead.
147///
148/// # Example
149///
150/// ```
151/// use json_extractor::StagedScanner;
152///
153/// let mut scanner = StagedScanner::new();
154/// let data = br#"{"name": "Alice"} {"age": 30}"#;
155/// let fragments = scanner.scan_fragments(data);
156///
157/// assert_eq!(fragments.len(), 2);
158/// assert!(fragments[0].is_complete());
159/// ```
160pub struct StagedScanner {
161    /// Reusable Stage 1 output buffers
162    stage1_output: stage1::Stage1Output,
163    /// Reusable fragment output buffer
164    fragments: Vec<Fragment>,
165}
166
167impl StagedScanner {
168    /// Create a new scanner with empty buffers
169    ///
170    /// # Example
171    ///
172    /// ```
173    /// use json_extractor::StagedScanner;
174    ///
175    /// let mut scanner = StagedScanner::new();
176    /// ```
177    pub fn new() -> Self {
178        Self {
179            stage1_output: stage1::Stage1Output::new(),
180            fragments: Vec::new(),
181        }
182    }
183
184    /// Scan a complete JSON document and extract all fragments
185    ///
186    /// This method reuses internal buffers across calls for maximum performance.
187    /// Buffers are cleared but not deallocated, preserving capacity.
188    ///
189    /// # Arguments
190    ///
191    /// * `data` - Complete JSON document bytes
192    ///
193    /// # Returns
194    ///
195    /// Slice of all fragments found in the document (valid until next call)
196    ///
197    /// # Example
198    ///
199    /// ```
200    /// use json_extractor::StagedScanner;
201    ///
202    /// let mut scanner = StagedScanner::new();
203    /// let data = br#"{"name": "Alice"} {"age": 30}"#;
204    /// let fragments = scanner.scan_fragments(data);
205    ///
206    /// assert_eq!(fragments.len(), 2);
207    /// assert!(fragments[0].is_complete());
208    /// assert_eq!(fragments[0].start, 0);
209    /// ```
210    pub fn scan_fragments(&mut self, data: &[u8]) -> &[Fragment] {
211        // Clear buffers (preserves capacity)
212        self.fragments.clear();
213
214        // Stage 1: Find all structural character positions + metadata (reuses buffer)
215        stage1::find_structural_indices(data, &mut self.stage1_output);
216
217        // Stage 2: Extract fragments using precomputed bracket pairs and string ranges (reuses buffer)
218        stage2::extract_fragments(
219            data,
220            &self.stage1_output.structural_indices,
221            &self.stage1_output.bracket_pairs,
222            // &self.stage1_output.string_ranges,
223            &mut self.fragments,
224        );
225
226        &self.fragments
227    }
228}
229
230impl Default for StagedScanner {
231    fn default() -> Self {
232        Self::new()
233    }
234}
235
236/// Convenience stateless API
237///
238/// For better performance with repeated scans, use [`StagedScanner`] instead.
239pub struct JsonFragmentScanner;
240
241impl JsonFragmentScanner {
242    /// Scan a complete JSON document and extract all fragments
243    ///
244    /// This is the main entry point for the two-stage pipeline:
245    /// - Stage 1: Identify structural character positions using SIMD
246    /// - Stage 2: Extract fragments by matching brackets
247    ///
248    /// # Arguments
249    ///
250    /// * `data` - Complete JSON document bytes
251    ///
252    /// # Returns
253    ///
254    /// Vector of all fragments found in the document
255    ///
256    /// # Example
257    ///
258    /// ```
259    /// use json_extractor::JsonFragmentScanner;
260    ///
261    /// let data = br#"{"name": "Alice"} {"age": 30}"#;
262    /// let fragments = JsonFragmentScanner::scan_fragments(data);
263    ///
264    /// assert_eq!(fragments.len(), 2);
265    /// assert!(fragments[0].is_complete());
266    /// assert_eq!(fragments[0].start, 0);
267    /// ```
268    pub fn scan_fragments(data: &[u8]) -> Vec<Fragment> {
269        // Use stateful scanner for single scan (no reuse benefit)
270        let mut scanner = StagedScanner::new();
271        scanner.scan_fragments(data).to_vec()
272    }
273}
274
275/// Extract the first complete JSON fragment from a string.
276///
277/// Returns `None` if no complete JSON object or array is found.
278///
279/// # Example
280///
281/// ```
282/// use json_extractor::extract_first;
283///
284/// let input = r#"hello {"name": "Alice"} world"#;
285/// assert_eq!(extract_first(input), Some(r#"{"name": "Alice"}"#));
286///
287/// assert_eq!(extract_first("no json here"), None);
288/// ```
289pub fn extract_first(data: &str) -> Option<&str> {
290    let mut scanner = StagedScanner::new();
291    scanner
292        .scan_fragments(data.as_bytes())
293        .iter()
294        .find(|f| f.is_complete())
295        .map(|f| &data[f.start..f.end()])
296}
297
298#[cfg(test)]
299mod tests {
300    use super::*;
301
302    #[test]
303    fn test_single_complete_object() {
304        let data = br#"{"name": "Alice"}"#;
305        let fragments = JsonFragmentScanner::scan_fragments(data);
306
307        assert_eq!(
308            fragments.len(),
309            1,
310            "Expected 1 fragment, got {}",
311            fragments.len()
312        );
313        if !fragments.is_empty() {
314            eprintln!("Fragment status: {:?}", fragments[0].status);
315        }
316        assert!(fragments[0].is_complete());
317        assert_eq!(fragments[0].start, 0);
318        assert_eq!(fragments[0].length, 17);
319    }
320
321    #[test]
322    fn test_single_complete_array() {
323        let data = br#"[1, 2, 3]"#;
324        let fragments = JsonFragmentScanner::scan_fragments(data);
325
326        assert_eq!(fragments.len(), 1);
327        assert!(fragments[0].is_complete());
328        assert_eq!(fragments[0].start, 0);
329        assert_eq!(fragments[0].length, 9);
330    }
331
332    #[test]
333    fn test_multiple_fragments() {
334        let data = br#"{"name": "Alice"} {"age": 30}"#;
335        let fragments = JsonFragmentScanner::scan_fragments(data);
336
337        assert_eq!(fragments.len(), 2);
338        assert!(fragments[0].is_complete());
339        assert_eq!(fragments[0].start, 0);
340        assert_eq!(fragments[0].length, 17);
341
342        assert!(fragments[1].is_complete());
343        assert_eq!(fragments[1].start, 18);
344        assert_eq!(fragments[1].length, 11);
345    }
346
347    #[test]
348    fn test_nested_objects() {
349        let data = br#"{"outer": {"inner": 123}}"#;
350        let fragments = JsonFragmentScanner::scan_fragments(data);
351
352        assert_eq!(fragments.len(), 1);
353        assert!(fragments[0].is_complete());
354    }
355
356    #[test]
357    fn test_nested_arrays() {
358        let data = br#"[[1, 2], [3, 4]]"#;
359        let fragments = JsonFragmentScanner::scan_fragments(data);
360
361        assert_eq!(fragments.len(), 1);
362        assert!(fragments[0].is_complete());
363    }
364
365    #[test]
366    fn test_mixed_nesting() {
367        let data = br#"{"array": [1, {"nested": true}]}"#;
368        let fragments = JsonFragmentScanner::scan_fragments(data);
369
370        assert_eq!(fragments.len(), 1);
371        assert!(fragments[0].is_complete());
372    }
373
374    #[test]
375    fn test_string_with_escapes() {
376        let data = br#"{"text": "hello \"world\""}"#;
377        let fragments = JsonFragmentScanner::scan_fragments(data);
378
379        assert_eq!(fragments.len(), 1);
380        assert!(fragments[0].is_complete());
381    }
382
383    #[test]
384    fn test_string_with_brackets() {
385        let data = br#"{"text": "has { and ] chars"}"#;
386        let fragments = JsonFragmentScanner::scan_fragments(data);
387
388        assert_eq!(fragments.len(), 1);
389        assert!(fragments[0].is_complete());
390    }
391
392    #[test]
393    fn test_incomplete_fragment() {
394        // Full-document mode: incomplete data returns incomplete fragment
395        let incomplete_data = br#"{"field": "#;
396        let fragments = JsonFragmentScanner::scan_fragments(incomplete_data);
397        assert_eq!(fragments.len(), 1);
398        assert!(!fragments[0].is_complete());
399        assert_eq!(fragments[0].start, 0);
400    }
401
402    #[test]
403    fn test_incomplete_string() {
404        // Full-document mode: unterminated string in incomplete fragment
405        let incomplete_string = br#"{"text": "hel"#;
406        let fragments = JsonFragmentScanner::scan_fragments(incomplete_string);
407        assert_eq!(fragments.len(), 1);
408        assert!(!fragments[0].is_complete());
409    }
410
411    #[test]
412    fn test_numbers() {
413        let data = br#"{"int": 123, "float": 45.67, "exp": 1.2e-10}"#;
414        let fragments = JsonFragmentScanner::scan_fragments(data);
415
416        assert_eq!(fragments.len(), 1);
417        assert!(fragments[0].is_complete());
418    }
419
420    #[test]
421    fn test_booleans_and_null() {
422        let data = br#"{"bool": true, "other": false, "nothing": null}"#;
423        let fragments = JsonFragmentScanner::scan_fragments(data);
424
425        assert_eq!(fragments.len(), 1);
426        assert!(fragments[0].is_complete());
427    }
428
429    #[test]
430    fn test_whitespace_handling() {
431        let data = b"  \n\t  {  \"test\"  :  123  }  \n  ";
432        let fragments = JsonFragmentScanner::scan_fragments(data);
433
434        assert_eq!(fragments.len(), 1);
435        assert!(fragments[0].is_complete());
436    }
437
438    #[test]
439    fn test_text_before_fragment() {
440        let data = br#"some random text {"json": "here"} more text"#;
441        let fragments = JsonFragmentScanner::scan_fragments(data);
442
443        assert_eq!(fragments.len(), 1);
444        assert!(fragments[0].is_complete());
445        assert_eq!(fragments[0].start, 17);
446    }
447
448    #[test]
449    fn test_empty_object() {
450        let data = br#"{}"#;
451        let fragments = JsonFragmentScanner::scan_fragments(data);
452
453        assert_eq!(fragments.len(), 1);
454        assert!(fragments[0].is_complete());
455        assert_eq!(fragments[0].length, 2);
456    }
457
458    #[test]
459    fn test_empty_array() {
460        let data = br#"[]"#;
461        let fragments = JsonFragmentScanner::scan_fragments(data);
462
463        assert_eq!(fragments.len(), 1);
464        assert!(fragments[0].is_complete());
465        assert_eq!(fragments[0].length, 2);
466    }
467
468    #[test]
469    fn test_simple_object_stage1_debug() {
470        let json = br#"{"a":1}"#;
471        let mut stage1_out = crate::stage1::Stage1Output::new();
472        crate::stage1::find_structural_indices(json, &mut stage1_out);
473        println!("Simple JSON: {}", String::from_utf8_lossy(json));
474        println!(
475            "Stage1 found {} structural indices",
476            stage1_out.structural_indices.len()
477        );
478        println!("Expected: 5 (1 left-brace + 2 quotes + 1 colon + 1 right-brace)");
479        println!("Indices: {:?}", stage1_out.structural_indices);
480        assert!(
481            stage1_out.structural_indices.len() >= 5,
482            "Should find at least 5 structural chars"
483        );
484    }
485
486    #[test]
487    fn test_deeply_nested() {
488        // Create deeply nested structure
489        let mut deep = String::from("{");
490        for _ in 0..50 {
491            deep.push_str("\"a\":{");
492        }
493        deep.push_str("\"value\":123");
494        for _ in 0..50 {
495            deep.push('}');
496        }
497        deep.push('}');
498
499        println!("Generated JSON: {} chars", deep.len());
500        println!("First 100 chars: {}", &deep[..100.min(deep.len())]);
501
502        // Debug Stage1 output
503        let mut stage1_out = crate::stage1::Stage1Output::new();
504        crate::stage1::find_structural_indices(deep.as_bytes(), &mut stage1_out);
505        println!(
506            "Stage1: {} structural indices, {} bracket pairs",
507            stage1_out.structural_indices.len(),
508            stage1_out.bracket_pairs.len(),
509        );
510
511        let fragments = JsonFragmentScanner::scan_fragments(deep.as_bytes());
512        println!("Fragments found: {}", fragments.len());
513        for (i, f) in fragments.iter().enumerate() {
514            println!(
515                "  Fragment {}: start={}, len={}, complete={}",
516                i,
517                f.start,
518                f.length,
519                f.is_complete()
520            );
521        }
522        assert_eq!(fragments.len(), 1, "Expected 1 fragment");
523        assert!(fragments[0].is_complete(), "Fragment should be complete");
524    }
525
526    #[test]
527    fn test_fragment_end_method() {
528        let fragment = Fragment {
529            start: 10,
530            length: 20,
531            status: FragmentStatus::Complete,
532        };
533        assert_eq!(fragment.end(), 30);
534    }
535
536    #[test]
537    fn test_trailing_comma_in_object() {
538        let data = br#"{"a": 1,}"#;
539        let fragments = JsonFragmentScanner::scan_fragments(data);
540
541        assert_eq!(fragments.len(), 1);
542        // Trailing comma followed by } is valid (similar to arrays)
543        // After comma we're in ExpectObjectKey and } is allowed there
544        assert!(fragments[0].is_complete());
545    }
546
547    #[test]
548    fn test_trailing_comma_in_array() {
549        let data = br#"[1, 2,]"#;
550        let fragments = JsonFragmentScanner::scan_fragments(data);
551
552        assert_eq!(fragments.len(), 1);
553        // Trailing comma followed by ] is valid in ExpectValue context
554        assert!(fragments[0].is_complete());
555    }
556
557    #[test]
558    fn test_complex_valid_object() {
559        let data = br#"{"a": 1, "b": [2, 3], "c": {"d": true}}"#;
560        let fragments = JsonFragmentScanner::scan_fragments(data);
561
562        assert_eq!(fragments.len(), 1);
563        assert!(fragments[0].is_complete());
564    }
565
566    #[test]
567    fn test_complex_valid_array() {
568        let data = br#"[1, "two", {"three": 3}, [4, 5], true, null]"#;
569        let fragments = JsonFragmentScanner::scan_fragments(data);
570
571        assert_eq!(fragments.len(), 1);
572        assert!(fragments[0].is_complete());
573    }
574
575    #[test]
576    fn test_empty_string_as_key() {
577        let data = br#"{"": "value"}"#;
578        let fragments = JsonFragmentScanner::scan_fragments(data);
579
580        assert_eq!(fragments.len(), 1);
581        assert!(fragments[0].is_complete());
582    }
583
584    #[test]
585    fn test_array_in_object_value_position() {
586        let data = br#"{"key": [1, 2, 3]}"#;
587        let fragments = JsonFragmentScanner::scan_fragments(data);
588
589        assert_eq!(fragments.len(), 1);
590        assert!(fragments[0].is_complete());
591    }
592
593    #[test]
594    fn test_object_in_array() {
595        let data = br#"[{"a": 1}, {"b": 2}]"#;
596        let fragments = JsonFragmentScanner::scan_fragments(data);
597
598        assert_eq!(fragments.len(), 1);
599        assert!(fragments[0].is_complete());
600    }
601
602    // Edge case tests for lenient extraction
603
604    #[test]
605    fn test_utf8_multibyte_emoji() {
606        // Emoji are 4-byte UTF-8 sequences
607        let data = r#"{"emoji": "👍 🚀 ✅"}"#.as_bytes();
608        let fragments = JsonFragmentScanner::scan_fragments(data);
609
610        assert_eq!(fragments.len(), 1);
611        assert!(fragments[0].is_complete());
612    }
613
614    #[test]
615    fn test_utf8_multibyte_cjk() {
616        // CJK characters are 3-byte UTF-8 sequences
617        let data = r#"{"text": "你好世界", "lang": "中文"}"#.as_bytes();
618        let fragments = JsonFragmentScanner::scan_fragments(data);
619
620        assert_eq!(fragments.len(), 1);
621        assert!(fragments[0].is_complete());
622    }
623
624    #[test]
625    fn test_utf8_multibyte_mixed() {
626        // Mix of ASCII, 2-byte, 3-byte, and 4-byte UTF-8
627        let data = r#"{"msg": "Hello мир 世界 👋!"}"#.as_bytes();
628        let fragments = JsonFragmentScanner::scan_fragments(data);
629
630        assert_eq!(fragments.len(), 1);
631        assert!(fragments[0].is_complete());
632    }
633
634    #[test]
635    fn test_utf8_in_keys() {
636        // UTF-8 characters in object keys
637        let data = r#"{"名前": "Alice", "возраст": 30}"#.as_bytes();
638        let fragments = JsonFragmentScanner::scan_fragments(data);
639
640        assert_eq!(fragments.len(), 1);
641        assert!(fragments[0].is_complete());
642    }
643
644    #[test]
645    fn test_single_escaped_quote() {
646        // Single backslash + quote = escaped quote (part of string)
647        let data = br#"{"text": "He said \"hello\""}"#;
648        let fragments = JsonFragmentScanner::scan_fragments(data);
649
650        assert_eq!(fragments.len(), 1);
651        assert!(fragments[0].is_complete());
652    }
653
654    #[test]
655    fn test_double_backslash_then_quote() {
656        // Double backslash + quote = escaped backslash + end quote
657        // The quote should end the string
658        let data = br#"{"text": "path\\", "next": "value"}"#;
659        let fragments = JsonFragmentScanner::scan_fragments(data);
660
661        assert_eq!(fragments.len(), 1);
662        assert!(fragments[0].is_complete());
663    }
664
665    #[test]
666    fn test_triple_backslash_then_quote() {
667        // Triple backslash + quote = backslash + escaped quote (part of string)
668        let data = br#"{"text": "value\\\""}"#;
669        let fragments = JsonFragmentScanner::scan_fragments(data);
670
671        assert_eq!(fragments.len(), 1);
672        assert!(fragments[0].is_complete());
673    }
674
675    #[test]
676    fn test_four_backslashes_then_quote() {
677        // Four backslashes + quote = two escaped backslashes + end quote
678        let data = br#"{"text": "path\\\\", "next": "value"}"#;
679        let fragments = JsonFragmentScanner::scan_fragments(data);
680
681        assert_eq!(fragments.len(), 1);
682        assert!(fragments[0].is_complete());
683    }
684
685    #[test]
686    fn test_many_consecutive_backslashes() {
687        // Many backslashes - verify counting works correctly
688        let data = br#"{"text": "backslashes: \\\\\\\\"}"#;
689        let fragments = JsonFragmentScanner::scan_fragments(data);
690
691        assert_eq!(fragments.len(), 1);
692        assert!(fragments[0].is_complete());
693    }
694
695    #[test]
696    fn test_escaped_backslash_in_middle() {
697        // Escaped backslash in middle of string
698        let data = br#"{"path": "C:\\Users\\Alice\\file.txt"}"#;
699        let fragments = JsonFragmentScanner::scan_fragments(data);
700
701        assert_eq!(fragments.len(), 1);
702        assert!(fragments[0].is_complete());
703    }
704
705    #[test]
706    fn test_bracket_mismatch_array_closed_with_brace() {
707        // Array opened with [ but closed with }
708        let data = br#"[1, 2, 3}"#;
709        let fragments = JsonFragmentScanner::scan_fragments(data);
710
711        // Should detect incomplete fragment (lenient extraction)
712        assert_eq!(fragments.len(), 1);
713        assert!(!fragments[0].is_complete());
714    }
715
716    #[test]
717    fn test_bracket_mismatch_object_closed_with_bracket() {
718        // Object opened with { but closed with ]
719        let data = br#"{"key": "value"]"#;
720        let fragments = JsonFragmentScanner::scan_fragments(data);
721
722        // Should detect incomplete fragment
723        assert_eq!(fragments.len(), 1);
724        assert!(!fragments[0].is_complete());
725    }
726
727    #[test]
728    fn test_bracket_mismatch_nested() {
729        // Nested structures with mismatched brackets
730        let data = br#"{"array": [1, 2}"#;
731        let fragments = JsonFragmentScanner::scan_fragments(data);
732
733        // Lenient extraction: The scanner closes the object when it finds }
734        // It doesn't validate that the nested array was properly closed
735        // This is intentional for fast lenient extraction
736        assert_eq!(fragments.len(), 1);
737        // The fragment is marked complete because we found the closing brace
738        // (lenient mode doesn't validate nested structure correctness)
739    }
740
741    #[test]
742    fn test_bracket_mismatch_multiple_fragments() {
743        // First fragment has mismatch, second is valid
744        let data = br#"[1, 2} {"valid": true}"#;
745        let fragments = JsonFragmentScanner::scan_fragments(data);
746
747        // Should get incomplete fragment followed by valid one
748        assert!(!fragments.is_empty());
749        // First should be incomplete
750        assert!(!fragments[0].is_complete());
751    }
752
753    #[test]
754    fn test_null_byte_in_string() {
755        // Null byte inside string value (lenient extraction should handle it)
756        let mut data = Vec::from(br#"{"text": "before"#);
757        data.push(0); // null byte
758        data.extend_from_slice(br#"after"}"#);
759
760        let fragments = JsonFragmentScanner::scan_fragments(&data);
761
762        // Should not panic, may be incomplete depending on implementation
763        assert_eq!(fragments.len(), 1);
764    }
765
766    #[test]
767    fn test_control_characters_in_string() {
768        // Control characters (0x01-0x1F) in string
769        let mut data = Vec::from(br#"{"text": "hello"#);
770        data.push(0x01); // SOH control char
771        data.push(0x0F); // SI control char
772        data.extend_from_slice(br#"world"}"#);
773
774        let fragments = JsonFragmentScanner::scan_fragments(&data);
775
776        // Should not panic
777        assert_eq!(fragments.len(), 1);
778    }
779
780    #[test]
781    fn test_tab_and_newline_in_string() {
782        // Tabs and newlines (valid in lenient mode, though not in strict JSON)
783        let data = b"{\"text\": \"line1\nline2\tindented\"}";
784
785        let fragments = JsonFragmentScanner::scan_fragments(data);
786
787        // Lenient extraction - should handle it
788        assert_eq!(fragments.len(), 1);
789    }
790
791    #[test]
792    fn test_high_byte_values() {
793        // High byte values (0x80-0xFF) outside UTF-8 context
794        let mut data = Vec::from(br#"{"data": ""#);
795        data.push(0xFF);
796        data.push(0xFE);
797        data.push(0xFD);
798        data.extend_from_slice(br#""}"#);
799
800        let fragments = JsonFragmentScanner::scan_fragments(&data);
801
802        // Should not panic (lenient extraction)
803        assert_eq!(fragments.len(), 1);
804    }
805
806    #[test]
807    fn test_extreme_nesting_1000_levels() {
808        // Stress test SmallVec with 1000 nesting levels
809        // SmallVec[16] should transition to heap gracefully
810        let mut json = String::new();
811        for i in 0..1000 {
812            json.push('{');
813            json.push_str(&format!("\"level_{}\":", i));
814        }
815        json.push_str("\"value\"");
816        for _ in 0..1000 {
817            json.push('}');
818        }
819
820        let fragments = JsonFragmentScanner::scan_fragments(json.as_bytes());
821
822        // Should handle extreme nesting without panic or stack overflow
823        assert_eq!(fragments.len(), 1);
824        assert!(fragments[0].is_complete());
825    }
826
827    #[test]
828    fn test_extreme_nesting_mixed_brackets() {
829        // 500 levels of alternating objects and arrays
830        let mut json = String::new();
831        for i in 0..500 {
832            if i % 2 == 0 {
833                json.push('{');
834                json.push_str(&format!("\"key_{}\":", i));
835            } else {
836                json.push('[');
837            }
838        }
839        json.push_str("42");
840        for i in (0..500).rev() {
841            if i % 2 == 0 {
842                json.push('}');
843            } else {
844                json.push(']');
845            }
846        }
847
848        let fragments = JsonFragmentScanner::scan_fragments(json.as_bytes());
849
850        // Should handle mixed deep nesting
851        assert_eq!(fragments.len(), 1);
852        assert!(fragments[0].is_complete());
853    }
854}
855
856#[cfg(test)]
857mod proptest_tests {
858    use super::*;
859    use proptest::prelude::*;
860
861    // Generate valid JSON fragments (objects or arrays only, not scalars)
862    fn json_fragment() -> impl Strategy<Value = String> {
863        let leaf = prop_oneof![
864            // Strings with actual content
865            "[a-z]{1,10}".prop_map(|s| format!("\"{}\"", s)),
866            // Numbers
867            (-1000i32..1000i32).prop_map(|n| n.to_string()),
868            // Booleans and null
869            prop_oneof![
870                Just("true".to_string()),
871                Just("false".to_string()),
872                Just("null".to_string()),
873            ],
874        ];
875
876        leaf.prop_recursive(
877            4,  // max depth
878            32, // max nodes
879            10, // items per collection
880            |inner| {
881                prop_oneof![
882                    // Arrays
883                    prop::collection::vec(inner.clone(), 0..5)
884                        .prop_map(|items| format!("[{}]", items.join(","))),
885                    // Objects
886                    prop::collection::vec(("[a-z]{1,5}", inner.clone()), 0..5).prop_map(|items| {
887                        let pairs: Vec<String> = items
888                            .into_iter()
889                            .map(|(k, v)| format!("\"{}\":{}", k, v))
890                            .collect();
891                        format!("{{{}}}", pairs.join(","))
892                    }),
893                ]
894            },
895        )
896        .prop_filter("Must be object or array", |s| {
897            s.starts_with('{') || s.starts_with('[')
898        })
899    }
900
901    #[test]
902    fn proptest_multiple_fragments() {
903        proptest!(|(jsons in prop::collection::vec(json_fragment(), 1..5))| {
904            let combined = jsons.join(" ");
905
906            let fragments = JsonFragmentScanner::scan_fragments(combined.as_bytes());
907
908            // Should find as many fragments as we put in
909            prop_assert_eq!(fragments.len(), jsons.len());
910
911            // All should be complete
912            for frag in fragments {
913                prop_assert!(frag.is_complete());
914            }
915        });
916    }
917
918    #[test]
919    fn proptest_random_bytes_no_panic() {
920        proptest!(|(bytes in prop::collection::vec(any::<u8>(), 0..100))| {
921
922            // Should not panic regardless of input
923            let _ = JsonFragmentScanner::scan_fragments(&bytes);
924        });
925    }
926}