Skip to main content

pjson_rs/parser/
sonic.rs

1//! Hybrid parser using sonic-rs for SIMD acceleration
2//!
3//! This module provides a high-performance parser that combines:
4//! - sonic-rs for SIMD-accelerated JSON parsing
5//! - PJS semantic analysis for intelligent chunking
6
7use crate::{
8    error::{Error, Result},
9    frame::Frame,
10    frame::{FrameFlags, FrameHeader},
11    security::{DepthTracker, SecurityValidator},
12    semantic::{NumericDType, SemanticMeta, SemanticType},
13};
14use bytes::Bytes;
15use smallvec::SmallVec;
16use sonic_rs::{JsonContainerTrait, JsonNumberTrait, JsonValueTrait, Value as SonicValue};
17
18/// Branch prediction hint for unlikely conditions (simplified)
19#[inline(always)]
20fn unlikely(b: bool) -> bool {
21    b
22}
23
24/// Configuration for the sonic hybrid parser
25#[derive(Debug, Clone)]
26pub struct SonicConfig {
27    /// Enable semantic type detection
28    pub detect_semantics: bool,
29    /// Maximum input size in bytes
30    pub max_input_size: usize,
31}
32
33impl Default for SonicConfig {
34    fn default() -> Self {
35        Self {
36            detect_semantics: true,
37            max_input_size: 100 * 1024 * 1024, // 100MB
38        }
39    }
40}
41
42/// High-performance parser using sonic-rs with PJS semantic analysis
43pub struct SonicParser {
44    config: SonicConfig,
45    validator: SecurityValidator,
46    stats: std::cell::RefCell<SonicStats>,
47}
48
49/// Performance statistics for sonic parser
50#[derive(Debug, Default, Clone)]
51pub struct SonicStats {
52    /// Total parses attempted.
53    pub total_parses: u64,
54    /// Number of parses successfully handled by the sonic-rs backend.
55    pub sonic_successes: u64,
56    /// Number of parses that fell back to the serde backend.
57    pub serde_fallbacks: u64,
58    /// Average parse duration, in nanoseconds.
59    pub avg_parse_time_ns: u64,
60    /// Total input bytes processed.
61    pub bytes_processed: u64,
62}
63
64impl SonicParser {
65    /// Create a new SonicParser with default configuration
66    pub fn new() -> Self {
67        Self {
68            config: SonicConfig::default(),
69            validator: SecurityValidator::default(),
70            stats: std::cell::RefCell::new(SonicStats::default()),
71        }
72    }
73
74    /// Create a new SonicParser with custom configuration
75    pub fn with_config(config: SonicConfig) -> Self {
76        Self {
77            config,
78            validator: SecurityValidator::default(),
79            stats: std::cell::RefCell::new(SonicStats::default()),
80        }
81    }
82
83    /// Create a new SonicParser with security configuration
84    pub fn with_security_config(
85        config: SonicConfig,
86        security_config: crate::config::SecurityConfig,
87    ) -> Self {
88        Self {
89            config,
90            validator: SecurityValidator::new(security_config),
91            stats: std::cell::RefCell::new(SonicStats::default()),
92        }
93    }
94
95    /// Parse JSON input using sonic-rs with PJS semantics (optimized)
96    pub fn parse(&self, input: &[u8]) -> Result<Frame> {
97        let start_time = std::time::Instant::now();
98
99        // Security validation: check input size
100        self.validator.validate_input_size(input.len())?;
101
102        // Fast path: size check with branch prediction hint
103        if unlikely(input.len() > self.config.max_input_size) {
104            return Err(Error::Other(format!("Input too large: {}", input.len())));
105        }
106
107        // UTF-8 validation (safe approach)
108        let json_str = std::str::from_utf8(input)
109            .map_err(|e| Error::Other(format!("Invalid UTF-8 input: {}", e)))?;
110
111        // Pre-validate JSON structure for safety (before parsing)
112        self.pre_validate_json_string(json_str)?;
113
114        // Parse with sonic-rs SIMD acceleration
115        let value: SonicValue =
116            sonic_rs::from_str(json_str).map_err(|e| Error::invalid_json(0, e.to_string()))?;
117
118        // Post-validate parsed JSON structure for additional checks
119        self.validate_json_structure(&value)?;
120
121        // Fast semantic detection (only if enabled and small overhead)
122        let semantic_type = if self.config.detect_semantics && input.len() < 100_000 {
123            self.detect_semantic_type_sonic(&value)
124        } else {
125            SemanticType::Generic
126        };
127
128        // Zero-copy payload using Bytes::from_static when possible
129        let payload = if input.len() < 4096 {
130            // For small inputs, copy is fast and reduces fragmentation
131            Bytes::copy_from_slice(input)
132        } else {
133            // For larger inputs, prefer zero-copy when possible
134            Bytes::from(input.to_vec()) // Will be optimized to zero-copy in many cases
135        };
136
137        // Minimal frame header for performance
138        let header = FrameHeader {
139            version: 1,
140            flags: FrameFlags::empty(),
141            sequence: 0,
142            length: input.len() as u32,
143            schema_id: 0,
144            checksum: 0,
145        };
146
147        let semantics = if semantic_type != SemanticType::Generic {
148            Some(SemanticMeta::new(semantic_type))
149        } else {
150            None
151        };
152
153        // Update statistics
154        {
155            let mut stats = self.stats.borrow_mut();
156            stats.total_parses += 1;
157            stats.sonic_successes += 1;
158            stats.bytes_processed += input.len() as u64;
159
160            let elapsed_ns = start_time.elapsed().as_nanos() as u64;
161            stats.avg_parse_time_ns = (stats.avg_parse_time_ns * (stats.total_parses - 1)
162                + elapsed_ns)
163                / stats.total_parses;
164        }
165
166        Ok(Frame {
167            header,
168            payload,
169            semantics,
170        })
171    }
172
173    /// Get performance statistics
174    pub fn get_stats(&self) -> SonicStats {
175        self.stats.borrow().clone()
176    }
177
178    /// Pre-validate JSON string for basic safety checks (before parsing)
179    fn pre_validate_json_string(&self, json_str: &str) -> Result<()> {
180        // Count nesting depth by counting braces/brackets
181        let mut depth = 0;
182        let mut max_depth = 0;
183
184        for ch in json_str.chars() {
185            match ch {
186                '{' | '[' => {
187                    depth += 1;
188                    max_depth = max_depth.max(depth);
189                    self.validator.validate_json_depth(max_depth)?;
190                }
191                '}' | ']' => {
192                    depth = depth.saturating_sub(1);
193                }
194                _ => {}
195            }
196        }
197
198        Ok(())
199    }
200
201    /// Validate JSON structure for security (depth, complexity, etc.)
202    fn validate_json_structure(&self, value: &SonicValue) -> Result<()> {
203        let mut depth_tracker = DepthTracker::default();
204        self.validate_json_recursive(value, &mut depth_tracker)
205    }
206
207    /// Recursively validate JSON structure
208    fn validate_json_recursive(
209        &self,
210        value: &SonicValue,
211        depth_tracker: &mut DepthTracker,
212    ) -> Result<()> {
213        match value {
214            _ if value.is_object() => {
215                depth_tracker.enter()?;
216
217                if let Some(obj) = value.as_object() {
218                    // Validate object key count
219                    self.validator.validate_object_keys(obj.len())?;
220
221                    // Recursively validate object values
222                    for (key, val) in obj.iter() {
223                        // Validate key length
224                        self.validator.validate_string_length(key.len())?;
225                        self.validate_json_recursive(val, depth_tracker)?;
226                    }
227                }
228
229                depth_tracker.exit();
230            }
231            _ if value.is_array() => {
232                depth_tracker.enter()?;
233
234                if let Some(arr) = value.as_array() {
235                    // Validate array length
236                    self.validator.validate_array_length(arr.len())?;
237
238                    // Recursively validate array elements
239                    for element in arr.iter() {
240                        self.validate_json_recursive(element, depth_tracker)?;
241                    }
242                }
243
244                depth_tracker.exit();
245            }
246            _ if value.is_str() => {
247                if let Some(s) = value.as_str() {
248                    self.validator.validate_string_length(s.len())?;
249                }
250            }
251            _ => {
252                // Numbers, booleans, null are always valid
253            }
254        }
255
256        Ok(())
257    }
258
259    /// Detect semantic type using sonic-rs Value with SIMD acceleration
260    fn detect_semantic_type_sonic(&self, value: &SonicValue) -> SemanticType {
261        if value.is_array()
262            && let Some(arr) = value.as_array()
263        {
264            return self.analyze_array_semantics_simd(arr);
265        }
266
267        if value.is_object()
268            && let Some(obj) = value.as_object()
269        {
270            return self.analyze_object_semantics_simd(obj);
271        }
272
273        SemanticType::Generic
274    }
275
276    /// SIMD-optimized object semantic analysis
277    fn analyze_object_semantics_simd(&self, obj: &sonic_rs::Object) -> SemanticType {
278        let scan_result = crate::parser::simd::SimdClassifier::scan_object_keys(obj);
279
280        // Fast GeoJSON detection
281        if scan_result.has_type_field && scan_result.has_coordinates {
282            return SemanticType::Geospatial {
283                coordinate_system: "WGS84".to_string(),
284                geometry_type: obj
285                    .get(&"type")
286                    .and_then(|v| v.as_str())
287                    .unwrap_or("Point")
288                    .to_string(),
289            };
290        }
291
292        // Fast time series detection
293        if scan_result.has_timestamp {
294            let timestamp_field = if obj.contains_key(&"timestamp") {
295                "timestamp"
296            } else {
297                "time"
298            };
299
300            // Find numeric value fields efficiently
301            let value_fields: SmallVec<[String; 4]> = obj
302                .iter()
303                .filter_map(|(k, v)| {
304                    if k != timestamp_field && v.is_number() {
305                        Some(k.to_string())
306                    } else {
307                        None
308                    }
309                })
310                .collect();
311
312            if !value_fields.is_empty() {
313                return SemanticType::TimeSeries {
314                    timestamp_field: timestamp_field.to_string(),
315                    value_fields,
316                    interval_ms: None,
317                };
318            }
319        }
320
321        SemanticType::Generic
322    }
323
324    /// SIMD-optimized array semantic analysis
325    fn analyze_array_semantics_simd(&self, arr: &sonic_rs::Array) -> SemanticType {
326        let len = arr.len();
327        if len == 0 {
328            return SemanticType::Generic;
329        }
330
331        // Fast numeric array detection using SIMD
332        if crate::parser::simd::SimdClassifier::is_numeric_array(arr) {
333            let dtype = if let Some(first) = arr.first() {
334                if let Some(num) = first.as_number() {
335                    if num.is_i64() {
336                        NumericDType::I64
337                    } else if num.is_u64() {
338                        NumericDType::U64
339                    } else {
340                        NumericDType::F64
341                    }
342                } else {
343                    NumericDType::F64
344                }
345            } else {
346                NumericDType::F64
347            };
348
349            return SemanticType::NumericArray {
350                dtype,
351                length: Some(len),
352            };
353        }
354
355        // Fast time series detection
356        if len >= 2 {
357            let mut is_time_series = true;
358
359            // Use early exit strategy for performance
360            for value in arr.iter() {
361                if let Some(obj) = value.as_object() {
362                    let scan_result = crate::parser::simd::SimdClassifier::scan_object_keys(obj);
363                    if !scan_result.has_timestamp {
364                        is_time_series = false;
365                        break;
366                    }
367                } else {
368                    is_time_series = false;
369                    break;
370                }
371            }
372
373            if is_time_series {
374                return SemanticType::TimeSeries {
375                    timestamp_field: "timestamp".to_string(),
376                    value_fields: SmallVec::from_vec(vec!["value".to_string()]),
377                    interval_ms: None,
378                };
379            }
380        }
381
382        // Check for tabular data (array of objects with similar structure)
383        if len >= 3
384            && arr.iter().all(|v| v.is_object())
385            && let Some(first_obj) = arr.first().and_then(|v| v.as_object())
386        {
387            let first_scan = crate::parser::simd::SimdClassifier::scan_object_keys(first_obj);
388
389            // Simple homogeneity check - all objects should have similar key counts
390            let is_tabular = arr.iter().skip(1).filter_map(|v| v.as_object()).all(|obj| {
391                let scan = crate::parser::simd::SimdClassifier::scan_object_keys(obj);
392                // Allow some variation (±20%)
393                let diff = scan.key_count as i32 - first_scan.key_count as i32;
394                diff.abs() <= (first_scan.key_count as i32 / 5)
395            });
396
397            if is_tabular {
398                // Extract columns from first object
399                let columns: SmallVec<[crate::semantic::ColumnMeta; 16]> = first_obj
400                    .iter()
401                    .map(|(k, v)| {
402                        let column_type = if v.is_number() {
403                            crate::semantic::ColumnType::Numeric(NumericDType::F64)
404                        } else if v.is_str() {
405                            crate::semantic::ColumnType::String
406                        } else if v.as_bool().is_some() {
407                            crate::semantic::ColumnType::Boolean
408                        } else {
409                            crate::semantic::ColumnType::Json
410                        };
411
412                        crate::semantic::ColumnMeta {
413                            name: k.to_string(),
414                            dtype: column_type,
415                            nullable: false,
416                        }
417                    })
418                    .collect();
419
420                return SemanticType::Table {
421                    columns: Box::new(columns),
422                    row_count: Some(len),
423                };
424            }
425        }
426
427        SemanticType::Generic
428    }
429}
430
431impl Default for SonicParser {
432    fn default() -> Self {
433        Self::new()
434    }
435}
436
437#[cfg(test)]
438mod tests {
439    use super::*;
440
441    #[test]
442    fn test_sonic_parser_creation() {
443        let parser = SonicParser::new();
444        assert!(parser.config.detect_semantics);
445        assert_eq!(parser.config.max_input_size, 100 * 1024 * 1024);
446    }
447
448    #[test]
449    fn test_sonic_basic_parsing() {
450        let parser = SonicParser::new();
451        let json = br#"{"name": "test", "value": 42}"#;
452
453        let result = parser.parse(json);
454        assert!(result.is_ok());
455
456        let frame = result.unwrap();
457        assert_eq!(frame.header.version, 1);
458        assert_eq!(frame.payload.len(), json.len());
459    }
460
461    #[test]
462    fn test_sonic_numeric_array_detection() {
463        let parser = SonicParser::new();
464        let json = br#"[1.5, 2.7, 3.14, 4.2, 5.1]"#;
465
466        let result = parser.parse(json).unwrap();
467        if let Some(semantics) = result.semantics {
468            assert!(matches!(
469                semantics.semantic_type,
470                SemanticType::NumericArray { .. }
471            ));
472        } else {
473            panic!("Expected semantic metadata");
474        }
475    }
476
477    #[test]
478    fn test_sonic_time_series_detection() {
479        let parser = SonicParser::new();
480        let json = br#"[
481            {"timestamp": "2023-01-01T00:00:00Z", "value": 1.5},
482            {"timestamp": "2023-01-01T00:01:00Z", "value": 2.3}
483        ]"#;
484
485        let result = parser.parse(json).unwrap();
486        if let Some(semantics) = result.semantics {
487            assert!(matches!(
488                semantics.semantic_type,
489                SemanticType::TimeSeries { .. }
490            ));
491        } else {
492            panic!("Expected semantic metadata");
493        }
494    }
495
496    #[test]
497    fn test_sonic_performance_config() {
498        let config = SonicConfig {
499            detect_semantics: false,
500            max_input_size: 1024,
501        };
502
503        let parser = SonicParser::with_config(config);
504        assert!(!parser.config.detect_semantics);
505        assert_eq!(parser.config.max_input_size, 1024);
506    }
507
508    #[test]
509    fn test_sonic_invalid_utf8_handling() {
510        let parser = SonicParser::new();
511        // Create invalid UTF-8 sequence
512        let invalid_utf8 = &[0xFF, 0xFE, 0xFD];
513
514        let result = parser.parse(invalid_utf8);
515        assert!(result.is_err());
516
517        let error_msg = result.unwrap_err().to_string();
518        assert!(error_msg.contains("Invalid UTF-8"));
519    }
520
521    #[test]
522    fn test_sonic_input_size_limit() {
523        let config = SonicConfig {
524            detect_semantics: true,
525            max_input_size: 10, // Very small limit
526        };
527        let parser = SonicParser::with_config(config);
528
529        let large_json = b"[1,2,3,4,5,6,7,8,9,10]"; // Exceeds 10 bytes
530        let result = parser.parse(large_json);
531
532        assert!(result.is_err());
533        let error_msg = result.unwrap_err().to_string();
534        assert!(error_msg.contains("Input size") || error_msg.contains("Input too large"));
535    }
536
537    #[test]
538    fn test_sonic_json_depth_validation() {
539        let parser = SonicParser::new();
540
541        // Create moderately nested JSON that exceeds our validation limit but won't cause stack overflow
542        let mut json = String::new();
543        // Create 65 levels of nesting (exceeds limit of 64)
544        for _ in 0..65 {
545            json.push('{');
546            json.push_str("\"a\":");
547        }
548        json.push_str("\"value\"");
549        for _ in 0..65 {
550            json.push('}');
551        }
552
553        let result = parser.parse(json.as_bytes());
554        assert!(result.is_err());
555        let error_msg = result.unwrap_err().to_string();
556        assert!(error_msg.contains("depth"));
557    }
558
559    #[test]
560    fn test_sonic_large_string_validation() {
561        let parser = SonicParser::new();
562
563        // Create JSON with very large string value
564        let large_string = "a".repeat(11 * 1024 * 1024); // 11MB string
565        let json = format!("{{\"key\": \"{}\"}}", large_string);
566
567        let result = parser.parse(json.as_bytes());
568        assert!(result.is_err());
569        let error_msg = result.unwrap_err().to_string();
570        assert!(error_msg.contains("String length"));
571    }
572
573    #[test]
574    fn test_sonic_large_array_validation() {
575        let parser = SonicParser::new();
576
577        // Create JSON with array that has too many elements
578        let mut json = String::from("[");
579        let _max_elements = 1_000_000 + 1; // Large array limit for reference
580
581        // We'll test with smaller number for performance, but check the validation logic
582        for i in 0..1001 {
583            // Just over a reasonable limit for testing
584            if i > 0 {
585                json.push(',');
586            }
587            json.push_str(&i.to_string());
588        }
589        json.push(']');
590
591        // This should work fine as 1001 is well under the limit
592        let result = parser.parse(json.as_bytes());
593        assert!(result.is_ok());
594    }
595
596    #[test]
597    fn test_sonic_many_object_keys_validation() {
598        let parser = SonicParser::new();
599
600        // Create JSON object with many keys
601        let mut json = String::from("{");
602        for i in 0..1000 {
603            // Well under the limit for testing
604            if i > 0 {
605                json.push(',');
606            }
607            json.push_str(&format!("\"key{}\": {}", i, i));
608        }
609        json.push('}');
610
611        let result = parser.parse(json.as_bytes());
612        assert!(result.is_ok());
613    }
614
615    #[test]
616    fn test_sonic_geojson_detection() {
617        let parser = SonicParser::new();
618        let json = br#"{
619            "type": "Point",
620            "coordinates": [125.6, 10.1]
621        }"#;
622
623        let result = parser.parse(json).unwrap();
624        assert!(result.semantics.is_some());
625        if let Some(semantics) = result.semantics {
626            assert!(matches!(
627                semantics.semantic_type,
628                SemanticType::Geospatial { .. }
629            ));
630        }
631    }
632
633    #[test]
634    fn test_sonic_timeseries_with_time_field() {
635        let parser = SonicParser::new();
636        let json = br#"{
637            "time": "2023-01-01T00:00:00Z",
638            "temperature": 25.5,
639            "humidity": 60.2
640        }"#;
641
642        let result = parser.parse(json).unwrap();
643        assert!(result.semantics.is_some());
644        if let Some(semantics) = result.semantics {
645            if let SemanticType::TimeSeries {
646                timestamp_field, ..
647            } = semantics.semantic_type
648            {
649                assert_eq!(timestamp_field, "time");
650            } else {
651                panic!("Expected TimeSeries semantic type");
652            }
653        }
654    }
655
656    #[test]
657    fn test_sonic_large_input_skips_semantics() {
658        let parser = SonicParser::new();
659
660        // Create input larger than 100KB threshold
661        let large_value = "x".repeat(50_000);
662        let json = format!(
663            r#"{{
664            "timestamp": "2023-01-01T00:00:00Z",
665            "data": "{}"
666        }}"#,
667            large_value
668        );
669
670        let result = parser.parse(json.as_bytes()).unwrap();
671        // Semantic detection should be skipped for large inputs
672        assert!(result.semantics.is_none());
673    }
674
675    #[test]
676    fn test_sonic_tabular_data_detection() {
677        let parser = SonicParser::new();
678        let json = br#"[
679            {"id": 1, "name": "Alice", "age": 30},
680            {"id": 2, "name": "Bob", "age": 25},
681            {"id": 3, "name": "Charlie", "age": 35}
682        ]"#;
683
684        let result = parser.parse(json).unwrap();
685        assert!(result.semantics.is_some());
686        if let Some(semantics) = result.semantics {
687            assert!(matches!(
688                semantics.semantic_type,
689                SemanticType::Table { .. }
690            ));
691        }
692    }
693
694    #[test]
695    fn test_sonic_non_tabular_heterogeneous_array() {
696        let parser = SonicParser::new();
697        // Array with different structures - should not be detected as tabular
698        let json = br#"[
699            {"id": 1, "name": "Alice"},
700            {"id": 2, "name": "Bob", "extra": "field"},
701            {"completely": "different"}
702        ]"#;
703
704        let result = parser.parse(json).unwrap();
705        // Should not detect as tabular due to heterogeneous structure
706        if let Some(semantics) = result.semantics {
707            assert!(!matches!(
708                semantics.semantic_type,
709                SemanticType::Table { .. }
710            ));
711        }
712    }
713}