Skip to main content

pjson_rs/parser/
sonic.rs

1//! Hybrid parser using sonic-rs for SIMD acceleration
2//!
3//! This module provides a high-performance parser that combines:
4//! - sonic-rs for SIMD-accelerated JSON parsing
5//! - PJS semantic analysis for intelligent chunking
6
7use crate::{
8    error::{Error, Result},
9    frame::Frame,
10    frame::{FrameFlags, FrameHeader},
11    security::{DepthTracker, SecurityValidator},
12    semantic::{NumericDType, SemanticMeta, SemanticType},
13};
14use bytes::Bytes;
15use smallvec::SmallVec;
16use sonic_rs::{JsonContainerTrait, JsonNumberTrait, JsonValueTrait, Value as SonicValue};
17
18/// Branch prediction hint for unlikely conditions (simplified)
19#[inline(always)]
20fn unlikely(b: bool) -> bool {
21    b
22}
23
24/// Configuration for the sonic hybrid parser
25#[derive(Debug, Clone)]
26pub struct SonicConfig {
27    /// Enable semantic type detection
28    pub detect_semantics: bool,
29    /// Maximum input size in bytes
30    pub max_input_size: usize,
31}
32
33impl Default for SonicConfig {
34    fn default() -> Self {
35        Self {
36            detect_semantics: true,
37            max_input_size: 100 * 1024 * 1024, // 100MB
38        }
39    }
40}
41
42/// High-performance parser using sonic-rs with PJS semantic analysis
43pub struct SonicParser {
44    config: SonicConfig,
45    validator: SecurityValidator,
46    stats: std::cell::RefCell<SonicStats>,
47}
48
49/// Performance statistics for sonic parser
50#[derive(Debug, Default, Clone)]
51pub struct SonicStats {
52    pub total_parses: u64,
53    pub sonic_successes: u64,
54    pub serde_fallbacks: u64,
55    pub avg_parse_time_ns: u64,
56    pub bytes_processed: u64,
57}
58
59impl SonicParser {
60    /// Create a new SonicParser with default configuration
61    pub fn new() -> Self {
62        Self {
63            config: SonicConfig::default(),
64            validator: SecurityValidator::default(),
65            stats: std::cell::RefCell::new(SonicStats::default()),
66        }
67    }
68
69    /// Create a new SonicParser with custom configuration
70    pub fn with_config(config: SonicConfig) -> Self {
71        Self {
72            config,
73            validator: SecurityValidator::default(),
74            stats: std::cell::RefCell::new(SonicStats::default()),
75        }
76    }
77
78    /// Create a new SonicParser with security configuration
79    pub fn with_security_config(
80        config: SonicConfig,
81        security_config: crate::config::SecurityConfig,
82    ) -> Self {
83        Self {
84            config,
85            validator: SecurityValidator::new(security_config),
86            stats: std::cell::RefCell::new(SonicStats::default()),
87        }
88    }
89
90    /// Parse JSON input using sonic-rs with PJS semantics (optimized)
91    pub fn parse(&self, input: &[u8]) -> Result<Frame> {
92        let start_time = std::time::Instant::now();
93
94        // Security validation: check input size
95        self.validator.validate_input_size(input.len())?;
96
97        // Fast path: size check with branch prediction hint
98        if unlikely(input.len() > self.config.max_input_size) {
99            return Err(Error::Other(format!("Input too large: {}", input.len())));
100        }
101
102        // UTF-8 validation (safe approach)
103        let json_str = std::str::from_utf8(input)
104            .map_err(|e| Error::Other(format!("Invalid UTF-8 input: {}", e)))?;
105
106        // Pre-validate JSON structure for safety (before parsing)
107        self.pre_validate_json_string(json_str)?;
108
109        // Parse with sonic-rs SIMD acceleration
110        let value: SonicValue =
111            sonic_rs::from_str(json_str).map_err(|e| Error::invalid_json(0, e.to_string()))?;
112
113        // Post-validate parsed JSON structure for additional checks
114        self.validate_json_structure(&value)?;
115
116        // Fast semantic detection (only if enabled and small overhead)
117        let semantic_type = if self.config.detect_semantics && input.len() < 100_000 {
118            self.detect_semantic_type_sonic(&value)
119        } else {
120            SemanticType::Generic
121        };
122
123        // Zero-copy payload using Bytes::from_static when possible
124        let payload = if input.len() < 4096 {
125            // For small inputs, copy is fast and reduces fragmentation
126            Bytes::copy_from_slice(input)
127        } else {
128            // For larger inputs, prefer zero-copy when possible
129            Bytes::from(input.to_vec()) // Will be optimized to zero-copy in many cases
130        };
131
132        // Minimal frame header for performance
133        let header = FrameHeader {
134            version: 1,
135            flags: FrameFlags::empty(),
136            sequence: 0,
137            length: input.len() as u32,
138            schema_id: 0,
139            checksum: 0,
140        };
141
142        let semantics = if semantic_type != SemanticType::Generic {
143            Some(SemanticMeta::new(semantic_type))
144        } else {
145            None
146        };
147
148        // Update statistics
149        {
150            let mut stats = self.stats.borrow_mut();
151            stats.total_parses += 1;
152            stats.sonic_successes += 1;
153            stats.bytes_processed += input.len() as u64;
154
155            let elapsed_ns = start_time.elapsed().as_nanos() as u64;
156            stats.avg_parse_time_ns = (stats.avg_parse_time_ns * (stats.total_parses - 1)
157                + elapsed_ns)
158                / stats.total_parses;
159        }
160
161        Ok(Frame {
162            header,
163            payload,
164            semantics,
165        })
166    }
167
168    /// Get performance statistics
169    pub fn get_stats(&self) -> SonicStats {
170        self.stats.borrow().clone()
171    }
172
173    /// Pre-validate JSON string for basic safety checks (before parsing)
174    fn pre_validate_json_string(&self, json_str: &str) -> Result<()> {
175        // Count nesting depth by counting braces/brackets
176        let mut depth = 0;
177        let mut max_depth = 0;
178
179        for ch in json_str.chars() {
180            match ch {
181                '{' | '[' => {
182                    depth += 1;
183                    max_depth = max_depth.max(depth);
184                    self.validator.validate_json_depth(max_depth)?;
185                }
186                '}' | ']' => {
187                    depth = depth.saturating_sub(1);
188                }
189                _ => {}
190            }
191        }
192
193        Ok(())
194    }
195
196    /// Validate JSON structure for security (depth, complexity, etc.)
197    fn validate_json_structure(&self, value: &SonicValue) -> Result<()> {
198        let mut depth_tracker = DepthTracker::default();
199        self.validate_json_recursive(value, &mut depth_tracker)
200    }
201
202    /// Recursively validate JSON structure
203    fn validate_json_recursive(
204        &self,
205        value: &SonicValue,
206        depth_tracker: &mut DepthTracker,
207    ) -> Result<()> {
208        match value {
209            _ if value.is_object() => {
210                depth_tracker.enter()?;
211
212                if let Some(obj) = value.as_object() {
213                    // Validate object key count
214                    self.validator.validate_object_keys(obj.len())?;
215
216                    // Recursively validate object values
217                    for (key, val) in obj.iter() {
218                        // Validate key length
219                        self.validator.validate_string_length(key.len())?;
220                        self.validate_json_recursive(val, depth_tracker)?;
221                    }
222                }
223
224                depth_tracker.exit();
225            }
226            _ if value.is_array() => {
227                depth_tracker.enter()?;
228
229                if let Some(arr) = value.as_array() {
230                    // Validate array length
231                    self.validator.validate_array_length(arr.len())?;
232
233                    // Recursively validate array elements
234                    for element in arr.iter() {
235                        self.validate_json_recursive(element, depth_tracker)?;
236                    }
237                }
238
239                depth_tracker.exit();
240            }
241            _ if value.is_str() => {
242                if let Some(s) = value.as_str() {
243                    self.validator.validate_string_length(s.len())?;
244                }
245            }
246            _ => {
247                // Numbers, booleans, null are always valid
248            }
249        }
250
251        Ok(())
252    }
253
254    /// Detect semantic type using sonic-rs Value with SIMD acceleration
255    fn detect_semantic_type_sonic(&self, value: &SonicValue) -> SemanticType {
256        if value.is_array()
257            && let Some(arr) = value.as_array()
258        {
259            return self.analyze_array_semantics_simd(arr);
260        }
261
262        if value.is_object()
263            && let Some(obj) = value.as_object()
264        {
265            return self.analyze_object_semantics_simd(obj);
266        }
267
268        SemanticType::Generic
269    }
270
271    /// SIMD-optimized object semantic analysis
272    fn analyze_object_semantics_simd(&self, obj: &sonic_rs::Object) -> SemanticType {
273        let scan_result = crate::parser::simd::SimdClassifier::scan_object_keys(obj);
274
275        // Fast GeoJSON detection
276        if scan_result.has_type_field && scan_result.has_coordinates {
277            return SemanticType::Geospatial {
278                coordinate_system: "WGS84".to_string(),
279                geometry_type: obj
280                    .get(&"type")
281                    .and_then(|v| v.as_str())
282                    .unwrap_or("Point")
283                    .to_string(),
284            };
285        }
286
287        // Fast time series detection
288        if scan_result.has_timestamp {
289            let timestamp_field = if obj.contains_key(&"timestamp") {
290                "timestamp"
291            } else {
292                "time"
293            };
294
295            // Find numeric value fields efficiently
296            let value_fields: SmallVec<[String; 4]> = obj
297                .iter()
298                .filter_map(|(k, v)| {
299                    if k != timestamp_field && v.is_number() {
300                        Some(k.to_string())
301                    } else {
302                        None
303                    }
304                })
305                .collect();
306
307            if !value_fields.is_empty() {
308                return SemanticType::TimeSeries {
309                    timestamp_field: timestamp_field.to_string(),
310                    value_fields,
311                    interval_ms: None,
312                };
313            }
314        }
315
316        SemanticType::Generic
317    }
318
319    /// SIMD-optimized array semantic analysis
320    fn analyze_array_semantics_simd(&self, arr: &sonic_rs::Array) -> SemanticType {
321        let len = arr.len();
322        if len == 0 {
323            return SemanticType::Generic;
324        }
325
326        // Fast numeric array detection using SIMD
327        if crate::parser::simd::SimdClassifier::is_numeric_array(arr) {
328            let dtype = if let Some(first) = arr.first() {
329                if let Some(num) = first.as_number() {
330                    if num.is_i64() {
331                        NumericDType::I64
332                    } else if num.is_u64() {
333                        NumericDType::U64
334                    } else {
335                        NumericDType::F64
336                    }
337                } else {
338                    NumericDType::F64
339                }
340            } else {
341                NumericDType::F64
342            };
343
344            return SemanticType::NumericArray {
345                dtype,
346                length: Some(len),
347            };
348        }
349
350        // Fast time series detection
351        if len >= 2 {
352            let mut is_time_series = true;
353
354            // Use early exit strategy for performance
355            for value in arr.iter() {
356                if let Some(obj) = value.as_object() {
357                    let scan_result = crate::parser::simd::SimdClassifier::scan_object_keys(obj);
358                    if !scan_result.has_timestamp {
359                        is_time_series = false;
360                        break;
361                    }
362                } else {
363                    is_time_series = false;
364                    break;
365                }
366            }
367
368            if is_time_series {
369                return SemanticType::TimeSeries {
370                    timestamp_field: "timestamp".to_string(),
371                    value_fields: SmallVec::from_vec(vec!["value".to_string()]),
372                    interval_ms: None,
373                };
374            }
375        }
376
377        // Check for tabular data (array of objects with similar structure)
378        if len >= 3
379            && arr.iter().all(|v| v.is_object())
380            && let Some(first_obj) = arr.first().and_then(|v| v.as_object())
381        {
382            let first_scan = crate::parser::simd::SimdClassifier::scan_object_keys(first_obj);
383
384            // Simple homogeneity check - all objects should have similar key counts
385            let is_tabular = arr.iter().skip(1).filter_map(|v| v.as_object()).all(|obj| {
386                let scan = crate::parser::simd::SimdClassifier::scan_object_keys(obj);
387                // Allow some variation (±20%)
388                let diff = scan.key_count as i32 - first_scan.key_count as i32;
389                diff.abs() <= (first_scan.key_count as i32 / 5)
390            });
391
392            if is_tabular {
393                // Extract columns from first object
394                let columns: SmallVec<[crate::semantic::ColumnMeta; 16]> = first_obj
395                    .iter()
396                    .map(|(k, v)| {
397                        let column_type = if v.is_number() {
398                            crate::semantic::ColumnType::Numeric(NumericDType::F64)
399                        } else if v.is_str() {
400                            crate::semantic::ColumnType::String
401                        } else if v.as_bool().is_some() {
402                            crate::semantic::ColumnType::Boolean
403                        } else {
404                            crate::semantic::ColumnType::Json
405                        };
406
407                        crate::semantic::ColumnMeta {
408                            name: k.to_string(),
409                            dtype: column_type,
410                            nullable: false,
411                        }
412                    })
413                    .collect();
414
415                return SemanticType::Table {
416                    columns: Box::new(columns),
417                    row_count: Some(len),
418                };
419            }
420        }
421
422        SemanticType::Generic
423    }
424}
425
426impl Default for SonicParser {
427    fn default() -> Self {
428        Self::new()
429    }
430}
431
432#[cfg(test)]
433mod tests {
434    use super::*;
435
436    #[test]
437    fn test_sonic_parser_creation() {
438        let parser = SonicParser::new();
439        assert!(parser.config.detect_semantics);
440        assert_eq!(parser.config.max_input_size, 100 * 1024 * 1024);
441    }
442
443    #[test]
444    fn test_sonic_basic_parsing() {
445        let parser = SonicParser::new();
446        let json = br#"{"name": "test", "value": 42}"#;
447
448        let result = parser.parse(json);
449        assert!(result.is_ok());
450
451        let frame = result.unwrap();
452        assert_eq!(frame.header.version, 1);
453        assert_eq!(frame.payload.len(), json.len());
454    }
455
456    #[test]
457    fn test_sonic_numeric_array_detection() {
458        let parser = SonicParser::new();
459        let json = br#"[1.5, 2.7, 3.14, 4.2, 5.1]"#;
460
461        let result = parser.parse(json).unwrap();
462        if let Some(semantics) = result.semantics {
463            assert!(matches!(
464                semantics.semantic_type,
465                SemanticType::NumericArray { .. }
466            ));
467        } else {
468            panic!("Expected semantic metadata");
469        }
470    }
471
472    #[test]
473    fn test_sonic_time_series_detection() {
474        let parser = SonicParser::new();
475        let json = br#"[
476            {"timestamp": "2023-01-01T00:00:00Z", "value": 1.5},
477            {"timestamp": "2023-01-01T00:01:00Z", "value": 2.3}
478        ]"#;
479
480        let result = parser.parse(json).unwrap();
481        if let Some(semantics) = result.semantics {
482            assert!(matches!(
483                semantics.semantic_type,
484                SemanticType::TimeSeries { .. }
485            ));
486        } else {
487            panic!("Expected semantic metadata");
488        }
489    }
490
491    #[test]
492    fn test_sonic_performance_config() {
493        let config = SonicConfig {
494            detect_semantics: false,
495            max_input_size: 1024,
496        };
497
498        let parser = SonicParser::with_config(config);
499        assert!(!parser.config.detect_semantics);
500        assert_eq!(parser.config.max_input_size, 1024);
501    }
502
503    #[test]
504    fn test_sonic_invalid_utf8_handling() {
505        let parser = SonicParser::new();
506        // Create invalid UTF-8 sequence
507        let invalid_utf8 = &[0xFF, 0xFE, 0xFD];
508
509        let result = parser.parse(invalid_utf8);
510        assert!(result.is_err());
511
512        let error_msg = result.unwrap_err().to_string();
513        assert!(error_msg.contains("Invalid UTF-8"));
514    }
515
516    #[test]
517    fn test_sonic_input_size_limit() {
518        let config = SonicConfig {
519            detect_semantics: true,
520            max_input_size: 10, // Very small limit
521        };
522        let parser = SonicParser::with_config(config);
523
524        let large_json = b"[1,2,3,4,5,6,7,8,9,10]"; // Exceeds 10 bytes
525        let result = parser.parse(large_json);
526
527        assert!(result.is_err());
528        let error_msg = result.unwrap_err().to_string();
529        assert!(error_msg.contains("Input size") || error_msg.contains("Input too large"));
530    }
531
532    #[test]
533    fn test_sonic_json_depth_validation() {
534        let parser = SonicParser::new();
535
536        // Create moderately nested JSON that exceeds our validation limit but won't cause stack overflow
537        let mut json = String::new();
538        // Create 65 levels of nesting (exceeds limit of 64)
539        for _ in 0..65 {
540            json.push('{');
541            json.push_str("\"a\":");
542        }
543        json.push_str("\"value\"");
544        for _ in 0..65 {
545            json.push('}');
546        }
547
548        let result = parser.parse(json.as_bytes());
549        assert!(result.is_err());
550        let error_msg = result.unwrap_err().to_string();
551        assert!(error_msg.contains("depth"));
552    }
553
554    #[test]
555    fn test_sonic_large_string_validation() {
556        let parser = SonicParser::new();
557
558        // Create JSON with very large string value
559        let large_string = "a".repeat(11 * 1024 * 1024); // 11MB string
560        let json = format!("{{\"key\": \"{}\"}}", large_string);
561
562        let result = parser.parse(json.as_bytes());
563        assert!(result.is_err());
564        let error_msg = result.unwrap_err().to_string();
565        assert!(error_msg.contains("String length"));
566    }
567
568    #[test]
569    fn test_sonic_large_array_validation() {
570        let parser = SonicParser::new();
571
572        // Create JSON with array that has too many elements
573        let mut json = String::from("[");
574        let _max_elements = 1_000_000 + 1; // Large array limit for reference
575
576        // We'll test with smaller number for performance, but check the validation logic
577        for i in 0..1001 {
578            // Just over a reasonable limit for testing
579            if i > 0 {
580                json.push(',');
581            }
582            json.push_str(&i.to_string());
583        }
584        json.push(']');
585
586        // This should work fine as 1001 is well under the limit
587        let result = parser.parse(json.as_bytes());
588        assert!(result.is_ok());
589    }
590
591    #[test]
592    fn test_sonic_many_object_keys_validation() {
593        let parser = SonicParser::new();
594
595        // Create JSON object with many keys
596        let mut json = String::from("{");
597        for i in 0..1000 {
598            // Well under the limit for testing
599            if i > 0 {
600                json.push(',');
601            }
602            json.push_str(&format!("\"key{}\": {}", i, i));
603        }
604        json.push('}');
605
606        let result = parser.parse(json.as_bytes());
607        assert!(result.is_ok());
608    }
609
610    #[test]
611    fn test_sonic_geojson_detection() {
612        let parser = SonicParser::new();
613        let json = br#"{
614            "type": "Point",
615            "coordinates": [125.6, 10.1]
616        }"#;
617
618        let result = parser.parse(json).unwrap();
619        assert!(result.semantics.is_some());
620        if let Some(semantics) = result.semantics {
621            assert!(matches!(
622                semantics.semantic_type,
623                SemanticType::Geospatial { .. }
624            ));
625        }
626    }
627
628    #[test]
629    fn test_sonic_timeseries_with_time_field() {
630        let parser = SonicParser::new();
631        let json = br#"{
632            "time": "2023-01-01T00:00:00Z",
633            "temperature": 25.5,
634            "humidity": 60.2
635        }"#;
636
637        let result = parser.parse(json).unwrap();
638        assert!(result.semantics.is_some());
639        if let Some(semantics) = result.semantics {
640            if let SemanticType::TimeSeries {
641                timestamp_field, ..
642            } = semantics.semantic_type
643            {
644                assert_eq!(timestamp_field, "time");
645            } else {
646                panic!("Expected TimeSeries semantic type");
647            }
648        }
649    }
650
651    #[test]
652    fn test_sonic_large_input_skips_semantics() {
653        let parser = SonicParser::new();
654
655        // Create input larger than 100KB threshold
656        let large_value = "x".repeat(50_000);
657        let json = format!(
658            r#"{{
659            "timestamp": "2023-01-01T00:00:00Z",
660            "data": "{}"
661        }}"#,
662            large_value
663        );
664
665        let result = parser.parse(json.as_bytes()).unwrap();
666        // Semantic detection should be skipped for large inputs
667        assert!(result.semantics.is_none());
668    }
669
670    #[test]
671    fn test_sonic_tabular_data_detection() {
672        let parser = SonicParser::new();
673        let json = br#"[
674            {"id": 1, "name": "Alice", "age": 30},
675            {"id": 2, "name": "Bob", "age": 25},
676            {"id": 3, "name": "Charlie", "age": 35}
677        ]"#;
678
679        let result = parser.parse(json).unwrap();
680        assert!(result.semantics.is_some());
681        if let Some(semantics) = result.semantics {
682            assert!(matches!(
683                semantics.semantic_type,
684                SemanticType::Table { .. }
685            ));
686        }
687    }
688
689    #[test]
690    fn test_sonic_non_tabular_heterogeneous_array() {
691        let parser = SonicParser::new();
692        // Array with different structures - should not be detected as tabular
693        let json = br#"[
694            {"id": 1, "name": "Alice"},
695            {"id": 2, "name": "Bob", "extra": "field"},
696            {"completely": "different"}
697        ]"#;
698
699        let result = parser.parse(json).unwrap();
700        // Should not detect as tabular due to heterogeneous structure
701        if let Some(semantics) = result.semantics {
702            assert!(!matches!(
703                semantics.semantic_type,
704                SemanticType::Table { .. }
705            ));
706        }
707    }
708}