1use crate::{
8 error::{Error, Result},
9 frame::Frame,
10 frame::{FrameFlags, FrameHeader},
11 security::{DepthTracker, SecurityValidator},
12 semantic::{NumericDType, SemanticMeta, SemanticType},
13};
14use bytes::Bytes;
15use smallvec::SmallVec;
16use sonic_rs::{JsonContainerTrait, JsonNumberTrait, JsonValueTrait, Value as SonicValue};
17
18#[inline(always)]
20fn unlikely(b: bool) -> bool {
21 b
22}
23
24#[derive(Debug, Clone)]
26pub struct SonicConfig {
27 pub detect_semantics: bool,
29 pub max_input_size: usize,
31}
32
33impl Default for SonicConfig {
34 fn default() -> Self {
35 Self {
36 detect_semantics: true,
37 max_input_size: 100 * 1024 * 1024, }
39 }
40}
41
42pub struct SonicParser {
44 config: SonicConfig,
45 validator: SecurityValidator,
46 stats: std::cell::RefCell<SonicStats>,
47}
48
49#[derive(Debug, Default, Clone)]
51pub struct SonicStats {
52 pub total_parses: u64,
54 pub sonic_successes: u64,
56 pub serde_fallbacks: u64,
58 pub avg_parse_time_ns: u64,
60 pub bytes_processed: u64,
62}
63
64impl SonicParser {
65 pub fn new() -> Self {
67 Self {
68 config: SonicConfig::default(),
69 validator: SecurityValidator::default(),
70 stats: std::cell::RefCell::new(SonicStats::default()),
71 }
72 }
73
74 pub fn with_config(config: SonicConfig) -> Self {
76 Self {
77 config,
78 validator: SecurityValidator::default(),
79 stats: std::cell::RefCell::new(SonicStats::default()),
80 }
81 }
82
83 pub fn with_security_config(
85 config: SonicConfig,
86 security_config: crate::config::SecurityConfig,
87 ) -> Self {
88 Self {
89 config,
90 validator: SecurityValidator::new(security_config),
91 stats: std::cell::RefCell::new(SonicStats::default()),
92 }
93 }
94
95 pub fn parse(&self, input: &[u8]) -> Result<Frame> {
97 let start_time = std::time::Instant::now();
98
99 self.validator.validate_input_size(input.len())?;
101
102 if unlikely(input.len() > self.config.max_input_size) {
104 return Err(Error::Other(format!("Input too large: {}", input.len())));
105 }
106
107 let json_str = std::str::from_utf8(input)
109 .map_err(|e| Error::Other(format!("Invalid UTF-8 input: {}", e)))?;
110
111 self.pre_validate_json_string(json_str)?;
113
114 let value: SonicValue =
116 sonic_rs::from_str(json_str).map_err(|e| Error::invalid_json(0, e.to_string()))?;
117
118 self.validate_json_structure(&value)?;
120
121 let semantic_type = if self.config.detect_semantics && input.len() < 100_000 {
123 self.detect_semantic_type_sonic(&value)
124 } else {
125 SemanticType::Generic
126 };
127
128 let payload = if input.len() < 4096 {
130 Bytes::copy_from_slice(input)
132 } else {
133 Bytes::from(input.to_vec()) };
136
137 let header = FrameHeader {
139 version: 1,
140 flags: FrameFlags::empty(),
141 sequence: 0,
142 length: input.len() as u32,
143 schema_id: 0,
144 checksum: 0,
145 };
146
147 let semantics = if semantic_type != SemanticType::Generic {
148 Some(SemanticMeta::new(semantic_type))
149 } else {
150 None
151 };
152
153 {
155 let mut stats = self.stats.borrow_mut();
156 stats.total_parses += 1;
157 stats.sonic_successes += 1;
158 stats.bytes_processed += input.len() as u64;
159
160 let elapsed_ns = start_time.elapsed().as_nanos() as u64;
161 stats.avg_parse_time_ns = (stats.avg_parse_time_ns * (stats.total_parses - 1)
162 + elapsed_ns)
163 / stats.total_parses;
164 }
165
166 Ok(Frame {
167 header,
168 payload,
169 semantics,
170 })
171 }
172
173 pub fn get_stats(&self) -> SonicStats {
175 self.stats.borrow().clone()
176 }
177
178 fn pre_validate_json_string(&self, json_str: &str) -> Result<()> {
180 let mut depth = 0;
182 let mut max_depth = 0;
183
184 for ch in json_str.chars() {
185 match ch {
186 '{' | '[' => {
187 depth += 1;
188 max_depth = max_depth.max(depth);
189 self.validator.validate_json_depth(max_depth)?;
190 }
191 '}' | ']' => {
192 depth = depth.saturating_sub(1);
193 }
194 _ => {}
195 }
196 }
197
198 Ok(())
199 }
200
201 fn validate_json_structure(&self, value: &SonicValue) -> Result<()> {
203 let mut depth_tracker = DepthTracker::default();
204 self.validate_json_recursive(value, &mut depth_tracker)
205 }
206
207 fn validate_json_recursive(
209 &self,
210 value: &SonicValue,
211 depth_tracker: &mut DepthTracker,
212 ) -> Result<()> {
213 match value {
214 _ if value.is_object() => {
215 depth_tracker.enter()?;
216
217 if let Some(obj) = value.as_object() {
218 self.validator.validate_object_keys(obj.len())?;
220
221 for (key, val) in obj.iter() {
223 self.validator.validate_string_length(key.len())?;
225 self.validate_json_recursive(val, depth_tracker)?;
226 }
227 }
228
229 depth_tracker.exit();
230 }
231 _ if value.is_array() => {
232 depth_tracker.enter()?;
233
234 if let Some(arr) = value.as_array() {
235 self.validator.validate_array_length(arr.len())?;
237
238 for element in arr.iter() {
240 self.validate_json_recursive(element, depth_tracker)?;
241 }
242 }
243
244 depth_tracker.exit();
245 }
246 _ if value.is_str() => {
247 if let Some(s) = value.as_str() {
248 self.validator.validate_string_length(s.len())?;
249 }
250 }
251 _ => {
252 }
254 }
255
256 Ok(())
257 }
258
259 fn detect_semantic_type_sonic(&self, value: &SonicValue) -> SemanticType {
261 if value.is_array()
262 && let Some(arr) = value.as_array()
263 {
264 return self.analyze_array_semantics_simd(arr);
265 }
266
267 if value.is_object()
268 && let Some(obj) = value.as_object()
269 {
270 return self.analyze_object_semantics_simd(obj);
271 }
272
273 SemanticType::Generic
274 }
275
276 fn analyze_object_semantics_simd(&self, obj: &sonic_rs::Object) -> SemanticType {
278 let scan_result = crate::parser::simd::SimdClassifier::scan_object_keys(obj);
279
280 if scan_result.has_type_field && scan_result.has_coordinates {
282 return SemanticType::Geospatial {
283 coordinate_system: "WGS84".to_string(),
284 geometry_type: obj
285 .get(&"type")
286 .and_then(|v| v.as_str())
287 .unwrap_or("Point")
288 .to_string(),
289 };
290 }
291
292 if scan_result.has_timestamp {
294 let timestamp_field = if obj.contains_key(&"timestamp") {
295 "timestamp"
296 } else {
297 "time"
298 };
299
300 let value_fields: SmallVec<[String; 4]> = obj
302 .iter()
303 .filter_map(|(k, v)| {
304 if k != timestamp_field && v.is_number() {
305 Some(k.to_string())
306 } else {
307 None
308 }
309 })
310 .collect();
311
312 if !value_fields.is_empty() {
313 return SemanticType::TimeSeries {
314 timestamp_field: timestamp_field.to_string(),
315 value_fields,
316 interval_ms: None,
317 };
318 }
319 }
320
321 SemanticType::Generic
322 }
323
324 fn analyze_array_semantics_simd(&self, arr: &sonic_rs::Array) -> SemanticType {
326 let len = arr.len();
327 if len == 0 {
328 return SemanticType::Generic;
329 }
330
331 if crate::parser::simd::SimdClassifier::is_numeric_array(arr) {
333 let dtype = if let Some(first) = arr.first() {
334 if let Some(num) = first.as_number() {
335 if num.is_i64() {
336 NumericDType::I64
337 } else if num.is_u64() {
338 NumericDType::U64
339 } else {
340 NumericDType::F64
341 }
342 } else {
343 NumericDType::F64
344 }
345 } else {
346 NumericDType::F64
347 };
348
349 return SemanticType::NumericArray {
350 dtype,
351 length: Some(len),
352 };
353 }
354
355 if len >= 2 {
357 let mut is_time_series = true;
358
359 for value in arr.iter() {
361 if let Some(obj) = value.as_object() {
362 let scan_result = crate::parser::simd::SimdClassifier::scan_object_keys(obj);
363 if !scan_result.has_timestamp {
364 is_time_series = false;
365 break;
366 }
367 } else {
368 is_time_series = false;
369 break;
370 }
371 }
372
373 if is_time_series {
374 return SemanticType::TimeSeries {
375 timestamp_field: "timestamp".to_string(),
376 value_fields: SmallVec::from_vec(vec!["value".to_string()]),
377 interval_ms: None,
378 };
379 }
380 }
381
382 if len >= 3
384 && arr.iter().all(|v| v.is_object())
385 && let Some(first_obj) = arr.first().and_then(|v| v.as_object())
386 {
387 let first_scan = crate::parser::simd::SimdClassifier::scan_object_keys(first_obj);
388
389 let is_tabular = arr.iter().skip(1).filter_map(|v| v.as_object()).all(|obj| {
391 let scan = crate::parser::simd::SimdClassifier::scan_object_keys(obj);
392 let diff = scan.key_count as i32 - first_scan.key_count as i32;
394 diff.abs() <= (first_scan.key_count as i32 / 5)
395 });
396
397 if is_tabular {
398 let columns: SmallVec<[crate::semantic::ColumnMeta; 16]> = first_obj
400 .iter()
401 .map(|(k, v)| {
402 let column_type = if v.is_number() {
403 crate::semantic::ColumnType::Numeric(NumericDType::F64)
404 } else if v.is_str() {
405 crate::semantic::ColumnType::String
406 } else if v.as_bool().is_some() {
407 crate::semantic::ColumnType::Boolean
408 } else {
409 crate::semantic::ColumnType::Json
410 };
411
412 crate::semantic::ColumnMeta {
413 name: k.to_string(),
414 dtype: column_type,
415 nullable: false,
416 }
417 })
418 .collect();
419
420 return SemanticType::Table {
421 columns: Box::new(columns),
422 row_count: Some(len),
423 };
424 }
425 }
426
427 SemanticType::Generic
428 }
429}
430
431impl Default for SonicParser {
432 fn default() -> Self {
433 Self::new()
434 }
435}
436
437#[cfg(test)]
438mod tests {
439 use super::*;
440
441 #[test]
442 fn test_sonic_parser_creation() {
443 let parser = SonicParser::new();
444 assert!(parser.config.detect_semantics);
445 assert_eq!(parser.config.max_input_size, 100 * 1024 * 1024);
446 }
447
448 #[test]
449 fn test_sonic_basic_parsing() {
450 let parser = SonicParser::new();
451 let json = br#"{"name": "test", "value": 42}"#;
452
453 let result = parser.parse(json);
454 assert!(result.is_ok());
455
456 let frame = result.unwrap();
457 assert_eq!(frame.header.version, 1);
458 assert_eq!(frame.payload.len(), json.len());
459 }
460
461 #[test]
462 fn test_sonic_numeric_array_detection() {
463 let parser = SonicParser::new();
464 let json = br#"[1.5, 2.7, 3.14, 4.2, 5.1]"#;
465
466 let result = parser.parse(json).unwrap();
467 if let Some(semantics) = result.semantics {
468 assert!(matches!(
469 semantics.semantic_type,
470 SemanticType::NumericArray { .. }
471 ));
472 } else {
473 panic!("Expected semantic metadata");
474 }
475 }
476
477 #[test]
478 fn test_sonic_time_series_detection() {
479 let parser = SonicParser::new();
480 let json = br#"[
481 {"timestamp": "2023-01-01T00:00:00Z", "value": 1.5},
482 {"timestamp": "2023-01-01T00:01:00Z", "value": 2.3}
483 ]"#;
484
485 let result = parser.parse(json).unwrap();
486 if let Some(semantics) = result.semantics {
487 assert!(matches!(
488 semantics.semantic_type,
489 SemanticType::TimeSeries { .. }
490 ));
491 } else {
492 panic!("Expected semantic metadata");
493 }
494 }
495
496 #[test]
497 fn test_sonic_performance_config() {
498 let config = SonicConfig {
499 detect_semantics: false,
500 max_input_size: 1024,
501 };
502
503 let parser = SonicParser::with_config(config);
504 assert!(!parser.config.detect_semantics);
505 assert_eq!(parser.config.max_input_size, 1024);
506 }
507
508 #[test]
509 fn test_sonic_invalid_utf8_handling() {
510 let parser = SonicParser::new();
511 let invalid_utf8 = &[0xFF, 0xFE, 0xFD];
513
514 let result = parser.parse(invalid_utf8);
515 assert!(result.is_err());
516
517 let error_msg = result.unwrap_err().to_string();
518 assert!(error_msg.contains("Invalid UTF-8"));
519 }
520
521 #[test]
522 fn test_sonic_input_size_limit() {
523 let config = SonicConfig {
524 detect_semantics: true,
525 max_input_size: 10, };
527 let parser = SonicParser::with_config(config);
528
529 let large_json = b"[1,2,3,4,5,6,7,8,9,10]"; let result = parser.parse(large_json);
531
532 assert!(result.is_err());
533 let error_msg = result.unwrap_err().to_string();
534 assert!(error_msg.contains("Input size") || error_msg.contains("Input too large"));
535 }
536
537 #[test]
538 fn test_sonic_json_depth_validation() {
539 let parser = SonicParser::new();
540
541 let mut json = String::new();
543 for _ in 0..65 {
545 json.push('{');
546 json.push_str("\"a\":");
547 }
548 json.push_str("\"value\"");
549 for _ in 0..65 {
550 json.push('}');
551 }
552
553 let result = parser.parse(json.as_bytes());
554 assert!(result.is_err());
555 let error_msg = result.unwrap_err().to_string();
556 assert!(error_msg.contains("depth"));
557 }
558
559 #[test]
560 fn test_sonic_large_string_validation() {
561 let parser = SonicParser::new();
562
563 let large_string = "a".repeat(11 * 1024 * 1024); let json = format!("{{\"key\": \"{}\"}}", large_string);
566
567 let result = parser.parse(json.as_bytes());
568 assert!(result.is_err());
569 let error_msg = result.unwrap_err().to_string();
570 assert!(error_msg.contains("String length"));
571 }
572
573 #[test]
574 fn test_sonic_large_array_validation() {
575 let parser = SonicParser::new();
576
577 let mut json = String::from("[");
579 let _max_elements = 1_000_000 + 1; for i in 0..1001 {
583 if i > 0 {
585 json.push(',');
586 }
587 json.push_str(&i.to_string());
588 }
589 json.push(']');
590
591 let result = parser.parse(json.as_bytes());
593 assert!(result.is_ok());
594 }
595
596 #[test]
597 fn test_sonic_many_object_keys_validation() {
598 let parser = SonicParser::new();
599
600 let mut json = String::from("{");
602 for i in 0..1000 {
603 if i > 0 {
605 json.push(',');
606 }
607 json.push_str(&format!("\"key{}\": {}", i, i));
608 }
609 json.push('}');
610
611 let result = parser.parse(json.as_bytes());
612 assert!(result.is_ok());
613 }
614
615 #[test]
616 fn test_sonic_geojson_detection() {
617 let parser = SonicParser::new();
618 let json = br#"{
619 "type": "Point",
620 "coordinates": [125.6, 10.1]
621 }"#;
622
623 let result = parser.parse(json).unwrap();
624 assert!(result.semantics.is_some());
625 if let Some(semantics) = result.semantics {
626 assert!(matches!(
627 semantics.semantic_type,
628 SemanticType::Geospatial { .. }
629 ));
630 }
631 }
632
633 #[test]
634 fn test_sonic_timeseries_with_time_field() {
635 let parser = SonicParser::new();
636 let json = br#"{
637 "time": "2023-01-01T00:00:00Z",
638 "temperature": 25.5,
639 "humidity": 60.2
640 }"#;
641
642 let result = parser.parse(json).unwrap();
643 assert!(result.semantics.is_some());
644 if let Some(semantics) = result.semantics {
645 if let SemanticType::TimeSeries {
646 timestamp_field, ..
647 } = semantics.semantic_type
648 {
649 assert_eq!(timestamp_field, "time");
650 } else {
651 panic!("Expected TimeSeries semantic type");
652 }
653 }
654 }
655
656 #[test]
657 fn test_sonic_large_input_skips_semantics() {
658 let parser = SonicParser::new();
659
660 let large_value = "x".repeat(50_000);
662 let json = format!(
663 r#"{{
664 "timestamp": "2023-01-01T00:00:00Z",
665 "data": "{}"
666 }}"#,
667 large_value
668 );
669
670 let result = parser.parse(json.as_bytes()).unwrap();
671 assert!(result.semantics.is_none());
673 }
674
675 #[test]
676 fn test_sonic_tabular_data_detection() {
677 let parser = SonicParser::new();
678 let json = br#"[
679 {"id": 1, "name": "Alice", "age": 30},
680 {"id": 2, "name": "Bob", "age": 25},
681 {"id": 3, "name": "Charlie", "age": 35}
682 ]"#;
683
684 let result = parser.parse(json).unwrap();
685 assert!(result.semantics.is_some());
686 if let Some(semantics) = result.semantics {
687 assert!(matches!(
688 semantics.semantic_type,
689 SemanticType::Table { .. }
690 ));
691 }
692 }
693
694 #[test]
695 fn test_sonic_non_tabular_heterogeneous_array() {
696 let parser = SonicParser::new();
697 let json = br#"[
699 {"id": 1, "name": "Alice"},
700 {"id": 2, "name": "Bob", "extra": "field"},
701 {"completely": "different"}
702 ]"#;
703
704 let result = parser.parse(json).unwrap();
705 if let Some(semantics) = result.semantics {
707 assert!(!matches!(
708 semantics.semantic_type,
709 SemanticType::Table { .. }
710 ));
711 }
712 }
713}