1use crate::{
8 error::{Error, Result},
9 frame::Frame,
10 frame::{FrameFlags, FrameHeader},
11 security::{DepthTracker, SecurityValidator},
12 semantic::{NumericDType, SemanticMeta, SemanticType},
13};
14use bytes::Bytes;
15use smallvec::SmallVec;
16use sonic_rs::{JsonContainerTrait, JsonNumberTrait, JsonValueTrait, Value as SonicValue};
17
18#[inline(always)]
20fn unlikely(b: bool) -> bool {
21 b
22}
23
24#[derive(Debug, Clone)]
26pub struct SonicConfig {
27 pub detect_semantics: bool,
29 pub max_input_size: usize,
31}
32
33impl Default for SonicConfig {
34 fn default() -> Self {
35 Self {
36 detect_semantics: true,
37 max_input_size: 100 * 1024 * 1024, }
39 }
40}
41
42pub struct SonicParser {
44 config: SonicConfig,
45 validator: SecurityValidator,
46 stats: std::cell::RefCell<SonicStats>,
47}
48
49#[derive(Debug, Default, Clone)]
51pub struct SonicStats {
52 pub total_parses: u64,
53 pub sonic_successes: u64,
54 pub serde_fallbacks: u64,
55 pub avg_parse_time_ns: u64,
56 pub bytes_processed: u64,
57}
58
59impl SonicParser {
60 pub fn new() -> Self {
62 Self {
63 config: SonicConfig::default(),
64 validator: SecurityValidator::default(),
65 stats: std::cell::RefCell::new(SonicStats::default()),
66 }
67 }
68
69 pub fn with_config(config: SonicConfig) -> Self {
71 Self {
72 config,
73 validator: SecurityValidator::default(),
74 stats: std::cell::RefCell::new(SonicStats::default()),
75 }
76 }
77
78 pub fn with_security_config(
80 config: SonicConfig,
81 security_config: crate::config::SecurityConfig,
82 ) -> Self {
83 Self {
84 config,
85 validator: SecurityValidator::new(security_config),
86 stats: std::cell::RefCell::new(SonicStats::default()),
87 }
88 }
89
90 pub fn parse(&self, input: &[u8]) -> Result<Frame> {
92 let start_time = std::time::Instant::now();
93
94 self.validator.validate_input_size(input.len())?;
96
97 if unlikely(input.len() > self.config.max_input_size) {
99 return Err(Error::Other(format!("Input too large: {}", input.len())));
100 }
101
102 let json_str = std::str::from_utf8(input)
104 .map_err(|e| Error::Other(format!("Invalid UTF-8 input: {}", e)))?;
105
106 self.pre_validate_json_string(json_str)?;
108
109 let value: SonicValue =
111 sonic_rs::from_str(json_str).map_err(|e| Error::invalid_json(0, e.to_string()))?;
112
113 self.validate_json_structure(&value)?;
115
116 let semantic_type = if self.config.detect_semantics && input.len() < 100_000 {
118 self.detect_semantic_type_sonic(&value)
119 } else {
120 SemanticType::Generic
121 };
122
123 let payload = if input.len() < 4096 {
125 Bytes::copy_from_slice(input)
127 } else {
128 Bytes::from(input.to_vec()) };
131
132 let header = FrameHeader {
134 version: 1,
135 flags: FrameFlags::empty(),
136 sequence: 0,
137 length: input.len() as u32,
138 schema_id: 0,
139 checksum: 0,
140 };
141
142 let semantics = if semantic_type != SemanticType::Generic {
143 Some(SemanticMeta::new(semantic_type))
144 } else {
145 None
146 };
147
148 {
150 let mut stats = self.stats.borrow_mut();
151 stats.total_parses += 1;
152 stats.sonic_successes += 1;
153 stats.bytes_processed += input.len() as u64;
154
155 let elapsed_ns = start_time.elapsed().as_nanos() as u64;
156 stats.avg_parse_time_ns = (stats.avg_parse_time_ns * (stats.total_parses - 1)
157 + elapsed_ns)
158 / stats.total_parses;
159 }
160
161 Ok(Frame {
162 header,
163 payload,
164 semantics,
165 })
166 }
167
168 pub fn get_stats(&self) -> SonicStats {
170 self.stats.borrow().clone()
171 }
172
173 fn pre_validate_json_string(&self, json_str: &str) -> Result<()> {
175 let mut depth = 0;
177 let mut max_depth = 0;
178
179 for ch in json_str.chars() {
180 match ch {
181 '{' | '[' => {
182 depth += 1;
183 max_depth = max_depth.max(depth);
184 self.validator.validate_json_depth(max_depth)?;
185 }
186 '}' | ']' => {
187 depth = depth.saturating_sub(1);
188 }
189 _ => {}
190 }
191 }
192
193 Ok(())
194 }
195
196 fn validate_json_structure(&self, value: &SonicValue) -> Result<()> {
198 let mut depth_tracker = DepthTracker::default();
199 self.validate_json_recursive(value, &mut depth_tracker)
200 }
201
202 fn validate_json_recursive(
204 &self,
205 value: &SonicValue,
206 depth_tracker: &mut DepthTracker,
207 ) -> Result<()> {
208 match value {
209 _ if value.is_object() => {
210 depth_tracker.enter()?;
211
212 if let Some(obj) = value.as_object() {
213 self.validator.validate_object_keys(obj.len())?;
215
216 for (key, val) in obj.iter() {
218 self.validator.validate_string_length(key.len())?;
220 self.validate_json_recursive(val, depth_tracker)?;
221 }
222 }
223
224 depth_tracker.exit();
225 }
226 _ if value.is_array() => {
227 depth_tracker.enter()?;
228
229 if let Some(arr) = value.as_array() {
230 self.validator.validate_array_length(arr.len())?;
232
233 for element in arr.iter() {
235 self.validate_json_recursive(element, depth_tracker)?;
236 }
237 }
238
239 depth_tracker.exit();
240 }
241 _ if value.is_str() => {
242 if let Some(s) = value.as_str() {
243 self.validator.validate_string_length(s.len())?;
244 }
245 }
246 _ => {
247 }
249 }
250
251 Ok(())
252 }
253
254 fn detect_semantic_type_sonic(&self, value: &SonicValue) -> SemanticType {
256 if value.is_array()
257 && let Some(arr) = value.as_array()
258 {
259 return self.analyze_array_semantics_simd(arr);
260 }
261
262 if value.is_object()
263 && let Some(obj) = value.as_object()
264 {
265 return self.analyze_object_semantics_simd(obj);
266 }
267
268 SemanticType::Generic
269 }
270
271 fn analyze_object_semantics_simd(&self, obj: &sonic_rs::Object) -> SemanticType {
273 let scan_result = crate::parser::simd::SimdClassifier::scan_object_keys(obj);
274
275 if scan_result.has_type_field && scan_result.has_coordinates {
277 return SemanticType::Geospatial {
278 coordinate_system: "WGS84".to_string(),
279 geometry_type: obj
280 .get(&"type")
281 .and_then(|v| v.as_str())
282 .unwrap_or("Point")
283 .to_string(),
284 };
285 }
286
287 if scan_result.has_timestamp {
289 let timestamp_field = if obj.contains_key(&"timestamp") {
290 "timestamp"
291 } else {
292 "time"
293 };
294
295 let value_fields: SmallVec<[String; 4]> = obj
297 .iter()
298 .filter_map(|(k, v)| {
299 if k != timestamp_field && v.is_number() {
300 Some(k.to_string())
301 } else {
302 None
303 }
304 })
305 .collect();
306
307 if !value_fields.is_empty() {
308 return SemanticType::TimeSeries {
309 timestamp_field: timestamp_field.to_string(),
310 value_fields,
311 interval_ms: None,
312 };
313 }
314 }
315
316 SemanticType::Generic
317 }
318
319 fn analyze_array_semantics_simd(&self, arr: &sonic_rs::Array) -> SemanticType {
321 let len = arr.len();
322 if len == 0 {
323 return SemanticType::Generic;
324 }
325
326 if crate::parser::simd::SimdClassifier::is_numeric_array(arr) {
328 let dtype = if let Some(first) = arr.first() {
329 if let Some(num) = first.as_number() {
330 if num.is_i64() {
331 NumericDType::I64
332 } else if num.is_u64() {
333 NumericDType::U64
334 } else {
335 NumericDType::F64
336 }
337 } else {
338 NumericDType::F64
339 }
340 } else {
341 NumericDType::F64
342 };
343
344 return SemanticType::NumericArray {
345 dtype,
346 length: Some(len),
347 };
348 }
349
350 if len >= 2 {
352 let mut is_time_series = true;
353
354 for value in arr.iter() {
356 if let Some(obj) = value.as_object() {
357 let scan_result = crate::parser::simd::SimdClassifier::scan_object_keys(obj);
358 if !scan_result.has_timestamp {
359 is_time_series = false;
360 break;
361 }
362 } else {
363 is_time_series = false;
364 break;
365 }
366 }
367
368 if is_time_series {
369 return SemanticType::TimeSeries {
370 timestamp_field: "timestamp".to_string(),
371 value_fields: SmallVec::from_vec(vec!["value".to_string()]),
372 interval_ms: None,
373 };
374 }
375 }
376
377 if len >= 3
379 && arr.iter().all(|v| v.is_object())
380 && let Some(first_obj) = arr.first().and_then(|v| v.as_object())
381 {
382 let first_scan = crate::parser::simd::SimdClassifier::scan_object_keys(first_obj);
383
384 let is_tabular = arr.iter().skip(1).filter_map(|v| v.as_object()).all(|obj| {
386 let scan = crate::parser::simd::SimdClassifier::scan_object_keys(obj);
387 let diff = scan.key_count as i32 - first_scan.key_count as i32;
389 diff.abs() <= (first_scan.key_count as i32 / 5)
390 });
391
392 if is_tabular {
393 let columns: SmallVec<[crate::semantic::ColumnMeta; 16]> = first_obj
395 .iter()
396 .map(|(k, v)| {
397 let column_type = if v.is_number() {
398 crate::semantic::ColumnType::Numeric(NumericDType::F64)
399 } else if v.is_str() {
400 crate::semantic::ColumnType::String
401 } else if v.as_bool().is_some() {
402 crate::semantic::ColumnType::Boolean
403 } else {
404 crate::semantic::ColumnType::Json
405 };
406
407 crate::semantic::ColumnMeta {
408 name: k.to_string(),
409 dtype: column_type,
410 nullable: false,
411 }
412 })
413 .collect();
414
415 return SemanticType::Table {
416 columns: Box::new(columns),
417 row_count: Some(len),
418 };
419 }
420 }
421
422 SemanticType::Generic
423 }
424}
425
426impl Default for SonicParser {
427 fn default() -> Self {
428 Self::new()
429 }
430}
431
432#[cfg(test)]
433mod tests {
434 use super::*;
435
436 #[test]
437 fn test_sonic_parser_creation() {
438 let parser = SonicParser::new();
439 assert!(parser.config.detect_semantics);
440 assert_eq!(parser.config.max_input_size, 100 * 1024 * 1024);
441 }
442
443 #[test]
444 fn test_sonic_basic_parsing() {
445 let parser = SonicParser::new();
446 let json = br#"{"name": "test", "value": 42}"#;
447
448 let result = parser.parse(json);
449 assert!(result.is_ok());
450
451 let frame = result.unwrap();
452 assert_eq!(frame.header.version, 1);
453 assert_eq!(frame.payload.len(), json.len());
454 }
455
456 #[test]
457 fn test_sonic_numeric_array_detection() {
458 let parser = SonicParser::new();
459 let json = br#"[1.5, 2.7, 3.14, 4.2, 5.1]"#;
460
461 let result = parser.parse(json).unwrap();
462 if let Some(semantics) = result.semantics {
463 assert!(matches!(
464 semantics.semantic_type,
465 SemanticType::NumericArray { .. }
466 ));
467 } else {
468 panic!("Expected semantic metadata");
469 }
470 }
471
472 #[test]
473 fn test_sonic_time_series_detection() {
474 let parser = SonicParser::new();
475 let json = br#"[
476 {"timestamp": "2023-01-01T00:00:00Z", "value": 1.5},
477 {"timestamp": "2023-01-01T00:01:00Z", "value": 2.3}
478 ]"#;
479
480 let result = parser.parse(json).unwrap();
481 if let Some(semantics) = result.semantics {
482 assert!(matches!(
483 semantics.semantic_type,
484 SemanticType::TimeSeries { .. }
485 ));
486 } else {
487 panic!("Expected semantic metadata");
488 }
489 }
490
491 #[test]
492 fn test_sonic_performance_config() {
493 let config = SonicConfig {
494 detect_semantics: false,
495 max_input_size: 1024,
496 };
497
498 let parser = SonicParser::with_config(config);
499 assert!(!parser.config.detect_semantics);
500 assert_eq!(parser.config.max_input_size, 1024);
501 }
502
503 #[test]
504 fn test_sonic_invalid_utf8_handling() {
505 let parser = SonicParser::new();
506 let invalid_utf8 = &[0xFF, 0xFE, 0xFD];
508
509 let result = parser.parse(invalid_utf8);
510 assert!(result.is_err());
511
512 let error_msg = result.unwrap_err().to_string();
513 assert!(error_msg.contains("Invalid UTF-8"));
514 }
515
516 #[test]
517 fn test_sonic_input_size_limit() {
518 let config = SonicConfig {
519 detect_semantics: true,
520 max_input_size: 10, };
522 let parser = SonicParser::with_config(config);
523
524 let large_json = b"[1,2,3,4,5,6,7,8,9,10]"; let result = parser.parse(large_json);
526
527 assert!(result.is_err());
528 let error_msg = result.unwrap_err().to_string();
529 assert!(error_msg.contains("Input size") || error_msg.contains("Input too large"));
530 }
531
532 #[test]
533 fn test_sonic_json_depth_validation() {
534 let parser = SonicParser::new();
535
536 let mut json = String::new();
538 for _ in 0..65 {
540 json.push('{');
541 json.push_str("\"a\":");
542 }
543 json.push_str("\"value\"");
544 for _ in 0..65 {
545 json.push('}');
546 }
547
548 let result = parser.parse(json.as_bytes());
549 assert!(result.is_err());
550 let error_msg = result.unwrap_err().to_string();
551 assert!(error_msg.contains("depth"));
552 }
553
554 #[test]
555 fn test_sonic_large_string_validation() {
556 let parser = SonicParser::new();
557
558 let large_string = "a".repeat(11 * 1024 * 1024); let json = format!("{{\"key\": \"{}\"}}", large_string);
561
562 let result = parser.parse(json.as_bytes());
563 assert!(result.is_err());
564 let error_msg = result.unwrap_err().to_string();
565 assert!(error_msg.contains("String length"));
566 }
567
568 #[test]
569 fn test_sonic_large_array_validation() {
570 let parser = SonicParser::new();
571
572 let mut json = String::from("[");
574 let _max_elements = 1_000_000 + 1; for i in 0..1001 {
578 if i > 0 {
580 json.push(',');
581 }
582 json.push_str(&i.to_string());
583 }
584 json.push(']');
585
586 let result = parser.parse(json.as_bytes());
588 assert!(result.is_ok());
589 }
590
591 #[test]
592 fn test_sonic_many_object_keys_validation() {
593 let parser = SonicParser::new();
594
595 let mut json = String::from("{");
597 for i in 0..1000 {
598 if i > 0 {
600 json.push(',');
601 }
602 json.push_str(&format!("\"key{}\": {}", i, i));
603 }
604 json.push('}');
605
606 let result = parser.parse(json.as_bytes());
607 assert!(result.is_ok());
608 }
609
610 #[test]
611 fn test_sonic_geojson_detection() {
612 let parser = SonicParser::new();
613 let json = br#"{
614 "type": "Point",
615 "coordinates": [125.6, 10.1]
616 }"#;
617
618 let result = parser.parse(json).unwrap();
619 assert!(result.semantics.is_some());
620 if let Some(semantics) = result.semantics {
621 assert!(matches!(
622 semantics.semantic_type,
623 SemanticType::Geospatial { .. }
624 ));
625 }
626 }
627
628 #[test]
629 fn test_sonic_timeseries_with_time_field() {
630 let parser = SonicParser::new();
631 let json = br#"{
632 "time": "2023-01-01T00:00:00Z",
633 "temperature": 25.5,
634 "humidity": 60.2
635 }"#;
636
637 let result = parser.parse(json).unwrap();
638 assert!(result.semantics.is_some());
639 if let Some(semantics) = result.semantics {
640 if let SemanticType::TimeSeries {
641 timestamp_field, ..
642 } = semantics.semantic_type
643 {
644 assert_eq!(timestamp_field, "time");
645 } else {
646 panic!("Expected TimeSeries semantic type");
647 }
648 }
649 }
650
651 #[test]
652 fn test_sonic_large_input_skips_semantics() {
653 let parser = SonicParser::new();
654
655 let large_value = "x".repeat(50_000);
657 let json = format!(
658 r#"{{
659 "timestamp": "2023-01-01T00:00:00Z",
660 "data": "{}"
661 }}"#,
662 large_value
663 );
664
665 let result = parser.parse(json.as_bytes()).unwrap();
666 assert!(result.semantics.is_none());
668 }
669
670 #[test]
671 fn test_sonic_tabular_data_detection() {
672 let parser = SonicParser::new();
673 let json = br#"[
674 {"id": 1, "name": "Alice", "age": 30},
675 {"id": 2, "name": "Bob", "age": 25},
676 {"id": 3, "name": "Charlie", "age": 35}
677 ]"#;
678
679 let result = parser.parse(json).unwrap();
680 assert!(result.semantics.is_some());
681 if let Some(semantics) = result.semantics {
682 assert!(matches!(
683 semantics.semantic_type,
684 SemanticType::Table { .. }
685 ));
686 }
687 }
688
689 #[test]
690 fn test_sonic_non_tabular_heterogeneous_array() {
691 let parser = SonicParser::new();
692 let json = br#"[
694 {"id": 1, "name": "Alice"},
695 {"id": 2, "name": "Bob", "extra": "field"},
696 {"completely": "different"}
697 ]"#;
698
699 let result = parser.parse(json).unwrap();
700 if let Some(semantics) = result.semantics {
702 assert!(!matches!(
703 semantics.semantic_type,
704 SemanticType::Table { .. }
705 ));
706 }
707 }
708}