1use crate::config::SecurityConfig;
7use crate::security::SecurityValidator;
8use crate::semantic::{SemanticMeta, SemanticType};
9use crate::{Error, Frame, Result};
10use bytes::Bytes;
11use serde_json::{self, Map, Value};
12use smallvec::SmallVec;
13
14pub struct SimpleParser {
16 config: ParseConfig,
17 validator: SecurityValidator,
18}
19
20#[derive(Debug, Clone)]
22pub struct ParseConfig {
23 pub detect_semantics: bool,
25 pub max_size_mb: usize,
27 pub stream_large_arrays: bool,
29 pub stream_threshold: usize,
31}
32
33impl Default for ParseConfig {
34 fn default() -> Self {
35 Self {
36 detect_semantics: true,
37 max_size_mb: 100,
38 stream_large_arrays: true,
39 stream_threshold: 1000,
40 }
41 }
42}
43
44impl SimpleParser {
45 pub fn new() -> Self {
47 Self {
48 config: ParseConfig::default(),
49 validator: SecurityValidator::default(),
50 }
51 }
52
53 pub fn with_config(config: ParseConfig) -> Self {
55 Self {
56 config,
57 validator: SecurityValidator::default(),
58 }
59 }
60
61 pub fn with_security_config(config: ParseConfig, security_config: SecurityConfig) -> Self {
63 Self {
64 config,
65 validator: SecurityValidator::new(security_config),
66 }
67 }
68
69 pub fn parse(&self, input: &[u8]) -> Result<Frame> {
71 self.validator.validate_input_size(input.len())?;
73
74 let value: Value = serde_json::from_slice(input)
76 .map_err(|e| Error::invalid_json(0, format!("serde_json error: {e}")))?;
77
78 let semantic_type = if self.config.detect_semantics {
80 self.detect_semantic_type(&value)
81 } else {
82 SemanticType::Generic
83 };
84
85 let semantics = Some(SemanticMeta::new(semantic_type));
87
88 let mut frame = Frame::new(Bytes::copy_from_slice(input));
90 frame.semantics = semantics;
91
92 Ok(frame)
93 }
94
95 pub fn parse_with_semantics(&self, input: &[u8], semantics: &SemanticMeta) -> Result<Frame> {
97 let mut frame = self.parse(input)?;
98 frame.semantics = Some(semantics.clone());
99 Ok(frame)
100 }
101
102 fn detect_semantic_type(&self, value: &Value) -> SemanticType {
104 match value {
105 Value::Array(arr) => self.detect_array_semantics(arr),
106 Value::Object(obj) => self.detect_object_semantics(obj),
107 _ => SemanticType::Generic,
108 }
109 }
110
111 fn detect_array_semantics(&self, arr: &[Value]) -> SemanticType {
113 if arr.is_empty() {
114 return SemanticType::Generic;
115 }
116
117 if self.is_numeric_array(arr) {
119 let dtype = self.detect_numeric_dtype(&arr[0]);
120 return SemanticType::NumericArray {
121 dtype,
122 length: Some(arr.len()),
123 };
124 }
125
126 if self.is_time_series_array(arr) {
128 return SemanticType::TimeSeries {
129 timestamp_field: "timestamp".to_string(),
130 value_fields: SmallVec::from_vec(vec!["value".to_string()]),
131 interval_ms: None,
132 };
133 }
134
135 if self.is_tabular_data(arr) {
137 let columns = self.extract_table_columns(&arr[0]);
138 return SemanticType::Table {
139 columns: Box::new(columns),
140 row_count: Some(arr.len()),
141 };
142 }
143
144 SemanticType::Generic
145 }
146
147 fn detect_object_semantics(&self, obj: &Map<String, Value>) -> SemanticType {
149 if obj.contains_key("type") && obj.contains_key("coordinates") {
151 return SemanticType::Geospatial {
152 coordinate_system: "WGS84".to_string(),
153 geometry_type: obj
154 .get("type")
155 .and_then(|v| v.as_str())
156 .unwrap_or("Point")
157 .to_string(),
158 };
159 }
160
161 if obj.contains_key("timestamp") || obj.contains_key("time") {
163 let timestamp_field = if obj.contains_key("timestamp") {
164 "timestamp"
165 } else {
166 "time"
167 };
168
169 let value_fields: SmallVec<[String; 4]> = obj
170 .keys()
171 .filter(|k| {
172 *k != timestamp_field && self.looks_like_numeric_value(obj.get(*k).unwrap())
174 })
175 .cloned()
176 .collect();
177
178 if !value_fields.is_empty() {
179 return SemanticType::TimeSeries {
180 timestamp_field: timestamp_field.to_string(),
181 value_fields,
182 interval_ms: None,
183 };
184 }
185 }
186
187 if obj.contains_key("data")
189 && obj.contains_key("shape")
190 && let (Some(Value::Array(_)), Some(Value::Array(shape))) =
191 (obj.get("data"), obj.get("shape"))
192 {
193 let dimensions: SmallVec<[usize; 4]> = shape
194 .iter()
195 .filter_map(|v| v.as_u64().map(|n| n as usize))
196 .collect();
197
198 if !dimensions.is_empty() {
199 return SemanticType::Matrix {
200 dimensions,
201 dtype: crate::semantic::NumericDType::F64, };
203 }
204 }
205
206 SemanticType::Generic
207 }
208
209 fn is_numeric_array(&self, arr: &[Value]) -> bool {
211 arr.len() > 2 && arr.iter().all(|v| v.is_number())
212 }
213
214 fn is_time_series_array(&self, arr: &[Value]) -> bool {
216 arr.len() >= 2
217 && arr.iter().all(|v| {
218 if let Value::Object(obj) = v {
219 obj.contains_key("timestamp") || obj.contains_key("time")
220 } else {
221 false
222 }
223 })
224 }
225
226 fn is_tabular_data(&self, arr: &[Value]) -> bool {
228 if arr.len() < 2 {
229 return false;
230 }
231
232 let first_keys: std::collections::HashSet<_> = if let Value::Object(first) = &arr[0] {
234 first.keys().collect()
235 } else {
236 return false;
237 };
238
239 arr.iter().all(|v| {
240 if let Value::Object(obj) = v {
241 let keys: std::collections::HashSet<_> = obj.keys().collect();
242 let intersection = first_keys.intersection(&keys).count();
244 let union = first_keys.union(&keys).count();
245 intersection as f64 / union as f64 > 0.8
246 } else {
247 false
248 }
249 })
250 }
251
252 fn detect_numeric_dtype(&self, value: &Value) -> crate::semantic::NumericDType {
254 match value {
255 Value::Number(n) => {
256 if n.is_i64() {
257 crate::semantic::NumericDType::I64
258 } else if n.is_u64() {
259 crate::semantic::NumericDType::U64
260 } else {
261 crate::semantic::NumericDType::F64
262 }
263 }
264 _ => crate::semantic::NumericDType::F64,
265 }
266 }
267
268 fn extract_table_columns(
270 &self,
271 first_obj: &Value,
272 ) -> SmallVec<[crate::semantic::ColumnMeta; 16]> {
273 let mut columns = SmallVec::new();
274
275 if let Value::Object(obj) = first_obj {
276 for (key, value) in obj {
277 let column_type = self.detect_column_type(value);
278 columns.push(crate::semantic::ColumnMeta {
279 name: key.clone(),
280 dtype: column_type,
281 nullable: false, });
283 }
284 }
285
286 columns
287 }
288
289 fn detect_column_type(&self, value: &Value) -> crate::semantic::ColumnType {
291 match value {
292 Value::Number(n) => {
293 if n.is_i64() {
294 crate::semantic::ColumnType::Numeric(crate::semantic::NumericDType::I64)
295 } else if n.is_u64() {
296 crate::semantic::ColumnType::Numeric(crate::semantic::NumericDType::U64)
297 } else {
298 crate::semantic::ColumnType::Numeric(crate::semantic::NumericDType::F64)
299 }
300 }
301 Value::String(_) => crate::semantic::ColumnType::String,
302 Value::Bool(_) => crate::semantic::ColumnType::Boolean,
303 Value::Array(_) => {
304 crate::semantic::ColumnType::Array(Box::new(crate::semantic::ColumnType::Json))
305 }
306 _ => crate::semantic::ColumnType::Json,
307 }
308 }
309
310 fn looks_like_numeric_value(&self, value: &Value) -> bool {
312 value.is_number()
313 }
314
315 pub fn stats(&self) -> ParseStats {
317 ParseStats {
318 total_parses: 0,
319 semantic_detections: 0,
320 avg_parse_time_ms: 0.0,
321 }
322 }
323}
324
325#[derive(Debug, Default)]
327pub struct ParseStats {
328 pub total_parses: u64,
330 pub semantic_detections: u64,
332 pub avg_parse_time_ms: f64,
334}
335
336impl Default for SimpleParser {
337 fn default() -> Self {
338 Self::new()
339 }
340}
341
342#[cfg(test)]
343mod tests {
344 use super::*;
345
346 #[test]
347 fn test_simple_parser_creation() {
348 let parser = SimpleParser::new();
349 assert!(parser.config.detect_semantics);
350 }
351
352 #[test]
353 fn test_basic_json_parsing() {
354 let parser = SimpleParser::new();
355 let json = br#"{"hello": "world", "count": 42}"#;
356
357 let result = parser.parse(json);
358 assert!(result.is_ok());
359
360 let frame = result.unwrap();
361 assert!(frame.semantics.is_some());
362 }
363
364 #[test]
365 fn test_numeric_array_detection() {
366 let parser = SimpleParser::new();
367 let json = b"[1, 2, 3, 4, 5]";
368
369 let result = parser.parse(json).unwrap();
370 if let Some(semantics) = result.semantics {
371 assert!(matches!(
372 semantics.semantic_type,
373 SemanticType::NumericArray { .. }
374 ));
375 }
376 }
377
378 #[test]
379 fn test_time_series_detection() {
380 let parser = SimpleParser::new();
381 let json = br#"[
382 {"timestamp": "2023-01-01T00:00:00Z", "value": 1.5},
383 {"timestamp": "2023-01-01T00:01:00Z", "value": 2.3}
384 ]"#;
385
386 let result = parser.parse(json).unwrap();
387 if let Some(semantics) = result.semantics {
388 assert!(matches!(
389 semantics.semantic_type,
390 SemanticType::TimeSeries { .. }
391 ));
392 }
393 }
394
395 #[test]
396 fn test_geospatial_detection() {
397 let parser = SimpleParser::new();
398 let json = br#"{"type": "Point", "coordinates": [125.6, 10.1]}"#;
399
400 let result = parser.parse(json).unwrap();
401 if let Some(semantics) = result.semantics {
402 assert!(matches!(
403 semantics.semantic_type,
404 SemanticType::Geospatial { .. }
405 ));
406 }
407 }
408
409 #[test]
410 fn test_tabular_data_detection() {
411 let parser = SimpleParser::new();
412 let json = br#"[
413 {"name": "John", "age": 30, "city": "New York"},
414 {"name": "Jane", "age": 25, "city": "Boston"},
415 {"name": "Bob", "age": 35, "city": "Chicago"}
416 ]"#;
417
418 let result = parser.parse(json).unwrap();
419 if let Some(semantics) = result.semantics {
420 assert!(matches!(
421 semantics.semantic_type,
422 SemanticType::Table { .. }
423 ));
424 }
425 }
426
427 #[test]
428 fn test_large_input_rejection() {
429 let mut parser = SimpleParser::new();
430 parser.config.max_size_mb = 1; let large_json = vec![b'a'; 2 * 1024 * 1024]; let result = parser.parse(&large_json);
434
435 assert!(result.is_err());
436 }
437}