1use crate::config::SecurityConfig;
7use crate::security::SecurityValidator;
8use crate::semantic::{SemanticMeta, SemanticType};
9use crate::{Error, Frame, Result};
10use bytes::Bytes;
11use serde_json::{self, Map, Value};
12use smallvec::SmallVec;
13
14pub struct SimpleParser {
16 config: ParseConfig,
17 validator: SecurityValidator,
18}
19
20#[derive(Debug, Clone)]
22pub struct ParseConfig {
23 pub detect_semantics: bool,
25 pub max_size_mb: usize,
27 pub stream_large_arrays: bool,
29 pub stream_threshold: usize,
31}
32
33impl Default for ParseConfig {
34 fn default() -> Self {
35 Self {
36 detect_semantics: true,
37 max_size_mb: 100,
38 stream_large_arrays: true,
39 stream_threshold: 1000,
40 }
41 }
42}
43
44impl SimpleParser {
45 pub fn new() -> Self {
47 Self {
48 config: ParseConfig::default(),
49 validator: SecurityValidator::default(),
50 }
51 }
52
53 pub fn with_config(config: ParseConfig) -> Self {
55 Self {
56 config,
57 validator: SecurityValidator::default(),
58 }
59 }
60
61 pub fn with_security_config(config: ParseConfig, security_config: SecurityConfig) -> Self {
63 Self {
64 config,
65 validator: SecurityValidator::new(security_config),
66 }
67 }
68
69 pub fn parse(&self, input: &[u8]) -> Result<Frame> {
71 self.validator.validate_input_size(input.len())?;
73
74 let value: Value = serde_json::from_slice(input)
76 .map_err(|e| Error::invalid_json(0, format!("serde_json error: {e}")))?;
77
78 let semantic_type = if self.config.detect_semantics {
80 self.detect_semantic_type(&value)
81 } else {
82 SemanticType::Generic
83 };
84
85 let semantics = Some(SemanticMeta::new(semantic_type));
87
88 let mut frame = Frame::new(Bytes::copy_from_slice(input));
90 frame.semantics = semantics;
91
92 Ok(frame)
93 }
94
95 pub fn parse_with_semantics(&self, input: &[u8], semantics: &SemanticMeta) -> Result<Frame> {
97 let mut frame = self.parse(input)?;
98 frame.semantics = Some(semantics.clone());
99 Ok(frame)
100 }
101
102 fn detect_semantic_type(&self, value: &Value) -> SemanticType {
104 match value {
105 Value::Array(arr) => self.detect_array_semantics(arr),
106 Value::Object(obj) => self.detect_object_semantics(obj),
107 _ => SemanticType::Generic,
108 }
109 }
110
111 fn detect_array_semantics(&self, arr: &[Value]) -> SemanticType {
113 if arr.is_empty() {
114 return SemanticType::Generic;
115 }
116
117 if self.is_numeric_array(arr) {
119 let dtype = self.detect_numeric_dtype(&arr[0]);
120 return SemanticType::NumericArray {
121 dtype,
122 length: Some(arr.len()),
123 };
124 }
125
126 if self.is_time_series_array(arr) {
128 return SemanticType::TimeSeries {
129 timestamp_field: "timestamp".to_string(),
130 value_fields: SmallVec::from_vec(vec!["value".to_string()]),
131 interval_ms: None,
132 };
133 }
134
135 if self.is_tabular_data(arr) {
137 let columns = self.extract_table_columns(&arr[0]);
138 return SemanticType::Table {
139 columns: Box::new(columns),
140 row_count: Some(arr.len()),
141 };
142 }
143
144 SemanticType::Generic
145 }
146
147 fn detect_object_semantics(&self, obj: &Map<String, Value>) -> SemanticType {
149 if obj.contains_key("type") && obj.contains_key("coordinates") {
151 return SemanticType::Geospatial {
152 coordinate_system: "WGS84".to_string(),
153 geometry_type: obj
154 .get("type")
155 .and_then(|v| v.as_str())
156 .unwrap_or("Point")
157 .to_string(),
158 };
159 }
160
161 if obj.contains_key("timestamp") || obj.contains_key("time") {
163 let timestamp_field = if obj.contains_key("timestamp") {
164 "timestamp"
165 } else {
166 "time"
167 };
168
169 let value_fields: SmallVec<[String; 4]> = obj
170 .keys()
171 .filter_map(|k| {
172 let v = obj.get(k.as_str())?;
173 (*k != timestamp_field && self.looks_like_numeric_value(v)).then(|| k.clone())
174 })
175 .collect();
176
177 if !value_fields.is_empty() {
178 return SemanticType::TimeSeries {
179 timestamp_field: timestamp_field.to_string(),
180 value_fields,
181 interval_ms: None,
182 };
183 }
184 }
185
186 if obj.contains_key("data")
188 && obj.contains_key("shape")
189 && let (Some(Value::Array(_)), Some(Value::Array(shape))) =
190 (obj.get("data"), obj.get("shape"))
191 {
192 let dimensions: SmallVec<[usize; 4]> = shape
193 .iter()
194 .filter_map(|v| v.as_u64().map(|n| n as usize))
195 .collect();
196
197 if !dimensions.is_empty() {
198 return SemanticType::Matrix {
199 dimensions,
200 dtype: crate::semantic::NumericDType::F64, };
202 }
203 }
204
205 SemanticType::Generic
206 }
207
208 fn is_numeric_array(&self, arr: &[Value]) -> bool {
210 arr.len() > 2 && arr.iter().all(|v| v.is_number())
211 }
212
213 fn is_time_series_array(&self, arr: &[Value]) -> bool {
215 arr.len() >= 2
216 && arr.iter().all(|v| {
217 if let Value::Object(obj) = v {
218 obj.contains_key("timestamp") || obj.contains_key("time")
219 } else {
220 false
221 }
222 })
223 }
224
225 fn is_tabular_data(&self, arr: &[Value]) -> bool {
227 if arr.len() < 2 {
228 return false;
229 }
230
231 let first_keys: std::collections::HashSet<_> = if let Value::Object(first) = &arr[0] {
233 first.keys().collect()
234 } else {
235 return false;
236 };
237
238 arr.iter().all(|v| {
239 if let Value::Object(obj) = v {
240 let keys: std::collections::HashSet<_> = obj.keys().collect();
241 let intersection = first_keys.intersection(&keys).count();
243 let union = first_keys.union(&keys).count();
244 intersection as f64 / union as f64 > 0.8
245 } else {
246 false
247 }
248 })
249 }
250
251 fn detect_numeric_dtype(&self, value: &Value) -> crate::semantic::NumericDType {
253 match value {
254 Value::Number(n) => {
255 if n.is_i64() {
256 crate::semantic::NumericDType::I64
257 } else if n.is_u64() {
258 crate::semantic::NumericDType::U64
259 } else {
260 crate::semantic::NumericDType::F64
261 }
262 }
263 _ => crate::semantic::NumericDType::F64,
264 }
265 }
266
267 fn extract_table_columns(
269 &self,
270 first_obj: &Value,
271 ) -> SmallVec<[crate::semantic::ColumnMeta; 16]> {
272 let mut columns = SmallVec::new();
273
274 if let Value::Object(obj) = first_obj {
275 for (key, value) in obj {
276 let column_type = self.detect_column_type(value);
277 columns.push(crate::semantic::ColumnMeta {
278 name: key.clone(),
279 dtype: column_type,
280 nullable: false, });
282 }
283 }
284
285 columns
286 }
287
288 fn detect_column_type(&self, value: &Value) -> crate::semantic::ColumnType {
290 match value {
291 Value::Number(n) => {
292 if n.is_i64() {
293 crate::semantic::ColumnType::Numeric(crate::semantic::NumericDType::I64)
294 } else if n.is_u64() {
295 crate::semantic::ColumnType::Numeric(crate::semantic::NumericDType::U64)
296 } else {
297 crate::semantic::ColumnType::Numeric(crate::semantic::NumericDType::F64)
298 }
299 }
300 Value::String(_) => crate::semantic::ColumnType::String,
301 Value::Bool(_) => crate::semantic::ColumnType::Boolean,
302 Value::Array(_) => {
303 crate::semantic::ColumnType::Array(Box::new(crate::semantic::ColumnType::Json))
304 }
305 _ => crate::semantic::ColumnType::Json,
306 }
307 }
308
309 fn looks_like_numeric_value(&self, value: &Value) -> bool {
311 value.is_number()
312 }
313
314 pub fn stats(&self) -> ParseStats {
316 ParseStats {
317 total_parses: 0,
318 semantic_detections: 0,
319 avg_parse_time_ms: 0.0,
320 }
321 }
322}
323
324#[derive(Debug, Default)]
326pub struct ParseStats {
327 pub total_parses: u64,
329 pub semantic_detections: u64,
331 pub avg_parse_time_ms: f64,
333}
334
335impl Default for SimpleParser {
336 fn default() -> Self {
337 Self::new()
338 }
339}
340
341#[cfg(test)]
342mod tests {
343 use super::*;
344
345 #[test]
346 fn test_simple_parser_creation() {
347 let parser = SimpleParser::new();
348 assert!(parser.config.detect_semantics);
349 }
350
351 #[test]
352 fn test_basic_json_parsing() {
353 let parser = SimpleParser::new();
354 let json = br#"{"hello": "world", "count": 42}"#;
355
356 let result = parser.parse(json);
357 assert!(result.is_ok());
358
359 let frame = result.unwrap();
360 assert!(frame.semantics.is_some());
361 }
362
363 #[test]
364 fn test_numeric_array_detection() {
365 let parser = SimpleParser::new();
366 let json = b"[1, 2, 3, 4, 5]";
367
368 let result = parser.parse(json).unwrap();
369 if let Some(semantics) = result.semantics {
370 assert!(matches!(
371 semantics.semantic_type,
372 SemanticType::NumericArray { .. }
373 ));
374 }
375 }
376
377 #[test]
378 fn test_time_series_detection() {
379 let parser = SimpleParser::new();
380 let json = br#"[
381 {"timestamp": "2023-01-01T00:00:00Z", "value": 1.5},
382 {"timestamp": "2023-01-01T00:01:00Z", "value": 2.3}
383 ]"#;
384
385 let result = parser.parse(json).unwrap();
386 if let Some(semantics) = result.semantics {
387 assert!(matches!(
388 semantics.semantic_type,
389 SemanticType::TimeSeries { .. }
390 ));
391 }
392 }
393
394 #[test]
395 fn test_geospatial_detection() {
396 let parser = SimpleParser::new();
397 let json = br#"{"type": "Point", "coordinates": [125.6, 10.1]}"#;
398
399 let result = parser.parse(json).unwrap();
400 if let Some(semantics) = result.semantics {
401 assert!(matches!(
402 semantics.semantic_type,
403 SemanticType::Geospatial { .. }
404 ));
405 }
406 }
407
408 #[test]
409 fn test_tabular_data_detection() {
410 let parser = SimpleParser::new();
411 let json = br#"[
412 {"name": "John", "age": 30, "city": "New York"},
413 {"name": "Jane", "age": 25, "city": "Boston"},
414 {"name": "Bob", "age": 35, "city": "Chicago"}
415 ]"#;
416
417 let result = parser.parse(json).unwrap();
418 if let Some(semantics) = result.semantics {
419 assert!(matches!(
420 semantics.semantic_type,
421 SemanticType::Table { .. }
422 ));
423 }
424 }
425
426 #[test]
427 fn test_large_input_rejection() {
428 let mut parser = SimpleParser::new();
429 parser.config.max_size_mb = 1; let large_json = vec![b'a'; 2 * 1024 * 1024]; let result = parser.parse(&large_json);
433
434 assert!(result.is_err());
435 }
436}