1use crate::semantic::{SemanticMeta, SemanticType};
7use crate::{Error, Frame, Result};
8use bytes::Bytes;
9use serde_json::{self, Map, Value};
10use smallvec::SmallVec;
11
12pub struct SimpleParser {
14 config: ParseConfig,
15}
16
17#[derive(Debug, Clone)]
19pub struct ParseConfig {
20 pub detect_semantics: bool,
22 pub max_size_mb: usize,
24 pub stream_large_arrays: bool,
26 pub stream_threshold: usize,
28}
29
30impl Default for ParseConfig {
31 fn default() -> Self {
32 Self {
33 detect_semantics: true,
34 max_size_mb: 100,
35 stream_large_arrays: true,
36 stream_threshold: 1000,
37 }
38 }
39}
40
41impl SimpleParser {
42 pub fn new() -> Self {
44 Self {
45 config: ParseConfig::default(),
46 }
47 }
48
49 pub fn with_config(config: ParseConfig) -> Self {
51 Self { config }
52 }
53
54 pub fn parse(&self, input: &[u8]) -> Result<Frame> {
56 if input.len() > self.config.max_size_mb * 1024 * 1024 {
58 let input_mb = input.len() / (1024 * 1024);
59 let max_mb = self.config.max_size_mb;
60 return Err(Error::buffer(format!(
61 "Input too large: {input_mb} MB, max: {max_mb} MB"
62 )));
63 }
64
65 let value: Value = serde_json::from_slice(input)
67 .map_err(|e| Error::invalid_json(0, format!("serde_json error: {e}")))?;
68
69 let semantic_type = if self.config.detect_semantics {
71 self.detect_semantic_type(&value)
72 } else {
73 SemanticType::Generic
74 };
75
76 let semantics = Some(SemanticMeta::new(semantic_type));
78
79 let mut frame = Frame::new(Bytes::copy_from_slice(input));
81 frame.semantics = semantics;
82
83 Ok(frame)
84 }
85
86 pub fn parse_with_semantics(&self, input: &[u8], semantics: &SemanticMeta) -> Result<Frame> {
88 let mut frame = self.parse(input)?;
89 frame.semantics = Some(semantics.clone());
90 Ok(frame)
91 }
92
93 fn detect_semantic_type(&self, value: &Value) -> SemanticType {
95 match value {
96 Value::Array(arr) => self.detect_array_semantics(arr),
97 Value::Object(obj) => self.detect_object_semantics(obj),
98 _ => SemanticType::Generic,
99 }
100 }
101
102 fn detect_array_semantics(&self, arr: &[Value]) -> SemanticType {
104 if arr.is_empty() {
105 return SemanticType::Generic;
106 }
107
108 if self.is_numeric_array(arr) {
110 let dtype = self.detect_numeric_dtype(&arr[0]);
111 return SemanticType::NumericArray {
112 dtype,
113 length: Some(arr.len()),
114 };
115 }
116
117 if self.is_time_series_array(arr) {
119 return SemanticType::TimeSeries {
120 timestamp_field: "timestamp".to_string(),
121 value_fields: SmallVec::from_vec(vec!["value".to_string()]),
122 interval_ms: None,
123 };
124 }
125
126 if self.is_tabular_data(arr) {
128 let columns = self.extract_table_columns(&arr[0]);
129 return SemanticType::Table {
130 columns: Box::new(columns),
131 row_count: Some(arr.len()),
132 };
133 }
134
135 SemanticType::Generic
136 }
137
138 fn detect_object_semantics(&self, obj: &Map<String, Value>) -> SemanticType {
140 if obj.contains_key("type") && obj.contains_key("coordinates") {
142 return SemanticType::Geospatial {
143 coordinate_system: "WGS84".to_string(),
144 geometry_type: obj
145 .get("type")
146 .and_then(|v| v.as_str())
147 .unwrap_or("Point")
148 .to_string(),
149 };
150 }
151
152 if obj.contains_key("timestamp") || obj.contains_key("time") {
154 let timestamp_field = if obj.contains_key("timestamp") {
155 "timestamp"
156 } else {
157 "time"
158 };
159
160 let value_fields: SmallVec<[String; 4]> = obj
161 .keys()
162 .filter(|k| {
163 *k != timestamp_field && self.looks_like_numeric_value(obj.get(*k).unwrap())
165 })
166 .map(|k| k.clone())
167 .collect();
168
169 if !value_fields.is_empty() {
170 return SemanticType::TimeSeries {
171 timestamp_field: timestamp_field.to_string(),
172 value_fields,
173 interval_ms: None,
174 };
175 }
176 }
177
178 if obj.contains_key("data") && obj.contains_key("shape") {
180 if let (Some(Value::Array(_)), Some(Value::Array(shape))) =
181 (obj.get("data"), obj.get("shape"))
182 {
183 let dimensions: SmallVec<[usize; 4]> = shape
184 .iter()
185 .filter_map(|v| v.as_u64().map(|n| n as usize))
186 .collect();
187
188 if !dimensions.is_empty() {
189 return SemanticType::Matrix {
190 dimensions,
191 dtype: crate::semantic::NumericDType::F64, };
193 }
194 }
195 }
196
197 SemanticType::Generic
198 }
199
200 fn is_numeric_array(&self, arr: &[Value]) -> bool {
202 arr.len() > 2 && arr.iter().all(|v| v.is_number())
203 }
204
205 fn is_time_series_array(&self, arr: &[Value]) -> bool {
207 arr.len() >= 2
208 && arr.iter().all(|v| {
209 if let Value::Object(obj) = v {
210 obj.contains_key("timestamp") || obj.contains_key("time")
211 } else {
212 false
213 }
214 })
215 }
216
217 fn is_tabular_data(&self, arr: &[Value]) -> bool {
219 if arr.len() < 2 {
220 return false;
221 }
222
223 let first_keys: std::collections::HashSet<_> = if let Value::Object(first) = &arr[0] {
225 first.keys().collect()
226 } else {
227 return false;
228 };
229
230 arr.iter().all(|v| {
231 if let Value::Object(obj) = v {
232 let keys: std::collections::HashSet<_> = obj.keys().collect();
233 let intersection = first_keys.intersection(&keys).count();
235 let union = first_keys.union(&keys).count();
236 intersection as f64 / union as f64 > 0.8
237 } else {
238 false
239 }
240 })
241 }
242
243 fn detect_numeric_dtype(&self, value: &Value) -> crate::semantic::NumericDType {
245 match value {
246 Value::Number(n) => {
247 if n.is_i64() {
248 crate::semantic::NumericDType::I64
249 } else if n.is_u64() {
250 crate::semantic::NumericDType::U64
251 } else {
252 crate::semantic::NumericDType::F64
253 }
254 }
255 _ => crate::semantic::NumericDType::F64,
256 }
257 }
258
259 fn extract_table_columns(
261 &self,
262 first_obj: &Value,
263 ) -> SmallVec<[crate::semantic::ColumnMeta; 16]> {
264 let mut columns = SmallVec::new();
265
266 if let Value::Object(obj) = first_obj {
267 for (key, value) in obj {
268 let column_type = self.detect_column_type(value);
269 columns.push(crate::semantic::ColumnMeta {
270 name: key.clone(),
271 dtype: column_type,
272 nullable: false, });
274 }
275 }
276
277 columns
278 }
279
280 fn detect_column_type(&self, value: &Value) -> crate::semantic::ColumnType {
282 match value {
283 Value::Number(n) => {
284 if n.is_i64() {
285 crate::semantic::ColumnType::Numeric(crate::semantic::NumericDType::I64)
286 } else if n.is_u64() {
287 crate::semantic::ColumnType::Numeric(crate::semantic::NumericDType::U64)
288 } else {
289 crate::semantic::ColumnType::Numeric(crate::semantic::NumericDType::F64)
290 }
291 }
292 Value::String(_) => crate::semantic::ColumnType::String,
293 Value::Bool(_) => crate::semantic::ColumnType::Boolean,
294 Value::Array(_) => {
295 crate::semantic::ColumnType::Array(Box::new(crate::semantic::ColumnType::Json))
296 }
297 _ => crate::semantic::ColumnType::Json,
298 }
299 }
300
301 fn looks_like_numeric_value(&self, value: &Value) -> bool {
303 value.is_number()
304 }
305
306 pub fn stats(&self) -> ParseStats {
308 ParseStats {
309 total_parses: 0,
310 semantic_detections: 0,
311 avg_parse_time_ms: 0.0,
312 }
313 }
314}
315
316#[derive(Debug, Default)]
318pub struct ParseStats {
319 pub total_parses: u64,
321 pub semantic_detections: u64,
323 pub avg_parse_time_ms: f64,
325}
326
327impl Default for SimpleParser {
328 fn default() -> Self {
329 Self::new()
330 }
331}
332
333#[cfg(test)]
334mod tests {
335 use super::*;
336
337 #[test]
338 fn test_simple_parser_creation() {
339 let parser = SimpleParser::new();
340 assert!(parser.config.detect_semantics);
341 }
342
343 #[test]
344 fn test_basic_json_parsing() {
345 let parser = SimpleParser::new();
346 let json = br#"{"hello": "world", "count": 42}"#;
347
348 let result = parser.parse(json);
349 assert!(result.is_ok());
350
351 let frame = result.unwrap();
352 assert!(frame.semantics.is_some());
353 }
354
355 #[test]
356 fn test_numeric_array_detection() {
357 let parser = SimpleParser::new();
358 let json = b"[1, 2, 3, 4, 5]";
359
360 let result = parser.parse(json).unwrap();
361 if let Some(semantics) = result.semantics {
362 assert!(matches!(
363 semantics.semantic_type,
364 SemanticType::NumericArray { .. }
365 ));
366 }
367 }
368
369 #[test]
370 fn test_time_series_detection() {
371 let parser = SimpleParser::new();
372 let json = br#"[
373 {"timestamp": "2023-01-01T00:00:00Z", "value": 1.5},
374 {"timestamp": "2023-01-01T00:01:00Z", "value": 2.3}
375 ]"#;
376
377 let result = parser.parse(json).unwrap();
378 if let Some(semantics) = result.semantics {
379 assert!(matches!(
380 semantics.semantic_type,
381 SemanticType::TimeSeries { .. }
382 ));
383 }
384 }
385
386 #[test]
387 fn test_geospatial_detection() {
388 let parser = SimpleParser::new();
389 let json = br#"{"type": "Point", "coordinates": [125.6, 10.1]}"#;
390
391 let result = parser.parse(json).unwrap();
392 if let Some(semantics) = result.semantics {
393 assert!(matches!(
394 semantics.semantic_type,
395 SemanticType::Geospatial { .. }
396 ));
397 }
398 }
399
400 #[test]
401 fn test_tabular_data_detection() {
402 let parser = SimpleParser::new();
403 let json = br#"[
404 {"name": "John", "age": 30, "city": "New York"},
405 {"name": "Jane", "age": 25, "city": "Boston"},
406 {"name": "Bob", "age": 35, "city": "Chicago"}
407 ]"#;
408
409 let result = parser.parse(json).unwrap();
410 if let Some(semantics) = result.semantics {
411 assert!(matches!(
412 semantics.semantic_type,
413 SemanticType::Table { .. }
414 ));
415 }
416 }
417
418 #[test]
419 fn test_large_input_rejection() {
420 let mut parser = SimpleParser::new();
421 parser.config.max_size_mb = 1; let large_json = vec![b'a'; 2 * 1024 * 1024]; let result = parser.parse(&large_json);
425
426 assert!(result.is_err());
427 }
428}