1use crate::semantic::{SemanticMeta, SemanticType};
7use crate::{Error, Frame, Result};
8use bytes::Bytes;
9use serde_json::{self, Map, Value};
10use smallvec::SmallVec;
11
12pub struct SimpleParser {
14 config: ParseConfig,
15}
16
17#[derive(Debug, Clone)]
19pub struct ParseConfig {
20 pub detect_semantics: bool,
22 pub max_size_mb: usize,
24 pub stream_large_arrays: bool,
26 pub stream_threshold: usize,
28}
29
30impl Default for ParseConfig {
31 fn default() -> Self {
32 Self {
33 detect_semantics: true,
34 max_size_mb: 100,
35 stream_large_arrays: true,
36 stream_threshold: 1000,
37 }
38 }
39}
40
41impl SimpleParser {
42 pub fn new() -> Self {
44 Self {
45 config: ParseConfig::default(),
46 }
47 }
48
49 pub fn with_config(config: ParseConfig) -> Self {
51 Self { config }
52 }
53
54 pub fn parse(&self, input: &[u8]) -> Result<Frame> {
56 if input.len() > self.config.max_size_mb * 1024 * 1024 {
58 let input_mb = input.len() / (1024 * 1024);
59 let max_mb = self.config.max_size_mb;
60 return Err(Error::buffer(format!(
61 "Input too large: {input_mb} MB, max: {max_mb} MB"
62 )));
63 }
64
65 let value: Value = serde_json::from_slice(input)
67 .map_err(|e| Error::invalid_json(0, format!("serde_json error: {e}")))?;
68
69 let semantic_type = if self.config.detect_semantics {
71 self.detect_semantic_type(&value)
72 } else {
73 SemanticType::Generic
74 };
75
76 let semantics = Some(SemanticMeta::new(semantic_type));
78
79 let mut frame = Frame::new(Bytes::copy_from_slice(input));
81 frame.semantics = semantics;
82
83 Ok(frame)
84 }
85
86 pub fn parse_with_semantics(&self, input: &[u8], semantics: &SemanticMeta) -> Result<Frame> {
88 let mut frame = self.parse(input)?;
89 frame.semantics = Some(semantics.clone());
90 Ok(frame)
91 }
92
93 fn detect_semantic_type(&self, value: &Value) -> SemanticType {
95 match value {
96 Value::Array(arr) => self.detect_array_semantics(arr),
97 Value::Object(obj) => self.detect_object_semantics(obj),
98 _ => SemanticType::Generic,
99 }
100 }
101
102 fn detect_array_semantics(&self, arr: &[Value]) -> SemanticType {
104 if arr.is_empty() {
105 return SemanticType::Generic;
106 }
107
108 if self.is_numeric_array(arr) {
110 let dtype = self.detect_numeric_dtype(&arr[0]);
111 return SemanticType::NumericArray {
112 dtype,
113 length: Some(arr.len()),
114 };
115 }
116
117 if self.is_time_series_array(arr) {
119 return SemanticType::TimeSeries {
120 timestamp_field: "timestamp".to_string(),
121 value_fields: SmallVec::from_vec(vec!["value".to_string()]),
122 interval_ms: None,
123 };
124 }
125
126 if self.is_tabular_data(arr) {
128 let columns = self.extract_table_columns(&arr[0]);
129 return SemanticType::Table {
130 columns: Box::new(columns),
131 row_count: Some(arr.len()),
132 };
133 }
134
135 SemanticType::Generic
136 }
137
138 fn detect_object_semantics(&self, obj: &Map<String, Value>) -> SemanticType {
140 if obj.contains_key("type") && obj.contains_key("coordinates") {
142 return SemanticType::Geospatial {
143 coordinate_system: "WGS84".to_string(),
144 geometry_type: obj
145 .get("type")
146 .and_then(|v| v.as_str())
147 .unwrap_or("Point")
148 .to_string(),
149 };
150 }
151
152 if obj.contains_key("timestamp") || obj.contains_key("time") {
154 let timestamp_field = if obj.contains_key("timestamp") {
155 "timestamp"
156 } else {
157 "time"
158 };
159
160 let value_fields: SmallVec<[String; 4]> = obj
161 .keys()
162 .filter(|k| {
163 *k != timestamp_field && self.looks_like_numeric_value(obj.get(*k).unwrap())
165 }).cloned()
166 .collect();
167
168 if !value_fields.is_empty() {
169 return SemanticType::TimeSeries {
170 timestamp_field: timestamp_field.to_string(),
171 value_fields,
172 interval_ms: None,
173 };
174 }
175 }
176
177 if obj.contains_key("data") && obj.contains_key("shape")
179 && let (Some(Value::Array(_)), Some(Value::Array(shape))) =
180 (obj.get("data"), obj.get("shape"))
181 {
182 let dimensions: SmallVec<[usize; 4]> = shape
183 .iter()
184 .filter_map(|v| v.as_u64().map(|n| n as usize))
185 .collect();
186
187 if !dimensions.is_empty() {
188 return SemanticType::Matrix {
189 dimensions,
190 dtype: crate::semantic::NumericDType::F64, };
192 }
193 }
194
195 SemanticType::Generic
196 }
197
198 fn is_numeric_array(&self, arr: &[Value]) -> bool {
200 arr.len() > 2 && arr.iter().all(|v| v.is_number())
201 }
202
203 fn is_time_series_array(&self, arr: &[Value]) -> bool {
205 arr.len() >= 2
206 && arr.iter().all(|v| {
207 if let Value::Object(obj) = v {
208 obj.contains_key("timestamp") || obj.contains_key("time")
209 } else {
210 false
211 }
212 })
213 }
214
215 fn is_tabular_data(&self, arr: &[Value]) -> bool {
217 if arr.len() < 2 {
218 return false;
219 }
220
221 let first_keys: std::collections::HashSet<_> = if let Value::Object(first) = &arr[0] {
223 first.keys().collect()
224 } else {
225 return false;
226 };
227
228 arr.iter().all(|v| {
229 if let Value::Object(obj) = v {
230 let keys: std::collections::HashSet<_> = obj.keys().collect();
231 let intersection = first_keys.intersection(&keys).count();
233 let union = first_keys.union(&keys).count();
234 intersection as f64 / union as f64 > 0.8
235 } else {
236 false
237 }
238 })
239 }
240
241 fn detect_numeric_dtype(&self, value: &Value) -> crate::semantic::NumericDType {
243 match value {
244 Value::Number(n) => {
245 if n.is_i64() {
246 crate::semantic::NumericDType::I64
247 } else if n.is_u64() {
248 crate::semantic::NumericDType::U64
249 } else {
250 crate::semantic::NumericDType::F64
251 }
252 }
253 _ => crate::semantic::NumericDType::F64,
254 }
255 }
256
257 fn extract_table_columns(
259 &self,
260 first_obj: &Value,
261 ) -> SmallVec<[crate::semantic::ColumnMeta; 16]> {
262 let mut columns = SmallVec::new();
263
264 if let Value::Object(obj) = first_obj {
265 for (key, value) in obj {
266 let column_type = self.detect_column_type(value);
267 columns.push(crate::semantic::ColumnMeta {
268 name: key.clone(),
269 dtype: column_type,
270 nullable: false, });
272 }
273 }
274
275 columns
276 }
277
278 fn detect_column_type(&self, value: &Value) -> crate::semantic::ColumnType {
280 match value {
281 Value::Number(n) => {
282 if n.is_i64() {
283 crate::semantic::ColumnType::Numeric(crate::semantic::NumericDType::I64)
284 } else if n.is_u64() {
285 crate::semantic::ColumnType::Numeric(crate::semantic::NumericDType::U64)
286 } else {
287 crate::semantic::ColumnType::Numeric(crate::semantic::NumericDType::F64)
288 }
289 }
290 Value::String(_) => crate::semantic::ColumnType::String,
291 Value::Bool(_) => crate::semantic::ColumnType::Boolean,
292 Value::Array(_) => {
293 crate::semantic::ColumnType::Array(Box::new(crate::semantic::ColumnType::Json))
294 }
295 _ => crate::semantic::ColumnType::Json,
296 }
297 }
298
299 fn looks_like_numeric_value(&self, value: &Value) -> bool {
301 value.is_number()
302 }
303
304 pub fn stats(&self) -> ParseStats {
306 ParseStats {
307 total_parses: 0,
308 semantic_detections: 0,
309 avg_parse_time_ms: 0.0,
310 }
311 }
312}
313
314#[derive(Debug, Default)]
316pub struct ParseStats {
317 pub total_parses: u64,
319 pub semantic_detections: u64,
321 pub avg_parse_time_ms: f64,
323}
324
325impl Default for SimpleParser {
326 fn default() -> Self {
327 Self::new()
328 }
329}
330
331#[cfg(test)]
332mod tests {
333 use super::*;
334
335 #[test]
336 fn test_simple_parser_creation() {
337 let parser = SimpleParser::new();
338 assert!(parser.config.detect_semantics);
339 }
340
341 #[test]
342 fn test_basic_json_parsing() {
343 let parser = SimpleParser::new();
344 let json = br#"{"hello": "world", "count": 42}"#;
345
346 let result = parser.parse(json);
347 assert!(result.is_ok());
348
349 let frame = result.unwrap();
350 assert!(frame.semantics.is_some());
351 }
352
353 #[test]
354 fn test_numeric_array_detection() {
355 let parser = SimpleParser::new();
356 let json = b"[1, 2, 3, 4, 5]";
357
358 let result = parser.parse(json).unwrap();
359 if let Some(semantics) = result.semantics {
360 assert!(matches!(
361 semantics.semantic_type,
362 SemanticType::NumericArray { .. }
363 ));
364 }
365 }
366
367 #[test]
368 fn test_time_series_detection() {
369 let parser = SimpleParser::new();
370 let json = br#"[
371 {"timestamp": "2023-01-01T00:00:00Z", "value": 1.5},
372 {"timestamp": "2023-01-01T00:01:00Z", "value": 2.3}
373 ]"#;
374
375 let result = parser.parse(json).unwrap();
376 if let Some(semantics) = result.semantics {
377 assert!(matches!(
378 semantics.semantic_type,
379 SemanticType::TimeSeries { .. }
380 ));
381 }
382 }
383
384 #[test]
385 fn test_geospatial_detection() {
386 let parser = SimpleParser::new();
387 let json = br#"{"type": "Point", "coordinates": [125.6, 10.1]}"#;
388
389 let result = parser.parse(json).unwrap();
390 if let Some(semantics) = result.semantics {
391 assert!(matches!(
392 semantics.semantic_type,
393 SemanticType::Geospatial { .. }
394 ));
395 }
396 }
397
398 #[test]
399 fn test_tabular_data_detection() {
400 let parser = SimpleParser::new();
401 let json = br#"[
402 {"name": "John", "age": 30, "city": "New York"},
403 {"name": "Jane", "age": 25, "city": "Boston"},
404 {"name": "Bob", "age": 35, "city": "Chicago"}
405 ]"#;
406
407 let result = parser.parse(json).unwrap();
408 if let Some(semantics) = result.semantics {
409 assert!(matches!(
410 semantics.semantic_type,
411 SemanticType::Table { .. }
412 ));
413 }
414 }
415
416 #[test]
417 fn test_large_input_rejection() {
418 let mut parser = SimpleParser::new();
419 parser.config.max_size_mb = 1; let large_json = vec![b'a'; 2 * 1024 * 1024]; let result = parser.parse(&large_json);
423
424 assert!(result.is_err());
425 }
426}