pjson_rs/parser/
simple.rs

1//! Simplified serde-based parser for PJS MVP
2//!
3//! This parser uses serde_json as the foundation and focuses on PJS's
4//! unique features: semantic analysis, chunking, and streaming.
5
6use crate::semantic::{SemanticMeta, SemanticType};
7use crate::{Error, Frame, Result};
8use bytes::Bytes;
9use serde_json::{self, Map, Value};
10use smallvec::SmallVec;
11
12/// Simple parser based on serde_json
13pub struct SimpleParser {
14    config: ParseConfig,
15}
16
17/// Parser configuration
18#[derive(Debug, Clone)]
19pub struct ParseConfig {
20    /// Enable automatic semantic type detection
21    pub detect_semantics: bool,
22    /// Maximum JSON size to parse at once (MB)
23    pub max_size_mb: usize,
24    /// Enable streaming for large arrays
25    pub stream_large_arrays: bool,
26    /// Array size threshold for streaming
27    pub stream_threshold: usize,
28}
29
30impl Default for ParseConfig {
31    fn default() -> Self {
32        Self {
33            detect_semantics: true,
34            max_size_mb: 100,
35            stream_large_arrays: true,
36            stream_threshold: 1000,
37        }
38    }
39}
40
41impl SimpleParser {
42    /// Create new simple parser with default config
43    pub fn new() -> Self {
44        Self {
45            config: ParseConfig::default(),
46        }
47    }
48
49    /// Create parser with custom config
50    pub fn with_config(config: ParseConfig) -> Self {
51        Self { config }
52    }
53
54    /// Parse JSON bytes into PJS Frame
55    pub fn parse(&self, input: &[u8]) -> Result<Frame> {
56        // Basic size check
57        if input.len() > self.config.max_size_mb * 1024 * 1024 {
58            let input_mb = input.len() / (1024 * 1024);
59            let max_mb = self.config.max_size_mb;
60            return Err(Error::buffer(format!(
61                "Input too large: {input_mb} MB, max: {max_mb} MB"
62            )));
63        }
64
65        // Parse with serde_json
66        let value: Value = serde_json::from_slice(input)
67            .map_err(|e| Error::invalid_json(0, format!("serde_json error: {e}")))?;
68
69        // Detect semantic type
70        let semantic_type = if self.config.detect_semantics {
71            self.detect_semantic_type(&value)
72        } else {
73            SemanticType::Generic
74        };
75
76        // Create semantic metadata
77        let semantics = Some(SemanticMeta::new(semantic_type));
78
79        // Create frame
80        let mut frame = Frame::new(Bytes::copy_from_slice(input));
81        frame.semantics = semantics;
82
83        Ok(frame)
84    }
85
86    /// Parse with explicit semantic hints
87    pub fn parse_with_semantics(&self, input: &[u8], semantics: &SemanticMeta) -> Result<Frame> {
88        let mut frame = self.parse(input)?;
89        frame.semantics = Some(semantics.clone());
90        Ok(frame)
91    }
92
93    /// Detect semantic type from JSON structure
94    fn detect_semantic_type(&self, value: &Value) -> SemanticType {
95        match value {
96            Value::Array(arr) => self.detect_array_semantics(arr),
97            Value::Object(obj) => self.detect_object_semantics(obj),
98            _ => SemanticType::Generic,
99        }
100    }
101
102    /// Detect semantic type for JSON arrays
103    fn detect_array_semantics(&self, arr: &[Value]) -> SemanticType {
104        if arr.is_empty() {
105            return SemanticType::Generic;
106        }
107
108        // Check if it's a numeric array
109        if self.is_numeric_array(arr) {
110            let dtype = self.detect_numeric_dtype(&arr[0]);
111            return SemanticType::NumericArray {
112                dtype,
113                length: Some(arr.len()),
114            };
115        }
116
117        // Check if it's time series data
118        if self.is_time_series_array(arr) {
119            return SemanticType::TimeSeries {
120                timestamp_field: "timestamp".to_string(),
121                value_fields: SmallVec::from_vec(vec!["value".to_string()]),
122                interval_ms: None,
123            };
124        }
125
126        // Check if it's tabular data
127        if self.is_tabular_data(arr) {
128            let columns = self.extract_table_columns(&arr[0]);
129            return SemanticType::Table {
130                columns: Box::new(columns),
131                row_count: Some(arr.len()),
132            };
133        }
134
135        SemanticType::Generic
136    }
137
138    /// Detect semantic type for JSON objects
139    fn detect_object_semantics(&self, obj: &Map<String, Value>) -> SemanticType {
140        // GeoJSON detection
141        if obj.contains_key("type") && obj.contains_key("coordinates") {
142            return SemanticType::Geospatial {
143                coordinate_system: "WGS84".to_string(),
144                geometry_type: obj
145                    .get("type")
146                    .and_then(|v| v.as_str())
147                    .unwrap_or("Point")
148                    .to_string(),
149            };
150        }
151
152        // Time series single point
153        if obj.contains_key("timestamp") || obj.contains_key("time") {
154            let timestamp_field = if obj.contains_key("timestamp") {
155                "timestamp"
156            } else {
157                "time"
158            };
159
160            let value_fields: SmallVec<[String; 4]> = obj
161                .keys()
162                .filter(|k| {
163                    // TODO: Handle unwrap() - add proper error handling for object field access
164                    *k != timestamp_field && self.looks_like_numeric_value(obj.get(*k).unwrap())
165                })
166                .map(|k| k.clone())
167                .collect();
168
169            if !value_fields.is_empty() {
170                return SemanticType::TimeSeries {
171                    timestamp_field: timestamp_field.to_string(),
172                    value_fields,
173                    interval_ms: None,
174                };
175            }
176        }
177
178        // Matrix/image data detection
179        if obj.contains_key("data") && obj.contains_key("shape") {
180            if let (Some(Value::Array(_)), Some(Value::Array(shape))) =
181                (obj.get("data"), obj.get("shape"))
182            {
183                let dimensions: SmallVec<[usize; 4]> = shape
184                    .iter()
185                    .filter_map(|v| v.as_u64().map(|n| n as usize))
186                    .collect();
187
188                if !dimensions.is_empty() {
189                    return SemanticType::Matrix {
190                        dimensions,
191                        dtype: crate::semantic::NumericDType::F64, // Default
192                    };
193                }
194            }
195        }
196
197        SemanticType::Generic
198    }
199
200    /// Check if array contains only numeric values
201    fn is_numeric_array(&self, arr: &[Value]) -> bool {
202        arr.len() > 2 && arr.iter().all(|v| v.is_number())
203    }
204
205    /// Check if array looks like time series data
206    fn is_time_series_array(&self, arr: &[Value]) -> bool {
207        arr.len() >= 2
208            && arr.iter().all(|v| {
209                if let Value::Object(obj) = v {
210                    obj.contains_key("timestamp") || obj.contains_key("time")
211                } else {
212                    false
213                }
214            })
215    }
216
217    /// Check if array looks like tabular data (array of similar objects)
218    fn is_tabular_data(&self, arr: &[Value]) -> bool {
219        if arr.len() < 2 {
220            return false;
221        }
222
223        // Check if all elements are objects with similar structure
224        let first_keys: std::collections::HashSet<_> = if let Value::Object(first) = &arr[0] {
225            first.keys().collect()
226        } else {
227            return false;
228        };
229
230        arr.iter().all(|v| {
231            if let Value::Object(obj) = v {
232                let keys: std::collections::HashSet<_> = obj.keys().collect();
233                // Allow some variation in keys (80% similarity)
234                let intersection = first_keys.intersection(&keys).count();
235                let union = first_keys.union(&keys).count();
236                intersection as f64 / union as f64 > 0.8
237            } else {
238                false
239            }
240        })
241    }
242
243    /// Detect numeric data type from Value
244    fn detect_numeric_dtype(&self, value: &Value) -> crate::semantic::NumericDType {
245        match value {
246            Value::Number(n) => {
247                if n.is_i64() {
248                    crate::semantic::NumericDType::I64
249                } else if n.is_u64() {
250                    crate::semantic::NumericDType::U64
251                } else {
252                    crate::semantic::NumericDType::F64
253                }
254            }
255            _ => crate::semantic::NumericDType::F64,
256        }
257    }
258
259    /// Extract table columns from first object
260    fn extract_table_columns(
261        &self,
262        first_obj: &Value,
263    ) -> SmallVec<[crate::semantic::ColumnMeta; 16]> {
264        let mut columns = SmallVec::new();
265
266        if let Value::Object(obj) = first_obj {
267            for (key, value) in obj {
268                let column_type = self.detect_column_type(value);
269                columns.push(crate::semantic::ColumnMeta {
270                    name: key.clone(),
271                    dtype: column_type,
272                    nullable: false, // Simplified - would need more analysis
273                });
274            }
275        }
276
277        columns
278    }
279
280    /// Detect column type from JSON value
281    fn detect_column_type(&self, value: &Value) -> crate::semantic::ColumnType {
282        match value {
283            Value::Number(n) => {
284                if n.is_i64() {
285                    crate::semantic::ColumnType::Numeric(crate::semantic::NumericDType::I64)
286                } else if n.is_u64() {
287                    crate::semantic::ColumnType::Numeric(crate::semantic::NumericDType::U64)
288                } else {
289                    crate::semantic::ColumnType::Numeric(crate::semantic::NumericDType::F64)
290                }
291            }
292            Value::String(_) => crate::semantic::ColumnType::String,
293            Value::Bool(_) => crate::semantic::ColumnType::Boolean,
294            Value::Array(_) => {
295                crate::semantic::ColumnType::Array(Box::new(crate::semantic::ColumnType::Json))
296            }
297            _ => crate::semantic::ColumnType::Json,
298        }
299    }
300
301    /// Check if value looks like a numeric measurement
302    fn looks_like_numeric_value(&self, value: &Value) -> bool {
303        value.is_number()
304    }
305
306    /// Get parser statistics (placeholder for metrics)
307    pub fn stats(&self) -> ParseStats {
308        ParseStats {
309            total_parses: 0,
310            semantic_detections: 0,
311            avg_parse_time_ms: 0.0,
312        }
313    }
314}
315
316/// Parser statistics
317#[derive(Debug, Default)]
318pub struct ParseStats {
319    /// Total number of parse operations
320    pub total_parses: u64,
321    /// Number of successful semantic type detections
322    pub semantic_detections: u64,
323    /// Average parse time in milliseconds
324    pub avg_parse_time_ms: f64,
325}
326
327impl Default for SimpleParser {
328    fn default() -> Self {
329        Self::new()
330    }
331}
332
333#[cfg(test)]
334mod tests {
335    use super::*;
336
337    #[test]
338    fn test_simple_parser_creation() {
339        let parser = SimpleParser::new();
340        assert!(parser.config.detect_semantics);
341    }
342
343    #[test]
344    fn test_basic_json_parsing() {
345        let parser = SimpleParser::new();
346        let json = br#"{"hello": "world", "count": 42}"#;
347
348        let result = parser.parse(json);
349        assert!(result.is_ok());
350
351        let frame = result.unwrap();
352        assert!(frame.semantics.is_some());
353    }
354
355    #[test]
356    fn test_numeric_array_detection() {
357        let parser = SimpleParser::new();
358        let json = b"[1, 2, 3, 4, 5]";
359
360        let result = parser.parse(json).unwrap();
361        if let Some(semantics) = result.semantics {
362            assert!(matches!(
363                semantics.semantic_type,
364                SemanticType::NumericArray { .. }
365            ));
366        }
367    }
368
369    #[test]
370    fn test_time_series_detection() {
371        let parser = SimpleParser::new();
372        let json = br#"[
373            {"timestamp": "2023-01-01T00:00:00Z", "value": 1.5},
374            {"timestamp": "2023-01-01T00:01:00Z", "value": 2.3}
375        ]"#;
376
377        let result = parser.parse(json).unwrap();
378        if let Some(semantics) = result.semantics {
379            assert!(matches!(
380                semantics.semantic_type,
381                SemanticType::TimeSeries { .. }
382            ));
383        }
384    }
385
386    #[test]
387    fn test_geospatial_detection() {
388        let parser = SimpleParser::new();
389        let json = br#"{"type": "Point", "coordinates": [125.6, 10.1]}"#;
390
391        let result = parser.parse(json).unwrap();
392        if let Some(semantics) = result.semantics {
393            assert!(matches!(
394                semantics.semantic_type,
395                SemanticType::Geospatial { .. }
396            ));
397        }
398    }
399
400    #[test]
401    fn test_tabular_data_detection() {
402        let parser = SimpleParser::new();
403        let json = br#"[
404            {"name": "John", "age": 30, "city": "New York"},
405            {"name": "Jane", "age": 25, "city": "Boston"},
406            {"name": "Bob", "age": 35, "city": "Chicago"}
407        ]"#;
408
409        let result = parser.parse(json).unwrap();
410        if let Some(semantics) = result.semantics {
411            assert!(matches!(
412                semantics.semantic_type,
413                SemanticType::Table { .. }
414            ));
415        }
416    }
417
418    #[test]
419    fn test_large_input_rejection() {
420        let mut parser = SimpleParser::new();
421        parser.config.max_size_mb = 1; // 1MB limit
422
423        let large_json = vec![b'a'; 2 * 1024 * 1024]; // 2MB
424        let result = parser.parse(&large_json);
425
426        assert!(result.is_err());
427    }
428}