pjson_rs/parser/
simple.rs

1//! Simplified serde-based parser for PJS MVP
2//!
3//! This parser uses serde_json as the foundation and focuses on PJS's
4//! unique features: semantic analysis, chunking, and streaming.
5
6use crate::semantic::{SemanticMeta, SemanticType};
7use crate::{Error, Frame, Result};
8use bytes::Bytes;
9use serde_json::{self, Map, Value};
10use smallvec::SmallVec;
11
12/// Simple parser based on serde_json
13pub struct SimpleParser {
14    config: ParseConfig,
15}
16
17/// Parser configuration
18#[derive(Debug, Clone)]
19pub struct ParseConfig {
20    /// Enable automatic semantic type detection
21    pub detect_semantics: bool,
22    /// Maximum JSON size to parse at once (MB)
23    pub max_size_mb: usize,
24    /// Enable streaming for large arrays
25    pub stream_large_arrays: bool,
26    /// Array size threshold for streaming
27    pub stream_threshold: usize,
28}
29
30impl Default for ParseConfig {
31    fn default() -> Self {
32        Self {
33            detect_semantics: true,
34            max_size_mb: 100,
35            stream_large_arrays: true,
36            stream_threshold: 1000,
37        }
38    }
39}
40
41impl SimpleParser {
42    /// Create new simple parser with default config
43    pub fn new() -> Self {
44        Self {
45            config: ParseConfig::default(),
46        }
47    }
48
49    /// Create parser with custom config
50    pub fn with_config(config: ParseConfig) -> Self {
51        Self { config }
52    }
53
54    /// Parse JSON bytes into PJS Frame
55    pub fn parse(&self, input: &[u8]) -> Result<Frame> {
56        // Basic size check
57        if input.len() > self.config.max_size_mb * 1024 * 1024 {
58            let input_mb = input.len() / (1024 * 1024);
59            let max_mb = self.config.max_size_mb;
60            return Err(Error::buffer(format!(
61                "Input too large: {input_mb} MB, max: {max_mb} MB"
62            )));
63        }
64
65        // Parse with serde_json
66        let value: Value = serde_json::from_slice(input)
67            .map_err(|e| Error::invalid_json(0, format!("serde_json error: {e}")))?;
68
69        // Detect semantic type
70        let semantic_type = if self.config.detect_semantics {
71            self.detect_semantic_type(&value)
72        } else {
73            SemanticType::Generic
74        };
75
76        // Create semantic metadata
77        let semantics = Some(SemanticMeta::new(semantic_type));
78
79        // Create frame
80        let mut frame = Frame::new(Bytes::copy_from_slice(input));
81        frame.semantics = semantics;
82
83        Ok(frame)
84    }
85
86    /// Parse with explicit semantic hints
87    pub fn parse_with_semantics(&self, input: &[u8], semantics: &SemanticMeta) -> Result<Frame> {
88        let mut frame = self.parse(input)?;
89        frame.semantics = Some(semantics.clone());
90        Ok(frame)
91    }
92
93    /// Detect semantic type from JSON structure
94    fn detect_semantic_type(&self, value: &Value) -> SemanticType {
95        match value {
96            Value::Array(arr) => self.detect_array_semantics(arr),
97            Value::Object(obj) => self.detect_object_semantics(obj),
98            _ => SemanticType::Generic,
99        }
100    }
101
102    /// Detect semantic type for JSON arrays
103    fn detect_array_semantics(&self, arr: &[Value]) -> SemanticType {
104        if arr.is_empty() {
105            return SemanticType::Generic;
106        }
107
108        // Check if it's a numeric array
109        if self.is_numeric_array(arr) {
110            let dtype = self.detect_numeric_dtype(&arr[0]);
111            return SemanticType::NumericArray {
112                dtype,
113                length: Some(arr.len()),
114            };
115        }
116
117        // Check if it's time series data
118        if self.is_time_series_array(arr) {
119            return SemanticType::TimeSeries {
120                timestamp_field: "timestamp".to_string(),
121                value_fields: SmallVec::from_vec(vec!["value".to_string()]),
122                interval_ms: None,
123            };
124        }
125
126        // Check if it's tabular data
127        if self.is_tabular_data(arr) {
128            let columns = self.extract_table_columns(&arr[0]);
129            return SemanticType::Table {
130                columns: Box::new(columns),
131                row_count: Some(arr.len()),
132            };
133        }
134
135        SemanticType::Generic
136    }
137
138    /// Detect semantic type for JSON objects
139    fn detect_object_semantics(&self, obj: &Map<String, Value>) -> SemanticType {
140        // GeoJSON detection
141        if obj.contains_key("type") && obj.contains_key("coordinates") {
142            return SemanticType::Geospatial {
143                coordinate_system: "WGS84".to_string(),
144                geometry_type: obj
145                    .get("type")
146                    .and_then(|v| v.as_str())
147                    .unwrap_or("Point")
148                    .to_string(),
149            };
150        }
151
152        // Time series single point
153        if obj.contains_key("timestamp") || obj.contains_key("time") {
154            let timestamp_field = if obj.contains_key("timestamp") {
155                "timestamp"
156            } else {
157                "time"
158            };
159
160            let value_fields: SmallVec<[String; 4]> = obj
161                .keys()
162                .filter(|k| {
163                    // TODO: Handle unwrap() - add proper error handling for object field access
164                    *k != timestamp_field && self.looks_like_numeric_value(obj.get(*k).unwrap())
165                }).cloned()
166                .collect();
167
168            if !value_fields.is_empty() {
169                return SemanticType::TimeSeries {
170                    timestamp_field: timestamp_field.to_string(),
171                    value_fields,
172                    interval_ms: None,
173                };
174            }
175        }
176
177        // Matrix/image data detection
178        if obj.contains_key("data") && obj.contains_key("shape")
179            && let (Some(Value::Array(_)), Some(Value::Array(shape))) =
180                (obj.get("data"), obj.get("shape"))
181            {
182                let dimensions: SmallVec<[usize; 4]> = shape
183                    .iter()
184                    .filter_map(|v| v.as_u64().map(|n| n as usize))
185                    .collect();
186
187                if !dimensions.is_empty() {
188                    return SemanticType::Matrix {
189                        dimensions,
190                        dtype: crate::semantic::NumericDType::F64, // Default
191                    };
192                }
193            }
194
195        SemanticType::Generic
196    }
197
198    /// Check if array contains only numeric values
199    fn is_numeric_array(&self, arr: &[Value]) -> bool {
200        arr.len() > 2 && arr.iter().all(|v| v.is_number())
201    }
202
203    /// Check if array looks like time series data
204    fn is_time_series_array(&self, arr: &[Value]) -> bool {
205        arr.len() >= 2
206            && arr.iter().all(|v| {
207                if let Value::Object(obj) = v {
208                    obj.contains_key("timestamp") || obj.contains_key("time")
209                } else {
210                    false
211                }
212            })
213    }
214
215    /// Check if array looks like tabular data (array of similar objects)
216    fn is_tabular_data(&self, arr: &[Value]) -> bool {
217        if arr.len() < 2 {
218            return false;
219        }
220
221        // Check if all elements are objects with similar structure
222        let first_keys: std::collections::HashSet<_> = if let Value::Object(first) = &arr[0] {
223            first.keys().collect()
224        } else {
225            return false;
226        };
227
228        arr.iter().all(|v| {
229            if let Value::Object(obj) = v {
230                let keys: std::collections::HashSet<_> = obj.keys().collect();
231                // Allow some variation in keys (80% similarity)
232                let intersection = first_keys.intersection(&keys).count();
233                let union = first_keys.union(&keys).count();
234                intersection as f64 / union as f64 > 0.8
235            } else {
236                false
237            }
238        })
239    }
240
241    /// Detect numeric data type from Value
242    fn detect_numeric_dtype(&self, value: &Value) -> crate::semantic::NumericDType {
243        match value {
244            Value::Number(n) => {
245                if n.is_i64() {
246                    crate::semantic::NumericDType::I64
247                } else if n.is_u64() {
248                    crate::semantic::NumericDType::U64
249                } else {
250                    crate::semantic::NumericDType::F64
251                }
252            }
253            _ => crate::semantic::NumericDType::F64,
254        }
255    }
256
257    /// Extract table columns from first object
258    fn extract_table_columns(
259        &self,
260        first_obj: &Value,
261    ) -> SmallVec<[crate::semantic::ColumnMeta; 16]> {
262        let mut columns = SmallVec::new();
263
264        if let Value::Object(obj) = first_obj {
265            for (key, value) in obj {
266                let column_type = self.detect_column_type(value);
267                columns.push(crate::semantic::ColumnMeta {
268                    name: key.clone(),
269                    dtype: column_type,
270                    nullable: false, // Simplified - would need more analysis
271                });
272            }
273        }
274
275        columns
276    }
277
278    /// Detect column type from JSON value
279    fn detect_column_type(&self, value: &Value) -> crate::semantic::ColumnType {
280        match value {
281            Value::Number(n) => {
282                if n.is_i64() {
283                    crate::semantic::ColumnType::Numeric(crate::semantic::NumericDType::I64)
284                } else if n.is_u64() {
285                    crate::semantic::ColumnType::Numeric(crate::semantic::NumericDType::U64)
286                } else {
287                    crate::semantic::ColumnType::Numeric(crate::semantic::NumericDType::F64)
288                }
289            }
290            Value::String(_) => crate::semantic::ColumnType::String,
291            Value::Bool(_) => crate::semantic::ColumnType::Boolean,
292            Value::Array(_) => {
293                crate::semantic::ColumnType::Array(Box::new(crate::semantic::ColumnType::Json))
294            }
295            _ => crate::semantic::ColumnType::Json,
296        }
297    }
298
299    /// Check if value looks like a numeric measurement
300    fn looks_like_numeric_value(&self, value: &Value) -> bool {
301        value.is_number()
302    }
303
304    /// Get parser statistics (placeholder for metrics)
305    pub fn stats(&self) -> ParseStats {
306        ParseStats {
307            total_parses: 0,
308            semantic_detections: 0,
309            avg_parse_time_ms: 0.0,
310        }
311    }
312}
313
314/// Parser statistics
315#[derive(Debug, Default)]
316pub struct ParseStats {
317    /// Total number of parse operations
318    pub total_parses: u64,
319    /// Number of successful semantic type detections
320    pub semantic_detections: u64,
321    /// Average parse time in milliseconds
322    pub avg_parse_time_ms: f64,
323}
324
325impl Default for SimpleParser {
326    fn default() -> Self {
327        Self::new()
328    }
329}
330
331#[cfg(test)]
332mod tests {
333    use super::*;
334
335    #[test]
336    fn test_simple_parser_creation() {
337        let parser = SimpleParser::new();
338        assert!(parser.config.detect_semantics);
339    }
340
341    #[test]
342    fn test_basic_json_parsing() {
343        let parser = SimpleParser::new();
344        let json = br#"{"hello": "world", "count": 42}"#;
345
346        let result = parser.parse(json);
347        assert!(result.is_ok());
348
349        let frame = result.unwrap();
350        assert!(frame.semantics.is_some());
351    }
352
353    #[test]
354    fn test_numeric_array_detection() {
355        let parser = SimpleParser::new();
356        let json = b"[1, 2, 3, 4, 5]";
357
358        let result = parser.parse(json).unwrap();
359        if let Some(semantics) = result.semantics {
360            assert!(matches!(
361                semantics.semantic_type,
362                SemanticType::NumericArray { .. }
363            ));
364        }
365    }
366
367    #[test]
368    fn test_time_series_detection() {
369        let parser = SimpleParser::new();
370        let json = br#"[
371            {"timestamp": "2023-01-01T00:00:00Z", "value": 1.5},
372            {"timestamp": "2023-01-01T00:01:00Z", "value": 2.3}
373        ]"#;
374
375        let result = parser.parse(json).unwrap();
376        if let Some(semantics) = result.semantics {
377            assert!(matches!(
378                semantics.semantic_type,
379                SemanticType::TimeSeries { .. }
380            ));
381        }
382    }
383
384    #[test]
385    fn test_geospatial_detection() {
386        let parser = SimpleParser::new();
387        let json = br#"{"type": "Point", "coordinates": [125.6, 10.1]}"#;
388
389        let result = parser.parse(json).unwrap();
390        if let Some(semantics) = result.semantics {
391            assert!(matches!(
392                semantics.semantic_type,
393                SemanticType::Geospatial { .. }
394            ));
395        }
396    }
397
398    #[test]
399    fn test_tabular_data_detection() {
400        let parser = SimpleParser::new();
401        let json = br#"[
402            {"name": "John", "age": 30, "city": "New York"},
403            {"name": "Jane", "age": 25, "city": "Boston"},
404            {"name": "Bob", "age": 35, "city": "Chicago"}
405        ]"#;
406
407        let result = parser.parse(json).unwrap();
408        if let Some(semantics) = result.semantics {
409            assert!(matches!(
410                semantics.semantic_type,
411                SemanticType::Table { .. }
412            ));
413        }
414    }
415
416    #[test]
417    fn test_large_input_rejection() {
418        let mut parser = SimpleParser::new();
419        parser.config.max_size_mb = 1; // 1MB limit
420
421        let large_json = vec![b'a'; 2 * 1024 * 1024]; // 2MB
422        let result = parser.parse(&large_json);
423
424        assert!(result.is_err());
425    }
426}