pjson_rs/parser/
simple.rs

1//! Simplified serde-based parser for PJS MVP
2//!
3//! This parser uses serde_json as the foundation and focuses on PJS's
4//! unique features: semantic analysis, chunking, and streaming.
5
6use crate::config::SecurityConfig;
7use crate::security::SecurityValidator;
8use crate::semantic::{SemanticMeta, SemanticType};
9use crate::{Error, Frame, Result};
10use bytes::Bytes;
11use serde_json::{self, Map, Value};
12use smallvec::SmallVec;
13
14/// Simple parser based on serde_json
15pub struct SimpleParser {
16    config: ParseConfig,
17    validator: SecurityValidator,
18}
19
20/// Parser configuration
21#[derive(Debug, Clone)]
22pub struct ParseConfig {
23    /// Enable automatic semantic type detection
24    pub detect_semantics: bool,
25    /// Maximum JSON size to parse at once (MB)
26    pub max_size_mb: usize,
27    /// Enable streaming for large arrays
28    pub stream_large_arrays: bool,
29    /// Array size threshold for streaming
30    pub stream_threshold: usize,
31}
32
33impl Default for ParseConfig {
34    fn default() -> Self {
35        Self {
36            detect_semantics: true,
37            max_size_mb: 100,
38            stream_large_arrays: true,
39            stream_threshold: 1000,
40        }
41    }
42}
43
44impl SimpleParser {
45    /// Create new simple parser with default config
46    pub fn new() -> Self {
47        Self {
48            config: ParseConfig::default(),
49            validator: SecurityValidator::default(),
50        }
51    }
52
53    /// Create parser with custom config
54    pub fn with_config(config: ParseConfig) -> Self {
55        Self {
56            config,
57            validator: SecurityValidator::default(),
58        }
59    }
60
61    /// Create parser with custom security config
62    pub fn with_security_config(config: ParseConfig, security_config: SecurityConfig) -> Self {
63        Self {
64            config,
65            validator: SecurityValidator::new(security_config),
66        }
67    }
68
69    /// Parse JSON bytes into PJS Frame
70    pub fn parse(&self, input: &[u8]) -> Result<Frame> {
71        // Security validation
72        self.validator.validate_input_size(input.len())?;
73
74        // Parse with serde_json
75        let value: Value = serde_json::from_slice(input)
76            .map_err(|e| Error::invalid_json(0, format!("serde_json error: {e}")))?;
77
78        // Detect semantic type
79        let semantic_type = if self.config.detect_semantics {
80            self.detect_semantic_type(&value)
81        } else {
82            SemanticType::Generic
83        };
84
85        // Create semantic metadata
86        let semantics = Some(SemanticMeta::new(semantic_type));
87
88        // Create frame
89        let mut frame = Frame::new(Bytes::copy_from_slice(input));
90        frame.semantics = semantics;
91
92        Ok(frame)
93    }
94
95    /// Parse with explicit semantic hints
96    pub fn parse_with_semantics(&self, input: &[u8], semantics: &SemanticMeta) -> Result<Frame> {
97        let mut frame = self.parse(input)?;
98        frame.semantics = Some(semantics.clone());
99        Ok(frame)
100    }
101
102    /// Detect semantic type from JSON structure
103    fn detect_semantic_type(&self, value: &Value) -> SemanticType {
104        match value {
105            Value::Array(arr) => self.detect_array_semantics(arr),
106            Value::Object(obj) => self.detect_object_semantics(obj),
107            _ => SemanticType::Generic,
108        }
109    }
110
111    /// Detect semantic type for JSON arrays
112    fn detect_array_semantics(&self, arr: &[Value]) -> SemanticType {
113        if arr.is_empty() {
114            return SemanticType::Generic;
115        }
116
117        // Check if it's a numeric array
118        if self.is_numeric_array(arr) {
119            let dtype = self.detect_numeric_dtype(&arr[0]);
120            return SemanticType::NumericArray {
121                dtype,
122                length: Some(arr.len()),
123            };
124        }
125
126        // Check if it's time series data
127        if self.is_time_series_array(arr) {
128            return SemanticType::TimeSeries {
129                timestamp_field: "timestamp".to_string(),
130                value_fields: SmallVec::from_vec(vec!["value".to_string()]),
131                interval_ms: None,
132            };
133        }
134
135        // Check if it's tabular data
136        if self.is_tabular_data(arr) {
137            let columns = self.extract_table_columns(&arr[0]);
138            return SemanticType::Table {
139                columns: Box::new(columns),
140                row_count: Some(arr.len()),
141            };
142        }
143
144        SemanticType::Generic
145    }
146
147    /// Detect semantic type for JSON objects
148    fn detect_object_semantics(&self, obj: &Map<String, Value>) -> SemanticType {
149        // GeoJSON detection
150        if obj.contains_key("type") && obj.contains_key("coordinates") {
151            return SemanticType::Geospatial {
152                coordinate_system: "WGS84".to_string(),
153                geometry_type: obj
154                    .get("type")
155                    .and_then(|v| v.as_str())
156                    .unwrap_or("Point")
157                    .to_string(),
158            };
159        }
160
161        // Time series single point
162        if obj.contains_key("timestamp") || obj.contains_key("time") {
163            let timestamp_field = if obj.contains_key("timestamp") {
164                "timestamp"
165            } else {
166                "time"
167            };
168
169            let value_fields: SmallVec<[String; 4]> = obj
170                .keys()
171                .filter(|k| {
172                    // TODO: Handle unwrap() - add proper error handling for object field access
173                    *k != timestamp_field && self.looks_like_numeric_value(obj.get(*k).unwrap())
174                })
175                .cloned()
176                .collect();
177
178            if !value_fields.is_empty() {
179                return SemanticType::TimeSeries {
180                    timestamp_field: timestamp_field.to_string(),
181                    value_fields,
182                    interval_ms: None,
183                };
184            }
185        }
186
187        // Matrix/image data detection
188        if obj.contains_key("data")
189            && obj.contains_key("shape")
190            && let (Some(Value::Array(_)), Some(Value::Array(shape))) =
191                (obj.get("data"), obj.get("shape"))
192        {
193            let dimensions: SmallVec<[usize; 4]> = shape
194                .iter()
195                .filter_map(|v| v.as_u64().map(|n| n as usize))
196                .collect();
197
198            if !dimensions.is_empty() {
199                return SemanticType::Matrix {
200                    dimensions,
201                    dtype: crate::semantic::NumericDType::F64, // Default
202                };
203            }
204        }
205
206        SemanticType::Generic
207    }
208
209    /// Check if array contains only numeric values
210    fn is_numeric_array(&self, arr: &[Value]) -> bool {
211        arr.len() > 2 && arr.iter().all(|v| v.is_number())
212    }
213
214    /// Check if array looks like time series data
215    fn is_time_series_array(&self, arr: &[Value]) -> bool {
216        arr.len() >= 2
217            && arr.iter().all(|v| {
218                if let Value::Object(obj) = v {
219                    obj.contains_key("timestamp") || obj.contains_key("time")
220                } else {
221                    false
222                }
223            })
224    }
225
226    /// Check if array looks like tabular data (array of similar objects)
227    fn is_tabular_data(&self, arr: &[Value]) -> bool {
228        if arr.len() < 2 {
229            return false;
230        }
231
232        // Check if all elements are objects with similar structure
233        let first_keys: std::collections::HashSet<_> = if let Value::Object(first) = &arr[0] {
234            first.keys().collect()
235        } else {
236            return false;
237        };
238
239        arr.iter().all(|v| {
240            if let Value::Object(obj) = v {
241                let keys: std::collections::HashSet<_> = obj.keys().collect();
242                // Allow some variation in keys (80% similarity)
243                let intersection = first_keys.intersection(&keys).count();
244                let union = first_keys.union(&keys).count();
245                intersection as f64 / union as f64 > 0.8
246            } else {
247                false
248            }
249        })
250    }
251
252    /// Detect numeric data type from Value
253    fn detect_numeric_dtype(&self, value: &Value) -> crate::semantic::NumericDType {
254        match value {
255            Value::Number(n) => {
256                if n.is_i64() {
257                    crate::semantic::NumericDType::I64
258                } else if n.is_u64() {
259                    crate::semantic::NumericDType::U64
260                } else {
261                    crate::semantic::NumericDType::F64
262                }
263            }
264            _ => crate::semantic::NumericDType::F64,
265        }
266    }
267
268    /// Extract table columns from first object
269    fn extract_table_columns(
270        &self,
271        first_obj: &Value,
272    ) -> SmallVec<[crate::semantic::ColumnMeta; 16]> {
273        let mut columns = SmallVec::new();
274
275        if let Value::Object(obj) = first_obj {
276            for (key, value) in obj {
277                let column_type = self.detect_column_type(value);
278                columns.push(crate::semantic::ColumnMeta {
279                    name: key.clone(),
280                    dtype: column_type,
281                    nullable: false, // Simplified - would need more analysis
282                });
283            }
284        }
285
286        columns
287    }
288
289    /// Detect column type from JSON value
290    fn detect_column_type(&self, value: &Value) -> crate::semantic::ColumnType {
291        match value {
292            Value::Number(n) => {
293                if n.is_i64() {
294                    crate::semantic::ColumnType::Numeric(crate::semantic::NumericDType::I64)
295                } else if n.is_u64() {
296                    crate::semantic::ColumnType::Numeric(crate::semantic::NumericDType::U64)
297                } else {
298                    crate::semantic::ColumnType::Numeric(crate::semantic::NumericDType::F64)
299                }
300            }
301            Value::String(_) => crate::semantic::ColumnType::String,
302            Value::Bool(_) => crate::semantic::ColumnType::Boolean,
303            Value::Array(_) => {
304                crate::semantic::ColumnType::Array(Box::new(crate::semantic::ColumnType::Json))
305            }
306            _ => crate::semantic::ColumnType::Json,
307        }
308    }
309
310    /// Check if value looks like a numeric measurement
311    fn looks_like_numeric_value(&self, value: &Value) -> bool {
312        value.is_number()
313    }
314
315    /// Get parser statistics (placeholder for metrics)
316    pub fn stats(&self) -> ParseStats {
317        ParseStats {
318            total_parses: 0,
319            semantic_detections: 0,
320            avg_parse_time_ms: 0.0,
321        }
322    }
323}
324
325/// Parser statistics
326#[derive(Debug, Default)]
327pub struct ParseStats {
328    /// Total number of parse operations
329    pub total_parses: u64,
330    /// Number of successful semantic type detections
331    pub semantic_detections: u64,
332    /// Average parse time in milliseconds
333    pub avg_parse_time_ms: f64,
334}
335
336impl Default for SimpleParser {
337    fn default() -> Self {
338        Self::new()
339    }
340}
341
342#[cfg(test)]
343mod tests {
344    use super::*;
345
346    #[test]
347    fn test_simple_parser_creation() {
348        let parser = SimpleParser::new();
349        assert!(parser.config.detect_semantics);
350    }
351
352    #[test]
353    fn test_basic_json_parsing() {
354        let parser = SimpleParser::new();
355        let json = br#"{"hello": "world", "count": 42}"#;
356
357        let result = parser.parse(json);
358        assert!(result.is_ok());
359
360        let frame = result.unwrap();
361        assert!(frame.semantics.is_some());
362    }
363
364    #[test]
365    fn test_numeric_array_detection() {
366        let parser = SimpleParser::new();
367        let json = b"[1, 2, 3, 4, 5]";
368
369        let result = parser.parse(json).unwrap();
370        if let Some(semantics) = result.semantics {
371            assert!(matches!(
372                semantics.semantic_type,
373                SemanticType::NumericArray { .. }
374            ));
375        }
376    }
377
378    #[test]
379    fn test_time_series_detection() {
380        let parser = SimpleParser::new();
381        let json = br#"[
382            {"timestamp": "2023-01-01T00:00:00Z", "value": 1.5},
383            {"timestamp": "2023-01-01T00:01:00Z", "value": 2.3}
384        ]"#;
385
386        let result = parser.parse(json).unwrap();
387        if let Some(semantics) = result.semantics {
388            assert!(matches!(
389                semantics.semantic_type,
390                SemanticType::TimeSeries { .. }
391            ));
392        }
393    }
394
395    #[test]
396    fn test_geospatial_detection() {
397        let parser = SimpleParser::new();
398        let json = br#"{"type": "Point", "coordinates": [125.6, 10.1]}"#;
399
400        let result = parser.parse(json).unwrap();
401        if let Some(semantics) = result.semantics {
402            assert!(matches!(
403                semantics.semantic_type,
404                SemanticType::Geospatial { .. }
405            ));
406        }
407    }
408
409    #[test]
410    fn test_tabular_data_detection() {
411        let parser = SimpleParser::new();
412        let json = br#"[
413            {"name": "John", "age": 30, "city": "New York"},
414            {"name": "Jane", "age": 25, "city": "Boston"},
415            {"name": "Bob", "age": 35, "city": "Chicago"}
416        ]"#;
417
418        let result = parser.parse(json).unwrap();
419        if let Some(semantics) = result.semantics {
420            assert!(matches!(
421                semantics.semantic_type,
422                SemanticType::Table { .. }
423            ));
424        }
425    }
426
427    #[test]
428    fn test_large_input_rejection() {
429        let mut parser = SimpleParser::new();
430        parser.config.max_size_mb = 1; // 1MB limit
431
432        let large_json = vec![b'a'; 2 * 1024 * 1024]; // 2MB
433        let result = parser.parse(&large_json);
434
435        assert!(result.is_err());
436    }
437}