Skip to main content

pjson_rs/parser/
simple.rs

1//! Simplified serde-based parser for PJS MVP
2//!
3//! This parser uses serde_json as the foundation and focuses on PJS's
4//! unique features: semantic analysis, chunking, and streaming.
5
6use crate::config::SecurityConfig;
7use crate::security::SecurityValidator;
8use crate::semantic::{SemanticMeta, SemanticType};
9use crate::{Error, Frame, Result};
10use bytes::Bytes;
11use serde_json::{self, Map, Value};
12use smallvec::SmallVec;
13
14/// Simple parser based on serde_json
15pub struct SimpleParser {
16    config: ParseConfig,
17    validator: SecurityValidator,
18}
19
20/// Parser configuration
21#[derive(Debug, Clone)]
22pub struct ParseConfig {
23    /// Enable automatic semantic type detection
24    pub detect_semantics: bool,
25    /// Maximum JSON size to parse at once (MB)
26    pub max_size_mb: usize,
27    /// Enable streaming for large arrays
28    pub stream_large_arrays: bool,
29    /// Array size threshold for streaming
30    pub stream_threshold: usize,
31}
32
33impl Default for ParseConfig {
34    fn default() -> Self {
35        Self {
36            detect_semantics: true,
37            max_size_mb: 100,
38            stream_large_arrays: true,
39            stream_threshold: 1000,
40        }
41    }
42}
43
44impl SimpleParser {
45    /// Create new simple parser with default config
46    pub fn new() -> Self {
47        Self {
48            config: ParseConfig::default(),
49            validator: SecurityValidator::default(),
50        }
51    }
52
53    /// Create parser with custom config
54    pub fn with_config(config: ParseConfig) -> Self {
55        Self {
56            config,
57            validator: SecurityValidator::default(),
58        }
59    }
60
61    /// Create parser with custom security config
62    pub fn with_security_config(config: ParseConfig, security_config: SecurityConfig) -> Self {
63        Self {
64            config,
65            validator: SecurityValidator::new(security_config),
66        }
67    }
68
69    /// Parse JSON bytes into PJS Frame
70    pub fn parse(&self, input: &[u8]) -> Result<Frame> {
71        // Security validation
72        self.validator.validate_input_size(input.len())?;
73
74        // Parse with serde_json
75        let value: Value = serde_json::from_slice(input)
76            .map_err(|e| Error::invalid_json(0, format!("serde_json error: {e}")))?;
77
78        // Detect semantic type
79        let semantic_type = if self.config.detect_semantics {
80            self.detect_semantic_type(&value)
81        } else {
82            SemanticType::Generic
83        };
84
85        // Create semantic metadata
86        let semantics = Some(SemanticMeta::new(semantic_type));
87
88        // Create frame
89        let mut frame = Frame::new(Bytes::copy_from_slice(input));
90        frame.semantics = semantics;
91
92        Ok(frame)
93    }
94
95    /// Parse with explicit semantic hints
96    pub fn parse_with_semantics(&self, input: &[u8], semantics: &SemanticMeta) -> Result<Frame> {
97        let mut frame = self.parse(input)?;
98        frame.semantics = Some(semantics.clone());
99        Ok(frame)
100    }
101
102    /// Detect semantic type from JSON structure
103    fn detect_semantic_type(&self, value: &Value) -> SemanticType {
104        match value {
105            Value::Array(arr) => self.detect_array_semantics(arr),
106            Value::Object(obj) => self.detect_object_semantics(obj),
107            _ => SemanticType::Generic,
108        }
109    }
110
111    /// Detect semantic type for JSON arrays
112    fn detect_array_semantics(&self, arr: &[Value]) -> SemanticType {
113        if arr.is_empty() {
114            return SemanticType::Generic;
115        }
116
117        // Check if it's a numeric array
118        if self.is_numeric_array(arr) {
119            let dtype = self.detect_numeric_dtype(&arr[0]);
120            return SemanticType::NumericArray {
121                dtype,
122                length: Some(arr.len()),
123            };
124        }
125
126        // Check if it's time series data
127        if self.is_time_series_array(arr) {
128            return SemanticType::TimeSeries {
129                timestamp_field: "timestamp".to_string(),
130                value_fields: SmallVec::from_vec(vec!["value".to_string()]),
131                interval_ms: None,
132            };
133        }
134
135        // Check if it's tabular data
136        if self.is_tabular_data(arr) {
137            let columns = self.extract_table_columns(&arr[0]);
138            return SemanticType::Table {
139                columns: Box::new(columns),
140                row_count: Some(arr.len()),
141            };
142        }
143
144        SemanticType::Generic
145    }
146
147    /// Detect semantic type for JSON objects
148    fn detect_object_semantics(&self, obj: &Map<String, Value>) -> SemanticType {
149        // GeoJSON detection
150        if obj.contains_key("type") && obj.contains_key("coordinates") {
151            return SemanticType::Geospatial {
152                coordinate_system: "WGS84".to_string(),
153                geometry_type: obj
154                    .get("type")
155                    .and_then(|v| v.as_str())
156                    .unwrap_or("Point")
157                    .to_string(),
158            };
159        }
160
161        // Time series single point
162        if obj.contains_key("timestamp") || obj.contains_key("time") {
163            let timestamp_field = if obj.contains_key("timestamp") {
164                "timestamp"
165            } else {
166                "time"
167            };
168
169            let value_fields: SmallVec<[String; 4]> = obj
170                .keys()
171                .filter_map(|k| {
172                    let v = obj.get(k.as_str())?;
173                    (*k != timestamp_field && self.looks_like_numeric_value(v)).then(|| k.clone())
174                })
175                .collect();
176
177            if !value_fields.is_empty() {
178                return SemanticType::TimeSeries {
179                    timestamp_field: timestamp_field.to_string(),
180                    value_fields,
181                    interval_ms: None,
182                };
183            }
184        }
185
186        // Matrix/image data detection
187        if obj.contains_key("data")
188            && obj.contains_key("shape")
189            && let (Some(Value::Array(_)), Some(Value::Array(shape))) =
190                (obj.get("data"), obj.get("shape"))
191        {
192            let dimensions: SmallVec<[usize; 4]> = shape
193                .iter()
194                .filter_map(|v| v.as_u64().map(|n| n as usize))
195                .collect();
196
197            if !dimensions.is_empty() {
198                return SemanticType::Matrix {
199                    dimensions,
200                    dtype: crate::semantic::NumericDType::F64, // Default
201                };
202            }
203        }
204
205        SemanticType::Generic
206    }
207
208    /// Check if array contains only numeric values
209    fn is_numeric_array(&self, arr: &[Value]) -> bool {
210        arr.len() > 2 && arr.iter().all(|v| v.is_number())
211    }
212
213    /// Check if array looks like time series data
214    fn is_time_series_array(&self, arr: &[Value]) -> bool {
215        arr.len() >= 2
216            && arr.iter().all(|v| {
217                if let Value::Object(obj) = v {
218                    obj.contains_key("timestamp") || obj.contains_key("time")
219                } else {
220                    false
221                }
222            })
223    }
224
225    /// Check if array looks like tabular data (array of similar objects)
226    fn is_tabular_data(&self, arr: &[Value]) -> bool {
227        if arr.len() < 2 {
228            return false;
229        }
230
231        // Check if all elements are objects with similar structure
232        let first_keys: std::collections::HashSet<_> = if let Value::Object(first) = &arr[0] {
233            first.keys().collect()
234        } else {
235            return false;
236        };
237
238        arr.iter().all(|v| {
239            if let Value::Object(obj) = v {
240                let keys: std::collections::HashSet<_> = obj.keys().collect();
241                // Allow some variation in keys (80% similarity)
242                let intersection = first_keys.intersection(&keys).count();
243                let union = first_keys.union(&keys).count();
244                intersection as f64 / union as f64 > 0.8
245            } else {
246                false
247            }
248        })
249    }
250
251    /// Detect numeric data type from Value
252    fn detect_numeric_dtype(&self, value: &Value) -> crate::semantic::NumericDType {
253        match value {
254            Value::Number(n) => {
255                if n.is_i64() {
256                    crate::semantic::NumericDType::I64
257                } else if n.is_u64() {
258                    crate::semantic::NumericDType::U64
259                } else {
260                    crate::semantic::NumericDType::F64
261                }
262            }
263            _ => crate::semantic::NumericDType::F64,
264        }
265    }
266
267    /// Extract table columns from first object
268    fn extract_table_columns(
269        &self,
270        first_obj: &Value,
271    ) -> SmallVec<[crate::semantic::ColumnMeta; 16]> {
272        let mut columns = SmallVec::new();
273
274        if let Value::Object(obj) = first_obj {
275            for (key, value) in obj {
276                let column_type = self.detect_column_type(value);
277                columns.push(crate::semantic::ColumnMeta {
278                    name: key.clone(),
279                    dtype: column_type,
280                    nullable: false, // Simplified - would need more analysis
281                });
282            }
283        }
284
285        columns
286    }
287
288    /// Detect column type from JSON value
289    fn detect_column_type(&self, value: &Value) -> crate::semantic::ColumnType {
290        match value {
291            Value::Number(n) => {
292                if n.is_i64() {
293                    crate::semantic::ColumnType::Numeric(crate::semantic::NumericDType::I64)
294                } else if n.is_u64() {
295                    crate::semantic::ColumnType::Numeric(crate::semantic::NumericDType::U64)
296                } else {
297                    crate::semantic::ColumnType::Numeric(crate::semantic::NumericDType::F64)
298                }
299            }
300            Value::String(_) => crate::semantic::ColumnType::String,
301            Value::Bool(_) => crate::semantic::ColumnType::Boolean,
302            Value::Array(_) => {
303                crate::semantic::ColumnType::Array(Box::new(crate::semantic::ColumnType::Json))
304            }
305            _ => crate::semantic::ColumnType::Json,
306        }
307    }
308
309    /// Check if value looks like a numeric measurement
310    fn looks_like_numeric_value(&self, value: &Value) -> bool {
311        value.is_number()
312    }
313
314    /// Get parser statistics (placeholder for metrics)
315    pub fn stats(&self) -> ParseStats {
316        ParseStats {
317            total_parses: 0,
318            semantic_detections: 0,
319            avg_parse_time_ms: 0.0,
320        }
321    }
322}
323
324/// Parser statistics
325#[derive(Debug, Default)]
326pub struct ParseStats {
327    /// Total number of parse operations
328    pub total_parses: u64,
329    /// Number of successful semantic type detections
330    pub semantic_detections: u64,
331    /// Average parse time in milliseconds
332    pub avg_parse_time_ms: f64,
333}
334
335impl Default for SimpleParser {
336    fn default() -> Self {
337        Self::new()
338    }
339}
340
341#[cfg(test)]
342mod tests {
343    use super::*;
344
345    #[test]
346    fn test_simple_parser_creation() {
347        let parser = SimpleParser::new();
348        assert!(parser.config.detect_semantics);
349    }
350
351    #[test]
352    fn test_basic_json_parsing() {
353        let parser = SimpleParser::new();
354        let json = br#"{"hello": "world", "count": 42}"#;
355
356        let result = parser.parse(json);
357        assert!(result.is_ok());
358
359        let frame = result.unwrap();
360        assert!(frame.semantics.is_some());
361    }
362
363    #[test]
364    fn test_numeric_array_detection() {
365        let parser = SimpleParser::new();
366        let json = b"[1, 2, 3, 4, 5]";
367
368        let result = parser.parse(json).unwrap();
369        if let Some(semantics) = result.semantics {
370            assert!(matches!(
371                semantics.semantic_type,
372                SemanticType::NumericArray { .. }
373            ));
374        }
375    }
376
377    #[test]
378    fn test_time_series_detection() {
379        let parser = SimpleParser::new();
380        let json = br#"[
381            {"timestamp": "2023-01-01T00:00:00Z", "value": 1.5},
382            {"timestamp": "2023-01-01T00:01:00Z", "value": 2.3}
383        ]"#;
384
385        let result = parser.parse(json).unwrap();
386        if let Some(semantics) = result.semantics {
387            assert!(matches!(
388                semantics.semantic_type,
389                SemanticType::TimeSeries { .. }
390            ));
391        }
392    }
393
394    #[test]
395    fn test_geospatial_detection() {
396        let parser = SimpleParser::new();
397        let json = br#"{"type": "Point", "coordinates": [125.6, 10.1]}"#;
398
399        let result = parser.parse(json).unwrap();
400        if let Some(semantics) = result.semantics {
401            assert!(matches!(
402                semantics.semantic_type,
403                SemanticType::Geospatial { .. }
404            ));
405        }
406    }
407
408    #[test]
409    fn test_tabular_data_detection() {
410        let parser = SimpleParser::new();
411        let json = br#"[
412            {"name": "John", "age": 30, "city": "New York"},
413            {"name": "Jane", "age": 25, "city": "Boston"},
414            {"name": "Bob", "age": 35, "city": "Chicago"}
415        ]"#;
416
417        let result = parser.parse(json).unwrap();
418        if let Some(semantics) = result.semantics {
419            assert!(matches!(
420                semantics.semantic_type,
421                SemanticType::Table { .. }
422            ));
423        }
424    }
425
426    #[test]
427    fn test_large_input_rejection() {
428        let mut parser = SimpleParser::new();
429        parser.config.max_size_mb = 1; // 1MB limit
430
431        let large_json = vec![b'a'; 2 * 1024 * 1024]; // 2MB
432        let result = parser.parse(&large_json);
433
434        assert!(result.is_err());
435    }
436}