Skip to main content

pjson_rs/parser/
value.rs

1//! Lazy JSON value types for zero-copy parsing
2
3use crate::parser::scanner::{Range, ScanResult};
4use crate::{Error, Result};
5use smallvec::SmallVec;
6
7/// Zero-copy JSON value representation
8#[derive(Debug, Clone)]
9pub enum JsonValue<'a> {
10    /// Raw bytes slice (not parsed yet)
11    Raw(&'a [u8]),
12    /// Parsed string (zero-copy)
13    String(&'a str),
14    /// Number stored as bytes for lazy parsing
15    Number(&'a [u8]),
16    /// Boolean value
17    Bool(bool),
18    /// Null value
19    Null,
20    /// Array with lazy evaluation
21    Array(LazyArray<'a>),
22    /// Object with lazy evaluation
23    Object(LazyObject<'a>),
24}
25
26/// Lazy array that parses elements on-demand
27#[derive(Debug, Clone)]
28pub struct LazyArray<'a> {
29    /// Raw JSON bytes
30    raw: &'a [u8],
31    /// Pre-computed element boundaries using SIMD scanning
32    boundaries: SmallVec<[Range; 32]>,
33}
34
35/// Lazy object that parses fields on-demand
36#[derive(Debug, Clone)]
37pub struct LazyObject<'a> {
38    /// Raw JSON bytes
39    raw: &'a [u8],
40    /// Pre-computed key-value boundaries
41    fields: SmallVec<[FieldRange; 16]>,
42}
43
44/// Field boundary information
45#[derive(Debug, Clone)]
46pub struct FieldRange {
47    /// Key range (without quotes)
48    key: Range,
49    /// Value range
50    value: Range,
51}
52
53impl<'a> JsonValue<'a> {
54    /// Get value as string if it's a string type
55    pub fn as_str(&self) -> Option<&str> {
56        match self {
57            JsonValue::String(s) => Some(s),
58            _ => None,
59        }
60    }
61
62    /// Get value as f64 if it's a number
63    pub fn as_f64(&self) -> Option<f64> {
64        match self {
65            JsonValue::Number(bytes) => std::str::from_utf8(bytes).ok()?.parse().ok(),
66            _ => None,
67        }
68    }
69
70    /// Get value as i64 if it's an integer number
71    pub fn as_i64(&self) -> Option<i64> {
72        match self {
73            JsonValue::Number(bytes) => std::str::from_utf8(bytes).ok()?.parse().ok(),
74            _ => None,
75        }
76    }
77
78    /// Get value as bool if it's a boolean
79    pub fn as_bool(&self) -> Option<bool> {
80        match self {
81            JsonValue::Bool(b) => Some(*b),
82            _ => None,
83        }
84    }
85
86    /// Check if value is null
87    pub fn is_null(&self) -> bool {
88        matches!(self, JsonValue::Null)
89    }
90
91    /// Get value as array
92    pub fn as_array(&self) -> Option<&LazyArray<'a>> {
93        match self {
94            JsonValue::Array(arr) => Some(arr),
95            _ => None,
96        }
97    }
98
99    /// Get value as object
100    pub fn as_object(&self) -> Option<&LazyObject<'a>> {
101        match self {
102            JsonValue::Object(obj) => Some(obj),
103            _ => None,
104        }
105    }
106
107    /// Force parse raw bytes into structured value
108    pub fn parse_raw(&mut self) -> Result<()> {
109        match self {
110            JsonValue::Raw(_bytes) => {
111                // This would use the main parser to parse the raw bytes
112                // For now, we'll leave this as a placeholder
113                *self = JsonValue::Null; // Simplified
114                Ok(())
115            }
116            _ => Ok(()),
117        }
118    }
119}
120
121impl<'a> LazyArray<'a> {
122    /// Create new lazy array from scan result
123    pub fn from_scan(raw: &'a [u8], scan_result: ScanResult) -> Self {
124        // Extract array element boundaries from scan result
125        let boundaries = Self::extract_element_boundaries(raw, &scan_result);
126
127        Self { raw, boundaries }
128    }
129
130    /// Get array length
131    pub fn len(&self) -> usize {
132        self.boundaries.len()
133    }
134
135    /// Check if array is empty
136    pub fn is_empty(&self) -> bool {
137        self.boundaries.is_empty()
138    }
139
140    /// Get element at index (simplified - returns raw bytes)
141    pub fn get(&self, index: usize) -> Option<&'a [u8]> {
142        if index >= self.boundaries.len() {
143            return None;
144        }
145
146        let range = self.boundaries[index];
147        Some(&self.raw[range.start..range.end])
148    }
149
150    /// Get element at index, parsing if necessary (simplified)
151    pub fn get_parsed(&self, index: usize) -> Option<JsonValue<'a>> {
152        self.get(index).map(JsonValue::Raw)
153    }
154
155    /// Iterator over array elements (lazy)
156    pub fn iter(&'a self) -> LazyArrayIter<'a> {
157        LazyArrayIter {
158            array: self,
159            index: 0,
160        }
161    }
162
163    /// Extract top-level element boundaries from a JSON array.
164    ///
165    /// Parses `raw` bytes assuming it is a JSON array (`[...]`) and returns
166    /// a `Range` for each top-level element, trimmed of surrounding whitespace.
167    /// Nested arrays/objects and strings (including escaped quotes) are treated
168    /// opaquely — only depth-0 commas and the closing `]` act as delimiters.
169    ///
170    /// # Invariant
171    ///
172    /// Assumes well-formed JSON. Mismatched brackets in nested content (e.g. `[{]}`) may
173    /// produce incorrect ranges without signalling an error.
174    fn extract_element_boundaries(raw: &[u8], _scan_result: &ScanResult) -> SmallVec<[Range; 32]> {
175        let mut result = SmallVec::new();
176        let len = raw.len();
177
178        // Find the opening '['.
179        let mut pos = 0;
180        while pos < len && raw[pos] != b'[' {
181            pos += 1;
182        }
183        if pos == len {
184            return result;
185        }
186        pos += 1; // skip '['
187
188        let mut depth: usize = 1;
189        let mut in_string = false;
190        let mut elem_start: Option<usize> = None;
191
192        while pos < len {
193            let b = raw[pos];
194
195            if in_string {
196                if b == b'\\' {
197                    // Skip the escaped character.
198                    pos += 1;
199                } else if b == b'"' {
200                    in_string = false;
201                }
202                pos += 1;
203                continue;
204            }
205
206            match b {
207                b'"' => {
208                    in_string = true;
209                    if elem_start.is_none() {
210                        elem_start = Some(pos);
211                    }
212                }
213                b'[' | b'{' => {
214                    depth += 1;
215                    if elem_start.is_none() {
216                        elem_start = Some(pos);
217                    }
218                }
219                b']' | b'}' => {
220                    depth -= 1;
221                    if depth == 0 {
222                        // Closing bracket of the top-level array — emit last element.
223                        if let Some(start) = elem_start {
224                            let end = trim_end(raw, start, pos);
225                            if end > start {
226                                result.push(Range::new(start, end));
227                            }
228                        }
229                        break;
230                    }
231                }
232                b',' if depth == 1 => {
233                    // Top-level separator — emit the current element.
234                    if let Some(start) = elem_start {
235                        let end = trim_end(raw, start, pos);
236                        if end > start {
237                            result.push(Range::new(start, end));
238                        }
239                    }
240                    elem_start = None;
241                }
242                b' ' | b'\t' | b'\n' | b'\r' => {
243                    // Whitespace before first non-space character of an element.
244                    pos += 1;
245                    continue;
246                }
247                _ => {
248                    if elem_start.is_none() {
249                        elem_start = Some(pos);
250                    }
251                }
252            }
253            pos += 1;
254        }
255
256        result
257    }
258
259    /// Check if this appears to be a numeric array for SIMD optimization
260    pub fn is_numeric(&self) -> bool {
261        // Heuristic: check first few elements
262        self.boundaries.len() > 4
263            && self.boundaries.iter().take(3).all(|range| {
264                let slice = &self.raw[range.start..range.end];
265                self.looks_like_number(slice)
266            })
267    }
268
269    fn looks_like_number(&self, bytes: &[u8]) -> bool {
270        if bytes.is_empty() {
271            return false;
272        }
273
274        bytes.iter().all(|&b| {
275            b.is_ascii_digit() || b == b'.' || b == b'-' || b == b'+' || b == b'e' || b == b'E'
276        })
277    }
278}
279
280impl<'a> LazyObject<'a> {
281    /// Create new lazy object from scan result
282    pub fn from_scan(raw: &'a [u8], scan_result: ScanResult) -> Self {
283        let fields = Self::extract_field_boundaries(raw, &scan_result);
284
285        Self { raw, fields }
286    }
287
288    /// Get number of fields
289    pub fn len(&self) -> usize {
290        self.fields.len()
291    }
292
293    /// Check if object is empty
294    pub fn is_empty(&self) -> bool {
295        self.fields.is_empty()
296    }
297
298    /// Get field value by key (simplified)
299    pub fn get(&self, key: &str) -> Option<&'a [u8]> {
300        // Find field by key
301        let field_range = self.fields.iter().find(|field| {
302            let key_bytes = &self.raw[field.key.start..field.key.end];
303            std::str::from_utf8(key_bytes) == Ok(key)
304        })?;
305
306        // Return value bytes
307        Some(&self.raw[field_range.value.start..field_range.value.end])
308    }
309
310    /// Get all field keys
311    pub fn keys(&self) -> Result<Vec<&str>> {
312        self.fields
313            .iter()
314            .map(|field| {
315                let key_bytes = &self.raw[field.key.start..field.key.end];
316                std::str::from_utf8(key_bytes).map_err(Error::from)
317            })
318            .collect()
319    }
320
321    /// Extract top-level field boundaries from a JSON object.
322    ///
323    /// Parses `raw` bytes assuming it is a JSON object (`{...}`) and returns a
324    /// `FieldRange` for each top-level field.  The `key` range covers the string
325    /// content **without** surrounding quotes; the `value` range covers the full
326    /// value representation (including quotes when the value is a string).
327    ///
328    /// # Invariant
329    ///
330    /// Assumes well-formed JSON. Malformed input (e.g. duplicate commas, mismatched
331    /// brackets) may produce incomplete results without signalling an error.
332    fn extract_field_boundaries(
333        raw: &[u8],
334        _scan_result: &ScanResult,
335    ) -> SmallVec<[FieldRange; 16]> {
336        let mut result = SmallVec::new();
337        let len = raw.len();
338
339        // Find the opening '{'.
340        let mut pos = 0;
341        while pos < len && raw[pos] != b'{' {
342            pos += 1;
343        }
344        if pos == len {
345            return result;
346        }
347        pos += 1; // skip '{'
348
349        loop {
350            // --- skip whitespace before key ---
351            while pos < len && raw[pos].is_ascii_whitespace() {
352                pos += 1;
353            }
354            if pos >= len || raw[pos] == b'}' {
355                break;
356            }
357            if raw[pos] != b'"' {
358                // Malformed input; stop.
359                break;
360            }
361            pos += 1; // skip opening '"'
362            let key_start = pos;
363            // Scan to closing '"', honouring backslash escapes.
364            while pos < len && raw[pos] != b'"' {
365                if raw[pos] == b'\\' {
366                    pos += 1; // skip escaped char
367                }
368                pos += 1;
369            }
370            let key_end = pos;
371            if pos < len {
372                pos += 1; // skip closing '"'
373            }
374
375            // --- skip whitespace and ':' ---
376            while pos < len && (raw[pos].is_ascii_whitespace() || raw[pos] == b':') {
377                pos += 1;
378            }
379            if pos >= len {
380                break;
381            }
382
383            // --- parse value with depth tracking ---
384            let value_start = pos;
385            let mut depth: usize = 0;
386            let mut in_str = false;
387
388            while pos < len {
389                let b = raw[pos];
390                if in_str {
391                    if b == b'\\' {
392                        pos += 1; // skip escaped char
393                    } else if b == b'"' {
394                        in_str = false;
395                        if depth == 0 {
396                            pos += 1;
397                            break;
398                        }
399                    }
400                    pos += 1;
401                    continue;
402                }
403                match b {
404                    b'"' => {
405                        in_str = true;
406                    }
407                    b'[' | b'{' => depth += 1,
408                    b']' | b'}' => {
409                        if depth == 0 {
410                            // Closing brace of the parent object — do not consume.
411                            break;
412                        }
413                        depth -= 1;
414                        if depth == 0 {
415                            pos += 1;
416                            break;
417                        }
418                    }
419                    b',' if depth == 0 => {
420                        // Separator between fields — do not consume.
421                        break;
422                    }
423                    _ => {}
424                }
425                pos += 1;
426            }
427
428            let value_end = trim_end(raw, value_start, pos);
429            if value_end > value_start {
430                result.push(FieldRange::new(
431                    Range::new(key_start, key_end),
432                    Range::new(value_start, value_end),
433                ));
434            }
435
436            // Skip ',' between fields (or '}' will exit on the next iteration).
437            while pos < len && (raw[pos].is_ascii_whitespace() || raw[pos] == b',') {
438                pos += 1;
439            }
440        }
441
442        result
443    }
444}
445
446/// Iterator for lazy array elements
447pub struct LazyArrayIter<'a> {
448    array: &'a LazyArray<'a>,
449    index: usize,
450}
451
452impl<'a> Iterator for LazyArrayIter<'a> {
453    type Item = &'a [u8]; // Raw element bytes
454
455    fn next(&mut self) -> Option<Self::Item> {
456        if self.index >= self.array.boundaries.len() {
457            return None;
458        }
459
460        let range = self.array.boundaries[self.index];
461        self.index += 1;
462
463        Some(&self.array.raw[range.start..range.end])
464    }
465}
466
467impl FieldRange {
468    /// Create new field range
469    pub fn new(key: Range, value: Range) -> Self {
470        Self { key, value }
471    }
472}
473
474/// Return the index past the last non-whitespace byte in `raw[start..end]`.
475///
476/// Used to strip trailing whitespace from element and value ranges.
477fn trim_end(raw: &[u8], start: usize, end: usize) -> usize {
478    let mut e = end;
479    while e > start && raw[e - 1].is_ascii_whitespace() {
480        e -= 1;
481    }
482    e
483}
484
485#[cfg(test)]
486mod tests {
487    use super::*;
488
489    #[test]
490    fn test_json_value_types() {
491        let val = JsonValue::String("hello");
492        assert_eq!(val.as_str(), Some("hello"));
493        assert!(val.as_f64().is_none());
494    }
495
496    #[test]
497    fn test_lazy_array_creation() {
498        let raw = b"[1, 2, 3]";
499        let scan_result = ScanResult::new();
500        let array = LazyArray::from_scan(raw, scan_result);
501
502        assert_eq!(array.len(), 3);
503        assert_eq!(array.get(0), Some(b"1".as_ref()));
504        assert_eq!(array.get(1), Some(b"2".as_ref()));
505        assert_eq!(array.get(2), Some(b"3".as_ref()));
506    }
507
508    #[test]
509    fn test_lazy_array_empty() {
510        let array = LazyArray::from_scan(b"[]", ScanResult::new());
511        assert_eq!(array.len(), 0);
512        assert!(array.is_empty());
513    }
514
515    #[test]
516    fn test_lazy_array_strings() {
517        let raw = b"[\"hello\", \"world\"]";
518        let array = LazyArray::from_scan(raw, ScanResult::new());
519        assert_eq!(array.len(), 2);
520        assert_eq!(array.get(0), Some(b"\"hello\"".as_ref()));
521    }
522
523    #[test]
524    fn test_lazy_array_nested() {
525        let raw = b"[1, [2, 3], {\"a\": 4}]";
526        let array = LazyArray::from_scan(raw, ScanResult::new());
527        assert_eq!(array.len(), 3);
528        assert_eq!(array.get(0), Some(b"1".as_ref()));
529        assert_eq!(array.get(1), Some(b"[2, 3]".as_ref()));
530        assert_eq!(array.get(2), Some(b"{\"a\": 4}".as_ref()));
531    }
532
533    #[test]
534    fn test_lazy_array_escaped_string() {
535        let raw = br#"["say \"hi\"", "bye"]"#;
536        let array = LazyArray::from_scan(raw, ScanResult::new());
537        assert_eq!(array.len(), 2);
538    }
539
540    #[test]
541    fn test_lazy_object_creation() {
542        let obj = LazyObject::from_scan(b"{\"a\": 1, \"b\": 2}", ScanResult::new());
543        assert_eq!(obj.len(), 2);
544        assert_eq!(obj.get("a"), Some(b"1".as_ref()));
545        assert_eq!(obj.get("b"), Some(b"2".as_ref()));
546    }
547
548    #[test]
549    fn test_lazy_object_empty() {
550        let obj = LazyObject::from_scan(b"{}", ScanResult::new());
551        assert_eq!(obj.len(), 0);
552        assert!(obj.is_empty());
553    }
554
555    #[test]
556    fn test_lazy_object_string_value() {
557        let raw = b"{\"name\": \"alice\"}";
558        let obj = LazyObject::from_scan(raw, ScanResult::new());
559        assert_eq!(obj.len(), 1);
560        assert_eq!(obj.get("name"), Some(b"\"alice\"".as_ref()));
561    }
562
563    #[test]
564    fn test_lazy_object_nested_value() {
565        let raw = b"{\"arr\": [1, 2], \"n\": 42}";
566        let obj = LazyObject::from_scan(raw, ScanResult::new());
567        assert_eq!(obj.len(), 2);
568        assert_eq!(obj.get("arr"), Some(b"[1, 2]".as_ref()));
569        assert_eq!(obj.get("n"), Some(b"42".as_ref()));
570    }
571
572    #[test]
573    fn test_number_detection() {
574        let raw = b"[1.0, 2.5, 3.14]";
575        let scan_result = ScanResult::new();
576        let array = LazyArray::from_scan(raw, scan_result);
577
578        assert!(array.looks_like_number(b"123.45"));
579        assert!(!array.looks_like_number(b"\"string\""));
580    }
581}