Skip to main content

pjson_rs/parser/
value.rs

1//! Lazy JSON value types for zero-copy parsing
2
3use crate::parser::scanner::{Range, ScanResult};
4use crate::{Error, Result};
5use smallvec::SmallVec;
6
7/// Zero-copy JSON value representation
8#[derive(Debug, Clone)]
9pub enum JsonValue<'a> {
10    /// Raw bytes slice (not parsed yet)
11    Raw(&'a [u8]),
12    /// Parsed string (zero-copy)
13    String(&'a str),
14    /// Number stored as bytes for lazy parsing
15    Number(&'a [u8]),
16    /// Boolean value
17    Bool(bool),
18    /// Null value
19    Null,
20    /// Array with lazy evaluation
21    Array(LazyArray<'a>),
22    /// Object with lazy evaluation
23    Object(LazyObject<'a>),
24}
25
26/// Lazy array that parses elements on-demand
27#[derive(Debug, Clone)]
28pub struct LazyArray<'a> {
29    /// Raw JSON bytes
30    raw: &'a [u8],
31    /// Pre-computed element boundaries using SIMD scanning
32    boundaries: SmallVec<[Range; 32]>,
33}
34
35/// Lazy object that parses fields on-demand
36#[derive(Debug, Clone)]
37pub struct LazyObject<'a> {
38    /// Raw JSON bytes
39    raw: &'a [u8],
40    /// Pre-computed key-value boundaries
41    fields: SmallVec<[FieldRange; 16]>,
42}
43
44/// Field boundary information
45#[derive(Debug, Clone)]
46pub struct FieldRange {
47    /// Key range (without quotes)
48    key: Range,
49    /// Value range
50    value: Range,
51}
52
53impl<'a> JsonValue<'a> {
54    /// Get value as string if it's a string type
55    pub fn as_str(&self) -> Option<&str> {
56        match self {
57            JsonValue::String(s) => Some(s),
58            _ => None,
59        }
60    }
61
62    /// Get value as f64 if it's a number
63    pub fn as_f64(&self) -> Option<f64> {
64        match self {
65            JsonValue::Number(bytes) => std::str::from_utf8(bytes).ok()?.parse().ok(),
66            _ => None,
67        }
68    }
69
70    /// Get value as i64 if it's an integer number
71    pub fn as_i64(&self) -> Option<i64> {
72        match self {
73            JsonValue::Number(bytes) => std::str::from_utf8(bytes).ok()?.parse().ok(),
74            _ => None,
75        }
76    }
77
78    /// Get value as bool if it's a boolean
79    pub fn as_bool(&self) -> Option<bool> {
80        match self {
81            JsonValue::Bool(b) => Some(*b),
82            _ => None,
83        }
84    }
85
86    /// Check if value is null
87    pub fn is_null(&self) -> bool {
88        matches!(self, JsonValue::Null)
89    }
90
91    /// Get value as array
92    pub fn as_array(&self) -> Option<&LazyArray<'a>> {
93        match self {
94            JsonValue::Array(arr) => Some(arr),
95            _ => None,
96        }
97    }
98
99    /// Get value as object
100    pub fn as_object(&self) -> Option<&LazyObject<'a>> {
101        match self {
102            JsonValue::Object(obj) => Some(obj),
103            _ => None,
104        }
105    }
106
107    /// Force parse raw bytes into the appropriate structured variant.
108    ///
109    /// Classifies the underlying bytes by their first non-whitespace character
110    /// and replaces `JsonValue::Raw` with the matching typed variant
111    /// (`Null`, `Bool`, `Number`, `String`, `Array`, or `Object`). Variants other
112    /// than `Raw` are left unchanged.
113    ///
114    /// Numbers, strings, arrays, and objects keep zero-copy semantics by borrowing
115    /// from the original byte slice. Strings containing escape sequences cannot be
116    /// represented in `JsonValue::String<&str>` without allocation and are rejected.
117    ///
118    /// # Errors
119    ///
120    /// Returns [`Error::InvalidJson`] when the bytes are empty, contain an
121    /// unterminated string, contain an escaped string (zero-copy not possible),
122    /// or do not begin with a recognised JSON token.
123    /// Returns [`Error::Utf8`] when string contents are not valid UTF-8.
124    ///
125    /// # Examples
126    ///
127    /// ```
128    /// use pjson_rs::parser::JsonValue;
129    ///
130    /// let mut v = JsonValue::Raw(b"42");
131    /// v.parse_raw().unwrap();
132    /// assert_eq!(v.as_i64(), Some(42));
133    ///
134    /// let mut v = JsonValue::Raw(b"\"hello\"");
135    /// v.parse_raw().unwrap();
136    /// assert_eq!(v.as_str(), Some("hello"));
137    /// ```
138    pub fn parse_raw(&mut self) -> Result<()> {
139        let bytes = if let JsonValue::Raw(bytes) = self {
140            *bytes
141        } else {
142            return Ok(());
143        };
144
145        let Some(start) = bytes.iter().position(|b| !b.is_ascii_whitespace()) else {
146            return Err(Error::invalid_json(0, "empty input"));
147        };
148        let end = bytes
149            .iter()
150            .rposition(|b| !b.is_ascii_whitespace())
151            .map(|i| i + 1)
152            .unwrap_or(bytes.len());
153        let trimmed = &bytes[start..end];
154
155        *self = match trimmed[0] {
156            b'n' if trimmed == b"null" => JsonValue::Null,
157            b't' if trimmed == b"true" => JsonValue::Bool(true),
158            b'f' if trimmed == b"false" => JsonValue::Bool(false),
159            b'"' => {
160                if trimmed.len() < 2 || trimmed[trimmed.len() - 1] != b'"' {
161                    return Err(Error::invalid_json(start, "unterminated string"));
162                }
163                let inner = &trimmed[1..trimmed.len() - 1];
164                if inner.contains(&b'\\') {
165                    return Err(Error::invalid_json(
166                        start,
167                        "escaped strings cannot be represented zero-copy",
168                    ));
169                }
170                JsonValue::String(std::str::from_utf8(inner)?)
171            }
172            b'[' => JsonValue::Array(LazyArray::from_scan(trimmed, ScanResult::new())),
173            b'{' => JsonValue::Object(LazyObject::from_scan(trimmed, ScanResult::new())),
174            b'-' | b'0'..=b'9' => JsonValue::Number(trimmed),
175            _ => return Err(Error::invalid_json(start, "unrecognised JSON value")),
176        };
177
178        Ok(())
179    }
180}
181
182impl<'a> LazyArray<'a> {
183    /// Create new lazy array from scan result
184    pub fn from_scan(raw: &'a [u8], scan_result: ScanResult) -> Self {
185        // Extract array element boundaries from scan result
186        let boundaries = Self::extract_element_boundaries(raw, &scan_result);
187
188        Self { raw, boundaries }
189    }
190
191    /// Get array length
192    pub fn len(&self) -> usize {
193        self.boundaries.len()
194    }
195
196    /// Check if array is empty
197    pub fn is_empty(&self) -> bool {
198        self.boundaries.is_empty()
199    }
200
201    /// Get element at index (simplified - returns raw bytes)
202    pub fn get(&self, index: usize) -> Option<&'a [u8]> {
203        if index >= self.boundaries.len() {
204            return None;
205        }
206
207        let range = self.boundaries[index];
208        Some(&self.raw[range.start..range.end])
209    }
210
211    /// Get element at index, parsing if necessary (simplified)
212    pub fn get_parsed(&self, index: usize) -> Option<JsonValue<'a>> {
213        self.get(index).map(JsonValue::Raw)
214    }
215
216    /// Iterator over array elements (lazy)
217    pub fn iter(&'a self) -> LazyArrayIter<'a> {
218        LazyArrayIter {
219            array: self,
220            index: 0,
221        }
222    }
223
224    /// Extract top-level element boundaries from a JSON array.
225    ///
226    /// Parses `raw` bytes assuming it is a JSON array (`[...]`) and returns
227    /// a `Range` for each top-level element, trimmed of surrounding whitespace.
228    /// Nested arrays/objects and strings (including escaped quotes) are treated
229    /// opaquely — only depth-0 commas and the closing `]` act as delimiters.
230    ///
231    /// # Invariant
232    ///
233    /// Assumes well-formed JSON. Mismatched brackets in nested content (e.g. `[{]}`) may
234    /// produce incorrect ranges without signalling an error.
235    fn extract_element_boundaries(raw: &[u8], _scan_result: &ScanResult) -> SmallVec<[Range; 32]> {
236        let mut result = SmallVec::new();
237        let len = raw.len();
238
239        // Find the opening '['.
240        let mut pos = 0;
241        while pos < len && raw[pos] != b'[' {
242            pos += 1;
243        }
244        if pos == len {
245            return result;
246        }
247        pos += 1; // skip '['
248
249        let mut depth: usize = 1;
250        let mut in_string = false;
251        let mut elem_start: Option<usize> = None;
252
253        while pos < len {
254            let b = raw[pos];
255
256            if in_string {
257                if b == b'\\' {
258                    // Skip the escaped character.
259                    pos += 1;
260                } else if b == b'"' {
261                    in_string = false;
262                }
263                pos += 1;
264                continue;
265            }
266
267            match b {
268                b'"' => {
269                    in_string = true;
270                    if elem_start.is_none() {
271                        elem_start = Some(pos);
272                    }
273                }
274                b'[' | b'{' => {
275                    depth += 1;
276                    if elem_start.is_none() {
277                        elem_start = Some(pos);
278                    }
279                }
280                b']' | b'}' => {
281                    depth -= 1;
282                    if depth == 0 {
283                        // Closing bracket of the top-level array — emit last element.
284                        if let Some(start) = elem_start {
285                            let end = trim_end(raw, start, pos);
286                            if end > start {
287                                result.push(Range::new(start, end));
288                            }
289                        }
290                        break;
291                    }
292                }
293                b',' if depth == 1 => {
294                    // Top-level separator — emit the current element.
295                    if let Some(start) = elem_start {
296                        let end = trim_end(raw, start, pos);
297                        if end > start {
298                            result.push(Range::new(start, end));
299                        }
300                    }
301                    elem_start = None;
302                }
303                b' ' | b'\t' | b'\n' | b'\r' => {
304                    // Whitespace before first non-space character of an element.
305                    pos += 1;
306                    continue;
307                }
308                _ => {
309                    if elem_start.is_none() {
310                        elem_start = Some(pos);
311                    }
312                }
313            }
314            pos += 1;
315        }
316
317        result
318    }
319
320    /// Check if this appears to be a numeric array for SIMD optimization
321    pub fn is_numeric(&self) -> bool {
322        // Heuristic: check first few elements
323        self.boundaries.len() > 4
324            && self.boundaries.iter().take(3).all(|range| {
325                let slice = &self.raw[range.start..range.end];
326                self.looks_like_number(slice)
327            })
328    }
329
330    fn looks_like_number(&self, bytes: &[u8]) -> bool {
331        if bytes.is_empty() {
332            return false;
333        }
334
335        bytes.iter().all(|&b| {
336            b.is_ascii_digit() || b == b'.' || b == b'-' || b == b'+' || b == b'e' || b == b'E'
337        })
338    }
339}
340
341impl<'a> LazyObject<'a> {
342    /// Create new lazy object from scan result
343    pub fn from_scan(raw: &'a [u8], scan_result: ScanResult) -> Self {
344        let fields = Self::extract_field_boundaries(raw, &scan_result);
345
346        Self { raw, fields }
347    }
348
349    /// Get number of fields
350    pub fn len(&self) -> usize {
351        self.fields.len()
352    }
353
354    /// Check if object is empty
355    pub fn is_empty(&self) -> bool {
356        self.fields.is_empty()
357    }
358
359    /// Get field value by key (simplified)
360    pub fn get(&self, key: &str) -> Option<&'a [u8]> {
361        // Find field by key
362        let field_range = self.fields.iter().find(|field| {
363            let key_bytes = &self.raw[field.key.start..field.key.end];
364            std::str::from_utf8(key_bytes) == Ok(key)
365        })?;
366
367        // Return value bytes
368        Some(&self.raw[field_range.value.start..field_range.value.end])
369    }
370
371    /// Get all field keys
372    pub fn keys(&self) -> Result<Vec<&str>> {
373        self.fields
374            .iter()
375            .map(|field| {
376                let key_bytes = &self.raw[field.key.start..field.key.end];
377                std::str::from_utf8(key_bytes).map_err(Error::from)
378            })
379            .collect()
380    }
381
382    /// Extract top-level field boundaries from a JSON object.
383    ///
384    /// Parses `raw` bytes assuming it is a JSON object (`{...}`) and returns a
385    /// `FieldRange` for each top-level field.  The `key` range covers the string
386    /// content **without** surrounding quotes; the `value` range covers the full
387    /// value representation (including quotes when the value is a string).
388    ///
389    /// # Invariant
390    ///
391    /// Assumes well-formed JSON. Malformed input (e.g. duplicate commas, mismatched
392    /// brackets) may produce incomplete results without signalling an error.
393    fn extract_field_boundaries(
394        raw: &[u8],
395        _scan_result: &ScanResult,
396    ) -> SmallVec<[FieldRange; 16]> {
397        let mut result = SmallVec::new();
398        let len = raw.len();
399
400        // Find the opening '{'.
401        let mut pos = 0;
402        while pos < len && raw[pos] != b'{' {
403            pos += 1;
404        }
405        if pos == len {
406            return result;
407        }
408        pos += 1; // skip '{'
409
410        loop {
411            // --- skip whitespace before key ---
412            while pos < len && raw[pos].is_ascii_whitespace() {
413                pos += 1;
414            }
415            if pos >= len || raw[pos] == b'}' {
416                break;
417            }
418            if raw[pos] != b'"' {
419                // Malformed input; stop.
420                break;
421            }
422            pos += 1; // skip opening '"'
423            let key_start = pos;
424            // Scan to closing '"', honouring backslash escapes.
425            while pos < len && raw[pos] != b'"' {
426                if raw[pos] == b'\\' {
427                    pos += 1; // skip escaped char
428                }
429                pos += 1;
430            }
431            let key_end = pos;
432            if pos < len {
433                pos += 1; // skip closing '"'
434            }
435
436            // --- skip whitespace and ':' ---
437            while pos < len && (raw[pos].is_ascii_whitespace() || raw[pos] == b':') {
438                pos += 1;
439            }
440            if pos >= len {
441                break;
442            }
443
444            // --- parse value with depth tracking ---
445            let value_start = pos;
446            let mut depth: usize = 0;
447            let mut in_str = false;
448
449            while pos < len {
450                let b = raw[pos];
451                if in_str {
452                    if b == b'\\' {
453                        pos += 1; // skip escaped char
454                    } else if b == b'"' {
455                        in_str = false;
456                        if depth == 0 {
457                            pos += 1;
458                            break;
459                        }
460                    }
461                    pos += 1;
462                    continue;
463                }
464                match b {
465                    b'"' => {
466                        in_str = true;
467                    }
468                    b'[' | b'{' => depth += 1,
469                    b']' | b'}' => {
470                        if depth == 0 {
471                            // Closing brace of the parent object — do not consume.
472                            break;
473                        }
474                        depth -= 1;
475                        if depth == 0 {
476                            pos += 1;
477                            break;
478                        }
479                    }
480                    b',' if depth == 0 => {
481                        // Separator between fields — do not consume.
482                        break;
483                    }
484                    _ => {}
485                }
486                pos += 1;
487            }
488
489            let value_end = trim_end(raw, value_start, pos);
490            if value_end > value_start {
491                result.push(FieldRange::new(
492                    Range::new(key_start, key_end),
493                    Range::new(value_start, value_end),
494                ));
495            }
496
497            // Skip ',' between fields (or '}' will exit on the next iteration).
498            while pos < len && (raw[pos].is_ascii_whitespace() || raw[pos] == b',') {
499                pos += 1;
500            }
501        }
502
503        result
504    }
505}
506
507/// Iterator for lazy array elements
508pub struct LazyArrayIter<'a> {
509    array: &'a LazyArray<'a>,
510    index: usize,
511}
512
513impl<'a> Iterator for LazyArrayIter<'a> {
514    type Item = &'a [u8]; // Raw element bytes
515
516    fn next(&mut self) -> Option<Self::Item> {
517        if self.index >= self.array.boundaries.len() {
518            return None;
519        }
520
521        let range = self.array.boundaries[self.index];
522        self.index += 1;
523
524        Some(&self.array.raw[range.start..range.end])
525    }
526}
527
528impl FieldRange {
529    /// Create new field range
530    pub fn new(key: Range, value: Range) -> Self {
531        Self { key, value }
532    }
533}
534
535/// Return the index past the last non-whitespace byte in `raw[start..end]`.
536///
537/// Used to strip trailing whitespace from element and value ranges.
538fn trim_end(raw: &[u8], start: usize, end: usize) -> usize {
539    let mut e = end;
540    while e > start && raw[e - 1].is_ascii_whitespace() {
541        e -= 1;
542    }
543    e
544}
545
546#[cfg(test)]
547mod tests {
548    use super::*;
549
550    #[test]
551    fn test_json_value_types() {
552        let val = JsonValue::String("hello");
553        assert_eq!(val.as_str(), Some("hello"));
554        assert!(val.as_f64().is_none());
555    }
556
557    #[test]
558    fn test_lazy_array_creation() {
559        let raw = b"[1, 2, 3]";
560        let scan_result = ScanResult::new();
561        let array = LazyArray::from_scan(raw, scan_result);
562
563        assert_eq!(array.len(), 3);
564        assert_eq!(array.get(0), Some(b"1".as_ref()));
565        assert_eq!(array.get(1), Some(b"2".as_ref()));
566        assert_eq!(array.get(2), Some(b"3".as_ref()));
567    }
568
569    #[test]
570    fn test_lazy_array_empty() {
571        let array = LazyArray::from_scan(b"[]", ScanResult::new());
572        assert_eq!(array.len(), 0);
573        assert!(array.is_empty());
574    }
575
576    #[test]
577    fn test_lazy_array_strings() {
578        let raw = b"[\"hello\", \"world\"]";
579        let array = LazyArray::from_scan(raw, ScanResult::new());
580        assert_eq!(array.len(), 2);
581        assert_eq!(array.get(0), Some(b"\"hello\"".as_ref()));
582    }
583
584    #[test]
585    fn test_lazy_array_nested() {
586        let raw = b"[1, [2, 3], {\"a\": 4}]";
587        let array = LazyArray::from_scan(raw, ScanResult::new());
588        assert_eq!(array.len(), 3);
589        assert_eq!(array.get(0), Some(b"1".as_ref()));
590        assert_eq!(array.get(1), Some(b"[2, 3]".as_ref()));
591        assert_eq!(array.get(2), Some(b"{\"a\": 4}".as_ref()));
592    }
593
594    #[test]
595    fn test_lazy_array_escaped_string() {
596        let raw = br#"["say \"hi\"", "bye"]"#;
597        let array = LazyArray::from_scan(raw, ScanResult::new());
598        assert_eq!(array.len(), 2);
599    }
600
601    #[test]
602    fn test_lazy_object_creation() {
603        let obj = LazyObject::from_scan(b"{\"a\": 1, \"b\": 2}", ScanResult::new());
604        assert_eq!(obj.len(), 2);
605        assert_eq!(obj.get("a"), Some(b"1".as_ref()));
606        assert_eq!(obj.get("b"), Some(b"2".as_ref()));
607    }
608
609    #[test]
610    fn test_lazy_object_empty() {
611        let obj = LazyObject::from_scan(b"{}", ScanResult::new());
612        assert_eq!(obj.len(), 0);
613        assert!(obj.is_empty());
614    }
615
616    #[test]
617    fn test_lazy_object_string_value() {
618        let raw = b"{\"name\": \"alice\"}";
619        let obj = LazyObject::from_scan(raw, ScanResult::new());
620        assert_eq!(obj.len(), 1);
621        assert_eq!(obj.get("name"), Some(b"\"alice\"".as_ref()));
622    }
623
624    #[test]
625    fn test_lazy_object_nested_value() {
626        let raw = b"{\"arr\": [1, 2], \"n\": 42}";
627        let obj = LazyObject::from_scan(raw, ScanResult::new());
628        assert_eq!(obj.len(), 2);
629        assert_eq!(obj.get("arr"), Some(b"[1, 2]".as_ref()));
630        assert_eq!(obj.get("n"), Some(b"42".as_ref()));
631    }
632
633    #[test]
634    fn test_number_detection() {
635        let raw = b"[1.0, 2.5, 3.14]";
636        let scan_result = ScanResult::new();
637        let array = LazyArray::from_scan(raw, scan_result);
638
639        assert!(array.looks_like_number(b"123.45"));
640        assert!(!array.looks_like_number(b"\"string\""));
641    }
642}