Skip to main content

pjson_rs/parser/
scanner.rs

1//! JSON scanning interface and common types
2
3use crate::{Result, semantic::NumericDType};
4use smallvec::SmallVec;
5
6/// Main scanning interface implemented by SIMD and scalar scanners
7pub trait JsonScanner {
8    /// Scan JSON input and return structural information
9    fn scan(&self, input: &[u8]) -> Result<ScanResult>;
10
11    /// Check if this scanner supports SIMD operations
12    fn supports_simd(&self) -> bool;
13
14    /// Parse numeric array with SIMD optimization if available
15    fn parse_numeric_array(
16        &self,
17        input: &[u8],
18        dtype: NumericDType,
19        length: Option<usize>,
20    ) -> Result<crate::parser::JsonValue<'_>>;
21
22    /// Find all string boundaries in the input
23    fn find_strings(&self, input: &[u8]) -> Result<Vec<StringLocation>>;
24
25    /// Find structural characters ({}[],:) positions
26    fn find_structural_chars(&self, input: &[u8]) -> Result<Vec<usize>>;
27}
28
29/// Result of scanning JSON input
30#[derive(Debug, Clone)]
31pub struct ScanResult {
32    /// Positions of structural characters
33    pub structural_chars: Vec<usize>,
34    /// String boundary positions
35    pub string_bounds: SmallVec<[Range; 16]>,
36    /// Number boundary positions  
37    pub number_bounds: SmallVec<[Range; 16]>,
38    /// Literal boundary positions (true/false/null)
39    pub literal_bounds: SmallVec<[Range; 8]>,
40    /// Detected root value type
41    pub root_type: Option<crate::parser::ValueType>,
42}
43
44/// Range representing start and end positions
45#[derive(Debug, Clone, Copy)]
46pub struct Range {
47    /// Inclusive byte offset of the first byte in the range.
48    pub start: usize,
49    /// Exclusive byte offset one past the last byte in the range.
50    pub end: usize,
51}
52
53/// String location with metadata
54#[derive(Debug, Clone)]
55pub struct StringLocation {
56    /// Start position of string (after opening quote)
57    pub start: usize,
58    /// End position of string (before closing quote)
59    pub end: usize,
60    /// Whether string contains escape sequences
61    pub has_escapes: bool,
62    /// Estimated length after unescaping
63    pub unescaped_len: Option<usize>,
64}
65
66impl ScanResult {
67    /// Create new empty scan result
68    pub fn new() -> Self {
69        Self {
70            structural_chars: Vec::new(),
71            string_bounds: SmallVec::new(),
72            number_bounds: SmallVec::new(),
73            literal_bounds: SmallVec::new(),
74            root_type: None,
75        }
76    }
77
78    /// Determine the root JSON value type
79    pub fn determine_root_type(&self) -> crate::parser::ValueType {
80        if let Some(root_type) = self.root_type {
81            return root_type;
82        }
83
84        // Simplified type detection
85        if !self.string_bounds.is_empty() {
86            crate::parser::ValueType::String
87        } else if !self.number_bounds.is_empty() {
88            crate::parser::ValueType::Number
89        } else if !self.literal_bounds.is_empty() {
90            crate::parser::ValueType::Boolean // or Null
91        } else {
92            crate::parser::ValueType::Object // Default
93        }
94    }
95
96    /// Check if this appears to be a numeric array
97    pub fn is_numeric_array(&self) -> bool {
98        // Heuristic: starts with '[', has many numbers, few strings
99        self.structural_chars
100            .first()
101            .is_some_and(|&c| c as u8 == b'[')
102            && self.number_bounds.len() > 4
103            && self.string_bounds.len() < 2
104    }
105
106    /// Check if this appears to be a table/object array
107    pub fn is_table_like(&self) -> bool {
108        // Heuristic: starts with '[', has balanced objects and strings
109        self.structural_chars
110            .first()
111            .is_some_and(|&c| c as u8 == b'[')
112            && self.count_object_starts() > 2
113            && self.string_bounds.len() > self.number_bounds.len()
114    }
115
116    /// Count opening braces to estimate object count
117    fn count_object_starts(&self) -> usize {
118        self.structural_chars
119            .iter()
120            .filter(|&&pos| pos as u8 == b'{')
121            .count()
122    }
123}
124
125impl Default for ScanResult {
126    fn default() -> Self {
127        Self::new()
128    }
129}
130
131impl Range {
132    /// Create new range
133    pub fn new(start: usize, end: usize) -> Self {
134        Self { start, end }
135    }
136
137    /// Get range length
138    pub fn len(&self) -> usize {
139        self.end.saturating_sub(self.start)
140    }
141
142    /// Check if range is empty
143    pub fn is_empty(&self) -> bool {
144        self.len() == 0
145    }
146}
147
148impl StringLocation {
149    /// Create new string location
150    pub fn new(start: usize, end: usize) -> Self {
151        Self {
152            start,
153            end,
154            has_escapes: false,
155            unescaped_len: None,
156        }
157    }
158
159    /// Create with escape information
160    pub fn with_escapes(start: usize, end: usize, has_escapes: bool) -> Self {
161        Self {
162            start,
163            end,
164            has_escapes,
165            unescaped_len: None,
166        }
167    }
168
169    /// Get string length in bytes
170    pub fn len(&self) -> usize {
171        self.end.saturating_sub(self.start)
172    }
173
174    /// Check if empty
175    pub fn is_empty(&self) -> bool {
176        self.len() == 0
177    }
178}
179
180#[cfg(test)]
181mod tests {
182    use super::*;
183
184    #[test]
185    fn test_scan_result_creation() {
186        let result = ScanResult::new();
187        assert!(result.structural_chars.is_empty());
188        assert!(result.string_bounds.is_empty());
189    }
190
191    #[test]
192    fn test_range_operations() {
193        let range = Range::new(10, 20);
194        assert_eq!(range.len(), 10);
195        assert!(!range.is_empty());
196    }
197
198    #[test]
199    fn test_string_location() {
200        let loc = StringLocation::new(5, 15);
201        assert_eq!(loc.len(), 10);
202        assert!(!loc.has_escapes);
203    }
204}