rsonpath/classification/
structural.rs

1//! Classification of structurally significant JSON bytes.
2//!
3//! Provides the [`Structural`] struct and [`StructuralIterator`] trait
4//! that allow effectively iterating over structural characters in a JSON document.
5//!
6//! Classifying [`Commas`](`Structural::Comma`) and [`Colons`](`Structural::Colon`) is disabled by default.
7//! It can be enabled on demand by calling
8//! [`StructuralIterator::turn_commas_on`]/[`StructuralIterator::turn_colons_on`].
9//! This configuration is persisted across [`stop`](StructuralIterator::stop) and
10//! [`resume`](StructuralIterator::resume) calls.
11//!
12//! A structural classifier needs ownership over a base
13//! [`QuoteClassifiedIterator`](`crate::classification::quotes::QuoteClassifiedIterator`).
14use crate::{
15    classification::{quotes::QuoteClassifiedIterator, ResumeClassifierState},
16    input::{error::InputError, InputBlockIterator},
17    FallibleIterator, MaskType, BLOCK_SIZE,
18};
19
20/// Defines the kinds of brackets that can be identified as structural.
21#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
22#[repr(u8)]
23pub enum BracketType {
24    /// Square brackets, '[' and ']'.
25    Square,
26    /// Curly braces, '{' and '}'.
27    Curly,
28}
29
30/// Defines structural characters in JSON documents.
31#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
32pub enum Structural {
33    /// Represents the closing square or curly brace, ']' or '}'.
34    Closing(BracketType, usize),
35    /// Represents the colon ':' character.
36    Colon(usize),
37    /// Represents the opening square or curly brace, '[' or '{'.
38    Opening(BracketType, usize),
39    /// Represents the comma ',' character.
40    Comma(usize),
41}
42use Structural::*;
43
44impl Structural {
45    /// Returns the index of the character in the document,
46    /// i.e. which byte it is counting from 0.
47    #[inline(always)]
48    #[must_use]
49    pub fn idx(self) -> usize {
50        match self {
51            Closing(_, idx) | Colon(idx) | Opening(_, idx) | Comma(idx) => idx,
52        }
53    }
54
55    /// Add a given amount to the structural's index.
56    ///
57    /// # Examples
58    /// ```rust
59    /// # use rsonpath::classification::structural::Structural;
60    ///
61    /// let structural = Structural::Colon(42);
62    /// let offset_structural = structural.offset(10);
63    ///
64    /// assert_eq!(structural.idx(), 42);
65    /// assert_eq!(offset_structural.idx(), 52);
66    /// ```
67    #[inline(always)]
68    #[must_use]
69    pub fn offset(self, amount: usize) -> Self {
70        match self {
71            Closing(b, idx) => Closing(b, idx + amount),
72            Colon(idx) => Colon(idx + amount),
73            Opening(b, idx) => Opening(b, idx + amount),
74            Comma(idx) => Comma(idx + amount),
75        }
76    }
77
78    /// Check if the structural represents a closing character,
79    /// i.e. a [`Closing`] with either of the [`BracketType`] variants.
80    ///
81    /// # Examples
82    /// ```rust
83    /// # use rsonpath::classification::structural::{BracketType, Structural};
84    ///
85    /// let brace = Structural::Closing(BracketType::Curly, 42);
86    /// let bracket = Structural::Closing(BracketType::Square, 43);
87    /// let neither = Structural::Comma(44);
88    ///
89    /// assert!(brace.is_closing());
90    /// assert!(bracket.is_closing());
91    /// assert!(!neither.is_closing());
92    /// ```
93    #[inline(always)]
94    #[must_use]
95    pub fn is_closing(&self) -> bool {
96        matches!(self, Closing(_, _))
97    }
98
99    /// Check if the structural represents an opening character,
100    /// i.e. an [`Opening`] with either of the [`BracketType`] variants.
101    ///
102    /// # Examples
103    /// ```rust
104    /// # use rsonpath::classification::structural::{BracketType, Structural};
105    ///
106    /// let brace = Structural::Opening(BracketType::Curly, 42);
107    /// let bracket = Structural::Opening(BracketType::Square, 43);
108    /// let neither = Structural::Comma(44);
109    ///
110    /// assert!(brace.is_opening());
111    /// assert!(bracket.is_opening());
112    /// assert!(!neither.is_opening());
113    /// ```
114    #[inline(always)]
115    #[must_use]
116    pub fn is_opening(&self) -> bool {
117        matches!(self, Opening(_, _))
118    }
119}
120
121/// Trait for classifier iterators, i.e. finite iterators of [`Structural`] characters
122/// that hold a reference to the JSON document valid for `'a`.
123pub trait StructuralIterator<'i, I, Q, M, const N: usize>:
124    FallibleIterator<Item = Structural, Error = InputError>
125where
126    I: InputBlockIterator<'i, N>,
127{
128    /// Stop classification and return a state object that can be used to resume
129    /// a classifier from the place in which the current one was stopped.
130    fn stop(self) -> ResumeClassifierState<'i, I, Q, M, N>;
131
132    /// Resume classification from a state retrieved by stopping a classifier.
133    fn resume(state: ResumeClassifierState<'i, I, Q, M, N>) -> Self;
134
135    /// Turn classification of [`Structural::Colon`] characters off.
136    fn turn_colons_off(&mut self);
137
138    /// Turn classification of [`Structural::Colon`] characters on.
139    ///
140    /// The `idx` passed should be the index of the byte in the input
141    /// from which commas are to be classified. Passing an `idx` that
142    /// does not match the index which the internal [`QuoteClassifiedIterator`]
143    /// reached may result in incorrect results.
144    fn turn_colons_on(&mut self, idx: usize);
145
146    /// Turn classification of [`Structural::Comma`] characters off.
147    fn turn_commas_off(&mut self);
148
149    /// Turn classification of [`Structural::Comma`] characters on.
150    ///
151    /// The `idx` passed should be the index of the byte in the input
152    /// from which commas are to be classified. Passing an `idx` that
153    /// does not match the index which the internal [`QuoteClassifiedIterator`]
154    /// reached may result in incorrect results.
155    fn turn_commas_on(&mut self, idx: usize);
156
157    /// Turn classification of both [`Structural::Comma`] and [`Structural::Colon`]
158    /// characters on. This is generally faster than calling
159    /// [`turn_colons_on`](`StructuralIterator::turn_colons_on`) and
160    /// [`turn_commas_on`](`StructuralIterator::turn_commas_on`)
161    /// in sequence.
162    fn turn_colons_and_commas_on(&mut self, idx: usize);
163
164    /// Turn classification of both [`Structural::Comma`] and [`Structural::Colon`]
165    /// characters off. This is generally faster than calling
166    /// [`turn_colons_on`](`StructuralIterator::turn_colons_off`) and
167    /// [`turn_commas_on`](`StructuralIterator::turn_commas_off`)
168    /// in sequence.
169    fn turn_colons_and_commas_off(&mut self);
170}
171
172pub(crate) mod nosimd;
173pub(crate) mod shared;
174
175#[cfg(target_arch = "x86")]
176pub(crate) mod avx2_32;
177#[cfg(target_arch = "x86_64")]
178pub(crate) mod avx2_64;
179#[cfg(target_arch = "x86")]
180pub(crate) mod ssse3_32;
181#[cfg(target_arch = "x86_64")]
182pub(crate) mod ssse3_64;
183
184pub(crate) trait StructuralImpl {
185    type Classifier<'i, I, Q>: StructuralIterator<'i, I, Q, MaskType, BLOCK_SIZE>
186    where
187        I: InputBlockIterator<'i, BLOCK_SIZE>,
188        Q: QuoteClassifiedIterator<'i, I, MaskType, BLOCK_SIZE>;
189
190    fn new<'i, I, Q>(iter: Q) -> Self::Classifier<'i, I, Q>
191    where
192        I: InputBlockIterator<'i, BLOCK_SIZE>,
193        Q: QuoteClassifiedIterator<'i, I, MaskType, BLOCK_SIZE>;
194
195    fn resume<'i, I, Q>(state: ResumeClassifierState<'i, I, Q, MaskType, BLOCK_SIZE>) -> Self::Classifier<'i, I, Q>
196    where
197        I: InputBlockIterator<'i, BLOCK_SIZE>,
198        Q: QuoteClassifiedIterator<'i, I, MaskType, BLOCK_SIZE>,
199    {
200        <Self::Classifier<'i, I, Q> as StructuralIterator<'i, I, Q, MaskType, BLOCK_SIZE>>::resume(state)
201    }
202}
203
204#[cfg(test)]
205mod tests {
206    use super::*;
207    use crate::{
208        classification::simd::{self, config_simd, Simd},
209        input::{BorrowedBytes, Input},
210        result::empty::EmptyRecorder,
211    };
212
213    #[test]
214    fn resumption_without_commas_or_colons() {
215        use BracketType::*;
216        use Structural::*;
217
218        let simd = simd::configure();
219        config_simd!(simd => |simd| {
220            let json = r#"{"a": [42, 36, { "b": { "c": 1, "d": 2 } }]}"#;
221            let json_string = json.to_owned();
222            let input = BorrowedBytes::new(json_string.as_bytes());
223            let iter = input.iter_blocks(&EmptyRecorder);
224            let quotes = simd.classify_quoted_sequences(iter);
225            let offset = input.leading_padding_len();
226
227            let mut classifier = simd.classify_structural_characters(quotes);
228
229            assert_eq!(Some(Opening(Curly, offset)), classifier.next().unwrap());
230            assert_eq!(Some(Opening(Square, 6 + offset)), classifier.next().unwrap());
231
232            let resume_state = classifier.stop();
233
234            let mut resumed_classifier = simd.resume_structural_classification(resume_state);
235
236            assert_eq!(Some(Opening(Curly, 15 + offset)), resumed_classifier.next().unwrap());
237            assert_eq!(Some(Opening(Curly, 22 + offset)), resumed_classifier.next().unwrap());
238        });
239    }
240
241    #[test]
242    fn resumption_with_commas_but_no_colons() {
243        use BracketType::*;
244        use Structural::*;
245
246        let simd = simd::configure();
247        config_simd!(simd => |simd| {
248            let json = r#"{"a": [42, 36, { "b": { "c": 1, "d": 2 } }]}"#;
249            let json_string = json.to_owned();
250            let input = BorrowedBytes::new(json_string.as_bytes());
251            let iter = input.iter_blocks(&EmptyRecorder);
252            let quotes = simd.classify_quoted_sequences(iter);
253            let offset = input.leading_padding_len();
254
255            let mut classifier = simd.classify_structural_characters(quotes);
256            classifier.turn_commas_on(0);
257
258            assert_eq!(Some(Opening(Curly, offset)), classifier.next().unwrap());
259            assert_eq!(Some(Opening(Square, 6 + offset)), classifier.next().unwrap());
260            assert_eq!(Some(Comma(9 + offset)), classifier.next().unwrap());
261            assert_eq!(Some(Comma(13 + offset)), classifier.next().unwrap());
262
263            let resume_state = classifier.stop();
264
265            let mut resumed_classifier = simd.resume_structural_classification(resume_state);
266
267            assert_eq!(Some(Opening(Curly, 15 + offset)), resumed_classifier.next().unwrap());
268            assert_eq!(Some(Opening(Curly, 22 + offset)), resumed_classifier.next().unwrap());
269            assert_eq!(Some(Comma(30 + offset)), resumed_classifier.next().unwrap());
270        });
271    }
272
273    #[test]
274    fn resumption_with_colons_but_no_commas() {
275        use BracketType::*;
276        use Structural::*;
277
278        let simd = simd::configure();
279        config_simd!(simd => |simd| {
280            let json = r#"{"a": [42, 36, { "b": { "c": 1, "d": 2 } }]}"#;
281            let json_string = json.to_owned();
282            let input = BorrowedBytes::new(json_string.as_bytes());
283            let iter = input.iter_blocks(&EmptyRecorder);
284            let quotes = simd.classify_quoted_sequences(iter);
285            let offset = input.leading_padding_len();
286
287            let mut classifier = simd.classify_structural_characters(quotes);
288            classifier.turn_colons_on(0);
289
290            assert_eq!(Some(Opening(Curly, offset)), classifier.next().unwrap());
291            assert_eq!(Some(Colon(4 + offset)), classifier.next().unwrap());
292            assert_eq!(Some(Opening(Square, 6 + offset)), classifier.next().unwrap());
293
294            let resume_state = classifier.stop();
295
296            let mut resumed_classifier = simd.resume_structural_classification(resume_state);
297
298            assert_eq!(Some(Opening(Curly, 15 + offset)), resumed_classifier.next().unwrap());
299            assert_eq!(Some(Colon(20 + offset)), resumed_classifier.next().unwrap());
300            assert_eq!(Some(Opening(Curly, 22 + offset)), resumed_classifier.next().unwrap());
301            assert_eq!(Some(Colon(27 + offset)), resumed_classifier.next().unwrap());
302        });
303    }
304
305    #[test]
306    fn resumption_with_commas_and_colons() {
307        use BracketType::*;
308        use Structural::*;
309
310        let simd = simd::configure();
311        config_simd!(simd => |simd| {
312            let json = r#"{"a": [42, 36, { "b": { "c": 1, "d": 2 } }]}"#;
313            let json_string = json.to_owned();
314            let input = BorrowedBytes::new(json_string.as_bytes());
315            let iter = input.iter_blocks(&EmptyRecorder);
316            let quotes = simd.classify_quoted_sequences(iter);
317            let offset = input.leading_padding_len();
318
319            let mut classifier = simd.classify_structural_characters(quotes);
320            classifier.turn_commas_on(0);
321            classifier.turn_colons_on(0);
322
323            assert_eq!(Some(Opening(Curly, offset)), classifier.next().unwrap());
324            assert_eq!(Some(Colon(4 + offset)), classifier.next().unwrap());
325            assert_eq!(Some(Opening(Square, 6 + offset)), classifier.next().unwrap());
326            assert_eq!(Some(Comma(9 + offset)), classifier.next().unwrap());
327            assert_eq!(Some(Comma(13 + offset)), classifier.next().unwrap());
328
329            let resume_state = classifier.stop();
330
331            let mut resumed_classifier = simd.resume_structural_classification(resume_state);
332
333            assert_eq!(Some(Opening(Curly, 15 + offset)), resumed_classifier.next().unwrap());
334            assert_eq!(Some(Colon(20 + offset)), resumed_classifier.next().unwrap());
335            assert_eq!(Some(Opening(Curly, 22 + offset)), resumed_classifier.next().unwrap());
336            assert_eq!(Some(Colon(27 + offset)), resumed_classifier.next().unwrap());
337            assert_eq!(Some(Comma(30 + offset)), resumed_classifier.next().unwrap());
338        });
339    }
340
341    #[test]
342    fn resumption_at_block_boundary() {
343        use BracketType::*;
344        use Structural::*;
345
346        let simd = simd::configure();
347        config_simd!(simd => |simd| {
348            let mut json_string = "{".to_owned();
349            json_string += &" ".repeat(128);
350            json_string += "}";
351            let input = BorrowedBytes::new(json_string.as_bytes());
352            let iter = input.iter_blocks(&EmptyRecorder);
353            let quotes = simd.classify_quoted_sequences(iter);
354            let offset = input.leading_padding_len();
355
356            let mut classifier = simd.classify_structural_characters(quotes);
357
358            assert_eq!(Some(Opening(Curly, offset)), classifier.next().unwrap());
359
360            let resume_state = classifier.stop();
361            let mut resumed_classifier = simd.resume_structural_classification(resume_state);
362
363            assert_eq!(Some(Closing(Curly, 129 + offset)), resumed_classifier.next().unwrap());
364        });
365    }
366}