Skip to main content

rsonpath/classification/
structural.rs

1//! Classification of structurally significant JSON bytes.
2//!
3//! Provides the [`Structural`] struct and [`StructuralIterator`] trait
4//! that allow effectively iterating over structural characters in a JSON document.
5//!
6//! Classifying [`Commas`](`Structural::Comma`) and [`Colons`](`Structural::Colon`) is disabled by default.
7//! It can be enabled on demand by calling
8//! [`StructuralIterator::turn_commas_on`]/[`StructuralIterator::turn_colons_on`].
9//! This configuration is persisted across [`stop`](StructuralIterator::stop) and
10//! [`resume`](StructuralIterator::resume) calls.
11//!
12//! A structural classifier needs ownership over a base
13//! [`QuoteClassifiedIterator`](`crate::classification::quotes::QuoteClassifiedIterator`).
14use crate::{
15    classification::{quotes::QuoteClassifiedIterator, ResumeClassifierState},
16    input::{error::InputError, InputBlockIterator},
17    FallibleIterator, MaskType, BLOCK_SIZE,
18};
19
20/// Defines the kinds of brackets that can be identified as structural.
21#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
22#[repr(u8)]
23pub enum BracketType {
24    /// Square brackets, '[' and ']'.
25    Square,
26    /// Curly braces, '{' and '}'.
27    Curly,
28}
29
30/// Defines structural characters in JSON documents.
31#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
32pub enum Structural {
33    /// Represents the closing square or curly brace, ']' or '}'.
34    Closing(BracketType, usize),
35    /// Represents the colon ':' character.
36    Colon(usize),
37    /// Represents the opening square or curly brace, '[' or '{'.
38    Opening(BracketType, usize),
39    /// Represents the comma ',' character.
40    Comma(usize),
41}
42use Structural::*;
43
44impl Structural {
45    /// Returns the index of the character in the document,
46    /// i.e. which byte it is counting from 0.
47    #[inline(always)]
48    #[must_use]
49    pub fn idx(self) -> usize {
50        match self {
51            Closing(_, idx) | Colon(idx) | Opening(_, idx) | Comma(idx) => idx,
52        }
53    }
54
55    /// Add a given amount to the structural's index.
56    ///
57    /// # Examples
58    /// ```rust
59    /// # use rsonpath::classification::structural::Structural;
60    ///
61    /// let structural = Structural::Colon(42);
62    /// let offset_structural = structural.offset(10);
63    ///
64    /// assert_eq!(structural.idx(), 42);
65    /// assert_eq!(offset_structural.idx(), 52);
66    /// ```
67    #[inline(always)]
68    #[must_use]
69    pub fn offset(self, amount: usize) -> Self {
70        match self {
71            Closing(b, idx) => Closing(b, idx + amount),
72            Colon(idx) => Colon(idx + amount),
73            Opening(b, idx) => Opening(b, idx + amount),
74            Comma(idx) => Comma(idx + amount),
75        }
76    }
77
78    /// Check if the structural represents a closing character,
79    /// i.e. a [`Closing`] with either of the [`BracketType`] variants.
80    ///
81    /// # Examples
82    /// ```rust
83    /// # use rsonpath::classification::structural::{BracketType, Structural};
84    ///
85    /// let brace = Structural::Closing(BracketType::Curly, 42);
86    /// let bracket = Structural::Closing(BracketType::Square, 43);
87    /// let neither = Structural::Comma(44);
88    ///
89    /// assert!(brace.is_closing());
90    /// assert!(bracket.is_closing());
91    /// assert!(!neither.is_closing());
92    /// ```
93    #[inline(always)]
94    #[must_use]
95    pub fn is_closing(&self) -> bool {
96        matches!(self, Closing(_, _))
97    }
98
99    /// Check if the structural represents an opening character,
100    /// i.e. an [`Opening`] with either of the [`BracketType`] variants.
101    ///
102    /// # Examples
103    /// ```rust
104    /// # use rsonpath::classification::structural::{BracketType, Structural};
105    ///
106    /// let brace = Structural::Opening(BracketType::Curly, 42);
107    /// let bracket = Structural::Opening(BracketType::Square, 43);
108    /// let neither = Structural::Comma(44);
109    ///
110    /// assert!(brace.is_opening());
111    /// assert!(bracket.is_opening());
112    /// assert!(!neither.is_opening());
113    /// ```
114    #[inline(always)]
115    #[must_use]
116    pub fn is_opening(&self) -> bool {
117        matches!(self, Opening(_, _))
118    }
119}
120
121/// Trait for classifier iterators, i.e. finite iterators of [`Structural`] characters
122/// that hold a reference to the JSON document valid for `'a`.
123pub trait StructuralIterator<'i, I, Q, M, const N: usize>:
124    FallibleIterator<Item = Structural, Error = InputError>
125where
126    I: InputBlockIterator<'i, N>,
127{
128    /// Stop classification and return a state object that can be used to resume
129    /// a classifier from the place in which the current one was stopped.
130    fn stop(self) -> ResumeClassifierState<'i, I, Q, M, N>;
131
132    /// Resume classification from a state retrieved by stopping a classifier.
133    fn resume(state: ResumeClassifierState<'i, I, Q, M, N>) -> Self;
134
135    /// Turn classification of [`Structural::Colon`] characters off.
136    fn turn_colons_off(&mut self);
137
138    /// Turn classification of [`Structural::Colon`] characters on.
139    ///
140    /// The `idx` passed should be the index of the byte in the input
141    /// from which commas are to be classified. Passing an `idx` that
142    /// does not match the index which the internal [`QuoteClassifiedIterator`]
143    /// reached may result in incorrect results.
144    fn turn_colons_on(&mut self, idx: usize);
145
146    /// Turn classification of [`Structural::Comma`] characters off.
147    fn turn_commas_off(&mut self);
148
149    /// Turn classification of [`Structural::Comma`] characters on.
150    ///
151    /// The `idx` passed should be the index of the byte in the input
152    /// from which commas are to be classified. Passing an `idx` that
153    /// does not match the index which the internal [`QuoteClassifiedIterator`]
154    /// reached may result in incorrect results.
155    fn turn_commas_on(&mut self, idx: usize);
156
157    /// Turn classification of both [`Structural::Comma`] and [`Structural::Colon`]
158    /// characters on. This is generally faster than calling
159    /// [`turn_colons_on`](`StructuralIterator::turn_colons_on`) and
160    /// [`turn_commas_on`](`StructuralIterator::turn_commas_on`)
161    /// in sequence.
162    fn turn_colons_and_commas_on(&mut self, idx: usize);
163
164    /// Turn classification of both [`Structural::Comma`] and [`Structural::Colon`]
165    /// characters off. This is generally faster than calling
166    /// [`turn_colons_on`](`StructuralIterator::turn_colons_off`) and
167    /// [`turn_commas_on`](`StructuralIterator::turn_commas_off`)
168    /// in sequence.
169    fn turn_colons_and_commas_off(&mut self);
170}
171
172pub(crate) mod nosimd;
173pub(crate) mod shared;
174
175#[cfg(target_arch = "x86")]
176pub(crate) mod avx2_32;
177#[cfg(target_arch = "x86_64")]
178pub(crate) mod avx2_64;
179#[cfg(target_arch = "x86_64")]
180pub(crate) mod avx512_64;
181#[cfg(target_arch = "aarch64")]
182pub(crate) mod neon_64;
183#[cfg(target_arch = "x86")]
184pub(crate) mod ssse3_32;
185#[cfg(target_arch = "x86_64")]
186pub(crate) mod ssse3_64;
187
188pub(crate) trait StructuralImpl {
189    type Classifier<'i, I, Q>: StructuralIterator<'i, I, Q, MaskType, BLOCK_SIZE>
190    where
191        I: InputBlockIterator<'i, BLOCK_SIZE>,
192        Q: QuoteClassifiedIterator<'i, I, MaskType, BLOCK_SIZE>;
193
194    fn new<'i, I, Q>(iter: Q) -> Self::Classifier<'i, I, Q>
195    where
196        I: InputBlockIterator<'i, BLOCK_SIZE>,
197        Q: QuoteClassifiedIterator<'i, I, MaskType, BLOCK_SIZE>;
198
199    fn resume<'i, I, Q>(state: ResumeClassifierState<'i, I, Q, MaskType, BLOCK_SIZE>) -> Self::Classifier<'i, I, Q>
200    where
201        I: InputBlockIterator<'i, BLOCK_SIZE>,
202        Q: QuoteClassifiedIterator<'i, I, MaskType, BLOCK_SIZE>,
203    {
204        <Self::Classifier<'i, I, Q> as StructuralIterator<'i, I, Q, MaskType, BLOCK_SIZE>>::resume(state)
205    }
206}
207
208#[cfg(test)]
209mod tests {
210    use super::*;
211    use crate::{
212        classification::simd::{self, config_simd, Simd},
213        input::{BorrowedBytes, Input},
214        result::empty::EmptyRecorder,
215    };
216
217    #[test]
218    fn resumption_without_commas_or_colons() {
219        use BracketType::*;
220        use Structural::*;
221
222        let simd = simd::configure();
223        config_simd!(simd => |simd| {
224            let json = r#"{"a": [42, 36, { "b": { "c": 1, "d": 2 } }]}"#;
225            let json_string = json.to_owned();
226            let input = BorrowedBytes::new(json_string.as_bytes());
227            let iter = input.iter_blocks(&EmptyRecorder);
228            let quotes = simd.classify_quoted_sequences(iter);
229            let offset = input.leading_padding_len();
230
231            let mut classifier = simd.classify_structural_characters(quotes);
232
233            assert_eq!(Some(Opening(Curly, offset)), classifier.next().unwrap());
234            assert_eq!(Some(Opening(Square, 6 + offset)), classifier.next().unwrap());
235
236            let resume_state = classifier.stop();
237
238            let mut resumed_classifier = simd.resume_structural_classification(resume_state);
239
240            assert_eq!(Some(Opening(Curly, 15 + offset)), resumed_classifier.next().unwrap());
241            assert_eq!(Some(Opening(Curly, 22 + offset)), resumed_classifier.next().unwrap());
242        });
243    }
244
245    #[test]
246    fn resumption_with_commas_but_no_colons() {
247        use BracketType::*;
248        use Structural::*;
249
250        let simd = simd::configure();
251        config_simd!(simd => |simd| {
252            let json = r#"{"a": [42, 36, { "b": { "c": 1, "d": 2 } }]}"#;
253            let json_string = json.to_owned();
254            let input = BorrowedBytes::new(json_string.as_bytes());
255            let iter = input.iter_blocks(&EmptyRecorder);
256            let quotes = simd.classify_quoted_sequences(iter);
257            let offset = input.leading_padding_len();
258
259            let mut classifier = simd.classify_structural_characters(quotes);
260            classifier.turn_commas_on(0);
261
262            assert_eq!(Some(Opening(Curly, offset)), classifier.next().unwrap());
263            assert_eq!(Some(Opening(Square, 6 + offset)), classifier.next().unwrap());
264            assert_eq!(Some(Comma(9 + offset)), classifier.next().unwrap());
265            assert_eq!(Some(Comma(13 + offset)), classifier.next().unwrap());
266
267            let resume_state = classifier.stop();
268
269            let mut resumed_classifier = simd.resume_structural_classification(resume_state);
270
271            assert_eq!(Some(Opening(Curly, 15 + offset)), resumed_classifier.next().unwrap());
272            assert_eq!(Some(Opening(Curly, 22 + offset)), resumed_classifier.next().unwrap());
273            assert_eq!(Some(Comma(30 + offset)), resumed_classifier.next().unwrap());
274        });
275    }
276
277    #[test]
278    fn resumption_with_colons_but_no_commas() {
279        use BracketType::*;
280        use Structural::*;
281
282        let simd = simd::configure();
283        config_simd!(simd => |simd| {
284            let json = r#"{"a": [42, 36, { "b": { "c": 1, "d": 2 } }]}"#;
285            let json_string = json.to_owned();
286            let input = BorrowedBytes::new(json_string.as_bytes());
287            let iter = input.iter_blocks(&EmptyRecorder);
288            let quotes = simd.classify_quoted_sequences(iter);
289            let offset = input.leading_padding_len();
290
291            let mut classifier = simd.classify_structural_characters(quotes);
292            classifier.turn_colons_on(0);
293
294            assert_eq!(Some(Opening(Curly, offset)), classifier.next().unwrap());
295            assert_eq!(Some(Colon(4 + offset)), classifier.next().unwrap());
296            assert_eq!(Some(Opening(Square, 6 + offset)), classifier.next().unwrap());
297
298            let resume_state = classifier.stop();
299
300            let mut resumed_classifier = simd.resume_structural_classification(resume_state);
301
302            assert_eq!(Some(Opening(Curly, 15 + offset)), resumed_classifier.next().unwrap());
303            assert_eq!(Some(Colon(20 + offset)), resumed_classifier.next().unwrap());
304            assert_eq!(Some(Opening(Curly, 22 + offset)), resumed_classifier.next().unwrap());
305            assert_eq!(Some(Colon(27 + offset)), resumed_classifier.next().unwrap());
306        });
307    }
308
309    #[test]
310    fn resumption_with_commas_and_colons() {
311        use BracketType::*;
312        use Structural::*;
313
314        let simd = simd::configure();
315        config_simd!(simd => |simd| {
316            let json = r#"{"a": [42, 36, { "b": { "c": 1, "d": 2 } }]}"#;
317            let json_string = json.to_owned();
318            let input = BorrowedBytes::new(json_string.as_bytes());
319            let iter = input.iter_blocks(&EmptyRecorder);
320            let quotes = simd.classify_quoted_sequences(iter);
321            let offset = input.leading_padding_len();
322
323            let mut classifier = simd.classify_structural_characters(quotes);
324            classifier.turn_commas_on(0);
325            classifier.turn_colons_on(0);
326
327            assert_eq!(Some(Opening(Curly, offset)), classifier.next().unwrap());
328            assert_eq!(Some(Colon(4 + offset)), classifier.next().unwrap());
329            assert_eq!(Some(Opening(Square, 6 + offset)), classifier.next().unwrap());
330            assert_eq!(Some(Comma(9 + offset)), classifier.next().unwrap());
331            assert_eq!(Some(Comma(13 + offset)), classifier.next().unwrap());
332
333            let resume_state = classifier.stop();
334
335            let mut resumed_classifier = simd.resume_structural_classification(resume_state);
336
337            assert_eq!(Some(Opening(Curly, 15 + offset)), resumed_classifier.next().unwrap());
338            assert_eq!(Some(Colon(20 + offset)), resumed_classifier.next().unwrap());
339            assert_eq!(Some(Opening(Curly, 22 + offset)), resumed_classifier.next().unwrap());
340            assert_eq!(Some(Colon(27 + offset)), resumed_classifier.next().unwrap());
341            assert_eq!(Some(Comma(30 + offset)), resumed_classifier.next().unwrap());
342        });
343    }
344
345    #[test]
346    fn resumption_at_block_boundary() {
347        use BracketType::*;
348        use Structural::*;
349
350        let simd = simd::configure();
351        config_simd!(simd => |simd| {
352            let mut json_string = "{".to_owned();
353            json_string += &" ".repeat(128);
354            json_string += "}";
355            let input = BorrowedBytes::new(json_string.as_bytes());
356            let iter = input.iter_blocks(&EmptyRecorder);
357            let quotes = simd.classify_quoted_sequences(iter);
358            let offset = input.leading_padding_len();
359
360            let mut classifier = simd.classify_structural_characters(quotes);
361
362            assert_eq!(Some(Opening(Curly, offset)), classifier.next().unwrap());
363
364            let resume_state = classifier.stop();
365            let mut resumed_classifier = simd.resume_structural_classification(resume_state);
366
367            assert_eq!(Some(Closing(Curly, 129 + offset)), resumed_classifier.next().unwrap());
368        });
369    }
370}