rsonpath/classification/quotes.rs
1//! Classification of bytes withing JSON quote sequences.
2//!
3//! Provides the [`QuoteClassifiedBlock`] struct and [`QuoteClassifiedIterator`] trait
4//! that allow effectively enriching JSON inputs with quote sequence information.
5//!
6//! The output of quote classification is an iterator of [`QuoteClassifiedBlock`]
7//! which contain bitmasks whose lit bits signify characters that are within quotes
8//! in the source document. These characters need to be ignored.
9//!
10//! Note that the actual quote characters are not guaranteed to be classified
11//! as "within themselves" or otherwise. In particular the current implementation
12//! marks _opening_ quotes with lit bits, but _closing_ quotes are always unmarked.
13//! This behavior should not be presumed to be stable, though, and can change
14//! without a major semver bump.
15use crate::{
16 input::{error::InputError, InputBlock, InputBlockIterator},
17 FallibleIterator, MaskType, BLOCK_SIZE,
18};
19
20/// Result of the [`FallibleIterator`] for quote classification,
21/// and of the [`offset`](`QuoteClassifiedIterator::offset`) function.
22pub type QuoteIterResult<I, M, const N: usize> = Result<Option<QuoteClassifiedBlock<I, M, N>>, InputError>;
23
24/// Input block with a bitmask signifying which characters are within quotes.
25///
26/// Characters within quotes in the input are guaranteed to have their corresponding
27/// bit in `within_quotes_mask` set. The $0$-th bit of the mask corresponds to the
28/// last character in `block`, the $1$-st bit to the second-to-last character, etc.
29///
30/// There is no guarantee on how the boundary quote characters are classified,
31/// their bits might be lit or not lit depending on the implementation.
32pub struct QuoteClassifiedBlock<B, M, const N: usize> {
33 /// The block that was classified.
34 pub block: B,
35 /// Mask marking characters within a quoted sequence.
36 pub within_quotes_mask: M,
37}
38
39/// Result of resuming quote classification, the resulting iterator
40/// and optionally the first block (already quote classified).
41pub struct ResumedQuoteClassifier<Q, B, M, const N: usize> {
42 /// Resumed iterator.
43 pub classifier: Q,
44 /// Optional first quote classified block.
45 pub first_block: Option<QuoteClassifiedBlock<B, M, N>>,
46}
47
48/// Trait for quote classifier iterators, i.e. finite iterators
49/// enriching blocks of input with quote bitmasks.
50/// Iterator is allowed to hold a reference to the JSON document valid for `'a`.
51pub trait QuoteClassifiedIterator<'i, I: InputBlockIterator<'i, N>, M, const N: usize>:
52 FallibleIterator<Item = QuoteClassifiedBlock<I::Block, M, N>, Error = InputError>
53{
54 /// Get the total offset in bytes from the beginning of input.
55 fn get_offset(&self) -> usize;
56
57 /// Move the iterator `count` blocks forward.
58 ///
59 /// # Errors
60 /// At least one new block is read from the underlying
61 /// [`InputBlockIterator`] implementation, which can fail.
62 fn offset(&mut self, count: isize) -> QuoteIterResult<I::Block, M, N>;
63
64 /// Flip the bit representing whether the last block ended with a nonescaped quote.
65 ///
66 /// This should be done only in very specific circumstances where the previous-block
67 /// state could have been damaged due to stopping and resuming the classification at a later point.
68 fn flip_quotes_bit(&mut self);
69}
70
71/// Higher-level classifier that can be consumed to retrieve the inner
72/// [`Input::BlockIterator`](crate::input::Input::BlockIterator).
73pub trait InnerIter<I> {
74 /// Consume `self` and return the wrapped [`Input::BlockIterator`](crate::input::Input::BlockIterator).
75 fn into_inner(self) -> I;
76}
77
78impl<'i, B, M, const N: usize> QuoteClassifiedBlock<B, M, N>
79where
80 B: InputBlock<'i, N>,
81{
82 /// Returns the length of the classified block.
83 #[must_use]
84 #[inline(always)]
85 pub fn len(&self) -> usize {
86 self.block.len()
87 }
88
89 /// Whether the classified block is empty.
90 #[must_use]
91 #[inline(always)]
92 pub fn is_empty(&self) -> bool {
93 self.block.is_empty()
94 }
95}
96
97pub(crate) mod nosimd;
98pub(crate) mod shared;
99
100#[cfg(target_arch = "x86")]
101pub(crate) mod avx2_32;
102#[cfg(target_arch = "x86_64")]
103pub(crate) mod avx2_64;
104#[cfg(target_arch = "x86")]
105pub(crate) mod sse2_32;
106#[cfg(target_arch = "x86_64")]
107pub(crate) mod sse2_64;
108
109pub(crate) trait QuotesImpl {
110 type Classifier<'i, I>: QuoteClassifiedIterator<'i, I, MaskType, BLOCK_SIZE> + InnerIter<I>
111 where
112 I: InputBlockIterator<'i, BLOCK_SIZE>;
113
114 fn new<'i, I>(iter: I) -> Self::Classifier<'i, I>
115 where
116 I: InputBlockIterator<'i, BLOCK_SIZE>;
117
118 fn resume<'i, I>(
119 iter: I,
120 first_block: Option<I::Block>,
121 ) -> ResumedQuoteClassifier<Self::Classifier<'i, I>, I::Block, MaskType, BLOCK_SIZE>
122 where
123 I: InputBlockIterator<'i, BLOCK_SIZE>;
124}