rsonpath/
input.rs

1//! Input structures that can be fed into an [`Engine`](crate::engine::Engine).
2//!
3//! The engine itself is generic in the [`Input`] trait declared here.
4//! There are a couple of different built-in implementations, each
5//! suitable for a different scenario. Consult the module-level
6//! documentation of each type to determine which to use. Here's a quick
7//! cheat-sheet:
8//!
9//! | Input scenario                | Type to use       |
10//! |:------------------------------|:------------------|
11//! | memory based                  | [`BorrowedBytes`] |
12//! | memory based, take ownership  | [`OwnedBytes`]    |
13//! | file based                    | [`MmapInput`]     |
14//! | [`Read`](std::io::Read) based | [`BufferedInput`] |
15
16pub mod borrowed;
17pub mod buffered;
18pub mod error;
19pub mod mmap;
20pub mod owned;
21mod padding;
22mod slice;
23pub use borrowed::BorrowedBytes;
24pub use buffered::BufferedInput;
25pub use mmap::MmapInput;
26pub use owned::OwnedBytes;
27
28use self::error::InputError;
29use crate::{result::InputRecorder, string_pattern::StringPattern};
30use std::ops::Deref;
31
32/// Make the struct repr(C) with alignment equal to [`MAX_BLOCK_SIZE`].
33macro_rules! repr_align_block_size {
34    ($it:item) => {
35        #[repr(C, align(128))]
36        $it
37    };
38}
39pub(crate) use repr_align_block_size;
40
41/// Global padding guarantee for all [`Input`] implementations.
42/// Iterating over blocks of at most this size is guaranteed
43/// to produce only full blocks.
44///
45/// # Remarks
46/// This is set to `128` and unlikely to change.
47/// Widest available SIMD is AVX512, which has 64-byte blocks.
48/// The engine processes blocks in pairs, thus 128 is the highest possible request made to a block iterator.
49/// For this value to change a new, wider SIMD implementation would have to appear.
50pub const MAX_BLOCK_SIZE: usize = 128;
51
52/// UTF-8 encoded bytes representing a JSON document that support
53/// block-by-block iteration and basic seeking procedures.
54pub trait Input: Sized {
55    /// Type of the iterator used by [`iter_blocks`](Input::iter_blocks), parameterized
56    /// by the lifetime of source input and the size of the block.
57    type BlockIterator<'i, 'r, R, const N: usize>: InputBlockIterator<
58        'i,
59        N,
60        Block = Self::Block<'i, N>,
61        Error = Self::Error,
62    >
63    where
64        Self: 'i,
65        R: InputRecorder<Self::Block<'i, N>> + 'r;
66
67    /// Type of errors that can occur when operating on this [`Input`].
68    type Error: Into<InputError>;
69
70    /// Type of the blocks returned by the `BlockIterator`.
71    type Block<'i, const N: usize>: InputBlock<'i, N>
72    where
73        Self: 'i;
74
75    /// Return the length of the entire input, if known.
76    ///
77    /// This is meant to be used merely as a hint.
78    /// There are [`Input`] implementations that may not be able to know the entire
79    /// length a priori, and they should return [`None`].
80    #[inline(always)]
81    #[must_use]
82    fn len_hint(&self) -> Option<usize> {
83        None
84    }
85
86    /// Return the length of the padding added at the start of the input.
87    ///
88    /// This depends on the particular [`Input`] implementation, and may be zero.
89    /// In any case the length of the entire input should be equivalent to the length
90    /// of the source plus [`leading_padding_len`](`Input::leading_padding_len`) plus
91    /// [`trailing_padding_len`](`Input::trailing_padding_len`).
92    #[must_use]
93    fn leading_padding_len(&self) -> usize;
94
95    /// Return the length of the padding added at the end of the input.
96    ///
97    /// This depends on the particular [`Input`] implementation, and may be zero.
98    /// In any case the length of the entire input should be equivalent to the length
99    /// of the source plus [`leading_padding_len`](`Input::leading_padding_len`) plus
100    /// [`trailing_padding_len`](`Input::trailing_padding_len`).
101    #[must_use]
102    fn trailing_padding_len(&self) -> usize;
103
104    /// Iterate over blocks of size `N` of the input.
105    /// `N` has to be a power of two larger than 1.
106    #[must_use]
107    fn iter_blocks<'i, 'r, R, const N: usize>(&'i self, recorder: &'r R) -> Self::BlockIterator<'i, 'r, R, N>
108    where
109        R: InputRecorder<Self::Block<'i, N>>;
110
111    /// Search for an occurrence of `needle` in the input,
112    /// starting from `from` and looking back. Returns the index
113    /// of the first occurrence or `None` if the `needle` was not found.
114    #[must_use]
115    fn seek_backward(&self, from: usize, needle: u8) -> Option<usize>;
116
117    /// Search for an occurrence of any of the `needles` in the input,
118    /// starting from `from` and looking forward. Returns the index
119    /// of the first occurrence and the needle found, or `None` if none of the `needles` were not found.
120    ///
121    /// # Errors
122    /// This function can read more data from the input if no relevant characters are found
123    /// in the current buffer, which can fail.
124    fn seek_forward<const N: usize>(&self, from: usize, needles: [u8; N]) -> Result<Option<(usize, u8)>, Self::Error>;
125
126    /// Search for the first byte in the input that is not ASCII whitespace
127    /// starting from `from`. Returns a pair: the index of first such byte,
128    /// and the byte itself; or `None` if no non-whitespace characters
129    /// were found.
130    ///
131    /// # Errors
132    /// This function can read more data from the input if no relevant characters are found
133    /// in the current buffer, which can fail.
134    fn seek_non_whitespace_forward(&self, from: usize) -> Result<Option<(usize, u8)>, Self::Error>;
135
136    /// Search for the first byte in the input that is not ASCII whitespace
137    /// starting from `from` and looking back. Returns a pair:
138    /// the index of first such byte, and the byte itself;
139    /// or `None` if no non-whitespace characters were found.
140    #[must_use]
141    fn seek_non_whitespace_backward(&self, from: usize) -> Option<(usize, u8)>;
142
143    /// Decide whether the slice of input between `from` (inclusive)
144    /// and `to` (exclusive) matches the `member` (comparing bitwise,
145    /// including double quotes delimiters).
146    ///
147    /// This will also check if the leading double quote is not
148    /// escaped by a backslash character.
149    ///
150    /// # Errors
151    /// This function can read more data from the input if `to` falls beyond
152    /// the range that was already read, and the read operation can fail.
153    fn is_member_match(&self, from: usize, to: usize, member: &StringPattern) -> Result<bool, Self::Error>;
154}
155
156/// An iterator over blocks of input of size `N`.
157/// Implementations MUST guarantee that the blocks returned from `next`
158/// are *exactly* of size `N`.
159pub trait InputBlockIterator<'i, const N: usize> {
160    /// The type of blocks returned.
161    type Block: InputBlock<'i, N>;
162
163    /// Type of errors that can occur when reading from this iterator.
164    type Error: Into<InputError>;
165
166    /// Advances the iterator and returns the next value.
167    ///
168    /// # Errors
169    /// May fail depending on the implementation.
170    fn next(&mut self) -> Result<Option<Self::Block>, Self::Error>;
171
172    /// Get the offset of the iterator in the input.
173    ///
174    /// The offset is the starting point of the block that will be returned next
175    /// from this iterator, if any. It starts as 0 and increases by `N` on every
176    /// block retrieved.
177    fn get_offset(&self) -> usize;
178
179    /// Offset the iterator by `count` full blocks forward.
180    ///
181    /// The `count` parameter must be greater than 0.
182    fn offset(&mut self, count: isize);
183}
184
185/// A block of bytes of size `N` returned from [`InputBlockIterator`].
186pub trait InputBlock<'i, const N: usize>: Deref<Target = [u8]> {
187    /// Split the block in half, giving two slices of size `N`/2.
188    #[must_use]
189    fn halves(&self) -> (&[u8], &[u8]);
190
191    /// Split the block in four, giving four slices of size `N`/4.
192    #[inline]
193    #[must_use]
194    fn quarters(&self) -> (&[u8], &[u8], &[u8], &[u8]) {
195        assert_eq!(N % 4, 0);
196        let (half1, half2) = self.halves();
197        let (q1, q2) = (&half1[..N / 4], &half1[N / 4..]);
198        let (q3, q4) = (&half2[..N / 4], &half2[N / 4..]);
199
200        (q1, q2, q3, q4)
201    }
202}
203
204impl<'i, const N: usize> InputBlock<'i, N> for &'i [u8] {
205    #[inline(always)]
206    fn halves(&self) -> (&[u8], &[u8]) {
207        assert_eq!(N % 2, 0);
208        (&self[..N / 2], &self[N / 2..])
209    }
210}
211
212pub(super) trait SliceSeekable {
213    fn is_member_match(&self, from: usize, to: usize, member: &StringPattern) -> bool;
214
215    fn seek_backward(&self, from: usize, needle: u8) -> Option<usize>;
216
217    fn seek_forward<const N: usize>(&self, from: usize, needles: [u8; N]) -> Option<(usize, u8)>;
218
219    fn seek_non_whitespace_forward(&self, from: usize) -> Option<(usize, u8)>;
220
221    fn seek_non_whitespace_backward(&self, from: usize) -> Option<(usize, u8)>;
222}
223
224// This is mostly adapted from [slice::align_to](https://doc.rust-lang.org/std/primitive.slice.html#method.align_to).
225fn align_to<const N: usize>(bytes: &[u8]) -> (&[u8], &[u8], &[u8]) {
226    let ptr = bytes.as_ptr();
227    let offset = ptr.align_offset(N);
228    if offset > bytes.len() {
229        (bytes, &[], &[])
230    } else {
231        let (left, rest) = bytes.split_at(offset);
232        let middle_len = (rest.len() / N) * N;
233        let (middle, right) = rest.split_at(middle_len);
234
235        (left, middle, right)
236    }
237}
238
239#[cfg(test)]
240mod tests {
241    use super::align_to;
242    use crate::input::MAX_BLOCK_SIZE;
243
244    // Run all tests for the actual alignment we use.
245    const N: usize = MAX_BLOCK_SIZE;
246    const SIZE: usize = 1024;
247
248    #[repr(align(128))]
249    struct Aligned {
250        bytes: [u8; SIZE],
251    }
252
253    #[test]
254    fn test_all_alignments() {
255        // We construct a byte array that is already aligned,
256        // and then take all suffixes for all possible misalignments
257        // and small sizes.
258        let aligned = Aligned { bytes: get_bytes() };
259        let slice = &aligned.bytes;
260
261        for i in 0..slice.len() {
262            let misalignment = i % N;
263            test_with_misalignment(misalignment, &slice[i..]);
264        }
265    }
266
267    fn get_bytes() -> [u8; SIZE] {
268        let mut bytes = [0; SIZE];
269
270        for (i, b) in bytes.iter_mut().enumerate() {
271            let x = i % (u8::MAX as usize);
272            *b = x as u8;
273        }
274
275        bytes
276    }
277
278    fn test_with_misalignment(misalignment: usize, slice: &[u8]) {
279        let expected_left_len = (N - misalignment) % N;
280        let expected_rem_len = slice.len() - expected_left_len;
281        let expected_middle_len = (expected_rem_len / N) * N;
282        let expected_right_len = expected_rem_len - expected_middle_len;
283
284        let (left, middle, right) = align_to::<N>(slice);
285        let glued_back: Vec<_> = [left, middle, right].into_iter().flatten().copied().collect();
286
287        assert_eq!(left.len(), expected_left_len, "misalignment = {misalignment}");
288        assert_eq!(middle.len(), expected_middle_len, "misalignment = {misalignment}");
289        assert_eq!(right.len(), expected_right_len, "misalignment = {misalignment}");
290        assert_eq!(glued_back, slice, "misalignment = {misalignment}");
291    }
292}