Skip to main content

rsonpath/engine/
head_skipping.rs

1//! Engine decorator that performs **head skipping** – an extremely optimized search for
2//! the first matching member name in a query starting with a self-looping state.
3//! This happens in queries starting with a descendant selector.
4
5use std::sync::Arc;
6
7use crate::{
8    automaton::{Automaton, State},
9    classification::{
10        mask::Mask as _,
11        memmem::Memmem as _,
12        quotes::{InnerIter as _, QuoteClassifiedIterator, ResumedQuoteClassifier},
13        simd::{dispatch_simd, Simd},
14        structural::{BracketType, Structural, StructuralIterator as _},
15        ResumeClassifierBlockState, ResumeClassifierState,
16    },
17    debug,
18    depth::Depth,
19    engine::EngineError,
20    input::{
21        error::{InputError, InputErrorConvertible as _},
22        Input, InputBlockIterator,
23    },
24    result::Recorder,
25    string_pattern::StringPattern,
26    FallibleIterator as _, MaskType, BLOCK_SIZE,
27};
28
29/// Trait that needs to be implemented by an [`Engine`](`super::Engine`) to use this submodule.
30pub(super) trait CanHeadSkip<'i, 'r, I, R, V>
31where
32    I: Input + 'i,
33    R: Recorder<I::Block<'i, BLOCK_SIZE>>,
34    V: Simd,
35{
36    /// Function called when head-skipping finds a member name at which normal query execution
37    /// should resume.
38    ///
39    /// The [`HeadSkip::run_head_skipping`] function will call this implementation
40    /// whenever it finds a member name matching the first transition in the query.
41    /// The structural `classifier` passed is guaranteed to have classified the
42    /// `next_event` and nothing past that. It is guaranteed that
43    /// `next_event` is [`Structural::Opening`].
44    ///
45    /// When called, the engine must start with in the automaton state as given in `state`
46    /// and execute the query until a matching [`Structural::Closing`] character is encountered,
47    /// using `classifier` for classification and `result` for reporting query results. The `classifier`
48    /// must *not* be used to classify anything past the matching [`Structural::Closing`] character.
49    fn run_on_subtree(
50        &mut self,
51        next_event: Structural,
52        state: State,
53        structural_classifier: V::StructuralClassifier<'i, I::BlockIterator<'i, 'r, R, BLOCK_SIZE>>,
54    ) -> Result<ResumeState<'i, I::BlockIterator<'i, 'r, R, BLOCK_SIZE>, V, MaskType>, EngineError>;
55
56    fn recorder(&mut self) -> &'r R;
57}
58
59pub(super) struct ResumeState<'i, I, V, M>(
60    pub(super) ResumeClassifierState<'i, I, V::QuotesClassifier<'i, I>, M, BLOCK_SIZE>,
61)
62where
63    I: InputBlockIterator<'i, BLOCK_SIZE>,
64    V: Simd;
65
66/// Configuration of the head-skipping decorator.
67pub(super) struct HeadSkip<'b, I, V, const N: usize> {
68    bytes: &'b I,
69    state: State,
70    is_accepting: bool,
71    member_name: Arc<StringPattern>,
72    simd: V,
73}
74
75impl<'b, I: Input, V: Simd> HeadSkip<'b, I, V, BLOCK_SIZE> {
76    /// Create a new instance of the head-skipping decorator over a given input
77    /// and for a compiled query [`Automaton`].
78    ///
79    /// # Returns
80    /// If head-skipping is possible for the query represented by `automaton`,
81    /// returns [`Some`] with a configured instance of [`HeadSkip`].
82    /// If head-skipping is not possible, returns [`None`].
83    ///
84    /// ## Details
85    /// Head-skipping is possible if the query automaton starts
86    /// with a state with a wildcard self-loop and a single member-labelled transition forward.
87    /// Syntactically, if the [`fallback_state`](`crate::query::automaton::StateTable::fallback_state`)
88    /// of the [`initial_state`](`crate::query::automaton::StateTable::initial_state`) is the same as the
89    /// [`initial_state`](`crate::query::automaton::StateTable::initial_state`), and its
90    /// [`transitions`](`crate::query::automaton::StateTable::transitions`) are a single-element list.
91    ///
92    /// This means that we can search for the label of the forward transition in the entire document,
93    /// disregarding any additional structure &ndash; during execution we would always loop
94    /// around in the initial state until encountering the desired member name. This search can be done
95    /// extremely quickly with [`classification::memmem`](crate::classification::memmem).
96    ///
97    /// In all other cases, head-skipping is not supported.
98    pub(super) fn new(bytes: &'b I, automaton: &Automaton, simd: V) -> Option<Self> {
99        let initial_state = automaton.initial_state();
100        let fallback_state = automaton[initial_state].fallback_state();
101        let transitions = automaton[initial_state].member_transitions();
102
103        if fallback_state == initial_state
104            && transitions.len() == 1
105            && automaton[initial_state].array_transitions().is_empty()
106        {
107            let (member_name, target_state) = &transitions[0];
108            debug!("Automaton starts with a descendant search, using memmem heuristic.");
109            return Some(Self {
110                bytes,
111                state: *target_state,
112                is_accepting: automaton.is_accepting(*target_state),
113                member_name: member_name.clone(),
114                simd,
115            });
116        }
117
118        None
119    }
120
121    /// Run a preconfigured [`HeadSkip`] using the given `engine` and reporting
122    /// to the `result`.
123    pub(super) fn run_head_skipping<'r, E, R>(&self, engine: &mut E) -> Result<(), EngineError>
124    where
125        'b: 'r,
126        E: CanHeadSkip<'b, 'r, I, R, V>,
127        R: Recorder<I::Block<'b, BLOCK_SIZE>> + 'r,
128    {
129        dispatch_simd!(self.simd; self, engine =>
130        fn<'b, 'r, I, V, E, R>(head_skip: &HeadSkip<'b, I, V, BLOCK_SIZE>, engine: &mut E) -> Result<(), EngineError>
131        where
132            'b: 'r,
133            E: CanHeadSkip<'b, 'r, I, R, V>,
134            R: Recorder<I::Block<'b, BLOCK_SIZE>> + 'r,
135            I: Input,
136            V: Simd
137        {
138            let mut input_iter = head_skip.bytes.iter_blocks(engine.recorder());
139            let mut idx = 0;
140            let mut first_block = None;
141
142            loop {
143                let mut memmem = head_skip.simd.memmem(head_skip.bytes, &mut input_iter);
144                debug!("Starting memmem search from {idx}");
145
146                if let Some((starting_quote_idx, last_block)) = memmem.find_label(first_block, idx, head_skip.member_name.as_ref())? {
147                    drop(memmem);
148
149                    first_block = Some(last_block);
150                    idx = starting_quote_idx;
151                    debug!("Needle found at {idx}");
152                    let seek_start_idx = idx + head_skip.member_name.quoted().len();
153
154                match head_skip.bytes.seek_non_whitespace_forward(seek_start_idx).e()? {
155                    Some((colon_idx, b':')) => {
156                        let (next_idx, next_c) = head_skip
157                            .bytes
158                            .seek_non_whitespace_forward(colon_idx + 1).e()?
159                            .ok_or(EngineError::MissingItem())?;
160
161                            let ResumedQuoteClassifier {
162                                classifier: quote_classifier,
163                                first_block: quote_classified_first_block,
164                            } = head_skip.simd.resume_quote_classification(input_iter, first_block);
165
166                            // Temporarily set the index within the current block to zero.
167                            // This makes sense for the move below.
168                            let mut classifier_state = ResumeClassifierState {
169                                iter: quote_classifier,
170                                block: quote_classified_first_block
171                                    .map(|b| ResumeClassifierBlockState { block: b, idx: 0 }),
172                                are_colons_on: false,
173                                are_commas_on: head_skip.is_accepting,
174                            };
175
176                            debug!("Actual match with colon at {colon_idx}");
177                            debug!("Next significant character at {next_idx}");
178                            debug!("Classifier is at {}", classifier_state.get_idx());
179                            debug!("We will forward to {colon_idx} first, then to {next_idx}",);
180
181                            // Now we want to move the entire iterator state so that the current block is quote-classified,
182                            // and correctly points to the place the engine would expect had it found the matching key
183                            // in the regular loop. If the value is atomic, we handle it ourselves. If the value is complex,
184                            // the engine wants to start one byte *after* the opening character. However, the match report
185                            // has to happen before we advance one more byte, or else the opening character might be lost
186                            // in the output (if it happens at a block boundary).
187                            if next_c == b'{' || next_c == b'[' {
188                                forward_to(&mut classifier_state, next_idx)?;
189                                if head_skip.is_accepting {
190                                    engine.recorder().record_match(
191                                        next_idx,
192                                        Depth::ZERO,
193                                        crate::result::MatchedNodeType::Complex,
194                                    )?;
195                                }
196                                forward_to(&mut classifier_state, next_idx + 1)?;
197                            } else {
198                                forward_to(&mut classifier_state, next_idx)?;
199                            }
200
201                            // We now have the block where we want and we ran quote classification, but during the `forward_to`
202                            // call we lose all the flow-through quote information that usually is passed from one block to the next.
203                            // We need to manually verify the soundness of the classification. Fortunately:
204                            // 1. we know that resume_idx is either the start of a value, or one byte after an opening -
205                            //    in a valid JSON this character can be within quotes if and only if it is itself a quote;
206                            // 2. the only way the mask can be wrong is if it is flipped - marks chars within quotes
207                            //    as outside and vice versa - so it suffices to flip it if it is wrong.
208                            if let Some(block) = classifier_state.block.as_mut() {
209                                let should_be_quoted = block.block.block[block.idx] == b'"';
210                                if block.block.within_quotes_mask.is_lit(block.idx) != should_be_quoted {
211                                    debug!("Mask needs flipping!");
212                                    block.block.within_quotes_mask = !block.block.within_quotes_mask;
213                                    classifier_state.iter.flip_quotes_bit();
214                                }
215                            }
216
217                            classifier_state = match next_c {
218                                b'{' | b'[' => {
219                                    debug!("resuming");
220                                    let classifier = head_skip.simd.resume_structural_classification(classifier_state);
221                                    engine
222                                        .run_on_subtree(
223                                            Structural::Opening(
224                                                if next_c == b'{' {
225                                                    BracketType::Curly
226                                                } else {
227                                                    BracketType::Square
228                                                },
229                                                next_idx,
230                                            ),
231                                            head_skip.state,
232                                            classifier,
233                                        )?
234                                        .0
235                                }
236                                _ if head_skip.is_accepting => {
237                                    engine.recorder().record_match(
238                                        next_idx,
239                                        Depth::ZERO,
240                                        crate::result::MatchedNodeType::Atomic,
241                                    )?;
242                                    let mut classifier = head_skip.simd.resume_structural_classification(classifier_state);
243                                    let next_structural = classifier.next()?;
244
245                                    match next_structural {
246                                        Some(s) => engine.recorder().record_value_terminator(s.idx(), Depth::ZERO)?,
247                                        None => return Err(EngineError::MissingClosingCharacter()),
248                                    }
249                                    classifier.stop()
250                                }
251                                _ => classifier_state,
252                            };
253
254                            debug!("Quote classified up to {}", classifier_state.get_idx());
255                            idx = classifier_state.get_idx();
256
257                            first_block = classifier_state.block.map(|b| b.block.block);
258                            input_iter = classifier_state.iter.into_inner();
259                        }
260                        _ => idx += 1,
261                    }
262                } else {
263                    debug!("No memmem matches, exiting");
264                    break;
265                }
266            }
267
268            return Ok(());
269
270            /// Move the state forward to `index`.
271            ///
272            /// # Errors
273            /// If the offset crosses block boundaries, then a new block is read from the underlying
274            /// [`Input`](crate::input::Input) implementation, which can fail.
275            ///
276            /// # Panics
277            /// If the `index` is not ahead of the current position of the state ([`get_idx`](ResumeClassifierState::get_idx)).
278            #[inline(always)]
279            #[allow(clippy::panic_in_result_fn, reason = "Err here is only raised for input errors, assertions check programming bugs")]
280            fn forward_to<'i, I, Q, M, const N: usize>(state: &mut ResumeClassifierState<'i, I, Q, M, N>, index: usize) -> Result<(), InputError>
281            where
282                I: InputBlockIterator<'i, N>,
283                Q: QuoteClassifiedIterator<'i, I, M, N>,
284            {
285                let current_block_start = state.iter.get_offset();
286                let current_block_idx = state.block.as_ref().map_or(0, |b| b.idx);
287                let current_idx = current_block_start + current_block_idx;
288
289                debug!(
290                    "Calling forward_to({index}) when the inner iter offset is {current_block_start} and block idx is {current_block_idx:?}"
291                );
292
293                // We want to move by this much forward.
294                assert!(index > current_idx, "must forward by at least one byte");
295                let delta = index - current_idx;
296
297                // First we virtually pretend to move *backward*, setting the index of the current block to zero,
298                // and adjust the delta to cover that distance. This makes calculations simpler.
299                // Then we need to skip zero or more blocks and set our self.block to the last one we visit.
300                let remaining = delta + current_block_idx;
301                let blocks_to_skip = remaining / N;
302                let remainder = remaining % N;
303
304                match state.block.as_mut() {
305                    Some(b) if blocks_to_skip == 0 => {
306                        b.idx = remaining;
307                    }
308                    Some(_) => {
309                        state.block = state
310                            .iter
311                            .offset(blocks_to_skip as isize)?
312                            .map(|b| ResumeClassifierBlockState {
313                                block: b,
314                                idx: remainder,
315                            });
316                    }
317                    None => {
318                        state.block = state
319                            .iter
320                            .offset((blocks_to_skip + 1) as isize)?
321                            .map(|b| ResumeClassifierBlockState {
322                                block: b,
323                                idx: remainder,
324                            });
325                    }
326                }
327
328                debug!("forward_to({index}) results in idx moved to {}", state.get_idx());
329
330                Ok(())
331            }
332        })
333    }
334}