rsonpath/engine/head_skipping.rs
1//! Engine decorator that performs **head skipping** – an extremely optimized search for
2//! the first matching member name in a query starting with a self-looping state.
3//! This happens in queries starting with a descendant selector.
4
5use std::sync::Arc;
6
7use crate::{
8 automaton::{Automaton, State},
9 classification::{
10 mask::Mask,
11 memmem::Memmem,
12 quotes::{InnerIter, QuoteClassifiedIterator, ResumedQuoteClassifier},
13 simd::{dispatch_simd, Simd},
14 structural::{BracketType, Structural, StructuralIterator},
15 ResumeClassifierBlockState, ResumeClassifierState,
16 },
17 debug,
18 depth::Depth,
19 engine::EngineError,
20 input::{
21 error::{InputError, InputErrorConvertible},
22 Input, InputBlockIterator,
23 },
24 result::Recorder,
25 string_pattern::StringPattern,
26 FallibleIterator, MaskType, BLOCK_SIZE,
27};
28
29/// Trait that needs to be implemented by an [`Engine`](`super::Engine`) to use this submodule.
30pub(super) trait CanHeadSkip<'i, 'r, I, R, V>
31where
32 I: Input + 'i,
33 R: Recorder<I::Block<'i, BLOCK_SIZE>>,
34 V: Simd,
35{
36 /// Function called when head-skipping finds a member name at which normal query execution
37 /// should resume.
38 ///
39 /// The [`HeadSkip::run_head_skipping`] function will call this implementation
40 /// whenever it finds a member name matching the first transition in the query.
41 /// The structural `classifier` passed is guaranteed to have classified the
42 /// `next_event` and nothing past that. It is guaranteed that
43 /// `next_event` is [`Structural::Opening`].
44 ///
45 /// When called, the engine must start with in the automaton state as given in `state`
46 /// and execute the query until a matching [`Structural::Closing`] character is encountered,
47 /// using `classifier` for classification and `result` for reporting query results. The `classifier`
48 /// must *not* be used to classify anything past the matching [`Structural::Closing`] character.
49 fn run_on_subtree(
50 &mut self,
51 next_event: Structural,
52 state: State,
53 structural_classifier: V::StructuralClassifier<'i, I::BlockIterator<'i, 'r, R, BLOCK_SIZE>>,
54 ) -> Result<ResumeState<'i, I::BlockIterator<'i, 'r, R, BLOCK_SIZE>, V, MaskType>, EngineError>;
55
56 fn recorder(&mut self) -> &'r R;
57}
58
59pub(super) struct ResumeState<'i, I, V, M>(
60 pub(super) ResumeClassifierState<'i, I, V::QuotesClassifier<'i, I>, M, BLOCK_SIZE>,
61)
62where
63 I: InputBlockIterator<'i, BLOCK_SIZE>,
64 V: Simd;
65
66/// Configuration of the head-skipping decorator.
67pub(super) struct HeadSkip<'b, I, V, const N: usize> {
68 bytes: &'b I,
69 state: State,
70 is_accepting: bool,
71 member_name: Arc<StringPattern>,
72 simd: V,
73}
74
75impl<'b, I: Input, V: Simd> HeadSkip<'b, I, V, BLOCK_SIZE> {
76 /// Create a new instance of the head-skipping decorator over a given input
77 /// and for a compiled query [`Automaton`].
78 ///
79 /// # Returns
80 /// If head-skipping is possible for the query represented by `automaton`,
81 /// returns [`Some`] with a configured instance of [`HeadSkip`].
82 /// If head-skipping is not possible, returns [`None`].
83 ///
84 /// ## Details
85 /// Head-skipping is possible if the query automaton starts
86 /// with a state with a wildcard self-loop and a single member-labelled transition forward.
87 /// Syntactically, if the [`fallback_state`](`crate::query::automaton::StateTable::fallback_state`)
88 /// of the [`initial_state`](`crate::query::automaton::StateTable::initial_state`) is the same as the
89 /// [`initial_state`](`crate::query::automaton::StateTable::initial_state`), and its
90 /// [`transitions`](`crate::query::automaton::StateTable::transitions`) are a single-element list.
91 ///
92 /// This means that we can search for the label of the forward transition in the entire document,
93 /// disregarding any additional structure – during execution we would always loop
94 /// around in the initial state until encountering the desired member name. This search can be done
95 /// extremely quickly with [`classification::memmem`](crate::classification::memmem).
96 ///
97 /// In all other cases, head-skipping is not supported.
98 pub(super) fn new(bytes: &'b I, automaton: &Automaton, simd: V) -> Option<Self> {
99 let initial_state = automaton.initial_state();
100 let fallback_state = automaton[initial_state].fallback_state();
101 let transitions = automaton[initial_state].member_transitions();
102
103 if fallback_state == initial_state
104 && transitions.len() == 1
105 && automaton[initial_state].array_transitions().is_empty()
106 {
107 let (member_name, target_state) = &transitions[0];
108 debug!("Automaton starts with a descendant search, using memmem heuristic.");
109 return Some(Self {
110 bytes,
111 state: *target_state,
112 is_accepting: automaton.is_accepting(*target_state),
113 member_name: member_name.clone(),
114 simd,
115 });
116 }
117
118 None
119 }
120
121 /// Run a preconfigured [`HeadSkip`] using the given `engine` and reporting
122 /// to the `result`.
123 pub(super) fn run_head_skipping<'r, E, R>(&self, engine: &mut E) -> Result<(), EngineError>
124 where
125 'b: 'r,
126 E: CanHeadSkip<'b, 'r, I, R, V>,
127 R: Recorder<I::Block<'b, BLOCK_SIZE>> + 'r,
128 {
129 dispatch_simd!(self.simd; self, engine =>
130 fn<'b, 'r, I, V, E, R>(head_skip: &HeadSkip<'b, I, V, BLOCK_SIZE>, engine: &mut E) -> Result<(), EngineError>
131 where
132 'b: 'r,
133 E: CanHeadSkip<'b, 'r, I, R, V>,
134 R: Recorder<I::Block<'b, BLOCK_SIZE>> + 'r,
135 I: Input,
136 V: Simd
137 {
138 let mut input_iter = head_skip.bytes.iter_blocks(engine.recorder());
139 let mut idx = 0;
140 let mut first_block = None;
141
142 loop {
143 let mut memmem = head_skip.simd.memmem(head_skip.bytes, &mut input_iter);
144 debug!("Starting memmem search from {idx}");
145
146 if let Some((starting_quote_idx, last_block)) = memmem.find_label(first_block, idx, head_skip.member_name.as_ref())? {
147 drop(memmem);
148
149 first_block = Some(last_block);
150 idx = starting_quote_idx;
151 debug!("Needle found at {idx}");
152 let seek_start_idx = idx + head_skip.member_name.quoted().len();
153
154 match head_skip.bytes.seek_non_whitespace_forward(seek_start_idx).e()? {
155 Some((colon_idx, b':')) => {
156 let (next_idx, next_c) = head_skip
157 .bytes
158 .seek_non_whitespace_forward(colon_idx + 1).e()?
159 .ok_or(EngineError::MissingItem())?;
160
161 let ResumedQuoteClassifier {
162 classifier: quote_classifier,
163 first_block: quote_classified_first_block,
164 } = head_skip.simd.resume_quote_classification(input_iter, first_block);
165
166 // Temporarily set the index within the current block to zero.
167 // This makes sense for the move below.
168 let mut classifier_state = ResumeClassifierState {
169 iter: quote_classifier,
170 block: quote_classified_first_block
171 .map(|b| ResumeClassifierBlockState { block: b, idx: 0 }),
172 are_colons_on: false,
173 are_commas_on: head_skip.is_accepting,
174 };
175
176 debug!("Actual match with colon at {colon_idx}");
177 debug!("Next significant character at {next_idx}");
178 debug!("Classifier is at {}", classifier_state.get_idx());
179 debug!("We will forward to {colon_idx} first, then to {next_idx}",);
180
181 // Now we want to move the entire iterator state so that the current block is quote-classified,
182 // and correctly points to the place the engine would expect had it found the matching key
183 // in the regular loop. If the value is atomic, we handle it ourselves. If the value is complex,
184 // the engine wants to start one byte *after* the opening character. However, the match report
185 // has to happen before we advance one more byte, or else the opening character might be lost
186 // in the output (if it happens at a block boundary).
187 if next_c == b'{' || next_c == b'[' {
188 forward_to(&mut classifier_state, next_idx)?;
189 if head_skip.is_accepting {
190 engine.recorder().record_match(
191 next_idx,
192 Depth::ZERO,
193 crate::result::MatchedNodeType::Complex,
194 )?;
195 }
196 forward_to(&mut classifier_state, next_idx + 1)?;
197 } else {
198 forward_to(&mut classifier_state, next_idx)?;
199 };
200
201 // We now have the block where we want and we ran quote classification, but during the `forward_to`
202 // call we lose all the flow-through quote information that usually is passed from one block to the next.
203 // We need to manually verify the soundness of the classification. Fortunately:
204 // 1. we know that resume_idx is either the start of a value, or one byte after an opening -
205 // in a valid JSON this character can be within quotes if and only if it is itself a quote;
206 // 2. the only way the mask can be wrong is if it is flipped - marks chars within quotes
207 // as outside and vice versa - so it suffices to flip it if it is wrong.
208 if let Some(block) = classifier_state.block.as_mut() {
209 let should_be_quoted = block.block.block[block.idx] == b'"';
210 if block.block.within_quotes_mask.is_lit(block.idx) != should_be_quoted {
211 debug!("Mask needs flipping!");
212 block.block.within_quotes_mask = !block.block.within_quotes_mask;
213 classifier_state.iter.flip_quotes_bit();
214 }
215 }
216
217 classifier_state = match next_c {
218 b'{' | b'[' => {
219 debug!("resuming");
220 let classifier = head_skip.simd.resume_structural_classification(classifier_state);
221 engine
222 .run_on_subtree(
223 Structural::Opening(
224 if next_c == b'{' {
225 BracketType::Curly
226 } else {
227 BracketType::Square
228 },
229 next_idx,
230 ),
231 head_skip.state,
232 classifier,
233 )?
234 .0
235 }
236 _ if head_skip.is_accepting => {
237 engine.recorder().record_match(
238 next_idx,
239 Depth::ZERO,
240 crate::result::MatchedNodeType::Atomic,
241 )?;
242 let mut classifier = head_skip.simd.resume_structural_classification(classifier_state);
243 let next_structural = classifier.next()?;
244
245 match next_structural {
246 Some(s) => engine.recorder().record_value_terminator(s.idx(), Depth::ZERO)?,
247 None => return Err(EngineError::MissingClosingCharacter()),
248 }
249 classifier.stop()
250 }
251 _ => classifier_state,
252 };
253
254 debug!("Quote classified up to {}", classifier_state.get_idx());
255 idx = classifier_state.get_idx();
256
257 first_block = classifier_state.block.map(|b| b.block.block);
258 input_iter = classifier_state.iter.into_inner();
259 }
260 _ => idx += 1,
261 }
262 } else {
263 debug!("No memmem matches, exiting");
264 break;
265 }
266 }
267
268 return Ok(());
269
270 /// Move the state forward to `index`.
271 ///
272 /// # Errors
273 /// If the offset crosses block boundaries, then a new block is read from the underlying
274 /// [`Input`](crate::input::Input) implementation, which can fail.
275 ///
276 /// # Panics
277 /// If the `index` is not ahead of the current position of the state ([`get_idx`](ResumeClassifierState::get_idx)).
278 #[inline(always)]
279 #[allow(clippy::panic_in_result_fn)]
280 fn forward_to<'i, I, Q, M, const N: usize>(state: &mut ResumeClassifierState<'i, I, Q, M, N>, index: usize) -> Result<(), InputError>
281 where
282 I: InputBlockIterator<'i, N>,
283 Q: QuoteClassifiedIterator<'i, I, M, N>,
284 {
285 let current_block_start = state.iter.get_offset();
286 let current_block_idx = state.block.as_ref().map_or(0, |b| b.idx);
287 let current_idx = current_block_start + current_block_idx;
288
289 debug!(
290 "Calling forward_to({index}) when the inner iter offset is {current_block_start} and block idx is {current_block_idx:?}"
291 );
292
293 // We want to move by this much forward, and delta > 0.
294 assert!(index > current_idx);
295 let delta = index - current_idx;
296
297 // First we virtually pretend to move *backward*, setting the index of the current block to zero,
298 // and adjust the delta to cover that distance. This makes calculations simpler.
299 // Then we need to skip zero or more blocks and set our self.block to the last one we visit.
300 let remaining = delta + current_block_idx;
301 let blocks_to_skip = remaining / N;
302 let remainder = remaining % N;
303
304 match state.block.as_mut() {
305 Some(b) if blocks_to_skip == 0 => {
306 b.idx = remaining;
307 }
308 Some(_) => {
309 state.block = state
310 .iter
311 .offset(blocks_to_skip as isize)?
312 .map(|b| ResumeClassifierBlockState {
313 block: b,
314 idx: remainder,
315 });
316 }
317 None => {
318 state.block = state
319 .iter
320 .offset((blocks_to_skip + 1) as isize)?
321 .map(|b| ResumeClassifierBlockState {
322 block: b,
323 idx: remainder,
324 });
325 }
326 }
327
328 debug!("forward_to({index}) results in idx moved to {}", state.get_idx());
329
330 Ok(())
331 }
332 })
333 }
334}