dcbor_pattern/pattern/
vm.rs

1//! Tiny Thompson-style VM for walking dCBOR trees.
2//!
3//! The VM runs byte-code produced by `Pattern::compile` methods.
4
5use std::collections::{HashMap, HashSet};
6
7use dcbor::prelude::*;
8
9use super::{Matcher, Path, Pattern};
10use crate::{Quantifier, Reluctance};
11
12/// Navigation axis for traversing dCBOR tree structures.
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
14pub enum Axis {
15    /// Navigate to array elements
16    ArrayElement,
17    /// Navigate to map keys
18    MapKey,
19    /// Navigate to map values
20    MapValue,
21    /// Navigate to tagged value content
22    TaggedContent,
23}
24
25impl Axis {
26    /// Return child CBOR values reachable from `cbor` via this axis.
27    pub fn children(&self, cbor: &CBOR) -> Vec<CBOR> {
28        match (self, cbor.as_case()) {
29            (Axis::ArrayElement, CBORCase::Array(arr)) => arr.clone(),
30            (Axis::MapKey, CBORCase::Map(map)) => {
31                map.iter().map(|(k, _v)| k.clone()).collect()
32            }
33            (Axis::MapValue, CBORCase::Map(map)) => {
34                map.iter().map(|(_k, v)| v.clone()).collect()
35            }
36            (Axis::TaggedContent, CBORCase::Tagged(_, content)) => {
37                vec![content.clone()]
38            }
39            _ => Vec::new(),
40        }
41    }
42}
43
44/// Bytecode instructions for the pattern VM.
45#[derive(Debug, Clone)]
46pub enum Instr {
47    /// Match predicate: `literals[idx].matches(cbor)`
48    MatchPredicate(usize),
49    /// Match structure: use `literals[idx].paths(cbor)` for structure patterns
50    MatchStructure(usize),
51    /// ε-split: fork execution to `a` and `b`
52    Split { a: usize, b: usize },
53    /// Unconditional jump to instruction at index
54    Jump(usize),
55    /// Descend to children via axis, one thread per child
56    PushAxis(Axis),
57    /// Pop one CBOR value from the path
58    Pop,
59    /// Emit current path
60    Save,
61    /// Final accept, emit current path and halt thread
62    Accept,
63    /// Recursively search for pattern at `pat_idx` and propagate captures
64    Search {
65        pat_idx: usize,
66        capture_map: Vec<(String, usize)>,
67    },
68    /// Save current path and start new sequence from last CBOR value
69    ExtendSequence,
70    /// Combine saved path with current path for final result
71    CombineSequence,
72    /// Match only if pattern at `pat_idx` does not match
73    NotMatch { pat_idx: usize },
74    /// Repeat a sub pattern according to range and greediness
75    Repeat {
76        pat_idx: usize,
77        quantifier: Quantifier,
78    },
79    /// Mark the start of a capture group
80    CaptureStart(usize),
81    /// Mark the end of a capture group
82    CaptureEnd(usize),
83}
84
85#[derive(Debug, Clone)]
86pub struct Program {
87    pub code: Vec<Instr>,
88    pub literals: Vec<Pattern>,
89    pub capture_names: Vec<String>,
90}
91
92/// Internal back-tracking state.
93#[derive(Clone)]
94struct Thread {
95    pc: usize,
96    cbor: CBOR,
97    path: Path,
98    /// Stack of saved paths for nested sequence patterns
99    saved_paths: Vec<Path>,
100    captures: Vec<Vec<Path>>,
101    capture_stack: Vec<Vec<usize>>,
102}
103
104/// Match atomic patterns without recursion into the VM.
105///
106/// This function handles only the patterns that are safe to use in
107/// MatchPredicate instructions - Value, Structure, Any, and None patterns. Meta
108/// patterns should never be passed to this function as they need to be compiled
109/// to bytecode.
110#[allow(clippy::panic)]
111pub(crate) fn atomic_paths(
112    p: &crate::pattern::Pattern,
113    cbor: &CBOR,
114) -> Vec<Path> {
115    use crate::pattern::Pattern::*;
116    match p {
117        Value(v) => v.paths(cbor),
118        Structure(s) => s.paths(cbor),
119        Meta(meta) => match meta {
120            crate::pattern::meta::MetaPattern::Any(_) => {
121                vec![vec![cbor.clone()]]
122            }
123            crate::pattern::meta::MetaPattern::Search(_) => {
124                panic!(
125                    "SearchPattern should be compiled to Search instruction, not MatchPredicate"
126                );
127            }
128            _ => panic!(
129                "non-atomic meta pattern used in MatchPredicate: {:?}",
130                meta
131            ),
132        },
133    }
134}
135
136fn repeat_paths(
137    pat: &Pattern,
138    cbor: &CBOR,
139    path: &Path,
140    quantifier: Quantifier,
141) -> Vec<(CBOR, Path)> {
142    // Build states for all possible repetition counts
143    let mut states: Vec<Vec<(CBOR, Path)>> =
144        vec![vec![(cbor.clone(), path.clone())]];
145    let bound = quantifier.max().unwrap_or(usize::MAX);
146
147    // Try matching the pattern repeatedly
148    for _ in 0..bound {
149        let mut next = Vec::new();
150        for (c, pth) in states.last().unwrap().iter() {
151            for sub_path in pat.paths(c) {
152                if let Some(last) = sub_path.last() {
153                    if last.to_cbor_data() == c.to_cbor_data() {
154                        continue; // Avoid infinite loops
155                    }
156                    let mut combined = pth.clone();
157                    if sub_path.first() == Some(c) {
158                        combined.extend(sub_path.iter().skip(1).cloned());
159                    } else {
160                        combined.extend(sub_path.iter().cloned());
161                    }
162                    next.push((last.clone(), combined));
163                }
164            }
165        }
166        if next.is_empty() {
167            break; // No more matches possible
168        }
169        states.push(next);
170    }
171
172    // Zero repetition case
173    let has_zero_rep = quantifier.min() == 0;
174    let zero_rep_result = if has_zero_rep {
175        vec![(cbor.clone(), path.clone())]
176    } else {
177        vec![]
178    };
179
180    // Calculate maximum allowed repetitions
181    let max_possible = states.len() - 1;
182    let max_allowed = bound.min(max_possible);
183
184    // Check if we can satisfy the minimum repetition requirement
185    if max_allowed < quantifier.min() && quantifier.min() > 0 {
186        return Vec::new();
187    }
188
189    // Calculate the range of repetition counts based on min and max
190    // Ensure we don't include zero here - it's handled separately
191    let min_count = if quantifier.min() == 0 {
192        1
193    } else {
194        quantifier.min()
195    };
196    let max_count = if max_allowed < min_count {
197        return zero_rep_result;
198    } else {
199        max_allowed
200    };
201
202    let count_range = min_count..=max_count;
203
204    // Generate list of counts to try based on reluctance
205    let counts: Vec<usize> = match quantifier.reluctance() {
206        Reluctance::Greedy => count_range.rev().collect(),
207        Reluctance::Lazy => count_range.collect(),
208        Reluctance::Possessive => {
209            if max_count >= min_count {
210                vec![max_count]
211            } else {
212                vec![]
213            }
214        }
215    };
216
217    // Collect results based on the counts determined above
218    let mut out = Vec::new();
219
220    // For greedy repetition, try higher counts first
221    if matches!(quantifier.reluctance(), Reluctance::Greedy) {
222        // Include results from counts determined by reluctance
223        for c in counts {
224            if let Some(list) = states.get(c) {
225                out.extend(list.clone());
226            }
227        }
228
229        // For greedy matching, add zero repetition case at the end if
230        // applicable
231        if has_zero_rep && out.is_empty() {
232            out.push((cbor.clone(), path.clone()));
233        }
234    } else {
235        // For lazy/possessive, include zero repetition first if applicable
236        if has_zero_rep {
237            out.push((cbor.clone(), path.clone()));
238        }
239
240        // Then include results from counts determined by reluctance
241        for c in counts {
242            if let Some(list) = states.get(c) {
243                out.extend(list.clone());
244            }
245        }
246    }
247
248    out
249}
250
251/// Execute `prog` starting at `root`. Every time `SAVE` or `ACCEPT` executes,
252/// current `path` is pushed into result.
253/// Execute a single thread until it halts. Returns true if any paths were
254/// produced.
255fn run_thread(
256    prog: &Program,
257    start: Thread,
258    out: &mut Vec<(Path, Vec<Vec<Path>>)>,
259) -> bool {
260    use Instr::*;
261    let mut produced = false;
262    let mut stack = vec![start];
263
264    while let Some(mut th) = stack.pop() {
265        loop {
266            match prog.code[th.pc] {
267                MatchPredicate(idx) => {
268                    if atomic_paths(&prog.literals[idx], &th.cbor).is_empty() {
269                        break;
270                    }
271                    th.pc += 1;
272                }
273                MatchStructure(idx) => {
274                    // Use the structure pattern's matcher, with captures if
275                    // present
276                    if let crate::pattern::Pattern::Structure(sp) =
277                        &prog.literals[idx]
278                    {
279                        let (structure_paths, structure_captures) =
280                            sp.paths_with_captures(&th.cbor);
281
282                        if structure_paths.is_empty() {
283                            break;
284                        }
285
286                        // Merge structure captures into thread captures
287                        for (i, name) in prog.capture_names.iter().enumerate() {
288                            if let Some(captured_paths) =
289                                structure_captures.get(name)
290                            {
291                                // Ensure capture storage is initialized
292                                while th.captures.len() <= i {
293                                    th.captures.push(Vec::new());
294                                }
295                                th.captures[i].extend(captured_paths.clone());
296                            }
297                        }
298
299                        // Handle structure paths
300                        if structure_paths.len() == 1
301                            && structure_paths[0].len() == 1
302                        {
303                            // Simple case: single path with single element
304                            th.pc += 1;
305                        } else {
306                            // Complex case: multiple paths or multi-element
307                            // paths
308                            for structure_path in structure_paths {
309                                if let Some(target) = structure_path.last() {
310                                    let mut new_thread = th.clone();
311                                    new_thread.cbor = target.clone();
312                                    new_thread.path.extend(
313                                        structure_path.iter().skip(1).cloned(),
314                                    );
315                                    new_thread.pc += 1;
316                                    stack.push(new_thread);
317                                }
318                            }
319                            break;
320                        }
321                    } else {
322                        panic!(
323                            "MatchStructure used with non-structure pattern"
324                        );
325                    }
326                }
327                Split { a, b } => {
328                    let mut th2 = th.clone();
329                    th2.pc = b;
330                    stack.push(th2);
331                    th.pc = a;
332                }
333                Jump(addr) => {
334                    th.pc = addr;
335                }
336                PushAxis(axis) => {
337                    let children = axis.children(&th.cbor);
338                    for child in children {
339                        let mut new_thread = th.clone();
340                        new_thread.cbor = child.clone();
341                        new_thread.path.push(child);
342                        new_thread.pc += 1;
343                        stack.push(new_thread);
344                    }
345                    break;
346                }
347                Pop => {
348                    if th.path.is_empty() {
349                        break;
350                    }
351                    th.path.pop();
352                    if let Some(parent) = th.path.last() {
353                        th.cbor = parent.clone();
354                    }
355                    th.pc += 1;
356                }
357                Save => {
358                    out.push((th.path.clone(), th.captures.clone()));
359                    produced = true;
360                    th.pc += 1;
361                }
362                Accept => {
363                    out.push((th.path.clone(), th.captures.clone()));
364                    produced = true;
365                    break;
366                }
367                Search { pat_idx, ref capture_map } => {
368                    // Implement recursive search pattern with capture support
369                    let (search_results, captures) =
370                        prog.literals[pat_idx].paths_with_captures(&th.cbor);
371
372                    for search_path in search_results {
373                        let mut new_thread = th.clone();
374                        new_thread.path = search_path.clone();
375
376                        // Apply capture mappings - map captured paths to thread
377                        // state
378                        for (name, capture_idx) in capture_map {
379                            if *capture_idx < new_thread.captures.len() {
380                                if let Some(capture_paths) = captures.get(name)
381                                {
382                                    for capture_path in capture_paths {
383                                        new_thread.captures[*capture_idx]
384                                            .push(capture_path.clone());
385                                    }
386                                }
387                            }
388                        }
389
390                        new_thread.pc += 1;
391                        stack.push(new_thread);
392                    }
393                    break;
394                }
395                ExtendSequence => {
396                    th.saved_paths.push(th.path.clone());
397                    if let Some(last) = th.path.last().cloned() {
398                        th.path = vec![last.clone()];
399                        th.cbor = last;
400                    }
401                    th.pc += 1;
402                }
403                CombineSequence => {
404                    if let Some(saved) = th.saved_paths.pop() {
405                        let mut combined = saved;
406                        if th.path.len() > 1 {
407                            combined.extend(th.path.iter().skip(1).cloned());
408                        }
409                        th.path = combined;
410                    }
411                    th.pc += 1;
412                }
413                NotMatch { pat_idx } => {
414                    if !prog.literals[pat_idx].paths(&th.cbor).is_empty() {
415                        break; // Pattern matched, so NOT pattern fails
416                    }
417                    th.pc += 1;
418                }
419                Repeat { pat_idx, quantifier } => {
420                    let repeat_results = repeat_paths(
421                        &prog.literals[pat_idx],
422                        &th.cbor,
423                        &th.path,
424                        quantifier,
425                    );
426                    for (result_cbor, result_path) in repeat_results {
427                        let mut new_thread = th.clone();
428                        new_thread.cbor = result_cbor;
429                        new_thread.path = result_path;
430                        new_thread.pc += 1;
431                        stack.push(new_thread);
432                    }
433                    break;
434                }
435                CaptureStart(idx) => {
436                    // Initialize capture group
437                    while th.captures.len() <= idx {
438                        th.captures.push(Vec::new());
439                    }
440                    while th.capture_stack.len() <= idx {
441                        th.capture_stack.push(Vec::new());
442                    }
443                    // Store the current path to capture it at CaptureEnd
444                    th.capture_stack[idx].push(th.path.len());
445                    th.pc += 1;
446                }
447                CaptureEnd(idx) => {
448                    // Finalize capture group
449                    if let Some(_start_len) = th
450                        .capture_stack
451                        .get_mut(idx)
452                        .and_then(|stack| stack.pop())
453                    {
454                        // For captures, we want to capture the full path to the
455                        // current CBOR value
456                        // not just the delta since CaptureStart
457                        let captured_path = th.path.clone();
458                        if let Some(captures) = th.captures.get_mut(idx) {
459                            captures.push(captured_path);
460                        }
461                    }
462                    th.pc += 1;
463                }
464            }
465        }
466    }
467
468    produced
469}
470
471/// Execute a program against a dCBOR value, returning all matching paths and
472/// captures.
473pub fn run(
474    prog: &Program,
475    root: &CBOR,
476) -> (Vec<Path>, HashMap<String, Vec<Path>>) {
477    let start = Thread {
478        pc: 0,
479        cbor: root.clone(),
480        path: vec![root.clone()],
481        saved_paths: Vec::new(),
482        captures: Vec::new(),
483        capture_stack: Vec::new(),
484    };
485
486    let mut results = Vec::new();
487    run_thread(prog, start, &mut results);
488
489    // Deduplicate paths while preserving original order
490    let mut seen_paths = HashSet::new();
491    let paths: Vec<Path> = results
492        .iter()
493        .filter_map(|(path, _)| {
494            if seen_paths.contains(path) {
495                None // Already seen, skip
496            } else {
497                seen_paths.insert(path.clone());
498                Some(path.clone()) // First occurrence, keep
499            }
500        })
501        .collect();
502
503    // Build capture map from capture names and results
504    // Collect all captured paths from all threads, then deduplicate per capture
505    // while preserving order
506    let mut captures = HashMap::new();
507    for (i, name) in prog.capture_names.iter().enumerate() {
508        let mut captured_paths = Vec::new();
509        for (_, thread_captures) in &results {
510            if let Some(capture_group) = thread_captures.get(i) {
511                captured_paths.extend(capture_group.clone());
512            }
513        }
514
515        // Deduplicate captured paths for this capture name while preserving
516        // order
517        if !captured_paths.is_empty() {
518            let mut seen_capture_paths = HashSet::new();
519            let deduplicated_captured_paths: Vec<Path> = captured_paths
520                .into_iter()
521                .filter(|path| {
522                    if seen_capture_paths.contains(path) {
523                        false // Already seen, skip
524                    } else {
525                        seen_capture_paths.insert(path.clone());
526                        true // First occurrence, keep
527                    }
528                })
529                .collect();
530            captures.insert(name.clone(), deduplicated_captured_paths);
531        }
532    }
533
534    (paths, captures)
535}
536
537/// VM for executing pattern programs against dCBOR values.
538pub struct Vm;
539
540impl Vm {
541    /// Execute a program against a dCBOR value.
542    pub fn run(
543        prog: &Program,
544        root: &CBOR,
545    ) -> (Vec<Path>, HashMap<String, Vec<Path>>) {
546        run(prog, root)
547    }
548}