dcbor_pattern/pattern/
vm.rs

1//! Tiny Thompson-style VM for walking dCBOR trees.
2//!
3//! The VM runs byte-code produced by `Pattern::compile` methods.
4
5use std::collections::{HashMap, HashSet};
6
7use dcbor::prelude::*;
8
9use super::{Matcher, Path, Pattern};
10use crate::{Quantifier, Reluctance};
11
12/// Navigation axis for traversing dCBOR tree structures.
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
14pub enum Axis {
15    /// Navigate to array elements
16    ArrayElement,
17    /// Navigate to map keys
18    MapKey,
19    /// Navigate to map values
20    MapValue,
21    /// Navigate to tagged value content
22    TaggedContent,
23}
24
25impl Axis {
26    /// Return child CBOR values reachable from `cbor` via this axis.
27    pub fn children(&self, cbor: &CBOR) -> Vec<CBOR> {
28        match (self, cbor.as_case()) {
29            (Axis::ArrayElement, CBORCase::Array(arr)) => arr.clone(),
30            (Axis::MapKey, CBORCase::Map(map)) => {
31                map.iter().map(|(k, _v)| k.clone()).collect()
32            }
33            (Axis::MapValue, CBORCase::Map(map)) => {
34                map.iter().map(|(_k, v)| v.clone()).collect()
35            }
36            (Axis::TaggedContent, CBORCase::Tagged(_, content)) => {
37                vec![content.clone()]
38            }
39            _ => Vec::new(),
40        }
41    }
42}
43
44/// Bytecode instructions for the pattern VM.
45#[derive(Debug, Clone)]
46pub enum Instr {
47    /// Match predicate: `literals[idx].matches(cbor)`
48    MatchPredicate(usize),
49    /// Match structure: use `literals[idx].paths(cbor)` for structure patterns
50    MatchStructure(usize),
51    /// ε-split: fork execution to `a` and `b`
52    Split { a: usize, b: usize },
53    /// Unconditional jump to instruction at index
54    Jump(usize),
55    /// Descend to children via axis, one thread per child
56    PushAxis(Axis),
57    /// Pop one CBOR value from the path
58    Pop,
59    /// Emit current path
60    Save,
61    /// Final accept, emit current path and halt thread
62    Accept,
63    /// Recursively search for pattern at `pat_idx` and propagate captures
64    Search {
65        pat_idx: usize,
66        capture_map: Vec<(String, usize)>,
67    },
68    /// Save current path and start new sequence from last CBOR value
69    ExtendSequence,
70    /// Combine saved path with current path for final result
71    CombineSequence,
72    /// Match only if pattern at `pat_idx` does not match
73    NotMatch { pat_idx: usize },
74    /// Repeat a sub pattern according to range and greediness
75    Repeat {
76        pat_idx: usize,
77        quantifier: Quantifier,
78    },
79    /// Mark the start of a capture group
80    CaptureStart(usize),
81    /// Mark the end of a capture group
82    CaptureEnd(usize),
83}
84
85#[derive(Debug, Clone)]
86pub struct Program {
87    pub code: Vec<Instr>,
88    pub literals: Vec<Pattern>,
89    pub capture_names: Vec<String>,
90}
91
92/// Internal back-tracking state.
93#[derive(Clone)]
94struct Thread {
95    pc: usize,
96    cbor: CBOR,
97    path: Path,
98    /// Stack of saved paths for nested sequence patterns
99    saved_paths: Vec<Path>,
100    captures: Vec<Vec<Path>>,
101    capture_stack: Vec<Vec<usize>>,
102}
103
104/// Match atomic patterns without recursion into the VM.
105///
106/// This function handles only the patterns that are safe to use in
107/// MatchPredicate instructions - Value, Structure, Any, and None patterns. Meta
108/// patterns should never be passed to this function as they need to be compiled
109/// to bytecode.
110#[allow(clippy::panic)]
111pub(crate) fn atomic_paths(
112    p: &crate::pattern::Pattern,
113    cbor: &CBOR,
114) -> Vec<Path> {
115    use crate::pattern::Pattern::*;
116    match p {
117        Value(v) => v.paths(cbor),
118        Structure(s) => s.paths(cbor),
119        Meta(meta) => match meta {
120            crate::pattern::meta::MetaPattern::Any(_) => {
121                vec![vec![cbor.clone()]]
122            }
123            crate::pattern::meta::MetaPattern::Search(_) => {
124                panic!(
125                    "SearchPattern should be compiled to Search instruction, not MatchPredicate"
126                );
127            }
128            _ => panic!(
129                "non-atomic meta pattern used in MatchPredicate: {:?}",
130                meta
131            ),
132        },
133    }
134}
135
136fn repeat_paths(
137    pat: &Pattern,
138    cbor: &CBOR,
139    path: &Path,
140    quantifier: Quantifier,
141) -> Vec<(CBOR, Path)> {
142    // Build states for all possible repetition counts
143    let mut states: Vec<Vec<(CBOR, Path)>> =
144        vec![vec![(cbor.clone(), path.clone())]];
145    let bound = quantifier.max().unwrap_or(usize::MAX);
146
147    // Try matching the pattern repeatedly
148    for _ in 0..bound {
149        let mut next = Vec::new();
150        for (c, pth) in states.last().unwrap().iter() {
151            for sub_path in pat.paths(c) {
152                if let Some(last) = sub_path.last() {
153                    if last.to_cbor_data() == c.to_cbor_data() {
154                        continue; // Avoid infinite loops
155                    }
156                    let mut combined = pth.clone();
157                    if sub_path.first() == Some(c) {
158                        combined.extend(sub_path.iter().skip(1).cloned());
159                    } else {
160                        combined.extend(sub_path.iter().cloned());
161                    }
162                    next.push((last.clone(), combined));
163                }
164            }
165        }
166        if next.is_empty() {
167            break; // No more matches possible
168        }
169        states.push(next);
170    }
171
172    // Zero repetition case
173    let has_zero_rep = quantifier.min() == 0;
174    let zero_rep_result = if has_zero_rep {
175        vec![(cbor.clone(), path.clone())]
176    } else {
177        vec![]
178    };
179
180    // Calculate maximum allowed repetitions
181    let max_possible = states.len() - 1;
182    let max_allowed = bound.min(max_possible);
183
184    // Check if we can satisfy the minimum repetition requirement
185    if max_allowed < quantifier.min() && quantifier.min() > 0 {
186        return Vec::new();
187    }
188
189    // Calculate the range of repetition counts based on min and max
190    // Ensure we don't include zero here - it's handled separately
191    let min_count = if quantifier.min() == 0 {
192        1
193    } else {
194        quantifier.min()
195    };
196    let max_count = if max_allowed < min_count {
197        return zero_rep_result;
198    } else {
199        max_allowed
200    };
201
202    let count_range = min_count..=max_count;
203
204    // Generate list of counts to try based on reluctance
205    let counts: Vec<usize> = match quantifier.reluctance() {
206        Reluctance::Greedy => count_range.rev().collect(),
207        Reluctance::Lazy => count_range.collect(),
208        Reluctance::Possessive => {
209            if max_count >= min_count {
210                vec![max_count]
211            } else {
212                vec![]
213            }
214        }
215    };
216
217    // Collect results based on the counts determined above
218    let mut out = Vec::new();
219
220    // For greedy repetition, try higher counts first
221    if matches!(quantifier.reluctance(), Reluctance::Greedy) {
222        // Include results from counts determined by reluctance
223        for c in counts {
224            if let Some(list) = states.get(c) {
225                out.extend(list.clone());
226            }
227        }
228
229        // For greedy matching, add zero repetition case at the end if
230        // applicable
231        if has_zero_rep && out.is_empty() {
232            out.push((cbor.clone(), path.clone()));
233        }
234    } else {
235        // For lazy/possessive, include zero repetition first if applicable
236        if has_zero_rep {
237            out.push((cbor.clone(), path.clone()));
238        }
239
240        // Then include results from counts determined by reluctance
241        for c in counts {
242            if let Some(list) = states.get(c) {
243                out.extend(list.clone());
244            }
245        }
246    }
247
248    out
249}
250
251/// Execute `prog` starting at `root`. Every time `SAVE` or `ACCEPT` executes,
252/// current `path` is pushed into result.
253/// Execute a single thread until it halts. Returns true if any paths were
254/// produced.
255fn run_thread(
256    prog: &Program,
257    start: Thread,
258    out: &mut Vec<(Path, Vec<Vec<Path>>)>,
259) -> bool {
260    use Instr::*;
261    let mut produced = false;
262    let mut stack = vec![start];
263
264    while let Some(mut th) = stack.pop() {
265        loop {
266            match prog.code[th.pc] {
267                MatchPredicate(idx) => {
268                    if atomic_paths(&prog.literals[idx], &th.cbor).is_empty() {
269                        break;
270                    }
271                    th.pc += 1;
272                }
273                MatchStructure(idx) => {
274                    // Use the structure pattern's matcher, with captures if
275                    // present
276                    if let crate::pattern::Pattern::Structure(sp) =
277                        &prog.literals[idx]
278                    {
279                        let (structure_paths, structure_captures) =
280                            sp.paths_with_captures(&th.cbor);
281
282                        if structure_paths.is_empty() {
283                            break;
284                        }
285
286                        // Merge structure captures into thread captures
287                        for (i, name) in prog.capture_names.iter().enumerate() {
288                            if let Some(captured_paths) =
289                                structure_captures.get(name)
290                            {
291                                // Ensure capture storage is initialized
292                                while th.captures.len() <= i {
293                                    th.captures.push(Vec::new());
294                                }
295                                th.captures[i].extend(captured_paths.clone());
296                            }
297                        }
298
299                        // Handle structure paths
300                        if structure_paths.len() == 1
301                            && structure_paths[0].len() == 1
302                        {
303                            // Simple case: single path with single element
304                            th.pc += 1;
305                        } else {
306                            // Complex case: multiple paths or multi-element
307                            // paths
308                            for structure_path in structure_paths {
309                                if let Some(target) = structure_path.last() {
310                                    let mut new_thread = th.clone();
311                                    new_thread.cbor = target.clone();
312                                    new_thread.path.extend(
313                                        structure_path.iter().skip(1).cloned(),
314                                    );
315                                    new_thread.pc += 1;
316                                    stack.push(new_thread);
317                                }
318                            }
319                            break;
320                        }
321                    } else {
322                        panic!(
323                            "MatchStructure used with non-structure pattern"
324                        );
325                    }
326                }
327                Split { a, b } => {
328                    let mut th2 = th.clone();
329                    th2.pc = b;
330                    stack.push(th2);
331                    th.pc = a;
332                }
333                Jump(addr) => {
334                    th.pc = addr;
335                }
336                PushAxis(axis) => {
337                    let children = axis.children(&th.cbor);
338                    for child in children {
339                        let mut new_thread = th.clone();
340                        new_thread.cbor = child.clone();
341                        new_thread.path.push(child);
342                        new_thread.pc += 1;
343                        stack.push(new_thread);
344                    }
345                    break;
346                }
347                Pop => {
348                    if th.path.is_empty() {
349                        break;
350                    }
351                    th.path.pop();
352                    if let Some(parent) = th.path.last() {
353                        th.cbor = parent.clone();
354                    }
355                    th.pc += 1;
356                }
357                Save => {
358                    out.push((th.path.clone(), th.captures.clone()));
359                    produced = true;
360                    th.pc += 1;
361                }
362                Accept => {
363                    out.push((th.path.clone(), th.captures.clone()));
364                    produced = true;
365                    break;
366                }
367                Search { pat_idx, ref capture_map } => {
368                    // Implement recursive search pattern with capture support
369                    let (search_results, captures) =
370                        prog.literals[pat_idx].paths_with_captures(&th.cbor);
371
372                    for search_path in search_results {
373                        let mut new_thread = th.clone();
374                        new_thread.path = search_path.clone();
375
376                        // Apply capture mappings - map captured paths to thread
377                        // state
378                        for (name, capture_idx) in capture_map {
379                            if *capture_idx < new_thread.captures.len()
380                                && let Some(capture_paths) = captures.get(name)
381                            {
382                                for capture_path in capture_paths {
383                                    new_thread.captures[*capture_idx]
384                                        .push(capture_path.clone());
385                                }
386                            }
387                        }
388
389                        new_thread.pc += 1;
390                        stack.push(new_thread);
391                    }
392                    break;
393                }
394                ExtendSequence => {
395                    th.saved_paths.push(th.path.clone());
396                    if let Some(last) = th.path.last().cloned() {
397                        th.path = vec![last.clone()];
398                        th.cbor = last;
399                    }
400                    th.pc += 1;
401                }
402                CombineSequence => {
403                    if let Some(saved) = th.saved_paths.pop() {
404                        let mut combined = saved;
405                        if th.path.len() > 1 {
406                            combined.extend(th.path.iter().skip(1).cloned());
407                        }
408                        th.path = combined;
409                    }
410                    th.pc += 1;
411                }
412                NotMatch { pat_idx } => {
413                    if !prog.literals[pat_idx].paths(&th.cbor).is_empty() {
414                        break; // Pattern matched, so NOT pattern fails
415                    }
416                    th.pc += 1;
417                }
418                Repeat { pat_idx, quantifier } => {
419                    let repeat_results = repeat_paths(
420                        &prog.literals[pat_idx],
421                        &th.cbor,
422                        &th.path,
423                        quantifier,
424                    );
425                    for (result_cbor, result_path) in repeat_results {
426                        let mut new_thread = th.clone();
427                        new_thread.cbor = result_cbor;
428                        new_thread.path = result_path;
429                        new_thread.pc += 1;
430                        stack.push(new_thread);
431                    }
432                    break;
433                }
434                CaptureStart(idx) => {
435                    // Initialize capture group
436                    while th.captures.len() <= idx {
437                        th.captures.push(Vec::new());
438                    }
439                    while th.capture_stack.len() <= idx {
440                        th.capture_stack.push(Vec::new());
441                    }
442                    // Store the current path to capture it at CaptureEnd
443                    th.capture_stack[idx].push(th.path.len());
444                    th.pc += 1;
445                }
446                CaptureEnd(idx) => {
447                    // Finalize capture group
448                    if let Some(_start_len) = th
449                        .capture_stack
450                        .get_mut(idx)
451                        .and_then(|stack| stack.pop())
452                    {
453                        // For captures, we want to capture the full path to the
454                        // current CBOR value
455                        // not just the delta since CaptureStart
456                        let captured_path = th.path.clone();
457                        if let Some(captures) = th.captures.get_mut(idx) {
458                            captures.push(captured_path);
459                        }
460                    }
461                    th.pc += 1;
462                }
463            }
464        }
465    }
466
467    produced
468}
469
470/// Execute a program against a dCBOR value, returning all matching paths and
471/// captures.
472pub fn run(
473    prog: &Program,
474    root: &CBOR,
475) -> (Vec<Path>, HashMap<String, Vec<Path>>) {
476    let start = Thread {
477        pc: 0,
478        cbor: root.clone(),
479        path: vec![root.clone()],
480        saved_paths: Vec::new(),
481        captures: Vec::new(),
482        capture_stack: Vec::new(),
483    };
484
485    let mut results = Vec::new();
486    run_thread(prog, start, &mut results);
487
488    // Deduplicate paths while preserving original order
489    let mut seen_paths = HashSet::new();
490    let paths: Vec<Path> = results
491        .iter()
492        .filter_map(|(path, _)| {
493            if seen_paths.contains(path) {
494                None // Already seen, skip
495            } else {
496                seen_paths.insert(path.clone());
497                Some(path.clone()) // First occurrence, keep
498            }
499        })
500        .collect();
501
502    // Build capture map from capture names and results
503    // Collect all captured paths from all threads, then deduplicate per capture
504    // while preserving order
505    let mut captures = HashMap::new();
506    for (i, name) in prog.capture_names.iter().enumerate() {
507        let mut captured_paths = Vec::new();
508        for (_, thread_captures) in &results {
509            if let Some(capture_group) = thread_captures.get(i) {
510                captured_paths.extend(capture_group.clone());
511            }
512        }
513
514        // Deduplicate captured paths for this capture name while preserving
515        // order
516        if !captured_paths.is_empty() {
517            let mut seen_capture_paths = HashSet::new();
518            let deduplicated_captured_paths: Vec<Path> = captured_paths
519                .into_iter()
520                .filter(|path| {
521                    if seen_capture_paths.contains(path) {
522                        false // Already seen, skip
523                    } else {
524                        seen_capture_paths.insert(path.clone());
525                        true // First occurrence, keep
526                    }
527                })
528                .collect();
529            captures.insert(name.clone(), deduplicated_captured_paths);
530        }
531    }
532
533    (paths, captures)
534}
535
536/// VM for executing pattern programs against dCBOR values.
537pub struct Vm;
538
539impl Vm {
540    /// Execute a program against a dCBOR value.
541    pub fn run(
542        prog: &Program,
543        root: &CBOR,
544    ) -> (Vec<Path>, HashMap<String, Vec<Path>>) {
545        run(prog, root)
546    }
547}