Skip to main content

gap_match/
gap_match.rs

1//! Gap-tolerant matching built on top of `eregex`.
2//!
3//! The engine matches patterns *contiguously*. Real-world inputs (OCR output,
4//! noisy logs) sometimes split an intended target across junk, e.g.
5//!
6//! ```text
7//! "Microcontroller STM32F dutyu7 8 407VGT6   "
8//! ```
9//!
10//! where the part number `STM32F407VGT6` is broken by the noise `" dutyu7 8 "`.
11//! A plain `find` (and even the end-anchored `find_partial`) returns `None`
12//! here, because the space after `F` is a hard mismatch and the match can
13//! never reach end-of-input through the noise.
14//!
15//! True fuzzy / gap matching is on the roadmap (`Fuzzy / approximate matching`
16//! in `README.md`). But you can get the **same observable behaviour today**
17//! with a small "shift-and-resume" search:
18//!
19//! 1. Decompose the target into ordered sub-patterns (segments).
20//! 2. Find each segment with [`Regex::find_at`], advancing a cursor.
21//! 3. Bound the noise between consecutive segments with `max_gap`.
22//! 4. Stitch the segment texts into the reconstructed target.
23//!
24//! [`Regex::find_partial`] is then used to (a) confirm the reconstruction is a
25//! genuine *full* match (groups and all) and (b) detect the related end-of-input
26//! *partial* cases (`STM32F`, `STM32F40`, `STM32F407VG`).
27//!
28//! Run with: `cargo run --example gap_match`
29//!
30//! # Caveats (read these before adopting the pattern)
31//!
32//! * Segment boundaries are **user-defined** — you decide where noise may
33//!   occur. Picking them at group boundaries is usually right, but the choice
34//!   is pattern-specific.
35//! * Gaps are allowed **between** segments, never inside one. A wrong
36//!   character *inside* a segment (e.g. `STM32FX07...`) makes that segment
37//!   unfindable, yielding `None` — which happens to match the desired NoMatch,
38//!   but for a slightly different reason than true fuzzy matching would give.
39//! * This is a **heuristic reconstruction**, not a proof of equivalence with
40//!   the original pattern. It can produce false positives at segment edges;
41//!   always re-validate the stitched result with the full pattern (we do).
42
43use eregex::{MatchStatus, Regex};
44
45/// A decomposed target: ordered sub-patterns searched left-to-right.
46struct Segment {
47    re: Regex,
48    label: &'static str,
49}
50
51/// The outcome of a successful gap-tolerant search.
52struct GapMatch {
53    /// The stitched-back target, e.g. `"STM32F407VGT6"`.
54    reconstructed: String,
55    /// `(label, start, end)` per segment, in haystack byte offsets.
56    segments: Vec<(&'static str, usize, usize)>,
57    /// `(start, end)` of each skipped noise gap.
58    skipped: Vec<(usize, usize)>,
59}
60
61/// Search `haystack` for `segments` in order.
62///
63/// * The first segment may start anywhere.
64/// * Each later segment must start within `max_gap` bytes of the previous
65///   segment's end. Exceeding the limit returns `None` (the "gap too long"
66///   case #9).
67/// * Any non-empty gap between two consecutive segments is recorded in
68///   `skipped`.
69fn gap_find(haystack: &str, segments: &[Segment], max_gap: usize) -> Option<GapMatch> {
70    let mut reconstructed = String::new();
71    let mut hits = Vec::with_capacity(segments.len());
72    let mut skipped = Vec::new();
73    let mut cursor = 0usize; // byte offset to search from
74
75    for (i, seg) in segments.iter().enumerate() {
76        // Shift-and-resume: find this segment at or after the cursor.
77        let m = seg.re.find_at(haystack, cursor)?;
78        let (s, e) = (m.start(), m.end());
79        if i > 0 {
80            let gap = s - cursor;
81            if gap > max_gap {
82                return None; // noise too long → NoMatch (#9)
83            }
84            if gap > 0 {
85                skipped.push((cursor, s));
86            }
87        }
88        hits.push((seg.label, s, e));
89        reconstructed.push_str(m.as_str());
90        cursor = e;
91    }
92
93    Some(GapMatch {
94        reconstructed,
95        segments: hits,
96        skipped,
97    })
98}
99
100/// The STM32 target, decomposed at the only place noise is allowed: right
101/// after the series letter `F`, before the three digits. This makes group 2
102/// (`F407`) the "split" group: `F` from segment 1, `407` from segment 2.
103fn stm32_segments() -> Vec<Segment> {
104    vec![
105        Segment {
106            re: Regex::new(r"STM32F").unwrap(),
107            label: "STM32F",
108        },
109        Segment {
110            re: Regex::new(r"[0-9]{3}[A-Z0-9]{4}").unwrap(),
111            label: "407VGT6",
112        },
113    ]
114}
115
116/// The full, contiguous pattern used to re-validate the reconstruction and to
117/// recover clean capture groups (STM32 / F407 / VGT6).
118const FULL: &str = r"(STM32)(F[0-9]{3})([A-Z0-9]{4})";
119
120fn report(name: &str, hay: &str, max_gap: usize) {
121    println!("\n=== {name} ===");
122    println!("haystack: {hay:?}");
123
124    // (1) Show what the bare engine does — both strict and end-anchored partial.
125    let full = Regex::new(FULL).unwrap();
126    println!(
127        "  find         : {:?}",
128        full.find(hay).map(|m| m.as_str().to_string())
129    );
130    println!(
131        "  find_partial : {:?}",
132        match full.find_partial(hay) {
133            None => "None".to_string(),
134            Some(p) => format!("{:?} matched={:?}", p.status, p.matched),
135        }
136    );
137
138    // (2) Gap-tolerant shift-and-resume search.
139    match gap_find(hay, &stm32_segments(), max_gap) {
140        Some(gm) => {
141            println!("  gap_find     : OK");
142            println!("    reconstructed = {:?}", gm.reconstructed);
143            for (label, s, e) in &gm.segments {
144                println!("    segment {label:<8} = [{s}..{e}] {:?}", &hay[*s..*e]);
145            }
146            for (s, e) in &gm.skipped {
147                println!("    skipped        = [{s}..{e}] {:?}", &hay[*s..*e]);
148            }
149
150            // (3) Re-validate the reconstruction with the FULL pattern and
151            //     report clean groups. This is also where `find_partial`
152            //     earns its keep: it confirms Full vs Partial.
153            let p = full
154                .find_partial(&gm.reconstructed)
155                .expect("reconstruction must re-match the full pattern");
156            assert_eq!(p.status, MatchStatus::Full, "reconstruction must be Full");
157            println!(
158                "    groups = [STM32={:?}, F407={:?}, VGT6={:?}]",
159                p.group(1),
160                p.group(2),
161                p.group(3),
162            );
163
164            // The caller asked specifically about #2: group 2 (`F407`) is
165            // assembled from TWO non-contiguous haystack ranges — the `F` at
166            // the tail of segment 1 and the `407` at the head of segment 2.
167            if name.contains("#2") {
168                let (_, _, seg1_end) = gm.segments[0];
169                let (_, seg2_start, _) = gm.segments[1];
170                println!(
171                    "    group F407 split = [{}..{}] + [{}..{}]  (\"F\" + \"407\")",
172                    seg1_end - 1,
173                    seg1_end,
174                    seg2_start,
175                    seg2_start + 3
176                );
177            }
178        }
179        None => println!("  gap_find     : None (NoMatch)"),
180    }
181}
182
183fn main() {
184    // #2 / #10 — the main case: real noise between `F` and `407`.
185    // Gap = the " dutyu7 8 " run (≤ 16 bytes), so it is accepted.
186    report(
187        "#2 split by noise (max_gap=16)",
188        "Microcontroller STM32F dutyu7 8 407VGT6   ",
189        16,
190    );
191
192    // #9 — same shape, but max_gap=8 rejects the run.
193    report(
194        "#9 gap too long (max_gap=8)",
195        "Microcontroller STM32F very very very long unrelated text 407VGT6",
196        8,
197    );
198
199    // #7 — a wrong char *inside* the digits segment (`X` where a digit is
200    // expected). No `[0-9]{3}` run exists, so the segment search fails.
201    report(
202        "#7 wrong char inside series",
203        "Microcontroller STM32FX07VGT6",
204        16,
205    );
206
207    // #8 — wrong family (`STM8` instead of `STM32`): the first segment is
208    // never found.
209    report("#8 wrong family", "Microcontroller STM8F407VGT6", 16);
210
211    // #1 — the contiguous happy path: no gap, reconstruction == input tail.
212    report(
213        "#1 contiguous full match",
214        "Microcontroller STM32F407VGT6",
215        16,
216    );
217
218    println!("\n--- end-of-input partials (find_partial alone, no gap mode) ---");
219    let full = Regex::new(FULL).unwrap();
220    for hay in [
221        "Microcontroller STM32F",
222        "Microcontroller STM32F40",
223        "Microcontroller STM32F407VG",
224    ] {
225        let p = full.find_partial(hay).unwrap();
226        println!(
227            "  {:<32} -> {:?} matched={:?} g1={:?} g2={:?} g3={:?}",
228            hay,
229            p.status,
230            p.matched,
231            p.group(1),
232            p.group(2),
233            p.group(3)
234        );
235    }
236
237    println!("\nAll reconstruction checks passed.");
238}