gap_match/gap_match.rs
1//! Gap-tolerant matching built on top of `eregex`.
2//!
3//! The engine matches patterns *contiguously*. Real-world inputs (OCR output,
4//! noisy logs) sometimes split an intended target across junk, e.g.
5//!
6//! ```text
7//! "Microcontroller STM32F dutyu7 8 407VGT6 "
8//! ```
9//!
10//! where the part number `STM32F407VGT6` is broken by the noise `" dutyu7 8 "`.
11//! A plain `find` (and even the end-anchored `find_partial`) returns `None`
12//! here, because the space after `F` is a hard mismatch and the match can
13//! never reach end-of-input through the noise.
14//!
15//! True fuzzy / gap matching is on the roadmap (`Fuzzy / approximate matching`
16//! in `README.md`). But you can get the **same observable behaviour today**
17//! with a small "shift-and-resume" search:
18//!
19//! 1. Decompose the target into ordered sub-patterns (segments).
20//! 2. Find each segment with [`Regex::find_at`], advancing a cursor.
21//! 3. Bound the noise between consecutive segments with `max_gap`.
22//! 4. Stitch the segment texts into the reconstructed target.
23//!
24//! [`Regex::find_partial`] is then used to (a) confirm the reconstruction is a
25//! genuine *full* match (groups and all) and (b) detect the related end-of-input
26//! *partial* cases (`STM32F`, `STM32F40`, `STM32F407VG`).
27//!
28//! Run with: `cargo run --example gap_match`
29//!
30//! # Caveats (read these before adopting the pattern)
31//!
32//! * Segment boundaries are **user-defined** — you decide where noise may
33//! occur. Picking them at group boundaries is usually right, but the choice
34//! is pattern-specific.
35//! * Gaps are allowed **between** segments, never inside one. A wrong
36//! character *inside* a segment (e.g. `STM32FX07...`) makes that segment
37//! unfindable, yielding `None` — which happens to match the desired NoMatch,
38//! but for a slightly different reason than true fuzzy matching would give.
39//! * This is a **heuristic reconstruction**, not a proof of equivalence with
40//! the original pattern. It can produce false positives at segment edges;
41//! always re-validate the stitched result with the full pattern (we do).
42
43use eregex::{MatchStatus, Regex};
44
45/// A decomposed target: ordered sub-patterns searched left-to-right.
46struct Segment {
47 re: Regex,
48 label: &'static str,
49}
50
51/// The outcome of a successful gap-tolerant search.
52struct GapMatch {
53 /// The stitched-back target, e.g. `"STM32F407VGT6"`.
54 reconstructed: String,
55 /// `(label, start, end)` per segment, in haystack byte offsets.
56 segments: Vec<(&'static str, usize, usize)>,
57 /// `(start, end)` of each skipped noise gap.
58 skipped: Vec<(usize, usize)>,
59}
60
61/// Search `haystack` for `segments` in order.
62///
63/// * The first segment may start anywhere.
64/// * Each later segment must start within `max_gap` bytes of the previous
65/// segment's end. Exceeding the limit returns `None` (the "gap too long"
66/// case #9).
67/// * Any non-empty gap between two consecutive segments is recorded in
68/// `skipped`.
69fn gap_find(haystack: &str, segments: &[Segment], max_gap: usize) -> Option<GapMatch> {
70 let mut reconstructed = String::new();
71 let mut hits = Vec::with_capacity(segments.len());
72 let mut skipped = Vec::new();
73 let mut cursor = 0usize; // byte offset to search from
74
75 for (i, seg) in segments.iter().enumerate() {
76 // Shift-and-resume: find this segment at or after the cursor.
77 let m = seg.re.find_at(haystack, cursor)?;
78 let (s, e) = (m.start(), m.end());
79 if i > 0 {
80 let gap = s - cursor;
81 if gap > max_gap {
82 return None; // noise too long → NoMatch (#9)
83 }
84 if gap > 0 {
85 skipped.push((cursor, s));
86 }
87 }
88 hits.push((seg.label, s, e));
89 reconstructed.push_str(m.as_str());
90 cursor = e;
91 }
92
93 Some(GapMatch {
94 reconstructed,
95 segments: hits,
96 skipped,
97 })
98}
99
100/// The STM32 target, decomposed at the only place noise is allowed: right
101/// after the series letter `F`, before the three digits. This makes group 2
102/// (`F407`) the "split" group: `F` from segment 1, `407` from segment 2.
103fn stm32_segments() -> Vec<Segment> {
104 vec![
105 Segment {
106 re: Regex::new(r"STM32F").unwrap(),
107 label: "STM32F",
108 },
109 Segment {
110 re: Regex::new(r"[0-9]{3}[A-Z0-9]{4}").unwrap(),
111 label: "407VGT6",
112 },
113 ]
114}
115
116/// The full, contiguous pattern used to re-validate the reconstruction and to
117/// recover clean capture groups (STM32 / F407 / VGT6).
118const FULL: &str = r"(STM32)(F[0-9]{3})([A-Z0-9]{4})";
119
120fn report(name: &str, hay: &str, max_gap: usize) {
121 println!("\n=== {name} ===");
122 println!("haystack: {hay:?}");
123
124 // (1) Show what the bare engine does — both strict and end-anchored partial.
125 let full = Regex::new(FULL).unwrap();
126 println!(
127 " find : {:?}",
128 full.find(hay).map(|m| m.as_str().to_string())
129 );
130 println!(
131 " find_partial : {:?}",
132 match full.find_partial(hay) {
133 None => "None".to_string(),
134 Some(p) => format!("{:?} matched={:?}", p.status, p.matched),
135 }
136 );
137
138 // (2) Gap-tolerant shift-and-resume search.
139 match gap_find(hay, &stm32_segments(), max_gap) {
140 Some(gm) => {
141 println!(" gap_find : OK");
142 println!(" reconstructed = {:?}", gm.reconstructed);
143 for (label, s, e) in &gm.segments {
144 println!(" segment {label:<8} = [{s}..{e}] {:?}", &hay[*s..*e]);
145 }
146 for (s, e) in &gm.skipped {
147 println!(" skipped = [{s}..{e}] {:?}", &hay[*s..*e]);
148 }
149
150 // (3) Re-validate the reconstruction with the FULL pattern and
151 // report clean groups. This is also where `find_partial`
152 // earns its keep: it confirms Full vs Partial.
153 let p = full
154 .find_partial(&gm.reconstructed)
155 .expect("reconstruction must re-match the full pattern");
156 assert_eq!(p.status, MatchStatus::Full, "reconstruction must be Full");
157 println!(
158 " groups = [STM32={:?}, F407={:?}, VGT6={:?}]",
159 p.group(1),
160 p.group(2),
161 p.group(3),
162 );
163
164 // The caller asked specifically about #2: group 2 (`F407`) is
165 // assembled from TWO non-contiguous haystack ranges — the `F` at
166 // the tail of segment 1 and the `407` at the head of segment 2.
167 if name.contains("#2") {
168 let (_, _, seg1_end) = gm.segments[0];
169 let (_, seg2_start, _) = gm.segments[1];
170 println!(
171 " group F407 split = [{}..{}] + [{}..{}] (\"F\" + \"407\")",
172 seg1_end - 1,
173 seg1_end,
174 seg2_start,
175 seg2_start + 3
176 );
177 }
178 }
179 None => println!(" gap_find : None (NoMatch)"),
180 }
181}
182
183fn main() {
184 // #2 / #10 — the main case: real noise between `F` and `407`.
185 // Gap = the " dutyu7 8 " run (≤ 16 bytes), so it is accepted.
186 report(
187 "#2 split by noise (max_gap=16)",
188 "Microcontroller STM32F dutyu7 8 407VGT6 ",
189 16,
190 );
191
192 // #9 — same shape, but max_gap=8 rejects the run.
193 report(
194 "#9 gap too long (max_gap=8)",
195 "Microcontroller STM32F very very very long unrelated text 407VGT6",
196 8,
197 );
198
199 // #7 — a wrong char *inside* the digits segment (`X` where a digit is
200 // expected). No `[0-9]{3}` run exists, so the segment search fails.
201 report(
202 "#7 wrong char inside series",
203 "Microcontroller STM32FX07VGT6",
204 16,
205 );
206
207 // #8 — wrong family (`STM8` instead of `STM32`): the first segment is
208 // never found.
209 report("#8 wrong family", "Microcontroller STM8F407VGT6", 16);
210
211 // #1 — the contiguous happy path: no gap, reconstruction == input tail.
212 report(
213 "#1 contiguous full match",
214 "Microcontroller STM32F407VGT6",
215 16,
216 );
217
218 println!("\n--- end-of-input partials (find_partial alone, no gap mode) ---");
219 let full = Regex::new(FULL).unwrap();
220 for hay in [
221 "Microcontroller STM32F",
222 "Microcontroller STM32F40",
223 "Microcontroller STM32F407VG",
224 ] {
225 let p = full.find_partial(hay).unwrap();
226 println!(
227 " {:<32} -> {:?} matched={:?} g1={:?} g2={:?} g3={:?}",
228 hay,
229 p.status,
230 p.matched,
231 p.group(1),
232 p.group(2),
233 p.group(3)
234 );
235 }
236
237 println!("\nAll reconstruction checks passed.");
238}