Skip to main content

serde_saphyr/
path_map.rs

1//! This module records YAML key paths (as seen during deserialization) and later tries to map
2//! validation paths back to those YAML locations.
3//!
4//! The key problem is that validation paths are derived from Rust field identifiers (typically
5//! `snake_case`), while YAML keys can use different spellings (`camelCase`, `kebab-case`, etc.).
6//! The parser has no direct access to Rust field names as reported by validation crates.
7//!
8//! We apply a small, ordered set of comparison strategies and only accept a match when it is
9//! **unique**.
10//!
11//! Matching rules:
12//! - Paths must have the same length and the same per-segment kind (key vs index).
13//! - Segment names are first normalized by stripping Rust raw-identifier prefixes (`r#type` →
14//!   `type`) to work around reserved-keyword field names.
15//! - `PathMap::search` runs multiple passes from most exact to most fuzzy:
16//!   1. Direct lookup (exact `Path` equality).
17//!   2. Whole-path ASCII case-insensitive match.
18//!   3. Token-sequence match: split on separators and common casing/digit boundaries
19//!      (`user_id`, `userId`, `user-id` → tokens `user`, `id`).
20//!   4. Collapsed match: drop all non-alphanumeric characters and compare ASCII-lowercased.
21//!
22//! Any non-direct pass succeeds only if it yields exactly one candidate; otherwise the result is
23//! considered ambiguous.
24
25use crate::location::Locations;
26
27use std::borrow::Cow;
28use std::collections::HashMap;
29use std::mem;
30
31#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
32pub(crate) enum PathKind {
33    Key,
34    Index,
35}
36
37#[derive(Clone, Debug, PartialEq, Eq, Hash)]
38pub(crate) struct PathSegment {
39    pub(crate) kind: PathKind,
40    pub(crate) name: String,
41}
42
43impl From<&str> for PathSegment {
44    fn from(value: &str) -> Self {
45        Self {
46            kind: PathKind::Key,
47            name: value.to_owned(),
48        }
49    }
50}
51
52impl From<String> for PathSegment {
53    fn from(value: String) -> Self {
54        Self {
55            kind: PathKind::Key,
56            name: value,
57        }
58    }
59}
60
61impl<'a> From<Cow<'a, str>> for PathSegment {
62    fn from(value: Cow<'a, str>) -> Self {
63        Self {
64            kind: PathKind::Key,
65            name: value.into_owned(),
66        }
67    }
68}
69
70impl From<usize> for PathSegment {
71    fn from(value: usize) -> Self {
72        Self {
73            kind: PathKind::Index,
74            name: value.to_string(),
75        }
76    }
77}
78
79#[derive(Clone, Debug, PartialEq, Eq, Hash, Default)]
80pub(crate) struct PathKey {
81    segments: Vec<PathSegment>,
82}
83
84impl PathKey {
85    pub(crate) fn empty() -> Self {
86        Self {
87            segments: Vec::new(),
88        }
89    }
90
91    pub(crate) fn take(&mut self) -> Self {
92        Self {
93            segments: mem::take(&mut self.segments),
94        }
95    }
96
97    pub(crate) fn join<T: Into<PathSegment>>(mut self, seg: T) -> Self {
98        self.segments.push(seg.into());
99        self
100    }
101
102    pub(crate) fn len(&self) -> usize {
103        self.segments.len()
104    }
105
106    pub(crate) fn is_empty(&self) -> bool {
107        self.segments.is_empty()
108    }
109
110    pub(crate) fn leaf_string(&self) -> Option<String> {
111        self.segments.last().map(|seg| seg.name.clone())
112    }
113
114    pub(crate) fn truncate(&self, len: usize) -> Self {
115        Self {
116            segments: self.segments.iter().take(len).cloned().collect(),
117        }
118    }
119
120    pub(crate) fn parent(&self) -> Option<Self> {
121        let len = self.segments.len();
122        if len == 0 {
123            None
124        } else {
125            Some(self.truncate(len - 1))
126        }
127    }
128
129    fn iter_segments(&self) -> impl Iterator<Item = (&PathKind, &str)> {
130        self.segments
131            .iter()
132            .map(|seg| (&seg.kind, seg.name.as_str()))
133    }
134}
135
136pub(crate) fn format_path_with_resolved_leaf(path: &PathKey, resolved_leaf: &str) -> String {
137    let mut out = String::new();
138    let last_index = path.segments.len().saturating_sub(1);
139
140    for (idx, seg) in path.segments.iter().enumerate() {
141        match seg.kind {
142            PathKind::Index => {
143                out.push('[');
144                out.push_str(&seg.name);
145                out.push(']');
146            }
147            PathKind::Key => {
148                if idx > 0 {
149                    out.push('.');
150                }
151                if idx == last_index {
152                    out.push_str(resolved_leaf);
153                } else {
154                    out.push_str(&seg.name);
155                }
156            }
157        }
158    }
159
160    if out.is_empty() {
161        "<root>".to_owned()
162    } else {
163        out
164    }
165}
166
167#[cfg(feature = "garde")]
168pub(crate) fn path_key_from_garde(path: &garde::error::Path) -> PathKey {
169    use garde::error::Kind;
170
171    let mut segs: Vec<PathSegment> = path
172        .__iter()
173        .map(|(k, s)| match k {
174            Kind::Index => PathSegment {
175                kind: PathKind::Index,
176                name: s.as_str().to_owned(),
177            },
178            _ => PathSegment {
179                kind: PathKind::Key,
180                name: s.as_str().to_owned(),
181            },
182        })
183        .collect();
184    segs.reverse();
185
186    PathKey { segments: segs }
187}
188
189#[derive(Debug)]
190pub struct PathMap {
191    pub(crate) map: HashMap<PathKey, Locations>,
192}
193
194impl PathMap {
195    pub(crate) fn new() -> Self {
196        Self {
197            map: HashMap::new(),
198        }
199    }
200
201    pub(crate) fn insert(&mut self, path: PathKey, locations: Locations) {
202        self.map.insert(path, locations);
203    }
204
205    pub(crate) fn search(&self, path: &PathKey) -> Option<(Locations, String)> {
206        // 1) Direct lookup.
207        if let Some(loc) = self.map.get(path) {
208            let leaf = path.leaf_string()?;
209            return Some((*loc, leaf));
210        }
211
212        // Multi-pass matching (more exact -> more fuzzy). Each pass succeeds only if it yields
213        // exactly one candidate.
214        //
215        // This is used to bridge Rust field paths (snake_case, etc.) to YAML key spellings
216        // recorded during deserialization, without attempting arbitrary rename mapping.
217
218        // 2) Whole-path case-insensitive match (only if unique).
219        self.find_unique_by(path, segments_equal_case_insensitive)
220            // 3) Token-sequence match (only if unique).
221            //
222            // Tokenization is stronger than “collapsed” matching: it treats separators and common
223            // casing boundaries as token boundaries, reducing false collisions like:
224            //   ab_c  vs a_bc  (both collapse to "abc", but tokenize to ["ab","c"] vs ["a","bc"]).
225            .or_else(|| self.find_unique_by(path, segments_equal_tokenized_case_insensitive))
226            // 4) Loose collapsed match (only if unique): remove all non-alphanumeric characters
227            // within each segment and compare case-insensitively.
228            .or_else(|| self.find_unique_by(path, segments_equal_collapsed_case_insensitive))
229            // 5) Key-to-index fallback (only if unique): allow garde paths that contain map keys
230            // to resolve to recorded sequence indices when custom deserialization transforms a
231            // YAML sequence into a map keyed by derived IDs.
232            .or_else(|| self.find_unique_by(path, segments_equal_key_to_index_fallback))
233    }
234
235    fn find_unique_by(
236        &self,
237        target: &PathKey,
238        mut matches: impl FnMut(&PathKey, &PathKey) -> bool,
239    ) -> Option<(Locations, String)> {
240        if target.is_empty() {
241            return None;
242        }
243
244        let mut found: Option<(Locations, String)> = None;
245        for (candidate, loc) in self.map.iter() {
246            if matches(target, candidate) {
247                if found.is_some() {
248                    return None; // ambiguous
249                }
250                found = Some((*loc, candidate.leaf_string()?));
251            }
252        }
253        found
254    }
255}
256
257fn segments_equal_key_to_index_fallback(target: &PathKey, candidate: &PathKey) -> bool {
258    if target.len() != candidate.len() {
259        return false;
260    }
261
262    target
263        .iter_segments()
264        .zip(candidate.iter_segments())
265        .all(|((tk, ts), (ck, cs))| match (tk, ck) {
266            (PathKind::Index, PathKind::Index) => ts == cs,
267            (PathKind::Key, PathKind::Key) => strip_raw_identifier_prefix(ts)
268                .eq_ignore_ascii_case(strip_raw_identifier_prefix(cs)),
269            // Fallback: treat a key segment in the target as matching an index segment in the
270            // candidate. This is intentionally loose; we rely on `find_unique_by` to reject
271            // ambiguous matches.
272            (PathKind::Key, PathKind::Index) => !ts.is_empty() && !cs.is_empty(),
273            (PathKind::Index, PathKind::Key) => false,
274        })
275}
276
277fn strip_raw_identifier_prefix(s: &str) -> &str {
278    // Rust raw identifiers are formatted like `r#type`.
279    s.strip_prefix("r#").unwrap_or(s)
280}
281
282fn segments_equal_case_insensitive(target: &PathKey, candidate: &PathKey) -> bool {
283    if target.len() != candidate.len() {
284        return false;
285    }
286
287    target
288        .iter_segments()
289        .zip(candidate.iter_segments())
290        .all(|((tk, ts), (ck, cs))| {
291            tk == ck
292                && match tk {
293                    PathKind::Index => ts == cs,
294                    PathKind::Key => strip_raw_identifier_prefix(ts)
295                        .eq_ignore_ascii_case(strip_raw_identifier_prefix(cs)),
296                }
297        })
298}
299
300fn collapse_non_alnum_ascii_lower(s: &str) -> String {
301    let mut out = String::with_capacity(s.len());
302    out.extend(
303        s.chars()
304            .filter(|c| c.is_ascii_alphanumeric())
305            .map(|c| c.to_ascii_lowercase()),
306    );
307    out
308}
309
310fn segments_equal_collapsed_case_insensitive(target: &PathKey, candidate: &PathKey) -> bool {
311    if target.len() != candidate.len() {
312        return false;
313    }
314
315    target
316        .iter_segments()
317        .zip(candidate.iter_segments())
318        .all(|((tk, ts), (ck, cs))| {
319            tk == ck
320                && match tk {
321                    PathKind::Index => ts == cs,
322                    PathKind::Key => {
323                        collapse_non_alnum_ascii_lower(strip_raw_identifier_prefix(ts))
324                            == collapse_non_alnum_ascii_lower(strip_raw_identifier_prefix(cs))
325                    }
326                }
327        })
328}
329
330#[derive(Copy, Clone, Debug, PartialEq, Eq)]
331enum CharClass {
332    Lower,
333    Upper,
334    Digit,
335    Other,
336}
337
338fn classify_ascii(c: char) -> CharClass {
339    if c.is_ascii_lowercase() {
340        CharClass::Lower
341    } else if c.is_ascii_uppercase() {
342        CharClass::Upper
343    } else if c.is_ascii_digit() {
344        CharClass::Digit
345    } else {
346        CharClass::Other
347    }
348}
349
350fn tokenize_segment(s: &str) -> Vec<String> {
351    // 1) Split on any non-alphanumeric separator.
352    // 2) Further split each piece on:
353    //    - camel/pascal case boundaries (userId -> user + id)
354    //    - digit boundaries (sha256Sum -> sha + 256 + sum)
355    //    - acronym boundary heuristic (HTTPServer -> http + server)
356    let mut tokens = Vec::new();
357
358    for piece in s
359        .split(|c: char| !c.is_ascii_alphanumeric())
360        .filter(|p| !p.is_empty())
361    {
362        let chars: Vec<char> = piece.chars().collect();
363        if chars.is_empty() {
364            continue;
365        }
366
367        let mut start = 0usize;
368        for i in 1..chars.len() {
369            let prev = classify_ascii(chars[i - 1]);
370            let curr = classify_ascii(chars[i]);
371            let next = chars.get(i + 1).copied().map(classify_ascii);
372
373            let boundary = match (prev, curr) {
374                // userId / userID / userID2
375                (CharClass::Lower, CharClass::Upper) => true,
376                // sha256 / foo2Bar
377                (CharClass::Digit, CharClass::Lower | CharClass::Upper) => true,
378                (CharClass::Lower | CharClass::Upper, CharClass::Digit) => true,
379                // HTTPServer: split before the S in Server (Acronym + Word)
380                (CharClass::Upper, CharClass::Upper) if matches!(next, Some(CharClass::Lower)) => {
381                    true
382                }
383                _ => false,
384            };
385
386            if boundary {
387                if start < i {
388                    let tok: String = chars[start..i]
389                        .iter()
390                        .map(|c| c.to_ascii_lowercase())
391                        .collect();
392                    if !tok.is_empty() {
393                        tokens.push(tok);
394                    }
395                }
396                start = i;
397            }
398        }
399
400        if start < chars.len() {
401            let tok: String = chars[start..]
402                .iter()
403                .map(|c| c.to_ascii_lowercase())
404                .collect();
405            if !tok.is_empty() {
406                tokens.push(tok);
407            }
408        }
409    }
410
411    tokens
412}
413
414fn segments_equal_tokenized_case_insensitive(target: &PathKey, candidate: &PathKey) -> bool {
415    if target.len() != candidate.len() {
416        return false;
417    }
418
419    target
420        .iter_segments()
421        .zip(candidate.iter_segments())
422        .all(|((tk, ts), (ck, cs))| {
423            tk == ck
424                && match tk {
425                    PathKind::Index => ts == cs,
426                    PathKind::Key => {
427                        tokenize_segment(strip_raw_identifier_prefix(ts))
428                            == tokenize_segment(strip_raw_identifier_prefix(cs))
429                    }
430                }
431        })
432}
433
434pub(crate) struct PathRecorder {
435    pub(crate) current: PathKey,
436    pub(crate) map: PathMap,
437}
438
439impl PathRecorder {
440    pub(crate) fn new() -> Self {
441        Self {
442            current: PathKey::empty(),
443            map: PathMap::new(),
444        }
445    }
446}
447
448#[cfg(test)]
449mod tests {
450    use super::*;
451    use crate::Location;
452
453    fn locs(line: usize, column: usize) -> Locations {
454        let l = Location::new(line, column);
455        Locations {
456            reference_location: l,
457            defined_location: l,
458        }
459    }
460
461    fn p2(a: &str, b: &str) -> PathKey {
462        PathKey::empty().join(a).join(b)
463    }
464
465    #[test]
466    fn search_direct_hit() {
467        let mut m = PathMap::new();
468        let k = p2("gp", "a1");
469        m.insert(k.clone(), locs(3, 7));
470
471        assert_eq!(m.search(&k), Some((locs(3, 7), "a1".to_string())));
472    }
473
474    #[test]
475    fn search_case_insensitive_unique() {
476        let mut m = PathMap::new();
477        m.insert(p2("opwKinematics", "a1"), locs(10, 2));
478
479        assert_eq!(
480            m.search(&p2("OPWKINEMATICS", "A1")),
481            Some((locs(10, 2), "a1".to_string()))
482        );
483    }
484
485    #[test]
486    fn search_case_insensitive_ambiguous() {
487        let mut m = PathMap::new();
488        m.insert(p2("FOO", "bar"), locs(1, 1));
489        m.insert(p2("foo", "bar"), locs(2, 2));
490
491        assert_eq!(m.search(&p2("Foo", "BAR")), None);
492    }
493
494    #[test]
495    fn search_tokenized_unique_snake_vs_camel() {
496        let mut m = PathMap::new();
497        m.insert(p2("userId", "a1"), locs(5, 9));
498
499        assert_eq!(
500            m.search(&p2("user_id", "a1")),
501            Some((locs(5, 9), "a1".to_string()))
502        );
503    }
504
505    #[test]
506    fn search_tokenized_unique_separators_equivalent() {
507        let mut m = PathMap::new();
508        // All of these represent the same token sequence ["user","id"].
509        m.insert(p2("user-id", "a1"), locs(7, 3));
510
511        assert_eq!(
512            m.search(&p2("user.id", "a1")),
513            Some((locs(7, 3), "a1".to_string()))
514        );
515        assert_eq!(
516            m.search(&p2("user id", "a1")),
517            Some((locs(7, 3), "a1".to_string()))
518        );
519        assert_eq!(
520            m.search(&p2("UserID", "a1")),
521            Some((locs(7, 3), "a1".to_string()))
522        );
523    }
524
525    #[test]
526    fn search_tokenized_unique_digit_boundaries() {
527        let mut m = PathMap::new();
528        m.insert(p2("sha_256_sum", "a1"), locs(9, 4));
529
530        assert_eq!(
531            m.search(&p2("sha256Sum", "a1")),
532            Some((locs(9, 4), "a1".to_string()))
533        );
534    }
535
536    #[test]
537    fn search_tokenized_unique_acronym_boundary() {
538        let mut m = PathMap::new();
539        m.insert(p2("http_server", "a1"), locs(11, 2));
540
541        assert_eq!(
542            m.search(&p2("HTTPServer", "a1")),
543            Some((locs(11, 2), "a1".to_string()))
544        );
545    }
546
547    #[test]
548    fn search_collapsed_fallback_avoids_token_collision() {
549        let mut m = PathMap::new();
550        // These collide under fully-collapsed matching ("abc"), but are distinct by tokens.
551        m.insert(p2("ab_c", "x"), locs(1, 1));
552        m.insert(p2("a_bc", "x"), locs(2, 2));
553
554        // Target tokenizes to ["ab","c"], so we should pick only the first.
555        assert_eq!(
556            m.search(&p2("abC", "x")),
557            Some((locs(1, 1), "x".to_string()))
558        );
559    }
560
561    #[test]
562    fn search_collapsed_match_unique_after_token_pass_fails() {
563        let mut m = PathMap::new();
564        m.insert(p2("userid", "a1"), locs(12, 6));
565
566        // Tokenization for "userId" yields ["user","id"], while "userid" yields ["userid"].
567        // So token pass does not match; collapsed pass should still bridge it.
568        assert_eq!(
569            m.search(&p2("userId", "a1")),
570            Some((locs(12, 6), "a1".to_string()))
571        );
572    }
573
574    #[test]
575    fn search_collapsed_match_ambiguous() {
576        let mut m = PathMap::new();
577        m.insert(p2("ab_c", "x"), locs(1, 1));
578        m.insert(p2("a_bc", "x"), locs(2, 2));
579
580        // Target tokenizes to ["abc"], so the token pass does not match either candidate.
581        // Collapsed("abc") == "abc" matches both candidates, so the result must be ambiguous.
582        assert_eq!(m.search(&p2("abc", "x")), None);
583    }
584
585    #[test]
586    fn search_returns_resolved_leaf_segment_when_leaf_is_renamed() {
587        let mut m = PathMap::new();
588        // YAML key spelling is camelCase, path might be snake_case.
589        m.insert(PathKey::empty().join("myField"), locs(1, 10));
590
591        assert_eq!(
592            m.search(&PathKey::empty().join("my_field")),
593            Some((locs(1, 10), "myField".to_string()))
594        );
595    }
596
597    #[test]
598    fn search_strips_raw_identifier_prefix() {
599        let mut m = PathMap::new();
600        // Rust reserved keywords use raw identifiers in paths (`r#type`), but YAML keys are plain.
601        m.insert(PathKey::empty().join("type"), locs(9, 3));
602
603        assert_eq!(
604            m.search(&PathKey::empty().join("r#type")),
605            Some((locs(9, 3), "type".to_string()))
606        );
607    }
608
609    #[test]
610    fn search_handles_index_segments() {
611        let mut m = PathMap::new();
612        let path = PathKey::empty().join("items").join(2usize).join("name");
613        m.insert(path.clone(), locs(5, 8));
614
615        assert_eq!(
616            m.search(&PathKey::empty().join("items").join(2usize).join("name")),
617            Some((locs(5, 8), "name".to_string()))
618        );
619    }
620}