config_disassembler/xml/parsers/
parse_unique_id.rs

1//! Parse unique ID from XML element for file naming.
2//!
3//! ## Configuration syntax
4//!
5//! `unique_id_elements` is a comma-separated list of *candidates*; the first
6//! candidate that fully resolves against an element wins. Each candidate is
7//! either:
8//!
9//! * a single field name (e.g. `fullName`) - matches when that field is
10//!   present anywhere in the element's subtree, or
11//! * a `+`-joined **compound** of two or more field names (e.g.
12//!   `actionName+pageOrSobjectType+formFactor`) - matches only when *every*
13//!   sub-field resolves at the same level, in which case the resolved
14//!   values are joined with [`COMPOUND_VALUE_SEPARATOR`] (`__`).
15//!
16//! Compounds let metadata types like `<profileActionOverrides>` - whose
17//! natural unique key is `actionName + pageOrSobjectType + formFactor +
18//! profile [+ recordType]` - produce stable, readable filenames instead of
19//! collapsing every sibling into a SHA-256 fallback. Listing both the wide
20//! and narrow forms (`A+B+C+D, A+B+C, A`) gives a graceful fallback chain
21//! when an item only carries some of the keys.
22//!
23//! Backwards compatibility: any spec that contains no `+` is parsed as a
24//! list of single-field candidates and behaves identically to releases
25//! prior to compound-key support.
26
27use serde_json::Value;
28use sha2::{Digest, Sha256};
29use std::borrow::Cow;
30
31use crate::xml::types::XmlElement;
32
33/// Separator inserted between resolved values when a compound candidate
34/// matches. Picked because filenames are filesystem-safe everywhere and
35/// because individual Salesforce identifier names rarely contain the
36/// double-underscore (single `_` is common - e.g. `Account_Name__c` - so
37/// a single underscore would round-trip ambiguously when values themselves
38/// already contain `_`).
39const COMPOUND_VALUE_SEPARATOR: &str = "__";
40
41/// Replacement character substituted in for any byte that's illegal or
42/// portability-unsafe in a path segment. Underscore matches the convention
43/// used by `sanitize_filename` in the grouped-by-tag write path so behavior
44/// is consistent across strategies.
45const SANITIZED_REPLACEMENT: char = '_';
46
47/// True for characters that are illegal or portability-unsafe inside a
48/// single path segment on at least one supported OS:
49///
50/// - `/` `\`            path separators on Unix / Windows
51/// - `:` `*` `?` `"` `<` `>` `|`   reserved on Windows
52/// - ASCII control bytes (0x00-0x1F)  break terminals and zip readers
53///
54/// Salesforce identifier fields can legitimately contain any of these.
55/// `EntitlementProcess.milestones[*].milestoneName`, for example, accepts
56/// free-form text and we have seen `TrustFile Transaction Sync/Import
57/// Complete` in the wild - the embedded `/` was being interpreted as a
58/// path separator and silently dropped data on round-trip (see #25).
59fn is_illegal_path_char(c: char) -> bool {
60    matches!(c, '/' | '\\' | ':' | '*' | '?' | '"' | '<' | '>' | '|') || c.is_ascii_control()
61}
62
63/// True for trailing characters that Windows silently strips when creating
64/// a file. Leaving these in would let two distinct inputs (`Foo.` vs `Foo`,
65/// `Foo ` vs `Foo`) collide on disk on Windows but not on Unix, breaking
66/// cross-platform stability of disassembled output. Tab is *not* in this
67/// set: Windows accepts trailing tab in filenames and we'd rather replace
68/// the (rare) tab with `_` via the control-char path than silently lose
69/// the byte.
70fn is_trailing_strip_char(c: char) -> bool {
71    matches!(c, '.' | ' ')
72}
73
74/// Sanitize a resolved unique-ID value into a portable path segment.
75///
76/// Borrows the input on the happy path - the vast majority of Salesforce
77/// identifiers (`fullName`, `name`, `developerName`, ...) only contain
78/// ASCII alphanumerics, underscores, hyphens, and dots, all of which are
79/// passed through verbatim. We only allocate when the input contains an
80/// illegal character or has a trailing `.`/space that Windows would
81/// silently strip on write.
82///
83/// Order of operations matters:
84///   1. Trim trailing `.`/space from the *input*. Windows would strip them
85///      on write anyway, so doing it deterministically here keeps Linux
86///      and Windows producing byte-identical filenames.
87///   2. Replace illegal chars in the trimmed input with `_`. Each illegal
88///      char becomes exactly one `_` so the resulting length, and the
89///      mapping between original and replacement positions, is stable.
90///
91/// The substitution is deterministic so the produced filename is stable
92/// across runs and across machines, which keeps source-control diffs
93/// meaningful. When two distinct un-sanitized values collapse to the same
94/// sanitized form (for example `Foo/Bar` and `Foo_Bar` both produce
95/// `Foo_Bar`), the upstream caller's collision detector catches it and
96/// falls back to per-element SHA-256 hashes for the colliding siblings.
97fn sanitize_path_segment(s: &str) -> Cow<'_, str> {
98    let trimmed = s.trim_end_matches(is_trailing_strip_char);
99    let needs_replacement = trimmed.chars().any(is_illegal_path_char);
100    let was_trimmed = trimmed.len() != s.len();
101    if !needs_replacement && !was_trimmed {
102        return Cow::Borrowed(s);
103    }
104    let mut out = String::with_capacity(trimmed.len());
105    for c in trimmed.chars() {
106        if is_illegal_path_char(c) {
107            out.push(SANITIZED_REPLACEMENT);
108        } else {
109            out.push(c);
110        }
111    }
112    if out.is_empty() {
113        // Edge case: input was entirely trimmed away (e.g. `". "`). Returning
114        // an empty string would produce a path like `.<tag>-meta.xml` which
115        // is also invalid. Use a single underscore so the file still writes;
116        // the upstream collision detector will hash any siblings that pile
117        // up here.
118        out.push(SANITIZED_REPLACEMENT);
119    }
120    Cow::Owned(out)
121}
122
123/// Hash the full canonicalized JSON form of an element to derive an 8-char
124/// filename. SHA-256 over distinct content yields distinct prefixes with
125/// vanishingly small collision probability for normal sibling counts.
126fn create_short_hash(element: &XmlElement) -> String {
127    let stringified = serde_json::to_string(element).unwrap_or_default();
128    let mut hasher = Sha256::new();
129    hasher.update(stringified.as_bytes());
130    let result = hasher.finalize();
131    const HEX: &[u8; 16] = b"0123456789abcdef";
132    let mut s = String::with_capacity(8);
133    for b in result.iter().take(4) {
134        s.push(HEX[(b >> 4) as usize] as char);
135        s.push(HEX[(b & 0xf) as usize] as char);
136    }
137    s
138}
139
140/// True only for objects that have at least one element-name child. quick-xml
141/// represents leaf scalars (and attribute-only nodes) as `{ "#text": "..." }` /
142/// `{ "@attr": "...", "#text": "..." }`; those are *not* recursable - if we
143/// recurse into them we end up hashing the same single text-leaf child for
144/// every sibling that happens to start with the same scalar element, which
145/// silently collapses distinct siblings into one filename.
146fn is_recursable_object(value: &Value) -> bool {
147    let Some(obj) = value.as_object() else {
148        return false;
149    };
150    obj.iter()
151        .any(|(k, _)| !k.starts_with('#') && !k.starts_with('@'))
152}
153
154/// Extract string from a value - handles both direct strings and objects with #text (XML leaf elements).
155fn value_as_string(value: &Value) -> Option<String> {
156    if let Some(s) = value.as_str() {
157        return Some(s.to_string());
158    }
159    value
160        .as_object()
161        .and_then(|obj| obj.get("#text"))
162        .and_then(|v| v.as_str())
163        .map(|s| s.to_string())
164}
165
166/// Parse the user-supplied spec into a list of candidates, where each
167/// candidate is itself a list of field names. A candidate of length 1 is a
168/// plain single-field match (legacy behaviour); length >= 2 is a compound.
169///
170/// Empty entries (from leading/trailing commas, double commas, or stray `+`
171/// separators) are filtered so a copy-pasted spec like `, name ,, +foo+ ,`
172/// degrades to `[["name"], ["foo"]]` rather than panicking on empty lookups.
173fn parse_candidates(spec: &str) -> Vec<Vec<&str>> {
174    spec.split(',')
175        .map(|candidate| {
176            candidate
177                .split('+')
178                .map(str::trim)
179                .filter(|f| !f.is_empty())
180                .collect::<Vec<&str>>()
181        })
182        .filter(|fields| !fields.is_empty())
183        .collect()
184}
185
186/// Match a single candidate against the element's *direct* fields. A
187/// single-field candidate succeeds when the field is present and resolves
188/// to a non-empty string; a compound candidate succeeds only when every
189/// sub-field is present and non-empty, in which case the resolved values
190/// are joined with [`COMPOUND_VALUE_SEPARATOR`].
191///
192/// Restricting compounds to the same level keeps the semantics intuitive:
193/// `actionName+profile+recordType` describes a single record's shape, not
194/// a search for those tokens scattered across the subtree.
195fn match_candidate_at_direct(element: &XmlElement, fields: &[&str]) -> Option<String> {
196    let obj = element.as_object()?;
197    let mut parts: Vec<String> = Vec::with_capacity(fields.len());
198    for field in fields {
199        let value = obj.get(*field).and_then(value_as_string)?;
200        if value.is_empty() {
201            return None;
202        }
203        parts.push(value);
204    }
205    if parts.is_empty() {
206        return None;
207    }
208    Some(parts.join(COMPOUND_VALUE_SEPARATOR))
209}
210
211/// Search for a configured unique-id candidate anywhere in the subtree
212/// rooted at `element`. Returns `Some(id)` only when a candidate fully
213/// resolves; returns `None` so the caller can fall back to hashing the
214/// *outer* element rather than a single inner child.
215///
216/// Order of evaluation:
217/// 1. Try every candidate against the direct fields of `element` (so a
218///    direct match always beats a deeper one - preserves the priority that
219///    callers configuring `fullName,name` historically relied on).
220/// 2. If nothing matched, recurse into recursable children and repeat.
221fn find_id_in_subtree(element: &XmlElement, unique_id_elements: &str) -> Option<String> {
222    let candidates = parse_candidates(unique_id_elements);
223    if candidates.is_empty() {
224        return None;
225    }
226    for candidate in &candidates {
227        if let Some(id) = match_candidate_at_direct(element, candidate) {
228            return Some(id);
229        }
230    }
231    let obj = element.as_object()?;
232    for (_, child) in obj {
233        if !is_recursable_object(child) {
234            continue;
235        }
236        if let Some(found) = find_id_in_subtree(child, unique_id_elements) {
237            return Some(found);
238        }
239    }
240    None
241}
242
243/// Get a unique ID for an element, using configured fields or a hash of the
244/// *outer* element when no configured field exists in the subtree.
245///
246/// Hashing must be performed on the outer element (not on whatever inner
247/// child the search happened to visit first) so siblings whose first nested
248/// child shares a value - e.g. a list of `<actionOverrides>` that all start
249/// with `<actionName>View</actionName>` - still produce distinct filenames
250/// reflecting their distinct content.
251///
252/// Resolved configured-field values are passed through [`sanitize_path_segment`]
253/// before being returned so any path-illegal characters in the source value
254/// (e.g. `/` in an `EntitlementProcess` `milestoneName`) are mapped to a
255/// safe placeholder. Hash-fallback values are pure hex and pass through the
256/// sanitizer as a no-op.
257pub fn parse_unique_id_element(element: &XmlElement, unique_id_elements: Option<&str>) -> String {
258    let raw = if let Some(ids) = unique_id_elements {
259        find_id_in_subtree(element, ids).unwrap_or_else(|| create_short_hash(element))
260    } else {
261        create_short_hash(element)
262    };
263    match sanitize_path_segment(&raw) {
264        Cow::Borrowed(_) => raw,
265        Cow::Owned(s) => s,
266    }
267}
268
269/// Hash an arbitrary [`XmlElement`] to its 8-character short hash. Exposed so
270/// the upstream collision detector can request a deterministic fallback for
271/// individual siblings without re-deriving the hash logic.
272pub fn short_hash_for_element(element: &XmlElement) -> String {
273    create_short_hash(element)
274}
275
276#[cfg(test)]
277mod tests {
278    use super::*;
279    use serde_json::json;
280
281    #[test]
282    fn finds_direct_field() {
283        let el = json!({ "name": "Get_Info", "label": "Get Info" });
284        assert_eq!(parse_unique_id_element(&el, Some("name")), "Get_Info");
285    }
286
287    #[test]
288    fn finds_deeply_nested_field() {
289        // value before connector so we find elementReference (matches TS iteration order)
290        let el = json!({
291            "value": { "elementReference": "accts.accounts" },
292            "connector": { "targetReference": "X" }
293        });
294        assert_eq!(
295            parse_unique_id_element(&el, Some("elementReference")),
296            "accts.accounts"
297        );
298    }
299
300    #[test]
301    fn finds_id_in_grandchild() {
302        let el = json!({
303            "wrapper": {
304                "inner": { "name": "NestedName" }
305            }
306        });
307        assert_eq!(parse_unique_id_element(&el, Some("name")), "NestedName");
308    }
309
310    #[test]
311    fn value_as_string_returns_none_for_non_string_non_text_objects() {
312        // Directly named field exists but value is neither a string nor an object with #text.
313        // Exercises the None-return path inside value_as_string plus the "no match, move on"
314        // path inside find_direct_field_match.
315        let el = json!({ "name": { "other": "xxx" } });
316        let id = parse_unique_id_element(&el, Some("name"));
317        // Falls through to the 8-char short-hash fallback.
318        assert_eq!(id.len(), 8);
319    }
320
321    #[test]
322    fn falls_back_to_hash_when_no_match_and_no_nested_object() {
323        // No direct match and no nested object match → hash fallback.
324        let el = json!({ "a": "string", "b": "another" });
325        let id = parse_unique_id_element(&el, Some("name"));
326        assert_eq!(id.len(), 8);
327    }
328
329    #[test]
330    fn hash_fallback_when_unique_id_elements_is_none() {
331        let el = json!({ "a": "b" });
332        let id = parse_unique_id_element(&el, None);
333        assert_eq!(id.len(), 8);
334    }
335
336    #[test]
337    fn non_object_element_returns_hash() {
338        let el = json!("just-a-string");
339        let id = parse_unique_id_element(&el, Some("name"));
340        assert_eq!(id.len(), 8);
341    }
342
343    #[test]
344    fn finds_name_from_text_object() {
345        // XML parser stores leaf elements as { "#text": "value" }
346        let el = json!({
347            "name": { "#text": "Get_Info" },
348            "label": { "#text": "Get Info" },
349            "actionName": { "#text": "GetFirstFromCollection" }
350        });
351        assert_eq!(parse_unique_id_element(&el, Some("name")), "Get_Info");
352        assert_eq!(
353            parse_unique_id_element(&el, Some("actionName")),
354            "GetFirstFromCollection"
355        );
356    }
357
358    // ---- regression: text-leaf siblings must NOT collapse to one hash ------
359
360    /// Models a `<CustomApplication>`'s `<actionOverrides>`: every block has
361    /// the same `<actionName>View</actionName>` first child but distinct
362    /// `<content>` and `<pageOrSobjectType>` payloads. With the old
363    /// implementation the recursion landed on `{"#text":"View"}` for every
364    /// sibling and they all hashed to the same 8-char prefix, silently
365    /// collapsing 100s of overrides into a single shard that contained only
366    /// the last one written.
367    #[test]
368    fn distinct_siblings_with_shared_first_text_leaf_get_distinct_hashes() {
369        let make_action_override = |i: u32| -> XmlElement {
370            json!({
371                "actionName": { "#text": "View" },
372                "comment": { "#text": format!("Action override {i}") },
373                "content": { "#text": format!("Sample_Page_{i:05}") },
374                "formFactor": { "#text": "Large" },
375                "skipRecordTypeSelect": { "#text": "false" },
376                "type": { "#text": "Flexipage" },
377                "pageOrSobjectType": { "#text": format!("Sample_Object_{i:03}__c") }
378            })
379        };
380
381        // Default unique-id elements ("fullName,name") - none of these are
382        // present on actionOverride children.
383        let ids = Some("fullName,name");
384
385        let mut seen = std::collections::HashSet::new();
386        for i in 1..=128 {
387            let id = parse_unique_id_element(&make_action_override(i), ids);
388            assert_eq!(id.len(), 8, "expected an 8-char short hash, got {id}");
389            assert!(
390                seen.insert(id.clone()),
391                "duplicate hash {id} for actionOverride {i} - distinct siblings collapsed"
392            );
393        }
394    }
395
396    /// Same shape but with no unique-id config at all: must also produce
397    /// distinct hashes per sibling.
398    #[test]
399    fn distinct_siblings_get_distinct_hashes_with_no_unique_id_config() {
400        let mut seen = std::collections::HashSet::new();
401        for i in 1..=64 {
402            let el = json!({
403                "actionName": { "#text": "View" },
404                "content": { "#text": format!("Page_{i}") }
405            });
406            let id = parse_unique_id_element(&el, None);
407            assert!(
408                seen.insert(id.clone()),
409                "duplicate hash {id} at index {i} with no unique-id config"
410            );
411        }
412    }
413
414    /// `find_id_in_subtree` must skip text-leaf wrappers like
415    /// `{"#text": "..."}` rather than treat them as recursable objects.
416    /// Otherwise the search returns a hash of the inner wrapper rather than
417    /// hashing the outer element.
418    #[test]
419    fn text_leaf_wrappers_are_not_recursable() {
420        let leaf = json!({ "#text": "View" });
421        assert!(!is_recursable_object(&leaf));
422
423        let attrs_only = json!({ "@attr": "x", "#text": "y" });
424        assert!(!is_recursable_object(&attrs_only));
425
426        let real = json!({ "name": "x" });
427        assert!(is_recursable_object(&real));
428
429        let mixed = json!({ "@attr": "x", "name": "y" });
430        assert!(is_recursable_object(&mixed));
431    }
432
433    // ---- compound-key support ----------------------------------------------
434
435    /// A `<profileActionOverrides>` element with the full key set. The
436    /// compound `actionName+pageOrSobjectType+formFactor+profile` must
437    /// resolve to all four values joined with `__`.
438    #[test]
439    fn compound_resolves_when_all_fields_present() {
440        let el = json!({
441            "actionName": { "#text": "Tab" },
442            "content": { "#text": "Home_Page_Default" },
443            "formFactor": { "#text": "Large" },
444            "pageOrSobjectType": { "#text": "standard-home" },
445            "type": { "#text": "Flexipage" },
446            "profile": { "#text": "Implementation_Lightning" }
447        });
448        let id =
449            parse_unique_id_element(&el, Some("actionName+pageOrSobjectType+formFactor+profile"));
450        assert_eq!(id, "Tab__standard-home__Large__Implementation_Lightning");
451    }
452
453    /// A compound that names a field the element doesn't have must NOT
454    /// match - the next candidate (a narrower compound, then a single
455    /// field) takes over.
456    #[test]
457    fn compound_falls_through_when_one_field_missing() {
458        // `<actionOverrides>` (no profile, no recordType) - the wide compound
459        // must fail, the narrow compound must succeed.
460        let el = json!({
461            "actionName": { "#text": "View" },
462            "content": { "#text": "LUX_Case_Release_Candidate_Copy" },
463            "formFactor": { "#text": "Large" },
464            "pageOrSobjectType": { "#text": "Case" },
465            "type": { "#text": "Flexipage" }
466        });
467        let spec = "actionName+pageOrSobjectType+formFactor+profile,actionName+pageOrSobjectType+formFactor,actionName";
468        assert_eq!(
469            parse_unique_id_element(&el, Some(spec)),
470            "View__Case__Large"
471        );
472    }
473
474    /// All compound candidates miss → the loop must fall back to the
475    /// single-field candidate at the tail of the spec, and ultimately to
476    /// the outer-element hash if even that misses.
477    #[test]
478    fn compound_then_single_then_hash_fallback() {
479        let el = json!({
480            "actionName": { "#text": "View" }
481        });
482        let spec_all_compound =
483            "actionName+pageOrSobjectType+formFactor+profile,actionName+pageOrSobjectType";
484        let id = parse_unique_id_element(&el, Some(spec_all_compound));
485        assert_eq!(
486            id.len(),
487            8,
488            "no candidate should match → hash fallback, got {id}"
489        );
490
491        let spec_with_single_tail = "actionName+pageOrSobjectType+formFactor,actionName";
492        assert_eq!(
493            parse_unique_id_element(&el, Some(spec_with_single_tail)),
494            "View"
495        );
496    }
497
498    /// Empty values (`<recordType></recordType>`) must be treated as
499    /// missing for the purpose of compound matching - otherwise we would
500    /// emit filenames like `View__Account__Large__` with a trailing
501    /// separator and silently collide with siblings that genuinely lack
502    /// the field.
503    #[test]
504    fn compound_treats_empty_values_as_missing() {
505        let el = json!({
506            "actionName": { "#text": "View" },
507            "pageOrSobjectType": { "#text": "Account" },
508            "recordType": { "#text": "" }  // explicitly empty
509        });
510        let spec = "actionName+pageOrSobjectType+recordType,actionName+pageOrSobjectType";
511        assert_eq!(
512            parse_unique_id_element(&el, Some(spec)),
513            "View__Account",
514            "empty <recordType> must be treated as missing"
515        );
516    }
517
518    /// Distinct profileActionOverrides siblings sharing actionName +
519    /// pageOrSobjectType + formFactor but differing in `profile` must
520    /// produce distinct compound IDs (not collide).
521    #[test]
522    fn compound_disambiguates_siblings_that_share_outer_fields() {
523        let make = |profile: &str| {
524            json!({
525                "actionName": { "#text": "Tab" },
526                "content": { "#text": "Home_Page_Default" },
527                "formFactor": { "#text": "Large" },
528                "pageOrSobjectType": { "#text": "standard-home" },
529                "type": { "#text": "Flexipage" },
530                "profile": { "#text": profile }
531            })
532        };
533        let spec = "actionName+pageOrSobjectType+formFactor+profile";
534        let a = parse_unique_id_element(&make("Implementation_Lightning"), Some(spec));
535        let b = parse_unique_id_element(&make("Sales_Lightning"), Some(spec));
536        assert_ne!(a, b);
537        assert!(a.ends_with("Implementation_Lightning"));
538        assert!(b.ends_with("Sales_Lightning"));
539    }
540
541    /// A single-field spec must behave identically to releases prior to
542    /// compound-key support: same priority (direct first, then nested),
543    /// same hash fallback, no spurious `__` separators.
544    #[test]
545    fn single_field_behaviour_is_unchanged() {
546        let el = json!({ "name": "Get_Info", "label": "Get Info" });
547        assert_eq!(parse_unique_id_element(&el, Some("name")), "Get_Info");
548
549        // Direct vs nested priority preserved.
550        let nested = json!({
551            "wrapper": { "name": "NestedName" }
552        });
553        assert_eq!(parse_unique_id_element(&nested, Some("name")), "NestedName");
554    }
555
556    /// Pathological/malformed specs - leading commas, stray `+`, all
557    /// whitespace - must not panic and must degrade to hash fallback.
558    #[test]
559    fn malformed_spec_degrades_to_hash() {
560        let el = json!({ "foo": "bar" });
561        let id = parse_unique_id_element(&el, Some(",,+,, "));
562        assert_eq!(id.len(), 8, "all-empty candidates → hash fallback");
563    }
564
565    // ---- path-segment sanitization (issue #25) ------------------------------
566
567    /// Salesforce identifiers can legitimately contain characters that are
568    /// illegal in a path segment. The most common offender is `/` (seen in
569    /// the wild on `EntitlementProcess.milestones[*].milestoneName`). Without
570    /// sanitization the resolved id `Foo/Bar` is interpreted by the OS as
571    /// the path `Foo/Bar.tag-meta.xml`, silently writing into a non-existent
572    /// `Foo/` directory and dropping data. Each forbidden char must collapse
573    /// to a single `_`.
574    #[test]
575    fn sanitize_replaces_path_separators() {
576        assert_eq!(sanitize_path_segment("Foo/Bar"), "Foo_Bar");
577        assert_eq!(sanitize_path_segment("Foo\\Bar"), "Foo_Bar");
578        assert_eq!(
579            sanitize_path_segment("TrustFile Transaction Sync/Import Complete"),
580            "TrustFile Transaction Sync_Import Complete"
581        );
582    }
583
584    #[test]
585    fn sanitize_replaces_windows_reserved_chars() {
586        for c in [':', '*', '?', '"', '<', '>', '|'] {
587            let input = format!("a{c}b");
588            assert_eq!(sanitize_path_segment(&input), "a_b", "char={c}");
589        }
590    }
591
592    #[test]
593    fn sanitize_replaces_control_characters() {
594        // 0x00 (NUL), 0x09 (TAB), 0x1F (US) all map to `_`.
595        assert_eq!(sanitize_path_segment("a\u{0}b"), "a_b");
596        assert_eq!(sanitize_path_segment("a\u{1f}b"), "a_b");
597    }
598
599    #[test]
600    fn sanitize_strips_trailing_dot_and_space() {
601        // Windows write semantics drop trailing `.` and space silently;
602        // leaving them in would let two distinct inputs collide on disk.
603        // Tab and other control characters are NOT in this set - they're
604        // replaced with `_` via the control-char path so the byte isn't
605        // lost (`Foo\t` -> `Foo_` rather than `Foo`).
606        assert_eq!(sanitize_path_segment("Foo."), "Foo");
607        assert_eq!(sanitize_path_segment("Foo "), "Foo");
608        assert_eq!(sanitize_path_segment("Foo. ."), "Foo");
609        assert_eq!(sanitize_path_segment("Foo\t"), "Foo_");
610    }
611
612    #[test]
613    fn sanitize_passes_safe_inputs_through_unchanged() {
614        // Borrows on the happy path - exercise via the Cow variant.
615        let cases = [
616            "Account",
617            "Account_Name__c",
618            "Sample_Object_005__c",
619            "Implementation - TrustFile Amazon",
620            "View",
621            "TrustFile Account Setup Complete",
622            "View__Account__Large__SalesProfile",
623            // Inner dots are fine; only TRAILING dots are stripped.
624            "Account.LogACall",
625            "Sample_Object_017__c.Sample_Record_Type_0123",
626        ];
627        for case in cases {
628            match sanitize_path_segment(case) {
629                Cow::Borrowed(s) => assert_eq!(s, case, "unexpected mutation for {case:?}"),
630                Cow::Owned(s) => panic!("unexpected allocation for {case:?}: got {s:?}"),
631            }
632        }
633    }
634
635    #[test]
636    fn sanitize_replaces_illegal_chars_one_for_one() {
637        // Each illegal char becomes exactly one `_` so the result length
638        // and structure mirror the input - critical for collision-detection
639        // signal: two distinct inputs differing only in their illegal chars
640        // produce distinct sanitized outputs and the collision detector
641        // does not need to fire.
642        assert_eq!(sanitize_path_segment("///"), "___");
643        assert_eq!(sanitize_path_segment("/"), "_");
644        assert_eq!(sanitize_path_segment("a/b/c"), "a_b_c");
645        assert_eq!(sanitize_path_segment("a*b?c"), "a_b_c");
646    }
647
648    #[test]
649    fn sanitize_replacement_yields_underscore_when_input_collapses_to_empty() {
650        // Edge case: input is entirely trailing-trim-able (e.g. `". ."` or `". "`).
651        // After trim the string is empty, which would produce a degenerate
652        // filename like `.<tag>-meta.xml`. Substitute a single `_` so the
653        // file still writes; the upstream collision detector will hash any
654        // siblings that pile up here.
655        assert_eq!(sanitize_path_segment(". ."), "_");
656        assert_eq!(sanitize_path_segment(". "), "_");
657        assert_eq!(sanitize_path_segment("."), "_");
658        assert_eq!(sanitize_path_segment(" "), "_");
659    }
660
661    #[test]
662    fn sanitize_handles_empty_input() {
663        // Empty in -> empty out. Caller is responsible for upgrading to a
664        // hash if they need a non-empty filename; sanitize itself has no
665        // useful work to do here.
666        let out = sanitize_path_segment("");
667        assert!(matches!(out, Cow::Borrowed(s) if s.is_empty()));
668    }
669
670    /// `parse_unique_id_element` MUST apply sanitization at the boundary so
671    /// every caller (single-field, compound, multi-level) gets it for free.
672    /// This is the regression test that pairs with issue #25.
673    #[test]
674    fn parse_unique_id_element_sanitizes_resolved_value() {
675        let el = json!({
676            "milestoneName": { "#text": "TrustFile Transaction Sync/Import Complete" }
677        });
678        let id = parse_unique_id_element(&el, Some("milestoneName"));
679        assert!(!id.contains('/'), "resolved id must not contain `/`: {id}");
680        assert_eq!(id, "TrustFile Transaction Sync_Import Complete");
681    }
682
683    #[test]
684    fn parse_unique_id_element_sanitizes_compound_values() {
685        // Compound values are joined with `__`; if any component contains
686        // an illegal char it must be sanitized BEFORE the join (otherwise
687        // the illegal char survives into the produced filename).
688        let el = json!({
689            "actionName": { "#text": "View" },
690            "pageOrSobjectType": { "#text": "Sample/Object__c" },
691            "formFactor": { "#text": "Large" }
692        });
693        let id = parse_unique_id_element(&el, Some("actionName+pageOrSobjectType+formFactor"));
694        assert!(!id.contains('/'), "compound id must not contain `/`: {id}");
695        assert_eq!(id, "View__Sample_Object__c__Large");
696    }
697
698    #[test]
699    fn parse_unique_id_element_hash_fallback_is_unaffected_by_sanitizer() {
700        // Hash fallback returns 8 hex chars, all of which are safe; the
701        // sanitizer must be a no-op here.
702        let el = json!({ "a": "b" });
703        let id = parse_unique_id_element(&el, Some("name"));
704        assert_eq!(id.len(), 8);
705        assert!(id.chars().all(|c| c.is_ascii_hexdigit()));
706    }
707
708    /// Recursion must only return when a configured unique-id field is
709    /// *actually* found, not when a recursive call falls back to its own
710    /// hash. The hash is computed exactly once, at the top level, on the
711    /// outer element.
712    #[test]
713    fn nested_search_does_not_return_inner_hash() {
714        // Two distinct outer elements whose first recursable child has the
715        // same shape. With the old behavior the recursion would compute a
716        // hash of that inner child for both - same hash for distinct outers.
717        // With the fix, each outer is hashed in full and they differ.
718        let a = json!({
719            "wrapper": { "leafA": "shared", "extraA": "different-A" },
720            "outerA": "A"
721        });
722        let b = json!({
723            "wrapper": { "leafA": "shared", "extraA": "different-A" },
724            "outerB": "B"
725        });
726        let id_a = parse_unique_id_element(&a, Some("name"));
727        let id_b = parse_unique_id_element(&b, Some("name"));
728        assert_ne!(id_a, id_b);
729    }
730}
config_disassembler/xml/parsers/parse_unique_id.rs

config_disassembler/xml/parsers/
parse_unique_id.rs