config_disassembler/xml/parsers/parse_unique_id.rs
1//! Parse unique ID from XML element for file naming.
2//!
3//! ## Configuration syntax
4//!
5//! `unique_id_elements` is a comma-separated list of *candidates*; the first
6//! candidate that fully resolves against an element wins. Each candidate is
7//! either:
8//!
9//! * a single field name (e.g. `fullName`) - matches when that field is
10//! present anywhere in the element's subtree, or
11//! * a `+`-joined **compound** of two or more field names (e.g.
12//! `actionName+pageOrSobjectType+formFactor`) - matches only when *every*
13//! sub-field resolves at the same level, in which case the resolved
14//! values are joined with [`COMPOUND_VALUE_SEPARATOR`] (`__`).
15//!
16//! Compounds let metadata types like `<profileActionOverrides>` - whose
17//! natural unique key is `actionName + pageOrSobjectType + formFactor +
18//! profile [+ recordType]` - produce stable, readable filenames instead of
19//! collapsing every sibling into a SHA-256 fallback. Listing both the wide
20//! and narrow forms (`A+B+C+D, A+B+C, A`) gives a graceful fallback chain
21//! when an item only carries some of the keys.
22//!
23//! Backwards compatibility: any spec that contains no `+` is parsed as a
24//! list of single-field candidates and behaves identically to releases
25//! prior to compound-key support.
26
27use serde_json::Value;
28use sha2::{Digest, Sha256};
29use std::borrow::Cow;
30
31use crate::xml::types::XmlElement;
32
33/// Separator inserted between resolved values when a compound candidate
34/// matches. Picked because filenames are filesystem-safe everywhere and
35/// because individual Salesforce identifier names rarely contain the
36/// double-underscore (single `_` is common - e.g. `Account_Name__c` - so
37/// a single underscore would round-trip ambiguously when values themselves
38/// already contain `_`).
39const COMPOUND_VALUE_SEPARATOR: &str = "__";
40
41/// Replacement character substituted in for any byte that's illegal or
42/// portability-unsafe in a path segment. Underscore matches the convention
43/// used by `sanitize_filename` in the grouped-by-tag write path so behavior
44/// is consistent across strategies.
45const SANITIZED_REPLACEMENT: char = '_';
46
47/// True for characters that are illegal or portability-unsafe inside a
48/// single path segment on at least one supported OS:
49///
50/// - `/` `\` path separators on Unix / Windows
51/// - `:` `*` `?` `"` `<` `>` `|` reserved on Windows
52/// - ASCII control bytes (0x00-0x1F) break terminals and zip readers
53///
54/// Salesforce identifier fields can legitimately contain any of these.
55/// `EntitlementProcess.milestones[*].milestoneName`, for example, accepts
56/// free-form text and we have seen `TrustFile Transaction Sync/Import
57/// Complete` in the wild - the embedded `/` was being interpreted as a
58/// path separator and silently dropped data on round-trip (see #25).
59fn is_illegal_path_char(c: char) -> bool {
60 matches!(c, '/' | '\\' | ':' | '*' | '?' | '"' | '<' | '>' | '|') || c.is_ascii_control()
61}
62
63/// True for trailing characters that Windows silently strips when creating
64/// a file. Leaving these in would let two distinct inputs (`Foo.` vs `Foo`,
65/// `Foo ` vs `Foo`) collide on disk on Windows but not on Unix, breaking
66/// cross-platform stability of disassembled output. Tab is *not* in this
67/// set: Windows accepts trailing tab in filenames and we'd rather replace
68/// the (rare) tab with `_` via the control-char path than silently lose
69/// the byte.
70fn is_trailing_strip_char(c: char) -> bool {
71 matches!(c, '.' | ' ')
72}
73
74/// Sanitize a resolved unique-ID value into a portable path segment.
75///
76/// Borrows the input on the happy path - the vast majority of Salesforce
77/// identifiers (`fullName`, `name`, `developerName`, ...) only contain
78/// ASCII alphanumerics, underscores, hyphens, and dots, all of which are
79/// passed through verbatim. We only allocate when the input contains an
80/// illegal character or has a trailing `.`/space that Windows would
81/// silently strip on write.
82///
83/// Order of operations matters:
84/// 1. Trim trailing `.`/space from the *input*. Windows would strip them
85/// on write anyway, so doing it deterministically here keeps Linux
86/// and Windows producing byte-identical filenames.
87/// 2. Replace illegal chars in the trimmed input with `_`. Each illegal
88/// char becomes exactly one `_` so the resulting length, and the
89/// mapping between original and replacement positions, is stable.
90///
91/// The substitution is deterministic so the produced filename is stable
92/// across runs and across machines, which keeps source-control diffs
93/// meaningful. When two distinct un-sanitized values collapse to the same
94/// sanitized form (for example `Foo/Bar` and `Foo_Bar` both produce
95/// `Foo_Bar`), the upstream caller's collision detector catches it and
96/// falls back to per-element SHA-256 hashes for the colliding siblings.
97fn sanitize_path_segment(s: &str) -> Cow<'_, str> {
98 let trimmed = s.trim_end_matches(is_trailing_strip_char);
99 let needs_replacement = trimmed.chars().any(is_illegal_path_char);
100 let was_trimmed = trimmed.len() != s.len();
101 if !needs_replacement && !was_trimmed {
102 return Cow::Borrowed(s);
103 }
104 let mut out = String::with_capacity(trimmed.len());
105 for c in trimmed.chars() {
106 if is_illegal_path_char(c) {
107 out.push(SANITIZED_REPLACEMENT);
108 } else {
109 out.push(c);
110 }
111 }
112 if out.is_empty() {
113 // Edge case: input was entirely trimmed away (e.g. `". "`). Returning
114 // an empty string would produce a path like `.<tag>-meta.xml` which
115 // is also invalid. Use a single underscore so the file still writes;
116 // the upstream collision detector will hash any siblings that pile
117 // up here.
118 out.push(SANITIZED_REPLACEMENT);
119 }
120 Cow::Owned(out)
121}
122
123/// Hash the full canonicalized JSON form of an element to derive an 8-char
124/// filename. SHA-256 over distinct content yields distinct prefixes with
125/// vanishingly small collision probability for normal sibling counts.
126fn create_short_hash(element: &XmlElement) -> String {
127 let stringified = serde_json::to_string(element).unwrap_or_default();
128 let mut hasher = Sha256::new();
129 hasher.update(stringified.as_bytes());
130 let result = hasher.finalize();
131 const HEX: &[u8; 16] = b"0123456789abcdef";
132 let mut s = String::with_capacity(8);
133 for b in result.iter().take(4) {
134 s.push(HEX[(b >> 4) as usize] as char);
135 s.push(HEX[(b & 0xf) as usize] as char);
136 }
137 s
138}
139
140/// True only for objects that have at least one element-name child. quick-xml
141/// represents leaf scalars (and attribute-only nodes) as `{ "#text": "..." }` /
142/// `{ "@attr": "...", "#text": "..." }`; those are *not* recursable - if we
143/// recurse into them we end up hashing the same single text-leaf child for
144/// every sibling that happens to start with the same scalar element, which
145/// silently collapses distinct siblings into one filename.
146fn is_recursable_object(value: &Value) -> bool {
147 let Some(obj) = value.as_object() else {
148 return false;
149 };
150 obj.iter()
151 .any(|(k, _)| !k.starts_with('#') && !k.starts_with('@'))
152}
153
154/// Extract string from a value - handles both direct strings and objects with #text (XML leaf elements).
155fn value_as_string(value: &Value) -> Option<String> {
156 if let Some(s) = value.as_str() {
157 return Some(s.to_string());
158 }
159 value
160 .as_object()
161 .and_then(|obj| obj.get("#text"))
162 .and_then(|v| v.as_str())
163 .map(|s| s.to_string())
164}
165
166/// Parse the user-supplied spec into a list of candidates, where each
167/// candidate is itself a list of field names. A candidate of length 1 is a
168/// plain single-field match (legacy behaviour); length >= 2 is a compound.
169///
170/// Empty entries (from leading/trailing commas, double commas, or stray `+`
171/// separators) are filtered so a copy-pasted spec like `, name ,, +foo+ ,`
172/// degrades to `[["name"], ["foo"]]` rather than panicking on empty lookups.
173fn parse_candidates(spec: &str) -> Vec<Vec<&str>> {
174 spec.split(',')
175 .map(|candidate| {
176 candidate
177 .split('+')
178 .map(str::trim)
179 .filter(|f| !f.is_empty())
180 .collect::<Vec<&str>>()
181 })
182 .filter(|fields| !fields.is_empty())
183 .collect()
184}
185
186/// Match a single candidate against the element's *direct* fields. A
187/// single-field candidate succeeds when the field is present and resolves
188/// to a non-empty string; a compound candidate succeeds only when every
189/// sub-field is present and non-empty, in which case the resolved values
190/// are joined with [`COMPOUND_VALUE_SEPARATOR`].
191///
192/// Restricting compounds to the same level keeps the semantics intuitive:
193/// `actionName+profile+recordType` describes a single record's shape, not
194/// a search for those tokens scattered across the subtree.
195fn match_candidate_at_direct(element: &XmlElement, fields: &[&str]) -> Option<String> {
196 let obj = element.as_object()?;
197 let mut parts: Vec<String> = Vec::with_capacity(fields.len());
198 for field in fields {
199 let value = obj.get(*field).and_then(value_as_string)?;
200 if value.is_empty() {
201 return None;
202 }
203 parts.push(value);
204 }
205 if parts.is_empty() {
206 return None;
207 }
208 Some(parts.join(COMPOUND_VALUE_SEPARATOR))
209}
210
211/// Search for a configured unique-id candidate anywhere in the subtree
212/// rooted at `element`. Returns `Some(id)` only when a candidate fully
213/// resolves; returns `None` so the caller can fall back to hashing the
214/// *outer* element rather than a single inner child.
215///
216/// Order of evaluation:
217/// 1. Try every candidate against the direct fields of `element` (so a
218/// direct match always beats a deeper one - preserves the priority that
219/// callers configuring `fullName,name` historically relied on).
220/// 2. If nothing matched, recurse into recursable children and repeat.
221fn find_id_in_subtree(element: &XmlElement, unique_id_elements: &str) -> Option<String> {
222 let candidates = parse_candidates(unique_id_elements);
223 if candidates.is_empty() {
224 return None;
225 }
226 for candidate in &candidates {
227 if let Some(id) = match_candidate_at_direct(element, candidate) {
228 return Some(id);
229 }
230 }
231 let obj = element.as_object()?;
232 for (_, child) in obj {
233 if !is_recursable_object(child) {
234 continue;
235 }
236 if let Some(found) = find_id_in_subtree(child, unique_id_elements) {
237 return Some(found);
238 }
239 }
240 None
241}
242
243/// Get a unique ID for an element, using configured fields or a hash of the
244/// *outer* element when no configured field exists in the subtree.
245///
246/// Hashing must be performed on the outer element (not on whatever inner
247/// child the search happened to visit first) so siblings whose first nested
248/// child shares a value - e.g. a list of `<actionOverrides>` that all start
249/// with `<actionName>View</actionName>` - still produce distinct filenames
250/// reflecting their distinct content.
251///
252/// Resolved configured-field values are passed through [`sanitize_path_segment`]
253/// before being returned so any path-illegal characters in the source value
254/// (e.g. `/` in an `EntitlementProcess` `milestoneName`) are mapped to a
255/// safe placeholder. Hash-fallback values are pure hex and pass through the
256/// sanitizer as a no-op.
257pub fn parse_unique_id_element(element: &XmlElement, unique_id_elements: Option<&str>) -> String {
258 let raw = if let Some(ids) = unique_id_elements {
259 find_id_in_subtree(element, ids).unwrap_or_else(|| create_short_hash(element))
260 } else {
261 create_short_hash(element)
262 };
263 match sanitize_path_segment(&raw) {
264 Cow::Borrowed(_) => raw,
265 Cow::Owned(s) => s,
266 }
267}
268
269/// Hash an arbitrary [`XmlElement`] to its 8-character short hash. Exposed so
270/// the upstream collision detector can request a deterministic fallback for
271/// individual siblings without re-deriving the hash logic.
272pub fn short_hash_for_element(element: &XmlElement) -> String {
273 create_short_hash(element)
274}
275
276#[cfg(test)]
277mod tests {
278 use super::*;
279 use serde_json::json;
280
281 #[test]
282 fn finds_direct_field() {
283 let el = json!({ "name": "Get_Info", "label": "Get Info" });
284 assert_eq!(parse_unique_id_element(&el, Some("name")), "Get_Info");
285 }
286
287 #[test]
288 fn finds_deeply_nested_field() {
289 // value before connector so we find elementReference (matches TS iteration order)
290 let el = json!({
291 "value": { "elementReference": "accts.accounts" },
292 "connector": { "targetReference": "X" }
293 });
294 assert_eq!(
295 parse_unique_id_element(&el, Some("elementReference")),
296 "accts.accounts"
297 );
298 }
299
300 #[test]
301 fn finds_id_in_grandchild() {
302 let el = json!({
303 "wrapper": {
304 "inner": { "name": "NestedName" }
305 }
306 });
307 assert_eq!(parse_unique_id_element(&el, Some("name")), "NestedName");
308 }
309
310 #[test]
311 fn value_as_string_returns_none_for_non_string_non_text_objects() {
312 // Directly named field exists but value is neither a string nor an object with #text.
313 // Exercises the None-return path inside value_as_string plus the "no match, move on"
314 // path inside find_direct_field_match.
315 let el = json!({ "name": { "other": "xxx" } });
316 let id = parse_unique_id_element(&el, Some("name"));
317 // Falls through to the 8-char short-hash fallback.
318 assert_eq!(id.len(), 8);
319 }
320
321 #[test]
322 fn falls_back_to_hash_when_no_match_and_no_nested_object() {
323 // No direct match and no nested object match → hash fallback.
324 let el = json!({ "a": "string", "b": "another" });
325 let id = parse_unique_id_element(&el, Some("name"));
326 assert_eq!(id.len(), 8);
327 }
328
329 #[test]
330 fn hash_fallback_when_unique_id_elements_is_none() {
331 let el = json!({ "a": "b" });
332 let id = parse_unique_id_element(&el, None);
333 assert_eq!(id.len(), 8);
334 }
335
336 #[test]
337 fn non_object_element_returns_hash() {
338 let el = json!("just-a-string");
339 let id = parse_unique_id_element(&el, Some("name"));
340 assert_eq!(id.len(), 8);
341 }
342
343 #[test]
344 fn finds_name_from_text_object() {
345 // XML parser stores leaf elements as { "#text": "value" }
346 let el = json!({
347 "name": { "#text": "Get_Info" },
348 "label": { "#text": "Get Info" },
349 "actionName": { "#text": "GetFirstFromCollection" }
350 });
351 assert_eq!(parse_unique_id_element(&el, Some("name")), "Get_Info");
352 assert_eq!(
353 parse_unique_id_element(&el, Some("actionName")),
354 "GetFirstFromCollection"
355 );
356 }
357
358 // ---- regression: text-leaf siblings must NOT collapse to one hash ------
359
360 /// Models a `<CustomApplication>`'s `<actionOverrides>`: every block has
361 /// the same `<actionName>View</actionName>` first child but distinct
362 /// `<content>` and `<pageOrSobjectType>` payloads. With the old
363 /// implementation the recursion landed on `{"#text":"View"}` for every
364 /// sibling and they all hashed to the same 8-char prefix, silently
365 /// collapsing 100s of overrides into a single shard that contained only
366 /// the last one written.
367 #[test]
368 fn distinct_siblings_with_shared_first_text_leaf_get_distinct_hashes() {
369 let make_action_override = |i: u32| -> XmlElement {
370 json!({
371 "actionName": { "#text": "View" },
372 "comment": { "#text": format!("Action override {i}") },
373 "content": { "#text": format!("Sample_Page_{i:05}") },
374 "formFactor": { "#text": "Large" },
375 "skipRecordTypeSelect": { "#text": "false" },
376 "type": { "#text": "Flexipage" },
377 "pageOrSobjectType": { "#text": format!("Sample_Object_{i:03}__c") }
378 })
379 };
380
381 // Default unique-id elements ("fullName,name") - none of these are
382 // present on actionOverride children.
383 let ids = Some("fullName,name");
384
385 let mut seen = std::collections::HashSet::new();
386 for i in 1..=128 {
387 let id = parse_unique_id_element(&make_action_override(i), ids);
388 assert_eq!(id.len(), 8, "expected an 8-char short hash, got {id}");
389 assert!(
390 seen.insert(id.clone()),
391 "duplicate hash {id} for actionOverride {i} - distinct siblings collapsed"
392 );
393 }
394 }
395
396 /// Same shape but with no unique-id config at all: must also produce
397 /// distinct hashes per sibling.
398 #[test]
399 fn distinct_siblings_get_distinct_hashes_with_no_unique_id_config() {
400 let mut seen = std::collections::HashSet::new();
401 for i in 1..=64 {
402 let el = json!({
403 "actionName": { "#text": "View" },
404 "content": { "#text": format!("Page_{i}") }
405 });
406 let id = parse_unique_id_element(&el, None);
407 assert!(
408 seen.insert(id.clone()),
409 "duplicate hash {id} at index {i} with no unique-id config"
410 );
411 }
412 }
413
414 /// `find_id_in_subtree` must skip text-leaf wrappers like
415 /// `{"#text": "..."}` rather than treat them as recursable objects.
416 /// Otherwise the search returns a hash of the inner wrapper rather than
417 /// hashing the outer element.
418 #[test]
419 fn text_leaf_wrappers_are_not_recursable() {
420 let leaf = json!({ "#text": "View" });
421 assert!(!is_recursable_object(&leaf));
422
423 let attrs_only = json!({ "@attr": "x", "#text": "y" });
424 assert!(!is_recursable_object(&attrs_only));
425
426 let real = json!({ "name": "x" });
427 assert!(is_recursable_object(&real));
428
429 let mixed = json!({ "@attr": "x", "name": "y" });
430 assert!(is_recursable_object(&mixed));
431 }
432
433 // ---- compound-key support ----------------------------------------------
434
435 /// A `<profileActionOverrides>` element with the full key set. The
436 /// compound `actionName+pageOrSobjectType+formFactor+profile` must
437 /// resolve to all four values joined with `__`.
438 #[test]
439 fn compound_resolves_when_all_fields_present() {
440 let el = json!({
441 "actionName": { "#text": "Tab" },
442 "content": { "#text": "Home_Page_Default" },
443 "formFactor": { "#text": "Large" },
444 "pageOrSobjectType": { "#text": "standard-home" },
445 "type": { "#text": "Flexipage" },
446 "profile": { "#text": "Implementation_Lightning" }
447 });
448 let id =
449 parse_unique_id_element(&el, Some("actionName+pageOrSobjectType+formFactor+profile"));
450 assert_eq!(id, "Tab__standard-home__Large__Implementation_Lightning");
451 }
452
453 /// A compound that names a field the element doesn't have must NOT
454 /// match - the next candidate (a narrower compound, then a single
455 /// field) takes over.
456 #[test]
457 fn compound_falls_through_when_one_field_missing() {
458 // `<actionOverrides>` (no profile, no recordType) - the wide compound
459 // must fail, the narrow compound must succeed.
460 let el = json!({
461 "actionName": { "#text": "View" },
462 "content": { "#text": "LUX_Case_Release_Candidate_Copy" },
463 "formFactor": { "#text": "Large" },
464 "pageOrSobjectType": { "#text": "Case" },
465 "type": { "#text": "Flexipage" }
466 });
467 let spec = "actionName+pageOrSobjectType+formFactor+profile,actionName+pageOrSobjectType+formFactor,actionName";
468 assert_eq!(
469 parse_unique_id_element(&el, Some(spec)),
470 "View__Case__Large"
471 );
472 }
473
474 /// All compound candidates miss → the loop must fall back to the
475 /// single-field candidate at the tail of the spec, and ultimately to
476 /// the outer-element hash if even that misses.
477 #[test]
478 fn compound_then_single_then_hash_fallback() {
479 let el = json!({
480 "actionName": { "#text": "View" }
481 });
482 let spec_all_compound =
483 "actionName+pageOrSobjectType+formFactor+profile,actionName+pageOrSobjectType";
484 let id = parse_unique_id_element(&el, Some(spec_all_compound));
485 assert_eq!(
486 id.len(),
487 8,
488 "no candidate should match → hash fallback, got {id}"
489 );
490
491 let spec_with_single_tail = "actionName+pageOrSobjectType+formFactor,actionName";
492 assert_eq!(
493 parse_unique_id_element(&el, Some(spec_with_single_tail)),
494 "View"
495 );
496 }
497
498 /// Empty values (`<recordType></recordType>`) must be treated as
499 /// missing for the purpose of compound matching - otherwise we would
500 /// emit filenames like `View__Account__Large__` with a trailing
501 /// separator and silently collide with siblings that genuinely lack
502 /// the field.
503 #[test]
504 fn compound_treats_empty_values_as_missing() {
505 let el = json!({
506 "actionName": { "#text": "View" },
507 "pageOrSobjectType": { "#text": "Account" },
508 "recordType": { "#text": "" } // explicitly empty
509 });
510 let spec = "actionName+pageOrSobjectType+recordType,actionName+pageOrSobjectType";
511 assert_eq!(
512 parse_unique_id_element(&el, Some(spec)),
513 "View__Account",
514 "empty <recordType> must be treated as missing"
515 );
516 }
517
518 /// Distinct profileActionOverrides siblings sharing actionName +
519 /// pageOrSobjectType + formFactor but differing in `profile` must
520 /// produce distinct compound IDs (not collide).
521 #[test]
522 fn compound_disambiguates_siblings_that_share_outer_fields() {
523 let make = |profile: &str| {
524 json!({
525 "actionName": { "#text": "Tab" },
526 "content": { "#text": "Home_Page_Default" },
527 "formFactor": { "#text": "Large" },
528 "pageOrSobjectType": { "#text": "standard-home" },
529 "type": { "#text": "Flexipage" },
530 "profile": { "#text": profile }
531 })
532 };
533 let spec = "actionName+pageOrSobjectType+formFactor+profile";
534 let a = parse_unique_id_element(&make("Implementation_Lightning"), Some(spec));
535 let b = parse_unique_id_element(&make("Sales_Lightning"), Some(spec));
536 assert_ne!(a, b);
537 assert!(a.ends_with("Implementation_Lightning"));
538 assert!(b.ends_with("Sales_Lightning"));
539 }
540
541 /// A single-field spec must behave identically to releases prior to
542 /// compound-key support: same priority (direct first, then nested),
543 /// same hash fallback, no spurious `__` separators.
544 #[test]
545 fn single_field_behaviour_is_unchanged() {
546 let el = json!({ "name": "Get_Info", "label": "Get Info" });
547 assert_eq!(parse_unique_id_element(&el, Some("name")), "Get_Info");
548
549 // Direct vs nested priority preserved.
550 let nested = json!({
551 "wrapper": { "name": "NestedName" }
552 });
553 assert_eq!(parse_unique_id_element(&nested, Some("name")), "NestedName");
554 }
555
556 /// Pathological/malformed specs - leading commas, stray `+`, all
557 /// whitespace - must not panic and must degrade to hash fallback.
558 #[test]
559 fn malformed_spec_degrades_to_hash() {
560 let el = json!({ "foo": "bar" });
561 let id = parse_unique_id_element(&el, Some(",,+,, "));
562 assert_eq!(id.len(), 8, "all-empty candidates → hash fallback");
563 }
564
565 // ---- path-segment sanitization (issue #25) ------------------------------
566
567 /// Salesforce identifiers can legitimately contain characters that are
568 /// illegal in a path segment. The most common offender is `/` (seen in
569 /// the wild on `EntitlementProcess.milestones[*].milestoneName`). Without
570 /// sanitization the resolved id `Foo/Bar` is interpreted by the OS as
571 /// the path `Foo/Bar.tag-meta.xml`, silently writing into a non-existent
572 /// `Foo/` directory and dropping data. Each forbidden char must collapse
573 /// to a single `_`.
574 #[test]
575 fn sanitize_replaces_path_separators() {
576 assert_eq!(sanitize_path_segment("Foo/Bar"), "Foo_Bar");
577 assert_eq!(sanitize_path_segment("Foo\\Bar"), "Foo_Bar");
578 assert_eq!(
579 sanitize_path_segment("TrustFile Transaction Sync/Import Complete"),
580 "TrustFile Transaction Sync_Import Complete"
581 );
582 }
583
584 #[test]
585 fn sanitize_replaces_windows_reserved_chars() {
586 for c in [':', '*', '?', '"', '<', '>', '|'] {
587 let input = format!("a{c}b");
588 assert_eq!(sanitize_path_segment(&input), "a_b", "char={c}");
589 }
590 }
591
592 #[test]
593 fn sanitize_replaces_control_characters() {
594 // 0x00 (NUL), 0x09 (TAB), 0x1F (US) all map to `_`.
595 assert_eq!(sanitize_path_segment("a\u{0}b"), "a_b");
596 assert_eq!(sanitize_path_segment("a\u{1f}b"), "a_b");
597 }
598
599 #[test]
600 fn sanitize_strips_trailing_dot_and_space() {
601 // Windows write semantics drop trailing `.` and space silently;
602 // leaving them in would let two distinct inputs collide on disk.
603 // Tab and other control characters are NOT in this set - they're
604 // replaced with `_` via the control-char path so the byte isn't
605 // lost (`Foo\t` -> `Foo_` rather than `Foo`).
606 assert_eq!(sanitize_path_segment("Foo."), "Foo");
607 assert_eq!(sanitize_path_segment("Foo "), "Foo");
608 assert_eq!(sanitize_path_segment("Foo. ."), "Foo");
609 assert_eq!(sanitize_path_segment("Foo\t"), "Foo_");
610 }
611
612 #[test]
613 fn sanitize_passes_safe_inputs_through_unchanged() {
614 // Borrows on the happy path - exercise via the Cow variant.
615 let cases = [
616 "Account",
617 "Account_Name__c",
618 "Sample_Object_005__c",
619 "Implementation - TrustFile Amazon",
620 "View",
621 "TrustFile Account Setup Complete",
622 "View__Account__Large__SalesProfile",
623 // Inner dots are fine; only TRAILING dots are stripped.
624 "Account.LogACall",
625 "Sample_Object_017__c.Sample_Record_Type_0123",
626 ];
627 for case in cases {
628 match sanitize_path_segment(case) {
629 Cow::Borrowed(s) => assert_eq!(s, case, "unexpected mutation for {case:?}"),
630 Cow::Owned(s) => panic!("unexpected allocation for {case:?}: got {s:?}"),
631 }
632 }
633 }
634
635 #[test]
636 fn sanitize_replaces_illegal_chars_one_for_one() {
637 // Each illegal char becomes exactly one `_` so the result length
638 // and structure mirror the input - critical for collision-detection
639 // signal: two distinct inputs differing only in their illegal chars
640 // produce distinct sanitized outputs and the collision detector
641 // does not need to fire.
642 assert_eq!(sanitize_path_segment("///"), "___");
643 assert_eq!(sanitize_path_segment("/"), "_");
644 assert_eq!(sanitize_path_segment("a/b/c"), "a_b_c");
645 assert_eq!(sanitize_path_segment("a*b?c"), "a_b_c");
646 }
647
648 #[test]
649 fn sanitize_replacement_yields_underscore_when_input_collapses_to_empty() {
650 // Edge case: input is entirely trailing-trim-able (e.g. `". ."` or `". "`).
651 // After trim the string is empty, which would produce a degenerate
652 // filename like `.<tag>-meta.xml`. Substitute a single `_` so the
653 // file still writes; the upstream collision detector will hash any
654 // siblings that pile up here.
655 assert_eq!(sanitize_path_segment(". ."), "_");
656 assert_eq!(sanitize_path_segment(". "), "_");
657 assert_eq!(sanitize_path_segment("."), "_");
658 assert_eq!(sanitize_path_segment(" "), "_");
659 }
660
661 #[test]
662 fn sanitize_handles_empty_input() {
663 // Empty in -> empty out. Caller is responsible for upgrading to a
664 // hash if they need a non-empty filename; sanitize itself has no
665 // useful work to do here.
666 let out = sanitize_path_segment("");
667 assert!(matches!(out, Cow::Borrowed(s) if s.is_empty()));
668 }
669
670 /// `parse_unique_id_element` MUST apply sanitization at the boundary so
671 /// every caller (single-field, compound, multi-level) gets it for free.
672 /// This is the regression test that pairs with issue #25.
673 #[test]
674 fn parse_unique_id_element_sanitizes_resolved_value() {
675 let el = json!({
676 "milestoneName": { "#text": "TrustFile Transaction Sync/Import Complete" }
677 });
678 let id = parse_unique_id_element(&el, Some("milestoneName"));
679 assert!(!id.contains('/'), "resolved id must not contain `/`: {id}");
680 assert_eq!(id, "TrustFile Transaction Sync_Import Complete");
681 }
682
683 #[test]
684 fn parse_unique_id_element_sanitizes_compound_values() {
685 // Compound values are joined with `__`; if any component contains
686 // an illegal char it must be sanitized BEFORE the join (otherwise
687 // the illegal char survives into the produced filename).
688 let el = json!({
689 "actionName": { "#text": "View" },
690 "pageOrSobjectType": { "#text": "Sample/Object__c" },
691 "formFactor": { "#text": "Large" }
692 });
693 let id = parse_unique_id_element(&el, Some("actionName+pageOrSobjectType+formFactor"));
694 assert!(!id.contains('/'), "compound id must not contain `/`: {id}");
695 assert_eq!(id, "View__Sample_Object__c__Large");
696 }
697
698 #[test]
699 fn parse_unique_id_element_hash_fallback_is_unaffected_by_sanitizer() {
700 // Hash fallback returns 8 hex chars, all of which are safe; the
701 // sanitizer must be a no-op here.
702 let el = json!({ "a": "b" });
703 let id = parse_unique_id_element(&el, Some("name"));
704 assert_eq!(id.len(), 8);
705 assert!(id.chars().all(|c| c.is_ascii_hexdigit()));
706 }
707
708 /// Recursion must only return when a configured unique-id field is
709 /// *actually* found, not when a recursive call falls back to its own
710 /// hash. The hash is computed exactly once, at the top level, on the
711 /// outer element.
712 #[test]
713 fn nested_search_does_not_return_inner_hash() {
714 // Two distinct outer elements whose first recursable child has the
715 // same shape. With the old behavior the recursion would compute a
716 // hash of that inner child for both - same hash for distinct outers.
717 // With the fix, each outer is hashed in full and they differ.
718 let a = json!({
719 "wrapper": { "leafA": "shared", "extraA": "different-A" },
720 "outerA": "A"
721 });
722 let b = json!({
723 "wrapper": { "leafA": "shared", "extraA": "different-A" },
724 "outerB": "B"
725 });
726 let id_a = parse_unique_id_element(&a, Some("name"));
727 let id_b = parse_unique_id_element(&b, Some("name"));
728 assert_ne!(id_a, id_b);
729 }
730}