kache 0.5.0

Zero-copy, content-addressed Rust build cache. No copies, no wasted disk — just hardlinks locally and S3 for sharing.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
//! Path normalization for cache keys.
//!
//! Purpose: stop machine-local absolute paths from leaking into cache
//! keys. Without this, a build at `/path/A/foo` and a build at
//! `/path/B/foo` produce different cache keys for byte-identical
//! sources — cache hits don't transfer across machines or worktrees,
//! and even relocating a project on the same machine misses
//! everything.
//!
//! ## Why a struct, not just `replace_cwd`
//!
//! Before this module, the cache_key code shipped a one-off
//! `normalize_flags` that did `value.replace(cwd, ".")`. Two failure
//! modes the struct fixes:
//!
//! 1. **Symlink-induced miss** (the e422e55 class). On macOS, `/tmp`
//!    is a symlink to `/private/tmp`. CWD reports `/tmp/build` but
//!    cargo emits `OUT_DIR=/private/tmp/build/...`. Literal string
//!    replace finds no match → path leaks into the key. Canonicalizing
//!    the prefix once at construction time fixes this without paying
//!    a `canonicalize()` per-replacement.
//!
//! 2. **Single-prefix horizon**. CWD is one of several path prefixes
//!    a cache key wants stripped. The struct holds an ordered list of
//!    `(prefix, sentinel)` rules so adding a new one is a one-line
//!    edit, not a new ad-hoc helper.
//!
//! ## Cross-machine contract
//!
//! For shared / remote caches (CI runners, multiple devs hitting the
//! same S3 bucket), key portability requires every machine-local path
//! to map to the same sentinel. Today's rule set covers:
//!
//! | Prefix              | Sentinel        | Why it varies per host |
//! |---------------------|-----------------|------------------------|
//! | workspace root      | `<WORKSPACE>`   | per-checkout location  |
//! | `$CARGO_TARGET_DIR` | `<TARGET>`      | per-user / per-job dir |
//! | `$CARGO_HOME`       | `<CARGO_HOME>`  | usually `~/.cargo`     |
//! | `$RUSTUP_HOME`      | `<RUSTUP_HOME>` | usually `~/.rustup`    |
//! | `$HOME`             | `<HOME>`        | per-user               |
//! | system tempdir      | `<TMPDIR>`      | macOS uses `/var/folders/<random>/T` per user |
//!
//! ## Two consumers, same rule set
//!
//! - [`Self::normalize`] applies the rules to cache-key INPUTS (env
//!   var values, RUSTFLAGS, etc.). Sequential `String::replace` in
//!   most-specific-first order — first match wins.
//! - [`Self::remap_args`] renders the rules as
//!   `--remap-path-prefix=PREFIX=SENTINEL` flags for rustc. **Reverses
//!   the rule order** because rustc applies remappings with
//!   last-match-wins (verified empirically by inspecting
//!   `strings <binary>`). This pairs cache-key portability with
//!   artifact-byte portability: a cached `.rlib` from Dev A's
//!   machine has DWARF that points at `<CARGO_HOME>/registry/...`
//!   instead of `/Users/alice/.cargo/registry/...`, so Dev B can
//!   debug it via `lldb`/`gdb` `set substitute-path`.
//!
//! What this **still doesn't** cover (separate concerns):
//!
//! - **build.rs-generated paths inside `OUT_DIR`**:
//!   `--remap-path-prefix` only rewrites paths rustc knows about at
//!   compile time. A `private.rs` generated by serde's build.rs may
//!   contain paths internally (rare in practice). Future fixture +
//!   audit if a real-world case shows up.
//! - **Toolchain divergence**: different rustc / linker / SDK versions
//!   produce different output for the same source. Already captured
//!   by `rustc_version` + `linker_id` in the cache key — non-shareable
//!   by design when these differ.
//! - **Sentinel collision**: if a real path string contained
//!   `<WORKSPACE>` literally, replacement would corrupt it. POSIX
//!   paths don't use `<`/`>`; Windows uses them only inside `\\?\`
//!   device-path forms which aren't cache-key inputs. Acceptable risk.
//!
//! ## macOS Unicode normalization (NFC vs NFD)
//!
//! HFS+ historically stored filenames in NFD form (`é` as `e` +
//! combining accent — three UTF-8 bytes); APFS preserves whatever
//! form the API caller used. Env vars set by shells / build tools
//! typically use NFC (`é` as a single codepoint — two UTF-8 bytes).
//! Same character to the eye, **different bytes**.
//!
//! Without normalization, a user `José` would silently lose
//! cross-machine cache sharing while ASCII-named users got it for
//! free — the cache key for their workspace would diverge across
//! tools that disagree on the encoding. We normalize both sides
//! (rule prefixes via [`canonical_string`], input via
//! [`PathNormalizer::normalize`]) to NFC before matching. Same
//! source bytes after normalization → same key.
//!
//! ## Sentinel strategy
//!
//! Each prefix maps to a stable token. Sentinels are angle-bracketed
//! so they can't appear in real path strings. Order matters: longer /
//! more-specific prefixes match first so `<WORKSPACE>` shadows
//! `<HOME>` for users whose workspace lives inside their home dir.
//!
//! ## Leak detection
//!
//! After applying all rules, [`PathNormalizer::normalize`] scans the
//! result for substrings that still look like absolute paths
//! (`/Users/`, `/home/`, `/private/`, drive letters). When found,
//! emits a `tracing::warn!` naming the suspicious value. This is a
//! defense-in-depth check — the relocate e2e phase exists to surface
//! these by failing the build, but the warning gives a faster signal
//! during ad-hoc dev runs without needing to run the full harness.

use std::path::{Path, PathBuf};
use unicode_normalization::UnicodeNormalization;

/// One canonicalized prefix + its replacement sentinel.
#[derive(Debug, Clone)]
struct Rule {
    prefix: String,
    sentinel: &'static str,
}

/// Replaces machine-local path prefixes with stable sentinels so
/// cache keys are reproducible across hosts and worktrees.
///
/// Construction is best-effort: prefixes that fail to canonicalize
/// (because the env var isn't set, or the path doesn't exist) are
/// silently dropped. This is intentional — a missing
/// `$CARGO_TARGET_DIR` shouldn't make every cache key generation
/// fail; the worst case is the rule doesn't apply for that build.
#[derive(Debug, Clone)]
pub struct PathNormalizer {
    rules: Vec<Rule>,
}

impl PathNormalizer {
    /// Build a normalizer from the current process environment plus
    /// an optional explicit workspace root.
    ///
    /// Rule order is most-specific first, so when prefixes nest
    /// (workspace inside `$HOME`, target dir inside workspace) the
    /// inner prefix wins:
    ///
    /// 1. current working directory → `<WORKSPACE>`
    /// 2. `workspace_root` (if provided) → `<WORKSPACE>`
    /// 3. `$CARGO_TARGET_DIR` → `<TARGET>`
    /// 4. `$CARGO_HOME` (or default `~/.cargo`) → `<CARGO_HOME>`
    /// 5. `$RUSTUP_HOME` (or default `~/.rustup`) → `<RUSTUP_HOME>`
    /// 6. `$HOME` → `<HOME>`
    /// 7. system tempdir (`std::env::temp_dir()`) → `<TMPDIR>`
    ///
    /// Rule 1 is what rewrites rustc's DWARF `DW_AT_comp_dir` — rustc
    /// records its working directory there verbatim, and the other
    /// rules only rewrite it if one's prefix matches the CWD exactly.
    /// Without rule 1 a debug build leaks the build path through
    /// `comp_dir` (and any dSYM derived from it). kache runs as
    /// RUSTC_WRAPPER with no `chdir`, so the wrapper's CWD is the CWD
    /// rustc records. For a workspace member it is more specific than
    /// `workspace_root` (the repo root); for a single package they
    /// coincide and the dedup below collapses them.
    ///
    /// Tempdir lands last because it can be unrelated to user paths
    /// (`/tmp` on Linux, `/var/folders/.../T` on macOS) — it's
    /// purely for relocate-style scenarios where build artifacts
    /// flow through tempdirs.
    pub fn from_env(workspace_root: Option<&Path>) -> Self {
        let mut rules = Vec::new();

        // User-declared base dir (`KACHE_BASE_DIR`, the analog of ccache's
        // `CCACHE_BASEDIR`). Highest priority — an explicit prefix the
        // user knows should be stripped so the key is independent of the
        // checkout location. Covers paths the auto-derived rules below
        // can't see (container mounts, generated-file env-deps that point
        // outside the workspace, etc.).
        push_rule_with_variants(
            &mut rules,
            std::env::var_os("KACHE_BASE_DIR")
                .filter(|v| !v.is_empty())
                .and_then(|v| canonical_string(Path::new(&v))),
            "<BASE_DIR>",
        );

        push_rule_with_variants(
            &mut rules,
            std::env::current_dir()
                .ok()
                .and_then(|p| canonical_string(&p)),
            "<WORKSPACE>",
        );

        push_rule_with_variants(
            &mut rules,
            workspace_root.and_then(canonical_string),
            "<WORKSPACE>",
        );

        push_rule_with_variants(
            &mut rules,
            std::env::var_os("CARGO_TARGET_DIR").and_then(|p| canonical_string(Path::new(&p))),
            "<TARGET>",
        );

        // CARGO_HOME defaults to $HOME/.cargo if not set; honor that
        // so users with the default layout still get the substitution.
        let cargo_home = std::env::var_os("CARGO_HOME")
            .map(PathBuf::from)
            .or_else(|| dirs::home_dir().map(|h| h.join(".cargo")));
        push_rule_with_variants(
            &mut rules,
            cargo_home.and_then(|p| canonical_string(&p)),
            "<CARGO_HOME>",
        );

        // Same default-fallback story for RUSTUP_HOME. rustup paths
        // appear in -L flags and toolchain refs — without this rule,
        // Dev A's `~alice/.rustup` and Dev B's `~bob/.rustup` would
        // produce different keys for byte-identical sources.
        let rustup_home = std::env::var_os("RUSTUP_HOME")
            .map(PathBuf::from)
            .or_else(|| dirs::home_dir().map(|h| h.join(".rustup")));
        push_rule_with_variants(
            &mut rules,
            rustup_home.and_then(|p| canonical_string(&p)),
            "<RUSTUP_HOME>",
        );

        push_rule_with_variants(
            &mut rules,
            dirs::home_dir().and_then(|p| canonical_string(&p)),
            "<HOME>",
        );

        // Windows-specific rules: cargo / rustup paths often live
        // under %APPDATA% / %LOCALAPPDATA% (e.g. CARGO_HOME defaults
        // to %USERPROFILE%\.cargo but tools like rustup-init may
        // also stash state in %LOCALAPPDATA%\rustup). %PROGRAMFILES%
        // covers MSVC / SDK locations that flow into native-link
        // search paths. Each is added as its own sentinel so cross-
        // dev caches don't leak per-user paths.
        for (env_key, sentinel) in [
            ("APPDATA", "<APPDATA>"),
            ("LOCALAPPDATA", "<LOCALAPPDATA>"),
            ("PROGRAMFILES", "<PROGRAMFILES>"),
        ] {
            push_rule_with_variants(
                &mut rules,
                std::env::var_os(env_key).and_then(|v| canonical_string(Path::new(&v))),
                sentinel,
            );
        }

        // System tempdir — varies wildly across hosts (per-user random
        // segment on macOS like `/var/folders/1z/.../T`). Build
        // artifacts from build.rs scripts often pass through tempdirs
        // for intermediate output.
        push_rule_with_variants(
            &mut rules,
            canonical_string(&std::env::temp_dir()),
            "<TMPDIR>",
        );

        // De-dupe: if (e.g.) CARGO_HOME == workspace_root, the second
        // entry would never fire. Also keep order stable so the
        // first-listed sentinel wins for identical prefixes. Variants
        // pushed by `push_rule_with_variants` are already adjacent
        // per-base prefix, so a stable dedup catches duplicates
        // introduced when (e.g.) `~/.cargo` happens to canonicalize
        // identically to one of its forward-slash variants on a
        // unix host (no-op then).
        rules.dedup_by(|a, b| a.prefix == b.prefix);

        Self { rules }
    }

    /// A normalizer with no rules — leaves every input unchanged.
    /// Used in tests and as a sentinel "no normalization configured"
    /// state in code paths where the env-derived constructor isn't
    /// applicable.
    ///
    /// `#[allow(dead_code)]` because the only consumer today is the
    /// test module; production code always builds via `from_env`.
    /// Kept as part of the public surface so future callers (e.g.
    /// integration tests of cache_key with deterministic inputs)
    /// don't have to construct `PathNormalizer` via `from_env` and
    /// pull in the host's actual env state.
    #[allow(dead_code)]
    pub fn empty() -> Self {
        Self { rules: Vec::new() }
    }

    /// Replace every known prefix in `s` with its sentinel.
    ///
    /// Operates on raw strings (not [`Path`]) because cache-key
    /// inputs are typically values from shell commands and env vars
    /// that may contain paths embedded in larger strings (e.g.
    /// `-L /home/x/lib -L /home/x/build/deps`). Each rule does a
    /// literal substring replace; rules apply in declaration order
    /// so the first match for a given byte range wins.
    ///
    /// **Pure substring replace, modulo Unicode normalization.**
    /// Earlier drafts canonicalized the input to handle Windows
    /// separator and drive-letter case differences. That worked for
    /// cache-key hashing but had a side effect: parts of the input
    /// that DIDN'T match a rule still got their separators flipped.
    /// For Windows compatibility we instead store MULTIPLE VARIANTS
    /// of each prefix (forward/back slash, upper/lower drive) in
    /// [`Self::from_env`] — see that method's docs. The matching
    /// here is a byte-literal pass over every variant.
    ///
    /// **Unicode normalization to NFC** is applied to the input
    /// before matching, because rule prefixes are stored in NFC
    /// (see [`canonical_string`]). This handles the macOS HFS+/APFS
    /// case where filesystem APIs may return paths in NFD (`é` as
    /// `e` + combining accent) while env vars set by other tools
    /// use NFC (`é` as a single codepoint). Same character to the
    /// eye, different bytes — without this, a user `José` would
    /// silently lose cross-machine cache sharing while ASCII-named
    /// users got it for free. NFC is semantic-preserving for path
    /// strings (composed and decomposed forms are equivalent path
    /// representations); the normalize step is a no-op on inputs
    /// that are already NFC (the common case).
    pub fn normalize<S: AsRef<str>>(&self, s: S) -> String {
        let mut out: String = s.as_ref().nfc().collect();
        for rule in &self.rules {
            if rule.prefix.is_empty() {
                continue;
            }
            out = out.replace(&rule.prefix, rule.sentinel);
        }
        warn_if_path_leaked(&out);
        out
    }

    /// Render the rule set as a list of `--remap-path-prefix=PREFIX=SENTINEL`
    /// arguments suitable for injecting into a rustc invocation.
    ///
    /// Same rule list, same machine-local paths, same sentinels — but
    /// the *consumer* is rustc instead of the cache-key hasher.
    /// rustc rewrites paths embedded in DWARF / PDB / `#[track_caller]`
    /// callsites using these mappings, so the resulting binary's debug
    /// info uses the stable sentinels rather than the cacher's
    /// machine-local prefixes.
    ///
    /// **Order is reversed from the rule list** because rustc applies
    /// `--remap-path-prefix` with last-match-wins semantics, not
    /// first-match. PathNormalizer's rule list is most-specific
    /// first (so [`Self::normalize`]'s sequential `String::replace`
    /// hits the inner prefix before the outer one). For rustc we
    /// need the opposite: most-specific LAST so it overrides the
    /// outer mapping that already matched. Empirically verified by
    /// inspecting `strings <binary>` after a debug build — without
    /// the reversal, paths under `~/.cargo` got mapped to
    /// `<HOME>/.cargo/...` instead of `<CARGO_HOME>/...`.
    pub fn remap_args(&self) -> Vec<String> {
        self.rules
            .iter()
            .rev()
            .filter(|r| !r.prefix.is_empty())
            .map(|r| format!("--remap-path-prefix={}={}", r.prefix, r.sentinel))
            .collect()
    }
}

/// Defense-in-depth: scan a string for substrings that look like
/// machine-local absolute paths and emit a warning naming the
/// context (which hash field, which value).
///
/// The relocate e2e phase already catches leaks by failing the build
/// when keys diverge across paths, but this gives a faster signal
/// during ad-hoc dev runs (a `KACHE_LOG=warn` user sees the leak the
/// first time they hit it instead of waiting for the full harness).
///
/// Only matches well-known absolute-path prefixes — avoids false
/// positives on legitimate strings that happen to contain `/`. The
/// list is intentionally conservative: missing a leak is preferable
/// to spamming the log on innocuous values.
///
/// `context` names where the value came from (e.g. `"normalize"`,
/// `"codegen:link-arg"`, `"cargo_cfg:MOZ_OBJ_DIR"`). It surfaces in the
/// warn so the offending field is identifiable without re-running with
/// trace-level logging.
pub(crate) fn check_for_path_leak(value: &str, context: &str) {
    const SUSPICIOUS_PREFIXES: &[&str] = &[
        "/Users/",       // macOS home
        "/home/",        // Linux home
        "/private/tmp/", // macOS canonicalized tempdir
        "/private/var/", // macOS canonicalized var
        "/var/folders/", // macOS per-user tempdir
        "C:\\Users\\",   // Windows home
    ];
    for prefix in SUSPICIOUS_PREFIXES {
        if let Some(idx) = value.find(prefix) {
            // Show a small window around the leak to make it easier
            // to identify (env var name etc.) without dumping the
            // whole value, which could be huge.
            let start = idx.saturating_sub(40);
            let end = (idx + prefix.len() + 40).min(value.len());
            tracing::warn!(
                "residual absolute path detected in `{}` (prefix `{}`): ...{}...",
                context,
                prefix,
                &value[start..end]
            );
            return;
        }
    }
}

/// Back-compat shim retained for `PathNormalizer::normalize` callers.
fn warn_if_path_leaked(s: &str) {
    check_for_path_leak(s, "PathNormalizer::normalize");
}

/// Canonicalize `path` to a string for prefix matching, or return
/// `None` if the path doesn't exist / can't be resolved. Empty
/// strings are also rejected because `String::replace("", x)` would
/// match between every byte and produce nonsense.
///
/// Returns the OS-native canonical form (Windows: `\` separators,
/// preserved case) **normalized to Unicode NFC**. The Windows
/// separator/case variants are added as ADDITIONAL rules by
/// [`push_rule_with_variants`] — keeping the OS-native form intact
/// lets `Path::starts_with` and other path-aware code consume this
/// string without surprise.
///
/// Why NFC: macOS filesystem APIs may return paths in NFD form
/// (HFS+ legacy, sometimes APFS); env vars typically use NFC. Both
/// rule and input get normalized to NFC at their respective sites
/// so byte-literal substring matching works regardless of which
/// form the source produced. See [`PathNormalizer::normalize`] for
/// the matching-side normalization.
fn canonical_string(path: &Path) -> Option<String> {
    let canon = path.canonicalize().ok()?;
    let lossy = canon.to_string_lossy();
    let s: String = strip_verbatim_prefix(&lossy).nfc().collect();
    if s.is_empty() { None } else { Some(s) }
}

/// Drop the Windows extended-length (`\\?\`) verbatim prefix that
/// [`Path::canonicalize`] returns on Windows.
///
/// Rule prefixes are matched against the paths cargo / rustc / env vars
/// actually emit, which are plain `C:\...` — never verbatim. Without
/// stripping, a rule like `\\?\C:\proj` can never substring-match an
/// input `C:\proj\...`, so on Windows *no* path normalization fired and
/// machine-local paths (notably `OUT_DIR`) leaked into the cache key,
/// breaking cross-path cache hits (kunobi-ninja/kache#201's relocate
/// miss). A `\\?\UNC\server\share` verbatim path is rewritten to its
/// `\\server\share` form. No-op on any string without the prefix
/// (always the case on Unix).
fn strip_verbatim_prefix(s: &str) -> std::borrow::Cow<'_, str> {
    if let Some(unc) = s.strip_prefix(r"\\?\UNC\") {
        std::borrow::Cow::Owned(format!(r"\\{unc}"))
    } else if let Some(drive) = s.strip_prefix(r"\\?\") {
        std::borrow::Cow::Borrowed(drive)
    } else {
        std::borrow::Cow::Borrowed(s)
    }
}

/// Push a rule + its Windows-shape variants into the rule list.
///
/// On unix, this is just `rules.push(Rule { prefix, sentinel })` —
/// one rule, no variants. On Windows the same canonical prefix
/// goes in *plus* up to three variants:
///
/// 1. **Forward-slash form** — cargo's dep-info often reports paths
///    with `/` even though the OS uses `\`. Without this variant
///    a stored `C:\Users\alice\.cargo` rule would miss an input
///    `C:/Users/alice/.cargo/registry/...`.
/// 2. **Lower-case drive letter** — NTFS is case-insensitive (`C:\`
///    and `c:\` are the same path) but `String::replace` is
///    byte-literal. Tools may emit either; both must match.
/// 3. **Both: lower-case drive + forward slash** — the cross of
///    the above when an input combines both deviations.
/// 4. **8.3 short-name form** — some tools emit a path's legacy short
///    name (`C:\PROGRA~1\...`) while the canonical rule is the long
///    form (`C:\Program Files\...`). The short form is resolved once
///    here via [`short_path_name`] and added (plus its own slash/case
///    crosses) so an 8.3 input still normalizes (issue #126).
///
/// Why this design over input transformation: an earlier draft
/// canonicalized the input string before matching. That worked for
/// cache-key hashing but had a side effect — parts of the input
/// that DIDN'T match a rule still got their separators flipped.
/// Storing variants instead means [`PathNormalizer::normalize`]
/// stays a dumb byte-literal substring replace; the input is treated
/// as opaque bytes outside any matched prefix. Duplicate variants
/// (e.g. a path with no 8.3 form) are pruned by the `dedup_by` in
/// [`PathNormalizer::from_env`].
fn push_rule_with_variants(rules: &mut Vec<Rule>, prefix: Option<String>, sentinel: &'static str) {
    let Some(prefix) = prefix else { return };
    if prefix.is_empty() {
        return;
    }

    // Always push the canonical form first.
    rules.push(Rule {
        prefix: prefix.clone(),
        sentinel,
    });

    // Variants only matter on Windows. On unix the canonical form
    // is the only one cargo / rustc / env vars produce, so adding
    // variants would just be dead rules.
    if !cfg!(windows) {
        return;
    }

    push_slash_and_case_variants(rules, &prefix, sentinel);

    // 8.3 short-name form (issue #126). Resolved once at rule-build
    // time; `None` when the path doesn't exist, has no short name, or
    // 8.3 generation is disabled on the volume — in which case there's
    // nothing to add. The short form gets the same slash/case crosses.
    if let Some(short) = short_path_name(&prefix)
        && short != prefix
    {
        rules.push(Rule {
            prefix: short.clone(),
            sentinel,
        });
        push_slash_and_case_variants(rules, &short, sentinel);
    }
}

/// Push the forward-slash, lower-case-drive, and combined variants of a
/// single Windows prefix. The canonical form is expected to have been
/// pushed already by the caller. Each variant is only added when it
/// actually differs from the forms already covered.
fn push_slash_and_case_variants(rules: &mut Vec<Rule>, prefix: &str, sentinel: &'static str) {
    // Forward-slash variant.
    let fs = prefix.replace('\\', "/");
    if fs != prefix {
        rules.push(Rule {
            prefix: fs.clone(),
            sentinel,
        });
    }

    // Lower-case drive variant.
    let lc = lowercase_drive_letter(prefix);
    if let Some(ref lc_str) = lc
        && lc_str != prefix
    {
        rules.push(Rule {
            prefix: lc_str.clone(),
            sentinel,
        });
    }

    // Both: forward-slash + lower-case drive.
    if let Some(fs_lc) = lowercase_drive_letter(&fs)
        && fs_lc != fs
        && Some(&fs_lc) != lc.as_ref()
    {
        rules.push(Rule {
            prefix: fs_lc,
            sentinel,
        });
    }
}

/// Resolve a path's Windows 8.3 short name (e.g. `C:\Program Files` →
/// `C:\PROGRA~1`) via `GetShortPathNameW`. Returns `None` when the path
/// doesn't exist, the volume has 8.3 generation disabled, or the call
/// otherwise fails — all of which simply mean "no short-name variant to
/// add". Off Windows this is always `None`.
#[cfg(windows)]
fn short_path_name(path: &str) -> Option<String> {
    use std::ffi::{OsStr, OsString};
    use std::os::windows::ffi::{OsStrExt, OsStringExt};
    use windows_sys::Win32::Storage::FileSystem::GetShortPathNameW;

    let wide: Vec<u16> = OsStr::new(path)
        .encode_wide()
        .chain(std::iter::once(0))
        .collect();

    // First call with a zero-length buffer returns the required length
    // INCLUDING the terminating NUL (0 on error).
    let needed = unsafe { GetShortPathNameW(wide.as_ptr(), std::ptr::null_mut(), 0) };
    if needed == 0 {
        return None;
    }

    let mut buf = vec![0u16; needed as usize];
    // On success the buffer is large enough, so this returns the length
    // EXCLUDING the NUL (i.e. < needed). 0 = error; >= needed = a race
    // where the path grew between calls — treat both as "no variant".
    let written = unsafe { GetShortPathNameW(wide.as_ptr(), buf.as_mut_ptr(), needed) };
    if written == 0 || written >= needed {
        return None;
    }

    Some(
        OsString::from_wide(&buf[..written as usize])
            .to_string_lossy()
            .into_owned(),
    )
}

#[cfg(not(windows))]
fn short_path_name(_path: &str) -> Option<String> {
    None
}

/// Return `Some(s with first byte lowercased)` if `s` looks like
/// a Windows drive-letter path (`X:` prefix where X is an ASCII
/// uppercase letter), else `None`. Only the drive letter is touched —
/// rest of the path is preserved (Windows filesystems are typically
/// case-preserving even when case-insensitive).
///
/// Operates at the BYTE level, not the `&str` slice level, to stay
/// panic-safe on inputs that start with a multi-byte UTF-8 codepoint.
/// `&s[1..2]` would panic with "not a char boundary" when byte 0 is
/// the start of (e.g.) `É` (`0xC3 0x89` in UTF-8). Drive letters are
/// always ASCII A-Z in practice, but defensive — `Path::canonicalize`
/// can return paths starting with anything when working with weird
/// volume labels or junctions.
fn lowercase_drive_letter(s: &str) -> Option<String> {
    let bytes = s.as_bytes();
    if bytes.len() < 2 || bytes[1] != b':' || !bytes[0].is_ascii_uppercase() {
        return None;
    }
    let mut out = s.to_string();
    // SAFETY: replacing one ASCII byte with another ASCII byte
    // preserves UTF-8 validity. Byte 0 was confirmed ASCII above.
    unsafe {
        out.as_bytes_mut()[0] = bytes[0].to_ascii_lowercase();
    }
    Some(out)
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::fs;
    use tempfile::TempDir;

    #[test]
    fn empty_normalizer_is_identity() {
        let n = PathNormalizer::empty();
        assert_eq!(n.normalize("/anything/at/all"), "/anything/at/all");
        assert_eq!(n.normalize(""), "");
    }

    #[test]
    fn canonicalizes_workspace_prefix_and_replaces_with_sentinel() {
        // Real on-disk dir so `canonicalize` succeeds.
        let dir = TempDir::new().unwrap();
        let n = PathNormalizer::from_env(Some(dir.path()));
        let canonical = dir.path().canonicalize().unwrap();
        let input = format!("{}/src/main.rs", canonical.display());
        assert!(
            n.normalize(&input).contains("<WORKSPACE>"),
            "got {} for input {input}",
            n.normalize(&input)
        );
    }

    #[test]
    fn workspace_rule_strips_macos_private_tmp_symlink() {
        // The exact bug class this struct exists to prevent: macOS
        // `/tmp` → `/private/tmp` symlink. A literal-replace
        // implementation that didn't canonicalize would miss when
        // the env var contains the canonicalized form but CWD
        // reports the symlink form (or vice versa).
        if !cfg!(target_os = "macos") {
            return;
        }
        // Construct a real dir under /tmp (the symlink form). The
        // normalizer canonicalizes to /private/tmp internally, so an
        // OUT_DIR-style string using the canonical form should match.
        let symlink_root = Path::new("/tmp");
        let unique = format!("kache-pn-test-{}", std::process::id());
        let real_dir = symlink_root.join(&unique);
        let _ = fs::create_dir_all(&real_dir);
        let n = PathNormalizer::from_env(Some(&real_dir));
        let private_form = format!("/private/tmp/{unique}/target/release/build/foo/out");
        let normalized = n.normalize(&private_form);
        let _ = fs::remove_dir_all(&real_dir);
        assert!(
            normalized.starts_with("<WORKSPACE>"),
            "expected <WORKSPACE> sentinel, got {normalized:?}"
        );
    }

    #[test]
    fn home_rule_normalizes_paths_inside_home() {
        let n = PathNormalizer::from_env(None);
        if let Some(home) = dirs::home_dir().and_then(|p| p.canonicalize().ok()) {
            let input = format!("{}/some/thing", home.display());
            // Either <HOME> or <CARGO_HOME> may match (cargo home is
            // inside home by default). Either is a valid normalization.
            let out = n.normalize(&input);
            assert!(
                out.starts_with('<'),
                "expected a sentinel prefix, got {out:?}"
            );
        }
    }

    #[test]
    fn empty_prefix_does_not_corrupt_input() {
        // Defense-in-depth: an empty prefix would make
        // `String::replace("", x)` match at every byte boundary,
        // producing garbage. The constructor rejects empty prefixes,
        // and `normalize` skips them as belt-and-braces.
        let n = PathNormalizer {
            rules: vec![Rule {
                prefix: String::new(),
                sentinel: "<NEVER>",
            }],
        };
        assert_eq!(n.normalize("hello world"), "hello world");
    }

    #[test]
    fn unmatched_input_passes_through_unchanged() {
        let n = PathNormalizer::from_env(None);
        let input = "this/path/does/not/match/anything/local";
        assert_eq!(n.normalize(input), input);
    }

    #[test]
    fn from_env_includes_rustup_home_rule_when_dir_exists() {
        // Default RUSTUP_HOME is `~/.rustup`; the dir typically exists
        // in any test environment with rustup installed (which is the
        // common case for kache contributors). Skip the assertion if
        // the dir really doesn't exist — better to skip than spam CI
        // with environment-dependent flakes.
        let rustup_dir = dirs::home_dir().map(|h| h.join(".rustup"));
        let Some(dir) = rustup_dir else {
            return;
        };
        if !dir.exists() {
            return;
        }
        let n = PathNormalizer::from_env(None);
        let canonical = dir.canonicalize().unwrap();
        let input = format!("{}/toolchains/stable/bin/rustc", canonical.display());
        let out = n.normalize(&input);
        assert!(
            out.contains("<RUSTUP_HOME>") || out.contains("<HOME>"),
            "expected rustup or home sentinel in {out:?}"
        );
    }

    #[test]
    fn from_env_includes_tempdir_rule() {
        let n = PathNormalizer::from_env(None);
        let temp = std::env::temp_dir().canonicalize().unwrap();
        let input = format!("{}/some-build-artifact", temp.display());
        let out = n.normalize(&input);
        // On macOS the tempdir is `/var/folders/.../T` which is also
        // inside the canonical /private/var path; either sentinel is
        // a valid normalization.
        assert!(
            out.contains("<TMPDIR>") || out.contains("<HOME>") || out.starts_with('<'),
            "expected a sentinel for tempdir-based path, got {out:?}"
        );
    }

    #[test]
    fn from_env_includes_cwd_rule_for_comp_dir() {
        // The CWD rule is what rewrites rustc's DWARF `DW_AT_comp_dir`:
        // rustc records its working directory there verbatim, and the
        // other rules only rewrite it if one's prefix matches the CWD
        // exactly. Without this rule a debug build leaks the build path
        // through `comp_dir`. kache runs as RUSTC_WRAPPER with no
        // `chdir`, so the wrapper CWD is the CWD rustc records.
        let cwd = std::env::current_dir().unwrap();
        let canonical = canonical_string(&cwd).expect("cwd must canonicalize");
        let n = PathNormalizer::from_env(None);

        // The CWD must normalize to a sentinel (`<WORKSPACE>` — possibly
        // shadowed by an inner rule, hence the looser `starts_with`).
        let normalized = n.normalize(format!("{canonical}/src/main.rs"));
        assert!(
            normalized.starts_with('<'),
            "CWD path should normalize to a sentinel, got {normalized:?}"
        );

        // ...and the rule must surface as an injected `--remap-path-prefix`
        // so rustc rewrites `comp_dir` at compile time.
        let remaps = n.remap_args();
        assert!(
            remaps
                .iter()
                .any(|a| a == &format!("--remap-path-prefix={canonical}=<WORKSPACE>")),
            "from_env must emit a CWD remap arg; got {remaps:?}"
        );
    }

    #[test]
    fn remap_args_emits_one_flag_per_rule_in_declaration_order() {
        // The order matters because rustc's --remap-path-prefix
        // applies left-to-right: the first matching prefix wins.
        // PathNormalizer's rule order is most-specific first
        // (workspace before $HOME, etc.), so iterating in that order
        // gives rustc the right precedence.
        let dir = TempDir::new().unwrap();
        let n = PathNormalizer::from_env(Some(dir.path()));
        let args = n.remap_args();
        // Every arg must have the rustc-recognized shape.
        for arg in &args {
            assert!(
                arg.starts_with("--remap-path-prefix="),
                "unexpected arg shape: {arg:?}"
            );
            // Must contain exactly one `=PREFIX=SENTINEL` split
            // (the prefix may contain `=` in pathological filenames
            // — split_once handles that by taking the first `=`).
            let body = arg.strip_prefix("--remap-path-prefix=").unwrap();
            assert!(
                body.contains('='),
                "missing `=PREFIX=SENTINEL` shape: {arg:?}"
            );
        }
        // Workspace rule (the most specific we passed) must appear
        // AFTER $HOME because rustc applies remappings with
        // last-match-wins. PathNormalizer's rule list is
        // most-specific-first for `normalize`'s sequential
        // string-replace; `remap_args` reverses to suit rustc.
        if let (Some(ws_idx), Some(home_idx)) = (
            args.iter().position(|a| a.contains("<WORKSPACE>")),
            args.iter().position(|a| a.contains("<HOME>")),
        ) {
            assert!(
                home_idx < ws_idx,
                "HOME must come before WORKSPACE so rustc's last-match-wins \
                 lets WORKSPACE override; got:\n{args:#?}"
            );
        }
    }

    #[test]
    fn remap_args_skips_empty_prefixes() {
        // Defense-in-depth: an empty prefix would emit
        // `--remap-path-prefix==<SENTINEL>`, which rustc would
        // probably accept and apply to every path (corrupting
        // every embedded path string). Mirror the same skip the
        // `normalize` method does.
        let n = PathNormalizer {
            rules: vec![Rule {
                prefix: String::new(),
                sentinel: "<NEVER>",
            }],
        };
        assert!(n.remap_args().is_empty());
    }

    // ── Windows-shape variants ──────────────────────────────────
    //
    // These tests exercise the "store multiple variants per rule"
    // strategy that handles Windows separator + drive-case
    // differences without transforming the input string. Each test
    // can run on any host; the cfg!(windows) gates select the
    // expected behavior (variants are added on Windows, skipped
    // elsewhere).

    fn rules_for(n: &PathNormalizer) -> Vec<(String, &'static str)> {
        n.rules
            .iter()
            .map(|r| (r.prefix.clone(), r.sentinel))
            .collect()
    }

    #[test]
    fn lowercase_drive_letter_only_touches_first_byte() {
        // Drive letter lowercased; rest of path preserved exactly
        // (Windows is case-preserving even when case-insensitive).
        assert_eq!(
            lowercase_drive_letter("C:\\Users\\Alice"),
            Some("c:\\Users\\Alice".to_string())
        );
        assert_eq!(
            lowercase_drive_letter("D:/Projects/Foo"),
            Some("d:/Projects/Foo".to_string())
        );
    }

    #[test]
    fn lowercase_drive_letter_returns_none_for_non_drive_paths() {
        // Unix paths, already-lowercase drives, and short strings
        // all return None — caller treats that as "no variant".
        assert_eq!(lowercase_drive_letter("/unix/path"), None);
        assert_eq!(lowercase_drive_letter("c:\\already\\lower"), None);
        assert_eq!(lowercase_drive_letter("C"), None);
        assert_eq!(lowercase_drive_letter(""), None);
        assert_eq!(lowercase_drive_letter("CD"), None); // missing :
        assert_eq!(lowercase_drive_letter("1:\\foo"), None); // not a letter
    }

    #[test]
    fn push_rule_with_variants_adds_only_canonical_form_on_unix() {
        // On unix the canonical form is the only one cargo / rustc
        // produce, so variants would be dead rules. This test pins
        // that contract (would fail loudly if someone changed the
        // gate to add variants unconditionally — wasted memory and
        // a tiny risk of unexpected matches).
        if cfg!(windows) {
            return;
        }
        let mut rules = Vec::new();
        push_rule_with_variants(
            &mut rules,
            Some("/Users/alice/.cargo".to_string()),
            "<CARGO_HOME>",
        );
        assert_eq!(rules.len(), 1);
        assert_eq!(rules[0].prefix, "/Users/alice/.cargo");
        assert_eq!(rules[0].sentinel, "<CARGO_HOME>");
    }

    #[test]
    fn push_rule_with_variants_expands_on_windows() {
        // The contract: a single Windows-shaped prefix expands to
        // up to four variants — canonical, forward-slash,
        // lowercase-drive, and both. Each variant maps to the
        // SAME sentinel so any of the input forms gets normalized.
        if !cfg!(windows) {
            return;
        }
        let mut rules = Vec::new();
        push_rule_with_variants(
            &mut rules,
            Some("C:\\Users\\Alice\\.cargo".to_string()),
            "<CARGO_HOME>",
        );
        // All variants present, all map to <CARGO_HOME>.
        let prefixes: Vec<&str> = rules.iter().map(|r| r.prefix.as_str()).collect();
        assert!(prefixes.contains(&"C:\\Users\\Alice\\.cargo"));
        assert!(prefixes.contains(&"C:/Users/Alice/.cargo"));
        assert!(prefixes.contains(&"c:\\Users\\Alice\\.cargo"));
        assert!(prefixes.contains(&"c:/Users/Alice/.cargo"));
        assert!(rules.iter().all(|r| r.sentinel == "<CARGO_HOME>"));
    }

    /// Issue #126: a path's legacy 8.3 short name (`...\LONGPR~1`) must
    /// normalize to the same sentinel as its long form. Windows-only and
    /// dependent on the volume having 8.3 generation enabled, so it
    /// resolves the real short name for an existing dir and skips if the
    /// volume has short names disabled (rather than flaking).
    #[cfg(windows)]
    #[test]
    fn push_rule_with_variants_adds_8dot3_short_name() {
        // A name with a space and >8 chars forces an 8.3 alias when the
        // volume supports it (e.g. "Long Program Dir" -> "LONGPR~1").
        let tmp = TempDir::new().unwrap();
        let long_dir = tmp.path().join("Long Program Dir");
        fs::create_dir(&long_dir).unwrap();
        let canonical = canonical_string(&long_dir).expect("canonicalize long dir");

        // No short name (8.3 disabled on this volume) -> nothing to test.
        let Some(short) = short_path_name(&canonical) else {
            return;
        };
        if short == canonical {
            return;
        }

        let mut rules = Vec::new();
        push_rule_with_variants(&mut rules, Some(canonical), "<BASE_DIR>");
        let n = PathNormalizer { rules };

        let input = format!("{short}\\src\\main.rs");
        let out = n.normalize(&input);
        assert!(
            out.contains("<BASE_DIR>"),
            "8.3 short-name input {input:?} should normalize via the short variant, got {out:?}"
        );
    }

    #[test]
    fn push_rule_with_variants_skips_empty_and_none() {
        // Defense-in-depth: an empty prefix would corrupt every
        // input via `String::replace("", x)`. None means "this
        // rule's source isn't available" (env var unset, dir
        // doesn't exist) and should be silently skipped.
        let mut rules = Vec::new();
        push_rule_with_variants(&mut rules, None, "<NEVER>");
        push_rule_with_variants(&mut rules, Some(String::new()), "<NEVER>");
        assert!(rules.is_empty());
    }

    #[test]
    fn normalize_matches_any_variant_form() {
        // The motivating Windows case: a stored canonical prefix
        // plus inputs that may use any of the four variants. All
        // should normalize to the same sentinel because we stored
        // every variant at construction.
        let mut rules = Vec::new();
        push_rule_with_variants(
            &mut rules,
            Some("C:\\Users\\Alice\\.cargo".to_string()),
            "<CARGO_HOME>",
        );
        let n = PathNormalizer { rules };

        // On unix only the canonical form is in the rule set, so
        // only that input is matched; on Windows any of the four
        // forms should match.
        let inputs_and_expectations: &[(&str, bool)] = &[
            ("C:\\Users\\Alice\\.cargo\\registry\\foo", true), // canonical
            ("C:/Users/Alice/.cargo/registry/foo", cfg!(windows)),
            ("c:\\Users\\Alice\\.cargo\\registry\\foo", cfg!(windows)),
            ("c:/Users/Alice/.cargo/registry/foo", cfg!(windows)),
            ("/Users/alice/.cargo/registry/foo", false), // unix path, never matches
        ];
        for (input, should_match) in inputs_and_expectations {
            let out = n.normalize(input);
            let matched = out.contains("<CARGO_HOME>");
            assert_eq!(
                matched, *should_match,
                "input {input:?}: expected match={should_match}, got out={out:?}"
            );
        }
    }

    #[test]
    fn normalize_does_not_transform_unmatched_input() {
        // Critical contract: input bytes that DON'T match any rule
        // must come out byte-identical. The earlier
        // canonicalize-input design failed this — it flipped
        // backslashes everywhere. The current variant-based design
        // never transforms; this test pins that.
        let n = PathNormalizer::empty();
        let weird_inputs = &[
            "C:\\Users\\foo",
            "/unix/with\\backslash/mixed",
            r"\\?\C:\extended\length",
            "\\//\\/", // mixed-up separators (the user's concern)
            "no path here at all",
        ];
        for input in weird_inputs {
            assert_eq!(
                n.normalize(input),
                *input,
                "unmatched input {input:?} must pass through unchanged"
            );
        }
    }

    #[test]
    fn rules_dedup_keeps_first_for_identical_canonicalization() {
        // On unix, when the canonical forms of two different sources
        // (e.g. CARGO_HOME and HOME if CARGO_HOME = HOME, weird but
        // possible in tests) collide, the first-listed sentinel
        // wins. push_rule_with_variants pushes the canonical first
        // for each, so dedup_by removes only the duplicate.
        let mut rules = Vec::new();
        push_rule_with_variants(&mut rules, Some("/same/path".to_string()), "<FIRST>");
        push_rule_with_variants(&mut rules, Some("/same/path".to_string()), "<SECOND>");
        rules.dedup_by(|a, b| a.prefix == b.prefix);
        // Order preserved — first occurrence wins. Note this exercises
        // the same dedup the real `from_env` does.
        let names: Vec<_> = rules_for(&PathNormalizer { rules }).into_iter().collect();
        assert_eq!(names.len(), 1);
        assert_eq!(names[0].1, "<FIRST>");
    }

    #[test]
    fn leak_detector_does_not_panic_on_unsentineled_paths() {
        // The detector logs but doesn't error — failure mode is
        // observability, not correctness. This test pins that
        // behavior so a future change doesn't accidentally make
        // leak detection panic.
        let n = PathNormalizer::empty();
        // Empty normalizer leaves input untouched; if the input
        // contains a suspicious path, the detector logs.
        let _ = n.normalize("/Users/alice/leaked/path");
        let _ = n.normalize("/home/bob/leaked/path");
        let _ = n.normalize("C:\\Users\\charlie\\leaked");
        // No assertions on log output (we don't capture tracing in
        // unit tests); the assertion is "did not panic".
    }

    // ── Extended edge-case coverage ─────────────────────────────
    //
    // The tests above cover the basic surface (empty/non-empty
    // normalizers, sentinel substitution, Windows variants). This
    // section exercises edge cases that aren't immediately obvious
    // from the basic tests but matter for real-world inputs:
    //
    //   - Multiple occurrences of the same prefix in one input
    //   - The boundary case where the input EQUALS the prefix
    //   - Multiple distinct prefixes chained in one input
    //   - Nested prefixes (workspace inside home; most-specific wins)
    //   - Idempotency (sentinels themselves don't get re-replaced)
    //   - The known limitation: substring vs. path-component matching
    //   - Realistic OUT_DIR / RUSTFLAGS-shape inputs end-to-end

    fn pn_with_rules(rules: Vec<(&str, &'static str)>) -> PathNormalizer {
        // Test helper: build a normalizer with explicit rules,
        // bypassing the env-derived `from_env` so tests are
        // deterministic across hosts.
        PathNormalizer {
            rules: rules
                .into_iter()
                .map(|(p, s)| Rule {
                    prefix: p.to_string(),
                    sentinel: s,
                })
                .collect(),
        }
    }

    #[test]
    fn normalize_replaces_all_occurrences_of_same_prefix() {
        // String::replace replaces every match, not just the first.
        // Realistic case: a RUSTFLAGS value with multiple `-L` flags
        // pointing at different subdirs of the same workspace.
        let n = pn_with_rules(vec![("/ws", "<W>")]);
        let input = "-L /ws/lib -L /ws/build/deps -L /ws/extra";
        let out = n.normalize(input);
        assert_eq!(out, "-L <W>/lib -L <W>/build/deps -L <W>/extra");
    }

    #[test]
    fn normalize_handles_input_equal_to_prefix() {
        // Boundary: input has nothing after the prefix. Should
        // become exactly the sentinel — no leftover bytes.
        let n = pn_with_rules(vec![("/ws", "<W>")]);
        assert_eq!(n.normalize("/ws"), "<W>");
    }

    #[test]
    fn normalize_chains_multiple_distinct_prefixes_in_one_input() {
        // Real RUSTFLAGS-style input mixing workspace and cargo home
        // references. Both rules apply, each to its own substring.
        let n = pn_with_rules(vec![("/ws", "<W>"), ("/home/u/.cargo", "<C>")]);
        let input = "-L /ws/lib -L /home/u/.cargo/registry/src/foo";
        let out = n.normalize(input);
        assert_eq!(out, "-L <W>/lib -L <C>/registry/src/foo");
    }

    #[test]
    fn most_specific_prefix_wins_when_nested() {
        // The classic "workspace lives inside home" case. PathNormalizer's
        // rule order is most-specific first, so when normalize sweeps
        // sequentially, the workspace prefix matches and replaces the
        // input BEFORE the home prefix gets a chance — the resulting
        // string starts with `<W>` not `<H>/projects/...`.
        let n = pn_with_rules(vec![("/home/u/projects/ws", "<W>"), ("/home/u", "<H>")]);
        let input = "/home/u/projects/ws/src/lib.rs";
        let out = n.normalize(input);
        assert_eq!(out, "<W>/src/lib.rs");
        // Sibling path that ISN'T inside the workspace falls through
        // to the home rule, as expected.
        let sibling = "/home/u/other/foo.rs";
        assert_eq!(n.normalize(sibling), "<H>/other/foo.rs");
    }

    #[test]
    fn normalize_is_idempotent_on_already_sentinelized_input() {
        // Critical safety: running normalize twice should be a
        // no-op for the second pass. Sentinels are angle-bracketed
        // (`<HOME>`, `<WORKSPACE>`) and POSIX paths don't use
        // angles, so a sentinel can't be a prefix of any rule. If
        // a future change ever picked sentinel strings that happen
        // to look like real paths, this test fires.
        let n = pn_with_rules(vec![("/home/u", "<HOME>"), ("/workspace", "<WORKSPACE>")]);
        let input = "/home/u/projects/foo /workspace/src/main.rs";
        let once = n.normalize(input);
        let twice = n.normalize(&once);
        assert_eq!(once, twice, "normalize is not idempotent");
        assert!(once.contains("<HOME>"));
        assert!(once.contains("<WORKSPACE>"));
    }

    #[test]
    fn normalize_substring_match_is_documented_limitation() {
        // ACCEPTED LIMITATION: String::replace is byte-substring,
        // not path-component aware. A rule for `/home/u` will match
        // a partial path component like `/home/usr` (incorrectly
        // turning `/home/usr/foo` into `<HOME>r/foo`).
        //
        // In practice this doesn't bite because PathNormalizer's
        // rule prefixes always come from `Path::canonicalize()`,
        // which yields paths that end at a real directory boundary
        // — and consumers (cache_key inputs) almost always have a
        // separator immediately after a matched prefix. But if a
        // future caller starts with prefixes that don't end at
        // directory boundaries, this test documents the surprise
        // it would produce.
        let n = pn_with_rules(vec![("/home/u", "<H>")]);
        // Pathological input: `/home/usr/foo` contains `/home/u` as
        // a 7-byte substring (same first 7 chars), so replace fires
        // and leaves `sr/foo` behind. Semantically wrong if `/home/usr`
        // is supposed to be a different user — but the input was
        // already a bare-substring match, not a path-component match.
        assert_eq!(n.normalize("/home/usr/foo"), "<H>sr/foo");
        // The realistic case still works because cargo / canonicalize
        // produces paths with separators after the prefix.
        assert_eq!(n.normalize("/home/u/foo"), "<H>/foo");
    }

    #[test]
    fn normalize_handles_realistic_out_dir_value() {
        // End-to-end shape mirroring what `parse_env_dep_info`
        // would feed in for a real serde build:
        //   env_dep:OUT_DIR=/<workspace>/target/release/build/serde-XXX/out
        let n = pn_with_rules(vec![("/Users/alice/projects/myrepo", "<WORKSPACE>")]);
        let out_dir =
            "/Users/alice/projects/myrepo/target/release/build/serde-65d43fa14511931c/out";
        assert_eq!(
            n.normalize(out_dir),
            "<WORKSPACE>/target/release/build/serde-65d43fa14511931c/out"
        );
    }

    #[test]
    fn normalize_handles_realistic_rustflags_value() {
        // Real RUSTFLAGS shape with multiple link-search dirs, some
        // of which reference machine-local paths.
        let n = pn_with_rules(vec![
            ("/Users/alice/.cargo", "<CARGO_HOME>"),
            ("/Users/alice/projects/myrepo", "<WORKSPACE>"),
        ]);
        let flags = "-L /Users/alice/.cargo/registry/cache/foo \
                     -L /Users/alice/projects/myrepo/target/release/deps \
                     -C link-arg=-Wl,-rpath,/system/lib";
        let out = n.normalize(flags);
        // Both machine-local paths sentinelized.
        assert!(out.contains("<CARGO_HOME>/registry/cache/foo"));
        assert!(out.contains("<WORKSPACE>/target/release/deps"));
        // System path NOT in the rule list — passes through.
        assert!(out.contains("/system/lib"));
    }

    #[test]
    fn lowercase_drive_letter_handles_drive_root_alone() {
        // Edge case: input is just `C:\` or `C:` with nothing else.
        // Drive letter still gets lowercased; rest preserved.
        assert_eq!(lowercase_drive_letter("C:\\"), Some("c:\\".to_string()));
        assert_eq!(lowercase_drive_letter("C:"), Some("c:".to_string()));
        assert_eq!(lowercase_drive_letter("D:/"), Some("d:/".to_string()));
    }

    #[test]
    fn windows_variants_are_distinct_for_distinct_canonical_forms() {
        // Sanity: the four-variant expansion produces FOUR distinct
        // strings only when each form differs. If the canonical form
        // already has lowercase drive + forward slashes, no extra
        // variants are added (they'd be exact duplicates that dedup
        // would strip anyway).
        if !cfg!(windows) {
            return;
        }
        let mut rules = Vec::new();
        push_rule_with_variants(
            &mut rules,
            Some("c:/users/alice/.cargo".to_string()),
            "<CARGO_HOME>",
        );
        // The "canonical" we passed is already in the
        // lowercase-drive forward-slash form; backslash variant
        // should be added but not the lowercase-drive variant
        // (it's already lowercase).
        let prefixes: Vec<&str> = rules.iter().map(|r| r.prefix.as_str()).collect();
        assert!(prefixes.contains(&"c:/users/alice/.cargo"));
        assert!(prefixes.contains(&"c:\\users\\alice\\.cargo"));
        // No uppercase-drive variant — we didn't construct one.
        assert!(!prefixes.iter().any(|p| p.starts_with("C:")));
    }

    #[test]
    fn remap_args_emits_all_windows_variants_with_same_sentinel() {
        // Each variant becomes its own --remap-path-prefix flag,
        // all mapping to the same sentinel. rustc applies them in
        // order with last-match-wins; whichever variant the actual
        // build path matches, the sentinel is identical.
        if !cfg!(windows) {
            return;
        }
        let mut rules = Vec::new();
        push_rule_with_variants(
            &mut rules,
            Some("C:\\Users\\Alice\\.cargo".to_string()),
            "<CARGO_HOME>",
        );
        let n = PathNormalizer { rules };
        let args = n.remap_args();
        // 4 variants × 1 sentinel = 4 args, all mapping to
        // <CARGO_HOME>.
        assert_eq!(args.len(), 4);
        for arg in &args {
            assert!(
                arg.ends_with("=<CARGO_HOME>"),
                "every variant maps to the same sentinel; got {arg:?}"
            );
        }
    }

    // ── Unicode safety ──────────────────────────────────────────
    //
    // The substring-match design uses byte-literal `String::replace`
    // (which is UTF-8 self-synchronizing — partial codepoint matches
    // can't happen). The drive-letter helper uses byte indexing
    // explicitly to stay panic-safe on inputs that start with a
    // multi-byte codepoint. These tests pin both contracts.

    #[test]
    fn lowercase_drive_letter_does_not_panic_on_multibyte_first_char() {
        // `É` is U+00C9, encoded as two bytes in UTF-8 (0xC3 0x89).
        // The earlier `&s[1..2]` slice would panic with
        // "not a char boundary" on byte 1 (which is inside É).
        // The byte-level check returns None cleanly.
        assert_eq!(lowercase_drive_letter("É:foo"), None);
        assert_eq!(lowercase_drive_letter("日本:"), None);
        assert_eq!(lowercase_drive_letter("é"), None); // too short for : check anyway
        // Sanity: the ASCII drive case still works.
        assert_eq!(lowercase_drive_letter("C:foo"), Some("c:foo".to_string()));
    }

    #[test]
    fn normalize_handles_unicode_paths_correctly() {
        // Realistic case: `/Users/José/.cargo`. The rule prefix and
        // input both use the SAME UTF-8 byte sequence (NFC form, the
        // typical state when the env var is set on macOS APFS).
        // String::replace works on bytes; UTF-8 is self-synchronizing
        // so partial matches inside multi-byte codepoints can't
        // happen.
        let n = pn_with_rules(vec![("/Users/José/.cargo", "<CARGO_HOME>")]);
        let input = "/Users/José/.cargo/registry/src/foo";
        assert_eq!(n.normalize(input), "<CARGO_HOME>/registry/src/foo");
    }

    #[test]
    fn normalize_matches_across_nfc_nfd_unicode_normalization_forms() {
        // The macOS NFC vs NFD case. `é` has two valid UTF-8
        // byte sequences:
        //   NFC: U+00E9         → 0xC3 0xA9       (2 bytes)
        //   NFD: U+0065 U+0301  → 0x65 0xCC 0x81  (3 bytes)
        // String::replace is byte-literal so the two forms don't
        // match each other directly. PathNormalizer normalizes both
        // sides to NFC before matching, so a rule built from a path
        // returned in NFD (HFS+ legacy / some macOS APIs) still
        // matches an input string in NFC (typical env-var form).
        let nfc_prefix = "/Users/Jos\u{00E9}/.cargo"; // single codepoint é
        let nfd_input = "/Users/Jos\u{0065}\u{0301}/.cargo/registry/foo"; // e + combining acute

        // Sanity: the byte sequences differ.
        assert_ne!(
            nfc_prefix.as_bytes(),
            &nfd_input.as_bytes()[..nfc_prefix.len()]
        );

        // Rule prefix in NFC form — `from_env`'s `canonical_string`
        // would have produced this. Input arrives in NFD; normalize
        // converts it to NFC before matching, so the substring hit
        // fires.
        let n = pn_with_rules(vec![(nfc_prefix, "<CARGO_HOME>")]);
        let out = n.normalize(nfd_input);
        assert_eq!(out, "<CARGO_HOME>/registry/foo");

        // Symmetric: NFC input also matches.
        let nfc_input = "/Users/Jos\u{00E9}/.cargo/registry/bar";
        assert_eq!(n.normalize(nfc_input), "<CARGO_HOME>/registry/bar");
    }

    #[test]
    fn strip_verbatim_prefix_removes_extended_length_marker() {
        // The #201 bug: canonicalize returns `\\?\C:\...` on Windows, so
        // a rule prefix never substring-matched the plain `C:\...` paths
        // cargo emits → OUT_DIR leaked into the cache key.
        assert_eq!(strip_verbatim_prefix(r"\\?\C:\proj\out"), r"C:\proj\out");
        // UNC verbatim collapses back to its `\\server\share` form.
        assert_eq!(
            strip_verbatim_prefix(r"\\?\UNC\server\share\x"),
            r"\\server\share\x"
        );
        // Plain paths (and all Unix paths) pass through untouched.
        assert_eq!(strip_verbatim_prefix(r"C:\proj\out"), r"C:\proj\out");
        assert_eq!(strip_verbatim_prefix("/home/u/proj"), "/home/u/proj");
    }

    #[test]
    fn canonical_string_normalizes_to_nfc() {
        // Rule construction must produce NFC-form prefixes so that
        // the NFC normalization in `normalize` can match. Even if
        // the underlying path uses non-NFC bytes (which canonicalize
        // may return on macOS), the stored prefix must be NFC.
        let dir = TempDir::new().unwrap();
        let nfc_name = "Jos\u{00E9}";
        let subdir = dir.path().join(nfc_name);
        std::fs::create_dir(&subdir).unwrap();

        let result = canonical_string(&subdir).expect("canonicalize should succeed");
        // NFC contract: re-normalizing gives the same string.
        let renormalized: String = result.nfc().collect();
        assert_eq!(
            result, renormalized,
            "canonical_string output must be in NFC form, got {result:?}"
        );
    }

    #[test]
    fn normalize_preserves_unicode_outside_matched_prefix() {
        // The "no transformation of unmatched bytes" contract
        // extends to unicode: a code point that contains `\` or `/`
        // bytes shouldn't get partially mangled. UTF-8 reserves
        // bytes 0x00-0x7F for ASCII and uses 0xC0-0xFD as multi-byte
        // markers, so `/` (0x2F) and `\` (0x5C) only appear as
        // themselves — no risk of accidental separator detection
        // mid-codepoint. This test pins it by example.
        let n = pn_with_rules(vec![("/ws", "<W>")]);
        let input = "/ws/José/中文/файл.rs"; // Latin / CJK / Cyrillic
        let out = n.normalize(input);
        assert_eq!(out, "<W>/José/中文/файл.rs");
    }

    #[test]
    fn from_env_construction_is_deterministic() {
        // Two normalizers built from the same env should have
        // identical rule lists — prerequisite for cross-machine
        // cache key consistency. (If this ever flakes, something
        // in `from_env` reads non-deterministic state.)
        let dir = TempDir::new().unwrap();
        let n1 = PathNormalizer::from_env(Some(dir.path()));
        let n2 = PathNormalizer::from_env(Some(dir.path()));
        let p1: Vec<(String, &str)> = n1
            .rules
            .iter()
            .map(|r| (r.prefix.clone(), r.sentinel))
            .collect();
        let p2: Vec<(String, &str)> = n2
            .rules
            .iter()
            .map(|r| (r.prefix.clone(), r.sentinel))
            .collect();
        assert_eq!(p1, p2);
    }
}