kache 0.6.0-rc.1

Zero-copy, content-addressed Rust build cache. No copies, no wasted disk — just hardlinks locally and S3 for sharing.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
//! Compiler-agnostic flag classification.
//!
//! Each compiler (cc, future swift, future MSVC, …) declares a static
//! table of [`FlagSpec`] entries; the shared [`classify_against`] helper
//! resolves any argument to one of the [`FlagClass`] categories. The
//! result tells downstream code (refusal logic, cache-key composition,
//! probe-availability gating) **why** a given argument is safe to cache
//! past — not just whether it is.
//!
//! # Why a table, not a procedural classifier
//!
//! - **Auditable.** "What flags does kache support?" / "Did this PR
//!   add X correctly?" / "Are cc and clang-cl consistent?" are all
//!   answerable by reading or diffing the table. A procedural
//!   classifier scattered across `if`/`match` arms forces every such
//!   question to be answered by code reading.
//! - **Extensible.** Adding a flag is a single row, justified by its
//!   `source` field. A new compiler is its own table, same vocabulary
//!   — no wrapper / cache-key churn.
//! - **User-configurable.** A future config-layer (kunobi-ninja/kache#95)
//!   reads user-defined rows and overlays them onto the shipped table
//!   with explicit precedence rules.
//! - **Documentable.** `kache list-flags --class captured-by-probe`
//!   (future) is a one-line projection over the static data.
//!
//! # Matcher set
//!
//! Three variants, no carve-outs. Anything weirder than `Exact` /
//! `Prefix` lives in a [`Matcher::Regex`] row that justifies its
//! pattern in the row's `source`.
//!
//! # Regex safety
//!
//! [`Matcher::Regex`] uses the [`regex`] crate's default engine —
//! linear-time RE2-style, no backreferences, no lookarounds, no
//! catastrophic backtracking. Patterns are auto-anchored (`^...$`) so
//! row authors cannot accidentally match a substring. All regex
//! patterns in every shipped table are compiled at CI time by
//! [`assert_table_regexes_compile`], so production lookups never panic.

use regex::Regex;
use std::collections::HashMap;

/// The *dialect* of flag spellings a compiler driver speaks.
///
/// gcc and clang (default driver) share the `Gnu` dialect; clang in
/// MSVC driver mode (`clang-cl`) speaks `Cl` (`/`-spellings, and
/// gcc-looking spellings like `-MD` that mean something different).
/// Table rows tagged with a dialect apply only to that dialect; rows
/// with no tag apply to all.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Dialect {
    Gnu,
    Cl,
}

/// How kache treats one compiler argument for caching purposes.
///
/// The classification tells the orchestration code *why* the
/// argument is safe to cache past. Each variant has a different
/// safety contract:
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum FlagClass {
    /// The argument's effect is captured by a typed field the
    /// per-compiler parser extracts (optimization, debug level,
    /// `-std=`, PIC, target arch). The cache-key recipe for that
    /// compiler hashes those fields directly — same flag → same
    /// field value → same key.
    ModeledInKey,

    /// The parser handles the argument structurally, but the value is
    /// not part of the object-content cache key. Examples: mode
    /// markers like `-c` / `-E` / `-S` that route the invocation to a
    /// non-cacheable compiler mode before key computation.
    ParserHandled,

    /// The argument's effect is captured by a probe that asks the
    /// compiler to print its resolved invocation (`cc -###`,
    /// `swiftc -print-target-info`, `cl /Bv`, …). The resolved
    /// token stream is hashed into the cache key, so different
    /// arguments produce different keys without kache modeling
    /// each flag individually.
    ///
    /// **Safety contract**: holds only when the probe resolves on
    /// the host compiler. If the probe returns `None` (e.g. gcc's
    /// `-###` parsing currently incomplete), arguments classified
    /// here MUST refuse to cache — otherwise the key falls back to
    /// modeled-only and silently under-keys.
    CapturedByProbe,

    /// The argument changes *what the compiler sees*, not *how it
    /// compiles*. The preprocessor expansion (or equivalent first
    /// pass) is hashed into the cache key, so the effect lives in
    /// the input bytes the key already covers. Examples: `-DFOO=1`,
    /// `-Iinclude`, `-include header.h`.
    PreprocessorCaptured,

    /// The argument has no effect on the resulting object bytes —
    /// only on diagnostics, the dep-info sidecar, build mechanics,
    /// or color settings. Safe to ignore for keying.
    NoObjectEffect,
}

/// How a [`FlagSpec`] matches an argument.
///
/// Three variants. `Exact` for one literal string. `Prefix` for an
/// open-ended family with a shared head (`-D`, `-I`, …). `Regex` for
/// everything else; auto-anchored so the pattern matches the entire
/// argument or nothing.
#[derive(Debug, Clone, Copy)]
pub enum Matcher {
    /// Single literal string. `arg == s`.
    Exact(&'static str),

    /// `arg.starts_with(s)`.
    Prefix(&'static str),

    /// Anchored, linear-time regex. The matcher implicitly wraps the
    /// pattern as `^(?:pattern)$` so callers can't author a partial-
    /// match regex by accident, and a top-level alternation
    /// (`-foo|-bar`) inside the row pattern doesn't escape the anchors
    /// via operator-precedence quirks. Each `Regex(...)` row must
    /// justify in its owning [`FlagSpec::source`] why a simpler
    /// variant doesn't suffice.
    Regex(&'static str),
}

/// One row of a compiler's flag classification table.
///
/// `source` is mandatory and serves two purposes:
/// - **Audit trail**: every row points at an issue / PR / spec that
///   introduced it. `grep` on the table tells you when each flag
///   was added and why.
/// - **Rationale**: for [`Matcher::Regex`] rows, the source string
///   also explains *why* the pattern can't be expressed with `Exact`
///   or `Prefix` — review checks this.
#[derive(Debug, Clone, Copy)]
pub struct FlagSpec {
    pub matcher: Matcher,
    pub class: FlagClass,
    pub source: &'static str,
    /// Dialect this row applies to. `None` = any dialect.
    pub dialect: Option<Dialect>,
}

/// Build a cache mapping each `Matcher::Regex` pattern in `table` to
/// its compiled `^pattern$` form.
///
/// Called once per process per table via the caller's `OnceLock`. A
/// malformed pattern panics with a diagnostic naming the row's
/// `source`; the [`assert_table_regexes_compile`] test runs in CI to
/// make production unwraps infallible.
pub fn build_regex_cache(table: &'static [FlagSpec]) -> HashMap<&'static str, Regex> {
    let mut map = HashMap::with_capacity(table.len());
    for spec in table {
        if let Matcher::Regex(pat) = spec.matcher {
            // Wrap in `(?:…)` before anchoring so a top-level
            // alternation in the row pattern (e.g. `-O[0-3sz]?|-Og`)
            // doesn't bind looser than the anchors. Without the
            // group, `^-O[0-3sz]?|-Og$` would parse as
            // `(^-O[0-3sz]?) | (-Og$)`, accepting `-Ofast` via the
            // first alternative. With it, both halves are anchored.
            let anchored = format!("^(?:{pat})$");
            let re = Regex::new(&anchored).unwrap_or_else(|e| {
                panic!(
                    "compiler/flags: invalid regex `{pat}` from {}: {e}",
                    spec.source
                )
            });
            map.insert(pat, re);
        }
    }
    map
}

/// Classify `arg` against `table`. Returns `None` when no row matches
/// — the caller treats that as "unsupported flag, refuse to cache".
///
/// A flag candidate: gcc-style `-x`, or — only in the MSVC dialect —
/// `/x`. Windows sources are never `/`-rooted, so a leading `/` under
/// `Cl` is always a flag (unmatched ⇒ refuse). Anything else is a
/// positional (source / output / separated-flag value) and inert.
///
/// Rows tagged with a dialect are only matched when the invocation
/// speaks that dialect; untagged rows match any dialect.
pub fn classify_against(
    arg: &str,
    table: &'static [FlagSpec],
    regex_cache: &HashMap<&'static str, Regex>,
    dialect: Dialect,
) -> Option<FlagClass> {
    // See the doc comment: `/x` is a flag only under the MSVC dialect;
    // anything that isn't a flag is an inert positional.
    let is_flag = arg.starts_with('-') || (dialect == Dialect::Cl && arg.starts_with('/'));
    if !is_flag {
        return Some(FlagClass::NoObjectEffect);
    }
    for spec in table {
        if let Some(d) = spec.dialect
            && d != dialect
        {
            continue;
        }
        let matched = match &spec.matcher {
            Matcher::Exact(s) => arg == *s,
            Matcher::Prefix(s) => arg.starts_with(*s),
            // The cache is populated lazily by `build_regex_cache`; a
            // missing entry here means the table changed without
            // refreshing the cache — debug-time bug, not a flag
            // we should silently accept.
            Matcher::Regex(pat) => regex_cache
                .get(pat)
                .map(|re| re.is_match(arg))
                .unwrap_or_else(|| {
                    panic!(
                        "compiler/flags: regex `{pat}` ({}) not in cache",
                        spec.source
                    )
                }),
        };
        if matched {
            return Some(spec.class);
        }
    }
    None
}

/// CI helper: validate that every [`Matcher::Regex`] pattern in
/// `table` compiles. Run from each compiler's unit-test module so
/// shipped tables can't carry malformed regexes.
#[cfg(test)]
#[doc(hidden)]
pub fn assert_table_regexes_compile(table: &'static [FlagSpec]) {
    for spec in table {
        if let Matcher::Regex(pat) = spec.matcher {
            // Wrap in `(?:…)` before anchoring so a top-level
            // alternation in the row pattern (e.g. `-O[0-3sz]?|-Og`)
            // doesn't bind looser than the anchors. Without the
            // group, `^-O[0-3sz]?|-Og$` would parse as
            // `(^-O[0-3sz]?) | (-Og$)`, accepting `-Ofast` via the
            // first alternative. With it, both halves are anchored.
            let anchored = format!("^(?:{pat})$");
            Regex::new(&anchored).unwrap_or_else(|e| {
                panic!(
                    "compiler/flags: invalid regex `{pat}` from {}: {e}",
                    spec.source
                )
            });
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::sync::OnceLock;

    static TEST_TABLE: &[FlagSpec] = &[
        FlagSpec {
            matcher: Matcher::Exact("-fPIC"),
            class: FlagClass::ModeledInKey,
            source: "tests",
            dialect: None,
        },
        FlagSpec {
            matcher: Matcher::Prefix("-D"),
            class: FlagClass::PreprocessorCaptured,
            source: "tests",
            dialect: None,
        },
        FlagSpec {
            matcher: Matcher::Exact("-E"),
            class: FlagClass::ParserHandled,
            source: "tests",
            dialect: None,
        },
        FlagSpec {
            matcher: Matcher::Regex(r"-W[^,]*"),
            class: FlagClass::NoObjectEffect,
            source: "tests — warnings; excludes `-Wl,*`/`-Wa,*`/`-Wp,*` passthrough forms",
            dialect: None,
        },
    ];

    fn cache() -> &'static HashMap<&'static str, Regex> {
        static CACHE: OnceLock<HashMap<&'static str, Regex>> = OnceLock::new();
        CACHE.get_or_init(|| build_regex_cache(TEST_TABLE))
    }

    #[test]
    fn positional_arg_classifies_as_no_effect() {
        // Source files, output paths, values consumed by separate-
        // argument flags — never flags, always safe.
        assert_eq!(
            classify_against("foo.c", TEST_TABLE, cache(), Dialect::Gnu),
            Some(FlagClass::NoObjectEffect)
        );
        assert_eq!(
            classify_against("include", TEST_TABLE, cache(), Dialect::Gnu),
            Some(FlagClass::NoObjectEffect)
        );
        // Under Cl the positional contract still holds for non-`/` tokens
        // (only a leading `/` becomes a flag candidate there).
        assert_eq!(
            classify_against("foo.c", TEST_TABLE, cache(), Dialect::Cl),
            Some(FlagClass::NoObjectEffect)
        );
        assert_eq!(
            classify_against("include", TEST_TABLE, cache(), Dialect::Cl),
            Some(FlagClass::NoObjectEffect)
        );
    }

    #[test]
    fn exact_matcher_matches_only_the_literal() {
        assert_eq!(
            classify_against("-fPIC", TEST_TABLE, cache(), Dialect::Gnu),
            Some(FlagClass::ModeledInKey)
        );
        assert_eq!(
            classify_against("-fPIC=1", TEST_TABLE, cache(), Dialect::Gnu),
            None
        );
        assert_eq!(
            classify_against("-fPI", TEST_TABLE, cache(), Dialect::Gnu),
            None
        );
    }

    #[test]
    fn exact_matcher_can_return_parser_handled() {
        assert_eq!(
            classify_against("-E", TEST_TABLE, cache(), Dialect::Gnu),
            Some(FlagClass::ParserHandled)
        );
    }

    #[test]
    fn prefix_matcher_matches_the_family() {
        assert_eq!(
            classify_against("-DFOO=1", TEST_TABLE, cache(), Dialect::Gnu),
            Some(FlagClass::PreprocessorCaptured)
        );
        assert_eq!(
            classify_against("-D", TEST_TABLE, cache(), Dialect::Gnu),
            Some(FlagClass::PreprocessorCaptured)
        );
        assert_eq!(
            classify_against("-d", TEST_TABLE, cache(), Dialect::Gnu),
            None
        );
    }

    #[test]
    fn regex_matcher_is_anchored_and_excludes_by_pattern() {
        // -W* warnings should match; -Wl,*  / -Wa,* / -Wp,* must NOT
        // (they're passthrough forms with different semantics).
        assert_eq!(
            classify_against("-Wall", TEST_TABLE, cache(), Dialect::Gnu),
            Some(FlagClass::NoObjectEffect)
        );
        assert_eq!(
            classify_against("-Wno-unused", TEST_TABLE, cache(), Dialect::Gnu),
            Some(FlagClass::NoObjectEffect)
        );
        assert_eq!(
            classify_against("-Wl,-no_pie", TEST_TABLE, cache(), Dialect::Gnu),
            None
        );
        assert_eq!(
            classify_against("-Wa,--64", TEST_TABLE, cache(), Dialect::Gnu),
            None
        );
        assert_eq!(
            classify_against("-Wp,-MD,foo.d", TEST_TABLE, cache(), Dialect::Gnu),
            None
        );
    }

    #[test]
    fn unknown_flag_classifies_as_none() {
        // An argument no row matches → caller refuses. This is the
        // safety property: the table is an allow-list of *known*
        // classifications.
        assert_eq!(
            classify_against("-fmadeup", TEST_TABLE, cache(), Dialect::Gnu),
            None
        );
        assert_eq!(
            classify_against("--unknown", TEST_TABLE, cache(), Dialect::Gnu),
            None
        );
    }

    #[test]
    fn ci_validator_accepts_valid_table() {
        assert_table_regexes_compile(TEST_TABLE);
    }

    #[test]
    fn dialect_values_are_distinct() {
        assert_ne!(Dialect::Gnu, Dialect::Cl);
        let d = Dialect::Cl;
        assert_eq!(d, d);
    }

    #[test]
    fn classify_against_filters_by_dialect_and_slash_flags() {
        let table: &'static [FlagSpec] = &[
            FlagSpec {
                matcher: Matcher::Exact("-MD"),
                class: FlagClass::NoObjectEffect,
                source: "test gnu-only",
                dialect: Some(Dialect::Gnu),
            },
            FlagSpec {
                matcher: Matcher::Exact("-c"),
                class: FlagClass::ParserHandled,
                source: "test any",
                dialect: None,
            },
        ];
        let cache = build_regex_cache(table);

        assert_eq!(
            classify_against("-MD", table, &cache, Dialect::Gnu),
            Some(FlagClass::NoObjectEffect)
        );
        assert_eq!(classify_against("-MD", table, &cache, Dialect::Cl), None);
        assert_eq!(
            classify_against("-c", table, &cache, Dialect::Gnu),
            Some(FlagClass::ParserHandled)
        );
        assert_eq!(
            classify_against("-c", table, &cache, Dialect::Cl),
            Some(FlagClass::ParserHandled)
        );
        // under Cl, an unmatched leading-`/` token refuses (fail closed)
        assert_eq!(
            classify_against("/guard:cf", table, &cache, Dialect::Cl),
            None
        );
        // under Gnu, `/foo` is a positional → inert
        assert_eq!(
            classify_against("/foo", table, &cache, Dialect::Gnu),
            Some(FlagClass::NoObjectEffect)
        );
    }

    /// Anchoring must survive top-level alternation in a row's regex.
    /// Without the `(?:...)` wrap in `build_regex_cache`, a pattern
    /// like `-foo|-bar` would parse as `(^-foo)|(-bar$)`: partial
    /// matches succeed, and an unrelated argument like `-foozilla`
    /// gets classified silently. This used to bite the cc table's
    /// `-O[0-3sz]?|-Og` row.
    #[test]
    fn anchoring_survives_top_level_alternation_in_pattern() {
        static ALT_TABLE: &[FlagSpec] = &[FlagSpec {
            matcher: Matcher::Regex(r"-foo|-bar"),
            class: FlagClass::NoObjectEffect,
            source: "tests — alternation anchoring",
            dialect: None,
        }];
        let cache = build_regex_cache(ALT_TABLE);

        // The legitimate matches.
        assert_eq!(
            classify_against("-foo", ALT_TABLE, &cache, Dialect::Gnu),
            Some(FlagClass::NoObjectEffect)
        );
        assert_eq!(
            classify_against("-bar", ALT_TABLE, &cache, Dialect::Gnu),
            Some(FlagClass::NoObjectEffect)
        );

        // The traps a naive `^pat$` would let through. All start with
        // `-` so the positional early-return doesn't short-circuit
        // the actual regex check — we are genuinely exercising
        // the matcher.
        assert_eq!(
            classify_against("-foobar", ALT_TABLE, &cache, Dialect::Gnu),
            None
        );
        assert_eq!(
            classify_against("-foozilla", ALT_TABLE, &cache, Dialect::Gnu),
            None
        );
        assert_eq!(
            classify_against("-x-bar", ALT_TABLE, &cache, Dialect::Gnu),
            None
        );
        assert_eq!(
            classify_against("--bar", ALT_TABLE, &cache, Dialect::Gnu),
            None
        );
    }
}