Skip to main content

dodot_lib/preprocessing/
divergence.rs

1//! Drift detection for preprocessor outputs (the 4-state matrix).
2//!
3//! Walks the per-pack baseline cache and compares each cached record
4//! against the current source file (in the pack) and the current
5//! deployed file (in the datastore). Classifies each pair into one of
6//! the four states defined in `docs/proposals/preprocessing-pipeline.lex`
7//! §6.1:
8//!
9//! | source | deployed | state           |
10//! |--------|----------|-----------------|
11//! | same   | same     | `Synced`        |
12//! | new    | same     | `InputChanged`  |
13//! | same   | edited   | `OutputChanged` |
14//! | new    | edited   | `BothChanged`   |
15//!
16//! Plus two special states for missing files: a baseline whose source
17//! has been deleted (`MissingSource`) or whose deployed artifact is
18//! gone (`MissingDeployed`).
19//!
20//! This module is **read-only**. It produces a [`DivergenceReport`] per
21//! cached baseline; the action layer (`commands::transform::check`)
22//! decides what to do with each report (apply a reverse-merge diff,
23//! emit a conflict block, etc).
24
25use std::path::PathBuf;
26
27use serde::Serialize;
28
29use crate::fs::Fs;
30use crate::paths::Pather;
31use crate::preprocessing::baseline::{hex_sha256, Baseline};
32use crate::Result;
33
34/// Where a single processed file sits in the 4-state matrix.
35#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
36pub enum DivergenceState {
37    /// Source unchanged, deployed file matches the cached render.
38    /// Nothing to do.
39    Synced,
40    /// Source has changed since the cached render, but the deployed
41    /// file is still the cached render. The next `dodot up` will
42    /// re-render — no action from `transform check`.
43    InputChanged,
44    /// Source unchanged, deployed file edited by the user. The
45    /// reverse-merge engine should propagate the edit back to the
46    /// source.
47    OutputChanged,
48    /// Both the source and the deployed file have changed since the
49    /// last `dodot up`. The reverse-merge engine still tries to
50    /// produce a diff, but the result is more likely to require a
51    /// conflict block.
52    BothChanged,
53    /// The cached source path no longer exists on disk. The pack file
54    /// was renamed or removed; the baseline is stale and should be
55    /// dropped on the next `up`.
56    MissingSource,
57    /// The cached deployed path is gone. The user (or some external
58    /// tool) deleted the rendered file. Unusual but worth surfacing.
59    MissingDeployed,
60}
61
62/// One row in `dodot transform check`'s report.
63#[derive(Debug, Clone, Serialize)]
64pub struct DivergenceReport {
65    pub pack: String,
66    pub handler: String,
67    /// Filename inside the cache (e.g. `"config.toml"`). Same as the
68    /// stripped virtual entry the preprocessor produced.
69    pub filename: String,
70    /// Absolute path of the source file in the pack.
71    pub source_path: PathBuf,
72    /// Absolute path of the deployed (rendered) file in the datastore.
73    pub deployed_path: PathBuf,
74    /// The classified state.
75    pub state: DivergenceState,
76}
77
78/// Walk the per-pack baseline cache directory and load every record.
79///
80/// Returns `(pack, handler, filename, baseline)` tuples. The cache
81/// layout is `<cache_dir>/preprocessor/<pack>/<handler>/<filename>.json`,
82/// so this function is a 3-level read_dir walk. Missing or unreadable
83/// subdirectories are skipped silently — the cache is rederivable, and
84/// we never want a transient permission glitch to crash a check run.
85pub fn collect_baselines(
86    fs: &dyn Fs,
87    paths: &dyn Pather,
88) -> Result<Vec<(String, String, String, Baseline)>> {
89    let root = paths.cache_dir().join("preprocessor");
90    if !fs.is_dir(&root) {
91        return Ok(Vec::new());
92    }
93
94    let mut out = Vec::new();
95    let mut packs = match fs.read_dir(&root) {
96        Ok(v) => v,
97        Err(_) => return Ok(Vec::new()),
98    };
99    packs.sort_by(|a, b| a.name.cmp(&b.name));
100
101    for pack in packs {
102        if !pack.is_dir {
103            continue;
104        }
105        let mut handlers = match fs.read_dir(&pack.path) {
106            Ok(v) => v,
107            Err(_) => continue,
108        };
109        handlers.sort_by(|a, b| a.name.cmp(&b.name));
110
111        for handler in handlers {
112            if !handler.is_dir {
113                continue;
114            }
115            let mut files = match fs.read_dir(&handler.path) {
116                Ok(v) => v,
117                Err(_) => continue,
118            };
119            files.sort_by(|a, b| a.name.cmp(&b.name));
120
121            for file in files {
122                if !file.is_file {
123                    continue;
124                }
125                // Filenames in the cache are `<logical>.json` for
126                // baselines and `<logical>.secret.json` for sidecars
127                // (`secrets.lex` §3.3). Skip sidecars so they don't
128                // get fed into Baseline::load — they have a separate
129                // schema. Their content is loaded on demand by
130                // SecretsSidecar::load(...) keyed off the baseline's
131                // logical name.
132                if file.name.ends_with(".secret.json") {
133                    continue;
134                }
135                let Some(filename) = file.name.strip_suffix(".json").map(str::to_string) else {
136                    continue;
137                };
138                match Baseline::load(fs, paths, &pack.name, &handler.name, &filename) {
139                    Ok(Some(baseline)) => {
140                        out.push((pack.name.clone(), handler.name.clone(), filename, baseline));
141                    }
142                    // A corrupt baseline gets surfaced as an error
143                    // here so the user knows to clear it; better than
144                    // silently dropping it from the report.
145                    Ok(None) => {} // unreachable when fs.is_file is true, but tolerate
146                    Err(e) => return Err(e),
147                }
148            }
149        }
150    }
151
152    Ok(out)
153}
154
155/// Classify a single baseline against the current state on disk.
156///
157/// The deployed-file path is derived from the datastore layout: a
158/// preprocessor-expanded file lives at
159/// `<data_dir>/packs/<pack>/<handler>/<filename>`. The user's
160/// home-side symlink dereferences to this path, so reading the bytes
161/// here is the same as reading what the user sees — the double-link
162/// model means the deployed file *is* the file in the datastore.
163pub fn classify_one(
164    fs: &dyn Fs,
165    paths: &dyn Pather,
166    pack: &str,
167    handler: &str,
168    filename: &str,
169    baseline: &Baseline,
170) -> DivergenceReport {
171    let source_path = baseline.source_path.clone();
172    let deployed_path = paths
173        .data_dir()
174        .join("packs")
175        .join(pack)
176        .join(handler)
177        .join(filename);
178
179    let source_exists = !source_path.as_os_str().is_empty() && fs.exists(&source_path);
180    let deployed_exists = fs.exists(&deployed_path);
181
182    let state = if !source_exists {
183        DivergenceState::MissingSource
184    } else if !deployed_exists {
185        DivergenceState::MissingDeployed
186    } else {
187        // Best-effort reads: if either side is unreadable mid-walk
188        // (rare; e.g. a permissions hiccup), we fall back to "Synced"
189        // rather than crashing the report. The caller can re-run.
190        let source_changed = match fs.read_file(&source_path) {
191            Ok(bytes) => hex_sha256(&bytes) != baseline.source_hash,
192            Err(_) => false,
193        };
194        let deployed_changed = match fs.read_file(&deployed_path) {
195            Ok(bytes) => hex_sha256(&bytes) != baseline.rendered_hash,
196            Err(_) => false,
197        };
198        match (source_changed, deployed_changed) {
199            (false, false) => DivergenceState::Synced,
200            (true, false) => DivergenceState::InputChanged,
201            (false, true) => DivergenceState::OutputChanged,
202            (true, true) => DivergenceState::BothChanged,
203        }
204    };
205
206    DivergenceReport {
207        pack: pack.to_string(),
208        handler: handler.to_string(),
209        filename: filename.to_string(),
210        source_path,
211        deployed_path,
212        state,
213    }
214}
215
216/// Walk every cached baseline and produce a divergence report.
217///
218/// The report is sorted by `(pack, handler, filename)` so consumers can
219/// rely on a stable display order without a second sort.
220pub fn collect_divergences(fs: &dyn Fs, paths: &dyn Pather) -> Result<Vec<DivergenceReport>> {
221    let baselines = collect_baselines(fs, paths)?;
222    let reports: Vec<DivergenceReport> = baselines
223        .iter()
224        .map(|(p, h, f, b)| classify_one(fs, paths, p, h, f, b))
225        .collect();
226    Ok(reports)
227}
228
229/// Look up the baseline whose `source_path` matches `target`, plus
230/// the `(pack, handler, filename)` triple that identifies it in the
231/// cache layout.
232///
233/// Used by the clean filter (R6): git invokes the filter with the
234/// source path of the file being processed, and the filter needs the
235/// matching baseline to find the deployed bytes and the cached
236/// tracked render. The lookup is a linear scan of the cache — fast
237/// enough for the realistic per-repo template count (tens to low
238/// hundreds), and avoids the on-disk index file the cache layout
239/// would otherwise need.
240///
241/// Returns `Ok(None)` when no baseline matches; the clean filter
242/// treats that as "echo stdin unchanged" rather than an error.
243pub fn find_baseline_for_source(
244    fs: &dyn Fs,
245    paths: &dyn Pather,
246    target: &std::path::Path,
247) -> Result<Option<(String, String, String, Baseline)>> {
248    for (pack, handler, filename, baseline) in collect_baselines(fs, paths)? {
249        if baseline.source_path == target {
250            return Ok(Some((pack, handler, filename, baseline)));
251        }
252    }
253    Ok(None)
254}
255
256#[cfg(test)]
257mod tests {
258    use super::*;
259    use crate::testing::TempEnvironment;
260
261    fn write_pack_template(env: &TempEnvironment, pack: &str, name: &str, body: &str) {
262        let path = env.dotfiles_root.join(pack).join(name);
263        env.fs.mkdir_all(path.parent().unwrap()).unwrap();
264        env.fs.write_file(&path, body.as_bytes()).unwrap();
265    }
266
267    fn write_deployed(env: &TempEnvironment, pack: &str, handler: &str, name: &str, body: &str) {
268        let path = env
269            .paths
270            .data_dir()
271            .join("packs")
272            .join(pack)
273            .join(handler)
274            .join(name);
275        env.fs.mkdir_all(path.parent().unwrap()).unwrap();
276        env.fs.write_file(&path, body.as_bytes()).unwrap();
277    }
278
279    fn baseline_for(source_path: &std::path::Path, rendered: &[u8], source: &[u8]) -> Baseline {
280        Baseline::build(source_path, rendered, source, Some(""), None)
281    }
282
283    #[test]
284    fn empty_cache_yields_empty_report() {
285        let env = TempEnvironment::builder().build();
286        let reports = collect_divergences(env.fs.as_ref(), env.paths.as_ref()).unwrap();
287        assert!(reports.is_empty());
288    }
289
290    #[test]
291    fn collect_baselines_skips_secret_sidecars() {
292        // The Phase S2 sidecar (`<filename>.secret.json`) lives in
293        // the same handler dir as baselines and shares the `.json`
294        // suffix. Pin that the collector skips it instead of trying
295        // to parse it as a baseline (which fails with a confusing
296        // "missing field rendered_hash" error).
297        let env = TempEnvironment::builder().build();
298        write_pack_template(&env, "app", "config.toml.tmpl", "src");
299        write_deployed(&env, "app", "preprocessed", "config.toml", "rendered");
300        let src_path = env.dotfiles_root.join("app/config.toml.tmpl");
301        let baseline = baseline_for(&src_path, b"rendered", b"src");
302        baseline
303            .write(
304                env.fs.as_ref(),
305                env.paths.as_ref(),
306                "app",
307                "preprocessed",
308                "config.toml",
309            )
310            .unwrap();
311        // Drop a sidecar next to it.
312        let sidecar = crate::preprocessing::baseline::SecretsSidecar::new(vec![
313            crate::preprocessing::SecretLineRange {
314                start: 0,
315                end: 1,
316                reference: "pass:k".into(),
317            },
318        ]);
319        sidecar
320            .write(
321                env.fs.as_ref(),
322                env.paths.as_ref(),
323                "app",
324                "preprocessed",
325                "config.toml",
326            )
327            .unwrap();
328
329        let baselines = collect_baselines(env.fs.as_ref(), env.paths.as_ref()).unwrap();
330        // Exactly one entry — the baseline. The sidecar is skipped.
331        assert_eq!(baselines.len(), 1);
332        assert_eq!(baselines[0].2, "config.toml");
333    }
334
335    #[test]
336    fn synced_state_when_nothing_changed() {
337        // Baseline + source bytes + deployed bytes all match.
338        let env = TempEnvironment::builder().build();
339        write_pack_template(&env, "app", "config.toml.tmpl", "src");
340        write_deployed(&env, "app", "preprocessed", "config.toml", "rendered");
341        let src_path = env.dotfiles_root.join("app/config.toml.tmpl");
342        let baseline = baseline_for(&src_path, b"rendered", b"src");
343        baseline
344            .write(
345                env.fs.as_ref(),
346                env.paths.as_ref(),
347                "app",
348                "preprocessed",
349                "config.toml",
350            )
351            .unwrap();
352
353        let reports = collect_divergences(env.fs.as_ref(), env.paths.as_ref()).unwrap();
354        assert_eq!(reports.len(), 1);
355        assert_eq!(reports[0].state, DivergenceState::Synced);
356    }
357
358    #[test]
359    fn input_changed_when_source_edited() {
360        // Source bytes diverge from baseline; deployed bytes still
361        // match. The next `up` will re-render — `transform check`
362        // takes no action here.
363        let env = TempEnvironment::builder().build();
364        write_pack_template(&env, "app", "config.toml.tmpl", "src EDITED");
365        write_deployed(&env, "app", "preprocessed", "config.toml", "rendered");
366        let src_path = env.dotfiles_root.join("app/config.toml.tmpl");
367        let baseline = baseline_for(&src_path, b"rendered", b"src");
368        baseline
369            .write(
370                env.fs.as_ref(),
371                env.paths.as_ref(),
372                "app",
373                "preprocessed",
374                "config.toml",
375            )
376            .unwrap();
377
378        let reports = collect_divergences(env.fs.as_ref(), env.paths.as_ref()).unwrap();
379        assert_eq!(reports[0].state, DivergenceState::InputChanged);
380    }
381
382    #[test]
383    fn output_changed_when_deployed_edited() {
384        // The auto-merge happy path: only the deployed file moved.
385        let env = TempEnvironment::builder().build();
386        write_pack_template(&env, "app", "config.toml.tmpl", "src");
387        write_deployed(&env, "app", "preprocessed", "config.toml", "rendered EDIT");
388        let src_path = env.dotfiles_root.join("app/config.toml.tmpl");
389        let baseline = baseline_for(&src_path, b"rendered", b"src");
390        baseline
391            .write(
392                env.fs.as_ref(),
393                env.paths.as_ref(),
394                "app",
395                "preprocessed",
396                "config.toml",
397            )
398            .unwrap();
399
400        let reports = collect_divergences(env.fs.as_ref(), env.paths.as_ref()).unwrap();
401        assert_eq!(reports[0].state, DivergenceState::OutputChanged);
402    }
403
404    #[test]
405    fn both_changed_when_both_edited() {
406        let env = TempEnvironment::builder().build();
407        write_pack_template(&env, "app", "config.toml.tmpl", "src EDIT");
408        write_deployed(&env, "app", "preprocessed", "config.toml", "rendered EDIT");
409        let src_path = env.dotfiles_root.join("app/config.toml.tmpl");
410        let baseline = baseline_for(&src_path, b"rendered", b"src");
411        baseline
412            .write(
413                env.fs.as_ref(),
414                env.paths.as_ref(),
415                "app",
416                "preprocessed",
417                "config.toml",
418            )
419            .unwrap();
420
421        let reports = collect_divergences(env.fs.as_ref(), env.paths.as_ref()).unwrap();
422        assert_eq!(reports[0].state, DivergenceState::BothChanged);
423    }
424
425    #[test]
426    fn missing_source_when_pack_file_deleted() {
427        // Baseline points at a source path that's been removed (e.g.
428        // the user renamed or deleted the template). Surfaced as a
429        // distinct state so callers can offer to drop the stale
430        // baseline.
431        let env = TempEnvironment::builder().build();
432        write_deployed(&env, "app", "preprocessed", "config.toml", "rendered");
433        let baseline = baseline_for(
434            &env.dotfiles_root.join("app/config.toml.tmpl"),
435            b"rendered",
436            b"src",
437        );
438        baseline
439            .write(
440                env.fs.as_ref(),
441                env.paths.as_ref(),
442                "app",
443                "preprocessed",
444                "config.toml",
445            )
446            .unwrap();
447
448        let reports = collect_divergences(env.fs.as_ref(), env.paths.as_ref()).unwrap();
449        assert_eq!(reports[0].state, DivergenceState::MissingSource);
450    }
451
452    #[test]
453    fn missing_deployed_when_datastore_file_gone() {
454        let env = TempEnvironment::builder().build();
455        write_pack_template(&env, "app", "config.toml.tmpl", "src");
456        let src_path = env.dotfiles_root.join("app/config.toml.tmpl");
457        let baseline = baseline_for(&src_path, b"rendered", b"src");
458        baseline
459            .write(
460                env.fs.as_ref(),
461                env.paths.as_ref(),
462                "app",
463                "preprocessed",
464                "config.toml",
465            )
466            .unwrap();
467        // Deliberately do NOT write the deployed file.
468
469        let reports = collect_divergences(env.fs.as_ref(), env.paths.as_ref()).unwrap();
470        assert_eq!(reports[0].state, DivergenceState::MissingDeployed);
471    }
472
473    #[test]
474    fn report_is_sorted_by_pack_handler_filename() {
475        // Two packs with two files each, registered in non-sorted
476        // order. The walker must surface them in (pack, handler,
477        // filename) order so display layers don't need a second sort.
478        let env = TempEnvironment::builder().build();
479        for (pack, name, body) in [
480            ("zebra", "z.toml.tmpl", "z-src"),
481            ("alpha", "b.toml.tmpl", "b-src"),
482            ("alpha", "a.toml.tmpl", "a-src"),
483        ] {
484            write_pack_template(&env, pack, name, body);
485            let cache_name = name.strip_suffix(".tmpl").unwrap();
486            write_deployed(&env, pack, "preprocessed", cache_name, "rendered");
487            let src_path = env.dotfiles_root.join(pack).join(name);
488            let baseline = baseline_for(&src_path, b"rendered", body.as_bytes());
489            baseline
490                .write(
491                    env.fs.as_ref(),
492                    env.paths.as_ref(),
493                    pack,
494                    "preprocessed",
495                    cache_name,
496                )
497                .unwrap();
498        }
499
500        let reports = collect_divergences(env.fs.as_ref(), env.paths.as_ref()).unwrap();
501        let order: Vec<_> = reports
502            .iter()
503            .map(|r| (r.pack.clone(), r.filename.clone()))
504            .collect();
505        assert_eq!(
506            order,
507            vec![
508                ("alpha".into(), "a.toml".into()),
509                ("alpha".into(), "b.toml".into()),
510                ("zebra".into(), "z.toml".into()),
511            ]
512        );
513    }
514
515    #[test]
516    fn baseline_with_empty_source_path_is_classified_missing_source() {
517        // Forward-compat with v1 baselines written before source_path
518        // existed: serde-default fills in an empty PathBuf, and the
519        // classifier reports MissingSource so the user sees the issue
520        // and re-runs `dodot up` to rebuild the cache.
521        let env = TempEnvironment::builder().build();
522        write_deployed(&env, "app", "preprocessed", "config.toml", "rendered");
523        let baseline = baseline_for(std::path::Path::new(""), b"rendered", b"src");
524        baseline
525            .write(
526                env.fs.as_ref(),
527                env.paths.as_ref(),
528                "app",
529                "preprocessed",
530                "config.toml",
531            )
532            .unwrap();
533
534        let reports = collect_divergences(env.fs.as_ref(), env.paths.as_ref()).unwrap();
535        assert_eq!(reports[0].state, DivergenceState::MissingSource);
536    }
537
538    // ── find_baseline_for_source ────────────────────────────────
539
540    #[test]
541    fn find_baseline_for_source_returns_match() {
542        // Stage two baselines with distinct source paths; the lookup
543        // must return only the one whose `source_path` matches.
544        let env = TempEnvironment::builder().build();
545        let src_a = env.dotfiles_root.join("app/a.toml.tmpl");
546        write_pack_template(&env, "app", "a.toml.tmpl", "src-a");
547        write_deployed(&env, "app", "preprocessed", "a.toml", "rendered-a");
548        baseline_for(&src_a, b"rendered-a", b"src-a")
549            .write(
550                env.fs.as_ref(),
551                env.paths.as_ref(),
552                "app",
553                "preprocessed",
554                "a.toml",
555            )
556            .unwrap();
557
558        let src_b = env.dotfiles_root.join("app/b.toml.tmpl");
559        write_pack_template(&env, "app", "b.toml.tmpl", "src-b");
560        write_deployed(&env, "app", "preprocessed", "b.toml", "rendered-b");
561        baseline_for(&src_b, b"rendered-b", b"src-b")
562            .write(
563                env.fs.as_ref(),
564                env.paths.as_ref(),
565                "app",
566                "preprocessed",
567                "b.toml",
568            )
569            .unwrap();
570
571        let hit = find_baseline_for_source(env.fs.as_ref(), env.paths.as_ref(), &src_a).unwrap();
572        let (pack, handler, filename, baseline) = hit.expect("baseline must be found");
573        assert_eq!(pack, "app");
574        assert_eq!(handler, "preprocessed");
575        assert_eq!(filename, "a.toml");
576        assert_eq!(baseline.source_path, src_a);
577        assert_eq!(baseline.rendered_content, "rendered-a");
578    }
579
580    #[test]
581    fn find_baseline_for_source_returns_none_when_unknown() {
582        // Path the cache has never seen → Ok(None). The clean
583        // filter treats this as "echo stdin unchanged", so the
584        // None case is part of the normal contract, not an error.
585        let env = TempEnvironment::builder().build();
586        let unknown = env.dotfiles_root.join("never-cached.tmpl");
587        let result =
588            find_baseline_for_source(env.fs.as_ref(), env.paths.as_ref(), &unknown).unwrap();
589        assert!(result.is_none());
590    }
591
592    #[test]
593    fn find_baseline_for_source_on_empty_cache_returns_none() {
594        // No baselines on disk at all (e.g. user has never run
595        // `dodot up`) → Ok(None), not an error.
596        let env = TempEnvironment::builder().build();
597        let any = env.dotfiles_root.join("anything.tmpl");
598        let result = find_baseline_for_source(env.fs.as_ref(), env.paths.as_ref(), &any).unwrap();
599        assert!(result.is_none());
600    }
601}