Skip to main content

dodot_lib/preprocessing/
divergence.rs

1//! Drift detection for preprocessor outputs (the 4-state matrix).
2//!
3//! Walks the per-pack baseline cache and compares each cached record
4//! against the current source file (in the pack) and the current
5//! deployed file (in the datastore). Classifies each pair into one of
6//! the four states defined in `docs/proposals/preprocessing-pipeline.lex`
7//! §6.1:
8//!
9//! | source | deployed | state           |
10//! |--------|----------|-----------------|
11//! | same   | same     | `Synced`        |
12//! | new    | same     | `InputChanged`  |
13//! | same   | edited   | `OutputChanged` |
14//! | new    | edited   | `BothChanged`   |
15//!
16//! Plus two special states for missing files: a baseline whose source
17//! has been deleted (`MissingSource`) or whose deployed artifact is
18//! gone (`MissingDeployed`).
19//!
20//! This module is **read-only**. It produces a [`DivergenceReport`] per
21//! cached baseline; the action layer (`commands::transform::check`)
22//! decides what to do with each report (apply a reverse-merge diff,
23//! emit a conflict block, etc).
24
25use std::path::PathBuf;
26
27use serde::Serialize;
28
29use crate::fs::Fs;
30use crate::paths::Pather;
31use crate::preprocessing::baseline::{hex_sha256, Baseline};
32use crate::Result;
33
34/// Where a single processed file sits in the 4-state matrix.
35#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
36pub enum DivergenceState {
37    /// Source unchanged, deployed file matches the cached render.
38    /// Nothing to do.
39    Synced,
40    /// Source has changed since the cached render, but the deployed
41    /// file is still the cached render. The next `dodot up` will
42    /// re-render — no action from `transform check`.
43    InputChanged,
44    /// Source unchanged, deployed file edited by the user. The
45    /// reverse-merge engine should propagate the edit back to the
46    /// source.
47    OutputChanged,
48    /// Both the source and the deployed file have changed since the
49    /// last `dodot up`. The reverse-merge engine still tries to
50    /// produce a diff, but the result is more likely to require a
51    /// conflict block.
52    BothChanged,
53    /// The cached source path no longer exists on disk. The pack file
54    /// was renamed or removed; the baseline is stale and should be
55    /// dropped on the next `up`.
56    MissingSource,
57    /// The cached deployed path is gone. The user (or some external
58    /// tool) deleted the rendered file. Unusual but worth surfacing.
59    MissingDeployed,
60}
61
62/// One row in `dodot transform check`'s report.
63#[derive(Debug, Clone, Serialize)]
64pub struct DivergenceReport {
65    pub pack: String,
66    pub handler: String,
67    /// Filename inside the cache (e.g. `"config.toml"`). Same as the
68    /// stripped virtual entry the preprocessor produced.
69    pub filename: String,
70    /// Absolute path of the source file in the pack.
71    pub source_path: PathBuf,
72    /// Absolute path of the deployed (rendered) file in the datastore.
73    pub deployed_path: PathBuf,
74    /// The classified state.
75    pub state: DivergenceState,
76}
77
78/// Walk the per-pack baseline cache directory and load every record.
79///
80/// Returns `(pack, handler, filename, baseline)` tuples. The cache
81/// layout is `<cache_dir>/preprocessor/<pack>/<handler>/<filename>.json`,
82/// so this function is a 3-level read_dir walk. Missing or unreadable
83/// subdirectories are skipped silently — the cache is rederivable, and
84/// we never want a transient permission glitch to crash a check run.
85pub fn collect_baselines(
86    fs: &dyn Fs,
87    paths: &dyn Pather,
88) -> Result<Vec<(String, String, String, Baseline)>> {
89    let root = paths.cache_dir().join("preprocessor");
90    if !fs.is_dir(&root) {
91        return Ok(Vec::new());
92    }
93
94    let mut out = Vec::new();
95    let mut packs = match fs.read_dir(&root) {
96        Ok(v) => v,
97        Err(_) => return Ok(Vec::new()),
98    };
99    packs.sort_by(|a, b| a.name.cmp(&b.name));
100
101    for pack in packs {
102        if !pack.is_dir {
103            continue;
104        }
105        let mut handlers = match fs.read_dir(&pack.path) {
106            Ok(v) => v,
107            Err(_) => continue,
108        };
109        handlers.sort_by(|a, b| a.name.cmp(&b.name));
110
111        for handler in handlers {
112            if !handler.is_dir {
113                continue;
114            }
115            let mut files = match fs.read_dir(&handler.path) {
116                Ok(v) => v,
117                Err(_) => continue,
118            };
119            files.sort_by(|a, b| a.name.cmp(&b.name));
120
121            for file in files {
122                if !file.is_file {
123                    continue;
124                }
125                // Filenames in the cache are `<logical>.json`; strip
126                // the suffix to recover the logical name.
127                let Some(filename) = file.name.strip_suffix(".json").map(str::to_string) else {
128                    continue;
129                };
130                match Baseline::load(fs, paths, &pack.name, &handler.name, &filename) {
131                    Ok(Some(baseline)) => {
132                        out.push((pack.name.clone(), handler.name.clone(), filename, baseline));
133                    }
134                    // A corrupt baseline gets surfaced as an error
135                    // here so the user knows to clear it; better than
136                    // silently dropping it from the report.
137                    Ok(None) => {} // unreachable when fs.is_file is true, but tolerate
138                    Err(e) => return Err(e),
139                }
140            }
141        }
142    }
143
144    Ok(out)
145}
146
147/// Classify a single baseline against the current state on disk.
148///
149/// The deployed-file path is derived from the datastore layout: a
150/// preprocessor-expanded file lives at
151/// `<data_dir>/packs/<pack>/<handler>/<filename>`. The user's
152/// home-side symlink dereferences to this path, so reading the bytes
153/// here is the same as reading what the user sees — the double-link
154/// model means the deployed file *is* the file in the datastore.
155pub fn classify_one(
156    fs: &dyn Fs,
157    paths: &dyn Pather,
158    pack: &str,
159    handler: &str,
160    filename: &str,
161    baseline: &Baseline,
162) -> DivergenceReport {
163    let source_path = baseline.source_path.clone();
164    let deployed_path = paths
165        .data_dir()
166        .join("packs")
167        .join(pack)
168        .join(handler)
169        .join(filename);
170
171    let source_exists = !source_path.as_os_str().is_empty() && fs.exists(&source_path);
172    let deployed_exists = fs.exists(&deployed_path);
173
174    let state = if !source_exists {
175        DivergenceState::MissingSource
176    } else if !deployed_exists {
177        DivergenceState::MissingDeployed
178    } else {
179        // Best-effort reads: if either side is unreadable mid-walk
180        // (rare; e.g. a permissions hiccup), we fall back to "Synced"
181        // rather than crashing the report. The caller can re-run.
182        let source_changed = match fs.read_file(&source_path) {
183            Ok(bytes) => hex_sha256(&bytes) != baseline.source_hash,
184            Err(_) => false,
185        };
186        let deployed_changed = match fs.read_file(&deployed_path) {
187            Ok(bytes) => hex_sha256(&bytes) != baseline.rendered_hash,
188            Err(_) => false,
189        };
190        match (source_changed, deployed_changed) {
191            (false, false) => DivergenceState::Synced,
192            (true, false) => DivergenceState::InputChanged,
193            (false, true) => DivergenceState::OutputChanged,
194            (true, true) => DivergenceState::BothChanged,
195        }
196    };
197
198    DivergenceReport {
199        pack: pack.to_string(),
200        handler: handler.to_string(),
201        filename: filename.to_string(),
202        source_path,
203        deployed_path,
204        state,
205    }
206}
207
208/// Walk every cached baseline and produce a divergence report.
209///
210/// The report is sorted by `(pack, handler, filename)` so consumers can
211/// rely on a stable display order without a second sort.
212pub fn collect_divergences(fs: &dyn Fs, paths: &dyn Pather) -> Result<Vec<DivergenceReport>> {
213    let baselines = collect_baselines(fs, paths)?;
214    let reports: Vec<DivergenceReport> = baselines
215        .iter()
216        .map(|(p, h, f, b)| classify_one(fs, paths, p, h, f, b))
217        .collect();
218    Ok(reports)
219}
220
221/// Look up the baseline whose `source_path` matches `target`, plus
222/// the `(pack, handler, filename)` triple that identifies it in the
223/// cache layout.
224///
225/// Used by the clean filter (R6): git invokes the filter with the
226/// source path of the file being processed, and the filter needs the
227/// matching baseline to find the deployed bytes and the cached
228/// tracked render. The lookup is a linear scan of the cache — fast
229/// enough for the realistic per-repo template count (tens to low
230/// hundreds), and avoids the on-disk index file the cache layout
231/// would otherwise need.
232///
233/// Returns `Ok(None)` when no baseline matches; the clean filter
234/// treats that as "echo stdin unchanged" rather than an error.
235pub fn find_baseline_for_source(
236    fs: &dyn Fs,
237    paths: &dyn Pather,
238    target: &std::path::Path,
239) -> Result<Option<(String, String, String, Baseline)>> {
240    for (pack, handler, filename, baseline) in collect_baselines(fs, paths)? {
241        if baseline.source_path == target {
242            return Ok(Some((pack, handler, filename, baseline)));
243        }
244    }
245    Ok(None)
246}
247
248#[cfg(test)]
249mod tests {
250    use super::*;
251    use crate::testing::TempEnvironment;
252
253    fn write_pack_template(env: &TempEnvironment, pack: &str, name: &str, body: &str) {
254        let path = env.dotfiles_root.join(pack).join(name);
255        env.fs.mkdir_all(path.parent().unwrap()).unwrap();
256        env.fs.write_file(&path, body.as_bytes()).unwrap();
257    }
258
259    fn write_deployed(env: &TempEnvironment, pack: &str, handler: &str, name: &str, body: &str) {
260        let path = env
261            .paths
262            .data_dir()
263            .join("packs")
264            .join(pack)
265            .join(handler)
266            .join(name);
267        env.fs.mkdir_all(path.parent().unwrap()).unwrap();
268        env.fs.write_file(&path, body.as_bytes()).unwrap();
269    }
270
271    fn baseline_for(source_path: &std::path::Path, rendered: &[u8], source: &[u8]) -> Baseline {
272        Baseline::build(source_path, rendered, source, Some(""), None)
273    }
274
275    #[test]
276    fn empty_cache_yields_empty_report() {
277        let env = TempEnvironment::builder().build();
278        let reports = collect_divergences(env.fs.as_ref(), env.paths.as_ref()).unwrap();
279        assert!(reports.is_empty());
280    }
281
282    #[test]
283    fn synced_state_when_nothing_changed() {
284        // Baseline + source bytes + deployed bytes all match.
285        let env = TempEnvironment::builder().build();
286        write_pack_template(&env, "app", "config.toml.tmpl", "src");
287        write_deployed(&env, "app", "preprocessed", "config.toml", "rendered");
288        let src_path = env.dotfiles_root.join("app/config.toml.tmpl");
289        let baseline = baseline_for(&src_path, b"rendered", b"src");
290        baseline
291            .write(
292                env.fs.as_ref(),
293                env.paths.as_ref(),
294                "app",
295                "preprocessed",
296                "config.toml",
297            )
298            .unwrap();
299
300        let reports = collect_divergences(env.fs.as_ref(), env.paths.as_ref()).unwrap();
301        assert_eq!(reports.len(), 1);
302        assert_eq!(reports[0].state, DivergenceState::Synced);
303    }
304
305    #[test]
306    fn input_changed_when_source_edited() {
307        // Source bytes diverge from baseline; deployed bytes still
308        // match. The next `up` will re-render — `transform check`
309        // takes no action here.
310        let env = TempEnvironment::builder().build();
311        write_pack_template(&env, "app", "config.toml.tmpl", "src EDITED");
312        write_deployed(&env, "app", "preprocessed", "config.toml", "rendered");
313        let src_path = env.dotfiles_root.join("app/config.toml.tmpl");
314        let baseline = baseline_for(&src_path, b"rendered", b"src");
315        baseline
316            .write(
317                env.fs.as_ref(),
318                env.paths.as_ref(),
319                "app",
320                "preprocessed",
321                "config.toml",
322            )
323            .unwrap();
324
325        let reports = collect_divergences(env.fs.as_ref(), env.paths.as_ref()).unwrap();
326        assert_eq!(reports[0].state, DivergenceState::InputChanged);
327    }
328
329    #[test]
330    fn output_changed_when_deployed_edited() {
331        // The auto-merge happy path: only the deployed file moved.
332        let env = TempEnvironment::builder().build();
333        write_pack_template(&env, "app", "config.toml.tmpl", "src");
334        write_deployed(&env, "app", "preprocessed", "config.toml", "rendered EDIT");
335        let src_path = env.dotfiles_root.join("app/config.toml.tmpl");
336        let baseline = baseline_for(&src_path, b"rendered", b"src");
337        baseline
338            .write(
339                env.fs.as_ref(),
340                env.paths.as_ref(),
341                "app",
342                "preprocessed",
343                "config.toml",
344            )
345            .unwrap();
346
347        let reports = collect_divergences(env.fs.as_ref(), env.paths.as_ref()).unwrap();
348        assert_eq!(reports[0].state, DivergenceState::OutputChanged);
349    }
350
351    #[test]
352    fn both_changed_when_both_edited() {
353        let env = TempEnvironment::builder().build();
354        write_pack_template(&env, "app", "config.toml.tmpl", "src EDIT");
355        write_deployed(&env, "app", "preprocessed", "config.toml", "rendered EDIT");
356        let src_path = env.dotfiles_root.join("app/config.toml.tmpl");
357        let baseline = baseline_for(&src_path, b"rendered", b"src");
358        baseline
359            .write(
360                env.fs.as_ref(),
361                env.paths.as_ref(),
362                "app",
363                "preprocessed",
364                "config.toml",
365            )
366            .unwrap();
367
368        let reports = collect_divergences(env.fs.as_ref(), env.paths.as_ref()).unwrap();
369        assert_eq!(reports[0].state, DivergenceState::BothChanged);
370    }
371
372    #[test]
373    fn missing_source_when_pack_file_deleted() {
374        // Baseline points at a source path that's been removed (e.g.
375        // the user renamed or deleted the template). Surfaced as a
376        // distinct state so callers can offer to drop the stale
377        // baseline.
378        let env = TempEnvironment::builder().build();
379        write_deployed(&env, "app", "preprocessed", "config.toml", "rendered");
380        let baseline = baseline_for(
381            &env.dotfiles_root.join("app/config.toml.tmpl"),
382            b"rendered",
383            b"src",
384        );
385        baseline
386            .write(
387                env.fs.as_ref(),
388                env.paths.as_ref(),
389                "app",
390                "preprocessed",
391                "config.toml",
392            )
393            .unwrap();
394
395        let reports = collect_divergences(env.fs.as_ref(), env.paths.as_ref()).unwrap();
396        assert_eq!(reports[0].state, DivergenceState::MissingSource);
397    }
398
399    #[test]
400    fn missing_deployed_when_datastore_file_gone() {
401        let env = TempEnvironment::builder().build();
402        write_pack_template(&env, "app", "config.toml.tmpl", "src");
403        let src_path = env.dotfiles_root.join("app/config.toml.tmpl");
404        let baseline = baseline_for(&src_path, b"rendered", b"src");
405        baseline
406            .write(
407                env.fs.as_ref(),
408                env.paths.as_ref(),
409                "app",
410                "preprocessed",
411                "config.toml",
412            )
413            .unwrap();
414        // Deliberately do NOT write the deployed file.
415
416        let reports = collect_divergences(env.fs.as_ref(), env.paths.as_ref()).unwrap();
417        assert_eq!(reports[0].state, DivergenceState::MissingDeployed);
418    }
419
420    #[test]
421    fn report_is_sorted_by_pack_handler_filename() {
422        // Two packs with two files each, registered in non-sorted
423        // order. The walker must surface them in (pack, handler,
424        // filename) order so display layers don't need a second sort.
425        let env = TempEnvironment::builder().build();
426        for (pack, name, body) in [
427            ("zebra", "z.toml.tmpl", "z-src"),
428            ("alpha", "b.toml.tmpl", "b-src"),
429            ("alpha", "a.toml.tmpl", "a-src"),
430        ] {
431            write_pack_template(&env, pack, name, body);
432            let cache_name = name.strip_suffix(".tmpl").unwrap();
433            write_deployed(&env, pack, "preprocessed", cache_name, "rendered");
434            let src_path = env.dotfiles_root.join(pack).join(name);
435            let baseline = baseline_for(&src_path, b"rendered", body.as_bytes());
436            baseline
437                .write(
438                    env.fs.as_ref(),
439                    env.paths.as_ref(),
440                    pack,
441                    "preprocessed",
442                    cache_name,
443                )
444                .unwrap();
445        }
446
447        let reports = collect_divergences(env.fs.as_ref(), env.paths.as_ref()).unwrap();
448        let order: Vec<_> = reports
449            .iter()
450            .map(|r| (r.pack.clone(), r.filename.clone()))
451            .collect();
452        assert_eq!(
453            order,
454            vec![
455                ("alpha".into(), "a.toml".into()),
456                ("alpha".into(), "b.toml".into()),
457                ("zebra".into(), "z.toml".into()),
458            ]
459        );
460    }
461
462    #[test]
463    fn baseline_with_empty_source_path_is_classified_missing_source() {
464        // Forward-compat with v1 baselines written before source_path
465        // existed: serde-default fills in an empty PathBuf, and the
466        // classifier reports MissingSource so the user sees the issue
467        // and re-runs `dodot up` to rebuild the cache.
468        let env = TempEnvironment::builder().build();
469        write_deployed(&env, "app", "preprocessed", "config.toml", "rendered");
470        let baseline = baseline_for(std::path::Path::new(""), b"rendered", b"src");
471        baseline
472            .write(
473                env.fs.as_ref(),
474                env.paths.as_ref(),
475                "app",
476                "preprocessed",
477                "config.toml",
478            )
479            .unwrap();
480
481        let reports = collect_divergences(env.fs.as_ref(), env.paths.as_ref()).unwrap();
482        assert_eq!(reports[0].state, DivergenceState::MissingSource);
483    }
484
485    // ── find_baseline_for_source ────────────────────────────────
486
487    #[test]
488    fn find_baseline_for_source_returns_match() {
489        // Stage two baselines with distinct source paths; the lookup
490        // must return only the one whose `source_path` matches.
491        let env = TempEnvironment::builder().build();
492        let src_a = env.dotfiles_root.join("app/a.toml.tmpl");
493        write_pack_template(&env, "app", "a.toml.tmpl", "src-a");
494        write_deployed(&env, "app", "preprocessed", "a.toml", "rendered-a");
495        baseline_for(&src_a, b"rendered-a", b"src-a")
496            .write(
497                env.fs.as_ref(),
498                env.paths.as_ref(),
499                "app",
500                "preprocessed",
501                "a.toml",
502            )
503            .unwrap();
504
505        let src_b = env.dotfiles_root.join("app/b.toml.tmpl");
506        write_pack_template(&env, "app", "b.toml.tmpl", "src-b");
507        write_deployed(&env, "app", "preprocessed", "b.toml", "rendered-b");
508        baseline_for(&src_b, b"rendered-b", b"src-b")
509            .write(
510                env.fs.as_ref(),
511                env.paths.as_ref(),
512                "app",
513                "preprocessed",
514                "b.toml",
515            )
516            .unwrap();
517
518        let hit = find_baseline_for_source(env.fs.as_ref(), env.paths.as_ref(), &src_a).unwrap();
519        let (pack, handler, filename, baseline) = hit.expect("baseline must be found");
520        assert_eq!(pack, "app");
521        assert_eq!(handler, "preprocessed");
522        assert_eq!(filename, "a.toml");
523        assert_eq!(baseline.source_path, src_a);
524        assert_eq!(baseline.rendered_content, "rendered-a");
525    }
526
527    #[test]
528    fn find_baseline_for_source_returns_none_when_unknown() {
529        // Path the cache has never seen → Ok(None). The clean
530        // filter treats this as "echo stdin unchanged", so the
531        // None case is part of the normal contract, not an error.
532        let env = TempEnvironment::builder().build();
533        let unknown = env.dotfiles_root.join("never-cached.tmpl");
534        let result =
535            find_baseline_for_source(env.fs.as_ref(), env.paths.as_ref(), &unknown).unwrap();
536        assert!(result.is_none());
537    }
538
539    #[test]
540    fn find_baseline_for_source_on_empty_cache_returns_none() {
541        // No baselines on disk at all (e.g. user has never run
542        // `dodot up`) → Ok(None), not an error.
543        let env = TempEnvironment::builder().build();
544        let any = env.dotfiles_root.join("anything.tmpl");
545        let result = find_baseline_for_source(env.fs.as_ref(), env.paths.as_ref(), &any).unwrap();
546        assert!(result.is_none());
547    }
548}