Skip to main content

dodot_lib/preprocessing/
baseline.rs

1//! Per-file baseline cache for the preprocessing pipeline.
2//!
3//! Every successful expansion writes a JSON record at
4//! `<cache_dir>/preprocessor/<pack>/<handler>/<filename>.json` capturing
5//! enough state to (a) detect drift on the deployed file, (b) decide
6//! whether the source has changed, and (c) drive cache-backed
7//! reverse-merge without re-rendering the template.
8//!
9//! See `docs/proposals/preprocessing-pipeline.lex` §5.2 for the
10//! field-level contract and `docs/proposals/magic.lex` §"Cache That
11//! Makes It Cheap" for why the `tracked_render` field exists.
12//!
13//! # Lifecycle
14//!
15//! - **Write**: `preprocess_pack` calls [`Baseline::write`] after every
16//!   successful expansion. Re-running `dodot up` overwrites the file in
17//!   place.
18//! - **Read**: `dodot transform check` and the clean filter call
19//!   [`Baseline::load`] to drive divergence detection.
20//! - **Cleanup**: `dodot down` deletes the per-pack subdirectory; the
21//!   cache survives `dodot up` failures so partial deployments don't
22//!   strand baseline data for files that did succeed.
23//!
24//! # Schema versioning
25//!
26//! Records carry a `version` field. The current schema is `1`. Future
27//! changes that add fields can stay at `v1` (serde-default fills in the
28//! missing value); breaking changes bump the version, and load returns
29//! a clean error so the user can clear the cache and re-baseline.
30
31use std::path::{Path, PathBuf};
32use std::time::{SystemTime, UNIX_EPOCH};
33
34use serde::{Deserialize, Serialize};
35use sha2::{Digest, Sha256};
36
37use crate::fs::Fs;
38use crate::paths::Pather;
39use crate::{DodotError, Result};
40
41/// Current baseline-cache schema version. Bump on incompatible changes.
42pub const SCHEMA_VERSION: u32 = 1;
43
44/// One baseline record — the cached state of a single processed file.
45#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
46pub struct Baseline {
47    /// Schema version — see [`SCHEMA_VERSION`].
48    pub version: u32,
49    /// Absolute path of the source file at expansion time. Captured so
50    /// `dodot transform check` can re-find the template to patch
51    /// without re-walking the pack tree, and so cache-only diagnostics
52    /// can name the source even after pack reorganisation.
53    ///
54    /// `#[serde(default)]` for forward compatibility with any v1
55    /// baseline written before this field existed (treated as empty;
56    /// transform check will skip such entries until they're rewritten
57    /// by the next `dodot up`).
58    #[serde(default)]
59    pub source_path: PathBuf,
60    /// SHA-256 of the rendered (visible, marker-free) output, hex-encoded.
61    pub rendered_hash: String,
62    /// The full rendered output verbatim. Stored so reverse-merge can
63    /// diff the deployed file against the baseline byte-for-byte
64    /// without re-rendering the template.
65    pub rendered_content: String,
66    /// SHA-256 of the source file's bytes at the moment of expansion,
67    /// hex-encoded. Used to distinguish "user edited the source" from
68    /// "user edited the deployed file" (the 4-state matrix in the
69    /// pipeline spec §6.1).
70    pub source_hash: String,
71    /// SHA-256 of the rendering context (variables, dodot.* values),
72    /// hex-encoded. Provided by the preprocessor; for templates this is
73    /// the deterministic projection computed by
74    /// [`compute_context_hash`](crate::preprocessing::template). May be
75    /// empty if the preprocessor has no meaningful context concept.
76    #[serde(default)]
77    pub context_hash: String,
78    /// Marker-annotated rendered output (burgertocow's "tracked"
79    /// stream). Empty when the preprocessor doesn't produce one.
80    /// Persisted so the clean filter can rehydrate a `TrackedRender`
81    /// via [`burgertocow::TrackedRender::from_tracked_string`] and
82    /// drive the reverse-diff without re-rendering — re-rendering at
83    /// clean-filter time would re-trigger any secret-provider auth
84    /// prompts on every `git status`.
85    #[serde(default)]
86    pub tracked_render: String,
87    /// Wall-clock unix timestamp (seconds) of when the baseline was
88    /// written. Used by `dodot transform status` to show "deployed
89    /// since …". Not load-bearing for divergence detection.
90    pub timestamp: u64,
91}
92
93impl Baseline {
94    /// Build a baseline from raw inputs. Hashes are computed here so
95    /// callers don't repeat the SHA setup; the optional `tracked_render`
96    /// and `context_hash` come straight off the preprocessor's
97    /// `ExpandedFile`.
98    ///
99    /// `source_path` is the absolute path of the source file inside
100    /// the pack — recorded so reverse-merge knows where to write the
101    /// patched template back to.
102    pub fn build(
103        source_path: &Path,
104        rendered_content: &[u8],
105        source_bytes: &[u8],
106        tracked_render: Option<&str>,
107        context_hash: Option<&[u8; 32]>,
108    ) -> Self {
109        Self {
110            version: SCHEMA_VERSION,
111            source_path: source_path.to_path_buf(),
112            rendered_hash: hex_sha256(rendered_content),
113            rendered_content: String::from_utf8_lossy(rendered_content).into_owned(),
114            source_hash: hex_sha256(source_bytes),
115            context_hash: context_hash.map(hex_encode_32).unwrap_or_default(),
116            tracked_render: tracked_render.unwrap_or("").to_string(),
117            timestamp: now_secs_unix(),
118        }
119    }
120
121    /// Persist this baseline to its JSON path under the cache dir.
122    /// Creates parent directories as needed. Overwrites any existing
123    /// file at the target path.
124    pub fn write(
125        &self,
126        fs: &dyn Fs,
127        paths: &dyn Pather,
128        pack: &str,
129        handler: &str,
130        filename: &str,
131    ) -> Result<PathBuf> {
132        let path = paths.preprocessor_baseline_path(pack, handler, filename);
133        if let Some(parent) = path.parent() {
134            fs.mkdir_all(parent)?;
135        }
136        let body = serde_json::to_string_pretty(self).map_err(|e| {
137            DodotError::Other(format!(
138                "failed to serialise baseline for {pack}/{handler}/{filename}: {e}"
139            ))
140        })?;
141        fs.write_file(&path, body.as_bytes())?;
142        Ok(path)
143    }
144
145    /// Load a baseline from its JSON path. Returns `Ok(None)` if the
146    /// file does not exist (a file with no baseline is a normal state
147    /// for a brand-new pack); returns an error for parse failures or
148    /// unsupported schema versions so the caller can suggest a manual
149    /// clear.
150    pub fn load(
151        fs: &dyn Fs,
152        paths: &dyn Pather,
153        pack: &str,
154        handler: &str,
155        filename: &str,
156    ) -> Result<Option<Self>> {
157        let path = paths.preprocessor_baseline_path(pack, handler, filename);
158        if !fs.exists(&path) {
159            return Ok(None);
160        }
161        let raw = fs.read_to_string(&path)?;
162        let baseline: Self = serde_json::from_str(&raw).map_err(|e| {
163            DodotError::Other(format!(
164                "failed to parse baseline at {}: {e}\n  \
165                 Try `dodot up --force` to re-baseline.",
166                path.display()
167            ))
168        })?;
169        if baseline.version != SCHEMA_VERSION {
170            return Err(DodotError::Other(format!(
171                "baseline at {} has unsupported schema version {} (expected {}). \
172                 Clear the file and run `dodot up` to rebuild.",
173                path.display(),
174                baseline.version,
175                SCHEMA_VERSION
176            )));
177        }
178        Ok(Some(baseline))
179    }
180}
181
182/// SHA-256 → 64-char lowercase hex. Used by the baseline cache for
183/// rendered/source content hashing and by the divergence walker for
184/// the same purpose against current on-disk state. `pub(crate)` so
185/// the divergence module reuses it instead of cloning a parallel
186/// implementation.
187pub(crate) fn hex_sha256(bytes: &[u8]) -> String {
188    let mut hasher = Sha256::new();
189    hasher.update(bytes);
190    hex_encode_32(&hasher.finalize().into())
191}
192
193fn hex_encode_32(bytes: &[u8; 32]) -> String {
194    let mut out = String::with_capacity(64);
195    for b in bytes {
196        out.push(hex_nibble(b >> 4));
197        out.push(hex_nibble(b & 0x0f));
198    }
199    out
200}
201
202fn hex_nibble(n: u8) -> char {
203    match n {
204        0..=9 => (b'0' + n) as char,
205        10..=15 => (b'a' + n - 10) as char,
206        _ => unreachable!(),
207    }
208}
209
210fn now_secs_unix() -> u64 {
211    SystemTime::now()
212        .duration_since(UNIX_EPOCH)
213        .map(|d| d.as_secs())
214        .unwrap_or(0)
215}
216
217/// Canonical filename for a baseline given a logical (stripped) pack
218/// path. Strips parent directories and uses the bare basename, which
219/// matches the cache-path convention specified in the pipeline doc.
220///
221/// Subdirectory-bearing virtual entries (e.g. `subdir/config.toml`) get
222/// flattened to `config.toml` here. The pipeline disambiguates on its
223/// side via the per-pack-and-handler directory tree, but the cache
224/// layout intentionally mirrors a single per-file slot. Two files with
225/// the same basename in different subdirectories of the same pack would
226/// share a cache slot — uncommon for the dotfile-sized payloads
227/// preprocessors produce, but if it surfaces we can extend the
228/// filename encoding without touching callers.
229pub fn cache_filename_for(virtual_relative: &Path) -> String {
230    virtual_relative
231        .file_name()
232        .map(|n| n.to_string_lossy().into_owned())
233        .unwrap_or_else(|| virtual_relative.to_string_lossy().into_owned())
234}
235
236#[cfg(test)]
237mod tests {
238    use super::*;
239    use crate::testing::TempEnvironment;
240
241    #[test]
242    fn build_then_write_then_load_round_trips() {
243        let env = TempEnvironment::builder().build();
244        let baseline = Baseline::build(
245            Path::new("/tmp/config.toml.tmpl"),
246            b"name = Alice\n",
247            b"name = {{ name }}\n",
248            Some("name = \u{1e}Alice\u{1f}\n"),
249            Some(&[0x42; 32]),
250        );
251        let path = baseline
252            .write(
253                env.fs.as_ref(),
254                env.paths.as_ref(),
255                "app",
256                "preprocessed",
257                "config.toml",
258            )
259            .unwrap();
260        assert!(env.fs.exists(&path));
261
262        let loaded = Baseline::load(
263            env.fs.as_ref(),
264            env.paths.as_ref(),
265            "app",
266            "preprocessed",
267            "config.toml",
268        )
269        .unwrap()
270        .expect("baseline must exist after write");
271        assert_eq!(loaded, baseline);
272    }
273
274    #[test]
275    fn load_returns_none_for_missing_file() {
276        let env = TempEnvironment::builder().build();
277        let result = Baseline::load(
278            env.fs.as_ref(),
279            env.paths.as_ref(),
280            "app",
281            "preprocessed",
282            "nope.toml",
283        )
284        .unwrap();
285        assert!(result.is_none());
286    }
287
288    #[test]
289    fn load_rejects_unsupported_schema_version() {
290        let env = TempEnvironment::builder().build();
291        let path = env
292            .paths
293            .preprocessor_baseline_path("app", "preprocessed", "config.toml");
294        env.fs.mkdir_all(path.parent().unwrap()).unwrap();
295        env.fs
296            .write_file(
297                &path,
298                br#"{"version": 999, "rendered_hash": "x", "rendered_content": "x", "source_hash": "x", "timestamp": 0}"#,
299            )
300            .unwrap();
301
302        let err = Baseline::load(
303            env.fs.as_ref(),
304            env.paths.as_ref(),
305            "app",
306            "preprocessed",
307            "config.toml",
308        )
309        .unwrap_err();
310        assert!(
311            format!("{err}").contains("unsupported schema version"),
312            "got: {err}"
313        );
314    }
315
316    #[test]
317    fn load_rejects_corrupted_json() {
318        let env = TempEnvironment::builder().build();
319        let path = env
320            .paths
321            .preprocessor_baseline_path("app", "preprocessed", "config.toml");
322        env.fs.mkdir_all(path.parent().unwrap()).unwrap();
323        env.fs.write_file(&path, b"{not json").unwrap();
324
325        let err = Baseline::load(
326            env.fs.as_ref(),
327            env.paths.as_ref(),
328            "app",
329            "preprocessed",
330            "config.toml",
331        )
332        .unwrap_err();
333        let msg = format!("{err}");
334        assert!(msg.contains("failed to parse"), "got: {msg}");
335        // Hint to clear the cache should be in the error so users have
336        // a recovery path.
337        assert!(
338            msg.contains("--force"),
339            "expected recovery hint, got: {msg}"
340        );
341    }
342
343    #[test]
344    fn build_records_hashes_and_optional_fields() {
345        // Empty optionals → empty strings (serde default), not Null.
346        let p = Path::new("/dummy/source");
347        let b = Baseline::build(p, b"hello", b"hello", None, None);
348        assert_eq!(b.version, SCHEMA_VERSION);
349        assert_eq!(b.source_path, p);
350        assert_eq!(b.rendered_hash.len(), 64); // SHA-256 hex
351        assert_eq!(b.source_hash, b.rendered_hash); // same bytes
352        assert!(b.context_hash.is_empty());
353        assert!(b.tracked_render.is_empty());
354
355        // Provided optionals → encoded.
356        let b2 = Baseline::build(p, b"x", b"y", Some("tracked"), Some(&[0xff; 32]));
357        assert_eq!(b2.context_hash.len(), 64);
358        assert!(b2.context_hash.chars().all(|c| c == 'f'));
359        assert_eq!(b2.tracked_render, "tracked");
360    }
361
362    #[test]
363    fn rendered_content_preserves_lossy_utf8() {
364        // The cache holds rendered_content as UTF-8 (templates are
365        // text); this test pins the loss behaviour for non-UTF-8 bytes
366        // so a future change is a deliberate decision.
367        let b = Baseline::build(
368            Path::new("/dummy"),
369            &[0x66, 0x6f, 0xff, 0x6f],
370            b"src",
371            None,
372            None,
373        );
374        // Replacement character for the invalid 0xff.
375        assert_eq!(b.rendered_content, "fo\u{fffd}o");
376    }
377
378    #[test]
379    fn write_creates_nested_directories() {
380        // Pack-and-handler directories may not exist on first write;
381        // confirm we mkdir_all rather than expecting them to be there.
382        let env = TempEnvironment::builder().build();
383        let baseline = Baseline::build(Path::new("/dummy"), b"x", b"y", None, None);
384        let path = baseline
385            .write(
386                env.fs.as_ref(),
387                env.paths.as_ref(),
388                "deep",
389                "preprocessed",
390                "x",
391            )
392            .unwrap();
393        assert!(env.fs.exists(&path));
394        assert!(env.fs.is_dir(path.parent().unwrap()));
395    }
396
397    #[test]
398    fn write_overwrites_existing_baseline() {
399        // A second write at the same logical path replaces the first.
400        let env = TempEnvironment::builder().build();
401        let first = Baseline::build(Path::new("/dummy"), b"first", b"src", None, None);
402        first
403            .write(
404                env.fs.as_ref(),
405                env.paths.as_ref(),
406                "app",
407                "preprocessed",
408                "f",
409            )
410            .unwrap();
411        let second = Baseline::build(Path::new("/dummy"), b"second", b"src", None, None);
412        second
413            .write(
414                env.fs.as_ref(),
415                env.paths.as_ref(),
416                "app",
417                "preprocessed",
418                "f",
419            )
420            .unwrap();
421
422        let loaded = Baseline::load(
423            env.fs.as_ref(),
424            env.paths.as_ref(),
425            "app",
426            "preprocessed",
427            "f",
428        )
429        .unwrap()
430        .unwrap();
431        assert_eq!(loaded.rendered_content, "second");
432    }
433
434    #[test]
435    fn cache_filename_for_drops_parent_directories() {
436        assert_eq!(cache_filename_for(Path::new("config.toml")), "config.toml");
437        assert_eq!(
438            cache_filename_for(Path::new("subdir/config.toml")),
439            "config.toml"
440        );
441        assert_eq!(cache_filename_for(Path::new("a/b/c/leaf.txt")), "leaf.txt");
442    }
443
444    #[test]
445    fn hex_encoding_is_lowercase_and_padded() {
446        assert_eq!(hex_encode_32(&[0; 32]).len(), 64);
447        assert!(hex_encode_32(&[0; 32]).chars().all(|c| c == '0'));
448        assert_eq!(hex_encode_32(&[0xab; 32]).len(), 64);
449        // Lowercase by convention.
450        assert!(hex_encode_32(&[0xab; 32])
451            .chars()
452            .all(|c| c == 'a' || c == 'b'));
453    }
454}