dodot_lib/preprocessing/baseline.rs
1//! Per-file baseline cache for the preprocessing pipeline.
2//!
3//! Every successful expansion writes a JSON record at
4//! `<cache_dir>/preprocessor/<pack>/<handler>/<filename>.json` capturing
5//! enough state to (a) detect drift on the deployed file, (b) decide
6//! whether the source has changed, and (c) drive cache-backed
7//! reverse-merge without re-rendering the template.
8//!
9//! See `docs/proposals/preprocessing-pipeline.lex` §5.2 for the
10//! field-level contract and `docs/proposals/magic.lex` §"Cache That
11//! Makes It Cheap" for why the `tracked_render` field exists.
12//!
13//! # Lifecycle
14//!
15//! - **Write**: `preprocess_pack` calls [`Baseline::write`] after every
16//! successful expansion. Re-running `dodot up` overwrites the file in
17//! place.
18//! - **Read**: `dodot transform check` and the clean filter call
19//! [`Baseline::load`] to drive divergence detection.
20//! - **Cleanup**: `dodot down` deletes the per-pack subdirectory; the
21//! cache survives `dodot up` failures so partial deployments don't
22//! strand baseline data for files that did succeed.
23//!
24//! # Schema versioning
25//!
26//! Records carry a `version` field. The current schema is `1`. Future
27//! changes that add fields can stay at `v1` (serde-default fills in the
28//! missing value); breaking changes bump the version, and load returns
29//! a clean error so the user can clear the cache and re-baseline.
30
31use std::path::{Path, PathBuf};
32use std::time::{SystemTime, UNIX_EPOCH};
33
34use serde::{Deserialize, Serialize};
35use sha2::{Digest, Sha256};
36
37use crate::fs::Fs;
38use crate::paths::Pather;
39use crate::{DodotError, Result};
40
41/// Current baseline-cache schema version. Bump on incompatible changes.
42pub const SCHEMA_VERSION: u32 = 1;
43
44/// One baseline record — the cached state of a single processed file.
45#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
46pub struct Baseline {
47 /// Schema version — see [`SCHEMA_VERSION`].
48 pub version: u32,
49 /// Absolute path of the source file at expansion time. Captured so
50 /// `dodot transform check` can re-find the template to patch
51 /// without re-walking the pack tree, and so cache-only diagnostics
52 /// can name the source even after pack reorganisation.
53 ///
54 /// `#[serde(default)]` for forward compatibility with any v1
55 /// baseline written before this field existed (treated as empty;
56 /// transform check will skip such entries until they're rewritten
57 /// by the next `dodot up`).
58 #[serde(default)]
59 pub source_path: PathBuf,
60 /// SHA-256 of the rendered (visible, marker-free) output, hex-encoded.
61 pub rendered_hash: String,
62 /// The full rendered output verbatim. Stored so reverse-merge can
63 /// diff the deployed file against the baseline byte-for-byte
64 /// without re-rendering the template.
65 pub rendered_content: String,
66 /// SHA-256 of the source file's bytes at the moment of expansion,
67 /// hex-encoded. Used to distinguish "user edited the source" from
68 /// "user edited the deployed file" (the 4-state matrix in the
69 /// pipeline spec §6.1).
70 pub source_hash: String,
71 /// SHA-256 of the rendering context (variables, dodot.* values),
72 /// hex-encoded. Provided by the preprocessor; for templates this is
73 /// the deterministic projection computed by
74 /// [`compute_context_hash`](crate::preprocessing::template). May be
75 /// empty if the preprocessor has no meaningful context concept.
76 #[serde(default)]
77 pub context_hash: String,
78 /// Marker-annotated rendered output (burgertocow's "tracked"
79 /// stream). Empty when the preprocessor doesn't produce one.
80 /// Persisted so the clean filter can rehydrate a `TrackedRender`
81 /// via [`burgertocow::TrackedRender::from_tracked_string`] and
82 /// drive the reverse-diff without re-rendering — re-rendering at
83 /// clean-filter time would re-trigger any secret-provider auth
84 /// prompts on every `git status`.
85 #[serde(default)]
86 pub tracked_render: String,
87 /// Wall-clock unix timestamp (seconds) of when the baseline was
88 /// written. Used by `dodot transform status` to show "deployed
89 /// since …". Not load-bearing for divergence detection.
90 pub timestamp: u64,
91}
92
93impl Baseline {
94 /// Build a baseline from raw inputs. Hashes are computed here so
95 /// callers don't repeat the SHA setup; the optional `tracked_render`
96 /// and `context_hash` come straight off the preprocessor's
97 /// `ExpandedFile`.
98 ///
99 /// `source_path` is the absolute path of the source file inside
100 /// the pack — recorded so reverse-merge knows where to write the
101 /// patched template back to.
102 pub fn build(
103 source_path: &Path,
104 rendered_content: &[u8],
105 source_bytes: &[u8],
106 tracked_render: Option<&str>,
107 context_hash: Option<&[u8; 32]>,
108 ) -> Self {
109 Self {
110 version: SCHEMA_VERSION,
111 source_path: source_path.to_path_buf(),
112 rendered_hash: hex_sha256(rendered_content),
113 rendered_content: String::from_utf8_lossy(rendered_content).into_owned(),
114 source_hash: hex_sha256(source_bytes),
115 context_hash: context_hash.map(hex_encode_32).unwrap_or_default(),
116 tracked_render: tracked_render.unwrap_or("").to_string(),
117 timestamp: now_secs_unix(),
118 }
119 }
120
121 /// Persist this baseline to its JSON path under the cache dir.
122 /// Creates parent directories as needed. Overwrites any existing
123 /// file at the target path.
124 pub fn write(
125 &self,
126 fs: &dyn Fs,
127 paths: &dyn Pather,
128 pack: &str,
129 handler: &str,
130 filename: &str,
131 ) -> Result<PathBuf> {
132 let path = paths.preprocessor_baseline_path(pack, handler, filename);
133 if let Some(parent) = path.parent() {
134 fs.mkdir_all(parent)?;
135 }
136 let body = serde_json::to_string_pretty(self).map_err(|e| {
137 DodotError::Other(format!(
138 "failed to serialise baseline for {pack}/{handler}/{filename}: {e}"
139 ))
140 })?;
141 fs.write_file(&path, body.as_bytes())?;
142 Ok(path)
143 }
144
145 /// Load a baseline from its JSON path. Returns `Ok(None)` if the
146 /// file does not exist (a file with no baseline is a normal state
147 /// for a brand-new pack); returns an error for parse failures or
148 /// unsupported schema versions so the caller can suggest a manual
149 /// clear.
150 pub fn load(
151 fs: &dyn Fs,
152 paths: &dyn Pather,
153 pack: &str,
154 handler: &str,
155 filename: &str,
156 ) -> Result<Option<Self>> {
157 let path = paths.preprocessor_baseline_path(pack, handler, filename);
158 if !fs.exists(&path) {
159 return Ok(None);
160 }
161 let raw = fs.read_to_string(&path)?;
162 let baseline: Self = serde_json::from_str(&raw).map_err(|e| {
163 DodotError::Other(format!(
164 "failed to parse baseline at {}: {e}\n \
165 Try `dodot up --force` to re-baseline.",
166 path.display()
167 ))
168 })?;
169 if baseline.version != SCHEMA_VERSION {
170 return Err(DodotError::Other(format!(
171 "baseline at {} has unsupported schema version {} (expected {}). \
172 Clear the file and run `dodot up` to rebuild.",
173 path.display(),
174 baseline.version,
175 SCHEMA_VERSION
176 )));
177 }
178 Ok(Some(baseline))
179 }
180}
181
182/// SHA-256 → 64-char lowercase hex. Used by the baseline cache for
183/// rendered/source content hashing and by the divergence walker for
184/// the same purpose against current on-disk state. `pub(crate)` so
185/// the divergence module reuses it instead of cloning a parallel
186/// implementation.
187pub(crate) fn hex_sha256(bytes: &[u8]) -> String {
188 let mut hasher = Sha256::new();
189 hasher.update(bytes);
190 hex_encode_32(&hasher.finalize().into())
191}
192
193fn hex_encode_32(bytes: &[u8; 32]) -> String {
194 let mut out = String::with_capacity(64);
195 for b in bytes {
196 out.push(hex_nibble(b >> 4));
197 out.push(hex_nibble(b & 0x0f));
198 }
199 out
200}
201
202fn hex_nibble(n: u8) -> char {
203 match n {
204 0..=9 => (b'0' + n) as char,
205 10..=15 => (b'a' + n - 10) as char,
206 _ => unreachable!(),
207 }
208}
209
210fn now_secs_unix() -> u64 {
211 SystemTime::now()
212 .duration_since(UNIX_EPOCH)
213 .map(|d| d.as_secs())
214 .unwrap_or(0)
215}
216
217/// Canonical filename for a baseline given a logical (stripped) pack
218/// path. Strips parent directories and uses the bare basename, which
219/// matches the cache-path convention specified in the pipeline doc.
220///
221/// Subdirectory-bearing virtual entries (e.g. `subdir/config.toml`) get
222/// flattened to `config.toml` here. The pipeline disambiguates on its
223/// side via the per-pack-and-handler directory tree, but the cache
224/// layout intentionally mirrors a single per-file slot. Two files with
225/// the same basename in different subdirectories of the same pack would
226/// share a cache slot — uncommon for the dotfile-sized payloads
227/// preprocessors produce, but if it surfaces we can extend the
228/// filename encoding without touching callers.
229pub fn cache_filename_for(virtual_relative: &Path) -> String {
230 virtual_relative
231 .file_name()
232 .map(|n| n.to_string_lossy().into_owned())
233 .unwrap_or_else(|| virtual_relative.to_string_lossy().into_owned())
234}
235
236#[cfg(test)]
237mod tests {
238 use super::*;
239 use crate::testing::TempEnvironment;
240
241 #[test]
242 fn build_then_write_then_load_round_trips() {
243 let env = TempEnvironment::builder().build();
244 let baseline = Baseline::build(
245 Path::new("/tmp/config.toml.tmpl"),
246 b"name = Alice\n",
247 b"name = {{ name }}\n",
248 Some("name = \u{1e}Alice\u{1f}\n"),
249 Some(&[0x42; 32]),
250 );
251 let path = baseline
252 .write(
253 env.fs.as_ref(),
254 env.paths.as_ref(),
255 "app",
256 "preprocessed",
257 "config.toml",
258 )
259 .unwrap();
260 assert!(env.fs.exists(&path));
261
262 let loaded = Baseline::load(
263 env.fs.as_ref(),
264 env.paths.as_ref(),
265 "app",
266 "preprocessed",
267 "config.toml",
268 )
269 .unwrap()
270 .expect("baseline must exist after write");
271 assert_eq!(loaded, baseline);
272 }
273
274 #[test]
275 fn load_returns_none_for_missing_file() {
276 let env = TempEnvironment::builder().build();
277 let result = Baseline::load(
278 env.fs.as_ref(),
279 env.paths.as_ref(),
280 "app",
281 "preprocessed",
282 "nope.toml",
283 )
284 .unwrap();
285 assert!(result.is_none());
286 }
287
288 #[test]
289 fn load_rejects_unsupported_schema_version() {
290 let env = TempEnvironment::builder().build();
291 let path = env
292 .paths
293 .preprocessor_baseline_path("app", "preprocessed", "config.toml");
294 env.fs.mkdir_all(path.parent().unwrap()).unwrap();
295 env.fs
296 .write_file(
297 &path,
298 br#"{"version": 999, "rendered_hash": "x", "rendered_content": "x", "source_hash": "x", "timestamp": 0}"#,
299 )
300 .unwrap();
301
302 let err = Baseline::load(
303 env.fs.as_ref(),
304 env.paths.as_ref(),
305 "app",
306 "preprocessed",
307 "config.toml",
308 )
309 .unwrap_err();
310 assert!(
311 format!("{err}").contains("unsupported schema version"),
312 "got: {err}"
313 );
314 }
315
316 #[test]
317 fn load_rejects_corrupted_json() {
318 let env = TempEnvironment::builder().build();
319 let path = env
320 .paths
321 .preprocessor_baseline_path("app", "preprocessed", "config.toml");
322 env.fs.mkdir_all(path.parent().unwrap()).unwrap();
323 env.fs.write_file(&path, b"{not json").unwrap();
324
325 let err = Baseline::load(
326 env.fs.as_ref(),
327 env.paths.as_ref(),
328 "app",
329 "preprocessed",
330 "config.toml",
331 )
332 .unwrap_err();
333 let msg = format!("{err}");
334 assert!(msg.contains("failed to parse"), "got: {msg}");
335 // Hint to clear the cache should be in the error so users have
336 // a recovery path.
337 assert!(
338 msg.contains("--force"),
339 "expected recovery hint, got: {msg}"
340 );
341 }
342
343 #[test]
344 fn build_records_hashes_and_optional_fields() {
345 // Empty optionals → empty strings (serde default), not Null.
346 let p = Path::new("/dummy/source");
347 let b = Baseline::build(p, b"hello", b"hello", None, None);
348 assert_eq!(b.version, SCHEMA_VERSION);
349 assert_eq!(b.source_path, p);
350 assert_eq!(b.rendered_hash.len(), 64); // SHA-256 hex
351 assert_eq!(b.source_hash, b.rendered_hash); // same bytes
352 assert!(b.context_hash.is_empty());
353 assert!(b.tracked_render.is_empty());
354
355 // Provided optionals → encoded.
356 let b2 = Baseline::build(p, b"x", b"y", Some("tracked"), Some(&[0xff; 32]));
357 assert_eq!(b2.context_hash.len(), 64);
358 assert!(b2.context_hash.chars().all(|c| c == 'f'));
359 assert_eq!(b2.tracked_render, "tracked");
360 }
361
362 #[test]
363 fn rendered_content_preserves_lossy_utf8() {
364 // The cache holds rendered_content as UTF-8 (templates are
365 // text); this test pins the loss behaviour for non-UTF-8 bytes
366 // so a future change is a deliberate decision.
367 let b = Baseline::build(
368 Path::new("/dummy"),
369 &[0x66, 0x6f, 0xff, 0x6f],
370 b"src",
371 None,
372 None,
373 );
374 // Replacement character for the invalid 0xff.
375 assert_eq!(b.rendered_content, "fo\u{fffd}o");
376 }
377
378 #[test]
379 fn write_creates_nested_directories() {
380 // Pack-and-handler directories may not exist on first write;
381 // confirm we mkdir_all rather than expecting them to be there.
382 let env = TempEnvironment::builder().build();
383 let baseline = Baseline::build(Path::new("/dummy"), b"x", b"y", None, None);
384 let path = baseline
385 .write(
386 env.fs.as_ref(),
387 env.paths.as_ref(),
388 "deep",
389 "preprocessed",
390 "x",
391 )
392 .unwrap();
393 assert!(env.fs.exists(&path));
394 assert!(env.fs.is_dir(path.parent().unwrap()));
395 }
396
397 #[test]
398 fn write_overwrites_existing_baseline() {
399 // A second write at the same logical path replaces the first.
400 let env = TempEnvironment::builder().build();
401 let first = Baseline::build(Path::new("/dummy"), b"first", b"src", None, None);
402 first
403 .write(
404 env.fs.as_ref(),
405 env.paths.as_ref(),
406 "app",
407 "preprocessed",
408 "f",
409 )
410 .unwrap();
411 let second = Baseline::build(Path::new("/dummy"), b"second", b"src", None, None);
412 second
413 .write(
414 env.fs.as_ref(),
415 env.paths.as_ref(),
416 "app",
417 "preprocessed",
418 "f",
419 )
420 .unwrap();
421
422 let loaded = Baseline::load(
423 env.fs.as_ref(),
424 env.paths.as_ref(),
425 "app",
426 "preprocessed",
427 "f",
428 )
429 .unwrap()
430 .unwrap();
431 assert_eq!(loaded.rendered_content, "second");
432 }
433
434 #[test]
435 fn cache_filename_for_drops_parent_directories() {
436 assert_eq!(cache_filename_for(Path::new("config.toml")), "config.toml");
437 assert_eq!(
438 cache_filename_for(Path::new("subdir/config.toml")),
439 "config.toml"
440 );
441 assert_eq!(cache_filename_for(Path::new("a/b/c/leaf.txt")), "leaf.txt");
442 }
443
444 #[test]
445 fn hex_encoding_is_lowercase_and_padded() {
446 assert_eq!(hex_encode_32(&[0; 32]).len(), 64);
447 assert!(hex_encode_32(&[0; 32]).chars().all(|c| c == '0'));
448 assert_eq!(hex_encode_32(&[0xab; 32]).len(), 64);
449 // Lowercase by convention.
450 assert!(hex_encode_32(&[0xab; 32])
451 .chars()
452 .all(|c| c == 'a' || c == 'b'));
453 }
454}