Skip to main content

cordance_core/
lock.rs

1//! `cordance-source-lock.v1` — every input source + every output's sha256.
2//!
3//! Drives `cordance check`: any source-anchor sha drift → fail.
4//! Combined with fenced regions, this is how Cordance avoids
5//! laundering hand-edits as authoritative output.
6
7use std::collections::{HashMap, HashSet};
8
9use camino::Utf8PathBuf;
10use serde::{Deserialize, Serialize};
11use sha2::{Digest, Sha256};
12
13use crate::pack::CordancePack;
14
15#[derive(Clone, Debug, Serialize, Deserialize)]
16pub struct SourceLock {
17    pub schema: String,
18    pub pack_id: String,
19    pub doctrine_commit: Option<String>,
20    pub axiom_algorithm_pin: Option<String>,
21    pub sources: Vec<SourceLockEntry>,
22    pub outputs: Vec<SourceLockEntry>,
23}
24
25impl SourceLock {
26    #[must_use]
27    pub fn empty() -> Self {
28        Self {
29            schema: crate::schema::CORDANCE_SOURCE_LOCK_V1.into(),
30            pack_id: String::new(),
31            doctrine_commit: None,
32            axiom_algorithm_pin: None,
33            sources: vec![],
34            outputs: vec![],
35        }
36    }
37
38    /// Compute a `SourceLock` from a `CordancePack`.
39    ///
40    /// `pack_id` is the sha256 of a deterministic byte stream that mixes:
41    /// 1. project identity (`name`, a fixed `repo:.` placeholder, optional
42    ///    `axiom_pin`), so that two unrelated empty projects can never
43    ///    collide on the same id, and
44    /// 2. every source's `id:sha256`, sorted by id, so the digest is stable
45    ///    across runs that produce the same logical pack.
46    ///
47    /// `axiom_algorithm_pin` is propagated from `pack.project.axiom_pin` so
48    /// that a downstream `cordance check` notices a doctrine-shaped drift in
49    /// the axiom algorithm version (the cellos drift problem cordance exists
50    /// to solve).
51    ///
52    /// # Cross-host determinism — Round-8 bughunt #4 (R8-bughunt-4)
53    ///
54    /// Earlier rounds (round-5 R5-bughunt-5 / round-5 R5-bughunt-8 / round-6
55    /// LOW / round-7 R7-bughunt-6) hashed `pack.project.repo_root.as_str()`
56    /// directly. That string was the operator's host-absolute path
57    /// (`C:\\Users\\0ryant\\prj\\cordance` on Windows,
58    /// `/home/op/projects/cordance` on Linux), so two clones of the same
59    /// git commit on different hosts produced different `pack_id`s — and
60    /// the cortex receipt that hashes `pack_id` then disagreed about pack
61    /// identity across CI runners and developer machines. The cellos drift
62    /// problem the receipt was supposed to detect was masked by a
63    /// path-shape drift cordance itself was introducing.
64    ///
65    /// The mix now uses a fixed placeholder (`b"."`). Project distinction
66    /// still comes from `name` and `axiom_pin`. `pack_cmd::run` writes the
67    /// matching `"."` into `pack.project.repo_root` for the on-disk shape;
68    /// the framing constant here is the second half of the same contract
69    /// so any future code that builds a pack with a non-`"."` `repo_root`
70    /// (test fixtures, MCP shims) still hashes identically.
71    #[must_use]
72    pub fn compute_from_pack(pack: &CordancePack) -> Self {
73        // Build the set of paths that should NOT appear in the lock's
74        // `sources` list:
75        //   1. Every path Cordance emitted as a target artifact (lives in
76        //      `pack.outputs`). On the first `cordance pack` the scanner
77        //      doesn't see these because they don't exist yet, but a
78        //      subsequent `cordance check` rescan picks them up — round-4
79        //      bughunt #2's new-file detection then flags every one of them
80        //      as an "ADDED" source. The output sha is already captured in
81        //      `lock.outputs`, so the source-side entry is redundant noise.
82        //   2. Cordance-internal metadata under `.cordance/` that no emitter
83        //      claims (sources.lock itself, evidence-map.json, the optional
84        //      llm-candidate.json). These never appear in `pack.outputs` but
85        //      do appear in scans of a directory where `cordance pack` has
86        //      already run.
87        let mut excluded_paths: HashSet<String> = pack
88            .outputs
89            .iter()
90            .map(|o| o.path.as_str().to_string())
91            .collect();
92        excluded_paths.insert(".cordance/sources.lock".into());
93        excluded_paths.insert(".cordance/evidence-map.json".into());
94        excluded_paths.insert(".cordance/llm-candidate.json".into());
95        excluded_paths.insert(".cordance/cortex-receipt.json".into());
96
97        let mut sorted_sources: Vec<_> = pack
98            .sources
99            .iter()
100            .filter(|r| !excluded_paths.contains(r.path.as_str()))
101            .cloned()
102            .collect();
103        sorted_sources.sort_by(|a, b| a.id.cmp(&b.id));
104
105        // Domain-separated framing keeps "project:name" from colliding with a
106        // source whose id happens to start with that prefix.
107        //
108        // Round-8 bughunt #4 (R8-bughunt-4): the `repo:` mix is now a fixed
109        // `b"."` placeholder rather than `pack.project.repo_root.as_str()`.
110        // See the doc-comment above for the cross-host-determinism rationale;
111        // we keep the framing line so the byte stream layout (and therefore
112        // any historical re-derivation) is shape-compatible — only the
113        // value-bytes changed from a host-absolute path to a single `.`.
114        let mut hasher = Sha256::new();
115        hasher.update(b"project:");
116        hasher.update(pack.project.name.as_bytes());
117        hasher.update(b"\nrepo:");
118        hasher.update(b".");
119        if let Some(pin) = &pack.project.axiom_pin {
120            hasher.update(b"\naxiom:");
121            hasher.update(pin.as_bytes());
122        }
123        for r in &sorted_sources {
124            hasher.update(b"\n");
125            hasher.update(r.id.as_bytes());
126            hasher.update(b":");
127            hasher.update(r.sha256.as_bytes());
128        }
129        let pack_id = hex::encode(hasher.finalize());
130
131        let sources = sorted_sources
132            .iter()
133            .map(|r| SourceLockEntry {
134                id: r.id.clone(),
135                path: r.path.clone(),
136                sha256: r.sha256.clone(),
137                bytes: r.size_bytes,
138            })
139            .collect();
140
141        let outputs = pack
142            .outputs
143            .iter()
144            .map(|o| SourceLockEntry {
145                id: o.path.to_string(),
146                path: o.path.as_str().into(),
147                sha256: o.sha256.clone(),
148                bytes: o.bytes,
149            })
150            .collect();
151
152        let doctrine_commit = pack.doctrine_pins.first().map(|p| p.commit.clone());
153
154        Self {
155            schema: crate::schema::CORDANCE_SOURCE_LOCK_V1.into(),
156            pack_id,
157            doctrine_commit,
158            axiom_algorithm_pin: pack.project.axiom_pin.clone(),
159            sources,
160            outputs,
161        }
162    }
163
164    /// Diff `self` (the current/new state) against `previous`.
165    ///
166    /// `fenced_outputs` is the set of output paths (as `entry.path.as_str()`)
167    /// that currently contain cordance fence markers on disk. The caller is
168    /// responsible for computing this set; keeping the read out of `diff`
169    /// preserves the modularity-and-ports-adapters boundary — `SourceLock`
170    /// is a pure domain type and must not touch the filesystem.
171    ///
172    /// Output paths absent from `fenced_outputs` are treated as **fenced**
173    /// when they are missing on disk (so a deleted managed region still
174    /// counts as drift). This matches the previous behaviour exactly.
175    #[must_use]
176    pub fn diff(&self, previous: &Self, fenced_outputs: &HashSet<String>) -> DriftReport {
177        let current_sources: HashMap<&str, &SourceLockEntry> =
178            self.sources.iter().map(|e| (e.id.as_str(), e)).collect();
179        let previous_sources: HashMap<&str, &SourceLockEntry> = previous
180            .sources
181            .iter()
182            .map(|e| (e.id.as_str(), e))
183            .collect();
184
185        let mut source_drifts = Vec::new();
186
187        // Pass 1: every previous entry that is gone or whose sha drifted.
188        for entry in &previous.sources {
189            match current_sources.get(entry.id.as_str()) {
190                None => {
191                    source_drifts.push(SourceDriftEntry {
192                        id: entry.id.clone(),
193                        path: entry.path.to_string(),
194                        old_sha256: entry.sha256.clone(),
195                        new_sha256: "DELETED".into(),
196                    });
197                }
198                Some(current) if current.sha256 != entry.sha256 => {
199                    source_drifts.push(SourceDriftEntry {
200                        id: entry.id.clone(),
201                        path: entry.path.to_string(),
202                        old_sha256: entry.sha256.clone(),
203                        new_sha256: current.sha256.clone(),
204                    });
205                }
206                _ => {}
207            }
208        }
209
210        // Pass 2: every CURRENT entry not in the previous lock — a newly-added
211        // source. Round-4 bughunt #2: the previous diff was blind to new
212        // files, so `cordance check` would report "clean" after `touch
213        // newfile.md`. We mark new entries with `old_sha256 == "ADDED"` so the
214        // formatter can surface them as additions rather than zero-byte
215        // changes. The same flag also catches "a previously-blocked file is
216        // now classified" and "a renamed file produces a fresh id" — both
217        // were silently missed under the old single-pass loop.
218        for entry in &self.sources {
219            if !previous_sources.contains_key(entry.id.as_str()) {
220                source_drifts.push(SourceDriftEntry {
221                    id: entry.id.clone(),
222                    path: entry.path.to_string(),
223                    old_sha256: "ADDED".into(),
224                    new_sha256: entry.sha256.clone(),
225                });
226            }
227        }
228
229        let current_outputs: HashMap<&str, &SourceLockEntry> =
230            self.outputs.iter().map(|e| (e.id.as_str(), e)).collect();
231
232        let mut fenced_output_drifts = Vec::new();
233        let mut unfenced_output_drifts = Vec::new();
234
235        for entry in &previous.outputs {
236            // Output gone from lock entirely → treat as fenced drift (managed region gone).
237            let drifted = current_outputs
238                .get(entry.id.as_str())
239                .is_none_or(|current| current.sha256 != entry.sha256);
240
241            if !drifted {
242                continue;
243            }
244
245            // A missing file is still a managed-region drift (fenced) — the
246            // caller signals that by leaving the path out of
247            // `fenced_outputs`. Only outputs that the caller explicitly
248            // observed without fence markers are classified as unfenced.
249            // Said another way: "fenced or missing" == fenced drift,
250            // "observed and unfenced" == user-owned drift.
251            let path_key = entry.path.as_str();
252            let observed_unfenced =
253                current_outputs.contains_key(path_key) && !fenced_outputs.contains(path_key);
254
255            let drift_entry = OutputDriftEntry {
256                path: entry.path.to_string(),
257                old_sha256: entry.sha256.clone(),
258                new_sha256: current_outputs
259                    .get(entry.id.as_str())
260                    .map_or_else(|| "DELETED".into(), |e| e.sha256.clone()),
261            };
262
263            if observed_unfenced {
264                unfenced_output_drifts.push(drift_entry);
265            } else {
266                fenced_output_drifts.push(drift_entry);
267            }
268        }
269
270        DriftReport {
271            source_drifts,
272            fenced_output_drifts,
273            unfenced_output_drifts,
274        }
275    }
276}
277
278#[derive(Clone, Debug, Serialize, Deserialize)]
279pub struct SourceLockEntry {
280    pub id: String,
281    pub path: Utf8PathBuf,
282    pub sha256: String,
283    pub bytes: u64,
284}
285
286#[derive(Clone, Debug, Default, Serialize, Deserialize)]
287pub struct DriftReport {
288    pub source_drifts: Vec<SourceDriftEntry>,
289    pub fenced_output_drifts: Vec<OutputDriftEntry>,
290    pub unfenced_output_drifts: Vec<OutputDriftEntry>,
291}
292
293impl DriftReport {
294    #[must_use]
295    pub const fn is_clean(&self) -> bool {
296        self.source_drifts.is_empty() && self.fenced_output_drifts.is_empty()
297    }
298
299    /// Exit code: 0=clean, 1=source drift, 2=fenced output drift, 3=both.
300    #[must_use]
301    pub const fn exit_code(&self) -> i32 {
302        match (
303            !self.source_drifts.is_empty(),
304            !self.fenced_output_drifts.is_empty(),
305        ) {
306            (false, false) => 0,
307            (true, false) => 1,
308            (false, true) => 2,
309            (true, true) => 3,
310        }
311    }
312}
313
314#[derive(Clone, Debug, Serialize, Deserialize)]
315pub struct SourceDriftEntry {
316    pub id: String,
317    pub path: String,
318    pub old_sha256: String,
319    pub new_sha256: String,
320}
321
322#[derive(Clone, Debug, Serialize, Deserialize)]
323pub struct OutputDriftEntry {
324    pub path: String,
325    pub old_sha256: String,
326    pub new_sha256: String,
327}
328
329#[cfg(test)]
330mod tests {
331    use super::*;
332    use crate::advise::AdviseReport;
333    use crate::pack::{CordancePack, PackTargets, ProjectIdentity};
334    use crate::schema;
335
336    fn minimal_empty_pack(name: &str) -> CordancePack {
337        CordancePack {
338            schema: schema::CORDANCE_PACK_V1.into(),
339            project: ProjectIdentity {
340                name: name.into(),
341                repo_root: format!("/tmp/{name}").into(),
342                kind: "test".into(),
343                host_os: "linux".into(),
344                axiom_pin: None,
345            },
346            sources: vec![],
347            doctrine_pins: vec![],
348            targets: PackTargets::default(),
349            outputs: vec![],
350            source_lock: SourceLock::empty(),
351            advise: AdviseReport::empty(),
352            residual_risk: vec!["test".into()],
353        }
354    }
355
356    #[test]
357    fn empty_lock_has_v1_schema() {
358        let l = SourceLock::empty();
359        assert_eq!(l.schema, crate::schema::CORDANCE_SOURCE_LOCK_V1);
360    }
361
362    /// Two unrelated empty projects must never share a `pack_id`; the cellos
363    /// drift problem (HIGH bughunt #5) is precisely the false-equality this
364    /// guards against.
365    #[test]
366    fn empty_packs_with_different_names_have_different_pack_ids() {
367        let p1 = minimal_empty_pack("project_a");
368        let p2 = minimal_empty_pack("project_b");
369        let l1 = SourceLock::compute_from_pack(&p1);
370        let l2 = SourceLock::compute_from_pack(&p2);
371        assert_ne!(l1.pack_id, l2.pack_id);
372    }
373
374    /// `pack_id` is stable across re-computation on the same logical pack.
375    #[test]
376    fn same_empty_pack_yields_same_pack_id() {
377        let p = minimal_empty_pack("project_a");
378        let l1 = SourceLock::compute_from_pack(&p);
379        let l2 = SourceLock::compute_from_pack(&p);
380        assert_eq!(l1.pack_id, l2.pack_id);
381    }
382
383    /// Empty pack must never produce the trivial `sha256("")` digest — that
384    /// is the exact symptom of CRITICAL #5 from the bughunt review.
385    #[test]
386    fn empty_pack_id_is_not_sha256_of_empty_string() {
387        let p = minimal_empty_pack("any");
388        let lock = SourceLock::compute_from_pack(&p);
389        let empty_sha = hex::encode(Sha256::digest(b""));
390        assert_ne!(lock.pack_id, empty_sha);
391    }
392
393    /// `axiom_pin` on the pack must round-trip through to
394    /// `lock.axiom_algorithm_pin`.
395    #[test]
396    fn axiom_pin_propagates_to_lock() {
397        let mut pack = minimal_empty_pack("test");
398        pack.project.axiom_pin = Some("v3.1.1-axiom".into());
399        let lock = SourceLock::compute_from_pack(&pack);
400        assert_eq!(lock.axiom_algorithm_pin.as_deref(), Some("v3.1.1-axiom"));
401    }
402
403    /// Two empty packs that differ only in `axiom_pin` must produce different
404    /// `pack_id`s — otherwise an axiom version bump silently looks identical
405    /// to the previous lock.
406    #[test]
407    fn axiom_pin_affects_pack_id() {
408        let mut p1 = minimal_empty_pack("same_name");
409        let mut p2 = minimal_empty_pack("same_name");
410        p1.project.axiom_pin = Some("v3.1.0-axiom".into());
411        p2.project.axiom_pin = Some("v3.1.1-axiom".into());
412        let l1 = SourceLock::compute_from_pack(&p1);
413        let l2 = SourceLock::compute_from_pack(&p2);
414        assert_ne!(l1.pack_id, l2.pack_id);
415    }
416
417    /// Round-8 bughunt #4 (R8-bughunt-4): the cross-host-determinism pin.
418    /// Two operators on different machines cloning the same git commit
419    /// produce historical `repo_root` values like `/tmp/foo` and
420    /// `C:\Users\op\foo`. Under the old shape, those host-absolute paths
421    /// were mixed into the `pack_id` sha and the two operators' receipts
422    /// disagreed about pack identity for the same logical commit — masking
423    /// the cellos drift problem the receipt was meant to surface.
424    ///
425    /// With the placeholder mix, `compute_from_pack` ignores `repo_root`
426    /// entirely. Identical name + identical `axiom_pin` + identical sources
427    /// MUST yield identical `pack_id`, even if a caller passed two
428    /// different `repo_root` values (legacy test fixtures, MCP shims that
429    /// haven't been migrated yet). This test pins the contract on the
430    /// hash side; the construction-site fix in `pack_cmd::run` writes
431    /// `"."` for all real packs.
432    #[test]
433    fn pack_id_ignores_repo_root_across_hosts() {
434        let mut p1 = minimal_empty_pack("cordance");
435        let mut p2 = minimal_empty_pack("cordance");
436        p1.project.axiom_pin = Some("v3.1.1-axiom".into());
437        p2.project.axiom_pin = Some("v3.1.1-axiom".into());
438        // Simulate the same logical project cloned to different hosts.
439        p1.project.repo_root = "/tmp/foo".into();
440        p2.project.repo_root = r"C:\Users\op\foo".into();
441        let l1 = SourceLock::compute_from_pack(&p1);
442        let l2 = SourceLock::compute_from_pack(&p2);
443        assert_eq!(
444            l1.pack_id, l2.pack_id,
445            "pack_id must be host-independent: identical name+axiom_pin+sources \
446             must yield identical pack_id regardless of repo_root \
447             (l1={}, l2={})",
448            l1.pack_id, l2.pack_id
449        );
450    }
451
452    /// Round-8 bughunt #4 (R8-bughunt-4) companion: the placeholder mix
453    /// must not collapse the existing name-distinction guarantee. Two
454    /// packs with different names must still produce different `pack_id`s
455    /// even when they share an (identical) `repo_root` value. This is the
456    /// negative bound on the previous test: removing `repo_root` from the
457    /// hash must not strengthen collisions, just align cross-host.
458    #[test]
459    fn pack_id_still_differs_by_name_when_repo_root_matches() {
460        let p1 = minimal_empty_pack("project_a");
461        let p2 = minimal_empty_pack("project_b");
462        // `minimal_empty_pack` already sets repo_root from the name, so
463        // overwrite both with an identical placeholder to isolate the
464        // name-distinction signal.
465        let mut p1 = p1;
466        let mut p2 = p2;
467        p1.project.repo_root = ".".into();
468        p2.project.repo_root = ".".into();
469        let l1 = SourceLock::compute_from_pack(&p1);
470        let l2 = SourceLock::compute_from_pack(&p2);
471        assert_ne!(
472            l1.pack_id, l2.pack_id,
473            "different project names must still yield different pack_ids \
474             even when repo_root is identical"
475        );
476    }
477
478    #[test]
479    fn drift_report_clean_exit_code() {
480        let report = DriftReport::default();
481        assert!(report.is_clean());
482        assert_eq!(report.exit_code(), 0);
483    }
484
485    #[test]
486    fn drift_report_source_only_exit_code() {
487        let report = DriftReport {
488            source_drifts: vec![SourceDriftEntry {
489                id: "x".into(),
490                path: "x.md".into(),
491                old_sha256: "aa".into(),
492                new_sha256: "bb".into(),
493            }],
494            fenced_output_drifts: vec![],
495            unfenced_output_drifts: vec![],
496        };
497        assert!(!report.is_clean());
498        assert_eq!(report.exit_code(), 1);
499    }
500
501    /// Round-4 bughunt #2: `cordance check` was blind to NEW source files
502    /// because `SourceLock::diff` iterated only `previous.sources`. A `touch
503    /// newfile.md` between two packs must show up as an ADDED entry.
504    #[test]
505    fn diff_reports_newly_added_sources() {
506        let prev = SourceLock::empty();
507        let mut current = SourceLock::empty();
508        current.sources.push(SourceLockEntry {
509            id: "project_readme:README.md".into(),
510            path: "README.md".into(),
511            sha256: "newhash".into(),
512            bytes: 100,
513        });
514        let report = current.diff(&prev, &HashSet::new());
515        assert_eq!(report.source_drifts.len(), 1);
516        assert_eq!(report.source_drifts[0].old_sha256, "ADDED");
517        assert_eq!(report.source_drifts[0].new_sha256, "newhash");
518        assert_eq!(report.source_drifts[0].path, "README.md");
519        assert!(!report.is_clean());
520    }
521
522    /// Symmetric: a previously-present source that is gone in the current
523    /// lock is reported as DELETED — same as before the bughunt fix.
524    #[test]
525    fn diff_reports_deleted_sources() {
526        let mut prev = SourceLock::empty();
527        prev.sources.push(SourceLockEntry {
528            id: "project_readme:README.md".into(),
529            path: "README.md".into(),
530            sha256: "oldhash".into(),
531            bytes: 100,
532        });
533        let current = SourceLock::empty();
534        let report = current.diff(&prev, &HashSet::new());
535        assert_eq!(report.source_drifts.len(), 1);
536        assert_eq!(report.source_drifts[0].old_sha256, "oldhash");
537        assert_eq!(report.source_drifts[0].new_sha256, "DELETED");
538    }
539
540    /// Mixed: one entry drifts in place, one is added, one is deleted.
541    /// All three must appear in `source_drifts`.
542    #[test]
543    fn diff_handles_mixed_drift_added_deleted() {
544        let mut prev = SourceLock::empty();
545        prev.sources.push(SourceLockEntry {
546            id: "a".into(),
547            path: "a.md".into(),
548            sha256: "aaa".into(),
549            bytes: 1,
550        });
551        prev.sources.push(SourceLockEntry {
552            id: "b".into(),
553            path: "b.md".into(),
554            sha256: "bbb".into(),
555            bytes: 1,
556        });
557        let mut current = SourceLock::empty();
558        current.sources.push(SourceLockEntry {
559            id: "a".into(),
560            path: "a.md".into(),
561            sha256: "aaa_drifted".into(),
562            bytes: 1,
563        });
564        // 'b' removed, 'c' added.
565        current.sources.push(SourceLockEntry {
566            id: "c".into(),
567            path: "c.md".into(),
568            sha256: "ccc".into(),
569            bytes: 1,
570        });
571        let report = current.diff(&prev, &HashSet::new());
572        assert_eq!(report.source_drifts.len(), 3);
573        let by_id: std::collections::HashMap<String, &SourceDriftEntry> = report
574            .source_drifts
575            .iter()
576            .map(|d| (d.id.clone(), d))
577            .collect();
578        assert_eq!(by_id["a"].new_sha256, "aaa_drifted");
579        assert_eq!(by_id["b"].new_sha256, "DELETED");
580        assert_eq!(by_id["c"].old_sha256, "ADDED");
581    }
582
583    /// Round-4 bughunt #1: `SourceLock` must not embed a wall-clock so
584    /// `sources.lock` is byte-deterministic across runs.
585    #[test]
586    fn source_lock_does_not_serialise_generated_at() {
587        let lock = SourceLock::empty();
588        let s = serde_json::to_string(&lock).expect("ser");
589        assert!(
590            !s.contains("generated_at"),
591            "sources.lock must not embed a wall-clock timestamp: {s}"
592        );
593    }
594
595    #[test]
596    fn drift_report_both_exit_code() {
597        let report = DriftReport {
598            source_drifts: vec![SourceDriftEntry {
599                id: "x".into(),
600                path: "x.md".into(),
601                old_sha256: "aa".into(),
602                new_sha256: "bb".into(),
603            }],
604            fenced_output_drifts: vec![OutputDriftEntry {
605                path: "out.md".into(),
606                old_sha256: "cc".into(),
607                new_sha256: "dd".into(),
608            }],
609            unfenced_output_drifts: vec![],
610        };
611        assert!(!report.is_clean());
612        assert_eq!(report.exit_code(), 3);
613    }
614}