Skip to main content

repo/
repository_redaction.rs

1// SPDX-License-Identifier: Apache-2.0
2//! Repository helpers for the redaction primitive.
3//!
4//! Storage layout (one file per redacted blob):
5//!
6//! ```text
7//! <heddle_dir>/redactions/<blob-hash-hex>.bin
8//! ```
9//!
10//! The file is an rmp-serde-encoded [`RedactionsBlob`] — every redaction
11//! that targets the same blob lives in the same blob file. The blob's own
12//! content hash provides a unique key that is independent of how the
13//! blob was produced, so cross-state redactions ("redact every occurrence
14//! of this blob") fold into a single record naturally.
15//!
16//! ## Redaction IDs
17//!
18//! A redaction's *id* is the BLAKE3 hash of its rmp-encoded bytes. We
19//! compute it deterministically when writing so callers can correlate the
20//! returned id with the oplog `OpRecord::Redact` entry's `redaction_id`
21//! field. The id is content-addressed: identical Redactions produce
22//! identical ids, which preserves the "redact is idempotent" property
23//! from the build brief.
24
25use std::{collections::HashSet, fs, path::PathBuf};
26
27use anyhow::{Context, Result};
28use chrono::Utc;
29use objects::{
30    fs_atomic::write_file_atomic,
31    object::{ChangeId, ContentHash, Principal, Redaction, RedactionsBlob, Tree},
32};
33
34use crate::repository::Repository;
35
36/// Outcome of a `purge` call. Useful for surfaces that report what
37/// actually changed (the JSON output of `heddle purge`).
38#[derive(Debug, Clone)]
39pub struct PurgeOutcome {
40    /// The id (content hash) of the latest redaction on the purged blob.
41    /// `None` when no redaction existed yet — purge refuses in that case;
42    /// the field is here so a future force-without-redaction surface can
43    /// fill it in without changing the public type.
44    pub redaction_id: Option<ContentHash>,
45    /// Number of redaction records that transitioned from
46    /// "declared-only" to "purged" as part of this call. Idempotent
47    /// retries report 0.
48    pub redactions_marked: usize,
49    /// Whether the loose blob bytes were physically removed from local
50    /// storage. `false` if no loose copy existed (already gone, or only
51    /// present in a pack).
52    pub blob_bytes_removed: bool,
53    /// `true` iff the blob is still present in a pack file. Initial
54    /// implementation can't repack to drop the bytes; surfaces this so
55    /// the CLI can warn operators rather than silently leave bytes on
56    /// disk.
57    pub blob_remains_in_pack: bool,
58}
59
60impl Repository {
61    /// Append a redaction. Returns the redaction's content-addressed id.
62    ///
63    /// Idempotent: if a redaction with the same canonical bytes already
64    /// exists on the blob, no second entry is written and the existing
65    /// id is returned.
66    pub fn put_redaction(&self, redaction: Redaction) -> Result<ContentHash> {
67        let blob = redaction.redacted_blob;
68        let mut existing = self.get_redactions_for_blob(&blob)?;
69
70        // Compute the id by canonical-encoding a single-redaction blob
71        // wrapper. The content-addressed id is stable across runs.
72        let id = redaction_content_hash(&redaction)?;
73
74        // Idempotency: if any existing redaction encodes to the same id,
75        // skip the write.
76        for existing_redaction in &existing.redactions {
77            let existing_id = redaction_content_hash(existing_redaction)?;
78            if existing_id == id {
79                return Ok(id);
80            }
81        }
82
83        existing.push(redaction);
84        let bytes = existing
85            .encode()
86            .with_context(|| "encoding redactions blob")?;
87        let path = self.redaction_path_for_blob(&blob);
88        if let Some(parent) = path.parent() {
89            fs::create_dir_all(parent).with_context(|| format!("create '{}'", parent.display()))?;
90        }
91        write_file_atomic(&path, &bytes).with_context(|| format!("write '{}'", path.display()))?;
92        Ok(id)
93    }
94
95    /// Load all redactions targeting `blob`. Returns an empty
96    /// `RedactionsBlob` (not an error) when none exist — callers can
97    /// treat the result uniformly.
98    pub fn get_redactions_for_blob(&self, blob: &ContentHash) -> Result<RedactionsBlob> {
99        let path = self.redaction_path_for_blob(blob);
100        if !path.exists() {
101            return Ok(RedactionsBlob::empty());
102        }
103        let bytes = fs::read(&path).with_context(|| format!("read '{}'", path.display()))?;
104        RedactionsBlob::decode(&bytes).with_context(|| format!("decode '{}'", path.display()))
105    }
106
107    /// Walk every redactions file in the repo. Used by `heddle redact list`
108    /// and the GC's "never collect a redaction" guard. Returns
109    /// `(blob_hash, blob)` pairs so callers can correlate.
110    pub fn list_all_redactions(&self) -> Result<Vec<(ContentHash, RedactionsBlob)>> {
111        let dir = self.redactions_dir();
112        if !dir.exists() {
113            return Ok(Vec::new());
114        }
115        let mut out = Vec::new();
116        for entry in fs::read_dir(&dir).with_context(|| format!("read '{}'", dir.display()))? {
117            let entry = entry.with_context(|| format!("entry in '{}'", dir.display()))?;
118            let path = entry.path();
119            let Some(stem) = path.file_stem().and_then(|s| s.to_str()) else {
120                continue;
121            };
122            // Skip non-`.bin` files (e.g. editor backups). The blob hash
123            // is hex-encoded; bad filenames just get skipped — we don't
124            // crash on partial state.
125            if path.extension().and_then(|e| e.to_str()) != Some("bin") {
126                continue;
127            }
128            let Ok(blob) = parse_blob_hash_hex(stem) else {
129                continue;
130            };
131            let bytes = fs::read(&path).with_context(|| format!("read '{}'", path.display()))?;
132            let blob_obj = RedactionsBlob::decode(&bytes)
133                .with_context(|| format!("decode '{}'", path.display()))?;
134            out.push((blob, blob_obj));
135        }
136        Ok(out)
137    }
138
139    /// Look up a single redaction by its id. Returns `Some((blob_hash,
140    /// redaction))` if found; `None` if no redaction by that id exists.
141    ///
142    /// Today this walks every redactions file — operators rarely have
143    /// more than a handful of redactions in a repo, and the operation
144    /// is interactive (`heddle redact show`). If listings become
145    /// frequent enough to matter, a flat `<heddle_dir>/redactions/index.bin`
146    /// can be added without changing the public signature.
147    pub fn get_redaction(
148        &self,
149        redaction_id: &ContentHash,
150    ) -> Result<Option<(ContentHash, Redaction)>> {
151        for (blob, redactions_blob) in self.list_all_redactions()? {
152            for redaction in &redactions_blob.redactions {
153                let id = redaction_content_hash(redaction)?;
154                if id == *redaction_id {
155                    return Ok(Some((blob, redaction.clone())));
156                }
157            }
158        }
159        Ok(None)
160    }
161
162    /// Mark every redaction on `blob` as purged and physically remove the
163    /// blob bytes from the local loose object store. The `Redaction`
164    /// records stay in place; only the bytes are gone.
165    ///
166    /// Refuses if no redaction exists on the blob — operators must
167    /// `redact` before `purge`. This is the contract from the build
168    /// brief: "Refuses unless a Redaction already exists on the blob."
169    ///
170    /// `_purger` is recorded by the caller in the oplog `Purge` entry;
171    /// it's accepted here so the helper can be extended (e.g. to embed
172    /// the purger in a purge record) without changing the signature.
173    pub fn purge_blob(&self, blob: &ContentHash, _purger: &Principal) -> Result<PurgeOutcome> {
174        let mut redactions_blob = self.get_redactions_for_blob(blob)?;
175        if redactions_blob.redactions.is_empty() {
176            anyhow::bail!(
177                "no redaction exists for blob {} — declare one with `heddle redact` first",
178                blob.short()
179            );
180        }
181        let now = Utc::now();
182        let redactions_marked = redactions_blob.mark_all_purged(now);
183        let latest_id = match redactions_blob.latest() {
184            Some(latest) => Some(redaction_content_hash(latest)?),
185            None => None,
186        };
187        // Persist the purged-at marker before touching the blob bytes —
188        // if the blob delete fails (filesystem error), the audit trail
189        // still records that purge was attempted.
190        let bytes = redactions_blob
191            .encode()
192            .with_context(|| "re-encode redactions blob after purge mark")?;
193        let path = self.redaction_path_for_blob(blob);
194        write_file_atomic(&path, &bytes).with_context(|| format!("write '{}'", path.display()))?;
195
196        // Delete the loose blob bytes if present. Packed blobs are
197        // flagged but not removed in this initial implementation —
198        // dropping packed bytes requires a repack pass we punt on
199        // here.
200        let (blob_bytes_removed, blob_remains_in_pack) = remove_loose_blob_bytes(self, blob)?;
201
202        Ok(PurgeOutcome {
203            redaction_id: latest_id,
204            redactions_marked,
205            blob_bytes_removed,
206            blob_remains_in_pack,
207        })
208    }
209
210    /// `<heddle_dir>/redactions/` — root of the redaction store.
211    pub(crate) fn redactions_dir(&self) -> PathBuf {
212        self.heddle_dir().join("redactions")
213    }
214
215    /// `<heddle_dir>/redactions/<blob-hash-hex>.bin` — the redactions
216    /// file for a specific blob.
217    pub(crate) fn redaction_path_for_blob(&self, blob: &ContentHash) -> PathBuf {
218        self.redactions_dir()
219            .join(format!("{}.bin", hex_encode_content_hash(blob)))
220    }
221
222    /// If `blob` has any active redaction, return the stub text the
223    /// materialize path should write to disk in place of the blob
224    /// content. Returns `None` when no redaction exists — callers
225    /// should then proceed with normal materialization.
226    ///
227    /// Picks the latest redaction (by `redacted_at`) to source the
228    /// stub. Multiple redactions on the same blob converge to the
229    /// most-recent message; the older ones remain in the audit trail.
230    pub fn redaction_stub_for_blob(&self, blob: &ContentHash) -> Result<Option<String>> {
231        let redactions = self.get_redactions_for_blob(blob)?;
232        if !redactions.has_active() {
233            return Ok(None);
234        }
235        let latest = redactions
236            .latest()
237            .expect("non-empty redactions blob has a latest entry");
238        let id = redaction_content_hash(latest)?;
239        Ok(Some(latest.stub_text(&id)))
240    }
241
242    /// Enumerate every state reachable from any thread tip or marker by
243    /// walking parent pointers transitively. Used by `--all-states`
244    /// redaction propagation so a leaked secret can be scrubbed from
245    /// every state in which its blob hash appears.
246    ///
247    /// The walk is breadth-first and dedups by `ChangeId`, so a state
248    /// reached from multiple tips appears once. Missing states (broken
249    /// parent links) are skipped silently — redaction propagation is
250    /// best-effort across the reachable graph, not a graph-integrity
251    /// check.
252    pub fn reachable_states(&self) -> Result<Vec<ChangeId>> {
253        let refs = self.refs();
254        let mut roots: Vec<ChangeId> = Vec::new();
255        for name in refs
256            .list_threads()
257            .with_context(|| "list threads for reachable_states")?
258        {
259            if let Some(tip) = refs
260                .get_thread(&name)
261                .with_context(|| format!("read thread '{name}'"))?
262            {
263                roots.push(tip);
264            }
265        }
266        for name in refs
267            .list_markers()
268            .with_context(|| "list markers for reachable_states")?
269        {
270            if let Some(tip) = refs
271                .get_marker(&name)
272                .with_context(|| format!("read marker '{name}'"))?
273            {
274                roots.push(tip);
275            }
276        }
277
278        let mut visited: HashSet<ChangeId> = HashSet::new();
279        let mut queue: Vec<ChangeId> = Vec::new();
280        for root in roots {
281            if visited.insert(root) {
282                queue.push(root);
283            }
284        }
285        let mut out: Vec<ChangeId> = Vec::new();
286        while let Some(id) = queue.pop() {
287            // Load the state; if missing (broken parent ref or shallow
288            // clone), skip — propagation tolerates gaps.
289            let Some(state) = self
290                .store()
291                .get_state(&id)
292                .with_context(|| format!("load state {} for reachable walk", id.short()))?
293            else {
294                continue;
295            };
296            out.push(id);
297            for parent in &state.parents {
298                if visited.insert(*parent) {
299                    queue.push(*parent);
300                }
301            }
302        }
303        Ok(out)
304    }
305
306    /// Find every path under `state` whose terminal blob hashes to
307    /// `target`. Used by `--all-states` propagation: a leaked secret
308    /// may live at different paths across history (renames, copies),
309    /// so we propagate by blob hash, not by path.
310    ///
311    /// Returns paths as forward-slash strings, lexicographically
312    /// stable thanks to `Tree` entry ordering.
313    pub fn paths_to_blob_in_state(
314        &self,
315        state: &ChangeId,
316        target: &ContentHash,
317    ) -> Result<Vec<String>> {
318        let Some(tree) = self
319            .get_tree_for_state(state)
320            .with_context(|| format!("load tree for state {}", state.short()))?
321        else {
322            return Ok(Vec::new());
323        };
324        let mut out: Vec<String> = Vec::new();
325        walk_tree_for_blob(self, &tree, "", target, &mut out)?;
326        Ok(out)
327    }
328}
329
330/// Recursive helper for [`Repository::paths_to_blob_in_state`]. Walks
331/// the tree depth-first; on a matching blob, records the full
332/// repo-relative path and continues (a blob can appear under multiple
333/// paths).
334fn walk_tree_for_blob(
335    repo: &Repository,
336    tree: &Tree,
337    prefix: &str,
338    target: &ContentHash,
339    out: &mut Vec<String>,
340) -> Result<()> {
341    for entry in tree.iter() {
342        let path = if prefix.is_empty() {
343            entry.name.clone()
344        } else {
345            format!("{prefix}/{}", entry.name)
346        };
347        if entry.is_blob() {
348            if entry.hash == *target {
349                out.push(path);
350            }
351            continue;
352        }
353        if entry.is_tree() {
354            let Some(subtree) = repo
355                .store()
356                .get_tree(&entry.hash)
357                .with_context(|| format!("load subtree {}", entry.hash.short()))?
358            else {
359                // Missing subtree object — treat as unreachable, don't fail.
360                continue;
361            };
362            walk_tree_for_blob(repo, &subtree, &path, target, out)?;
363        }
364    }
365    Ok(())
366}
367
368/// Compute the content hash of a single redaction. The hash covers the
369/// rmp-encoded bytes of a one-element `RedactionsBlob` containing the
370/// redaction, so the id format is stable across schema additions that
371/// extend `RedactionsBlob` (e.g. a future header field).
372fn redaction_content_hash(redaction: &Redaction) -> Result<ContentHash> {
373    // Wrap in a single-entry blob so the canonical bytes are independent
374    // of the surrounding container's existing siblings — two different
375    // redactions stored in the same blob file still produce distinct ids.
376    let single = RedactionsBlob::new(vec![redaction.clone()]);
377    let bytes = single
378        .encode()
379        .with_context(|| "encode single-redaction for content addressing")?;
380    let digest = blake3::hash(&bytes);
381    Ok(ContentHash::from_bytes(*digest.as_bytes()))
382}
383
384fn hex_encode_content_hash(hash: &ContentHash) -> String {
385    let bytes = hash.as_bytes();
386    let mut out = String::with_capacity(bytes.len() * 2);
387    for b in bytes {
388        use std::fmt::Write as _;
389        let _ = write!(out, "{:02x}", b);
390    }
391    out
392}
393
394fn parse_blob_hash_hex(hex: &str) -> Result<ContentHash> {
395    if hex.len() != 64 {
396        anyhow::bail!("invalid blob-hash hex length: {}", hex.len());
397    }
398    let mut bytes = [0u8; 32];
399    for i in 0..32 {
400        let slice = &hex[i * 2..i * 2 + 2];
401        bytes[i] = u8::from_str_radix(slice, 16)
402            .with_context(|| format!("invalid hex byte at offset {}", i * 2))?;
403    }
404    Ok(ContentHash::from_bytes(bytes))
405}
406
407/// Remove the loose blob bytes for `hash` if a loose copy exists.
408/// Returns `(removed, remains_in_pack)`. Both `false` when the blob is
409/// not in the store at all (already gone).
410fn remove_loose_blob_bytes(repo: &Repository, hash: &ContentHash) -> Result<(bool, bool)> {
411    let store = repo.store();
412    if let Some(path) = store.loose_blob_path(hash)
413        && path.exists()
414    {
415        fs::remove_file(&path)
416            .with_context(|| format!("remove loose blob '{}'", path.display()))?;
417        // Even after loose removal, the blob may still be in a pack.
418        // We don't have a non-disruptive way to check packs here
419        // without holding the pack index — leave the field
420        // conservatively `false` and let a future refinement set it
421        // when the pack-aware purge lands.
422        return Ok((true, false));
423    }
424    Ok((false, false))
425}
426
427#[cfg(test)]
428mod tests {
429    use chrono::TimeZone;
430    use objects::object::{ChangeId, Principal};
431    use tempfile::TempDir;
432
433    use super::*;
434
435    fn fresh_repo() -> (TempDir, Repository) {
436        let dir = TempDir::new().unwrap();
437        let repo = Repository::init_default(dir.path()).unwrap();
438        (dir, repo)
439    }
440
441    fn sample_principal() -> Principal {
442        Principal {
443            name: "Anan".into(),
444            email: "anan@heddle.sh".into(),
445        }
446    }
447
448    fn sample_blob() -> ContentHash {
449        ContentHash::from_bytes([7u8; 32])
450    }
451
452    fn sample_redaction() -> Redaction {
453        Redaction {
454            redacted_blob: sample_blob(),
455            state: ChangeId::from_bytes([1u8; 16]),
456            path: "config/secrets.toml".into(),
457            reason: "leaked credential".into(),
458            redactor: sample_principal(),
459            redacted_at: Utc.with_ymd_and_hms(2026, 5, 10, 14, 33, 0).unwrap(),
460            signature: None,
461            purged_at: None,
462            supersedes: None,
463        }
464    }
465
466    #[test]
467    fn put_redaction_writes_blob_and_returns_stable_id() {
468        let (_dir, repo) = fresh_repo();
469        let r = sample_redaction();
470        let id1 = repo.put_redaction(r.clone()).expect("put redaction");
471        // Idempotent: putting the same redaction returns the same id and
472        // does not duplicate the entry.
473        let id2 = repo.put_redaction(r.clone()).expect("re-put redaction");
474        assert_eq!(
475            id1, id2,
476            "put_redaction must be idempotent on identical input"
477        );
478
479        let stored = repo
480            .get_redactions_for_blob(&sample_blob())
481            .expect("get redactions");
482        assert_eq!(
483            stored.redactions.len(),
484            1,
485            "idempotent put must not duplicate entries"
486        );
487    }
488
489    #[test]
490    fn list_all_redactions_returns_every_blob_with_a_record() {
491        let (_dir, repo) = fresh_repo();
492        let r = sample_redaction();
493        repo.put_redaction(r.clone()).unwrap();
494        let listing = repo.list_all_redactions().expect("list all redactions");
495        assert_eq!(listing.len(), 1);
496        assert_eq!(listing[0].0, sample_blob());
497        assert_eq!(listing[0].1.redactions.len(), 1);
498    }
499
500    #[test]
501    fn get_redaction_finds_by_id_or_returns_none() {
502        let (_dir, repo) = fresh_repo();
503        let id = repo.put_redaction(sample_redaction()).unwrap();
504        let found = repo
505            .get_redaction(&id)
506            .expect("lookup by id")
507            .expect("present");
508        assert_eq!(found.0, sample_blob());
509        let unknown = ContentHash::from_bytes([0u8; 32]);
510        let missing = repo.get_redaction(&unknown).expect("lookup miss");
511        assert!(
512            missing.is_none(),
513            "lookup of unknown id must return None, not error"
514        );
515    }
516
517    #[test]
518    fn purge_blob_refuses_when_no_redaction_exists() {
519        let (_dir, repo) = fresh_repo();
520        let err = repo
521            .purge_blob(&sample_blob(), &sample_principal())
522            .expect_err("purge without redaction must refuse");
523        let msg = err.to_string();
524        assert!(
525            msg.contains("no redaction"),
526            "error must name the missing-redaction precondition, got: {msg}"
527        );
528    }
529
530    #[test]
531    fn purge_blob_marks_redactions_purged_after_redact() {
532        let (_dir, repo) = fresh_repo();
533        repo.put_redaction(sample_redaction()).unwrap();
534        let outcome = repo
535            .purge_blob(&sample_blob(), &sample_principal())
536            .expect("purge after redact");
537        assert_eq!(outcome.redactions_marked, 1);
538        assert!(outcome.redaction_id.is_some());
539
540        // After purge the stored redactions carry a purged_at marker.
541        let stored = repo
542            .get_redactions_for_blob(&sample_blob())
543            .expect("get redactions");
544        assert!(
545            stored.redactions.iter().all(|r| r.is_purged()),
546            "every redaction on a purged blob must be marked purged"
547        );
548
549        // Idempotent re-purge marks zero additional records — operators
550        // can retry a partial purge without inflating the audit trail.
551        let again = repo
552            .purge_blob(&sample_blob(), &sample_principal())
553            .expect("re-purge");
554        assert_eq!(again.redactions_marked, 0);
555    }
556}