Skip to main content

harn_vm/redact/
manifest.rs

1//! Manifest-producing redaction for whole transcript/record structures.
2//!
3//! [`super::RedactionPolicy`] scrubs leaf strings, headers, URLs, and
4//! JSON fields. This submodule adds the *export/share* layer on top:
5//! a single canonical walk that redacts an arbitrary JSON structure —
6//! a transcript, a run record, a session bundle — while recording an
7//! auditable [`RedactionEntry`] for every value it touched, plus the
8//! symmetric [`RedactionPolicy::find_unredacted_secret`] gate that a
9//! share/ingest boundary uses to refuse a payload that still carries a
10//! high-confidence secret.
11//!
12//! These were previously private helpers inside `session_bundle`. They
13//! live here so every downstream host that exports a transcript (portal
14//! Markdown/JSON download, TUI export, harn-cloud tape ingest) calls
15//! one engine instead of reimplementing the walk and drifting from the
16//! leaf-scrubbing policy.
17
18use std::borrow::Cow;
19
20use serde::{Deserialize, Serialize};
21use serde_json::Value as JsonValue;
22
23use super::{RedactionPolicy, REDACTED_PLACEHOLDER};
24
25/// One line in a redaction manifest: the JSON path that was touched,
26/// the reason it was redacted, the action taken, and the replacement
27/// that now sits at that path. Consumers use it to show "what did we
28/// scrub before sharing this?" and to attribute a leak to a provider
29/// via the `<redacted:<pattern>:<len>>` replacement string.
30#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
31#[serde(default)]
32pub struct RedactionEntry {
33    pub path: String,
34    pub class: String,
35    pub action: String,
36    pub replacement: Option<String>,
37}
38
39/// A high-confidence secret that survived redaction, located by
40/// [`RedactionPolicy::find_unredacted_secret`]. `path` is the JSON path
41/// to the offending string; `excerpt` is a bounded, non-sensitive
42/// prefix suitable for an error message (it is the leading characters
43/// of the value, so callers must not log it verbatim into a durable
44/// sink without their own redaction).
45#[derive(Clone, Debug, PartialEq, Eq)]
46pub struct UnredactedSecret {
47    pub path: String,
48    pub excerpt: String,
49}
50
51impl RedactionPolicy {
52    /// Redact an arbitrary JSON structure in place and return an
53    /// auditable manifest of every value that changed.
54    ///
55    /// This is the canonical whole-transcript / whole-record entry
56    /// point: it recursively walks objects and arrays, replaces
57    /// sensitive-named fields wholesale, and scans every leaf string
58    /// for secret patterns and credentialed URLs via
59    /// [`RedactionPolicy::redact_string`]. Message bodies, tool inputs,
60    /// and tool results are all just nested strings, so a transcript or
61    /// a serialized [`crate::orchestration::RunRecord`] is covered by a
62    /// single call.
63    ///
64    /// Idempotent on output: the named `<redacted:<pattern>:<len>>` and
65    /// `[redacted]` placeholders do not re-match, so running twice
66    /// yields byte-identical JSON (the returned manifest still re-lists
67    /// sensitive-named fields, which are re-stamped to the same
68    /// placeholder).
69    ///
70    /// Paths are JSON-path-ish (`$.a.b[0].c`), rooted at `$`.
71    pub fn redact_json_manifest(&self, value: &mut JsonValue) -> Vec<RedactionEntry> {
72        let mut entries = Vec::new();
73        self.redact_json_manifest_at(value, "$", &mut entries);
74        entries
75    }
76
77    fn redact_json_manifest_at(
78        &self,
79        value: &mut JsonValue,
80        path: &str,
81        entries: &mut Vec<RedactionEntry>,
82    ) {
83        match value {
84            JsonValue::Object(map) => {
85                let keys = map.keys().cloned().collect::<Vec<_>>();
86                for key in keys {
87                    let child_path = json_path_child(path, &key);
88                    if self.field_is_sensitive(&key) {
89                        map.insert(key, JsonValue::String(REDACTED_PLACEHOLDER.to_string()));
90                        entries.push(RedactionEntry {
91                            path: child_path,
92                            class: "sensitive_field".to_string(),
93                            action: "replaced".to_string(),
94                            replacement: Some(REDACTED_PLACEHOLDER.to_string()),
95                        });
96                    } else if let Some(child) = map.get_mut(&key) {
97                        self.redact_json_manifest_at(child, &child_path, entries);
98                    }
99                }
100            }
101            JsonValue::Array(items) => {
102                for (index, item) in items.iter_mut().enumerate() {
103                    self.redact_json_manifest_at(item, &format!("{path}[{index}]"), entries);
104                }
105            }
106            JsonValue::String(text) => {
107                let redacted = self.redact_string(text);
108                if let Cow::Owned(replacement) = redacted {
109                    // Record the actual replacement string (a named
110                    // `<redacted:<pattern>:<len>>` placeholder from the
111                    // OA-06 catalog) so audit consumers can attribute
112                    // the leak to a specific provider.
113                    let manifest_replacement = replacement.clone();
114                    *text = replacement;
115                    entries.push(RedactionEntry {
116                        path: path.to_string(),
117                        class: "secret_pattern_or_url".to_string(),
118                        action: "replaced".to_string(),
119                        replacement: Some(manifest_replacement),
120                    });
121                }
122            }
123            _ => {}
124        }
125    }
126
127    /// Locate the first high-confidence secret that would still be
128    /// redacted by this policy anywhere in `value`, without mutating it.
129    ///
130    /// This is the share/ingest gate: a caller runs
131    /// [`RedactionPolicy::redact_json_manifest`] to scrub, then calls
132    /// this on the result and refuses to publish if it returns `Some`.
133    /// It reuses the exact leaf predicate the redactor uses, so "was
134    /// scrubbed" and "would be rejected" can never disagree.
135    pub fn find_unredacted_secret(&self, value: &JsonValue) -> Option<UnredactedSecret> {
136        self.find_unredacted_secret_at(value, "$")
137    }
138
139    fn find_unredacted_secret_at(&self, value: &JsonValue, path: &str) -> Option<UnredactedSecret> {
140        match value {
141            JsonValue::Object(map) => {
142                for (key, child) in map {
143                    if let Some(found) =
144                        self.find_unredacted_secret_at(child, &json_path_child(path, key))
145                    {
146                        return Some(found);
147                    }
148                }
149                None
150            }
151            JsonValue::Array(items) => {
152                for (index, item) in items.iter().enumerate() {
153                    if let Some(found) =
154                        self.find_unredacted_secret_at(item, &format!("{path}[{index}]"))
155                    {
156                        return Some(found);
157                    }
158                }
159                None
160            }
161            JsonValue::String(text) => {
162                if matches!(self.redact_string(text), Cow::Owned(_)) {
163                    Some(UnredactedSecret {
164                        path: path.to_string(),
165                        excerpt: secret_excerpt(text),
166                    })
167                } else {
168                    None
169                }
170            }
171            _ => None,
172        }
173    }
174}
175
176/// Bounded, non-sensitive prefix of a value for error messages.
177fn secret_excerpt(text: &str) -> String {
178    let excerpt = text.chars().take(80).collect::<String>();
179    if text.chars().count() > 80 {
180        format!("{excerpt}...")
181    } else {
182        excerpt
183    }
184}
185
186/// Append `key` to a JSON path, using dotted form for identifier-safe
187/// keys and bracketed/quoted form otherwise.
188pub(crate) fn json_path_child(parent: &str, key: &str) -> String {
189    if key
190        .chars()
191        .all(|ch| ch.is_ascii_alphanumeric() || ch == '_')
192    {
193        format!("{parent}.{key}")
194    } else {
195        format!(
196            "{parent}[{}]",
197            serde_json::to_string(key).unwrap_or_default()
198        )
199    }
200}
201
202#[cfg(test)]
203mod tests {
204    use super::*;
205    use serde_json::json;
206
207    // Fake secrets are assembled at runtime so this source file does
208    // not itself trip push-protection or secret scanners. Each still
209    // matches the redactor's catalog regexes.
210    fn aws_key() -> String {
211        format!("AKIA{}", "ABCDEFGHIJKLMNOP")
212    }
213    fn github_pat() -> String {
214        format!("ghp_{}", "a".repeat(36))
215    }
216    fn stripe_key() -> String {
217        let head = ["sk", "live"].join("_");
218        format!("{head}_{}", "abcdefghijklmnopqrstuvwxyz")
219    }
220    fn private_key_block() -> String {
221        "-----BEGIN OPENSSH PRIVATE KEY-----\nb3BlbnNzaC1rZXktdjEAAAAA\n-----END OPENSSH PRIVATE KEY-----".to_string()
222    }
223
224    /// A realistic agent transcript: system + user + assistant with a
225    /// tool_use whose input carries a credential, and a tool_result
226    /// body that leaks several provider secrets — the highest-risk
227    /// carriers the export path must scrub.
228    fn dirty_transcript() -> JsonValue {
229        json!({
230            "_type": "transcript",
231            "messages": [
232                { "role": "system", "content": "You are a coding agent. Commit is 903e58f1b0a4c2d3e4f5061728394a5b6c7d8e9f." },
233                { "role": "user", "content": "deploy with AWS creds" },
234                {
235                    "role": "assistant",
236                    "content": [
237                        { "type": "text", "text": "Running the deploy." },
238                        {
239                            "type": "tool_use",
240                            "id": "toolu_01",
241                            "name": "run_command",
242                            "input": {
243                                "command": "aws deploy",
244                                "api_key": aws_key(),
245                                "env": { "AWS_ACCESS_KEY_ID": aws_key() }
246                            }
247                        }
248                    ]
249                },
250                {
251                    "role": "tool",
252                    "content": [
253                        {
254                            "type": "tool_result",
255                            "tool_use_id": "toolu_01",
256                            "content": format!(
257                                "auth: Bearer abcDEF123_-longenoughtoken\ngithub token {}\nstripe {}\n{}\nrequest_id 550e8400-e29b-41d4-a716-446655440000",
258                                github_pat(), stripe_key(), private_key_block()
259                            )
260                        }
261                    ]
262                }
263            ],
264            "summary": "deployed ok"
265        })
266    }
267
268    fn secrets() -> Vec<String> {
269        vec![
270            aws_key(),
271            github_pat(),
272            stripe_key(),
273            "b3BlbnNzaC1rZXktdjEAAAAA".to_string(),
274        ]
275    }
276
277    #[test]
278    fn redact_json_manifest_scrubs_every_secret_and_records_paths() {
279        crate::reset_thread_local_state();
280        let policy = RedactionPolicy::default();
281        let mut transcript = dirty_transcript();
282        let manifest = policy.redact_json_manifest(&mut transcript);
283
284        let rendered = serde_json::to_string(&transcript).unwrap();
285        for secret in secrets() {
286            assert!(
287                !rendered.contains(&secret),
288                "secret leaked into redacted transcript: {secret}\n{rendered}"
289            );
290        }
291        assert!(!manifest.is_empty(), "expected a non-empty manifest");
292        // The sensitive-named `api_key` field is replaced wholesale and
293        // attributed as a field-name redaction.
294        assert!(manifest
295            .iter()
296            .any(|entry| entry.path.ends_with(".api_key") && entry.class == "sensitive_field"));
297        // The tool_result body is a free-form string scrubbed by
298        // pattern, attributed with the named replacement.
299        assert!(manifest.iter().any(|entry| {
300            entry.class == "secret_pattern_or_url"
301                && entry
302                    .replacement
303                    .as_deref()
304                    .is_some_and(|value| value.contains("<redacted:"))
305        }));
306    }
307
308    #[test]
309    fn redact_json_manifest_preserves_non_secret_content() {
310        crate::reset_thread_local_state();
311        let policy = RedactionPolicy::default();
312        let mut transcript = dirty_transcript();
313        policy.redact_json_manifest(&mut transcript);
314        let rendered = serde_json::to_string(&transcript).unwrap();
315
316        // System text and the summary are untouched.
317        assert!(rendered.contains("You are a coding agent"));
318        assert!(rendered.contains("deployed ok"));
319        // False-positive guards: a 40-char git SHA, a UUID request id,
320        // and the literal `Running the deploy.` line must survive.
321        assert!(rendered.contains("903e58f1b0a4c2d3e4f5061728394a5b6c7d8e9f"));
322        assert!(rendered.contains("550e8400-e29b-41d4-a716-446655440000"));
323        assert!(rendered.contains("Running the deploy."));
324    }
325
326    #[test]
327    fn redact_json_manifest_is_idempotent_on_output() {
328        crate::reset_thread_local_state();
329        let policy = RedactionPolicy::default();
330        let mut once = dirty_transcript();
331        policy.redact_json_manifest(&mut once);
332        let after_first = serde_json::to_string(&once).unwrap();
333
334        let mut twice = once.clone();
335        policy.redact_json_manifest(&mut twice);
336        let after_second = serde_json::to_string(&twice).unwrap();
337
338        assert_eq!(
339            after_first, after_second,
340            "second redaction pass must not further mangle already-redacted output"
341        );
342    }
343
344    #[test]
345    fn find_unredacted_secret_flags_raw_then_clears_after_redaction() {
346        crate::reset_thread_local_state();
347        let policy = RedactionPolicy::default();
348        let mut transcript = dirty_transcript();
349
350        let found = policy
351            .find_unredacted_secret(&transcript)
352            .expect("raw transcript still carries a secret");
353        assert!(found.path.starts_with("$."));
354        assert!(!found.excerpt.is_empty());
355
356        policy.redact_json_manifest(&mut transcript);
357        assert!(
358            policy.find_unredacted_secret(&transcript).is_none(),
359            "no secret should remain after redaction"
360        );
361    }
362
363    #[test]
364    fn find_unredacted_secret_ignores_benign_ids() {
365        crate::reset_thread_local_state();
366        let policy = RedactionPolicy::default();
367        let benign = json!({
368            "git_sha": "903e58f1b0a4c2d3e4f5061728394a5b6c7d8e9f",
369            "uuid": "550e8400-e29b-41d4-a716-446655440000",
370            "note": "kept 12 messages, added 3, then replied in text",
371            "max_tokens": "max_tokens=200",
372        });
373        assert!(policy.find_unredacted_secret(&benign).is_none());
374    }
375}