Skip to main content

llm_wiki/ops/
redact.rs

1//! Opt-in secret redaction for page bodies. Enabled per-ingest via `redact: true`.
2//! Built-in patterns cover common API keys, tokens, and emails. Custom patterns
3//! are added via `[redact.patterns]` in config. Redaction is lossy by design.
4
5use regex::Regex;
6use serde::{Deserialize, Serialize};
7
8use crate::config::RedactConfig;
9
10// ── Types ─────────────────────────────────────────────────────────────────────
11
12/// A single secret match found during a redaction pass.
13#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct RedactionMatch {
15    /// Name of the pattern that triggered the match.
16    pub pattern_name: String,
17    /// 1-based line number where the match was found.
18    pub line_number: usize,
19}
20
21/// Report of all redaction substitutions applied to a single page body.
22#[derive(Debug, Clone, Serialize, Deserialize)]
23pub struct RedactionReport {
24    /// Slug of the page that was redacted.
25    pub slug: String,
26    /// All matches found (and replaced) in this page.
27    pub matches: Vec<RedactionMatch>,
28}
29
30struct RedactPattern {
31    name: &'static str,
32    regex: Regex,
33    replacement: &'static str,
34}
35
36// ── Built-in patterns ─────────────────────────────────────────────────────────
37
38fn builtin_patterns() -> Vec<RedactPattern> {
39    let specs: &[(&'static str, &'static str, &'static str)] = &[
40        (
41            "github-pat",
42            r"ghp_[A-Za-z0-9]{36}",
43            "[REDACTED:github-pat]",
44        ),
45        ("openai-key", r"sk-[A-Za-z0-9]{48}", "[REDACTED:openai-key]"),
46        (
47            "anthropic-key",
48            r"sk-ant-[A-Za-z0-9\-]{90,}",
49            "[REDACTED:anthropic-key]",
50        ),
51        (
52            "aws-access-key",
53            r"AKIA[0-9A-Z]{16}",
54            "[REDACTED:aws-access-key]",
55        ),
56        (
57            "bearer-token",
58            r"Bearer [A-Za-z0-9\-._~+/]{20,}",
59            "[REDACTED:bearer-token]",
60        ),
61        (
62            "email",
63            r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}",
64            "[REDACTED:email]",
65        ),
66    ];
67    specs
68        .iter()
69        .map(|(name, pat, rep)| RedactPattern {
70            name,
71            regex: Regex::new(pat).expect("builtin regex is valid"),
72            replacement: rep,
73        })
74        .collect()
75}
76
77// ── Pattern builder ───────────────────────────────────────────────────────────
78
79struct CompiledPattern {
80    name: String,
81    regex: Regex,
82    replacement: String,
83}
84
85fn build_patterns(config: &RedactConfig) -> Vec<CompiledPattern> {
86    let disabled: std::collections::HashSet<&str> =
87        config.disable.iter().map(String::as_str).collect();
88
89    let mut patterns: Vec<CompiledPattern> = builtin_patterns()
90        .into_iter()
91        .filter(|p| !disabled.contains(p.name))
92        .map(|p| CompiledPattern {
93            name: p.name.to_string(),
94            regex: p.regex,
95            replacement: p.replacement.to_string(),
96        })
97        .collect();
98
99    for custom in &config.patterns {
100        match Regex::new(&custom.pattern) {
101            Ok(re) => patterns.push(CompiledPattern {
102                name: custom.name.clone(),
103                regex: re,
104                replacement: custom.replacement.clone(),
105            }),
106            Err(e) => {
107                tracing::warn!(
108                    pattern = %custom.name,
109                    error = %e,
110                    "skipping invalid custom redaction pattern"
111                );
112            }
113        }
114    }
115
116    patterns
117}
118
119// ── Core redaction ────────────────────────────────────────────────────────────
120
121/// Redact secrets from `body` (never frontmatter). Returns the redacted body
122/// and a list of matches (pattern name + 1-based line number). Lossy by design.
123pub fn redact_body(body: &str, config: &RedactConfig) -> (String, Vec<RedactionMatch>) {
124    let patterns = build_patterns(config);
125    let mut matches: Vec<RedactionMatch> = Vec::new();
126    let mut result = String::with_capacity(body.len());
127
128    for (line_idx, line) in body.lines().enumerate() {
129        let line_number = line_idx + 1;
130        let mut current = line.to_string();
131        for pat in &patterns {
132            if pat.regex.is_match(&current) {
133                matches.push(RedactionMatch {
134                    pattern_name: pat.name.clone(),
135                    line_number,
136                });
137                current = pat
138                    .regex
139                    .replace_all(&current, pat.replacement.as_str())
140                    .into_owned();
141            }
142        }
143        result.push_str(&current);
144        result.push('\n');
145    }
146
147    // Preserve original trailing newline behaviour
148    if !body.ends_with('\n') && result.ends_with('\n') {
149        result.pop();
150    }
151
152    (result, matches)
153}