#![deny(missing_docs)]
#![forbid(unsafe_code)]
#![allow(clippy::unwrap_used, clippy::expect_used)]
use once_cell::sync::Lazy;
use regex::Regex;
use serde_json::Value;
pub const ITERATION: &str = "I01";
pub mod merge;
pub use merge::{merge_findings, Finding, FindingSource};
pub mod label;
pub mod scan;
pub use label::PrivacyLabel;
pub use scan::{scan_text, RedactionResult, RiskSignals, ScanError};
pub mod engine;
#[cfg(feature = "ort")]
pub use engine::OrtEngine;
pub use engine::{EngineError, MockEngine, NoopEngine, RedactionEngine};
pub mod model_descriptor;
pub mod ensemble;
pub use ensemble::EnsembleEngine;
pub use ensemble::EngineAttribution;
#[cfg(feature = "ort")]
pub mod bootstrap;
#[cfg(feature = "ort")]
pub use bootstrap::{ensure_model_available, BootstrapError, ModelPaths};
pub use scan::scan_text_with_engine;
pub use scan::scan_text_with_engine_with_lang;
pub mod lang_hint;
pub use lang_hint::{
detect_lang_heuristic, scan_text_with_engine_with_hint, LangHintSource, LanguageHint,
LANG_HINT_TRUSTED_CONFIDENCE,
};
pub use scan::{scan_text_with_engine_budgeted, BudgetedScanOutcome, EngineStatus};
pub fn redact(value: &Value) -> (Value, String) {
let mut findings: Vec<String> = Vec::new();
let redacted = redact_value(value, &mut findings);
findings.sort();
findings.dedup();
let string_corpus = collect_strings(&redacted);
let mut summary = String::new();
for f in &findings {
summary.push_str("finding:");
summary.push_str(f);
summary.push(' ');
}
summary.push_str(&string_corpus);
(redacted, summary.trim().to_string())
}
pub fn scrub_text(text: &str) -> String {
let mut sink: Vec<String> = Vec::new();
redact_string(text, &mut sink)
}
pub fn scan_hard_findings(text: &str) -> Vec<&'static str> {
let stripped = KNOWN_REDACTED_MARKER.replace_all(text, "");
let mut out: Vec<&'static str> = Vec::new();
for r in HARD_RULES.iter() {
if r.pattern.is_match(&stripped) && !out.contains(&r.name) {
out.push(r.name);
}
}
out
}
pub fn detect_hard_secret(text: &str) -> Option<&'static str> {
let stripped = KNOWN_REDACTED_MARKER.replace_all(text, "");
for r in HARD_RULES.iter() {
if r.pattern.is_match(&stripped) {
return Some(r.name);
}
}
None
}
const BY_KEY_SAFE_CHAR_CLASS: &str = r"[A-Za-z0-9_\-]";
fn normalize_key_for_placeholder(k: &str) -> String {
k.chars()
.map(|c| {
if c.is_ascii_alphanumeric() || c == '_' || c == '-' {
c
} else {
'_'
}
})
.collect()
}
fn redact_value(v: &Value, findings: &mut Vec<String>) -> Value {
match v {
Value::String(s) => Value::String(redact_string(s, findings)),
Value::Array(arr) => Value::Array(arr.iter().map(|x| redact_value(x, findings)).collect()),
Value::Object(obj) => {
let mut new_obj = serde_json::Map::new();
for (k, val) in obj {
let sensitive_key = KEY_HINT.is_match(k);
let redacted = if sensitive_key {
match val {
Value::String(s) if !s.is_empty() => {
findings.push("env_like_key".to_string());
let safe_k = normalize_key_for_placeholder(k);
Value::String(format!("[REDACTED len={} by_key={}]", s.len(), safe_k))
}
other => redact_value(other, findings),
}
} else {
redact_value(val, findings)
};
new_obj.insert(k.clone(), redacted);
}
Value::Object(new_obj)
}
_ => v.clone(),
}
}
fn redact_string(s: &str, findings: &mut Vec<String>) -> String {
if PEM_RE.is_match(s) {
findings.push("pem_private_key".to_string());
return "[REDACTED pem_private_key]".to_string();
}
let mut out = s.to_string();
for rule in ALL_RULES.iter() {
if rule.pattern.is_match(&out) {
findings.push(rule.name.to_string());
out = rule
.pattern
.replace_all(&out, |_: ®ex::Captures<'_>| {
format!("[REDACTED {}]", rule.name)
})
.into_owned();
}
}
out
}
fn collect_strings(v: &Value) -> String {
let mut buf = String::new();
fn walk(v: &Value, buf: &mut String) {
match v {
Value::String(s) => {
buf.push_str(s);
buf.push(' ');
}
Value::Array(a) => a.iter().for_each(|x| walk(x, buf)),
Value::Object(o) => o.values().for_each(|x| walk(x, buf)),
_ => {}
}
}
walk(v, &mut buf);
buf.trim().to_string()
}
pub(crate) struct Rule {
pub(crate) name: &'static str,
pub(crate) pattern: Regex,
}
pub(crate) static ALL_RULES: Lazy<Vec<Rule>> = Lazy::new(|| {
vec![
Rule {
name: "aws_access_key_id",
pattern: Regex::new(r"\b(AKIA|ASIA)[0-9A-Z]{16}\b").expect("regex"),
},
Rule {
name: "github_token",
pattern: Regex::new(r"\bgh[pousr]_[A-Za-z0-9]{36,255}\b").expect("regex"),
},
Rule {
name: "anthropic_api_key",
pattern: Regex::new(r"\bsk-ant-[A-Za-z0-9_\-]{20,}\b").expect("regex"),
},
Rule {
name: "openai_api_key",
pattern: Regex::new(r"\bsk-[A-Za-z0-9_\-]{20,}\b").expect("regex"),
},
Rule {
name: "env_assignment",
pattern: Regex::new(
r#"(?i)\b[A-Z][A-Z0-9_]*(?:KEY|TOKEN|SECRET|PASSWORD|PASSWD|PWD|APIKEY|API_KEY|AUTH)\b\s*[=:]\s*["']?[^\s"',;}\]]+"#,
)
.expect("regex"),
},
Rule {
name: "jwt",
pattern: Regex::new(
r"\bey[A-Za-z0-9_\-]{10,}\.[A-Za-z0-9_\-]{10,}\.[A-Za-z0-9_\-]{10,}\b",
)
.expect("regex"),
},
Rule {
name: "email",
pattern: Regex::new(r"\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b")
.expect("regex"),
},
Rule {
name: "internal_ipv4",
pattern: Regex::new(
r"\b(10\.\d{1,3}\.\d{1,3}\.\d{1,3}|172\.(1[6-9]|2\d|3[0-1])\.\d{1,3}\.\d{1,3}|192\.168\.\d{1,3}\.\d{1,3}|127\.\d{1,3}\.\d{1,3}\.\d{1,3})\b",
)
.expect("regex"),
},
Rule {
name: "slack_webhook",
pattern: Regex::new(
r"\bhttps://hooks\.slack\.com/services/T[A-Z0-9]{8,12}/B[A-Z0-9]{8,12}/[A-Za-z0-9]{20,}\b",
)
.expect("regex"),
},
Rule {
name: "stripe_secret_key",
pattern: Regex::new(r"\bsk_(live|test)_[A-Za-z0-9]{24,}\b").expect("regex"),
},
Rule {
name: "google_api_key",
pattern: Regex::new(r"\bAIza[A-Za-z0-9_\-]{35}\b").expect("regex"),
},
Rule {
name: "gitlab_pat",
pattern: Regex::new(r"\bglpat-[A-Za-z0-9_\-]{20,}\b").expect("regex"),
},
Rule {
name: "database_url",
pattern: Regex::new(
r"\b(postgresql|postgres|mysql|mongodb\+srv|mongodb|rediss|redis|amqps|amqp)://[^:/\s@]+:[^@/\s]+@[A-Za-z0-9.\-]+(:\d+)?(/[^\s]*)?",
)
.expect("regex"),
},
Rule {
name: "generic_url",
pattern: Regex::new(r#"\bhttps?://[^\s<>"']+"#).expect("regex"),
},
]
});
pub(crate) static HARD_RULES: Lazy<Vec<Rule>> = Lazy::new(|| {
vec![
Rule {
name: "aws_access_key_id",
pattern: Regex::new(r"\b(AKIA|ASIA)[0-9A-Z]{16}\b").expect("regex"),
},
Rule {
name: "github_token",
pattern: Regex::new(r"\bgh[pousr]_[A-Za-z0-9]{36,255}\b").expect("regex"),
},
Rule {
name: "anthropic_api_key",
pattern: Regex::new(r"\bsk-ant-[A-Za-z0-9_\-]{20,}\b").expect("regex"),
},
Rule {
name: "openai_api_key",
pattern: Regex::new(r"\bsk-[A-Za-z0-9_\-]{20,}\b").expect("regex"),
},
Rule {
name: "pem_private_key",
pattern: Regex::new(r"-----BEGIN [A-Z ]*PRIVATE KEY-----").expect("regex"),
},
Rule {
name: "jwt",
pattern: Regex::new(
r"\bey[A-Za-z0-9_\-]{10,}\.[A-Za-z0-9_\-]{10,}\.[A-Za-z0-9_\-]{10,}\b",
)
.expect("regex"),
},
Rule {
name: "env_assignment",
pattern: Regex::new(
r#"(?i)\b[A-Z][A-Z0-9_]*(?:KEY|TOKEN|SECRET|PASSWORD|PASSWD|PWD|APIKEY|API_KEY|AUTH)\b\s*[=:]\s*["']?[^\s"',;}\]]+"#,
)
.expect("regex"),
},
Rule {
name: "slack_webhook",
pattern: Regex::new(
r"\bhttps://hooks\.slack\.com/services/T[A-Z0-9]{8,12}/B[A-Z0-9]{8,12}/[A-Za-z0-9]{20,}\b",
)
.expect("regex"),
},
Rule {
name: "stripe_secret_key",
pattern: Regex::new(r"\bsk_(live|test)_[A-Za-z0-9]{24,}\b").expect("regex"),
},
Rule {
name: "google_api_key",
pattern: Regex::new(r"\bAIza[A-Za-z0-9_\-]{35}\b").expect("regex"),
},
Rule {
name: "gitlab_pat",
pattern: Regex::new(r"\bglpat-[A-Za-z0-9_\-]{20,}\b").expect("regex"),
},
Rule {
name: "database_url",
pattern: Regex::new(
r"\b(postgresql|postgres|mysql|mongodb\+srv|mongodb|rediss|redis|amqps|amqp)://[^:/\s@]+:[^@/\s]+@[A-Za-z0-9.\-]+(:\d+)?(/[^\s]*)?",
)
.expect("regex"),
},
]
});
static PEM_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"-----BEGIN [A-Z ]*PRIVATE KEY-----").expect("regex"));
static KNOWN_REDACTED_MARKER: Lazy<Regex> = Lazy::new(|| {
let pattern = format!(
r"\[REDACTED (?:len=\d+ by_key={c}+|[a-z_]+)\]",
c = BY_KEY_SAFE_CHAR_CLASS
);
Regex::new(&pattern).expect("regex")
});
static KEY_HINT: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?i)(secret|token|password|api[_\-]?key|auth)").expect("regex"));
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
#[test]
fn crate_iteration_is_i01() {
assert_eq!(ITERATION, "I01");
}
#[test]
fn redacts_github_token_in_string() {
let v = json!({"note": "my token is ghp_abcdefghijklmnopqrstuvwxyzABCDEFGHIJ"});
let (out, summary) = redact(&v);
let s = serde_json::to_string(&out).unwrap();
assert!(!s.contains("ghp_abcdefghijklmnopqrstuvwxyzABCDEFGHIJ"));
assert!(s.contains("[REDACTED github_token]"));
assert!(summary.contains("finding:github_token"));
}
#[test]
fn redacts_aws_key() {
let v = json!({"aws": "AKIAIOSFODNN7EXAMPLE"});
let (out, _) = redact(&v);
assert!(!serde_json::to_string(&out)
.unwrap()
.contains("AKIAIOSFODNN7EXAMPLE"));
}
#[test]
fn redacts_pem_block() {
let v = json!({
"ssh": "-----BEGIN RSA PRIVATE KEY-----\nMIIEpAIBAAKC...\n-----END RSA PRIVATE KEY-----"
});
let (out, summary) = redact(&v);
let s = serde_json::to_string(&out).unwrap();
assert!(!s.contains("BEGIN RSA PRIVATE KEY"));
assert!(s.contains("[REDACTED pem_private_key]"));
assert!(summary.contains("pem_private_key"));
}
#[test]
fn redacts_sensitive_key_by_name() {
let v = json!({"database_password": "hunter2", "ok": "hello"});
let (out, _) = redact(&v);
let s = serde_json::to_string(&out).unwrap();
assert!(!s.contains("hunter2"));
assert!(s.contains("[REDACTED"));
assert!(s.contains("hello")); }
#[test]
fn redacts_email_and_internal_ip() {
let v = json!({"msg": "contact alice@example.com on 192.168.1.5"});
let (out, _) = redact(&v);
let s = serde_json::to_string(&out).unwrap();
assert!(!s.contains("alice@example.com"));
assert!(!s.contains("192.168.1.5"));
}
#[test]
fn leaves_non_sensitive_untouched() {
let v = json!({"n": 42, "flag": true, "list": [1,2,3], "msg": "hello world"});
let (out, summary) = redact(&v);
assert_eq!(out, v);
assert!(!summary.contains("finding:"));
}
#[test]
fn detect_hard_secret_catches_github_token() {
let text = r#"{"x": "ghp_abcdefghijklmnopqrstuvwxyzABCDEFGHIJ"}"#;
assert_eq!(detect_hard_secret(text), Some("github_token"));
}
#[test]
fn detect_hard_secret_catches_pem() {
assert_eq!(
detect_hard_secret("...-----BEGIN RSA PRIVATE KEY-----..."),
Some("pem_private_key")
);
}
#[test]
fn detect_hard_secret_allows_clean_text() {
assert_eq!(detect_hard_secret(r#"{"msg":"hello world"}"#), None);
}
#[test]
fn fts_summary_never_contains_raw_secret() {
const MAGIC: &str = "ghp_ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
let v = json!({"note": format!("token = {}", MAGIC)});
let (_out, summary) = redact(&v);
assert!(
!summary.contains(MAGIC),
"summary 泄漏了 secret: {}",
summary
);
assert!(summary.contains("finding:github_token"));
}
#[test]
fn anthropic_key_not_misclassified_as_openai() {
let v = json!({"note": "value=sk-ant-api03_ABCDEFGHIJKLMNOPQRSTUVWX"});
let (out, summary) = redact(&v);
let s = serde_json::to_string(&out).unwrap();
assert!(!s.contains("sk-ant-api03"));
assert!(
summary.contains("anthropic_api_key"),
"summary 应含 anthropic,实际:{}",
summary
);
}
#[test]
fn detect_hard_secret_catches_anthropic_before_openai() {
let text = r#"{"x": "sk-ant-api03_ABCDEFGHIJKLMNOPQRSTUVWX"}"#;
assert_eq!(detect_hard_secret(text), Some("anthropic_api_key"));
}
#[test]
fn env_style_assignment_is_redacted() {
let v = json!({
"log": "OPENAI_API_KEY=some-unregulated-value-xyz123abc\nDATABASE_PASSWORD: hunter2\nOK=yes"
});
let (out, _) = redact(&v);
let s = serde_json::to_string(&out).unwrap();
assert!(
!s.contains("some-unregulated-value-xyz123abc"),
"OPENAI_API_KEY=... 未脱敏:{}",
s
);
assert!(!s.contains("hunter2"), "DATABASE_PASSWORD 未脱敏:{}", s);
assert!(s.contains("OK=yes"), "OK=yes 不应被误脱敏:{}", s);
}
#[test]
fn detect_hard_secret_catches_env_assignment() {
assert_eq!(
detect_hard_secret("DATABASE_PASSWORD=hunter2"),
Some("env_assignment")
);
}
#[test]
fn f1_special_chars_in_key_normalize_to_marker_safe_class() {
let cases = vec![
json!({"app.config.secret": "sensitive-value-12345"}),
json!({"path/to/token": "secret-data-abc123"}),
json!({"中文密钥": "chinese-secret-content"}),
json!({"key with space": "spaced-secret-value"}),
json!({"k@weird#chars!": "another-secret-string"}),
];
for v in cases {
let (out, _) = redact(&v);
let s = serde_json::to_string(&out).unwrap();
if s.contains("[REDACTED") {
assert_eq!(
detect_hard_secret(&s),
None,
"placeholder 形态漂出 marker 集合;输出={}",
s
);
}
}
}
#[test]
fn detect_hard_secret_not_bypassed_by_fake_placeholder() {
let fake1 = "[REDACTED ghp_abcdefghijklmnopqrstuvwxyzABCDEFGHIJ]";
assert!(
detect_hard_secret(fake1).is_some(),
"伪装 placeholder 里的 token 必须被拦下"
);
let fake2 = "[REDACTED DATABASE_PASSWORD=hunter2]";
assert!(
detect_hard_secret(fake2).is_some(),
"伪装 placeholder 里的 env 赋值必须被拦下"
);
let fake3 = "[REDACTED sk-ant-api03_abcdefghijklmnopqrstuvwx]";
assert!(
detect_hard_secret(fake3).is_some(),
"伪装 placeholder 里的 anthropic key 必须被拦下"
);
assert!(detect_hard_secret("[REDACTED github_token]").is_none());
assert!(detect_hard_secret("[REDACTED pem_private_key]").is_none());
assert!(detect_hard_secret("[REDACTED env_assignment]").is_none());
assert!(detect_hard_secret("[REDACTED len=40 by_key=auth]").is_none());
}
}