mod apps;
use apps::{
builtin_app_names, ensure_user_app_copy, load_app_bundle, run_apps, user_apps_dir, BUILTIN_APPS,
};
mod config;
mod guided;
use guided::{
build_guided_entries, build_guided_profiles, parse_template_preset, prompt_cloud_providers,
prompt_domains, prompt_formats, prompt_line, prompt_yes_no, template_body_aws,
template_body_database, template_body_generic, template_body_k8s, template_body_web,
GuidedOptions, GuidedPreset, TemplatePreset, PROFILE_HEADER, TEMPLATE_HEADER,
};
use config::{find_project_config, load_project_config, load_settings, run_init, run_show_config};
mod hooks;
use hooks::{global_default_secrets_path, run_install_hook};
mod entropy;
use entropy::{
entropy_configs_from_entries, entropy_histogram_bytes, entropy_scan_bytes, scanner_fallback,
EntropyBuckets, EntropyConfig, NullSeekWriter, HISTOGRAM_THRESHOLDS,
};
mod progress;
use progress::{
with_progress_scope, ProgressContext, ProgressMode, ProgressPolicy, ProgressReporter,
SharedProgressReporter,
};
use clap::{Parser, Subcommand, ValueEnum};
use rayon::prelude::*;
use sanitize_engine::secrets::{
decrypt_secrets, encrypt_secrets, entries_to_patterns, extract_allow_patterns, parse_category,
parse_secrets, serialize_secrets, SecretEntry, SecretsFormat,
};
use sanitize_engine::{
atomic_write, extract_context, extract_context_reader, format_llm_prompt,
format_llm_prompt_reference, strip_values_from_text, ArchiveFilter, ArchiveFormat,
ArchiveProcessor, ArchiveProgress, AtomicFileWriter, FieldNameSignal, FileReport,
FileTypeProfile, HmacGenerator, LlmEntry, LlmPathEntry, LogContextConfig, MappingStore,
ProcessorRegistry, RandomGenerator, ReplacementGenerator, ReportBuilder, ReportMetadata,
ScanConfig, ScanPattern, ScanStats, StreamScanner, DEFAULT_ARCHIVE_DEPTH,
DEFAULT_CONTEXT_LINES, DEFAULT_FIELD_SIGNAL_THRESHOLD, DEFAULT_MAX_MATCHES,
};
use std::collections::{HashMap, HashSet};
use std::env;
use std::ffi::OsString;
use std::fs;
use std::io::{self, BufReader, BufWriter, Cursor, IsTerminal, Read, Write};
use std::path::{Path, PathBuf};
use std::process;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, Mutex};
use std::time::SystemTime;
use tracing::{info, warn};
use zeroize::Zeroizing;
const DEFAULT_MAX_STRUCTURED_FILE_SIZE: u64 = 256 * 1024 * 1024;
const MAX_CONTEXT_BUFFER_BYTES: u64 = 256 * 1024 * 1024;
type LlmCollector = Arc<Mutex<Vec<LlmEntry>>>;
static INTERRUPTED: AtomicBool = AtomicBool::new(false);
const DEFAULT_PROGRESS_INTERVAL_MS: u64 = 200;
const VALID_FORMATS: &[&str] = &[
"text",
"json",
"jsonl",
"ndjson",
"yaml",
"yml",
"xml",
"csv",
"tsv",
"key-value",
"toml",
"env",
"ini",
"log",
];
fn is_interrupted() -> bool {
INTERRUPTED.load(Ordering::Relaxed)
}
#[derive(Copy, Clone)]
struct ArchiveDeps<'a> {
scanner: &'a Arc<StreamScanner>,
registry: &'a Arc<ProcessorRegistry>,
store: &'a Arc<MappingStore>,
profiles: &'a [sanitize_engine::processor::FileTypeProfile],
}
#[derive(Debug, Clone, PartialEq, Default, clap::ValueEnum)]
enum ReportFormat {
#[default]
Json,
Sarif,
Html,
}
#[derive(Parser, Debug)]
#[command(
name = "sanitize",
version,
about = "One-way data sanitization tool",
long_about = "Deterministic one-way data sanitization tool.\n\n\
Scans files and archives for sensitive data described in a secrets file \
(plaintext by default) and replaces every match with a category-aware substitute.\n\
Replacements are ONE-WAY — no mapping file is stored and there is no \
restore mode.\n\n\
Use `sanitize encrypt` / `sanitize decrypt` to manage encrypted secrets files.",
after_help = "\
EXAMPLES:\n \
# Plaintext secrets file (default — no password needed):\n \
sanitize data.log -s secrets.yaml\n \
sanitize data.log -s secrets.yaml -o clean.log\n \
grep \"error\" log.txt | sanitize -s secrets.yaml\n\n \
# Encrypted secrets file (requires --encrypted-secrets):\n \
sanitize data.log -s s.enc --encrypted-secrets -p\n \
sanitize data.log -s s.enc --encrypted-secrets -P /run/secrets/pw\n \
SANITIZE_PASSWORD=hunter2 sanitize data.log -s s.enc --encrypted-secrets\n\n \
# Encrypt / decrypt secrets files:\n \
sanitize encrypt secrets.json secrets.json.enc --password\n \
sanitize decrypt secrets.json.enc secrets.json --password\n\n \
# Deterministic replacements with encrypted secrets:\n \
sanitize data.csv -s s.enc --encrypted-secrets -p -d\n\n \
# Extract error/warning context into the JSON report (--report required):\n \
sanitize app.log -s s.enc --encrypted-secrets -p --report report.json --extract-context\n \
cat app.log | sanitize -s s.enc --encrypted-secrets -p --report - --extract-context\n\n \
# Custom keywords and wider context window:\n \
sanitize app.log -s s.enc --encrypted-secrets -p --report - \\\n \
--extract-context --context-keywords timeout,oomkilled --context-lines 20\n\n \
# Strip values to generate a profile template (no secrets file needed):\n \
sanitize gitlab.rb --strip-values -o gitlab.rb.template\n \
cat config.rb | sanitize --strip-values\n\n \
# Inline mode: embed sanitized content in the prompt (pipe to clipboard, LLM, etc.):\n \
sanitize app.log -s secrets.yaml --llm | pbcopy\n \
sanitize config.yaml -s s.enc --encrypted-secrets -p --llm review-config\n \
sanitize nginx.conf --app nginx --llm review-security\n \
sanitize app.log -s s.yaml --llm --extract-context --context-lines 15\n \
sanitize app.log -s s.yaml --llm /path/to/custom-template.txt\n\n \
# Reference mode: write sanitized files to disk, prompt lists absolute paths:\n \
sanitize app.log -s s.yaml --llm --output /tmp/sanitized/app.log\n \
sanitize logs/ -s s.yaml --llm review-security --output /tmp/sanitized/"
)]
struct Cli {
#[command(subcommand)]
command: Option<SubCommand>,
#[arg(value_name = "INPUT")]
input: Vec<PathBuf>,
#[arg(short = 'o', long, value_name = "FILE")]
output: Option<PathBuf>,
#[arg(short = 's', long = "secrets-file", value_name = "FILE")]
secrets_file: Option<PathBuf>,
#[arg(long = "profile", value_name = "FILE")]
profile: Option<PathBuf>,
#[arg(short = 'p', long)]
password: bool,
#[arg(short = 'P', long = "password-file", value_name = "FILE")]
password_file: Option<PathBuf>,
#[arg(long)]
encrypted_secrets: bool,
#[arg(short = 'f', long, value_name = "FMT")]
format: Option<String>,
#[arg(short = 'n', long)]
dry_run: bool,
#[arg(long)]
fail_on_match: bool,
#[arg(short = 'r', long, value_name = "PATH")]
report: Option<Option<PathBuf>>,
#[arg(long, value_name = "FORMAT", default_value = "json")]
report_format: ReportFormat,
#[arg(long)]
strict: bool,
#[arg(short = 'd', long)]
deterministic: bool,
#[arg(long)]
no_structured_handoff: bool,
#[arg(long)]
no_field_signal: bool,
#[arg(long)]
include_binary: bool,
#[arg(long)]
hidden: bool,
#[arg(long, value_name = "THRESHOLD")]
entropy_threshold: Option<f64>,
#[arg(long, value_name = "GLOB", num_args = 1)]
exclude_path: Vec<String>,
#[arg(long, value_name = "GLOB", num_args = 1)]
include_path: Vec<String>,
#[arg(long)]
force_text: bool,
#[arg(long)]
use_default: bool,
#[arg(long, value_name = "N")]
threads: Option<usize>,
#[arg(long, value_name = "BYTES", default_value_t = 1_048_576, hide = true)]
chunk_size: usize,
#[arg(long, value_name = "N", default_value_t = 10_000_000, hide = true)]
max_mappings: usize,
#[arg(long, value_name = "BYTES", default_value_t = DEFAULT_MAX_STRUCTURED_FILE_SIZE, hide = true)]
max_structured_size: u64,
#[arg(long, value_name = "N", default_value_t = DEFAULT_ARCHIVE_DEPTH, hide = true)]
max_archive_depth: u32,
#[arg(long, value_name = "FMT")]
log_format: Option<String>,
#[arg(long, value_name = "LEVEL")]
log_level: Option<String>,
#[arg(long, value_enum, value_name = "MODE")]
progress: Option<ProgressMode>,
#[arg(long, hide = true)]
no_progress: bool,
#[arg(long)]
quiet: bool,
#[arg(long, value_name = "MS", default_value_t = DEFAULT_PROGRESS_INTERVAL_MS, hide = true)]
progress_interval_ms: u64,
#[arg(long, value_name = "PATH", num_args = 0..=1, default_missing_value = "-")]
findings: Option<PathBuf>,
#[arg(long)]
extract_context: bool,
#[arg(long, value_name = "N", default_value_t = 10)]
context_lines: usize,
#[arg(long, value_name = "KEYWORDS", value_delimiter = ',')]
context_keywords: Vec<String>,
#[arg(long)]
context_keywords_replace: bool,
#[arg(long, value_name = "N", default_value_t = 50)]
max_context_matches: usize,
#[arg(long, value_name = "N", default_value_t = 500)]
max_match_locations: usize,
#[arg(long)]
context_case_sensitive: bool,
#[arg(long)]
strip_values: bool,
#[arg(
long,
value_name = "DELIM",
default_value = "=",
requires = "strip_values"
)]
strip_delimiter: String,
#[arg(
long,
value_name = "PREFIX",
default_value = "#",
requires = "strip_values"
)]
strip_comment_prefix: String,
#[arg(long, value_name = "TEMPLATE", default_missing_value = "troubleshoot", num_args = 0..=1)]
llm: Option<String>,
#[arg(long, value_delimiter = ',', value_name = "APPS")]
app: Vec<String>,
#[arg(long = "allow", value_name = "PATTERN")]
allow: Vec<String>,
}
impl Default for Cli {
fn default() -> Self {
Self {
command: None,
input: vec![],
output: None,
secrets_file: None,
profile: None,
password: false,
password_file: None,
encrypted_secrets: false,
format: None,
dry_run: false,
fail_on_match: false,
report: None,
report_format: ReportFormat::Json,
strict: false,
deterministic: false,
no_structured_handoff: false,
no_field_signal: false,
include_binary: false,
hidden: false,
force_text: false,
use_default: false,
threads: None,
chunk_size: 1_048_576,
max_mappings: 10_000_000,
max_structured_size: DEFAULT_MAX_STRUCTURED_FILE_SIZE,
max_archive_depth: DEFAULT_ARCHIVE_DEPTH,
log_format: None,
log_level: None,
progress: None,
no_progress: false,
quiet: false,
progress_interval_ms: DEFAULT_PROGRESS_INTERVAL_MS,
findings: None,
extract_context: false,
context_lines: DEFAULT_CONTEXT_LINES,
context_keywords: vec![],
context_keywords_replace: false,
max_context_matches: DEFAULT_MAX_MATCHES,
context_case_sensitive: false,
max_match_locations: 500,
strip_values: false,
strip_delimiter: "=".to_string(),
strip_comment_prefix: "#".to_string(),
llm: None,
app: vec![],
allow: vec![],
exclude_path: vec![],
include_path: vec![],
entropy_threshold: None,
}
}
}
impl Cli {
fn effective_progress_mode(&self) -> ProgressMode {
if self.quiet {
ProgressMode::Off
} else if let Some(mode) = self.progress {
mode
} else if self.no_progress {
ProgressMode::Off
} else {
ProgressMode::Auto
}
}
fn effective_log_format(&self) -> &str {
self.log_format.as_deref().unwrap_or("human")
}
fn effective_log_level(&self) -> &str {
self.log_level.as_deref().unwrap_or("warn")
}
}
#[derive(Subcommand, Debug)]
enum SubCommand {
#[command(after_help = "\
EXAMPLES:\n \
sanitize encrypt secrets.json secrets.json.enc --password \"my-password\"\n \
SANITIZE_PASSWORD=hunter2 sanitize encrypt secrets.yaml secrets.yaml.enc\n \
sanitize encrypt secrets.toml secrets.toml.enc # interactive prompt")]
Encrypt(EncryptArgs),
#[command(after_help = "\
EXAMPLES:\n \
sanitize decrypt secrets.json.enc secrets.json --password \"my-password\"\n \
sanitize decrypt secrets.enc out.yaml --password-file /run/secrets/pw")]
Decrypt(DecryptArgs),
#[command(name = "apps")]
Apps(AppsArgs),
#[command(
name = "allow-test",
after_help = "\
EXAMPLES:\n \
sanitize allow-test --allow '*.internal' db.internal github.com\n \
sanitize allow-test --allow localhost --allow '*.internal' --allow '192.168.1.*' db.internal 192.168.1.5 8.8.8.8\n \
sanitize allow-test --allow 'regex:^10\\.[0-9]+\\.[0-9]+\\.[0-9]+$' 10.0.0.1 192.168.1.1\n \
echo -e 'db.internal\\ngithub.com\\n192.168.1.5' | sanitize allow-test --allow '*.internal' --allow '192.168.1.*'\n \
sanitize allow-test --allow '*.internal' db.internal --json"
)]
AllowTest(AllowTestArgs),
#[command(after_help = "\
EXAMPLES:\n \
sanitize guided")]
Guided,
#[command(after_help = "\
PRESETS\n \
generic Common secrets: tokens, emails, IPs, hostnames (default)\n \
web Web-app logs: JWTs, sessions, emails, URLs\n \
k8s Kubernetes configs: service-accounts, tokens, namespaces\n \
database Database configs: passwords, connection strings, usernames\n \
aws AWS: access keys, ARNs, account IDs\n\n\
EXAMPLES:\n \
sanitize template # generic → secrets.template.yaml\n \
sanitize template --preset web # web-app template\n \
sanitize template --preset k8s -o k8s-secrets.yaml")]
Template(TemplateArgs),
#[command(
name = "install-hook",
after_help = "\
EXAMPLES:\n \
sanitize install-hook # scan with auto-loaded default secrets\n \
sanitize install-hook --app gitlab,kubernetes # scan with app bundles\n \
sanitize install-hook -s secrets.yaml # scan with custom secrets file\n \
sanitize install-hook --mode sanitize # sanitize staged files in place\n \
sanitize install-hook --hook pre-push # install a pre-push hook\n \
sanitize install-hook --global # apply to all repos on this machine\n \
sanitize install-hook --remove # remove the installed hook\n \
sanitize install-hook --dry-run # preview without writing"
)]
InstallHook(InstallHookArgs),
#[command(name = "show-config")]
ShowConfig,
#[command(
name = "init-hook",
after_help = "\
EXAMPLES:\n \
sanitize init-hook # create settings file + pre-commit hook\n \
sanitize init-hook --mode sanitize # hook sanitizes files in place\n \
sanitize init-hook --hook pre-push # hook runs on push instead\n \
sanitize init-hook --global # apply hook to all repos on this machine"
)]
InitHook(InitArgs),
#[command(after_help = "\
EXAMPLES:\n \
sanitize scan app.log -s secrets.yaml # scan a log file\n \
sanitize scan ./logs/ -s secrets.yaml # scan a directory\n \
sanitize scan app.log --app gitlab # scan using an app bundle\n \
sanitize scan . --exclude-path tests/fixtures/ # skip test fixtures\n \
git diff HEAD | sanitize scan # scan a patch from stdin\n \
sanitize scan app.log -s s.enc --encrypted-secrets -p # encrypted secrets")]
Scan(ScanArgs),
#[command(
name = "test-pattern",
after_help = "\
EXAMPLES:\n \
sanitize test-pattern --pattern 'ghp_[A-Za-z0-9_]{36}' 'ghp_abc123'\n \
sanitize test-pattern -s secrets.yaml 'my-secret-value' 'safe-value'\n \
sanitize test-pattern --app gitlab 'glpat-abc123'\n \
echo 'AKIA1234567890ABCDEF' | sanitize test-pattern --app aws\n \
sanitize test-pattern -s secrets.yaml --json 'value1' 'value2'"
)]
TestPattern(TestPatternArgs),
}
#[derive(Parser, Debug)]
struct EncryptArgs {
#[arg(value_name = "INPUT")]
input: PathBuf,
#[arg(value_name = "OUTPUT")]
output: PathBuf,
#[arg(long)]
password: bool,
#[arg(long = "password-file", value_name = "FILE")]
password_file: Option<PathBuf>,
#[arg(long, value_parser = parse_format)]
secrets_format: Option<SecretsFormat>,
#[arg(long, overrides_with = "_no_validate", default_value_t = true)]
validate: bool,
#[arg(long = "no-validate", hide = true)]
_no_validate: bool,
}
#[derive(Parser, Debug)]
struct DecryptArgs {
#[arg(value_name = "INPUT")]
input: PathBuf,
#[arg(value_name = "OUTPUT")]
output: PathBuf,
#[arg(long)]
password: bool,
#[arg(long = "password-file", value_name = "FILE")]
password_file: Option<PathBuf>,
#[arg(long, value_parser = parse_format)]
secrets_format: Option<SecretsFormat>,
}
fn parse_format(s: &str) -> Result<SecretsFormat, String> {
match s {
"json" => Ok(SecretsFormat::Json),
"yaml" | "yml" => Ok(SecretsFormat::Yaml),
"toml" => Ok(SecretsFormat::Toml),
other => Err(format!(
"unknown format '{}' (use json, yaml, or toml)",
other
)),
}
}
#[derive(Parser, Debug)]
struct TemplateArgs {
#[arg(long, short = 'p', default_value = "generic", value_name = "PRESET")]
preset: String,
#[arg(long, short = 'o', value_name = "FILE")]
output: Option<PathBuf>,
#[arg(long)]
overwrite: bool,
}
#[derive(Parser, Debug)]
struct AllowTestArgs {
#[arg(long = "allow", value_name = "PATTERN", required = true)]
allow: Vec<String>,
#[arg(value_name = "VALUE")]
values: Vec<String>,
#[arg(long)]
json: bool,
}
#[derive(Parser, Debug)]
struct ScanArgs {
#[arg(value_name = "INPUT")]
input: Vec<PathBuf>,
#[arg(short = 's', long = "secrets-file", value_name = "FILE")]
secrets_file: Option<PathBuf>,
#[arg(long)]
encrypted_secrets: bool,
#[arg(short = 'p', long)]
password: bool,
#[arg(short = 'P', long = "password-file", value_name = "FILE")]
password_file: Option<PathBuf>,
#[arg(long, value_name = "APPS", value_delimiter = ',')]
app: Vec<String>,
#[arg(long, value_name = "PATTERN")]
allow: Vec<String>,
#[arg(long = "profile", value_name = "FILE")]
profile: Option<PathBuf>,
#[arg(long)]
hidden: bool,
#[arg(long, value_name = "GLOB", num_args = 1)]
exclude_path: Vec<String>,
#[arg(long, value_name = "GLOB", num_args = 1)]
include_path: Vec<String>,
#[arg(short = 'r', long, value_name = "PATH")]
report: Option<Option<PathBuf>>,
#[arg(long, value_name = "FORMAT", default_value = "json")]
report_format: ReportFormat,
#[arg(long, value_name = "N")]
threads: Option<usize>,
#[arg(long, value_name = "FMT")]
log_format: Option<String>,
#[arg(long, value_name = "LEVEL")]
log_level: Option<String>,
#[arg(long, hide = true)]
no_progress: bool,
#[arg(long)]
findings: bool,
#[arg(long, value_name = "THRESHOLD")]
entropy_threshold: Option<f64>,
#[arg(long)]
use_default: bool,
}
#[derive(Parser, Debug)]
struct TestPatternArgs {
#[arg(long = "pattern", short = 'P', value_name = "REGEX")]
patterns: Vec<String>,
#[arg(short = 's', long = "secrets-file", value_name = "FILE")]
secrets_file: Option<PathBuf>,
#[arg(long, value_name = "APPS", value_delimiter = ',')]
app: Vec<String>,
#[arg(value_name = "VALUE")]
values: Vec<String>,
#[arg(long)]
json: bool,
}
#[derive(Parser, Debug)]
pub(crate) struct AppsArgs {
#[command(subcommand)]
command: Option<AppsSubCommand>,
}
#[derive(Subcommand, Debug)]
pub(crate) enum AppsSubCommand {
#[command(after_help = "\
EXAMPLES:\n \
sanitize apps add elastic --profile elastic.profile.yaml --secrets-file elastic.secrets.yaml\n \
sanitize apps add myapp --profile myapp.profile.yaml\n \
sanitize apps add myapp --secrets-file myapp.secrets.yaml --overwrite")]
Add(AppsAddArgs),
#[command(after_help = "\
EXAMPLES:\n \
sanitize apps remove elastic --yes\n \
sanitize apps remove myapp -y")]
Remove(AppsRemoveArgs),
#[command(after_help = "\
EXAMPLES:\n \
sanitize apps edit rails\n \
sanitize apps edit kubernetes\n \
sanitize apps edit gitlab")]
Edit(AppsEditArgs),
Dir,
}
#[derive(Parser, Debug)]
pub(crate) struct AppsAddArgs {
#[arg(value_name = "NAME")]
name: String,
#[arg(long, value_name = "FILE")]
profile: Option<PathBuf>,
#[arg(long, value_name = "FILE")]
secrets_file: Option<PathBuf>,
#[arg(long)]
overwrite: bool,
}
#[derive(Parser, Debug)]
pub(crate) struct AppsRemoveArgs {
#[arg(value_name = "NAME")]
name: String,
#[arg(long, short = 'y')]
yes: bool,
}
#[derive(Parser, Debug)]
pub(crate) struct AppsEditArgs {
#[arg(value_name = "NAME")]
name: String,
}
#[derive(ValueEnum, Copy, Clone, Debug, PartialEq, Eq)]
pub(crate) enum HookType {
#[value(name = "pre-commit")]
PreCommit,
#[value(name = "pre-push")]
PrePush,
}
impl HookType {
pub(crate) fn hook_name(&self) -> &'static str {
match self {
HookType::PreCommit => "pre-commit",
HookType::PrePush => "pre-push",
}
}
}
#[derive(ValueEnum, Copy, Clone, Debug, PartialEq, Eq)]
pub(crate) enum HookMode {
Scan,
Sanitize,
}
#[derive(Parser, Debug)]
#[command(after_help = "\
NOTE\n \
The hook calls `sanitize` from PATH at commit time — the binary must be\n \
installed on every machine that will run the hook. If `sanitize` is not\n \
found the hook silently passes rather than blocking the commit.")]
pub(crate) struct InstallHookArgs {
#[arg(long, value_enum, default_value = "pre-commit", value_name = "HOOK")]
pub(crate) hook: HookType,
#[arg(long, value_enum, default_value = "scan", value_name = "MODE")]
pub(crate) mode: HookMode,
#[arg(long)]
pub(crate) global: bool,
#[arg(long, short = 'f')]
pub(crate) force: bool,
#[arg(long)]
pub(crate) remove: bool,
#[arg(long, value_name = "NAMES")]
pub(crate) app: Option<String>,
#[arg(short = 's', long, value_name = "FILE")]
pub(crate) secrets_file: Option<PathBuf>,
#[arg(long)]
pub(crate) dry_run: bool,
}
#[derive(Parser, Debug)]
pub(crate) struct InitArgs {
#[arg(long, value_enum, default_value = "pre-commit", value_name = "HOOK")]
pub(crate) hook: HookType,
#[arg(long, value_enum, default_value = "scan", value_name = "MODE")]
pub(crate) mode: HookMode,
#[arg(long)]
pub(crate) global: bool,
#[arg(long, short = 'f')]
pub(crate) force: bool,
#[arg(long)]
pub(crate) dry_run: bool,
}
fn run_scan(args: &ScanArgs) -> Result<(), (String, i32)> {
let pre_resolved_password: Option<Zeroizing<String>> =
if args.encrypted_secrets && !args.password {
if let Some(ref pf) = args.password_file {
Some(read_password_file(pf).map_err(|e| (e, 1))?)
} else if let Ok(pw) = std::env::var("SANITIZE_PASSWORD") {
std::env::remove_var("SANITIZE_PASSWORD");
eprintln!("info: using password from SANITIZE_PASSWORD environment variable");
Some(Zeroizing::new(pw))
} else {
None
}
} else if args.encrypted_secrets && args.password {
Some(prompt_password("secrets file").map_err(|e| (e, 1))?)
} else {
None
};
let cli = Cli {
command: None,
input: args.input.clone(),
output: None,
secrets_file: args.secrets_file.clone(),
profile: args.profile.clone(),
password: args.password,
password_file: args.password_file.clone(),
encrypted_secrets: args.encrypted_secrets,
format: None,
dry_run: true,
fail_on_match: true,
report: args.report.clone(),
report_format: args.report_format.clone(),
strict: false,
deterministic: false,
no_structured_handoff: true,
no_field_signal: false,
include_binary: false,
hidden: args.hidden,
exclude_path: args.exclude_path.clone(),
include_path: args.include_path.clone(),
force_text: false,
threads: args.threads,
chunk_size: 1_048_576,
max_mappings: 10_000_000,
max_structured_size: DEFAULT_MAX_STRUCTURED_FILE_SIZE,
max_archive_depth: DEFAULT_ARCHIVE_DEPTH,
log_format: args.log_format.clone(),
log_level: args.log_level.clone(),
progress: if args.findings || args.no_progress {
Some(ProgressMode::Off)
} else {
None
},
no_progress: false,
quiet: false,
progress_interval_ms: DEFAULT_PROGRESS_INTERVAL_MS,
extract_context: false,
context_lines: DEFAULT_CONTEXT_LINES,
context_keywords: Vec::new(),
context_keywords_replace: false,
max_context_matches: DEFAULT_MAX_MATCHES,
context_case_sensitive: false,
max_match_locations: 0,
strip_values: false,
strip_delimiter: "=".to_string(),
strip_comment_prefix: "#".to_string(),
llm: None,
app: args.app.clone(),
allow: args.allow.clone(),
findings: if args.findings {
Some(PathBuf::from("-"))
} else {
None
},
entropy_threshold: args.entropy_threshold,
use_default: args.use_default,
};
run_sanitize(cli, pre_resolved_password, HashMap::new())
}
fn run_test_pattern(args: &TestPatternArgs) -> Result<(), (String, i32)> {
let mut entries: Vec<SecretEntry> = Vec::new();
for p in &args.patterns {
entries.push(SecretEntry {
pattern: p.clone(),
kind: "regex".to_string(),
category: "auth_token".to_string(),
label: None,
values: vec![],
min_length: None,
max_length: None,
threshold: None,
charset: None,
});
}
if let Some(ref path) = args.secrets_file {
let bytes =
fs::read(path).map_err(|e| (format!("failed to read {}: {e}", path.display()), 1))?;
let format = SecretsFormat::from_extension(path.to_string_lossy().as_ref());
let mut file_entries = parse_secrets(&bytes, format)
.map_err(|e| (format!("failed to parse {}: {e}", path.display()), 1))?;
file_entries.retain(|e| e.kind != "allow");
entries.extend(file_entries);
}
for app_name in &args.app {
let bundle = load_app_bundle(app_name).map_err(|e| (e, 1))?;
let mut bundle_entries = bundle.secrets;
bundle_entries.retain(|e| e.kind != "allow");
entries.extend(bundle_entries);
}
if entries.is_empty() {
return Err((
"no patterns to test — provide --pattern, --secrets-file, or --app".into(),
1,
));
}
struct CompiledPattern {
label: String,
category: String,
regex: regex::Regex,
}
let mut compiled: Vec<CompiledPattern> = Vec::new();
let mut compile_errors: Vec<String> = Vec::new();
for entry in &entries {
if entry.pattern.is_empty() {
continue;
}
let label = entry
.label
.clone()
.unwrap_or_else(|| entry.pattern.chars().take(40).collect());
let (regex_str, _is_literal) = if entry.kind == "literal" {
(regex::escape(&entry.pattern), true)
} else {
(entry.pattern.clone(), false)
};
match regex::Regex::new(®ex_str) {
Ok(re) => compiled.push(CompiledPattern {
label,
category: entry.category.clone(),
regex: re,
}),
Err(e) => compile_errors.push(format!(" pattern '{}': {e}", entry.pattern)),
}
}
if !compile_errors.is_empty() {
for e in &compile_errors {
eprintln!("warning: pattern failed to compile — {e}");
}
}
if compiled.is_empty() {
return Err(("all patterns failed to compile".into(), 1));
}
let values: Vec<String> = if args.values.is_empty() {
let mut buf = String::new();
io::stdin()
.read_to_string(&mut buf)
.map_err(|e| (format!("failed to read stdin: {e}"), 1))?;
buf.lines()
.filter(|l| !l.is_empty())
.map(|l| l.to_string())
.collect()
} else {
args.values.clone()
};
if values.is_empty() {
return Err((
"no values to test — provide values as arguments or via stdin".into(),
1,
));
}
struct MatchHit {
label: String,
category: String,
matched_text: String,
start: usize,
end: usize,
partial: bool,
}
struct ValueResult {
value: String,
hits: Vec<MatchHit>,
}
let results: Vec<ValueResult> = values
.iter()
.map(|value| {
let mut hits = Vec::new();
for cp in &compiled {
if let Some(m) = cp.regex.captures(value) {
let (span, partial) = if let Some(g1) = m.get(1) {
(g1, true)
} else {
(m.get(0).unwrap(), false)
};
hits.push(MatchHit {
label: cp.label.clone(),
category: cp.category.clone(),
matched_text: span.as_str().to_string(),
start: span.start(),
end: span.end(),
partial,
});
}
}
ValueResult {
value: value.clone(),
hits,
}
})
.collect();
let total_matched = results.iter().filter(|r| !r.hits.is_empty()).count();
if args.json {
#[derive(serde::Serialize)]
struct JsonHit<'a> {
label: &'a str,
category: &'a str,
matched_text: &'a str,
start: usize,
end: usize,
partial: bool,
}
#[derive(serde::Serialize)]
struct JsonResult<'a> {
value: &'a str,
matched: bool,
hits: Vec<JsonHit<'a>>,
}
#[derive(serde::Serialize)]
struct JsonOutput<'a> {
patterns_loaded: usize,
results: Vec<JsonResult<'a>>,
summary: JsonSummary,
}
#[derive(serde::Serialize)]
struct JsonSummary {
total: usize,
matched: usize,
unmatched: usize,
}
let out = JsonOutput {
patterns_loaded: compiled.len(),
results: results
.iter()
.map(|r| JsonResult {
value: &r.value,
matched: !r.hits.is_empty(),
hits: r
.hits
.iter()
.map(|h| JsonHit {
label: &h.label,
category: &h.category,
matched_text: &h.matched_text,
start: h.start,
end: h.end,
partial: h.partial,
})
.collect(),
})
.collect(),
summary: JsonSummary {
total: results.len(),
matched: total_matched,
unmatched: results.len() - total_matched,
},
};
println!(
"{}",
serde_json::to_string_pretty(&out)
.unwrap_or_else(|e| format!("{{\"error\": \"{e}\"}}"))
);
} else {
println!(
"Testing {} pattern(s) against {} value(s)\n",
compiled.len(),
values.len()
);
for r in &results {
if r.hits.is_empty() {
println!("✗ {}", r.value);
println!(" (no match)\n");
} else {
println!("✓ {}", r.value);
for h in &r.hits {
let span_note = if h.partial {
format!(
"bytes {}..{} (partial — prefix/suffix preserved)",
h.start, h.end
)
} else {
format!("bytes {}..{} (full match)", h.start, h.end)
};
println!(
" {:<30} [{}] {:?} {}",
h.label, h.category, h.matched_text, span_note
);
}
println!();
}
}
println!("{}/{} values matched", total_matched, results.len());
}
if total_matched < results.len() {
Err(("some values did not match any pattern".into(), 1))
} else {
Ok(())
}
}
fn run_allow_test(args: &AllowTestArgs) -> Result<(), (String, i32)> {
use sanitize_engine::allowlist::AllowlistMatcher;
let (matcher, warnings) = AllowlistMatcher::new(args.allow.clone());
for w in &warnings {
eprintln!("warning: {w}");
}
let values: Vec<String> = if args.values.is_empty() {
let mut buf = String::new();
io::stdin()
.read_to_string(&mut buf)
.map_err(|e| (format!("failed to read stdin: {e}"), 1))?;
buf.lines()
.map(|l| l.to_string())
.filter(|l| !l.is_empty())
.collect()
} else {
args.values.clone()
};
if values.is_empty() {
return Err((
"no values to test — provide values as arguments or via stdin".into(),
1,
));
}
#[derive(serde::Serialize)]
struct MatchResult<'a> {
value: &'a str,
allowed: bool,
#[serde(skip_serializing_if = "Option::is_none")]
pattern: Option<&'a str>,
}
let results: Vec<MatchResult> = values
.iter()
.map(|v| {
let pattern = matcher.match_pattern(v);
MatchResult {
value: v,
allowed: pattern.is_some(),
pattern,
}
})
.collect();
if args.json {
let allowed = results.iter().filter(|r| r.allowed).count();
#[derive(serde::Serialize)]
struct Output<'a> {
results: Vec<MatchResult<'a>>,
summary: Summary,
}
#[derive(serde::Serialize)]
struct Summary {
total: usize,
allowed: usize,
blocked: usize,
}
let out = Output {
summary: Summary {
total: results.len(),
allowed,
blocked: results.len() - allowed,
},
results,
};
match serde_json::to_string_pretty(&out) {
Ok(json) => println!("{}", json),
Err(e) => eprintln!("allow-test: failed to serialize JSON output: {e}"),
}
} else {
for r in &results {
if r.allowed {
println!("✓ {:<40} → {}", r.value, r.pattern.unwrap_or(""));
} else {
println!("✗ {:<40} (no match)", r.value);
}
}
let allowed = results.iter().filter(|r| r.allowed).count();
println!("\n{}/{} values allowed", allowed, results.len());
}
Ok(())
}
fn run_template(args: &TemplateArgs) -> Result<(), (String, i32)> {
let preset = parse_template_preset(&args.preset).map_err(|e| (e, 1))?;
let output_path = args
.output
.clone()
.unwrap_or_else(|| PathBuf::from(format!("secrets.template.{}.yaml", args.preset)));
if output_path.exists() && !args.overwrite {
return Err((
format!(
"{} already exists — use --overwrite to replace it",
output_path.display()
),
1,
));
}
let body = match preset {
TemplatePreset::Generic => template_body_generic(),
TemplatePreset::Web => template_body_web(),
TemplatePreset::K8s => template_body_k8s(),
TemplatePreset::Database => template_body_database(),
TemplatePreset::Aws => template_body_aws(),
};
let mut content = String::with_capacity(TEMPLATE_HEADER.len() + body.len());
content.push_str(TEMPLATE_HEADER);
content.push('\n');
content.push_str(body);
atomic_write(&output_path, content.as_bytes())
.map_err(|e| (format!("failed to write {}: {e}", output_path.display()), 1))?;
eprintln!("Template written to {}", output_path.display());
eprintln!();
eprintln!("Next steps:");
eprintln!(
" 1. Edit {} to add your own patterns and remove irrelevant ones.",
output_path.display()
);
eprintln!(
" 2. Encrypt: sanitize encrypt {} {}.enc",
output_path.display(),
output_path.display()
);
eprintln!(
" 3. Sanitize: sanitize <input> -s {}.enc -o <output>",
output_path.display()
);
eprintln!();
eprintln!("WARNING: always review sanitized output before sending to an LLM.");
Ok(())
}
fn normalize_guided_output_path(path: PathBuf) -> PathBuf {
match path
.extension()
.and_then(|ext| ext.to_str())
.map(|s| s.to_ascii_lowercase())
{
Some(ext) if ext == "yaml" || ext == "yml" => path,
_ => path.with_extension("yaml"),
}
}
fn prompt_confirm_password() -> Result<Zeroizing<String>, String> {
loop {
let pw1 = prompt_password("encryption")?;
let pw2 = prompt_password("encryption (confirm)")?;
if pw1 == pw2 {
return Ok(pw1);
}
eprintln!("Passwords did not match. Try again.");
}
}
fn run_guided() -> Result<(), (String, i32)> {
if !io::stdin().is_terminal() || !io::stdout().is_terminal() {
return Err((
"guided mode requires an interactive terminal (TTY)".into(),
1,
));
}
eprintln!("Guided setup: logs-focused secrets template");
eprintln!("This wizard creates a starter file you can extend later.\n");
eprintln!("Workspace type (affects which patterns are included):");
eprintln!(" 1) Generic — tokens, emails, IPs, hostnames, UUIDs");
eprintln!(" 2) Web app — JWTs, session cookies, emails, URLs");
eprintln!(" 3) Kubernetes — service accounts, tokens, namespaces");
eprintln!(" 4) Database — passwords, connection strings, usernames");
eprintln!(" 5) AWS — access keys, ARNs, account IDs");
let preset = loop {
let answer = prompt_line("Select [1-5] (default: 1): ").map_err(|e| (e, 1))?;
match answer.as_str() {
"" | "1" => break GuidedPreset::Balanced,
"2" => break GuidedPreset::WebApp,
"3" => break GuidedPreset::Kubernetes,
"4" => break GuidedPreset::Database,
"5" => break GuidedPreset::Aggressive,
_ => eprintln!("Please enter a number from 1 to 5."),
}
};
eprintln!("\nReplacement strictness:");
eprintln!(" 1) Balanced — replace clearly sensitive values only");
eprintln!(" 2) Aggressive — replace high-entropy tokens too (recommended for LLMs)");
let aggressive = loop {
let answer = prompt_line("Select [1/2] (default: 2): ").map_err(|e| (e, 1))?;
match answer.as_str() {
"" | "2" => break true,
"1" => break false,
_ => eprintln!("Please enter 1 or 2."),
}
};
let domains = prompt_domains().map_err(|e| (e, 1))?;
let providers = prompt_cloud_providers().map_err(|e| (e, 1))?;
eprintln!();
let formats = prompt_formats().map_err(|e| (e, 1))?;
let exclude_noise_ids = prompt_yes_no(
"\nExclude noisy IDs (trace_id/span_id-like high-entropy values)?",
true,
)
.map_err(|e| (e, 1))?;
let out_raw = prompt_line("\nOutput secrets file path (YAML; default: secrets.guided.yaml): ")
.map_err(|e| (e, 1))?;
let requested_output_path = if out_raw.trim().is_empty() {
PathBuf::from("secrets.guided.yaml")
} else {
PathBuf::from(out_raw)
};
let output_path = normalize_guided_output_path(requested_output_path.clone());
if output_path != requested_output_path {
eprintln!(
"Guided mode writes YAML templates; using {}",
output_path.display()
);
}
let options = GuidedOptions {
preset: if aggressive {
match preset {
GuidedPreset::Balanced => GuidedPreset::Aggressive,
other => other,
}
} else {
preset
},
domains,
providers,
exclude_noise_ids,
formats,
};
let entries = build_guided_entries(&options);
let (_patterns, compile_warnings) = entries_to_patterns(&entries);
if !compile_warnings.is_empty() {
return Err((
format!(
"generated template had {} invalid pattern(s)",
compile_warnings.len()
),
1,
));
}
let plain = serialize_secrets(&entries, SecretsFormat::Yaml)
.map_err(|e| (format!("failed to serialize template: {e}"), 1))?;
if output_path.exists()
&& !prompt_yes_no(
&format!("{} already exists. Overwrite?", output_path.display()),
false,
)
.map_err(|e| (e, 1))?
{
return Err(("aborted by user".into(), 1));
}
atomic_write(&output_path, &plain)
.map_err(|e| (format!("failed to write {}: {e}", output_path.display()), 1))?;
eprintln!(
"Generated {} entries at {}",
entries.len(),
output_path.display()
);
let profile_path: Option<PathBuf> = if options.formats.is_empty() {
None
} else {
let profiles = build_guided_profiles(&options);
let profile_yaml = serde_yaml_ng::to_string(&profiles)
.map_err(|e| (format!("failed to serialize profile: {e}"), 1))?;
let default_profile_name = output_path
.file_stem()
.and_then(|s| s.to_str())
.map(|stem| format!("{stem}.profile.yaml"))
.unwrap_or_else(|| "profile.guided.yaml".to_string());
let prof_raw = prompt_line(&format!(
"Output profile file path (default: {default_profile_name}): "
))
.map_err(|e| (e, 1))?;
let prof_path = if prof_raw.trim().is_empty() {
PathBuf::from(&default_profile_name)
} else {
PathBuf::from(prof_raw)
};
if prof_path.exists()
&& !prompt_yes_no(
&format!("{} already exists. Overwrite?", prof_path.display()),
false,
)
.map_err(|e| (e, 1))?
{
return Err(("aborted by user".into(), 1));
}
let mut content = String::with_capacity(PROFILE_HEADER.len() + 1 + profile_yaml.len());
content.push_str(PROFILE_HEADER);
content.push('\n');
content.push_str(&profile_yaml);
atomic_write(&prof_path, content.as_bytes())
.map_err(|e| (format!("failed to write {}: {e}", prof_path.display()), 1))?;
eprintln!(
"Generated {} profile rule(s) at {} (safe to commit — no secrets inside)",
profiles.len(),
prof_path.display()
);
Some(prof_path)
};
let encrypt =
prompt_yes_no("Encrypt the generated secrets file now?", true).map_err(|e| (e, 1))?;
let mut secrets_for_run = output_path.clone();
let mut run_password: Option<Zeroizing<String>> = None;
let mut run_unencrypted = true;
if encrypt {
let pw = prompt_confirm_password().map_err(|e| (e, 1))?;
let encrypted = encrypt_secrets(&plain, &pw)
.map_err(|e| (format!("failed to encrypt guided secrets file: {e}"), 1))?;
let encrypted_path = PathBuf::from(format!("{}.enc", output_path.display()));
atomic_write(&encrypted_path, &encrypted).map_err(|e| {
(
format!("failed to write {}: {e}", encrypted_path.display()),
1,
)
})?;
eprintln!("Encrypted template written to {}", encrypted_path.display());
if let Err(e) = fs::remove_file(&output_path) {
eprintln!(
"Warning: could not remove plaintext file {}: {e}",
output_path.display()
);
} else {
eprintln!("Plaintext file {} removed.", output_path.display());
}
secrets_for_run = encrypted_path;
run_password = Some(pw);
run_unencrypted = false;
}
let run_now =
prompt_yes_no("Run sanitize now with this secrets file?", true).map_err(|e| (e, 1))?;
if !run_now {
let profile_flag = profile_path
.as_ref()
.map(|p| format!(" --profile {}", p.display()))
.unwrap_or_default();
eprintln!(
"Next: sanitize <input> -s {}{}",
secrets_for_run.display(),
profile_flag
);
return Ok(());
}
let input_raw = prompt_line("Input file path (or '-' for stdin): ").map_err(|e| (e, 1))?;
let input = if input_raw.trim().is_empty() {
return Err(("input file path is required to run sanitize now".into(), 1));
} else {
PathBuf::from(input_raw)
};
let out_raw =
prompt_line("Output path (optional; blank = stdout/default): ").map_err(|e| (e, 1))?;
let output = if out_raw.trim().is_empty() {
None
} else {
Some(PathBuf::from(out_raw))
};
let dry_run = prompt_yes_no("Dry-run first?", true).map_err(|e| (e, 1))?;
let deterministic =
prompt_yes_no("Use deterministic replacements?", true).map_err(|e| (e, 1))?;
let mut deterministic_password: Option<Zeroizing<String>> = run_password.clone();
if deterministic && deterministic_password.is_none() {
deterministic_password = Some(prompt_password("deterministic seed").map_err(|e| (e, 1))?);
}
let cli = Cli {
input: vec![input],
output,
secrets_file: Some(secrets_for_run),
profile: profile_path,
encrypted_secrets: !run_unencrypted,
dry_run,
deterministic,
..Cli::default()
};
run_sanitize(cli, deterministic_password.or(run_password), HashMap::new())
}
fn resolve_password(
password_flag: bool,
cli_password_file: &Option<PathBuf>,
interactive_label: &str,
) -> Result<Zeroizing<String>, String> {
if password_flag {
if !io::stdin().is_terminal() {
return Err("--password requires an interactive terminal. \
For non-interactive use, supply the password via \
--password-file or the SANITIZE_PASSWORD environment variable."
.into());
}
return prompt_password(interactive_label);
}
if let Some(path) = cli_password_file {
return read_password_file(path);
}
if let Ok(pw) = std::env::var("SANITIZE_PASSWORD") {
if !pw.is_empty() {
std::env::remove_var("SANITIZE_PASSWORD");
eprintln!("info: using password from SANITIZE_PASSWORD environment variable");
return Ok(Zeroizing::new(pw));
}
}
prompt_password(interactive_label)
}
#[cfg(unix)]
fn read_password_file(path: &Path) -> Result<Zeroizing<String>, String> {
use nix::sys::stat::fstat;
use std::os::unix::io::AsRawFd;
let file = fs::File::open(path)
.map_err(|e| format!("cannot open password file {}: {e}", path.display()))?;
let stat = fstat(file.as_raw_fd())
.map_err(|e| format!("cannot stat password file {}: {e}", path.display()))?;
let mode = stat.st_mode & 0o777;
if mode != 0o600 && mode != 0o400 {
return Err(format!(
"password file {} has permissions {:04o}; expected 0600 or 0400. \
Fix with: chmod 600 {}",
path.display(),
mode,
path.display(),
));
}
read_password_file_contents(path)
}
#[cfg(not(unix))]
fn read_password_file(path: &Path) -> Result<Zeroizing<String>, String> {
eprintln!(
"warning: password-file permission checks are only available on Unix. \
Ensure {} is not world-readable.",
path.display(),
);
read_password_file_contents(path)
}
fn read_password_file_contents(path: &Path) -> Result<Zeroizing<String>, String> {
const MAX_PASSWORD_FILE_BYTES: u64 = 4096;
let size = fs::metadata(path)
.map_err(|e| format!("cannot stat password file {}: {e}", path.display()))?
.len();
if size > MAX_PASSWORD_FILE_BYTES {
return Err(format!(
"password file {} is too large ({size} bytes); expected ≤ {MAX_PASSWORD_FILE_BYTES} bytes",
path.display(),
));
}
let mut contents = Zeroizing::new(
fs::read_to_string(path)
.map_err(|e| format!("cannot read password file {}: {e}", path.display()))?,
);
if contents.ends_with('\n') {
contents.pop();
if contents.ends_with('\r') {
contents.pop();
}
}
if contents.is_empty() {
return Err(format!("password file {} is empty", path.display()));
}
Ok(contents)
}
fn prompt_password(label: &str) -> Result<Zeroizing<String>, String> {
let pw = rpassword::prompt_password(format!("Enter {label} password: "))
.map_err(|e| format!("failed to read password: {e}"))?;
if pw.is_empty() {
return Err("password must not be empty".into());
}
Ok(Zeroizing::new(pw))
}
fn resolve_sanitize_password(cli: &Cli) -> Result<Zeroizing<String>, String> {
resolve_password(cli.password, &cli.password_file, "secrets decryption")
}
fn looks_binary(data: &[u8]) -> bool {
let sample = &data[..data.len().min(512)];
if sample.contains(&0u8) {
return true;
}
let non_text = sample
.iter()
.filter(|&&b| b < 0x20 && b != b'\n' && b != b'\r' && b != b'\t')
.count();
non_text as f64 / sample.len().max(1) as f64 > 0.10
}
fn build_store(
deterministic: bool,
password: Option<&str>,
max_mappings: usize,
allowlist: Option<Arc<sanitize_engine::allowlist::AllowlistMatcher>>,
) -> std::result::Result<Arc<MappingStore>, String> {
let generator: Arc<dyn ReplacementGenerator> = if deterministic {
match password {
Some(k) => {
use hmac::Hmac;
use sha2::Sha256;
let mut buf = Zeroizing::new([0u8; 32]);
let salt = b"sanitize-engine:deterministic-seed:v1";
pbkdf2::pbkdf2::<Hmac<Sha256>>(k.as_bytes(), salt, 600_000, buf.as_mut())
.expect("PBKDF2 output length is valid");
Arc::new(HmacGenerator::new(*buf))
}
None => {
return Err(
"--deterministic requires --password (or SANITIZE_PASSWORD). \
A deterministic seed cannot be derived without a key."
.into(),
);
}
}
} else {
Arc::new(RandomGenerator::new())
};
let capacity = if max_mappings == 0 {
None
} else {
Some(max_mappings)
};
Ok(Arc::new(match allowlist {
Some(al) => MappingStore::new_with_allowlist(generator, capacity, al),
None => MappingStore::new(generator, capacity),
}))
}
pub(crate) fn common_allow_patterns() -> Vec<String> {
vec![
"127.0.0.1".into(),
"0.0.0.0".into(),
"255.255.255.255".into(),
"255.255.255.0".into(),
"255.255.0.0".into(),
"255.0.0.0".into(),
"::1".into(),
"localhost".into(),
"localhost.localdomain".into(),
"http://localhost*".into(),
"https://localhost*".into(),
"http://127.0.0.1*".into(),
"https://127.0.0.1*".into(),
"example.com".into(),
"example.org".into(),
"example.net".into(),
"http://example.com*".into(),
"https://example.com*".into(),
"https://example.org*".into(),
"https://example.net*".into(),
"00000000-0000-0000-0000-000000000000".into(),
"xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx".into(),
"aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa".into(),
"12345678-1234-1234-1234-123456789abc".into(),
"changeme".into(),
"example".into(),
"sample".into(),
"placeholder".into(),
"${*}".into(),
"{{*}}".into(),
]
}
fn build_default_patterns() -> Vec<ScanPattern> {
let opts = GuidedOptions {
preset: GuidedPreset::Balanced,
domains: vec![],
providers: vec![],
exclude_noise_ids: false,
formats: vec![],
};
let entries = build_guided_entries(&opts);
let (patterns, errors) = entries_to_patterns(&entries);
if !errors.is_empty() {
for (i, e) in &errors {
warn!(entry = i, error = %e, "built-in default pattern failed to compile");
}
}
patterns
}
fn builtin_field_name_signals() -> Vec<FieldNameSignal> {
let specs: &[(&str, &str, f64)] = &[
(
r"password|passwd|secret|private_key|api_secret|client_secret",
"field-signal:strong",
3.0,
),
(
r"api_key|access_key|auth_token|token|signing_key|encryption_key|credential|cert",
"field-signal:medium",
3.5,
),
];
specs
.iter()
.filter_map(|(pattern, label, threshold)| {
match FieldNameSignal::new(
*pattern,
parse_category("custom:credential"),
Some((*label).to_string()),
*threshold,
) {
Ok(sig) => Some(sig),
Err(e) => {
warn!(error = %e, "built-in field-name signal failed to compile");
None
}
}
})
.collect()
}
fn field_signals_from_entries(entries: &[SecretEntry]) -> Vec<FieldNameSignal> {
entries
.iter()
.filter(|e| e.kind == "field-name" && !e.pattern.is_empty())
.filter_map(|e| {
let category = parse_category(&e.category);
let threshold = e.threshold.unwrap_or(DEFAULT_FIELD_SIGNAL_THRESHOLD);
match FieldNameSignal::new(&e.pattern, category, e.label.clone(), threshold) {
Ok(sig) => Some(sig),
Err(err) => {
warn!(pattern = %e.pattern, error = %err, "field-name signal skipped");
None
}
}
})
.collect()
}
fn build_augmented_scanner(
base_patterns: &[ScanPattern],
store: &Arc<MappingStore>,
scan_config: ScanConfig,
) -> std::result::Result<Arc<StreamScanner>, (String, i32)> {
let mut patterns = base_patterns.to_vec();
let mut discovered = 0usize;
for (category, original, _replacement) in store.iter() {
let s = original.as_str();
if s.is_empty() {
continue;
}
match ScanPattern::from_literal(s, category, format!("profile-discovered:{s}")) {
Ok(pat) => {
patterns.push(pat);
discovered += 1;
}
Err(e) => {
warn!(value = s, error = %e, "could not compile discovered literal pattern");
}
}
}
if discovered > 0 {
info!(
count = discovered,
"augmented scanner with profile-discovered literals"
);
}
let scanner = StreamScanner::new(patterns, Arc::clone(store), scan_config)
.map_err(|e| (format!("failed to create augmented scanner: {e}"), 1))?;
Ok(Arc::new(scanner))
}
fn build_scan_config(chunk_size: usize) -> Result<ScanConfig, String> {
if chunk_size == 0 {
return Err("--chunk-size must be greater than 0".into());
}
let overlap = (chunk_size / 4).clamp(1, 4096);
if overlap >= chunk_size {
return Err(format!(
"--chunk-size ({chunk_size}) is too small; must be > {overlap} bytes"
));
}
let cfg = ScanConfig::new(chunk_size, overlap);
cfg.validate().map_err(|e| e.to_string())?;
Ok(cfg)
}
fn default_archive_output(input: &Path, fmt: ArchiveFormat) -> PathBuf {
let stem = input
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("output");
let ext = match fmt {
ArchiveFormat::Zip => "zip",
ArchiveFormat::Tar => "tar",
ArchiveFormat::TarGz => "tar.gz",
};
let base = if matches!(fmt, ArchiveFormat::TarGz) {
stem.strip_suffix(".tar").unwrap_or(stem)
} else {
stem
};
input.with_file_name(format!("{base}.sanitized.{ext}"))
}
fn init_logging(log_format: &str, log_level: &str) {
use tracing_subscriber::fmt;
use tracing_subscriber::EnvFilter;
let filter =
EnvFilter::try_from_env("SANITIZE_LOG").unwrap_or_else(|_| EnvFilter::new(log_level));
match log_format {
"json" => {
let _ = fmt()
.json()
.with_env_filter(filter)
.with_target(true)
.with_writer(io::stderr)
.try_init();
}
_ => {
let _ = fmt()
.compact()
.with_env_filter(filter)
.with_target(false)
.with_writer(io::stderr)
.try_init();
}
}
}
fn has_stdin_input(cli: &Cli) -> bool {
cli.input.is_empty() || cli.input.iter().any(|p| p.as_os_str() == "-")
}
#[cfg(unix)]
fn stdin_is_pipe() -> bool {
use nix::sys::stat::fstat;
use std::os::unix::io::AsRawFd;
fstat(io::stdin().as_raw_fd())
.map(|s| {
nix::sys::stat::SFlag::from_bits_truncate(s.st_mode)
.contains(nix::sys::stat::SFlag::S_IFIFO)
})
.unwrap_or(false)
}
#[cfg(windows)]
fn stdin_is_pipe() -> bool {
use std::os::windows::io::AsRawHandle;
extern "system" {
fn GetFileType(hFile: *mut std::ffi::c_void) -> u32;
}
const FILE_TYPE_PIPE: u32 = 3;
let handle = io::stdin().as_raw_handle();
unsafe { GetFileType(handle as *mut _) == FILE_TYPE_PIPE }
}
#[cfg(not(any(unix, windows)))]
fn stdin_is_pipe() -> bool {
!io::stdin().is_terminal()
}
fn file_inputs(cli: &Cli) -> Vec<&PathBuf> {
cli.input.iter().filter(|p| p.as_os_str() != "-").collect()
}
fn format_to_ext(fmt: &str) -> Option<&str> {
match fmt {
"json" => Some("json"),
"jsonl" | "ndjson" => Some("jsonl"),
"yaml" | "yml" => Some("yaml"),
"xml" => Some("xml"),
"csv" => Some("csv"),
"tsv" => Some("tsv"),
"key-value" | "key_value" | "kv" => Some("conf"),
"toml" => Some("toml"),
"env" => Some("env"),
"ini" => Some("ini"),
"log" => Some("log"),
_ => None,
}
}
fn default_plain_output(input: &Path) -> PathBuf {
let name = input
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("output");
let output_name = if let Some((stem, ext)) = name.rsplit_once('.') {
format!("{stem}-sanitized.{ext}")
} else {
format!("{name}-sanitized")
};
input.with_file_name(output_name)
}
fn split_name_for_suffix(name: &str) -> (String, String) {
if let Some(stem) = name.strip_suffix(".tar.gz") {
return (stem.to_string(), ".tar.gz".to_string());
}
if let Some((stem, ext)) = name.rsplit_once('.') {
return (stem.to_string(), format!(".{ext}"));
}
(name.to_string(), String::new())
}
fn uniquify_output_path(path: PathBuf, used: &mut HashSet<PathBuf>) -> PathBuf {
if !path.exists() && !used.contains(&path) {
used.insert(path.clone());
return path;
}
let parent = path.parent().map(Path::to_path_buf).unwrap_or_default();
let name = path
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("output")
.to_string();
let (stem, ext) = split_name_for_suffix(&name);
let mut idx = 1usize;
loop {
let candidate = parent.join(format!("{stem}-{idx}{ext}"));
if !candidate.exists() && !used.contains(&candidate) {
used.insert(candidate.clone());
return candidate;
}
idx += 1;
}
}
enum InputTarget {
Stdin { output: Option<PathBuf> },
File { input: PathBuf, output: PathBuf },
}
const SKIP_VCS_DIRS: &[&str] = &[".git", ".hg", ".svn", ".bzr"];
struct ExpandedInput {
path: PathBuf,
dir_root: Option<PathBuf>,
}
fn walk_dir(dir: &Path, include_hidden: bool) -> Result<Vec<PathBuf>, String> {
use walkdir::WalkDir;
let mut files = Vec::new();
let walker = WalkDir::new(dir).follow_links(false).sort_by_file_name();
for entry in walker {
let entry = entry.map_err(|e| format!("error walking {}: {e}", dir.display()))?;
let name = entry.file_name().to_str().unwrap_or("");
if entry.file_type().is_dir() && SKIP_VCS_DIRS.contains(&name) {
continue;
}
if !include_hidden && entry.depth() > 0 && name.starts_with('.') {
continue;
}
if entry.file_type().is_file() {
files.push(entry.into_path());
}
}
Ok(files)
}
struct IgnoreList {
patterns: Vec<(glob::Pattern, bool)>,
}
impl IgnoreList {
fn new(raw: &[String]) -> Self {
let mut patterns = Vec::with_capacity(raw.len());
for p in raw {
let is_subtree = p.ends_with('/');
let trimmed = p.trim_end_matches('/');
if trimmed.is_empty() {
continue;
}
match glob::Pattern::new(trimmed) {
Ok(compiled) => patterns.push((compiled, is_subtree)),
Err(e) => eprintln!("warning: invalid exclude pattern '{p}': {e} — skipping"),
}
}
Self { patterns }
}
fn is_excluded(&self, path: &Path, root: &Path) -> bool {
if self.patterns.is_empty() {
return false;
}
let opts = glob::MatchOptions {
case_sensitive: true,
require_literal_separator: true,
require_literal_leading_dot: false,
};
let canon_path = path.canonicalize().unwrap_or_else(|_| path.to_path_buf());
let canon_root = root.canonicalize().unwrap_or_else(|_| root.to_path_buf());
let rel = canon_path.strip_prefix(&canon_root).unwrap_or(&canon_path);
let rel_str = rel.to_string_lossy();
let filename = path
.file_name()
.map(|n| n.to_string_lossy())
.unwrap_or_default();
for (pat, is_subtree) in &self.patterns {
if *is_subtree {
let prefix = pat.as_str();
if rel_str.starts_with(prefix)
&& (rel_str.len() == prefix.len()
|| rel_str.as_bytes().get(prefix.len()) == Some(&b'/'))
{
return true;
}
} else {
if pat.matches_with(&rel_str, opts) {
return true;
}
if !pat.as_str().contains('/') && pat.matches_with(&filename, opts) {
return true;
}
}
}
false
}
}
struct IncludeList {
patterns: Vec<(glob::Pattern, bool)>,
}
impl IncludeList {
fn new(raw: &[String]) -> Self {
let mut patterns = Vec::with_capacity(raw.len());
for p in raw {
let is_subtree = p.ends_with('/');
let trimmed = p.trim_end_matches('/');
if trimmed.is_empty() {
continue;
}
match glob::Pattern::new(trimmed) {
Ok(compiled) => patterns.push((compiled, is_subtree)),
Err(e) => eprintln!("warning: invalid include-path pattern '{p}': {e} — skipping"),
}
}
Self { patterns }
}
fn is_included(&self, path: &Path, root: &Path) -> bool {
if self.patterns.is_empty() {
return true;
}
let opts = glob::MatchOptions {
case_sensitive: true,
require_literal_separator: true,
require_literal_leading_dot: false,
};
let canon_path = path.canonicalize().unwrap_or_else(|_| path.to_path_buf());
let canon_root = root.canonicalize().unwrap_or_else(|_| root.to_path_buf());
let rel = canon_path.strip_prefix(&canon_root).unwrap_or(&canon_path);
let rel_str = rel.to_string_lossy();
let filename = path
.file_name()
.map(|n| n.to_string_lossy())
.unwrap_or_default();
for (pat, is_subtree) in &self.patterns {
if *is_subtree {
let prefix = pat.as_str();
if rel_str.starts_with(prefix)
&& (rel_str.len() == prefix.len()
|| rel_str.as_bytes().get(prefix.len()) == Some(&b'/'))
{
return true;
}
} else {
if pat.matches_with(&rel_str, opts) {
return true;
}
if !pat.as_str().contains('/') && pat.matches_with(&filename, opts) {
return true;
}
}
}
false
}
}
fn plan_input_targets(cli: &Cli) -> Result<Vec<InputTarget>, String> {
let explicit_stdin_count = cli.input.iter().filter(|p| p.as_os_str() == "-").count();
if explicit_stdin_count > 1 {
return Err("stdin marker '-' can be specified at most once".into());
}
let has_piped_stdin = explicit_stdin_count == 0 && stdin_is_pipe();
if cli.input.is_empty() {
return Ok(vec![InputTarget::Stdin {
output: cli.output.clone(),
}]);
}
let (ignore_patterns, ignore_root): (Vec<String>, PathBuf) = {
let mut patterns: Vec<String> = Vec::new();
let root = if let Some(ref cfg_path) = find_project_config() {
let (pc, cfg_dir) = load_project_config(cfg_path);
patterns.extend(pc.exclude);
cfg_dir
} else {
std::env::current_dir().unwrap_or_else(|_| PathBuf::from("."))
};
patterns.extend(cli.exclude_path.iter().cloned());
(patterns, root)
};
let ignore_list = IgnoreList::new(&ignore_patterns);
let include_list = IncludeList::new(&cli.include_path);
let mut expanded: Vec<ExpandedInput> = Vec::new();
for input in &cli.input {
if input.as_os_str() == "-" {
continue; }
if input.is_dir() {
let files = walk_dir(input, cli.hidden)?;
if files.is_empty() {
warn!(dir = %input.display(), "directory contains no processable files");
continue;
}
let before = files.len();
let walk_root = input.canonicalize().unwrap_or_else(|_| input.to_path_buf());
let files: Vec<PathBuf> = files
.into_iter()
.filter(|f| {
if ignore_list.is_excluded(f, &walk_root) {
info!(path = %f.display(), "excluded by ignore pattern");
return false;
}
if !include_list.is_included(f, &walk_root) {
info!(path = %f.display(), "excluded by include-path filter");
return false;
}
true
})
.collect();
if files.is_empty() {
warn!(dir = %input.display(), excluded = before, "all files in directory excluded by path filters");
continue;
}
let excluded = before - files.len();
info!(dir = %input.display(), files = files.len(), excluded, "expanding directory input");
if cli.effective_log_format() != "json" {
if excluded > 0 {
eprintln!(
" {} files in {} ({} excluded)",
files.len(),
input.display(),
excluded
);
} else {
eprintln!(" {} files in {}", files.len(), input.display());
}
}
for f in files {
expanded.push(ExpandedInput {
path: f,
dir_root: Some(input.clone()),
});
}
} else {
if ignore_list.is_excluded(input, &ignore_root) {
warn!(path = %input.display(), "explicitly specified file matches an exclude pattern — skipping");
continue;
}
expanded.push(ExpandedInput {
path: input.clone(),
dir_root: None,
});
}
}
let multi_input = expanded.len() + explicit_stdin_count + (has_piped_stdin as usize) > 1;
let mut used_outputs = HashSet::new();
let mut units = Vec::new();
let output_dir: Option<PathBuf> = if multi_input {
if let Some(path) = &cli.output {
if path.exists() && !path.is_dir() {
return Err(format!(
"--output must be a directory when multiple inputs are provided: {}",
path.display()
));
}
if !path.exists() {
fs::create_dir_all(path).map_err(|e| {
format!("failed to create output directory {}: {e}", path.display())
})?;
}
Some(path.clone())
} else {
None
}
} else {
None
};
for ei in expanded {
let planned_out = if let Some(ref root) = ei.dir_root {
let rel = ei.path.strip_prefix(root).unwrap_or(&ei.path);
if let Some(out_root) = &cli.output {
let dest = out_root.join(rel);
if let Some(parent) = dest.parent() {
fs::create_dir_all(parent)
.map_err(|e| format!("failed to create {}: {e}", parent.display()))?;
}
uniquify_output_path(dest, &mut used_outputs)
} else {
let dir_name = root
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("sanitized");
let peer_dir = root
.parent()
.unwrap_or_else(|| Path::new("."))
.join(format!("{dir_name}-sanitized"));
let dest = peer_dir.join(rel);
if let Some(parent) = dest.parent() {
fs::create_dir_all(parent)
.map_err(|e| format!("failed to create {}: {e}", parent.display()))?;
}
uniquify_output_path(dest, &mut used_outputs)
}
} else if multi_input {
let default_out = match ArchiveFormat::from_path(&ei.path.to_string_lossy()) {
Some(fmt) => default_archive_output(&ei.path, fmt),
None => default_plain_output(&ei.path),
};
let out_name = default_out
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("output")
.to_string();
if let Some(dir) = &output_dir {
uniquify_output_path(dir.join(out_name), &mut used_outputs)
} else {
uniquify_output_path(default_out, &mut used_outputs)
}
} else {
let default_out = match ArchiveFormat::from_path(&ei.path.to_string_lossy()) {
Some(fmt) => default_archive_output(&ei.path, fmt),
None => default_plain_output(&ei.path),
};
if let Some(out) = &cli.output {
if out.is_dir() {
let out_name = default_out
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("output")
.to_string();
uniquify_output_path(out.join(out_name), &mut used_outputs)
} else {
out.clone()
}
} else {
default_out
}
};
units.push(InputTarget::File {
input: ei.path,
output: planned_out,
});
}
if explicit_stdin_count > 0 || has_piped_stdin {
let stdin_out = if multi_input {
Some(
output_dir
.as_ref()
.map(|d| d.join("input-sanitized.txt"))
.unwrap_or_else(|| PathBuf::from("input-sanitized.txt")),
)
} else {
cli.output.clone()
};
units.push(InputTarget::Stdin { output: stdin_out });
}
Ok(units)
}
#[allow(clippy::type_complexity)]
fn parse_archive_filters(
args: &[OsString],
) -> Result<(HashMap<PathBuf, (Vec<String>, Vec<String>)>, Vec<OsString>), String> {
#[derive(PartialEq)]
enum State {
Global,
AfterArchive,
CollectingOnly,
CollectingExclude,
}
let mut state = State::Global;
let mut current_archive: Option<PathBuf> = None;
let mut filter_map: HashMap<PathBuf, (Vec<String>, Vec<String>)> = HashMap::new();
let mut cleaned: Vec<OsString> = Vec::with_capacity(args.len());
let validate_pattern = |p: &str| -> Result<(), String> {
if !p.ends_with('/') {
glob::Pattern::new(p).map_err(|e| format!("invalid glob pattern '{p}': {e}"))?;
}
Ok(())
};
for arg in args {
let s = arg.to_string_lossy();
match s.as_ref() {
"--only" => {
if state == State::Global {
return Err(
"--only must follow an archive path (e.g. archive.zip --only PATTERN)"
.into(),
);
}
state = State::CollectingOnly;
}
"--exclude" => {
if state == State::Global {
return Err(
"--exclude must follow an archive path (e.g. archive.zip --exclude PATTERN)"
.into(),
);
}
state = State::CollectingExclude;
}
_ if (state == State::CollectingOnly || state == State::CollectingExclude)
&& !s.starts_with('-') =>
{
let candidate = PathBuf::from(s.as_ref());
if ArchiveFormat::from_path(&s).is_some() && candidate.is_file() {
filter_map
.entry(candidate.clone())
.or_insert_with(|| (Vec::new(), Vec::new()));
current_archive = Some(candidate.clone());
state = State::AfterArchive;
cleaned.push(arg.clone());
} else if candidate.is_file() {
return Err(format!(
"non-archive path '{}' cannot appear between filter flags; \
move it before or after the archive+filter group",
candidate.display()
));
} else {
validate_pattern(&s)?;
let key = current_archive.as_ref().unwrap();
let entry = filter_map.entry(key.clone()).or_default();
if state == State::CollectingOnly {
entry.0.push(s.into_owned());
} else {
entry.1.push(s.into_owned());
}
}
}
_ if (state == State::CollectingOnly || state == State::CollectingExclude)
&& s.starts_with('-') =>
{
state = State::AfterArchive;
cleaned.push(arg.clone());
}
_ => {
let candidate = PathBuf::from(s.as_ref());
if ArchiveFormat::from_path(&s).is_some() {
filter_map
.entry(candidate.clone())
.or_insert_with(|| (Vec::new(), Vec::new()));
current_archive = Some(candidate.clone());
state = State::AfterArchive;
}
cleaned.push(arg.clone());
}
}
}
Ok((filter_map, cleaned))
}
fn validate_args(cli: &Cli) -> Result<(), String> {
if has_stdin_input(cli) && io::stdin().is_terminal() {
return Err("stdin was requested but stdin is a terminal.\n\
Provide file path(s) only, or pipe data into sanitize when using '-'.\n\n\
Usage: sanitize [OPTIONS] [INPUT]...\n \
command | sanitize -s secrets.yaml"
.into());
}
let explicit_stdin_count = cli.input.iter().filter(|p| p.as_os_str() == "-").count();
if explicit_stdin_count > 1 {
return Err("stdin marker '-' can be specified at most once".into());
}
for input in file_inputs(cli) {
if !input.exists() {
return Err(format!("input path not found: {}", input.display()));
}
if !input.is_file() && !input.is_dir() {
return Err(format!(
"input path is not a file or directory: {}",
input.display()
));
}
}
if let Some(ref fmt) = cli.format {
if !VALID_FORMATS.contains(&fmt.as_str()) {
return Err(format!(
"invalid --format '{}': must be one of: {}",
fmt,
VALID_FORMATS.join(", ")
));
}
}
if let Some(ref sf) = cli.secrets_file {
if !sf.exists() && !cli.deterministic {
return Err(format!("secrets file not found: {}", sf.display()));
}
if sf.exists() && !sf.is_file() {
return Err(format!(
"secrets path is not a regular file: {}",
sf.display()
));
}
}
build_scan_config(cli.chunk_size)?;
if let Some(t) = cli.threads {
if t == 0 {
return Err("--threads must be ≥ 1".into());
}
}
if cli.max_archive_depth > 10 {
return Err(format!(
"--max-archive-depth {} exceeds maximum of 10 (each nesting level \
may buffer up to 256 MiB of archive data)",
cli.max_archive_depth
));
}
if cli.max_archive_depth == 0 {
return Err("--max-archive-depth must be ≥ 1".into());
}
if !matches!(cli.effective_log_format(), "human" | "json") {
return Err(format!(
"invalid --log-format '{}': must be 'human' or 'json'",
cli.effective_log_format()
));
}
if !matches!(
cli.effective_log_level(),
"off" | "error" | "warn" | "info" | "debug" | "trace"
) {
return Err(format!(
"invalid --log-level '{}': must be one of off, error, warn, info, debug, trace",
cli.effective_log_level()
));
}
if cli.progress_interval_ms == 0 {
return Err("--progress-interval-ms must be greater than 0".into());
}
let has_password_source = cli.password
|| cli.password_file.is_some()
|| std::env::var("SANITIZE_PASSWORD").is_ok_and(|v| !v.is_empty());
if has_password_source && !cli.encrypted_secrets && !cli.deterministic {
return Err(
"password input (--password, --password-file, or SANITIZE_PASSWORD) \
was provided but --encrypted-secrets is not set.\n\
Add --encrypted-secrets to decrypt the secrets file, or remove \
password inputs to use a plaintext file."
.into(),
);
}
for app in &cli.app {
let is_builtin = BUILTIN_APPS.iter().any(|a| a.name == app.as_str());
let is_user = user_apps_dir()
.map(|d| d.join(app).is_dir())
.unwrap_or(false);
if !is_builtin && !is_user {
return Err(format!(
"unknown --app '{}'. Built-in apps: {}. \
Add a custom app at $SANITIZE_APPS_DIR/{} (secrets.yaml / profile.yaml).",
app,
builtin_app_names().join(", "),
app,
));
}
}
if let Some(ref template) = cli.llm {
if cli.dry_run {
return Err(
"--llm and --dry-run cannot be combined: dry-run does not produce \
sanitized output, so the generated prompt would have no content."
.into(),
);
}
let known = matches!(
template.as_str(),
"troubleshoot" | "review-config" | "review-security"
);
if !known {
let path = Path::new(template);
if !path.exists() {
return Err(format!(
"--llm template '{}' is not a known template name and the path \
does not exist.\n\
Built-in templates: troubleshoot, review-config, review-security\n\
To use a custom template, provide a path to an existing file.",
template
));
}
if !path.is_file() {
return Err(format!(
"--llm template '{}' exists but is not a regular file.",
template
));
}
}
}
Ok(())
}
fn resolve_thread_count(requested: Option<usize>) -> usize {
let available = std::thread::available_parallelism()
.map(|n| n.get())
.unwrap_or(1);
match requested {
Some(n) => n.min(available),
None => available,
}
}
struct CliConfigSnapshot {
had_secrets: bool,
had_profile: bool,
apps: Vec<String>,
allow: Vec<String>,
strict: bool,
fail_on_match: bool,
no_structured_handoff: bool,
deterministic: bool,
dry_run: bool,
}
impl CliConfigSnapshot {
fn capture(cli: &Cli) -> Self {
Self {
had_secrets: cli.secrets_file.is_some(),
had_profile: cli.profile.is_some(),
apps: cli.app.clone(),
allow: cli.allow.clone(),
strict: cli.strict,
fail_on_match: cli.fail_on_match,
no_structured_handoff: cli.no_structured_handoff,
deterministic: cli.deterministic,
dry_run: cli.dry_run,
}
}
}
fn print_run_header(cli: &Cli, snap: &CliConfigSnapshot, json_logs: bool) {
if json_logs {
info!(
secrets = cli.secrets_file.as_ref().map(|p| p.display().to_string()).unwrap_or_default(),
profile = cli.profile.as_ref().map(|p| p.display().to_string()).unwrap_or_default(),
apps = %cli.app.join(","),
"run config"
);
return;
}
match &cli.secrets_file {
Some(p) => {
let ann = if !snap.had_secrets { " [config]" } else { "" };
eprintln!(" secrets: {}{}", p.display(), ann);
}
None => {
eprintln!(" secrets: (none — built-in patterns only)");
}
}
if let Some(p) = &cli.profile {
let ann = if !snap.had_profile { " [config]" } else { "" };
eprintln!(" profile: {}{}", p.display(), ann);
}
if !cli.app.is_empty() {
let parts: Vec<String> = cli
.app
.iter()
.map(|a| {
if snap.apps.contains(a) {
a.clone()
} else {
format!("{a} [config]")
}
})
.collect();
eprintln!(" apps: {}", parts.join(", "));
}
if !cli.allow.is_empty() {
let parts: Vec<String> = cli
.allow
.iter()
.map(|v| {
if snap.allow.contains(v) {
format!("{v:?}")
} else {
format!("{v:?} [config]")
}
})
.collect();
eprintln!(" allow: {}", parts.join(", "));
}
let mut flags: Vec<String> = Vec::new();
if cli.strict {
flags.push(if !snap.strict {
"--strict [config]".into()
} else {
"--strict".into()
});
}
if cli.fail_on_match {
flags.push(if !snap.fail_on_match {
"--fail-on-match [config]".into()
} else {
"--fail-on-match".into()
});
}
if cli.no_structured_handoff {
flags.push(if !snap.no_structured_handoff {
"--no-structured-handoff [config]".into()
} else {
"--no-structured-handoff".into()
});
}
if cli.deterministic {
flags.push(if !snap.deterministic {
"--deterministic [config]".into()
} else {
"--deterministic".into()
});
}
if cli.dry_run {
flags.push(if !snap.dry_run {
"--dry-run [config]".into()
} else {
"--dry-run".into()
});
}
if !flags.is_empty() {
eprintln!(" flags: {}", flags.join(", "));
}
eprintln!();
}
fn merge_entropy_counts(stats: &mut ScanStats, label_counts: HashMap<String, u64>) {
let total: u64 = label_counts.values().sum();
stats.matches_found += total;
stats.replacements_applied += total;
for (label, count) in label_counts {
*stats.pattern_counts.entry(label).or_insert(0) += count;
}
}
fn accumulate_entropy_histogram(
acc: &Arc<Mutex<Vec<EntropyBuckets>>>,
buf: &[u8],
configs: &[EntropyConfig],
) {
let local = entropy_histogram_bytes(buf, configs);
let mut guard = acc.lock().expect("entropy histogram lock");
if guard.is_empty() {
*guard = local;
} else {
for (dst, src) in guard.iter_mut().zip(local.iter()) {
dst.merge(src);
}
}
}
fn print_entropy_histogram(buckets: &[EntropyBuckets]) {
for b in buckets {
let label_suffix = if b.label != "high_entropy_token" {
format!(" [{}]", b.label)
} else {
String::new()
};
if b.total_candidates == 0 {
eprintln!(
"Entropy calibration{} — {} ({}–{} chars): no candidates found",
label_suffix, b.charset_desc, b.min_length, b.max_length
);
continue;
}
eprintln!(
"Entropy calibration{} — {} ({}–{} chars):",
label_suffix, b.charset_desc, b.min_length, b.max_length
);
for (i, &thresh) in HISTOGRAM_THRESHOLDS.iter().enumerate() {
let count = b.counts[i];
let marker = if (thresh - b.configured_threshold).abs() < 1e-9 {
" ← threshold"
} else {
""
};
eprintln!(" ≥{:.1} bits {:>6}{}", thresh, count, marker);
}
if !HISTOGRAM_THRESHOLDS
.iter()
.any(|&t| (t - b.configured_threshold).abs() < 1e-9)
{
eprintln!(
" (configured threshold {:.2} bits falls between standard levels above)",
b.configured_threshold
);
}
eprintln!(" {} candidates examined", b.total_candidates);
}
}
fn make_scan_callback(
progress: Option<SharedProgressReporter>,
label: impl Into<String>,
) -> impl FnMut(&sanitize_engine::ScanProgress) {
let label = label.into();
move |scan_progress| {
if let Some(reporter) = &progress {
reporter
.lock()
.expect("progress reporter lock")
.update_scan(&label, scan_progress);
}
}
}
fn scan_with_locations<R, W>(
scanner: &StreamScanner,
reader: R,
writer: W,
total_bytes: Option<u64>,
progress_cb: impl FnMut(&sanitize_engine::ScanProgress),
max_locations: usize,
) -> Result<
(
ScanStats,
Vec<sanitize_engine::scanner::MatchLocation>,
bool,
),
String,
>
where
R: std::io::Read,
W: std::io::Write,
{
let mut locations: Vec<sanitize_engine::scanner::MatchLocation> = Vec::new();
let mut truncated = false;
let stats = scanner
.scan_reader_with_callbacks(reader, writer, total_bytes, progress_cb, |loc| {
if max_locations == 0 {
return;
}
if locations.len() < max_locations {
locations.push(loc);
} else {
truncated = true;
}
})
.map_err(|e| format!("scanner error: {e}"))?;
Ok((stats, locations, truncated))
}
#[allow(clippy::too_many_arguments)]
fn process_stdin(
cli: &Cli,
output_path: Option<&Path>,
scanner: &Arc<StreamScanner>,
registry: &Arc<ProcessorRegistry>,
store: &Arc<MappingStore>,
profiles: &[sanitize_engine::processor::FileTypeProfile],
report_builder: Option<&ReportBuilder>,
progress: Option<&SharedProgressReporter>,
llm_collector: Option<&LlmCollector>,
entropy_configs: &Arc<Vec<EntropyConfig>>,
entropy_histogram_acc: Option<&Arc<Mutex<Vec<EntropyBuckets>>>>,
) -> Result<bool, String> {
let structured_ext = if cli.force_text {
None
} else {
cli.format.as_deref().and_then(format_to_ext)
};
let mut had_matches = false;
if let Some(ext) = structured_ext {
let mut input_bytes = Vec::new();
let limit = cli.max_structured_size;
io::stdin()
.take(limit + 1)
.read_to_end(&mut input_bytes)
.map_err(|e| format!("failed to read stdin: {e}"))?;
if input_bytes.len() as u64 > limit {
warn!(
max = limit,
"stdin exceeds --max-structured-size, falling back to streaming scanner"
);
let cursor = Cursor::new(input_bytes);
let chained = cursor.chain(io::stdin().lock());
let reader = BufReader::new(chained);
return process_stdin_streaming(
reader,
output_path,
cli,
scanner,
store,
report_builder,
progress,
llm_collector,
entropy_configs,
cli.max_match_locations,
entropy_histogram_acc,
);
}
let store_len_before = store.len();
let store_snapshot = store.snapshot();
let label = format!("Processing structured stdin ({ext})");
return with_progress_scope(progress, &label, |_| {
let structured_result = try_structured_processing(
&input_bytes,
&format!("stdin.{ext}"),
registry,
store,
profiles,
);
match structured_result {
Some(Ok(_structured_bytes)) => {
let per_content_scanner =
build_format_preserving_scanner(scanner, store, store_snapshot)
.map_err(|e| format!("failed to build content scanner: {e}"))?;
let (mut output_bytes, scan_stats) =
scanner_fallback(&per_content_scanner, &input_bytes)?;
let (ent_out, ent_lc) =
entropy_scan_bytes(&output_bytes, entropy_configs, store);
output_bytes = ent_out;
let ent_total: u64 = ent_lc.values().sum();
let method = format!("structured+scan:{ext}");
let structured_reps = store.len().saturating_sub(store_len_before) as u64;
let total_replacements =
structured_reps + scan_stats.replacements_applied + ent_total;
if total_replacements > 0 {
had_matches = true;
}
if let Some(rb) = report_builder {
let mut pattern_counts = scan_stats.pattern_counts.clone();
for (label, count) in &ent_lc {
*pattern_counts.entry(label.clone()).or_insert(0) += count;
}
let stats = ScanStats {
matches_found: total_replacements,
replacements_applied: total_replacements,
bytes_processed: input_bytes.len() as u64,
bytes_output: output_bytes.len() as u64,
pattern_counts,
};
rb.record_file(FileReport::from_scan_stats(
"<stdin>".to_string(),
&stats,
method,
));
}
maybe_extract_context(&output_bytes, "<stdin>", cli, report_builder);
if !cli.dry_run {
write_or_collect(&output_bytes, "<stdin>", output_path, llm_collector)?;
}
return Ok(had_matches);
}
Some(Err(e)) => {
if cli.strict {
return Err(format!("structured processing failed: {e}"));
}
warn!(error = %e, "structured processing failed, falling back to scanner");
}
None => {}
}
let (mut output_bytes, mut stats) = scanner_fallback(scanner, &input_bytes)?;
let (ent_out, ent_lc) = entropy_scan_bytes(&output_bytes, entropy_configs, store);
output_bytes = ent_out;
merge_entropy_counts(&mut stats, ent_lc);
if stats.matches_found > 0 {
had_matches = true;
}
if let Some(rb) = report_builder {
rb.record_file(FileReport::from_scan_stats(
"<stdin>".to_string(),
&stats,
"scanner",
));
}
maybe_extract_context(&output_bytes, "<stdin>", cli, report_builder);
if !cli.dry_run {
write_or_collect(&output_bytes, "<stdin>", output_path, llm_collector)?;
}
Ok(had_matches)
});
}
let reader = BufReader::new(io::stdin().lock());
process_stdin_streaming(
reader,
output_path,
cli,
scanner,
store,
report_builder,
progress,
llm_collector,
entropy_configs,
cli.max_match_locations,
entropy_histogram_acc,
)
}
#[allow(clippy::too_many_arguments)]
fn process_stdin_streaming<R: io::Read>(
reader: BufReader<R>,
output_path: Option<&Path>,
cli: &Cli,
scanner: &Arc<StreamScanner>,
store: &Arc<MappingStore>,
report_builder: Option<&ReportBuilder>,
progress: Option<&SharedProgressReporter>,
llm_collector: Option<&LlmCollector>,
entropy_configs: &Arc<Vec<EntropyConfig>>,
max_match_locations: usize,
entropy_histogram_acc: Option<&Arc<Mutex<Vec<EntropyBuckets>>>>,
) -> Result<bool, String> {
let label = if cli.dry_run {
"Scanning stdin (dry-run)"
} else {
"Scanning stdin"
};
let entropy_active = !entropy_configs.is_empty();
with_progress_scope(progress, label, |progress| {
let mut had_matches = false;
if cli.dry_run {
let (stats, locs, locs_truncated) = if entropy_active {
let mut buf: Vec<u8> = Vec::new();
let (mut s, locs, tr) = scan_with_locations(
scanner,
reader,
&mut buf,
None,
make_scan_callback(progress.clone(), label),
max_match_locations,
)?;
let (_ent_out, ent_lc) = entropy_scan_bytes(&buf, entropy_configs, store);
merge_entropy_counts(&mut s, ent_lc);
if let Some(acc) = entropy_histogram_acc {
accumulate_entropy_histogram(acc, &buf, entropy_configs);
}
(s, locs, tr)
} else {
let (s, locs, tr) = scan_with_locations(
scanner,
reader,
io::sink(),
None,
make_scan_callback(progress.clone(), label),
max_match_locations,
)?;
(s, locs, tr)
};
if stats.matches_found > 0 {
had_matches = true;
}
if let Some(rb) = report_builder {
rb.record_file(
FileReport::from_scan_stats("<stdin>".to_string(), &stats, "scanner")
.with_match_locations(locs, locs_truncated),
);
}
info!(
matches = stats.matches_found,
replacements = stats.replacements_applied,
"dry-run complete"
);
return Ok(had_matches);
}
let needs_buffer = cli.extract_context || llm_collector.is_some() || entropy_active;
if let Some(out_path) = output_path {
if needs_buffer {
let mut buf: Vec<u8> = Vec::new();
let (mut stats, locs, locs_truncated) = scan_with_locations(
scanner,
reader,
&mut buf,
None,
make_scan_callback(progress.clone(), label),
max_match_locations,
)?;
if is_interrupted() {
return Err("interrupted — partial output discarded".into());
}
if entropy_active {
let (ent_out, ent_lc) = entropy_scan_bytes(&buf, entropy_configs, store);
buf = ent_out;
merge_entropy_counts(&mut stats, ent_lc);
}
if stats.matches_found > 0 {
had_matches = true;
}
if let Some(rb) = report_builder {
rb.record_file(
FileReport::from_scan_stats("<stdin>".to_string(), &stats, "scanner")
.with_match_locations(locs, locs_truncated),
);
}
maybe_extract_context(&buf, "<stdin>", cli, report_builder);
if let Some(c) = llm_collector {
maybe_collect_for_llm(&buf, "<stdin>", Some(c));
} else {
atomic_write(out_path, &buf)
.map_err(|e| format!("failed to write {}: {e}", out_path.display()))?;
info!(output = %out_path.display(), "output written");
}
} else {
let mut atomic_writer = AtomicFileWriter::new(out_path)
.map_err(|e| format!("failed to create output: {e}"))?;
let (stats, locs, locs_truncated) = scan_with_locations(
scanner,
reader,
&mut atomic_writer,
None,
make_scan_callback(progress.clone(), label),
max_match_locations,
)?;
if is_interrupted() {
return Err("interrupted — partial output discarded".into());
}
atomic_writer
.finish()
.map_err(|e| format!("failed to finalize output: {e}"))?;
if stats.matches_found > 0 {
had_matches = true;
}
if let Some(rb) = report_builder {
rb.record_file(
FileReport::from_scan_stats("<stdin>".to_string(), &stats, "scanner")
.with_match_locations(locs, locs_truncated),
);
}
}
} else if needs_buffer {
let mut buf: Vec<u8> = Vec::new();
let (mut stats, locs, locs_truncated) = scan_with_locations(
scanner,
reader,
&mut buf,
None,
make_scan_callback(progress.clone(), label),
max_match_locations,
)?;
if entropy_active {
let (ent_out, ent_lc) = entropy_scan_bytes(&buf, entropy_configs, store);
buf = ent_out;
merge_entropy_counts(&mut stats, ent_lc);
}
if stats.matches_found > 0 {
had_matches = true;
}
if let Some(rb) = report_builder {
rb.record_file(
FileReport::from_scan_stats("<stdin>".to_string(), &stats, "scanner")
.with_match_locations(locs, locs_truncated),
);
}
maybe_extract_context(&buf, "<stdin>", cli, report_builder);
if let Some(c) = llm_collector {
maybe_collect_for_llm(&buf, "<stdin>", Some(c));
} else {
if let Some(p) = progress {
p.lock().expect("progress reporter lock").clear_live_line();
}
let stdout = io::stdout();
stdout
.lock()
.write_all(&buf)
.map_err(|e| format!("failed to write to stdout: {e}"))?;
}
} else {
if let Some(ref p) = progress {
p.lock().expect("progress reporter lock").clear_live_line();
}
let stdout = io::stdout();
let writer = BufWriter::new(stdout.lock());
let (stats, locs, locs_truncated) = scan_with_locations(
scanner,
reader,
writer,
None,
make_scan_callback(progress.clone(), label),
max_match_locations,
)?;
if stats.matches_found > 0 {
had_matches = true;
}
if let Some(rb) = report_builder {
rb.record_file(
FileReport::from_scan_stats("<stdin>".to_string(), &stats, "scanner")
.with_match_locations(locs, locs_truncated),
);
}
}
Ok(had_matches)
})
}
#[allow(clippy::too_many_arguments)]
fn process_plain_file(
input: &Path,
cli: &Cli,
output_path: Option<&Path>,
scanner: &Arc<StreamScanner>,
registry: &Arc<ProcessorRegistry>,
store: &Arc<MappingStore>,
profiles: &[sanitize_engine::processor::FileTypeProfile],
report_builder: Option<&ReportBuilder>,
progress: Option<&SharedProgressReporter>,
llm_collector: Option<&LlmCollector>,
entropy_configs: &Arc<Vec<EntropyConfig>>,
max_match_locations: usize,
entropy_histogram_acc: Option<&Arc<Mutex<Vec<EntropyBuckets>>>>,
) -> Result<bool, String> {
let mut sample = [0u8; 512];
let sample_len = {
let mut f = fs::File::open(input)
.map_err(|e| format!("failed to open {}: {e}", input.display()))?;
io::Read::read(&mut f, &mut sample)
.map_err(|e| format!("failed to read {}: {e}", input.display()))?
};
if !cli.include_binary && looks_binary(&sample[..sample_len]) {
let file_size = sample_len as u64;
warn!(
file = %input.display(),
bytes = file_size,
"skipping binary file — use --include-binary to process it"
);
return Ok(false);
}
let filename = if let Some(ref fmt) = cli.format {
format_to_ext(fmt)
.map(|ext| format!("override.{ext}"))
.unwrap_or_default()
} else {
input
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("")
.to_string()
};
let structured_ext = matches!(
filename.rsplit('.').next().unwrap_or(""),
"json"
| "jsonl"
| "ndjson"
| "yaml"
| "yml"
| "xml"
| "csv"
| "tsv"
| "rb"
| "conf"
| "cfg"
| "ini"
| "env"
| "properties"
| "toml"
) || {
filename
.rsplit('/')
.next()
.unwrap_or(&filename)
.starts_with(".env")
};
let mut had_matches = false;
if structured_ext && !cli.force_text {
let file_meta =
fs::metadata(input).map_err(|e| format!("failed to stat {}: {e}", input.display()))?;
let file_size = file_meta.len();
let maybe_streaming = profiles
.iter()
.find(|p| p.matches_filename(&filename))
.and_then(|p| {
registry
.get(&p.processor)
.filter(|proc| proc.supports_streaming())
.map(|proc| (p.clone(), Arc::clone(proc)))
});
if let Some((streaming_profile, streaming_proc)) = maybe_streaming {
let store_snapshot = store.snapshot();
{
let mut reader = BufReader::new(
fs::File::open(input)
.map_err(|e| format!("failed to open {}: {e}", input.display()))?,
);
streaming_proc
.process_stream(&mut reader, &mut io::sink(), &streaming_profile, store)
.map_err(|e| {
format!("structured pass 1 failed for {}: {e}", input.display())
})?;
}
let per_file_scanner = Arc::new(
build_format_preserving_scanner(scanner, store, store_snapshot)
.map_err(|e| format!("failed to build per-file scanner: {e}"))?,
);
let ext = filename.rsplit('.').next().unwrap_or("unknown");
let method = format!("structured+scan:{ext}");
let sz = file_size;
if cli.dry_run {
let label = format!("Scanning {} (dry-run)", input.display());
let progress_label = label.clone();
return with_progress_scope(progress, &label, move |progress| {
let reader = BufReader::new(
fs::File::open(input)
.map_err(|e| format!("failed to open {}: {e}", input.display()))?,
);
let (stats, locs, locs_truncated) = scan_with_locations(
&per_file_scanner,
reader,
io::sink(),
Some(sz),
make_scan_callback(progress.clone(), &progress_label),
max_match_locations,
)?;
if stats.matches_found > 0 {
had_matches = true;
}
if let Some(rb) = report_builder {
rb.record_file(
FileReport::from_scan_stats(
input.display().to_string(),
&stats,
&method,
)
.with_match_locations(locs, locs_truncated),
);
}
info!(
matches = stats.matches_found,
replacements = stats.replacements_applied,
"dry-run complete"
);
Ok(had_matches)
});
} else if let Some(out_path) = output_path {
let label = format!("Scanning {}", input.display());
let progress_label = label.clone();
let llm_opt = llm_collector.cloned();
return with_progress_scope(progress, &label, move |progress| {
if llm_opt.is_some() {
let reader = BufReader::new(
fs::File::open(input)
.map_err(|e| format!("failed to open {}: {e}", input.display()))?,
);
let mut buf: Vec<u8> = Vec::new();
let (stats, locs, locs_truncated) = scan_with_locations(
&per_file_scanner,
reader,
&mut buf,
Some(sz),
make_scan_callback(progress.clone(), &progress_label),
max_match_locations,
)?;
if is_interrupted() {
return Err("interrupted — partial output discarded".into());
}
if stats.matches_found > 0 {
had_matches = true;
}
if let Some(rb) = report_builder {
rb.record_file(
FileReport::from_scan_stats(
input.display().to_string(),
&stats,
&method,
)
.with_match_locations(locs, locs_truncated),
);
}
maybe_extract_context(
&buf,
&input.display().to_string(),
cli,
report_builder,
);
maybe_collect_for_llm(&buf, &abs_label(input), llm_opt.as_ref());
} else {
let reader = BufReader::new(
fs::File::open(input)
.map_err(|e| format!("failed to open {}: {e}", input.display()))?,
);
let mut atomic_writer = AtomicFileWriter::new(out_path)
.map_err(|e| format!("failed to create output: {e}"))?;
let (stats, locs, locs_truncated) = scan_with_locations(
&per_file_scanner,
reader,
&mut atomic_writer,
Some(sz),
make_scan_callback(progress.clone(), &progress_label),
max_match_locations,
)?;
if is_interrupted() {
return Err("interrupted — partial output discarded".into());
}
atomic_writer
.finish()
.map_err(|e| format!("failed to finalize output: {e}"))?;
if stats.matches_found > 0 {
had_matches = true;
}
if let Some(rb) = report_builder {
rb.record_file(
FileReport::from_scan_stats(
input.display().to_string(),
&stats,
&method,
)
.with_match_locations(locs, locs_truncated),
);
}
maybe_extract_context_reader(
out_path,
&input.display().to_string(),
cli,
report_builder,
);
}
Ok(had_matches)
});
} else {
let label = format!("Scanning {}", input.display());
let progress_label = label.clone();
let llm_opt = llm_collector.cloned();
return with_progress_scope(progress, &label, move |progress| {
let needs_buffer = (cli.extract_context || llm_opt.is_some())
&& sz <= MAX_CONTEXT_BUFFER_BYTES;
if needs_buffer {
let reader = BufReader::new(
fs::File::open(input)
.map_err(|e| format!("failed to open {}: {e}", input.display()))?,
);
let mut buf: Vec<u8> = Vec::new();
let (stats, locs, locs_truncated) = scan_with_locations(
&per_file_scanner,
reader,
&mut buf,
Some(sz),
make_scan_callback(progress.clone(), &progress_label),
max_match_locations,
)?;
if stats.matches_found > 0 {
had_matches = true;
}
if let Some(rb) = report_builder {
rb.record_file(
FileReport::from_scan_stats(
input.display().to_string(),
&stats,
&method,
)
.with_match_locations(locs, locs_truncated),
);
}
maybe_extract_context(
&buf,
&input.display().to_string(),
cli,
report_builder,
);
if llm_opt.is_some() {
maybe_collect_for_llm(&buf, &abs_label(input), llm_opt.as_ref());
} else {
let stdout = io::stdout();
stdout
.lock()
.write_all(&buf)
.map_err(|e| format!("failed to write to stdout: {e}"))?;
}
} else {
if cli.extract_context {
warn!(
file = %input.display(),
size = sz,
max = MAX_CONTEXT_BUFFER_BYTES,
"--extract-context: file too large to buffer for stdout; \
use -o/--output to write to a file for context extraction"
);
}
let reader = BufReader::new(
fs::File::open(input)
.map_err(|e| format!("failed to open {}: {e}", input.display()))?,
);
let stdout = io::stdout();
let writer = BufWriter::new(stdout.lock());
let (stats, locs, locs_truncated) = scan_with_locations(
&per_file_scanner,
reader,
writer,
Some(sz),
make_scan_callback(progress.clone(), &progress_label),
max_match_locations,
)?;
if stats.matches_found > 0 {
had_matches = true;
}
if let Some(rb) = report_builder {
rb.record_file(
FileReport::from_scan_stats(
input.display().to_string(),
&stats,
&method,
)
.with_match_locations(locs, locs_truncated),
);
}
}
Ok(had_matches)
});
}
}
if file_size > cli.max_structured_size {
warn!(
file = %input.display(),
size = file_size,
max = cli.max_structured_size,
"structured file exceeds size limit, falling back to streaming scanner"
);
} else {
let input_bytes =
fs::read(input).map_err(|e| format!("failed to read {}: {e}", input.display()))?;
let store_len_before = store.len();
let store_snapshot = store.snapshot();
let label = format!("Processing structured {}", input.display());
return with_progress_scope(progress, &label, |_| {
let structured_result =
try_structured_processing(&input_bytes, &filename, registry, store, profiles);
let (output_bytes, method, _was_structured, fallback_stats) =
match structured_result {
Some(Ok(_structured_bytes)) => {
let ext = filename.rsplit('.').next().unwrap_or("unknown");
let per_file_scanner =
build_format_preserving_scanner(scanner, store, store_snapshot)
.map_err(|e| {
format!("failed to build per-file scanner: {e}")
})?;
let (scanned_bytes, scan_stats) =
scanner_fallback(&per_file_scanner, &input_bytes)?;
(
scanned_bytes,
format!("structured+scan:{ext}"),
true,
Some(scan_stats),
)
}
Some(Err(e)) => {
if cli.strict {
return Err(format!("structured processing failed: {e}"));
}
warn!(error = %e, "structured processing failed, falling back to scanner");
let (out, stats) = scanner_fallback(scanner, &input_bytes)?;
(out, "scanner".into(), false, Some(stats))
}
None => {
let (out, stats) = scanner_fallback(scanner, &input_bytes)?;
(out, "scanner".into(), false, Some(stats))
}
};
let (output_bytes, fallback_stats) = {
let (ent_out, ent_lc) =
entropy_scan_bytes(&output_bytes, entropy_configs, store);
let stats = fallback_stats.map(|mut s| {
merge_entropy_counts(&mut s, ent_lc);
s
});
(ent_out, stats)
};
if cli.dry_run || report_builder.is_some() || cli.fail_on_match {
let _ = store_len_before; let replacements = fallback_stats
.as_ref()
.map_or(0, |s| s.replacements_applied);
if replacements > 0 {
had_matches = true;
}
if let Some(rb) = report_builder {
let stats = fallback_stats
.map(|mut s| {
s.matches_found = replacements;
s.replacements_applied = replacements;
s.bytes_processed = input_bytes.len() as u64;
s.bytes_output = output_bytes.len() as u64;
s
})
.unwrap_or_else(|| ScanStats {
matches_found: replacements,
replacements_applied: replacements,
bytes_processed: input_bytes.len() as u64,
bytes_output: output_bytes.len() as u64,
..Default::default()
});
rb.record_file(FileReport::from_scan_stats(
input.display().to_string(),
&stats,
method,
));
}
if cli.dry_run {
info!(
matches = replacements,
replacements = replacements,
"dry-run complete"
);
return Ok(had_matches);
}
}
maybe_extract_context(
&output_bytes,
&input.display().to_string(),
cli,
report_builder,
);
write_or_collect(
&output_bytes,
&input.display().to_string(),
output_path,
llm_collector,
)?;
Ok(had_matches)
});
}
}
let method = "scanner";
let entropy_active = !entropy_configs.is_empty();
if cli.dry_run {
let label = format!("Scanning {} (dry-run)", input.display());
let progress_label = label.clone();
let ent_cfgs = Arc::clone(entropy_configs);
let store_arc = Arc::clone(store);
with_progress_scope(progress, &label, move |progress| {
let reader = BufReader::new(
fs::File::open(input)
.map_err(|e| format!("failed to open {}: {e}", input.display()))?,
);
let progress_for_scan = progress.clone();
let sz = file_size(input)?;
let (stats, locs, locs_truncated) = if entropy_active {
let mut buf: Vec<u8> = Vec::new();
let (mut s, locs, tr) = scan_with_locations(
scanner,
reader,
&mut buf,
Some(sz),
make_scan_callback(progress_for_scan, &progress_label),
max_match_locations,
)?;
let (_ent_out, ent_lc) = entropy_scan_bytes(&buf, &ent_cfgs, &store_arc);
merge_entropy_counts(&mut s, ent_lc);
if let Some(acc) = entropy_histogram_acc {
accumulate_entropy_histogram(acc, &buf, &ent_cfgs);
}
(s, locs, tr)
} else {
scan_with_locations(
scanner,
reader,
io::sink(),
Some(sz),
make_scan_callback(progress_for_scan, &progress_label),
max_match_locations,
)?
};
if stats.matches_found > 0 {
had_matches = true;
}
if let Some(rb) = report_builder {
rb.record_file(
FileReport::from_scan_stats(input.display().to_string(), &stats, method)
.with_match_locations(locs, locs_truncated),
);
}
info!(
matches = stats.matches_found,
replacements = stats.replacements_applied,
"dry-run complete"
);
Ok(had_matches)
})
} else if let Some(out_path) = output_path {
let label = format!("Scanning {}", input.display());
let progress_label = label.clone();
let llm_opt = llm_collector.cloned();
let ent_cfgs = Arc::clone(entropy_configs);
let store_arc = Arc::clone(store);
with_progress_scope(progress, &label, move |progress| {
if llm_opt.is_some() || entropy_active {
let reader = BufReader::new(
fs::File::open(input)
.map_err(|e| format!("failed to open {}: {e}", input.display()))?,
);
let mut buf: Vec<u8> = Vec::new();
let progress_for_scan = progress.clone();
let (mut stats, locs, locs_truncated) = scan_with_locations(
scanner,
reader,
&mut buf,
Some(file_size(input)?),
make_scan_callback(progress_for_scan, &progress_label),
max_match_locations,
)?;
if is_interrupted() {
return Err("interrupted — partial output discarded".into());
}
if entropy_active {
let (ent_out, ent_lc) = entropy_scan_bytes(&buf, &ent_cfgs, &store_arc);
buf = ent_out;
merge_entropy_counts(&mut stats, ent_lc);
}
if stats.matches_found > 0 {
had_matches = true;
}
if let Some(rb) = report_builder {
rb.record_file(
FileReport::from_scan_stats(input.display().to_string(), &stats, method)
.with_match_locations(locs, locs_truncated),
);
}
maybe_extract_context(&buf, &input.display().to_string(), cli, report_builder);
if llm_opt.is_some() {
maybe_collect_for_llm(&buf, &abs_label(input), llm_opt.as_ref());
} else {
atomic_write(out_path, &buf)
.map_err(|e| format!("failed to write output: {e}"))?;
}
} else {
let reader = BufReader::new(
fs::File::open(input)
.map_err(|e| format!("failed to open {}: {e}", input.display()))?,
);
let mut atomic_writer = AtomicFileWriter::new(out_path)
.map_err(|e| format!("failed to create output: {e}"))?;
let progress_for_scan = progress.clone();
let (stats, locs, locs_truncated) = scan_with_locations(
scanner,
reader,
&mut atomic_writer,
Some(file_size(input)?),
make_scan_callback(progress_for_scan, &progress_label),
max_match_locations,
)?;
if is_interrupted() {
return Err("interrupted — partial output discarded".into());
}
atomic_writer
.finish()
.map_err(|e| format!("failed to finalize output: {e}"))?;
if stats.matches_found > 0 {
had_matches = true;
}
if let Some(rb) = report_builder {
rb.record_file(
FileReport::from_scan_stats(input.display().to_string(), &stats, method)
.with_match_locations(locs, locs_truncated),
);
}
maybe_extract_context_reader(
out_path,
&input.display().to_string(),
cli,
report_builder,
);
}
Ok(had_matches)
})
} else {
let label = format!("Scanning {}", input.display());
let progress_label = label.clone();
let llm_opt = llm_collector.cloned();
let ent_cfgs = Arc::clone(entropy_configs);
let store_arc = Arc::clone(store);
with_progress_scope(progress, &label, move |progress| {
let sz = file_size(input)?;
let needs_buffer = (cli.extract_context || llm_opt.is_some() || entropy_active)
&& sz <= MAX_CONTEXT_BUFFER_BYTES;
if needs_buffer {
let reader = BufReader::new(
fs::File::open(input)
.map_err(|e| format!("failed to open {}: {e}", input.display()))?,
);
let mut buf: Vec<u8> = Vec::new();
let progress_for_scan = progress.clone();
let (mut stats, locs, locs_truncated) = scan_with_locations(
scanner,
reader,
&mut buf,
Some(sz),
make_scan_callback(progress_for_scan, &progress_label),
max_match_locations,
)?;
if entropy_active {
let (ent_out, ent_lc) = entropy_scan_bytes(&buf, &ent_cfgs, &store_arc);
buf = ent_out;
merge_entropy_counts(&mut stats, ent_lc);
}
if stats.matches_found > 0 {
had_matches = true;
}
if let Some(rb) = report_builder {
rb.record_file(
FileReport::from_scan_stats(input.display().to_string(), &stats, method)
.with_match_locations(locs, locs_truncated),
);
}
maybe_extract_context(&buf, &input.display().to_string(), cli, report_builder);
if llm_opt.is_some() {
maybe_collect_for_llm(&buf, &abs_label(input), llm_opt.as_ref());
} else {
let stdout = io::stdout();
stdout
.lock()
.write_all(&buf)
.map_err(|e| format!("failed to write to stdout: {e}"))?;
}
} else {
if cli.extract_context {
warn!(
file = %input.display(),
size = sz,
max = MAX_CONTEXT_BUFFER_BYTES,
"--extract-context: file too large to buffer for stdout; \
use -o/--output to write to a file for context extraction"
);
}
let reader = BufReader::new(
fs::File::open(input)
.map_err(|e| format!("failed to open {}: {e}", input.display()))?,
);
let stdout = io::stdout();
let writer = BufWriter::new(stdout.lock());
let progress_for_scan = progress.clone();
let (stats, locs, locs_truncated) = scan_with_locations(
scanner,
reader,
writer,
Some(sz),
make_scan_callback(progress_for_scan, &progress_label),
max_match_locations,
)?;
if stats.matches_found > 0 {
had_matches = true;
}
if let Some(rb) = report_builder {
rb.record_file(
FileReport::from_scan_stats(input.display().to_string(), &stats, method)
.with_match_locations(locs, locs_truncated),
);
}
}
Ok(had_matches)
})
}
}
fn save_discovered_secrets(
store: &Arc<MappingStore>,
path: &Path,
) -> std::result::Result<usize, String> {
let mut new_entries: Vec<SecretEntry> = store
.iter()
.filter(|(_, original, _)| !original.is_empty())
.map(|(category, original, _)| SecretEntry {
pattern: original.to_string(),
kind: "literal".into(),
category: category.to_string(),
label: Some("discovered".into()),
values: vec![],
min_length: None,
max_length: None,
threshold: None,
charset: None,
})
.collect();
if new_entries.is_empty() {
return Ok(0);
}
let existing: Vec<SecretEntry> = if path.exists() {
let raw = fs::read(path).map_err(|e| format!("failed to read {}: {e}", path.display()))?;
let text = std::str::from_utf8(&raw)
.map_err(|_| format!("{} is not valid UTF-8", path.display()))?;
serde_yaml_ng::from_str::<Vec<SecretEntry>>(text).unwrap_or_default()
} else {
vec![]
};
let existing_patterns: std::collections::HashSet<&str> =
existing.iter().map(|e| e.pattern.as_str()).collect();
new_entries.retain(|e| !existing_patterns.contains(e.pattern.as_str()));
let added = new_entries.len();
if added == 0 {
return Ok(0);
}
let mut all_entries: Vec<&SecretEntry> = existing.iter().collect();
all_entries.extend(new_entries.iter());
let yaml = serde_yaml_ng::to_string(&all_entries)
.map_err(|e| format!("failed to serialize discovered secrets: {e}"))?;
atomic_write(path, yaml.as_bytes())
.map_err(|e| format!("failed to write {}: {e}", path.display()))?;
Ok(added)
}
fn load_profiles(path: &Path) -> Result<Vec<sanitize_engine::processor::FileTypeProfile>, String> {
let raw =
fs::read(path).map_err(|e| format!("failed to read profile '{}': {e}", path.display()))?;
let text = std::str::from_utf8(&raw)
.map_err(|_| format!("profile '{}' is not valid UTF-8", path.display()))?;
let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
let profiles: Vec<sanitize_engine::processor::FileTypeProfile> = match ext {
"json" => serde_json::from_str(text)
.map_err(|e| format!("profile '{}': invalid JSON: {e}", path.display())),
"yaml" | "yml" => serde_yaml_ng::from_str(text)
.map_err(|e| format!("profile '{}': invalid YAML: {e}", path.display())),
_ => serde_json::from_str(text)
.or_else(|_| serde_yaml_ng::from_str(text))
.map_err(|e| {
format!(
"profile '{}': could not parse as JSON or YAML: {e}",
path.display()
)
}),
}?;
for (i, p) in profiles.iter().enumerate() {
for pat in p.include.iter().chain(p.exclude.iter()) {
glob::Pattern::new(pat).map_err(|e| {
format!(
"profile '{}' entry {i}: invalid glob '{pat}': {e}",
path.display()
)
})?;
}
}
Ok(profiles)
}
fn try_structured_processing(
content: &[u8],
filename: &str,
registry: &Arc<ProcessorRegistry>,
store: &Arc<MappingStore>,
profiles: &[sanitize_engine::processor::FileTypeProfile],
) -> Option<Result<Vec<u8>, String>> {
let profile = profiles.iter().find(|p| p.matches_filename(filename))?;
match registry.process(content, profile, store) {
Ok(Some(result)) => Some(Ok(result)),
Ok(None) => None,
Err(e) => Some(Err(e.to_string())),
}
}
fn build_format_preserving_scanner(
base_scanner: &Arc<StreamScanner>,
store: &Arc<MappingStore>,
snapshot: usize,
) -> Result<StreamScanner, sanitize_engine::error::SanitizeError> {
let extra: Vec<ScanPattern> = store
.iter_since(snapshot)
.filter(|(_, orig, _)| orig.len() >= 4)
.filter_map(|(category, original, _)| {
let s = original.as_str();
match ScanPattern::from_literal(s, category, format!("field:{s}")) {
Ok(pat) => Some(pat),
Err(e) => {
warn!(value = %s, error = %e, "could not compile field literal pattern");
None
}
}
})
.collect();
base_scanner.for_structured_pass(extra)
}
#[allow(clippy::too_many_arguments)]
fn process_archive(
input: &Path,
cli: &Cli,
output_path: &Path,
deps: ArchiveDeps<'_>,
format: ArchiveFormat,
filter: ArchiveFilter,
report_builder: Option<&ReportBuilder>,
progress: Option<&SharedProgressReporter>,
suppress_inner_parallelism: bool,
_max_match_locations: usize,
) -> Result<bool, String> {
let label = format!("Processing archive {}", input.display());
with_progress_scope(progress, &label, |progress| {
let base_proc = ArchiveProcessor::new(
Arc::clone(deps.registry),
Arc::clone(deps.scanner),
Arc::clone(deps.store),
deps.profiles.to_vec(),
)
.with_max_depth(cli.max_archive_depth)
.with_force_text(cli.force_text)
.with_filter(filter);
let base_proc = if suppress_inner_parallelism {
base_proc.with_parallel_threshold(usize::MAX)
} else {
base_proc
};
let archive_proc = if let Some(progress) = &progress {
let label = label.clone();
let progress = Arc::clone(progress);
base_proc.with_progress_callback(Arc::new(move |archive_progress: &ArchiveProgress| {
progress
.lock()
.unwrap()
.update_archive(&label, archive_progress);
}))
} else {
base_proc
};
if cli.dry_run {
let stats = match format {
ArchiveFormat::Tar => {
let reader = BufReader::new(
fs::File::open(input)
.map_err(|e| format!("failed to open archive: {e}"))?,
);
archive_proc
.process_tar(reader, io::sink())
.map_err(|e| format!("archive error: {e}"))?
}
ArchiveFormat::TarGz => {
let reader = BufReader::new(
fs::File::open(input)
.map_err(|e| format!("failed to open archive: {e}"))?,
);
archive_proc
.process_tar_gz(reader, io::sink())
.map_err(|e| format!("archive error: {e}"))?
}
ArchiveFormat::Zip => {
let mut reader = BufReader::new(
fs::File::open(input)
.map_err(|e| format!("failed to open archive: {e}"))?,
);
let mut null_out = NullSeekWriter { pos: 0, len: 0 };
archive_proc
.process_zip(&mut reader, &mut null_out)
.map_err(|e| format!("archive error: {e}"))?
}
};
if let Some(rb) = report_builder {
record_archive_stats(rb, &stats);
}
info!(
files = stats.files_processed,
structured = stats.structured_hits,
scanner = stats.scanner_fallback,
"dry-run archive processing complete"
);
return Ok(stats.files_processed > 0);
}
let stats = match format {
ArchiveFormat::Tar => {
let reader = BufReader::new(
fs::File::open(input).map_err(|e| format!("failed to open input: {e}"))?,
);
let mut atomic_writer = AtomicFileWriter::new(output_path)
.map_err(|e| format!("failed to create output: {e}"))?;
let stats = archive_proc
.process_tar(reader, &mut atomic_writer)
.map_err(|e| format!("archive processing error: {e}"))?;
if is_interrupted() {
return Err("interrupted — partial output discarded".into());
}
atomic_writer
.finish()
.map_err(|e| format!("failed to finalize output: {e}"))?;
stats
}
ArchiveFormat::TarGz => {
let reader = BufReader::new(
fs::File::open(input).map_err(|e| format!("failed to open input: {e}"))?,
);
let mut atomic_writer = AtomicFileWriter::new(output_path)
.map_err(|e| format!("failed to create output: {e}"))?;
let stats = archive_proc
.process_tar_gz(reader, &mut atomic_writer)
.map_err(|e| format!("archive processing error: {e}"))?;
if is_interrupted() {
return Err("interrupted — partial output discarded".into());
}
atomic_writer
.finish()
.map_err(|e| format!("failed to finalize output: {e}"))?;
stats
}
ArchiveFormat::Zip => {
let mut reader = BufReader::new(
fs::File::open(input).map_err(|e| format!("failed to open archive: {e}"))?,
);
let mut atomic_writer = AtomicFileWriter::new(output_path)
.map_err(|e| format!("failed to create output: {e}"))?;
let stats = archive_proc
.process_zip(&mut reader, &mut atomic_writer)
.map_err(|e| format!("archive processing error: {e}"))?;
if is_interrupted() {
return Err("interrupted — partial output discarded".into());
}
atomic_writer
.finish()
.map_err(|e| format!("failed to finalize output: {e}"))?;
stats
}
};
if let Some(rb) = report_builder {
record_archive_stats(rb, &stats);
}
print_archive_stats(output_path, &stats);
Ok(stats.files_processed > 0)
})
}
fn file_size(path: &Path) -> Result<u64, String> {
fs::metadata(path)
.map(|metadata| metadata.len())
.map_err(|e| format!("failed to stat {}: {e}", path.display()))
}
fn record_archive_stats(rb: &ReportBuilder, stats: &sanitize_engine::ArchiveStats) {
for (path, method) in &stats.file_methods {
if let Some(scan_stats) = stats.file_scan_stats.get(path) {
rb.record_file(FileReport::from_scan_stats(
path.clone(),
scan_stats,
method.clone(),
));
} else {
rb.record_file(FileReport {
path: path.clone(),
matches: 0,
replacements: 0,
bytes_processed: 0,
bytes_output: 0,
pattern_counts: std::collections::HashMap::new(),
method: method.clone(),
log_context: None,
match_locations: None,
});
}
}
if stats.file_methods.is_empty() {
rb.record_file(FileReport {
path: "(archive)".into(),
matches: 0,
replacements: 0,
bytes_processed: stats.total_input_bytes,
bytes_output: stats.total_output_bytes,
pattern_counts: std::collections::HashMap::new(),
method: format!(
"archive({} files, {} structured, {} scanner)",
stats.files_processed, stats.structured_hits, stats.scanner_fallback
),
log_context: None,
match_locations: None,
});
}
}
fn print_archive_stats(output: &Path, stats: &sanitize_engine::ArchiveStats) {
info!(
files = stats.files_processed,
structured = stats.structured_hits,
scanner = stats.scanner_fallback,
output = %output.display(),
"archive processing complete"
);
}
fn write_output(output_path: Option<&Path>, data: &[u8]) -> Result<(), String> {
match output_path {
Some(path) => {
atomic_write(path, data)
.map_err(|e| format!("failed to write {}: {e}", path.display()))?;
info!(output = %path.display(), "output written");
}
None => {
let stdout = io::stdout();
let mut lock = stdout.lock();
lock.write_all(data)
.map_err(|e| format!("failed to write to stdout: {e}"))?;
}
}
Ok(())
}
fn build_log_context_config(cli: &Cli) -> LogContextConfig {
let mut config = LogContextConfig::new()
.with_context_lines(cli.context_lines)
.with_max_matches(cli.max_context_matches)
.case_sensitive(cli.context_case_sensitive);
if !cli.context_keywords.is_empty() {
config = if cli.context_keywords_replace {
config.with_keywords(cli.context_keywords.iter().cloned())
} else {
config.with_extra_keywords(cli.context_keywords.iter().cloned())
};
}
config
}
fn maybe_extract_context(
bytes: &[u8],
report_path: &str,
cli: &Cli,
report_builder: Option<&ReportBuilder>,
) {
if !cli.extract_context {
return;
}
let Some(rb) = report_builder else { return };
let text = String::from_utf8_lossy(bytes);
rb.set_file_log_context(
report_path,
extract_context(&text, &build_log_context_config(cli)),
);
}
fn maybe_extract_context_reader(
out_path: &Path,
report_path: &str,
cli: &Cli,
report_builder: Option<&ReportBuilder>,
) {
if !cli.extract_context {
return;
}
let Some(rb) = report_builder else { return };
let config = build_log_context_config(cli);
let file = match fs::File::open(out_path) {
Ok(f) => f,
Err(e) => {
warn!(error = %e, path = %out_path.display(), "--extract-context: failed to open output file for context scan");
return;
}
};
match extract_context_reader(BufReader::new(file), &config) {
Ok(result) => rb.set_file_log_context(report_path, result),
Err(e) => warn!(error = %e, "--extract-context: failed to read output for log context"),
}
}
fn abs_label(path: &Path) -> String {
fs::canonicalize(path)
.unwrap_or_else(|_| std::env::current_dir().unwrap_or_default().join(path))
.display()
.to_string()
}
fn maybe_collect_for_llm(bytes: &[u8], label: &str, collector: Option<&LlmCollector>) {
if let Some(c) = collector {
if let Ok(mut guard) = c.lock() {
guard.push((label.to_string(), bytes.to_vec()));
}
}
}
fn write_or_collect(
data: &[u8],
label: &str,
output_path: Option<&Path>,
collector: Option<&LlmCollector>,
) -> Result<(), String> {
if let Some(c) = collector {
maybe_collect_for_llm(data, label, Some(c));
Ok(())
} else {
write_output(output_path, data)
}
}
fn run_encrypt(args: &EncryptArgs) -> Result<(), (String, i32)> {
let validate = args.validate && !args._no_validate;
let password =
resolve_password(args.password, &args.password_file, "encryption").map_err(|e| (e, 1))?;
let plaintext = fs::read(&args.input)
.map_err(|e| (format!("cannot read '{}': {e}", args.input.display()), 1))?;
let format = args
.secrets_format
.or_else(|| SecretsFormat::from_extension(args.input.to_string_lossy().as_ref()));
if validate {
eprint!("Validating secrets file... ");
match parse_secrets(&plaintext, format) {
Ok(entries) => {
eprintln!("OK ({} entries)", entries.len());
}
Err(e) => {
eprintln!("FAILED");
return Err((format!("validation error: {e}"), 1));
}
}
}
eprint!("Encrypting... ");
let encrypted = encrypt_secrets(&plaintext, &password).map_err(|e| {
eprintln!("FAILED");
(format!("encryption failed: {e}"), 1)
})?;
atomic_write(&args.output, &encrypted)
.map_err(|e| (format!("cannot write '{}': {e}", args.output.display()), 1))?;
eprintln!("done");
eprintln!(
"Wrote {} bytes to '{}'",
encrypted.len(),
args.output.display()
);
eprintln!();
eprintln!("To use with the sanitizer:");
eprintln!(
" sanitize data.log -s {} --password",
args.output.display()
);
Ok(())
}
fn run_decrypt(args: &DecryptArgs) -> Result<(), (String, i32)> {
let password =
resolve_password(args.password, &args.password_file, "decryption").map_err(|e| (e, 1))?;
let encrypted = fs::read(&args.input)
.map_err(|e| (format!("cannot read '{}': {e}", args.input.display()), 1))?;
eprint!("Decrypting... ");
let plaintext = decrypt_secrets(&encrypted, &password).map_err(|e| {
eprintln!("FAILED");
(format!("decryption failed: {e}"), 1)
})?;
if let Some(fmt) = args.secrets_format {
eprint!("Validating... ");
match parse_secrets(&plaintext, Some(fmt)) {
Ok(entries) => {
eprintln!("OK ({} entries)", entries.len());
}
Err(e) => {
eprintln!("FAILED");
return Err((format!("decrypted content is not valid {:?}: {e}", fmt), 1));
}
}
}
atomic_write(&args.output, &plaintext)
.map_err(|e| (format!("cannot write '{}': {e}", args.output.display()), 1))?;
eprintln!("done");
eprintln!(
"Wrote {} bytes to '{}'",
plaintext.len(),
args.output.display()
);
eprintln!();
eprintln!("Remember to re-encrypt after editing:");
eprintln!(
" sanitize encrypt {} {}.enc",
args.output.display(),
args.output.display()
);
Ok(())
}
fn run() -> Result<(), (String, i32)> {
let raw_args: Vec<OsString> = std::env::args_os().skip(1).collect();
let (raw_filter_map, cleaned_args) = parse_archive_filters(&raw_args).map_err(|e| (e, 1))?;
let filter_map: HashMap<PathBuf, ArchiveFilter> = raw_filter_map
.into_iter()
.map(|(path, (only, exclude))| {
ArchiveFilter::new(only, exclude)
.map(|f| (path, f))
.map_err(|e| (e, 1))
})
.collect::<Result<HashMap<_, _>, _>>()?;
let cli = Cli::parse_from(std::iter::once(OsString::from("sanitize")).chain(cleaned_args));
init_logging(cli.effective_log_format(), cli.effective_log_level());
match &cli.command {
Some(SubCommand::Encrypt(args)) => return run_encrypt(args),
Some(SubCommand::Decrypt(args)) => return run_decrypt(args),
Some(SubCommand::Apps(args)) => return run_apps(args),
Some(SubCommand::Guided) => return run_guided(),
Some(SubCommand::Template(args)) => return run_template(args),
Some(SubCommand::AllowTest(args)) => return run_allow_test(args),
Some(SubCommand::InstallHook(args)) => return run_install_hook(args),
Some(SubCommand::InitHook(args)) => return run_init(args),
Some(SubCommand::ShowConfig) => return run_show_config(),
Some(SubCommand::Scan(args)) => return run_scan(args),
Some(SubCommand::TestPattern(args)) => return run_test_pattern(args),
None => {} }
run_sanitize(cli, None, filter_map)
}
fn run_sanitize(
mut cli: Cli,
pre_resolved_password: Option<Zeroizing<String>>,
filter_map: HashMap<PathBuf, ArchiveFilter>,
) -> Result<(), (String, i32)> {
if let Err(e) = ctrlc::set_handler(move || {
INTERRUPTED.store(true, Ordering::SeqCst);
}) {
eprintln!("warning: failed to install signal handler: {e}");
}
let cli_snapshot = CliConfigSnapshot::capture(&cli);
let settings = load_settings();
if cli.app.is_empty() && !settings.app.is_empty() {
cli.app = settings.app;
}
if cli.allow.is_empty() && !settings.allow.is_empty() {
cli.allow = settings.allow;
}
if !cli.fail_on_match {
if let Some(v) = settings.fail_on_match {
cli.fail_on_match = v;
}
}
if !cli.strict {
if let Some(v) = settings.strict {
cli.strict = v;
}
}
if !cli.no_structured_handoff {
if let Some(v) = settings.no_structured_handoff {
cli.no_structured_handoff = v;
}
}
if !cli.no_field_signal {
if let Some(v) = settings.no_field_signal {
cli.no_field_signal = v;
}
}
if cli.threads.is_none() {
cli.threads = settings.threads;
}
if cli.log_format.is_none() {
cli.log_format = settings.log_format;
}
if cli.log_level.is_none() {
cli.log_level = settings.log_level;
}
if !cli.no_progress {
if let Some(v) = settings.no_progress {
cli.no_progress = v;
}
}
if let Some(project_config_path) = find_project_config() {
let (pc, config_dir) = load_project_config(&project_config_path);
for bundle in &pc.app {
if !cli.app.contains(bundle) {
cli.app.push(bundle.clone());
}
}
for val in &pc.allow {
if !cli.allow.contains(val) {
cli.allow.push(val.clone());
}
}
if cli.secrets_file.is_none() {
if let Some(rel) = pc.secrets_file {
cli.secrets_file = Some(config_dir.join(rel));
}
}
if !cli.encrypted_secrets {
if let Some(v) = pc.encrypted_secrets {
cli.encrypted_secrets = v;
}
}
if cli.profile.is_none() {
if let Some(rel) = pc.profile {
cli.profile = Some(config_dir.join(rel));
}
}
if !cli.fail_on_match {
if let Some(v) = pc.fail_on_match {
cli.fail_on_match = v;
}
}
if !cli.strict {
if let Some(v) = pc.strict {
cli.strict = v;
}
}
if !cli.no_structured_handoff {
if let Some(v) = pc.no_structured_handoff {
cli.no_structured_handoff = v;
}
}
if !cli.no_field_signal {
if let Some(v) = pc.no_field_signal {
cli.no_field_signal = v;
}
}
}
validate_args(&cli).map_err(|e| (e, 1))?;
let progress_mode = cli.effective_progress_mode();
let progress_context = ProgressContext::detect(cli.effective_log_format());
let progress_policy = ProgressPolicy::from_mode(progress_mode, progress_context);
let progress_reporter = if progress_policy.live_updates || progress_policy.milestone_updates {
Some(Arc::new(Mutex::new(ProgressReporter::new(
progress_policy,
progress_context.json_logs,
cli.progress_interval_ms,
))))
} else {
None
};
let thread_count = resolve_thread_count(cli.threads);
let _ = rayon::ThreadPoolBuilder::new()
.num_threads(thread_count)
.build_global();
if cli.secrets_file.is_none() && cli.app.is_empty() {
let default_path = global_default_secrets_path();
if !default_path.exists() {
if let Some(parent) = default_path.parent() {
let _ = fs::create_dir_all(parent);
}
let allow_entry = SecretEntry {
kind: "allow".into(),
pattern: String::new(),
category: String::new(),
label: None,
values: common_allow_patterns(),
min_length: None,
max_length: None,
threshold: None,
charset: None,
};
if let Ok(yaml) = serde_yaml_ng::to_string(&[allow_entry]) {
let header = "# Global sanitize allowlist — add patterns or kind:regex entries here.\n# Auto-loaded on every plain run. Edit freely; deleted values take effect immediately.\n\n";
let _ = fs::write(&default_path, format!("{header}{yaml}"));
}
}
if default_path.exists() {
cli.secrets_file = Some(default_path);
}
}
if !cli.app.is_empty() && !cli.no_structured_handoff {
for app_name in &cli.app {
if let Some(secrets_path) = ensure_user_app_copy(app_name) {
if cli.app.len() == 1 && cli.secrets_file.is_none() {
info!(
app = %app_name,
path = %secrets_path.display(),
"using local app secrets as write-back target"
);
cli.secrets_file = Some(secrets_path);
}
}
}
}
if cli.profile.is_some() && cli.secrets_file.is_none() && !cli.no_structured_handoff {
return Err((
"a secrets file is required when using --profile\n\
\n\
Without one, discovered values from the profile pass have nowhere to go\n\
and the scanner pass runs blind — sensitive data in logs that the profile\n\
would catch from config will be missed.\n\
\n\
The file can be empty on the first run; sanitize will populate it with\n\
discovered literals automatically:\n\
\n\
touch secrets.yaml\n\
sanitize --profile my.profile.yaml --secrets-file secrets.yaml [paths...]"
.into(),
1,
));
}
if progress_policy.live_updates
|| progress_policy.milestone_updates
|| progress_context.json_logs
{
print_run_header(&cli, &cli_snapshot, progress_context.json_logs);
}
info!(
threads = thread_count,
deterministic = cli.deterministic,
chunk_size = cli.chunk_size,
progress_mode = ?progress_mode,
live_progress = progress_policy.live_updates,
milestone_progress = progress_policy.milestone_updates,
progress_interval_ms = cli.progress_interval_ms,
"starting sanitization"
);
let effective_password: Option<Zeroizing<String>> =
if cli.encrypted_secrets || cli.deterministic {
if let Some(pw) = pre_resolved_password {
Some(pw)
} else {
Some(resolve_sanitize_password(&cli).map_err(|e| (e, 1))?)
}
} else {
None
};
let scan_config = build_scan_config(cli.chunk_size).map_err(|e| (e, 1))?;
let registry = Arc::new(ProcessorRegistry::with_builtins());
let file_profiles: Vec<sanitize_engine::processor::FileTypeProfile> =
if let Some(ref profile_path) = cli.profile {
load_profiles(profile_path).map_err(|e| (e, 1))?
} else {
vec![]
};
let mut base_patterns: Vec<ScanPattern> = vec![];
let mut all_allow_patterns: Vec<String> = cli.allow.clone();
let mut entropy_configs: Vec<EntropyConfig> = vec![];
for app_name in &cli.app {
if let Ok(bundle) = load_app_bundle(app_name) {
all_allow_patterns.extend(extract_allow_patterns(&bundle.secrets));
}
}
let secrets_raw_bytes: Option<Vec<u8>> = if let Some(ref secrets_path) = cli.secrets_file {
if secrets_path.exists() {
Some(fs::read(secrets_path).map_err(|e| {
(
format!(
"failed to read secrets file {}: {e}",
secrets_path.display()
),
1,
)
})?)
} else if cli.deterministic {
None
} else {
return Err((
format!("secrets file not found: {}", secrets_path.display()),
1,
));
}
} else {
None
};
let mut was_encrypted_secrets = false;
if let Some(ref raw_bytes) = secrets_raw_bytes {
let secrets_format = cli
.secrets_file
.as_ref()
.and_then(|p| SecretsFormat::from_extension(p.to_string_lossy().as_ref()));
let (((patterns, warnings), allow_from_secrets), was_encrypted) =
sanitize_engine::secrets::load_secrets_auto(
raw_bytes,
effective_password.as_ref().map(|s| s.as_str()),
secrets_format,
!cli.encrypted_secrets,
)
.map_err(|e| (format!("failed to load secrets: {e}"), 1))?;
let secrets_display = cli
.secrets_file
.as_ref()
.map(|p| p.display().to_string())
.unwrap_or_default();
was_encrypted_secrets = was_encrypted;
if was_encrypted {
info!(secrets_file = %secrets_display, "loaded encrypted secrets");
} else {
info!(secrets_file = %secrets_display, "loaded plaintext secrets (unencrypted)");
}
if !warnings.is_empty() {
for (idx, err) in &warnings {
warn!(entry = idx, error = %err, "secret entry warning");
}
if cli.strict {
return Err((
format!(
"{} secret entries had errors (use without --strict to continue)",
warnings.len()
),
1,
));
}
}
base_patterns.extend(patterns);
all_allow_patterns.extend(allow_from_secrets);
let entropy_plaintext: Option<Zeroizing<Vec<u8>>> = if was_encrypted {
effective_password
.as_ref()
.and_then(|pw| decrypt_secrets(raw_bytes, pw.as_str()).ok())
} else {
None
};
let bytes_for_entropy: &[u8] = entropy_plaintext
.as_deref()
.map_or(raw_bytes.as_slice(), |v| v);
if let Ok(ent_entries) = parse_secrets(bytes_for_entropy, None) {
entropy_configs.extend(entropy_configs_from_entries(&ent_entries));
}
}
if let Some(threshold) = cli.entropy_threshold {
if !entropy_configs
.iter()
.any(|c| c.label == "high_entropy_token")
{
entropy_configs.push(EntropyConfig {
threshold,
..Default::default()
});
}
}
let entropy_configs = Arc::new(entropy_configs);
let entropy_histogram_acc: Option<Arc<Mutex<Vec<EntropyBuckets>>>> =
if cli.dry_run && !entropy_configs.is_empty() {
Some(Arc::new(Mutex::new(Vec::new())))
} else {
None
};
let nothing_specified = cli.secrets_file.is_none()
&& cli.app.is_empty()
&& cli.profile.is_none()
&& !cli.use_default;
let load_defaults =
cli.use_default || nothing_specified || (!cli.app.is_empty() && cli.secrets_file.is_none());
if load_defaults {
all_allow_patterns.extend(common_allow_patterns());
}
let allowlist: Option<Arc<sanitize_engine::allowlist::AllowlistMatcher>> =
if all_allow_patterns.is_empty() {
None
} else {
let (matcher, al_warnings) =
sanitize_engine::allowlist::AllowlistMatcher::new(all_allow_patterns);
for w in &al_warnings {
warn!(warning = %w, "allowlist pattern warning");
}
let matcher = Arc::new(matcher);
info!(patterns = matcher.pattern_count(), "allowlist loaded");
Some(matcher)
};
let store = build_store(
cli.deterministic,
effective_password.as_ref().map(|s| s.as_str()),
cli.max_mappings,
allowlist,
)
.map_err(|e| (e, 1))?;
if load_defaults {
let default_patterns = build_default_patterns();
info!(
patterns = default_patterns.len(),
"loaded built-in balanced patterns (auto, via --app)"
);
base_patterns.extend(default_patterns);
}
let mut app_profiles: Vec<FileTypeProfile> = vec![];
for app_name in &cli.app {
let bundle = load_app_bundle(app_name).map_err(|e| (e, 1))?;
let (app_patterns, app_errors) = entries_to_patterns(&bundle.secrets);
if !app_errors.is_empty() {
for (i, e) in &app_errors {
warn!(app = %app_name, entry = i, error = %e, "app bundle pattern warning");
}
}
info!(
app = %app_name,
patterns = app_patterns.len(),
profiles = bundle.profiles.len(),
"loaded app bundle"
);
base_patterns.extend(app_patterns);
app_profiles.extend(bundle.profiles);
}
if base_patterns.is_empty() && app_profiles.is_empty() {
warn!("no secrets file or --app provided; pass --secrets-file, --app, or --profile explicitly, or run without flags to auto-create the default secrets file");
}
let scanner = StreamScanner::new(
base_patterns.clone(),
Arc::clone(&store),
scan_config.clone(),
)
.map_err(|e| (format!("failed to create scanner: {e}"), 1))?;
if !base_patterns.is_empty() {
info!(patterns = scanner.pattern_count(), "scanner ready");
}
let scanner = Arc::new(scanner);
let mut profiles: Vec<sanitize_engine::processor::FileTypeProfile> = {
let mut merged = app_profiles;
merged.extend(file_profiles);
merged
};
if !cli.no_field_signal && !profiles.is_empty() {
let mut active_signals = builtin_field_name_signals();
if let Some(ref raw_bytes) = secrets_raw_bytes {
let plaintext_for_signals: Option<Zeroizing<Vec<u8>>> = if was_encrypted_secrets {
effective_password
.as_ref()
.and_then(|pw| decrypt_secrets(raw_bytes, pw.as_str()).ok())
} else {
None
};
let bytes = plaintext_for_signals
.as_deref()
.map_or(raw_bytes.as_slice(), |v| v);
if let Ok(entries) = parse_secrets(bytes, None) {
let user_signals = field_signals_from_entries(&entries);
if !user_signals.is_empty() {
info!(
count = user_signals.len(),
"loaded user-defined field-name signals"
);
}
active_signals.extend(user_signals);
}
}
let signal_count = active_signals.len();
for profile in &mut profiles {
profile.field_name_signals = active_signals.clone();
}
info!(
signals = signal_count,
"field-name signals active (disable with --no-field-signal)"
);
}
if !profiles.is_empty() {
info!(count = profiles.len(), "loaded field-path profiles");
for p in &profiles {
if registry.get(&p.processor).is_none() {
eprintln!(
"Warning: profile processor '{}' is not registered. \
Known processors: {}",
p.processor,
registry.names().join(", ")
);
}
}
}
let report_enabled =
cli.report.is_some() || cli.llm.is_some() || cli.findings.is_some() || !cli.quiet;
let report_builder = if report_enabled {
let timestamp = SystemTime::now()
.duration_since(SystemTime::UNIX_EPOCH)
.map(|d| {
let secs = d.as_secs();
let (s, m, h) = (secs % 60, (secs / 60) % 60, (secs / 3600) % 24);
let days = secs / 86400;
format!("epoch+{days}d {:02}:{:02}:{:02}Z", h, m, s)
})
.unwrap_or_else(|_| "unknown".into());
Some(ReportBuilder::new(ReportMetadata {
version: env!("CARGO_PKG_VERSION").into(),
timestamp,
deterministic: cli.deterministic,
dry_run: cli.dry_run,
strict: cli.strict,
chunk_size: cli.chunk_size,
threads: cli.threads,
secrets_file: cli.secrets_file.as_ref().map(|p| p.display().to_string()),
}))
} else {
None
};
let reference_mode = cli.llm.is_some() && cli.output.is_some();
let llm_collector: Option<LlmCollector> = if cli.llm.is_some() && !reference_mode {
Some(Arc::new(Mutex::new(Vec::new())))
} else {
None
};
let input_targets = plan_input_targets(&cli).map_err(|e| (e, 1))?;
if cli.strip_values {
let delimiter = "#strip-values-delimiter#"; let _ = delimiter; for target in &input_targets {
let (content, output_path) = match target {
InputTarget::Stdin { output } => {
let mut buf = Vec::new();
io::stdin()
.read_to_end(&mut buf)
.map_err(|e| (format!("failed to read stdin: {e}"), 1))?;
(
String::from_utf8_lossy(&buf).into_owned(),
output.as_deref(),
)
}
InputTarget::File { input, output } => {
let text = fs::read_to_string(input)
.map_err(|e| (format!("failed to read {}: {e}", input.display()), 1))?;
(text, Some(output.as_path()))
}
};
let stripped =
strip_values_from_text(&content, &cli.strip_delimiter, &cli.strip_comment_prefix);
write_output(output_path, stripped.as_bytes()).map_err(|e| (e, 1))?;
}
return Ok(());
}
let (stdin_targets, file_targets): (Vec<_>, Vec<_>) = input_targets
.into_iter()
.partition(|t| matches!(t, InputTarget::Stdin { .. }));
let llm_ref_entries: Vec<LlmPathEntry> = if reference_mode {
stdin_targets
.iter()
.filter_map(|t| {
if let InputTarget::Stdin { output: Some(out) } = t {
Some(("<stdin>".to_string(), abs_label(out)))
} else {
None
}
})
.chain(file_targets.iter().filter_map(|t| {
if let InputTarget::File { input, output } = t {
Some((abs_label(input), abs_label(output)))
} else {
None
}
}))
.map(|(label, abs_out)| (label, PathBuf::from(abs_out)))
.collect()
} else {
vec![]
};
let mut had_matches = false;
if profiles.is_empty() {
for target in &stdin_targets {
let InputTarget::Stdin { ref output } = target else {
unreachable!()
};
let result = process_stdin(
&cli,
output.as_deref(),
&scanner,
®istry,
&store,
&profiles,
report_builder.as_ref(),
progress_reporter.as_ref(),
llm_collector.as_ref(),
&entropy_configs,
entropy_histogram_acc.as_ref(),
)
.map_err(|e| (e, 1))?;
had_matches |= result;
}
}
let (phase1_targets, phase2_targets): (Vec<_>, Vec<_>) = if profiles.is_empty() {
(vec![], file_targets)
} else {
file_targets.into_iter().partition(|t| {
let InputTarget::File { ref input, .. } = t else {
return false;
};
let name = input.to_string_lossy();
ArchiveFormat::from_path(&name).is_none()
&& profiles.iter().any(|p| p.matches_filename(&name))
})
};
for target in phase1_targets {
if is_interrupted() {
break;
}
let InputTarget::File { input, output } = target else {
unreachable!()
};
let result = process_plain_file(
&input,
&cli,
Some(output.as_path()),
&scanner,
®istry,
&store,
&profiles,
report_builder.as_ref(),
progress_reporter.as_ref(),
llm_collector.as_ref(),
&entropy_configs,
cli.max_match_locations,
entropy_histogram_acc.as_ref(),
)
.map_err(|e| (e, 1))?;
had_matches |= result;
}
if !profiles.is_empty() {
let discovery = ArchiveProcessor::new(
Arc::clone(®istry),
Arc::clone(&scanner), Arc::clone(&store),
profiles.to_vec(),
);
for target in &phase2_targets {
if is_interrupted() {
break;
}
let InputTarget::File { ref input, .. } = target else {
continue;
};
let input_str = input.to_string_lossy();
let Some(fmt) = ArchiveFormat::from_path(&input_str) else {
continue;
};
let file = fs::File::open(input).map_err(|e| {
(
format!(
"failed to open {} for profile discovery: {e}",
input.display()
),
1,
)
})?;
match fmt {
ArchiveFormat::Tar => discovery.discover_profiles_tar(file),
ArchiveFormat::TarGz => discovery.discover_profiles_tar_gz(file),
ArchiveFormat::Zip => discovery.discover_profiles_zip(file),
}
.map_err(|e| {
(
format!("profile discovery failed for {}: {e}", input.display()),
1,
)
})?;
}
}
let augmented_scanner = build_augmented_scanner(&base_patterns, &store, scan_config)?;
if !profiles.is_empty() {
for target in stdin_targets {
let InputTarget::Stdin { output } = target else {
unreachable!()
};
let result = process_stdin(
&cli,
output.as_deref(),
&augmented_scanner,
®istry,
&store,
&profiles,
report_builder.as_ref(),
progress_reporter.as_ref(),
llm_collector.as_ref(),
&entropy_configs,
entropy_histogram_acc.as_ref(),
)
.map_err(|e| (e, 1))?;
had_matches |= result;
}
}
let file_results: Vec<Result<bool, (String, i32)>> = if phase2_targets.len() > 1 {
phase2_targets
.into_par_iter()
.map(|target| {
if is_interrupted() {
return Ok(false);
}
let InputTarget::File { input, output } = target else {
unreachable!()
};
let input_str = input.to_string_lossy();
if let Some(fmt) = ArchiveFormat::from_path(&input_str) {
let filter = filter_map.get(&input).cloned().unwrap_or_default();
process_archive(
&input,
&cli,
&output,
ArchiveDeps {
scanner: &augmented_scanner,
registry: ®istry,
store: &store,
profiles: &profiles,
},
fmt,
filter,
report_builder.as_ref(),
progress_reporter.as_ref(),
true,
cli.max_match_locations,
)
.map_err(|e| (e, 1))
} else {
process_plain_file(
&input,
&cli,
Some(output.as_path()),
&augmented_scanner,
®istry,
&store,
&profiles,
report_builder.as_ref(),
progress_reporter.as_ref(),
llm_collector.as_ref(),
&entropy_configs,
cli.max_match_locations,
entropy_histogram_acc.as_ref(),
)
.map_err(|e| (e, 1))
}
})
.collect()
} else {
phase2_targets
.into_iter()
.map(|target| {
let InputTarget::File { input, output } = target else {
unreachable!()
};
let input_str = input.to_string_lossy();
if let Some(fmt) = ArchiveFormat::from_path(&input_str) {
let filter = filter_map.get(&input).cloned().unwrap_or_default();
process_archive(
&input,
&cli,
&output,
ArchiveDeps {
scanner: &augmented_scanner,
registry: ®istry,
store: &store,
profiles: &profiles,
},
fmt,
filter,
report_builder.as_ref(),
progress_reporter.as_ref(),
false,
cli.max_match_locations,
)
.map_err(|e| (e, 1))
} else {
process_plain_file(
&input,
&cli,
Some(output.as_path()),
&augmented_scanner,
®istry,
&store,
&profiles,
report_builder.as_ref(),
progress_reporter.as_ref(),
llm_collector.as_ref(),
&entropy_configs,
cli.max_match_locations,
entropy_histogram_acc.as_ref(),
)
.map_err(|e| (e, 1))
}
})
.collect()
};
for result in file_results {
had_matches |= result?;
}
if is_interrupted() {
return Err(("interrupted by signal".into(), 130));
}
if !cli.no_structured_handoff && !profiles.is_empty() {
if let Some(save_path) = &cli.secrets_file {
match save_discovered_secrets(&store, save_path) {
Ok(0) => {}
Ok(n) => info!(
path = %save_path.display(),
added = n,
"saved discovered literals to secrets file"
),
Err(e) => warn!("could not save discovered secrets: {e}"),
}
}
}
if let Some(builder) = report_builder {
let report = builder.finish();
if let Some(ref template_name) = cli.llm {
let prompt = if reference_mode {
format_llm_prompt_reference(template_name, &llm_ref_entries, Some(&report))
.map_err(|e| (e, 1))?
} else {
let entries = llm_collector
.as_ref()
.and_then(|c| c.lock().ok())
.map(|g| g.clone())
.unwrap_or_default();
format_llm_prompt(template_name, &entries, Some(&report)).map_err(|e| (e, 1))?
};
let stdout = io::stdout();
stdout
.lock()
.write_all(prompt.as_bytes())
.map_err(|e| (format!("failed to write LLM prompt: {e}"), 1))?;
}
if let Some(report_opt) = &cli.report {
let content = match cli.report_format {
ReportFormat::Sarif => report
.to_sarif()
.map_err(|e| (format!("failed to serialize SARIF report: {e}"), 1))?,
ReportFormat::Html => report.to_html(),
ReportFormat::Json => report
.to_json_pretty()
.map_err(|e| (format!("failed to serialize report: {e}"), 1))?,
};
match report_opt {
Some(path) if path.to_string_lossy() == "-" => {
println!("{content}");
}
Some(path) => {
atomic_write(path, content.as_bytes()).map_err(|e| {
(
format!("failed to write report to {}: {e}", path.display()),
1,
)
})?;
info!(report = %path.display(), format = ?cli.report_format, "report written");
}
None => {
eprintln!("{content}");
}
}
}
if let Some(ref findings_path) = cli.findings {
let mut lines: Vec<String> = Vec::with_capacity(report.files.len() + 1);
#[derive(serde::Serialize)]
struct FileFinding<'a> {
#[serde(rename = "type")]
kind: &'static str,
file: &'a str,
matches: u64,
clean: bool,
#[serde(skip_serializing_if = "HashMap::is_empty")]
patterns: &'a HashMap<String, u64>,
bytes_processed: u64,
}
#[derive(serde::Serialize)]
struct SummaryFinding {
#[serde(rename = "type")]
kind: &'static str,
files: u64,
matches: u64,
clean: bool,
}
for f in &report.files {
let line = serde_json::to_string(&FileFinding {
kind: "file",
file: &f.path,
matches: f.matches,
clean: f.matches == 0,
patterns: &f.pattern_counts,
bytes_processed: f.bytes_processed,
})
.map_err(|e| (format!("failed to serialize finding: {e}"), 1))?;
lines.push(line);
}
lines.push(
serde_json::to_string(&SummaryFinding {
kind: "summary",
files: report.summary.total_files,
matches: report.summary.total_matches,
clean: report.summary.total_matches == 0,
})
.map_err(|e| (format!("failed to serialize findings summary: {e}"), 1))?,
);
let ndjson = lines.join("\n") + "\n";
if findings_path.to_string_lossy() == "-" {
io::stdout()
.lock()
.write_all(ndjson.as_bytes())
.map_err(|e| (format!("failed to write findings to stdout: {e}"), 1))?;
} else {
atomic_write(findings_path, ndjson.as_bytes()).map_err(|e| {
(
format!(
"failed to write findings to {}: {e}",
findings_path.display()
),
1,
)
})?;
info!(findings = %findings_path.display(), files = report.files.len(), "findings written");
}
}
if !cli.quiet {
let verb = if cli.dry_run { "Matched" } else { "Redacted" };
if report.summary.total_matches == 0 {
eprintln!("{verb}: nothing");
} else {
let mut parts: Vec<(u64, &str)> = report
.summary
.pattern_counts
.iter()
.map(|(k, &v)| (v, k.as_str()))
.collect();
parts.sort_by(|a, b| b.0.cmp(&a.0).then(a.1.cmp(b.1)));
let line = parts
.iter()
.map(|(count, name)| format!("{count} {name}"))
.collect::<Vec<_>>()
.join(", ");
eprintln!("{verb}: {line}");
}
}
}
if let Some(acc) = entropy_histogram_acc {
if let Ok(buckets) = acc.lock() {
if !buckets.is_empty() {
print_entropy_histogram(&buckets);
}
}
}
#[cfg(feature = "bench")]
{
let mappings = store.len();
info!(unique_mappings = mappings, "performance summary");
}
if cli.fail_on_match && had_matches {
return Err(("matches found (--fail-on-match)".into(), 2));
}
Ok(())
}
fn main() {
match run() {
Ok(()) => {}
Err((msg, code)) => {
eprintln!("error: {msg}");
process::exit(code);
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::guided::{CloudProvider, GuidedFormat};
use crate::hooks::{
build_hook_flags, build_hook_script, hook_script_pre_commit_scan, remove_hook, sh_quote,
HOOK_MARKER,
};
use clap::Parser;
use tempfile::tempdir;
fn make_progress_context(
stderr_is_terminal: bool,
is_ci: bool,
term_is_dumb: bool,
json_logs: bool,
) -> ProgressContext {
ProgressContext {
stderr_is_terminal,
is_ci,
term_is_dumb,
json_logs,
}
}
#[test]
fn cli_debug_assert_does_not_panic() {
let _ = Cli::try_parse_from(["sanitize", "input.txt"]);
}
#[test]
fn cli_parses_basic_input() {
let cli = Cli::try_parse_from(["sanitize", "input.txt"]).unwrap();
assert_eq!(cli.input, vec![PathBuf::from("input.txt")]);
assert!(cli.command.is_none());
}
#[test]
fn cli_parses_input_with_output() {
let cli = Cli::try_parse_from(["sanitize", "input.txt", "-o", "output.txt"]).unwrap();
assert_eq!(cli.input, vec![PathBuf::from("input.txt")]);
assert_eq!(cli.output.unwrap(), PathBuf::from("output.txt"));
}
#[test]
fn cli_parses_multiple_inputs() {
let cli = Cli::try_parse_from(["sanitize", "test.txt", "a.json", "b.zip"]).unwrap();
assert_eq!(
cli.input,
vec![
PathBuf::from("test.txt"),
PathBuf::from("a.json"),
PathBuf::from("b.zip")
]
);
}
#[test]
fn cli_parses_output_long_flag() {
let cli = Cli::try_parse_from(["sanitize", "input.txt", "--output", "out.txt"]).unwrap();
assert_eq!(cli.output.unwrap(), PathBuf::from("out.txt"));
}
#[test]
fn cli_parses_secrets_file_flag() {
let cli = Cli::try_parse_from(["sanitize", "input.txt", "--secrets-file", "secrets.json"])
.unwrap();
assert_eq!(cli.secrets_file.unwrap(), PathBuf::from("secrets.json"));
}
#[test]
fn cli_parses_short_flags() {
let cli = Cli::try_parse_from([
"sanitize",
"input.txt",
"-s",
"secrets.json",
"-p",
"-P",
"/run/secrets/pw",
"-o",
"out.txt",
"-n",
"-d",
"-f",
"json",
])
.unwrap();
assert_eq!(cli.secrets_file.unwrap(), PathBuf::from("secrets.json"));
assert!(cli.password);
assert_eq!(cli.password_file.unwrap(), PathBuf::from("/run/secrets/pw"));
assert_eq!(cli.output.unwrap(), PathBuf::from("out.txt"));
assert!(cli.dry_run);
assert!(cli.deterministic);
assert_eq!(cli.format.unwrap(), "json");
}
#[test]
fn cli_parses_dry_run() {
let cli = Cli::try_parse_from(["sanitize", "input.txt", "--dry-run"]).unwrap();
assert!(cli.dry_run);
}
#[test]
fn cli_parses_progress_mode() {
let cli = Cli::try_parse_from(["sanitize", "input.txt", "--progress", "on"]).unwrap();
assert_eq!(cli.progress, Some(ProgressMode::On));
assert_eq!(cli.effective_progress_mode(), ProgressMode::On);
}
#[test]
fn cli_no_progress_maps_to_off() {
let cli = Cli::try_parse_from(["sanitize", "input.txt", "--no-progress"]).unwrap();
assert!(cli.no_progress);
assert_eq!(cli.effective_progress_mode(), ProgressMode::Off);
}
#[test]
fn cli_explicit_progress_takes_precedence_over_no_progress() {
let cli =
Cli::try_parse_from(["sanitize", "input.txt", "--no-progress", "--progress", "on"])
.unwrap();
assert!(cli.no_progress);
assert_eq!(cli.progress, Some(ProgressMode::On));
assert_eq!(cli.effective_progress_mode(), ProgressMode::On);
}
#[test]
fn cli_parses_progress_interval() {
let cli = Cli::try_parse_from(["sanitize", "input.txt", "--progress-interval-ms", "500"])
.unwrap();
assert_eq!(cli.progress_interval_ms, 500);
}
#[test]
fn validate_args_rejects_zero_progress_interval() {
let mut cli = Cli::try_parse_from(["sanitize", "input.txt"]).unwrap();
cli.input = vec![std::env::current_dir().unwrap().join("Cargo.toml")];
cli.progress_interval_ms = 0;
let err = validate_args(&cli).unwrap_err();
assert!(err.contains("--progress-interval-ms must be greater than 0"));
}
#[test]
fn progress_policy_auto_disables_live_updates_for_json_logs() {
let policy = ProgressPolicy::from_mode(
ProgressMode::Auto,
make_progress_context(true, false, false, true),
);
assert!(!policy.live_updates);
assert!(!policy.milestone_updates);
}
#[test]
fn progress_policy_auto_disables_live_updates_in_ci() {
let policy = ProgressPolicy::from_mode(
ProgressMode::Auto,
make_progress_context(true, true, false, false),
);
assert!(!policy.live_updates);
assert!(!policy.milestone_updates);
}
#[test]
fn progress_policy_on_keeps_milestones_when_live_updates_are_unavailable() {
let policy = ProgressPolicy::from_mode(
ProgressMode::On,
make_progress_context(false, false, false, false),
);
assert!(!policy.live_updates);
assert!(policy.milestone_updates);
}
#[test]
fn progress_policy_auto_enables_live_updates_in_interactive_human_mode() {
let policy = ProgressPolicy::from_mode(
ProgressMode::Auto,
make_progress_context(true, false, false, false),
);
assert!(policy.live_updates);
assert!(policy.milestone_updates);
}
#[test]
fn cli_parses_encrypt_subcommand() {
let cli = Cli::try_parse_from([
"sanitize",
"encrypt",
"secrets.json",
"secrets.enc",
"--password",
])
.unwrap();
assert!(cli.command.is_some());
assert!(cli.input.is_empty());
}
#[test]
fn cli_parses_decrypt_subcommand() {
let cli = Cli::try_parse_from([
"sanitize",
"decrypt",
"secrets.enc",
"secrets.json",
"--password",
])
.unwrap();
assert!(cli.command.is_some());
assert!(cli.input.is_empty());
}
#[test]
fn cli_parses_guided_subcommand() {
let cli = Cli::try_parse_from(["sanitize", "guided"]).unwrap();
assert!(matches!(cli.command, Some(SubCommand::Guided)));
assert!(cli.input.is_empty());
}
#[test]
fn cli_no_input_no_subcommand_is_ok_at_parse_time() {
let cli = Cli::try_parse_from(["sanitize", "--dry-run"]).unwrap();
assert!(cli.input.is_empty());
assert!(cli.command.is_none());
}
#[test]
fn cli_parses_all_flags() {
let cli = Cli::try_parse_from([
"sanitize",
"input.log",
"--output",
"output.log",
"--secrets-file",
"s.enc",
"--password",
"--dry-run",
"--fail-on-match",
"--deterministic",
"--strict",
"--include-binary",
"--encrypted-secrets",
"--chunk-size",
"4096",
"--threads",
"4",
"--max-mappings",
"500",
"--log-format",
"json",
"--format",
"yaml",
])
.unwrap();
assert!(cli.dry_run);
assert!(cli.fail_on_match);
assert!(cli.deterministic);
assert!(cli.strict);
assert!(cli.include_binary);
assert!(cli.encrypted_secrets);
assert_eq!(cli.chunk_size, 4096);
assert_eq!(cli.threads, Some(4));
assert_eq!(cli.max_mappings, 500);
assert_eq!(cli.format.unwrap(), "yaml");
assert_eq!(cli.output.unwrap(), PathBuf::from("output.log"));
}
#[test]
fn cli_stdin_dash_input() {
let cli = Cli::try_parse_from(["sanitize", "-", "-s", "s.json"]).unwrap();
assert!(has_stdin_input(&cli));
}
#[test]
fn cli_stdin_no_input() {
let cli = Cli::try_parse_from(["sanitize", "-s", "s.json"]).unwrap();
assert!(has_stdin_input(&cli));
}
#[test]
fn cli_file_input_not_stdin() {
let cli = Cli::try_parse_from(["sanitize", "data.log"]).unwrap();
assert!(!has_stdin_input(&cli));
}
#[test]
fn cli_file_and_stdin_mix_is_supported() {
let cli = Cli::try_parse_from(["sanitize", "test.txt", "-", "-s", "s.json"]).unwrap();
assert!(has_stdin_input(&cli));
assert_eq!(file_inputs(&cli).len(), 1);
}
#[test]
fn format_to_ext_mapping() {
assert_eq!(format_to_ext("json"), Some("json"));
assert_eq!(format_to_ext("yaml"), Some("yaml"));
assert_eq!(format_to_ext("xml"), Some("xml"));
assert_eq!(format_to_ext("csv"), Some("csv"));
assert_eq!(format_to_ext("key-value"), Some("conf"));
assert_eq!(format_to_ext("text"), None);
assert_eq!(format_to_ext("unknown"), None);
}
#[test]
fn plan_multi_input_outputs_preserve_types() {
let tmp = tempdir().unwrap();
let input_dir = tmp.path().join("in");
let out_dir = tmp.path().join("out");
fs::create_dir_all(&input_dir).unwrap();
let txt = input_dir.join("test.txt");
let json = input_dir.join("a.json");
let zip = input_dir.join("b.zip");
fs::write(&txt, "x").unwrap();
fs::write(&json, "{}\n").unwrap();
fs::write(&zip, "PK\x03\x04").unwrap();
let cli = Cli::try_parse_from([
"sanitize",
txt.to_str().unwrap(),
json.to_str().unwrap(),
zip.to_str().unwrap(),
"--output",
out_dir.to_str().unwrap(),
])
.unwrap();
let targets = plan_input_targets(&cli).unwrap();
let mut outputs = targets
.into_iter()
.filter_map(|t| match t {
InputTarget::File { output, .. } => {
Some(output.file_name().unwrap().to_string_lossy().to_string())
}
InputTarget::Stdin { .. } => None,
})
.collect::<Vec<_>>();
outputs.sort();
assert_eq!(
outputs,
vec![
"a-sanitized.json".to_string(),
"b.sanitized.zip".to_string(),
"test-sanitized.txt".to_string(),
]
);
}
#[test]
fn plan_multi_input_collision_adds_numeric_suffix() {
let tmp = tempdir().unwrap();
let dir1 = tmp.path().join("dir1");
let dir2 = tmp.path().join("dir2");
let out_dir = tmp.path().join("out");
fs::create_dir_all(&dir1).unwrap();
fs::create_dir_all(&dir2).unwrap();
let f1 = dir1.join("same.txt");
let f2 = dir2.join("same.txt");
fs::write(&f1, "x").unwrap();
fs::write(&f2, "y").unwrap();
let cli = Cli::try_parse_from([
"sanitize",
f1.to_str().unwrap(),
f2.to_str().unwrap(),
"--output",
out_dir.to_str().unwrap(),
])
.unwrap();
let targets = plan_input_targets(&cli).unwrap();
let outputs = targets
.into_iter()
.filter_map(|t| match t {
InputTarget::File { output, .. } => {
Some(output.file_name().unwrap().to_string_lossy().to_string())
}
InputTarget::Stdin { .. } => None,
})
.collect::<Vec<_>>();
assert!(outputs.contains(&"same-sanitized.txt".to_string()));
assert!(outputs.contains(&"same-sanitized-1.txt".to_string()));
}
#[test]
fn guided_entries_compile_balanced() {
let opts = GuidedOptions {
preset: GuidedPreset::Balanced,
domains: vec!["corp.internal".into()],
providers: vec![CloudProvider::Aws],
exclude_noise_ids: true,
formats: vec![GuidedFormat::YamlJson, GuidedFormat::Env],
};
let entries = build_guided_entries(&opts);
let (_patterns, warnings) = entries_to_patterns(&entries);
assert!(warnings.is_empty());
}
#[test]
fn guided_entries_include_gcp_custom_when_selected() {
let opts = GuidedOptions {
preset: GuidedPreset::Aggressive,
domains: vec![],
providers: vec![CloudProvider::Gcp],
exclude_noise_ids: false,
formats: vec![],
};
let entries = build_guided_entries(&opts);
assert!(entries
.iter()
.any(|e| e.category == "custom:gcp_service_account"));
assert!(entries.iter().any(|e| e.category == "custom:gcp_resource"));
}
#[test]
fn guided_profiles_use_known_processor_names() {
use sanitize_engine::processor::ProcessorRegistry;
let registry = ProcessorRegistry::with_builtins();
for preset in [
GuidedPreset::Balanced,
GuidedPreset::Aggressive,
GuidedPreset::WebApp,
GuidedPreset::Kubernetes,
GuidedPreset::Database,
] {
let opts = GuidedOptions {
preset,
domains: vec![],
providers: vec![],
exclude_noise_ids: false,
formats: vec![
GuidedFormat::YamlJson,
GuidedFormat::JsonLines,
GuidedFormat::Env,
GuidedFormat::Toml,
GuidedFormat::IniConf,
],
};
let profiles = build_guided_profiles(&opts);
for p in &profiles {
assert!(
registry.get(&p.processor).is_some(),
"preset {:?}: unknown processor '{}'",
preset,
p.processor
);
}
}
}
#[test]
fn guided_profiles_all_formats_produce_non_empty_field_rules() {
let opts = GuidedOptions {
preset: GuidedPreset::Balanced,
domains: vec![],
providers: vec![],
exclude_noise_ids: false,
formats: vec![
GuidedFormat::YamlJson,
GuidedFormat::JsonLines,
GuidedFormat::Env,
GuidedFormat::Toml,
GuidedFormat::IniConf,
],
};
let profiles = build_guided_profiles(&opts);
assert_eq!(
profiles.len(),
6,
"expected 6 profiles (yaml, json, jsonl, env, toml, ini)"
);
for p in &profiles {
assert!(
!p.fields.is_empty(),
"profile '{}' has no field rules",
p.processor
);
}
}
#[test]
fn guided_profiles_k8s_adds_secret_data_fields() {
let opts = GuidedOptions {
preset: GuidedPreset::Kubernetes,
domains: vec![],
providers: vec![],
exclude_noise_ids: false,
formats: vec![GuidedFormat::YamlJson],
};
let profiles = build_guided_profiles(&opts);
let yaml_profile = profiles.iter().find(|p| p.processor == "yaml").unwrap();
let patterns: Vec<&str> = yaml_profile
.fields
.iter()
.map(|f| f.pattern.as_str())
.collect();
assert!(
patterns.contains(&"data.*"),
"k8s yaml profile missing data.*"
);
assert!(
patterns.contains(&"stringData.*"),
"k8s yaml profile missing stringData.*"
);
}
#[test]
fn guided_profiles_jsonl_has_skip_invalid_option() {
let opts = GuidedOptions {
preset: GuidedPreset::Balanced,
domains: vec![],
providers: vec![],
exclude_noise_ids: false,
formats: vec![GuidedFormat::JsonLines],
};
let profiles = build_guided_profiles(&opts);
let jsonl = profiles.iter().find(|p| p.processor == "jsonl").unwrap();
assert_eq!(
jsonl.options.get("skip_invalid").map(|s| s.as_str()),
Some("true"),
"jsonl profile should have skip_invalid=true for mixed log files"
);
}
#[test]
fn guided_entries_k8s_includes_container_id_short() {
let opts = GuidedOptions {
preset: GuidedPreset::Kubernetes,
domains: vec![],
providers: vec![],
exclude_noise_ids: false,
formats: vec![],
};
let entries = build_guided_entries(&opts);
assert!(
entries
.iter()
.any(|e| e.label.as_deref() == Some("container_id_short")),
"k8s preset should include container_id_short"
);
}
#[test]
fn guided_entries_balanced_excludes_container_id_short() {
let opts = GuidedOptions {
preset: GuidedPreset::Balanced,
domains: vec![],
providers: vec![],
exclude_noise_ids: false,
formats: vec![],
};
let entries = build_guided_entries(&opts);
assert!(
!entries
.iter()
.any(|e| e.label.as_deref() == Some("container_id_short")),
"balanced preset should not include container_id_short"
);
}
fn real_file_cli() -> Cli {
let mut cli = Cli::try_parse_from(["sanitize", "placeholder"]).unwrap();
cli.input = vec![std::env::current_dir().unwrap().join("Cargo.toml")];
cli
}
#[test]
fn validate_args_rejects_invalid_format() {
let mut cli = real_file_cli();
cli.format = Some("notaformat".into());
let err = validate_args(&cli).unwrap_err();
assert!(err.contains("invalid --format"), "got: {err}");
}
#[test]
fn validate_args_rejects_invalid_log_format() {
let mut cli = real_file_cli();
cli.log_format = Some("xml".into());
let err = validate_args(&cli).unwrap_err();
assert!(err.contains("invalid --log-format"), "got: {err}");
}
#[test]
fn validate_args_rejects_zero_threads() {
let mut cli = real_file_cli();
cli.threads = Some(0);
let err = validate_args(&cli).unwrap_err();
assert!(err.contains("--threads must be"), "got: {err}");
}
#[test]
fn validate_args_rejects_password_without_encrypted_secrets() {
let mut cli = real_file_cli();
cli.password = true;
let err = validate_args(&cli).unwrap_err();
assert!(err.contains("--encrypted-secrets is not set"), "got: {err}");
}
#[test]
fn validate_args_allows_llm_with_output() {
let mut cli = real_file_cli();
cli.llm = Some("troubleshoot".into());
cli.output = Some(PathBuf::from("/tmp/out.txt"));
assert!(
validate_args(&cli).is_ok(),
"--llm + --output should be allowed for reference mode"
);
}
#[test]
fn validate_args_rejects_llm_with_dry_run() {
let mut cli = real_file_cli();
cli.llm = Some("troubleshoot".into());
cli.dry_run = true;
let err = validate_args(&cli).unwrap_err();
assert!(
err.contains("--llm and --dry-run cannot be combined"),
"got: {err}"
);
}
#[test]
fn validate_args_rejects_llm_with_nonexistent_template_path() {
let mut cli = real_file_cli();
cli.llm = Some("/nonexistent/template.txt".into());
let err = validate_args(&cli).unwrap_err();
assert!(err.contains("does not exist"), "got: {err}");
}
#[test]
fn validate_args_accepts_known_llm_templates() {
for name in ["troubleshoot", "review-config", "review-security"] {
let mut cli = real_file_cli();
cli.llm = Some(name.into());
assert!(
validate_args(&cli).is_ok(),
"built-in template '{}' should be accepted",
name
);
}
}
#[test]
fn build_default_patterns_returns_nonempty_set() {
let patterns = build_default_patterns();
assert!(
!patterns.is_empty(),
"built-in balanced patterns should not be empty"
);
let labels: Vec<_> = patterns.iter().map(|p| p.label()).collect();
assert!(labels.contains(&"email"), "expected email pattern");
assert!(
labels.contains(&"github_token"),
"expected github_token pattern"
);
assert!(
labels.contains(&"stripe_key"),
"expected stripe_key pattern"
);
}
#[test]
fn hook_script_pre_commit_scan_contains_marker_and_fail_on_match() {
let args = InstallHookArgs {
hook: HookType::PreCommit,
mode: HookMode::Scan,
global: false,
force: false,
remove: false,
app: None,
secrets_file: None,
dry_run: false,
};
let script = build_hook_script(&args);
assert!(script.contains(HOOK_MARKER), "marker must be present");
assert!(
script.contains("--dry-run --fail-on-match"),
"scan mode must use --dry-run --fail-on-match"
);
assert!(
script.contains("SANITIZE_SKIP"),
"escape hatch must be present"
);
assert!(
script.starts_with("#!/bin/sh"),
"must start with POSIX shebang"
);
}
#[test]
fn hook_script_pre_commit_sanitize_uses_output_dot() {
let args = InstallHookArgs {
hook: HookType::PreCommit,
mode: HookMode::Sanitize,
global: false,
force: false,
remove: false,
app: None,
secrets_file: None,
dry_run: false,
};
let script = build_hook_script(&args);
assert!(
script.contains("--output ."),
"sanitize mode must write output in place"
);
assert!(
script.contains("git add"),
"sanitize mode must re-stage files"
);
assert!(
!script.contains("--dry-run"),
"sanitize mode must not pass --dry-run"
);
}
#[test]
fn hook_script_pre_push_contains_while_read_loop() {
let args = InstallHookArgs {
hook: HookType::PrePush,
mode: HookMode::Scan,
global: false,
force: false,
remove: false,
app: Some("gitlab".into()),
secrets_file: None,
dry_run: false,
};
let script = build_hook_script(&args);
assert!(
script.contains("while IFS=' ' read -r"),
"pre-push must iterate stdin"
);
assert!(
script.contains("--app 'gitlab'"),
"app bundle must be quoted and forwarded"
);
}
#[test]
fn hook_flags_shell_quotes_paths_with_spaces() {
let args = InstallHookArgs {
hook: HookType::PreCommit,
mode: HookMode::Scan,
global: false,
force: false,
remove: false,
app: None,
secrets_file: Some(PathBuf::from("my secrets/file.yaml")),
dry_run: false,
};
let flags = build_hook_flags(&args);
assert!(
flags.contains("-s 'my secrets/file.yaml'"),
"space in path must be single-quoted: got {flags}"
);
}
#[test]
fn sh_quote_escapes_embedded_single_quotes() {
assert_eq!(sh_quote("it's a test"), "'it'\\''s a test'");
assert_eq!(sh_quote("normal"), "'normal'");
assert_eq!(sh_quote("a b c"), "'a b c'");
}
#[test]
fn remove_hook_deletes_file_when_entirely_ours() {
let dir = tempdir().unwrap();
let hook_path = dir.path().join("pre-commit");
let script = hook_script_pre_commit_scan("--use-default");
fs::write(&hook_path, &script).unwrap();
#[cfg(unix)]
{
use std::os::unix::fs::PermissionsExt;
fs::set_permissions(&hook_path, fs::Permissions::from_mode(0o755)).unwrap();
}
remove_hook(&hook_path, "pre-commit").expect("remove should succeed");
assert!(
!hook_path.exists(),
"file should be deleted when it is entirely our hook"
);
}
#[test]
fn remove_hook_strips_block_from_composite_hook() {
let dir = tempdir().unwrap();
let hook_path = dir.path().join("pre-commit");
let pre_existing = "#!/bin/sh\n# other team's linter\nnpm run lint\n";
let our_block = hook_script_pre_commit_scan("--use-default");
fs::write(&hook_path, format!("{pre_existing}{our_block}")).unwrap();
remove_hook(&hook_path, "pre-commit").expect("remove should succeed");
assert!(
hook_path.exists(),
"file should remain when other content is present"
);
let remaining = fs::read_to_string(&hook_path).unwrap();
assert!(
remaining.contains("npm run lint"),
"other hook content must be preserved"
);
assert!(!remaining.contains(HOOK_MARKER), "our marker must be gone");
}
#[test]
fn remove_hook_rejects_unrecognised_hook() {
let dir = tempdir().unwrap();
let hook_path = dir.path().join("pre-commit");
fs::write(&hook_path, "#!/bin/sh\necho hello\n").unwrap();
let result = remove_hook(&hook_path, "pre-commit");
assert!(
result.is_err(),
"should refuse to remove a hook we didn't install"
);
}
}