use crate::scanner_builder::common_allow_patterns;
use sanitize_engine::processor::FileTypeProfile;
use sanitize_engine::secrets::SecretEntry;
use sanitize_engine::{Category, FieldRule};
use std::collections::HashSet;
use std::io::{self, Write};
pub(crate) fn parse_template_preset(s: &str) -> Result<TemplatePreset, String> {
match s {
"generic" => Ok(TemplatePreset::Generic),
"web" => Ok(TemplatePreset::Web),
"k8s" | "kubernetes" => Ok(TemplatePreset::K8s),
"database" | "db" => Ok(TemplatePreset::Database),
"aws" => Ok(TemplatePreset::Aws),
other => Err(format!(
"unknown preset '{}' (choices: generic, web, k8s, database, aws)",
other
)),
}
}
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub(crate) enum TemplatePreset {
Generic,
Web,
K8s,
Database,
Aws,
}
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub(crate) enum GuidedPreset {
Balanced,
Aggressive,
WebApp,
Kubernetes,
Database,
}
#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
pub(crate) enum CloudProvider {
Aws,
Azure,
Gcp,
}
#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
pub(crate) enum GuidedFormat {
YamlJson,
JsonLines,
Env,
Toml,
IniConf,
}
#[derive(Clone, Debug)]
pub(crate) struct GuidedOptions {
pub(crate) preset: GuidedPreset,
pub(crate) domains: Vec<String>,
pub(crate) providers: Vec<CloudProvider>,
pub(crate) exclude_noise_ids: bool,
pub(crate) formats: Vec<GuidedFormat>,
}
pub(crate) fn prompt_line(prompt: &str) -> Result<String, String> {
let mut stdout = io::stdout();
write!(stdout, "{}", prompt).map_err(|e| format!("failed to write prompt: {e}"))?;
stdout
.flush()
.map_err(|e| format!("failed to flush prompt: {e}"))?;
let mut input = String::new();
io::stdin()
.read_line(&mut input)
.map_err(|e| format!("failed to read input: {e}"))?;
Ok(input.trim().to_string())
}
pub(crate) fn prompt_yes_no(prompt: &str, default_yes: bool) -> Result<bool, String> {
let suffix = if default_yes { "[Y/n]" } else { "[y/N]" };
loop {
let answer = prompt_line(&format!("{} {} ", prompt, suffix))?;
if answer.is_empty() {
return Ok(default_yes);
}
match answer.to_ascii_lowercase().as_str() {
"y" | "yes" => return Ok(true),
"n" | "no" => return Ok(false),
_ => eprintln!("Please answer 'y' or 'n'."),
}
}
}
fn sanitize_domain(input: &str) -> Option<String> {
let trimmed = input.trim().trim_matches('.').to_ascii_lowercase();
if trimmed.is_empty() {
return None;
}
if !trimmed
.bytes()
.all(|b| b.is_ascii_alphanumeric() || b == b'.' || b == b'-')
{
return None;
}
Some(trimmed)
}
pub(crate) fn prompt_domains() -> Result<Vec<String>, String> {
let raw = prompt_line(
"Company domains (comma-separated, up to 3, optional; e.g. corp.internal,example.com): ",
)?;
if raw.trim().is_empty() {
return Ok(vec![]);
}
let mut seen = HashSet::new();
let mut out = Vec::new();
for item in raw.split(',') {
let Some(domain) = sanitize_domain(item) else {
return Err(format!("invalid domain value: '{}'", item.trim()));
};
if seen.insert(domain.clone()) {
out.push(domain);
}
}
if out.len() > 3 {
return Err("please provide at most 3 domains".into());
}
Ok(out)
}
pub(crate) fn prompt_cloud_providers() -> Result<Vec<CloudProvider>, String> {
eprintln!("Cloud providers in scope:");
eprintln!(" 1) AWS");
eprintln!(" 2) Azure");
eprintln!(" 3) GCP");
eprintln!(" 4) None");
let raw = prompt_line("Select one or more (comma-separated numbers, default: 4): ")?;
if raw.trim().is_empty() || raw.trim() == "4" {
return Ok(vec![]);
}
let mut selected = Vec::new();
let mut seen = HashSet::new();
for token in raw.split(',').map(|s| s.trim()) {
let provider = match token {
"1" => CloudProvider::Aws,
"2" => CloudProvider::Azure,
"3" => CloudProvider::Gcp,
"4" => continue,
_ => return Err(format!("invalid selection: '{token}'")),
};
if seen.insert(provider) {
selected.push(provider);
}
}
Ok(selected)
}
pub(crate) const ALL_FORMATS: &[GuidedFormat] = &[
GuidedFormat::YamlJson,
GuidedFormat::JsonLines,
GuidedFormat::Env,
GuidedFormat::Toml,
GuidedFormat::IniConf,
];
pub(crate) fn prompt_formats() -> Result<Vec<GuidedFormat>, String> {
eprintln!("Structured file formats to include in profile (controls field-level redaction):");
eprintln!(" 1) YAML / JSON — k8s manifests, docker-compose, app configs");
eprintln!(" 2) JSON Lines — NDJSON structured logs (.jsonl, .ndjson)");
eprintln!(" 3) .env files — twelve-factor app secrets, CI variables");
eprintln!(" 4) TOML — Rust, Hugo, and other TOML configs");
eprintln!(" 5) INI / conf — system services, databases, legacy apps");
eprintln!(" 6) All of the above (default)");
eprintln!(" 7) None — secrets file only, no profile");
let raw = prompt_line("Select one or more (comma-separated, default: 6): ")?;
if raw.trim().is_empty() || raw.trim() == "6" {
return Ok(ALL_FORMATS.to_vec());
}
if raw.trim() == "7" {
return Ok(vec![]);
}
let mut selected = Vec::new();
let mut seen = HashSet::new();
for token in raw.split(',').map(|s| s.trim()) {
let fmt = match token {
"1" => GuidedFormat::YamlJson,
"2" => GuidedFormat::JsonLines,
"3" => GuidedFormat::Env,
"4" => GuidedFormat::Toml,
"5" => GuidedFormat::IniConf,
"6" => return Ok(ALL_FORMATS.to_vec()),
"7" => return Ok(vec![]),
_ => return Err(format!("invalid selection: '{token}'")),
};
if seen.insert(fmt) {
selected.push(fmt);
}
}
Ok(selected)
}
fn make_regex_entry(pattern: &str, category: &str, label: &str) -> SecretEntry {
SecretEntry {
pattern: pattern.to_string(),
kind: "regex".to_string(),
category: category.to_string(),
label: Some(label.to_string()),
values: vec![],
min_length: None,
max_length: None,
threshold: None,
charset: None,
}
}
pub(crate) fn build_guided_entries(opts: &GuidedOptions) -> Vec<SecretEntry> {
let mut entries = vec![
make_regex_entry(
r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}",
"email",
"email",
),
make_regex_entry(r"\b(?:\d{1,3}\.){3}\d{1,3}\b", "ipv4", "ipv4"),
make_regex_entry(
r"\b(?:[0-9A-Fa-f]{1,4}:){7}[0-9A-Fa-f]{1,4}\b",
"ipv6",
"ipv6_full",
),
make_regex_entry(
r"\b(?:[0-9A-Fa-f]{1,4}:){1,6}:[0-9A-Fa-f]{0,4}\b|\b::(?:[0-9A-Fa-f]{1,4}:){0,5}[0-9A-Fa-f]{1,4}\b",
"ipv6",
"ipv6_compressed",
),
make_regex_entry(
r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}\b",
"uuid",
"uuid",
),
make_regex_entry(
r"\beyJ[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\b",
"jwt",
"jwt",
),
make_regex_entry(r#"https?://[^\s"'<>;]+"#, "url", "url"),
make_regex_entry(
r#"[a-z][a-z0-9+.-]+://[^:@\s]{1,128}:[^@\s]{1,128}@[^\s"'<>]+"#,
"url",
"credential_url",
),
make_regex_entry(
r"-----BEGIN (?:RSA |EC |OPENSSH |)PRIVATE KEY-----",
"auth_token",
"private_key_header",
),
make_regex_entry(
r#"(?i)(?:api_key|api_secret|access_token|client_secret|private_key|secret_key|auth_key|signing_key|jwt_secret|jwt_key)[\s:="']+[A-Za-z0-9._~+/=-]{16,}"#,
"auth_token",
"secret_kv",
),
make_regex_entry(
r#"(?i)(?:password|passwd|pwd)[\s:="']+[^\s"']{6,}"#,
"custom:password",
"password_kv",
),
make_regex_entry(
r"/(?:home|Users)/[A-Za-z0-9_.-]+",
"file_path",
"user_home_path",
),
make_regex_entry(r"\bsha256:[a-f0-9]{64}\b", "container_id", "image_digest"),
make_regex_entry(
r"\b(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}\b",
"mac_address",
"mac_address",
),
make_regex_entry(
r"\b(?:ghp|gho|ghu|ghs|ghr)_[A-Za-z0-9]{36}\b",
"auth_token",
"github_token",
),
make_regex_entry(
r"\bgithub_pat_[A-Za-z0-9_]{82}\b",
"auth_token",
"github_pat_fine_grained",
),
make_regex_entry(r"\bAIza[A-Za-z0-9_-]{35}\b", "auth_token", "gcp_api_key"),
make_regex_entry(
r"\b(?:AKIA|ABIA|ACCA|ASIA)[A-Z0-9]{16}\b",
"auth_token",
"aws_access_key_id",
),
make_regex_entry(
r"\bsk-(?:proj-|svcacct-)?[A-Za-z0-9_-]{40,}\b",
"auth_token",
"openai_api_key",
),
make_regex_entry(
r"\bsk-ant-[A-Za-z0-9_-]{93,}\b",
"auth_token",
"anthropic_api_key",
),
make_regex_entry(
r"\bxox[bpars]-[0-9]{10,13}-[0-9]{10,13}[a-zA-Z0-9-]*\b",
"auth_token",
"slack_token",
),
make_regex_entry(r"\bnpm_[A-Za-z0-9]{36}\b", "auth_token", "npm_token"),
make_regex_entry(r"\bhf_[A-Za-z0-9]{34}\b", "auth_token", "huggingface_token"),
make_regex_entry(
r"\b(?:sk|pk|rk)_(?:live|test)_[A-Za-z0-9]{24,}\b",
"auth_token",
"stripe_key",
),
make_regex_entry(r"\bglpat-[A-Za-z0-9_-]{20}\b", "auth_token", "gitlab_token"),
make_regex_entry(
r"\bSG\.[A-Za-z0-9_-]{22}\.[A-Za-z0-9_-]{43}\b",
"auth_token",
"sendgrid_api_key",
),
make_regex_entry(r"\bAC[a-f0-9]{32}\b", "auth_token", "twilio_account_sid"),
];
if matches!(opts.preset, GuidedPreset::Aggressive) {
entries.push(make_regex_entry(
r"\b(?:(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)\.){2,}(?:[a-zA-Z]{2,63})\b",
"hostname",
"hostname",
));
entries.push(make_regex_entry(
r"\b[a-f0-9]{12}\b",
"container_id",
"container_id_short",
));
}
if matches!(
opts.preset,
GuidedPreset::Aggressive
| GuidedPreset::WebApp
| GuidedPreset::Kubernetes
| GuidedPreset::Database
) {
entries.push(make_regex_entry(
r"(?i)\bBearer\s+[A-Za-z0-9._~+/=-]{16,}\b",
"auth_token",
"bearer_token",
));
entries.push(make_regex_entry(
r#"(?i)\bauthorization[\s:="']+[A-Za-z0-9._~+/=-]{16,}\b"#,
"auth_token",
"authorization_kv",
));
entries.push(make_regex_entry(
r"\b[A-Za-z0-9_\-]{32,}\b",
"custom:high_entropy_token",
"high_entropy_token",
));
}
if matches!(opts.preset, GuidedPreset::WebApp) {
entries.push(make_regex_entry(
r"(?i)\bsess(?:ion)?[_-]?(?:id|token|key)[\s:=]+[A-Za-z0-9._~+/=-]{8,}\b",
"auth_token",
"session_id",
));
entries.push(make_regex_entry(
r"(?i)(?:refresh|access)[_-]?token[\s:=]+[A-Za-z0-9._~+/=-]{16,}",
"auth_token",
"oauth_token",
));
}
if matches!(opts.preset, GuidedPreset::Kubernetes) {
entries.push(make_regex_entry(
r"(?i)token[\s:]+[A-Za-z0-9._~+/=-]{20,}",
"auth_token",
"k8s_token",
));
entries.push(make_regex_entry(
r"\bnamespace[:\s]+[a-z][a-z0-9-]{2,62}\b",
"custom:k8s_namespace",
"k8s_namespace",
));
entries.push(make_regex_entry(
r"\b[a-f0-9]{64}\b",
"container_id",
"k8s_image_sha",
));
entries.push(make_regex_entry(
r"\b[a-f0-9]{12}\b",
"container_id",
"container_id_short",
));
}
if matches!(opts.preset, GuidedPreset::Database) {
entries.push(make_regex_entry(
r#"(?i)(?:postgres|mysql|mongodb|redis|amqp|jdbc:[^:]+)://[^\s"'>]+"#,
"url",
"db_connection_string",
));
entries.push(make_regex_entry(
r#"(?i)(?:user|username|login)[\s:="']+[^\s"']{3,}"#,
"name",
"db_username",
));
}
for domain in &opts.domains {
let escaped = regex::escape(domain);
entries.push(make_regex_entry(
&format!(r"[A-Za-z0-9._%+-]+@{}", escaped),
"email",
&format!("email_{}", domain.replace('.', "_")),
));
entries.push(make_regex_entry(
&format!(r"\b(?:[A-Za-z0-9-]+\.)*{}\b", escaped),
"hostname",
&format!("host_{}", domain.replace('.', "_")),
));
}
let has_aws = opts.providers.contains(&CloudProvider::Aws);
let has_azure = opts.providers.contains(&CloudProvider::Azure);
let has_gcp = opts.providers.contains(&CloudProvider::Gcp);
if has_aws {
entries.push(make_regex_entry(
r"\barn:aws:[^\s]+\b",
"aws_arn",
"aws_arn",
));
entries.push(make_regex_entry(
r#"(?i)(?:aws_secret_access_key|aws_secret_key|aws_secret)[\s:="']+[A-Za-z0-9/+=]{40}\b"#,
"auth_token",
"aws_secret_access_key",
));
entries.push(make_regex_entry(
r"\bi-[0-9a-f]{8,17}\b",
"container_id",
"ec2_instance_id",
));
}
if has_azure {
entries.push(make_regex_entry(
r"/subscriptions/[0-9a-fA-F-]{8,}/resourceGroups/[^\s/]+(?:/providers/[^\s]+)?",
"azure_resource_id",
"azure_resource_id",
));
}
if has_gcp {
entries.push(make_regex_entry(
r"\b[a-z0-9-]+@[a-z0-9-]+\.iam\.gserviceaccount\.com\b",
"custom:gcp_service_account",
"gcp_service_account",
));
entries.push(make_regex_entry(
r"\bprojects/[a-z][a-z0-9-]{4,30}/[A-Za-z0-9/_-]+\b",
"custom:gcp_resource",
"gcp_resource",
));
}
if opts.exclude_noise_ids {
entries.retain(|entry| entry.label.as_deref() != Some("high_entropy_token"));
}
entries.push(SecretEntry {
pattern: String::new(),
kind: "allow".into(),
category: String::new(),
label: Some("common_safe_values".into()),
values: common_allow_patterns(),
min_length: None,
max_length: None,
threshold: None,
charset: None,
});
entries
}
pub(crate) const PROFILE_HEADER: &str = "\
# =============================================================================
# sanitize profile — structured field rules
# =============================================================================
#
# PURPOSE
# This file tells sanitize which fields to redact inside structured files
# (YAML, JSON, .env, TOML, INI, NDJSON) before sending to an LLM or
# external service. It works alongside the secrets file: the secrets file
# covers free-text patterns; this file covers key=value fields.
#
# HOW TO USE
# sanitize input/ -s secrets.yaml --profile profile.yaml -o output/
#
# SAFE TO COMMIT
# This file contains no secrets — only field name patterns. Commit it
# alongside your sanitize secrets file (which you should encrypt).
#
# FIELD REFERENCE
# processor string Required. Processor name: yaml, json, jsonl, env, toml, ini.
# extensions list File extensions this profile applies to.
# fields list Field rules: pattern (glob) + category.
# options map Processor-specific options (e.g. compact, skip_invalid).
#
# WARNING: REVIEW OUTPUT BEFORE SENDING TO AN LLM.
# Field rules redact exact keys — add regex patterns in secrets.yaml
# to catch values that appear outside structured fields.
# =============================================================================
";
pub(crate) fn build_guided_profiles(opts: &GuidedOptions) -> Vec<FileTypeProfile> {
let credential_fields = || -> Vec<FieldRule> {
vec![
FieldRule::new("*.password").with_category(Category::Custom("password".into())),
FieldRule::new("*.passwd").with_category(Category::Custom("password".into())),
FieldRule::new("*.secret").with_category(Category::AuthToken),
FieldRule::new("*.secret_key").with_category(Category::AuthToken),
FieldRule::new("*.api_key").with_category(Category::AuthToken),
FieldRule::new("*.api_token").with_category(Category::AuthToken),
FieldRule::new("*.access_token").with_category(Category::AuthToken),
FieldRule::new("*.auth_token").with_category(Category::AuthToken),
FieldRule::new("*.token").with_category(Category::AuthToken),
FieldRule::new("*.private_key").with_category(Category::AuthToken),
FieldRule::new("*.connection_string").with_category(Category::Url),
FieldRule::new("*.database_url").with_category(Category::Url),
FieldRule::new("*.dsn").with_category(Category::Url),
]
};
let mut profiles = Vec::new();
for fmt in &opts.formats {
match fmt {
GuidedFormat::YamlJson => {
let mut yaml_fields = credential_fields();
yaml_fields.push(FieldRule::new("*.email").with_category(Category::Email));
yaml_fields.push(FieldRule::new("*.username").with_category(Category::Name));
if matches!(opts.preset, GuidedPreset::Kubernetes) {
yaml_fields.push(FieldRule::new("data.*").with_category(Category::AuthToken));
yaml_fields
.push(FieldRule::new("stringData.*").with_category(Category::AuthToken));
}
profiles.push(
FileTypeProfile::new("yaml", yaml_fields)
.with_extension(".yaml")
.with_extension(".yml"),
);
let mut json_fields = credential_fields();
json_fields.push(FieldRule::new("*.email").with_category(Category::Email));
json_fields.push(FieldRule::new("*.username").with_category(Category::Name));
json_fields.push(FieldRule::new("*.ip").with_category(Category::IpV4));
profiles.push(
FileTypeProfile::new("json", json_fields)
.with_extension(".json")
.with_option("compact", "true"),
);
}
GuidedFormat::JsonLines => {
let mut fields = credential_fields();
fields.push(FieldRule::new("*.email").with_category(Category::Email));
fields.push(FieldRule::new("*.user").with_category(Category::Name));
fields.push(FieldRule::new("*.username").with_category(Category::Name));
fields.push(FieldRule::new("*.ip").with_category(Category::IpV4));
fields.push(FieldRule::new("*.client_ip").with_category(Category::IpV4));
fields.push(FieldRule::new("*.remote_addr").with_category(Category::IpV4));
fields.push(FieldRule::new("*.host").with_category(Category::Hostname));
profiles.push(
FileTypeProfile::new("jsonl", fields)
.with_extension(".jsonl")
.with_extension(".ndjson")
.with_option("skip_invalid", "true"),
);
}
GuidedFormat::Env => {
let fields = vec![
FieldRule::new("*_PASSWORD").with_category(Category::Custom("password".into())),
FieldRule::new("*_PASSWD").with_category(Category::Custom("password".into())),
FieldRule::new("*_SECRET").with_category(Category::AuthToken),
FieldRule::new("*_KEY").with_category(Category::AuthToken),
FieldRule::new("*_TOKEN").with_category(Category::AuthToken),
FieldRule::new("*_DSN").with_category(Category::Url),
FieldRule::new("*_URL").with_category(Category::Url),
FieldRule::new("DATABASE_URL").with_category(Category::Url),
FieldRule::new("REDIS_URL").with_category(Category::Url),
FieldRule::new("*_EMAIL").with_category(Category::Email),
FieldRule::new("*_USER").with_category(Category::Name),
FieldRule::new("*_USERNAME").with_category(Category::Name),
];
profiles.push(FileTypeProfile::new("env", fields).with_extension(".env"));
}
GuidedFormat::Toml => {
let mut fields = credential_fields();
fields.push(FieldRule::new("*.email").with_category(Category::Email));
fields.push(FieldRule::new("*.username").with_category(Category::Name));
profiles.push(FileTypeProfile::new("toml", fields).with_extension(".toml"));
}
GuidedFormat::IniConf => {
let fields = vec![
FieldRule::new("*.password").with_category(Category::Custom("password".into())),
FieldRule::new("*.passwd").with_category(Category::Custom("password".into())),
FieldRule::new("*.secret").with_category(Category::AuthToken),
FieldRule::new("*.token").with_category(Category::AuthToken),
FieldRule::new("*.api_key").with_category(Category::AuthToken),
FieldRule::new("*.email").with_category(Category::Email),
FieldRule::new("*.username").with_category(Category::Name),
FieldRule::new("*.user").with_category(Category::Name),
];
profiles.push(
FileTypeProfile::new("ini", fields)
.with_extension(".ini")
.with_extension(".conf")
.with_extension(".cfg"),
);
}
}
}
profiles
}
pub(crate) const TEMPLATE_HEADER: &str = "\
# =============================================================================
# sanitize secrets template
# =============================================================================
#
# PURPOSE
# This file tells sanitize which patterns to detect and replace before
# you send logs, configs, or other data to an LLM or external service.
#
# RELIABILITY FIRST
# Every replacement preserves the original byte length so structured
# formats (JSON, YAML, TOML, …) remain parseable after sanitization.
# Run `sanitize --force-text` to bypass structured processing entirely.
#
# HOW TO USE
# 1. Edit this file to add your own patterns and literals.
# 2. Encrypt: sanitize encrypt this-file.yaml this-file.yaml.enc
# 3. Sanitize: sanitize input.log -s this-file.yaml.enc -o output.log
#
# FIELD REFERENCE
# pattern string Required. Regex or literal to match.
# kind string \"regex\" (default) or \"literal\".
# category string Controls the replacement style. See docs/categories.md.
# label string Optional. Human-readable name shown in reports.
#
# WARNING: REVIEW OUTPUT BEFORE SENDING TO AN LLM.
# No automated tool catches everything — always spot-check.
# =============================================================================
";
pub(crate) fn template_body_generic() -> &'static str {
r#"secrets:
# --- Tokens & credentials ---
- pattern: '(?i)\b(?:bearer|token|api[_-]?key|secret)[\s:=]+[A-Za-z0-9._~+/=-]{16,}\b'
kind: regex
category: auth_token
label: auth_token_context
- pattern: '\beyJ[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\b'
kind: regex
category: jwt
label: jwt
# --- Network identifiers ---
- pattern: '\b(?:\d{1,3}\.){3}\d{1,3}\b'
kind: regex
category: ipv4
label: ipv4
- pattern: '\b(?:[0-9A-Fa-f]{1,4}:){2,7}[0-9A-Fa-f]{1,4}\b'
kind: regex
category: ipv6
label: ipv6
- pattern: '[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}'
kind: regex
category: email
label: email
- pattern: '\b(?:(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)\.)+(?:[a-zA-Z]{2,63})\b'
kind: regex
category: hostname
label: hostname
- pattern: 'https?://[^\s"''<>]+'
kind: regex
category: url
label: url
# --- Identifiers ---
- pattern: '\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}\b'
kind: regex
category: uuid
label: uuid
- pattern: '\b[a-f0-9]{12,64}\b'
kind: regex
category: container_id
label: container_id
# --- Add your own literals below ---
# - pattern: 'my-internal-hostname.corp.example.com'
# kind: literal
# category: hostname
# label: corp_hostname
"#
}
pub(crate) fn template_body_web() -> &'static str {
r#"secrets:
# --- JWTs and session tokens ---
- pattern: '\beyJ[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\b'
kind: regex
category: jwt
label: jwt
- pattern: '(?i)\bsess(?:ion)?[_-]?(?:id|token|key)[\s:=]+[A-Za-z0-9._~+/=-]{8,}\b'
kind: regex
category: auth_token
label: session_id
- pattern: '(?i)(?:refresh|access)[_-]?token[\s:=]+[A-Za-z0-9._~+/=-]{16,}'
kind: regex
category: auth_token
label: oauth_token
- pattern: '(?i)\b(?:bearer|authorization)[\s:]+[A-Za-z0-9._~+/=-]{16,}\b'
kind: regex
category: auth_token
label: bearer_token
# --- User identifiers ---
- pattern: '[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}'
kind: regex
category: email
label: email
- pattern: '\b(?:\d{1,3}\.){3}\d{1,3}\b'
kind: regex
category: ipv4
label: client_ip
# --- URLs (may contain query params with tokens) ---
- pattern: 'https?://[^\s"''<>]+'
kind: regex
category: url
label: url
# --- Add domain-specific literals ---
# - pattern: 'users.myapp.com'
# kind: literal
# category: hostname
# label: app_domain
"#
}
pub(crate) fn template_body_k8s() -> &'static str {
r#"secrets:
# --- Service account tokens (base64, JWT) ---
- pattern: '\beyJ[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\b'
kind: regex
category: jwt
label: k8s_service_account_jwt
- pattern: '(?i)token[\s:]+[A-Za-z0-9._~+/=-]{20,}'
kind: regex
category: auth_token
label: k8s_token
# --- Namespace and pod names ---
- pattern: '\bnamespace[\s:]+[a-z][a-z0-9-]{2,62}\b'
kind: regex
category: custom:k8s_namespace
label: k8s_namespace
# --- IPs assigned to pods and services ---
- pattern: '\b(?:\d{1,3}\.){3}\d{1,3}\b'
kind: regex
category: ipv4
label: pod_or_svc_ip
# --- Cluster hostnames / DNS names ---
- pattern: '\b(?:(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)\.)+(?:[a-zA-Z]{2,63})\b'
kind: regex
category: hostname
label: k8s_dns
# --- UUIDs (pod IDs, request IDs, etc.) ---
- pattern: '\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}\b'
kind: regex
category: uuid
label: uid
# --- Docker / container image digests ---
- pattern: '\b[a-f0-9]{64}\b'
kind: regex
category: container_id
label: image_digest
# --- Add your cluster name as a literal ---
# - pattern: 'prod-cluster-1'
# kind: literal
# category: hostname
# label: cluster_name
"#
}
pub(crate) fn template_body_database() -> &'static str {
r#"secrets:
# --- Connection strings (contain embedded credentials) ---
- pattern: '(?i)(?:postgres|mysql|mongodb|redis|amqp|jdbc:[^:]+)://[^\s"''>]+'
kind: regex
category: url
label: db_connection_string
# --- Inline passwords / secrets ---
- pattern: '(?i)(?:password|passwd|pwd)[\s:=]+[^\s"'']{6,}'
kind: regex
category: custom:db_password
label: db_password
- pattern: '(?i)(?:user|username|login)[\s:=]+[^\s"'']{3,}'
kind: regex
category: name
label: db_username
# --- Host / IP for database servers ---
- pattern: '\b(?:\d{1,3}\.){3}\d{1,3}\b'
kind: regex
category: ipv4
label: db_host_ip
- pattern: '\b(?:(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)\.)+(?:[a-zA-Z]{2,63})\b'
kind: regex
category: hostname
label: db_hostname
# --- TLS certificate fingerprints / hashes ---
- pattern: '\b[a-f0-9]{40}\b'
kind: regex
category: container_id
label: cert_fingerprint
# --- Add database-specific literals ---
# - pattern: 'prod-db.internal.example.com'
# kind: literal
# category: hostname
# label: prod_db_host
"#
}
pub(crate) fn template_body_aws() -> &'static str {
r#"secrets:
# --- AWS access key IDs ---
- pattern: '\b(?:AKIA|ASIA)[A-Z0-9]{16}\b'
kind: regex
category: auth_token
label: aws_access_key_id
# --- ARNs (may reveal account IDs, resource names) ---
- pattern: '\barn:aws:[^\s]+'
kind: regex
category: aws_arn
label: aws_arn
# --- AWS account IDs (12-digit numbers in ARNs or standalone) ---
- pattern: '\b\d{12}\b'
kind: regex
category: custom:aws_account_id
label: aws_account_id
# --- S3 bucket names and keys in URLs ---
- pattern: 'https://s3(?:[.-][a-z0-9-]+)?\.amazonaws\.com/[^\s"''<>]+'
kind: regex
category: url
label: s3_url
# --- EC2 / ECS instance IDs ---
- pattern: '\bi-[0-9a-f]{8,17}\b'
kind: regex
category: container_id
label: ec2_instance_id
# --- IPs for EC2 instances ---
- pattern: '\b(?:\d{1,3}\.){3}\d{1,3}\b'
kind: regex
category: ipv4
label: ec2_ip
# --- Emails in IAM roles, SES, etc. ---
- pattern: '[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}'
kind: regex
category: email
label: email
# --- Add your AWS account ID as a literal for exact matching ---
# - pattern: '123456789012'
# kind: literal
# category: custom:aws_account_id
# label: my_account_id
"#
}