use std::borrow::Cow;
use std::collections::HashSet;
use std::fs;
use std::path::{Path, PathBuf};
use regex::{Regex, RegexBuilder};
use serde::{Deserialize, Serialize};
use thiserror::Error;
use tracing::warn;
use crate::builtin;
use crate::{PatternMetadata, SecretPattern, Severity};
pub const USER_PATTERNS_SUBDIR: &str = "patterns.d";
#[derive(Debug, Default, Clone, Deserialize, Serialize)]
pub struct UserPatternFile {
#[serde(default, rename = "pattern")]
pub patterns: Vec<UserPatternEntry>,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
#[serde(deny_unknown_fields)]
pub struct UserPatternEntry {
pub id: String,
pub display_name: String,
pub format_regex: String,
pub severity: Severity,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub provider_id: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub retrieval_url_template: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub default_expiry_days: Option<u32>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub scopes_hint: Vec<String>,
}
#[derive(Debug)]
pub struct UserPattern {
id: String,
display_name: String,
severity: Severity,
regex: Regex,
metadata: Option<PatternMetadata>,
}
impl UserPattern {
pub fn id(&self) -> &str {
&self.id
}
}
impl SecretPattern for UserPattern {
fn id(&self) -> &str {
&self.id
}
fn display_name(&self) -> &str {
&self.display_name
}
fn format_regex(&self) -> &Regex {
&self.regex
}
fn severity(&self) -> Severity {
self.severity
}
fn metadata(&self) -> Option<&PatternMetadata> {
self.metadata.as_ref()
}
}
#[derive(Debug, Error)]
pub enum LoadError {
#[error("failed to read user patterns at {path}: {source}")]
Read {
path: PathBuf,
#[source]
source: std::io::Error,
},
#[error("failed to parse user patterns file at {path}: {source}")]
Parse {
path: PathBuf,
#[source]
source: toml::de::Error,
},
#[error("invalid regex for user pattern '{id}' in {path}: {source}")]
BadRegex {
path: PathBuf,
id: String,
#[source]
source: regex::Error,
},
#[error("user pattern id '{id}' in {path} must be lowercase kebab-case (`[a-z][a-z0-9-]*`)")]
BadId {
path: PathBuf,
id: String,
},
#[error("user pattern id '{id}' is declared twice (first in {first}, then in {second})")]
DuplicateUserId {
id: String,
first: PathBuf,
second: PathBuf,
},
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct LoadWarning {
pub kind: LoadWarningKind,
pub subject: String,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum LoadWarningKind {
ShadowsBuiltin,
SkippedNonToml,
}
#[derive(Debug, Default)]
pub struct Catalogue {
user_patterns: Vec<UserPattern>,
shadowed: HashSet<String>,
warnings: Vec<LoadWarning>,
}
impl Catalogue {
pub fn builtins_only() -> Self {
Self::default()
}
pub fn load(dir: &Path) -> Result<Self, LoadError> {
let mut cat = Self::default();
if !dir.exists() {
return Ok(cat);
}
let read = fs::read_dir(dir).map_err(|e| LoadError::Read {
path: dir.to_path_buf(),
source: e,
})?;
let mut paths: Vec<PathBuf> = read.filter_map(|res| res.ok().map(|e| e.path())).collect();
paths.sort();
let mut id_origin: std::collections::HashMap<String, PathBuf> =
std::collections::HashMap::new();
for path in paths {
if path.extension().and_then(|s| s.to_str()) != Some("toml") {
cat.warnings.push(LoadWarning {
kind: LoadWarningKind::SkippedNonToml,
subject: path.display().to_string(),
});
continue;
}
let body = fs::read_to_string(&path).map_err(|e| LoadError::Read {
path: path.clone(),
source: e,
})?;
let parsed: UserPatternFile = toml::from_str(&body).map_err(|e| LoadError::Parse {
path: path.clone(),
source: e,
})?;
for entry in parsed.patterns {
if !is_kebab_id(&entry.id) {
return Err(LoadError::BadId {
path: path.clone(),
id: entry.id,
});
}
if let Some(prev) = id_origin.get(&entry.id) {
return Err(LoadError::DuplicateUserId {
id: entry.id,
first: prev.clone(),
second: path.clone(),
});
}
let regex = RegexBuilder::new(&entry.format_regex)
.size_limit(64 * 1024)
.build()
.map_err(|e| LoadError::BadRegex {
path: path.clone(),
id: entry.id.clone(),
source: e,
})?;
let metadata = if entry.provider_id.is_some()
|| entry.retrieval_url_template.is_some()
|| entry.default_expiry_days.is_some()
|| !entry.scopes_hint.is_empty()
{
Some(PatternMetadata {
provider_id: Cow::Owned(entry.provider_id.unwrap_or_default()),
retrieval_url_template: Cow::Owned(
entry.retrieval_url_template.unwrap_or_default(),
),
default_expiry_days: entry.default_expiry_days,
scopes_hint: entry.scopes_hint.into_iter().map(Cow::Owned).collect(),
})
} else {
None
};
if builtin::find(&entry.id).is_some() {
cat.shadowed.insert(entry.id.clone());
cat.warnings.push(LoadWarning {
kind: LoadWarningKind::ShadowsBuiltin,
subject: entry.id.clone(),
});
warn!(
id = %entry.id,
path = %path.display(),
"user pattern shadows built-in entry"
);
}
id_origin.insert(entry.id.clone(), path.clone());
cat.user_patterns.push(UserPattern {
id: entry.id,
display_name: entry.display_name,
severity: entry.severity,
regex,
metadata,
});
}
}
Ok(cat)
}
pub fn iter(&self) -> Vec<&dyn SecretPattern> {
let mut out: Vec<&dyn SecretPattern> = self
.user_patterns
.iter()
.map(|p| p as &dyn SecretPattern)
.collect();
for b in builtin::builtins() {
if !self.shadowed.contains(b.id()) {
out.push(b);
}
}
out
}
pub fn find(&self, id: &str) -> Option<&dyn SecretPattern> {
if let Some(p) = self.user_patterns.iter().find(|p| p.id() == id) {
return Some(p as &dyn SecretPattern);
}
builtin::find(id)
}
pub fn warnings(&self) -> &[LoadWarning] {
&self.warnings
}
pub fn has_user_patterns(&self) -> bool {
!self.user_patterns.is_empty()
}
}
fn is_kebab_id(id: &str) -> bool {
let bytes = id.as_bytes();
if bytes.is_empty() {
return false;
}
if !bytes[0].is_ascii_lowercase() {
return false;
}
bytes
.iter()
.skip(1)
.all(|&b| b.is_ascii_lowercase() || b.is_ascii_digit() || b == b'-')
}
#[cfg(test)]
mod tests {
use super::*;
fn write_toml(dir: &Path, name: &str, body: &str) -> PathBuf {
let p = dir.join(name);
std::fs::write(&p, body).expect("write fixture");
p
}
#[test]
fn missing_directory_yields_empty_catalogue() {
let dir = tempfile::tempdir().unwrap();
let cat = Catalogue::load(&dir.path().join("nonexistent")).unwrap();
assert!(!cat.has_user_patterns());
assert!(cat.warnings.is_empty());
}
#[test]
fn empty_directory_yields_empty_catalogue() {
let dir = tempfile::tempdir().unwrap();
let cat = Catalogue::load(dir.path()).unwrap();
assert!(!cat.has_user_patterns());
assert!(cat.warnings.is_empty());
}
#[test]
fn loads_single_user_pattern() {
let dir = tempfile::tempdir().unwrap();
write_toml(
dir.path(),
"internal.toml",
r#"
[[pattern]]
id = "internal-mfa-token"
display_name = "Internal MFA Token"
format_regex = "^mfa_[A-Z0-9]{40}$"
severity = "high"
provider_id = "internal"
retrieval_url_template = "https://mfa.example.internal/tokens"
default_expiry_days = 180
scopes_hint = ["read", "write"]
"#,
);
let cat = Catalogue::load(dir.path()).unwrap();
assert!(cat.has_user_patterns());
let p = cat.find("internal-mfa-token").expect("found");
assert_eq!(p.id(), "internal-mfa-token");
assert_eq!(p.display_name(), "Internal MFA Token");
assert_eq!(p.severity(), Severity::High);
assert!(
p.format_regex()
.is_match("mfa_ABCDEFGHIJ0123456789ABCDEFGHIJ0123456789")
);
let m = p.metadata().expect("metadata");
assert_eq!(m.provider_id.as_ref(), "internal");
assert_eq!(
m.retrieval_url_template.as_ref(),
"https://mfa.example.internal/tokens"
);
assert_eq!(m.default_expiry_days, Some(180));
assert_eq!(m.scopes_hint.len(), 2);
}
#[test]
fn user_pattern_without_metadata_fields_has_no_metadata() {
let dir = tempfile::tempdir().unwrap();
write_toml(
dir.path(),
"minimal.toml",
r#"
[[pattern]]
id = "minimal-x"
display_name = "Minimal X"
format_regex = "^x_[a-z]{8}$"
severity = "low"
"#,
);
let cat = Catalogue::load(dir.path()).unwrap();
let p = cat.find("minimal-x").unwrap();
assert!(p.metadata().is_none());
}
#[test]
fn loads_multiple_files_in_sorted_order() {
let dir = tempfile::tempdir().unwrap();
write_toml(
dir.path(),
"b.toml",
r#"
[[pattern]]
id = "second"
display_name = "Second"
format_regex = "^.+$"
severity = "low"
"#,
);
write_toml(
dir.path(),
"a.toml",
r#"
[[pattern]]
id = "first"
display_name = "First"
format_regex = "^.+$"
severity = "low"
"#,
);
let cat = Catalogue::load(dir.path()).unwrap();
assert_eq!(cat.user_patterns[0].id(), "first");
assert_eq!(cat.user_patterns[1].id(), "second");
}
#[test]
fn skips_non_toml_files_with_warning() {
let dir = tempfile::tempdir().unwrap();
std::fs::write(dir.path().join("readme.md"), "not toml").unwrap();
write_toml(
dir.path(),
"real.toml",
r#"
[[pattern]]
id = "real-x"
display_name = "Real"
format_regex = "^.+$"
severity = "low"
"#,
);
let cat = Catalogue::load(dir.path()).unwrap();
assert_eq!(cat.user_patterns.len(), 1);
assert!(
cat.warnings
.iter()
.any(|w| matches!(w.kind, LoadWarningKind::SkippedNonToml))
);
}
#[test]
fn user_pattern_shadows_builtin_with_warning() {
let dir = tempfile::tempdir().unwrap();
write_toml(
dir.path(),
"shadow.toml",
r#"
[[pattern]]
id = "github-pat"
display_name = "Custom GitHub PAT"
format_regex = "^my-custom-gh-.+$"
severity = "high"
"#,
);
let cat = Catalogue::load(dir.path()).unwrap();
let p = cat.find("github-pat").expect("found via user");
assert_eq!(p.display_name(), "Custom GitHub PAT");
assert!(p.format_regex().is_match("my-custom-gh-anything"));
assert!(!p.format_regex().is_match("ghp_someValidLookingTokenString"));
let visible_ids: Vec<&str> = cat.iter().iter().map(|p| p.id()).collect();
let count = visible_ids.iter().filter(|id| **id == "github-pat").count();
assert_eq!(count, 1);
assert!(cat.warnings.iter().any(
|w| matches!(w.kind, LoadWarningKind::ShadowsBuiltin) && w.subject == "github-pat"
));
}
#[test]
fn non_shadowing_user_patterns_coexist_with_builtins() {
let dir = tempfile::tempdir().unwrap();
write_toml(
dir.path(),
"x.toml",
r#"
[[pattern]]
id = "internal-x"
display_name = "Internal X"
format_regex = "^x_.+$"
severity = "low"
"#,
);
let cat = Catalogue::load(dir.path()).unwrap();
let visible: Vec<&dyn SecretPattern> = cat.iter();
assert_eq!(visible.len(), 32);
assert!(
!cat.warnings
.iter()
.any(|w| matches!(w.kind, LoadWarningKind::ShadowsBuiltin))
);
}
#[test]
fn rejects_bad_regex() {
let dir = tempfile::tempdir().unwrap();
write_toml(
dir.path(),
"bad.toml",
r#"
[[pattern]]
id = "bad-regex"
display_name = "Bad Regex"
format_regex = "^[unclosed$"
severity = "low"
"#,
);
let err = Catalogue::load(dir.path()).unwrap_err();
match err {
LoadError::BadRegex { id, .. } => assert_eq!(id, "bad-regex"),
other => panic!("expected BadRegex, got {other:?}"),
}
}
#[test]
fn rejects_bad_id_uppercase() {
let dir = tempfile::tempdir().unwrap();
write_toml(
dir.path(),
"bad.toml",
r#"
[[pattern]]
id = "BadId"
display_name = "x"
format_regex = "^.+$"
severity = "low"
"#,
);
let err = Catalogue::load(dir.path()).unwrap_err();
assert!(matches!(err, LoadError::BadId { .. }));
}
#[test]
fn rejects_bad_id_starts_with_digit() {
let dir = tempfile::tempdir().unwrap();
write_toml(
dir.path(),
"bad.toml",
r#"
[[pattern]]
id = "9bad"
display_name = "x"
format_regex = "^.+$"
severity = "low"
"#,
);
assert!(matches!(
Catalogue::load(dir.path()).unwrap_err(),
LoadError::BadId { .. }
));
}
#[test]
fn rejects_duplicate_user_id_across_files() {
let dir = tempfile::tempdir().unwrap();
write_toml(
dir.path(),
"a.toml",
r#"
[[pattern]]
id = "dup"
display_name = "First"
format_regex = "^a$"
severity = "low"
"#,
);
write_toml(
dir.path(),
"b.toml",
r#"
[[pattern]]
id = "dup"
display_name = "Second"
format_regex = "^b$"
severity = "low"
"#,
);
let err = Catalogue::load(dir.path()).unwrap_err();
match err {
LoadError::DuplicateUserId { id, .. } => assert_eq!(id, "dup"),
other => panic!("expected DuplicateUserId, got {other:?}"),
}
}
#[test]
fn rejects_duplicate_user_id_within_file() {
let dir = tempfile::tempdir().unwrap();
write_toml(
dir.path(),
"a.toml",
r#"
[[pattern]]
id = "dup"
display_name = "First"
format_regex = "^a$"
severity = "low"
[[pattern]]
id = "dup"
display_name = "Second"
format_regex = "^b$"
severity = "low"
"#,
);
let err = Catalogue::load(dir.path()).unwrap_err();
assert!(matches!(err, LoadError::DuplicateUserId { .. }));
}
#[test]
fn rejects_unknown_field() {
let dir = tempfile::tempdir().unwrap();
write_toml(
dir.path(),
"bad.toml",
r#"
[[pattern]]
id = "x"
display_name = "x"
format_regex = "^.+$"
severity = "low"
unknown_field = "wrong"
"#,
);
assert!(matches!(
Catalogue::load(dir.path()).unwrap_err(),
LoadError::Parse { .. }
));
}
#[test]
fn builtins_only_returns_thirty_one_patterns() {
let cat = Catalogue::builtins_only();
let visible = cat.iter();
assert_eq!(visible.len(), 31);
assert!(cat.warnings.is_empty());
}
#[test]
fn builtins_only_find_works() {
let cat = Catalogue::builtins_only();
let p = cat.find("github-pat").unwrap();
assert_eq!(p.id(), "github-pat");
assert!(cat.find("no-such-id").is_none());
}
#[test]
fn is_kebab_id_accepts_valid_ids() {
assert!(is_kebab_id("github-pat"));
assert!(is_kebab_id("a"));
assert!(is_kebab_id("a1"));
assert!(is_kebab_id("a-b-c"));
}
#[test]
fn is_kebab_id_rejects_invalid_ids() {
assert!(!is_kebab_id(""));
assert!(!is_kebab_id("Github"));
assert!(!is_kebab_id("9start"));
assert!(!is_kebab_id("with_underscore"));
assert!(!is_kebab_id("with.dot"));
assert!(!is_kebab_id("-leading-dash"));
}
}