use crate::category::Category;
use glob::Pattern;
use regex::Regex;
use serde::{Deserialize, Serialize};
pub const DEFAULT_FIELD_SIGNAL_THRESHOLD: f64 = 3.5;
#[derive(Debug, Clone)]
pub struct FieldNameSignal {
pub key_pattern: String,
pub(crate) key_regex: Regex,
pub category: Category,
pub label: String,
pub threshold: f64,
}
impl FieldNameSignal {
pub fn new(
key_pattern: impl Into<String>,
category: Category,
label: Option<String>,
threshold: f64,
) -> Result<Self, String> {
let key_pattern = key_pattern.into();
let key_regex = regex::RegexBuilder::new(&key_pattern)
.case_insensitive(true)
.build()
.map_err(|e| format!("field-name signal pattern {:?}: {e}", key_pattern))?;
let label = label.unwrap_or_else(|| format!("field-signal:{}", key_pattern));
Ok(Self {
key_pattern,
key_regex,
category,
label,
threshold,
})
}
#[inline]
#[must_use]
pub fn matches_key(&self, key: &str) -> bool {
self.key_regex.is_match(key)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FieldRule {
pub pattern: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub category: Option<Category>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub label: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub min_length: Option<usize>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub sub_processor: Option<String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub sub_fields: Vec<FieldRule>,
}
impl FieldRule {
#[must_use]
pub fn new(pattern: impl Into<String>) -> Self {
Self {
pattern: pattern.into(),
category: None,
label: None,
min_length: None,
sub_processor: None,
sub_fields: Vec::new(),
}
}
#[must_use]
pub fn with_min_length(mut self, min: usize) -> Self {
self.min_length = Some(min);
self
}
#[must_use]
pub fn with_category(mut self, category: Category) -> Self {
self.category = Some(category);
self
}
#[must_use]
pub fn with_label(mut self, label: impl Into<String>) -> Self {
self.label = Some(label.into());
self
}
#[must_use]
pub fn with_sub_processor(mut self, name: impl Into<String>) -> Self {
self.sub_processor = Some(name.into());
self
}
#[must_use]
pub fn with_sub_fields(mut self, fields: Vec<FieldRule>) -> Self {
self.sub_fields = fields;
self
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FileTypeProfile {
pub processor: String,
#[serde(default)]
pub extensions: Vec<String>,
#[serde(default)]
pub include: Vec<String>,
#[serde(default)]
pub exclude: Vec<String>,
pub fields: Vec<FieldRule>,
#[serde(default)]
pub options: std::collections::HashMap<String, String>,
#[serde(skip)]
pub field_name_signals: Vec<FieldNameSignal>,
}
impl FileTypeProfile {
#[must_use]
pub fn new(processor: impl Into<String>, fields: Vec<FieldRule>) -> Self {
Self {
processor: processor.into(),
extensions: Vec::new(),
include: Vec::new(),
exclude: Vec::new(),
fields,
options: std::collections::HashMap::new(),
field_name_signals: Vec::new(),
}
}
#[must_use]
pub fn with_extension(mut self, ext: impl Into<String>) -> Self {
self.extensions.push(ext.into());
self
}
#[must_use]
pub fn with_option(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
self.options.insert(key.into(), value.into());
self
}
pub fn matches_filename(&self, filename: &str) -> bool {
if self.extensions.is_empty() {
return false;
}
if !self
.extensions
.iter()
.any(|ext| filename.ends_with(ext.as_str()))
{
return false;
}
let basename: &str = std::path::Path::new(filename)
.file_name()
.and_then(|n| n.to_str())
.unwrap_or(filename);
let glob_matches =
|pat: &str| Pattern::new(pat).is_ok_and(|p| p.matches(filename) || p.matches(basename));
if !self.include.is_empty() && !self.include.iter().any(|pat| glob_matches(pat)) {
return false;
}
if self.exclude.iter().any(|pat| glob_matches(pat)) {
return false;
}
true
}
#[must_use]
pub fn with_include(mut self, pat: impl Into<String>) -> Self {
self.include.push(pat.into());
self
}
#[must_use]
pub fn with_exclude(mut self, pat: impl Into<String>) -> Self {
self.exclude.push(pat.into());
self
}
}
impl Serialize for Category {
fn serialize<S: serde::Serializer>(
&self,
serializer: S,
) -> std::result::Result<S::Ok, S::Error> {
serializer.serialize_str(&self.to_string())
}
}
impl<'de> Deserialize<'de> for Category {
fn deserialize<D: serde::Deserializer<'de>>(
deserializer: D,
) -> std::result::Result<Self, D::Error> {
let s = String::deserialize(deserializer)?;
Ok(match s.as_str() {
"email" => Category::Email,
"name" => Category::Name,
"phone" => Category::Phone,
"ipv4" => Category::IpV4,
"ipv6" => Category::IpV6,
"credit_card" => Category::CreditCard,
"ssn" => Category::Ssn,
"hostname" => Category::Hostname,
"mac_address" => Category::MacAddress,
"container_id" => Category::ContainerId,
"uuid" => Category::Uuid,
"jwt" => Category::Jwt,
"auth_token" => Category::AuthToken,
"file_path" => Category::FilePath,
"windows_sid" => Category::WindowsSid,
"url" => Category::Url,
"aws_arn" => Category::AwsArn,
"azure_resource_id" => Category::AzureResourceId,
other => {
let tag = other.strip_prefix("custom:").unwrap_or(other);
Category::Custom(tag.into())
}
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn field_rule_with_min_length() {
let rule = FieldRule::new("*.password").with_min_length(8);
assert_eq!(rule.min_length, Some(8));
}
#[test]
fn field_rule_with_category() {
let rule = FieldRule::new("*.email").with_category(Category::Email);
assert_eq!(rule.category, Some(Category::Email));
}
#[test]
fn field_rule_with_label() {
let rule = FieldRule::new("*.token").with_label("my-token");
assert_eq!(rule.label.as_deref(), Some("my-token"));
}
#[test]
fn profile_with_include_narrows_match() {
let profile = FileTypeProfile::new("json", vec![])
.with_extension(".json")
.with_include("config*.json");
assert!(profile.matches_filename("config.json"));
assert!(profile.matches_filename("config-prod.json"));
assert!(!profile.matches_filename("events.json"));
}
#[test]
fn profile_with_exclude_blocks_match() {
let profile = FileTypeProfile::new("json", vec![])
.with_extension(".json")
.with_exclude("*.log.json");
assert!(profile.matches_filename("config.json"));
assert!(!profile.matches_filename("server.log.json"));
}
#[test]
fn profile_include_and_exclude_combined() {
let profile = FileTypeProfile::new("json", vec![])
.with_extension(".json")
.with_include("config*.json")
.with_exclude("config-secret.json");
assert!(profile.matches_filename("config-prod.json"));
assert!(!profile.matches_filename("config-secret.json"));
assert!(!profile.matches_filename("events.json"));
}
#[test]
fn profile_no_extensions_matches_nothing() {
let profile = FileTypeProfile::new("json", vec![]);
assert!(!profile.matches_filename("anything.json"));
}
#[test]
fn category_serialize_deserialize_roundtrip() {
let cases: &[(&str, Category)] = &[
("email", Category::Email),
("ipv4", Category::IpV4),
("custom:my_key", Category::Custom("my_key".into())),
];
for (s, expected) in cases {
let json = format!("\"{}\"", s);
let got: Category = serde_json::from_str(&json).unwrap();
assert_eq!(got, *expected, "deserializing {s}");
let serialized = serde_json::to_string(&got).unwrap();
assert_eq!(serialized, json, "serializing {s}");
}
}
}