pub mod archive;
pub mod csv_proc;
pub mod env_proc;
pub mod ini_proc;
pub mod json_proc;
pub mod jsonl_proc;
pub mod key_value;
pub(crate) mod limits;
pub mod log_line;
pub mod profile;
pub mod registry;
pub mod toml_proc;
pub mod xml_proc;
pub mod yaml_proc;
pub use profile::{FieldNameSignal, FieldRule, FileTypeProfile, DEFAULT_FIELD_SIGNAL_THRESHOLD};
pub use registry::ProcessorRegistry;
use crate::category::Category;
use crate::error::{Result, SanitizeError};
use crate::store::MappingStore;
use std::io;
pub trait Processor: Send + Sync {
fn name(&self) -> &'static str;
fn can_handle(&self, content: &[u8], profile: &FileTypeProfile) -> bool;
fn process(
&self,
content: &[u8],
profile: &FileTypeProfile,
store: &MappingStore,
) -> Result<Vec<u8>>;
fn supports_streaming(&self) -> bool {
false
}
fn process_stream(
&self,
reader: &mut dyn io::Read,
writer: &mut dyn io::Write,
profile: &FileTypeProfile,
store: &MappingStore,
) -> Result<()> {
let mut buf = Vec::new();
io::Read::read_to_end(reader, &mut buf)?;
let out = self.process(&buf, profile, store)?;
io::Write::write_all(writer, &out)?;
Ok(())
}
}
pub(crate) fn replace_value(value: &str, rule: &FieldRule, store: &MappingStore) -> Result<String> {
if let Some(min) = rule.min_length {
if value.len() < min {
return Ok(value.to_string());
}
}
let category = rule
.category
.clone()
.unwrap_or(Category::Custom("field".into()));
let sanitized = store.get_or_insert(&category, value)?;
Ok(sanitized.to_string())
}
#[must_use]
pub(crate) fn build_path(prefix: &str, key: &str) -> String {
if prefix.is_empty() {
key.to_string()
} else {
format!("{}.{}", prefix, key)
}
}
#[must_use]
pub(crate) fn pattern_matches(pattern: &str, key_path: &str) -> bool {
if pattern == "*" {
return true;
}
if pattern == key_path {
return true;
}
if !pattern.contains('*') {
return false;
}
if let Some(suffix) = pattern.strip_prefix("*.") {
if !suffix.contains('*')
&& (key_path == suffix
|| key_path
.strip_suffix(suffix)
.is_some_and(|rest| rest.ends_with('.')))
{
return true;
}
}
if let Some(prefix) = pattern.strip_suffix(".*") {
if !prefix.contains('*')
&& key_path
.strip_prefix(prefix)
.is_some_and(|rest| rest.starts_with('.'))
{
return true;
}
}
glob_matches(pattern, key_path)
}
use crate::allowlist::glob_matches;
#[inline]
#[allow(clippy::cast_precision_loss)]
pub(crate) fn shannon_entropy(data: &[u8]) -> f64 {
if data.is_empty() {
return 0.0;
}
let mut counts = [0u32; 256];
for &b in data {
counts[b as usize] += 1;
}
let len = data.len() as f64;
counts
.iter()
.filter(|&&c| c > 0)
.map(|&c| {
let p = f64::from(c) / len;
-p * p.log2()
})
.sum()
}
#[must_use]
pub(crate) fn find_field_signal<'a>(
key: &str,
signals: &'a [FieldNameSignal],
) -> Option<&'a FieldNameSignal> {
signals.iter().find(|sig| sig.matches_key(key))
}
pub(crate) fn replace_by_signal(
value: &str,
sig: &FieldNameSignal,
store: &MappingStore,
) -> Result<Option<String>> {
if value.is_empty() {
return Ok(None);
}
if shannon_entropy(value.as_bytes()) < sig.threshold {
return Ok(None);
}
let replaced = store.get_or_insert(&sig.category, value)?;
Ok(Some(replaced.to_string()))
}
#[must_use]
pub(crate) fn find_matching_rule<'a>(
key_path: &str,
profile: &'a FileTypeProfile,
) -> Option<&'a FieldRule> {
profile
.fields
.iter()
.find(|rule| pattern_matches(&rule.pattern, key_path))
}
pub(crate) trait TreeNode {
fn for_each_map_entry<F>(&mut self, f: F) -> Result<()>
where
F: FnMut(&str, &mut Self) -> Result<()>;
fn for_each_seq_item<F>(&mut self, f: F) -> Result<()>
where
F: FnMut(&mut Self) -> Result<()>;
fn as_str_mut(&mut self) -> Option<&mut String>;
fn is_scalar(&self) -> bool;
fn scalar_to_string(&self) -> String;
fn set_string(&mut self, s: String);
}
pub(crate) fn walk_tree<V: TreeNode>(
value: &mut V,
prefix: &str,
profile: &FileTypeProfile,
store: &MappingStore,
depth: usize,
format_name: &str,
) -> Result<()> {
if depth > limits::DEFAULT_DEPTH {
return Err(SanitizeError::RecursionDepthExceeded(format!(
"{format_name} recursion depth exceeds limit of {}",
limits::DEFAULT_DEPTH
)));
}
value.for_each_map_entry(|key, v| {
let path = build_path(prefix, key);
if let Some(s) = v.as_str_mut() {
if let Some(rule) = find_matching_rule(&path, profile) {
*s = replace_value(s, rule, store)?;
} else if let Some(sig) = find_field_signal(key, &profile.field_name_signals) {
if let Some(replaced) = replace_by_signal(s, sig, store)? {
*s = replaced;
}
}
} else if v.is_scalar() {
if let Some(rule) = find_matching_rule(&path, profile) {
let repr = v.scalar_to_string();
let replaced = replace_value(&repr, rule, store)?;
v.set_string(replaced);
} else if let Some(sig) = find_field_signal(key, &profile.field_name_signals) {
let repr = v.scalar_to_string();
if let Some(replaced) = replace_by_signal(&repr, sig, store)? {
v.set_string(replaced);
}
}
} else {
walk_tree(v, &path, profile, store, depth + 1, format_name)?;
}
Ok(())
})?;
value.for_each_seq_item(|item| walk_tree(item, prefix, profile, store, depth + 1, format_name))
}
#[cfg(test)]
mod tests {
use super::*;
use crate::category::Category;
#[test]
#[allow(clippy::float_cmp)]
fn entropy_empty_is_zero() {
assert_eq!(shannon_entropy(b""), 0.0);
}
#[test]
#[allow(clippy::float_cmp)]
fn entropy_single_byte_is_zero() {
assert_eq!(shannon_entropy(b"aaaa"), 0.0);
}
#[test]
fn entropy_two_equal_symbols_is_one_bit() {
assert!((shannon_entropy(b"abababab") - 1.0).abs() < 1e-10);
}
#[test]
fn entropy_high_for_random_hex() {
let h = shannon_entropy(b"a3f8c2d1e9b7f4a2c8d3e1b9f7a4c2d1");
assert!(h > 3.5, "expected entropy > 3.5, got {h}");
}
#[test]
fn entropy_low_for_word() {
let h = shannon_entropy(b"Bearer");
assert!(h < 3.0, "expected entropy < 3.0, got {h}");
}
#[test]
fn signal_matches_exact_key() {
let sig = FieldNameSignal::new("^password$", Category::AuthToken, None, 3.5).unwrap();
assert!(sig.matches_key("password"));
assert!(!sig.matches_key("db_password"));
assert!(!sig.matches_key("PASSWORD_HASH"));
}
#[test]
fn signal_match_is_case_insensitive() {
let sig = FieldNameSignal::new("^password$", Category::AuthToken, None, 3.5).unwrap();
assert!(sig.matches_key("PASSWORD"));
assert!(sig.matches_key("Password"));
}
#[test]
fn signal_alternation_pattern() {
let sig =
FieldNameSignal::new(r"^(password|secret|token)$", Category::AuthToken, None, 3.5)
.unwrap();
assert!(sig.matches_key("password"));
assert!(sig.matches_key("secret"));
assert!(sig.matches_key("token"));
assert!(!sig.matches_key("token_type"));
}
#[test]
fn signal_invalid_regex_returns_error() {
let result = FieldNameSignal::new("[invalid(", Category::AuthToken, None, 3.5);
assert!(result.is_err());
}
#[test]
fn signal_default_label_derived_from_pattern() {
let sig = FieldNameSignal::new("^secret$", Category::AuthToken, None, 3.5).unwrap();
assert_eq!(sig.label, "field-signal:^secret$");
}
#[test]
fn signal_custom_label_preserved() {
let sig = FieldNameSignal::new(
"^secret$",
Category::AuthToken,
Some("my-label".into()),
3.5,
)
.unwrap();
assert_eq!(sig.label, "my-label");
}
#[test]
fn find_returns_none_for_empty_signals() {
assert!(find_field_signal("password", &[]).is_none());
}
#[test]
fn find_returns_first_matching_signal() {
let s1 = FieldNameSignal::new("^password$", Category::AuthToken, Some("s1".into()), 3.0)
.unwrap();
let s2 =
FieldNameSignal::new("^token$", Category::AuthToken, Some("s2".into()), 3.5).unwrap();
let signals = vec![s1, s2];
let found = find_field_signal("token", &signals).unwrap();
assert_eq!(found.label, "s2");
}
#[test]
fn find_returns_none_when_no_match() {
let sig = FieldNameSignal::new("^password$", Category::AuthToken, None, 3.5).unwrap();
assert!(find_field_signal("hostname", &[sig]).is_none());
}
}