use std::collections::{BTreeSet, HashMap};
use std::path::PathBuf;
use serde::Deserialize;
use thiserror::Error;
use crate::{LocaleTag, PiiClass};
const SUPPORTED_SCHEMA_MAJOR_MINOR: &str = "0.1.";
#[derive(Debug, Clone, PartialEq)]
pub struct Rulepack {
pub schema_version: String,
pub rulepack_id: String,
pub rulepack_version: String,
pub default_locales: Vec<LocaleTag>,
pub locale: Option<LocaleData>,
pub recognizers: Vec<RecognizerSpec>,
}
#[derive(Debug, Clone, PartialEq)]
#[non_exhaustive]
pub struct RecognizerSpec {
pub id: String,
pub class: PiiClass,
pub cooperates_with: Vec<String>,
pub enabled: bool,
pub locales: Vec<LocaleTag>,
pub matcher: RawMatch,
pub context: Option<ContextSpec>,
pub validator: Option<ValidatorSpec>,
pub normalizer: Option<NormalizerSpec>,
pub scoring: ScoringSpec,
pub token: TokenSpec,
pub source: Option<SourceSpec>,
}
#[derive(Debug, Clone, PartialEq, Deserialize)]
#[serde(tag = "kind", deny_unknown_fields, rename_all = "snake_case")]
#[non_exhaustive]
pub enum RawMatch {
Regex {
#[serde(default)]
pattern: Option<String>,
#[serde(default)]
pattern_template: Option<String>,
#[serde(default)]
capture_groups: Option<Vec<u32>>,
},
Dictionary {
#[serde(default)]
terms: Vec<String>,
#[serde(default)]
terms_file: Option<String>,
#[serde(default)]
terms_from_context: Option<String>,
#[serde(default)]
case_sensitive: bool,
},
Ner {
model_ref: String,
},
AnchoredMatch {
cues_bucket: String,
boundary: String,
right_window_chars: u16,
name_shape: String,
cue_position: String,
},
}
#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
#[serde(deny_unknown_fields, rename_all = "snake_case")]
#[non_exhaustive]
pub enum AnchoredBoundary {
Punctuation,
Whitespace,
LineEnd,
}
#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
#[serde(deny_unknown_fields, rename_all = "snake_case")]
#[non_exhaustive]
pub enum NameShape {
PersonName,
}
#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
#[serde(deny_unknown_fields, rename_all = "snake_case")]
#[non_exhaustive]
pub enum CuePosition {
Before,
After,
}
#[derive(Debug, Clone, PartialEq)]
#[non_exhaustive]
pub struct ContextSpec {
pub hotwords: Vec<String>,
pub window: Option<u16>,
pub boost: Option<f32>,
pub exclusions: Vec<String>,
}
#[derive(Debug, Clone, PartialEq)]
pub struct ValidatorSpec {
pub kind: String,
}
#[derive(Debug, Clone, PartialEq)]
pub struct NormalizerSpec {
pub kind: String,
}
#[derive(Debug, Clone, PartialEq)]
pub struct ScoringSpec {
pub base: f32,
pub priority: i32,
}
#[derive(Debug, Clone, PartialEq)]
#[non_exhaustive]
pub struct TokenSpec {
pub family: Option<String>,
pub format: Option<String>,
}
#[derive(Debug, Clone, PartialEq)]
pub struct SourceSpec {
pub origin: String,
pub from: Option<String>,
pub license: Option<String>,
}
#[derive(Debug, Clone, PartialEq, Eq, Default)]
pub struct LocaleData {
pub buckets: HashMap<String, LocaleBucket>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct LocaleBucket {
pub names: Vec<String>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub enum RulepackSource {
Embedded(&'static str),
Path(PathBuf),
}
#[derive(Debug, Error)]
#[non_exhaustive]
pub enum RulepackError {
#[error("failed to read rulepack: {0}")]
Io(#[source] std::io::Error),
#[error("failed to parse rulepack TOML: {0}")]
Toml(#[source] toml::de::Error),
#[error("unsupported rulepack schema_version {found}; supported {supported}")]
SchemaVersion { found: String, supported: String },
#[error("unknown pii class: {0}")]
UnknownClass(String),
#[error("unknown locale: {0}")]
UnknownLocale(String),
#[error("unsupported matcher kind: {0}")]
UnsupportedMatcher(String),
#[error("unsupported anchored_match field '{field}' value '{value}'")]
UnsupportedAnchoredMatch { field: String, value: String },
#[error("unsupported rulepack field '{field}' in B1; planned for {planned_version}")]
UnsupportedField {
field: String,
planned_version: &'static str,
},
#[error("unsupported validator kind: {kind}")]
UnsupportedValidator { kind: String },
#[error("unsupported normalizer kind: {kind}")]
UnsupportedNormalizer { kind: String },
#[error("unsupported rule spec variant: {variant}")]
UnsupportedRuleSpec { variant: String },
#[error("duplicate recognizer id '{id}' in rulepacks '{first_pack}' and '{second_pack}'")]
DuplicateId {
id: String,
first_pack: String,
second_pack: String,
},
#[error("regex recognizer '{id}' must define exactly one of pattern or pattern_template")]
RegexPatternChoice { id: String },
#[error("unknown pattern_template placeholder '{placeholder}' in recognizer '{id}'")]
UnknownPatternTemplatePlaceholder { id: String, placeholder: String },
#[error(
"context class_map override for dictionary '{dict}' changes {old_class:?} to {new_class:?}, but {uncovered_rule}"
)]
ClassMapOverrideClash {
dict: String,
old_class: PiiClass,
new_class: PiiClass,
uncovered_rule: String,
},
#[error(
"same-class recognizers '{recognizer_a}' and '{recognizer_b}' both emit {class:?} but neither declares cooperates_with"
)]
SameClassWithoutCooperation {
class: PiiClass,
recognizer_a: String,
recognizer_b: String,
},
#[error(
"recognizers {recognizer_ids:?} share class {class:?} with equivalent regex shape and overlapping locale projection {locale_overlap:?}"
)]
ConflictingLocaleProjection {
class: PiiClass,
recognizer_ids: Vec<String>,
locale_overlap: Vec<LocaleTag>,
},
}
impl Rulepack {
pub fn load(source: RulepackSource) -> Result<Rulepack, RulepackError> {
let raw = match source {
RulepackSource::Embedded(contents) => contents.to_string(),
RulepackSource::Path(path) => {
std::fs::read_to_string(path).map_err(RulepackError::Io)?
}
};
Self::parse(&raw)
}
pub fn parse(raw: &str) -> Result<Rulepack, RulepackError> {
let (raw, lint) = extract_recognizer_lint_config(raw);
let raw: RawRulepack = toml::from_str(&raw).map_err(RulepackError::Toml)?;
RawRulepackWithLint { raw, lint }.try_into()
}
pub fn activated_classes(&self) -> BTreeSet<PiiClass> {
self.recognizers
.iter()
.filter(|recognizer| recognizer.enabled)
.map(|recognizer| recognizer.class.clone())
.collect()
}
}
#[derive(Debug, Deserialize)]
#[serde(deny_unknown_fields)]
struct RawRulepack {
schema_version: String,
rulepack_id: String,
rulepack_version: String,
#[serde(default)]
default_locales: Vec<String>,
#[serde(default)]
locale: Option<RawLocaleData>,
#[serde(default)]
recognizers: Vec<RawRecognizerSpec>,
}
#[derive(Debug, Default)]
struct RawRecognizerLintConfig {
strict_locale_overlap: bool,
}
#[derive(Debug)]
struct RawRulepackWithLint {
raw: RawRulepack,
lint: RawRecognizerLintConfig,
}
#[derive(Debug, Deserialize)]
struct RawLocaleData {
#[serde(flatten)]
buckets: HashMap<String, RawLocaleBucket>,
}
#[derive(Debug, Deserialize)]
#[serde(deny_unknown_fields)]
struct RawLocaleBucket {
names: Vec<String>,
}
#[derive(Debug, Deserialize)]
#[serde(deny_unknown_fields)]
struct RawRecognizerSpec {
id: String,
class: String,
#[serde(default)]
cooperates_with: Vec<String>,
#[serde(default = "default_true")]
enabled: bool,
#[serde(default)]
locales: Vec<String>,
#[serde(rename = "match")]
matcher: RawMatch,
#[serde(default)]
context: Option<RawContextSpec>,
#[serde(default)]
validator: Option<RawValidatorSpec>,
#[serde(default)]
normalizer: Option<RawNormalizerSpec>,
#[serde(default)]
scoring: Option<RawScoringSpec>,
#[serde(default)]
token: RawTokenSpec,
#[serde(default)]
source: Option<RawSourceSpec>,
}
#[derive(Debug, Deserialize)]
#[serde(deny_unknown_fields)]
struct RawContextSpec {
#[serde(default)]
hotwords: Vec<String>,
#[serde(default)]
window: Option<u16>,
#[serde(default)]
boost: Option<f32>,
#[serde(default)]
exclusions: Vec<String>,
}
#[derive(Debug, Deserialize)]
#[serde(deny_unknown_fields)]
struct RawValidatorSpec {
kind: String,
}
#[derive(Debug, Deserialize)]
#[serde(deny_unknown_fields)]
struct RawNormalizerSpec {
kind: String,
}
#[derive(Debug, Deserialize)]
#[serde(deny_unknown_fields)]
struct RawScoringSpec {
#[serde(default = "default_base_score")]
base: f32,
#[serde(default)]
priority: i32,
}
#[derive(Debug, Default, Deserialize)]
#[serde(deny_unknown_fields)]
struct RawTokenSpec {
#[serde(default)]
family: Option<String>,
#[serde(default)]
format: Option<String>,
}
#[derive(Debug, Deserialize)]
#[serde(deny_unknown_fields)]
struct RawSourceSpec {
origin: String,
#[serde(default)]
from: Option<String>,
#[serde(default)]
license: Option<String>,
}
impl TryFrom<RawRulepack> for Rulepack {
type Error = RulepackError;
fn try_from(raw: RawRulepack) -> Result<Self, Self::Error> {
RawRulepackWithLint {
raw,
lint: RawRecognizerLintConfig::default(),
}
.try_into()
}
}
impl TryFrom<RawRulepackWithLint> for Rulepack {
type Error = RulepackError;
fn try_from(raw_with_lint: RawRulepackWithLint) -> Result<Self, Self::Error> {
let raw = raw_with_lint.raw;
if !raw.schema_version.starts_with(SUPPORTED_SCHEMA_MAJOR_MINOR) {
return Err(RulepackError::SchemaVersion {
found: raw.schema_version,
supported: "~0.1.x".to_string(),
});
}
let default_locales = parse_locales(raw.default_locales)?;
let recognizers = raw
.recognizers
.into_iter()
.map(|recognizer| parse_recognizer(recognizer, &default_locales))
.collect::<Result<Vec<_>, _>>()?;
validate_rulepack_recognizers(&recognizers, &default_locales, &raw_with_lint.lint)?;
let locale = raw.locale.map(LocaleData::from);
reject_anchored_match_ellipsis_cues(&recognizers, locale.as_ref())?;
Ok(Self {
schema_version: raw.schema_version,
rulepack_id: raw.rulepack_id,
rulepack_version: raw.rulepack_version,
default_locales,
locale,
recognizers,
})
}
}
fn extract_recognizer_lint_config(raw: &str) -> (String, RawRecognizerLintConfig) {
let mut sanitized = String::with_capacity(raw.len());
let mut lint = RawRecognizerLintConfig::default();
let mut in_lint = false;
for line in raw.lines() {
let trimmed = line.trim();
if trimmed == "[recognizers.lint]" {
in_lint = true;
continue;
}
if in_lint && trimmed.starts_with('[') {
in_lint = false;
}
if in_lint {
if let Some((key, value)) = trimmed.split_once('=') {
if key.trim() == "strict_locale_overlap" {
lint.strict_locale_overlap = value.trim().eq_ignore_ascii_case("true");
}
}
continue;
}
sanitized.push_str(line);
sanitized.push('\n');
}
(sanitized, lint)
}
impl From<RawLocaleData> for LocaleData {
fn from(raw: RawLocaleData) -> Self {
Self {
buckets: raw
.buckets
.into_iter()
.map(|(name, bucket)| {
(
name,
LocaleBucket {
names: bucket.names,
},
)
})
.collect(),
}
}
}
fn parse_recognizer(
raw: RawRecognizerSpec,
default_locales: &[LocaleTag],
) -> Result<RecognizerSpec, RulepackError> {
reject_unshipped_fields(&raw)?;
validate_matcher(&raw)?;
let locales = if raw.locales.is_empty() {
default_locales.to_vec()
} else {
parse_locales(raw.locales)?
};
Ok(RecognizerSpec {
id: raw.id,
class: parse_class(&raw.class)?,
cooperates_with: raw.cooperates_with,
enabled: raw.enabled,
locales,
matcher: raw.matcher,
context: raw.context.map(|context| ContextSpec {
hotwords: context.hotwords,
window: context.window,
boost: context.boost,
exclusions: context.exclusions,
}),
validator: raw.validator.map(|validator| ValidatorSpec {
kind: validator.kind,
}),
normalizer: raw.normalizer.map(|normalizer| NormalizerSpec {
kind: normalizer.kind,
}),
scoring: raw.scoring.map_or_else(
|| ScoringSpec {
base: default_base_score(),
priority: 0,
},
|scoring| ScoringSpec {
base: scoring.base,
priority: scoring.priority,
},
),
token: TokenSpec {
family: raw.token.family,
format: raw.token.format,
},
source: raw.source.map(|source| SourceSpec {
origin: source.origin,
from: source.from,
license: source.license,
}),
})
}
fn validate_matcher(raw: &RawRecognizerSpec) -> Result<(), RulepackError> {
match &raw.matcher {
RawMatch::Regex {
pattern,
pattern_template,
..
} => {
if pattern.is_some() == pattern_template.is_some() {
return Err(RulepackError::RegexPatternChoice { id: raw.id.clone() });
}
}
RawMatch::AnchoredMatch {
cues_bucket,
boundary,
right_window_chars,
name_shape,
cue_position,
..
} => {
if cues_bucket.trim().is_empty() {
return Err(RulepackError::UnsupportedAnchoredMatch {
field: "cues_bucket".to_string(),
value: cues_bucket.clone(),
});
}
if !(1..=512).contains(right_window_chars) {
return Err(RulepackError::UnsupportedAnchoredMatch {
field: "right_window_chars".to_string(),
value: right_window_chars.to_string(),
});
}
if !matches!(boundary.as_str(), "punctuation" | "whitespace" | "line_end") {
return Err(RulepackError::UnsupportedAnchoredMatch {
field: "boundary".to_string(),
value: boundary.clone(),
});
}
if name_shape != "person_name" {
return Err(RulepackError::UnsupportedAnchoredMatch {
field: "name_shape".to_string(),
value: name_shape.clone(),
});
}
if !matches!(cue_position.as_str(), "before" | "after") {
return Err(RulepackError::UnsupportedAnchoredMatch {
field: "cue_position".to_string(),
value: cue_position.clone(),
});
}
}
RawMatch::Dictionary { .. } | RawMatch::Ner { .. } => {}
}
Ok(())
}
fn reject_anchored_match_ellipsis_cues(
recognizers: &[RecognizerSpec],
locale: Option<&LocaleData>,
) -> Result<(), RulepackError> {
let Some(locale) = locale else {
return Ok(());
};
for recognizer in recognizers {
let RawMatch::AnchoredMatch { cues_bucket, .. } = &recognizer.matcher else {
continue;
};
let Some(bucket) = locale.buckets.get(cues_bucket) else {
continue;
};
if let Some(cue) = bucket.names.iter().find(|cue| cue.contains("...")) {
return Err(RulepackError::UnsupportedAnchoredMatch {
field: format!("locale.{cues_bucket}.names"),
value: cue.clone(),
});
}
}
Ok(())
}
fn reject_unshipped_fields(raw: &RawRecognizerSpec) -> Result<(), RulepackError> {
const PLANNED_VERSION: &str = "v0.4.1";
if raw
.token
.format
.as_deref()
.is_some_and(|value| !value.is_empty())
{
return Err(RulepackError::UnsupportedField {
field: "token.format".to_string(),
planned_version: PLANNED_VERSION,
});
}
if let Some(context) = &raw.context {
if !context.hotwords.is_empty() {
return Err(RulepackError::UnsupportedField {
field: "context.hotwords".to_string(),
planned_version: PLANNED_VERSION,
});
}
if context.boost.is_some() {
return Err(RulepackError::UnsupportedField {
field: "context.boost".to_string(),
planned_version: PLANNED_VERSION,
});
}
if context.window.is_some() {
return Err(RulepackError::UnsupportedField {
field: "context.window".to_string(),
planned_version: PLANNED_VERSION,
});
}
}
Ok(())
}
pub fn recognizer_composition_validator(
recognizers: &[RecognizerSpec],
) -> Result<(), RulepackError> {
for (index, first) in recognizers.iter().enumerate() {
for second in recognizers.iter().skip(index + 1) {
if first.class != second.class {
continue;
}
if first.cooperates_with.iter().any(|id| id == &second.id)
|| second.cooperates_with.iter().any(|id| id == &first.id)
{
continue;
}
return Err(RulepackError::SameClassWithoutCooperation {
class: first.class.clone(),
recognizer_a: first.id.clone(),
recognizer_b: second.id.clone(),
});
}
}
Ok(())
}
fn validate_rulepack_recognizers(
recognizers: &[RecognizerSpec],
active_locales: &[LocaleTag],
lint: &RawRecognizerLintConfig,
) -> Result<(), RulepackError> {
recognizer_composition_validator(recognizers)?;
lint_locale_projection_collisions(recognizers, active_locales, lint)?;
lint_global_naked_patterns(recognizers);
Ok(())
}
fn lint_locale_projection_collisions(
recognizers: &[RecognizerSpec],
active_locales: &[LocaleTag],
lint: &RawRecognizerLintConfig,
) -> Result<(), RulepackError> {
for (index, first) in recognizers.iter().enumerate() {
if !first.enabled {
continue;
}
let Some(first_shape) = regex_structural_shape(&first.matcher) else {
continue;
};
if !is_truly_naked_numeric(&first.matcher) {
continue;
}
let first_projection = locale_projection(&first.locales, active_locales);
if first_projection.is_empty() {
continue;
}
for second in recognizers.iter().skip(index + 1) {
if !second.enabled || first.class != second.class {
continue;
}
if !is_truly_naked_numeric(&second.matcher) {
continue;
}
if regex_structural_shape(&second.matcher).as_ref() != Some(&first_shape) {
continue;
}
let second_projection = locale_projection(&second.locales, active_locales);
if second_projection.is_empty() {
continue;
}
let recognizer_ids = vec![first.id.clone(), second.id.clone()];
let locale_overlap = merged_locale_projection(&first_projection, &second_projection);
if lint.strict_locale_overlap {
return Err(RulepackError::ConflictingLocaleProjection {
class: first.class.clone(),
recognizer_ids,
locale_overlap,
});
}
tracing::warn!(
class = %first.class.class_name(),
recognizer_ids = ?recognizer_ids,
locale_overlap = ?locale_overlap,
"recognizers share class with naked-shape regex and non-disjoint locale projection"
);
}
}
Ok(())
}
fn lint_global_naked_patterns(recognizers: &[RecognizerSpec]) {
for recognizer in recognizers {
if !recognizer.enabled || recognizer.locales != [LocaleTag::Global] {
continue;
}
let Some(shape) = regex_structural_shape(&recognizer.matcher) else {
continue;
};
let RawMatch::Regex {
pattern: Some(pattern),
..
} = &recognizer.matcher
else {
continue;
};
if shape.minimum_match_len < 6 && !has_regex_separator(pattern) {
tracing::warn!(
recognizer_id = %recognizer.id,
class = %recognizer.class.class_name(),
minimum_match_len = shape.minimum_match_len,
"global recognizer uses short naked regex shape"
);
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
struct RegexStructuralShape {
minimum_match_len: usize,
character_class: RegexCharacterClass,
}
#[derive(Debug, Clone, PartialEq, Eq)]
enum RegexCharacterClass {
Digit,
}
fn regex_structural_shape(matcher: &RawMatch) -> Option<RegexStructuralShape> {
let RawMatch::Regex {
pattern: Some(pattern),
pattern_template: None,
..
} = matcher
else {
return None;
};
if has_unescaped_line_anchor(pattern) {
return None;
}
digit_quantifier_minimum(pattern).map(|minimum_match_len| RegexStructuralShape {
minimum_match_len,
character_class: RegexCharacterClass::Digit,
})
}
fn is_truly_naked_numeric(matcher: &RawMatch) -> bool {
let RawMatch::Regex {
pattern: Some(pattern),
..
} = matcher
else {
return false;
};
let mut chars = pattern.chars();
while let Some(ch) = chars.next() {
if ch == '\\' {
chars.next();
continue;
}
if ch.is_ascii_alphabetic() {
return false;
}
}
true
}
fn has_unescaped_line_anchor(pattern: &str) -> bool {
let mut escaped = false;
let mut in_class = false;
for ch in pattern.chars() {
if escaped {
escaped = false;
continue;
}
match ch {
'\\' => escaped = true,
'[' => in_class = true,
']' => in_class = false,
'^' | '$' if !in_class => return true,
_ => {}
}
}
false
}
fn digit_quantifier_minimum(pattern: &str) -> Option<usize> {
find_digit_quantifier(pattern, r"\d{")
.or_else(|| find_digit_quantifier(pattern, "[0-9]{"))
.or_else(|| find_digit_quantifier(pattern, "[[:digit:]]{"))
}
fn find_digit_quantifier(pattern: &str, needle: &str) -> Option<usize> {
let start = pattern.find(needle)? + needle.len();
let rest = &pattern[start..];
let digits = rest
.chars()
.take_while(|ch| ch.is_ascii_digit())
.collect::<String>();
if digits.is_empty() {
return None;
}
digits.parse().ok()
}
fn locale_projection(locales: &[LocaleTag], active_locales: &[LocaleTag]) -> Vec<LocaleTag> {
let mut projection = Vec::new();
for locale in locales {
if *locale == LocaleTag::Global {
projection.push(LocaleTag::Global);
} else if active_locales.iter().any(|active| active == locale) {
projection.push(locale.clone());
}
}
projection
}
fn merged_locale_projection(left: &[LocaleTag], right: &[LocaleTag]) -> Vec<LocaleTag> {
let mut merged = Vec::new();
for locale in left.iter().chain(right) {
if !merged.iter().any(|existing| existing == locale) {
merged.push(locale.clone());
}
}
merged
}
fn has_regex_separator(pattern: &str) -> bool {
pattern.contains('-')
|| pattern.contains('/')
|| pattern.contains('.')
|| pattern.contains('+')
|| pattern.contains("\\s")
|| pattern.contains("[:space:]")
}
pub fn parse_class(input: &str) -> Result<PiiClass, RulepackError> {
let trimmed = input.trim();
let lower = trimmed.to_ascii_lowercase();
match lower.as_str() {
"email" => Ok(PiiClass::Email),
"name" => Ok(PiiClass::Name),
"location" => Ok(PiiClass::Location),
"organization" => Ok(PiiClass::Organization),
custom if custom.starts_with("custom:") => {
let name = trimmed
.split_once(':')
.map(|(_, name)| name)
.unwrap_or_default();
if name.trim().is_empty() {
return Err(RulepackError::UnknownClass(input.to_string()));
}
Ok(PiiClass::custom(name))
}
_ => Err(RulepackError::UnknownClass(input.to_string())),
}
}
fn parse_locales(locales: Vec<String>) -> Result<Vec<LocaleTag>, RulepackError> {
locales
.into_iter()
.map(|locale| {
LocaleTag::parse(&locale).map_err(|_| RulepackError::UnknownLocale(locale.clone()))
})
.collect()
}
fn default_true() -> bool {
true
}
fn default_base_score() -> f32 {
0.70
}
#[cfg(test)]
mod tests {
use super::*;
const CORE: &str = r#"
schema_version = "0.1.0"
rulepack_id = "gaze-core"
rulepack_version = "0.4.0"
default_locales = ["global"]
[locale.email_headers]
names = ["From", "To", "Cc", "Bcc", "Reply-To", "Sender"]
[[recognizers]]
id = "email.global"
class = "Email"
enabled = true
locales = ["global"]
[recognizers.match]
kind = "regex"
pattern = '''(?i)\b[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}\b'''
[recognizers.context]
exclusions = ["example.com"]
[recognizers.validator]
kind = "email_rfc"
[recognizers.normalizer]
kind = "email_canonical"
[recognizers.scoring]
base = 0.70
priority = 90
[recognizers.token]
[recognizers.source]
origin = "ported"
from = "presidio"
license = "Apache-2.0"
"#;
#[test]
fn parses_core_rulepack_end_to_end() {
let rulepack = Rulepack::parse(CORE).expect("core rulepack");
assert_eq!(rulepack.rulepack_id, "gaze-core");
assert_eq!(rulepack.default_locales, vec![LocaleTag::Global]);
let header_names = &rulepack
.locale
.as_ref()
.and_then(|locale| locale.buckets.get("email_headers"))
.expect("email headers")
.names;
assert_eq!(
header_names,
&vec!["From", "To", "Cc", "Bcc", "Reply-To", "Sender"]
);
assert_eq!(rulepack.recognizers.len(), 1);
let recognizer = &rulepack.recognizers[0];
assert_eq!(recognizer.id, "email.global");
assert_eq!(recognizer.class, PiiClass::Email);
assert_eq!(recognizer.scoring.priority, 90);
assert!(matches!(recognizer.matcher, RawMatch::Regex { .. }));
}
#[cfg(feature = "bundled-recognizers")]
#[test]
fn embedded_core_activated_classes_match_rulepack_classes() {
let rulepack = Rulepack::load(RulepackSource::Embedded(
gaze_recognizers::embedded("core").expect("core rulepack"),
))
.expect("embedded core rulepack");
assert_eq!(
rulepack.activated_classes(),
BTreeSet::from([PiiClass::Email, PiiClass::Name])
);
}
#[cfg(feature = "bundled-recognizers")]
#[test]
fn embedded_core_loads_full_name_recognizer_cooperation_matrix() {
let rulepack = Rulepack::load(RulepackSource::Embedded(
gaze_recognizers::embedded("core").expect("core rulepack"),
))
.expect("embedded core rulepack");
let name_recognizers = rulepack
.recognizers
.iter()
.filter(|recognizer| recognizer.class == PiiClass::Name)
.collect::<Vec<_>>();
assert_eq!(name_recognizers.len(), 5);
for recognizer in &name_recognizers {
for peer in &name_recognizers {
if recognizer.id == peer.id {
continue;
}
assert!(
recognizer.cooperates_with.contains(&peer.id),
"{} missing cooperates_with {}",
recognizer.id,
peer.id
);
}
}
}
#[cfg(feature = "bundled-recognizers")]
#[test]
fn embedded_core_extended_activated_classes_match_rulepack_classes() {
let rulepack = Rulepack::load(RulepackSource::Embedded(
gaze_recognizers::embedded("core-extended").expect("core-extended rulepack"),
))
.expect("embedded core-extended rulepack");
assert_eq!(
rulepack.activated_classes(),
BTreeSet::from([
PiiClass::custom("phone"),
PiiClass::custom("iban"),
PiiClass::custom("credit_card"),
PiiClass::custom("ip_address"),
PiiClass::custom("eth_address"),
PiiClass::custom("postal_code"),
])
);
}
#[cfg(feature = "bundled-recognizers")]
#[test]
fn activated_classes_include_new_rulepack_recognizer_class() {
let raw = format!(
r#"{}
[[recognizers]]
id = "test.only"
class = "custom:test_only"
enabled = true
locales = ["global"]
[recognizers.match]
kind = "regex"
pattern = "TEST_ONLY"
[recognizers.scoring]
base = 0.70
priority = 1
"#,
gaze_recognizers::embedded("core-extended").expect("core-extended rulepack")
);
let rulepack = Rulepack::parse(&raw).expect("core-extended with synthetic recognizer");
assert!(
rulepack
.activated_classes()
.contains(&PiiClass::custom("test_only")),
"new recognizer class must be derived from rulepack data"
);
}
#[test]
fn rulepack_accepts_token_family() {
let rulepack = Rulepack::parse(&unsupported_field_rulepack(
"[recognizers.token]\nfamily = \"email.formatpreserve\"\n",
))
.expect("token family is active in v0.4.1");
assert_eq!(
rulepack.recognizers[0].token.family.as_deref(),
Some("email.formatpreserve")
);
}
#[test]
fn rulepack_rejects_unsupported_token_format() {
let err = Rulepack::parse(&unsupported_field_rulepack(
"[recognizers.token]\nformat = \"Customer_{n}\"\n",
))
.expect_err("token format is reserved for v0.4.1");
assert_unsupported_field(err, "token.format");
}
#[test]
fn rulepack_rejects_unsupported_context_hotwords() {
let err = Rulepack::parse(&unsupported_field_rulepack(
"[recognizers.context]\nhotwords = [\"foo\"]\n",
))
.expect_err("context hotwords are reserved for v0.4.1");
assert_unsupported_field(err, "context.hotwords");
}
#[test]
fn rulepack_rejects_unsupported_context_boost() {
let err = Rulepack::parse(&unsupported_field_rulepack(
"[recognizers.context]\nboost = 0.10\n",
))
.expect_err("context boost is reserved for v0.4.1");
assert_unsupported_field(err, "context.boost");
}
#[test]
fn rulepack_rejects_unsupported_context_window() {
let err = Rulepack::parse(&unsupported_field_rulepack(
"[recognizers.context]\nwindow = 12\n",
))
.expect_err("context window is reserved for v0.4.1");
assert_unsupported_field(err, "context.window");
}
#[test]
fn rulepack_accepts_default_token_fields() {
let rulepack = Rulepack::parse(CORE).expect("reserved token/context fields are unset");
let recognizer = &rulepack.recognizers[0];
assert_eq!(recognizer.token.family, None);
assert_eq!(recognizer.token.format, None);
assert!(recognizer.context.as_ref().unwrap().hotwords.is_empty());
assert_eq!(recognizer.context.as_ref().unwrap().boost, None);
assert_eq!(recognizer.context.as_ref().unwrap().window, None);
}
#[test]
fn pattern_template_with_pattern_both_present_fails_closed() {
let err = Rulepack::parse(&unsupported_field_rulepack(
"pattern_template = \"{locale_email_headers}: (.+)\"\n",
))
.expect_err("pattern and pattern_template are mutually exclusive");
assert!(matches!(
err,
RulepackError::RegexPatternChoice { id } if id == "bad.email"
));
}
#[test]
fn regex_pattern_or_template_is_required() {
let raw = r#"
schema_version = "0.1.0"
rulepack_id = "bad"
rulepack_version = "0.4.0"
default_locales = ["global"]
[[recognizers]]
id = "bad.email"
class = "Email"
enabled = true
[recognizers.match]
kind = "regex"
"#;
let err = Rulepack::parse(raw).expect_err("regex pattern is required");
assert!(matches!(
err,
RulepackError::RegexPatternChoice { id } if id == "bad.email"
));
}
#[test]
fn anchored_match_accepts_valid_schema() {
let rulepack = Rulepack::parse(&anchored_match_rulepack("")).expect("anchored_match");
assert!(matches!(
rulepack.recognizers[0].matcher,
RawMatch::AnchoredMatch { .. }
));
}
#[test]
fn anchored_match_rejects_unknown_boundary() {
let err = Rulepack::parse(&anchored_match_rulepack("boundary = \"paragraph\"\n"))
.expect_err("unknown boundary fails closed");
assert_unsupported_anchored_match(err, "boundary", "paragraph");
}
#[test]
fn anchored_match_rejects_unknown_name_shape() {
let err = Rulepack::parse(&anchored_match_rulepack("name_shape = \"organization\"\n"))
.expect_err("unknown name_shape fails closed");
assert_unsupported_anchored_match(err, "name_shape", "organization");
}
#[test]
fn anchored_match_rejects_unknown_cue_position() {
let err = Rulepack::parse(&anchored_match_rulepack("cue_position = \"around\"\n"))
.expect_err("unknown cue_position fails closed");
assert_unsupported_anchored_match(err, "cue_position", "around");
}
#[test]
fn anchored_match_rejects_missing_cues_bucket() {
let err = Rulepack::parse(&anchored_match_rulepack("cues_bucket = \"\"\n"))
.expect_err("missing cues_bucket fails closed");
assert_unsupported_anchored_match(err, "cues_bucket", "");
}
#[test]
fn anchored_match_rejects_ellipsis_in_cue_values() {
let err = Rulepack::parse(
r#"
schema_version = "0.1.0"
rulepack_id = "anchored"
rulepack_version = "0.6.0"
default_locales = ["global"]
[locale.forward_markers]
names = ["Forwarded ... message"]
[[recognizers]]
id = "name.forward_marker"
class = "Name"
enabled = true
[recognizers.match]
kind = "anchored_match"
cues_bucket = "forward_markers"
boundary = "punctuation"
right_window_chars = 64
name_shape = "person_name"
cue_position = "before"
"#,
)
.expect_err("ellipsis cue fails closed");
assert_unsupported_anchored_match(
err,
"locale.forward_markers.names",
"Forwarded ... message",
);
}
#[test]
fn anchored_match_rejects_invalid_window_bounds() {
for (value, expected) in [("0", "0"), ("513", "513")] {
let err = Rulepack::parse(&anchored_match_rulepack(&format!(
"right_window_chars = {value}\n"
)))
.expect_err("invalid right_window_chars fails closed");
assert_unsupported_anchored_match(err, "right_window_chars", expected);
}
}
#[test]
fn rulepack_load_fails_when_two_name_recognizers_omit_cooperates_with() {
let err = Rulepack::parse(
r#"
schema_version = "0.1.0"
rulepack_id = "bad-composition"
rulepack_version = "0.4.1"
default_locales = ["global"]
[[recognizers]]
id = "email.header.name"
class = "Name"
enabled = true
[recognizers.match]
kind = "regex"
pattern = "From: ([A-Z][a-z]+)"
[[recognizers]]
id = "salutation.name"
class = "Name"
enabled = true
[recognizers.match]
kind = "regex"
pattern = "Dear ([A-Z][a-z]+)"
"#,
)
.expect_err("same-class recognizers must explicitly cooperate");
assert!(matches!(
err,
RulepackError::SameClassWithoutCooperation {
class: PiiClass::Name,
recognizer_a,
recognizer_b,
} if recognizer_a == "email.header.name" && recognizer_b == "salutation.name"
));
}
#[test]
fn rulepack_load_accepts_same_class_pair_with_cooperates_with() {
let rulepack = Rulepack::parse(
r#"
schema_version = "0.1.0"
rulepack_id = "cooperating-composition"
rulepack_version = "0.4.1"
default_locales = ["global"]
[[recognizers]]
id = "email.header.name"
class = "Name"
cooperates_with = ["salutation.name"]
enabled = true
[recognizers.match]
kind = "regex"
pattern = "From: ([A-Z][a-z]+)"
[[recognizers]]
id = "salutation.name"
class = "Name"
enabled = true
[recognizers.match]
kind = "regex"
pattern = "Dear ([A-Z][a-z]+)"
"#,
)
.expect("cooperates_with unblocks same-class recognizers");
assert_eq!(rulepack.recognizers.len(), 2);
assert_eq!(
rulepack.recognizers[0].cooperates_with,
vec!["salutation.name"]
);
}
#[test]
fn rejects_unknown_fields_with_parent_table_context() {
let err = Rulepack::parse(
r#"
schema_version = "0.1.0"
rulepack_id = "bad"
rulepack_version = "0.4.0"
default_locales = ["global"]
bogus = true
"#,
)
.expect_err("unknown field must fail");
assert!(matches!(err, RulepackError::Toml(_)));
assert!(err.to_string().contains("bogus"));
}
#[test]
fn rejects_unsupported_schema_version() {
let err = Rulepack::parse(
r#"
schema_version = "0.2.0"
rulepack_id = "bad"
rulepack_version = "0.4.0"
"#,
)
.expect_err("unsupported schema");
assert!(matches!(err, RulepackError::SchemaVersion { .. }));
}
#[test]
fn class_spelling_accepts_pascal_case_and_custom_names() {
assert_eq!(parse_class("Email").unwrap(), PiiClass::Email);
assert_eq!(
parse_class("custom:Class_Alpha").unwrap(),
PiiClass::Custom("class_alpha".to_string())
);
}
fn unsupported_field_rulepack(extra: &str) -> String {
format!(
r#"
schema_version = "0.1.0"
rulepack_id = "bad"
rulepack_version = "0.4.0"
default_locales = ["global"]
[[recognizers]]
id = "bad.email"
class = "Email"
enabled = true
[recognizers.match]
kind = "regex"
pattern = ".+"
{extra}
"#
)
}
fn anchored_match_rulepack(override_line: &str) -> String {
let cues_bucket = if override_line.starts_with("cues_bucket") {
override_line.to_string()
} else {
"cues_bucket = \"forward_markers\"\n".to_string()
};
let boundary = if override_line.starts_with("boundary") {
override_line.to_string()
} else {
"boundary = \"punctuation\"\n".to_string()
};
let right_window_chars = if override_line.starts_with("right_window_chars") {
override_line.to_string()
} else {
"right_window_chars = 64\n".to_string()
};
let name_shape = if override_line.starts_with("name_shape") {
override_line.to_string()
} else {
"name_shape = \"person_name\"\n".to_string()
};
let cue_position = if override_line.starts_with("cue_position") {
override_line.to_string()
} else {
"cue_position = \"before\"\n".to_string()
};
format!(
r#"
schema_version = "0.1.0"
rulepack_id = "anchored"
rulepack_version = "0.6.0"
default_locales = ["global"]
[[recognizers]]
id = "name.forward_marker"
class = "Name"
enabled = true
[recognizers.match]
kind = "anchored_match"
{cues_bucket}{boundary}{right_window_chars}{name_shape}{cue_position}
"#
)
}
fn assert_unsupported_field(err: RulepackError, field: &str) {
assert!(matches!(
err,
RulepackError::UnsupportedField {
field: ref actual,
planned_version: "v0.4.1",
} if actual == field
));
}
fn assert_unsupported_anchored_match(err: RulepackError, field: &str, value: &str) {
assert!(matches!(
err,
RulepackError::UnsupportedAnchoredMatch {
field: ref actual_field,
value: ref actual_value,
} if actual_field == field && actual_value == value
));
}
}