Skip to main content

gaze/
rulepack.rs

1use std::collections::{BTreeSet, HashMap};
2use std::path::PathBuf;
3
4use serde::Deserialize;
5use thiserror::Error;
6
7use crate::{LocaleTag, PiiClass};
8
9const SUPPORTED_SCHEMA_MAJOR_MINOR: &str = "0.1.";
10
11#[derive(Debug, Clone, PartialEq)]
12pub struct Rulepack {
13    pub schema_version: String,
14    pub rulepack_id: String,
15    pub rulepack_version: String,
16    pub default_locales: Vec<LocaleTag>,
17    pub locale: Option<LocaleData>,
18    pub recognizers: Vec<RecognizerSpec>,
19}
20
21#[derive(Debug, Clone, PartialEq)]
22#[non_exhaustive]
23pub struct RecognizerSpec {
24    pub id: String,
25    pub class: PiiClass,
26    pub cooperates_with: Vec<String>,
27    pub enabled: bool,
28    pub locales: Vec<LocaleTag>,
29    pub matcher: RawMatch,
30    pub context: Option<ContextSpec>,
31    pub validator: Option<ValidatorSpec>,
32    pub normalizer: Option<NormalizerSpec>,
33    pub scoring: ScoringSpec,
34    pub token: TokenSpec,
35    pub source: Option<SourceSpec>,
36}
37
38#[derive(Debug, Clone, PartialEq, Deserialize)]
39#[serde(tag = "kind", deny_unknown_fields, rename_all = "snake_case")]
40#[non_exhaustive]
41pub enum RawMatch {
42    Regex {
43        #[serde(default)]
44        pattern: Option<String>,
45        #[serde(default)]
46        pattern_template: Option<String>,
47        #[serde(default)]
48        capture_groups: Option<Vec<u32>>,
49    },
50    Dictionary {
51        #[serde(default)]
52        terms: Vec<String>,
53        #[serde(default)]
54        terms_file: Option<String>,
55        #[serde(default)]
56        terms_from_context: Option<String>,
57        #[serde(default)]
58        case_sensitive: bool,
59    },
60    Ner {
61        model_ref: String,
62    },
63    AnchoredMatch {
64        cues_bucket: String,
65        boundary: String,
66        right_window_chars: u16,
67        name_shape: String,
68        cue_position: String,
69    },
70}
71
72#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
73#[serde(deny_unknown_fields, rename_all = "snake_case")]
74#[non_exhaustive]
75pub enum AnchoredBoundary {
76    Punctuation,
77    Whitespace,
78    LineEnd,
79}
80
81/// Closed enum for the shape of token sequences `anchored_match` extracts.
82///
83/// v0.6 ships a single `PersonName` variant. Future variants
84/// (e.g. `Organization`, `Address`, `LegalEntity`) must justify
85/// why they aren't a locale-bucket lookup before being added here.
86/// Adding variants without that justification regresses the
87/// principle drawer `session-2026-04-25-no-codified-domain-concerns`
88/// (see also `lower_email_header_pattern_template` in pre-v0.4.1 history).
89#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
90#[serde(deny_unknown_fields, rename_all = "snake_case")]
91#[non_exhaustive]
92pub enum NameShape {
93    PersonName,
94}
95
96#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
97#[serde(deny_unknown_fields, rename_all = "snake_case")]
98#[non_exhaustive]
99pub enum CuePosition {
100    Before,
101    After,
102}
103
104#[derive(Debug, Clone, PartialEq)]
105#[non_exhaustive]
106pub struct ContextSpec {
107    pub hotwords: Vec<String>,
108    pub window: Option<u16>,
109    pub boost: Option<f32>,
110    pub exclusions: Vec<String>,
111}
112
113#[derive(Debug, Clone, PartialEq)]
114pub struct ValidatorSpec {
115    pub kind: String,
116}
117
118#[derive(Debug, Clone, PartialEq)]
119pub struct NormalizerSpec {
120    pub kind: String,
121}
122
123#[derive(Debug, Clone, PartialEq)]
124pub struct ScoringSpec {
125    pub base: f32,
126    pub priority: i32,
127}
128
129#[derive(Debug, Clone, PartialEq)]
130#[non_exhaustive]
131pub struct TokenSpec {
132    pub family: Option<String>,
133    pub format: Option<String>,
134}
135
136#[derive(Debug, Clone, PartialEq)]
137pub struct SourceSpec {
138    pub origin: String,
139    pub from: Option<String>,
140    pub license: Option<String>,
141}
142
143#[derive(Debug, Clone, PartialEq, Eq, Default)]
144pub struct LocaleData {
145    pub buckets: HashMap<String, LocaleBucket>,
146}
147
148#[derive(Debug, Clone, PartialEq, Eq)]
149pub struct LocaleBucket {
150    pub names: Vec<String>,
151}
152
153#[derive(Debug, Clone, PartialEq, Eq)]
154#[non_exhaustive]
155pub enum RulepackSource {
156    Embedded(&'static str),
157    Path(PathBuf),
158}
159
160#[derive(Debug, Error)]
161#[non_exhaustive]
162pub enum RulepackError {
163    #[error("failed to read rulepack: {0}")]
164    Io(#[source] std::io::Error),
165    #[error("failed to parse rulepack TOML: {0}")]
166    Toml(#[source] toml::de::Error),
167    #[error("unsupported rulepack schema_version {found}; supported {supported}")]
168    SchemaVersion { found: String, supported: String },
169    #[error("unknown pii class: {0}")]
170    UnknownClass(String),
171    #[error("unknown locale: {0}")]
172    UnknownLocale(String),
173    #[error("unsupported matcher kind: {0}")]
174    UnsupportedMatcher(String),
175    #[error("unsupported anchored_match field '{field}' value '{value}'")]
176    UnsupportedAnchoredMatch { field: String, value: String },
177    #[error("unsupported rulepack field '{field}' in B1; planned for {planned_version}")]
178    UnsupportedField {
179        field: String,
180        planned_version: &'static str,
181    },
182    #[error("unsupported validator kind: {kind}")]
183    UnsupportedValidator { kind: String },
184    #[error("unsupported normalizer kind: {kind}")]
185    UnsupportedNormalizer { kind: String },
186    #[error("unsupported rule spec variant: {variant}")]
187    UnsupportedRuleSpec { variant: String },
188    #[error("duplicate recognizer id '{id}' in rulepacks '{first_pack}' and '{second_pack}'")]
189    DuplicateId {
190        id: String,
191        first_pack: String,
192        second_pack: String,
193    },
194    #[error("regex recognizer '{id}' must define exactly one of pattern or pattern_template")]
195    RegexPatternChoice { id: String },
196    #[error("unknown pattern_template placeholder '{placeholder}' in recognizer '{id}'")]
197    UnknownPatternTemplatePlaceholder { id: String, placeholder: String },
198    #[error(
199        "context class_map override for dictionary '{dict}' changes {old_class:?} to {new_class:?}, but {uncovered_rule}"
200    )]
201    ClassMapOverrideClash {
202        dict: String,
203        old_class: PiiClass,
204        new_class: PiiClass,
205        uncovered_rule: String,
206    },
207    #[error(
208        "same-class recognizers '{recognizer_a}' and '{recognizer_b}' both emit {class:?} but neither declares cooperates_with"
209    )]
210    SameClassWithoutCooperation {
211        class: PiiClass,
212        recognizer_a: String,
213        recognizer_b: String,
214    },
215    #[error(
216        "recognizers {recognizer_ids:?} share class {class:?} with equivalent regex shape and overlapping locale projection {locale_overlap:?}"
217    )]
218    ConflictingLocaleProjection {
219        class: PiiClass,
220        recognizer_ids: Vec<String>,
221        locale_overlap: Vec<LocaleTag>,
222    },
223}
224
225impl Rulepack {
226    pub fn load(source: RulepackSource) -> Result<Rulepack, RulepackError> {
227        let raw = match source {
228            RulepackSource::Embedded(contents) => contents.to_string(),
229            RulepackSource::Path(path) => {
230                std::fs::read_to_string(path).map_err(RulepackError::Io)?
231            }
232        };
233        Self::parse(&raw)
234    }
235
236    pub fn parse(raw: &str) -> Result<Rulepack, RulepackError> {
237        let (raw, lint) = extract_recognizer_lint_config(raw);
238        let raw: RawRulepack = toml::from_str(&raw).map_err(RulepackError::Toml)?;
239        RawRulepackWithLint { raw, lint }.try_into()
240    }
241
242    pub fn activated_classes(&self) -> BTreeSet<PiiClass> {
243        self.recognizers
244            .iter()
245            .filter(|recognizer| recognizer.enabled)
246            .map(|recognizer| recognizer.class.clone())
247            .collect()
248    }
249}
250
251#[derive(Debug, Deserialize)]
252#[serde(deny_unknown_fields)]
253struct RawRulepack {
254    schema_version: String,
255    rulepack_id: String,
256    rulepack_version: String,
257    #[serde(default)]
258    default_locales: Vec<String>,
259    #[serde(default)]
260    locale: Option<RawLocaleData>,
261    #[serde(default)]
262    recognizers: Vec<RawRecognizerSpec>,
263}
264
265#[derive(Debug, Default)]
266struct RawRecognizerLintConfig {
267    strict_locale_overlap: bool,
268}
269
270#[derive(Debug)]
271struct RawRulepackWithLint {
272    raw: RawRulepack,
273    lint: RawRecognizerLintConfig,
274}
275
276#[derive(Debug, Deserialize)]
277struct RawLocaleData {
278    #[serde(flatten)]
279    buckets: HashMap<String, RawLocaleBucket>,
280}
281
282#[derive(Debug, Deserialize)]
283#[serde(deny_unknown_fields)]
284struct RawLocaleBucket {
285    names: Vec<String>,
286}
287
288#[derive(Debug, Deserialize)]
289#[serde(deny_unknown_fields)]
290struct RawRecognizerSpec {
291    id: String,
292    class: String,
293    #[serde(default)]
294    cooperates_with: Vec<String>,
295    #[serde(default = "default_true")]
296    enabled: bool,
297    #[serde(default)]
298    locales: Vec<String>,
299    #[serde(rename = "match")]
300    matcher: RawMatch,
301    #[serde(default)]
302    context: Option<RawContextSpec>,
303    #[serde(default)]
304    validator: Option<RawValidatorSpec>,
305    #[serde(default)]
306    normalizer: Option<RawNormalizerSpec>,
307    #[serde(default)]
308    scoring: Option<RawScoringSpec>,
309    #[serde(default)]
310    token: RawTokenSpec,
311    #[serde(default)]
312    source: Option<RawSourceSpec>,
313}
314
315#[derive(Debug, Deserialize)]
316#[serde(deny_unknown_fields)]
317struct RawContextSpec {
318    #[serde(default)]
319    hotwords: Vec<String>,
320    #[serde(default)]
321    window: Option<u16>,
322    #[serde(default)]
323    boost: Option<f32>,
324    #[serde(default)]
325    exclusions: Vec<String>,
326}
327
328#[derive(Debug, Deserialize)]
329#[serde(deny_unknown_fields)]
330struct RawValidatorSpec {
331    kind: String,
332}
333
334#[derive(Debug, Deserialize)]
335#[serde(deny_unknown_fields)]
336struct RawNormalizerSpec {
337    kind: String,
338}
339
340#[derive(Debug, Deserialize)]
341#[serde(deny_unknown_fields)]
342struct RawScoringSpec {
343    #[serde(default = "default_base_score")]
344    base: f32,
345    #[serde(default)]
346    priority: i32,
347}
348
349#[derive(Debug, Default, Deserialize)]
350#[serde(deny_unknown_fields)]
351struct RawTokenSpec {
352    #[serde(default)]
353    family: Option<String>,
354    #[serde(default)]
355    format: Option<String>,
356}
357
358#[derive(Debug, Deserialize)]
359#[serde(deny_unknown_fields)]
360struct RawSourceSpec {
361    origin: String,
362    #[serde(default)]
363    from: Option<String>,
364    #[serde(default)]
365    license: Option<String>,
366}
367
368impl TryFrom<RawRulepack> for Rulepack {
369    type Error = RulepackError;
370
371    fn try_from(raw: RawRulepack) -> Result<Self, Self::Error> {
372        RawRulepackWithLint {
373            raw,
374            lint: RawRecognizerLintConfig::default(),
375        }
376        .try_into()
377    }
378}
379
380impl TryFrom<RawRulepackWithLint> for Rulepack {
381    type Error = RulepackError;
382
383    fn try_from(raw_with_lint: RawRulepackWithLint) -> Result<Self, Self::Error> {
384        let raw = raw_with_lint.raw;
385        if !raw.schema_version.starts_with(SUPPORTED_SCHEMA_MAJOR_MINOR) {
386            return Err(RulepackError::SchemaVersion {
387                found: raw.schema_version,
388                supported: "~0.1.x".to_string(),
389            });
390        }
391
392        let default_locales = parse_locales(raw.default_locales)?;
393        let recognizers = raw
394            .recognizers
395            .into_iter()
396            .map(|recognizer| parse_recognizer(recognizer, &default_locales))
397            .collect::<Result<Vec<_>, _>>()?;
398        validate_rulepack_recognizers(&recognizers, &default_locales, &raw_with_lint.lint)?;
399        let locale = raw.locale.map(LocaleData::from);
400        reject_anchored_match_ellipsis_cues(&recognizers, locale.as_ref())?;
401
402        Ok(Self {
403            schema_version: raw.schema_version,
404            rulepack_id: raw.rulepack_id,
405            rulepack_version: raw.rulepack_version,
406            default_locales,
407            locale,
408            recognizers,
409        })
410    }
411}
412
413fn extract_recognizer_lint_config(raw: &str) -> (String, RawRecognizerLintConfig) {
414    let mut sanitized = String::with_capacity(raw.len());
415    let mut lint = RawRecognizerLintConfig::default();
416    let mut in_lint = false;
417
418    for line in raw.lines() {
419        let trimmed = line.trim();
420        if trimmed == "[recognizers.lint]" {
421            in_lint = true;
422            continue;
423        }
424        if in_lint && trimmed.starts_with('[') {
425            in_lint = false;
426        }
427        if in_lint {
428            if let Some((key, value)) = trimmed.split_once('=') {
429                if key.trim() == "strict_locale_overlap" {
430                    lint.strict_locale_overlap = value.trim().eq_ignore_ascii_case("true");
431                }
432            }
433            continue;
434        }
435        sanitized.push_str(line);
436        sanitized.push('\n');
437    }
438
439    (sanitized, lint)
440}
441
442impl From<RawLocaleData> for LocaleData {
443    fn from(raw: RawLocaleData) -> Self {
444        Self {
445            buckets: raw
446                .buckets
447                .into_iter()
448                .map(|(name, bucket)| {
449                    (
450                        name,
451                        LocaleBucket {
452                            names: bucket.names,
453                        },
454                    )
455                })
456                .collect(),
457        }
458    }
459}
460
461fn parse_recognizer(
462    raw: RawRecognizerSpec,
463    default_locales: &[LocaleTag],
464) -> Result<RecognizerSpec, RulepackError> {
465    reject_unshipped_fields(&raw)?;
466    validate_matcher(&raw)?;
467    let locales = if raw.locales.is_empty() {
468        default_locales.to_vec()
469    } else {
470        parse_locales(raw.locales)?
471    };
472
473    Ok(RecognizerSpec {
474        id: raw.id,
475        class: parse_class(&raw.class)?,
476        cooperates_with: raw.cooperates_with,
477        enabled: raw.enabled,
478        locales,
479        matcher: raw.matcher,
480        context: raw.context.map(|context| ContextSpec {
481            hotwords: context.hotwords,
482            window: context.window,
483            boost: context.boost,
484            exclusions: context.exclusions,
485        }),
486        validator: raw.validator.map(|validator| ValidatorSpec {
487            kind: validator.kind,
488        }),
489        normalizer: raw.normalizer.map(|normalizer| NormalizerSpec {
490            kind: normalizer.kind,
491        }),
492        scoring: raw.scoring.map_or_else(
493            || ScoringSpec {
494                base: default_base_score(),
495                priority: 0,
496            },
497            |scoring| ScoringSpec {
498                base: scoring.base,
499                priority: scoring.priority,
500            },
501        ),
502        token: TokenSpec {
503            family: raw.token.family,
504            format: raw.token.format,
505        },
506        source: raw.source.map(|source| SourceSpec {
507            origin: source.origin,
508            from: source.from,
509            license: source.license,
510        }),
511    })
512}
513
514fn validate_matcher(raw: &RawRecognizerSpec) -> Result<(), RulepackError> {
515    match &raw.matcher {
516        RawMatch::Regex {
517            pattern,
518            pattern_template,
519            ..
520        } => {
521            if pattern.is_some() == pattern_template.is_some() {
522                return Err(RulepackError::RegexPatternChoice { id: raw.id.clone() });
523            }
524        }
525        RawMatch::AnchoredMatch {
526            cues_bucket,
527            boundary,
528            right_window_chars,
529            name_shape,
530            cue_position,
531            ..
532        } => {
533            if cues_bucket.trim().is_empty() {
534                return Err(RulepackError::UnsupportedAnchoredMatch {
535                    field: "cues_bucket".to_string(),
536                    value: cues_bucket.clone(),
537                });
538            }
539            if !(1..=512).contains(right_window_chars) {
540                return Err(RulepackError::UnsupportedAnchoredMatch {
541                    field: "right_window_chars".to_string(),
542                    value: right_window_chars.to_string(),
543                });
544            }
545            if !matches!(boundary.as_str(), "punctuation" | "whitespace" | "line_end") {
546                return Err(RulepackError::UnsupportedAnchoredMatch {
547                    field: "boundary".to_string(),
548                    value: boundary.clone(),
549                });
550            }
551            if name_shape != "person_name" {
552                return Err(RulepackError::UnsupportedAnchoredMatch {
553                    field: "name_shape".to_string(),
554                    value: name_shape.clone(),
555                });
556            }
557            if !matches!(cue_position.as_str(), "before" | "after") {
558                return Err(RulepackError::UnsupportedAnchoredMatch {
559                    field: "cue_position".to_string(),
560                    value: cue_position.clone(),
561                });
562            }
563        }
564        RawMatch::Dictionary { .. } | RawMatch::Ner { .. } => {}
565    }
566    Ok(())
567}
568
569fn reject_anchored_match_ellipsis_cues(
570    recognizers: &[RecognizerSpec],
571    locale: Option<&LocaleData>,
572) -> Result<(), RulepackError> {
573    let Some(locale) = locale else {
574        return Ok(());
575    };
576    for recognizer in recognizers {
577        let RawMatch::AnchoredMatch { cues_bucket, .. } = &recognizer.matcher else {
578            continue;
579        };
580        let Some(bucket) = locale.buckets.get(cues_bucket) else {
581            continue;
582        };
583        if let Some(cue) = bucket.names.iter().find(|cue| cue.contains("...")) {
584            return Err(RulepackError::UnsupportedAnchoredMatch {
585                field: format!("locale.{cues_bucket}.names"),
586                value: cue.clone(),
587            });
588        }
589    }
590    Ok(())
591}
592
593fn reject_unshipped_fields(raw: &RawRecognizerSpec) -> Result<(), RulepackError> {
594    const PLANNED_VERSION: &str = "v0.4.1";
595
596    if raw
597        .token
598        .format
599        .as_deref()
600        .is_some_and(|value| !value.is_empty())
601    {
602        return Err(RulepackError::UnsupportedField {
603            field: "token.format".to_string(),
604            planned_version: PLANNED_VERSION,
605        });
606    }
607    if let Some(context) = &raw.context {
608        if !context.hotwords.is_empty() {
609            return Err(RulepackError::UnsupportedField {
610                field: "context.hotwords".to_string(),
611                planned_version: PLANNED_VERSION,
612            });
613        }
614        if context.boost.is_some() {
615            return Err(RulepackError::UnsupportedField {
616                field: "context.boost".to_string(),
617                planned_version: PLANNED_VERSION,
618            });
619        }
620        if context.window.is_some() {
621            return Err(RulepackError::UnsupportedField {
622                field: "context.window".to_string(),
623                planned_version: PLANNED_VERSION,
624            });
625        }
626    }
627    Ok(())
628}
629
630pub fn recognizer_composition_validator(
631    recognizers: &[RecognizerSpec],
632) -> Result<(), RulepackError> {
633    for (index, first) in recognizers.iter().enumerate() {
634        for second in recognizers.iter().skip(index + 1) {
635            if first.class != second.class {
636                continue;
637            }
638            if first.cooperates_with.iter().any(|id| id == &second.id)
639                || second.cooperates_with.iter().any(|id| id == &first.id)
640            {
641                continue;
642            }
643            return Err(RulepackError::SameClassWithoutCooperation {
644                class: first.class.clone(),
645                recognizer_a: first.id.clone(),
646                recognizer_b: second.id.clone(),
647            });
648        }
649    }
650    Ok(())
651}
652
653fn validate_rulepack_recognizers(
654    recognizers: &[RecognizerSpec],
655    active_locales: &[LocaleTag],
656    lint: &RawRecognizerLintConfig,
657) -> Result<(), RulepackError> {
658    recognizer_composition_validator(recognizers)?;
659    lint_locale_projection_collisions(recognizers, active_locales, lint)?;
660    lint_global_naked_patterns(recognizers);
661    Ok(())
662}
663
664fn lint_locale_projection_collisions(
665    recognizers: &[RecognizerSpec],
666    active_locales: &[LocaleTag],
667    lint: &RawRecognizerLintConfig,
668) -> Result<(), RulepackError> {
669    for (index, first) in recognizers.iter().enumerate() {
670        if !first.enabled {
671            continue;
672        }
673        let Some(first_shape) = regex_structural_shape(&first.matcher) else {
674            continue;
675        };
676        if !is_truly_naked_numeric(&first.matcher) {
677            continue;
678        }
679        let first_projection = locale_projection(&first.locales, active_locales);
680        if first_projection.is_empty() {
681            continue;
682        }
683
684        for second in recognizers.iter().skip(index + 1) {
685            if !second.enabled || first.class != second.class {
686                continue;
687            }
688            if !is_truly_naked_numeric(&second.matcher) {
689                continue;
690            }
691            if regex_structural_shape(&second.matcher).as_ref() != Some(&first_shape) {
692                continue;
693            }
694            let second_projection = locale_projection(&second.locales, active_locales);
695            if second_projection.is_empty() {
696                continue;
697            }
698
699            let recognizer_ids = vec![first.id.clone(), second.id.clone()];
700            let locale_overlap = merged_locale_projection(&first_projection, &second_projection);
701            if lint.strict_locale_overlap {
702                return Err(RulepackError::ConflictingLocaleProjection {
703                    class: first.class.clone(),
704                    recognizer_ids,
705                    locale_overlap,
706                });
707            }
708            tracing::warn!(
709                class = %first.class.class_name(),
710                recognizer_ids = ?recognizer_ids,
711                locale_overlap = ?locale_overlap,
712                "recognizers share class with naked-shape regex and non-disjoint locale projection"
713            );
714        }
715    }
716    Ok(())
717}
718
719fn lint_global_naked_patterns(recognizers: &[RecognizerSpec]) {
720    for recognizer in recognizers {
721        if !recognizer.enabled || recognizer.locales != [LocaleTag::Global] {
722            continue;
723        }
724        let Some(shape) = regex_structural_shape(&recognizer.matcher) else {
725            continue;
726        };
727        let RawMatch::Regex {
728            pattern: Some(pattern),
729            ..
730        } = &recognizer.matcher
731        else {
732            continue;
733        };
734        if shape.minimum_match_len < 6 && !has_regex_separator(pattern) {
735            tracing::warn!(
736                recognizer_id = %recognizer.id,
737                class = %recognizer.class.class_name(),
738                minimum_match_len = shape.minimum_match_len,
739                "global recognizer uses short naked regex shape"
740            );
741        }
742    }
743}
744
745#[derive(Debug, Clone, PartialEq, Eq)]
746struct RegexStructuralShape {
747    minimum_match_len: usize,
748    character_class: RegexCharacterClass,
749}
750
751#[derive(Debug, Clone, PartialEq, Eq)]
752enum RegexCharacterClass {
753    Digit,
754}
755
756fn regex_structural_shape(matcher: &RawMatch) -> Option<RegexStructuralShape> {
757    let RawMatch::Regex {
758        pattern: Some(pattern),
759        pattern_template: None,
760        ..
761    } = matcher
762    else {
763        return None;
764    };
765    if has_unescaped_line_anchor(pattern) {
766        return None;
767    }
768    digit_quantifier_minimum(pattern).map(|minimum_match_len| RegexStructuralShape {
769        minimum_match_len,
770        character_class: RegexCharacterClass::Digit,
771    })
772}
773
774fn is_truly_naked_numeric(matcher: &RawMatch) -> bool {
775    let RawMatch::Regex {
776        pattern: Some(pattern),
777        ..
778    } = matcher
779    else {
780        return false;
781    };
782
783    let mut chars = pattern.chars();
784    while let Some(ch) = chars.next() {
785        if ch == '\\' {
786            chars.next();
787            continue;
788        }
789        if ch.is_ascii_alphabetic() {
790            return false;
791        }
792    }
793    true
794}
795
796fn has_unescaped_line_anchor(pattern: &str) -> bool {
797    let mut escaped = false;
798    let mut in_class = false;
799    for ch in pattern.chars() {
800        if escaped {
801            escaped = false;
802            continue;
803        }
804        match ch {
805            '\\' => escaped = true,
806            '[' => in_class = true,
807            ']' => in_class = false,
808            '^' | '$' if !in_class => return true,
809            _ => {}
810        }
811    }
812    false
813}
814
815fn digit_quantifier_minimum(pattern: &str) -> Option<usize> {
816    find_digit_quantifier(pattern, r"\d{")
817        .or_else(|| find_digit_quantifier(pattern, "[0-9]{"))
818        .or_else(|| find_digit_quantifier(pattern, "[[:digit:]]{"))
819}
820
821fn find_digit_quantifier(pattern: &str, needle: &str) -> Option<usize> {
822    let start = pattern.find(needle)? + needle.len();
823    let rest = &pattern[start..];
824    let digits = rest
825        .chars()
826        .take_while(|ch| ch.is_ascii_digit())
827        .collect::<String>();
828    if digits.is_empty() {
829        return None;
830    }
831    digits.parse().ok()
832}
833
834fn locale_projection(locales: &[LocaleTag], active_locales: &[LocaleTag]) -> Vec<LocaleTag> {
835    let mut projection = Vec::new();
836    for locale in locales {
837        if *locale == LocaleTag::Global {
838            projection.push(LocaleTag::Global);
839        } else if active_locales.iter().any(|active| active == locale) {
840            projection.push(locale.clone());
841        }
842    }
843    projection
844}
845
846fn merged_locale_projection(left: &[LocaleTag], right: &[LocaleTag]) -> Vec<LocaleTag> {
847    let mut merged = Vec::new();
848    for locale in left.iter().chain(right) {
849        if !merged.iter().any(|existing| existing == locale) {
850            merged.push(locale.clone());
851        }
852    }
853    merged
854}
855
856fn has_regex_separator(pattern: &str) -> bool {
857    pattern.contains('-')
858        || pattern.contains('/')
859        || pattern.contains('.')
860        || pattern.contains('+')
861        || pattern.contains("\\s")
862        || pattern.contains("[:space:]")
863}
864
865pub fn parse_class(input: &str) -> Result<PiiClass, RulepackError> {
866    let trimmed = input.trim();
867    let lower = trimmed.to_ascii_lowercase();
868    match lower.as_str() {
869        "email" => Ok(PiiClass::Email),
870        "name" => Ok(PiiClass::Name),
871        "location" => Ok(PiiClass::Location),
872        "organization" => Ok(PiiClass::Organization),
873        custom if custom.starts_with("custom:") => {
874            let name = trimmed
875                .split_once(':')
876                .map(|(_, name)| name)
877                .unwrap_or_default();
878            if name.trim().is_empty() {
879                return Err(RulepackError::UnknownClass(input.to_string()));
880            }
881            Ok(PiiClass::custom(name))
882        }
883        _ => Err(RulepackError::UnknownClass(input.to_string())),
884    }
885}
886
887fn parse_locales(locales: Vec<String>) -> Result<Vec<LocaleTag>, RulepackError> {
888    locales
889        .into_iter()
890        .map(|locale| {
891            LocaleTag::parse(&locale).map_err(|_| RulepackError::UnknownLocale(locale.clone()))
892        })
893        .collect()
894}
895
896fn default_true() -> bool {
897    true
898}
899
900fn default_base_score() -> f32 {
901    0.70
902}
903
904#[cfg(test)]
905mod tests {
906    use super::*;
907
908    const CORE: &str = r#"
909schema_version = "0.1.0"
910rulepack_id = "gaze-core"
911rulepack_version = "0.4.0"
912default_locales = ["global"]
913
914[locale.email_headers]
915names = ["From", "To", "Cc", "Bcc", "Reply-To", "Sender"]
916
917[[recognizers]]
918id = "email.global"
919class = "Email"
920enabled = true
921locales = ["global"]
922
923[recognizers.match]
924kind = "regex"
925pattern = '''(?i)\b[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}\b'''
926
927[recognizers.context]
928exclusions = ["example.com"]
929
930[recognizers.validator]
931kind = "email_rfc"
932
933[recognizers.normalizer]
934kind = "email_canonical"
935
936[recognizers.scoring]
937base = 0.70
938priority = 90
939
940[recognizers.token]
941
942[recognizers.source]
943origin = "ported"
944from = "presidio"
945license = "Apache-2.0"
946"#;
947
948    #[test]
949    fn parses_core_rulepack_end_to_end() {
950        let rulepack = Rulepack::parse(CORE).expect("core rulepack");
951
952        assert_eq!(rulepack.rulepack_id, "gaze-core");
953        assert_eq!(rulepack.default_locales, vec![LocaleTag::Global]);
954        let header_names = &rulepack
955            .locale
956            .as_ref()
957            .and_then(|locale| locale.buckets.get("email_headers"))
958            .expect("email headers")
959            .names;
960        assert_eq!(
961            header_names,
962            &vec!["From", "To", "Cc", "Bcc", "Reply-To", "Sender"]
963        );
964        assert_eq!(rulepack.recognizers.len(), 1);
965        let recognizer = &rulepack.recognizers[0];
966        assert_eq!(recognizer.id, "email.global");
967        assert_eq!(recognizer.class, PiiClass::Email);
968        assert_eq!(recognizer.scoring.priority, 90);
969        assert!(matches!(recognizer.matcher, RawMatch::Regex { .. }));
970    }
971
972    #[cfg(feature = "bundled-recognizers")]
973    #[test]
974    fn embedded_core_activated_classes_match_rulepack_classes() {
975        let rulepack = Rulepack::load(RulepackSource::Embedded(
976            gaze_recognizers::embedded("core").expect("core rulepack"),
977        ))
978        .expect("embedded core rulepack");
979
980        assert_eq!(
981            rulepack.activated_classes(),
982            BTreeSet::from([PiiClass::Email, PiiClass::Name])
983        );
984    }
985
986    #[cfg(feature = "bundled-recognizers")]
987    #[test]
988    fn embedded_core_loads_full_name_recognizer_cooperation_matrix() {
989        let rulepack = Rulepack::load(RulepackSource::Embedded(
990            gaze_recognizers::embedded("core").expect("core rulepack"),
991        ))
992        .expect("embedded core rulepack");
993        let name_recognizers = rulepack
994            .recognizers
995            .iter()
996            .filter(|recognizer| recognizer.class == PiiClass::Name)
997            .collect::<Vec<_>>();
998
999        assert_eq!(name_recognizers.len(), 5);
1000        for recognizer in &name_recognizers {
1001            for peer in &name_recognizers {
1002                if recognizer.id == peer.id {
1003                    continue;
1004                }
1005                assert!(
1006                    recognizer.cooperates_with.contains(&peer.id),
1007                    "{} missing cooperates_with {}",
1008                    recognizer.id,
1009                    peer.id
1010                );
1011            }
1012        }
1013    }
1014
1015    #[cfg(feature = "bundled-recognizers")]
1016    #[test]
1017    fn embedded_core_extended_activated_classes_match_rulepack_classes() {
1018        let rulepack = Rulepack::load(RulepackSource::Embedded(
1019            gaze_recognizers::embedded("core-extended").expect("core-extended rulepack"),
1020        ))
1021        .expect("embedded core-extended rulepack");
1022
1023        assert_eq!(
1024            rulepack.activated_classes(),
1025            BTreeSet::from([
1026                PiiClass::custom("phone"),
1027                PiiClass::custom("iban"),
1028                PiiClass::custom("credit_card"),
1029                PiiClass::custom("ip_address"),
1030                PiiClass::custom("eth_address"),
1031                PiiClass::custom("postal_code"),
1032            ])
1033        );
1034    }
1035
1036    #[cfg(feature = "bundled-recognizers")]
1037    #[test]
1038    fn activated_classes_include_new_rulepack_recognizer_class() {
1039        let raw = format!(
1040            r#"{}
1041
1042[[recognizers]]
1043id = "test.only"
1044class = "custom:test_only"
1045enabled = true
1046locales = ["global"]
1047
1048[recognizers.match]
1049kind = "regex"
1050pattern = "TEST_ONLY"
1051
1052[recognizers.scoring]
1053base = 0.70
1054priority = 1
1055"#,
1056            gaze_recognizers::embedded("core-extended").expect("core-extended rulepack")
1057        );
1058        let rulepack = Rulepack::parse(&raw).expect("core-extended with synthetic recognizer");
1059
1060        assert!(
1061            rulepack
1062                .activated_classes()
1063                .contains(&PiiClass::custom("test_only")),
1064            "new recognizer class must be derived from rulepack data"
1065        );
1066    }
1067
1068    #[test]
1069    fn rulepack_accepts_token_family() {
1070        let rulepack = Rulepack::parse(&unsupported_field_rulepack(
1071            "[recognizers.token]\nfamily = \"email.formatpreserve\"\n",
1072        ))
1073        .expect("token family is active in v0.4.1");
1074
1075        assert_eq!(
1076            rulepack.recognizers[0].token.family.as_deref(),
1077            Some("email.formatpreserve")
1078        );
1079    }
1080
1081    #[test]
1082    fn rulepack_rejects_unsupported_token_format() {
1083        let err = Rulepack::parse(&unsupported_field_rulepack(
1084            "[recognizers.token]\nformat = \"Customer_{n}\"\n",
1085        ))
1086        .expect_err("token format is reserved for v0.4.1");
1087
1088        assert_unsupported_field(err, "token.format");
1089    }
1090
1091    #[test]
1092    fn rulepack_rejects_unsupported_context_hotwords() {
1093        let err = Rulepack::parse(&unsupported_field_rulepack(
1094            "[recognizers.context]\nhotwords = [\"foo\"]\n",
1095        ))
1096        .expect_err("context hotwords are reserved for v0.4.1");
1097
1098        assert_unsupported_field(err, "context.hotwords");
1099    }
1100
1101    #[test]
1102    fn rulepack_rejects_unsupported_context_boost() {
1103        let err = Rulepack::parse(&unsupported_field_rulepack(
1104            "[recognizers.context]\nboost = 0.10\n",
1105        ))
1106        .expect_err("context boost is reserved for v0.4.1");
1107
1108        assert_unsupported_field(err, "context.boost");
1109    }
1110
1111    #[test]
1112    fn rulepack_rejects_unsupported_context_window() {
1113        let err = Rulepack::parse(&unsupported_field_rulepack(
1114            "[recognizers.context]\nwindow = 12\n",
1115        ))
1116        .expect_err("context window is reserved for v0.4.1");
1117
1118        assert_unsupported_field(err, "context.window");
1119    }
1120
1121    #[test]
1122    fn rulepack_accepts_default_token_fields() {
1123        let rulepack = Rulepack::parse(CORE).expect("reserved token/context fields are unset");
1124        let recognizer = &rulepack.recognizers[0];
1125
1126        assert_eq!(recognizer.token.family, None);
1127        assert_eq!(recognizer.token.format, None);
1128        assert!(recognizer.context.as_ref().unwrap().hotwords.is_empty());
1129        assert_eq!(recognizer.context.as_ref().unwrap().boost, None);
1130        assert_eq!(recognizer.context.as_ref().unwrap().window, None);
1131    }
1132
1133    #[test]
1134    fn pattern_template_with_pattern_both_present_fails_closed() {
1135        let err = Rulepack::parse(&unsupported_field_rulepack(
1136            "pattern_template = \"{locale_email_headers}: (.+)\"\n",
1137        ))
1138        .expect_err("pattern and pattern_template are mutually exclusive");
1139
1140        assert!(matches!(
1141            err,
1142            RulepackError::RegexPatternChoice { id } if id == "bad.email"
1143        ));
1144    }
1145
1146    #[test]
1147    fn regex_pattern_or_template_is_required() {
1148        let raw = r#"
1149schema_version = "0.1.0"
1150rulepack_id = "bad"
1151rulepack_version = "0.4.0"
1152default_locales = ["global"]
1153
1154[[recognizers]]
1155id = "bad.email"
1156class = "Email"
1157enabled = true
1158
1159[recognizers.match]
1160kind = "regex"
1161"#;
1162        let err = Rulepack::parse(raw).expect_err("regex pattern is required");
1163
1164        assert!(matches!(
1165            err,
1166            RulepackError::RegexPatternChoice { id } if id == "bad.email"
1167        ));
1168    }
1169
1170    #[test]
1171    fn anchored_match_accepts_valid_schema() {
1172        let rulepack = Rulepack::parse(&anchored_match_rulepack("")).expect("anchored_match");
1173        assert!(matches!(
1174            rulepack.recognizers[0].matcher,
1175            RawMatch::AnchoredMatch { .. }
1176        ));
1177    }
1178
1179    #[test]
1180    fn anchored_match_rejects_unknown_boundary() {
1181        let err = Rulepack::parse(&anchored_match_rulepack("boundary = \"paragraph\"\n"))
1182            .expect_err("unknown boundary fails closed");
1183
1184        assert_unsupported_anchored_match(err, "boundary", "paragraph");
1185    }
1186
1187    #[test]
1188    fn anchored_match_rejects_unknown_name_shape() {
1189        let err = Rulepack::parse(&anchored_match_rulepack("name_shape = \"organization\"\n"))
1190            .expect_err("unknown name_shape fails closed");
1191
1192        assert_unsupported_anchored_match(err, "name_shape", "organization");
1193    }
1194
1195    #[test]
1196    fn anchored_match_rejects_unknown_cue_position() {
1197        let err = Rulepack::parse(&anchored_match_rulepack("cue_position = \"around\"\n"))
1198            .expect_err("unknown cue_position fails closed");
1199
1200        assert_unsupported_anchored_match(err, "cue_position", "around");
1201    }
1202
1203    #[test]
1204    fn anchored_match_rejects_missing_cues_bucket() {
1205        let err = Rulepack::parse(&anchored_match_rulepack("cues_bucket = \"\"\n"))
1206            .expect_err("missing cues_bucket fails closed");
1207
1208        assert_unsupported_anchored_match(err, "cues_bucket", "");
1209    }
1210
1211    #[test]
1212    fn anchored_match_rejects_ellipsis_in_cue_values() {
1213        let err = Rulepack::parse(
1214            r#"
1215schema_version = "0.1.0"
1216rulepack_id = "anchored"
1217rulepack_version = "0.6.0"
1218default_locales = ["global"]
1219
1220[locale.forward_markers]
1221names = ["Forwarded ... message"]
1222
1223[[recognizers]]
1224id = "name.forward_marker"
1225class = "Name"
1226enabled = true
1227
1228[recognizers.match]
1229kind = "anchored_match"
1230cues_bucket = "forward_markers"
1231boundary = "punctuation"
1232right_window_chars = 64
1233name_shape = "person_name"
1234cue_position = "before"
1235"#,
1236        )
1237        .expect_err("ellipsis cue fails closed");
1238
1239        assert_unsupported_anchored_match(
1240            err,
1241            "locale.forward_markers.names",
1242            "Forwarded ... message",
1243        );
1244    }
1245
1246    #[test]
1247    fn anchored_match_rejects_invalid_window_bounds() {
1248        for (value, expected) in [("0", "0"), ("513", "513")] {
1249            let err = Rulepack::parse(&anchored_match_rulepack(&format!(
1250                "right_window_chars = {value}\n"
1251            )))
1252            .expect_err("invalid right_window_chars fails closed");
1253
1254            assert_unsupported_anchored_match(err, "right_window_chars", expected);
1255        }
1256    }
1257
1258    #[test]
1259    fn rulepack_load_fails_when_two_name_recognizers_omit_cooperates_with() {
1260        let err = Rulepack::parse(
1261            r#"
1262schema_version = "0.1.0"
1263rulepack_id = "bad-composition"
1264rulepack_version = "0.4.1"
1265default_locales = ["global"]
1266
1267[[recognizers]]
1268id = "email.header.name"
1269class = "Name"
1270enabled = true
1271
1272[recognizers.match]
1273kind = "regex"
1274pattern = "From: ([A-Z][a-z]+)"
1275
1276[[recognizers]]
1277id = "salutation.name"
1278class = "Name"
1279enabled = true
1280
1281[recognizers.match]
1282kind = "regex"
1283pattern = "Dear ([A-Z][a-z]+)"
1284"#,
1285        )
1286        .expect_err("same-class recognizers must explicitly cooperate");
1287
1288        assert!(matches!(
1289            err,
1290            RulepackError::SameClassWithoutCooperation {
1291                class: PiiClass::Name,
1292                recognizer_a,
1293                recognizer_b,
1294            } if recognizer_a == "email.header.name" && recognizer_b == "salutation.name"
1295        ));
1296    }
1297
1298    #[test]
1299    fn rulepack_load_accepts_same_class_pair_with_cooperates_with() {
1300        let rulepack = Rulepack::parse(
1301            r#"
1302schema_version = "0.1.0"
1303rulepack_id = "cooperating-composition"
1304rulepack_version = "0.4.1"
1305default_locales = ["global"]
1306
1307[[recognizers]]
1308id = "email.header.name"
1309class = "Name"
1310cooperates_with = ["salutation.name"]
1311enabled = true
1312
1313[recognizers.match]
1314kind = "regex"
1315pattern = "From: ([A-Z][a-z]+)"
1316
1317[[recognizers]]
1318id = "salutation.name"
1319class = "Name"
1320enabled = true
1321
1322[recognizers.match]
1323kind = "regex"
1324pattern = "Dear ([A-Z][a-z]+)"
1325"#,
1326        )
1327        .expect("cooperates_with unblocks same-class recognizers");
1328
1329        assert_eq!(rulepack.recognizers.len(), 2);
1330        assert_eq!(
1331            rulepack.recognizers[0].cooperates_with,
1332            vec!["salutation.name"]
1333        );
1334    }
1335
1336    #[test]
1337    fn rejects_unknown_fields_with_parent_table_context() {
1338        let err = Rulepack::parse(
1339            r#"
1340schema_version = "0.1.0"
1341rulepack_id = "bad"
1342rulepack_version = "0.4.0"
1343default_locales = ["global"]
1344bogus = true
1345"#,
1346        )
1347        .expect_err("unknown field must fail");
1348
1349        assert!(matches!(err, RulepackError::Toml(_)));
1350        assert!(err.to_string().contains("bogus"));
1351    }
1352
1353    #[test]
1354    fn rejects_unsupported_schema_version() {
1355        let err = Rulepack::parse(
1356            r#"
1357schema_version = "0.2.0"
1358rulepack_id = "bad"
1359rulepack_version = "0.4.0"
1360"#,
1361        )
1362        .expect_err("unsupported schema");
1363
1364        assert!(matches!(err, RulepackError::SchemaVersion { .. }));
1365    }
1366
1367    #[test]
1368    fn class_spelling_accepts_pascal_case_and_custom_names() {
1369        assert_eq!(parse_class("Email").unwrap(), PiiClass::Email);
1370        assert_eq!(
1371            parse_class("custom:Class_Alpha").unwrap(),
1372            PiiClass::Custom("class_alpha".to_string())
1373        );
1374    }
1375
1376    fn unsupported_field_rulepack(extra: &str) -> String {
1377        format!(
1378            r#"
1379schema_version = "0.1.0"
1380rulepack_id = "bad"
1381rulepack_version = "0.4.0"
1382default_locales = ["global"]
1383
1384[[recognizers]]
1385id = "bad.email"
1386class = "Email"
1387enabled = true
1388
1389[recognizers.match]
1390kind = "regex"
1391pattern = ".+"
1392
1393{extra}
1394"#
1395        )
1396    }
1397
1398    fn anchored_match_rulepack(override_line: &str) -> String {
1399        let cues_bucket = if override_line.starts_with("cues_bucket") {
1400            override_line.to_string()
1401        } else {
1402            "cues_bucket = \"forward_markers\"\n".to_string()
1403        };
1404        let boundary = if override_line.starts_with("boundary") {
1405            override_line.to_string()
1406        } else {
1407            "boundary = \"punctuation\"\n".to_string()
1408        };
1409        let right_window_chars = if override_line.starts_with("right_window_chars") {
1410            override_line.to_string()
1411        } else {
1412            "right_window_chars = 64\n".to_string()
1413        };
1414        let name_shape = if override_line.starts_with("name_shape") {
1415            override_line.to_string()
1416        } else {
1417            "name_shape = \"person_name\"\n".to_string()
1418        };
1419        let cue_position = if override_line.starts_with("cue_position") {
1420            override_line.to_string()
1421        } else {
1422            "cue_position = \"before\"\n".to_string()
1423        };
1424        format!(
1425            r#"
1426schema_version = "0.1.0"
1427rulepack_id = "anchored"
1428rulepack_version = "0.6.0"
1429default_locales = ["global"]
1430
1431[[recognizers]]
1432id = "name.forward_marker"
1433class = "Name"
1434enabled = true
1435
1436[recognizers.match]
1437kind = "anchored_match"
1438{cues_bucket}{boundary}{right_window_chars}{name_shape}{cue_position}
1439"#
1440        )
1441    }
1442
1443    fn assert_unsupported_field(err: RulepackError, field: &str) {
1444        assert!(matches!(
1445            err,
1446            RulepackError::UnsupportedField {
1447                field: ref actual,
1448                planned_version: "v0.4.1",
1449            } if actual == field
1450        ));
1451    }
1452
1453    fn assert_unsupported_anchored_match(err: RulepackError, field: &str, value: &str) {
1454        assert!(matches!(
1455            err,
1456            RulepackError::UnsupportedAnchoredMatch {
1457                field: ref actual_field,
1458                value: ref actual_value,
1459            } if actual_field == field && actual_value == value
1460        ));
1461    }
1462}