Skip to main content

gaze/
rulepack.rs

1use std::collections::{BTreeSet, HashMap};
2use std::path::PathBuf;
3
4use regex::Regex;
5use serde::Deserialize;
6use thiserror::Error;
7
8use crate::{LocaleTag, PiiClass};
9
10const SUPPORTED_SCHEMA_MAJOR_MINOR: &str = "0.1.";
11
12#[derive(Debug, Clone, PartialEq)]
13pub struct Rulepack {
14    pub schema_version: String,
15    pub rulepack_id: String,
16    pub rulepack_version: String,
17    pub default_locales: Vec<LocaleTag>,
18    pub locale: Option<LocaleData>,
19    pub recognizers: Vec<RecognizerSpec>,
20}
21
22#[derive(Debug, Clone, PartialEq)]
23#[non_exhaustive]
24pub struct RecognizerSpec {
25    pub id: String,
26    pub class: PiiClass,
27    pub cooperates_with: Vec<String>,
28    pub enabled: bool,
29    pub locales: Vec<LocaleTag>,
30    pub matcher: RawMatch,
31    pub context: Option<ContextSpec>,
32    pub validator: Option<ValidatorSpec>,
33    pub normalizer: Option<NormalizerSpec>,
34    pub scoring: ScoringSpec,
35    pub token: TokenSpec,
36    pub source: Option<SourceSpec>,
37}
38
39#[derive(Debug, Clone, PartialEq, Deserialize)]
40#[serde(tag = "kind", deny_unknown_fields, rename_all = "snake_case")]
41#[non_exhaustive]
42pub enum RawMatch {
43    Regex {
44        #[serde(default)]
45        pattern: Option<String>,
46        #[serde(default)]
47        pattern_template: Option<String>,
48        #[serde(default)]
49        capture_groups: Option<Vec<u32>>,
50    },
51    Dictionary {
52        #[serde(default)]
53        terms: Vec<String>,
54        #[serde(default)]
55        terms_file: Option<String>,
56        #[serde(default)]
57        terms_from_context: Option<String>,
58        #[serde(default)]
59        case_sensitive: bool,
60    },
61    Ner {
62        model_ref: String,
63    },
64    AnchoredMatch {
65        cues_bucket: String,
66        boundary: String,
67        right_window_chars: u16,
68        name_shape: String,
69        cue_position: String,
70    },
71}
72
73#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
74#[serde(deny_unknown_fields, rename_all = "snake_case")]
75#[non_exhaustive]
76pub enum AnchoredBoundary {
77    Punctuation,
78    Whitespace,
79    LineEnd,
80}
81
82/// Closed enum for the shape of token sequences `anchored_match` extracts.
83///
84/// v0.6 ships a single `PersonName` variant. Future variants
85/// (e.g. `Organization`, `Address`, `LegalEntity`) must justify
86/// why they aren't a locale-bucket lookup before being added here.
87/// Adding variants without that justification regresses the
88/// principle drawer `session-2026-04-25-no-codified-domain-concerns`
89/// (see also `lower_email_header_pattern_template` in pre-v0.4.1 history).
90#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
91#[serde(deny_unknown_fields, rename_all = "snake_case")]
92#[non_exhaustive]
93pub enum NameShape {
94    PersonName,
95}
96
97#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
98#[serde(deny_unknown_fields, rename_all = "snake_case")]
99#[non_exhaustive]
100pub enum CuePosition {
101    Before,
102    After,
103}
104
105#[derive(Debug, Clone, PartialEq)]
106#[non_exhaustive]
107pub struct ContextSpec {
108    pub hotwords: Vec<String>,
109    pub window: Option<u16>,
110    pub boost: Option<f32>,
111    pub exclusions: Vec<String>,
112}
113
114#[derive(Debug, Clone, PartialEq)]
115pub struct ValidatorSpec {
116    pub kind: String,
117}
118
119#[derive(Debug, Clone, PartialEq)]
120pub struct NormalizerSpec {
121    pub kind: String,
122}
123
124#[derive(Debug, Clone, PartialEq)]
125pub struct ScoringSpec {
126    pub base: f32,
127    pub priority: i32,
128}
129
130#[derive(Debug, Clone, PartialEq)]
131#[non_exhaustive]
132pub struct TokenSpec {
133    pub family: Option<String>,
134    pub format: Option<String>,
135}
136
137#[derive(Debug, Clone, PartialEq)]
138pub struct SourceSpec {
139    pub origin: String,
140    pub from: Option<String>,
141    pub license: Option<String>,
142}
143
144#[derive(Debug, Clone, PartialEq, Eq, Default)]
145pub struct LocaleData {
146    pub buckets: HashMap<String, LocaleBucket>,
147}
148
149#[derive(Debug, Clone, PartialEq, Eq)]
150pub struct LocaleBucket {
151    pub names: Vec<String>,
152}
153
154#[derive(Debug, Clone, PartialEq, Eq)]
155#[non_exhaustive]
156pub enum RulepackSource {
157    Embedded(&'static str),
158    Path(PathBuf),
159}
160
161#[derive(Debug, Error)]
162#[non_exhaustive]
163pub enum RulepackError {
164    #[error("failed to read rulepack: {0}")]
165    Io(#[source] std::io::Error),
166    #[error("failed to parse rulepack TOML: {0}")]
167    Toml(#[source] toml::de::Error),
168    #[error("unsupported rulepack schema_version {found}; supported {supported}")]
169    SchemaVersion { found: String, supported: String },
170    #[error("unknown pii class: {0}")]
171    UnknownClass(String),
172    #[error("unknown locale: {0}")]
173    UnknownLocale(String),
174    #[error("unsupported matcher kind: {0}")]
175    UnsupportedMatcher(String),
176    #[error("unsupported anchored_match field '{field}' value '{value}'")]
177    UnsupportedAnchoredMatch { field: String, value: String },
178    #[error("unsupported rulepack field '{field}' in B1; planned for {planned_version}")]
179    UnsupportedField {
180        field: String,
181        planned_version: &'static str,
182    },
183    #[error("unsupported validator kind: {kind}")]
184    UnsupportedValidator { kind: String },
185    #[error("unsupported normalizer kind: {kind}")]
186    UnsupportedNormalizer { kind: String },
187    #[error("unsupported rule spec variant: {variant}")]
188    UnsupportedRuleSpec { variant: String },
189    #[error("duplicate recognizer id '{id}' in rulepacks '{first_pack}' and '{second_pack}'")]
190    DuplicateId {
191        id: String,
192        first_pack: String,
193        second_pack: String,
194    },
195    #[error("regex recognizer '{id}' must define exactly one of pattern or pattern_template")]
196    RegexPatternChoice { id: String },
197    #[error("invalid regex for recognizer '{id}': {source}")]
198    RegexCompile {
199        id: String,
200        #[source]
201        source: regex::Error,
202    },
203    #[error(
204        "regex recognizer '{id}' shadows Gaze token shape sample '{shadowed_shape}' with pattern '{pattern}'"
205    )]
206    TokenShapeShadow {
207        id: String,
208        pattern: String,
209        shadowed_shape: String,
210    },
211    #[error("unknown pattern_template placeholder '{placeholder}' in recognizer '{id}'")]
212    UnknownPatternTemplatePlaceholder { id: String, placeholder: String },
213    #[error(
214        "context class_map override for dictionary '{dict}' changes {old_class:?} to {new_class:?}, but {uncovered_rule}"
215    )]
216    ClassMapOverrideClash {
217        dict: String,
218        old_class: PiiClass,
219        new_class: PiiClass,
220        uncovered_rule: String,
221    },
222    #[error(
223        "same-class recognizers '{recognizer_a}' and '{recognizer_b}' both emit {class:?} but neither declares cooperates_with"
224    )]
225    SameClassWithoutCooperation {
226        class: PiiClass,
227        recognizer_a: String,
228        recognizer_b: String,
229    },
230    #[error(
231        "recognizers {recognizer_ids:?} share class {class:?} with equivalent regex shape and overlapping locale projection {locale_overlap:?}"
232    )]
233    ConflictingLocaleProjection {
234        class: PiiClass,
235        recognizer_ids: Vec<String>,
236        locale_overlap: Vec<LocaleTag>,
237    },
238}
239
240impl Rulepack {
241    pub fn load(source: RulepackSource) -> Result<Rulepack, RulepackError> {
242        let raw = match source {
243            RulepackSource::Embedded(contents) => contents.to_string(),
244            RulepackSource::Path(path) => {
245                std::fs::read_to_string(path).map_err(RulepackError::Io)?
246            }
247        };
248        Self::parse(&raw)
249    }
250
251    pub fn parse(raw: &str) -> Result<Rulepack, RulepackError> {
252        let (raw, lint) = extract_recognizer_lint_config(raw);
253        let raw: RawRulepack = toml::from_str(&raw).map_err(RulepackError::Toml)?;
254        RawRulepackWithLint { raw, lint }.try_into()
255    }
256
257    pub fn activated_classes(&self) -> BTreeSet<PiiClass> {
258        self.recognizers
259            .iter()
260            .filter(|recognizer| recognizer.enabled)
261            .map(|recognizer| recognizer.class.clone())
262            .collect()
263    }
264}
265
266#[derive(Debug, Deserialize)]
267#[serde(deny_unknown_fields)]
268struct RawRulepack {
269    schema_version: String,
270    rulepack_id: String,
271    rulepack_version: String,
272    #[serde(default)]
273    default_locales: Vec<String>,
274    #[serde(default)]
275    locale: Option<RawLocaleData>,
276    #[serde(default)]
277    recognizers: Vec<RawRecognizerSpec>,
278}
279
280#[derive(Debug, Default)]
281struct RawRecognizerLintConfig {
282    strict_locale_overlap: bool,
283}
284
285#[derive(Debug)]
286struct RawRulepackWithLint {
287    raw: RawRulepack,
288    lint: RawRecognizerLintConfig,
289}
290
291#[derive(Debug, Deserialize)]
292struct RawLocaleData {
293    #[serde(flatten)]
294    buckets: HashMap<String, RawLocaleBucket>,
295}
296
297#[derive(Debug, Deserialize)]
298#[serde(deny_unknown_fields)]
299struct RawLocaleBucket {
300    names: Vec<String>,
301}
302
303#[derive(Debug, Deserialize)]
304#[serde(deny_unknown_fields)]
305struct RawRecognizerSpec {
306    id: String,
307    class: String,
308    #[serde(default)]
309    cooperates_with: Vec<String>,
310    #[serde(default = "default_true")]
311    enabled: bool,
312    #[serde(default)]
313    locales: Vec<String>,
314    #[serde(rename = "match")]
315    matcher: RawMatch,
316    #[serde(default)]
317    context: Option<RawContextSpec>,
318    #[serde(default)]
319    validator: Option<RawValidatorSpec>,
320    #[serde(default)]
321    normalizer: Option<RawNormalizerSpec>,
322    #[serde(default)]
323    scoring: Option<RawScoringSpec>,
324    #[serde(default)]
325    token: RawTokenSpec,
326    #[serde(default)]
327    source: Option<RawSourceSpec>,
328}
329
330#[derive(Debug, Deserialize)]
331#[serde(deny_unknown_fields)]
332struct RawContextSpec {
333    #[serde(default)]
334    hotwords: Vec<String>,
335    #[serde(default)]
336    window: Option<u16>,
337    #[serde(default)]
338    boost: Option<f32>,
339    #[serde(default)]
340    exclusions: Vec<String>,
341}
342
343#[derive(Debug, Deserialize)]
344#[serde(deny_unknown_fields)]
345struct RawValidatorSpec {
346    kind: String,
347}
348
349#[derive(Debug, Deserialize)]
350#[serde(deny_unknown_fields)]
351struct RawNormalizerSpec {
352    kind: String,
353}
354
355#[derive(Debug, Deserialize)]
356#[serde(deny_unknown_fields)]
357struct RawScoringSpec {
358    #[serde(default = "default_base_score")]
359    base: f32,
360    #[serde(default)]
361    priority: i32,
362}
363
364#[derive(Debug, Default, Deserialize)]
365#[serde(deny_unknown_fields)]
366struct RawTokenSpec {
367    #[serde(default)]
368    family: Option<String>,
369    #[serde(default)]
370    format: Option<String>,
371}
372
373#[derive(Debug, Deserialize)]
374#[serde(deny_unknown_fields)]
375struct RawSourceSpec {
376    origin: String,
377    #[serde(default)]
378    from: Option<String>,
379    #[serde(default)]
380    license: Option<String>,
381}
382
383impl TryFrom<RawRulepack> for Rulepack {
384    type Error = RulepackError;
385
386    fn try_from(raw: RawRulepack) -> Result<Self, Self::Error> {
387        RawRulepackWithLint {
388            raw,
389            lint: RawRecognizerLintConfig::default(),
390        }
391        .try_into()
392    }
393}
394
395impl TryFrom<RawRulepackWithLint> for Rulepack {
396    type Error = RulepackError;
397
398    fn try_from(raw_with_lint: RawRulepackWithLint) -> Result<Self, Self::Error> {
399        let raw = raw_with_lint.raw;
400        if !raw.schema_version.starts_with(SUPPORTED_SCHEMA_MAJOR_MINOR) {
401            return Err(RulepackError::SchemaVersion {
402                found: raw.schema_version,
403                supported: "~0.1.x".to_string(),
404            });
405        }
406
407        let default_locales = parse_locales(raw.default_locales)?;
408        let recognizers = raw
409            .recognizers
410            .into_iter()
411            .map(|recognizer| parse_recognizer(recognizer, &default_locales))
412            .collect::<Result<Vec<_>, _>>()?;
413        validate_rulepack_recognizers(&recognizers, &default_locales, &raw_with_lint.lint)?;
414        let locale = raw.locale.map(LocaleData::from);
415        reject_anchored_match_ellipsis_cues(&recognizers, locale.as_ref())?;
416
417        Ok(Self {
418            schema_version: raw.schema_version,
419            rulepack_id: raw.rulepack_id,
420            rulepack_version: raw.rulepack_version,
421            default_locales,
422            locale,
423            recognizers,
424        })
425    }
426}
427
428fn extract_recognizer_lint_config(raw: &str) -> (String, RawRecognizerLintConfig) {
429    let mut sanitized = String::with_capacity(raw.len());
430    let mut lint = RawRecognizerLintConfig::default();
431    let mut in_lint = false;
432
433    for line in raw.lines() {
434        let trimmed = line.trim();
435        if trimmed == "[recognizers.lint]" {
436            in_lint = true;
437            continue;
438        }
439        if in_lint && trimmed.starts_with('[') {
440            in_lint = false;
441        }
442        if in_lint {
443            if let Some((key, value)) = trimmed.split_once('=') {
444                if key.trim() == "strict_locale_overlap" {
445                    lint.strict_locale_overlap = value.trim().eq_ignore_ascii_case("true");
446                }
447            }
448            continue;
449        }
450        sanitized.push_str(line);
451        sanitized.push('\n');
452    }
453
454    (sanitized, lint)
455}
456
457impl From<RawLocaleData> for LocaleData {
458    fn from(raw: RawLocaleData) -> Self {
459        Self {
460            buckets: raw
461                .buckets
462                .into_iter()
463                .map(|(name, bucket)| {
464                    (
465                        name,
466                        LocaleBucket {
467                            names: bucket.names,
468                        },
469                    )
470                })
471                .collect(),
472        }
473    }
474}
475
476fn parse_recognizer(
477    raw: RawRecognizerSpec,
478    default_locales: &[LocaleTag],
479) -> Result<RecognizerSpec, RulepackError> {
480    reject_unshipped_fields(&raw)?;
481    validate_matcher(&raw)?;
482    let locales = if raw.locales.is_empty() {
483        default_locales.to_vec()
484    } else {
485        parse_locales(raw.locales)?
486    };
487
488    Ok(RecognizerSpec {
489        id: raw.id,
490        class: parse_class(&raw.class)?,
491        cooperates_with: raw.cooperates_with,
492        enabled: raw.enabled,
493        locales,
494        matcher: raw.matcher,
495        context: raw.context.map(|context| ContextSpec {
496            hotwords: context.hotwords,
497            window: context.window,
498            boost: context.boost,
499            exclusions: context.exclusions,
500        }),
501        validator: raw.validator.map(|validator| ValidatorSpec {
502            kind: validator.kind,
503        }),
504        normalizer: raw.normalizer.map(|normalizer| NormalizerSpec {
505            kind: normalizer.kind,
506        }),
507        scoring: raw.scoring.map_or_else(
508            || ScoringSpec {
509                base: default_base_score(),
510                priority: 0,
511            },
512            |scoring| ScoringSpec {
513                base: scoring.base,
514                priority: scoring.priority,
515            },
516        ),
517        token: TokenSpec {
518            family: raw.token.family,
519            format: raw.token.format,
520        },
521        source: raw.source.map(|source| SourceSpec {
522            origin: source.origin,
523            from: source.from,
524            license: source.license,
525        }),
526    })
527}
528
529fn validate_matcher(raw: &RawRecognizerSpec) -> Result<(), RulepackError> {
530    match &raw.matcher {
531        RawMatch::Regex {
532            pattern,
533            pattern_template,
534            ..
535        } => {
536            if pattern.is_some() == pattern_template.is_some() {
537                return Err(RulepackError::RegexPatternChoice { id: raw.id.clone() });
538            }
539            if let Some(pattern) = pattern {
540                let compiled =
541                    Regex::new(pattern).map_err(|source| RulepackError::RegexCompile {
542                        id: raw.id.clone(),
543                        source,
544                    })?;
545                crate::token_shape::reject_if_shadows_token_shape(&compiled, &raw.id).map_err(
546                    |shadow| RulepackError::TokenShapeShadow {
547                        id: shadow.recognizer_id,
548                        pattern: shadow.offending_pattern,
549                        shadowed_shape: shadow.shadowed_shape,
550                    },
551                )?;
552            }
553        }
554        RawMatch::AnchoredMatch {
555            cues_bucket,
556            boundary,
557            right_window_chars,
558            name_shape,
559            cue_position,
560            ..
561        } => {
562            if cues_bucket.trim().is_empty() {
563                return Err(RulepackError::UnsupportedAnchoredMatch {
564                    field: "cues_bucket".to_string(),
565                    value: cues_bucket.clone(),
566                });
567            }
568            if !(1..=512).contains(right_window_chars) {
569                return Err(RulepackError::UnsupportedAnchoredMatch {
570                    field: "right_window_chars".to_string(),
571                    value: right_window_chars.to_string(),
572                });
573            }
574            if !matches!(boundary.as_str(), "punctuation" | "whitespace" | "line_end") {
575                return Err(RulepackError::UnsupportedAnchoredMatch {
576                    field: "boundary".to_string(),
577                    value: boundary.clone(),
578                });
579            }
580            if name_shape != "person_name" {
581                return Err(RulepackError::UnsupportedAnchoredMatch {
582                    field: "name_shape".to_string(),
583                    value: name_shape.clone(),
584                });
585            }
586            if !matches!(cue_position.as_str(), "before" | "after") {
587                return Err(RulepackError::UnsupportedAnchoredMatch {
588                    field: "cue_position".to_string(),
589                    value: cue_position.clone(),
590                });
591            }
592        }
593        RawMatch::Dictionary { .. } | RawMatch::Ner { .. } => {}
594    }
595    Ok(())
596}
597
598fn reject_anchored_match_ellipsis_cues(
599    recognizers: &[RecognizerSpec],
600    locale: Option<&LocaleData>,
601) -> Result<(), RulepackError> {
602    let Some(locale) = locale else {
603        return Ok(());
604    };
605    for recognizer in recognizers {
606        let RawMatch::AnchoredMatch { cues_bucket, .. } = &recognizer.matcher else {
607            continue;
608        };
609        let Some(bucket) = locale.buckets.get(cues_bucket) else {
610            continue;
611        };
612        if let Some(cue) = bucket.names.iter().find(|cue| cue.contains("...")) {
613            return Err(RulepackError::UnsupportedAnchoredMatch {
614                field: format!("locale.{cues_bucket}.names"),
615                value: cue.clone(),
616            });
617        }
618    }
619    Ok(())
620}
621
622fn reject_unshipped_fields(raw: &RawRecognizerSpec) -> Result<(), RulepackError> {
623    const PLANNED_VERSION: &str = "v0.4.1";
624
625    if raw
626        .token
627        .format
628        .as_deref()
629        .is_some_and(|value| !value.is_empty())
630    {
631        return Err(RulepackError::UnsupportedField {
632            field: "token.format".to_string(),
633            planned_version: PLANNED_VERSION,
634        });
635    }
636    if let Some(context) = &raw.context {
637        if !context.hotwords.is_empty() {
638            return Err(RulepackError::UnsupportedField {
639                field: "context.hotwords".to_string(),
640                planned_version: PLANNED_VERSION,
641            });
642        }
643        if context.boost.is_some() {
644            return Err(RulepackError::UnsupportedField {
645                field: "context.boost".to_string(),
646                planned_version: PLANNED_VERSION,
647            });
648        }
649        if context.window.is_some() {
650            return Err(RulepackError::UnsupportedField {
651                field: "context.window".to_string(),
652                planned_version: PLANNED_VERSION,
653            });
654        }
655    }
656    Ok(())
657}
658
659pub fn recognizer_composition_validator(
660    recognizers: &[RecognizerSpec],
661) -> Result<(), RulepackError> {
662    for (index, first) in recognizers.iter().enumerate() {
663        for second in recognizers.iter().skip(index + 1) {
664            if first.class != second.class {
665                continue;
666            }
667            if first.cooperates_with.iter().any(|id| id == &second.id)
668                || second.cooperates_with.iter().any(|id| id == &first.id)
669            {
670                continue;
671            }
672            return Err(RulepackError::SameClassWithoutCooperation {
673                class: first.class.clone(),
674                recognizer_a: first.id.clone(),
675                recognizer_b: second.id.clone(),
676            });
677        }
678    }
679    Ok(())
680}
681
682fn validate_rulepack_recognizers(
683    recognizers: &[RecognizerSpec],
684    active_locales: &[LocaleTag],
685    lint: &RawRecognizerLintConfig,
686) -> Result<(), RulepackError> {
687    recognizer_composition_validator(recognizers)?;
688    lint_locale_projection_collisions(recognizers, active_locales, lint)?;
689    lint_global_naked_patterns(recognizers);
690    Ok(())
691}
692
693fn lint_locale_projection_collisions(
694    recognizers: &[RecognizerSpec],
695    active_locales: &[LocaleTag],
696    lint: &RawRecognizerLintConfig,
697) -> Result<(), RulepackError> {
698    for (index, first) in recognizers.iter().enumerate() {
699        if !first.enabled {
700            continue;
701        }
702        let Some(first_shape) = regex_structural_shape(&first.matcher) else {
703            continue;
704        };
705        if !is_truly_naked_numeric(&first.matcher) {
706            continue;
707        }
708        let first_projection = locale_projection(&first.locales, active_locales);
709        if first_projection.is_empty() {
710            continue;
711        }
712
713        for second in recognizers.iter().skip(index + 1) {
714            if !second.enabled || first.class != second.class {
715                continue;
716            }
717            if !is_truly_naked_numeric(&second.matcher) {
718                continue;
719            }
720            if regex_structural_shape(&second.matcher).as_ref() != Some(&first_shape) {
721                continue;
722            }
723            let second_projection = locale_projection(&second.locales, active_locales);
724            if second_projection.is_empty() {
725                continue;
726            }
727
728            let recognizer_ids = vec![first.id.clone(), second.id.clone()];
729            let locale_overlap = merged_locale_projection(&first_projection, &second_projection);
730            if lint.strict_locale_overlap {
731                return Err(RulepackError::ConflictingLocaleProjection {
732                    class: first.class.clone(),
733                    recognizer_ids,
734                    locale_overlap,
735                });
736            }
737            tracing::warn!(
738                class = %first.class.class_name(),
739                recognizer_ids = ?recognizer_ids,
740                locale_overlap = ?locale_overlap,
741                "recognizers share class with naked-shape regex and non-disjoint locale projection"
742            );
743        }
744    }
745    Ok(())
746}
747
748fn lint_global_naked_patterns(recognizers: &[RecognizerSpec]) {
749    for recognizer in recognizers {
750        if !recognizer.enabled || recognizer.locales != [LocaleTag::Global] {
751            continue;
752        }
753        let Some(shape) = regex_structural_shape(&recognizer.matcher) else {
754            continue;
755        };
756        let RawMatch::Regex {
757            pattern: Some(pattern),
758            ..
759        } = &recognizer.matcher
760        else {
761            continue;
762        };
763        if shape.minimum_match_len < 6 && !has_regex_separator(pattern) {
764            tracing::warn!(
765                recognizer_id = %recognizer.id,
766                class = %recognizer.class.class_name(),
767                minimum_match_len = shape.minimum_match_len,
768                "global recognizer uses short naked regex shape"
769            );
770        }
771    }
772}
773
774#[derive(Debug, Clone, PartialEq, Eq)]
775struct RegexStructuralShape {
776    minimum_match_len: usize,
777    character_class: RegexCharacterClass,
778}
779
780#[derive(Debug, Clone, PartialEq, Eq)]
781enum RegexCharacterClass {
782    Digit,
783}
784
785fn regex_structural_shape(matcher: &RawMatch) -> Option<RegexStructuralShape> {
786    let RawMatch::Regex {
787        pattern: Some(pattern),
788        pattern_template: None,
789        ..
790    } = matcher
791    else {
792        return None;
793    };
794    if has_unescaped_line_anchor(pattern) {
795        return None;
796    }
797    digit_quantifier_minimum(pattern).map(|minimum_match_len| RegexStructuralShape {
798        minimum_match_len,
799        character_class: RegexCharacterClass::Digit,
800    })
801}
802
803fn is_truly_naked_numeric(matcher: &RawMatch) -> bool {
804    let RawMatch::Regex {
805        pattern: Some(pattern),
806        ..
807    } = matcher
808    else {
809        return false;
810    };
811
812    let mut chars = pattern.chars();
813    while let Some(ch) = chars.next() {
814        if ch == '\\' {
815            chars.next();
816            continue;
817        }
818        if ch.is_ascii_alphabetic() {
819            return false;
820        }
821    }
822    true
823}
824
825fn has_unescaped_line_anchor(pattern: &str) -> bool {
826    let mut escaped = false;
827    let mut in_class = false;
828    for ch in pattern.chars() {
829        if escaped {
830            escaped = false;
831            continue;
832        }
833        match ch {
834            '\\' => escaped = true,
835            '[' => in_class = true,
836            ']' => in_class = false,
837            '^' | '$' if !in_class => return true,
838            _ => {}
839        }
840    }
841    false
842}
843
844fn digit_quantifier_minimum(pattern: &str) -> Option<usize> {
845    find_digit_quantifier(pattern, r"\d{")
846        .or_else(|| find_digit_quantifier(pattern, "[0-9]{"))
847        .or_else(|| find_digit_quantifier(pattern, "[[:digit:]]{"))
848}
849
850fn find_digit_quantifier(pattern: &str, needle: &str) -> Option<usize> {
851    let start = pattern.find(needle)? + needle.len();
852    let rest = &pattern[start..];
853    let digits = rest
854        .chars()
855        .take_while(|ch| ch.is_ascii_digit())
856        .collect::<String>();
857    if digits.is_empty() {
858        return None;
859    }
860    digits.parse().ok()
861}
862
863fn locale_projection(locales: &[LocaleTag], active_locales: &[LocaleTag]) -> Vec<LocaleTag> {
864    let mut projection = Vec::new();
865    for locale in locales {
866        if *locale == LocaleTag::Global {
867            projection.push(LocaleTag::Global);
868        } else if active_locales.iter().any(|active| active == locale) {
869            projection.push(locale.clone());
870        }
871    }
872    projection
873}
874
875fn merged_locale_projection(left: &[LocaleTag], right: &[LocaleTag]) -> Vec<LocaleTag> {
876    let mut merged = Vec::new();
877    for locale in left.iter().chain(right) {
878        if !merged.iter().any(|existing| existing == locale) {
879            merged.push(locale.clone());
880        }
881    }
882    merged
883}
884
885fn has_regex_separator(pattern: &str) -> bool {
886    pattern.contains('-')
887        || pattern.contains('/')
888        || pattern.contains('.')
889        || pattern.contains('+')
890        || pattern.contains("\\s")
891        || pattern.contains("[:space:]")
892}
893
894pub fn parse_class(input: &str) -> Result<PiiClass, RulepackError> {
895    let trimmed = input.trim();
896    let lower = trimmed.to_ascii_lowercase();
897    match lower.as_str() {
898        "email" => Ok(PiiClass::Email),
899        "name" => Ok(PiiClass::Name),
900        "location" => Ok(PiiClass::Location),
901        "organization" => Ok(PiiClass::Organization),
902        custom if custom.starts_with("custom:") => {
903            let name = trimmed
904                .split_once(':')
905                .map(|(_, name)| name)
906                .unwrap_or_default();
907            if name.trim().is_empty() {
908                return Err(RulepackError::UnknownClass(input.to_string()));
909            }
910            Ok(PiiClass::custom(name))
911        }
912        _ => Err(RulepackError::UnknownClass(input.to_string())),
913    }
914}
915
916fn parse_locales(locales: Vec<String>) -> Result<Vec<LocaleTag>, RulepackError> {
917    locales
918        .into_iter()
919        .map(|locale| {
920            LocaleTag::parse(&locale).map_err(|_| RulepackError::UnknownLocale(locale.clone()))
921        })
922        .collect()
923}
924
925fn default_true() -> bool {
926    true
927}
928
929fn default_base_score() -> f32 {
930    0.70
931}
932
933#[cfg(test)]
934mod tests {
935    use super::*;
936
937    const CORE: &str = r#"
938schema_version = "0.1.0"
939rulepack_id = "gaze-core"
940rulepack_version = "0.4.0"
941default_locales = ["global"]
942
943[locale.email_headers]
944names = ["From", "To", "Cc", "Bcc", "Reply-To", "Sender"]
945
946[[recognizers]]
947id = "email.global"
948class = "Email"
949enabled = true
950locales = ["global"]
951
952[recognizers.match]
953kind = "regex"
954pattern = '''(?i)\b[a-z0-9._%+\-]+@(?:(?:[a-z0-9\-]+\.)*example\.invalid|test\.local|[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?\.(?:com|org|net|edu|gov|de|uk|fr|nl|io|ai|co))\b'''
955
956[recognizers.context]
957exclusions = ["example.com"]
958
959[recognizers.validator]
960kind = "email_rfc"
961
962[recognizers.normalizer]
963kind = "email_canonical"
964
965[recognizers.scoring]
966base = 0.70
967priority = 90
968
969[recognizers.token]
970
971[recognizers.source]
972origin = "ported"
973from = "presidio"
974license = "Apache-2.0"
975"#;
976
977    #[test]
978    fn parses_core_rulepack_end_to_end() {
979        let rulepack = Rulepack::parse(CORE).expect("core rulepack");
980
981        assert_eq!(rulepack.rulepack_id, "gaze-core");
982        assert_eq!(rulepack.default_locales, vec![LocaleTag::Global]);
983        let header_names = &rulepack
984            .locale
985            .as_ref()
986            .and_then(|locale| locale.buckets.get("email_headers"))
987            .expect("email headers")
988            .names;
989        assert_eq!(
990            header_names,
991            &vec!["From", "To", "Cc", "Bcc", "Reply-To", "Sender"]
992        );
993        assert_eq!(rulepack.recognizers.len(), 1);
994        let recognizer = &rulepack.recognizers[0];
995        assert_eq!(recognizer.id, "email.global");
996        assert_eq!(recognizer.class, PiiClass::Email);
997        assert_eq!(recognizer.scoring.priority, 90);
998        assert!(matches!(recognizer.matcher, RawMatch::Regex { .. }));
999    }
1000
1001    #[cfg(feature = "bundled-recognizers")]
1002    #[test]
1003    fn embedded_core_activated_classes_match_rulepack_classes() {
1004        let rulepack = Rulepack::load(RulepackSource::Embedded(
1005            gaze_recognizers::embedded("core").expect("core rulepack"),
1006        ))
1007        .expect("embedded core rulepack");
1008
1009        assert_eq!(
1010            rulepack.activated_classes(),
1011            BTreeSet::from([PiiClass::Email, PiiClass::Name])
1012        );
1013    }
1014
1015    #[cfg(feature = "bundled-recognizers")]
1016    #[test]
1017    fn embedded_core_loads_full_name_recognizer_cooperation_matrix() {
1018        let rulepack = Rulepack::load(RulepackSource::Embedded(
1019            gaze_recognizers::embedded("core").expect("core rulepack"),
1020        ))
1021        .expect("embedded core rulepack");
1022        let name_recognizers = rulepack
1023            .recognizers
1024            .iter()
1025            .filter(|recognizer| recognizer.class == PiiClass::Name)
1026            .collect::<Vec<_>>();
1027
1028        assert_eq!(name_recognizers.len(), 5);
1029        for recognizer in &name_recognizers {
1030            for peer in &name_recognizers {
1031                if recognizer.id == peer.id {
1032                    continue;
1033                }
1034                assert!(
1035                    recognizer.cooperates_with.contains(&peer.id),
1036                    "{} missing cooperates_with {}",
1037                    recognizer.id,
1038                    peer.id
1039                );
1040            }
1041        }
1042    }
1043
1044    #[cfg(feature = "bundled-recognizers")]
1045    #[test]
1046    fn embedded_core_extended_activated_classes_match_rulepack_classes() {
1047        let rulepack = Rulepack::load(RulepackSource::Embedded(
1048            gaze_recognizers::embedded("core-extended").expect("core-extended rulepack"),
1049        ))
1050        .expect("embedded core-extended rulepack");
1051
1052        assert_eq!(
1053            rulepack.activated_classes(),
1054            BTreeSet::from([
1055                PiiClass::custom("phone"),
1056                PiiClass::custom("iban"),
1057                PiiClass::custom("credit_card"),
1058                PiiClass::custom("ip_address"),
1059                PiiClass::custom("eth_address"),
1060                PiiClass::custom("postal_code"),
1061            ])
1062        );
1063    }
1064
1065    #[cfg(feature = "bundled-recognizers")]
1066    #[test]
1067    fn activated_classes_include_new_rulepack_recognizer_class() {
1068        let raw = format!(
1069            r#"{}
1070
1071[[recognizers]]
1072id = "test.only"
1073class = "custom:test_only"
1074enabled = true
1075locales = ["global"]
1076
1077[recognizers.match]
1078kind = "regex"
1079pattern = "TEST_ONLY"
1080
1081[recognizers.scoring]
1082base = 0.70
1083priority = 1
1084"#,
1085            gaze_recognizers::embedded("core-extended").expect("core-extended rulepack")
1086        );
1087        let rulepack = Rulepack::parse(&raw).expect("core-extended with synthetic recognizer");
1088
1089        assert!(
1090            rulepack
1091                .activated_classes()
1092                .contains(&PiiClass::custom("test_only")),
1093            "new recognizer class must be derived from rulepack data"
1094        );
1095    }
1096
1097    #[test]
1098    fn rulepack_accepts_token_family() {
1099        let rulepack = Rulepack::parse(&unsupported_field_rulepack(
1100            "[recognizers.token]\nfamily = \"email.formatpreserve\"\n",
1101        ))
1102        .expect("token family is active in v0.4.1");
1103
1104        assert_eq!(
1105            rulepack.recognizers[0].token.family.as_deref(),
1106            Some("email.formatpreserve")
1107        );
1108    }
1109
1110    #[test]
1111    fn rulepack_rejects_unsupported_token_format() {
1112        let err = Rulepack::parse(&unsupported_field_rulepack(
1113            "[recognizers.token]\nformat = \"Customer_{n}\"\n",
1114        ))
1115        .expect_err("token format is reserved for v0.4.1");
1116
1117        assert_unsupported_field(err, "token.format");
1118    }
1119
1120    #[test]
1121    fn rulepack_rejects_unsupported_context_hotwords() {
1122        let err = Rulepack::parse(&unsupported_field_rulepack(
1123            "[recognizers.context]\nhotwords = [\"foo\"]\n",
1124        ))
1125        .expect_err("context hotwords are reserved for v0.4.1");
1126
1127        assert_unsupported_field(err, "context.hotwords");
1128    }
1129
1130    #[test]
1131    fn rulepack_rejects_unsupported_context_boost() {
1132        let err = Rulepack::parse(&unsupported_field_rulepack(
1133            "[recognizers.context]\nboost = 0.10\n",
1134        ))
1135        .expect_err("context boost is reserved for v0.4.1");
1136
1137        assert_unsupported_field(err, "context.boost");
1138    }
1139
1140    #[test]
1141    fn rulepack_rejects_unsupported_context_window() {
1142        let err = Rulepack::parse(&unsupported_field_rulepack(
1143            "[recognizers.context]\nwindow = 12\n",
1144        ))
1145        .expect_err("context window is reserved for v0.4.1");
1146
1147        assert_unsupported_field(err, "context.window");
1148    }
1149
1150    #[test]
1151    fn rulepack_accepts_default_token_fields() {
1152        let rulepack = Rulepack::parse(CORE).expect("reserved token/context fields are unset");
1153        let recognizer = &rulepack.recognizers[0];
1154
1155        assert_eq!(recognizer.token.family, None);
1156        assert_eq!(recognizer.token.format, None);
1157        assert!(recognizer.context.as_ref().unwrap().hotwords.is_empty());
1158        assert_eq!(recognizer.context.as_ref().unwrap().boost, None);
1159        assert_eq!(recognizer.context.as_ref().unwrap().window, None);
1160    }
1161
1162    #[test]
1163    fn pattern_template_with_pattern_both_present_fails_closed() {
1164        let err = Rulepack::parse(&unsupported_field_rulepack(
1165            "pattern_template = \"{locale_email_headers}: (.+)\"\n",
1166        ))
1167        .expect_err("pattern and pattern_template are mutually exclusive");
1168
1169        assert!(matches!(
1170            err,
1171            RulepackError::RegexPatternChoice { id } if id == "bad.email"
1172        ));
1173    }
1174
1175    #[test]
1176    fn regex_pattern_or_template_is_required() {
1177        let raw = r#"
1178schema_version = "0.1.0"
1179rulepack_id = "bad"
1180rulepack_version = "0.4.0"
1181default_locales = ["global"]
1182
1183[[recognizers]]
1184id = "bad.email"
1185class = "Email"
1186enabled = true
1187
1188[recognizers.match]
1189kind = "regex"
1190"#;
1191        let err = Rulepack::parse(raw).expect_err("regex pattern is required");
1192
1193        assert!(matches!(
1194            err,
1195            RulepackError::RegexPatternChoice { id } if id == "bad.email"
1196        ));
1197    }
1198
1199    #[test]
1200    fn rulepack_load_accepts_fixture_email_regex() {
1201        let raw = r#"
1202schema_version = "0.1.0"
1203rulepack_id = "custom-email"
1204rulepack_version = "0.7.0"
1205default_locales = ["global"]
1206
1207[[recognizers]]
1208id = "custom.email"
1209class = "Email"
1210enabled = true
1211
1212[recognizers.match]
1213kind = "regex"
1214pattern = '''alice@example\.invalid'''
1215"#;
1216
1217        let rulepack = Rulepack::parse(raw).expect("standard email regex should load");
1218
1219        assert_eq!(rulepack.recognizers.len(), 1);
1220        assert_eq!(rulepack.recognizers[0].id, "custom.email");
1221    }
1222
1223    #[test]
1224    fn anchored_match_accepts_valid_schema() {
1225        let rulepack = Rulepack::parse(&anchored_match_rulepack("")).expect("anchored_match");
1226        assert!(matches!(
1227            rulepack.recognizers[0].matcher,
1228            RawMatch::AnchoredMatch { .. }
1229        ));
1230    }
1231
1232    #[test]
1233    fn anchored_match_rejects_unknown_boundary() {
1234        let err = Rulepack::parse(&anchored_match_rulepack("boundary = \"paragraph\"\n"))
1235            .expect_err("unknown boundary fails closed");
1236
1237        assert_unsupported_anchored_match(err, "boundary", "paragraph");
1238    }
1239
1240    #[test]
1241    fn anchored_match_rejects_unknown_name_shape() {
1242        let err = Rulepack::parse(&anchored_match_rulepack("name_shape = \"organization\"\n"))
1243            .expect_err("unknown name_shape fails closed");
1244
1245        assert_unsupported_anchored_match(err, "name_shape", "organization");
1246    }
1247
1248    #[test]
1249    fn anchored_match_rejects_unknown_cue_position() {
1250        let err = Rulepack::parse(&anchored_match_rulepack("cue_position = \"around\"\n"))
1251            .expect_err("unknown cue_position fails closed");
1252
1253        assert_unsupported_anchored_match(err, "cue_position", "around");
1254    }
1255
1256    #[test]
1257    fn anchored_match_rejects_missing_cues_bucket() {
1258        let err = Rulepack::parse(&anchored_match_rulepack("cues_bucket = \"\"\n"))
1259            .expect_err("missing cues_bucket fails closed");
1260
1261        assert_unsupported_anchored_match(err, "cues_bucket", "");
1262    }
1263
1264    #[test]
1265    fn anchored_match_rejects_ellipsis_in_cue_values() {
1266        let err = Rulepack::parse(
1267            r#"
1268schema_version = "0.1.0"
1269rulepack_id = "anchored"
1270rulepack_version = "0.6.0"
1271default_locales = ["global"]
1272
1273[locale.forward_markers]
1274names = ["Forwarded ... message"]
1275
1276[[recognizers]]
1277id = "name.forward_marker"
1278class = "Name"
1279enabled = true
1280
1281[recognizers.match]
1282kind = "anchored_match"
1283cues_bucket = "forward_markers"
1284boundary = "punctuation"
1285right_window_chars = 64
1286name_shape = "person_name"
1287cue_position = "before"
1288"#,
1289        )
1290        .expect_err("ellipsis cue fails closed");
1291
1292        assert_unsupported_anchored_match(
1293            err,
1294            "locale.forward_markers.names",
1295            "Forwarded ... message",
1296        );
1297    }
1298
1299    #[test]
1300    fn anchored_match_rejects_invalid_window_bounds() {
1301        for (value, expected) in [("0", "0"), ("513", "513")] {
1302            let err = Rulepack::parse(&anchored_match_rulepack(&format!(
1303                "right_window_chars = {value}\n"
1304            )))
1305            .expect_err("invalid right_window_chars fails closed");
1306
1307            assert_unsupported_anchored_match(err, "right_window_chars", expected);
1308        }
1309    }
1310
1311    #[test]
1312    fn rulepack_load_fails_when_two_name_recognizers_omit_cooperates_with() {
1313        let err = Rulepack::parse(
1314            r#"
1315schema_version = "0.1.0"
1316rulepack_id = "bad-composition"
1317rulepack_version = "0.4.1"
1318default_locales = ["global"]
1319
1320[[recognizers]]
1321id = "email.header.name"
1322class = "Name"
1323enabled = true
1324
1325[recognizers.match]
1326kind = "regex"
1327pattern = "From: ([A-Z][a-z]+)"
1328
1329[[recognizers]]
1330id = "salutation.name"
1331class = "Name"
1332enabled = true
1333
1334[recognizers.match]
1335kind = "regex"
1336pattern = "Dear ([A-Z][a-z]+)"
1337"#,
1338        )
1339        .expect_err("same-class recognizers must explicitly cooperate");
1340
1341        assert!(matches!(
1342            err,
1343            RulepackError::SameClassWithoutCooperation {
1344                class: PiiClass::Name,
1345                recognizer_a,
1346                recognizer_b,
1347            } if recognizer_a == "email.header.name" && recognizer_b == "salutation.name"
1348        ));
1349    }
1350
1351    #[test]
1352    fn rulepack_load_accepts_same_class_pair_with_cooperates_with() {
1353        let rulepack = Rulepack::parse(
1354            r#"
1355schema_version = "0.1.0"
1356rulepack_id = "cooperating-composition"
1357rulepack_version = "0.4.1"
1358default_locales = ["global"]
1359
1360[[recognizers]]
1361id = "email.header.name"
1362class = "Name"
1363cooperates_with = ["salutation.name"]
1364enabled = true
1365
1366[recognizers.match]
1367kind = "regex"
1368pattern = "From: ([A-Z][a-z]+)"
1369
1370[[recognizers]]
1371id = "salutation.name"
1372class = "Name"
1373enabled = true
1374
1375[recognizers.match]
1376kind = "regex"
1377pattern = "Dear ([A-Z][a-z]+)"
1378"#,
1379        )
1380        .expect("cooperates_with unblocks same-class recognizers");
1381
1382        assert_eq!(rulepack.recognizers.len(), 2);
1383        assert_eq!(
1384            rulepack.recognizers[0].cooperates_with,
1385            vec!["salutation.name"]
1386        );
1387    }
1388
1389    #[test]
1390    fn rejects_unknown_fields_with_parent_table_context() {
1391        let err = Rulepack::parse(
1392            r#"
1393schema_version = "0.1.0"
1394rulepack_id = "bad"
1395rulepack_version = "0.4.0"
1396default_locales = ["global"]
1397bogus = true
1398"#,
1399        )
1400        .expect_err("unknown field must fail");
1401
1402        assert!(matches!(err, RulepackError::Toml(_)));
1403        assert!(err.to_string().contains("bogus"));
1404    }
1405
1406    #[test]
1407    fn rejects_unsupported_schema_version() {
1408        let err = Rulepack::parse(
1409            r#"
1410schema_version = "0.2.0"
1411rulepack_id = "bad"
1412rulepack_version = "0.4.0"
1413"#,
1414        )
1415        .expect_err("unsupported schema");
1416
1417        assert!(matches!(err, RulepackError::SchemaVersion { .. }));
1418    }
1419
1420    #[test]
1421    fn class_spelling_accepts_pascal_case_and_custom_names() {
1422        assert_eq!(parse_class("Email").unwrap(), PiiClass::Email);
1423        assert_eq!(
1424            parse_class("custom:Class_Alpha").unwrap(),
1425            PiiClass::Custom("class_alpha".to_string())
1426        );
1427    }
1428
1429    fn unsupported_field_rulepack(extra: &str) -> String {
1430        format!(
1431            r#"
1432schema_version = "0.1.0"
1433rulepack_id = "bad"
1434rulepack_version = "0.4.0"
1435default_locales = ["global"]
1436
1437[[recognizers]]
1438id = "bad.email"
1439class = "Email"
1440enabled = true
1441
1442[recognizers.match]
1443kind = "regex"
1444pattern = "BAD_EMAIL_FIXTURE"
1445
1446{extra}
1447"#
1448        )
1449    }
1450
1451    fn anchored_match_rulepack(override_line: &str) -> String {
1452        let cues_bucket = if override_line.starts_with("cues_bucket") {
1453            override_line.to_string()
1454        } else {
1455            "cues_bucket = \"forward_markers\"\n".to_string()
1456        };
1457        let boundary = if override_line.starts_with("boundary") {
1458            override_line.to_string()
1459        } else {
1460            "boundary = \"punctuation\"\n".to_string()
1461        };
1462        let right_window_chars = if override_line.starts_with("right_window_chars") {
1463            override_line.to_string()
1464        } else {
1465            "right_window_chars = 64\n".to_string()
1466        };
1467        let name_shape = if override_line.starts_with("name_shape") {
1468            override_line.to_string()
1469        } else {
1470            "name_shape = \"person_name\"\n".to_string()
1471        };
1472        let cue_position = if override_line.starts_with("cue_position") {
1473            override_line.to_string()
1474        } else {
1475            "cue_position = \"before\"\n".to_string()
1476        };
1477        format!(
1478            r#"
1479schema_version = "0.1.0"
1480rulepack_id = "anchored"
1481rulepack_version = "0.6.0"
1482default_locales = ["global"]
1483
1484[[recognizers]]
1485id = "name.forward_marker"
1486class = "Name"
1487enabled = true
1488
1489[recognizers.match]
1490kind = "anchored_match"
1491{cues_bucket}{boundary}{right_window_chars}{name_shape}{cue_position}
1492"#
1493        )
1494    }
1495
1496    fn assert_unsupported_field(err: RulepackError, field: &str) {
1497        assert!(matches!(
1498            err,
1499            RulepackError::UnsupportedField {
1500                field: ref actual,
1501                planned_version: "v0.4.1",
1502            } if actual == field
1503        ));
1504    }
1505
1506    fn assert_unsupported_anchored_match(err: RulepackError, field: &str, value: &str) {
1507        assert!(matches!(
1508            err,
1509            RulepackError::UnsupportedAnchoredMatch {
1510                field: ref actual_field,
1511                value: ref actual_value,
1512            } if actual_field == field && actual_value == value
1513        ));
1514    }
1515}