1use std::collections::{BTreeSet, HashMap};
2use std::path::PathBuf;
3
4use regex::Regex;
5use serde::Deserialize;
6use thiserror::Error;
7
8use crate::{LocaleTag, PiiClass};
9
10const SUPPORTED_SCHEMA_MAJOR_MINOR: &str = "0.1.";
11
12#[derive(Debug, Clone, PartialEq)]
13pub struct Rulepack {
14 pub schema_version: String,
15 pub rulepack_id: String,
16 pub rulepack_version: String,
17 pub default_locales: Vec<LocaleTag>,
18 pub locale: Option<LocaleData>,
19 pub recognizers: Vec<RecognizerSpec>,
20}
21
22#[derive(Debug, Clone, PartialEq)]
23#[non_exhaustive]
24pub struct RecognizerSpec {
25 pub id: String,
26 pub class: PiiClass,
27 pub cooperates_with: Vec<String>,
28 pub enabled: bool,
29 pub locales: Vec<LocaleTag>,
30 pub matcher: RawMatch,
31 pub context: Option<ContextSpec>,
32 pub validator: Option<ValidatorSpec>,
33 pub normalizer: Option<NormalizerSpec>,
34 pub scoring: ScoringSpec,
35 pub token: TokenSpec,
36 pub source: Option<SourceSpec>,
37}
38
39#[derive(Debug, Clone, PartialEq, Deserialize)]
40#[serde(tag = "kind", deny_unknown_fields, rename_all = "snake_case")]
41#[non_exhaustive]
42pub enum RawMatch {
43 Regex {
44 #[serde(default)]
45 pattern: Option<String>,
46 #[serde(default)]
47 pattern_template: Option<String>,
48 #[serde(default)]
49 capture_groups: Option<Vec<u32>>,
50 },
51 Dictionary {
52 #[serde(default)]
53 terms: Vec<String>,
54 #[serde(default)]
55 terms_file: Option<String>,
56 #[serde(default)]
57 terms_from_context: Option<String>,
58 #[serde(default)]
59 case_sensitive: bool,
60 },
61 Ner {
62 model_ref: String,
63 },
64 AnchoredMatch {
65 cues_bucket: String,
66 boundary: String,
67 right_window_chars: u16,
68 name_shape: String,
69 cue_position: String,
70 },
71}
72
73#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
74#[serde(deny_unknown_fields, rename_all = "snake_case")]
75#[non_exhaustive]
76pub enum AnchoredBoundary {
77 Punctuation,
78 Whitespace,
79 LineEnd,
80}
81
82#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
91#[serde(deny_unknown_fields, rename_all = "snake_case")]
92#[non_exhaustive]
93pub enum NameShape {
94 PersonName,
95}
96
97#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
98#[serde(deny_unknown_fields, rename_all = "snake_case")]
99#[non_exhaustive]
100pub enum CuePosition {
101 Before,
102 After,
103}
104
105#[derive(Debug, Clone, PartialEq)]
106#[non_exhaustive]
107pub struct ContextSpec {
108 pub hotwords: Vec<String>,
109 pub window: Option<u16>,
110 pub boost: Option<f32>,
111 pub exclusions: Vec<String>,
112}
113
114#[derive(Debug, Clone, PartialEq)]
115pub struct ValidatorSpec {
116 pub kind: String,
117}
118
119#[derive(Debug, Clone, PartialEq)]
120pub struct NormalizerSpec {
121 pub kind: String,
122}
123
124#[derive(Debug, Clone, PartialEq)]
125pub struct ScoringSpec {
126 pub base: f32,
127 pub priority: i32,
128}
129
130#[derive(Debug, Clone, PartialEq)]
131#[non_exhaustive]
132pub struct TokenSpec {
133 pub family: Option<String>,
134 pub format: Option<String>,
135}
136
137#[derive(Debug, Clone, PartialEq)]
138pub struct SourceSpec {
139 pub origin: String,
140 pub from: Option<String>,
141 pub license: Option<String>,
142}
143
144#[derive(Debug, Clone, PartialEq, Eq, Default)]
145pub struct LocaleData {
146 pub buckets: HashMap<String, LocaleBucket>,
147}
148
149#[derive(Debug, Clone, PartialEq, Eq)]
150pub struct LocaleBucket {
151 pub names: Vec<String>,
152}
153
154#[derive(Debug, Clone, PartialEq, Eq)]
155#[non_exhaustive]
156pub enum RulepackSource {
157 Embedded(&'static str),
158 Path(PathBuf),
159}
160
161#[derive(Debug, Error)]
162#[non_exhaustive]
163pub enum RulepackError {
164 #[error("failed to read rulepack: {0}")]
165 Io(#[source] std::io::Error),
166 #[error("failed to parse rulepack TOML: {0}")]
167 Toml(#[source] toml::de::Error),
168 #[error("unsupported rulepack schema_version {found}; supported {supported}")]
169 SchemaVersion { found: String, supported: String },
170 #[error("unknown pii class: {0}")]
171 UnknownClass(String),
172 #[error("unknown locale: {0}")]
173 UnknownLocale(String),
174 #[error("unsupported matcher kind: {0}")]
175 UnsupportedMatcher(String),
176 #[error("unsupported anchored_match field '{field}' value '{value}'")]
177 UnsupportedAnchoredMatch { field: String, value: String },
178 #[error("unsupported rulepack field '{field}' in B1; planned for {planned_version}")]
179 UnsupportedField {
180 field: String,
181 planned_version: &'static str,
182 },
183 #[error("unsupported validator kind: {kind}")]
184 UnsupportedValidator { kind: String },
185 #[error("unsupported normalizer kind: {kind}")]
186 UnsupportedNormalizer { kind: String },
187 #[error("unsupported rule spec variant: {variant}")]
188 UnsupportedRuleSpec { variant: String },
189 #[error("duplicate recognizer id '{id}' in rulepacks '{first_pack}' and '{second_pack}'")]
190 DuplicateId {
191 id: String,
192 first_pack: String,
193 second_pack: String,
194 },
195 #[error("regex recognizer '{id}' must define exactly one of pattern or pattern_template")]
196 RegexPatternChoice { id: String },
197 #[error("invalid regex for recognizer '{id}': {source}")]
198 RegexCompile {
199 id: String,
200 #[source]
201 source: regex::Error,
202 },
203 #[error(
204 "regex recognizer '{id}' shadows Gaze token shape sample '{shadowed_shape}' with pattern '{pattern}'"
205 )]
206 TokenShapeShadow {
207 id: String,
208 pattern: String,
209 shadowed_shape: String,
210 },
211 #[error("unknown pattern_template placeholder '{placeholder}' in recognizer '{id}'")]
212 UnknownPatternTemplatePlaceholder { id: String, placeholder: String },
213 #[error(
214 "context class_map override for dictionary '{dict}' changes {old_class:?} to {new_class:?}, but {uncovered_rule}"
215 )]
216 ClassMapOverrideClash {
217 dict: String,
218 old_class: PiiClass,
219 new_class: PiiClass,
220 uncovered_rule: String,
221 },
222 #[error(
223 "same-class recognizers '{recognizer_a}' and '{recognizer_b}' both emit {class:?} but neither declares cooperates_with"
224 )]
225 SameClassWithoutCooperation {
226 class: PiiClass,
227 recognizer_a: String,
228 recognizer_b: String,
229 },
230 #[error(
231 "recognizers {recognizer_ids:?} share class {class:?} with equivalent regex shape and overlapping locale projection {locale_overlap:?}"
232 )]
233 ConflictingLocaleProjection {
234 class: PiiClass,
235 recognizer_ids: Vec<String>,
236 locale_overlap: Vec<LocaleTag>,
237 },
238}
239
240impl Rulepack {
241 pub fn load(source: RulepackSource) -> Result<Rulepack, RulepackError> {
242 let raw = match source {
243 RulepackSource::Embedded(contents) => contents.to_string(),
244 RulepackSource::Path(path) => {
245 std::fs::read_to_string(path).map_err(RulepackError::Io)?
246 }
247 };
248 Self::parse(&raw)
249 }
250
251 pub fn parse(raw: &str) -> Result<Rulepack, RulepackError> {
252 let (raw, lint) = extract_recognizer_lint_config(raw);
253 let raw: RawRulepack = toml::from_str(&raw).map_err(RulepackError::Toml)?;
254 RawRulepackWithLint { raw, lint }.try_into()
255 }
256
257 pub fn activated_classes(&self) -> BTreeSet<PiiClass> {
258 self.recognizers
259 .iter()
260 .filter(|recognizer| recognizer.enabled)
261 .map(|recognizer| recognizer.class.clone())
262 .collect()
263 }
264}
265
266#[derive(Debug, Deserialize)]
267#[serde(deny_unknown_fields)]
268struct RawRulepack {
269 schema_version: String,
270 rulepack_id: String,
271 rulepack_version: String,
272 #[serde(default)]
273 default_locales: Vec<String>,
274 #[serde(default)]
275 locale: Option<RawLocaleData>,
276 #[serde(default)]
277 recognizers: Vec<RawRecognizerSpec>,
278}
279
280#[derive(Debug, Default)]
281struct RawRecognizerLintConfig {
282 strict_locale_overlap: bool,
283}
284
285#[derive(Debug)]
286struct RawRulepackWithLint {
287 raw: RawRulepack,
288 lint: RawRecognizerLintConfig,
289}
290
291#[derive(Debug, Deserialize)]
292struct RawLocaleData {
293 #[serde(flatten)]
294 buckets: HashMap<String, RawLocaleBucket>,
295}
296
297#[derive(Debug, Deserialize)]
298#[serde(deny_unknown_fields)]
299struct RawLocaleBucket {
300 names: Vec<String>,
301}
302
303#[derive(Debug, Deserialize)]
304#[serde(deny_unknown_fields)]
305struct RawRecognizerSpec {
306 id: String,
307 class: String,
308 #[serde(default)]
309 cooperates_with: Vec<String>,
310 #[serde(default = "default_true")]
311 enabled: bool,
312 #[serde(default)]
313 locales: Vec<String>,
314 #[serde(rename = "match")]
315 matcher: RawMatch,
316 #[serde(default)]
317 context: Option<RawContextSpec>,
318 #[serde(default)]
319 validator: Option<RawValidatorSpec>,
320 #[serde(default)]
321 normalizer: Option<RawNormalizerSpec>,
322 #[serde(default)]
323 scoring: Option<RawScoringSpec>,
324 #[serde(default)]
325 token: RawTokenSpec,
326 #[serde(default)]
327 source: Option<RawSourceSpec>,
328}
329
330#[derive(Debug, Deserialize)]
331#[serde(deny_unknown_fields)]
332struct RawContextSpec {
333 #[serde(default)]
334 hotwords: Vec<String>,
335 #[serde(default)]
336 window: Option<u16>,
337 #[serde(default)]
338 boost: Option<f32>,
339 #[serde(default)]
340 exclusions: Vec<String>,
341}
342
343#[derive(Debug, Deserialize)]
344#[serde(deny_unknown_fields)]
345struct RawValidatorSpec {
346 kind: String,
347}
348
349#[derive(Debug, Deserialize)]
350#[serde(deny_unknown_fields)]
351struct RawNormalizerSpec {
352 kind: String,
353}
354
355#[derive(Debug, Deserialize)]
356#[serde(deny_unknown_fields)]
357struct RawScoringSpec {
358 #[serde(default = "default_base_score")]
359 base: f32,
360 #[serde(default)]
361 priority: i32,
362}
363
364#[derive(Debug, Default, Deserialize)]
365#[serde(deny_unknown_fields)]
366struct RawTokenSpec {
367 #[serde(default)]
368 family: Option<String>,
369 #[serde(default)]
370 format: Option<String>,
371}
372
373#[derive(Debug, Deserialize)]
374#[serde(deny_unknown_fields)]
375struct RawSourceSpec {
376 origin: String,
377 #[serde(default)]
378 from: Option<String>,
379 #[serde(default)]
380 license: Option<String>,
381}
382
383impl TryFrom<RawRulepack> for Rulepack {
384 type Error = RulepackError;
385
386 fn try_from(raw: RawRulepack) -> Result<Self, Self::Error> {
387 RawRulepackWithLint {
388 raw,
389 lint: RawRecognizerLintConfig::default(),
390 }
391 .try_into()
392 }
393}
394
395impl TryFrom<RawRulepackWithLint> for Rulepack {
396 type Error = RulepackError;
397
398 fn try_from(raw_with_lint: RawRulepackWithLint) -> Result<Self, Self::Error> {
399 let raw = raw_with_lint.raw;
400 if !raw.schema_version.starts_with(SUPPORTED_SCHEMA_MAJOR_MINOR) {
401 return Err(RulepackError::SchemaVersion {
402 found: raw.schema_version,
403 supported: "~0.1.x".to_string(),
404 });
405 }
406
407 let default_locales = parse_locales(raw.default_locales)?;
408 let recognizers = raw
409 .recognizers
410 .into_iter()
411 .map(|recognizer| parse_recognizer(recognizer, &default_locales))
412 .collect::<Result<Vec<_>, _>>()?;
413 validate_rulepack_recognizers(&recognizers, &default_locales, &raw_with_lint.lint)?;
414 let locale = raw.locale.map(LocaleData::from);
415 reject_anchored_match_ellipsis_cues(&recognizers, locale.as_ref())?;
416
417 Ok(Self {
418 schema_version: raw.schema_version,
419 rulepack_id: raw.rulepack_id,
420 rulepack_version: raw.rulepack_version,
421 default_locales,
422 locale,
423 recognizers,
424 })
425 }
426}
427
428fn extract_recognizer_lint_config(raw: &str) -> (String, RawRecognizerLintConfig) {
429 let mut sanitized = String::with_capacity(raw.len());
430 let mut lint = RawRecognizerLintConfig::default();
431 let mut in_lint = false;
432
433 for line in raw.lines() {
434 let trimmed = line.trim();
435 if trimmed == "[recognizers.lint]" {
436 in_lint = true;
437 continue;
438 }
439 if in_lint && trimmed.starts_with('[') {
440 in_lint = false;
441 }
442 if in_lint {
443 if let Some((key, value)) = trimmed.split_once('=') {
444 if key.trim() == "strict_locale_overlap" {
445 lint.strict_locale_overlap = value.trim().eq_ignore_ascii_case("true");
446 }
447 }
448 continue;
449 }
450 sanitized.push_str(line);
451 sanitized.push('\n');
452 }
453
454 (sanitized, lint)
455}
456
457impl From<RawLocaleData> for LocaleData {
458 fn from(raw: RawLocaleData) -> Self {
459 Self {
460 buckets: raw
461 .buckets
462 .into_iter()
463 .map(|(name, bucket)| {
464 (
465 name,
466 LocaleBucket {
467 names: bucket.names,
468 },
469 )
470 })
471 .collect(),
472 }
473 }
474}
475
476fn parse_recognizer(
477 raw: RawRecognizerSpec,
478 default_locales: &[LocaleTag],
479) -> Result<RecognizerSpec, RulepackError> {
480 reject_unshipped_fields(&raw)?;
481 validate_matcher(&raw)?;
482 let locales = if raw.locales.is_empty() {
483 default_locales.to_vec()
484 } else {
485 parse_locales(raw.locales)?
486 };
487
488 Ok(RecognizerSpec {
489 id: raw.id,
490 class: parse_class(&raw.class)?,
491 cooperates_with: raw.cooperates_with,
492 enabled: raw.enabled,
493 locales,
494 matcher: raw.matcher,
495 context: raw.context.map(|context| ContextSpec {
496 hotwords: context.hotwords,
497 window: context.window,
498 boost: context.boost,
499 exclusions: context.exclusions,
500 }),
501 validator: raw.validator.map(|validator| ValidatorSpec {
502 kind: validator.kind,
503 }),
504 normalizer: raw.normalizer.map(|normalizer| NormalizerSpec {
505 kind: normalizer.kind,
506 }),
507 scoring: raw.scoring.map_or_else(
508 || ScoringSpec {
509 base: default_base_score(),
510 priority: 0,
511 },
512 |scoring| ScoringSpec {
513 base: scoring.base,
514 priority: scoring.priority,
515 },
516 ),
517 token: TokenSpec {
518 family: raw.token.family,
519 format: raw.token.format,
520 },
521 source: raw.source.map(|source| SourceSpec {
522 origin: source.origin,
523 from: source.from,
524 license: source.license,
525 }),
526 })
527}
528
529fn validate_matcher(raw: &RawRecognizerSpec) -> Result<(), RulepackError> {
530 match &raw.matcher {
531 RawMatch::Regex {
532 pattern,
533 pattern_template,
534 ..
535 } => {
536 if pattern.is_some() == pattern_template.is_some() {
537 return Err(RulepackError::RegexPatternChoice { id: raw.id.clone() });
538 }
539 if let Some(pattern) = pattern {
540 let compiled =
541 Regex::new(pattern).map_err(|source| RulepackError::RegexCompile {
542 id: raw.id.clone(),
543 source,
544 })?;
545 crate::token_shape::reject_if_shadows_token_shape(&compiled, &raw.id).map_err(
546 |shadow| RulepackError::TokenShapeShadow {
547 id: shadow.recognizer_id,
548 pattern: shadow.offending_pattern,
549 shadowed_shape: shadow.shadowed_shape,
550 },
551 )?;
552 }
553 }
554 RawMatch::AnchoredMatch {
555 cues_bucket,
556 boundary,
557 right_window_chars,
558 name_shape,
559 cue_position,
560 ..
561 } => {
562 if cues_bucket.trim().is_empty() {
563 return Err(RulepackError::UnsupportedAnchoredMatch {
564 field: "cues_bucket".to_string(),
565 value: cues_bucket.clone(),
566 });
567 }
568 if !(1..=512).contains(right_window_chars) {
569 return Err(RulepackError::UnsupportedAnchoredMatch {
570 field: "right_window_chars".to_string(),
571 value: right_window_chars.to_string(),
572 });
573 }
574 if !matches!(boundary.as_str(), "punctuation" | "whitespace" | "line_end") {
575 return Err(RulepackError::UnsupportedAnchoredMatch {
576 field: "boundary".to_string(),
577 value: boundary.clone(),
578 });
579 }
580 if name_shape != "person_name" {
581 return Err(RulepackError::UnsupportedAnchoredMatch {
582 field: "name_shape".to_string(),
583 value: name_shape.clone(),
584 });
585 }
586 if !matches!(cue_position.as_str(), "before" | "after") {
587 return Err(RulepackError::UnsupportedAnchoredMatch {
588 field: "cue_position".to_string(),
589 value: cue_position.clone(),
590 });
591 }
592 }
593 RawMatch::Dictionary { .. } | RawMatch::Ner { .. } => {}
594 }
595 Ok(())
596}
597
598fn reject_anchored_match_ellipsis_cues(
599 recognizers: &[RecognizerSpec],
600 locale: Option<&LocaleData>,
601) -> Result<(), RulepackError> {
602 let Some(locale) = locale else {
603 return Ok(());
604 };
605 for recognizer in recognizers {
606 let RawMatch::AnchoredMatch { cues_bucket, .. } = &recognizer.matcher else {
607 continue;
608 };
609 let Some(bucket) = locale.buckets.get(cues_bucket) else {
610 continue;
611 };
612 if let Some(cue) = bucket.names.iter().find(|cue| cue.contains("...")) {
613 return Err(RulepackError::UnsupportedAnchoredMatch {
614 field: format!("locale.{cues_bucket}.names"),
615 value: cue.clone(),
616 });
617 }
618 }
619 Ok(())
620}
621
622fn reject_unshipped_fields(raw: &RawRecognizerSpec) -> Result<(), RulepackError> {
623 const PLANNED_VERSION: &str = "v0.4.1";
624
625 if raw
626 .token
627 .format
628 .as_deref()
629 .is_some_and(|value| !value.is_empty())
630 {
631 return Err(RulepackError::UnsupportedField {
632 field: "token.format".to_string(),
633 planned_version: PLANNED_VERSION,
634 });
635 }
636 if let Some(context) = &raw.context {
637 if !context.hotwords.is_empty() {
638 return Err(RulepackError::UnsupportedField {
639 field: "context.hotwords".to_string(),
640 planned_version: PLANNED_VERSION,
641 });
642 }
643 if context.boost.is_some() {
644 return Err(RulepackError::UnsupportedField {
645 field: "context.boost".to_string(),
646 planned_version: PLANNED_VERSION,
647 });
648 }
649 if context.window.is_some() {
650 return Err(RulepackError::UnsupportedField {
651 field: "context.window".to_string(),
652 planned_version: PLANNED_VERSION,
653 });
654 }
655 }
656 Ok(())
657}
658
659pub fn recognizer_composition_validator(
660 recognizers: &[RecognizerSpec],
661) -> Result<(), RulepackError> {
662 for (index, first) in recognizers.iter().enumerate() {
663 for second in recognizers.iter().skip(index + 1) {
664 if first.class != second.class {
665 continue;
666 }
667 if first.cooperates_with.iter().any(|id| id == &second.id)
668 || second.cooperates_with.iter().any(|id| id == &first.id)
669 {
670 continue;
671 }
672 return Err(RulepackError::SameClassWithoutCooperation {
673 class: first.class.clone(),
674 recognizer_a: first.id.clone(),
675 recognizer_b: second.id.clone(),
676 });
677 }
678 }
679 Ok(())
680}
681
682fn validate_rulepack_recognizers(
683 recognizers: &[RecognizerSpec],
684 active_locales: &[LocaleTag],
685 lint: &RawRecognizerLintConfig,
686) -> Result<(), RulepackError> {
687 recognizer_composition_validator(recognizers)?;
688 lint_locale_projection_collisions(recognizers, active_locales, lint)?;
689 lint_global_naked_patterns(recognizers);
690 Ok(())
691}
692
693fn lint_locale_projection_collisions(
694 recognizers: &[RecognizerSpec],
695 active_locales: &[LocaleTag],
696 lint: &RawRecognizerLintConfig,
697) -> Result<(), RulepackError> {
698 for (index, first) in recognizers.iter().enumerate() {
699 if !first.enabled {
700 continue;
701 }
702 let Some(first_shape) = regex_structural_shape(&first.matcher) else {
703 continue;
704 };
705 if !is_truly_naked_numeric(&first.matcher) {
706 continue;
707 }
708 let first_projection = locale_projection(&first.locales, active_locales);
709 if first_projection.is_empty() {
710 continue;
711 }
712
713 for second in recognizers.iter().skip(index + 1) {
714 if !second.enabled || first.class != second.class {
715 continue;
716 }
717 if !is_truly_naked_numeric(&second.matcher) {
718 continue;
719 }
720 if regex_structural_shape(&second.matcher).as_ref() != Some(&first_shape) {
721 continue;
722 }
723 let second_projection = locale_projection(&second.locales, active_locales);
724 if second_projection.is_empty() {
725 continue;
726 }
727
728 let recognizer_ids = vec![first.id.clone(), second.id.clone()];
729 let locale_overlap = merged_locale_projection(&first_projection, &second_projection);
730 if lint.strict_locale_overlap {
731 return Err(RulepackError::ConflictingLocaleProjection {
732 class: first.class.clone(),
733 recognizer_ids,
734 locale_overlap,
735 });
736 }
737 tracing::warn!(
738 class = %first.class.class_name(),
739 recognizer_ids = ?recognizer_ids,
740 locale_overlap = ?locale_overlap,
741 "recognizers share class with naked-shape regex and non-disjoint locale projection"
742 );
743 }
744 }
745 Ok(())
746}
747
748fn lint_global_naked_patterns(recognizers: &[RecognizerSpec]) {
749 for recognizer in recognizers {
750 if !recognizer.enabled || recognizer.locales != [LocaleTag::Global] {
751 continue;
752 }
753 let Some(shape) = regex_structural_shape(&recognizer.matcher) else {
754 continue;
755 };
756 let RawMatch::Regex {
757 pattern: Some(pattern),
758 ..
759 } = &recognizer.matcher
760 else {
761 continue;
762 };
763 if shape.minimum_match_len < 6 && !has_regex_separator(pattern) {
764 tracing::warn!(
765 recognizer_id = %recognizer.id,
766 class = %recognizer.class.class_name(),
767 minimum_match_len = shape.minimum_match_len,
768 "global recognizer uses short naked regex shape"
769 );
770 }
771 }
772}
773
774#[derive(Debug, Clone, PartialEq, Eq)]
775struct RegexStructuralShape {
776 minimum_match_len: usize,
777 character_class: RegexCharacterClass,
778}
779
780#[derive(Debug, Clone, PartialEq, Eq)]
781enum RegexCharacterClass {
782 Digit,
783}
784
785fn regex_structural_shape(matcher: &RawMatch) -> Option<RegexStructuralShape> {
786 let RawMatch::Regex {
787 pattern: Some(pattern),
788 pattern_template: None,
789 ..
790 } = matcher
791 else {
792 return None;
793 };
794 if has_unescaped_line_anchor(pattern) {
795 return None;
796 }
797 digit_quantifier_minimum(pattern).map(|minimum_match_len| RegexStructuralShape {
798 minimum_match_len,
799 character_class: RegexCharacterClass::Digit,
800 })
801}
802
803fn is_truly_naked_numeric(matcher: &RawMatch) -> bool {
804 let RawMatch::Regex {
805 pattern: Some(pattern),
806 ..
807 } = matcher
808 else {
809 return false;
810 };
811
812 let mut chars = pattern.chars();
813 while let Some(ch) = chars.next() {
814 if ch == '\\' {
815 chars.next();
816 continue;
817 }
818 if ch.is_ascii_alphabetic() {
819 return false;
820 }
821 }
822 true
823}
824
825fn has_unescaped_line_anchor(pattern: &str) -> bool {
826 let mut escaped = false;
827 let mut in_class = false;
828 for ch in pattern.chars() {
829 if escaped {
830 escaped = false;
831 continue;
832 }
833 match ch {
834 '\\' => escaped = true,
835 '[' => in_class = true,
836 ']' => in_class = false,
837 '^' | '$' if !in_class => return true,
838 _ => {}
839 }
840 }
841 false
842}
843
844fn digit_quantifier_minimum(pattern: &str) -> Option<usize> {
845 find_digit_quantifier(pattern, r"\d{")
846 .or_else(|| find_digit_quantifier(pattern, "[0-9]{"))
847 .or_else(|| find_digit_quantifier(pattern, "[[:digit:]]{"))
848}
849
850fn find_digit_quantifier(pattern: &str, needle: &str) -> Option<usize> {
851 let start = pattern.find(needle)? + needle.len();
852 let rest = &pattern[start..];
853 let digits = rest
854 .chars()
855 .take_while(|ch| ch.is_ascii_digit())
856 .collect::<String>();
857 if digits.is_empty() {
858 return None;
859 }
860 digits.parse().ok()
861}
862
863fn locale_projection(locales: &[LocaleTag], active_locales: &[LocaleTag]) -> Vec<LocaleTag> {
864 let mut projection = Vec::new();
865 for locale in locales {
866 if *locale == LocaleTag::Global {
867 projection.push(LocaleTag::Global);
868 } else if active_locales.iter().any(|active| active == locale) {
869 projection.push(locale.clone());
870 }
871 }
872 projection
873}
874
875fn merged_locale_projection(left: &[LocaleTag], right: &[LocaleTag]) -> Vec<LocaleTag> {
876 let mut merged = Vec::new();
877 for locale in left.iter().chain(right) {
878 if !merged.iter().any(|existing| existing == locale) {
879 merged.push(locale.clone());
880 }
881 }
882 merged
883}
884
885fn has_regex_separator(pattern: &str) -> bool {
886 pattern.contains('-')
887 || pattern.contains('/')
888 || pattern.contains('.')
889 || pattern.contains('+')
890 || pattern.contains("\\s")
891 || pattern.contains("[:space:]")
892}
893
894pub fn parse_class(input: &str) -> Result<PiiClass, RulepackError> {
895 let trimmed = input.trim();
896 let lower = trimmed.to_ascii_lowercase();
897 match lower.as_str() {
898 "email" => Ok(PiiClass::Email),
899 "name" => Ok(PiiClass::Name),
900 "location" => Ok(PiiClass::Location),
901 "organization" => Ok(PiiClass::Organization),
902 custom if custom.starts_with("custom:") => {
903 let name = trimmed
904 .split_once(':')
905 .map(|(_, name)| name)
906 .unwrap_or_default();
907 if name.trim().is_empty() {
908 return Err(RulepackError::UnknownClass(input.to_string()));
909 }
910 Ok(PiiClass::custom(name))
911 }
912 _ => Err(RulepackError::UnknownClass(input.to_string())),
913 }
914}
915
916fn parse_locales(locales: Vec<String>) -> Result<Vec<LocaleTag>, RulepackError> {
917 locales
918 .into_iter()
919 .map(|locale| {
920 LocaleTag::parse(&locale).map_err(|_| RulepackError::UnknownLocale(locale.clone()))
921 })
922 .collect()
923}
924
925fn default_true() -> bool {
926 true
927}
928
929fn default_base_score() -> f32 {
930 0.70
931}
932
933#[cfg(test)]
934mod tests {
935 use super::*;
936
937 const CORE: &str = r#"
938schema_version = "0.1.0"
939rulepack_id = "gaze-core"
940rulepack_version = "0.4.0"
941default_locales = ["global"]
942
943[locale.email_headers]
944names = ["From", "To", "Cc", "Bcc", "Reply-To", "Sender"]
945
946[[recognizers]]
947id = "email.global"
948class = "Email"
949enabled = true
950locales = ["global"]
951
952[recognizers.match]
953kind = "regex"
954pattern = '''(?i)\b[a-z0-9._%+\-]+@(?:(?:[a-z0-9\-]+\.)*example\.invalid|test\.local|[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?\.(?:com|org|net|edu|gov|de|uk|fr|nl|io|ai|co))\b'''
955
956[recognizers.context]
957exclusions = ["example.com"]
958
959[recognizers.validator]
960kind = "email_rfc"
961
962[recognizers.normalizer]
963kind = "email_canonical"
964
965[recognizers.scoring]
966base = 0.70
967priority = 90
968
969[recognizers.token]
970
971[recognizers.source]
972origin = "ported"
973from = "presidio"
974license = "Apache-2.0"
975"#;
976
977 #[test]
978 fn parses_core_rulepack_end_to_end() {
979 let rulepack = Rulepack::parse(CORE).expect("core rulepack");
980
981 assert_eq!(rulepack.rulepack_id, "gaze-core");
982 assert_eq!(rulepack.default_locales, vec![LocaleTag::Global]);
983 let header_names = &rulepack
984 .locale
985 .as_ref()
986 .and_then(|locale| locale.buckets.get("email_headers"))
987 .expect("email headers")
988 .names;
989 assert_eq!(
990 header_names,
991 &vec!["From", "To", "Cc", "Bcc", "Reply-To", "Sender"]
992 );
993 assert_eq!(rulepack.recognizers.len(), 1);
994 let recognizer = &rulepack.recognizers[0];
995 assert_eq!(recognizer.id, "email.global");
996 assert_eq!(recognizer.class, PiiClass::Email);
997 assert_eq!(recognizer.scoring.priority, 90);
998 assert!(matches!(recognizer.matcher, RawMatch::Regex { .. }));
999 }
1000
1001 #[cfg(feature = "bundled-recognizers")]
1002 #[test]
1003 fn embedded_core_activated_classes_match_rulepack_classes() {
1004 let rulepack = Rulepack::load(RulepackSource::Embedded(
1005 gaze_recognizers::embedded("core").expect("core rulepack"),
1006 ))
1007 .expect("embedded core rulepack");
1008
1009 assert_eq!(
1010 rulepack.activated_classes(),
1011 BTreeSet::from([PiiClass::Email, PiiClass::Name])
1012 );
1013 }
1014
1015 #[cfg(feature = "bundled-recognizers")]
1016 #[test]
1017 fn embedded_core_loads_full_name_recognizer_cooperation_matrix() {
1018 let rulepack = Rulepack::load(RulepackSource::Embedded(
1019 gaze_recognizers::embedded("core").expect("core rulepack"),
1020 ))
1021 .expect("embedded core rulepack");
1022 let name_recognizers = rulepack
1023 .recognizers
1024 .iter()
1025 .filter(|recognizer| recognizer.class == PiiClass::Name)
1026 .collect::<Vec<_>>();
1027
1028 assert_eq!(name_recognizers.len(), 5);
1029 for recognizer in &name_recognizers {
1030 for peer in &name_recognizers {
1031 if recognizer.id == peer.id {
1032 continue;
1033 }
1034 assert!(
1035 recognizer.cooperates_with.contains(&peer.id),
1036 "{} missing cooperates_with {}",
1037 recognizer.id,
1038 peer.id
1039 );
1040 }
1041 }
1042 }
1043
1044 #[cfg(feature = "bundled-recognizers")]
1045 #[test]
1046 fn embedded_core_extended_activated_classes_match_rulepack_classes() {
1047 let rulepack = Rulepack::load(RulepackSource::Embedded(
1048 gaze_recognizers::embedded("core-extended").expect("core-extended rulepack"),
1049 ))
1050 .expect("embedded core-extended rulepack");
1051
1052 assert_eq!(
1053 rulepack.activated_classes(),
1054 BTreeSet::from([
1055 PiiClass::custom("phone"),
1056 PiiClass::custom("iban"),
1057 PiiClass::custom("credit_card"),
1058 PiiClass::custom("ip_address"),
1059 PiiClass::custom("eth_address"),
1060 PiiClass::custom("postal_code"),
1061 ])
1062 );
1063 }
1064
1065 #[cfg(feature = "bundled-recognizers")]
1066 #[test]
1067 fn activated_classes_include_new_rulepack_recognizer_class() {
1068 let raw = format!(
1069 r#"{}
1070
1071[[recognizers]]
1072id = "test.only"
1073class = "custom:test_only"
1074enabled = true
1075locales = ["global"]
1076
1077[recognizers.match]
1078kind = "regex"
1079pattern = "TEST_ONLY"
1080
1081[recognizers.scoring]
1082base = 0.70
1083priority = 1
1084"#,
1085 gaze_recognizers::embedded("core-extended").expect("core-extended rulepack")
1086 );
1087 let rulepack = Rulepack::parse(&raw).expect("core-extended with synthetic recognizer");
1088
1089 assert!(
1090 rulepack
1091 .activated_classes()
1092 .contains(&PiiClass::custom("test_only")),
1093 "new recognizer class must be derived from rulepack data"
1094 );
1095 }
1096
1097 #[test]
1098 fn rulepack_accepts_token_family() {
1099 let rulepack = Rulepack::parse(&unsupported_field_rulepack(
1100 "[recognizers.token]\nfamily = \"email.formatpreserve\"\n",
1101 ))
1102 .expect("token family is active in v0.4.1");
1103
1104 assert_eq!(
1105 rulepack.recognizers[0].token.family.as_deref(),
1106 Some("email.formatpreserve")
1107 );
1108 }
1109
1110 #[test]
1111 fn rulepack_rejects_unsupported_token_format() {
1112 let err = Rulepack::parse(&unsupported_field_rulepack(
1113 "[recognizers.token]\nformat = \"Customer_{n}\"\n",
1114 ))
1115 .expect_err("token format is reserved for v0.4.1");
1116
1117 assert_unsupported_field(err, "token.format");
1118 }
1119
1120 #[test]
1121 fn rulepack_rejects_unsupported_context_hotwords() {
1122 let err = Rulepack::parse(&unsupported_field_rulepack(
1123 "[recognizers.context]\nhotwords = [\"foo\"]\n",
1124 ))
1125 .expect_err("context hotwords are reserved for v0.4.1");
1126
1127 assert_unsupported_field(err, "context.hotwords");
1128 }
1129
1130 #[test]
1131 fn rulepack_rejects_unsupported_context_boost() {
1132 let err = Rulepack::parse(&unsupported_field_rulepack(
1133 "[recognizers.context]\nboost = 0.10\n",
1134 ))
1135 .expect_err("context boost is reserved for v0.4.1");
1136
1137 assert_unsupported_field(err, "context.boost");
1138 }
1139
1140 #[test]
1141 fn rulepack_rejects_unsupported_context_window() {
1142 let err = Rulepack::parse(&unsupported_field_rulepack(
1143 "[recognizers.context]\nwindow = 12\n",
1144 ))
1145 .expect_err("context window is reserved for v0.4.1");
1146
1147 assert_unsupported_field(err, "context.window");
1148 }
1149
1150 #[test]
1151 fn rulepack_accepts_default_token_fields() {
1152 let rulepack = Rulepack::parse(CORE).expect("reserved token/context fields are unset");
1153 let recognizer = &rulepack.recognizers[0];
1154
1155 assert_eq!(recognizer.token.family, None);
1156 assert_eq!(recognizer.token.format, None);
1157 assert!(recognizer.context.as_ref().unwrap().hotwords.is_empty());
1158 assert_eq!(recognizer.context.as_ref().unwrap().boost, None);
1159 assert_eq!(recognizer.context.as_ref().unwrap().window, None);
1160 }
1161
1162 #[test]
1163 fn pattern_template_with_pattern_both_present_fails_closed() {
1164 let err = Rulepack::parse(&unsupported_field_rulepack(
1165 "pattern_template = \"{locale_email_headers}: (.+)\"\n",
1166 ))
1167 .expect_err("pattern and pattern_template are mutually exclusive");
1168
1169 assert!(matches!(
1170 err,
1171 RulepackError::RegexPatternChoice { id } if id == "bad.email"
1172 ));
1173 }
1174
1175 #[test]
1176 fn regex_pattern_or_template_is_required() {
1177 let raw = r#"
1178schema_version = "0.1.0"
1179rulepack_id = "bad"
1180rulepack_version = "0.4.0"
1181default_locales = ["global"]
1182
1183[[recognizers]]
1184id = "bad.email"
1185class = "Email"
1186enabled = true
1187
1188[recognizers.match]
1189kind = "regex"
1190"#;
1191 let err = Rulepack::parse(raw).expect_err("regex pattern is required");
1192
1193 assert!(matches!(
1194 err,
1195 RulepackError::RegexPatternChoice { id } if id == "bad.email"
1196 ));
1197 }
1198
1199 #[test]
1200 fn rulepack_load_accepts_fixture_email_regex() {
1201 let raw = r#"
1202schema_version = "0.1.0"
1203rulepack_id = "custom-email"
1204rulepack_version = "0.7.0"
1205default_locales = ["global"]
1206
1207[[recognizers]]
1208id = "custom.email"
1209class = "Email"
1210enabled = true
1211
1212[recognizers.match]
1213kind = "regex"
1214pattern = '''alice@example\.invalid'''
1215"#;
1216
1217 let rulepack = Rulepack::parse(raw).expect("standard email regex should load");
1218
1219 assert_eq!(rulepack.recognizers.len(), 1);
1220 assert_eq!(rulepack.recognizers[0].id, "custom.email");
1221 }
1222
1223 #[test]
1224 fn anchored_match_accepts_valid_schema() {
1225 let rulepack = Rulepack::parse(&anchored_match_rulepack("")).expect("anchored_match");
1226 assert!(matches!(
1227 rulepack.recognizers[0].matcher,
1228 RawMatch::AnchoredMatch { .. }
1229 ));
1230 }
1231
1232 #[test]
1233 fn anchored_match_rejects_unknown_boundary() {
1234 let err = Rulepack::parse(&anchored_match_rulepack("boundary = \"paragraph\"\n"))
1235 .expect_err("unknown boundary fails closed");
1236
1237 assert_unsupported_anchored_match(err, "boundary", "paragraph");
1238 }
1239
1240 #[test]
1241 fn anchored_match_rejects_unknown_name_shape() {
1242 let err = Rulepack::parse(&anchored_match_rulepack("name_shape = \"organization\"\n"))
1243 .expect_err("unknown name_shape fails closed");
1244
1245 assert_unsupported_anchored_match(err, "name_shape", "organization");
1246 }
1247
1248 #[test]
1249 fn anchored_match_rejects_unknown_cue_position() {
1250 let err = Rulepack::parse(&anchored_match_rulepack("cue_position = \"around\"\n"))
1251 .expect_err("unknown cue_position fails closed");
1252
1253 assert_unsupported_anchored_match(err, "cue_position", "around");
1254 }
1255
1256 #[test]
1257 fn anchored_match_rejects_missing_cues_bucket() {
1258 let err = Rulepack::parse(&anchored_match_rulepack("cues_bucket = \"\"\n"))
1259 .expect_err("missing cues_bucket fails closed");
1260
1261 assert_unsupported_anchored_match(err, "cues_bucket", "");
1262 }
1263
1264 #[test]
1265 fn anchored_match_rejects_ellipsis_in_cue_values() {
1266 let err = Rulepack::parse(
1267 r#"
1268schema_version = "0.1.0"
1269rulepack_id = "anchored"
1270rulepack_version = "0.6.0"
1271default_locales = ["global"]
1272
1273[locale.forward_markers]
1274names = ["Forwarded ... message"]
1275
1276[[recognizers]]
1277id = "name.forward_marker"
1278class = "Name"
1279enabled = true
1280
1281[recognizers.match]
1282kind = "anchored_match"
1283cues_bucket = "forward_markers"
1284boundary = "punctuation"
1285right_window_chars = 64
1286name_shape = "person_name"
1287cue_position = "before"
1288"#,
1289 )
1290 .expect_err("ellipsis cue fails closed");
1291
1292 assert_unsupported_anchored_match(
1293 err,
1294 "locale.forward_markers.names",
1295 "Forwarded ... message",
1296 );
1297 }
1298
1299 #[test]
1300 fn anchored_match_rejects_invalid_window_bounds() {
1301 for (value, expected) in [("0", "0"), ("513", "513")] {
1302 let err = Rulepack::parse(&anchored_match_rulepack(&format!(
1303 "right_window_chars = {value}\n"
1304 )))
1305 .expect_err("invalid right_window_chars fails closed");
1306
1307 assert_unsupported_anchored_match(err, "right_window_chars", expected);
1308 }
1309 }
1310
1311 #[test]
1312 fn rulepack_load_fails_when_two_name_recognizers_omit_cooperates_with() {
1313 let err = Rulepack::parse(
1314 r#"
1315schema_version = "0.1.0"
1316rulepack_id = "bad-composition"
1317rulepack_version = "0.4.1"
1318default_locales = ["global"]
1319
1320[[recognizers]]
1321id = "email.header.name"
1322class = "Name"
1323enabled = true
1324
1325[recognizers.match]
1326kind = "regex"
1327pattern = "From: ([A-Z][a-z]+)"
1328
1329[[recognizers]]
1330id = "salutation.name"
1331class = "Name"
1332enabled = true
1333
1334[recognizers.match]
1335kind = "regex"
1336pattern = "Dear ([A-Z][a-z]+)"
1337"#,
1338 )
1339 .expect_err("same-class recognizers must explicitly cooperate");
1340
1341 assert!(matches!(
1342 err,
1343 RulepackError::SameClassWithoutCooperation {
1344 class: PiiClass::Name,
1345 recognizer_a,
1346 recognizer_b,
1347 } if recognizer_a == "email.header.name" && recognizer_b == "salutation.name"
1348 ));
1349 }
1350
1351 #[test]
1352 fn rulepack_load_accepts_same_class_pair_with_cooperates_with() {
1353 let rulepack = Rulepack::parse(
1354 r#"
1355schema_version = "0.1.0"
1356rulepack_id = "cooperating-composition"
1357rulepack_version = "0.4.1"
1358default_locales = ["global"]
1359
1360[[recognizers]]
1361id = "email.header.name"
1362class = "Name"
1363cooperates_with = ["salutation.name"]
1364enabled = true
1365
1366[recognizers.match]
1367kind = "regex"
1368pattern = "From: ([A-Z][a-z]+)"
1369
1370[[recognizers]]
1371id = "salutation.name"
1372class = "Name"
1373enabled = true
1374
1375[recognizers.match]
1376kind = "regex"
1377pattern = "Dear ([A-Z][a-z]+)"
1378"#,
1379 )
1380 .expect("cooperates_with unblocks same-class recognizers");
1381
1382 assert_eq!(rulepack.recognizers.len(), 2);
1383 assert_eq!(
1384 rulepack.recognizers[0].cooperates_with,
1385 vec!["salutation.name"]
1386 );
1387 }
1388
1389 #[test]
1390 fn rejects_unknown_fields_with_parent_table_context() {
1391 let err = Rulepack::parse(
1392 r#"
1393schema_version = "0.1.0"
1394rulepack_id = "bad"
1395rulepack_version = "0.4.0"
1396default_locales = ["global"]
1397bogus = true
1398"#,
1399 )
1400 .expect_err("unknown field must fail");
1401
1402 assert!(matches!(err, RulepackError::Toml(_)));
1403 assert!(err.to_string().contains("bogus"));
1404 }
1405
1406 #[test]
1407 fn rejects_unsupported_schema_version() {
1408 let err = Rulepack::parse(
1409 r#"
1410schema_version = "0.2.0"
1411rulepack_id = "bad"
1412rulepack_version = "0.4.0"
1413"#,
1414 )
1415 .expect_err("unsupported schema");
1416
1417 assert!(matches!(err, RulepackError::SchemaVersion { .. }));
1418 }
1419
1420 #[test]
1421 fn class_spelling_accepts_pascal_case_and_custom_names() {
1422 assert_eq!(parse_class("Email").unwrap(), PiiClass::Email);
1423 assert_eq!(
1424 parse_class("custom:Class_Alpha").unwrap(),
1425 PiiClass::Custom("class_alpha".to_string())
1426 );
1427 }
1428
1429 fn unsupported_field_rulepack(extra: &str) -> String {
1430 format!(
1431 r#"
1432schema_version = "0.1.0"
1433rulepack_id = "bad"
1434rulepack_version = "0.4.0"
1435default_locales = ["global"]
1436
1437[[recognizers]]
1438id = "bad.email"
1439class = "Email"
1440enabled = true
1441
1442[recognizers.match]
1443kind = "regex"
1444pattern = "BAD_EMAIL_FIXTURE"
1445
1446{extra}
1447"#
1448 )
1449 }
1450
1451 fn anchored_match_rulepack(override_line: &str) -> String {
1452 let cues_bucket = if override_line.starts_with("cues_bucket") {
1453 override_line.to_string()
1454 } else {
1455 "cues_bucket = \"forward_markers\"\n".to_string()
1456 };
1457 let boundary = if override_line.starts_with("boundary") {
1458 override_line.to_string()
1459 } else {
1460 "boundary = \"punctuation\"\n".to_string()
1461 };
1462 let right_window_chars = if override_line.starts_with("right_window_chars") {
1463 override_line.to_string()
1464 } else {
1465 "right_window_chars = 64\n".to_string()
1466 };
1467 let name_shape = if override_line.starts_with("name_shape") {
1468 override_line.to_string()
1469 } else {
1470 "name_shape = \"person_name\"\n".to_string()
1471 };
1472 let cue_position = if override_line.starts_with("cue_position") {
1473 override_line.to_string()
1474 } else {
1475 "cue_position = \"before\"\n".to_string()
1476 };
1477 format!(
1478 r#"
1479schema_version = "0.1.0"
1480rulepack_id = "anchored"
1481rulepack_version = "0.6.0"
1482default_locales = ["global"]
1483
1484[[recognizers]]
1485id = "name.forward_marker"
1486class = "Name"
1487enabled = true
1488
1489[recognizers.match]
1490kind = "anchored_match"
1491{cues_bucket}{boundary}{right_window_chars}{name_shape}{cue_position}
1492"#
1493 )
1494 }
1495
1496 fn assert_unsupported_field(err: RulepackError, field: &str) {
1497 assert!(matches!(
1498 err,
1499 RulepackError::UnsupportedField {
1500 field: ref actual,
1501 planned_version: "v0.4.1",
1502 } if actual == field
1503 ));
1504 }
1505
1506 fn assert_unsupported_anchored_match(err: RulepackError, field: &str, value: &str) {
1507 assert!(matches!(
1508 err,
1509 RulepackError::UnsupportedAnchoredMatch {
1510 field: ref actual_field,
1511 value: ref actual_value,
1512 } if actual_field == field && actual_value == value
1513 ));
1514 }
1515}