1use std::collections::{BTreeSet, HashMap};
2use std::path::PathBuf;
3
4use serde::Deserialize;
5use thiserror::Error;
6
7use crate::{LocaleTag, PiiClass};
8
9const SUPPORTED_SCHEMA_MAJOR_MINOR: &str = "0.1.";
10
11#[derive(Debug, Clone, PartialEq)]
12pub struct Rulepack {
13 pub schema_version: String,
14 pub rulepack_id: String,
15 pub rulepack_version: String,
16 pub default_locales: Vec<LocaleTag>,
17 pub locale: Option<LocaleData>,
18 pub recognizers: Vec<RecognizerSpec>,
19}
20
21#[derive(Debug, Clone, PartialEq)]
22#[non_exhaustive]
23pub struct RecognizerSpec {
24 pub id: String,
25 pub class: PiiClass,
26 pub cooperates_with: Vec<String>,
27 pub enabled: bool,
28 pub locales: Vec<LocaleTag>,
29 pub matcher: RawMatch,
30 pub context: Option<ContextSpec>,
31 pub validator: Option<ValidatorSpec>,
32 pub normalizer: Option<NormalizerSpec>,
33 pub scoring: ScoringSpec,
34 pub token: TokenSpec,
35 pub source: Option<SourceSpec>,
36}
37
38#[derive(Debug, Clone, PartialEq, Deserialize)]
39#[serde(tag = "kind", deny_unknown_fields, rename_all = "snake_case")]
40#[non_exhaustive]
41pub enum RawMatch {
42 Regex {
43 #[serde(default)]
44 pattern: Option<String>,
45 #[serde(default)]
46 pattern_template: Option<String>,
47 #[serde(default)]
48 capture_groups: Option<Vec<u32>>,
49 },
50 Dictionary {
51 #[serde(default)]
52 terms: Vec<String>,
53 #[serde(default)]
54 terms_file: Option<String>,
55 #[serde(default)]
56 terms_from_context: Option<String>,
57 #[serde(default)]
58 case_sensitive: bool,
59 },
60 Ner {
61 model_ref: String,
62 },
63 AnchoredMatch {
64 cues_bucket: String,
65 boundary: String,
66 right_window_chars: u16,
67 name_shape: String,
68 cue_position: String,
69 },
70}
71
72#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
73#[serde(deny_unknown_fields, rename_all = "snake_case")]
74#[non_exhaustive]
75pub enum AnchoredBoundary {
76 Punctuation,
77 Whitespace,
78 LineEnd,
79}
80
81#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
90#[serde(deny_unknown_fields, rename_all = "snake_case")]
91#[non_exhaustive]
92pub enum NameShape {
93 PersonName,
94}
95
96#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
97#[serde(deny_unknown_fields, rename_all = "snake_case")]
98#[non_exhaustive]
99pub enum CuePosition {
100 Before,
101 After,
102}
103
104#[derive(Debug, Clone, PartialEq)]
105#[non_exhaustive]
106pub struct ContextSpec {
107 pub hotwords: Vec<String>,
108 pub window: Option<u16>,
109 pub boost: Option<f32>,
110 pub exclusions: Vec<String>,
111}
112
113#[derive(Debug, Clone, PartialEq)]
114pub struct ValidatorSpec {
115 pub kind: String,
116}
117
118#[derive(Debug, Clone, PartialEq)]
119pub struct NormalizerSpec {
120 pub kind: String,
121}
122
123#[derive(Debug, Clone, PartialEq)]
124pub struct ScoringSpec {
125 pub base: f32,
126 pub priority: i32,
127}
128
129#[derive(Debug, Clone, PartialEq)]
130#[non_exhaustive]
131pub struct TokenSpec {
132 pub family: Option<String>,
133 pub format: Option<String>,
134}
135
136#[derive(Debug, Clone, PartialEq)]
137pub struct SourceSpec {
138 pub origin: String,
139 pub from: Option<String>,
140 pub license: Option<String>,
141}
142
143#[derive(Debug, Clone, PartialEq, Eq, Default)]
144pub struct LocaleData {
145 pub buckets: HashMap<String, LocaleBucket>,
146}
147
148#[derive(Debug, Clone, PartialEq, Eq)]
149pub struct LocaleBucket {
150 pub names: Vec<String>,
151}
152
153#[derive(Debug, Clone, PartialEq, Eq)]
154#[non_exhaustive]
155pub enum RulepackSource {
156 Embedded(&'static str),
157 Path(PathBuf),
158}
159
160#[derive(Debug, Error)]
161#[non_exhaustive]
162pub enum RulepackError {
163 #[error("failed to read rulepack: {0}")]
164 Io(#[source] std::io::Error),
165 #[error("failed to parse rulepack TOML: {0}")]
166 Toml(#[source] toml::de::Error),
167 #[error("unsupported rulepack schema_version {found}; supported {supported}")]
168 SchemaVersion { found: String, supported: String },
169 #[error("unknown pii class: {0}")]
170 UnknownClass(String),
171 #[error("unknown locale: {0}")]
172 UnknownLocale(String),
173 #[error("unsupported matcher kind: {0}")]
174 UnsupportedMatcher(String),
175 #[error("unsupported anchored_match field '{field}' value '{value}'")]
176 UnsupportedAnchoredMatch { field: String, value: String },
177 #[error("unsupported rulepack field '{field}' in B1; planned for {planned_version}")]
178 UnsupportedField {
179 field: String,
180 planned_version: &'static str,
181 },
182 #[error("unsupported validator kind: {kind}")]
183 UnsupportedValidator { kind: String },
184 #[error("unsupported normalizer kind: {kind}")]
185 UnsupportedNormalizer { kind: String },
186 #[error("unsupported rule spec variant: {variant}")]
187 UnsupportedRuleSpec { variant: String },
188 #[error("duplicate recognizer id '{id}' in rulepacks '{first_pack}' and '{second_pack}'")]
189 DuplicateId {
190 id: String,
191 first_pack: String,
192 second_pack: String,
193 },
194 #[error("regex recognizer '{id}' must define exactly one of pattern or pattern_template")]
195 RegexPatternChoice { id: String },
196 #[error("unknown pattern_template placeholder '{placeholder}' in recognizer '{id}'")]
197 UnknownPatternTemplatePlaceholder { id: String, placeholder: String },
198 #[error(
199 "context class_map override for dictionary '{dict}' changes {old_class:?} to {new_class:?}, but {uncovered_rule}"
200 )]
201 ClassMapOverrideClash {
202 dict: String,
203 old_class: PiiClass,
204 new_class: PiiClass,
205 uncovered_rule: String,
206 },
207 #[error(
208 "same-class recognizers '{recognizer_a}' and '{recognizer_b}' both emit {class:?} but neither declares cooperates_with"
209 )]
210 SameClassWithoutCooperation {
211 class: PiiClass,
212 recognizer_a: String,
213 recognizer_b: String,
214 },
215 #[error(
216 "recognizers {recognizer_ids:?} share class {class:?} with equivalent regex shape and overlapping locale projection {locale_overlap:?}"
217 )]
218 ConflictingLocaleProjection {
219 class: PiiClass,
220 recognizer_ids: Vec<String>,
221 locale_overlap: Vec<LocaleTag>,
222 },
223}
224
225impl Rulepack {
226 pub fn load(source: RulepackSource) -> Result<Rulepack, RulepackError> {
227 let raw = match source {
228 RulepackSource::Embedded(contents) => contents.to_string(),
229 RulepackSource::Path(path) => {
230 std::fs::read_to_string(path).map_err(RulepackError::Io)?
231 }
232 };
233 Self::parse(&raw)
234 }
235
236 pub fn parse(raw: &str) -> Result<Rulepack, RulepackError> {
237 let (raw, lint) = extract_recognizer_lint_config(raw);
238 let raw: RawRulepack = toml::from_str(&raw).map_err(RulepackError::Toml)?;
239 RawRulepackWithLint { raw, lint }.try_into()
240 }
241
242 pub fn activated_classes(&self) -> BTreeSet<PiiClass> {
243 self.recognizers
244 .iter()
245 .filter(|recognizer| recognizer.enabled)
246 .map(|recognizer| recognizer.class.clone())
247 .collect()
248 }
249}
250
251#[derive(Debug, Deserialize)]
252#[serde(deny_unknown_fields)]
253struct RawRulepack {
254 schema_version: String,
255 rulepack_id: String,
256 rulepack_version: String,
257 #[serde(default)]
258 default_locales: Vec<String>,
259 #[serde(default)]
260 locale: Option<RawLocaleData>,
261 #[serde(default)]
262 recognizers: Vec<RawRecognizerSpec>,
263}
264
265#[derive(Debug, Default)]
266struct RawRecognizerLintConfig {
267 strict_locale_overlap: bool,
268}
269
270#[derive(Debug)]
271struct RawRulepackWithLint {
272 raw: RawRulepack,
273 lint: RawRecognizerLintConfig,
274}
275
276#[derive(Debug, Deserialize)]
277struct RawLocaleData {
278 #[serde(flatten)]
279 buckets: HashMap<String, RawLocaleBucket>,
280}
281
282#[derive(Debug, Deserialize)]
283#[serde(deny_unknown_fields)]
284struct RawLocaleBucket {
285 names: Vec<String>,
286}
287
288#[derive(Debug, Deserialize)]
289#[serde(deny_unknown_fields)]
290struct RawRecognizerSpec {
291 id: String,
292 class: String,
293 #[serde(default)]
294 cooperates_with: Vec<String>,
295 #[serde(default = "default_true")]
296 enabled: bool,
297 #[serde(default)]
298 locales: Vec<String>,
299 #[serde(rename = "match")]
300 matcher: RawMatch,
301 #[serde(default)]
302 context: Option<RawContextSpec>,
303 #[serde(default)]
304 validator: Option<RawValidatorSpec>,
305 #[serde(default)]
306 normalizer: Option<RawNormalizerSpec>,
307 #[serde(default)]
308 scoring: Option<RawScoringSpec>,
309 #[serde(default)]
310 token: RawTokenSpec,
311 #[serde(default)]
312 source: Option<RawSourceSpec>,
313}
314
315#[derive(Debug, Deserialize)]
316#[serde(deny_unknown_fields)]
317struct RawContextSpec {
318 #[serde(default)]
319 hotwords: Vec<String>,
320 #[serde(default)]
321 window: Option<u16>,
322 #[serde(default)]
323 boost: Option<f32>,
324 #[serde(default)]
325 exclusions: Vec<String>,
326}
327
328#[derive(Debug, Deserialize)]
329#[serde(deny_unknown_fields)]
330struct RawValidatorSpec {
331 kind: String,
332}
333
334#[derive(Debug, Deserialize)]
335#[serde(deny_unknown_fields)]
336struct RawNormalizerSpec {
337 kind: String,
338}
339
340#[derive(Debug, Deserialize)]
341#[serde(deny_unknown_fields)]
342struct RawScoringSpec {
343 #[serde(default = "default_base_score")]
344 base: f32,
345 #[serde(default)]
346 priority: i32,
347}
348
349#[derive(Debug, Default, Deserialize)]
350#[serde(deny_unknown_fields)]
351struct RawTokenSpec {
352 #[serde(default)]
353 family: Option<String>,
354 #[serde(default)]
355 format: Option<String>,
356}
357
358#[derive(Debug, Deserialize)]
359#[serde(deny_unknown_fields)]
360struct RawSourceSpec {
361 origin: String,
362 #[serde(default)]
363 from: Option<String>,
364 #[serde(default)]
365 license: Option<String>,
366}
367
368impl TryFrom<RawRulepack> for Rulepack {
369 type Error = RulepackError;
370
371 fn try_from(raw: RawRulepack) -> Result<Self, Self::Error> {
372 RawRulepackWithLint {
373 raw,
374 lint: RawRecognizerLintConfig::default(),
375 }
376 .try_into()
377 }
378}
379
380impl TryFrom<RawRulepackWithLint> for Rulepack {
381 type Error = RulepackError;
382
383 fn try_from(raw_with_lint: RawRulepackWithLint) -> Result<Self, Self::Error> {
384 let raw = raw_with_lint.raw;
385 if !raw.schema_version.starts_with(SUPPORTED_SCHEMA_MAJOR_MINOR) {
386 return Err(RulepackError::SchemaVersion {
387 found: raw.schema_version,
388 supported: "~0.1.x".to_string(),
389 });
390 }
391
392 let default_locales = parse_locales(raw.default_locales)?;
393 let recognizers = raw
394 .recognizers
395 .into_iter()
396 .map(|recognizer| parse_recognizer(recognizer, &default_locales))
397 .collect::<Result<Vec<_>, _>>()?;
398 validate_rulepack_recognizers(&recognizers, &default_locales, &raw_with_lint.lint)?;
399 let locale = raw.locale.map(LocaleData::from);
400 reject_anchored_match_ellipsis_cues(&recognizers, locale.as_ref())?;
401
402 Ok(Self {
403 schema_version: raw.schema_version,
404 rulepack_id: raw.rulepack_id,
405 rulepack_version: raw.rulepack_version,
406 default_locales,
407 locale,
408 recognizers,
409 })
410 }
411}
412
413fn extract_recognizer_lint_config(raw: &str) -> (String, RawRecognizerLintConfig) {
414 let mut sanitized = String::with_capacity(raw.len());
415 let mut lint = RawRecognizerLintConfig::default();
416 let mut in_lint = false;
417
418 for line in raw.lines() {
419 let trimmed = line.trim();
420 if trimmed == "[recognizers.lint]" {
421 in_lint = true;
422 continue;
423 }
424 if in_lint && trimmed.starts_with('[') {
425 in_lint = false;
426 }
427 if in_lint {
428 if let Some((key, value)) = trimmed.split_once('=') {
429 if key.trim() == "strict_locale_overlap" {
430 lint.strict_locale_overlap = value.trim().eq_ignore_ascii_case("true");
431 }
432 }
433 continue;
434 }
435 sanitized.push_str(line);
436 sanitized.push('\n');
437 }
438
439 (sanitized, lint)
440}
441
442impl From<RawLocaleData> for LocaleData {
443 fn from(raw: RawLocaleData) -> Self {
444 Self {
445 buckets: raw
446 .buckets
447 .into_iter()
448 .map(|(name, bucket)| {
449 (
450 name,
451 LocaleBucket {
452 names: bucket.names,
453 },
454 )
455 })
456 .collect(),
457 }
458 }
459}
460
461fn parse_recognizer(
462 raw: RawRecognizerSpec,
463 default_locales: &[LocaleTag],
464) -> Result<RecognizerSpec, RulepackError> {
465 reject_unshipped_fields(&raw)?;
466 validate_matcher(&raw)?;
467 let locales = if raw.locales.is_empty() {
468 default_locales.to_vec()
469 } else {
470 parse_locales(raw.locales)?
471 };
472
473 Ok(RecognizerSpec {
474 id: raw.id,
475 class: parse_class(&raw.class)?,
476 cooperates_with: raw.cooperates_with,
477 enabled: raw.enabled,
478 locales,
479 matcher: raw.matcher,
480 context: raw.context.map(|context| ContextSpec {
481 hotwords: context.hotwords,
482 window: context.window,
483 boost: context.boost,
484 exclusions: context.exclusions,
485 }),
486 validator: raw.validator.map(|validator| ValidatorSpec {
487 kind: validator.kind,
488 }),
489 normalizer: raw.normalizer.map(|normalizer| NormalizerSpec {
490 kind: normalizer.kind,
491 }),
492 scoring: raw.scoring.map_or_else(
493 || ScoringSpec {
494 base: default_base_score(),
495 priority: 0,
496 },
497 |scoring| ScoringSpec {
498 base: scoring.base,
499 priority: scoring.priority,
500 },
501 ),
502 token: TokenSpec {
503 family: raw.token.family,
504 format: raw.token.format,
505 },
506 source: raw.source.map(|source| SourceSpec {
507 origin: source.origin,
508 from: source.from,
509 license: source.license,
510 }),
511 })
512}
513
514fn validate_matcher(raw: &RawRecognizerSpec) -> Result<(), RulepackError> {
515 match &raw.matcher {
516 RawMatch::Regex {
517 pattern,
518 pattern_template,
519 ..
520 } => {
521 if pattern.is_some() == pattern_template.is_some() {
522 return Err(RulepackError::RegexPatternChoice { id: raw.id.clone() });
523 }
524 }
525 RawMatch::AnchoredMatch {
526 cues_bucket,
527 boundary,
528 right_window_chars,
529 name_shape,
530 cue_position,
531 ..
532 } => {
533 if cues_bucket.trim().is_empty() {
534 return Err(RulepackError::UnsupportedAnchoredMatch {
535 field: "cues_bucket".to_string(),
536 value: cues_bucket.clone(),
537 });
538 }
539 if !(1..=512).contains(right_window_chars) {
540 return Err(RulepackError::UnsupportedAnchoredMatch {
541 field: "right_window_chars".to_string(),
542 value: right_window_chars.to_string(),
543 });
544 }
545 if !matches!(boundary.as_str(), "punctuation" | "whitespace" | "line_end") {
546 return Err(RulepackError::UnsupportedAnchoredMatch {
547 field: "boundary".to_string(),
548 value: boundary.clone(),
549 });
550 }
551 if name_shape != "person_name" {
552 return Err(RulepackError::UnsupportedAnchoredMatch {
553 field: "name_shape".to_string(),
554 value: name_shape.clone(),
555 });
556 }
557 if !matches!(cue_position.as_str(), "before" | "after") {
558 return Err(RulepackError::UnsupportedAnchoredMatch {
559 field: "cue_position".to_string(),
560 value: cue_position.clone(),
561 });
562 }
563 }
564 RawMatch::Dictionary { .. } | RawMatch::Ner { .. } => {}
565 }
566 Ok(())
567}
568
569fn reject_anchored_match_ellipsis_cues(
570 recognizers: &[RecognizerSpec],
571 locale: Option<&LocaleData>,
572) -> Result<(), RulepackError> {
573 let Some(locale) = locale else {
574 return Ok(());
575 };
576 for recognizer in recognizers {
577 let RawMatch::AnchoredMatch { cues_bucket, .. } = &recognizer.matcher else {
578 continue;
579 };
580 let Some(bucket) = locale.buckets.get(cues_bucket) else {
581 continue;
582 };
583 if let Some(cue) = bucket.names.iter().find(|cue| cue.contains("...")) {
584 return Err(RulepackError::UnsupportedAnchoredMatch {
585 field: format!("locale.{cues_bucket}.names"),
586 value: cue.clone(),
587 });
588 }
589 }
590 Ok(())
591}
592
593fn reject_unshipped_fields(raw: &RawRecognizerSpec) -> Result<(), RulepackError> {
594 const PLANNED_VERSION: &str = "v0.4.1";
595
596 if raw
597 .token
598 .format
599 .as_deref()
600 .is_some_and(|value| !value.is_empty())
601 {
602 return Err(RulepackError::UnsupportedField {
603 field: "token.format".to_string(),
604 planned_version: PLANNED_VERSION,
605 });
606 }
607 if let Some(context) = &raw.context {
608 if !context.hotwords.is_empty() {
609 return Err(RulepackError::UnsupportedField {
610 field: "context.hotwords".to_string(),
611 planned_version: PLANNED_VERSION,
612 });
613 }
614 if context.boost.is_some() {
615 return Err(RulepackError::UnsupportedField {
616 field: "context.boost".to_string(),
617 planned_version: PLANNED_VERSION,
618 });
619 }
620 if context.window.is_some() {
621 return Err(RulepackError::UnsupportedField {
622 field: "context.window".to_string(),
623 planned_version: PLANNED_VERSION,
624 });
625 }
626 }
627 Ok(())
628}
629
630pub fn recognizer_composition_validator(
631 recognizers: &[RecognizerSpec],
632) -> Result<(), RulepackError> {
633 for (index, first) in recognizers.iter().enumerate() {
634 for second in recognizers.iter().skip(index + 1) {
635 if first.class != second.class {
636 continue;
637 }
638 if first.cooperates_with.iter().any(|id| id == &second.id)
639 || second.cooperates_with.iter().any(|id| id == &first.id)
640 {
641 continue;
642 }
643 return Err(RulepackError::SameClassWithoutCooperation {
644 class: first.class.clone(),
645 recognizer_a: first.id.clone(),
646 recognizer_b: second.id.clone(),
647 });
648 }
649 }
650 Ok(())
651}
652
653fn validate_rulepack_recognizers(
654 recognizers: &[RecognizerSpec],
655 active_locales: &[LocaleTag],
656 lint: &RawRecognizerLintConfig,
657) -> Result<(), RulepackError> {
658 recognizer_composition_validator(recognizers)?;
659 lint_locale_projection_collisions(recognizers, active_locales, lint)?;
660 lint_global_naked_patterns(recognizers);
661 Ok(())
662}
663
664fn lint_locale_projection_collisions(
665 recognizers: &[RecognizerSpec],
666 active_locales: &[LocaleTag],
667 lint: &RawRecognizerLintConfig,
668) -> Result<(), RulepackError> {
669 for (index, first) in recognizers.iter().enumerate() {
670 if !first.enabled {
671 continue;
672 }
673 let Some(first_shape) = regex_structural_shape(&first.matcher) else {
674 continue;
675 };
676 if !is_truly_naked_numeric(&first.matcher) {
677 continue;
678 }
679 let first_projection = locale_projection(&first.locales, active_locales);
680 if first_projection.is_empty() {
681 continue;
682 }
683
684 for second in recognizers.iter().skip(index + 1) {
685 if !second.enabled || first.class != second.class {
686 continue;
687 }
688 if !is_truly_naked_numeric(&second.matcher) {
689 continue;
690 }
691 if regex_structural_shape(&second.matcher).as_ref() != Some(&first_shape) {
692 continue;
693 }
694 let second_projection = locale_projection(&second.locales, active_locales);
695 if second_projection.is_empty() {
696 continue;
697 }
698
699 let recognizer_ids = vec![first.id.clone(), second.id.clone()];
700 let locale_overlap = merged_locale_projection(&first_projection, &second_projection);
701 if lint.strict_locale_overlap {
702 return Err(RulepackError::ConflictingLocaleProjection {
703 class: first.class.clone(),
704 recognizer_ids,
705 locale_overlap,
706 });
707 }
708 tracing::warn!(
709 class = %first.class.class_name(),
710 recognizer_ids = ?recognizer_ids,
711 locale_overlap = ?locale_overlap,
712 "recognizers share class with naked-shape regex and non-disjoint locale projection"
713 );
714 }
715 }
716 Ok(())
717}
718
719fn lint_global_naked_patterns(recognizers: &[RecognizerSpec]) {
720 for recognizer in recognizers {
721 if !recognizer.enabled || recognizer.locales != [LocaleTag::Global] {
722 continue;
723 }
724 let Some(shape) = regex_structural_shape(&recognizer.matcher) else {
725 continue;
726 };
727 let RawMatch::Regex {
728 pattern: Some(pattern),
729 ..
730 } = &recognizer.matcher
731 else {
732 continue;
733 };
734 if shape.minimum_match_len < 6 && !has_regex_separator(pattern) {
735 tracing::warn!(
736 recognizer_id = %recognizer.id,
737 class = %recognizer.class.class_name(),
738 minimum_match_len = shape.minimum_match_len,
739 "global recognizer uses short naked regex shape"
740 );
741 }
742 }
743}
744
745#[derive(Debug, Clone, PartialEq, Eq)]
746struct RegexStructuralShape {
747 minimum_match_len: usize,
748 character_class: RegexCharacterClass,
749}
750
751#[derive(Debug, Clone, PartialEq, Eq)]
752enum RegexCharacterClass {
753 Digit,
754}
755
756fn regex_structural_shape(matcher: &RawMatch) -> Option<RegexStructuralShape> {
757 let RawMatch::Regex {
758 pattern: Some(pattern),
759 pattern_template: None,
760 ..
761 } = matcher
762 else {
763 return None;
764 };
765 if has_unescaped_line_anchor(pattern) {
766 return None;
767 }
768 digit_quantifier_minimum(pattern).map(|minimum_match_len| RegexStructuralShape {
769 minimum_match_len,
770 character_class: RegexCharacterClass::Digit,
771 })
772}
773
774fn is_truly_naked_numeric(matcher: &RawMatch) -> bool {
775 let RawMatch::Regex {
776 pattern: Some(pattern),
777 ..
778 } = matcher
779 else {
780 return false;
781 };
782
783 let mut chars = pattern.chars();
784 while let Some(ch) = chars.next() {
785 if ch == '\\' {
786 chars.next();
787 continue;
788 }
789 if ch.is_ascii_alphabetic() {
790 return false;
791 }
792 }
793 true
794}
795
796fn has_unescaped_line_anchor(pattern: &str) -> bool {
797 let mut escaped = false;
798 let mut in_class = false;
799 for ch in pattern.chars() {
800 if escaped {
801 escaped = false;
802 continue;
803 }
804 match ch {
805 '\\' => escaped = true,
806 '[' => in_class = true,
807 ']' => in_class = false,
808 '^' | '$' if !in_class => return true,
809 _ => {}
810 }
811 }
812 false
813}
814
815fn digit_quantifier_minimum(pattern: &str) -> Option<usize> {
816 find_digit_quantifier(pattern, r"\d{")
817 .or_else(|| find_digit_quantifier(pattern, "[0-9]{"))
818 .or_else(|| find_digit_quantifier(pattern, "[[:digit:]]{"))
819}
820
821fn find_digit_quantifier(pattern: &str, needle: &str) -> Option<usize> {
822 let start = pattern.find(needle)? + needle.len();
823 let rest = &pattern[start..];
824 let digits = rest
825 .chars()
826 .take_while(|ch| ch.is_ascii_digit())
827 .collect::<String>();
828 if digits.is_empty() {
829 return None;
830 }
831 digits.parse().ok()
832}
833
834fn locale_projection(locales: &[LocaleTag], active_locales: &[LocaleTag]) -> Vec<LocaleTag> {
835 let mut projection = Vec::new();
836 for locale in locales {
837 if *locale == LocaleTag::Global {
838 projection.push(LocaleTag::Global);
839 } else if active_locales.iter().any(|active| active == locale) {
840 projection.push(locale.clone());
841 }
842 }
843 projection
844}
845
846fn merged_locale_projection(left: &[LocaleTag], right: &[LocaleTag]) -> Vec<LocaleTag> {
847 let mut merged = Vec::new();
848 for locale in left.iter().chain(right) {
849 if !merged.iter().any(|existing| existing == locale) {
850 merged.push(locale.clone());
851 }
852 }
853 merged
854}
855
856fn has_regex_separator(pattern: &str) -> bool {
857 pattern.contains('-')
858 || pattern.contains('/')
859 || pattern.contains('.')
860 || pattern.contains('+')
861 || pattern.contains("\\s")
862 || pattern.contains("[:space:]")
863}
864
865pub fn parse_class(input: &str) -> Result<PiiClass, RulepackError> {
866 let trimmed = input.trim();
867 let lower = trimmed.to_ascii_lowercase();
868 match lower.as_str() {
869 "email" => Ok(PiiClass::Email),
870 "name" => Ok(PiiClass::Name),
871 "location" => Ok(PiiClass::Location),
872 "organization" => Ok(PiiClass::Organization),
873 custom if custom.starts_with("custom:") => {
874 let name = trimmed
875 .split_once(':')
876 .map(|(_, name)| name)
877 .unwrap_or_default();
878 if name.trim().is_empty() {
879 return Err(RulepackError::UnknownClass(input.to_string()));
880 }
881 Ok(PiiClass::custom(name))
882 }
883 _ => Err(RulepackError::UnknownClass(input.to_string())),
884 }
885}
886
887fn parse_locales(locales: Vec<String>) -> Result<Vec<LocaleTag>, RulepackError> {
888 locales
889 .into_iter()
890 .map(|locale| {
891 LocaleTag::parse(&locale).map_err(|_| RulepackError::UnknownLocale(locale.clone()))
892 })
893 .collect()
894}
895
896fn default_true() -> bool {
897 true
898}
899
900fn default_base_score() -> f32 {
901 0.70
902}
903
904#[cfg(test)]
905mod tests {
906 use super::*;
907
908 const CORE: &str = r#"
909schema_version = "0.1.0"
910rulepack_id = "gaze-core"
911rulepack_version = "0.4.0"
912default_locales = ["global"]
913
914[locale.email_headers]
915names = ["From", "To", "Cc", "Bcc", "Reply-To", "Sender"]
916
917[[recognizers]]
918id = "email.global"
919class = "Email"
920enabled = true
921locales = ["global"]
922
923[recognizers.match]
924kind = "regex"
925pattern = '''(?i)\b[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}\b'''
926
927[recognizers.context]
928exclusions = ["example.com"]
929
930[recognizers.validator]
931kind = "email_rfc"
932
933[recognizers.normalizer]
934kind = "email_canonical"
935
936[recognizers.scoring]
937base = 0.70
938priority = 90
939
940[recognizers.token]
941
942[recognizers.source]
943origin = "ported"
944from = "presidio"
945license = "Apache-2.0"
946"#;
947
948 #[test]
949 fn parses_core_rulepack_end_to_end() {
950 let rulepack = Rulepack::parse(CORE).expect("core rulepack");
951
952 assert_eq!(rulepack.rulepack_id, "gaze-core");
953 assert_eq!(rulepack.default_locales, vec![LocaleTag::Global]);
954 let header_names = &rulepack
955 .locale
956 .as_ref()
957 .and_then(|locale| locale.buckets.get("email_headers"))
958 .expect("email headers")
959 .names;
960 assert_eq!(
961 header_names,
962 &vec!["From", "To", "Cc", "Bcc", "Reply-To", "Sender"]
963 );
964 assert_eq!(rulepack.recognizers.len(), 1);
965 let recognizer = &rulepack.recognizers[0];
966 assert_eq!(recognizer.id, "email.global");
967 assert_eq!(recognizer.class, PiiClass::Email);
968 assert_eq!(recognizer.scoring.priority, 90);
969 assert!(matches!(recognizer.matcher, RawMatch::Regex { .. }));
970 }
971
972 #[cfg(feature = "bundled-recognizers")]
973 #[test]
974 fn embedded_core_activated_classes_match_rulepack_classes() {
975 let rulepack = Rulepack::load(RulepackSource::Embedded(
976 gaze_recognizers::embedded("core").expect("core rulepack"),
977 ))
978 .expect("embedded core rulepack");
979
980 assert_eq!(
981 rulepack.activated_classes(),
982 BTreeSet::from([PiiClass::Email, PiiClass::Name])
983 );
984 }
985
986 #[cfg(feature = "bundled-recognizers")]
987 #[test]
988 fn embedded_core_loads_full_name_recognizer_cooperation_matrix() {
989 let rulepack = Rulepack::load(RulepackSource::Embedded(
990 gaze_recognizers::embedded("core").expect("core rulepack"),
991 ))
992 .expect("embedded core rulepack");
993 let name_recognizers = rulepack
994 .recognizers
995 .iter()
996 .filter(|recognizer| recognizer.class == PiiClass::Name)
997 .collect::<Vec<_>>();
998
999 assert_eq!(name_recognizers.len(), 5);
1000 for recognizer in &name_recognizers {
1001 for peer in &name_recognizers {
1002 if recognizer.id == peer.id {
1003 continue;
1004 }
1005 assert!(
1006 recognizer.cooperates_with.contains(&peer.id),
1007 "{} missing cooperates_with {}",
1008 recognizer.id,
1009 peer.id
1010 );
1011 }
1012 }
1013 }
1014
1015 #[cfg(feature = "bundled-recognizers")]
1016 #[test]
1017 fn embedded_core_extended_activated_classes_match_rulepack_classes() {
1018 let rulepack = Rulepack::load(RulepackSource::Embedded(
1019 gaze_recognizers::embedded("core-extended").expect("core-extended rulepack"),
1020 ))
1021 .expect("embedded core-extended rulepack");
1022
1023 assert_eq!(
1024 rulepack.activated_classes(),
1025 BTreeSet::from([
1026 PiiClass::custom("phone"),
1027 PiiClass::custom("iban"),
1028 PiiClass::custom("credit_card"),
1029 PiiClass::custom("ip_address"),
1030 PiiClass::custom("eth_address"),
1031 PiiClass::custom("postal_code"),
1032 ])
1033 );
1034 }
1035
1036 #[cfg(feature = "bundled-recognizers")]
1037 #[test]
1038 fn activated_classes_include_new_rulepack_recognizer_class() {
1039 let raw = format!(
1040 r#"{}
1041
1042[[recognizers]]
1043id = "test.only"
1044class = "custom:test_only"
1045enabled = true
1046locales = ["global"]
1047
1048[recognizers.match]
1049kind = "regex"
1050pattern = "TEST_ONLY"
1051
1052[recognizers.scoring]
1053base = 0.70
1054priority = 1
1055"#,
1056 gaze_recognizers::embedded("core-extended").expect("core-extended rulepack")
1057 );
1058 let rulepack = Rulepack::parse(&raw).expect("core-extended with synthetic recognizer");
1059
1060 assert!(
1061 rulepack
1062 .activated_classes()
1063 .contains(&PiiClass::custom("test_only")),
1064 "new recognizer class must be derived from rulepack data"
1065 );
1066 }
1067
1068 #[test]
1069 fn rulepack_accepts_token_family() {
1070 let rulepack = Rulepack::parse(&unsupported_field_rulepack(
1071 "[recognizers.token]\nfamily = \"email.formatpreserve\"\n",
1072 ))
1073 .expect("token family is active in v0.4.1");
1074
1075 assert_eq!(
1076 rulepack.recognizers[0].token.family.as_deref(),
1077 Some("email.formatpreserve")
1078 );
1079 }
1080
1081 #[test]
1082 fn rulepack_rejects_unsupported_token_format() {
1083 let err = Rulepack::parse(&unsupported_field_rulepack(
1084 "[recognizers.token]\nformat = \"Customer_{n}\"\n",
1085 ))
1086 .expect_err("token format is reserved for v0.4.1");
1087
1088 assert_unsupported_field(err, "token.format");
1089 }
1090
1091 #[test]
1092 fn rulepack_rejects_unsupported_context_hotwords() {
1093 let err = Rulepack::parse(&unsupported_field_rulepack(
1094 "[recognizers.context]\nhotwords = [\"foo\"]\n",
1095 ))
1096 .expect_err("context hotwords are reserved for v0.4.1");
1097
1098 assert_unsupported_field(err, "context.hotwords");
1099 }
1100
1101 #[test]
1102 fn rulepack_rejects_unsupported_context_boost() {
1103 let err = Rulepack::parse(&unsupported_field_rulepack(
1104 "[recognizers.context]\nboost = 0.10\n",
1105 ))
1106 .expect_err("context boost is reserved for v0.4.1");
1107
1108 assert_unsupported_field(err, "context.boost");
1109 }
1110
1111 #[test]
1112 fn rulepack_rejects_unsupported_context_window() {
1113 let err = Rulepack::parse(&unsupported_field_rulepack(
1114 "[recognizers.context]\nwindow = 12\n",
1115 ))
1116 .expect_err("context window is reserved for v0.4.1");
1117
1118 assert_unsupported_field(err, "context.window");
1119 }
1120
1121 #[test]
1122 fn rulepack_accepts_default_token_fields() {
1123 let rulepack = Rulepack::parse(CORE).expect("reserved token/context fields are unset");
1124 let recognizer = &rulepack.recognizers[0];
1125
1126 assert_eq!(recognizer.token.family, None);
1127 assert_eq!(recognizer.token.format, None);
1128 assert!(recognizer.context.as_ref().unwrap().hotwords.is_empty());
1129 assert_eq!(recognizer.context.as_ref().unwrap().boost, None);
1130 assert_eq!(recognizer.context.as_ref().unwrap().window, None);
1131 }
1132
1133 #[test]
1134 fn pattern_template_with_pattern_both_present_fails_closed() {
1135 let err = Rulepack::parse(&unsupported_field_rulepack(
1136 "pattern_template = \"{locale_email_headers}: (.+)\"\n",
1137 ))
1138 .expect_err("pattern and pattern_template are mutually exclusive");
1139
1140 assert!(matches!(
1141 err,
1142 RulepackError::RegexPatternChoice { id } if id == "bad.email"
1143 ));
1144 }
1145
1146 #[test]
1147 fn regex_pattern_or_template_is_required() {
1148 let raw = r#"
1149schema_version = "0.1.0"
1150rulepack_id = "bad"
1151rulepack_version = "0.4.0"
1152default_locales = ["global"]
1153
1154[[recognizers]]
1155id = "bad.email"
1156class = "Email"
1157enabled = true
1158
1159[recognizers.match]
1160kind = "regex"
1161"#;
1162 let err = Rulepack::parse(raw).expect_err("regex pattern is required");
1163
1164 assert!(matches!(
1165 err,
1166 RulepackError::RegexPatternChoice { id } if id == "bad.email"
1167 ));
1168 }
1169
1170 #[test]
1171 fn anchored_match_accepts_valid_schema() {
1172 let rulepack = Rulepack::parse(&anchored_match_rulepack("")).expect("anchored_match");
1173 assert!(matches!(
1174 rulepack.recognizers[0].matcher,
1175 RawMatch::AnchoredMatch { .. }
1176 ));
1177 }
1178
1179 #[test]
1180 fn anchored_match_rejects_unknown_boundary() {
1181 let err = Rulepack::parse(&anchored_match_rulepack("boundary = \"paragraph\"\n"))
1182 .expect_err("unknown boundary fails closed");
1183
1184 assert_unsupported_anchored_match(err, "boundary", "paragraph");
1185 }
1186
1187 #[test]
1188 fn anchored_match_rejects_unknown_name_shape() {
1189 let err = Rulepack::parse(&anchored_match_rulepack("name_shape = \"organization\"\n"))
1190 .expect_err("unknown name_shape fails closed");
1191
1192 assert_unsupported_anchored_match(err, "name_shape", "organization");
1193 }
1194
1195 #[test]
1196 fn anchored_match_rejects_unknown_cue_position() {
1197 let err = Rulepack::parse(&anchored_match_rulepack("cue_position = \"around\"\n"))
1198 .expect_err("unknown cue_position fails closed");
1199
1200 assert_unsupported_anchored_match(err, "cue_position", "around");
1201 }
1202
1203 #[test]
1204 fn anchored_match_rejects_missing_cues_bucket() {
1205 let err = Rulepack::parse(&anchored_match_rulepack("cues_bucket = \"\"\n"))
1206 .expect_err("missing cues_bucket fails closed");
1207
1208 assert_unsupported_anchored_match(err, "cues_bucket", "");
1209 }
1210
1211 #[test]
1212 fn anchored_match_rejects_ellipsis_in_cue_values() {
1213 let err = Rulepack::parse(
1214 r#"
1215schema_version = "0.1.0"
1216rulepack_id = "anchored"
1217rulepack_version = "0.6.0"
1218default_locales = ["global"]
1219
1220[locale.forward_markers]
1221names = ["Forwarded ... message"]
1222
1223[[recognizers]]
1224id = "name.forward_marker"
1225class = "Name"
1226enabled = true
1227
1228[recognizers.match]
1229kind = "anchored_match"
1230cues_bucket = "forward_markers"
1231boundary = "punctuation"
1232right_window_chars = 64
1233name_shape = "person_name"
1234cue_position = "before"
1235"#,
1236 )
1237 .expect_err("ellipsis cue fails closed");
1238
1239 assert_unsupported_anchored_match(
1240 err,
1241 "locale.forward_markers.names",
1242 "Forwarded ... message",
1243 );
1244 }
1245
1246 #[test]
1247 fn anchored_match_rejects_invalid_window_bounds() {
1248 for (value, expected) in [("0", "0"), ("513", "513")] {
1249 let err = Rulepack::parse(&anchored_match_rulepack(&format!(
1250 "right_window_chars = {value}\n"
1251 )))
1252 .expect_err("invalid right_window_chars fails closed");
1253
1254 assert_unsupported_anchored_match(err, "right_window_chars", expected);
1255 }
1256 }
1257
1258 #[test]
1259 fn rulepack_load_fails_when_two_name_recognizers_omit_cooperates_with() {
1260 let err = Rulepack::parse(
1261 r#"
1262schema_version = "0.1.0"
1263rulepack_id = "bad-composition"
1264rulepack_version = "0.4.1"
1265default_locales = ["global"]
1266
1267[[recognizers]]
1268id = "email.header.name"
1269class = "Name"
1270enabled = true
1271
1272[recognizers.match]
1273kind = "regex"
1274pattern = "From: ([A-Z][a-z]+)"
1275
1276[[recognizers]]
1277id = "salutation.name"
1278class = "Name"
1279enabled = true
1280
1281[recognizers.match]
1282kind = "regex"
1283pattern = "Dear ([A-Z][a-z]+)"
1284"#,
1285 )
1286 .expect_err("same-class recognizers must explicitly cooperate");
1287
1288 assert!(matches!(
1289 err,
1290 RulepackError::SameClassWithoutCooperation {
1291 class: PiiClass::Name,
1292 recognizer_a,
1293 recognizer_b,
1294 } if recognizer_a == "email.header.name" && recognizer_b == "salutation.name"
1295 ));
1296 }
1297
1298 #[test]
1299 fn rulepack_load_accepts_same_class_pair_with_cooperates_with() {
1300 let rulepack = Rulepack::parse(
1301 r#"
1302schema_version = "0.1.0"
1303rulepack_id = "cooperating-composition"
1304rulepack_version = "0.4.1"
1305default_locales = ["global"]
1306
1307[[recognizers]]
1308id = "email.header.name"
1309class = "Name"
1310cooperates_with = ["salutation.name"]
1311enabled = true
1312
1313[recognizers.match]
1314kind = "regex"
1315pattern = "From: ([A-Z][a-z]+)"
1316
1317[[recognizers]]
1318id = "salutation.name"
1319class = "Name"
1320enabled = true
1321
1322[recognizers.match]
1323kind = "regex"
1324pattern = "Dear ([A-Z][a-z]+)"
1325"#,
1326 )
1327 .expect("cooperates_with unblocks same-class recognizers");
1328
1329 assert_eq!(rulepack.recognizers.len(), 2);
1330 assert_eq!(
1331 rulepack.recognizers[0].cooperates_with,
1332 vec!["salutation.name"]
1333 );
1334 }
1335
1336 #[test]
1337 fn rejects_unknown_fields_with_parent_table_context() {
1338 let err = Rulepack::parse(
1339 r#"
1340schema_version = "0.1.0"
1341rulepack_id = "bad"
1342rulepack_version = "0.4.0"
1343default_locales = ["global"]
1344bogus = true
1345"#,
1346 )
1347 .expect_err("unknown field must fail");
1348
1349 assert!(matches!(err, RulepackError::Toml(_)));
1350 assert!(err.to_string().contains("bogus"));
1351 }
1352
1353 #[test]
1354 fn rejects_unsupported_schema_version() {
1355 let err = Rulepack::parse(
1356 r#"
1357schema_version = "0.2.0"
1358rulepack_id = "bad"
1359rulepack_version = "0.4.0"
1360"#,
1361 )
1362 .expect_err("unsupported schema");
1363
1364 assert!(matches!(err, RulepackError::SchemaVersion { .. }));
1365 }
1366
1367 #[test]
1368 fn class_spelling_accepts_pascal_case_and_custom_names() {
1369 assert_eq!(parse_class("Email").unwrap(), PiiClass::Email);
1370 assert_eq!(
1371 parse_class("custom:Class_Alpha").unwrap(),
1372 PiiClass::Custom("class_alpha".to_string())
1373 );
1374 }
1375
1376 fn unsupported_field_rulepack(extra: &str) -> String {
1377 format!(
1378 r#"
1379schema_version = "0.1.0"
1380rulepack_id = "bad"
1381rulepack_version = "0.4.0"
1382default_locales = ["global"]
1383
1384[[recognizers]]
1385id = "bad.email"
1386class = "Email"
1387enabled = true
1388
1389[recognizers.match]
1390kind = "regex"
1391pattern = ".+"
1392
1393{extra}
1394"#
1395 )
1396 }
1397
1398 fn anchored_match_rulepack(override_line: &str) -> String {
1399 let cues_bucket = if override_line.starts_with("cues_bucket") {
1400 override_line.to_string()
1401 } else {
1402 "cues_bucket = \"forward_markers\"\n".to_string()
1403 };
1404 let boundary = if override_line.starts_with("boundary") {
1405 override_line.to_string()
1406 } else {
1407 "boundary = \"punctuation\"\n".to_string()
1408 };
1409 let right_window_chars = if override_line.starts_with("right_window_chars") {
1410 override_line.to_string()
1411 } else {
1412 "right_window_chars = 64\n".to_string()
1413 };
1414 let name_shape = if override_line.starts_with("name_shape") {
1415 override_line.to_string()
1416 } else {
1417 "name_shape = \"person_name\"\n".to_string()
1418 };
1419 let cue_position = if override_line.starts_with("cue_position") {
1420 override_line.to_string()
1421 } else {
1422 "cue_position = \"before\"\n".to_string()
1423 };
1424 format!(
1425 r#"
1426schema_version = "0.1.0"
1427rulepack_id = "anchored"
1428rulepack_version = "0.6.0"
1429default_locales = ["global"]
1430
1431[[recognizers]]
1432id = "name.forward_marker"
1433class = "Name"
1434enabled = true
1435
1436[recognizers.match]
1437kind = "anchored_match"
1438{cues_bucket}{boundary}{right_window_chars}{name_shape}{cue_position}
1439"#
1440 )
1441 }
1442
1443 fn assert_unsupported_field(err: RulepackError, field: &str) {
1444 assert!(matches!(
1445 err,
1446 RulepackError::UnsupportedField {
1447 field: ref actual,
1448 planned_version: "v0.4.1",
1449 } if actual == field
1450 ));
1451 }
1452
1453 fn assert_unsupported_anchored_match(err: RulepackError, field: &str, value: &str) {
1454 assert!(matches!(
1455 err,
1456 RulepackError::UnsupportedAnchoredMatch {
1457 field: ref actual_field,
1458 value: ref actual_value,
1459 } if actual_field == field && actual_value == value
1460 ));
1461 }
1462}