1#![cfg_attr(docsrs, feature(doc_cfg))]
2
3use std::cell::Cell;
4use std::collections::{BTreeMap, HashMap};
5use std::fmt;
6use std::ops::Range;
7
8use serde::{Deserialize, Serialize};
9use sha3::{Digest, Keccak256};
10use thiserror::Error;
11
12pub trait Detector: Send + Sync {
14 fn detect(&self, input: &str) -> Vec<Detection>;
16}
17
18#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
46pub enum PiiClass {
47 Email,
49 Name,
51 Location,
53 Organization,
55 Custom(String),
57}
58
59pub const BUILTIN_CLASS_NAMES: &[&str] = &["Email", "Name", "Location", "Organization"];
61
62pub const RESERVED_BUNDLED_FAMILIES: &[&str] = &[
67 "us-9-digit-id",
68 "iberian-id",
69 "payment-card-or-iban",
70 "phone-or-imei",
71 "vin-or-serial",
72 "mac-or-hex",
73 "passport-or-doc-support",
74 "national-13-digit",
75 "italian-cf-or-serial",
76 "german-personalausweis",
77 "swedish-personnummer",
78 "finnish-hetu",
79];
80
81#[derive(Debug, Clone, PartialEq, Eq)]
83#[non_exhaustive]
84pub struct CollisionMembership {
85 pub family: String,
87 pub variant: String,
89 pub precedence: u32,
91 pub mandatory_anchor: Option<String>,
93}
94
95impl CollisionMembership {
96 pub fn new(
98 family: impl Into<String>,
99 variant: impl Into<String>,
100 precedence: u32,
101 mandatory_anchor: Option<String>,
102 ) -> Self {
103 Self {
104 family: family.into(),
105 variant: variant.into(),
106 precedence,
107 mandatory_anchor,
108 }
109 }
110}
111
112impl PiiClass {
113 pub fn from_policy_name(input: &str) -> Option<Self> {
115 match input {
116 "email" => Some(Self::Email),
117 "name" => Some(Self::Name),
118 "location" => Some(Self::Location),
119 "organization" => Some(Self::Organization),
120 custom if custom.starts_with("custom:") => {
121 let name = custom.trim_start_matches("custom:");
122 (!name.trim().is_empty()).then(|| Self::custom(name))
123 }
124 _ => None,
125 }
126 }
127
128 pub fn builtin_variants() -> &'static [PiiClass] {
130 &[
131 PiiClass::Email,
132 PiiClass::Name,
133 PiiClass::Location,
134 PiiClass::Organization,
135 ]
136 }
137
138 pub fn custom(name: &str) -> Self {
140 let mut normalized = String::new();
141 let mut pending_underscore = false;
142 for ch in name.trim().chars() {
143 if ch.is_ascii_alphanumeric() {
144 if pending_underscore && !normalized.is_empty() {
145 normalized.push('_');
146 }
147 normalized.push(ch.to_ascii_lowercase());
148 pending_underscore = false;
149 } else {
150 pending_underscore = true;
151 }
152 }
153
154 Self::Custom(normalized)
155 }
156
157 pub fn as_custom_name(&self) -> Option<&str> {
159 match self {
160 Self::Custom(name) => Some(name.as_str()),
161 Self::Email | Self::Name | Self::Location | Self::Organization => None,
162 }
163 }
164
165 pub fn class_name(&self) -> String {
167 match self {
168 Self::Email => BUILTIN_CLASS_NAMES[0].to_string(),
169 Self::Name => BUILTIN_CLASS_NAMES[1].to_string(),
170 Self::Location => BUILTIN_CLASS_NAMES[2].to_string(),
171 Self::Organization => BUILTIN_CLASS_NAMES[3].to_string(),
172 Self::Custom(name) => format!("Custom:{name}"),
173 }
174 }
175
176 pub fn to_canonical_str(&self) -> String {
178 match self {
179 Self::Email => "email".to_string(),
180 Self::Name => "name".to_string(),
181 Self::Location => "location".to_string(),
182 Self::Organization => "organization".to_string(),
183 Self::Custom(name) => format!("custom:{name}"),
184 }
185 }
186
187 pub fn from_canonical_str(value: &str) -> Option<Self> {
189 match value {
190 "email" | "Email" => Some(Self::Email),
191 "name" | "Name" => Some(Self::Name),
192 "location" | "Location" => Some(Self::Location),
193 "organization" | "Organization" => Some(Self::Organization),
194 custom if custom.starts_with("custom:") => {
195 let name = &custom["custom:".len()..];
196 (!name.is_empty()).then(|| Self::Custom(name.to_string()))
197 }
198 _ => None,
199 }
200 }
201}
202
203#[derive(Debug, Clone, PartialEq, Eq)]
209#[non_exhaustive]
210pub struct PiiClassAudit(pub PiiClass);
211
212impl PiiClassAudit {
213 pub fn new(class: PiiClass) -> Self {
215 Self(class)
216 }
217
218 pub fn into_inner(self) -> PiiClass {
220 self.0
221 }
222}
223
224impl Serialize for PiiClassAudit {
225 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
226 where
227 S: serde::Serializer,
228 {
229 serializer.serialize_str(&self.0.to_canonical_str())
230 }
231}
232
233impl<'de> Deserialize<'de> for PiiClassAudit {
234 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
235 where
236 D: serde::Deserializer<'de>,
237 {
238 let value = String::deserialize(deserializer)?;
239 PiiClass::from_canonical_str(&value)
240 .map(Self)
241 .ok_or_else(|| {
242 serde::de::Error::custom(format!("unknown PiiClass canonical form: {value}"))
243 })
244 }
245}
246
247mod pii_class_audit_serde {
248 use super::{PiiClass, PiiClassAudit};
249 use serde::{Deserialize, Deserializer, Serialize, Serializer};
250
251 pub fn serialize<S>(class: &PiiClass, serializer: S) -> Result<S::Ok, S::Error>
252 where
253 S: Serializer,
254 {
255 PiiClassAudit::new(class.clone()).serialize(serializer)
256 }
257
258 pub fn deserialize<'de, D>(deserializer: D) -> Result<PiiClass, D::Error>
259 where
260 D: Deserializer<'de>,
261 {
262 Ok(PiiClassAudit::deserialize(deserializer)?.into_inner())
263 }
264}
265
266#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
268#[non_exhaustive]
269pub struct LosingCandidate {
270 #[serde(with = "pii_class_audit_serde")]
272 pub class: PiiClass,
273 pub recognizer_id: String,
275}
276
277impl LosingCandidate {
278 pub fn new(class: PiiClass, recognizer_id: impl Into<String>) -> Self {
280 Self {
281 class,
282 recognizer_id: recognizer_id.into(),
283 }
284 }
285}
286
287#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
289#[non_exhaustive]
290pub struct AmbiguityRecord {
291 #[serde(with = "pii_class_audit_serde")]
293 pub ambiguity_class: PiiClass,
294 pub losing_candidates: Vec<LosingCandidate>,
298 pub reason: AmbiguityReason,
300}
301
302impl AmbiguityRecord {
303 pub fn new(
305 ambiguity_class: PiiClass,
306 losing_candidates: Vec<LosingCandidate>,
307 reason: AmbiguityReason,
308 ) -> Self {
309 Self {
310 ambiguity_class,
311 losing_candidates,
312 reason,
313 }
314 }
315}
316
317#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
319#[non_exhaustive]
320#[serde(rename_all = "snake_case")]
321pub enum AmbiguityReason {
322 NoAnchor,
324 ValidatorIndeterminate,
326 MultiFamilyMatch,
328 PrecedenceTie,
330}
331
332#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
334#[non_exhaustive]
335#[serde(rename_all = "snake_case")]
336pub enum ValidatorFailReason {
337 LuhnFailed,
339 IbanMod97Failed,
341 #[serde(alias = "email_rfc_failed")]
343 EmailRfcRejected,
344 #[serde(alias = "e164_phone_failed")]
346 PhoneE164Rejected,
347 PhoneNationalRegionMismatch,
349 Ipv4ParseFailed,
351 Ipv6ParseFailed,
353 EthEip55ChecksumFailed,
355 AadhaarVerhoeffFailed,
357 FrNirMod97Failed,
359 DeSteuerIdMod1110Failed,
361 BsnMod11Failed,
363 CpfMod11Failed,
365 CnpjMod11Failed,
367 UkNhsMod11Failed,
369}
370
371#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
373#[non_exhaustive]
374#[serde(rename_all = "snake_case")]
375pub enum ValidatorOutcome {
376 Pass { canonical_form: Option<String> },
378 Fail { reason: ValidatorFailReason },
380 NotApplicable,
382}
383
384#[derive(Debug, Clone, PartialEq, Eq, Error)]
386#[non_exhaustive]
387pub enum ValidatorKindParseError {
388 #[error("unsupported validator: {kind}")]
390 UnsupportedValidator {
391 kind: String,
393 },
394}
395
396#[derive(Debug, Clone, Copy, PartialEq, Eq)]
398#[non_exhaustive]
399pub enum ValidatorKind {
400 EmailRfc,
402 #[cfg(feature = "phone-parser")]
404 E164Phone,
405 #[cfg(feature = "phone-parser")]
407 E164PhoneNational(Region),
408 Luhn,
410 IbanMod97,
412 Ipv4Parse,
414 Ipv6Parse,
416 EthEip55,
418 AadhaarVerhoeff,
420 FrNirMod97,
422 DeSteuerIdMod1110,
424 BsnMod11,
426 CpfMod11,
428 CnpjMod11,
430 UkNhsMod11,
432}
433
434#[cfg(feature = "phone-parser")]
436#[derive(Debug, Clone, Copy, PartialEq, Eq)]
437#[non_exhaustive]
438pub enum Region {
439 De,
441 Us,
443}
444
445impl ValidatorKind {
446 pub fn parse(s: &str) -> Result<Self, ValidatorKindParseError> {
448 match s {
449 "email_rfc" => Ok(Self::EmailRfc),
450 #[cfg(feature = "phone-parser")]
451 "e164_phone" => Ok(Self::E164Phone),
452 #[cfg(feature = "phone-parser")]
453 "e164_phone_national_de" => Ok(Self::E164PhoneNational(Region::De)),
454 #[cfg(feature = "phone-parser")]
455 "e164_phone_national_us" => Ok(Self::E164PhoneNational(Region::Us)),
456 "luhn" => Ok(Self::Luhn),
457 "iban_mod97" => Ok(Self::IbanMod97),
458 "ipv4_parse" => Ok(Self::Ipv4Parse),
459 "ipv6_parse" => Ok(Self::Ipv6Parse),
460 "eth_eip55" => Ok(Self::EthEip55),
461 "aadhaar_verhoeff" => Ok(Self::AadhaarVerhoeff),
462 "fr_nir_mod97" => Ok(Self::FrNirMod97),
463 "de_steuer_id_mod1110" => Ok(Self::DeSteuerIdMod1110),
464 "bsn_mod11" => Ok(Self::BsnMod11),
465 "cpf_mod11" => Ok(Self::CpfMod11),
466 "cnpj_mod11" => Ok(Self::CnpjMod11),
467 "uk_nhs_mod11" => Ok(Self::UkNhsMod11),
468 other => Err(ValidatorKindParseError::UnsupportedValidator {
469 kind: other.to_string(),
470 }),
471 }
472 }
473
474 pub fn validates(self, input: &str) -> bool {
476 match self {
477 Self::AadhaarVerhoeff => aadhaar_verhoeff_check(input),
478 Self::FrNirMod97 => fr_nir_mod97_check(input),
479 Self::DeSteuerIdMod1110 => de_steuer_id_mod1110_check(input),
480 Self::BsnMod11 => bsn_mod11_check(input),
481 Self::CpfMod11 => cpf_mod11_check(input),
482 Self::CnpjMod11 => cnpj_mod11_check(input),
483 Self::UkNhsMod11 => uk_nhs_mod11_check(input),
484 _ => self.canonical_form(input).is_some(),
485 }
486 }
487
488 pub fn validate(self, input: &str) -> ValidatorOutcome {
490 match self.canonical_form(input) {
491 Some(canonical_form) => ValidatorOutcome::Pass {
492 canonical_form: Some(canonical_form),
493 },
494 None => ValidatorOutcome::Fail {
495 reason: self.fail_reason(),
496 },
497 }
498 }
499
500 pub fn canonical_form(self, input: &str) -> Option<String> {
502 match self {
503 Self::EmailRfc => is_basic_email(input).then(|| input.to_string()),
504 #[cfg(feature = "phone-parser")]
505 Self::E164Phone => e164_phone_check(input).then(|| input.to_string()),
506 #[cfg(feature = "phone-parser")]
507 Self::E164PhoneNational(region) => validate_phone_national(region, input),
508 Self::Luhn => luhn_check(input).then(|| input.to_string()),
509 Self::IbanMod97 => iban_mod97_check(input).then(|| input.to_string()),
510 Self::Ipv4Parse => ipv4_parse_check(input).then(|| input.to_string()),
511 Self::Ipv6Parse => ipv6_parse_check(input).then(|| input.to_string()),
512 Self::EthEip55 => eth_eip55_check(input).then(|| input.to_string()),
513 Self::AadhaarVerhoeff => {
514 canonical_ascii_digits::<12>(input).filter(|_| aadhaar_verhoeff_check(input))
515 }
516 Self::FrNirMod97 => {
517 canonical_ascii_digits::<15>(input).filter(|_| fr_nir_mod97_check(input))
518 }
519 Self::DeSteuerIdMod1110 => {
520 canonical_ascii_digits::<11>(input).filter(|_| de_steuer_id_mod1110_check(input))
521 }
522 Self::BsnMod11 => canonical_ascii_digits::<9>(input).filter(|_| bsn_mod11_check(input)),
523 Self::CpfMod11 => {
524 canonical_ascii_digits::<11>(input).filter(|_| cpf_mod11_check(input))
525 }
526 Self::CnpjMod11 => {
527 canonical_ascii_digits::<14>(input).filter(|_| cnpj_mod11_check(input))
528 }
529 Self::UkNhsMod11 => {
530 canonical_ascii_digits::<10>(input).filter(|_| uk_nhs_mod11_check(input))
531 }
532 }
533 }
534
535 pub fn fail_reason(self) -> ValidatorFailReason {
537 match self {
538 Self::EmailRfc => ValidatorFailReason::EmailRfcRejected,
539 #[cfg(feature = "phone-parser")]
540 Self::E164Phone => ValidatorFailReason::PhoneE164Rejected,
541 #[cfg(feature = "phone-parser")]
542 Self::E164PhoneNational(_) => ValidatorFailReason::PhoneNationalRegionMismatch,
543 Self::Luhn => ValidatorFailReason::LuhnFailed,
544 Self::IbanMod97 => ValidatorFailReason::IbanMod97Failed,
545 Self::Ipv4Parse => ValidatorFailReason::Ipv4ParseFailed,
546 Self::Ipv6Parse => ValidatorFailReason::Ipv6ParseFailed,
547 Self::EthEip55 => ValidatorFailReason::EthEip55ChecksumFailed,
548 Self::AadhaarVerhoeff => ValidatorFailReason::AadhaarVerhoeffFailed,
549 Self::FrNirMod97 => ValidatorFailReason::FrNirMod97Failed,
550 Self::DeSteuerIdMod1110 => ValidatorFailReason::DeSteuerIdMod1110Failed,
551 Self::BsnMod11 => ValidatorFailReason::BsnMod11Failed,
552 Self::CpfMod11 => ValidatorFailReason::CpfMod11Failed,
553 Self::CnpjMod11 => ValidatorFailReason::CnpjMod11Failed,
554 Self::UkNhsMod11 => ValidatorFailReason::UkNhsMod11Failed,
555 }
556 }
557}
558
559fn is_basic_email(input: &str) -> bool {
560 let Some((local, domain)) = input.split_once('@') else {
561 return false;
562 };
563 !local.is_empty() && domain.contains('.') && !domain.starts_with('.') && !domain.ends_with('.')
564}
565
566#[cfg(feature = "phone-parser")]
567fn e164_phone_check(input: &str) -> bool {
568 phonenumber::parse(None, input).is_ok_and(|phone| phonenumber::is_valid(&phone))
569}
570
571#[cfg(feature = "phone-parser")]
572fn validate_phone_national(region: Region, input: &str) -> Option<String> {
573 let country = match region {
574 Region::De => phonenumber::country::DE,
575 Region::Us => phonenumber::country::US,
576 };
577 let expected_code = match region {
578 Region::De => 49,
579 Region::Us => 1,
580 };
581 let number = phonenumber::parse(Some(country), input).ok()?;
582 if number.country().code() != expected_code {
583 return None;
584 }
585 if number.is_valid() || is_safe_fixture_phone(region, input) {
586 return Some(number.format().mode(phonenumber::Mode::E164).to_string());
587 }
588 None
589}
590
591#[cfg(feature = "phone-parser")]
592fn is_safe_fixture_phone(region: Region, input: &str) -> bool {
593 let digits = input
594 .chars()
595 .filter(char::is_ascii_digit)
596 .collect::<String>();
597 match region {
598 Region::Us => {
599 digits == "15550100"
600 || matches!(digits.strip_prefix('1'), Some(rest) if rest.len() == 10 && rest[3..].starts_with("55501"))
601 }
602 Region::De => matches!(
603 digits.as_str(),
604 "493000000000"
605 | "4915100000000"
606 | "4915550112233"
607 | "015550112233"
608 | "491710000000"
609 | "01710000000"
610 ),
611 }
612}
613
614fn luhn_check(input: &str) -> bool {
615 let mut digits = Vec::new();
616 for byte in input.bytes() {
617 if byte.is_ascii_whitespace() || byte == b'-' {
618 continue;
619 }
620 if !byte.is_ascii_digit() {
621 return false;
622 }
623 digits.push(byte - b'0');
624 }
625 if !(13..=19).contains(&digits.len()) {
626 return false;
627 }
628
629 let sum: u32 = digits
630 .iter()
631 .rev()
632 .enumerate()
633 .map(|(index, digit)| {
634 let mut value = u32::from(*digit);
635 if index % 2 == 1 {
636 value *= 2;
637 if value > 9 {
638 value -= 9;
639 }
640 }
641 value
642 })
643 .sum();
644 sum.is_multiple_of(10)
645}
646
647fn iban_mod97_check(input: &str) -> bool {
648 let canonical = iban_canonicalize(input);
649 if !(15..=34).contains(&canonical.len()) {
650 return false;
651 }
652 if !canonical.chars().all(|ch| ch.is_ascii_alphanumeric()) {
653 return false;
654 }
655
656 let mut remainder = 0u32;
657 for ch in canonical[4..].chars().chain(canonical[..4].chars()) {
658 match ch {
659 '0'..='9' => {
660 remainder = (remainder * 10 + ch.to_digit(10).expect("digit")) % 97;
661 }
662 'A'..='Z' => {
663 let value = u32::from(ch) - u32::from('A') + 10;
664 remainder = (remainder * 10 + value / 10) % 97;
665 remainder = (remainder * 10 + value % 10) % 97;
666 }
667 _ => return false,
668 }
669 }
670 remainder == 1
671}
672
673fn iban_canonicalize(input: &str) -> String {
674 input
675 .chars()
676 .filter(|ch| !ch.is_ascii_whitespace())
677 .flat_map(char::to_uppercase)
678 .collect()
679}
680
681fn ipv4_parse_check(input: &str) -> bool {
682 input.parse::<std::net::Ipv4Addr>().is_ok()
683}
684
685fn ipv6_parse_check(input: &str) -> bool {
686 input.parse::<std::net::Ipv6Addr>().is_ok()
687}
688
689fn eth_eip55_check(input: &str) -> bool {
690 let Some(address) = input.strip_prefix("0x") else {
691 return false;
692 };
693 if address.len() != 40 || !address.bytes().all(|byte| byte.is_ascii_hexdigit()) {
694 return false;
695 }
696 if address
697 .bytes()
698 .all(|byte| !byte.is_ascii_alphabetic() || byte.is_ascii_lowercase())
699 || address
700 .bytes()
701 .all(|byte| !byte.is_ascii_alphabetic() || byte.is_ascii_uppercase())
702 {
703 return true;
704 }
705
706 let lowercase = address.to_ascii_lowercase();
707 let hash = Keccak256::digest(lowercase.as_bytes());
708 for (index, byte) in address.bytes().enumerate() {
709 if byte.is_ascii_digit() {
710 continue;
711 }
712 let hash_nibble = if index % 2 == 0 {
713 hash[index / 2] >> 4
714 } else {
715 hash[index / 2] & 0x0f
716 };
717 if (hash_nibble > 7) != byte.is_ascii_uppercase() {
718 return false;
719 }
720 }
721 true
722}
723
724fn collect_ascii_digits<const N: usize>(input: &str) -> Option<[u8; N]> {
725 let mut digits = [0u8; N];
726 let mut count = 0usize;
727 for byte in input.bytes() {
728 if byte.is_ascii_digit() {
729 if count == N {
730 return None;
731 }
732 digits[count] = byte - b'0';
733 count += 1;
734 } else if matches!(byte, b' ' | b'\t' | b'\n' | b'\r' | b'-' | b'.' | b'/') {
735 continue;
736 } else {
737 return None;
738 }
739 }
740 (count == N).then_some(digits)
741}
742
743fn canonical_ascii_digits<const N: usize>(input: &str) -> Option<String> {
744 let digits = collect_ascii_digits::<N>(input)?;
745 let mut canonical = String::with_capacity(N);
746 for digit in digits {
747 canonical.push(char::from(b'0' + digit));
748 }
749 Some(canonical)
750}
751
752fn not_all_same<const N: usize>(digits: &[u8; N]) -> bool {
753 digits[1..].iter().any(|digit| *digit != digits[0])
754}
755
756fn aadhaar_verhoeff_check(input: &str) -> bool {
757 const D: [[u8; 10]; 10] = [
758 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
759 [1, 2, 3, 4, 0, 6, 7, 8, 9, 5],
760 [2, 3, 4, 0, 1, 7, 8, 9, 5, 6],
761 [3, 4, 0, 1, 2, 8, 9, 5, 6, 7],
762 [4, 0, 1, 2, 3, 9, 5, 6, 7, 8],
763 [5, 9, 8, 7, 6, 0, 4, 3, 2, 1],
764 [6, 5, 9, 8, 7, 1, 0, 4, 3, 2],
765 [7, 6, 5, 9, 8, 2, 1, 0, 4, 3],
766 [8, 7, 6, 5, 9, 3, 2, 1, 0, 4],
767 [9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
768 ];
769 const P: [[u8; 10]; 8] = [
770 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
771 [1, 5, 7, 6, 2, 8, 3, 0, 9, 4],
772 [5, 8, 0, 3, 7, 9, 6, 1, 4, 2],
773 [8, 9, 1, 6, 0, 4, 3, 5, 2, 7],
774 [9, 4, 5, 3, 1, 2, 6, 8, 7, 0],
775 [4, 2, 8, 6, 5, 7, 3, 9, 0, 1],
776 [2, 7, 9, 3, 8, 0, 6, 4, 1, 5],
777 [7, 0, 4, 6, 9, 1, 3, 2, 5, 8],
778 ];
779 let Some(digits) = collect_ascii_digits::<12>(input) else {
780 return false;
781 };
782 if digits[0] < 2 || !not_all_same(&digits) {
783 return false;
784 }
785 let mut checksum = 0u8;
786 for (index, digit) in digits.iter().rev().enumerate() {
787 checksum = D[checksum as usize][P[index % 8][*digit as usize] as usize];
788 }
789 checksum == 0
790}
791
792fn fr_nir_mod97_check(input: &str) -> bool {
793 let Some(digits) = collect_ascii_digits::<15>(input) else {
794 return false;
795 };
796 if !matches!(digits[0], 1 | 2 | 3 | 4 | 7 | 8) {
797 return false;
798 }
799 let month = digits[3] * 10 + digits[4];
800 if !(1..=12).contains(&month) && !(20..=42).contains(&month) && !(50..=99).contains(&month) {
801 return false;
802 }
803 let mut number = 0u32;
804 for digit in &digits[..13] {
805 number = (number * 10 + u32::from(*digit)) % 97;
806 }
807 let key = u32::from(digits[13]) * 10 + u32::from(digits[14]);
808 97 - number == key
809}
810
811fn de_steuer_id_mod1110_check(input: &str) -> bool {
812 let Some(digits) = collect_ascii_digits::<11>(input) else {
813 return false;
814 };
815 if !steuer_id_first_ten_digits_valid(&digits) {
816 return false;
817 }
818 let mut product = 10u8;
819 for digit in &digits[..10] {
820 let mut sum = (*digit + product) % 10;
821 if sum == 0 {
822 sum = 10;
823 }
824 product = (2 * sum) % 11;
825 }
826 let check = (11 - product) % 10;
827 check == digits[10]
828}
829
830fn steuer_id_first_ten_digits_valid(digits: &[u8; 11]) -> bool {
831 if digits[0] == 0 {
832 return false;
833 }
834 let mut counts = [0u8; 10];
835 for digit in &digits[..10] {
836 counts[*digit as usize] += 1;
837 }
838 let repeated_digits = counts.iter().filter(|count| **count > 1).count();
839 let missing_digits = counts.iter().filter(|count| **count == 0).count();
840 let repeated_count_valid = counts.iter().any(|count| matches!(*count, 2 | 3));
841 repeated_digits == 1 && repeated_count_valid && matches!(missing_digits, 1 | 2)
842}
843
844fn bsn_mod11_check(input: &str) -> bool {
845 let Some(digits) = collect_ascii_digits::<9>(input) else {
846 return false;
847 };
848 if !not_all_same(&digits) {
849 return false;
850 }
851 let sum: i32 = digits[..8]
852 .iter()
853 .enumerate()
854 .map(|(index, digit)| i32::from(*digit) * (9 - index as i32))
855 .sum::<i32>()
856 - i32::from(digits[8]);
857 sum.rem_euclid(11) == 0
858}
859
860fn cpf_mod11_check(input: &str) -> bool {
861 let Some(digits) = collect_ascii_digits::<11>(input) else {
862 return false;
863 };
864 if !not_all_same(&digits) {
865 return false;
866 }
867 mod11_check_digit(&digits[..9], 10) == digits[9]
868 && mod11_check_digit(&digits[..10], 11) == digits[10]
869}
870
871fn cnpj_mod11_check(input: &str) -> bool {
872 let Some(digits) = collect_ascii_digits::<14>(input) else {
873 return false;
874 };
875 if !not_all_same(&digits) {
876 return false;
877 }
878 const FIRST: [u8; 12] = [5, 4, 3, 2, 9, 8, 7, 6, 5, 4, 3, 2];
879 const SECOND: [u8; 13] = [6, 5, 4, 3, 2, 9, 8, 7, 6, 5, 4, 3, 2];
880 weighted_mod11_check_digit(&digits[..12], &FIRST) == digits[12]
881 && weighted_mod11_check_digit(&digits[..13], &SECOND) == digits[13]
882}
883
884fn uk_nhs_mod11_check(input: &str) -> bool {
885 let Some(digits) = collect_ascii_digits::<10>(input) else {
886 return false;
887 };
888 if !not_all_same(&digits) {
889 return false;
890 }
891 let sum: u32 = digits[..9]
892 .iter()
893 .enumerate()
894 .map(|(index, digit)| u32::from(*digit) * (10 - index as u32))
895 .sum();
896 let check = 11 - (sum % 11);
897 let check = if check == 11 { 0 } else { check };
898 check != 10 && check == u32::from(digits[9])
899}
900
901fn mod11_check_digit(digits: &[u8], start_weight: u8) -> u8 {
902 let weights = (2..=start_weight).rev();
903 let sum: u32 = digits
904 .iter()
905 .zip(weights)
906 .map(|(digit, weight)| u32::from(*digit) * u32::from(weight))
907 .sum();
908 let remainder = sum % 11;
909 if remainder < 2 {
910 0
911 } else {
912 (11 - remainder) as u8
913 }
914}
915
916fn weighted_mod11_check_digit(digits: &[u8], weights: &[u8]) -> u8 {
917 let sum: u32 = digits
918 .iter()
919 .zip(weights)
920 .map(|(digit, weight)| u32::from(*digit) * u32::from(*weight))
921 .sum();
922 let remainder = sum % 11;
923 if remainder < 2 {
924 0
925 } else {
926 (11 - remainder) as u8
927 }
928}
929
930#[derive(Debug, Clone, PartialEq, Eq)]
932#[non_exhaustive]
933pub struct Detection {
934 pub span: Range<usize>,
936 pub class: PiiClass,
938 pub source: String,
940}
941
942impl Detection {
943 pub fn new(span: Range<usize>, class: PiiClass, source: impl Into<String>) -> Self {
945 Self {
946 span,
947 class,
948 source: source.into(),
949 }
950 }
951}
952
953pub trait SafetyNet: Send + Sync {
967 fn id(&self) -> &str;
969
970 fn supported_locales(&self) -> &[LocaleTag];
972
973 fn check(
975 &self,
976 clean_text: &str,
977 context: SafetyNetContext<'_>,
978 ) -> Result<Vec<LeakSuspect>, SafetyNetError>;
979}
980
981#[derive(Debug, Clone, Copy)]
983#[non_exhaustive]
984pub struct SafetyNetContext<'a> {
985 pub manifest: &'a Manifest,
987 pub locale_chain: &'a [LocaleTag],
991 pub document_kind: DocumentKind,
993 pub session_id: Option<&'a str>,
995 pub field_path: Option<&'a str>,
997}
998
999impl<'a> SafetyNetContext<'a> {
1000 pub fn new(
1002 manifest: &'a Manifest,
1003 locale_chain: &'a [LocaleTag],
1004 document_kind: DocumentKind,
1005 session_id: Option<&'a str>,
1006 field_path: Option<&'a str>,
1007 ) -> Self {
1008 Self {
1009 manifest,
1010 locale_chain,
1011 document_kind,
1012 session_id,
1013 field_path,
1014 }
1015 }
1016}
1017
1018#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
1020#[non_exhaustive]
1021pub struct EmittedTokenSpan {
1022 pub clean_span: Range<usize>,
1024 pub raw_span: Range<usize>,
1026 pub class: PiiClass,
1028}
1029
1030impl EmittedTokenSpan {
1031 pub fn new(clean_span: Range<usize>, raw_span: Range<usize>, class: PiiClass) -> Self {
1033 Self {
1034 clean_span,
1035 raw_span,
1036 class,
1037 }
1038 }
1039}
1040
1041#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
1043#[non_exhaustive]
1044pub struct Manifest {
1045 pub spans: Vec<EmittedTokenSpan>,
1047}
1048
1049impl Manifest {
1050 pub fn from_spans(mut spans: Vec<EmittedTokenSpan>) -> Self {
1052 spans.sort_by_key(|span| (span.clean_span.start, span.clean_span.end));
1053 Self { spans }
1054 }
1055
1056 pub fn diff_against(
1064 &self,
1065 suspect_span: &Range<usize>,
1066 suspect_class: &PiiClass,
1067 ) -> Option<LeakKind> {
1068 if suspect_span.is_empty() {
1069 return None;
1070 }
1071
1072 let start_idx = self
1073 .spans
1074 .partition_point(|span| span.clean_span.end <= suspect_span.start);
1075 let overlapping = self.spans[start_idx..]
1076 .iter()
1077 .take_while(|span| span.clean_span.start < suspect_span.end)
1078 .filter(|span| ranges_overlap(&span.clean_span, suspect_span))
1079 .collect::<Vec<_>>();
1080
1081 if overlapping.is_empty() {
1082 return Some(LeakKind::Uncovered);
1083 }
1084
1085 let mut cursor = suspect_span.start;
1086 let mut first_mismatch = None::<&EmittedTokenSpan>;
1087 for span in overlapping {
1088 if span.clean_span.start > cursor {
1089 return Some(LeakKind::PartialBleed {
1090 uncovered: cursor..span.clean_span.start.min(suspect_span.end),
1091 });
1092 }
1093
1094 if span.clean_span.end > cursor {
1095 if first_mismatch.is_none() && &span.class != suspect_class {
1096 first_mismatch = Some(span);
1097 }
1098 cursor = cursor.max(span.clean_span.end.min(suspect_span.end));
1099 if cursor >= suspect_span.end {
1100 break;
1101 }
1102 }
1103 }
1104
1105 if cursor < suspect_span.end {
1106 return Some(LeakKind::PartialBleed {
1107 uncovered: cursor..suspect_span.end,
1108 });
1109 }
1110
1111 first_mismatch.map(|span| LeakKind::ClassMismatch {
1112 pipeline_class: span.class.clone(),
1113 safety_net_class: suspect_class.clone(),
1114 })
1115 }
1116}
1117
1118fn ranges_overlap(left: &Range<usize>, right: &Range<usize>) -> bool {
1119 left.start < right.end && right.start < left.end
1120}
1121
1122#[derive(Debug, Clone, PartialEq)]
1124#[non_exhaustive]
1125pub struct LeakSuspect {
1126 pub span: Range<usize>,
1128 pub class: PiiClass,
1130 pub safety_net_id: String,
1132 pub score: Option<f32>,
1134 pub kind: LeakKind,
1136 pub raw_label: String,
1138 pub field_path: Option<String>,
1140}
1141
1142impl LeakSuspect {
1143 pub fn new(
1145 span: Range<usize>,
1146 class: PiiClass,
1147 safety_net_id: impl Into<String>,
1148 score: Option<f32>,
1149 kind: LeakKind,
1150 raw_label: impl Into<String>,
1151 field_path: Option<String>,
1152 ) -> Self {
1153 Self {
1154 span,
1155 class,
1156 safety_net_id: safety_net_id.into(),
1157 score,
1158 kind,
1159 raw_label: raw_label.into(),
1160 field_path,
1161 }
1162 }
1163}
1164
1165#[derive(Debug, Clone, PartialEq, Eq)]
1169#[non_exhaustive]
1170pub enum LeakKind {
1171 Uncovered,
1173 PartialBleed {
1175 uncovered: Range<usize>,
1177 },
1178 ClassMismatch {
1180 pipeline_class: PiiClass,
1182 safety_net_class: PiiClass,
1184 },
1185}
1186
1187#[derive(Debug, Clone, PartialEq, Eq)]
1189#[non_exhaustive]
1190pub enum LeakReportTelemetry {
1191 LocaleSkipped {
1193 safety_net_id: String,
1195 document_kind: DocumentKind,
1197 field_path: Option<String>,
1199 },
1200}
1201
1202#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
1204#[non_exhaustive]
1205pub struct LeakReportStats {
1206 pub suspect_count: usize,
1208 pub uncovered_count: usize,
1210 pub partial_bleed_count: usize,
1212 pub class_mismatch_count: usize,
1214 pub locale_skipped_count: usize,
1216}
1217
1218#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1225#[non_exhaustive]
1226pub struct DocumentExtension {
1227 pub schema_version: u16,
1229 pub clean_md_sha256: [u8; 32],
1231 pub layout_json_sha256: [u8; 32],
1233 pub report_json_sha256: [u8; 32],
1235 #[serde(default, skip_serializing_if = "Option::is_none")]
1237 pub preview_png_sha256: Option<[u8; 32]>,
1238 pub page_count: u32,
1240 pub audit_session_id: String,
1242 #[serde(default, skip_serializing_if = "Vec::is_empty")]
1244 pub clean_spans: Vec<EmittedTokenSpan>,
1245 #[serde(default, skip_serializing_if = "Vec::is_empty")]
1247 pub codec_audit: Vec<CodecAuditRow>,
1248}
1249
1250impl DocumentExtension {
1251 pub fn builder(schema_version: u16) -> DocumentExtensionBuilder {
1253 DocumentExtensionBuilder {
1254 schema_version,
1255 clean_md_sha256: None,
1256 layout_json_sha256: None,
1257 report_json_sha256: None,
1258 preview_png_sha256: None,
1259 page_count: None,
1260 audit_session_id: None,
1261 clean_spans: Vec::new(),
1262 codec_audit: Vec::new(),
1263 }
1264 }
1265}
1266
1267#[derive(Debug, Clone)]
1269#[must_use]
1270pub struct DocumentExtensionBuilder {
1271 schema_version: u16,
1272 clean_md_sha256: Option<[u8; 32]>,
1273 layout_json_sha256: Option<[u8; 32]>,
1274 report_json_sha256: Option<[u8; 32]>,
1275 preview_png_sha256: Option<[u8; 32]>,
1276 page_count: Option<u32>,
1277 audit_session_id: Option<String>,
1278 clean_spans: Vec<EmittedTokenSpan>,
1279 codec_audit: Vec<CodecAuditRow>,
1280}
1281
1282impl DocumentExtensionBuilder {
1283 pub fn clean_md_sha256(mut self, hash: [u8; 32]) -> Self {
1284 self.clean_md_sha256 = Some(hash);
1285 self
1286 }
1287
1288 pub fn layout_json_sha256(mut self, hash: [u8; 32]) -> Self {
1289 self.layout_json_sha256 = Some(hash);
1290 self
1291 }
1292
1293 pub fn report_json_sha256(mut self, hash: [u8; 32]) -> Self {
1294 self.report_json_sha256 = Some(hash);
1295 self
1296 }
1297
1298 pub fn preview_png_sha256(mut self, hash: [u8; 32]) -> Self {
1299 self.preview_png_sha256 = Some(hash);
1300 self
1301 }
1302
1303 pub fn page_count(mut self, page_count: u32) -> Self {
1304 self.page_count = Some(page_count);
1305 self
1306 }
1307
1308 pub fn audit_session_id(mut self, audit_session_id: impl Into<String>) -> Self {
1309 self.audit_session_id = Some(audit_session_id.into());
1310 self
1311 }
1312
1313 pub fn clean_spans(mut self, clean_spans: Vec<EmittedTokenSpan>) -> Self {
1314 self.clean_spans = clean_spans;
1315 self
1316 }
1317
1318 pub fn codec_audit(mut self, codec_audit: Vec<CodecAuditRow>) -> Self {
1319 self.codec_audit = codec_audit;
1320 self
1321 }
1322
1323 pub fn build(self) -> Result<DocumentExtension, DocumentExtensionError> {
1324 Ok(DocumentExtension {
1325 schema_version: self.schema_version,
1326 clean_md_sha256: self
1327 .clean_md_sha256
1328 .ok_or(DocumentExtensionError::MissingField("clean_md_sha256"))?,
1329 layout_json_sha256: self
1330 .layout_json_sha256
1331 .ok_or(DocumentExtensionError::MissingField("layout_json_sha256"))?,
1332 report_json_sha256: self
1333 .report_json_sha256
1334 .ok_or(DocumentExtensionError::MissingField("report_json_sha256"))?,
1335 preview_png_sha256: self.preview_png_sha256,
1336 page_count: self
1337 .page_count
1338 .ok_or(DocumentExtensionError::MissingField("page_count"))?,
1339 audit_session_id: self
1340 .audit_session_id
1341 .ok_or(DocumentExtensionError::MissingField("audit_session_id"))?,
1342 clean_spans: self.clean_spans,
1343 codec_audit: self.codec_audit,
1344 })
1345 }
1346}
1347
1348#[derive(Debug, Clone, PartialEq, Eq, Error)]
1350#[non_exhaustive]
1351pub enum DocumentExtensionError {
1352 #[error("missing document extension field: {0}")]
1353 MissingField(&'static str),
1354}
1355
1356#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
1358#[serde(rename_all = "snake_case")]
1359#[non_exhaustive]
1360pub enum TextOrigin {
1361 Ocr,
1363 EmbeddedText,
1365 Transcript,
1367 Hybrid,
1369}
1370
1371#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
1373#[non_exhaustive]
1374pub struct CodecCapabilitySet {
1375 pub text: bool,
1377 pub layout: bool,
1379 pub confidence: bool,
1381 pub timestamps: bool,
1383}
1384
1385impl CodecCapabilitySet {
1386 pub const TEXT_ONLY: Self = Self {
1388 text: true,
1389 layout: false,
1390 confidence: false,
1391 timestamps: false,
1392 };
1393
1394 pub const fn new(text: bool, layout: bool, confidence: bool, timestamps: bool) -> Self {
1396 Self {
1397 text,
1398 layout,
1399 confidence,
1400 timestamps,
1401 }
1402 }
1403
1404 pub fn contains(self, requested: Self) -> bool {
1406 (!requested.text || self.text)
1407 && (!requested.layout || self.layout)
1408 && (!requested.confidence || self.confidence)
1409 && (!requested.timestamps || self.timestamps)
1410 }
1411}
1412
1413#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1415#[serde(rename_all = "snake_case")]
1416#[non_exhaustive]
1417pub enum ExtractionDensityPolicy {
1418 Required(f32),
1420 Exempt { reason: String },
1422}
1423
1424impl Default for ExtractionDensityPolicy {
1425 fn default() -> Self {
1426 Self::Exempt {
1427 reason: "calibration_pending".to_string(),
1428 }
1429 }
1430}
1431
1432#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1434#[non_exhaustive]
1435pub struct CodecAuditRow {
1436 pub codec_id: String,
1438 pub codec_version: String,
1440 pub accepted_mime: String,
1442 pub advertised: CodecCapabilitySet,
1444 pub delivered: CodecCapabilitySet,
1446 pub text_origin: TextOrigin,
1448 pub codec_output_schema_version: u16,
1450 #[serde(default, skip_serializing_if = "Option::is_none")]
1452 pub options_hash_hex: Option<String>,
1453 #[serde(default, skip_serializing_if = "Option::is_none")]
1455 pub engine_provenance: Option<String>,
1456 pub extraction_density_policy: ExtractionDensityPolicy,
1458}
1459
1460impl CodecAuditRow {
1461 pub fn new(
1463 codec_id: impl Into<String>,
1464 codec_version: impl Into<String>,
1465 accepted_mime: impl Into<String>,
1466 text_origin: TextOrigin,
1467 ) -> Self {
1468 Self {
1469 codec_id: codec_id.into(),
1470 codec_version: codec_version.into(),
1471 accepted_mime: accepted_mime.into(),
1472 advertised: CodecCapabilitySet::default(),
1473 delivered: CodecCapabilitySet::default(),
1474 text_origin,
1475 codec_output_schema_version: 1,
1476 options_hash_hex: None,
1477 engine_provenance: None,
1478 extraction_density_policy: ExtractionDensityPolicy::default(),
1479 }
1480 }
1481}
1482
1483#[derive(Debug, Clone, Default, PartialEq)]
1489#[non_exhaustive]
1490pub struct LeakReport {
1491 pub suspects: Vec<LeakSuspect>,
1493 pub telemetry: Vec<LeakReportTelemetry>,
1495 pub stats: LeakReportStats,
1497 pub replay_hash: Option<String>,
1502}
1503
1504impl LeakReport {
1505 pub fn from_parts(
1507 suspects: Vec<LeakSuspect>,
1508 telemetry: Vec<LeakReportTelemetry>,
1509 ) -> LeakReport {
1510 let mut stats = LeakReportStats {
1511 suspect_count: suspects.len(),
1512 locale_skipped_count: telemetry
1513 .iter()
1514 .filter(|event| matches!(event, LeakReportTelemetry::LocaleSkipped { .. }))
1515 .count(),
1516 ..LeakReportStats::default()
1517 };
1518 for suspect in &suspects {
1519 match suspect.kind {
1520 LeakKind::Uncovered => stats.uncovered_count += 1,
1521 LeakKind::PartialBleed { .. } => stats.partial_bleed_count += 1,
1522 LeakKind::ClassMismatch { .. } => stats.class_mismatch_count += 1,
1523 }
1524 }
1525 LeakReport {
1526 suspects,
1527 telemetry,
1528 stats,
1529 replay_hash: None,
1530 }
1531 }
1532
1533 pub fn extend(&mut self, other: LeakReport) {
1535 self.suspects.extend(other.suspects);
1536 self.telemetry.extend(other.telemetry);
1537 *self = LeakReport::from_parts(
1538 std::mem::take(&mut self.suspects),
1539 std::mem::take(&mut self.telemetry),
1540 );
1541 }
1542}
1543
1544#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
1546#[non_exhaustive]
1547pub enum OpenAiPrivateLabel {
1548 PrivatePerson,
1550 PrivateAddress,
1552 PrivateEmail,
1554 PrivatePhone,
1556 PrivateUrl,
1558 PrivateDate,
1560 AccountNumber,
1562 Secret,
1564}
1565
1566impl OpenAiPrivateLabel {
1567 pub fn as_str(self) -> &'static str {
1569 match self {
1570 Self::PrivatePerson => "private_person",
1571 Self::PrivateAddress => "private_address",
1572 Self::PrivateEmail => "private_email",
1573 Self::PrivatePhone => "private_phone",
1574 Self::PrivateUrl => "private_url",
1575 Self::PrivateDate => "private_date",
1576 Self::AccountNumber => "account_number",
1577 Self::Secret => "secret",
1578 }
1579 }
1580}
1581
1582#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
1584#[non_exhaustive]
1585pub enum SafetyNetPiiClass {
1586 Email,
1588 Name,
1590 Location,
1592 Phone,
1594 Url,
1596 Date,
1598 AccountNumber,
1600 Secret,
1602}
1603
1604impl SafetyNetPiiClass {
1605 pub fn to_pii_class(self) -> PiiClass {
1607 match self {
1608 Self::Email => PiiClass::Email,
1609 Self::Name => PiiClass::Name,
1610 Self::Location => PiiClass::Location,
1611 Self::Phone => PiiClass::custom("phone"),
1612 Self::Url => PiiClass::custom("url"),
1613 Self::Date => PiiClass::custom("date"),
1614 Self::AccountNumber => PiiClass::custom("account_number"),
1615 Self::Secret => PiiClass::custom("secret"),
1616 }
1617 }
1618}
1619
1620#[derive(Debug, Clone, PartialEq, Eq, Error)]
1622#[non_exhaustive]
1623pub enum SafetyNetError {
1624 #[error("safety net unavailable: {reason}")]
1626 Unavailable {
1627 reason: String,
1629 },
1630 #[error("safety net weights missing: {path}")]
1632 WeightsMissing {
1633 path: String,
1635 },
1636 #[error("safety net model unavailable: {reason}")]
1638 ModelUnavailable {
1639 reason: String,
1641 },
1642 #[error("safety net model integrity mismatch: expected={expected}, actual={actual}")]
1644 ModelIntegrityMismatch {
1645 expected: String,
1647 actual: String,
1649 },
1650 #[error("safety net input too large: limit={limit}, actual={actual}")]
1652 InputTooLarge {
1653 limit: usize,
1655 actual: usize,
1657 },
1658 #[error("safety net runtime failed: {message}")]
1660 Runtime {
1661 message: String,
1663 },
1664 #[error("safety net invalid output: {message}")]
1666 InvalidOutput {
1667 message: String,
1669 },
1670}
1671
1672#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1686#[non_exhaustive]
1687pub enum Action {
1688 Tokenize,
1690 Redact,
1692 FormatPreserve,
1694 Generalize,
1696 Preserve,
1698}
1699
1700#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1702#[non_exhaustive]
1703pub enum ConflictTier {
1704 None,
1706 ClassPriority,
1708 RulePriority,
1710 Score,
1712 SpanLength,
1714 Validator,
1716 ValidatorVeto,
1718 CollisionPolicy,
1720 AnchoredContext,
1722 RecognizerId,
1724 Merged,
1726 Redact,
1728 Resolve,
1730 Fallback,
1732}
1733
1734#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
1736#[non_exhaustive]
1737pub enum FallbackReason {
1738 OverlapConflict,
1740 ValidatorVeto,
1742 AnchorMissing,
1744 ResidualSuspect,
1746}
1747
1748#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1750#[non_exhaustive]
1751pub enum DocumentKind {
1752 Structured,
1754 Text,
1756}
1757
1758#[derive(Debug, Clone, PartialEq, Eq)]
1768#[non_exhaustive]
1769pub struct RedactionEntry {
1770 pub source: String,
1772 pub recognizer_id: Option<String>,
1774 pub recognizer_version_id: Option<String>,
1776 pub class: PiiClass,
1778 pub action: Action,
1780 pub field_name: Option<String>,
1782 pub document_kind: DocumentKind,
1784 pub conflict_loser: bool,
1786 pub decided_by: ConflictTier,
1788 pub created_at: i64,
1790 pub session_id: Option<String>,
1792 pub validator_fail_reason: Option<ValidatorFailReason>,
1794 pub ambiguity_record: Option<AmbiguityRecord>,
1796 pub collision_family: Option<String>,
1798 pub collision_variant: Option<String>,
1800 pub fallback_triggered: Option<FallbackReason>,
1802 pub provenance_stage: Option<String>,
1804 pub provenance_model_id: Option<String>,
1805 pub provenance_model_version: Option<String>,
1806 pub provenance_artifact_sha256: Option<String>,
1807 pub provenance_tokenizer_sha256: Option<String>,
1808 pub provenance_locale_resolved: Option<String>,
1809 pub provenance_locale_match_kind: Option<String>,
1810 pub provenance_canonical_class: Option<String>,
1811 pub provenance_native_class: Option<String>,
1812 pub provenance_confidence: Option<String>,
1813 pub provenance_merged_from: Option<String>,
1814 pub backend_silently_dropped: Option<Vec<String>>,
1816}
1817
1818impl Serialize for RedactionEntry {
1819 fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
1820 where
1821 S: serde::Serializer,
1822 {
1823 use serde::ser::SerializeStruct;
1824
1825 let mut len = 14;
1826 if self.recognizer_id.is_some() {
1827 len += 1;
1828 }
1829 if self.recognizer_version_id.is_some() {
1830 len += 1;
1831 }
1832 len += [
1833 self.provenance_stage.as_ref(),
1834 self.provenance_model_id.as_ref(),
1835 self.provenance_model_version.as_ref(),
1836 self.provenance_artifact_sha256.as_ref(),
1837 self.provenance_tokenizer_sha256.as_ref(),
1838 self.provenance_locale_resolved.as_ref(),
1839 self.provenance_locale_match_kind.as_ref(),
1840 self.provenance_canonical_class.as_ref(),
1841 self.provenance_native_class.as_ref(),
1842 self.provenance_confidence.as_ref(),
1843 self.provenance_merged_from.as_ref(),
1844 ]
1845 .into_iter()
1846 .filter(|value| value.is_some())
1847 .count();
1848 if self.backend_silently_dropped.is_some() {
1849 len += 1;
1850 }
1851 let mut state = serializer.serialize_struct("RedactionEntry", len)?;
1852 state.serialize_field("source", &self.source)?;
1853 if let Some(recognizer_id) = &self.recognizer_id {
1854 state.serialize_field("recognizer_id", recognizer_id)?;
1855 }
1856 if let Some(recognizer_version_id) = &self.recognizer_version_id {
1857 state.serialize_field("recognizer_version_id", recognizer_version_id)?;
1858 }
1859 state.serialize_field("class", &self.class.to_canonical_str())?;
1860 state.serialize_field("action", redaction_action_as_str(self.action))?;
1861 state.serialize_field("field_name", &self.field_name)?;
1862 state.serialize_field(
1863 "document_kind",
1864 redaction_document_kind_as_str(self.document_kind),
1865 )?;
1866 state.serialize_field("conflict_loser", &self.conflict_loser)?;
1867 state.serialize_field(
1868 "decided_by",
1869 redaction_conflict_tier_as_str(self.decided_by),
1870 )?;
1871 state.serialize_field("created_at", &self.created_at)?;
1872 state.serialize_field("session_id", &self.session_id)?;
1873 state.serialize_field("validator_fail_reason", &self.validator_fail_reason)?;
1874 state.serialize_field("ambiguity_record", &self.ambiguity_record)?;
1875 state.serialize_field("collision_family", &self.collision_family)?;
1876 state.serialize_field("collision_variant", &self.collision_variant)?;
1877 state.serialize_field("fallback_triggered", &self.fallback_triggered)?;
1878 if let Some(value) = &self.provenance_stage {
1879 state.serialize_field("provenance_stage", value)?;
1880 }
1881 if let Some(value) = &self.provenance_model_id {
1882 state.serialize_field("provenance_model_id", value)?;
1883 }
1884 if let Some(value) = &self.provenance_model_version {
1885 state.serialize_field("provenance_model_version", value)?;
1886 }
1887 if let Some(value) = &self.provenance_artifact_sha256 {
1888 state.serialize_field("provenance_artifact_sha256", value)?;
1889 }
1890 if let Some(value) = &self.provenance_tokenizer_sha256 {
1891 state.serialize_field("provenance_tokenizer_sha256", value)?;
1892 }
1893 if let Some(value) = &self.provenance_locale_resolved {
1894 state.serialize_field("provenance_locale_resolved", value)?;
1895 }
1896 if let Some(value) = &self.provenance_locale_match_kind {
1897 state.serialize_field("provenance_locale_match_kind", value)?;
1898 }
1899 if let Some(value) = &self.provenance_canonical_class {
1900 state.serialize_field("provenance_canonical_class", value)?;
1901 }
1902 if let Some(value) = &self.provenance_native_class {
1903 state.serialize_field("provenance_native_class", value)?;
1904 }
1905 if let Some(value) = &self.provenance_confidence {
1906 state.serialize_field("provenance_confidence", value)?;
1907 }
1908 if let Some(value) = &self.provenance_merged_from {
1909 state.serialize_field("provenance_merged_from", value)?;
1910 }
1911 if let Some(dropped) = &self.backend_silently_dropped {
1912 state.serialize_field("backend_silently_dropped", dropped)?;
1913 }
1914 state.end()
1915 }
1916}
1917
1918fn redaction_action_as_str(action: Action) -> &'static str {
1919 match action {
1920 Action::Tokenize => "tokenize",
1921 Action::Redact => "redact",
1922 Action::FormatPreserve => "format_preserve",
1923 Action::Generalize => "generalize",
1924 Action::Preserve => "preserve",
1925 }
1926}
1927
1928fn redaction_document_kind_as_str(kind: DocumentKind) -> &'static str {
1929 match kind {
1930 DocumentKind::Structured => "structured",
1931 DocumentKind::Text => "text",
1932 }
1933}
1934
1935fn redaction_conflict_tier_as_str(tier: ConflictTier) -> &'static str {
1936 match tier {
1937 ConflictTier::None => "none",
1938 ConflictTier::ClassPriority => "class_priority",
1939 ConflictTier::RulePriority => "rule_priority",
1940 ConflictTier::Score => "score",
1941 ConflictTier::SpanLength => "span_length",
1942 ConflictTier::Validator => "validator",
1943 ConflictTier::ValidatorVeto => "validator_veto",
1944 ConflictTier::CollisionPolicy => "collision_policy",
1945 ConflictTier::AnchoredContext => "anchored_context",
1946 ConflictTier::RecognizerId => "recognizer_id",
1947 ConflictTier::Merged => "merged",
1948 ConflictTier::Redact => "redact",
1949 ConflictTier::Resolve => "resolve",
1950 ConflictTier::Fallback => "fallback",
1951 }
1952}
1953
1954impl RedactionEntry {
1955 #[allow(clippy::too_many_arguments)]
1957 pub fn new(
1958 source: impl Into<String>,
1959 class: PiiClass,
1960 action: Action,
1961 field_name: Option<String>,
1962 document_kind: DocumentKind,
1963 conflict_loser: bool,
1964 decided_by: ConflictTier,
1965 created_at: i64,
1966 session_id: Option<String>,
1967 ) -> Self {
1968 Self {
1969 source: source.into(),
1970 class,
1971 action,
1972 field_name,
1973 document_kind,
1974 conflict_loser,
1975 decided_by,
1976 created_at,
1977 session_id,
1978 recognizer_id: None,
1979 recognizer_version_id: None,
1980 validator_fail_reason: None,
1981 ambiguity_record: None,
1982 collision_family: None,
1983 collision_variant: None,
1984 fallback_triggered: None,
1985 provenance_stage: None,
1986 provenance_model_id: None,
1987 provenance_model_version: None,
1988 provenance_artifact_sha256: None,
1989 provenance_tokenizer_sha256: None,
1990 provenance_locale_resolved: None,
1991 provenance_locale_match_kind: None,
1992 provenance_canonical_class: None,
1993 provenance_native_class: None,
1994 provenance_confidence: None,
1995 provenance_merged_from: None,
1996 backend_silently_dropped: None,
1997 }
1998 }
1999
2000 pub fn with_validator_fail_reason(mut self, reason: ValidatorFailReason) -> Self {
2002 self.validator_fail_reason = Some(reason);
2003 self
2004 }
2005
2006 pub fn with_ambiguity_record(mut self, record: AmbiguityRecord) -> Self {
2008 self.ambiguity_record = Some(record);
2009 self
2010 }
2011
2012 pub fn with_collision_metadata(
2014 mut self,
2015 family: Option<String>,
2016 variant: Option<String>,
2017 ) -> Self {
2018 self.collision_family = family;
2019 self.collision_variant = variant;
2020 self
2021 }
2022
2023 pub fn with_fallback_triggered(mut self, reason: FallbackReason) -> Self {
2025 self.fallback_triggered = Some(reason);
2026 self
2027 }
2028
2029 pub fn with_backend_silently_dropped(mut self, dropped: Vec<String>) -> Self {
2031 self.backend_silently_dropped = Some(dropped);
2032 self
2033 }
2034
2035 pub fn with_recognizer_metadata(
2037 mut self,
2038 recognizer_id: Option<String>,
2039 recognizer_version_id: Option<String>,
2040 ) -> Self {
2041 self.recognizer_id = recognizer_id;
2042 self.recognizer_version_id = recognizer_version_id;
2043 self
2044 }
2045
2046 #[allow(clippy::too_many_arguments)]
2047 pub fn with_provenance_metadata(
2048 mut self,
2049 stage: Option<String>,
2050 model_id: Option<String>,
2051 model_version: Option<String>,
2052 artifact_sha256: Option<String>,
2053 tokenizer_sha256: Option<String>,
2054 locale_resolved: Option<String>,
2055 locale_match_kind: Option<String>,
2056 canonical_class: Option<String>,
2057 native_class: Option<String>,
2058 confidence: Option<f64>,
2059 merged_from: Option<String>,
2060 ) -> Self {
2061 self.provenance_stage = stage;
2062 self.provenance_model_id = model_id;
2063 self.provenance_model_version = model_version;
2064 self.provenance_artifact_sha256 = artifact_sha256;
2065 self.provenance_tokenizer_sha256 = tokenizer_sha256;
2066 self.provenance_locale_resolved = locale_resolved;
2067 self.provenance_locale_match_kind = locale_match_kind;
2068 self.provenance_canonical_class = canonical_class;
2069 self.provenance_native_class = native_class;
2070 self.provenance_confidence = confidence.map(|value| value.to_string());
2071 self.provenance_merged_from = merged_from;
2072 self
2073 }
2074}
2075
2076#[derive(Debug, Clone, PartialEq, Eq, Error)]
2078#[non_exhaustive]
2079pub enum RedactionLogError {
2080 #[error("sqlite redaction log error: {0}")]
2082 Sqlite(String),
2083 #[error("backend redaction log error: {0}")]
2085 Backend(String),
2086}
2087
2088pub trait RedactionLogger: Send + Sync {
2118 fn log(&self, entry: &RedactionEntry) -> Result<(), RedactionLogError>;
2120}
2121
2122#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
2124#[non_exhaustive]
2125pub enum SafetyTier {
2126 #[default]
2128 SafeDefault,
2129 LocaleGated,
2131 OptIn,
2133}
2134
2135#[derive(Debug, Clone, PartialEq, Eq)]
2137#[non_exhaustive]
2138pub struct SafetyTierParseError {
2139 value: String,
2140}
2141
2142impl SafetyTier {
2143 pub fn parse(value: &str) -> Result<Self, SafetyTierParseError> {
2145 match value {
2146 "safe_default" => Ok(Self::SafeDefault),
2147 "locale_gated" => Ok(Self::LocaleGated),
2148 "opt_in" => Ok(Self::OptIn),
2149 other => Err(SafetyTierParseError {
2150 value: other.to_string(),
2151 }),
2152 }
2153 }
2154
2155 pub fn as_str(self) -> &'static str {
2157 match self {
2158 Self::SafeDefault => "safe_default",
2159 Self::LocaleGated => "locale_gated",
2160 Self::OptIn => "opt_in",
2161 }
2162 }
2163}
2164
2165impl SafetyTierParseError {
2166 pub fn value(&self) -> &str {
2168 &self.value
2169 }
2170}
2171
2172impl fmt::Display for SafetyTierParseError {
2173 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2174 write!(f, "unsupported safety_tier '{}'", self.value)
2175 }
2176}
2177
2178impl std::error::Error for SafetyTierParseError {}
2179
2180#[derive(Debug, Clone, PartialEq, Eq, Hash)]
2182#[non_exhaustive]
2183pub enum LocaleTag {
2184 Global,
2186 DeDe,
2188 DeAt,
2190 DeCh,
2192 EnUs,
2194 EnGb,
2196 EnIe,
2198 EnAu,
2200 EnCa,
2202 Other(String),
2204}
2205
2206#[derive(Debug, Clone, PartialEq, Eq)]
2208#[non_exhaustive]
2209pub enum LocaleError {
2210 Unsupported,
2212}
2213
2214impl fmt::Display for LocaleError {
2215 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2216 match self {
2217 LocaleError::Unsupported => f.write_str("unsupported locale"),
2218 }
2219 }
2220}
2221
2222impl std::error::Error for LocaleError {}
2223
2224#[derive(Debug, Clone, PartialEq, Eq)]
2226pub struct LocaleChain(Vec<LocaleTag>);
2227
2228impl LocaleTag {
2229 pub const GLOBAL: LocaleTag = LocaleTag::Global;
2231
2232 pub fn parse(s: &str) -> Result<LocaleTag, LocaleError> {
2234 let raw = s.trim().replace('_', "-");
2235 let normalized = raw.to_ascii_lowercase();
2236 match normalized.as_str() {
2237 "global" | "*" => Ok(LocaleTag::Global),
2238 "de-de" => Ok(LocaleTag::DeDe),
2239 "de-at" => Ok(LocaleTag::DeAt),
2240 "de-ch" => Ok(LocaleTag::DeCh),
2241 "en-us" => Ok(LocaleTag::EnUs),
2242 "en-gb" => Ok(LocaleTag::EnGb),
2243 "en-ie" => Ok(LocaleTag::EnIe),
2244 "en-au" => Ok(LocaleTag::EnAu),
2245 "en-ca" => Ok(LocaleTag::EnCa),
2246 "" => Err(LocaleError::Unsupported),
2247 _ if is_bcp47_parseable(&raw) => Ok(LocaleTag::Other(canonical_other(&raw))),
2248 _ => Err(LocaleError::Unsupported),
2249 }
2250 }
2251
2252 pub fn as_str(&self) -> &str {
2254 match self {
2255 LocaleTag::Global => "global",
2256 LocaleTag::DeDe => "de-DE",
2257 LocaleTag::DeAt => "de-AT",
2258 LocaleTag::DeCh => "de-CH",
2259 LocaleTag::EnUs => "en-US",
2260 LocaleTag::EnGb => "en-GB",
2261 LocaleTag::EnIe => "en-IE",
2262 LocaleTag::EnAu => "en-AU",
2263 LocaleTag::EnCa => "en-CA",
2264 LocaleTag::Other(tag) => tag.as_str(),
2265 }
2266 }
2267}
2268
2269impl LocaleChain {
2270 pub fn from_tags(mut tags: Vec<LocaleTag>) -> LocaleChain {
2272 ensure_global(&mut tags);
2273 LocaleChain(tags)
2274 }
2275
2276 pub fn from_cli(raw: &str) -> Result<LocaleChain, LocaleError> {
2278 let tags = raw
2279 .split(',')
2280 .map(LocaleTag::parse)
2281 .collect::<Result<Vec<_>, _>>()?;
2282 Ok(LocaleChain::from_tags(tags))
2283 }
2284
2285 pub fn merge_policy_and_cli(
2287 policy: Option<&[LocaleTag]>,
2288 cli: Option<&[LocaleTag]>,
2289 ) -> LocaleChain {
2290 Self::merge_cli_policy_rulepack_default(cli, policy, None)
2291 }
2292
2293 pub fn merge_cli_policy_rulepack_default(
2295 cli: Option<&[LocaleTag]>,
2296 policy: Option<&[LocaleTag]>,
2297 rulepack_defaults: Option<&[LocaleTag]>,
2298 ) -> LocaleChain {
2299 let tags = cli
2300 .filter(|tags| !tags.is_empty())
2301 .or_else(|| policy.filter(|tags| !tags.is_empty()))
2302 .or_else(|| rulepack_defaults.filter(|tags| !tags.is_empty()))
2303 .map(|tags| tags.to_vec())
2304 .unwrap_or_else(|| vec![LocaleTag::Global]);
2305 LocaleChain::from_tags(tags)
2306 }
2307
2308 pub fn intersects(&self, recognizer_locales: &[LocaleTag]) -> bool {
2310 if recognizer_locales.is_empty() {
2311 return true;
2312 }
2313 recognizer_locales.iter().any(|recognizer_locale| {
2314 *recognizer_locale == LocaleTag::Global
2315 || self.0.iter().any(|active| active == recognizer_locale)
2316 })
2317 }
2318
2319 pub fn as_slice(&self) -> &[LocaleTag] {
2321 &self.0
2322 }
2323
2324 pub fn to_strings(&self) -> Vec<String> {
2326 self.0.iter().map(ToString::to_string).collect()
2327 }
2328}
2329
2330impl From<&[LocaleTag]> for LocaleChain {
2331 fn from(tags: &[LocaleTag]) -> Self {
2332 let mut owned = tags.to_vec();
2333 ensure_global(&mut owned);
2334 LocaleChain(owned)
2335 }
2336}
2337
2338impl fmt::Display for LocaleTag {
2339 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2340 f.write_str(self.as_str())
2341 }
2342}
2343
2344#[derive(Debug, Clone)]
2354#[non_exhaustive]
2355pub enum RawDocument {
2356 Structured(BTreeMap<String, Value>),
2358 Text(String),
2360}
2361
2362#[derive(Debug, Clone, Serialize)]
2381#[serde(untagged)]
2382#[non_exhaustive]
2383pub enum CleanDocument {
2384 Structured(BTreeMap<String, Value>),
2386 Text(String),
2388}
2389
2390#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
2392#[serde(untagged)]
2393#[non_exhaustive]
2394pub enum Value {
2395 Null,
2397 Bool(bool),
2399 String(String),
2401 I64(i64),
2403 Array(Vec<Value>),
2405 Object(BTreeMap<String, Value>),
2407}
2408
2409impl Value {
2410 pub fn as_str(&self) -> Option<&str> {
2412 match self {
2413 Self::String(value) => Some(value.as_str()),
2414 Self::Null | Self::Bool(_) | Self::I64(_) | Self::Array(_) | Self::Object(_) => None,
2415 }
2416 }
2417
2418 pub fn scalar_to_safety_net_string(&self) -> Option<String> {
2420 match self {
2421 Self::String(value) if !value.is_empty() => Some(value.clone()),
2422 Self::String(_) | Self::Null | Self::Array(_) | Self::Object(_) => None,
2423 Self::Bool(value) => Some(value.to_string()),
2424 Self::I64(value) => Some(value.to_string()),
2425 }
2426 }
2427}
2428
2429impl PartialEq<&str> for Value {
2430 fn eq(&self, other: &&str) -> bool {
2431 self.as_str() == Some(*other)
2432 }
2433}
2434
2435#[derive(Debug, Clone, Default)]
2437pub struct DictionaryBundle {
2438 entries: HashMap<String, DictionaryEntry>,
2439}
2440
2441#[derive(Debug, Clone)]
2443pub struct DictionaryEntry {
2444 terms: Vec<String>,
2445 case_sensitive: bool,
2446 source: DictionarySource,
2447}
2448
2449#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2451#[non_exhaustive]
2452pub enum DictionarySource {
2453 Cli,
2455 Rulepack,
2457}
2458
2459#[derive(Debug, Clone, PartialEq, Eq)]
2461#[non_exhaustive]
2462pub struct DictionaryStats {
2463 pub name: String,
2465 pub term_count: usize,
2467 pub source: DictionarySource,
2469}
2470
2471impl DictionaryStats {
2472 pub fn new(name: impl Into<String>, term_count: usize, source: DictionarySource) -> Self {
2474 Self {
2475 name: name.into(),
2476 term_count,
2477 source,
2478 }
2479 }
2480}
2481
2482#[derive(Debug, Clone, PartialEq, Eq)]
2484#[non_exhaustive]
2485pub struct RulepackDict {
2486 pub name: String,
2488 pub terms: Vec<String>,
2490 pub case_sensitive: bool,
2492}
2493
2494impl RulepackDict {
2495 pub fn new(name: impl Into<String>, terms: Vec<String>, case_sensitive: bool) -> Self {
2497 Self {
2498 name: name.into(),
2499 terms,
2500 case_sensitive,
2501 }
2502 }
2503}
2504
2505#[derive(Debug, Clone, PartialEq, Eq)]
2507#[non_exhaustive]
2508pub enum DictionaryLoadError {
2509 Empty { name: String },
2511 UnicodeInsensitiveUnsupported { name: String },
2513}
2514
2515impl fmt::Display for DictionaryLoadError {
2516 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2517 match self {
2518 Self::Empty { name } => write!(f, "dictionary '{name}' has no terms"),
2519 Self::UnicodeInsensitiveUnsupported { name } => write!(
2520 f,
2521 "dictionary '{name}' uses unicode terms with case-insensitive matching, unsupported in v0.4.0; use case_sensitive = true"
2522 ),
2523 }
2524 }
2525}
2526
2527impl std::error::Error for DictionaryLoadError {}
2528
2529impl DictionaryBundle {
2530 pub fn from_rulepack_terms(terms: &[RulepackDict]) -> Self {
2532 let mut entries = HashMap::with_capacity(terms.len());
2533 for dictionary in terms {
2534 let entry = DictionaryEntry::new(
2535 &dictionary.name,
2536 dictionary.terms.clone(),
2537 dictionary.case_sensitive,
2538 DictionarySource::Rulepack,
2539 )
2540 .expect("Policy validates dictionary terms before bundle construction");
2541 entries.insert(dictionary.name.clone(), entry);
2542 }
2543 Self { entries }
2544 }
2545
2546 pub fn from_entries(entries: impl IntoIterator<Item = (String, DictionaryEntry)>) -> Self {
2548 Self {
2549 entries: entries.into_iter().collect(),
2550 }
2551 }
2552
2553 pub fn merge(a: Self, b: Self) -> Self {
2555 let mut entries = a.entries;
2556 entries.extend(b.entries);
2557 Self { entries }
2558 }
2559
2560 pub fn get(&self, name: &str) -> Option<&DictionaryEntry> {
2562 self.entries.get(name)
2563 }
2564
2565 pub fn stats(&self) -> Vec<DictionaryStats> {
2567 let mut stats = self
2568 .entries
2569 .iter()
2570 .map(|(name, entry)| DictionaryStats {
2571 name: name.clone(),
2572 term_count: entry.terms.len(),
2573 source: entry.source,
2574 })
2575 .collect::<Vec<_>>();
2576 stats.sort_by(|a, b| a.name.cmp(&b.name));
2577 stats
2578 }
2579}
2580
2581impl DictionaryEntry {
2582 pub fn new(
2584 name: &str,
2585 terms: Vec<String>,
2586 case_sensitive: bool,
2587 source: DictionarySource,
2588 ) -> Result<Self, DictionaryLoadError> {
2589 if terms.is_empty() {
2590 return Err(DictionaryLoadError::Empty {
2591 name: name.to_string(),
2592 });
2593 }
2594 if !case_sensitive && terms.iter().any(|term| !term.is_ascii()) {
2595 return Err(DictionaryLoadError::UnicodeInsensitiveUnsupported {
2596 name: name.to_string(),
2597 });
2598 }
2599 Ok(Self {
2600 terms,
2601 case_sensitive,
2602 source,
2603 })
2604 }
2605
2606 pub fn case_sensitive(&self) -> bool {
2608 self.case_sensitive
2609 }
2610
2611 pub fn terms(&self) -> &[String] {
2613 &self.terms
2614 }
2615}
2616
2617#[cfg(test)]
2618mod dictionary_tests {
2619 use super::*;
2620
2621 #[test]
2622 fn dictionary_entry_rejects_empty_terms() {
2623 let err = DictionaryEntry::new("empty", Vec::new(), true, DictionarySource::Cli)
2624 .expect_err("empty dictionaries must fail closed");
2625
2626 assert!(matches!(err, DictionaryLoadError::Empty { name } if name == "empty"));
2627 }
2628
2629 #[test]
2630 fn dictionary_entry_rejects_non_ascii_case_insensitive_terms() {
2631 let err = DictionaryEntry::new(
2632 "songs",
2633 vec!["Beyonce".to_string(), "Caf\u{00e9}".to_string()],
2634 false,
2635 DictionarySource::Cli,
2636 )
2637 .expect_err("unicode case-insensitive dictionaries must fail closed");
2638
2639 assert!(matches!(
2640 err,
2641 DictionaryLoadError::UnicodeInsensitiveUnsupported { name } if name == "songs"
2642 ));
2643 }
2644}
2645
2646#[cfg(test)]
2647mod redaction_logger_tests {
2648 use super::*;
2649
2650 struct CapturingLogger;
2651
2652 impl RedactionLogger for CapturingLogger {
2653 fn log(&self, _entry: &RedactionEntry) -> Result<(), RedactionLogError> {
2654 Ok(())
2655 }
2656 }
2657
2658 fn assert_send_sync<T: Send + Sync + ?Sized>() {}
2659
2660 #[test]
2661 fn redaction_log_error_display_is_stable() {
2662 assert_eq!(
2663 RedactionLogError::Sqlite("write failed".to_string()).to_string(),
2664 "sqlite redaction log error: write failed"
2665 );
2666 assert_eq!(
2667 RedactionLogError::Backend("sink failed".to_string()).to_string(),
2668 "backend redaction log error: sink failed"
2669 );
2670 }
2671
2672 #[test]
2673 fn redaction_logger_trait_object_is_send_sync() {
2674 assert_send_sync::<dyn RedactionLogger>();
2675 }
2676
2677 #[test]
2678 fn local_logger_can_implement_redaction_logger() {
2679 let logger = CapturingLogger;
2680 let entry = RedactionEntry {
2681 source: "unit-test".to_string(),
2682 recognizer_id: None,
2683 recognizer_version_id: None,
2684 class: PiiClass::Email,
2685 action: Action::Tokenize,
2686 field_name: None,
2687 document_kind: DocumentKind::Text,
2688 conflict_loser: false,
2689 decided_by: ConflictTier::None,
2690 created_at: 0,
2691 session_id: None,
2692 validator_fail_reason: None,
2693 ambiguity_record: None,
2694 collision_family: None,
2695 collision_variant: None,
2696 fallback_triggered: None,
2697 provenance_stage: None,
2698 provenance_model_id: None,
2699 provenance_model_version: None,
2700 provenance_artifact_sha256: None,
2701 provenance_tokenizer_sha256: None,
2702 provenance_locale_resolved: None,
2703 provenance_locale_match_kind: None,
2704 provenance_canonical_class: None,
2705 provenance_native_class: None,
2706 provenance_confidence: None,
2707 provenance_merged_from: None,
2708 backend_silently_dropped: None,
2709 };
2710
2711 let trait_object: &dyn RedactionLogger = &logger;
2712 trait_object.log(&entry).expect("log entry");
2713 }
2714
2715 #[test]
2716 fn redaction_entry_json_shape_omits_absent_recognizer_lineage() {
2717 let entry = RedactionEntry::new(
2718 "email.global",
2719 PiiClass::Email,
2720 Action::Tokenize,
2721 None,
2722 DocumentKind::Text,
2723 false,
2724 ConflictTier::None,
2725 0,
2726 None,
2727 );
2728
2729 let rendered = serde_json::to_string(&entry).expect("serialize redaction entry");
2730
2731 assert_eq!(
2732 rendered,
2733 r#"{"source":"email.global","class":"email","action":"tokenize","field_name":null,"document_kind":"text","conflict_loser":false,"decided_by":"none","created_at":0,"session_id":null,"validator_fail_reason":null,"ambiguity_record":null,"collision_family":null,"collision_variant":null,"fallback_triggered":null}"#
2734 );
2735 }
2736
2737 #[test]
2738 fn redaction_entry_json_shape_includes_recognizer_lineage_when_present() {
2739 let entry = RedactionEntry::new(
2740 "ner/ort",
2741 PiiClass::Name,
2742 Action::Tokenize,
2743 None,
2744 DocumentKind::Text,
2745 false,
2746 ConflictTier::None,
2747 0,
2748 None,
2749 )
2750 .with_recognizer_metadata(
2751 Some("ner".to_string()),
2752 Some("ner.davlan-mbert.v1".to_string()),
2753 );
2754
2755 let value: serde_json::Value =
2756 serde_json::to_value(&entry).expect("serialize redaction entry");
2757
2758 assert_eq!(value["recognizer_id"], "ner");
2759 assert_eq!(value["recognizer_version_id"], "ner.davlan-mbert.v1");
2760 }
2761
2762 #[test]
2763 fn candidate_keeps_versioned_and_unversioned_recognizer_ids() {
2764 let unversioned = Candidate::new(
2765 0..5,
2766 PiiClass::Email,
2767 "email.global",
2768 0.9,
2769 10,
2770 None,
2771 "email",
2772 "email.global",
2773 ConflictTier::None,
2774 Vec::new(),
2775 );
2776 assert_eq!(unversioned.recognizer_id, "email.global");
2777 assert_eq!(unversioned.recognizer_version_id, None);
2778
2779 let versioned = unversioned
2780 .clone()
2781 .with_recognizer_version_id("email.global.v1");
2782 assert_eq!(versioned.recognizer_id, "email.global");
2783 assert_eq!(
2784 versioned.recognizer_version_id.as_deref(),
2785 Some("email.global.v1")
2786 );
2787 }
2788}
2789
2790#[cfg(test)]
2791mod safety_net_manifest_tests {
2792 use super::*;
2793
2794 fn span(start: usize, end: usize, class: PiiClass) -> EmittedTokenSpan {
2795 EmittedTokenSpan {
2796 clean_span: start..end,
2797 raw_span: start..end,
2798 class,
2799 }
2800 }
2801
2802 fn diff(manifest: Manifest, suspect: Range<usize>, class: PiiClass) -> Option<LeakKind> {
2803 manifest.diff_against(&suspect, &class)
2804 }
2805
2806 #[test]
2807 fn exact_same_class_coverage_is_not_a_leak() {
2808 let manifest = Manifest::from_spans(vec![span(0, 8, PiiClass::Email)]);
2809
2810 assert_eq!(diff(manifest, 0..8, PiiClass::Email), None);
2811 }
2812
2813 #[test]
2814 fn uncovered_outside_all_tokens_is_uncovered() {
2815 let manifest = Manifest::from_spans(vec![span(20, 30, PiiClass::Email)]);
2816
2817 assert_eq!(
2818 diff(manifest, 0..10, PiiClass::Email),
2819 Some(LeakKind::Uncovered)
2820 );
2821 }
2822
2823 #[test]
2824 fn single_internal_gap_returns_partial_bleed() {
2825 let manifest = Manifest::from_spans(vec![
2826 span(0, 5, PiiClass::Email),
2827 span(10, 15, PiiClass::Email),
2828 ]);
2829
2830 assert_eq!(
2831 diff(manifest, 0..15, PiiClass::Email),
2832 Some(LeakKind::PartialBleed { uncovered: 5..10 })
2833 );
2834 }
2835
2836 #[test]
2837 fn multi_gap_returns_deterministic_first_uncovered_gap() {
2838 let manifest = Manifest::from_spans(vec![
2839 span(0, 3, PiiClass::Email),
2840 span(5, 7, PiiClass::Email),
2841 span(9, 12, PiiClass::Email),
2842 ]);
2843
2844 assert_eq!(
2847 diff(manifest, 0..12, PiiClass::Email),
2848 Some(LeakKind::PartialBleed { uncovered: 3..5 })
2849 );
2850 }
2851
2852 #[test]
2853 fn multi_class_overlap_reports_first_mismatch_deterministically() {
2854 let manifest = Manifest::from_spans(vec![
2855 span(0, 4, PiiClass::Name),
2856 span(4, 8, PiiClass::Location),
2857 ]);
2858
2859 assert_eq!(
2860 diff(manifest, 0..8, PiiClass::Email),
2861 Some(LeakKind::ClassMismatch {
2862 pipeline_class: PiiClass::Name,
2863 safety_net_class: PiiClass::Email,
2864 })
2865 );
2866 }
2867
2868 #[test]
2869 fn adjacent_same_class_tokens_cover_continuously() {
2870 let manifest = Manifest::from_spans(vec![
2871 span(0, 5, PiiClass::Email),
2872 span(5, 10, PiiClass::Email),
2873 ]);
2874
2875 assert_eq!(diff(manifest, 0..10, PiiClass::Email), None);
2876 }
2877
2878 #[test]
2879 fn partial_bleed_at_start_end_and_middle() {
2880 let manifest = Manifest::from_spans(vec![span(3, 8, PiiClass::Email)]);
2881
2882 assert_eq!(
2883 diff(manifest.clone(), 0..8, PiiClass::Email),
2884 Some(LeakKind::PartialBleed { uncovered: 0..3 })
2885 );
2886 assert_eq!(
2887 diff(manifest.clone(), 3..10, PiiClass::Email),
2888 Some(LeakKind::PartialBleed { uncovered: 8..10 })
2889 );
2890
2891 let with_gap = Manifest::from_spans(vec![
2892 span(0, 3, PiiClass::Email),
2893 span(6, 10, PiiClass::Email),
2894 ]);
2895 assert_eq!(
2896 diff(with_gap, 0..10, PiiClass::Email),
2897 Some(LeakKind::PartialBleed { uncovered: 3..6 })
2898 );
2899 }
2900
2901 #[test]
2902 fn byte_indices_are_not_character_indices() {
2903 let text = "ID: 😀 <Email_1>";
2904 let token_start = text.find("<Email_1>").expect("token start");
2905 assert_eq!(token_start, 9, "emoji is four bytes, not one char");
2906 let manifest = Manifest::from_spans(vec![span(token_start, text.len(), PiiClass::Email)]);
2907
2908 assert_eq!(
2909 diff(manifest, token_start..text.len(), PiiClass::Email),
2910 None
2911 );
2912 }
2913
2914 #[test]
2915 fn empty_suspect_range_is_not_a_leak() {
2916 let manifest = Manifest::default();
2917
2918 assert_eq!(diff(manifest, 3..3, PiiClass::Email), None);
2919 }
2920
2921 #[test]
2922 fn safety_net_error_display_is_variant_specific_and_bytes_free() {
2923 let cases = [
2924 SafetyNetError::Unavailable {
2925 reason: "not configured".to_string(),
2926 }
2927 .to_string(),
2928 SafetyNetError::WeightsMissing {
2929 path: "/models/opf".to_string(),
2930 }
2931 .to_string(),
2932 SafetyNetError::ModelUnavailable {
2933 reason: "load failed".to_string(),
2934 }
2935 .to_string(),
2936 SafetyNetError::ModelIntegrityMismatch {
2937 expected: "e3b0c44298fc1c149afbf4c8996fb924".to_string(),
2938 actual: "4e07408562bedb8b60ce05c1decfe3ad".to_string(),
2939 }
2940 .to_string(),
2941 SafetyNetError::InputTooLarge {
2942 limit: 1024,
2943 actual: 2048,
2944 }
2945 .to_string(),
2946 SafetyNetError::Runtime {
2947 message: "timeout".to_string(),
2948 }
2949 .to_string(),
2950 SafetyNetError::InvalidOutput {
2951 message: "bad json".to_string(),
2952 }
2953 .to_string(),
2954 ];
2955
2956 for rendered in cases {
2957 assert!(!rendered.contains("alice@example.invalid"));
2958 }
2959 }
2960}
2961
2962pub trait Recognizer: Send + Sync {
2964 fn id(&self) -> &str;
2966 fn supported_class(&self) -> &PiiClass;
2968 fn detect(&self, input: &str, ctx: &DetectContext<'_>) -> Vec<Candidate>;
2970 fn token_family(&self) -> &str;
2972 fn validator_kind(&self) -> Option<ValidatorKind> {
2974 None
2975 }
2976 fn locales(&self) -> &[LocaleTag] {
2978 &[LocaleTag::Global]
2979 }
2980}
2981
2982#[derive(Debug, Clone, PartialEq)]
2984#[non_exhaustive]
2985pub struct Candidate {
2986 pub span: Range<usize>,
2988 pub class: PiiClass,
2990 pub recognizer_id: String,
2992 pub recognizer_version_id: Option<String>,
2994 pub score: f32,
2996 pub priority: i32,
2998 pub canonical_form: Option<String>,
3000 pub token_family: String,
3002 pub source: String,
3004 pub decided_by: ConflictTier,
3006 pub merged_sources: Vec<String>,
3008}
3009
3010impl Candidate {
3011 #[allow(clippy::too_many_arguments)]
3013 pub fn new(
3014 span: Range<usize>,
3015 class: PiiClass,
3016 recognizer_id: impl Into<String>,
3017 score: f32,
3018 priority: i32,
3019 canonical_form: Option<String>,
3020 token_family: impl Into<String>,
3021 source: impl Into<String>,
3022 decided_by: ConflictTier,
3023 merged_sources: Vec<String>,
3024 ) -> Self {
3025 Self {
3026 span,
3027 class,
3028 recognizer_id: recognizer_id.into(),
3029 recognizer_version_id: None,
3030 score,
3031 priority,
3032 canonical_form,
3033 token_family: token_family.into(),
3034 source: source.into(),
3035 decided_by,
3036 merged_sources,
3037 }
3038 }
3039
3040 pub fn with_span(mut self, span: Range<usize>) -> Self {
3042 self.span = span;
3043 self
3044 }
3045
3046 pub fn with_recognizer_version_id(mut self, recognizer_version_id: impl Into<String>) -> Self {
3048 self.recognizer_version_id = Some(recognizer_version_id.into());
3049 self
3050 }
3051}
3052
3053#[non_exhaustive]
3055pub struct DetectContext<'a> {
3056 pub locale_chain: &'a [LocaleTag],
3058 pub dictionaries: &'a DictionaryBundle,
3060 pub fields: &'a (),
3062 pub degraded: Cell<bool>,
3064}
3065
3066impl<'a> DetectContext<'a> {
3067 pub fn new(locale_chain: &'a [LocaleTag], dictionaries: &'a DictionaryBundle) -> Self {
3069 Self {
3070 locale_chain,
3071 dictionaries,
3072 fields: &(),
3073 degraded: Cell::new(false),
3074 }
3075 }
3076}
3077
3078fn ensure_global(tags: &mut Vec<LocaleTag>) {
3079 if !tags.contains(&LocaleTag::Global) {
3080 tags.push(LocaleTag::Global);
3081 }
3082}
3083
3084fn is_bcp47_parseable(raw: &str) -> bool {
3085 let mut parts = raw.split('-');
3086 let Some(language) = parts.next() else {
3087 return false;
3088 };
3089 if !(2..=8).contains(&language.len()) || !language.chars().all(|ch| ch.is_ascii_alphabetic()) {
3090 return false;
3091 }
3092 parts.all(|part| {
3093 (2..=8).contains(&part.len()) && part.chars().all(|ch| ch.is_ascii_alphanumeric())
3094 })
3095}
3096
3097fn canonical_other(raw: &str) -> String {
3098 let mut parts = raw.split('-');
3099 let language = parts.next().unwrap_or_default().to_ascii_lowercase();
3100 let rest = parts.map(|part| {
3101 if part.len() == 2 && part.chars().all(|ch| ch.is_ascii_alphabetic()) {
3102 part.to_ascii_uppercase()
3103 } else {
3104 part.to_ascii_lowercase()
3105 }
3106 });
3107 std::iter::once(language)
3108 .chain(rest)
3109 .collect::<Vec<_>>()
3110 .join("-")
3111}