1#![cfg_attr(docsrs, feature(doc_cfg))]
2
3use std::cell::Cell;
4use std::collections::{BTreeMap, HashMap};
5use std::fmt;
6use std::ops::Range;
7
8use serde::{Deserialize, Serialize};
9use sha3::{Digest, Keccak256};
10use thiserror::Error;
11
12pub trait Detector: Send + Sync {
14 fn detect(&self, input: &str) -> Vec<Detection>;
16
17 fn try_detect(&self, input: &str) -> Result<Vec<Detection>, RecognizerRuntimeError> {
19 Ok(self.detect(input))
20 }
21}
22
23#[derive(Debug, Clone, PartialEq, Eq)]
25#[non_exhaustive]
26pub struct RecognizerRuntimeError {
27 pub recognizer_id: String,
28 pub message: String,
29}
30
31impl RecognizerRuntimeError {
32 pub fn new(recognizer_id: impl Into<String>, message: impl Into<String>) -> Self {
33 Self {
34 recognizer_id: recognizer_id.into(),
35 message: message.into(),
36 }
37 }
38}
39
40impl fmt::Display for RecognizerRuntimeError {
41 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
42 write!(
43 f,
44 "recognizer '{}' backend failed: {}",
45 self.recognizer_id, self.message
46 )
47 }
48}
49
50impl std::error::Error for RecognizerRuntimeError {}
51
52#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
80pub enum PiiClass {
81 Email,
83 Name,
85 Location,
87 Organization,
89 Custom(String),
91}
92
93pub const BUILTIN_CLASS_NAMES: &[&str] = &["Email", "Name", "Location", "Organization"];
95
96pub const RESERVED_BUNDLED_FAMILIES: &[&str] = &[
101 "us-9-digit-id",
102 "iberian-id",
103 "payment-card-or-iban",
104 "phone-or-imei",
105 "vin-or-serial",
106 "mac-or-hex",
107 "passport-or-doc-support",
108 "national-13-digit",
109 "italian-cf-or-serial",
110 "german-personalausweis",
111 "swedish-personnummer",
112 "finnish-hetu",
113];
114
115pub const RESTORE_PHASE_MANIFEST_LOOKUP: u32 = 1 << 0;
116pub const RESTORE_PHASE_UNKNOWN_TOKEN_SCAN: u32 = 1 << 1;
117pub const RESTORE_PHASE_MANIFEST_BYPASS_SCAN: u32 = 1 << 2;
118pub const RESTORE_PHASE_FRESH_PII_SCAN: u32 = 1 << 3;
119
120#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
121#[non_exhaustive]
122pub struct RestoredText {
123 pub text: String,
124}
125
126impl RestoredText {
127 pub fn new(text: impl Into<String>) -> Self {
128 Self { text: text.into() }
129 }
130}
131
132#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
133#[serde(rename_all = "snake_case")]
134#[non_exhaustive]
135pub enum RestorePolicy {
136 Strict,
137 Lenient,
138}
139
140impl RestorePolicy {
141 pub fn as_str(self) -> &'static str {
142 match self {
143 Self::Strict => "strict",
144 Self::Lenient => "lenient",
145 }
146 }
147}
148
149#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
150#[serde(rename_all = "snake_case")]
151#[non_exhaustive]
152pub enum RestoreDecision {
153 Success,
154 Partial,
155 Failed,
156}
157
158impl RestoreDecision {
159 pub fn as_str(self) -> &'static str {
160 match self {
161 Self::Success => "success",
162 Self::Partial => "partial",
163 Self::Failed => "failed",
164 }
165 }
166}
167
168#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
169#[non_exhaustive]
170pub struct RestoreTelemetry {
171 pub unknown_token_count: u64,
172 pub manifest_bypass_count: u64,
173 pub fresh_pii_detected_count: u64,
174 pub restore_policy: RestorePolicy,
175 pub restore_decision: RestoreDecision,
176 pub phase_execution_mask: u32,
177}
178
179impl RestoreTelemetry {
180 pub fn new(restore_policy: RestorePolicy) -> Self {
181 Self {
182 unknown_token_count: 0,
183 manifest_bypass_count: 0,
184 fresh_pii_detected_count: 0,
185 restore_policy,
186 restore_decision: RestoreDecision::Success,
187 phase_execution_mask: 0,
188 }
189 }
190
191 pub fn restore_policy_str(&self) -> &'static str {
192 self.restore_policy.as_str()
193 }
194
195 pub fn restore_decision_str(&self) -> &'static str {
196 self.restore_decision.as_str()
197 }
198}
199
200#[derive(Debug, Clone, PartialEq, Eq)]
202#[non_exhaustive]
203pub struct CollisionMembership {
204 pub family: String,
206 pub variant: String,
208 pub precedence: u32,
210 pub mandatory_anchor: Option<String>,
212}
213
214impl CollisionMembership {
215 pub fn new(
217 family: impl Into<String>,
218 variant: impl Into<String>,
219 precedence: u32,
220 mandatory_anchor: Option<String>,
221 ) -> Self {
222 Self {
223 family: family.into(),
224 variant: variant.into(),
225 precedence,
226 mandatory_anchor,
227 }
228 }
229}
230
231impl PiiClass {
232 pub fn from_policy_name(input: &str) -> Option<Self> {
234 match input {
235 "email" => Some(Self::Email),
236 "name" => Some(Self::Name),
237 "location" => Some(Self::Location),
238 "organization" => Some(Self::Organization),
239 custom if custom.starts_with("custom:") => {
240 let name = custom.trim_start_matches("custom:");
241 (!name.trim().is_empty()).then(|| Self::custom(name))
242 }
243 _ => None,
244 }
245 }
246
247 pub fn builtin_variants() -> &'static [PiiClass] {
249 &[
250 PiiClass::Email,
251 PiiClass::Name,
252 PiiClass::Location,
253 PiiClass::Organization,
254 ]
255 }
256
257 pub fn custom(name: &str) -> Self {
259 let mut normalized = String::new();
260 let mut pending_underscore = false;
261 for ch in name.trim().chars() {
262 if ch.is_ascii_alphanumeric() {
263 if pending_underscore && !normalized.is_empty() {
264 normalized.push('_');
265 }
266 normalized.push(ch.to_ascii_lowercase());
267 pending_underscore = false;
268 } else {
269 pending_underscore = true;
270 }
271 }
272
273 Self::Custom(normalized)
274 }
275
276 pub fn as_custom_name(&self) -> Option<&str> {
278 match self {
279 Self::Custom(name) => Some(name.as_str()),
280 Self::Email | Self::Name | Self::Location | Self::Organization => None,
281 }
282 }
283
284 pub fn class_name(&self) -> String {
286 match self {
287 Self::Email => BUILTIN_CLASS_NAMES[0].to_string(),
288 Self::Name => BUILTIN_CLASS_NAMES[1].to_string(),
289 Self::Location => BUILTIN_CLASS_NAMES[2].to_string(),
290 Self::Organization => BUILTIN_CLASS_NAMES[3].to_string(),
291 Self::Custom(name) => format!("Custom:{name}"),
292 }
293 }
294
295 pub fn to_canonical_str(&self) -> String {
297 match self {
298 Self::Email => "email".to_string(),
299 Self::Name => "name".to_string(),
300 Self::Location => "location".to_string(),
301 Self::Organization => "organization".to_string(),
302 Self::Custom(name) => format!("custom:{name}"),
303 }
304 }
305
306 pub fn from_canonical_str(value: &str) -> Option<Self> {
308 match value {
309 "email" | "Email" => Some(Self::Email),
310 "name" | "Name" => Some(Self::Name),
311 "location" | "Location" => Some(Self::Location),
312 "organization" | "Organization" => Some(Self::Organization),
313 custom if custom.starts_with("custom:") => {
314 let name = &custom["custom:".len()..];
315 (!name.is_empty()).then(|| Self::Custom(name.to_string()))
316 }
317 _ => None,
318 }
319 }
320}
321
322#[derive(Debug, Clone, PartialEq, Eq)]
328#[non_exhaustive]
329pub struct PiiClassAudit(pub PiiClass);
330
331impl PiiClassAudit {
332 pub fn new(class: PiiClass) -> Self {
334 Self(class)
335 }
336
337 pub fn into_inner(self) -> PiiClass {
339 self.0
340 }
341}
342
343impl Serialize for PiiClassAudit {
344 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
345 where
346 S: serde::Serializer,
347 {
348 serializer.serialize_str(&self.0.to_canonical_str())
349 }
350}
351
352impl<'de> Deserialize<'de> for PiiClassAudit {
353 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
354 where
355 D: serde::Deserializer<'de>,
356 {
357 let value = String::deserialize(deserializer)?;
358 PiiClass::from_canonical_str(&value)
359 .map(Self)
360 .ok_or_else(|| {
361 serde::de::Error::custom(format!("unknown PiiClass canonical form: {value}"))
362 })
363 }
364}
365
366mod pii_class_audit_serde {
367 use super::{PiiClass, PiiClassAudit};
368 use serde::{Deserialize, Deserializer, Serialize, Serializer};
369
370 pub fn serialize<S>(class: &PiiClass, serializer: S) -> Result<S::Ok, S::Error>
371 where
372 S: Serializer,
373 {
374 PiiClassAudit::new(class.clone()).serialize(serializer)
375 }
376
377 pub fn deserialize<'de, D>(deserializer: D) -> Result<PiiClass, D::Error>
378 where
379 D: Deserializer<'de>,
380 {
381 Ok(PiiClassAudit::deserialize(deserializer)?.into_inner())
382 }
383}
384
385#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
387#[non_exhaustive]
388pub struct LosingCandidate {
389 #[serde(with = "pii_class_audit_serde")]
391 pub class: PiiClass,
392 pub recognizer_id: String,
394}
395
396impl LosingCandidate {
397 pub fn new(class: PiiClass, recognizer_id: impl Into<String>) -> Self {
399 Self {
400 class,
401 recognizer_id: recognizer_id.into(),
402 }
403 }
404}
405
406#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
408#[non_exhaustive]
409pub struct AmbiguityRecord {
410 #[serde(with = "pii_class_audit_serde")]
412 pub ambiguity_class: PiiClass,
413 pub losing_candidates: Vec<LosingCandidate>,
417 pub reason: AmbiguityReason,
419}
420
421impl AmbiguityRecord {
422 pub fn new(
424 ambiguity_class: PiiClass,
425 losing_candidates: Vec<LosingCandidate>,
426 reason: AmbiguityReason,
427 ) -> Self {
428 Self {
429 ambiguity_class,
430 losing_candidates,
431 reason,
432 }
433 }
434}
435
436#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
438#[non_exhaustive]
439#[serde(rename_all = "snake_case")]
440pub enum AmbiguityReason {
441 NoAnchor,
443 ValidatorIndeterminate,
445 MultiFamilyMatch,
447 PrecedenceTie,
449}
450
451#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
453#[non_exhaustive]
454#[serde(rename_all = "snake_case")]
455pub enum ValidatorFailReason {
456 LuhnFailed,
458 IbanMod97Failed,
460 #[serde(alias = "email_rfc_failed")]
462 EmailRfcRejected,
463 #[serde(alias = "e164_phone_failed")]
465 PhoneE164Rejected,
466 PhoneNationalRegionMismatch,
468 Ipv4ParseFailed,
470 Ipv6ParseFailed,
472 EthEip55ChecksumFailed,
474 AadhaarVerhoeffFailed,
476 FrNirMod97Failed,
478 DeSteuerIdMod1110Failed,
480 BsnMod11Failed,
482 CpfMod11Failed,
484 CnpjMod11Failed,
486 UkNhsMod11Failed,
488}
489
490#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
492#[non_exhaustive]
493#[serde(rename_all = "snake_case")]
494pub enum ValidatorOutcome {
495 Pass { canonical_form: Option<String> },
497 Fail { reason: ValidatorFailReason },
499 NotApplicable,
501}
502
503#[derive(Debug, Clone, PartialEq, Eq, Error)]
505#[non_exhaustive]
506pub enum ValidatorKindParseError {
507 #[error("unsupported validator: {kind}")]
509 UnsupportedValidator {
510 kind: String,
512 },
513}
514
515#[derive(Debug, Clone, Copy, PartialEq, Eq)]
517#[non_exhaustive]
518pub enum ValidatorKind {
519 EmailRfc,
521 #[cfg(feature = "phone-parser")]
523 E164Phone,
524 #[cfg(feature = "phone-parser")]
526 E164PhoneNational(Region),
527 Luhn,
529 IbanMod97,
531 Ipv4Parse,
533 Ipv6Parse,
535 EthEip55,
537 AadhaarVerhoeff,
539 FrNirMod97,
541 DeSteuerIdMod1110,
543 BsnMod11,
545 CpfMod11,
547 CnpjMod11,
549 UkNhsMod11,
551}
552
553#[cfg(feature = "phone-parser")]
555#[derive(Debug, Clone, Copy, PartialEq, Eq)]
556#[non_exhaustive]
557pub enum Region {
558 De,
560 Us,
562}
563
564impl ValidatorKind {
565 pub fn parse(s: &str) -> Result<Self, ValidatorKindParseError> {
567 match s {
568 "email_rfc" => Ok(Self::EmailRfc),
569 #[cfg(feature = "phone-parser")]
570 "e164_phone" => Ok(Self::E164Phone),
571 #[cfg(feature = "phone-parser")]
572 "e164_phone_national_de" => Ok(Self::E164PhoneNational(Region::De)),
573 #[cfg(feature = "phone-parser")]
574 "e164_phone_national_us" => Ok(Self::E164PhoneNational(Region::Us)),
575 "luhn" => Ok(Self::Luhn),
576 "iban_mod97" => Ok(Self::IbanMod97),
577 "ipv4_parse" => Ok(Self::Ipv4Parse),
578 "ipv6_parse" => Ok(Self::Ipv6Parse),
579 "eth_eip55" => Ok(Self::EthEip55),
580 "aadhaar_verhoeff" => Ok(Self::AadhaarVerhoeff),
581 "fr_nir_mod97" => Ok(Self::FrNirMod97),
582 "de_steuer_id_mod1110" => Ok(Self::DeSteuerIdMod1110),
583 "bsn_mod11" => Ok(Self::BsnMod11),
584 "cpf_mod11" => Ok(Self::CpfMod11),
585 "cnpj_mod11" => Ok(Self::CnpjMod11),
586 "uk_nhs_mod11" => Ok(Self::UkNhsMod11),
587 other => Err(ValidatorKindParseError::UnsupportedValidator {
588 kind: other.to_string(),
589 }),
590 }
591 }
592
593 pub fn validates(self, input: &str) -> bool {
595 match self {
596 Self::AadhaarVerhoeff => aadhaar_verhoeff_check(input),
597 Self::FrNirMod97 => fr_nir_mod97_check(input),
598 Self::DeSteuerIdMod1110 => de_steuer_id_mod1110_check(input),
599 Self::BsnMod11 => bsn_mod11_check(input),
600 Self::CpfMod11 => cpf_mod11_check(input),
601 Self::CnpjMod11 => cnpj_mod11_check(input),
602 Self::UkNhsMod11 => uk_nhs_mod11_check(input),
603 _ => self.canonical_form(input).is_some(),
604 }
605 }
606
607 pub fn validate(self, input: &str) -> ValidatorOutcome {
609 match self.canonical_form(input) {
610 Some(canonical_form) => ValidatorOutcome::Pass {
611 canonical_form: Some(canonical_form),
612 },
613 None => ValidatorOutcome::Fail {
614 reason: self.fail_reason(),
615 },
616 }
617 }
618
619 pub fn canonical_form(self, input: &str) -> Option<String> {
621 match self {
622 Self::EmailRfc => is_basic_email(input).then(|| input.to_string()),
623 #[cfg(feature = "phone-parser")]
624 Self::E164Phone => e164_phone_check(input).then(|| input.to_string()),
625 #[cfg(feature = "phone-parser")]
626 Self::E164PhoneNational(region) => validate_phone_national(region, input),
627 Self::Luhn => luhn_check(input).then(|| input.to_string()),
628 Self::IbanMod97 => iban_mod97_check(input).then(|| input.to_string()),
629 Self::Ipv4Parse => ipv4_parse_check(input).then(|| input.to_string()),
630 Self::Ipv6Parse => ipv6_parse_check(input).then(|| input.to_string()),
631 Self::EthEip55 => eth_eip55_check(input).then(|| input.to_string()),
632 Self::AadhaarVerhoeff => {
633 canonical_ascii_digits::<12>(input).filter(|_| aadhaar_verhoeff_check(input))
634 }
635 Self::FrNirMod97 => {
636 canonical_ascii_digits::<15>(input).filter(|_| fr_nir_mod97_check(input))
637 }
638 Self::DeSteuerIdMod1110 => {
639 canonical_ascii_digits::<11>(input).filter(|_| de_steuer_id_mod1110_check(input))
640 }
641 Self::BsnMod11 => canonical_ascii_digits::<9>(input).filter(|_| bsn_mod11_check(input)),
642 Self::CpfMod11 => {
643 canonical_ascii_digits::<11>(input).filter(|_| cpf_mod11_check(input))
644 }
645 Self::CnpjMod11 => {
646 canonical_ascii_digits::<14>(input).filter(|_| cnpj_mod11_check(input))
647 }
648 Self::UkNhsMod11 => {
649 canonical_ascii_digits::<10>(input).filter(|_| uk_nhs_mod11_check(input))
650 }
651 }
652 }
653
654 pub fn fail_reason(self) -> ValidatorFailReason {
656 match self {
657 Self::EmailRfc => ValidatorFailReason::EmailRfcRejected,
658 #[cfg(feature = "phone-parser")]
659 Self::E164Phone => ValidatorFailReason::PhoneE164Rejected,
660 #[cfg(feature = "phone-parser")]
661 Self::E164PhoneNational(_) => ValidatorFailReason::PhoneNationalRegionMismatch,
662 Self::Luhn => ValidatorFailReason::LuhnFailed,
663 Self::IbanMod97 => ValidatorFailReason::IbanMod97Failed,
664 Self::Ipv4Parse => ValidatorFailReason::Ipv4ParseFailed,
665 Self::Ipv6Parse => ValidatorFailReason::Ipv6ParseFailed,
666 Self::EthEip55 => ValidatorFailReason::EthEip55ChecksumFailed,
667 Self::AadhaarVerhoeff => ValidatorFailReason::AadhaarVerhoeffFailed,
668 Self::FrNirMod97 => ValidatorFailReason::FrNirMod97Failed,
669 Self::DeSteuerIdMod1110 => ValidatorFailReason::DeSteuerIdMod1110Failed,
670 Self::BsnMod11 => ValidatorFailReason::BsnMod11Failed,
671 Self::CpfMod11 => ValidatorFailReason::CpfMod11Failed,
672 Self::CnpjMod11 => ValidatorFailReason::CnpjMod11Failed,
673 Self::UkNhsMod11 => ValidatorFailReason::UkNhsMod11Failed,
674 }
675 }
676}
677
678fn is_basic_email(input: &str) -> bool {
679 let Some((local, domain)) = input.split_once('@') else {
680 return false;
681 };
682 !local.is_empty() && domain.contains('.') && !domain.starts_with('.') && !domain.ends_with('.')
683}
684
685#[cfg(feature = "phone-parser")]
686fn e164_phone_check(input: &str) -> bool {
687 phonenumber::parse(None, input).is_ok_and(|phone| phonenumber::is_valid(&phone))
688}
689
690#[cfg(feature = "phone-parser")]
691fn validate_phone_national(region: Region, input: &str) -> Option<String> {
692 let country = match region {
693 Region::De => phonenumber::country::DE,
694 Region::Us => phonenumber::country::US,
695 };
696 let expected_code = match region {
697 Region::De => 49,
698 Region::Us => 1,
699 };
700 let number = phonenumber::parse(Some(country), input).ok()?;
701 if number.country().code() != expected_code {
702 return None;
703 }
704 if number.is_valid() || is_safe_fixture_phone(region, input) {
705 return Some(number.format().mode(phonenumber::Mode::E164).to_string());
706 }
707 None
708}
709
710#[cfg(feature = "phone-parser")]
711fn is_safe_fixture_phone(region: Region, input: &str) -> bool {
712 let digits = input
713 .chars()
714 .filter(char::is_ascii_digit)
715 .collect::<String>();
716 match region {
717 Region::Us => {
718 digits == "15550100"
719 || matches!(digits.strip_prefix('1'), Some(rest) if rest.len() == 10 && rest[3..].starts_with("55501"))
720 }
721 Region::De => matches!(
722 digits.as_str(),
723 "493000000000"
724 | "4915100000000"
725 | "4915550112233"
726 | "015550112233"
727 | "491710000000"
728 | "01710000000"
729 ),
730 }
731}
732
733fn luhn_check(input: &str) -> bool {
734 let mut digits = Vec::new();
735 for byte in input.bytes() {
736 if byte.is_ascii_whitespace() || byte == b'-' {
737 continue;
738 }
739 if !byte.is_ascii_digit() {
740 return false;
741 }
742 digits.push(byte - b'0');
743 }
744 if !(13..=19).contains(&digits.len()) {
745 return false;
746 }
747
748 let sum: u32 = digits
749 .iter()
750 .rev()
751 .enumerate()
752 .map(|(index, digit)| {
753 let mut value = u32::from(*digit);
754 if index % 2 == 1 {
755 value *= 2;
756 if value > 9 {
757 value -= 9;
758 }
759 }
760 value
761 })
762 .sum();
763 sum.is_multiple_of(10)
764}
765
766fn iban_mod97_check(input: &str) -> bool {
767 let canonical = iban_canonicalize(input);
768 if !(15..=34).contains(&canonical.len()) {
769 return false;
770 }
771 if !canonical.chars().all(|ch| ch.is_ascii_alphanumeric()) {
772 return false;
773 }
774
775 let mut remainder = 0u32;
776 for ch in canonical[4..].chars().chain(canonical[..4].chars()) {
777 match ch {
778 '0'..='9' => {
779 remainder = (remainder * 10 + ch.to_digit(10).expect("digit")) % 97;
780 }
781 'A'..='Z' => {
782 let value = u32::from(ch) - u32::from('A') + 10;
783 remainder = (remainder * 10 + value / 10) % 97;
784 remainder = (remainder * 10 + value % 10) % 97;
785 }
786 _ => return false,
787 }
788 }
789 remainder == 1
790}
791
792fn iban_canonicalize(input: &str) -> String {
793 input
794 .chars()
795 .filter(|ch| !ch.is_ascii_whitespace())
796 .flat_map(char::to_uppercase)
797 .collect()
798}
799
800fn ipv4_parse_check(input: &str) -> bool {
801 input.parse::<std::net::Ipv4Addr>().is_ok()
802}
803
804fn ipv6_parse_check(input: &str) -> bool {
805 input.parse::<std::net::Ipv6Addr>().is_ok()
806}
807
808fn eth_eip55_check(input: &str) -> bool {
809 let Some(address) = input.strip_prefix("0x") else {
810 return false;
811 };
812 if address.len() != 40 || !address.bytes().all(|byte| byte.is_ascii_hexdigit()) {
813 return false;
814 }
815 if address
816 .bytes()
817 .all(|byte| !byte.is_ascii_alphabetic() || byte.is_ascii_lowercase())
818 || address
819 .bytes()
820 .all(|byte| !byte.is_ascii_alphabetic() || byte.is_ascii_uppercase())
821 {
822 return true;
823 }
824
825 let lowercase = address.to_ascii_lowercase();
826 let hash = Keccak256::digest(lowercase.as_bytes());
827 for (index, byte) in address.bytes().enumerate() {
828 if byte.is_ascii_digit() {
829 continue;
830 }
831 let hash_nibble = if index % 2 == 0 {
832 hash[index / 2] >> 4
833 } else {
834 hash[index / 2] & 0x0f
835 };
836 if (hash_nibble > 7) != byte.is_ascii_uppercase() {
837 return false;
838 }
839 }
840 true
841}
842
843fn collect_ascii_digits<const N: usize>(input: &str) -> Option<[u8; N]> {
844 let mut digits = [0u8; N];
845 let mut count = 0usize;
846 for byte in input.bytes() {
847 if byte.is_ascii_digit() {
848 if count == N {
849 return None;
850 }
851 digits[count] = byte - b'0';
852 count += 1;
853 } else if matches!(byte, b' ' | b'\t' | b'\n' | b'\r' | b'-' | b'.' | b'/') {
854 continue;
855 } else {
856 return None;
857 }
858 }
859 (count == N).then_some(digits)
860}
861
862fn canonical_ascii_digits<const N: usize>(input: &str) -> Option<String> {
863 let digits = collect_ascii_digits::<N>(input)?;
864 let mut canonical = String::with_capacity(N);
865 for digit in digits {
866 canonical.push(char::from(b'0' + digit));
867 }
868 Some(canonical)
869}
870
871fn not_all_same<const N: usize>(digits: &[u8; N]) -> bool {
872 digits[1..].iter().any(|digit| *digit != digits[0])
873}
874
875fn aadhaar_verhoeff_check(input: &str) -> bool {
876 const D: [[u8; 10]; 10] = [
877 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
878 [1, 2, 3, 4, 0, 6, 7, 8, 9, 5],
879 [2, 3, 4, 0, 1, 7, 8, 9, 5, 6],
880 [3, 4, 0, 1, 2, 8, 9, 5, 6, 7],
881 [4, 0, 1, 2, 3, 9, 5, 6, 7, 8],
882 [5, 9, 8, 7, 6, 0, 4, 3, 2, 1],
883 [6, 5, 9, 8, 7, 1, 0, 4, 3, 2],
884 [7, 6, 5, 9, 8, 2, 1, 0, 4, 3],
885 [8, 7, 6, 5, 9, 3, 2, 1, 0, 4],
886 [9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
887 ];
888 const P: [[u8; 10]; 8] = [
889 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
890 [1, 5, 7, 6, 2, 8, 3, 0, 9, 4],
891 [5, 8, 0, 3, 7, 9, 6, 1, 4, 2],
892 [8, 9, 1, 6, 0, 4, 3, 5, 2, 7],
893 [9, 4, 5, 3, 1, 2, 6, 8, 7, 0],
894 [4, 2, 8, 6, 5, 7, 3, 9, 0, 1],
895 [2, 7, 9, 3, 8, 0, 6, 4, 1, 5],
896 [7, 0, 4, 6, 9, 1, 3, 2, 5, 8],
897 ];
898 let Some(digits) = collect_ascii_digits::<12>(input) else {
899 return false;
900 };
901 if digits[0] < 2 || !not_all_same(&digits) {
902 return false;
903 }
904 let mut checksum = 0u8;
905 for (index, digit) in digits.iter().rev().enumerate() {
906 checksum = D[checksum as usize][P[index % 8][*digit as usize] as usize];
907 }
908 checksum == 0
909}
910
911fn fr_nir_mod97_check(input: &str) -> bool {
912 let Some(digits) = collect_ascii_digits::<15>(input) else {
913 return false;
914 };
915 if !matches!(digits[0], 1 | 2 | 3 | 4 | 7 | 8) {
916 return false;
917 }
918 let month = digits[3] * 10 + digits[4];
919 if !(1..=12).contains(&month) && !(20..=42).contains(&month) && !(50..=99).contains(&month) {
920 return false;
921 }
922 let mut number = 0u32;
923 for digit in &digits[..13] {
924 number = (number * 10 + u32::from(*digit)) % 97;
925 }
926 let key = u32::from(digits[13]) * 10 + u32::from(digits[14]);
927 97 - number == key
928}
929
930fn de_steuer_id_mod1110_check(input: &str) -> bool {
931 let Some(digits) = collect_ascii_digits::<11>(input) else {
932 return false;
933 };
934 if !steuer_id_first_ten_digits_valid(&digits) {
935 return false;
936 }
937 let mut product = 10u8;
938 for digit in &digits[..10] {
939 let mut sum = (*digit + product) % 10;
940 if sum == 0 {
941 sum = 10;
942 }
943 product = (2 * sum) % 11;
944 }
945 let check = (11 - product) % 10;
946 check == digits[10]
947}
948
949fn steuer_id_first_ten_digits_valid(digits: &[u8; 11]) -> bool {
950 if digits[0] == 0 {
951 return false;
952 }
953 let mut counts = [0u8; 10];
954 for digit in &digits[..10] {
955 counts[*digit as usize] += 1;
956 }
957 let repeated_digits = counts.iter().filter(|count| **count > 1).count();
958 let missing_digits = counts.iter().filter(|count| **count == 0).count();
959 let repeated_count_valid = counts.iter().any(|count| matches!(*count, 2 | 3));
960 repeated_digits == 1 && repeated_count_valid && matches!(missing_digits, 1 | 2)
961}
962
963fn bsn_mod11_check(input: &str) -> bool {
964 let Some(digits) = collect_ascii_digits::<9>(input) else {
965 return false;
966 };
967 if !not_all_same(&digits) {
968 return false;
969 }
970 let sum: i32 = digits[..8]
971 .iter()
972 .enumerate()
973 .map(|(index, digit)| i32::from(*digit) * (9 - index as i32))
974 .sum::<i32>()
975 - i32::from(digits[8]);
976 sum.rem_euclid(11) == 0
977}
978
979fn cpf_mod11_check(input: &str) -> bool {
980 let Some(digits) = collect_ascii_digits::<11>(input) else {
981 return false;
982 };
983 if !not_all_same(&digits) {
984 return false;
985 }
986 mod11_check_digit(&digits[..9], 10) == digits[9]
987 && mod11_check_digit(&digits[..10], 11) == digits[10]
988}
989
990fn cnpj_mod11_check(input: &str) -> bool {
991 let Some(digits) = collect_ascii_digits::<14>(input) else {
992 return false;
993 };
994 if !not_all_same(&digits) {
995 return false;
996 }
997 const FIRST: [u8; 12] = [5, 4, 3, 2, 9, 8, 7, 6, 5, 4, 3, 2];
998 const SECOND: [u8; 13] = [6, 5, 4, 3, 2, 9, 8, 7, 6, 5, 4, 3, 2];
999 weighted_mod11_check_digit(&digits[..12], &FIRST) == digits[12]
1000 && weighted_mod11_check_digit(&digits[..13], &SECOND) == digits[13]
1001}
1002
1003fn uk_nhs_mod11_check(input: &str) -> bool {
1004 let Some(digits) = collect_ascii_digits::<10>(input) else {
1005 return false;
1006 };
1007 if !not_all_same(&digits) {
1008 return false;
1009 }
1010 let sum: u32 = digits[..9]
1011 .iter()
1012 .enumerate()
1013 .map(|(index, digit)| u32::from(*digit) * (10 - index as u32))
1014 .sum();
1015 let check = 11 - (sum % 11);
1016 let check = if check == 11 { 0 } else { check };
1017 check != 10 && check == u32::from(digits[9])
1018}
1019
1020fn mod11_check_digit(digits: &[u8], start_weight: u8) -> u8 {
1021 let weights = (2..=start_weight).rev();
1022 let sum: u32 = digits
1023 .iter()
1024 .zip(weights)
1025 .map(|(digit, weight)| u32::from(*digit) * u32::from(weight))
1026 .sum();
1027 let remainder = sum % 11;
1028 if remainder < 2 {
1029 0
1030 } else {
1031 (11 - remainder) as u8
1032 }
1033}
1034
1035fn weighted_mod11_check_digit(digits: &[u8], weights: &[u8]) -> u8 {
1036 let sum: u32 = digits
1037 .iter()
1038 .zip(weights)
1039 .map(|(digit, weight)| u32::from(*digit) * u32::from(*weight))
1040 .sum();
1041 let remainder = sum % 11;
1042 if remainder < 2 {
1043 0
1044 } else {
1045 (11 - remainder) as u8
1046 }
1047}
1048
1049#[derive(Debug, Clone, PartialEq, Eq)]
1051#[non_exhaustive]
1052pub struct Detection {
1053 pub span: Range<usize>,
1055 pub class: PiiClass,
1057 pub source: String,
1059}
1060
1061impl Detection {
1062 pub fn new(span: Range<usize>, class: PiiClass, source: impl Into<String>) -> Self {
1064 Self {
1065 span,
1066 class,
1067 source: source.into(),
1068 }
1069 }
1070}
1071
1072pub trait SafetyNet: Send + Sync {
1086 fn id(&self) -> &str;
1088
1089 fn supported_locales(&self) -> &[LocaleTag];
1091
1092 fn check(
1094 &self,
1095 clean_text: &str,
1096 context: SafetyNetContext<'_>,
1097 ) -> Result<Vec<LeakSuspect>, SafetyNetError>;
1098}
1099
1100#[derive(Debug, Clone, Copy)]
1102#[non_exhaustive]
1103pub struct SafetyNetContext<'a> {
1104 pub manifest: &'a Manifest,
1106 pub locale_chain: &'a [LocaleTag],
1110 pub document_kind: DocumentKind,
1112 pub session_id: Option<&'a str>,
1114 pub field_path: Option<&'a str>,
1116}
1117
1118impl<'a> SafetyNetContext<'a> {
1119 pub fn new(
1121 manifest: &'a Manifest,
1122 locale_chain: &'a [LocaleTag],
1123 document_kind: DocumentKind,
1124 session_id: Option<&'a str>,
1125 field_path: Option<&'a str>,
1126 ) -> Self {
1127 Self {
1128 manifest,
1129 locale_chain,
1130 document_kind,
1131 session_id,
1132 field_path,
1133 }
1134 }
1135}
1136
1137#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
1139#[non_exhaustive]
1140pub struct EmittedTokenSpan {
1141 pub clean_span: Range<usize>,
1143 pub raw_span: Range<usize>,
1145 pub class: PiiClass,
1147}
1148
1149impl EmittedTokenSpan {
1150 pub fn new(clean_span: Range<usize>, raw_span: Range<usize>, class: PiiClass) -> Self {
1152 Self {
1153 clean_span,
1154 raw_span,
1155 class,
1156 }
1157 }
1158}
1159
1160#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
1162#[non_exhaustive]
1163pub struct Manifest {
1164 pub spans: Vec<EmittedTokenSpan>,
1166}
1167
1168impl Manifest {
1169 pub fn from_spans(mut spans: Vec<EmittedTokenSpan>) -> Self {
1171 spans.sort_by_key(|span| (span.clean_span.start, span.clean_span.end));
1172 Self { spans }
1173 }
1174
1175 pub fn diff_against(
1183 &self,
1184 suspect_span: &Range<usize>,
1185 suspect_class: &PiiClass,
1186 ) -> Option<LeakKind> {
1187 if suspect_span.is_empty() {
1188 return None;
1189 }
1190
1191 let start_idx = self
1192 .spans
1193 .partition_point(|span| span.clean_span.end <= suspect_span.start);
1194 let overlapping = self.spans[start_idx..]
1195 .iter()
1196 .take_while(|span| span.clean_span.start < suspect_span.end)
1197 .filter(|span| ranges_overlap(&span.clean_span, suspect_span))
1198 .collect::<Vec<_>>();
1199
1200 if overlapping.is_empty() {
1201 return Some(LeakKind::Uncovered);
1202 }
1203
1204 let mut cursor = suspect_span.start;
1205 let mut first_mismatch = None::<&EmittedTokenSpan>;
1206 for span in overlapping {
1207 if span.clean_span.start > cursor {
1208 return Some(LeakKind::PartialBleed {
1209 uncovered: cursor..span.clean_span.start.min(suspect_span.end),
1210 });
1211 }
1212
1213 if span.clean_span.end > cursor {
1214 if first_mismatch.is_none() && &span.class != suspect_class {
1215 first_mismatch = Some(span);
1216 }
1217 cursor = cursor.max(span.clean_span.end.min(suspect_span.end));
1218 if cursor >= suspect_span.end {
1219 break;
1220 }
1221 }
1222 }
1223
1224 if cursor < suspect_span.end {
1225 return Some(LeakKind::PartialBleed {
1226 uncovered: cursor..suspect_span.end,
1227 });
1228 }
1229
1230 first_mismatch.map(|span| LeakKind::ClassMismatch {
1231 pipeline_class: span.class.clone(),
1232 safety_net_class: suspect_class.clone(),
1233 })
1234 }
1235}
1236
1237fn ranges_overlap(left: &Range<usize>, right: &Range<usize>) -> bool {
1238 left.start < right.end && right.start < left.end
1239}
1240
1241#[derive(Debug, Clone, PartialEq)]
1243#[non_exhaustive]
1244pub struct LeakSuspect {
1245 pub span: Range<usize>,
1247 pub class: PiiClass,
1249 pub safety_net_id: String,
1251 pub score: Option<f32>,
1253 pub kind: LeakKind,
1255 pub raw_label: String,
1257 pub field_path: Option<String>,
1259}
1260
1261impl LeakSuspect {
1262 pub fn new(
1264 span: Range<usize>,
1265 class: PiiClass,
1266 safety_net_id: impl Into<String>,
1267 score: Option<f32>,
1268 kind: LeakKind,
1269 raw_label: impl Into<String>,
1270 field_path: Option<String>,
1271 ) -> Self {
1272 Self {
1273 span,
1274 class,
1275 safety_net_id: safety_net_id.into(),
1276 score,
1277 kind,
1278 raw_label: raw_label.into(),
1279 field_path,
1280 }
1281 }
1282}
1283
1284#[derive(Debug, Clone, PartialEq, Eq)]
1288#[non_exhaustive]
1289pub enum LeakKind {
1290 Uncovered,
1292 PartialBleed {
1294 uncovered: Range<usize>,
1296 },
1297 ClassMismatch {
1299 pipeline_class: PiiClass,
1301 safety_net_class: PiiClass,
1303 },
1304}
1305
1306#[derive(Debug, Clone, PartialEq, Eq)]
1308#[non_exhaustive]
1309pub enum LeakReportTelemetry {
1310 LocaleSkipped {
1312 safety_net_id: String,
1314 document_kind: DocumentKind,
1316 field_path: Option<String>,
1318 },
1319}
1320
1321#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
1323#[non_exhaustive]
1324pub struct LeakReportStats {
1325 pub suspect_count: usize,
1327 pub uncovered_count: usize,
1329 pub partial_bleed_count: usize,
1331 pub class_mismatch_count: usize,
1333 pub locale_skipped_count: usize,
1335}
1336
1337#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1344#[non_exhaustive]
1345pub struct DocumentExtension {
1346 pub schema_version: u16,
1348 pub clean_md_sha256: [u8; 32],
1350 pub layout_json_sha256: [u8; 32],
1352 pub report_json_sha256: [u8; 32],
1354 #[serde(default, skip_serializing_if = "Option::is_none")]
1356 pub preview_png_sha256: Option<[u8; 32]>,
1357 pub page_count: u32,
1359 pub audit_session_id: String,
1361 #[serde(default, skip_serializing_if = "Vec::is_empty")]
1363 pub clean_spans: Vec<EmittedTokenSpan>,
1364 #[serde(default, skip_serializing_if = "Vec::is_empty")]
1366 pub codec_audit: Vec<CodecAuditRow>,
1367}
1368
1369impl DocumentExtension {
1370 pub fn builder(schema_version: u16) -> DocumentExtensionBuilder {
1372 DocumentExtensionBuilder {
1373 schema_version,
1374 clean_md_sha256: None,
1375 layout_json_sha256: None,
1376 report_json_sha256: None,
1377 preview_png_sha256: None,
1378 page_count: None,
1379 audit_session_id: None,
1380 clean_spans: Vec::new(),
1381 codec_audit: Vec::new(),
1382 }
1383 }
1384}
1385
1386#[derive(Debug, Clone)]
1388#[must_use]
1389pub struct DocumentExtensionBuilder {
1390 schema_version: u16,
1391 clean_md_sha256: Option<[u8; 32]>,
1392 layout_json_sha256: Option<[u8; 32]>,
1393 report_json_sha256: Option<[u8; 32]>,
1394 preview_png_sha256: Option<[u8; 32]>,
1395 page_count: Option<u32>,
1396 audit_session_id: Option<String>,
1397 clean_spans: Vec<EmittedTokenSpan>,
1398 codec_audit: Vec<CodecAuditRow>,
1399}
1400
1401impl DocumentExtensionBuilder {
1402 pub fn clean_md_sha256(mut self, hash: [u8; 32]) -> Self {
1403 self.clean_md_sha256 = Some(hash);
1404 self
1405 }
1406
1407 pub fn layout_json_sha256(mut self, hash: [u8; 32]) -> Self {
1408 self.layout_json_sha256 = Some(hash);
1409 self
1410 }
1411
1412 pub fn report_json_sha256(mut self, hash: [u8; 32]) -> Self {
1413 self.report_json_sha256 = Some(hash);
1414 self
1415 }
1416
1417 pub fn preview_png_sha256(mut self, hash: [u8; 32]) -> Self {
1418 self.preview_png_sha256 = Some(hash);
1419 self
1420 }
1421
1422 pub fn page_count(mut self, page_count: u32) -> Self {
1423 self.page_count = Some(page_count);
1424 self
1425 }
1426
1427 pub fn audit_session_id(mut self, audit_session_id: impl Into<String>) -> Self {
1428 self.audit_session_id = Some(audit_session_id.into());
1429 self
1430 }
1431
1432 pub fn clean_spans(mut self, clean_spans: Vec<EmittedTokenSpan>) -> Self {
1433 self.clean_spans = clean_spans;
1434 self
1435 }
1436
1437 pub fn codec_audit(mut self, codec_audit: Vec<CodecAuditRow>) -> Self {
1438 self.codec_audit = codec_audit;
1439 self
1440 }
1441
1442 pub fn build(self) -> Result<DocumentExtension, DocumentExtensionError> {
1443 Ok(DocumentExtension {
1444 schema_version: self.schema_version,
1445 clean_md_sha256: self
1446 .clean_md_sha256
1447 .ok_or(DocumentExtensionError::MissingField("clean_md_sha256"))?,
1448 layout_json_sha256: self
1449 .layout_json_sha256
1450 .ok_or(DocumentExtensionError::MissingField("layout_json_sha256"))?,
1451 report_json_sha256: self
1452 .report_json_sha256
1453 .ok_or(DocumentExtensionError::MissingField("report_json_sha256"))?,
1454 preview_png_sha256: self.preview_png_sha256,
1455 page_count: self
1456 .page_count
1457 .ok_or(DocumentExtensionError::MissingField("page_count"))?,
1458 audit_session_id: self
1459 .audit_session_id
1460 .ok_or(DocumentExtensionError::MissingField("audit_session_id"))?,
1461 clean_spans: self.clean_spans,
1462 codec_audit: self.codec_audit,
1463 })
1464 }
1465}
1466
1467#[derive(Debug, Clone, PartialEq, Eq, Error)]
1469#[non_exhaustive]
1470pub enum DocumentExtensionError {
1471 #[error("missing document extension field: {0}")]
1472 MissingField(&'static str),
1473}
1474
1475#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
1477#[serde(rename_all = "snake_case")]
1478#[non_exhaustive]
1479pub enum TextOrigin {
1480 Ocr,
1482 EmbeddedText,
1484 Transcript,
1486 Hybrid,
1488}
1489
1490#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
1492#[non_exhaustive]
1493pub struct CodecCapabilitySet {
1494 pub text: bool,
1496 pub layout: bool,
1498 pub confidence: bool,
1500 pub timestamps: bool,
1502}
1503
1504impl CodecCapabilitySet {
1505 pub const TEXT_ONLY: Self = Self {
1507 text: true,
1508 layout: false,
1509 confidence: false,
1510 timestamps: false,
1511 };
1512
1513 pub const fn new(text: bool, layout: bool, confidence: bool, timestamps: bool) -> Self {
1515 Self {
1516 text,
1517 layout,
1518 confidence,
1519 timestamps,
1520 }
1521 }
1522
1523 pub fn contains(self, requested: Self) -> bool {
1525 (!requested.text || self.text)
1526 && (!requested.layout || self.layout)
1527 && (!requested.confidence || self.confidence)
1528 && (!requested.timestamps || self.timestamps)
1529 }
1530}
1531
1532#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1534#[serde(rename_all = "snake_case")]
1535#[non_exhaustive]
1536pub enum ExtractionDensityPolicy {
1537 Required(f32),
1539 Exempt { reason: String },
1541}
1542
1543impl Default for ExtractionDensityPolicy {
1544 fn default() -> Self {
1545 Self::Exempt {
1546 reason: "calibration_pending".to_string(),
1547 }
1548 }
1549}
1550
1551#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1553#[non_exhaustive]
1554pub struct CodecAuditRow {
1555 pub codec_id: String,
1557 pub codec_version: String,
1559 pub accepted_mime: String,
1561 pub advertised: CodecCapabilitySet,
1563 pub delivered: CodecCapabilitySet,
1565 pub text_origin: TextOrigin,
1567 pub codec_output_schema_version: u16,
1569 #[serde(default, skip_serializing_if = "Option::is_none")]
1571 pub options_hash_hex: Option<String>,
1572 #[serde(default, skip_serializing_if = "Option::is_none")]
1574 pub engine_provenance: Option<String>,
1575 pub extraction_density_policy: ExtractionDensityPolicy,
1577}
1578
1579impl CodecAuditRow {
1580 pub fn new(
1582 codec_id: impl Into<String>,
1583 codec_version: impl Into<String>,
1584 accepted_mime: impl Into<String>,
1585 text_origin: TextOrigin,
1586 ) -> Self {
1587 Self {
1588 codec_id: codec_id.into(),
1589 codec_version: codec_version.into(),
1590 accepted_mime: accepted_mime.into(),
1591 advertised: CodecCapabilitySet::default(),
1592 delivered: CodecCapabilitySet::default(),
1593 text_origin,
1594 codec_output_schema_version: 1,
1595 options_hash_hex: None,
1596 engine_provenance: None,
1597 extraction_density_policy: ExtractionDensityPolicy::default(),
1598 }
1599 }
1600}
1601
1602#[derive(Debug, Clone, Default, PartialEq)]
1608#[non_exhaustive]
1609pub struct LeakReport {
1610 pub suspects: Vec<LeakSuspect>,
1612 pub telemetry: Vec<LeakReportTelemetry>,
1614 pub stats: LeakReportStats,
1616 pub replay_hash: Option<String>,
1621}
1622
1623impl LeakReport {
1624 pub fn from_parts(
1626 suspects: Vec<LeakSuspect>,
1627 telemetry: Vec<LeakReportTelemetry>,
1628 ) -> LeakReport {
1629 let mut stats = LeakReportStats {
1630 suspect_count: suspects.len(),
1631 locale_skipped_count: telemetry
1632 .iter()
1633 .filter(|event| matches!(event, LeakReportTelemetry::LocaleSkipped { .. }))
1634 .count(),
1635 ..LeakReportStats::default()
1636 };
1637 for suspect in &suspects {
1638 match suspect.kind {
1639 LeakKind::Uncovered => stats.uncovered_count += 1,
1640 LeakKind::PartialBleed { .. } => stats.partial_bleed_count += 1,
1641 LeakKind::ClassMismatch { .. } => stats.class_mismatch_count += 1,
1642 }
1643 }
1644 LeakReport {
1645 suspects,
1646 telemetry,
1647 stats,
1648 replay_hash: None,
1649 }
1650 }
1651
1652 pub fn extend(&mut self, other: LeakReport) {
1654 self.suspects.extend(other.suspects);
1655 self.telemetry.extend(other.telemetry);
1656 *self = LeakReport::from_parts(
1657 std::mem::take(&mut self.suspects),
1658 std::mem::take(&mut self.telemetry),
1659 );
1660 }
1661}
1662
1663#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
1665#[non_exhaustive]
1666pub enum OpenAiPrivateLabel {
1667 PrivatePerson,
1669 PrivateAddress,
1671 PrivateEmail,
1673 PrivatePhone,
1675 PrivateUrl,
1677 PrivateDate,
1679 AccountNumber,
1681 Secret,
1683}
1684
1685impl OpenAiPrivateLabel {
1686 pub fn as_str(self) -> &'static str {
1688 match self {
1689 Self::PrivatePerson => "private_person",
1690 Self::PrivateAddress => "private_address",
1691 Self::PrivateEmail => "private_email",
1692 Self::PrivatePhone => "private_phone",
1693 Self::PrivateUrl => "private_url",
1694 Self::PrivateDate => "private_date",
1695 Self::AccountNumber => "account_number",
1696 Self::Secret => "secret",
1697 }
1698 }
1699}
1700
1701#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
1703#[non_exhaustive]
1704pub enum SafetyNetPiiClass {
1705 Email,
1707 Name,
1709 Location,
1711 Phone,
1713 Url,
1715 Date,
1717 AccountNumber,
1719 Secret,
1721}
1722
1723impl SafetyNetPiiClass {
1724 pub fn to_pii_class(self) -> PiiClass {
1726 match self {
1727 Self::Email => PiiClass::Email,
1728 Self::Name => PiiClass::Name,
1729 Self::Location => PiiClass::Location,
1730 Self::Phone => PiiClass::custom("phone"),
1731 Self::Url => PiiClass::custom("url"),
1732 Self::Date => PiiClass::custom("date"),
1733 Self::AccountNumber => PiiClass::custom("account_number"),
1734 Self::Secret => PiiClass::custom("secret"),
1735 }
1736 }
1737}
1738
1739#[derive(Debug, Clone, PartialEq, Eq, Error)]
1741#[non_exhaustive]
1742pub enum SafetyNetError {
1743 #[error("safety net unavailable: {reason}")]
1745 Unavailable {
1746 reason: String,
1748 },
1749 #[error("safety net weights missing: {path}")]
1751 WeightsMissing {
1752 path: String,
1754 },
1755 #[error("safety net model unavailable: {reason}")]
1757 ModelUnavailable {
1758 reason: String,
1760 },
1761 #[error("safety net model integrity mismatch: expected={expected}, actual={actual}")]
1763 ModelIntegrityMismatch {
1764 expected: String,
1766 actual: String,
1768 },
1769 #[error("safety net input too large: limit={limit}, actual={actual}")]
1771 InputTooLarge {
1772 limit: usize,
1774 actual: usize,
1776 },
1777 #[error("safety net runtime failed: {message}")]
1779 Runtime {
1780 message: String,
1782 },
1783 #[error("safety net invalid output: {message}")]
1785 InvalidOutput {
1786 message: String,
1788 },
1789}
1790
1791#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1805#[non_exhaustive]
1806pub enum Action {
1807 Tokenize,
1809 Redact,
1811 FormatPreserve,
1813 Generalize,
1815 Preserve,
1817}
1818
1819#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1821#[non_exhaustive]
1822pub enum ConflictTier {
1823 None,
1825 ClassPriority,
1827 RulePriority,
1829 Score,
1831 SpanLength,
1833 Validator,
1835 ValidatorVeto,
1837 CollisionPolicy,
1839 AnchoredContext,
1841 RecognizerId,
1843 Merged,
1845 Redact,
1847 Resolve,
1849 Fallback,
1851}
1852
1853#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
1855#[non_exhaustive]
1856pub enum FallbackReason {
1857 OverlapConflict,
1859 ValidatorVeto,
1861 AnchorMissing,
1863 ResidualSuspect,
1865}
1866
1867#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1869#[non_exhaustive]
1870pub enum DocumentKind {
1871 Structured,
1873 Text,
1875}
1876
1877#[derive(Debug, Clone, PartialEq, Eq)]
1887#[non_exhaustive]
1888pub struct RedactionEntry {
1889 pub source: String,
1891 pub recognizer_id: Option<String>,
1893 pub recognizer_version_id: Option<String>,
1895 pub class: PiiClass,
1897 pub action: Action,
1899 pub field_name: Option<String>,
1901 pub document_kind: DocumentKind,
1903 pub conflict_loser: bool,
1905 pub decided_by: ConflictTier,
1907 pub created_at: i64,
1909 pub session_id: Option<String>,
1911 pub validator_fail_reason: Option<ValidatorFailReason>,
1913 pub ambiguity_record: Option<AmbiguityRecord>,
1915 pub collision_family: Option<String>,
1917 pub collision_variant: Option<String>,
1919 pub fallback_triggered: Option<FallbackReason>,
1921 pub provenance_stage: Option<String>,
1923 pub provenance_model_id: Option<String>,
1924 pub provenance_model_version: Option<String>,
1925 pub provenance_artifact_sha256: Option<String>,
1926 pub provenance_tokenizer_sha256: Option<String>,
1927 pub provenance_locale_resolved: Option<String>,
1928 pub provenance_locale_match_kind: Option<String>,
1929 pub provenance_canonical_class: Option<String>,
1930 pub provenance_native_class: Option<String>,
1931 pub provenance_confidence: Option<String>,
1932 pub provenance_merged_from: Option<String>,
1933 pub backend_silently_dropped: Option<Vec<String>>,
1935 pub restore_policy: Option<String>,
1936 pub restore_decision: Option<String>,
1937 pub restore_unknown_token_count: Option<u64>,
1938 pub restore_manifest_bypass_count: Option<u64>,
1939 pub restore_fresh_pii_count: Option<u64>,
1940 pub restore_phase_mask: Option<u32>,
1941}
1942
1943impl Serialize for RedactionEntry {
1944 fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
1945 where
1946 S: serde::Serializer,
1947 {
1948 use serde::ser::SerializeStruct;
1949
1950 let mut len = 14;
1951 if self.recognizer_id.is_some() {
1952 len += 1;
1953 }
1954 if self.recognizer_version_id.is_some() {
1955 len += 1;
1956 }
1957 len += [
1958 self.provenance_stage.as_ref(),
1959 self.provenance_model_id.as_ref(),
1960 self.provenance_model_version.as_ref(),
1961 self.provenance_artifact_sha256.as_ref(),
1962 self.provenance_tokenizer_sha256.as_ref(),
1963 self.provenance_locale_resolved.as_ref(),
1964 self.provenance_locale_match_kind.as_ref(),
1965 self.provenance_canonical_class.as_ref(),
1966 self.provenance_native_class.as_ref(),
1967 self.provenance_confidence.as_ref(),
1968 self.provenance_merged_from.as_ref(),
1969 ]
1970 .into_iter()
1971 .filter(|value| value.is_some())
1972 .count();
1973 if self.backend_silently_dropped.is_some() {
1974 len += 1;
1975 }
1976 len += [self.restore_policy.as_ref(), self.restore_decision.as_ref()]
1977 .into_iter()
1978 .filter(|value| value.is_some())
1979 .count();
1980 len += [
1981 self.restore_unknown_token_count.is_some(),
1982 self.restore_manifest_bypass_count.is_some(),
1983 self.restore_fresh_pii_count.is_some(),
1984 self.restore_phase_mask.is_some(),
1985 ]
1986 .into_iter()
1987 .filter(|value| *value)
1988 .count();
1989 let mut state = serializer.serialize_struct("RedactionEntry", len)?;
1990 state.serialize_field("source", &self.source)?;
1991 if let Some(recognizer_id) = &self.recognizer_id {
1992 state.serialize_field("recognizer_id", recognizer_id)?;
1993 }
1994 if let Some(recognizer_version_id) = &self.recognizer_version_id {
1995 state.serialize_field("recognizer_version_id", recognizer_version_id)?;
1996 }
1997 state.serialize_field("class", &self.class.to_canonical_str())?;
1998 state.serialize_field("action", redaction_action_as_str(self.action))?;
1999 state.serialize_field("field_name", &self.field_name)?;
2000 state.serialize_field(
2001 "document_kind",
2002 redaction_document_kind_as_str(self.document_kind),
2003 )?;
2004 state.serialize_field("conflict_loser", &self.conflict_loser)?;
2005 state.serialize_field(
2006 "decided_by",
2007 redaction_conflict_tier_as_str(self.decided_by),
2008 )?;
2009 state.serialize_field("created_at", &self.created_at)?;
2010 state.serialize_field("session_id", &self.session_id)?;
2011 state.serialize_field("validator_fail_reason", &self.validator_fail_reason)?;
2012 state.serialize_field("ambiguity_record", &self.ambiguity_record)?;
2013 state.serialize_field("collision_family", &self.collision_family)?;
2014 state.serialize_field("collision_variant", &self.collision_variant)?;
2015 state.serialize_field("fallback_triggered", &self.fallback_triggered)?;
2016 if let Some(value) = &self.provenance_stage {
2017 state.serialize_field("provenance_stage", value)?;
2018 }
2019 if let Some(value) = &self.provenance_model_id {
2020 state.serialize_field("provenance_model_id", value)?;
2021 }
2022 if let Some(value) = &self.provenance_model_version {
2023 state.serialize_field("provenance_model_version", value)?;
2024 }
2025 if let Some(value) = &self.provenance_artifact_sha256 {
2026 state.serialize_field("provenance_artifact_sha256", value)?;
2027 }
2028 if let Some(value) = &self.provenance_tokenizer_sha256 {
2029 state.serialize_field("provenance_tokenizer_sha256", value)?;
2030 }
2031 if let Some(value) = &self.provenance_locale_resolved {
2032 state.serialize_field("provenance_locale_resolved", value)?;
2033 }
2034 if let Some(value) = &self.provenance_locale_match_kind {
2035 state.serialize_field("provenance_locale_match_kind", value)?;
2036 }
2037 if let Some(value) = &self.provenance_canonical_class {
2038 state.serialize_field("provenance_canonical_class", value)?;
2039 }
2040 if let Some(value) = &self.provenance_native_class {
2041 state.serialize_field("provenance_native_class", value)?;
2042 }
2043 if let Some(value) = &self.provenance_confidence {
2044 state.serialize_field("provenance_confidence", value)?;
2045 }
2046 if let Some(value) = &self.provenance_merged_from {
2047 state.serialize_field("provenance_merged_from", value)?;
2048 }
2049 if let Some(dropped) = &self.backend_silently_dropped {
2050 state.serialize_field("backend_silently_dropped", dropped)?;
2051 }
2052 if let Some(value) = &self.restore_policy {
2053 state.serialize_field("restore_policy", value)?;
2054 }
2055 if let Some(value) = &self.restore_decision {
2056 state.serialize_field("restore_decision", value)?;
2057 }
2058 if let Some(value) = self.restore_unknown_token_count {
2059 state.serialize_field("restore_unknown_token_count", &value)?;
2060 }
2061 if let Some(value) = self.restore_manifest_bypass_count {
2062 state.serialize_field("restore_manifest_bypass_count", &value)?;
2063 }
2064 if let Some(value) = self.restore_fresh_pii_count {
2065 state.serialize_field("restore_fresh_pii_count", &value)?;
2066 }
2067 if let Some(value) = self.restore_phase_mask {
2068 state.serialize_field("restore_phase_mask", &value)?;
2069 }
2070 state.end()
2071 }
2072}
2073
2074fn redaction_action_as_str(action: Action) -> &'static str {
2075 match action {
2076 Action::Tokenize => "tokenize",
2077 Action::Redact => "redact",
2078 Action::FormatPreserve => "format_preserve",
2079 Action::Generalize => "generalize",
2080 Action::Preserve => "preserve",
2081 }
2082}
2083
2084fn redaction_document_kind_as_str(kind: DocumentKind) -> &'static str {
2085 match kind {
2086 DocumentKind::Structured => "structured",
2087 DocumentKind::Text => "text",
2088 }
2089}
2090
2091fn redaction_conflict_tier_as_str(tier: ConflictTier) -> &'static str {
2092 match tier {
2093 ConflictTier::None => "none",
2094 ConflictTier::ClassPriority => "class_priority",
2095 ConflictTier::RulePriority => "rule_priority",
2096 ConflictTier::Score => "score",
2097 ConflictTier::SpanLength => "span_length",
2098 ConflictTier::Validator => "validator",
2099 ConflictTier::ValidatorVeto => "validator_veto",
2100 ConflictTier::CollisionPolicy => "collision_policy",
2101 ConflictTier::AnchoredContext => "anchored_context",
2102 ConflictTier::RecognizerId => "recognizer_id",
2103 ConflictTier::Merged => "merged",
2104 ConflictTier::Redact => "redact",
2105 ConflictTier::Resolve => "resolve",
2106 ConflictTier::Fallback => "fallback",
2107 }
2108}
2109
2110impl RedactionEntry {
2111 #[allow(clippy::too_many_arguments)]
2113 pub fn new(
2114 source: impl Into<String>,
2115 class: PiiClass,
2116 action: Action,
2117 field_name: Option<String>,
2118 document_kind: DocumentKind,
2119 conflict_loser: bool,
2120 decided_by: ConflictTier,
2121 created_at: i64,
2122 session_id: Option<String>,
2123 ) -> Self {
2124 Self {
2125 source: source.into(),
2126 class,
2127 action,
2128 field_name,
2129 document_kind,
2130 conflict_loser,
2131 decided_by,
2132 created_at,
2133 session_id,
2134 recognizer_id: None,
2135 recognizer_version_id: None,
2136 validator_fail_reason: None,
2137 ambiguity_record: None,
2138 collision_family: None,
2139 collision_variant: None,
2140 fallback_triggered: None,
2141 provenance_stage: None,
2142 provenance_model_id: None,
2143 provenance_model_version: None,
2144 provenance_artifact_sha256: None,
2145 provenance_tokenizer_sha256: None,
2146 provenance_locale_resolved: None,
2147 provenance_locale_match_kind: None,
2148 provenance_canonical_class: None,
2149 provenance_native_class: None,
2150 provenance_confidence: None,
2151 provenance_merged_from: None,
2152 backend_silently_dropped: None,
2153 restore_policy: None,
2154 restore_decision: None,
2155 restore_unknown_token_count: None,
2156 restore_manifest_bypass_count: None,
2157 restore_fresh_pii_count: None,
2158 restore_phase_mask: None,
2159 }
2160 }
2161
2162 pub fn with_validator_fail_reason(mut self, reason: ValidatorFailReason) -> Self {
2164 self.validator_fail_reason = Some(reason);
2165 self
2166 }
2167
2168 pub fn with_ambiguity_record(mut self, record: AmbiguityRecord) -> Self {
2170 self.ambiguity_record = Some(record);
2171 self
2172 }
2173
2174 pub fn with_collision_metadata(
2176 mut self,
2177 family: Option<String>,
2178 variant: Option<String>,
2179 ) -> Self {
2180 self.collision_family = family;
2181 self.collision_variant = variant;
2182 self
2183 }
2184
2185 pub fn with_fallback_triggered(mut self, reason: FallbackReason) -> Self {
2187 self.fallback_triggered = Some(reason);
2188 self
2189 }
2190
2191 pub fn with_backend_silently_dropped(mut self, dropped: Vec<String>) -> Self {
2193 self.backend_silently_dropped = Some(dropped);
2194 self
2195 }
2196
2197 pub fn with_restore_telemetry(mut self, telemetry: RestoreTelemetry) -> Self {
2198 self.restore_policy = Some(telemetry.restore_policy_str().to_string());
2199 self.restore_decision = Some(telemetry.restore_decision_str().to_string());
2200 self.restore_unknown_token_count = Some(telemetry.unknown_token_count);
2201 self.restore_manifest_bypass_count = Some(telemetry.manifest_bypass_count);
2202 self.restore_fresh_pii_count = Some(telemetry.fresh_pii_detected_count);
2203 self.restore_phase_mask = Some(telemetry.phase_execution_mask);
2204 self
2205 }
2206
2207 pub fn with_recognizer_metadata(
2209 mut self,
2210 recognizer_id: Option<String>,
2211 recognizer_version_id: Option<String>,
2212 ) -> Self {
2213 self.recognizer_id = recognizer_id;
2214 self.recognizer_version_id = recognizer_version_id;
2215 self
2216 }
2217
2218 #[allow(clippy::too_many_arguments)]
2219 pub fn with_provenance_metadata(
2220 mut self,
2221 stage: Option<String>,
2222 model_id: Option<String>,
2223 model_version: Option<String>,
2224 artifact_sha256: Option<String>,
2225 tokenizer_sha256: Option<String>,
2226 locale_resolved: Option<String>,
2227 locale_match_kind: Option<String>,
2228 canonical_class: Option<String>,
2229 native_class: Option<String>,
2230 confidence: Option<f64>,
2231 merged_from: Option<String>,
2232 ) -> Self {
2233 self.provenance_stage = stage;
2234 self.provenance_model_id = model_id;
2235 self.provenance_model_version = model_version;
2236 self.provenance_artifact_sha256 = artifact_sha256;
2237 self.provenance_tokenizer_sha256 = tokenizer_sha256;
2238 self.provenance_locale_resolved = locale_resolved;
2239 self.provenance_locale_match_kind = locale_match_kind;
2240 self.provenance_canonical_class = canonical_class;
2241 self.provenance_native_class = native_class;
2242 self.provenance_confidence = confidence.map(|value| value.to_string());
2243 self.provenance_merged_from = merged_from;
2244 self
2245 }
2246}
2247
2248#[derive(Debug, Clone, PartialEq, Eq, Error)]
2250#[non_exhaustive]
2251pub enum RedactionLogError {
2252 #[error("sqlite redaction log error: {0}")]
2254 Sqlite(String),
2255 #[error("backend redaction log error: {0}")]
2257 Backend(String),
2258}
2259
2260pub trait RedactionLogger: Send + Sync {
2290 fn log(&self, entry: &RedactionEntry) -> Result<(), RedactionLogError>;
2292}
2293
2294#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
2296#[non_exhaustive]
2297pub enum SafetyTier {
2298 #[default]
2300 SafeDefault,
2301 LocaleGated,
2303 OptIn,
2305}
2306
2307#[derive(Debug, Clone, PartialEq, Eq)]
2309#[non_exhaustive]
2310pub struct SafetyTierParseError {
2311 value: String,
2312}
2313
2314impl SafetyTier {
2315 pub fn parse(value: &str) -> Result<Self, SafetyTierParseError> {
2317 match value {
2318 "safe_default" => Ok(Self::SafeDefault),
2319 "locale_gated" => Ok(Self::LocaleGated),
2320 "opt_in" => Ok(Self::OptIn),
2321 other => Err(SafetyTierParseError {
2322 value: other.to_string(),
2323 }),
2324 }
2325 }
2326
2327 pub fn as_str(self) -> &'static str {
2329 match self {
2330 Self::SafeDefault => "safe_default",
2331 Self::LocaleGated => "locale_gated",
2332 Self::OptIn => "opt_in",
2333 }
2334 }
2335}
2336
2337impl SafetyTierParseError {
2338 pub fn value(&self) -> &str {
2340 &self.value
2341 }
2342}
2343
2344impl fmt::Display for SafetyTierParseError {
2345 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2346 write!(f, "unsupported safety_tier '{}'", self.value)
2347 }
2348}
2349
2350impl std::error::Error for SafetyTierParseError {}
2351
2352#[derive(Debug, Clone, PartialEq, Eq, Hash)]
2354#[non_exhaustive]
2355pub enum LocaleTag {
2356 Global,
2358 DeDe,
2360 DeAt,
2362 DeCh,
2364 EnUs,
2366 EnGb,
2368 EnIe,
2370 EnAu,
2372 EnCa,
2374 Other(String),
2376}
2377
2378#[derive(Debug, Clone, PartialEq, Eq)]
2380#[non_exhaustive]
2381pub enum LocaleError {
2382 Unsupported,
2384}
2385
2386impl fmt::Display for LocaleError {
2387 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2388 match self {
2389 LocaleError::Unsupported => f.write_str("unsupported locale"),
2390 }
2391 }
2392}
2393
2394impl std::error::Error for LocaleError {}
2395
2396#[derive(Debug, Clone, PartialEq, Eq)]
2398pub struct LocaleChain(Vec<LocaleTag>);
2399
2400impl LocaleTag {
2401 pub const GLOBAL: LocaleTag = LocaleTag::Global;
2403
2404 pub fn parse(s: &str) -> Result<LocaleTag, LocaleError> {
2406 let raw = s.trim().replace('_', "-");
2407 let normalized = raw.to_ascii_lowercase();
2408 match normalized.as_str() {
2409 "global" | "*" => Ok(LocaleTag::Global),
2410 "de-de" => Ok(LocaleTag::DeDe),
2411 "de-at" => Ok(LocaleTag::DeAt),
2412 "de-ch" => Ok(LocaleTag::DeCh),
2413 "en-us" => Ok(LocaleTag::EnUs),
2414 "en-gb" => Ok(LocaleTag::EnGb),
2415 "en-ie" => Ok(LocaleTag::EnIe),
2416 "en-au" => Ok(LocaleTag::EnAu),
2417 "en-ca" => Ok(LocaleTag::EnCa),
2418 "" => Err(LocaleError::Unsupported),
2419 _ if is_bcp47_parseable(&raw) => Ok(LocaleTag::Other(canonical_other(&raw))),
2420 _ => Err(LocaleError::Unsupported),
2421 }
2422 }
2423
2424 pub fn as_str(&self) -> &str {
2426 match self {
2427 LocaleTag::Global => "global",
2428 LocaleTag::DeDe => "de-DE",
2429 LocaleTag::DeAt => "de-AT",
2430 LocaleTag::DeCh => "de-CH",
2431 LocaleTag::EnUs => "en-US",
2432 LocaleTag::EnGb => "en-GB",
2433 LocaleTag::EnIe => "en-IE",
2434 LocaleTag::EnAu => "en-AU",
2435 LocaleTag::EnCa => "en-CA",
2436 LocaleTag::Other(tag) => tag.as_str(),
2437 }
2438 }
2439}
2440
2441impl LocaleChain {
2442 pub fn from_tags(mut tags: Vec<LocaleTag>) -> LocaleChain {
2444 ensure_global(&mut tags);
2445 LocaleChain(tags)
2446 }
2447
2448 pub fn from_cli(raw: &str) -> Result<LocaleChain, LocaleError> {
2450 let tags = raw
2451 .split(',')
2452 .map(LocaleTag::parse)
2453 .collect::<Result<Vec<_>, _>>()?;
2454 Ok(LocaleChain::from_tags(tags))
2455 }
2456
2457 pub fn merge_policy_and_cli(
2459 policy: Option<&[LocaleTag]>,
2460 cli: Option<&[LocaleTag]>,
2461 ) -> LocaleChain {
2462 Self::merge_cli_policy_rulepack_default(cli, policy, None)
2463 }
2464
2465 pub fn merge_cli_policy_rulepack_default(
2467 cli: Option<&[LocaleTag]>,
2468 policy: Option<&[LocaleTag]>,
2469 rulepack_defaults: Option<&[LocaleTag]>,
2470 ) -> LocaleChain {
2471 let tags = cli
2472 .filter(|tags| !tags.is_empty())
2473 .or_else(|| policy.filter(|tags| !tags.is_empty()))
2474 .or_else(|| rulepack_defaults.filter(|tags| !tags.is_empty()))
2475 .map(|tags| tags.to_vec())
2476 .unwrap_or_else(|| vec![LocaleTag::Global]);
2477 LocaleChain::from_tags(tags)
2478 }
2479
2480 pub fn intersects(&self, recognizer_locales: &[LocaleTag]) -> bool {
2482 if recognizer_locales.is_empty() {
2483 return true;
2484 }
2485 recognizer_locales.iter().any(|recognizer_locale| {
2486 *recognizer_locale == LocaleTag::Global
2487 || self.0.iter().any(|active| active == recognizer_locale)
2488 })
2489 }
2490
2491 pub fn as_slice(&self) -> &[LocaleTag] {
2493 &self.0
2494 }
2495
2496 pub fn to_strings(&self) -> Vec<String> {
2498 self.0.iter().map(ToString::to_string).collect()
2499 }
2500}
2501
2502impl From<&[LocaleTag]> for LocaleChain {
2503 fn from(tags: &[LocaleTag]) -> Self {
2504 let mut owned = tags.to_vec();
2505 ensure_global(&mut owned);
2506 LocaleChain(owned)
2507 }
2508}
2509
2510impl fmt::Display for LocaleTag {
2511 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2512 f.write_str(self.as_str())
2513 }
2514}
2515
2516#[derive(Debug, Clone)]
2526#[non_exhaustive]
2527pub enum RawDocument {
2528 Structured(BTreeMap<String, Value>),
2530 Text(String),
2532}
2533
2534#[derive(Debug, Clone, Serialize)]
2553#[serde(untagged)]
2554#[non_exhaustive]
2555pub enum CleanDocument {
2556 Structured(BTreeMap<String, Value>),
2558 Text(String),
2560}
2561
2562#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
2564#[serde(untagged)]
2565#[non_exhaustive]
2566pub enum Value {
2567 Null,
2569 Bool(bool),
2571 String(String),
2573 I64(i64),
2575 Array(Vec<Value>),
2577 Object(BTreeMap<String, Value>),
2579}
2580
2581impl Value {
2582 pub fn as_str(&self) -> Option<&str> {
2584 match self {
2585 Self::String(value) => Some(value.as_str()),
2586 Self::Null | Self::Bool(_) | Self::I64(_) | Self::Array(_) | Self::Object(_) => None,
2587 }
2588 }
2589
2590 pub fn scalar_to_safety_net_string(&self) -> Option<String> {
2592 match self {
2593 Self::String(value) if !value.is_empty() => Some(value.clone()),
2594 Self::String(_) | Self::Null | Self::Array(_) | Self::Object(_) => None,
2595 Self::Bool(value) => Some(value.to_string()),
2596 Self::I64(value) => Some(value.to_string()),
2597 }
2598 }
2599}
2600
2601impl PartialEq<&str> for Value {
2602 fn eq(&self, other: &&str) -> bool {
2603 self.as_str() == Some(*other)
2604 }
2605}
2606
2607#[derive(Debug, Clone, Default)]
2609pub struct DictionaryBundle {
2610 entries: HashMap<String, DictionaryEntry>,
2611}
2612
2613#[derive(Debug, Clone)]
2615pub struct DictionaryEntry {
2616 terms: Vec<String>,
2617 case_sensitive: bool,
2618 source: DictionarySource,
2619}
2620
2621#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2623#[non_exhaustive]
2624pub enum DictionarySource {
2625 Cli,
2627 Rulepack,
2629}
2630
2631#[derive(Debug, Clone, PartialEq, Eq)]
2633#[non_exhaustive]
2634pub struct DictionaryStats {
2635 pub name: String,
2637 pub term_count: usize,
2639 pub source: DictionarySource,
2641}
2642
2643impl DictionaryStats {
2644 pub fn new(name: impl Into<String>, term_count: usize, source: DictionarySource) -> Self {
2646 Self {
2647 name: name.into(),
2648 term_count,
2649 source,
2650 }
2651 }
2652}
2653
2654#[derive(Debug, Clone, PartialEq, Eq)]
2656#[non_exhaustive]
2657pub struct RulepackDict {
2658 pub name: String,
2660 pub terms: Vec<String>,
2662 pub case_sensitive: bool,
2664}
2665
2666impl RulepackDict {
2667 pub fn new(name: impl Into<String>, terms: Vec<String>, case_sensitive: bool) -> Self {
2669 Self {
2670 name: name.into(),
2671 terms,
2672 case_sensitive,
2673 }
2674 }
2675}
2676
2677#[derive(Debug, Clone, PartialEq, Eq)]
2679#[non_exhaustive]
2680pub enum DictionaryLoadError {
2681 Empty { name: String },
2683 UnicodeInsensitiveUnsupported { name: String },
2685}
2686
2687impl fmt::Display for DictionaryLoadError {
2688 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2689 match self {
2690 Self::Empty { name } => write!(f, "dictionary '{name}' has no terms"),
2691 Self::UnicodeInsensitiveUnsupported { name } => write!(
2692 f,
2693 "dictionary '{name}' uses unicode terms with case-insensitive matching, unsupported in v0.4.0; use case_sensitive = true"
2694 ),
2695 }
2696 }
2697}
2698
2699impl std::error::Error for DictionaryLoadError {}
2700
2701impl DictionaryBundle {
2702 pub fn from_rulepack_terms(terms: &[RulepackDict]) -> Self {
2704 let mut entries = HashMap::with_capacity(terms.len());
2705 for dictionary in terms {
2706 let entry = DictionaryEntry::new(
2707 &dictionary.name,
2708 dictionary.terms.clone(),
2709 dictionary.case_sensitive,
2710 DictionarySource::Rulepack,
2711 )
2712 .expect("Policy validates dictionary terms before bundle construction");
2713 entries.insert(dictionary.name.clone(), entry);
2714 }
2715 Self { entries }
2716 }
2717
2718 pub fn from_entries(entries: impl IntoIterator<Item = (String, DictionaryEntry)>) -> Self {
2720 Self {
2721 entries: entries.into_iter().collect(),
2722 }
2723 }
2724
2725 pub fn merge(a: Self, b: Self) -> Self {
2727 let mut entries = a.entries;
2728 entries.extend(b.entries);
2729 Self { entries }
2730 }
2731
2732 pub fn get(&self, name: &str) -> Option<&DictionaryEntry> {
2734 self.entries.get(name)
2735 }
2736
2737 pub fn stats(&self) -> Vec<DictionaryStats> {
2739 let mut stats = self
2740 .entries
2741 .iter()
2742 .map(|(name, entry)| DictionaryStats {
2743 name: name.clone(),
2744 term_count: entry.terms.len(),
2745 source: entry.source,
2746 })
2747 .collect::<Vec<_>>();
2748 stats.sort_by(|a, b| a.name.cmp(&b.name));
2749 stats
2750 }
2751}
2752
2753impl DictionaryEntry {
2754 pub fn new(
2756 name: &str,
2757 terms: Vec<String>,
2758 case_sensitive: bool,
2759 source: DictionarySource,
2760 ) -> Result<Self, DictionaryLoadError> {
2761 if terms.is_empty() {
2762 return Err(DictionaryLoadError::Empty {
2763 name: name.to_string(),
2764 });
2765 }
2766 if !case_sensitive && terms.iter().any(|term| !term.is_ascii()) {
2767 return Err(DictionaryLoadError::UnicodeInsensitiveUnsupported {
2768 name: name.to_string(),
2769 });
2770 }
2771 Ok(Self {
2772 terms,
2773 case_sensitive,
2774 source,
2775 })
2776 }
2777
2778 pub fn case_sensitive(&self) -> bool {
2780 self.case_sensitive
2781 }
2782
2783 pub fn terms(&self) -> &[String] {
2785 &self.terms
2786 }
2787}
2788
2789#[cfg(test)]
2790mod dictionary_tests {
2791 use super::*;
2792
2793 #[test]
2794 fn dictionary_entry_rejects_empty_terms() {
2795 let err = DictionaryEntry::new("empty", Vec::new(), true, DictionarySource::Cli)
2796 .expect_err("empty dictionaries must fail closed");
2797
2798 assert!(matches!(err, DictionaryLoadError::Empty { name } if name == "empty"));
2799 }
2800
2801 #[test]
2802 fn dictionary_entry_rejects_non_ascii_case_insensitive_terms() {
2803 let err = DictionaryEntry::new(
2804 "songs",
2805 vec!["Beyonce".to_string(), "Caf\u{00e9}".to_string()],
2806 false,
2807 DictionarySource::Cli,
2808 )
2809 .expect_err("unicode case-insensitive dictionaries must fail closed");
2810
2811 assert!(matches!(
2812 err,
2813 DictionaryLoadError::UnicodeInsensitiveUnsupported { name } if name == "songs"
2814 ));
2815 }
2816}
2817
2818#[cfg(test)]
2819mod redaction_logger_tests {
2820 use super::*;
2821
2822 struct CapturingLogger;
2823
2824 impl RedactionLogger for CapturingLogger {
2825 fn log(&self, _entry: &RedactionEntry) -> Result<(), RedactionLogError> {
2826 Ok(())
2827 }
2828 }
2829
2830 fn assert_send_sync<T: Send + Sync + ?Sized>() {}
2831
2832 #[test]
2833 fn redaction_log_error_display_is_stable() {
2834 assert_eq!(
2835 RedactionLogError::Sqlite("write failed".to_string()).to_string(),
2836 "sqlite redaction log error: write failed"
2837 );
2838 assert_eq!(
2839 RedactionLogError::Backend("sink failed".to_string()).to_string(),
2840 "backend redaction log error: sink failed"
2841 );
2842 }
2843
2844 #[test]
2845 fn redaction_logger_trait_object_is_send_sync() {
2846 assert_send_sync::<dyn RedactionLogger>();
2847 }
2848
2849 #[test]
2850 fn local_logger_can_implement_redaction_logger() {
2851 let logger = CapturingLogger;
2852 let entry = RedactionEntry {
2853 source: "unit-test".to_string(),
2854 recognizer_id: None,
2855 recognizer_version_id: None,
2856 class: PiiClass::Email,
2857 action: Action::Tokenize,
2858 field_name: None,
2859 document_kind: DocumentKind::Text,
2860 conflict_loser: false,
2861 decided_by: ConflictTier::None,
2862 created_at: 0,
2863 session_id: None,
2864 validator_fail_reason: None,
2865 ambiguity_record: None,
2866 collision_family: None,
2867 collision_variant: None,
2868 fallback_triggered: None,
2869 provenance_stage: None,
2870 provenance_model_id: None,
2871 provenance_model_version: None,
2872 provenance_artifact_sha256: None,
2873 provenance_tokenizer_sha256: None,
2874 provenance_locale_resolved: None,
2875 provenance_locale_match_kind: None,
2876 provenance_canonical_class: None,
2877 provenance_native_class: None,
2878 provenance_confidence: None,
2879 provenance_merged_from: None,
2880 backend_silently_dropped: None,
2881 restore_policy: None,
2882 restore_decision: None,
2883 restore_unknown_token_count: None,
2884 restore_manifest_bypass_count: None,
2885 restore_fresh_pii_count: None,
2886 restore_phase_mask: None,
2887 };
2888
2889 let trait_object: &dyn RedactionLogger = &logger;
2890 trait_object.log(&entry).expect("log entry");
2891 }
2892
2893 #[test]
2894 fn redaction_entry_json_shape_omits_absent_recognizer_lineage() {
2895 let entry = RedactionEntry::new(
2896 "email.global",
2897 PiiClass::Email,
2898 Action::Tokenize,
2899 None,
2900 DocumentKind::Text,
2901 false,
2902 ConflictTier::None,
2903 0,
2904 None,
2905 );
2906
2907 let rendered = serde_json::to_string(&entry).expect("serialize redaction entry");
2908
2909 assert_eq!(
2910 rendered,
2911 r#"{"source":"email.global","class":"email","action":"tokenize","field_name":null,"document_kind":"text","conflict_loser":false,"decided_by":"none","created_at":0,"session_id":null,"validator_fail_reason":null,"ambiguity_record":null,"collision_family":null,"collision_variant":null,"fallback_triggered":null}"#
2912 );
2913 }
2914
2915 #[test]
2916 fn redaction_entry_json_shape_includes_recognizer_lineage_when_present() {
2917 let entry = RedactionEntry::new(
2918 "ner/ort",
2919 PiiClass::Name,
2920 Action::Tokenize,
2921 None,
2922 DocumentKind::Text,
2923 false,
2924 ConflictTier::None,
2925 0,
2926 None,
2927 )
2928 .with_recognizer_metadata(
2929 Some("ner".to_string()),
2930 Some("ner.davlan-mbert.v1".to_string()),
2931 );
2932
2933 let value: serde_json::Value =
2934 serde_json::to_value(&entry).expect("serialize redaction entry");
2935
2936 assert_eq!(value["recognizer_id"], "ner");
2937 assert_eq!(value["recognizer_version_id"], "ner.davlan-mbert.v1");
2938 }
2939
2940 #[test]
2941 fn candidate_keeps_versioned_and_unversioned_recognizer_ids() {
2942 let unversioned = Candidate::new(
2943 0..5,
2944 PiiClass::Email,
2945 "email.global",
2946 0.9,
2947 10,
2948 None,
2949 "email",
2950 "email.global",
2951 ConflictTier::None,
2952 Vec::new(),
2953 );
2954 assert_eq!(unversioned.recognizer_id, "email.global");
2955 assert_eq!(unversioned.recognizer_version_id, None);
2956
2957 let versioned = unversioned
2958 .clone()
2959 .with_recognizer_version_id("email.global.v1");
2960 assert_eq!(versioned.recognizer_id, "email.global");
2961 assert_eq!(
2962 versioned.recognizer_version_id.as_deref(),
2963 Some("email.global.v1")
2964 );
2965 }
2966}
2967
2968#[cfg(test)]
2969mod safety_net_manifest_tests {
2970 use super::*;
2971
2972 fn span(start: usize, end: usize, class: PiiClass) -> EmittedTokenSpan {
2973 EmittedTokenSpan {
2974 clean_span: start..end,
2975 raw_span: start..end,
2976 class,
2977 }
2978 }
2979
2980 fn diff(manifest: Manifest, suspect: Range<usize>, class: PiiClass) -> Option<LeakKind> {
2981 manifest.diff_against(&suspect, &class)
2982 }
2983
2984 #[test]
2985 fn exact_same_class_coverage_is_not_a_leak() {
2986 let manifest = Manifest::from_spans(vec![span(0, 8, PiiClass::Email)]);
2987
2988 assert_eq!(diff(manifest, 0..8, PiiClass::Email), None);
2989 }
2990
2991 #[test]
2992 fn uncovered_outside_all_tokens_is_uncovered() {
2993 let manifest = Manifest::from_spans(vec![span(20, 30, PiiClass::Email)]);
2994
2995 assert_eq!(
2996 diff(manifest, 0..10, PiiClass::Email),
2997 Some(LeakKind::Uncovered)
2998 );
2999 }
3000
3001 #[test]
3002 fn single_internal_gap_returns_partial_bleed() {
3003 let manifest = Manifest::from_spans(vec![
3004 span(0, 5, PiiClass::Email),
3005 span(10, 15, PiiClass::Email),
3006 ]);
3007
3008 assert_eq!(
3009 diff(manifest, 0..15, PiiClass::Email),
3010 Some(LeakKind::PartialBleed { uncovered: 5..10 })
3011 );
3012 }
3013
3014 #[test]
3015 fn multi_gap_returns_deterministic_first_uncovered_gap() {
3016 let manifest = Manifest::from_spans(vec![
3017 span(0, 3, PiiClass::Email),
3018 span(5, 7, PiiClass::Email),
3019 span(9, 12, PiiClass::Email),
3020 ]);
3021
3022 assert_eq!(
3025 diff(manifest, 0..12, PiiClass::Email),
3026 Some(LeakKind::PartialBleed { uncovered: 3..5 })
3027 );
3028 }
3029
3030 #[test]
3031 fn multi_class_overlap_reports_first_mismatch_deterministically() {
3032 let manifest = Manifest::from_spans(vec![
3033 span(0, 4, PiiClass::Name),
3034 span(4, 8, PiiClass::Location),
3035 ]);
3036
3037 assert_eq!(
3038 diff(manifest, 0..8, PiiClass::Email),
3039 Some(LeakKind::ClassMismatch {
3040 pipeline_class: PiiClass::Name,
3041 safety_net_class: PiiClass::Email,
3042 })
3043 );
3044 }
3045
3046 #[test]
3047 fn adjacent_same_class_tokens_cover_continuously() {
3048 let manifest = Manifest::from_spans(vec![
3049 span(0, 5, PiiClass::Email),
3050 span(5, 10, PiiClass::Email),
3051 ]);
3052
3053 assert_eq!(diff(manifest, 0..10, PiiClass::Email), None);
3054 }
3055
3056 #[test]
3057 fn partial_bleed_at_start_end_and_middle() {
3058 let manifest = Manifest::from_spans(vec![span(3, 8, PiiClass::Email)]);
3059
3060 assert_eq!(
3061 diff(manifest.clone(), 0..8, PiiClass::Email),
3062 Some(LeakKind::PartialBleed { uncovered: 0..3 })
3063 );
3064 assert_eq!(
3065 diff(manifest.clone(), 3..10, PiiClass::Email),
3066 Some(LeakKind::PartialBleed { uncovered: 8..10 })
3067 );
3068
3069 let with_gap = Manifest::from_spans(vec![
3070 span(0, 3, PiiClass::Email),
3071 span(6, 10, PiiClass::Email),
3072 ]);
3073 assert_eq!(
3074 diff(with_gap, 0..10, PiiClass::Email),
3075 Some(LeakKind::PartialBleed { uncovered: 3..6 })
3076 );
3077 }
3078
3079 #[test]
3080 fn byte_indices_are_not_character_indices() {
3081 let text = "ID: 😀 <Email_1>";
3082 let token_start = text.find("<Email_1>").expect("token start");
3083 assert_eq!(token_start, 9, "emoji is four bytes, not one char");
3084 let manifest = Manifest::from_spans(vec![span(token_start, text.len(), PiiClass::Email)]);
3085
3086 assert_eq!(
3087 diff(manifest, token_start..text.len(), PiiClass::Email),
3088 None
3089 );
3090 }
3091
3092 #[test]
3093 fn empty_suspect_range_is_not_a_leak() {
3094 let manifest = Manifest::default();
3095
3096 assert_eq!(diff(manifest, 3..3, PiiClass::Email), None);
3097 }
3098
3099 #[test]
3100 fn safety_net_error_display_is_variant_specific_and_bytes_free() {
3101 let cases = [
3102 SafetyNetError::Unavailable {
3103 reason: "not configured".to_string(),
3104 }
3105 .to_string(),
3106 SafetyNetError::WeightsMissing {
3107 path: "/models/opf".to_string(),
3108 }
3109 .to_string(),
3110 SafetyNetError::ModelUnavailable {
3111 reason: "load failed".to_string(),
3112 }
3113 .to_string(),
3114 SafetyNetError::ModelIntegrityMismatch {
3115 expected: "e3b0c44298fc1c149afbf4c8996fb924".to_string(),
3116 actual: "4e07408562bedb8b60ce05c1decfe3ad".to_string(),
3117 }
3118 .to_string(),
3119 SafetyNetError::InputTooLarge {
3120 limit: 1024,
3121 actual: 2048,
3122 }
3123 .to_string(),
3124 SafetyNetError::Runtime {
3125 message: "timeout".to_string(),
3126 }
3127 .to_string(),
3128 SafetyNetError::InvalidOutput {
3129 message: "bad json".to_string(),
3130 }
3131 .to_string(),
3132 ];
3133
3134 for rendered in cases {
3135 assert!(!rendered.contains("alice@example.invalid"));
3136 }
3137 }
3138}
3139
3140pub trait Recognizer: Send + Sync {
3142 fn id(&self) -> &str;
3144 fn supported_class(&self) -> &PiiClass;
3146 fn detect(&self, input: &str, ctx: &DetectContext<'_>) -> Vec<Candidate>;
3148 fn try_detect(
3150 &self,
3151 input: &str,
3152 ctx: &DetectContext<'_>,
3153 ) -> Result<Vec<Candidate>, RecognizerRuntimeError> {
3154 Ok(self.detect(input, ctx))
3155 }
3156 fn token_family(&self) -> &str;
3158 fn validator_kind(&self) -> Option<ValidatorKind> {
3160 None
3161 }
3162 fn locales(&self) -> &[LocaleTag] {
3164 &[LocaleTag::Global]
3165 }
3166}
3167
3168#[derive(Debug, Clone, PartialEq)]
3170#[non_exhaustive]
3171pub struct Candidate {
3172 pub span: Range<usize>,
3174 pub class: PiiClass,
3176 pub recognizer_id: String,
3178 pub recognizer_version_id: Option<String>,
3180 pub score: f32,
3182 pub priority: i32,
3184 pub canonical_form: Option<String>,
3186 pub token_family: String,
3188 pub source: String,
3190 pub decided_by: ConflictTier,
3192 pub merged_sources: Vec<String>,
3194}
3195
3196impl Candidate {
3197 #[allow(clippy::too_many_arguments)]
3199 pub fn new(
3200 span: Range<usize>,
3201 class: PiiClass,
3202 recognizer_id: impl Into<String>,
3203 score: f32,
3204 priority: i32,
3205 canonical_form: Option<String>,
3206 token_family: impl Into<String>,
3207 source: impl Into<String>,
3208 decided_by: ConflictTier,
3209 merged_sources: Vec<String>,
3210 ) -> Self {
3211 Self {
3212 span,
3213 class,
3214 recognizer_id: recognizer_id.into(),
3215 recognizer_version_id: None,
3216 score,
3217 priority,
3218 canonical_form,
3219 token_family: token_family.into(),
3220 source: source.into(),
3221 decided_by,
3222 merged_sources,
3223 }
3224 }
3225
3226 pub fn with_span(mut self, span: Range<usize>) -> Self {
3228 self.span = span;
3229 self
3230 }
3231
3232 pub fn with_recognizer_version_id(mut self, recognizer_version_id: impl Into<String>) -> Self {
3234 self.recognizer_version_id = Some(recognizer_version_id.into());
3235 self
3236 }
3237}
3238
3239#[non_exhaustive]
3241pub struct DetectContext<'a> {
3242 pub locale_chain: &'a [LocaleTag],
3244 pub dictionaries: &'a DictionaryBundle,
3246 pub fields: &'a (),
3248 pub degraded: Cell<bool>,
3250}
3251
3252impl<'a> DetectContext<'a> {
3253 pub fn new(locale_chain: &'a [LocaleTag], dictionaries: &'a DictionaryBundle) -> Self {
3255 Self {
3256 locale_chain,
3257 dictionaries,
3258 fields: &(),
3259 degraded: Cell::new(false),
3260 }
3261 }
3262}
3263
3264fn ensure_global(tags: &mut Vec<LocaleTag>) {
3265 if !tags.contains(&LocaleTag::Global) {
3266 tags.push(LocaleTag::Global);
3267 }
3268}
3269
3270fn is_bcp47_parseable(raw: &str) -> bool {
3271 let mut parts = raw.split('-');
3272 let Some(language) = parts.next() else {
3273 return false;
3274 };
3275 if !(2..=8).contains(&language.len()) || !language.chars().all(|ch| ch.is_ascii_alphabetic()) {
3276 return false;
3277 }
3278 parts.all(|part| {
3279 (2..=8).contains(&part.len()) && part.chars().all(|ch| ch.is_ascii_alphanumeric())
3280 })
3281}
3282
3283fn canonical_other(raw: &str) -> String {
3284 let mut parts = raw.split('-');
3285 let language = parts.next().unwrap_or_default().to_ascii_lowercase();
3286 let rest = parts.map(|part| {
3287 if part.len() == 2 && part.chars().all(|ch| ch.is_ascii_alphabetic()) {
3288 part.to_ascii_uppercase()
3289 } else {
3290 part.to_ascii_lowercase()
3291 }
3292 });
3293 std::iter::once(language)
3294 .chain(rest)
3295 .collect::<Vec<_>>()
3296 .join("-")
3297}