1#![cfg_attr(docsrs, feature(doc_cfg))]
2
3use std::cell::Cell;
4use std::collections::{BTreeMap, HashMap};
5use std::fmt;
6use std::ops::Range;
7
8use serde::{Deserialize, Serialize};
9use sha3::{Digest, Keccak256};
10use thiserror::Error;
11
12pub trait Detector: Send + Sync {
14 fn detect(&self, input: &str) -> Vec<Detection>;
16}
17
18#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
46pub enum PiiClass {
47 Email,
49 Name,
51 Location,
53 Organization,
55 Custom(String),
57}
58
59pub const BUILTIN_CLASS_NAMES: &[&str] = &["Email", "Name", "Location", "Organization"];
61
62pub const RESERVED_BUNDLED_FAMILIES: &[&str] = &[
67 "us-9-digit-id",
68 "iberian-id",
69 "payment-card-or-iban",
70 "phone-or-imei",
71 "vin-or-serial",
72 "mac-or-hex",
73 "passport-or-doc-support",
74 "national-13-digit",
75 "italian-cf-or-serial",
76 "german-personalausweis",
77 "swedish-personnummer",
78 "finnish-hetu",
79];
80
81#[derive(Debug, Clone, PartialEq, Eq)]
83#[non_exhaustive]
84pub struct CollisionMembership {
85 pub family: String,
87 pub variant: String,
89 pub precedence: u32,
91 pub mandatory_anchor: Option<String>,
93}
94
95impl CollisionMembership {
96 pub fn new(
98 family: impl Into<String>,
99 variant: impl Into<String>,
100 precedence: u32,
101 mandatory_anchor: Option<String>,
102 ) -> Self {
103 Self {
104 family: family.into(),
105 variant: variant.into(),
106 precedence,
107 mandatory_anchor,
108 }
109 }
110}
111
112impl PiiClass {
113 pub fn from_policy_name(input: &str) -> Option<Self> {
115 match input {
116 "email" => Some(Self::Email),
117 "name" => Some(Self::Name),
118 "location" => Some(Self::Location),
119 "organization" => Some(Self::Organization),
120 custom if custom.starts_with("custom:") => {
121 let name = custom.trim_start_matches("custom:");
122 (!name.trim().is_empty()).then(|| Self::custom(name))
123 }
124 _ => None,
125 }
126 }
127
128 pub fn builtin_variants() -> &'static [PiiClass] {
130 &[
131 PiiClass::Email,
132 PiiClass::Name,
133 PiiClass::Location,
134 PiiClass::Organization,
135 ]
136 }
137
138 pub fn custom(name: &str) -> Self {
140 let mut normalized = String::new();
141 let mut pending_underscore = false;
142 for ch in name.trim().chars() {
143 if ch.is_ascii_alphanumeric() {
144 if pending_underscore && !normalized.is_empty() {
145 normalized.push('_');
146 }
147 normalized.push(ch.to_ascii_lowercase());
148 pending_underscore = false;
149 } else {
150 pending_underscore = true;
151 }
152 }
153
154 Self::Custom(normalized)
155 }
156
157 pub fn as_custom_name(&self) -> Option<&str> {
159 match self {
160 Self::Custom(name) => Some(name.as_str()),
161 Self::Email | Self::Name | Self::Location | Self::Organization => None,
162 }
163 }
164
165 pub fn class_name(&self) -> String {
167 match self {
168 Self::Email => BUILTIN_CLASS_NAMES[0].to_string(),
169 Self::Name => BUILTIN_CLASS_NAMES[1].to_string(),
170 Self::Location => BUILTIN_CLASS_NAMES[2].to_string(),
171 Self::Organization => BUILTIN_CLASS_NAMES[3].to_string(),
172 Self::Custom(name) => format!("Custom:{name}"),
173 }
174 }
175
176 pub fn to_canonical_str(&self) -> String {
178 match self {
179 Self::Email => "email".to_string(),
180 Self::Name => "name".to_string(),
181 Self::Location => "location".to_string(),
182 Self::Organization => "organization".to_string(),
183 Self::Custom(name) => format!("custom:{name}"),
184 }
185 }
186
187 pub fn from_canonical_str(value: &str) -> Option<Self> {
189 match value {
190 "email" | "Email" => Some(Self::Email),
191 "name" | "Name" => Some(Self::Name),
192 "location" | "Location" => Some(Self::Location),
193 "organization" | "Organization" => Some(Self::Organization),
194 custom if custom.starts_with("custom:") => {
195 let name = &custom["custom:".len()..];
196 (!name.is_empty()).then(|| Self::Custom(name.to_string()))
197 }
198 _ => None,
199 }
200 }
201}
202
203#[derive(Debug, Clone, PartialEq, Eq)]
209#[non_exhaustive]
210pub struct PiiClassAudit(pub PiiClass);
211
212impl PiiClassAudit {
213 pub fn new(class: PiiClass) -> Self {
215 Self(class)
216 }
217
218 pub fn into_inner(self) -> PiiClass {
220 self.0
221 }
222}
223
224impl Serialize for PiiClassAudit {
225 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
226 where
227 S: serde::Serializer,
228 {
229 serializer.serialize_str(&self.0.to_canonical_str())
230 }
231}
232
233impl<'de> Deserialize<'de> for PiiClassAudit {
234 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
235 where
236 D: serde::Deserializer<'de>,
237 {
238 let value = String::deserialize(deserializer)?;
239 PiiClass::from_canonical_str(&value)
240 .map(Self)
241 .ok_or_else(|| {
242 serde::de::Error::custom(format!("unknown PiiClass canonical form: {value}"))
243 })
244 }
245}
246
247mod pii_class_audit_serde {
248 use super::{PiiClass, PiiClassAudit};
249 use serde::{Deserialize, Deserializer, Serialize, Serializer};
250
251 pub fn serialize<S>(class: &PiiClass, serializer: S) -> Result<S::Ok, S::Error>
252 where
253 S: Serializer,
254 {
255 PiiClassAudit::new(class.clone()).serialize(serializer)
256 }
257
258 pub fn deserialize<'de, D>(deserializer: D) -> Result<PiiClass, D::Error>
259 where
260 D: Deserializer<'de>,
261 {
262 Ok(PiiClassAudit::deserialize(deserializer)?.into_inner())
263 }
264}
265
266#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
268#[non_exhaustive]
269pub struct LosingCandidate {
270 #[serde(with = "pii_class_audit_serde")]
272 pub class: PiiClass,
273 pub recognizer_id: String,
275}
276
277impl LosingCandidate {
278 pub fn new(class: PiiClass, recognizer_id: impl Into<String>) -> Self {
280 Self {
281 class,
282 recognizer_id: recognizer_id.into(),
283 }
284 }
285}
286
287#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
289#[non_exhaustive]
290pub struct AmbiguityRecord {
291 #[serde(with = "pii_class_audit_serde")]
293 pub ambiguity_class: PiiClass,
294 pub losing_candidates: Vec<LosingCandidate>,
298 pub reason: AmbiguityReason,
300}
301
302impl AmbiguityRecord {
303 pub fn new(
305 ambiguity_class: PiiClass,
306 losing_candidates: Vec<LosingCandidate>,
307 reason: AmbiguityReason,
308 ) -> Self {
309 Self {
310 ambiguity_class,
311 losing_candidates,
312 reason,
313 }
314 }
315}
316
317#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
319#[non_exhaustive]
320#[serde(rename_all = "snake_case")]
321pub enum AmbiguityReason {
322 NoAnchor,
324 ValidatorIndeterminate,
326 MultiFamilyMatch,
328 PrecedenceTie,
330}
331
332#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
334#[non_exhaustive]
335#[serde(rename_all = "snake_case")]
336pub enum ValidatorFailReason {
337 LuhnFailed,
339 IbanMod97Failed,
341 #[serde(alias = "email_rfc_failed")]
343 EmailRfcRejected,
344 #[serde(alias = "e164_phone_failed")]
346 PhoneE164Rejected,
347 PhoneNationalRegionMismatch,
349 Ipv4ParseFailed,
351 Ipv6ParseFailed,
353 EthEip55ChecksumFailed,
355}
356
357#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
359#[non_exhaustive]
360#[serde(rename_all = "snake_case")]
361pub enum ValidatorOutcome {
362 Pass { canonical_form: Option<String> },
364 Fail { reason: ValidatorFailReason },
366 NotApplicable,
368}
369
370#[derive(Debug, Clone, PartialEq, Eq, Error)]
372#[non_exhaustive]
373pub enum ValidatorKindParseError {
374 #[error("unsupported validator: {kind}")]
376 UnsupportedValidator {
377 kind: String,
379 },
380}
381
382#[derive(Debug, Clone, Copy, PartialEq, Eq)]
384#[non_exhaustive]
385pub enum ValidatorKind {
386 EmailRfc,
388 #[cfg(feature = "phone-parser")]
390 E164Phone,
391 #[cfg(feature = "phone-parser")]
393 E164PhoneNational(Region),
394 Luhn,
396 IbanMod97,
398 Ipv4Parse,
400 Ipv6Parse,
402 EthEip55,
404}
405
406#[cfg(feature = "phone-parser")]
408#[derive(Debug, Clone, Copy, PartialEq, Eq)]
409#[non_exhaustive]
410pub enum Region {
411 De,
413 Us,
415}
416
417impl ValidatorKind {
418 pub fn parse(s: &str) -> Result<Self, ValidatorKindParseError> {
420 match s {
421 "email_rfc" => Ok(Self::EmailRfc),
422 #[cfg(feature = "phone-parser")]
423 "e164_phone" => Ok(Self::E164Phone),
424 #[cfg(feature = "phone-parser")]
425 "e164_phone_national_de" => Ok(Self::E164PhoneNational(Region::De)),
426 #[cfg(feature = "phone-parser")]
427 "e164_phone_national_us" => Ok(Self::E164PhoneNational(Region::Us)),
428 "luhn" => Ok(Self::Luhn),
429 "iban_mod97" => Ok(Self::IbanMod97),
430 "ipv4_parse" => Ok(Self::Ipv4Parse),
431 "ipv6_parse" => Ok(Self::Ipv6Parse),
432 "eth_eip55" => Ok(Self::EthEip55),
433 other => Err(ValidatorKindParseError::UnsupportedValidator {
434 kind: other.to_string(),
435 }),
436 }
437 }
438
439 pub fn validates(self, input: &str) -> bool {
441 self.canonical_form(input).is_some()
442 }
443
444 pub fn validate(self, input: &str) -> ValidatorOutcome {
446 match self.canonical_form(input) {
447 Some(canonical_form) => ValidatorOutcome::Pass {
448 canonical_form: Some(canonical_form),
449 },
450 None => ValidatorOutcome::Fail {
451 reason: self.fail_reason(),
452 },
453 }
454 }
455
456 pub fn canonical_form(self, input: &str) -> Option<String> {
458 match self {
459 Self::EmailRfc => is_basic_email(input).then(|| input.to_string()),
460 #[cfg(feature = "phone-parser")]
461 Self::E164Phone => e164_phone_check(input).then(|| input.to_string()),
462 #[cfg(feature = "phone-parser")]
463 Self::E164PhoneNational(region) => validate_phone_national(region, input),
464 Self::Luhn => luhn_check(input).then(|| input.to_string()),
465 Self::IbanMod97 => iban_mod97_check(input).then(|| input.to_string()),
466 Self::Ipv4Parse => ipv4_parse_check(input).then(|| input.to_string()),
467 Self::Ipv6Parse => ipv6_parse_check(input).then(|| input.to_string()),
468 Self::EthEip55 => eth_eip55_check(input).then(|| input.to_string()),
469 }
470 }
471
472 pub fn fail_reason(self) -> ValidatorFailReason {
474 match self {
475 Self::EmailRfc => ValidatorFailReason::EmailRfcRejected,
476 #[cfg(feature = "phone-parser")]
477 Self::E164Phone => ValidatorFailReason::PhoneE164Rejected,
478 #[cfg(feature = "phone-parser")]
479 Self::E164PhoneNational(_) => ValidatorFailReason::PhoneNationalRegionMismatch,
480 Self::Luhn => ValidatorFailReason::LuhnFailed,
481 Self::IbanMod97 => ValidatorFailReason::IbanMod97Failed,
482 Self::Ipv4Parse => ValidatorFailReason::Ipv4ParseFailed,
483 Self::Ipv6Parse => ValidatorFailReason::Ipv6ParseFailed,
484 Self::EthEip55 => ValidatorFailReason::EthEip55ChecksumFailed,
485 }
486 }
487}
488
489fn is_basic_email(input: &str) -> bool {
490 let Some((local, domain)) = input.split_once('@') else {
491 return false;
492 };
493 !local.is_empty() && domain.contains('.') && !domain.starts_with('.') && !domain.ends_with('.')
494}
495
496#[cfg(feature = "phone-parser")]
497fn e164_phone_check(input: &str) -> bool {
498 phonenumber::parse(None, input).is_ok_and(|phone| phonenumber::is_valid(&phone))
499}
500
501#[cfg(feature = "phone-parser")]
502fn validate_phone_national(region: Region, input: &str) -> Option<String> {
503 let country = match region {
504 Region::De => phonenumber::country::DE,
505 Region::Us => phonenumber::country::US,
506 };
507 let expected_code = match region {
508 Region::De => 49,
509 Region::Us => 1,
510 };
511 let number = phonenumber::parse(Some(country), input).ok()?;
512 if number.country().code() != expected_code {
513 return None;
514 }
515 if number.is_valid() || is_safe_fixture_phone(region, input) {
516 return Some(number.format().mode(phonenumber::Mode::E164).to_string());
517 }
518 None
519}
520
521#[cfg(feature = "phone-parser")]
522fn is_safe_fixture_phone(region: Region, input: &str) -> bool {
523 let digits = input
524 .chars()
525 .filter(char::is_ascii_digit)
526 .collect::<String>();
527 match region {
528 Region::Us => {
529 digits == "15550100"
530 || matches!(digits.strip_prefix('1'), Some(rest) if rest.len() == 10 && rest[3..].starts_with("55501"))
531 }
532 Region::De => matches!(
533 digits.as_str(),
534 "493000000000"
535 | "4915100000000"
536 | "4915550112233"
537 | "015550112233"
538 | "491710000000"
539 | "01710000000"
540 ),
541 }
542}
543
544fn luhn_check(input: &str) -> bool {
545 let mut digits = Vec::new();
546 for byte in input.bytes() {
547 if byte.is_ascii_whitespace() || byte == b'-' {
548 continue;
549 }
550 if !byte.is_ascii_digit() {
551 return false;
552 }
553 digits.push(byte - b'0');
554 }
555 if !(13..=19).contains(&digits.len()) {
556 return false;
557 }
558
559 let sum: u32 = digits
560 .iter()
561 .rev()
562 .enumerate()
563 .map(|(index, digit)| {
564 let mut value = u32::from(*digit);
565 if index % 2 == 1 {
566 value *= 2;
567 if value > 9 {
568 value -= 9;
569 }
570 }
571 value
572 })
573 .sum();
574 sum.is_multiple_of(10)
575}
576
577fn iban_mod97_check(input: &str) -> bool {
578 let canonical = iban_canonicalize(input);
579 if !(15..=34).contains(&canonical.len()) {
580 return false;
581 }
582 if !canonical.chars().all(|ch| ch.is_ascii_alphanumeric()) {
583 return false;
584 }
585
586 let mut remainder = 0u32;
587 for ch in canonical[4..].chars().chain(canonical[..4].chars()) {
588 match ch {
589 '0'..='9' => {
590 remainder = (remainder * 10 + ch.to_digit(10).expect("digit")) % 97;
591 }
592 'A'..='Z' => {
593 let value = u32::from(ch) - u32::from('A') + 10;
594 remainder = (remainder * 10 + value / 10) % 97;
595 remainder = (remainder * 10 + value % 10) % 97;
596 }
597 _ => return false,
598 }
599 }
600 remainder == 1
601}
602
603fn iban_canonicalize(input: &str) -> String {
604 input
605 .chars()
606 .filter(|ch| !ch.is_ascii_whitespace())
607 .flat_map(char::to_uppercase)
608 .collect()
609}
610
611fn ipv4_parse_check(input: &str) -> bool {
612 input.parse::<std::net::Ipv4Addr>().is_ok()
613}
614
615fn ipv6_parse_check(input: &str) -> bool {
616 input.parse::<std::net::Ipv6Addr>().is_ok()
617}
618
619fn eth_eip55_check(input: &str) -> bool {
620 let Some(address) = input.strip_prefix("0x") else {
621 return false;
622 };
623 if address.len() != 40 || !address.bytes().all(|byte| byte.is_ascii_hexdigit()) {
624 return false;
625 }
626 if address
627 .bytes()
628 .all(|byte| !byte.is_ascii_alphabetic() || byte.is_ascii_lowercase())
629 || address
630 .bytes()
631 .all(|byte| !byte.is_ascii_alphabetic() || byte.is_ascii_uppercase())
632 {
633 return true;
634 }
635
636 let lowercase = address.to_ascii_lowercase();
637 let hash = Keccak256::digest(lowercase.as_bytes());
638 for (index, byte) in address.bytes().enumerate() {
639 if byte.is_ascii_digit() {
640 continue;
641 }
642 let hash_nibble = if index % 2 == 0 {
643 hash[index / 2] >> 4
644 } else {
645 hash[index / 2] & 0x0f
646 };
647 if (hash_nibble > 7) != byte.is_ascii_uppercase() {
648 return false;
649 }
650 }
651 true
652}
653
654#[derive(Debug, Clone, PartialEq, Eq)]
656#[non_exhaustive]
657pub struct Detection {
658 pub span: Range<usize>,
660 pub class: PiiClass,
662 pub source: String,
664}
665
666impl Detection {
667 pub fn new(span: Range<usize>, class: PiiClass, source: impl Into<String>) -> Self {
669 Self {
670 span,
671 class,
672 source: source.into(),
673 }
674 }
675}
676
677pub trait SafetyNet: Send + Sync {
691 fn id(&self) -> &str;
693
694 fn supported_locales(&self) -> &[LocaleTag];
696
697 fn check(
699 &self,
700 clean_text: &str,
701 context: SafetyNetContext<'_>,
702 ) -> Result<Vec<LeakSuspect>, SafetyNetError>;
703}
704
705#[derive(Debug, Clone, Copy)]
707#[non_exhaustive]
708pub struct SafetyNetContext<'a> {
709 pub manifest: &'a Manifest,
711 pub locale_chain: &'a [LocaleTag],
715 pub document_kind: DocumentKind,
717 pub session_id: Option<&'a str>,
719 pub field_path: Option<&'a str>,
721}
722
723impl<'a> SafetyNetContext<'a> {
724 pub fn new(
726 manifest: &'a Manifest,
727 locale_chain: &'a [LocaleTag],
728 document_kind: DocumentKind,
729 session_id: Option<&'a str>,
730 field_path: Option<&'a str>,
731 ) -> Self {
732 Self {
733 manifest,
734 locale_chain,
735 document_kind,
736 session_id,
737 field_path,
738 }
739 }
740}
741
742#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
744#[non_exhaustive]
745pub struct EmittedTokenSpan {
746 pub clean_span: Range<usize>,
748 pub raw_span: Range<usize>,
750 pub class: PiiClass,
752}
753
754impl EmittedTokenSpan {
755 pub fn new(clean_span: Range<usize>, raw_span: Range<usize>, class: PiiClass) -> Self {
757 Self {
758 clean_span,
759 raw_span,
760 class,
761 }
762 }
763}
764
765#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
767#[non_exhaustive]
768pub struct Manifest {
769 pub spans: Vec<EmittedTokenSpan>,
771}
772
773impl Manifest {
774 pub fn from_spans(mut spans: Vec<EmittedTokenSpan>) -> Self {
776 spans.sort_by_key(|span| (span.clean_span.start, span.clean_span.end));
777 Self { spans }
778 }
779
780 pub fn diff_against(
788 &self,
789 suspect_span: &Range<usize>,
790 suspect_class: &PiiClass,
791 ) -> Option<LeakKind> {
792 if suspect_span.is_empty() {
793 return None;
794 }
795
796 let start_idx = self
797 .spans
798 .partition_point(|span| span.clean_span.end <= suspect_span.start);
799 let overlapping = self.spans[start_idx..]
800 .iter()
801 .take_while(|span| span.clean_span.start < suspect_span.end)
802 .filter(|span| ranges_overlap(&span.clean_span, suspect_span))
803 .collect::<Vec<_>>();
804
805 if overlapping.is_empty() {
806 return Some(LeakKind::Uncovered);
807 }
808
809 let mut cursor = suspect_span.start;
810 let mut first_mismatch = None::<&EmittedTokenSpan>;
811 for span in overlapping {
812 if span.clean_span.start > cursor {
813 return Some(LeakKind::PartialBleed {
814 uncovered: cursor..span.clean_span.start.min(suspect_span.end),
815 });
816 }
817
818 if span.clean_span.end > cursor {
819 if first_mismatch.is_none() && &span.class != suspect_class {
820 first_mismatch = Some(span);
821 }
822 cursor = cursor.max(span.clean_span.end.min(suspect_span.end));
823 if cursor >= suspect_span.end {
824 break;
825 }
826 }
827 }
828
829 if cursor < suspect_span.end {
830 return Some(LeakKind::PartialBleed {
831 uncovered: cursor..suspect_span.end,
832 });
833 }
834
835 first_mismatch.map(|span| LeakKind::ClassMismatch {
836 pipeline_class: span.class.clone(),
837 safety_net_class: suspect_class.clone(),
838 })
839 }
840}
841
842fn ranges_overlap(left: &Range<usize>, right: &Range<usize>) -> bool {
843 left.start < right.end && right.start < left.end
844}
845
846#[derive(Debug, Clone, PartialEq)]
848#[non_exhaustive]
849pub struct LeakSuspect {
850 pub span: Range<usize>,
852 pub class: PiiClass,
854 pub safety_net_id: String,
856 pub score: Option<f32>,
858 pub kind: LeakKind,
860 pub raw_label: String,
862 pub field_path: Option<String>,
864}
865
866impl LeakSuspect {
867 pub fn new(
869 span: Range<usize>,
870 class: PiiClass,
871 safety_net_id: impl Into<String>,
872 score: Option<f32>,
873 kind: LeakKind,
874 raw_label: impl Into<String>,
875 field_path: Option<String>,
876 ) -> Self {
877 Self {
878 span,
879 class,
880 safety_net_id: safety_net_id.into(),
881 score,
882 kind,
883 raw_label: raw_label.into(),
884 field_path,
885 }
886 }
887}
888
889#[derive(Debug, Clone, PartialEq, Eq)]
893#[non_exhaustive]
894pub enum LeakKind {
895 Uncovered,
897 PartialBleed {
899 uncovered: Range<usize>,
901 },
902 ClassMismatch {
904 pipeline_class: PiiClass,
906 safety_net_class: PiiClass,
908 },
909}
910
911#[derive(Debug, Clone, PartialEq, Eq)]
913#[non_exhaustive]
914pub enum LeakReportTelemetry {
915 LocaleSkipped {
917 safety_net_id: String,
919 document_kind: DocumentKind,
921 field_path: Option<String>,
923 },
924}
925
926#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
928#[non_exhaustive]
929pub struct LeakReportStats {
930 pub suspect_count: usize,
932 pub uncovered_count: usize,
934 pub partial_bleed_count: usize,
936 pub class_mismatch_count: usize,
938 pub locale_skipped_count: usize,
940}
941
942#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
949#[non_exhaustive]
950pub struct DocumentExtension {
951 pub schema_version: u16,
953 pub clean_md_sha256: [u8; 32],
955 pub layout_json_sha256: [u8; 32],
957 pub report_json_sha256: [u8; 32],
959 #[serde(default, skip_serializing_if = "Option::is_none")]
961 pub preview_png_sha256: Option<[u8; 32]>,
962 pub page_count: u32,
964 pub audit_session_id: String,
966 #[serde(default, skip_serializing_if = "Vec::is_empty")]
968 pub clean_spans: Vec<EmittedTokenSpan>,
969 #[serde(default, skip_serializing_if = "Vec::is_empty")]
971 pub codec_audit: Vec<CodecAuditRow>,
972}
973
974impl DocumentExtension {
975 pub fn builder(schema_version: u16) -> DocumentExtensionBuilder {
977 DocumentExtensionBuilder {
978 schema_version,
979 clean_md_sha256: None,
980 layout_json_sha256: None,
981 report_json_sha256: None,
982 preview_png_sha256: None,
983 page_count: None,
984 audit_session_id: None,
985 clean_spans: Vec::new(),
986 codec_audit: Vec::new(),
987 }
988 }
989}
990
991#[derive(Debug, Clone)]
993#[must_use]
994pub struct DocumentExtensionBuilder {
995 schema_version: u16,
996 clean_md_sha256: Option<[u8; 32]>,
997 layout_json_sha256: Option<[u8; 32]>,
998 report_json_sha256: Option<[u8; 32]>,
999 preview_png_sha256: Option<[u8; 32]>,
1000 page_count: Option<u32>,
1001 audit_session_id: Option<String>,
1002 clean_spans: Vec<EmittedTokenSpan>,
1003 codec_audit: Vec<CodecAuditRow>,
1004}
1005
1006impl DocumentExtensionBuilder {
1007 pub fn clean_md_sha256(mut self, hash: [u8; 32]) -> Self {
1008 self.clean_md_sha256 = Some(hash);
1009 self
1010 }
1011
1012 pub fn layout_json_sha256(mut self, hash: [u8; 32]) -> Self {
1013 self.layout_json_sha256 = Some(hash);
1014 self
1015 }
1016
1017 pub fn report_json_sha256(mut self, hash: [u8; 32]) -> Self {
1018 self.report_json_sha256 = Some(hash);
1019 self
1020 }
1021
1022 pub fn preview_png_sha256(mut self, hash: [u8; 32]) -> Self {
1023 self.preview_png_sha256 = Some(hash);
1024 self
1025 }
1026
1027 pub fn page_count(mut self, page_count: u32) -> Self {
1028 self.page_count = Some(page_count);
1029 self
1030 }
1031
1032 pub fn audit_session_id(mut self, audit_session_id: impl Into<String>) -> Self {
1033 self.audit_session_id = Some(audit_session_id.into());
1034 self
1035 }
1036
1037 pub fn clean_spans(mut self, clean_spans: Vec<EmittedTokenSpan>) -> Self {
1038 self.clean_spans = clean_spans;
1039 self
1040 }
1041
1042 pub fn codec_audit(mut self, codec_audit: Vec<CodecAuditRow>) -> Self {
1043 self.codec_audit = codec_audit;
1044 self
1045 }
1046
1047 pub fn build(self) -> Result<DocumentExtension, DocumentExtensionError> {
1048 Ok(DocumentExtension {
1049 schema_version: self.schema_version,
1050 clean_md_sha256: self
1051 .clean_md_sha256
1052 .ok_or(DocumentExtensionError::MissingField("clean_md_sha256"))?,
1053 layout_json_sha256: self
1054 .layout_json_sha256
1055 .ok_or(DocumentExtensionError::MissingField("layout_json_sha256"))?,
1056 report_json_sha256: self
1057 .report_json_sha256
1058 .ok_or(DocumentExtensionError::MissingField("report_json_sha256"))?,
1059 preview_png_sha256: self.preview_png_sha256,
1060 page_count: self
1061 .page_count
1062 .ok_or(DocumentExtensionError::MissingField("page_count"))?,
1063 audit_session_id: self
1064 .audit_session_id
1065 .ok_or(DocumentExtensionError::MissingField("audit_session_id"))?,
1066 clean_spans: self.clean_spans,
1067 codec_audit: self.codec_audit,
1068 })
1069 }
1070}
1071
1072#[derive(Debug, Clone, PartialEq, Eq, Error)]
1074#[non_exhaustive]
1075pub enum DocumentExtensionError {
1076 #[error("missing document extension field: {0}")]
1077 MissingField(&'static str),
1078}
1079
1080#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
1082#[serde(rename_all = "snake_case")]
1083#[non_exhaustive]
1084pub enum TextOrigin {
1085 Ocr,
1087 EmbeddedText,
1089 Transcript,
1091 Hybrid,
1093}
1094
1095#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
1097#[non_exhaustive]
1098pub struct CodecCapabilitySet {
1099 pub text: bool,
1101 pub layout: bool,
1103 pub confidence: bool,
1105 pub timestamps: bool,
1107}
1108
1109impl CodecCapabilitySet {
1110 pub const TEXT_ONLY: Self = Self {
1112 text: true,
1113 layout: false,
1114 confidence: false,
1115 timestamps: false,
1116 };
1117
1118 pub const fn new(text: bool, layout: bool, confidence: bool, timestamps: bool) -> Self {
1120 Self {
1121 text,
1122 layout,
1123 confidence,
1124 timestamps,
1125 }
1126 }
1127
1128 pub fn contains(self, requested: Self) -> bool {
1130 (!requested.text || self.text)
1131 && (!requested.layout || self.layout)
1132 && (!requested.confidence || self.confidence)
1133 && (!requested.timestamps || self.timestamps)
1134 }
1135}
1136
1137#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1139#[serde(rename_all = "snake_case")]
1140#[non_exhaustive]
1141pub enum ExtractionDensityPolicy {
1142 Required(f32),
1144 Exempt { reason: String },
1146}
1147
1148impl Default for ExtractionDensityPolicy {
1149 fn default() -> Self {
1150 Self::Exempt {
1151 reason: "calibration_pending".to_string(),
1152 }
1153 }
1154}
1155
1156#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1158#[non_exhaustive]
1159pub struct CodecAuditRow {
1160 pub codec_id: String,
1162 pub codec_version: String,
1164 pub accepted_mime: String,
1166 pub advertised: CodecCapabilitySet,
1168 pub delivered: CodecCapabilitySet,
1170 pub text_origin: TextOrigin,
1172 pub codec_output_schema_version: u16,
1174 #[serde(default, skip_serializing_if = "Option::is_none")]
1176 pub options_hash_hex: Option<String>,
1177 #[serde(default, skip_serializing_if = "Option::is_none")]
1179 pub engine_provenance: Option<String>,
1180 pub extraction_density_policy: ExtractionDensityPolicy,
1182}
1183
1184impl CodecAuditRow {
1185 pub fn new(
1187 codec_id: impl Into<String>,
1188 codec_version: impl Into<String>,
1189 accepted_mime: impl Into<String>,
1190 text_origin: TextOrigin,
1191 ) -> Self {
1192 Self {
1193 codec_id: codec_id.into(),
1194 codec_version: codec_version.into(),
1195 accepted_mime: accepted_mime.into(),
1196 advertised: CodecCapabilitySet::default(),
1197 delivered: CodecCapabilitySet::default(),
1198 text_origin,
1199 codec_output_schema_version: 1,
1200 options_hash_hex: None,
1201 engine_provenance: None,
1202 extraction_density_policy: ExtractionDensityPolicy::default(),
1203 }
1204 }
1205}
1206
1207#[derive(Debug, Clone, Default, PartialEq)]
1213#[non_exhaustive]
1214pub struct LeakReport {
1215 pub suspects: Vec<LeakSuspect>,
1217 pub telemetry: Vec<LeakReportTelemetry>,
1219 pub stats: LeakReportStats,
1221 pub replay_hash: Option<String>,
1226}
1227
1228impl LeakReport {
1229 pub fn from_parts(
1231 suspects: Vec<LeakSuspect>,
1232 telemetry: Vec<LeakReportTelemetry>,
1233 ) -> LeakReport {
1234 let mut stats = LeakReportStats {
1235 suspect_count: suspects.len(),
1236 locale_skipped_count: telemetry
1237 .iter()
1238 .filter(|event| matches!(event, LeakReportTelemetry::LocaleSkipped { .. }))
1239 .count(),
1240 ..LeakReportStats::default()
1241 };
1242 for suspect in &suspects {
1243 match suspect.kind {
1244 LeakKind::Uncovered => stats.uncovered_count += 1,
1245 LeakKind::PartialBleed { .. } => stats.partial_bleed_count += 1,
1246 LeakKind::ClassMismatch { .. } => stats.class_mismatch_count += 1,
1247 }
1248 }
1249 LeakReport {
1250 suspects,
1251 telemetry,
1252 stats,
1253 replay_hash: None,
1254 }
1255 }
1256
1257 pub fn extend(&mut self, other: LeakReport) {
1259 self.suspects.extend(other.suspects);
1260 self.telemetry.extend(other.telemetry);
1261 *self = LeakReport::from_parts(
1262 std::mem::take(&mut self.suspects),
1263 std::mem::take(&mut self.telemetry),
1264 );
1265 }
1266}
1267
1268#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
1270#[non_exhaustive]
1271pub enum OpenAiPrivateLabel {
1272 PrivatePerson,
1274 PrivateAddress,
1276 PrivateEmail,
1278 PrivatePhone,
1280 PrivateUrl,
1282 PrivateDate,
1284 AccountNumber,
1286 Secret,
1288}
1289
1290impl OpenAiPrivateLabel {
1291 pub fn as_str(self) -> &'static str {
1293 match self {
1294 Self::PrivatePerson => "private_person",
1295 Self::PrivateAddress => "private_address",
1296 Self::PrivateEmail => "private_email",
1297 Self::PrivatePhone => "private_phone",
1298 Self::PrivateUrl => "private_url",
1299 Self::PrivateDate => "private_date",
1300 Self::AccountNumber => "account_number",
1301 Self::Secret => "secret",
1302 }
1303 }
1304}
1305
1306#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
1308#[non_exhaustive]
1309pub enum SafetyNetPiiClass {
1310 Email,
1312 Name,
1314 Location,
1316 Phone,
1318 Url,
1320 Date,
1322 AccountNumber,
1324 Secret,
1326}
1327
1328impl SafetyNetPiiClass {
1329 pub fn to_pii_class(self) -> PiiClass {
1331 match self {
1332 Self::Email => PiiClass::Email,
1333 Self::Name => PiiClass::Name,
1334 Self::Location => PiiClass::Location,
1335 Self::Phone => PiiClass::custom("phone"),
1336 Self::Url => PiiClass::custom("url"),
1337 Self::Date => PiiClass::custom("date"),
1338 Self::AccountNumber => PiiClass::custom("account_number"),
1339 Self::Secret => PiiClass::custom("secret"),
1340 }
1341 }
1342}
1343
1344#[derive(Debug, Clone, PartialEq, Eq, Error)]
1346#[non_exhaustive]
1347pub enum SafetyNetError {
1348 #[error("safety net unavailable: {reason}")]
1350 Unavailable {
1351 reason: String,
1353 },
1354 #[error("safety net weights missing: {path}")]
1356 WeightsMissing {
1357 path: String,
1359 },
1360 #[error("safety net model unavailable: {reason}")]
1362 ModelUnavailable {
1363 reason: String,
1365 },
1366 #[error("safety net input too large: limit={limit}, actual={actual}")]
1368 InputTooLarge {
1369 limit: usize,
1371 actual: usize,
1373 },
1374 #[error("safety net runtime failed: {message}")]
1376 Runtime {
1377 message: String,
1379 },
1380 #[error("safety net invalid output: {message}")]
1382 InvalidOutput {
1383 message: String,
1385 },
1386}
1387
1388#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1402#[non_exhaustive]
1403pub enum Action {
1404 Tokenize,
1406 Redact,
1408 FormatPreserve,
1410 Generalize,
1412 Preserve,
1414}
1415
1416#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1418#[non_exhaustive]
1419pub enum ConflictTier {
1420 None,
1422 ClassPriority,
1424 RulePriority,
1426 Score,
1428 SpanLength,
1430 Validator,
1432 ValidatorVeto,
1434 CollisionPolicy,
1436 AnchoredContext,
1438 RecognizerId,
1440 Merged,
1442}
1443
1444#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1446#[non_exhaustive]
1447pub enum DocumentKind {
1448 Structured,
1450 Text,
1452}
1453
1454#[derive(Debug, Clone, PartialEq, Eq)]
1464#[non_exhaustive]
1465pub struct RedactionEntry {
1466 pub source: String,
1468 pub class: PiiClass,
1470 pub action: Action,
1472 pub field_name: Option<String>,
1474 pub document_kind: DocumentKind,
1476 pub conflict_loser: bool,
1478 pub decided_by: ConflictTier,
1480 pub created_at: i64,
1482 pub session_id: Option<String>,
1484 pub validator_fail_reason: Option<ValidatorFailReason>,
1486 pub ambiguity_record: Option<AmbiguityRecord>,
1488 pub collision_family: Option<String>,
1490 pub collision_variant: Option<String>,
1492}
1493
1494impl RedactionEntry {
1495 #[allow(clippy::too_many_arguments)]
1497 pub fn new(
1498 source: impl Into<String>,
1499 class: PiiClass,
1500 action: Action,
1501 field_name: Option<String>,
1502 document_kind: DocumentKind,
1503 conflict_loser: bool,
1504 decided_by: ConflictTier,
1505 created_at: i64,
1506 session_id: Option<String>,
1507 ) -> Self {
1508 Self {
1509 source: source.into(),
1510 class,
1511 action,
1512 field_name,
1513 document_kind,
1514 conflict_loser,
1515 decided_by,
1516 created_at,
1517 session_id,
1518 validator_fail_reason: None,
1519 ambiguity_record: None,
1520 collision_family: None,
1521 collision_variant: None,
1522 }
1523 }
1524
1525 pub fn with_validator_fail_reason(mut self, reason: ValidatorFailReason) -> Self {
1527 self.validator_fail_reason = Some(reason);
1528 self
1529 }
1530
1531 pub fn with_ambiguity_record(mut self, record: AmbiguityRecord) -> Self {
1533 self.ambiguity_record = Some(record);
1534 self
1535 }
1536
1537 pub fn with_collision_metadata(
1539 mut self,
1540 family: Option<String>,
1541 variant: Option<String>,
1542 ) -> Self {
1543 self.collision_family = family;
1544 self.collision_variant = variant;
1545 self
1546 }
1547}
1548
1549#[derive(Debug, Clone, PartialEq, Eq, Error)]
1551#[non_exhaustive]
1552pub enum RedactionLogError {
1553 #[error("sqlite redaction log error: {0}")]
1555 Sqlite(String),
1556 #[error("backend redaction log error: {0}")]
1558 Backend(String),
1559}
1560
1561pub trait RedactionLogger: Send + Sync {
1591 fn log(&self, entry: &RedactionEntry) -> Result<(), RedactionLogError>;
1593}
1594
1595#[derive(Debug, Clone, PartialEq, Eq, Hash)]
1597#[non_exhaustive]
1598pub enum LocaleTag {
1599 Global,
1601 DeDe,
1603 DeAt,
1605 DeCh,
1607 EnUs,
1609 EnGb,
1611 EnIe,
1613 EnAu,
1615 EnCa,
1617 Other(String),
1619}
1620
1621#[derive(Debug, Clone, PartialEq, Eq)]
1623#[non_exhaustive]
1624pub enum LocaleError {
1625 Unsupported,
1627}
1628
1629impl fmt::Display for LocaleError {
1630 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1631 match self {
1632 LocaleError::Unsupported => f.write_str("unsupported locale"),
1633 }
1634 }
1635}
1636
1637impl std::error::Error for LocaleError {}
1638
1639#[derive(Debug, Clone, PartialEq, Eq)]
1641pub struct LocaleChain(Vec<LocaleTag>);
1642
1643impl LocaleTag {
1644 pub const GLOBAL: LocaleTag = LocaleTag::Global;
1646
1647 pub fn parse(s: &str) -> Result<LocaleTag, LocaleError> {
1649 let raw = s.trim().replace('_', "-");
1650 let normalized = raw.to_ascii_lowercase();
1651 match normalized.as_str() {
1652 "global" | "*" => Ok(LocaleTag::Global),
1653 "de-de" => Ok(LocaleTag::DeDe),
1654 "de-at" => Ok(LocaleTag::DeAt),
1655 "de-ch" => Ok(LocaleTag::DeCh),
1656 "en-us" => Ok(LocaleTag::EnUs),
1657 "en-gb" => Ok(LocaleTag::EnGb),
1658 "en-ie" => Ok(LocaleTag::EnIe),
1659 "en-au" => Ok(LocaleTag::EnAu),
1660 "en-ca" => Ok(LocaleTag::EnCa),
1661 "" => Err(LocaleError::Unsupported),
1662 _ if is_bcp47_parseable(&raw) => Ok(LocaleTag::Other(canonical_other(&raw))),
1663 _ => Err(LocaleError::Unsupported),
1664 }
1665 }
1666
1667 pub fn as_str(&self) -> &str {
1669 match self {
1670 LocaleTag::Global => "global",
1671 LocaleTag::DeDe => "de-DE",
1672 LocaleTag::DeAt => "de-AT",
1673 LocaleTag::DeCh => "de-CH",
1674 LocaleTag::EnUs => "en-US",
1675 LocaleTag::EnGb => "en-GB",
1676 LocaleTag::EnIe => "en-IE",
1677 LocaleTag::EnAu => "en-AU",
1678 LocaleTag::EnCa => "en-CA",
1679 LocaleTag::Other(tag) => tag.as_str(),
1680 }
1681 }
1682}
1683
1684impl LocaleChain {
1685 pub fn from_tags(mut tags: Vec<LocaleTag>) -> LocaleChain {
1687 ensure_global(&mut tags);
1688 LocaleChain(tags)
1689 }
1690
1691 pub fn from_cli(raw: &str) -> Result<LocaleChain, LocaleError> {
1693 let tags = raw
1694 .split(',')
1695 .map(LocaleTag::parse)
1696 .collect::<Result<Vec<_>, _>>()?;
1697 Ok(LocaleChain::from_tags(tags))
1698 }
1699
1700 pub fn merge_policy_and_cli(
1702 policy: Option<&[LocaleTag]>,
1703 cli: Option<&[LocaleTag]>,
1704 ) -> LocaleChain {
1705 Self::merge_cli_policy_rulepack_default(cli, policy, None)
1706 }
1707
1708 pub fn merge_cli_policy_rulepack_default(
1710 cli: Option<&[LocaleTag]>,
1711 policy: Option<&[LocaleTag]>,
1712 rulepack_defaults: Option<&[LocaleTag]>,
1713 ) -> LocaleChain {
1714 let tags = cli
1715 .filter(|tags| !tags.is_empty())
1716 .or_else(|| policy.filter(|tags| !tags.is_empty()))
1717 .or_else(|| rulepack_defaults.filter(|tags| !tags.is_empty()))
1718 .map(|tags| tags.to_vec())
1719 .unwrap_or_else(|| vec![LocaleTag::Global]);
1720 LocaleChain::from_tags(tags)
1721 }
1722
1723 pub fn intersects(&self, recognizer_locales: &[LocaleTag]) -> bool {
1725 if recognizer_locales.is_empty() {
1726 return true;
1727 }
1728 recognizer_locales.iter().any(|recognizer_locale| {
1729 *recognizer_locale == LocaleTag::Global
1730 || self.0.iter().any(|active| active == recognizer_locale)
1731 })
1732 }
1733
1734 pub fn as_slice(&self) -> &[LocaleTag] {
1736 &self.0
1737 }
1738
1739 pub fn to_strings(&self) -> Vec<String> {
1741 self.0.iter().map(ToString::to_string).collect()
1742 }
1743}
1744
1745impl From<&[LocaleTag]> for LocaleChain {
1746 fn from(tags: &[LocaleTag]) -> Self {
1747 let mut owned = tags.to_vec();
1748 ensure_global(&mut owned);
1749 LocaleChain(owned)
1750 }
1751}
1752
1753impl fmt::Display for LocaleTag {
1754 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1755 f.write_str(self.as_str())
1756 }
1757}
1758
1759#[derive(Debug, Clone)]
1769#[non_exhaustive]
1770pub enum RawDocument {
1771 Structured(BTreeMap<String, Value>),
1773 Text(String),
1775}
1776
1777#[derive(Debug, Clone, Serialize)]
1796#[serde(untagged)]
1797#[non_exhaustive]
1798pub enum CleanDocument {
1799 Structured(BTreeMap<String, Value>),
1801 Text(String),
1803}
1804
1805#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
1807#[serde(untagged)]
1808#[non_exhaustive]
1809pub enum Value {
1810 Null,
1812 Bool(bool),
1814 String(String),
1816 I64(i64),
1818 Array(Vec<Value>),
1820 Object(BTreeMap<String, Value>),
1822}
1823
1824impl Value {
1825 pub fn as_str(&self) -> Option<&str> {
1827 match self {
1828 Self::String(value) => Some(value.as_str()),
1829 Self::Null | Self::Bool(_) | Self::I64(_) | Self::Array(_) | Self::Object(_) => None,
1830 }
1831 }
1832
1833 pub fn scalar_to_safety_net_string(&self) -> Option<String> {
1835 match self {
1836 Self::String(value) if !value.is_empty() => Some(value.clone()),
1837 Self::String(_) | Self::Null | Self::Array(_) | Self::Object(_) => None,
1838 Self::Bool(value) => Some(value.to_string()),
1839 Self::I64(value) => Some(value.to_string()),
1840 }
1841 }
1842}
1843
1844impl PartialEq<&str> for Value {
1845 fn eq(&self, other: &&str) -> bool {
1846 self.as_str() == Some(*other)
1847 }
1848}
1849
1850#[derive(Debug, Clone, Default)]
1852pub struct DictionaryBundle {
1853 entries: HashMap<String, DictionaryEntry>,
1854}
1855
1856#[derive(Debug, Clone)]
1858pub struct DictionaryEntry {
1859 terms: Vec<String>,
1860 case_sensitive: bool,
1861 source: DictionarySource,
1862}
1863
1864#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1866#[non_exhaustive]
1867pub enum DictionarySource {
1868 Cli,
1870 Rulepack,
1872}
1873
1874#[derive(Debug, Clone, PartialEq, Eq)]
1876#[non_exhaustive]
1877pub struct DictionaryStats {
1878 pub name: String,
1880 pub term_count: usize,
1882 pub source: DictionarySource,
1884}
1885
1886impl DictionaryStats {
1887 pub fn new(name: impl Into<String>, term_count: usize, source: DictionarySource) -> Self {
1889 Self {
1890 name: name.into(),
1891 term_count,
1892 source,
1893 }
1894 }
1895}
1896
1897#[derive(Debug, Clone, PartialEq, Eq)]
1899#[non_exhaustive]
1900pub struct RulepackDict {
1901 pub name: String,
1903 pub terms: Vec<String>,
1905 pub case_sensitive: bool,
1907}
1908
1909impl RulepackDict {
1910 pub fn new(name: impl Into<String>, terms: Vec<String>, case_sensitive: bool) -> Self {
1912 Self {
1913 name: name.into(),
1914 terms,
1915 case_sensitive,
1916 }
1917 }
1918}
1919
1920#[derive(Debug, Clone, PartialEq, Eq)]
1922#[non_exhaustive]
1923pub enum DictionaryLoadError {
1924 Empty { name: String },
1926 UnicodeInsensitiveUnsupported { name: String },
1928}
1929
1930impl fmt::Display for DictionaryLoadError {
1931 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1932 match self {
1933 Self::Empty { name } => write!(f, "dictionary '{name}' has no terms"),
1934 Self::UnicodeInsensitiveUnsupported { name } => write!(
1935 f,
1936 "dictionary '{name}' uses unicode terms with case-insensitive matching, unsupported in v0.4.0; use case_sensitive = true"
1937 ),
1938 }
1939 }
1940}
1941
1942impl std::error::Error for DictionaryLoadError {}
1943
1944impl DictionaryBundle {
1945 pub fn from_rulepack_terms(terms: &[RulepackDict]) -> Self {
1947 let mut entries = HashMap::with_capacity(terms.len());
1948 for dictionary in terms {
1949 let entry = DictionaryEntry::new(
1950 &dictionary.name,
1951 dictionary.terms.clone(),
1952 dictionary.case_sensitive,
1953 DictionarySource::Rulepack,
1954 )
1955 .expect("Policy validates dictionary terms before bundle construction");
1956 entries.insert(dictionary.name.clone(), entry);
1957 }
1958 Self { entries }
1959 }
1960
1961 pub fn from_entries(entries: impl IntoIterator<Item = (String, DictionaryEntry)>) -> Self {
1963 Self {
1964 entries: entries.into_iter().collect(),
1965 }
1966 }
1967
1968 pub fn merge(a: Self, b: Self) -> Self {
1970 let mut entries = a.entries;
1971 entries.extend(b.entries);
1972 Self { entries }
1973 }
1974
1975 pub fn get(&self, name: &str) -> Option<&DictionaryEntry> {
1977 self.entries.get(name)
1978 }
1979
1980 pub fn stats(&self) -> Vec<DictionaryStats> {
1982 let mut stats = self
1983 .entries
1984 .iter()
1985 .map(|(name, entry)| DictionaryStats {
1986 name: name.clone(),
1987 term_count: entry.terms.len(),
1988 source: entry.source,
1989 })
1990 .collect::<Vec<_>>();
1991 stats.sort_by(|a, b| a.name.cmp(&b.name));
1992 stats
1993 }
1994}
1995
1996impl DictionaryEntry {
1997 pub fn new(
1999 name: &str,
2000 terms: Vec<String>,
2001 case_sensitive: bool,
2002 source: DictionarySource,
2003 ) -> Result<Self, DictionaryLoadError> {
2004 if terms.is_empty() {
2005 return Err(DictionaryLoadError::Empty {
2006 name: name.to_string(),
2007 });
2008 }
2009 if !case_sensitive && terms.iter().any(|term| !term.is_ascii()) {
2010 return Err(DictionaryLoadError::UnicodeInsensitiveUnsupported {
2011 name: name.to_string(),
2012 });
2013 }
2014 Ok(Self {
2015 terms,
2016 case_sensitive,
2017 source,
2018 })
2019 }
2020
2021 pub fn case_sensitive(&self) -> bool {
2023 self.case_sensitive
2024 }
2025
2026 pub fn terms(&self) -> &[String] {
2028 &self.terms
2029 }
2030}
2031
2032#[cfg(test)]
2033mod dictionary_tests {
2034 use super::*;
2035
2036 #[test]
2037 fn dictionary_entry_rejects_empty_terms() {
2038 let err = DictionaryEntry::new("empty", Vec::new(), true, DictionarySource::Cli)
2039 .expect_err("empty dictionaries must fail closed");
2040
2041 assert!(matches!(err, DictionaryLoadError::Empty { name } if name == "empty"));
2042 }
2043
2044 #[test]
2045 fn dictionary_entry_rejects_non_ascii_case_insensitive_terms() {
2046 let err = DictionaryEntry::new(
2047 "songs",
2048 vec!["Beyonce".to_string(), "Caf\u{00e9}".to_string()],
2049 false,
2050 DictionarySource::Cli,
2051 )
2052 .expect_err("unicode case-insensitive dictionaries must fail closed");
2053
2054 assert!(matches!(
2055 err,
2056 DictionaryLoadError::UnicodeInsensitiveUnsupported { name } if name == "songs"
2057 ));
2058 }
2059}
2060
2061#[cfg(test)]
2062mod redaction_logger_tests {
2063 use super::*;
2064
2065 struct CapturingLogger;
2066
2067 impl RedactionLogger for CapturingLogger {
2068 fn log(&self, _entry: &RedactionEntry) -> Result<(), RedactionLogError> {
2069 Ok(())
2070 }
2071 }
2072
2073 fn assert_send_sync<T: Send + Sync + ?Sized>() {}
2074
2075 #[test]
2076 fn redaction_log_error_display_is_stable() {
2077 assert_eq!(
2078 RedactionLogError::Sqlite("write failed".to_string()).to_string(),
2079 "sqlite redaction log error: write failed"
2080 );
2081 assert_eq!(
2082 RedactionLogError::Backend("sink failed".to_string()).to_string(),
2083 "backend redaction log error: sink failed"
2084 );
2085 }
2086
2087 #[test]
2088 fn redaction_logger_trait_object_is_send_sync() {
2089 assert_send_sync::<dyn RedactionLogger>();
2090 }
2091
2092 #[test]
2093 fn local_logger_can_implement_redaction_logger() {
2094 let logger = CapturingLogger;
2095 let entry = RedactionEntry {
2096 source: "unit-test".to_string(),
2097 class: PiiClass::Email,
2098 action: Action::Tokenize,
2099 field_name: None,
2100 document_kind: DocumentKind::Text,
2101 conflict_loser: false,
2102 decided_by: ConflictTier::None,
2103 created_at: 0,
2104 session_id: None,
2105 validator_fail_reason: None,
2106 ambiguity_record: None,
2107 collision_family: None,
2108 collision_variant: None,
2109 };
2110
2111 let trait_object: &dyn RedactionLogger = &logger;
2112 trait_object.log(&entry).expect("log entry");
2113 }
2114}
2115
2116#[cfg(test)]
2117mod safety_net_manifest_tests {
2118 use super::*;
2119
2120 fn span(start: usize, end: usize, class: PiiClass) -> EmittedTokenSpan {
2121 EmittedTokenSpan {
2122 clean_span: start..end,
2123 raw_span: start..end,
2124 class,
2125 }
2126 }
2127
2128 fn diff(manifest: Manifest, suspect: Range<usize>, class: PiiClass) -> Option<LeakKind> {
2129 manifest.diff_against(&suspect, &class)
2130 }
2131
2132 #[test]
2133 fn exact_same_class_coverage_is_not_a_leak() {
2134 let manifest = Manifest::from_spans(vec![span(0, 8, PiiClass::Email)]);
2135
2136 assert_eq!(diff(manifest, 0..8, PiiClass::Email), None);
2137 }
2138
2139 #[test]
2140 fn uncovered_outside_all_tokens_is_uncovered() {
2141 let manifest = Manifest::from_spans(vec![span(20, 30, PiiClass::Email)]);
2142
2143 assert_eq!(
2144 diff(manifest, 0..10, PiiClass::Email),
2145 Some(LeakKind::Uncovered)
2146 );
2147 }
2148
2149 #[test]
2150 fn single_internal_gap_returns_partial_bleed() {
2151 let manifest = Manifest::from_spans(vec![
2152 span(0, 5, PiiClass::Email),
2153 span(10, 15, PiiClass::Email),
2154 ]);
2155
2156 assert_eq!(
2157 diff(manifest, 0..15, PiiClass::Email),
2158 Some(LeakKind::PartialBleed { uncovered: 5..10 })
2159 );
2160 }
2161
2162 #[test]
2163 fn multi_gap_returns_deterministic_first_uncovered_gap() {
2164 let manifest = Manifest::from_spans(vec![
2165 span(0, 3, PiiClass::Email),
2166 span(5, 7, PiiClass::Email),
2167 span(9, 12, PiiClass::Email),
2168 ]);
2169
2170 assert_eq!(
2173 diff(manifest, 0..12, PiiClass::Email),
2174 Some(LeakKind::PartialBleed { uncovered: 3..5 })
2175 );
2176 }
2177
2178 #[test]
2179 fn multi_class_overlap_reports_first_mismatch_deterministically() {
2180 let manifest = Manifest::from_spans(vec![
2181 span(0, 4, PiiClass::Name),
2182 span(4, 8, PiiClass::Location),
2183 ]);
2184
2185 assert_eq!(
2186 diff(manifest, 0..8, PiiClass::Email),
2187 Some(LeakKind::ClassMismatch {
2188 pipeline_class: PiiClass::Name,
2189 safety_net_class: PiiClass::Email,
2190 })
2191 );
2192 }
2193
2194 #[test]
2195 fn adjacent_same_class_tokens_cover_continuously() {
2196 let manifest = Manifest::from_spans(vec![
2197 span(0, 5, PiiClass::Email),
2198 span(5, 10, PiiClass::Email),
2199 ]);
2200
2201 assert_eq!(diff(manifest, 0..10, PiiClass::Email), None);
2202 }
2203
2204 #[test]
2205 fn partial_bleed_at_start_end_and_middle() {
2206 let manifest = Manifest::from_spans(vec![span(3, 8, PiiClass::Email)]);
2207
2208 assert_eq!(
2209 diff(manifest.clone(), 0..8, PiiClass::Email),
2210 Some(LeakKind::PartialBleed { uncovered: 0..3 })
2211 );
2212 assert_eq!(
2213 diff(manifest.clone(), 3..10, PiiClass::Email),
2214 Some(LeakKind::PartialBleed { uncovered: 8..10 })
2215 );
2216
2217 let with_gap = Manifest::from_spans(vec![
2218 span(0, 3, PiiClass::Email),
2219 span(6, 10, PiiClass::Email),
2220 ]);
2221 assert_eq!(
2222 diff(with_gap, 0..10, PiiClass::Email),
2223 Some(LeakKind::PartialBleed { uncovered: 3..6 })
2224 );
2225 }
2226
2227 #[test]
2228 fn byte_indices_are_not_character_indices() {
2229 let text = "ID: 😀 <Email_1>";
2230 let token_start = text.find("<Email_1>").expect("token start");
2231 assert_eq!(token_start, 9, "emoji is four bytes, not one char");
2232 let manifest = Manifest::from_spans(vec![span(token_start, text.len(), PiiClass::Email)]);
2233
2234 assert_eq!(
2235 diff(manifest, token_start..text.len(), PiiClass::Email),
2236 None
2237 );
2238 }
2239
2240 #[test]
2241 fn empty_suspect_range_is_not_a_leak() {
2242 let manifest = Manifest::default();
2243
2244 assert_eq!(diff(manifest, 3..3, PiiClass::Email), None);
2245 }
2246
2247 #[test]
2248 fn safety_net_error_display_is_variant_specific_and_bytes_free() {
2249 let cases = [
2250 SafetyNetError::Unavailable {
2251 reason: "not configured".to_string(),
2252 }
2253 .to_string(),
2254 SafetyNetError::WeightsMissing {
2255 path: "/models/opf".to_string(),
2256 }
2257 .to_string(),
2258 SafetyNetError::ModelUnavailable {
2259 reason: "load failed".to_string(),
2260 }
2261 .to_string(),
2262 SafetyNetError::InputTooLarge {
2263 limit: 1024,
2264 actual: 2048,
2265 }
2266 .to_string(),
2267 SafetyNetError::Runtime {
2268 message: "timeout".to_string(),
2269 }
2270 .to_string(),
2271 SafetyNetError::InvalidOutput {
2272 message: "bad json".to_string(),
2273 }
2274 .to_string(),
2275 ];
2276
2277 for rendered in cases {
2278 assert!(!rendered.contains("alice@example.invalid"));
2279 }
2280 }
2281}
2282
2283pub trait Recognizer: Send + Sync {
2285 fn id(&self) -> &str;
2287 fn supported_class(&self) -> &PiiClass;
2289 fn detect(&self, input: &str, ctx: &DetectContext<'_>) -> Vec<Candidate>;
2291 fn token_family(&self) -> &str;
2293 fn validator_kind(&self) -> Option<ValidatorKind> {
2295 None
2296 }
2297 fn locales(&self) -> &[LocaleTag] {
2299 &[LocaleTag::Global]
2300 }
2301}
2302
2303#[derive(Debug, Clone, PartialEq)]
2305#[non_exhaustive]
2306pub struct Candidate {
2307 pub span: Range<usize>,
2309 pub class: PiiClass,
2311 pub recognizer_id: String,
2313 pub score: f32,
2315 pub priority: i32,
2317 pub canonical_form: Option<String>,
2319 pub token_family: String,
2321 pub source: String,
2323 pub decided_by: ConflictTier,
2325 pub merged_sources: Vec<String>,
2327}
2328
2329impl Candidate {
2330 #[allow(clippy::too_many_arguments)]
2332 pub fn new(
2333 span: Range<usize>,
2334 class: PiiClass,
2335 recognizer_id: impl Into<String>,
2336 score: f32,
2337 priority: i32,
2338 canonical_form: Option<String>,
2339 token_family: impl Into<String>,
2340 source: impl Into<String>,
2341 decided_by: ConflictTier,
2342 merged_sources: Vec<String>,
2343 ) -> Self {
2344 Self {
2345 span,
2346 class,
2347 recognizer_id: recognizer_id.into(),
2348 score,
2349 priority,
2350 canonical_form,
2351 token_family: token_family.into(),
2352 source: source.into(),
2353 decided_by,
2354 merged_sources,
2355 }
2356 }
2357
2358 pub fn with_span(mut self, span: Range<usize>) -> Self {
2360 self.span = span;
2361 self
2362 }
2363}
2364
2365#[non_exhaustive]
2367pub struct DetectContext<'a> {
2368 pub locale_chain: &'a [LocaleTag],
2370 pub dictionaries: &'a DictionaryBundle,
2372 pub fields: &'a (),
2374 pub degraded: Cell<bool>,
2376}
2377
2378impl<'a> DetectContext<'a> {
2379 pub fn new(locale_chain: &'a [LocaleTag], dictionaries: &'a DictionaryBundle) -> Self {
2381 Self {
2382 locale_chain,
2383 dictionaries,
2384 fields: &(),
2385 degraded: Cell::new(false),
2386 }
2387 }
2388}
2389
2390fn ensure_global(tags: &mut Vec<LocaleTag>) {
2391 if !tags.contains(&LocaleTag::Global) {
2392 tags.push(LocaleTag::Global);
2393 }
2394}
2395
2396fn is_bcp47_parseable(raw: &str) -> bool {
2397 let mut parts = raw.split('-');
2398 let Some(language) = parts.next() else {
2399 return false;
2400 };
2401 if !(2..=8).contains(&language.len()) || !language.chars().all(|ch| ch.is_ascii_alphabetic()) {
2402 return false;
2403 }
2404 parts.all(|part| {
2405 (2..=8).contains(&part.len()) && part.chars().all(|ch| ch.is_ascii_alphanumeric())
2406 })
2407}
2408
2409fn canonical_other(raw: &str) -> String {
2410 let mut parts = raw.split('-');
2411 let language = parts.next().unwrap_or_default().to_ascii_lowercase();
2412 let rest = parts.map(|part| {
2413 if part.len() == 2 && part.chars().all(|ch| ch.is_ascii_alphabetic()) {
2414 part.to_ascii_uppercase()
2415 } else {
2416 part.to_ascii_lowercase()
2417 }
2418 });
2419 std::iter::once(language)
2420 .chain(rest)
2421 .collect::<Vec<_>>()
2422 .join("-")
2423}