1#![cfg_attr(docsrs, feature(doc_cfg))]
2
3use std::cell::Cell;
4use std::collections::{BTreeMap, HashMap};
5use std::fmt;
6use std::ops::Range;
7
8use serde::{Deserialize, Serialize};
9use thiserror::Error;
10
11pub trait Detector: Send + Sync {
13 fn detect(&self, input: &str) -> Vec<Detection>;
15}
16
17#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
45pub enum PiiClass {
46 Email,
48 Name,
50 Location,
52 Organization,
54 Custom(String),
56}
57
58pub const BUILTIN_CLASS_NAMES: &[&str] = &["Email", "Name", "Location", "Organization"];
60
61impl PiiClass {
62 pub fn from_policy_name(input: &str) -> Option<Self> {
64 match input {
65 "email" => Some(Self::Email),
66 "name" => Some(Self::Name),
67 "location" => Some(Self::Location),
68 "organization" => Some(Self::Organization),
69 custom if custom.starts_with("custom:") => {
70 let name = custom.trim_start_matches("custom:");
71 (!name.trim().is_empty()).then(|| Self::custom(name))
72 }
73 _ => None,
74 }
75 }
76
77 pub fn builtin_variants() -> &'static [PiiClass] {
79 &[
80 PiiClass::Email,
81 PiiClass::Name,
82 PiiClass::Location,
83 PiiClass::Organization,
84 ]
85 }
86
87 pub fn custom(name: &str) -> Self {
89 let mut normalized = String::new();
90 let mut pending_underscore = false;
91 for ch in name.trim().chars() {
92 if ch.is_ascii_alphanumeric() {
93 if pending_underscore && !normalized.is_empty() {
94 normalized.push('_');
95 }
96 normalized.push(ch.to_ascii_lowercase());
97 pending_underscore = false;
98 } else {
99 pending_underscore = true;
100 }
101 }
102
103 Self::Custom(normalized)
104 }
105
106 pub fn as_custom_name(&self) -> Option<&str> {
108 match self {
109 Self::Custom(name) => Some(name.as_str()),
110 Self::Email | Self::Name | Self::Location | Self::Organization => None,
111 }
112 }
113
114 pub fn class_name(&self) -> String {
116 match self {
117 Self::Email => BUILTIN_CLASS_NAMES[0].to_string(),
118 Self::Name => BUILTIN_CLASS_NAMES[1].to_string(),
119 Self::Location => BUILTIN_CLASS_NAMES[2].to_string(),
120 Self::Organization => BUILTIN_CLASS_NAMES[3].to_string(),
121 Self::Custom(name) => format!("Custom:{name}"),
122 }
123 }
124}
125
126#[derive(Debug, Clone, PartialEq, Eq)]
128#[non_exhaustive]
129pub struct Detection {
130 pub span: Range<usize>,
132 pub class: PiiClass,
134 pub source: String,
136}
137
138impl Detection {
139 pub fn new(span: Range<usize>, class: PiiClass, source: impl Into<String>) -> Self {
141 Self {
142 span,
143 class,
144 source: source.into(),
145 }
146 }
147}
148
149pub trait SafetyNet: Send + Sync {
163 fn id(&self) -> &str;
165
166 fn supported_locales(&self) -> &[LocaleTag];
168
169 fn check(
171 &self,
172 clean_text: &str,
173 context: SafetyNetContext<'_>,
174 ) -> Result<Vec<LeakSuspect>, SafetyNetError>;
175}
176
177#[derive(Debug, Clone, Copy)]
179#[non_exhaustive]
180pub struct SafetyNetContext<'a> {
181 pub manifest: &'a Manifest,
183 pub locale_chain: &'a [LocaleTag],
187 pub document_kind: DocumentKind,
189 pub session_id: Option<&'a str>,
191 pub field_path: Option<&'a str>,
193}
194
195impl<'a> SafetyNetContext<'a> {
196 pub fn new(
198 manifest: &'a Manifest,
199 locale_chain: &'a [LocaleTag],
200 document_kind: DocumentKind,
201 session_id: Option<&'a str>,
202 field_path: Option<&'a str>,
203 ) -> Self {
204 Self {
205 manifest,
206 locale_chain,
207 document_kind,
208 session_id,
209 field_path,
210 }
211 }
212}
213
214#[derive(Debug, Clone, PartialEq, Eq)]
216#[non_exhaustive]
217pub struct EmittedTokenSpan {
218 pub clean_span: Range<usize>,
220 pub raw_span: Range<usize>,
222 pub class: PiiClass,
224}
225
226impl EmittedTokenSpan {
227 pub fn new(clean_span: Range<usize>, raw_span: Range<usize>, class: PiiClass) -> Self {
229 Self {
230 clean_span,
231 raw_span,
232 class,
233 }
234 }
235}
236
237#[derive(Debug, Clone, Default, PartialEq, Eq)]
239#[non_exhaustive]
240pub struct Manifest {
241 pub spans: Vec<EmittedTokenSpan>,
243}
244
245impl Manifest {
246 pub fn from_spans(mut spans: Vec<EmittedTokenSpan>) -> Self {
248 spans.sort_by_key(|span| (span.clean_span.start, span.clean_span.end));
249 Self { spans }
250 }
251
252 pub fn diff_against(
260 &self,
261 suspect_span: &Range<usize>,
262 suspect_class: &PiiClass,
263 ) -> Option<LeakKind> {
264 if suspect_span.is_empty() {
265 return None;
266 }
267
268 let start_idx = self
269 .spans
270 .partition_point(|span| span.clean_span.end <= suspect_span.start);
271 let overlapping = self.spans[start_idx..]
272 .iter()
273 .take_while(|span| span.clean_span.start < suspect_span.end)
274 .filter(|span| ranges_overlap(&span.clean_span, suspect_span))
275 .collect::<Vec<_>>();
276
277 if overlapping.is_empty() {
278 return Some(LeakKind::Uncovered);
279 }
280
281 let mut cursor = suspect_span.start;
282 let mut first_mismatch = None::<&EmittedTokenSpan>;
283 for span in overlapping {
284 if span.clean_span.start > cursor {
285 return Some(LeakKind::PartialBleed {
286 uncovered: cursor..span.clean_span.start.min(suspect_span.end),
287 });
288 }
289
290 if span.clean_span.end > cursor {
291 if first_mismatch.is_none() && &span.class != suspect_class {
292 first_mismatch = Some(span);
293 }
294 cursor = cursor.max(span.clean_span.end.min(suspect_span.end));
295 if cursor >= suspect_span.end {
296 break;
297 }
298 }
299 }
300
301 if cursor < suspect_span.end {
302 return Some(LeakKind::PartialBleed {
303 uncovered: cursor..suspect_span.end,
304 });
305 }
306
307 first_mismatch.map(|span| LeakKind::ClassMismatch {
308 pipeline_class: span.class.clone(),
309 safety_net_class: suspect_class.clone(),
310 })
311 }
312}
313
314fn ranges_overlap(left: &Range<usize>, right: &Range<usize>) -> bool {
315 left.start < right.end && right.start < left.end
316}
317
318#[derive(Debug, Clone, PartialEq)]
320#[non_exhaustive]
321pub struct LeakSuspect {
322 pub span: Range<usize>,
324 pub class: PiiClass,
326 pub safety_net_id: String,
328 pub score: Option<f32>,
330 pub kind: LeakKind,
332 pub raw_label: String,
334 pub field_path: Option<String>,
336}
337
338impl LeakSuspect {
339 pub fn new(
341 span: Range<usize>,
342 class: PiiClass,
343 safety_net_id: impl Into<String>,
344 score: Option<f32>,
345 kind: LeakKind,
346 raw_label: impl Into<String>,
347 field_path: Option<String>,
348 ) -> Self {
349 Self {
350 span,
351 class,
352 safety_net_id: safety_net_id.into(),
353 score,
354 kind,
355 raw_label: raw_label.into(),
356 field_path,
357 }
358 }
359}
360
361#[derive(Debug, Clone, PartialEq, Eq)]
365#[non_exhaustive]
366pub enum LeakKind {
367 Uncovered,
369 PartialBleed {
371 uncovered: Range<usize>,
373 },
374 ClassMismatch {
376 pipeline_class: PiiClass,
378 safety_net_class: PiiClass,
380 },
381}
382
383#[derive(Debug, Clone, PartialEq, Eq)]
385#[non_exhaustive]
386pub enum LeakReportTelemetry {
387 LocaleSkipped {
389 safety_net_id: String,
391 document_kind: DocumentKind,
393 field_path: Option<String>,
395 },
396}
397
398#[derive(Debug, Clone, Default, PartialEq, Eq)]
400#[non_exhaustive]
401pub struct LeakReportStats {
402 pub suspect_count: usize,
404 pub uncovered_count: usize,
406 pub partial_bleed_count: usize,
408 pub class_mismatch_count: usize,
410 pub locale_skipped_count: usize,
412}
413
414#[derive(Debug, Clone, Default, PartialEq)]
420#[non_exhaustive]
421pub struct LeakReport {
422 pub suspects: Vec<LeakSuspect>,
424 pub telemetry: Vec<LeakReportTelemetry>,
426 pub stats: LeakReportStats,
428 pub replay_hash: Option<String>,
433}
434
435impl LeakReport {
436 pub fn from_parts(
438 suspects: Vec<LeakSuspect>,
439 telemetry: Vec<LeakReportTelemetry>,
440 ) -> LeakReport {
441 let mut stats = LeakReportStats {
442 suspect_count: suspects.len(),
443 locale_skipped_count: telemetry
444 .iter()
445 .filter(|event| matches!(event, LeakReportTelemetry::LocaleSkipped { .. }))
446 .count(),
447 ..LeakReportStats::default()
448 };
449 for suspect in &suspects {
450 match suspect.kind {
451 LeakKind::Uncovered => stats.uncovered_count += 1,
452 LeakKind::PartialBleed { .. } => stats.partial_bleed_count += 1,
453 LeakKind::ClassMismatch { .. } => stats.class_mismatch_count += 1,
454 }
455 }
456 LeakReport {
457 suspects,
458 telemetry,
459 stats,
460 replay_hash: None,
461 }
462 }
463
464 pub fn extend(&mut self, other: LeakReport) {
466 self.suspects.extend(other.suspects);
467 self.telemetry.extend(other.telemetry);
468 *self = LeakReport::from_parts(
469 std::mem::take(&mut self.suspects),
470 std::mem::take(&mut self.telemetry),
471 );
472 }
473}
474
475#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
477#[non_exhaustive]
478pub enum OpenAiPrivateLabel {
479 PrivatePerson,
481 PrivateAddress,
483 PrivateEmail,
485 PrivatePhone,
487 PrivateUrl,
489 PrivateDate,
491 AccountNumber,
493 Secret,
495}
496
497impl OpenAiPrivateLabel {
498 pub fn as_str(self) -> &'static str {
500 match self {
501 Self::PrivatePerson => "private_person",
502 Self::PrivateAddress => "private_address",
503 Self::PrivateEmail => "private_email",
504 Self::PrivatePhone => "private_phone",
505 Self::PrivateUrl => "private_url",
506 Self::PrivateDate => "private_date",
507 Self::AccountNumber => "account_number",
508 Self::Secret => "secret",
509 }
510 }
511}
512
513#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
515#[non_exhaustive]
516pub enum SafetyNetPiiClass {
517 Email,
519 Name,
521 Location,
523 Phone,
525 Url,
527 Date,
529 AccountNumber,
531 Secret,
533}
534
535impl SafetyNetPiiClass {
536 pub fn to_pii_class(self) -> PiiClass {
538 match self {
539 Self::Email => PiiClass::Email,
540 Self::Name => PiiClass::Name,
541 Self::Location => PiiClass::Location,
542 Self::Phone => PiiClass::custom("phone"),
543 Self::Url => PiiClass::custom("url"),
544 Self::Date => PiiClass::custom("date"),
545 Self::AccountNumber => PiiClass::custom("account_number"),
546 Self::Secret => PiiClass::custom("secret"),
547 }
548 }
549}
550
551#[derive(Debug, Clone, PartialEq, Eq, Error)]
553#[non_exhaustive]
554pub enum SafetyNetError {
555 #[error("safety net unavailable: {reason}")]
557 Unavailable {
558 reason: String,
560 },
561 #[error("safety net weights missing: {path}")]
563 WeightsMissing {
564 path: String,
566 },
567 #[error("safety net model unavailable: {reason}")]
569 ModelUnavailable {
570 reason: String,
572 },
573 #[error("safety net input too large: limit={limit}, actual={actual}")]
575 InputTooLarge {
576 limit: usize,
578 actual: usize,
580 },
581 #[error("safety net runtime failed: {message}")]
583 Runtime {
584 message: String,
586 },
587 #[error("safety net invalid output: {message}")]
589 InvalidOutput {
590 message: String,
592 },
593}
594
595#[derive(Debug, Clone, Copy, PartialEq, Eq)]
609#[non_exhaustive]
610pub enum Action {
611 Tokenize,
613 Redact,
615 FormatPreserve,
617 Generalize,
619 Preserve,
621}
622
623#[derive(Debug, Clone, Copy, PartialEq, Eq)]
625#[non_exhaustive]
626pub enum ConflictTier {
627 None,
629 ClassPriority,
631 RulePriority,
633 Score,
635 SpanLength,
637 Validator,
639 RecognizerId,
641 Merged,
643}
644
645#[derive(Debug, Clone, Copy, PartialEq, Eq)]
647#[non_exhaustive]
648pub enum DocumentKind {
649 Structured,
651 Text,
653}
654
655#[derive(Debug, Clone, PartialEq, Eq)]
665#[non_exhaustive]
666pub struct RedactionEntry {
667 pub source: String,
669 pub class: PiiClass,
671 pub action: Action,
673 pub field_name: Option<String>,
675 pub document_kind: DocumentKind,
677 pub conflict_loser: bool,
679 pub decided_by: ConflictTier,
681 pub created_at: i64,
683 pub session_id: Option<String>,
685}
686
687impl RedactionEntry {
688 #[allow(clippy::too_many_arguments)]
690 pub fn new(
691 source: impl Into<String>,
692 class: PiiClass,
693 action: Action,
694 field_name: Option<String>,
695 document_kind: DocumentKind,
696 conflict_loser: bool,
697 decided_by: ConflictTier,
698 created_at: i64,
699 session_id: Option<String>,
700 ) -> Self {
701 Self {
702 source: source.into(),
703 class,
704 action,
705 field_name,
706 document_kind,
707 conflict_loser,
708 decided_by,
709 created_at,
710 session_id,
711 }
712 }
713}
714
715#[derive(Debug, Clone, PartialEq, Eq, Error)]
717#[non_exhaustive]
718pub enum RedactionLogError {
719 #[error("sqlite redaction log error: {0}")]
721 Sqlite(String),
722 #[error("backend redaction log error: {0}")]
724 Backend(String),
725}
726
727pub trait RedactionLogger: Send + Sync {
757 fn log(&self, entry: &RedactionEntry) -> Result<(), RedactionLogError>;
759}
760
761#[derive(Debug, Clone, PartialEq, Eq, Hash)]
763#[non_exhaustive]
764pub enum LocaleTag {
765 Global,
767 DeDe,
769 DeAt,
771 DeCh,
773 EnUs,
775 EnGb,
777 EnIe,
779 EnAu,
781 EnCa,
783 Other(String),
785}
786
787#[derive(Debug, Clone, PartialEq, Eq)]
789#[non_exhaustive]
790pub enum LocaleError {
791 Unsupported,
793}
794
795impl fmt::Display for LocaleError {
796 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
797 match self {
798 LocaleError::Unsupported => f.write_str("unsupported locale"),
799 }
800 }
801}
802
803impl std::error::Error for LocaleError {}
804
805#[derive(Debug, Clone, PartialEq, Eq)]
807pub struct LocaleChain(Vec<LocaleTag>);
808
809impl LocaleTag {
810 pub const GLOBAL: LocaleTag = LocaleTag::Global;
812
813 pub fn parse(s: &str) -> Result<LocaleTag, LocaleError> {
815 let raw = s.trim().replace('_', "-");
816 let normalized = raw.to_ascii_lowercase();
817 match normalized.as_str() {
818 "global" | "*" => Ok(LocaleTag::Global),
819 "de-de" => Ok(LocaleTag::DeDe),
820 "de-at" => Ok(LocaleTag::DeAt),
821 "de-ch" => Ok(LocaleTag::DeCh),
822 "en-us" => Ok(LocaleTag::EnUs),
823 "en-gb" => Ok(LocaleTag::EnGb),
824 "en-ie" => Ok(LocaleTag::EnIe),
825 "en-au" => Ok(LocaleTag::EnAu),
826 "en-ca" => Ok(LocaleTag::EnCa),
827 "" => Err(LocaleError::Unsupported),
828 _ if is_bcp47_parseable(&raw) => Ok(LocaleTag::Other(canonical_other(&raw))),
829 _ => Err(LocaleError::Unsupported),
830 }
831 }
832
833 pub fn as_str(&self) -> &str {
835 match self {
836 LocaleTag::Global => "global",
837 LocaleTag::DeDe => "de-DE",
838 LocaleTag::DeAt => "de-AT",
839 LocaleTag::DeCh => "de-CH",
840 LocaleTag::EnUs => "en-US",
841 LocaleTag::EnGb => "en-GB",
842 LocaleTag::EnIe => "en-IE",
843 LocaleTag::EnAu => "en-AU",
844 LocaleTag::EnCa => "en-CA",
845 LocaleTag::Other(tag) => tag.as_str(),
846 }
847 }
848}
849
850impl LocaleChain {
851 pub fn from_tags(mut tags: Vec<LocaleTag>) -> LocaleChain {
853 ensure_global(&mut tags);
854 LocaleChain(tags)
855 }
856
857 pub fn from_cli(raw: &str) -> Result<LocaleChain, LocaleError> {
859 let tags = raw
860 .split(',')
861 .map(LocaleTag::parse)
862 .collect::<Result<Vec<_>, _>>()?;
863 Ok(LocaleChain::from_tags(tags))
864 }
865
866 pub fn merge_policy_and_cli(
868 policy: Option<&[LocaleTag]>,
869 cli: Option<&[LocaleTag]>,
870 ) -> LocaleChain {
871 Self::merge_cli_policy_rulepack_default(cli, policy, None)
872 }
873
874 pub fn merge_cli_policy_rulepack_default(
876 cli: Option<&[LocaleTag]>,
877 policy: Option<&[LocaleTag]>,
878 rulepack_defaults: Option<&[LocaleTag]>,
879 ) -> LocaleChain {
880 let tags = cli
881 .filter(|tags| !tags.is_empty())
882 .or_else(|| policy.filter(|tags| !tags.is_empty()))
883 .or_else(|| rulepack_defaults.filter(|tags| !tags.is_empty()))
884 .map(|tags| tags.to_vec())
885 .unwrap_or_else(|| vec![LocaleTag::Global]);
886 LocaleChain::from_tags(tags)
887 }
888
889 pub fn intersects(&self, recognizer_locales: &[LocaleTag]) -> bool {
891 if recognizer_locales.is_empty() {
892 return true;
893 }
894 recognizer_locales.iter().any(|recognizer_locale| {
895 *recognizer_locale == LocaleTag::Global
896 || self.0.iter().any(|active| active == recognizer_locale)
897 })
898 }
899
900 pub fn as_slice(&self) -> &[LocaleTag] {
902 &self.0
903 }
904
905 pub fn to_strings(&self) -> Vec<String> {
907 self.0.iter().map(ToString::to_string).collect()
908 }
909}
910
911impl From<&[LocaleTag]> for LocaleChain {
912 fn from(tags: &[LocaleTag]) -> Self {
913 let mut owned = tags.to_vec();
914 ensure_global(&mut owned);
915 LocaleChain(owned)
916 }
917}
918
919impl fmt::Display for LocaleTag {
920 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
921 f.write_str(self.as_str())
922 }
923}
924
925#[derive(Debug, Clone)]
935#[non_exhaustive]
936pub enum RawDocument {
937 Structured(BTreeMap<String, Value>),
939 Text(String),
941}
942
943#[derive(Debug, Clone, Serialize)]
962#[serde(untagged)]
963#[non_exhaustive]
964pub enum CleanDocument {
965 Structured(BTreeMap<String, Value>),
967 Text(String),
969}
970
971#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
973#[serde(untagged)]
974#[non_exhaustive]
975pub enum Value {
976 Null,
978 Bool(bool),
980 String(String),
982 I64(i64),
984 Array(Vec<Value>),
986 Object(BTreeMap<String, Value>),
988}
989
990impl Value {
991 pub fn as_str(&self) -> Option<&str> {
993 match self {
994 Self::String(value) => Some(value.as_str()),
995 Self::Null | Self::Bool(_) | Self::I64(_) | Self::Array(_) | Self::Object(_) => None,
996 }
997 }
998
999 pub fn scalar_to_safety_net_string(&self) -> Option<String> {
1001 match self {
1002 Self::String(value) if !value.is_empty() => Some(value.clone()),
1003 Self::String(_) | Self::Null | Self::Array(_) | Self::Object(_) => None,
1004 Self::Bool(value) => Some(value.to_string()),
1005 Self::I64(value) => Some(value.to_string()),
1006 }
1007 }
1008}
1009
1010impl PartialEq<&str> for Value {
1011 fn eq(&self, other: &&str) -> bool {
1012 self.as_str() == Some(*other)
1013 }
1014}
1015
1016#[derive(Debug, Clone, Default)]
1018pub struct DictionaryBundle {
1019 entries: HashMap<String, DictionaryEntry>,
1020}
1021
1022#[derive(Debug, Clone)]
1024pub struct DictionaryEntry {
1025 terms: Vec<String>,
1026 case_sensitive: bool,
1027 source: DictionarySource,
1028}
1029
1030#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1032#[non_exhaustive]
1033pub enum DictionarySource {
1034 Cli,
1036 Rulepack,
1038}
1039
1040#[derive(Debug, Clone, PartialEq, Eq)]
1042#[non_exhaustive]
1043pub struct DictionaryStats {
1044 pub name: String,
1046 pub term_count: usize,
1048 pub source: DictionarySource,
1050}
1051
1052impl DictionaryStats {
1053 pub fn new(name: impl Into<String>, term_count: usize, source: DictionarySource) -> Self {
1055 Self {
1056 name: name.into(),
1057 term_count,
1058 source,
1059 }
1060 }
1061}
1062
1063#[derive(Debug, Clone, PartialEq, Eq)]
1065#[non_exhaustive]
1066pub struct RulepackDict {
1067 pub name: String,
1069 pub terms: Vec<String>,
1071 pub case_sensitive: bool,
1073}
1074
1075impl RulepackDict {
1076 pub fn new(name: impl Into<String>, terms: Vec<String>, case_sensitive: bool) -> Self {
1078 Self {
1079 name: name.into(),
1080 terms,
1081 case_sensitive,
1082 }
1083 }
1084}
1085
1086#[derive(Debug, Clone, PartialEq, Eq)]
1088#[non_exhaustive]
1089pub enum DictionaryLoadError {
1090 Empty { name: String },
1092 UnicodeInsensitiveUnsupported { name: String },
1094}
1095
1096impl fmt::Display for DictionaryLoadError {
1097 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1098 match self {
1099 Self::Empty { name } => write!(f, "dictionary '{name}' has no terms"),
1100 Self::UnicodeInsensitiveUnsupported { name } => write!(
1101 f,
1102 "dictionary '{name}' uses unicode terms with case-insensitive matching, unsupported in v0.4.0; use case_sensitive = true"
1103 ),
1104 }
1105 }
1106}
1107
1108impl std::error::Error for DictionaryLoadError {}
1109
1110impl DictionaryBundle {
1111 pub fn from_rulepack_terms(terms: &[RulepackDict]) -> Self {
1113 let mut entries = HashMap::with_capacity(terms.len());
1114 for dictionary in terms {
1115 let entry = DictionaryEntry::new(
1116 &dictionary.name,
1117 dictionary.terms.clone(),
1118 dictionary.case_sensitive,
1119 DictionarySource::Rulepack,
1120 )
1121 .expect("Policy validates dictionary terms before bundle construction");
1122 entries.insert(dictionary.name.clone(), entry);
1123 }
1124 Self { entries }
1125 }
1126
1127 pub fn from_entries(entries: impl IntoIterator<Item = (String, DictionaryEntry)>) -> Self {
1129 Self {
1130 entries: entries.into_iter().collect(),
1131 }
1132 }
1133
1134 pub fn merge(a: Self, b: Self) -> Self {
1136 let mut entries = a.entries;
1137 entries.extend(b.entries);
1138 Self { entries }
1139 }
1140
1141 pub fn get(&self, name: &str) -> Option<&DictionaryEntry> {
1143 self.entries.get(name)
1144 }
1145
1146 pub fn stats(&self) -> Vec<DictionaryStats> {
1148 let mut stats = self
1149 .entries
1150 .iter()
1151 .map(|(name, entry)| DictionaryStats {
1152 name: name.clone(),
1153 term_count: entry.terms.len(),
1154 source: entry.source,
1155 })
1156 .collect::<Vec<_>>();
1157 stats.sort_by(|a, b| a.name.cmp(&b.name));
1158 stats
1159 }
1160}
1161
1162impl DictionaryEntry {
1163 pub fn new(
1165 name: &str,
1166 terms: Vec<String>,
1167 case_sensitive: bool,
1168 source: DictionarySource,
1169 ) -> Result<Self, DictionaryLoadError> {
1170 if terms.is_empty() {
1171 return Err(DictionaryLoadError::Empty {
1172 name: name.to_string(),
1173 });
1174 }
1175 if !case_sensitive && terms.iter().any(|term| !term.is_ascii()) {
1176 return Err(DictionaryLoadError::UnicodeInsensitiveUnsupported {
1177 name: name.to_string(),
1178 });
1179 }
1180 Ok(Self {
1181 terms,
1182 case_sensitive,
1183 source,
1184 })
1185 }
1186
1187 pub fn case_sensitive(&self) -> bool {
1189 self.case_sensitive
1190 }
1191
1192 pub fn terms(&self) -> &[String] {
1194 &self.terms
1195 }
1196}
1197
1198#[cfg(test)]
1199mod dictionary_tests {
1200 use super::*;
1201
1202 #[test]
1203 fn dictionary_entry_rejects_empty_terms() {
1204 let err = DictionaryEntry::new("empty", Vec::new(), true, DictionarySource::Cli)
1205 .expect_err("empty dictionaries must fail closed");
1206
1207 assert!(matches!(err, DictionaryLoadError::Empty { name } if name == "empty"));
1208 }
1209
1210 #[test]
1211 fn dictionary_entry_rejects_non_ascii_case_insensitive_terms() {
1212 let err = DictionaryEntry::new(
1213 "songs",
1214 vec!["Beyonce".to_string(), "Caf\u{00e9}".to_string()],
1215 false,
1216 DictionarySource::Cli,
1217 )
1218 .expect_err("unicode case-insensitive dictionaries must fail closed");
1219
1220 assert!(matches!(
1221 err,
1222 DictionaryLoadError::UnicodeInsensitiveUnsupported { name } if name == "songs"
1223 ));
1224 }
1225}
1226
1227#[cfg(test)]
1228mod redaction_logger_tests {
1229 use super::*;
1230
1231 struct CapturingLogger;
1232
1233 impl RedactionLogger for CapturingLogger {
1234 fn log(&self, _entry: &RedactionEntry) -> Result<(), RedactionLogError> {
1235 Ok(())
1236 }
1237 }
1238
1239 fn assert_send_sync<T: Send + Sync + ?Sized>() {}
1240
1241 #[test]
1242 fn redaction_log_error_display_is_stable() {
1243 assert_eq!(
1244 RedactionLogError::Sqlite("write failed".to_string()).to_string(),
1245 "sqlite redaction log error: write failed"
1246 );
1247 assert_eq!(
1248 RedactionLogError::Backend("sink failed".to_string()).to_string(),
1249 "backend redaction log error: sink failed"
1250 );
1251 }
1252
1253 #[test]
1254 fn redaction_logger_trait_object_is_send_sync() {
1255 assert_send_sync::<dyn RedactionLogger>();
1256 }
1257
1258 #[test]
1259 fn local_logger_can_implement_redaction_logger() {
1260 let logger = CapturingLogger;
1261 let entry = RedactionEntry {
1262 source: "unit-test".to_string(),
1263 class: PiiClass::Email,
1264 action: Action::Tokenize,
1265 field_name: None,
1266 document_kind: DocumentKind::Text,
1267 conflict_loser: false,
1268 decided_by: ConflictTier::None,
1269 created_at: 0,
1270 session_id: None,
1271 };
1272
1273 let trait_object: &dyn RedactionLogger = &logger;
1274 trait_object.log(&entry).expect("log entry");
1275 }
1276}
1277
1278#[cfg(test)]
1279mod safety_net_manifest_tests {
1280 use super::*;
1281
1282 fn span(start: usize, end: usize, class: PiiClass) -> EmittedTokenSpan {
1283 EmittedTokenSpan {
1284 clean_span: start..end,
1285 raw_span: start..end,
1286 class,
1287 }
1288 }
1289
1290 fn diff(manifest: Manifest, suspect: Range<usize>, class: PiiClass) -> Option<LeakKind> {
1291 manifest.diff_against(&suspect, &class)
1292 }
1293
1294 #[test]
1295 fn exact_same_class_coverage_is_not_a_leak() {
1296 let manifest = Manifest::from_spans(vec![span(0, 8, PiiClass::Email)]);
1297
1298 assert_eq!(diff(manifest, 0..8, PiiClass::Email), None);
1299 }
1300
1301 #[test]
1302 fn uncovered_outside_all_tokens_is_uncovered() {
1303 let manifest = Manifest::from_spans(vec![span(20, 30, PiiClass::Email)]);
1304
1305 assert_eq!(
1306 diff(manifest, 0..10, PiiClass::Email),
1307 Some(LeakKind::Uncovered)
1308 );
1309 }
1310
1311 #[test]
1312 fn single_internal_gap_returns_partial_bleed() {
1313 let manifest = Manifest::from_spans(vec![
1314 span(0, 5, PiiClass::Email),
1315 span(10, 15, PiiClass::Email),
1316 ]);
1317
1318 assert_eq!(
1319 diff(manifest, 0..15, PiiClass::Email),
1320 Some(LeakKind::PartialBleed { uncovered: 5..10 })
1321 );
1322 }
1323
1324 #[test]
1325 fn multi_gap_returns_deterministic_first_uncovered_gap() {
1326 let manifest = Manifest::from_spans(vec![
1327 span(0, 3, PiiClass::Email),
1328 span(5, 7, PiiClass::Email),
1329 span(9, 12, PiiClass::Email),
1330 ]);
1331
1332 assert_eq!(
1335 diff(manifest, 0..12, PiiClass::Email),
1336 Some(LeakKind::PartialBleed { uncovered: 3..5 })
1337 );
1338 }
1339
1340 #[test]
1341 fn multi_class_overlap_reports_first_mismatch_deterministically() {
1342 let manifest = Manifest::from_spans(vec![
1343 span(0, 4, PiiClass::Name),
1344 span(4, 8, PiiClass::Location),
1345 ]);
1346
1347 assert_eq!(
1348 diff(manifest, 0..8, PiiClass::Email),
1349 Some(LeakKind::ClassMismatch {
1350 pipeline_class: PiiClass::Name,
1351 safety_net_class: PiiClass::Email,
1352 })
1353 );
1354 }
1355
1356 #[test]
1357 fn adjacent_same_class_tokens_cover_continuously() {
1358 let manifest = Manifest::from_spans(vec![
1359 span(0, 5, PiiClass::Email),
1360 span(5, 10, PiiClass::Email),
1361 ]);
1362
1363 assert_eq!(diff(manifest, 0..10, PiiClass::Email), None);
1364 }
1365
1366 #[test]
1367 fn partial_bleed_at_start_end_and_middle() {
1368 let manifest = Manifest::from_spans(vec![span(3, 8, PiiClass::Email)]);
1369
1370 assert_eq!(
1371 diff(manifest.clone(), 0..8, PiiClass::Email),
1372 Some(LeakKind::PartialBleed { uncovered: 0..3 })
1373 );
1374 assert_eq!(
1375 diff(manifest.clone(), 3..10, PiiClass::Email),
1376 Some(LeakKind::PartialBleed { uncovered: 8..10 })
1377 );
1378
1379 let with_gap = Manifest::from_spans(vec![
1380 span(0, 3, PiiClass::Email),
1381 span(6, 10, PiiClass::Email),
1382 ]);
1383 assert_eq!(
1384 diff(with_gap, 0..10, PiiClass::Email),
1385 Some(LeakKind::PartialBleed { uncovered: 3..6 })
1386 );
1387 }
1388
1389 #[test]
1390 fn byte_indices_are_not_character_indices() {
1391 let text = "ID: 😀 <Email_1>";
1392 let token_start = text.find("<Email_1>").expect("token start");
1393 assert_eq!(token_start, 9, "emoji is four bytes, not one char");
1394 let manifest = Manifest::from_spans(vec![span(token_start, text.len(), PiiClass::Email)]);
1395
1396 assert_eq!(
1397 diff(manifest, token_start..text.len(), PiiClass::Email),
1398 None
1399 );
1400 }
1401
1402 #[test]
1403 fn empty_suspect_range_is_not_a_leak() {
1404 let manifest = Manifest::default();
1405
1406 assert_eq!(diff(manifest, 3..3, PiiClass::Email), None);
1407 }
1408
1409 #[test]
1410 fn safety_net_error_display_is_variant_specific_and_bytes_free() {
1411 let cases = [
1412 SafetyNetError::Unavailable {
1413 reason: "not configured".to_string(),
1414 }
1415 .to_string(),
1416 SafetyNetError::WeightsMissing {
1417 path: "/models/opf".to_string(),
1418 }
1419 .to_string(),
1420 SafetyNetError::ModelUnavailable {
1421 reason: "load failed".to_string(),
1422 }
1423 .to_string(),
1424 SafetyNetError::InputTooLarge {
1425 limit: 1024,
1426 actual: 2048,
1427 }
1428 .to_string(),
1429 SafetyNetError::Runtime {
1430 message: "timeout".to_string(),
1431 }
1432 .to_string(),
1433 SafetyNetError::InvalidOutput {
1434 message: "bad json".to_string(),
1435 }
1436 .to_string(),
1437 ];
1438
1439 for rendered in cases {
1440 assert!(!rendered.contains("alice@example.invalid"));
1441 }
1442 }
1443}
1444
1445pub trait Recognizer: Send + Sync {
1447 fn id(&self) -> &str;
1449 fn supported_class(&self) -> &PiiClass;
1451 fn detect(&self, input: &str, ctx: &DetectContext<'_>) -> Vec<Candidate>;
1453 fn token_family(&self) -> &str;
1455 fn locales(&self) -> &[LocaleTag] {
1457 &[LocaleTag::Global]
1458 }
1459}
1460
1461#[derive(Debug, Clone, PartialEq)]
1463#[non_exhaustive]
1464pub struct Candidate {
1465 pub span: Range<usize>,
1467 pub class: PiiClass,
1469 pub recognizer_id: String,
1471 pub score: f32,
1473 pub priority: i32,
1475 pub canonical_form: Option<String>,
1477 pub token_family: String,
1479 pub source: String,
1481 pub decided_by: ConflictTier,
1483 pub merged_sources: Vec<String>,
1485}
1486
1487impl Candidate {
1488 #[allow(clippy::too_many_arguments)]
1490 pub fn new(
1491 span: Range<usize>,
1492 class: PiiClass,
1493 recognizer_id: impl Into<String>,
1494 score: f32,
1495 priority: i32,
1496 canonical_form: Option<String>,
1497 token_family: impl Into<String>,
1498 source: impl Into<String>,
1499 decided_by: ConflictTier,
1500 merged_sources: Vec<String>,
1501 ) -> Self {
1502 Self {
1503 span,
1504 class,
1505 recognizer_id: recognizer_id.into(),
1506 score,
1507 priority,
1508 canonical_form,
1509 token_family: token_family.into(),
1510 source: source.into(),
1511 decided_by,
1512 merged_sources,
1513 }
1514 }
1515
1516 pub fn with_span(mut self, span: Range<usize>) -> Self {
1518 self.span = span;
1519 self
1520 }
1521}
1522
1523#[non_exhaustive]
1525pub struct DetectContext<'a> {
1526 pub locale_chain: &'a [LocaleTag],
1528 pub dictionaries: &'a DictionaryBundle,
1530 pub fields: &'a (),
1532 pub degraded: Cell<bool>,
1534}
1535
1536impl<'a> DetectContext<'a> {
1537 pub fn new(locale_chain: &'a [LocaleTag], dictionaries: &'a DictionaryBundle) -> Self {
1539 Self {
1540 locale_chain,
1541 dictionaries,
1542 fields: &(),
1543 degraded: Cell::new(false),
1544 }
1545 }
1546}
1547
1548fn ensure_global(tags: &mut Vec<LocaleTag>) {
1549 if !tags.contains(&LocaleTag::Global) {
1550 tags.push(LocaleTag::Global);
1551 }
1552}
1553
1554fn is_bcp47_parseable(raw: &str) -> bool {
1555 let mut parts = raw.split('-');
1556 let Some(language) = parts.next() else {
1557 return false;
1558 };
1559 if !(2..=8).contains(&language.len()) || !language.chars().all(|ch| ch.is_ascii_alphabetic()) {
1560 return false;
1561 }
1562 parts.all(|part| {
1563 (2..=8).contains(&part.len()) && part.chars().all(|ch| ch.is_ascii_alphanumeric())
1564 })
1565}
1566
1567fn canonical_other(raw: &str) -> String {
1568 let mut parts = raw.split('-');
1569 let language = parts.next().unwrap_or_default().to_ascii_lowercase();
1570 let rest = parts.map(|part| {
1571 if part.len() == 2 && part.chars().all(|ch| ch.is_ascii_alphabetic()) {
1572 part.to_ascii_uppercase()
1573 } else {
1574 part.to_ascii_lowercase()
1575 }
1576 });
1577 std::iter::once(language)
1578 .chain(rest)
1579 .collect::<Vec<_>>()
1580 .join("-")
1581}