1#![cfg_attr(docsrs, feature(doc_cfg))]
2
3use std::cell::Cell;
4use std::collections::{BTreeMap, HashMap};
5use std::fmt;
6use std::ops::Range;
7
8use serde::{Deserialize, Serialize};
9use thiserror::Error;
10
11pub trait Detector: Send + Sync {
13 fn detect(&self, input: &str) -> Vec<Detection>;
15}
16
17#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
45pub enum PiiClass {
46 Email,
48 Name,
50 Location,
52 Organization,
54 Custom(String),
56}
57
58pub const BUILTIN_CLASS_NAMES: &[&str] = &["Email", "Name", "Location", "Organization"];
60
61impl PiiClass {
62 pub fn from_policy_name(input: &str) -> Option<Self> {
64 match input {
65 "email" => Some(Self::Email),
66 "name" => Some(Self::Name),
67 "location" => Some(Self::Location),
68 "organization" => Some(Self::Organization),
69 custom if custom.starts_with("custom:") => {
70 let name = custom.trim_start_matches("custom:");
71 (!name.trim().is_empty()).then(|| Self::custom(name))
72 }
73 _ => None,
74 }
75 }
76
77 pub fn builtin_variants() -> &'static [PiiClass] {
79 &[
80 PiiClass::Email,
81 PiiClass::Name,
82 PiiClass::Location,
83 PiiClass::Organization,
84 ]
85 }
86
87 pub fn custom(name: &str) -> Self {
89 let mut normalized = String::new();
90 let mut pending_underscore = false;
91 for ch in name.trim().chars() {
92 if ch.is_ascii_alphanumeric() {
93 if pending_underscore && !normalized.is_empty() {
94 normalized.push('_');
95 }
96 normalized.push(ch.to_ascii_lowercase());
97 pending_underscore = false;
98 } else {
99 pending_underscore = true;
100 }
101 }
102
103 Self::Custom(normalized)
104 }
105
106 pub fn as_custom_name(&self) -> Option<&str> {
108 match self {
109 Self::Custom(name) => Some(name.as_str()),
110 Self::Email | Self::Name | Self::Location | Self::Organization => None,
111 }
112 }
113
114 pub fn class_name(&self) -> String {
116 match self {
117 Self::Email => BUILTIN_CLASS_NAMES[0].to_string(),
118 Self::Name => BUILTIN_CLASS_NAMES[1].to_string(),
119 Self::Location => BUILTIN_CLASS_NAMES[2].to_string(),
120 Self::Organization => BUILTIN_CLASS_NAMES[3].to_string(),
121 Self::Custom(name) => format!("Custom:{name}"),
122 }
123 }
124}
125
126#[derive(Debug, Clone, PartialEq, Eq)]
128#[non_exhaustive]
129pub struct Detection {
130 pub span: Range<usize>,
132 pub class: PiiClass,
134 pub source: String,
136}
137
138impl Detection {
139 pub fn new(span: Range<usize>, class: PiiClass, source: impl Into<String>) -> Self {
141 Self {
142 span,
143 class,
144 source: source.into(),
145 }
146 }
147}
148
149pub trait SafetyNet: Send + Sync {
163 fn id(&self) -> &str;
165
166 fn supported_locales(&self) -> &[LocaleTag];
168
169 fn check(
171 &self,
172 clean_text: &str,
173 context: SafetyNetContext<'_>,
174 ) -> Result<Vec<LeakSuspect>, SafetyNetError>;
175}
176
177#[derive(Debug, Clone, Copy)]
179#[non_exhaustive]
180pub struct SafetyNetContext<'a> {
181 pub manifest: &'a Manifest,
183 pub locale_chain: &'a [LocaleTag],
187 pub document_kind: DocumentKind,
189 pub session_id: Option<&'a str>,
191 pub field_path: Option<&'a str>,
193}
194
195impl<'a> SafetyNetContext<'a> {
196 pub fn new(
198 manifest: &'a Manifest,
199 locale_chain: &'a [LocaleTag],
200 document_kind: DocumentKind,
201 session_id: Option<&'a str>,
202 field_path: Option<&'a str>,
203 ) -> Self {
204 Self {
205 manifest,
206 locale_chain,
207 document_kind,
208 session_id,
209 field_path,
210 }
211 }
212}
213
214#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
216#[non_exhaustive]
217pub struct EmittedTokenSpan {
218 pub clean_span: Range<usize>,
220 pub raw_span: Range<usize>,
222 pub class: PiiClass,
224}
225
226impl EmittedTokenSpan {
227 pub fn new(clean_span: Range<usize>, raw_span: Range<usize>, class: PiiClass) -> Self {
229 Self {
230 clean_span,
231 raw_span,
232 class,
233 }
234 }
235}
236
237#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
239#[non_exhaustive]
240pub struct Manifest {
241 pub spans: Vec<EmittedTokenSpan>,
243}
244
245impl Manifest {
246 pub fn from_spans(mut spans: Vec<EmittedTokenSpan>) -> Self {
248 spans.sort_by_key(|span| (span.clean_span.start, span.clean_span.end));
249 Self { spans }
250 }
251
252 pub fn diff_against(
260 &self,
261 suspect_span: &Range<usize>,
262 suspect_class: &PiiClass,
263 ) -> Option<LeakKind> {
264 if suspect_span.is_empty() {
265 return None;
266 }
267
268 let start_idx = self
269 .spans
270 .partition_point(|span| span.clean_span.end <= suspect_span.start);
271 let overlapping = self.spans[start_idx..]
272 .iter()
273 .take_while(|span| span.clean_span.start < suspect_span.end)
274 .filter(|span| ranges_overlap(&span.clean_span, suspect_span))
275 .collect::<Vec<_>>();
276
277 if overlapping.is_empty() {
278 return Some(LeakKind::Uncovered);
279 }
280
281 let mut cursor = suspect_span.start;
282 let mut first_mismatch = None::<&EmittedTokenSpan>;
283 for span in overlapping {
284 if span.clean_span.start > cursor {
285 return Some(LeakKind::PartialBleed {
286 uncovered: cursor..span.clean_span.start.min(suspect_span.end),
287 });
288 }
289
290 if span.clean_span.end > cursor {
291 if first_mismatch.is_none() && &span.class != suspect_class {
292 first_mismatch = Some(span);
293 }
294 cursor = cursor.max(span.clean_span.end.min(suspect_span.end));
295 if cursor >= suspect_span.end {
296 break;
297 }
298 }
299 }
300
301 if cursor < suspect_span.end {
302 return Some(LeakKind::PartialBleed {
303 uncovered: cursor..suspect_span.end,
304 });
305 }
306
307 first_mismatch.map(|span| LeakKind::ClassMismatch {
308 pipeline_class: span.class.clone(),
309 safety_net_class: suspect_class.clone(),
310 })
311 }
312}
313
314fn ranges_overlap(left: &Range<usize>, right: &Range<usize>) -> bool {
315 left.start < right.end && right.start < left.end
316}
317
318#[derive(Debug, Clone, PartialEq)]
320#[non_exhaustive]
321pub struct LeakSuspect {
322 pub span: Range<usize>,
324 pub class: PiiClass,
326 pub safety_net_id: String,
328 pub score: Option<f32>,
330 pub kind: LeakKind,
332 pub raw_label: String,
334 pub field_path: Option<String>,
336}
337
338impl LeakSuspect {
339 pub fn new(
341 span: Range<usize>,
342 class: PiiClass,
343 safety_net_id: impl Into<String>,
344 score: Option<f32>,
345 kind: LeakKind,
346 raw_label: impl Into<String>,
347 field_path: Option<String>,
348 ) -> Self {
349 Self {
350 span,
351 class,
352 safety_net_id: safety_net_id.into(),
353 score,
354 kind,
355 raw_label: raw_label.into(),
356 field_path,
357 }
358 }
359}
360
361#[derive(Debug, Clone, PartialEq, Eq)]
365#[non_exhaustive]
366pub enum LeakKind {
367 Uncovered,
369 PartialBleed {
371 uncovered: Range<usize>,
373 },
374 ClassMismatch {
376 pipeline_class: PiiClass,
378 safety_net_class: PiiClass,
380 },
381}
382
383#[derive(Debug, Clone, PartialEq, Eq)]
385#[non_exhaustive]
386pub enum LeakReportTelemetry {
387 LocaleSkipped {
389 safety_net_id: String,
391 document_kind: DocumentKind,
393 field_path: Option<String>,
395 },
396}
397
398#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
400#[non_exhaustive]
401pub struct LeakReportStats {
402 pub suspect_count: usize,
404 pub uncovered_count: usize,
406 pub partial_bleed_count: usize,
408 pub class_mismatch_count: usize,
410 pub locale_skipped_count: usize,
412}
413
414#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
421#[non_exhaustive]
422pub struct DocumentExtension {
423 pub schema_version: u16,
425 pub clean_md_sha256: [u8; 32],
427 pub layout_json_sha256: [u8; 32],
429 pub report_json_sha256: [u8; 32],
431 #[serde(default, skip_serializing_if = "Option::is_none")]
433 pub preview_png_sha256: Option<[u8; 32]>,
434 pub page_count: u32,
436 pub audit_session_id: String,
438 #[serde(default, skip_serializing_if = "Vec::is_empty")]
440 pub clean_spans: Vec<EmittedTokenSpan>,
441 #[serde(default, skip_serializing_if = "Vec::is_empty")]
443 pub codec_audit: Vec<CodecAuditRow>,
444}
445
446impl DocumentExtension {
447 pub fn builder(schema_version: u16) -> DocumentExtensionBuilder {
449 DocumentExtensionBuilder {
450 schema_version,
451 clean_md_sha256: None,
452 layout_json_sha256: None,
453 report_json_sha256: None,
454 preview_png_sha256: None,
455 page_count: None,
456 audit_session_id: None,
457 clean_spans: Vec::new(),
458 codec_audit: Vec::new(),
459 }
460 }
461}
462
463#[derive(Debug, Clone)]
465#[must_use]
466pub struct DocumentExtensionBuilder {
467 schema_version: u16,
468 clean_md_sha256: Option<[u8; 32]>,
469 layout_json_sha256: Option<[u8; 32]>,
470 report_json_sha256: Option<[u8; 32]>,
471 preview_png_sha256: Option<[u8; 32]>,
472 page_count: Option<u32>,
473 audit_session_id: Option<String>,
474 clean_spans: Vec<EmittedTokenSpan>,
475 codec_audit: Vec<CodecAuditRow>,
476}
477
478impl DocumentExtensionBuilder {
479 pub fn clean_md_sha256(mut self, hash: [u8; 32]) -> Self {
480 self.clean_md_sha256 = Some(hash);
481 self
482 }
483
484 pub fn layout_json_sha256(mut self, hash: [u8; 32]) -> Self {
485 self.layout_json_sha256 = Some(hash);
486 self
487 }
488
489 pub fn report_json_sha256(mut self, hash: [u8; 32]) -> Self {
490 self.report_json_sha256 = Some(hash);
491 self
492 }
493
494 pub fn preview_png_sha256(mut self, hash: [u8; 32]) -> Self {
495 self.preview_png_sha256 = Some(hash);
496 self
497 }
498
499 pub fn page_count(mut self, page_count: u32) -> Self {
500 self.page_count = Some(page_count);
501 self
502 }
503
504 pub fn audit_session_id(mut self, audit_session_id: impl Into<String>) -> Self {
505 self.audit_session_id = Some(audit_session_id.into());
506 self
507 }
508
509 pub fn clean_spans(mut self, clean_spans: Vec<EmittedTokenSpan>) -> Self {
510 self.clean_spans = clean_spans;
511 self
512 }
513
514 pub fn codec_audit(mut self, codec_audit: Vec<CodecAuditRow>) -> Self {
515 self.codec_audit = codec_audit;
516 self
517 }
518
519 pub fn build(self) -> Result<DocumentExtension, DocumentExtensionError> {
520 Ok(DocumentExtension {
521 schema_version: self.schema_version,
522 clean_md_sha256: self
523 .clean_md_sha256
524 .ok_or(DocumentExtensionError::MissingField("clean_md_sha256"))?,
525 layout_json_sha256: self
526 .layout_json_sha256
527 .ok_or(DocumentExtensionError::MissingField("layout_json_sha256"))?,
528 report_json_sha256: self
529 .report_json_sha256
530 .ok_or(DocumentExtensionError::MissingField("report_json_sha256"))?,
531 preview_png_sha256: self.preview_png_sha256,
532 page_count: self
533 .page_count
534 .ok_or(DocumentExtensionError::MissingField("page_count"))?,
535 audit_session_id: self
536 .audit_session_id
537 .ok_or(DocumentExtensionError::MissingField("audit_session_id"))?,
538 clean_spans: self.clean_spans,
539 codec_audit: self.codec_audit,
540 })
541 }
542}
543
544#[derive(Debug, Clone, PartialEq, Eq, Error)]
546#[non_exhaustive]
547pub enum DocumentExtensionError {
548 #[error("missing document extension field: {0}")]
549 MissingField(&'static str),
550}
551
552#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
554#[serde(rename_all = "snake_case")]
555#[non_exhaustive]
556pub enum TextOrigin {
557 Ocr,
559 EmbeddedText,
561 Transcript,
563 Hybrid,
565}
566
567#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
569#[non_exhaustive]
570pub struct CodecCapabilitySet {
571 pub text: bool,
573 pub layout: bool,
575 pub confidence: bool,
577 pub timestamps: bool,
579}
580
581impl CodecCapabilitySet {
582 pub const TEXT_ONLY: Self = Self {
584 text: true,
585 layout: false,
586 confidence: false,
587 timestamps: false,
588 };
589
590 pub const fn new(text: bool, layout: bool, confidence: bool, timestamps: bool) -> Self {
592 Self {
593 text,
594 layout,
595 confidence,
596 timestamps,
597 }
598 }
599
600 pub fn contains(self, requested: Self) -> bool {
602 (!requested.text || self.text)
603 && (!requested.layout || self.layout)
604 && (!requested.confidence || self.confidence)
605 && (!requested.timestamps || self.timestamps)
606 }
607}
608
609#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
611#[serde(rename_all = "snake_case")]
612#[non_exhaustive]
613pub enum ExtractionDensityPolicy {
614 Required(f32),
616 Exempt { reason: String },
618}
619
620impl Default for ExtractionDensityPolicy {
621 fn default() -> Self {
622 Self::Exempt {
623 reason: "calibration_pending".to_string(),
624 }
625 }
626}
627
628#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
630#[non_exhaustive]
631pub struct CodecAuditRow {
632 pub codec_id: String,
634 pub codec_version: String,
636 pub accepted_mime: String,
638 pub advertised: CodecCapabilitySet,
640 pub delivered: CodecCapabilitySet,
642 pub text_origin: TextOrigin,
644 pub codec_output_schema_version: u16,
646 #[serde(default, skip_serializing_if = "Option::is_none")]
648 pub options_hash_hex: Option<String>,
649 #[serde(default, skip_serializing_if = "Option::is_none")]
651 pub engine_provenance: Option<String>,
652 pub extraction_density_policy: ExtractionDensityPolicy,
654}
655
656impl CodecAuditRow {
657 pub fn new(
659 codec_id: impl Into<String>,
660 codec_version: impl Into<String>,
661 accepted_mime: impl Into<String>,
662 text_origin: TextOrigin,
663 ) -> Self {
664 Self {
665 codec_id: codec_id.into(),
666 codec_version: codec_version.into(),
667 accepted_mime: accepted_mime.into(),
668 advertised: CodecCapabilitySet::default(),
669 delivered: CodecCapabilitySet::default(),
670 text_origin,
671 codec_output_schema_version: 1,
672 options_hash_hex: None,
673 engine_provenance: None,
674 extraction_density_policy: ExtractionDensityPolicy::default(),
675 }
676 }
677}
678
679#[derive(Debug, Clone, Default, PartialEq)]
685#[non_exhaustive]
686pub struct LeakReport {
687 pub suspects: Vec<LeakSuspect>,
689 pub telemetry: Vec<LeakReportTelemetry>,
691 pub stats: LeakReportStats,
693 pub replay_hash: Option<String>,
698}
699
700impl LeakReport {
701 pub fn from_parts(
703 suspects: Vec<LeakSuspect>,
704 telemetry: Vec<LeakReportTelemetry>,
705 ) -> LeakReport {
706 let mut stats = LeakReportStats {
707 suspect_count: suspects.len(),
708 locale_skipped_count: telemetry
709 .iter()
710 .filter(|event| matches!(event, LeakReportTelemetry::LocaleSkipped { .. }))
711 .count(),
712 ..LeakReportStats::default()
713 };
714 for suspect in &suspects {
715 match suspect.kind {
716 LeakKind::Uncovered => stats.uncovered_count += 1,
717 LeakKind::PartialBleed { .. } => stats.partial_bleed_count += 1,
718 LeakKind::ClassMismatch { .. } => stats.class_mismatch_count += 1,
719 }
720 }
721 LeakReport {
722 suspects,
723 telemetry,
724 stats,
725 replay_hash: None,
726 }
727 }
728
729 pub fn extend(&mut self, other: LeakReport) {
731 self.suspects.extend(other.suspects);
732 self.telemetry.extend(other.telemetry);
733 *self = LeakReport::from_parts(
734 std::mem::take(&mut self.suspects),
735 std::mem::take(&mut self.telemetry),
736 );
737 }
738}
739
740#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
742#[non_exhaustive]
743pub enum OpenAiPrivateLabel {
744 PrivatePerson,
746 PrivateAddress,
748 PrivateEmail,
750 PrivatePhone,
752 PrivateUrl,
754 PrivateDate,
756 AccountNumber,
758 Secret,
760}
761
762impl OpenAiPrivateLabel {
763 pub fn as_str(self) -> &'static str {
765 match self {
766 Self::PrivatePerson => "private_person",
767 Self::PrivateAddress => "private_address",
768 Self::PrivateEmail => "private_email",
769 Self::PrivatePhone => "private_phone",
770 Self::PrivateUrl => "private_url",
771 Self::PrivateDate => "private_date",
772 Self::AccountNumber => "account_number",
773 Self::Secret => "secret",
774 }
775 }
776}
777
778#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
780#[non_exhaustive]
781pub enum SafetyNetPiiClass {
782 Email,
784 Name,
786 Location,
788 Phone,
790 Url,
792 Date,
794 AccountNumber,
796 Secret,
798}
799
800impl SafetyNetPiiClass {
801 pub fn to_pii_class(self) -> PiiClass {
803 match self {
804 Self::Email => PiiClass::Email,
805 Self::Name => PiiClass::Name,
806 Self::Location => PiiClass::Location,
807 Self::Phone => PiiClass::custom("phone"),
808 Self::Url => PiiClass::custom("url"),
809 Self::Date => PiiClass::custom("date"),
810 Self::AccountNumber => PiiClass::custom("account_number"),
811 Self::Secret => PiiClass::custom("secret"),
812 }
813 }
814}
815
816#[derive(Debug, Clone, PartialEq, Eq, Error)]
818#[non_exhaustive]
819pub enum SafetyNetError {
820 #[error("safety net unavailable: {reason}")]
822 Unavailable {
823 reason: String,
825 },
826 #[error("safety net weights missing: {path}")]
828 WeightsMissing {
829 path: String,
831 },
832 #[error("safety net model unavailable: {reason}")]
834 ModelUnavailable {
835 reason: String,
837 },
838 #[error("safety net input too large: limit={limit}, actual={actual}")]
840 InputTooLarge {
841 limit: usize,
843 actual: usize,
845 },
846 #[error("safety net runtime failed: {message}")]
848 Runtime {
849 message: String,
851 },
852 #[error("safety net invalid output: {message}")]
854 InvalidOutput {
855 message: String,
857 },
858}
859
860#[derive(Debug, Clone, Copy, PartialEq, Eq)]
874#[non_exhaustive]
875pub enum Action {
876 Tokenize,
878 Redact,
880 FormatPreserve,
882 Generalize,
884 Preserve,
886}
887
888#[derive(Debug, Clone, Copy, PartialEq, Eq)]
890#[non_exhaustive]
891pub enum ConflictTier {
892 None,
894 ClassPriority,
896 RulePriority,
898 Score,
900 SpanLength,
902 Validator,
904 RecognizerId,
906 Merged,
908}
909
910#[derive(Debug, Clone, Copy, PartialEq, Eq)]
912#[non_exhaustive]
913pub enum DocumentKind {
914 Structured,
916 Text,
918}
919
920#[derive(Debug, Clone, PartialEq, Eq)]
930#[non_exhaustive]
931pub struct RedactionEntry {
932 pub source: String,
934 pub class: PiiClass,
936 pub action: Action,
938 pub field_name: Option<String>,
940 pub document_kind: DocumentKind,
942 pub conflict_loser: bool,
944 pub decided_by: ConflictTier,
946 pub created_at: i64,
948 pub session_id: Option<String>,
950}
951
952impl RedactionEntry {
953 #[allow(clippy::too_many_arguments)]
955 pub fn new(
956 source: impl Into<String>,
957 class: PiiClass,
958 action: Action,
959 field_name: Option<String>,
960 document_kind: DocumentKind,
961 conflict_loser: bool,
962 decided_by: ConflictTier,
963 created_at: i64,
964 session_id: Option<String>,
965 ) -> Self {
966 Self {
967 source: source.into(),
968 class,
969 action,
970 field_name,
971 document_kind,
972 conflict_loser,
973 decided_by,
974 created_at,
975 session_id,
976 }
977 }
978}
979
980#[derive(Debug, Clone, PartialEq, Eq, Error)]
982#[non_exhaustive]
983pub enum RedactionLogError {
984 #[error("sqlite redaction log error: {0}")]
986 Sqlite(String),
987 #[error("backend redaction log error: {0}")]
989 Backend(String),
990}
991
992pub trait RedactionLogger: Send + Sync {
1022 fn log(&self, entry: &RedactionEntry) -> Result<(), RedactionLogError>;
1024}
1025
1026#[derive(Debug, Clone, PartialEq, Eq, Hash)]
1028#[non_exhaustive]
1029pub enum LocaleTag {
1030 Global,
1032 DeDe,
1034 DeAt,
1036 DeCh,
1038 EnUs,
1040 EnGb,
1042 EnIe,
1044 EnAu,
1046 EnCa,
1048 Other(String),
1050}
1051
1052#[derive(Debug, Clone, PartialEq, Eq)]
1054#[non_exhaustive]
1055pub enum LocaleError {
1056 Unsupported,
1058}
1059
1060impl fmt::Display for LocaleError {
1061 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1062 match self {
1063 LocaleError::Unsupported => f.write_str("unsupported locale"),
1064 }
1065 }
1066}
1067
1068impl std::error::Error for LocaleError {}
1069
1070#[derive(Debug, Clone, PartialEq, Eq)]
1072pub struct LocaleChain(Vec<LocaleTag>);
1073
1074impl LocaleTag {
1075 pub const GLOBAL: LocaleTag = LocaleTag::Global;
1077
1078 pub fn parse(s: &str) -> Result<LocaleTag, LocaleError> {
1080 let raw = s.trim().replace('_', "-");
1081 let normalized = raw.to_ascii_lowercase();
1082 match normalized.as_str() {
1083 "global" | "*" => Ok(LocaleTag::Global),
1084 "de-de" => Ok(LocaleTag::DeDe),
1085 "de-at" => Ok(LocaleTag::DeAt),
1086 "de-ch" => Ok(LocaleTag::DeCh),
1087 "en-us" => Ok(LocaleTag::EnUs),
1088 "en-gb" => Ok(LocaleTag::EnGb),
1089 "en-ie" => Ok(LocaleTag::EnIe),
1090 "en-au" => Ok(LocaleTag::EnAu),
1091 "en-ca" => Ok(LocaleTag::EnCa),
1092 "" => Err(LocaleError::Unsupported),
1093 _ if is_bcp47_parseable(&raw) => Ok(LocaleTag::Other(canonical_other(&raw))),
1094 _ => Err(LocaleError::Unsupported),
1095 }
1096 }
1097
1098 pub fn as_str(&self) -> &str {
1100 match self {
1101 LocaleTag::Global => "global",
1102 LocaleTag::DeDe => "de-DE",
1103 LocaleTag::DeAt => "de-AT",
1104 LocaleTag::DeCh => "de-CH",
1105 LocaleTag::EnUs => "en-US",
1106 LocaleTag::EnGb => "en-GB",
1107 LocaleTag::EnIe => "en-IE",
1108 LocaleTag::EnAu => "en-AU",
1109 LocaleTag::EnCa => "en-CA",
1110 LocaleTag::Other(tag) => tag.as_str(),
1111 }
1112 }
1113}
1114
1115impl LocaleChain {
1116 pub fn from_tags(mut tags: Vec<LocaleTag>) -> LocaleChain {
1118 ensure_global(&mut tags);
1119 LocaleChain(tags)
1120 }
1121
1122 pub fn from_cli(raw: &str) -> Result<LocaleChain, LocaleError> {
1124 let tags = raw
1125 .split(',')
1126 .map(LocaleTag::parse)
1127 .collect::<Result<Vec<_>, _>>()?;
1128 Ok(LocaleChain::from_tags(tags))
1129 }
1130
1131 pub fn merge_policy_and_cli(
1133 policy: Option<&[LocaleTag]>,
1134 cli: Option<&[LocaleTag]>,
1135 ) -> LocaleChain {
1136 Self::merge_cli_policy_rulepack_default(cli, policy, None)
1137 }
1138
1139 pub fn merge_cli_policy_rulepack_default(
1141 cli: Option<&[LocaleTag]>,
1142 policy: Option<&[LocaleTag]>,
1143 rulepack_defaults: Option<&[LocaleTag]>,
1144 ) -> LocaleChain {
1145 let tags = cli
1146 .filter(|tags| !tags.is_empty())
1147 .or_else(|| policy.filter(|tags| !tags.is_empty()))
1148 .or_else(|| rulepack_defaults.filter(|tags| !tags.is_empty()))
1149 .map(|tags| tags.to_vec())
1150 .unwrap_or_else(|| vec![LocaleTag::Global]);
1151 LocaleChain::from_tags(tags)
1152 }
1153
1154 pub fn intersects(&self, recognizer_locales: &[LocaleTag]) -> bool {
1156 if recognizer_locales.is_empty() {
1157 return true;
1158 }
1159 recognizer_locales.iter().any(|recognizer_locale| {
1160 *recognizer_locale == LocaleTag::Global
1161 || self.0.iter().any(|active| active == recognizer_locale)
1162 })
1163 }
1164
1165 pub fn as_slice(&self) -> &[LocaleTag] {
1167 &self.0
1168 }
1169
1170 pub fn to_strings(&self) -> Vec<String> {
1172 self.0.iter().map(ToString::to_string).collect()
1173 }
1174}
1175
1176impl From<&[LocaleTag]> for LocaleChain {
1177 fn from(tags: &[LocaleTag]) -> Self {
1178 let mut owned = tags.to_vec();
1179 ensure_global(&mut owned);
1180 LocaleChain(owned)
1181 }
1182}
1183
1184impl fmt::Display for LocaleTag {
1185 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1186 f.write_str(self.as_str())
1187 }
1188}
1189
1190#[derive(Debug, Clone)]
1200#[non_exhaustive]
1201pub enum RawDocument {
1202 Structured(BTreeMap<String, Value>),
1204 Text(String),
1206}
1207
1208#[derive(Debug, Clone, Serialize)]
1227#[serde(untagged)]
1228#[non_exhaustive]
1229pub enum CleanDocument {
1230 Structured(BTreeMap<String, Value>),
1232 Text(String),
1234}
1235
1236#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
1238#[serde(untagged)]
1239#[non_exhaustive]
1240pub enum Value {
1241 Null,
1243 Bool(bool),
1245 String(String),
1247 I64(i64),
1249 Array(Vec<Value>),
1251 Object(BTreeMap<String, Value>),
1253}
1254
1255impl Value {
1256 pub fn as_str(&self) -> Option<&str> {
1258 match self {
1259 Self::String(value) => Some(value.as_str()),
1260 Self::Null | Self::Bool(_) | Self::I64(_) | Self::Array(_) | Self::Object(_) => None,
1261 }
1262 }
1263
1264 pub fn scalar_to_safety_net_string(&self) -> Option<String> {
1266 match self {
1267 Self::String(value) if !value.is_empty() => Some(value.clone()),
1268 Self::String(_) | Self::Null | Self::Array(_) | Self::Object(_) => None,
1269 Self::Bool(value) => Some(value.to_string()),
1270 Self::I64(value) => Some(value.to_string()),
1271 }
1272 }
1273}
1274
1275impl PartialEq<&str> for Value {
1276 fn eq(&self, other: &&str) -> bool {
1277 self.as_str() == Some(*other)
1278 }
1279}
1280
1281#[derive(Debug, Clone, Default)]
1283pub struct DictionaryBundle {
1284 entries: HashMap<String, DictionaryEntry>,
1285}
1286
1287#[derive(Debug, Clone)]
1289pub struct DictionaryEntry {
1290 terms: Vec<String>,
1291 case_sensitive: bool,
1292 source: DictionarySource,
1293}
1294
1295#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1297#[non_exhaustive]
1298pub enum DictionarySource {
1299 Cli,
1301 Rulepack,
1303}
1304
1305#[derive(Debug, Clone, PartialEq, Eq)]
1307#[non_exhaustive]
1308pub struct DictionaryStats {
1309 pub name: String,
1311 pub term_count: usize,
1313 pub source: DictionarySource,
1315}
1316
1317impl DictionaryStats {
1318 pub fn new(name: impl Into<String>, term_count: usize, source: DictionarySource) -> Self {
1320 Self {
1321 name: name.into(),
1322 term_count,
1323 source,
1324 }
1325 }
1326}
1327
1328#[derive(Debug, Clone, PartialEq, Eq)]
1330#[non_exhaustive]
1331pub struct RulepackDict {
1332 pub name: String,
1334 pub terms: Vec<String>,
1336 pub case_sensitive: bool,
1338}
1339
1340impl RulepackDict {
1341 pub fn new(name: impl Into<String>, terms: Vec<String>, case_sensitive: bool) -> Self {
1343 Self {
1344 name: name.into(),
1345 terms,
1346 case_sensitive,
1347 }
1348 }
1349}
1350
1351#[derive(Debug, Clone, PartialEq, Eq)]
1353#[non_exhaustive]
1354pub enum DictionaryLoadError {
1355 Empty { name: String },
1357 UnicodeInsensitiveUnsupported { name: String },
1359}
1360
1361impl fmt::Display for DictionaryLoadError {
1362 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1363 match self {
1364 Self::Empty { name } => write!(f, "dictionary '{name}' has no terms"),
1365 Self::UnicodeInsensitiveUnsupported { name } => write!(
1366 f,
1367 "dictionary '{name}' uses unicode terms with case-insensitive matching, unsupported in v0.4.0; use case_sensitive = true"
1368 ),
1369 }
1370 }
1371}
1372
1373impl std::error::Error for DictionaryLoadError {}
1374
1375impl DictionaryBundle {
1376 pub fn from_rulepack_terms(terms: &[RulepackDict]) -> Self {
1378 let mut entries = HashMap::with_capacity(terms.len());
1379 for dictionary in terms {
1380 let entry = DictionaryEntry::new(
1381 &dictionary.name,
1382 dictionary.terms.clone(),
1383 dictionary.case_sensitive,
1384 DictionarySource::Rulepack,
1385 )
1386 .expect("Policy validates dictionary terms before bundle construction");
1387 entries.insert(dictionary.name.clone(), entry);
1388 }
1389 Self { entries }
1390 }
1391
1392 pub fn from_entries(entries: impl IntoIterator<Item = (String, DictionaryEntry)>) -> Self {
1394 Self {
1395 entries: entries.into_iter().collect(),
1396 }
1397 }
1398
1399 pub fn merge(a: Self, b: Self) -> Self {
1401 let mut entries = a.entries;
1402 entries.extend(b.entries);
1403 Self { entries }
1404 }
1405
1406 pub fn get(&self, name: &str) -> Option<&DictionaryEntry> {
1408 self.entries.get(name)
1409 }
1410
1411 pub fn stats(&self) -> Vec<DictionaryStats> {
1413 let mut stats = self
1414 .entries
1415 .iter()
1416 .map(|(name, entry)| DictionaryStats {
1417 name: name.clone(),
1418 term_count: entry.terms.len(),
1419 source: entry.source,
1420 })
1421 .collect::<Vec<_>>();
1422 stats.sort_by(|a, b| a.name.cmp(&b.name));
1423 stats
1424 }
1425}
1426
1427impl DictionaryEntry {
1428 pub fn new(
1430 name: &str,
1431 terms: Vec<String>,
1432 case_sensitive: bool,
1433 source: DictionarySource,
1434 ) -> Result<Self, DictionaryLoadError> {
1435 if terms.is_empty() {
1436 return Err(DictionaryLoadError::Empty {
1437 name: name.to_string(),
1438 });
1439 }
1440 if !case_sensitive && terms.iter().any(|term| !term.is_ascii()) {
1441 return Err(DictionaryLoadError::UnicodeInsensitiveUnsupported {
1442 name: name.to_string(),
1443 });
1444 }
1445 Ok(Self {
1446 terms,
1447 case_sensitive,
1448 source,
1449 })
1450 }
1451
1452 pub fn case_sensitive(&self) -> bool {
1454 self.case_sensitive
1455 }
1456
1457 pub fn terms(&self) -> &[String] {
1459 &self.terms
1460 }
1461}
1462
1463#[cfg(test)]
1464mod document_extension_tests {
1465 use super::*;
1466
1467 fn audit_row() -> CodecAuditRow {
1468 let mut row = CodecAuditRow::new(
1469 "gaze.codec.tesseract",
1470 "gaze-codec-tesseract@0.7.1",
1471 "image/png",
1472 TextOrigin::Ocr,
1473 );
1474 row.advertised = CodecCapabilitySet::new(true, true, true, false);
1475 row.delivered = CodecCapabilitySet::new(true, true, false, false);
1476 row.extraction_density_policy = ExtractionDensityPolicy::Required(1.0);
1477 row
1478 }
1479
1480 fn extension_builder() -> DocumentExtensionBuilder {
1481 DocumentExtension::builder(1)
1482 .clean_md_sha256([1; 32])
1483 .layout_json_sha256([2; 32])
1484 .report_json_sha256([3; 32])
1485 .page_count(2)
1486 .audit_session_id("018f0000-0000-7000-8000-000000000000")
1487 }
1488
1489 #[test]
1490 fn document_extension_round_trips_with_bundle_root_schema_version() {
1491 let mut row = audit_row();
1492 row.options_hash_hex = Some("00".repeat(32));
1493 row.engine_provenance = Some("tesseract@5.3.4".to_string());
1494 let extension = extension_builder()
1495 .preview_png_sha256([4; 32])
1496 .clean_spans(vec![EmittedTokenSpan::new(0..8, 0..12, PiiClass::Email)])
1497 .codec_audit(vec![row])
1498 .build()
1499 .expect("document extension");
1500
1501 let json = serde_json::to_value(&extension).expect("serialize document extension");
1502
1503 assert_eq!(json["schema_version"], 1);
1504 assert_eq!(json["clean_md_sha256"].as_array().expect("hash").len(), 32);
1505 assert_eq!(
1506 json["layout_json_sha256"].as_array().expect("hash").len(),
1507 32
1508 );
1509 assert_eq!(
1510 json["report_json_sha256"].as_array().expect("hash").len(),
1511 32
1512 );
1513 assert_eq!(
1514 json["preview_png_sha256"].as_array().expect("hash").len(),
1515 32
1516 );
1517 assert_eq!(json["page_count"], 2);
1518 assert_eq!(
1519 json["audit_session_id"],
1520 "018f0000-0000-7000-8000-000000000000"
1521 );
1522 assert_eq!(json["clean_spans"].as_array().expect("spans").len(), 1);
1523 assert!(json.get("clean_schema_version").is_none());
1524 assert!(json.get("layout_schema_version").is_none());
1525 assert!(json.get("report_schema_version").is_none());
1526 assert!(json.get("manifest_schema_version").is_none());
1527
1528 let decoded: DocumentExtension =
1529 serde_json::from_value(json).expect("deserialize document extension");
1530 assert_eq!(decoded, extension);
1531 }
1532
1533 #[test]
1534 fn document_extension_carries_full_integrity_set() {
1535 let extension = DocumentExtension::builder(1)
1536 .clean_md_sha256([10; 32])
1537 .layout_json_sha256([11; 32])
1538 .report_json_sha256([12; 32])
1539 .preview_png_sha256([13; 32])
1540 .page_count(7)
1541 .audit_session_id("018f0000-0000-7000-8000-000000000001")
1542 .clean_spans(vec![EmittedTokenSpan::new(5..14, 20..34, PiiClass::Name)])
1543 .codec_audit(vec![audit_row()])
1544 .build()
1545 .expect("document extension");
1546
1547 let json = serde_json::to_string(&extension).expect("serialize document extension");
1548 let decoded: DocumentExtension =
1549 serde_json::from_str(&json).expect("deserialize document extension");
1550
1551 assert_eq!(decoded, extension);
1552 assert_eq!(decoded.clean_md_sha256, [10; 32]);
1553 assert_eq!(decoded.layout_json_sha256, [11; 32]);
1554 assert_eq!(decoded.report_json_sha256, [12; 32]);
1555 assert_eq!(decoded.preview_png_sha256, Some([13; 32]));
1556 assert_eq!(decoded.page_count, 7);
1557 assert_eq!(
1558 decoded.audit_session_id,
1559 "018f0000-0000-7000-8000-000000000001"
1560 );
1561 assert_eq!(decoded.clean_spans.len(), 1);
1562 assert_eq!(decoded.codec_audit.len(), 1);
1563 }
1564
1565 #[test]
1566 fn document_extension_builder_requires_integrity_fields() {
1567 assert_eq!(
1568 DocumentExtension::builder(1).build(),
1569 Err(DocumentExtensionError::MissingField("clean_md_sha256"))
1570 );
1571 assert_eq!(
1572 DocumentExtension::builder(1)
1573 .clean_md_sha256([1; 32])
1574 .layout_json_sha256([2; 32])
1575 .report_json_sha256([3; 32])
1576 .page_count(1)
1577 .build(),
1578 Err(DocumentExtensionError::MissingField("audit_session_id"))
1579 );
1580 }
1581
1582 #[test]
1583 fn codec_audit_row_round_trips_without_raw_pii_fields() {
1584 let row = audit_row();
1585 let json = serde_json::to_string(&row).expect("serialize codec audit row");
1586
1587 assert!(json.contains("\"codec_id\""));
1588 assert!(!json.contains("alice@example.invalid"));
1589 assert!(!json.contains("\"raw\""));
1590 assert_eq!(
1591 serde_json::from_str::<CodecAuditRow>(&json).expect("deserialize codec audit row"),
1592 row
1593 );
1594 }
1595
1596 #[test]
1597 fn text_origin_round_trips() {
1598 for origin in [
1599 TextOrigin::Ocr,
1600 TextOrigin::EmbeddedText,
1601 TextOrigin::Transcript,
1602 TextOrigin::Hybrid,
1603 ] {
1604 let json = serde_json::to_string(&origin).expect("serialize text origin");
1605 let decoded: TextOrigin = serde_json::from_str(&json).expect("deserialize text origin");
1606 assert_eq!(decoded, origin);
1607 }
1608 }
1609
1610 #[test]
1611 fn codec_capability_set_round_trips_and_contains_requested_bits() {
1612 let delivered = CodecCapabilitySet::new(true, true, false, false);
1613
1614 let json = serde_json::to_string(&delivered).expect("serialize capabilities");
1615 let decoded: CodecCapabilitySet =
1616 serde_json::from_str(&json).expect("deserialize capabilities");
1617
1618 assert_eq!(decoded, delivered);
1619 assert!(decoded.contains(CodecCapabilitySet::TEXT_ONLY));
1620 assert!(!decoded.contains(CodecCapabilitySet::new(true, true, true, false)));
1621 }
1622
1623 #[test]
1624 fn extraction_density_policy_round_trips_closed_variants() {
1625 for policy in [
1626 ExtractionDensityPolicy::Required(1.25),
1627 ExtractionDensityPolicy::Exempt {
1628 reason: "text_only".to_string(),
1629 },
1630 ] {
1631 let json = serde_json::to_string(&policy).expect("serialize density policy");
1632 let decoded: ExtractionDensityPolicy =
1633 serde_json::from_str(&json).expect("deserialize density policy");
1634 assert_eq!(decoded, policy);
1635 }
1636 }
1637
1638 #[test]
1639 fn manifest_stats_round_trip_for_document_report_mirrors() {
1640 let manifest =
1641 Manifest::from_spans(vec![EmittedTokenSpan::new(0..15, 0..19, PiiClass::Email)]);
1642 let stats = LeakReportStats {
1643 suspect_count: 1,
1644 uncovered_count: 0,
1645 partial_bleed_count: 0,
1646 class_mismatch_count: 0,
1647 locale_skipped_count: 0,
1648 };
1649
1650 let manifest_json = serde_json::to_string(&manifest).expect("serialize manifest");
1651 let stats_json = serde_json::to_string(&stats).expect("serialize stats");
1652
1653 assert_eq!(
1654 serde_json::from_str::<Manifest>(&manifest_json).expect("deserialize manifest"),
1655 manifest
1656 );
1657 assert_eq!(
1658 serde_json::from_str::<LeakReportStats>(&stats_json).expect("deserialize stats"),
1659 stats
1660 );
1661 }
1662}
1663
1664#[cfg(test)]
1665mod dictionary_tests {
1666 use super::*;
1667
1668 #[test]
1669 fn dictionary_entry_rejects_empty_terms() {
1670 let err = DictionaryEntry::new("empty", Vec::new(), true, DictionarySource::Cli)
1671 .expect_err("empty dictionaries must fail closed");
1672
1673 assert!(matches!(err, DictionaryLoadError::Empty { name } if name == "empty"));
1674 }
1675
1676 #[test]
1677 fn dictionary_entry_rejects_non_ascii_case_insensitive_terms() {
1678 let err = DictionaryEntry::new(
1679 "songs",
1680 vec!["Beyonce".to_string(), "Caf\u{00e9}".to_string()],
1681 false,
1682 DictionarySource::Cli,
1683 )
1684 .expect_err("unicode case-insensitive dictionaries must fail closed");
1685
1686 assert!(matches!(
1687 err,
1688 DictionaryLoadError::UnicodeInsensitiveUnsupported { name } if name == "songs"
1689 ));
1690 }
1691}
1692
1693#[cfg(test)]
1694mod redaction_logger_tests {
1695 use super::*;
1696
1697 struct CapturingLogger;
1698
1699 impl RedactionLogger for CapturingLogger {
1700 fn log(&self, _entry: &RedactionEntry) -> Result<(), RedactionLogError> {
1701 Ok(())
1702 }
1703 }
1704
1705 fn assert_send_sync<T: Send + Sync + ?Sized>() {}
1706
1707 #[test]
1708 fn redaction_log_error_display_is_stable() {
1709 assert_eq!(
1710 RedactionLogError::Sqlite("write failed".to_string()).to_string(),
1711 "sqlite redaction log error: write failed"
1712 );
1713 assert_eq!(
1714 RedactionLogError::Backend("sink failed".to_string()).to_string(),
1715 "backend redaction log error: sink failed"
1716 );
1717 }
1718
1719 #[test]
1720 fn redaction_logger_trait_object_is_send_sync() {
1721 assert_send_sync::<dyn RedactionLogger>();
1722 }
1723
1724 #[test]
1725 fn local_logger_can_implement_redaction_logger() {
1726 let logger = CapturingLogger;
1727 let entry = RedactionEntry {
1728 source: "unit-test".to_string(),
1729 class: PiiClass::Email,
1730 action: Action::Tokenize,
1731 field_name: None,
1732 document_kind: DocumentKind::Text,
1733 conflict_loser: false,
1734 decided_by: ConflictTier::None,
1735 created_at: 0,
1736 session_id: None,
1737 };
1738
1739 let trait_object: &dyn RedactionLogger = &logger;
1740 trait_object.log(&entry).expect("log entry");
1741 }
1742}
1743
1744#[cfg(test)]
1745mod safety_net_manifest_tests {
1746 use super::*;
1747
1748 fn span(start: usize, end: usize, class: PiiClass) -> EmittedTokenSpan {
1749 EmittedTokenSpan {
1750 clean_span: start..end,
1751 raw_span: start..end,
1752 class,
1753 }
1754 }
1755
1756 fn diff(manifest: Manifest, suspect: Range<usize>, class: PiiClass) -> Option<LeakKind> {
1757 manifest.diff_against(&suspect, &class)
1758 }
1759
1760 #[test]
1761 fn exact_same_class_coverage_is_not_a_leak() {
1762 let manifest = Manifest::from_spans(vec![span(0, 8, PiiClass::Email)]);
1763
1764 assert_eq!(diff(manifest, 0..8, PiiClass::Email), None);
1765 }
1766
1767 #[test]
1768 fn uncovered_outside_all_tokens_is_uncovered() {
1769 let manifest = Manifest::from_spans(vec![span(20, 30, PiiClass::Email)]);
1770
1771 assert_eq!(
1772 diff(manifest, 0..10, PiiClass::Email),
1773 Some(LeakKind::Uncovered)
1774 );
1775 }
1776
1777 #[test]
1778 fn single_internal_gap_returns_partial_bleed() {
1779 let manifest = Manifest::from_spans(vec![
1780 span(0, 5, PiiClass::Email),
1781 span(10, 15, PiiClass::Email),
1782 ]);
1783
1784 assert_eq!(
1785 diff(manifest, 0..15, PiiClass::Email),
1786 Some(LeakKind::PartialBleed { uncovered: 5..10 })
1787 );
1788 }
1789
1790 #[test]
1791 fn multi_gap_returns_deterministic_first_uncovered_gap() {
1792 let manifest = Manifest::from_spans(vec![
1793 span(0, 3, PiiClass::Email),
1794 span(5, 7, PiiClass::Email),
1795 span(9, 12, PiiClass::Email),
1796 ]);
1797
1798 assert_eq!(
1801 diff(manifest, 0..12, PiiClass::Email),
1802 Some(LeakKind::PartialBleed { uncovered: 3..5 })
1803 );
1804 }
1805
1806 #[test]
1807 fn multi_class_overlap_reports_first_mismatch_deterministically() {
1808 let manifest = Manifest::from_spans(vec![
1809 span(0, 4, PiiClass::Name),
1810 span(4, 8, PiiClass::Location),
1811 ]);
1812
1813 assert_eq!(
1814 diff(manifest, 0..8, PiiClass::Email),
1815 Some(LeakKind::ClassMismatch {
1816 pipeline_class: PiiClass::Name,
1817 safety_net_class: PiiClass::Email,
1818 })
1819 );
1820 }
1821
1822 #[test]
1823 fn adjacent_same_class_tokens_cover_continuously() {
1824 let manifest = Manifest::from_spans(vec![
1825 span(0, 5, PiiClass::Email),
1826 span(5, 10, PiiClass::Email),
1827 ]);
1828
1829 assert_eq!(diff(manifest, 0..10, PiiClass::Email), None);
1830 }
1831
1832 #[test]
1833 fn partial_bleed_at_start_end_and_middle() {
1834 let manifest = Manifest::from_spans(vec![span(3, 8, PiiClass::Email)]);
1835
1836 assert_eq!(
1837 diff(manifest.clone(), 0..8, PiiClass::Email),
1838 Some(LeakKind::PartialBleed { uncovered: 0..3 })
1839 );
1840 assert_eq!(
1841 diff(manifest.clone(), 3..10, PiiClass::Email),
1842 Some(LeakKind::PartialBleed { uncovered: 8..10 })
1843 );
1844
1845 let with_gap = Manifest::from_spans(vec![
1846 span(0, 3, PiiClass::Email),
1847 span(6, 10, PiiClass::Email),
1848 ]);
1849 assert_eq!(
1850 diff(with_gap, 0..10, PiiClass::Email),
1851 Some(LeakKind::PartialBleed { uncovered: 3..6 })
1852 );
1853 }
1854
1855 #[test]
1856 fn byte_indices_are_not_character_indices() {
1857 let text = "ID: 😀 <Email_1>";
1858 let token_start = text.find("<Email_1>").expect("token start");
1859 assert_eq!(token_start, 9, "emoji is four bytes, not one char");
1860 let manifest = Manifest::from_spans(vec![span(token_start, text.len(), PiiClass::Email)]);
1861
1862 assert_eq!(
1863 diff(manifest, token_start..text.len(), PiiClass::Email),
1864 None
1865 );
1866 }
1867
1868 #[test]
1869 fn empty_suspect_range_is_not_a_leak() {
1870 let manifest = Manifest::default();
1871
1872 assert_eq!(diff(manifest, 3..3, PiiClass::Email), None);
1873 }
1874
1875 #[test]
1876 fn safety_net_error_display_is_variant_specific_and_bytes_free() {
1877 let cases = [
1878 SafetyNetError::Unavailable {
1879 reason: "not configured".to_string(),
1880 }
1881 .to_string(),
1882 SafetyNetError::WeightsMissing {
1883 path: "/models/opf".to_string(),
1884 }
1885 .to_string(),
1886 SafetyNetError::ModelUnavailable {
1887 reason: "load failed".to_string(),
1888 }
1889 .to_string(),
1890 SafetyNetError::InputTooLarge {
1891 limit: 1024,
1892 actual: 2048,
1893 }
1894 .to_string(),
1895 SafetyNetError::Runtime {
1896 message: "timeout".to_string(),
1897 }
1898 .to_string(),
1899 SafetyNetError::InvalidOutput {
1900 message: "bad json".to_string(),
1901 }
1902 .to_string(),
1903 ];
1904
1905 for rendered in cases {
1906 assert!(!rendered.contains("alice@example.invalid"));
1907 }
1908 }
1909}
1910
1911pub trait Recognizer: Send + Sync {
1913 fn id(&self) -> &str;
1915 fn supported_class(&self) -> &PiiClass;
1917 fn detect(&self, input: &str, ctx: &DetectContext<'_>) -> Vec<Candidate>;
1919 fn token_family(&self) -> &str;
1921 fn locales(&self) -> &[LocaleTag] {
1923 &[LocaleTag::Global]
1924 }
1925}
1926
1927#[derive(Debug, Clone, PartialEq)]
1929#[non_exhaustive]
1930pub struct Candidate {
1931 pub span: Range<usize>,
1933 pub class: PiiClass,
1935 pub recognizer_id: String,
1937 pub score: f32,
1939 pub priority: i32,
1941 pub canonical_form: Option<String>,
1943 pub token_family: String,
1945 pub source: String,
1947 pub decided_by: ConflictTier,
1949 pub merged_sources: Vec<String>,
1951}
1952
1953impl Candidate {
1954 #[allow(clippy::too_many_arguments)]
1956 pub fn new(
1957 span: Range<usize>,
1958 class: PiiClass,
1959 recognizer_id: impl Into<String>,
1960 score: f32,
1961 priority: i32,
1962 canonical_form: Option<String>,
1963 token_family: impl Into<String>,
1964 source: impl Into<String>,
1965 decided_by: ConflictTier,
1966 merged_sources: Vec<String>,
1967 ) -> Self {
1968 Self {
1969 span,
1970 class,
1971 recognizer_id: recognizer_id.into(),
1972 score,
1973 priority,
1974 canonical_form,
1975 token_family: token_family.into(),
1976 source: source.into(),
1977 decided_by,
1978 merged_sources,
1979 }
1980 }
1981
1982 pub fn with_span(mut self, span: Range<usize>) -> Self {
1984 self.span = span;
1985 self
1986 }
1987}
1988
1989#[non_exhaustive]
1991pub struct DetectContext<'a> {
1992 pub locale_chain: &'a [LocaleTag],
1994 pub dictionaries: &'a DictionaryBundle,
1996 pub fields: &'a (),
1998 pub degraded: Cell<bool>,
2000}
2001
2002impl<'a> DetectContext<'a> {
2003 pub fn new(locale_chain: &'a [LocaleTag], dictionaries: &'a DictionaryBundle) -> Self {
2005 Self {
2006 locale_chain,
2007 dictionaries,
2008 fields: &(),
2009 degraded: Cell::new(false),
2010 }
2011 }
2012}
2013
2014fn ensure_global(tags: &mut Vec<LocaleTag>) {
2015 if !tags.contains(&LocaleTag::Global) {
2016 tags.push(LocaleTag::Global);
2017 }
2018}
2019
2020fn is_bcp47_parseable(raw: &str) -> bool {
2021 let mut parts = raw.split('-');
2022 let Some(language) = parts.next() else {
2023 return false;
2024 };
2025 if !(2..=8).contains(&language.len()) || !language.chars().all(|ch| ch.is_ascii_alphabetic()) {
2026 return false;
2027 }
2028 parts.all(|part| {
2029 (2..=8).contains(&part.len()) && part.chars().all(|ch| ch.is_ascii_alphanumeric())
2030 })
2031}
2032
2033fn canonical_other(raw: &str) -> String {
2034 let mut parts = raw.split('-');
2035 let language = parts.next().unwrap_or_default().to_ascii_lowercase();
2036 let rest = parts.map(|part| {
2037 if part.len() == 2 && part.chars().all(|ch| ch.is_ascii_alphabetic()) {
2038 part.to_ascii_uppercase()
2039 } else {
2040 part.to_ascii_lowercase()
2041 }
2042 });
2043 std::iter::once(language)
2044 .chain(rest)
2045 .collect::<Vec<_>>()
2046 .join("-")
2047}