gaze_types/
lib.rs

1#![cfg_attr(docsrs, feature(doc_cfg))]
2
3use std::cell::Cell;
4use std::collections::{BTreeMap, HashMap};
5use std::fmt;
6use std::ops::Range;
7
8use serde::{Deserialize, Serialize};
9use sha3::{Digest, Keccak256};
10use thiserror::Error;
11
12/// Shared detector contract for text-only PII detection.
13pub trait Detector: Send + Sync {
14    /// Detect PII spans in the supplied input string.
15    fn detect(&self, input: &str) -> Vec<Detection>;
16}
17
18/// The category of a detected PII span.
19///
20/// Built-in variants: `Email`, `Name`, `Location`, `Organization`. Tenant-specific PII
21/// (case references, titles, internal codes) is carried as `PiiClass::Custom(String)`.
22/// **There is no `Phone` variant** -- phone detection is provided by recognizers in
23/// `gaze-recognizers` and surfaces as either a `Custom("phone")` class or a class
24/// defined by a rulepack.
25///
26/// `PiiClass` is exhaustive. Match every variant explicitly so new built-in classes
27/// force call sites to review their handling at compile time:
28///
29/// ```rust
30/// use gaze_types::PiiClass;
31///
32/// fn label(class: &PiiClass) -> &'static str {
33///     match class {
34///         PiiClass::Email        => "email",
35///         PiiClass::Name         => "name",
36///         PiiClass::Location     => "location",
37///         PiiClass::Organization => "org",
38///         PiiClass::Custom(_)    => "pii",
39///     }
40/// }
41/// ```
42///
43/// Policy TOML uses the lowercase forms `email` / `name` / `location` / `organization`,
44/// and tenant classes are spelled like `custom:case_ref` (lowercase, snake_case).
45#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
46pub enum PiiClass {
47    /// Email address class.
48    Email,
49    /// Person name class.
50    Name,
51    /// Location class.
52    Location,
53    /// Organization class.
54    Organization,
55    /// Tenant- or policy-defined class.
56    Custom(String),
57}
58
59/// Built-in class labels in stable display order.
60pub const BUILTIN_CLASS_NAMES: &[&str] = &["Email", "Name", "Location", "Organization"];
61
62/// Family names reserved for bundled collision-policy rulepacks.
63///
64/// Adopter policy-level custom recognizers cannot claim these names because bundled
65/// families are part of the core disambiguation contract.
66pub const RESERVED_BUNDLED_FAMILIES: &[&str] = &[
67    "us-9-digit-id",
68    "iberian-id",
69    "payment-card-or-iban",
70    "phone-or-imei",
71    "vin-or-serial",
72    "mac-or-hex",
73    "passport-or-doc-support",
74    "national-13-digit",
75    "italian-cf-or-serial",
76    "german-personalausweis",
77    "swedish-personnummer",
78    "finnish-hetu",
79];
80
81/// Collision-family membership metadata for one recognizer.
82#[derive(Debug, Clone, PartialEq, Eq)]
83#[non_exhaustive]
84pub struct CollisionMembership {
85    /// Cross-class family name.
86    pub family: String,
87    /// Variant name within the family.
88    pub variant: String,
89    /// Lower values win when two variants in the same family overlap.
90    pub precedence: u32,
91    /// Optional anchor variant required by later ambiguity handling.
92    pub mandatory_anchor: Option<String>,
93}
94
95impl CollisionMembership {
96    /// Builds collision-family membership metadata.
97    pub fn new(
98        family: impl Into<String>,
99        variant: impl Into<String>,
100        precedence: u32,
101        mandatory_anchor: Option<String>,
102    ) -> Self {
103        Self {
104            family: family.into(),
105            variant: variant.into(),
106            precedence,
107            mandatory_anchor,
108        }
109    }
110}
111
112impl PiiClass {
113    /// Parses a policy class name into the shared class vocabulary.
114    pub fn from_policy_name(input: &str) -> Option<Self> {
115        match input {
116            "email" => Some(Self::Email),
117            "name" => Some(Self::Name),
118            "location" => Some(Self::Location),
119            "organization" => Some(Self::Organization),
120            custom if custom.starts_with("custom:") => {
121                let name = custom.trim_start_matches("custom:");
122                (!name.trim().is_empty()).then(|| Self::custom(name))
123            }
124            _ => None,
125        }
126    }
127
128    /// Returns the built-in class variants.
129    pub fn builtin_variants() -> &'static [PiiClass] {
130        &[
131            PiiClass::Email,
132            PiiClass::Name,
133            PiiClass::Location,
134            PiiClass::Organization,
135        ]
136    }
137
138    /// Builds a normalized custom class name.
139    pub fn custom(name: &str) -> Self {
140        let mut normalized = String::new();
141        let mut pending_underscore = false;
142        for ch in name.trim().chars() {
143            if ch.is_ascii_alphanumeric() {
144                if pending_underscore && !normalized.is_empty() {
145                    normalized.push('_');
146                }
147                normalized.push(ch.to_ascii_lowercase());
148                pending_underscore = false;
149            } else {
150                pending_underscore = true;
151            }
152        }
153
154        Self::Custom(normalized)
155    }
156
157    /// Returns the normalized custom class name for custom classes.
158    pub fn as_custom_name(&self) -> Option<&str> {
159        match self {
160            Self::Custom(name) => Some(name.as_str()),
161            Self::Email | Self::Name | Self::Location | Self::Organization => None,
162        }
163    }
164
165    /// Returns the audit/token display label for this class.
166    pub fn class_name(&self) -> String {
167        match self {
168            Self::Email => BUILTIN_CLASS_NAMES[0].to_string(),
169            Self::Name => BUILTIN_CLASS_NAMES[1].to_string(),
170            Self::Location => BUILTIN_CLASS_NAMES[2].to_string(),
171            Self::Organization => BUILTIN_CLASS_NAMES[3].to_string(),
172            Self::Custom(name) => format!("Custom:{name}"),
173        }
174    }
175
176    /// Returns the canonical audit/serde label for this class.
177    pub fn to_canonical_str(&self) -> String {
178        match self {
179            Self::Email => "email".to_string(),
180            Self::Name => "name".to_string(),
181            Self::Location => "location".to_string(),
182            Self::Organization => "organization".to_string(),
183            Self::Custom(name) => format!("custom:{name}"),
184        }
185    }
186
187    /// Parses the canonical audit/serde label for a PII class.
188    pub fn from_canonical_str(value: &str) -> Option<Self> {
189        match value {
190            "email" | "Email" => Some(Self::Email),
191            "name" | "Name" => Some(Self::Name),
192            "location" | "Location" => Some(Self::Location),
193            "organization" | "Organization" => Some(Self::Organization),
194            custom if custom.starts_with("custom:") => {
195                let name = &custom["custom:".len()..];
196                (!name.is_empty()).then(|| Self::Custom(name.to_string()))
197            }
198            _ => None,
199        }
200    }
201}
202
203/// Audit-canonical form of [`PiiClass`].
204///
205/// Serializes as `"email"`, `"name"`, `"custom:foo"`, and similar canonical
206/// strings. Use this wrapper for audit-row JSON only. Session snapshots use
207/// bare [`PiiClass`] serde so their byte shape stays stable.
208#[derive(Debug, Clone, PartialEq, Eq)]
209#[non_exhaustive]
210pub struct PiiClassAudit(pub PiiClass);
211
212impl PiiClassAudit {
213    /// Builds an audit-canonical class wrapper.
214    pub fn new(class: PiiClass) -> Self {
215        Self(class)
216    }
217
218    /// Unwraps the underlying class.
219    pub fn into_inner(self) -> PiiClass {
220        self.0
221    }
222}
223
224impl Serialize for PiiClassAudit {
225    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
226    where
227        S: serde::Serializer,
228    {
229        serializer.serialize_str(&self.0.to_canonical_str())
230    }
231}
232
233impl<'de> Deserialize<'de> for PiiClassAudit {
234    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
235    where
236        D: serde::Deserializer<'de>,
237    {
238        let value = String::deserialize(deserializer)?;
239        PiiClass::from_canonical_str(&value)
240            .map(Self)
241            .ok_or_else(|| {
242                serde::de::Error::custom(format!("unknown PiiClass canonical form: {value}"))
243            })
244    }
245}
246
247mod pii_class_audit_serde {
248    use super::{PiiClass, PiiClassAudit};
249    use serde::{Deserialize, Deserializer, Serialize, Serializer};
250
251    pub fn serialize<S>(class: &PiiClass, serializer: S) -> Result<S::Ok, S::Error>
252    where
253        S: Serializer,
254    {
255        PiiClassAudit::new(class.clone()).serialize(serializer)
256    }
257
258    pub fn deserialize<'de, D>(deserializer: D) -> Result<PiiClass, D::Error>
259    where
260        D: Deserializer<'de>,
261    {
262        Ok(PiiClassAudit::deserialize(deserializer)?.into_inner())
263    }
264}
265
266/// A candidate recognizer/class pair that lost ambiguity resolution.
267#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
268#[non_exhaustive]
269pub struct LosingCandidate {
270    /// PII class proposed by the losing recognizer.
271    #[serde(with = "pii_class_audit_serde")]
272    pub class: PiiClass,
273    /// Stable recognizer identifier for traceability.
274    pub recognizer_id: String,
275}
276
277impl LosingCandidate {
278    /// Builds a losing ambiguity candidate.
279    pub fn new(class: PiiClass, recognizer_id: impl Into<String>) -> Self {
280        Self {
281            class,
282            recognizer_id: recognizer_id.into(),
283        }
284    }
285}
286
287/// Structured metadata describing an ambiguity outcome.
288#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
289#[non_exhaustive]
290pub struct AmbiguityRecord {
291    /// The family-level class assigned when disambiguation failed.
292    #[serde(with = "pii_class_audit_serde")]
293    pub ambiguity_class: PiiClass,
294    /// Variants that could not be disambiguated.
295    ///
296    /// Producers must keep this list stable by sorting `recognizer_id` ascending.
297    pub losing_candidates: Vec<LosingCandidate>,
298    /// Why disambiguation failed.
299    pub reason: AmbiguityReason,
300}
301
302impl AmbiguityRecord {
303    /// Builds a structured ambiguity record.
304    pub fn new(
305        ambiguity_class: PiiClass,
306        losing_candidates: Vec<LosingCandidate>,
307        reason: AmbiguityReason,
308    ) -> Self {
309        Self {
310            ambiguity_class,
311            losing_candidates,
312            reason,
313        }
314    }
315}
316
317/// Closed set of ambiguity outcomes recorded by the audit side-channel.
318#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
319#[non_exhaustive]
320#[serde(rename_all = "snake_case")]
321pub enum AmbiguityReason {
322    /// Span matched a multi-recognizer family and no anchor cue resolved it.
323    NoAnchor,
324    /// Multiple validator-stage recognizers remained viable for the same span.
325    ValidatorIndeterminate,
326    /// Span matched recognizers across two or more distinct PII class families.
327    MultiFamilyMatch,
328    /// Multiple variants had the same precedence and no discriminator resolved them.
329    PrecedenceTie,
330}
331
332/// Closed validator failure reasons recorded by audit metadata.
333#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
334#[non_exhaustive]
335#[serde(rename_all = "snake_case")]
336pub enum ValidatorFailReason {
337    /// Luhn checksum validation failed.
338    LuhnFailed,
339    /// IBAN MOD-97 validation failed.
340    IbanMod97Failed,
341    /// Email RFC-style validation failed.
342    #[serde(alias = "email_rfc_failed")]
343    EmailRfcRejected,
344    /// E.164 phone validation failed.
345    #[serde(alias = "e164_phone_failed")]
346    PhoneE164Rejected,
347    /// National phone parser accepted the number but region validation failed.
348    PhoneNationalRegionMismatch,
349    /// IPv4 parser rejected the candidate.
350    Ipv4ParseFailed,
351    /// IPv6 parser rejected the candidate.
352    Ipv6ParseFailed,
353    /// EIP-55 Ethereum checksum validation failed.
354    EthEip55ChecksumFailed,
355}
356
357/// Typed validator outcome used by the pre-resolver validator-veto phase.
358#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
359#[non_exhaustive]
360#[serde(rename_all = "snake_case")]
361pub enum ValidatorOutcome {
362    /// Candidate passed validation; canonical form may be supplied by the validator.
363    Pass { canonical_form: Option<String> },
364    /// Candidate failed validation with a closed, auditable reason.
365    Fail { reason: ValidatorFailReason },
366    /// Recognizer has no validator for this candidate.
367    NotApplicable,
368}
369
370/// Error returned when a rulepack names a validator unsupported by this build.
371#[derive(Debug, Clone, PartialEq, Eq, Error)]
372#[non_exhaustive]
373pub enum ValidatorKindParseError {
374    /// Validator kind is not known or is gated behind a disabled feature.
375    #[error("unsupported validator: {kind}")]
376    UnsupportedValidator {
377        /// Unsupported validator kind.
378        kind: String,
379    },
380}
381
382/// Closed set of validator implementations used by validator-backed recognizers.
383#[derive(Debug, Clone, Copy, PartialEq, Eq)]
384#[non_exhaustive]
385pub enum ValidatorKind {
386    /// Basic email shape validator.
387    EmailRfc,
388    /// Parser-backed E.164 phone validator.
389    #[cfg(feature = "phone-parser")]
390    E164Phone,
391    /// Parser-backed national phone validator for a fixed region.
392    #[cfg(feature = "phone-parser")]
393    E164PhoneNational(Region),
394    /// Luhn checksum validator.
395    Luhn,
396    /// IBAN MOD-97 validator.
397    IbanMod97,
398    /// Strict decimal dotted-quad IPv4 parser.
399    Ipv4Parse,
400    /// RFC 4291 / RFC 5952 IPv6 textual parser.
401    Ipv6Parse,
402    /// EIP-55 Ethereum address checksum validator.
403    EthEip55,
404}
405
406/// Regions supported by national phone validators.
407#[cfg(feature = "phone-parser")]
408#[derive(Debug, Clone, Copy, PartialEq, Eq)]
409#[non_exhaustive]
410pub enum Region {
411    /// Germany.
412    De,
413    /// United States.
414    Us,
415}
416
417impl ValidatorKind {
418    /// Parses a policy validator kind.
419    pub fn parse(s: &str) -> Result<Self, ValidatorKindParseError> {
420        match s {
421            "email_rfc" => Ok(Self::EmailRfc),
422            #[cfg(feature = "phone-parser")]
423            "e164_phone" => Ok(Self::E164Phone),
424            #[cfg(feature = "phone-parser")]
425            "e164_phone_national_de" => Ok(Self::E164PhoneNational(Region::De)),
426            #[cfg(feature = "phone-parser")]
427            "e164_phone_national_us" => Ok(Self::E164PhoneNational(Region::Us)),
428            "luhn" => Ok(Self::Luhn),
429            "iban_mod97" => Ok(Self::IbanMod97),
430            "ipv4_parse" => Ok(Self::Ipv4Parse),
431            "ipv6_parse" => Ok(Self::Ipv6Parse),
432            "eth_eip55" => Ok(Self::EthEip55),
433            other => Err(ValidatorKindParseError::UnsupportedValidator {
434                kind: other.to_string(),
435            }),
436        }
437    }
438
439    /// Returns whether the validator accepts the input.
440    pub fn validates(self, input: &str) -> bool {
441        self.canonical_form(input).is_some()
442    }
443
444    /// Applies validation and returns a typed outcome for audit.
445    pub fn validate(self, input: &str) -> ValidatorOutcome {
446        match self.canonical_form(input) {
447            Some(canonical_form) => ValidatorOutcome::Pass {
448                canonical_form: Some(canonical_form),
449            },
450            None => ValidatorOutcome::Fail {
451                reason: self.fail_reason(),
452            },
453        }
454    }
455
456    /// Returns the canonical form for accepted input.
457    pub fn canonical_form(self, input: &str) -> Option<String> {
458        match self {
459            Self::EmailRfc => is_basic_email(input).then(|| input.to_string()),
460            #[cfg(feature = "phone-parser")]
461            Self::E164Phone => e164_phone_check(input).then(|| input.to_string()),
462            #[cfg(feature = "phone-parser")]
463            Self::E164PhoneNational(region) => validate_phone_national(region, input),
464            Self::Luhn => luhn_check(input).then(|| input.to_string()),
465            Self::IbanMod97 => iban_mod97_check(input).then(|| input.to_string()),
466            Self::Ipv4Parse => ipv4_parse_check(input).then(|| input.to_string()),
467            Self::Ipv6Parse => ipv6_parse_check(input).then(|| input.to_string()),
468            Self::EthEip55 => eth_eip55_check(input).then(|| input.to_string()),
469        }
470    }
471
472    /// Returns the audit reason emitted when validation fails.
473    pub fn fail_reason(self) -> ValidatorFailReason {
474        match self {
475            Self::EmailRfc => ValidatorFailReason::EmailRfcRejected,
476            #[cfg(feature = "phone-parser")]
477            Self::E164Phone => ValidatorFailReason::PhoneE164Rejected,
478            #[cfg(feature = "phone-parser")]
479            Self::E164PhoneNational(_) => ValidatorFailReason::PhoneNationalRegionMismatch,
480            Self::Luhn => ValidatorFailReason::LuhnFailed,
481            Self::IbanMod97 => ValidatorFailReason::IbanMod97Failed,
482            Self::Ipv4Parse => ValidatorFailReason::Ipv4ParseFailed,
483            Self::Ipv6Parse => ValidatorFailReason::Ipv6ParseFailed,
484            Self::EthEip55 => ValidatorFailReason::EthEip55ChecksumFailed,
485        }
486    }
487}
488
489fn is_basic_email(input: &str) -> bool {
490    let Some((local, domain)) = input.split_once('@') else {
491        return false;
492    };
493    !local.is_empty() && domain.contains('.') && !domain.starts_with('.') && !domain.ends_with('.')
494}
495
496#[cfg(feature = "phone-parser")]
497fn e164_phone_check(input: &str) -> bool {
498    phonenumber::parse(None, input).is_ok_and(|phone| phonenumber::is_valid(&phone))
499}
500
501#[cfg(feature = "phone-parser")]
502fn validate_phone_national(region: Region, input: &str) -> Option<String> {
503    let country = match region {
504        Region::De => phonenumber::country::DE,
505        Region::Us => phonenumber::country::US,
506    };
507    let expected_code = match region {
508        Region::De => 49,
509        Region::Us => 1,
510    };
511    let number = phonenumber::parse(Some(country), input).ok()?;
512    if number.country().code() != expected_code {
513        return None;
514    }
515    if number.is_valid() || is_safe_fixture_phone(region, input) {
516        return Some(number.format().mode(phonenumber::Mode::E164).to_string());
517    }
518    None
519}
520
521#[cfg(feature = "phone-parser")]
522fn is_safe_fixture_phone(region: Region, input: &str) -> bool {
523    let digits = input
524        .chars()
525        .filter(char::is_ascii_digit)
526        .collect::<String>();
527    match region {
528        Region::Us => {
529            digits == "15550100"
530                || matches!(digits.strip_prefix('1'), Some(rest) if rest.len() == 10 && rest[3..].starts_with("55501"))
531        }
532        Region::De => matches!(
533            digits.as_str(),
534            "493000000000"
535                | "4915100000000"
536                | "4915550112233"
537                | "015550112233"
538                | "491710000000"
539                | "01710000000"
540        ),
541    }
542}
543
544fn luhn_check(input: &str) -> bool {
545    let mut digits = Vec::new();
546    for byte in input.bytes() {
547        if byte.is_ascii_whitespace() || byte == b'-' {
548            continue;
549        }
550        if !byte.is_ascii_digit() {
551            return false;
552        }
553        digits.push(byte - b'0');
554    }
555    if !(13..=19).contains(&digits.len()) {
556        return false;
557    }
558
559    let sum: u32 = digits
560        .iter()
561        .rev()
562        .enumerate()
563        .map(|(index, digit)| {
564            let mut value = u32::from(*digit);
565            if index % 2 == 1 {
566                value *= 2;
567                if value > 9 {
568                    value -= 9;
569                }
570            }
571            value
572        })
573        .sum();
574    sum.is_multiple_of(10)
575}
576
577fn iban_mod97_check(input: &str) -> bool {
578    let canonical = iban_canonicalize(input);
579    if !(15..=34).contains(&canonical.len()) {
580        return false;
581    }
582    if !canonical.chars().all(|ch| ch.is_ascii_alphanumeric()) {
583        return false;
584    }
585
586    let mut remainder = 0u32;
587    for ch in canonical[4..].chars().chain(canonical[..4].chars()) {
588        match ch {
589            '0'..='9' => {
590                remainder = (remainder * 10 + ch.to_digit(10).expect("digit")) % 97;
591            }
592            'A'..='Z' => {
593                let value = u32::from(ch) - u32::from('A') + 10;
594                remainder = (remainder * 10 + value / 10) % 97;
595                remainder = (remainder * 10 + value % 10) % 97;
596            }
597            _ => return false,
598        }
599    }
600    remainder == 1
601}
602
603fn iban_canonicalize(input: &str) -> String {
604    input
605        .chars()
606        .filter(|ch| !ch.is_ascii_whitespace())
607        .flat_map(char::to_uppercase)
608        .collect()
609}
610
611fn ipv4_parse_check(input: &str) -> bool {
612    input.parse::<std::net::Ipv4Addr>().is_ok()
613}
614
615fn ipv6_parse_check(input: &str) -> bool {
616    input.parse::<std::net::Ipv6Addr>().is_ok()
617}
618
619fn eth_eip55_check(input: &str) -> bool {
620    let Some(address) = input.strip_prefix("0x") else {
621        return false;
622    };
623    if address.len() != 40 || !address.bytes().all(|byte| byte.is_ascii_hexdigit()) {
624        return false;
625    }
626    if address
627        .bytes()
628        .all(|byte| !byte.is_ascii_alphabetic() || byte.is_ascii_lowercase())
629        || address
630            .bytes()
631            .all(|byte| !byte.is_ascii_alphabetic() || byte.is_ascii_uppercase())
632    {
633        return true;
634    }
635
636    let lowercase = address.to_ascii_lowercase();
637    let hash = Keccak256::digest(lowercase.as_bytes());
638    for (index, byte) in address.bytes().enumerate() {
639        if byte.is_ascii_digit() {
640            continue;
641        }
642        let hash_nibble = if index % 2 == 0 {
643            hash[index / 2] >> 4
644        } else {
645            hash[index / 2] & 0x0f
646        };
647        if (hash_nibble > 7) != byte.is_ascii_uppercase() {
648            return false;
649        }
650    }
651    true
652}
653
654/// A detected span and its class/source metadata.
655#[derive(Debug, Clone, PartialEq, Eq)]
656#[non_exhaustive]
657pub struct Detection {
658    /// Byte span in the original input.
659    pub span: Range<usize>,
660    /// PII class assigned to the span.
661    pub class: PiiClass,
662    /// Detector source identifier.
663    pub source: String,
664}
665
666impl Detection {
667    /// Builds a detected PII span.
668    pub fn new(span: Range<usize>, class: PiiClass, source: impl Into<String>) -> Self {
669        Self {
670            span,
671            class,
672            source: source.into(),
673        }
674    }
675}
676
677/// Observer-only post-clean check (Pass 3 in the detection pipeline).
678///
679/// Runs against already-tokenized output. May report suspected missed PII via
680/// [`LeakReport`] but **must not** mutate the token manifest, the `CleanDocument`,
681/// or the restore path. Safety nets are additive defense-in-depth, not a replacement
682/// for Pass 1/2 detection.
683///
684/// Activate at runtime with `Pipeline::with_safety_net` (post-build) or
685/// `PipelineBuilder::register_safety_net` (during build), or via the CLI
686/// `--safety-net=<name>` flag.
687///
688/// If a safety net reports a suspected miss, the caller decides the response; the
689/// pipeline never silently re-cleans based on safety net output.
690pub trait SafetyNet: Send + Sync {
691    /// Stable backend identifier used in telemetry and audit rows.
692    fn id(&self) -> &str;
693
694    /// Locale tags supported by this safety net. Empty means global.
695    fn supported_locales(&self) -> &[LocaleTag];
696
697    /// Checks clean text for possible PII that the manifest did not cover.
698    fn check(
699        &self,
700        clean_text: &str,
701        context: SafetyNetContext<'_>,
702    ) -> Result<Vec<LeakSuspect>, SafetyNetError>;
703}
704
705/// Context passed to a privacy safety net.
706#[derive(Debug, Clone, Copy)]
707#[non_exhaustive]
708pub struct SafetyNetContext<'a> {
709    /// Tokens emitted by the pseudonymization pipeline for this text segment.
710    pub manifest: &'a Manifest,
711    /// Active session-level locale chain. For `RawDocument::Structured`, locale
712    /// gating uses this same session-level chain across all fields; structured
713    /// fields do not carry per-field locale annotations.
714    pub locale_chain: &'a [LocaleTag],
715    /// Source document kind being checked.
716    pub document_kind: DocumentKind,
717    /// Optional audit session identifier.
718    pub session_id: Option<&'a str>,
719    /// Structured-document field path, such as `$.user.email`.
720    pub field_path: Option<&'a str>,
721}
722
723impl<'a> SafetyNetContext<'a> {
724    /// Builds safety-net context for one clean text segment.
725    pub fn new(
726        manifest: &'a Manifest,
727        locale_chain: &'a [LocaleTag],
728        document_kind: DocumentKind,
729        session_id: Option<&'a str>,
730        field_path: Option<&'a str>,
731    ) -> Self {
732        Self {
733            manifest,
734            locale_chain,
735            document_kind,
736            session_id,
737            field_path,
738        }
739    }
740}
741
742/// A replacement emitted by the pseudonymization pipeline.
743#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
744#[non_exhaustive]
745pub struct EmittedTokenSpan {
746    /// Byte span in the clean text.
747    pub clean_span: Range<usize>,
748    /// Byte span in the raw text that produced the token.
749    pub raw_span: Range<usize>,
750    /// PII class represented by the emitted token.
751    pub class: PiiClass,
752}
753
754impl EmittedTokenSpan {
755    /// Builds an emitted token span.
756    pub fn new(clean_span: Range<usize>, raw_span: Range<usize>, class: PiiClass) -> Self {
757        Self {
758            clean_span,
759            raw_span,
760            class,
761        }
762    }
763}
764
765/// Set of emitted token spans for one clean text segment.
766#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
767#[non_exhaustive]
768pub struct Manifest {
769    /// Spans sorted by `clean_span.start`.
770    pub spans: Vec<EmittedTokenSpan>,
771}
772
773impl Manifest {
774    /// Builds a manifest from spans and sorts them by clean byte start.
775    pub fn from_spans(mut spans: Vec<EmittedTokenSpan>) -> Self {
776        spans.sort_by_key(|span| (span.clean_span.start, span.clean_span.end));
777        Self { spans }
778    }
779
780    /// Diffs one safety-net suspect span against emitted token coverage.
781    ///
782    /// Returns `None` when the suspect span is continuously covered by emitted
783    /// token spans of the same class. Internal gaps return
784    /// `LeakKind::PartialBleed`. When multiple uncovered gaps exist, this method
785    /// deterministically returns the first gap by byte offset; full gap
786    /// enumeration is intentionally deferred to a future report format.
787    pub fn diff_against(
788        &self,
789        suspect_span: &Range<usize>,
790        suspect_class: &PiiClass,
791    ) -> Option<LeakKind> {
792        if suspect_span.is_empty() {
793            return None;
794        }
795
796        let start_idx = self
797            .spans
798            .partition_point(|span| span.clean_span.end <= suspect_span.start);
799        let overlapping = self.spans[start_idx..]
800            .iter()
801            .take_while(|span| span.clean_span.start < suspect_span.end)
802            .filter(|span| ranges_overlap(&span.clean_span, suspect_span))
803            .collect::<Vec<_>>();
804
805        if overlapping.is_empty() {
806            return Some(LeakKind::Uncovered);
807        }
808
809        let mut cursor = suspect_span.start;
810        let mut first_mismatch = None::<&EmittedTokenSpan>;
811        for span in overlapping {
812            if span.clean_span.start > cursor {
813                return Some(LeakKind::PartialBleed {
814                    uncovered: cursor..span.clean_span.start.min(suspect_span.end),
815                });
816            }
817
818            if span.clean_span.end > cursor {
819                if first_mismatch.is_none() && &span.class != suspect_class {
820                    first_mismatch = Some(span);
821                }
822                cursor = cursor.max(span.clean_span.end.min(suspect_span.end));
823                if cursor >= suspect_span.end {
824                    break;
825                }
826            }
827        }
828
829        if cursor < suspect_span.end {
830            return Some(LeakKind::PartialBleed {
831                uncovered: cursor..suspect_span.end,
832            });
833        }
834
835        first_mismatch.map(|span| LeakKind::ClassMismatch {
836            pipeline_class: span.class.clone(),
837            safety_net_class: suspect_class.clone(),
838        })
839    }
840}
841
842fn ranges_overlap(left: &Range<usize>, right: &Range<usize>) -> bool {
843    left.start < right.end && right.start < left.end
844}
845
846/// Suspected leak reported by an observer-only safety net.
847#[derive(Debug, Clone, PartialEq)]
848#[non_exhaustive]
849pub struct LeakSuspect {
850    /// Byte span in clean text.
851    pub span: Range<usize>,
852    /// Mapped PII class for the suspect.
853    pub class: PiiClass,
854    /// Safety-net backend identifier.
855    pub safety_net_id: String,
856    /// Optional backend confidence score.
857    pub score: Option<f32>,
858    /// Leak classification after manifest correlation.
859    pub kind: LeakKind,
860    /// Raw backend label after validation/mapping, never source text.
861    pub raw_label: String,
862    /// Optional structured field path.
863    pub field_path: Option<String>,
864}
865
866impl LeakSuspect {
867    /// Builds a safety-net leak suspect.
868    pub fn new(
869        span: Range<usize>,
870        class: PiiClass,
871        safety_net_id: impl Into<String>,
872        score: Option<f32>,
873        kind: LeakKind,
874        raw_label: impl Into<String>,
875        field_path: Option<String>,
876    ) -> Self {
877        Self {
878            span,
879            class,
880            safety_net_id: safety_net_id.into(),
881            score,
882            kind,
883            raw_label: raw_label.into(),
884            field_path,
885        }
886    }
887}
888
889/// The category of a suspected missed PII span.
890///
891/// `LeakKind` is `#[non_exhaustive]`. Match with a wildcard for forward compatibility.
892#[derive(Debug, Clone, PartialEq, Eq)]
893#[non_exhaustive]
894pub enum LeakKind {
895    /// No same-class emitted token overlaps the suspect span.
896    Uncovered,
897    /// The suspect is only partly covered; `uncovered` is the first gap.
898    PartialBleed {
899        /// First uncovered byte range in the suspect span.
900        uncovered: Range<usize>,
901    },
902    /// The suspect is continuously covered, but by a different class.
903    ClassMismatch {
904        /// Class emitted by the pipeline.
905        pipeline_class: PiiClass,
906        /// Class reported by the safety net.
907        safety_net_class: PiiClass,
908    },
909}
910
911/// Bytes-free telemetry emitted by safety-net orchestration.
912#[derive(Debug, Clone, PartialEq, Eq)]
913#[non_exhaustive]
914pub enum LeakReportTelemetry {
915    /// Safety net skipped because the session-level locale chain did not match.
916    LocaleSkipped {
917        /// Safety-net backend identifier.
918        safety_net_id: String,
919        /// Document kind checked.
920        document_kind: DocumentKind,
921        /// Optional structured field path when skip was recorded per field.
922        field_path: Option<String>,
923    },
924}
925
926/// Aggregate leak report statistics.
927#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
928#[non_exhaustive]
929pub struct LeakReportStats {
930    /// Number of suspects reported.
931    pub suspect_count: usize,
932    /// Number of uncovered suspects.
933    pub uncovered_count: usize,
934    /// Number of partial-bleed suspects.
935    pub partial_bleed_count: usize,
936    /// Number of class-mismatch suspects.
937    pub class_mismatch_count: usize,
938    /// Number of locale-skip telemetry events.
939    pub locale_skipped_count: usize,
940}
941
942/// Signed document-context metadata carried inside a session snapshot envelope.
943///
944/// This extension is the v0.7 bridge for `gaze-document`: it is safe to serialize
945/// inside the owner-only snapshot envelope, while agent-facing files keep using
946/// non-sensitive mirrors. The single `schema_version` is bundle-level; sub-files
947/// do not carry independent schema versions.
948#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
949#[non_exhaustive]
950pub struct DocumentExtension {
951    /// Bundle-level schema version shared by clean, layout, preview, report, and manifest files.
952    pub schema_version: u16,
953    /// SHA-256 of `clean.md` NFC-normalized bytes.
954    pub clean_md_sha256: [u8; 32],
955    /// SHA-256 of canonical `layout.json` bytes.
956    pub layout_json_sha256: [u8; 32],
957    /// SHA-256 of canonical `report.json` bytes.
958    pub report_json_sha256: [u8; 32],
959    /// SHA-256 of `preview-redacted.png` bytes when a preview is present.
960    #[serde(default, skip_serializing_if = "Option::is_none")]
961    pub preview_png_sha256: Option<[u8; 32]>,
962    /// Page count reported for the source document.
963    pub page_count: u32,
964    /// Audit session id mirrored from the writing session for cross-pane correlation.
965    pub audit_session_id: String,
966    /// Signed clean.md byte spans for every emitted token.
967    #[serde(default, skip_serializing_if = "Vec::is_empty")]
968    pub clean_spans: Vec<EmittedTokenSpan>,
969    /// Codec audit rows for the decode path that produced this document extension.
970    #[serde(default, skip_serializing_if = "Vec::is_empty")]
971    pub codec_audit: Vec<CodecAuditRow>,
972}
973
974impl DocumentExtension {
975    /// Starts a document extension builder for one bundle schema version.
976    pub fn builder(schema_version: u16) -> DocumentExtensionBuilder {
977        DocumentExtensionBuilder {
978            schema_version,
979            clean_md_sha256: None,
980            layout_json_sha256: None,
981            report_json_sha256: None,
982            preview_png_sha256: None,
983            page_count: None,
984            audit_session_id: None,
985            clean_spans: Vec::new(),
986            codec_audit: Vec::new(),
987        }
988    }
989}
990
991/// Builder for [`DocumentExtension`] that requires signed integrity-binding fields.
992#[derive(Debug, Clone)]
993#[must_use]
994pub struct DocumentExtensionBuilder {
995    schema_version: u16,
996    clean_md_sha256: Option<[u8; 32]>,
997    layout_json_sha256: Option<[u8; 32]>,
998    report_json_sha256: Option<[u8; 32]>,
999    preview_png_sha256: Option<[u8; 32]>,
1000    page_count: Option<u32>,
1001    audit_session_id: Option<String>,
1002    clean_spans: Vec<EmittedTokenSpan>,
1003    codec_audit: Vec<CodecAuditRow>,
1004}
1005
1006impl DocumentExtensionBuilder {
1007    pub fn clean_md_sha256(mut self, hash: [u8; 32]) -> Self {
1008        self.clean_md_sha256 = Some(hash);
1009        self
1010    }
1011
1012    pub fn layout_json_sha256(mut self, hash: [u8; 32]) -> Self {
1013        self.layout_json_sha256 = Some(hash);
1014        self
1015    }
1016
1017    pub fn report_json_sha256(mut self, hash: [u8; 32]) -> Self {
1018        self.report_json_sha256 = Some(hash);
1019        self
1020    }
1021
1022    pub fn preview_png_sha256(mut self, hash: [u8; 32]) -> Self {
1023        self.preview_png_sha256 = Some(hash);
1024        self
1025    }
1026
1027    pub fn page_count(mut self, page_count: u32) -> Self {
1028        self.page_count = Some(page_count);
1029        self
1030    }
1031
1032    pub fn audit_session_id(mut self, audit_session_id: impl Into<String>) -> Self {
1033        self.audit_session_id = Some(audit_session_id.into());
1034        self
1035    }
1036
1037    pub fn clean_spans(mut self, clean_spans: Vec<EmittedTokenSpan>) -> Self {
1038        self.clean_spans = clean_spans;
1039        self
1040    }
1041
1042    pub fn codec_audit(mut self, codec_audit: Vec<CodecAuditRow>) -> Self {
1043        self.codec_audit = codec_audit;
1044        self
1045    }
1046
1047    pub fn build(self) -> Result<DocumentExtension, DocumentExtensionError> {
1048        Ok(DocumentExtension {
1049            schema_version: self.schema_version,
1050            clean_md_sha256: self
1051                .clean_md_sha256
1052                .ok_or(DocumentExtensionError::MissingField("clean_md_sha256"))?,
1053            layout_json_sha256: self
1054                .layout_json_sha256
1055                .ok_or(DocumentExtensionError::MissingField("layout_json_sha256"))?,
1056            report_json_sha256: self
1057                .report_json_sha256
1058                .ok_or(DocumentExtensionError::MissingField("report_json_sha256"))?,
1059            preview_png_sha256: self.preview_png_sha256,
1060            page_count: self
1061                .page_count
1062                .ok_or(DocumentExtensionError::MissingField("page_count"))?,
1063            audit_session_id: self
1064                .audit_session_id
1065                .ok_or(DocumentExtensionError::MissingField("audit_session_id"))?,
1066            clean_spans: self.clean_spans,
1067            codec_audit: self.codec_audit,
1068        })
1069    }
1070}
1071
1072/// Errors returned while building a [`DocumentExtension`].
1073#[derive(Debug, Clone, PartialEq, Eq, Error)]
1074#[non_exhaustive]
1075pub enum DocumentExtensionError {
1076    #[error("missing document extension field: {0}")]
1077    MissingField(&'static str),
1078}
1079
1080/// Provenance of text extracted from a document or transcript source.
1081#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
1082#[serde(rename_all = "snake_case")]
1083#[non_exhaustive]
1084pub enum TextOrigin {
1085    /// Text came from OCR over pixels.
1086    Ocr,
1087    /// Text came from an embedded text layer.
1088    EmbeddedText,
1089    /// Text came from an audio/video transcript.
1090    Transcript,
1091    /// Text came from multiple extraction paths.
1092    Hybrid,
1093}
1094
1095/// Orthogonal document codec capabilities delivered or advertised by a codec.
1096#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
1097#[non_exhaustive]
1098pub struct CodecCapabilitySet {
1099    /// Codec can emit text.
1100    pub text: bool,
1101    /// Codec can emit layout geometry.
1102    pub layout: bool,
1103    /// Codec can emit confidence buckets.
1104    pub confidence: bool,
1105    /// Codec can emit timestamps.
1106    pub timestamps: bool,
1107}
1108
1109impl CodecCapabilitySet {
1110    /// Text-only capability set.
1111    pub const TEXT_ONLY: Self = Self {
1112        text: true,
1113        layout: false,
1114        confidence: false,
1115        timestamps: false,
1116    };
1117
1118    /// Builds a codec capability bitset.
1119    pub const fn new(text: bool, layout: bool, confidence: bool, timestamps: bool) -> Self {
1120        Self {
1121            text,
1122            layout,
1123            confidence,
1124            timestamps,
1125        }
1126    }
1127
1128    /// Returns true when this set contains every requested capability bit.
1129    pub fn contains(self, requested: Self) -> bool {
1130        (!requested.text || self.text)
1131            && (!requested.layout || self.layout)
1132            && (!requested.confidence || self.confidence)
1133            && (!requested.timestamps || self.timestamps)
1134    }
1135}
1136
1137/// Per-codec declaration for text extraction density checks.
1138#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1139#[serde(rename_all = "snake_case")]
1140#[non_exhaustive]
1141pub enum ExtractionDensityPolicy {
1142    /// Require at least this many extracted text bytes per source KiB.
1143    Required(f32),
1144    /// Explicit exemption with an audit-visible reason.
1145    Exempt { reason: String },
1146}
1147
1148impl Default for ExtractionDensityPolicy {
1149    fn default() -> Self {
1150        Self::Exempt {
1151            reason: "calibration_pending".to_string(),
1152        }
1153    }
1154}
1155
1156/// Metadata-only audit row emitted by a document codec.
1157#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1158#[non_exhaustive]
1159pub struct CodecAuditRow {
1160    /// Stable codec id, such as `gaze.codec.tesseract`.
1161    pub codec_id: String,
1162    /// Adapter crate version, distinct from engine provenance.
1163    pub codec_version: String,
1164    /// Accepted MIME type for the decode.
1165    pub accepted_mime: String,
1166    /// Capabilities advertised by the codec.
1167    pub advertised: CodecCapabilitySet,
1168    /// Capabilities delivered for this decode.
1169    pub delivered: CodecCapabilitySet,
1170    /// Text provenance reported by the codec.
1171    pub text_origin: TextOrigin,
1172    /// Codec-output schema version, decoupled from bundle schema version.
1173    pub codec_output_schema_version: u16,
1174    /// Hash of canonical codec options, never the options themselves.
1175    #[serde(default, skip_serializing_if = "Option::is_none")]
1176    pub options_hash_hex: Option<String>,
1177    /// Engine provenance string, without paths or raw source text.
1178    #[serde(default, skip_serializing_if = "Option::is_none")]
1179    pub engine_provenance: Option<String>,
1180    /// Extraction density policy declared by the codec for this MIME.
1181    pub extraction_density_policy: ExtractionDensityPolicy,
1182}
1183
1184impl CodecAuditRow {
1185    /// Builds a metadata-only codec audit row.
1186    pub fn new(
1187        codec_id: impl Into<String>,
1188        codec_version: impl Into<String>,
1189        accepted_mime: impl Into<String>,
1190        text_origin: TextOrigin,
1191    ) -> Self {
1192        Self {
1193            codec_id: codec_id.into(),
1194            codec_version: codec_version.into(),
1195            accepted_mime: accepted_mime.into(),
1196            advertised: CodecCapabilitySet::default(),
1197            delivered: CodecCapabilitySet::default(),
1198            text_origin,
1199            codec_output_schema_version: 1,
1200            options_hash_hex: None,
1201            engine_provenance: None,
1202            extraction_density_policy: ExtractionDensityPolicy::default(),
1203        }
1204    }
1205}
1206
1207/// A suspected missed PII span reported by a [`SafetyNet`].
1208///
1209/// The safety net is not authoritative; a `LeakReport` is a signal, not a confirmed
1210/// leak. False positives are expected. Review reports and adjust policy or recognizer
1211/// thresholds.
1212#[derive(Debug, Clone, Default, PartialEq)]
1213#[non_exhaustive]
1214pub struct LeakReport {
1215    /// Suspected leaks, containing metadata only.
1216    pub suspects: Vec<LeakSuspect>,
1217    /// Bytes-free telemetry events.
1218    pub telemetry: Vec<LeakReportTelemetry>,
1219    /// Aggregated counts for callers that do not need full suspect metadata.
1220    pub stats: LeakReportStats,
1221    /// Optional replay hash.
1222    ///
1223    /// Replay determinism is guaranteed only when command path, checkpoint,
1224    /// operating point, min score, and decode parameters are fixed externally.
1225    pub replay_hash: Option<String>,
1226}
1227
1228impl LeakReport {
1229    /// Builds a report from suspects and telemetry.
1230    pub fn from_parts(
1231        suspects: Vec<LeakSuspect>,
1232        telemetry: Vec<LeakReportTelemetry>,
1233    ) -> LeakReport {
1234        let mut stats = LeakReportStats {
1235            suspect_count: suspects.len(),
1236            locale_skipped_count: telemetry
1237                .iter()
1238                .filter(|event| matches!(event, LeakReportTelemetry::LocaleSkipped { .. }))
1239                .count(),
1240            ..LeakReportStats::default()
1241        };
1242        for suspect in &suspects {
1243            match suspect.kind {
1244                LeakKind::Uncovered => stats.uncovered_count += 1,
1245                LeakKind::PartialBleed { .. } => stats.partial_bleed_count += 1,
1246                LeakKind::ClassMismatch { .. } => stats.class_mismatch_count += 1,
1247            }
1248        }
1249        LeakReport {
1250            suspects,
1251            telemetry,
1252            stats,
1253            replay_hash: None,
1254        }
1255    }
1256
1257    /// Merges another report into this report.
1258    pub fn extend(&mut self, other: LeakReport) {
1259        self.suspects.extend(other.suspects);
1260        self.telemetry.extend(other.telemetry);
1261        *self = LeakReport::from_parts(
1262            std::mem::take(&mut self.suspects),
1263            std::mem::take(&mut self.telemetry),
1264        );
1265    }
1266}
1267
1268/// Closed set of upstream OpenAI Privacy Filter labels accepted by Gaze.
1269#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
1270#[non_exhaustive]
1271pub enum OpenAiPrivateLabel {
1272    /// `private_person`.
1273    PrivatePerson,
1274    /// `private_address`.
1275    PrivateAddress,
1276    /// `private_email`.
1277    PrivateEmail,
1278    /// `private_phone`.
1279    PrivatePhone,
1280    /// `private_url`.
1281    PrivateUrl,
1282    /// `private_date`.
1283    PrivateDate,
1284    /// `account_number`.
1285    AccountNumber,
1286    /// `secret`.
1287    Secret,
1288}
1289
1290impl OpenAiPrivateLabel {
1291    /// Returns the raw upstream label.
1292    pub fn as_str(self) -> &'static str {
1293        match self {
1294            Self::PrivatePerson => "private_person",
1295            Self::PrivateAddress => "private_address",
1296            Self::PrivateEmail => "private_email",
1297            Self::PrivatePhone => "private_phone",
1298            Self::PrivateUrl => "private_url",
1299            Self::PrivateDate => "private_date",
1300            Self::AccountNumber => "account_number",
1301            Self::Secret => "secret",
1302        }
1303    }
1304}
1305
1306/// Closed safety-net PII vocabulary before mapping into `PiiClass`.
1307#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
1308#[non_exhaustive]
1309pub enum SafetyNetPiiClass {
1310    /// Email address.
1311    Email,
1312    /// Person name.
1313    Name,
1314    /// Location or address.
1315    Location,
1316    /// Phone number.
1317    Phone,
1318    /// URL.
1319    Url,
1320    /// Date.
1321    Date,
1322    /// Account number.
1323    AccountNumber,
1324    /// Secret.
1325    Secret,
1326}
1327
1328impl SafetyNetPiiClass {
1329    /// Maps the safety-net class into the shared pipeline class vocabulary.
1330    pub fn to_pii_class(self) -> PiiClass {
1331        match self {
1332            Self::Email => PiiClass::Email,
1333            Self::Name => PiiClass::Name,
1334            Self::Location => PiiClass::Location,
1335            Self::Phone => PiiClass::custom("phone"),
1336            Self::Url => PiiClass::custom("url"),
1337            Self::Date => PiiClass::custom("date"),
1338            Self::AccountNumber => PiiClass::custom("account_number"),
1339            Self::Secret => PiiClass::custom("secret"),
1340        }
1341    }
1342}
1343
1344/// Exhaustive, closed error set for safety-net execution.
1345#[derive(Debug, Clone, PartialEq, Eq, Error)]
1346#[non_exhaustive]
1347pub enum SafetyNetError {
1348    /// Safety net was explicitly requested but is unavailable.
1349    #[error("safety net unavailable: {reason}")]
1350    Unavailable {
1351        /// Sanitized reason.
1352        reason: String,
1353    },
1354    /// Required model weights or checkpoint are missing.
1355    #[error("safety net weights missing: {path}")]
1356    WeightsMissing {
1357        /// Sanitized path or identifier.
1358        path: String,
1359    },
1360    /// Backend model could not be loaded or reached.
1361    #[error("safety net model unavailable: {reason}")]
1362    ModelUnavailable {
1363        /// Sanitized reason.
1364        reason: String,
1365    },
1366    /// Input exceeded configured backend limit.
1367    #[error("safety net input too large: limit={limit}, actual={actual}")]
1368    InputTooLarge {
1369        /// Configured byte limit.
1370        limit: usize,
1371        /// Actual byte length.
1372        actual: usize,
1373    },
1374    /// Backend runtime failed.
1375    #[error("safety net runtime failed: {message}")]
1376    Runtime {
1377        /// Sanitized diagnostic message.
1378        message: String,
1379    },
1380    /// Backend returned invalid output.
1381    #[error("safety net invalid output: {message}")]
1382    InvalidOutput {
1383        /// Sanitized diagnostic message.
1384        message: String,
1385    },
1386}
1387
1388/// Disposition applied to a detected PII span.
1389///
1390/// | Variant | Restorable | Output shape |
1391/// |---------|------------|--------------|
1392/// | `Tokenize` | Yes | Opaque token: `<hex:Class_N>` |
1393/// | `FormatPreserve` | Yes | Realistic-looking pseudonym (e.g., `email1.hex@gaze-fake.invalid`) |
1394/// | `Redact` | No | Literal `[REDACTED]` -- original value is gone |
1395/// | `Generalize` | No | Class label (e.g., `[Email]`) -- original value is gone |
1396/// | `Preserve` | - | Passes through unchanged |
1397///
1398/// `Action` is `#[non_exhaustive]`. Use a wildcard arm in exhaustive matches.
1399/// When restore is required, use `Tokenize` or `FormatPreserve` -- `Redact` and
1400/// `Generalize` are irreversible.
1401#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1402#[non_exhaustive]
1403pub enum Action {
1404    /// Replace PII with a reversible token.
1405    Tokenize,
1406    /// Replace PII with a non-restorable redaction marker.
1407    Redact,
1408    /// Replace PII with a reversible format-preserving token.
1409    FormatPreserve,
1410    /// Replace PII with a broader category.
1411    Generalize,
1412    /// Preserve the original value.
1413    Preserve,
1414}
1415
1416/// Conflict resolution tier that selected or rejected a candidate.
1417#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1418#[non_exhaustive]
1419pub enum ConflictTier {
1420    /// No conflict resolution was needed.
1421    None,
1422    /// Class priority decided the conflict.
1423    ClassPriority,
1424    /// Rule priority decided the conflict.
1425    RulePriority,
1426    /// Candidate score decided the conflict.
1427    Score,
1428    /// Span length decided the conflict.
1429    SpanLength,
1430    /// Same-class containment validator result decided the conflict.
1431    Validator,
1432    /// Pre-resolver validator veto rejected the candidate.
1433    ValidatorVeto,
1434    /// Cross-class collision-family policy decided the conflict.
1435    CollisionPolicy,
1436    /// Mandatory-anchor context was missing, so family-level fallback was emitted.
1437    AnchoredContext,
1438    /// Recognizer identifier decided the conflict.
1439    RecognizerId,
1440    /// Candidate was merged with another candidate.
1441    Merged,
1442}
1443
1444/// Source document kind for metadata-only audit logging.
1445#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1446#[non_exhaustive]
1447pub enum DocumentKind {
1448    /// Structured key/value document.
1449    Structured,
1450    /// Plain text document.
1451    Text,
1452}
1453
1454/// One row of redaction metadata emitted to a [`RedactionLogger`].
1455///
1456/// Fields identify the PII class, action taken, session ID, source document kind,
1457/// conflict-resolution metadata, and timestamp. Does **not** contain the original PII
1458/// value, the token string, or any identifiable content beyond what a compliance audit
1459/// requires.
1460///
1461/// `RedactionEntry` is `#[non_exhaustive]`; adopters must construct via the public
1462/// constructor or destructure with a wildcard pattern.
1463#[derive(Debug, Clone, PartialEq, Eq)]
1464#[non_exhaustive]
1465pub struct RedactionEntry {
1466    /// Detector or recognizer source identifier.
1467    pub source: String,
1468    /// PII class affected by the decision.
1469    pub class: PiiClass,
1470    /// Policy action applied to the span.
1471    pub action: Action,
1472    /// Optional structured field name.
1473    pub field_name: Option<String>,
1474    /// Source document kind.
1475    pub document_kind: DocumentKind,
1476    /// Whether this entry records a loser in conflict resolution.
1477    pub conflict_loser: bool,
1478    /// Conflict tier that decided the outcome.
1479    pub decided_by: ConflictTier,
1480    /// Creation timestamp in epoch milliseconds.
1481    pub created_at: i64,
1482    /// Optional session identifier.
1483    pub session_id: Option<String>,
1484    /// Optional validator failure reason for a vetoed candidate.
1485    pub validator_fail_reason: Option<ValidatorFailReason>,
1486    /// Optional ambiguity metadata for a family-level fallback.
1487    pub ambiguity_record: Option<AmbiguityRecord>,
1488    /// Collision family that influenced this decision.
1489    pub collision_family: Option<String>,
1490    /// Collision variant that influenced this decision.
1491    pub collision_variant: Option<String>,
1492}
1493
1494impl RedactionEntry {
1495    /// Builds a metadata-only redaction log entry.
1496    #[allow(clippy::too_many_arguments)]
1497    pub fn new(
1498        source: impl Into<String>,
1499        class: PiiClass,
1500        action: Action,
1501        field_name: Option<String>,
1502        document_kind: DocumentKind,
1503        conflict_loser: bool,
1504        decided_by: ConflictTier,
1505        created_at: i64,
1506        session_id: Option<String>,
1507    ) -> Self {
1508        Self {
1509            source: source.into(),
1510            class,
1511            action,
1512            field_name,
1513            document_kind,
1514            conflict_loser,
1515            decided_by,
1516            created_at,
1517            session_id,
1518            validator_fail_reason: None,
1519            ambiguity_record: None,
1520            collision_family: None,
1521            collision_variant: None,
1522        }
1523    }
1524
1525    /// Attaches a validator failure reason to this metadata row.
1526    pub fn with_validator_fail_reason(mut self, reason: ValidatorFailReason) -> Self {
1527        self.validator_fail_reason = Some(reason);
1528        self
1529    }
1530
1531    /// Attaches an ambiguity record to this metadata row.
1532    pub fn with_ambiguity_record(mut self, record: AmbiguityRecord) -> Self {
1533        self.ambiguity_record = Some(record);
1534        self
1535    }
1536
1537    /// Attaches collision-family metadata to this row.
1538    pub fn with_collision_metadata(
1539        mut self,
1540        family: Option<String>,
1541        variant: Option<String>,
1542    ) -> Self {
1543        self.collision_family = family;
1544        self.collision_variant = variant;
1545        self
1546    }
1547}
1548
1549/// Closed error set for redaction log sinks.
1550#[derive(Debug, Clone, PartialEq, Eq, Error)]
1551#[non_exhaustive]
1552pub enum RedactionLogError {
1553    /// SQLite-backed redaction log sink failed.
1554    #[error("sqlite redaction log error: {0}")]
1555    Sqlite(String),
1556    /// Non-SQLite redaction log sink failed.
1557    #[error("backend redaction log error: {0}")]
1558    Backend(String),
1559}
1560
1561/// Trait for audit sinks that receive redaction metadata.
1562///
1563/// Implement this for custom audit backends (remote telemetry, structured JSON logs).
1564/// For SQLite-backed persistence, use `gaze_audit::SqliteLogger`.
1565///
1566/// # Contract
1567///
1568/// The logger receives **metadata only**: class, action, session ID, timestamp, and
1569/// other bytes-free audit labels. It never receives the original PII value or the token
1570/// value. A custom impl that augments entries with raw document text violates the audit
1571/// isolation contract and will be flagged by the `gaze_module_isolation` Dylint lint
1572/// when it lives in the wrong crate.
1573///
1574/// # Example
1575///
1576/// ```rust
1577/// use std::sync::atomic::{AtomicUsize, Ordering};
1578/// use gaze_types::{RedactionEntry, RedactionLogError, RedactionLogger};
1579///
1580/// #[derive(Default)]
1581/// struct CountLogger(AtomicUsize);
1582///
1583/// impl RedactionLogger for CountLogger {
1584///     fn log(&self, _entry: &RedactionEntry) -> Result<(), RedactionLogError> {
1585///         self.0.fetch_add(1, Ordering::Relaxed);
1586///         Ok(())
1587///     }
1588/// }
1589/// ```
1590pub trait RedactionLogger: Send + Sync {
1591    /// Records a metadata-only redaction entry.
1592    fn log(&self, entry: &RedactionEntry) -> Result<(), RedactionLogError>;
1593}
1594
1595/// Locale tag recognized by policy and recognizers.
1596#[derive(Debug, Clone, PartialEq, Eq, Hash)]
1597#[non_exhaustive]
1598pub enum LocaleTag {
1599    /// Locale-independent recognizer or policy.
1600    Global,
1601    /// German as used in Germany.
1602    DeDe,
1603    /// German as used in Austria.
1604    DeAt,
1605    /// German as used in Switzerland.
1606    DeCh,
1607    /// English as used in the United States.
1608    EnUs,
1609    /// English as used in Great Britain.
1610    EnGb,
1611    /// English as used in Ireland.
1612    EnIe,
1613    /// English as used in Australia.
1614    EnAu,
1615    /// English as used in Canada.
1616    EnCa,
1617    /// Any other canonical BCP-47-like tag.
1618    Other(String),
1619}
1620
1621/// Locale parsing error.
1622#[derive(Debug, Clone, PartialEq, Eq)]
1623#[non_exhaustive]
1624pub enum LocaleError {
1625    /// Locale tag is unsupported or invalid.
1626    Unsupported,
1627}
1628
1629impl fmt::Display for LocaleError {
1630    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1631        match self {
1632            LocaleError::Unsupported => f.write_str("unsupported locale"),
1633        }
1634    }
1635}
1636
1637impl std::error::Error for LocaleError {}
1638
1639/// Ordered locale fallback chain.
1640#[derive(Debug, Clone, PartialEq, Eq)]
1641pub struct LocaleChain(Vec<LocaleTag>);
1642
1643impl LocaleTag {
1644    /// Global locale constant.
1645    pub const GLOBAL: LocaleTag = LocaleTag::Global;
1646
1647    /// Parses a locale tag from policy or CLI input.
1648    pub fn parse(s: &str) -> Result<LocaleTag, LocaleError> {
1649        let raw = s.trim().replace('_', "-");
1650        let normalized = raw.to_ascii_lowercase();
1651        match normalized.as_str() {
1652            "global" | "*" => Ok(LocaleTag::Global),
1653            "de-de" => Ok(LocaleTag::DeDe),
1654            "de-at" => Ok(LocaleTag::DeAt),
1655            "de-ch" => Ok(LocaleTag::DeCh),
1656            "en-us" => Ok(LocaleTag::EnUs),
1657            "en-gb" => Ok(LocaleTag::EnGb),
1658            "en-ie" => Ok(LocaleTag::EnIe),
1659            "en-au" => Ok(LocaleTag::EnAu),
1660            "en-ca" => Ok(LocaleTag::EnCa),
1661            "" => Err(LocaleError::Unsupported),
1662            _ if is_bcp47_parseable(&raw) => Ok(LocaleTag::Other(canonical_other(&raw))),
1663            _ => Err(LocaleError::Unsupported),
1664        }
1665    }
1666
1667    /// Returns the canonical string form of the locale tag.
1668    pub fn as_str(&self) -> &str {
1669        match self {
1670            LocaleTag::Global => "global",
1671            LocaleTag::DeDe => "de-DE",
1672            LocaleTag::DeAt => "de-AT",
1673            LocaleTag::DeCh => "de-CH",
1674            LocaleTag::EnUs => "en-US",
1675            LocaleTag::EnGb => "en-GB",
1676            LocaleTag::EnIe => "en-IE",
1677            LocaleTag::EnAu => "en-AU",
1678            LocaleTag::EnCa => "en-CA",
1679            LocaleTag::Other(tag) => tag.as_str(),
1680        }
1681    }
1682}
1683
1684impl LocaleChain {
1685    /// Builds a locale chain and appends global fallback when absent.
1686    pub fn from_tags(mut tags: Vec<LocaleTag>) -> LocaleChain {
1687        ensure_global(&mut tags);
1688        LocaleChain(tags)
1689    }
1690
1691    /// Parses a comma-separated CLI locale chain.
1692    pub fn from_cli(raw: &str) -> Result<LocaleChain, LocaleError> {
1693        let tags = raw
1694            .split(',')
1695            .map(LocaleTag::parse)
1696            .collect::<Result<Vec<_>, _>>()?;
1697        Ok(LocaleChain::from_tags(tags))
1698    }
1699
1700    /// Merges policy and CLI locale preferences.
1701    pub fn merge_policy_and_cli(
1702        policy: Option<&[LocaleTag]>,
1703        cli: Option<&[LocaleTag]>,
1704    ) -> LocaleChain {
1705        Self::merge_cli_policy_rulepack_default(cli, policy, None)
1706    }
1707
1708    /// Merges CLI, policy, rulepack, and default locale preferences.
1709    pub fn merge_cli_policy_rulepack_default(
1710        cli: Option<&[LocaleTag]>,
1711        policy: Option<&[LocaleTag]>,
1712        rulepack_defaults: Option<&[LocaleTag]>,
1713    ) -> LocaleChain {
1714        let tags = cli
1715            .filter(|tags| !tags.is_empty())
1716            .or_else(|| policy.filter(|tags| !tags.is_empty()))
1717            .or_else(|| rulepack_defaults.filter(|tags| !tags.is_empty()))
1718            .map(|tags| tags.to_vec())
1719            .unwrap_or_else(|| vec![LocaleTag::Global]);
1720        LocaleChain::from_tags(tags)
1721    }
1722
1723    /// Returns true when a recognizer can run under this locale chain.
1724    pub fn intersects(&self, recognizer_locales: &[LocaleTag]) -> bool {
1725        if recognizer_locales.is_empty() {
1726            return true;
1727        }
1728        recognizer_locales.iter().any(|recognizer_locale| {
1729            *recognizer_locale == LocaleTag::Global
1730                || self.0.iter().any(|active| active == recognizer_locale)
1731        })
1732    }
1733
1734    /// Returns the locale tags in chain order.
1735    pub fn as_slice(&self) -> &[LocaleTag] {
1736        &self.0
1737    }
1738
1739    /// Returns the locale chain as canonical strings.
1740    pub fn to_strings(&self) -> Vec<String> {
1741        self.0.iter().map(ToString::to_string).collect()
1742    }
1743}
1744
1745impl From<&[LocaleTag]> for LocaleChain {
1746    fn from(tags: &[LocaleTag]) -> Self {
1747        let mut owned = tags.to_vec();
1748        ensure_global(&mut owned);
1749        LocaleChain(owned)
1750    }
1751}
1752
1753impl fmt::Display for LocaleTag {
1754    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1755        f.write_str(self.as_str())
1756    }
1757}
1758
1759/// The input document submitted for pseudonymization.
1760///
1761/// `RawDocument::Text(String)` for plain or semi-structured text (most LLM workflows).
1762/// `RawDocument::Structured(BTreeMap<String, Value>)` for JSON-shaped data where
1763/// column-aware rules apply -- `ColumnRule`s only take effect on structured input.
1764///
1765/// `Detection::span` and recognizer candidate spans use **byte** ranges, not char indices.
1766///
1767/// `RawDocument` is `#[non_exhaustive]`. Match with a wildcard arm.
1768#[derive(Debug, Clone)]
1769#[non_exhaustive]
1770pub enum RawDocument {
1771    /// Structured document values.
1772    Structured(BTreeMap<String, Value>),
1773    /// Plain text document.
1774    Text(String),
1775}
1776
1777/// The pseudonymized output from `Pipeline::redact`.
1778///
1779/// Mirrors the shape of `RawDocument`: `CleanDocument::Text(String)` or
1780/// `CleanDocument::Structured(BTreeMap<String, Value>)`. Destructure with a `let`-else
1781/// or `match`; **there is no `.text()` accessor**.
1782///
1783/// ```rust
1784/// use gaze_types::CleanDocument;
1785///
1786/// fn unwrap_text(doc: CleanDocument) -> Option<String> {
1787///     if let CleanDocument::Text(t) = doc { Some(t) } else { None }
1788/// }
1789/// ```
1790///
1791/// Contains only tokens or redacted placeholders -- no original PII values.
1792/// Send this (or its inner string) to the LLM; never send the original `RawDocument`.
1793///
1794/// `CleanDocument` is `#[non_exhaustive]`.
1795#[derive(Debug, Clone, Serialize)]
1796#[serde(untagged)]
1797#[non_exhaustive]
1798pub enum CleanDocument {
1799    /// Structured document values.
1800    Structured(BTreeMap<String, Value>),
1801    /// Plain text document.
1802    Text(String),
1803}
1804
1805/// Minimal structured value representation that avoids a serde_json dependency.
1806#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
1807#[serde(untagged)]
1808#[non_exhaustive]
1809pub enum Value {
1810    /// Null value.
1811    Null,
1812    /// Boolean value.
1813    Bool(bool),
1814    /// String value.
1815    String(String),
1816    /// Signed 64-bit integer value.
1817    I64(i64),
1818    /// Array value.
1819    Array(Vec<Value>),
1820    /// Object value.
1821    Object(BTreeMap<String, Value>),
1822}
1823
1824impl Value {
1825    /// Returns the inner string for string values.
1826    pub fn as_str(&self) -> Option<&str> {
1827        match self {
1828            Self::String(value) => Some(value.as_str()),
1829            Self::Null | Self::Bool(_) | Self::I64(_) | Self::Array(_) | Self::Object(_) => None,
1830        }
1831    }
1832
1833    /// Returns a scalar string representation used for structured safety-net checks.
1834    pub fn scalar_to_safety_net_string(&self) -> Option<String> {
1835        match self {
1836            Self::String(value) if !value.is_empty() => Some(value.clone()),
1837            Self::String(_) | Self::Null | Self::Array(_) | Self::Object(_) => None,
1838            Self::Bool(value) => Some(value.to_string()),
1839            Self::I64(value) => Some(value.to_string()),
1840        }
1841    }
1842}
1843
1844impl PartialEq<&str> for Value {
1845    fn eq(&self, other: &&str) -> bool {
1846        self.as_str() == Some(*other)
1847    }
1848}
1849
1850/// Value-only dictionary bundle shared with recognizers.
1851#[derive(Debug, Clone, Default)]
1852pub struct DictionaryBundle {
1853    entries: HashMap<String, DictionaryEntry>,
1854}
1855
1856/// Value-only dictionary entry; compiled automatons live outside `gaze-types`.
1857#[derive(Debug, Clone)]
1858pub struct DictionaryEntry {
1859    terms: Vec<String>,
1860    case_sensitive: bool,
1861    source: DictionarySource,
1862}
1863
1864/// Source of a dictionary entry.
1865#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1866#[non_exhaustive]
1867pub enum DictionarySource {
1868    /// Dictionary supplied by request context.
1869    Cli,
1870    /// Dictionary supplied by a rulepack.
1871    Rulepack,
1872}
1873
1874/// Dictionary metadata used for diagnostics and tests.
1875#[derive(Debug, Clone, PartialEq, Eq)]
1876#[non_exhaustive]
1877pub struct DictionaryStats {
1878    /// Dictionary name.
1879    pub name: String,
1880    /// Number of configured terms.
1881    pub term_count: usize,
1882    /// Dictionary source.
1883    pub source: DictionarySource,
1884}
1885
1886impl DictionaryStats {
1887    /// Builds dictionary diagnostics metadata.
1888    pub fn new(name: impl Into<String>, term_count: usize, source: DictionarySource) -> Self {
1889        Self {
1890            name: name.into(),
1891            term_count,
1892            source,
1893        }
1894    }
1895}
1896
1897/// Dictionary declared by a rulepack.
1898#[derive(Debug, Clone, PartialEq, Eq)]
1899#[non_exhaustive]
1900pub struct RulepackDict {
1901    /// Dictionary name.
1902    pub name: String,
1903    /// Dictionary terms.
1904    pub terms: Vec<String>,
1905    /// Whether matching is case-sensitive.
1906    pub case_sensitive: bool,
1907}
1908
1909impl RulepackDict {
1910    /// Builds a rulepack dictionary declaration.
1911    pub fn new(name: impl Into<String>, terms: Vec<String>, case_sensitive: bool) -> Self {
1912        Self {
1913            name: name.into(),
1914            terms,
1915            case_sensitive,
1916        }
1917    }
1918}
1919
1920/// Error raised when constructing invalid dictionary entries.
1921#[derive(Debug, Clone, PartialEq, Eq)]
1922#[non_exhaustive]
1923pub enum DictionaryLoadError {
1924    /// Dictionary has no terms.
1925    Empty { name: String },
1926    /// ASCII-only case-insensitive matching cannot safely cover this entry.
1927    UnicodeInsensitiveUnsupported { name: String },
1928}
1929
1930impl fmt::Display for DictionaryLoadError {
1931    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1932        match self {
1933            Self::Empty { name } => write!(f, "dictionary '{name}' has no terms"),
1934            Self::UnicodeInsensitiveUnsupported { name } => write!(
1935                f,
1936                "dictionary '{name}' uses unicode terms with case-insensitive matching, unsupported in v0.4.0; use case_sensitive = true"
1937            ),
1938        }
1939    }
1940}
1941
1942impl std::error::Error for DictionaryLoadError {}
1943
1944impl DictionaryBundle {
1945    /// Builds a bundle from rulepack dictionaries.
1946    pub fn from_rulepack_terms(terms: &[RulepackDict]) -> Self {
1947        let mut entries = HashMap::with_capacity(terms.len());
1948        for dictionary in terms {
1949            let entry = DictionaryEntry::new(
1950                &dictionary.name,
1951                dictionary.terms.clone(),
1952                dictionary.case_sensitive,
1953                DictionarySource::Rulepack,
1954            )
1955            .expect("Policy validates dictionary terms before bundle construction");
1956            entries.insert(dictionary.name.clone(), entry);
1957        }
1958        Self { entries }
1959    }
1960
1961    /// Builds a bundle from pre-built dictionary entries.
1962    pub fn from_entries(entries: impl IntoIterator<Item = (String, DictionaryEntry)>) -> Self {
1963        Self {
1964            entries: entries.into_iter().collect(),
1965        }
1966    }
1967
1968    /// Merges two bundles, preferring entries from the second bundle on name conflicts.
1969    pub fn merge(a: Self, b: Self) -> Self {
1970        let mut entries = a.entries;
1971        entries.extend(b.entries);
1972        Self { entries }
1973    }
1974
1975    /// Returns a dictionary by name.
1976    pub fn get(&self, name: &str) -> Option<&DictionaryEntry> {
1977        self.entries.get(name)
1978    }
1979
1980    /// Returns sorted dictionary stats.
1981    pub fn stats(&self) -> Vec<DictionaryStats> {
1982        let mut stats = self
1983            .entries
1984            .iter()
1985            .map(|(name, entry)| DictionaryStats {
1986                name: name.clone(),
1987                term_count: entry.terms.len(),
1988                source: entry.source,
1989            })
1990            .collect::<Vec<_>>();
1991        stats.sort_by(|a, b| a.name.cmp(&b.name));
1992        stats
1993    }
1994}
1995
1996impl DictionaryEntry {
1997    /// Creates a validated value-only dictionary entry.
1998    pub fn new(
1999        name: &str,
2000        terms: Vec<String>,
2001        case_sensitive: bool,
2002        source: DictionarySource,
2003    ) -> Result<Self, DictionaryLoadError> {
2004        if terms.is_empty() {
2005            return Err(DictionaryLoadError::Empty {
2006                name: name.to_string(),
2007            });
2008        }
2009        if !case_sensitive && terms.iter().any(|term| !term.is_ascii()) {
2010            return Err(DictionaryLoadError::UnicodeInsensitiveUnsupported {
2011                name: name.to_string(),
2012            });
2013        }
2014        Ok(Self {
2015            terms,
2016            case_sensitive,
2017            source,
2018        })
2019    }
2020
2021    /// Returns whether matching is case-sensitive.
2022    pub fn case_sensitive(&self) -> bool {
2023        self.case_sensitive
2024    }
2025
2026    /// Returns configured dictionary terms.
2027    pub fn terms(&self) -> &[String] {
2028        &self.terms
2029    }
2030}
2031
2032#[cfg(test)]
2033mod dictionary_tests {
2034    use super::*;
2035
2036    #[test]
2037    fn dictionary_entry_rejects_empty_terms() {
2038        let err = DictionaryEntry::new("empty", Vec::new(), true, DictionarySource::Cli)
2039            .expect_err("empty dictionaries must fail closed");
2040
2041        assert!(matches!(err, DictionaryLoadError::Empty { name } if name == "empty"));
2042    }
2043
2044    #[test]
2045    fn dictionary_entry_rejects_non_ascii_case_insensitive_terms() {
2046        let err = DictionaryEntry::new(
2047            "songs",
2048            vec!["Beyonce".to_string(), "Caf\u{00e9}".to_string()],
2049            false,
2050            DictionarySource::Cli,
2051        )
2052        .expect_err("unicode case-insensitive dictionaries must fail closed");
2053
2054        assert!(matches!(
2055            err,
2056            DictionaryLoadError::UnicodeInsensitiveUnsupported { name } if name == "songs"
2057        ));
2058    }
2059}
2060
2061#[cfg(test)]
2062mod redaction_logger_tests {
2063    use super::*;
2064
2065    struct CapturingLogger;
2066
2067    impl RedactionLogger for CapturingLogger {
2068        fn log(&self, _entry: &RedactionEntry) -> Result<(), RedactionLogError> {
2069            Ok(())
2070        }
2071    }
2072
2073    fn assert_send_sync<T: Send + Sync + ?Sized>() {}
2074
2075    #[test]
2076    fn redaction_log_error_display_is_stable() {
2077        assert_eq!(
2078            RedactionLogError::Sqlite("write failed".to_string()).to_string(),
2079            "sqlite redaction log error: write failed"
2080        );
2081        assert_eq!(
2082            RedactionLogError::Backend("sink failed".to_string()).to_string(),
2083            "backend redaction log error: sink failed"
2084        );
2085    }
2086
2087    #[test]
2088    fn redaction_logger_trait_object_is_send_sync() {
2089        assert_send_sync::<dyn RedactionLogger>();
2090    }
2091
2092    #[test]
2093    fn local_logger_can_implement_redaction_logger() {
2094        let logger = CapturingLogger;
2095        let entry = RedactionEntry {
2096            source: "unit-test".to_string(),
2097            class: PiiClass::Email,
2098            action: Action::Tokenize,
2099            field_name: None,
2100            document_kind: DocumentKind::Text,
2101            conflict_loser: false,
2102            decided_by: ConflictTier::None,
2103            created_at: 0,
2104            session_id: None,
2105            validator_fail_reason: None,
2106            ambiguity_record: None,
2107            collision_family: None,
2108            collision_variant: None,
2109        };
2110
2111        let trait_object: &dyn RedactionLogger = &logger;
2112        trait_object.log(&entry).expect("log entry");
2113    }
2114}
2115
2116#[cfg(test)]
2117mod safety_net_manifest_tests {
2118    use super::*;
2119
2120    fn span(start: usize, end: usize, class: PiiClass) -> EmittedTokenSpan {
2121        EmittedTokenSpan {
2122            clean_span: start..end,
2123            raw_span: start..end,
2124            class,
2125        }
2126    }
2127
2128    fn diff(manifest: Manifest, suspect: Range<usize>, class: PiiClass) -> Option<LeakKind> {
2129        manifest.diff_against(&suspect, &class)
2130    }
2131
2132    #[test]
2133    fn exact_same_class_coverage_is_not_a_leak() {
2134        let manifest = Manifest::from_spans(vec![span(0, 8, PiiClass::Email)]);
2135
2136        assert_eq!(diff(manifest, 0..8, PiiClass::Email), None);
2137    }
2138
2139    #[test]
2140    fn uncovered_outside_all_tokens_is_uncovered() {
2141        let manifest = Manifest::from_spans(vec![span(20, 30, PiiClass::Email)]);
2142
2143        assert_eq!(
2144            diff(manifest, 0..10, PiiClass::Email),
2145            Some(LeakKind::Uncovered)
2146        );
2147    }
2148
2149    #[test]
2150    fn single_internal_gap_returns_partial_bleed() {
2151        let manifest = Manifest::from_spans(vec![
2152            span(0, 5, PiiClass::Email),
2153            span(10, 15, PiiClass::Email),
2154        ]);
2155
2156        assert_eq!(
2157            diff(manifest, 0..15, PiiClass::Email),
2158            Some(LeakKind::PartialBleed { uncovered: 5..10 })
2159        );
2160    }
2161
2162    #[test]
2163    fn multi_gap_returns_deterministic_first_uncovered_gap() {
2164        let manifest = Manifest::from_spans(vec![
2165            span(0, 3, PiiClass::Email),
2166            span(5, 7, PiiClass::Email),
2167            span(9, 12, PiiClass::Email),
2168        ]);
2169
2170        // The first-gap-only rule is intentional for v0.6.1; full gap
2171        // enumeration is deferred until the report format can carry it.
2172        assert_eq!(
2173            diff(manifest, 0..12, PiiClass::Email),
2174            Some(LeakKind::PartialBleed { uncovered: 3..5 })
2175        );
2176    }
2177
2178    #[test]
2179    fn multi_class_overlap_reports_first_mismatch_deterministically() {
2180        let manifest = Manifest::from_spans(vec![
2181            span(0, 4, PiiClass::Name),
2182            span(4, 8, PiiClass::Location),
2183        ]);
2184
2185        assert_eq!(
2186            diff(manifest, 0..8, PiiClass::Email),
2187            Some(LeakKind::ClassMismatch {
2188                pipeline_class: PiiClass::Name,
2189                safety_net_class: PiiClass::Email,
2190            })
2191        );
2192    }
2193
2194    #[test]
2195    fn adjacent_same_class_tokens_cover_continuously() {
2196        let manifest = Manifest::from_spans(vec![
2197            span(0, 5, PiiClass::Email),
2198            span(5, 10, PiiClass::Email),
2199        ]);
2200
2201        assert_eq!(diff(manifest, 0..10, PiiClass::Email), None);
2202    }
2203
2204    #[test]
2205    fn partial_bleed_at_start_end_and_middle() {
2206        let manifest = Manifest::from_spans(vec![span(3, 8, PiiClass::Email)]);
2207
2208        assert_eq!(
2209            diff(manifest.clone(), 0..8, PiiClass::Email),
2210            Some(LeakKind::PartialBleed { uncovered: 0..3 })
2211        );
2212        assert_eq!(
2213            diff(manifest.clone(), 3..10, PiiClass::Email),
2214            Some(LeakKind::PartialBleed { uncovered: 8..10 })
2215        );
2216
2217        let with_gap = Manifest::from_spans(vec![
2218            span(0, 3, PiiClass::Email),
2219            span(6, 10, PiiClass::Email),
2220        ]);
2221        assert_eq!(
2222            diff(with_gap, 0..10, PiiClass::Email),
2223            Some(LeakKind::PartialBleed { uncovered: 3..6 })
2224        );
2225    }
2226
2227    #[test]
2228    fn byte_indices_are_not_character_indices() {
2229        let text = "ID: 😀 <Email_1>";
2230        let token_start = text.find("<Email_1>").expect("token start");
2231        assert_eq!(token_start, 9, "emoji is four bytes, not one char");
2232        let manifest = Manifest::from_spans(vec![span(token_start, text.len(), PiiClass::Email)]);
2233
2234        assert_eq!(
2235            diff(manifest, token_start..text.len(), PiiClass::Email),
2236            None
2237        );
2238    }
2239
2240    #[test]
2241    fn empty_suspect_range_is_not_a_leak() {
2242        let manifest = Manifest::default();
2243
2244        assert_eq!(diff(manifest, 3..3, PiiClass::Email), None);
2245    }
2246
2247    #[test]
2248    fn safety_net_error_display_is_variant_specific_and_bytes_free() {
2249        let cases = [
2250            SafetyNetError::Unavailable {
2251                reason: "not configured".to_string(),
2252            }
2253            .to_string(),
2254            SafetyNetError::WeightsMissing {
2255                path: "/models/opf".to_string(),
2256            }
2257            .to_string(),
2258            SafetyNetError::ModelUnavailable {
2259                reason: "load failed".to_string(),
2260            }
2261            .to_string(),
2262            SafetyNetError::InputTooLarge {
2263                limit: 1024,
2264                actual: 2048,
2265            }
2266            .to_string(),
2267            SafetyNetError::Runtime {
2268                message: "timeout".to_string(),
2269            }
2270            .to_string(),
2271            SafetyNetError::InvalidOutput {
2272                message: "bad json".to_string(),
2273            }
2274            .to_string(),
2275        ];
2276
2277        for rendered in cases {
2278            assert!(!rendered.contains("alice@example.invalid"));
2279        }
2280    }
2281}
2282
2283/// Shared recognizer contract for locale-aware PII candidates.
2284pub trait Recognizer: Send + Sync {
2285    /// Stable recognizer identifier.
2286    fn id(&self) -> &str;
2287    /// PII class supported by this recognizer.
2288    fn supported_class(&self) -> &PiiClass;
2289    /// Detects PII candidates in the supplied input and context.
2290    fn detect(&self, input: &str, ctx: &DetectContext<'_>) -> Vec<Candidate>;
2291    /// Token family used for candidate token emission.
2292    fn token_family(&self) -> &str;
2293    /// Optional validator kind used by pre-resolver validator-veto.
2294    fn validator_kind(&self) -> Option<ValidatorKind> {
2295        None
2296    }
2297    /// Locales where this recognizer is active.
2298    fn locales(&self) -> &[LocaleTag] {
2299        &[LocaleTag::Global]
2300    }
2301}
2302
2303/// Candidate PII span emitted by a recognizer before final conflict resolution.
2304#[derive(Debug, Clone, PartialEq)]
2305#[non_exhaustive]
2306pub struct Candidate {
2307    /// Byte span in the original input.
2308    pub span: Range<usize>,
2309    /// PII class assigned to the span.
2310    pub class: PiiClass,
2311    /// Recognizer identifier.
2312    pub recognizer_id: String,
2313    /// Recognizer confidence score.
2314    pub score: f32,
2315    /// Rule or recognizer priority.
2316    pub priority: i32,
2317    /// Optional canonical representation for validation/merge logic.
2318    pub canonical_form: Option<String>,
2319    /// Token family used for output token shape.
2320    pub token_family: String,
2321    /// Candidate source label.
2322    pub source: String,
2323    /// Conflict tier that decided this candidate.
2324    pub decided_by: ConflictTier,
2325    /// Sources merged into this candidate.
2326    pub merged_sources: Vec<String>,
2327}
2328
2329impl Candidate {
2330    /// Builds a recognizer candidate.
2331    #[allow(clippy::too_many_arguments)]
2332    pub fn new(
2333        span: Range<usize>,
2334        class: PiiClass,
2335        recognizer_id: impl Into<String>,
2336        score: f32,
2337        priority: i32,
2338        canonical_form: Option<String>,
2339        token_family: impl Into<String>,
2340        source: impl Into<String>,
2341        decided_by: ConflictTier,
2342        merged_sources: Vec<String>,
2343    ) -> Self {
2344        Self {
2345            span,
2346            class,
2347            recognizer_id: recognizer_id.into(),
2348            score,
2349            priority,
2350            canonical_form,
2351            token_family: token_family.into(),
2352            source: source.into(),
2353            decided_by,
2354            merged_sources,
2355        }
2356    }
2357
2358    /// Returns this candidate with a translated span.
2359    pub fn with_span(mut self, span: Range<usize>) -> Self {
2360        self.span = span;
2361        self
2362    }
2363}
2364
2365/// Context supplied to recognizers during detection.
2366#[non_exhaustive]
2367pub struct DetectContext<'a> {
2368    /// Active locale chain.
2369    pub locale_chain: &'a [LocaleTag],
2370    /// Active dictionary bundle.
2371    pub dictionaries: &'a DictionaryBundle,
2372    /// Reserved field-aware matching slot; intentionally unit in v0.5 Phase B.
2373    pub fields: &'a (),
2374    /// Whether a recognizer degraded due to unavailable optional capability.
2375    pub degraded: Cell<bool>,
2376}
2377
2378impl<'a> DetectContext<'a> {
2379    /// Builds detection context for a recognizer pass.
2380    pub fn new(locale_chain: &'a [LocaleTag], dictionaries: &'a DictionaryBundle) -> Self {
2381        Self {
2382            locale_chain,
2383            dictionaries,
2384            fields: &(),
2385            degraded: Cell::new(false),
2386        }
2387    }
2388}
2389
2390fn ensure_global(tags: &mut Vec<LocaleTag>) {
2391    if !tags.contains(&LocaleTag::Global) {
2392        tags.push(LocaleTag::Global);
2393    }
2394}
2395
2396fn is_bcp47_parseable(raw: &str) -> bool {
2397    let mut parts = raw.split('-');
2398    let Some(language) = parts.next() else {
2399        return false;
2400    };
2401    if !(2..=8).contains(&language.len()) || !language.chars().all(|ch| ch.is_ascii_alphabetic()) {
2402        return false;
2403    }
2404    parts.all(|part| {
2405        (2..=8).contains(&part.len()) && part.chars().all(|ch| ch.is_ascii_alphanumeric())
2406    })
2407}
2408
2409fn canonical_other(raw: &str) -> String {
2410    let mut parts = raw.split('-');
2411    let language = parts.next().unwrap_or_default().to_ascii_lowercase();
2412    let rest = parts.map(|part| {
2413        if part.len() == 2 && part.chars().all(|ch| ch.is_ascii_alphabetic()) {
2414            part.to_ascii_uppercase()
2415        } else {
2416            part.to_ascii_lowercase()
2417        }
2418    });
2419    std::iter::once(language)
2420        .chain(rest)
2421        .collect::<Vec<_>>()
2422        .join("-")
2423}
gaze_types/lib.rs

gaze_types/
lib.rs