Skip to main content

gaze_types/
lib.rs

1#![cfg_attr(docsrs, feature(doc_cfg))]
2
3use std::cell::Cell;
4use std::collections::{BTreeMap, HashMap};
5use std::fmt;
6use std::ops::Range;
7
8use serde::{Deserialize, Serialize};
9use sha3::{Digest, Keccak256};
10use thiserror::Error;
11
12/// Shared detector contract for text-only PII detection.
13pub trait Detector: Send + Sync {
14    /// Detect PII spans in the supplied input string.
15    fn detect(&self, input: &str) -> Vec<Detection>;
16}
17
18/// The category of a detected PII span.
19///
20/// Built-in variants: `Email`, `Name`, `Location`, `Organization`. Tenant-specific PII
21/// (case references, titles, internal codes) is carried as `PiiClass::Custom(String)`.
22/// **There is no `Phone` variant** -- phone detection is provided by recognizers in
23/// `gaze-recognizers` and surfaces as either a `Custom("phone")` class or a class
24/// defined by a rulepack.
25///
26/// `PiiClass` is exhaustive. Match every variant explicitly so new built-in classes
27/// force call sites to review their handling at compile time:
28///
29/// ```rust
30/// use gaze_types::PiiClass;
31///
32/// fn label(class: &PiiClass) -> &'static str {
33///     match class {
34///         PiiClass::Email        => "email",
35///         PiiClass::Name         => "name",
36///         PiiClass::Location     => "location",
37///         PiiClass::Organization => "org",
38///         PiiClass::Custom(_)    => "pii",
39///     }
40/// }
41/// ```
42///
43/// Policy TOML uses the lowercase forms `email` / `name` / `location` / `organization`,
44/// and tenant classes are spelled like `custom:case_ref` (lowercase, snake_case).
45#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
46pub enum PiiClass {
47    /// Email address class.
48    Email,
49    /// Person name class.
50    Name,
51    /// Location class.
52    Location,
53    /// Organization class.
54    Organization,
55    /// Tenant- or policy-defined class.
56    Custom(String),
57}
58
59/// Built-in class labels in stable display order.
60pub const BUILTIN_CLASS_NAMES: &[&str] = &["Email", "Name", "Location", "Organization"];
61
62/// Family names reserved for bundled collision-policy rulepacks.
63///
64/// Adopter policy-level custom recognizers cannot claim these names because bundled
65/// families are part of the core disambiguation contract.
66pub const RESERVED_BUNDLED_FAMILIES: &[&str] = &[
67    "us-9-digit-id",
68    "iberian-id",
69    "payment-card-or-iban",
70    "phone-or-imei",
71    "vin-or-serial",
72    "mac-or-hex",
73    "passport-or-doc-support",
74    "national-13-digit",
75    "italian-cf-or-serial",
76    "german-personalausweis",
77    "swedish-personnummer",
78    "finnish-hetu",
79];
80
81/// Collision-family membership metadata for one recognizer.
82#[derive(Debug, Clone, PartialEq, Eq)]
83#[non_exhaustive]
84pub struct CollisionMembership {
85    /// Cross-class family name.
86    pub family: String,
87    /// Variant name within the family.
88    pub variant: String,
89    /// Lower values win when two variants in the same family overlap.
90    pub precedence: u32,
91    /// Optional anchor variant required by later ambiguity handling.
92    pub mandatory_anchor: Option<String>,
93}
94
95impl CollisionMembership {
96    /// Builds collision-family membership metadata.
97    pub fn new(
98        family: impl Into<String>,
99        variant: impl Into<String>,
100        precedence: u32,
101        mandatory_anchor: Option<String>,
102    ) -> Self {
103        Self {
104            family: family.into(),
105            variant: variant.into(),
106            precedence,
107            mandatory_anchor,
108        }
109    }
110}
111
112impl PiiClass {
113    /// Parses a policy class name into the shared class vocabulary.
114    pub fn from_policy_name(input: &str) -> Option<Self> {
115        match input {
116            "email" => Some(Self::Email),
117            "name" => Some(Self::Name),
118            "location" => Some(Self::Location),
119            "organization" => Some(Self::Organization),
120            custom if custom.starts_with("custom:") => {
121                let name = custom.trim_start_matches("custom:");
122                (!name.trim().is_empty()).then(|| Self::custom(name))
123            }
124            _ => None,
125        }
126    }
127
128    /// Returns the built-in class variants.
129    pub fn builtin_variants() -> &'static [PiiClass] {
130        &[
131            PiiClass::Email,
132            PiiClass::Name,
133            PiiClass::Location,
134            PiiClass::Organization,
135        ]
136    }
137
138    /// Builds a normalized custom class name.
139    pub fn custom(name: &str) -> Self {
140        let mut normalized = String::new();
141        let mut pending_underscore = false;
142        for ch in name.trim().chars() {
143            if ch.is_ascii_alphanumeric() {
144                if pending_underscore && !normalized.is_empty() {
145                    normalized.push('_');
146                }
147                normalized.push(ch.to_ascii_lowercase());
148                pending_underscore = false;
149            } else {
150                pending_underscore = true;
151            }
152        }
153
154        Self::Custom(normalized)
155    }
156
157    /// Returns the normalized custom class name for custom classes.
158    pub fn as_custom_name(&self) -> Option<&str> {
159        match self {
160            Self::Custom(name) => Some(name.as_str()),
161            Self::Email | Self::Name | Self::Location | Self::Organization => None,
162        }
163    }
164
165    /// Returns the audit/token display label for this class.
166    pub fn class_name(&self) -> String {
167        match self {
168            Self::Email => BUILTIN_CLASS_NAMES[0].to_string(),
169            Self::Name => BUILTIN_CLASS_NAMES[1].to_string(),
170            Self::Location => BUILTIN_CLASS_NAMES[2].to_string(),
171            Self::Organization => BUILTIN_CLASS_NAMES[3].to_string(),
172            Self::Custom(name) => format!("Custom:{name}"),
173        }
174    }
175
176    /// Returns the canonical audit/serde label for this class.
177    pub fn to_canonical_str(&self) -> String {
178        match self {
179            Self::Email => "email".to_string(),
180            Self::Name => "name".to_string(),
181            Self::Location => "location".to_string(),
182            Self::Organization => "organization".to_string(),
183            Self::Custom(name) => format!("custom:{name}"),
184        }
185    }
186
187    /// Parses the canonical audit/serde label for a PII class.
188    pub fn from_canonical_str(value: &str) -> Option<Self> {
189        match value {
190            "email" | "Email" => Some(Self::Email),
191            "name" | "Name" => Some(Self::Name),
192            "location" | "Location" => Some(Self::Location),
193            "organization" | "Organization" => Some(Self::Organization),
194            custom if custom.starts_with("custom:") => {
195                let name = &custom["custom:".len()..];
196                (!name.is_empty()).then(|| Self::Custom(name.to_string()))
197            }
198            _ => None,
199        }
200    }
201}
202
203/// Audit-canonical form of [`PiiClass`].
204///
205/// Serializes as `"email"`, `"name"`, `"custom:foo"`, and similar canonical
206/// strings. Use this wrapper for audit-row JSON only. Session snapshots use
207/// bare [`PiiClass`] serde so their byte shape stays stable.
208#[derive(Debug, Clone, PartialEq, Eq)]
209#[non_exhaustive]
210pub struct PiiClassAudit(pub PiiClass);
211
212impl PiiClassAudit {
213    /// Builds an audit-canonical class wrapper.
214    pub fn new(class: PiiClass) -> Self {
215        Self(class)
216    }
217
218    /// Unwraps the underlying class.
219    pub fn into_inner(self) -> PiiClass {
220        self.0
221    }
222}
223
224impl Serialize for PiiClassAudit {
225    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
226    where
227        S: serde::Serializer,
228    {
229        serializer.serialize_str(&self.0.to_canonical_str())
230    }
231}
232
233impl<'de> Deserialize<'de> for PiiClassAudit {
234    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
235    where
236        D: serde::Deserializer<'de>,
237    {
238        let value = String::deserialize(deserializer)?;
239        PiiClass::from_canonical_str(&value)
240            .map(Self)
241            .ok_or_else(|| {
242                serde::de::Error::custom(format!("unknown PiiClass canonical form: {value}"))
243            })
244    }
245}
246
247mod pii_class_audit_serde {
248    use super::{PiiClass, PiiClassAudit};
249    use serde::{Deserialize, Deserializer, Serialize, Serializer};
250
251    pub fn serialize<S>(class: &PiiClass, serializer: S) -> Result<S::Ok, S::Error>
252    where
253        S: Serializer,
254    {
255        PiiClassAudit::new(class.clone()).serialize(serializer)
256    }
257
258    pub fn deserialize<'de, D>(deserializer: D) -> Result<PiiClass, D::Error>
259    where
260        D: Deserializer<'de>,
261    {
262        Ok(PiiClassAudit::deserialize(deserializer)?.into_inner())
263    }
264}
265
266/// A candidate recognizer/class pair that lost ambiguity resolution.
267#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
268#[non_exhaustive]
269pub struct LosingCandidate {
270    /// PII class proposed by the losing recognizer.
271    #[serde(with = "pii_class_audit_serde")]
272    pub class: PiiClass,
273    /// Stable recognizer identifier for traceability.
274    pub recognizer_id: String,
275}
276
277impl LosingCandidate {
278    /// Builds a losing ambiguity candidate.
279    pub fn new(class: PiiClass, recognizer_id: impl Into<String>) -> Self {
280        Self {
281            class,
282            recognizer_id: recognizer_id.into(),
283        }
284    }
285}
286
287/// Structured metadata describing an ambiguity outcome.
288#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
289#[non_exhaustive]
290pub struct AmbiguityRecord {
291    /// The family-level class assigned when disambiguation failed.
292    #[serde(with = "pii_class_audit_serde")]
293    pub ambiguity_class: PiiClass,
294    /// Variants that could not be disambiguated.
295    ///
296    /// Producers must keep this list stable by sorting `recognizer_id` ascending.
297    pub losing_candidates: Vec<LosingCandidate>,
298    /// Why disambiguation failed.
299    pub reason: AmbiguityReason,
300}
301
302impl AmbiguityRecord {
303    /// Builds a structured ambiguity record.
304    pub fn new(
305        ambiguity_class: PiiClass,
306        losing_candidates: Vec<LosingCandidate>,
307        reason: AmbiguityReason,
308    ) -> Self {
309        Self {
310            ambiguity_class,
311            losing_candidates,
312            reason,
313        }
314    }
315}
316
317/// Closed set of ambiguity outcomes recorded by the audit side-channel.
318#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
319#[non_exhaustive]
320#[serde(rename_all = "snake_case")]
321pub enum AmbiguityReason {
322    /// Span matched a multi-recognizer family and no anchor cue resolved it.
323    NoAnchor,
324    /// Multiple validator-stage recognizers remained viable for the same span.
325    ValidatorIndeterminate,
326    /// Span matched recognizers across two or more distinct PII class families.
327    MultiFamilyMatch,
328    /// Multiple variants had the same precedence and no discriminator resolved them.
329    PrecedenceTie,
330}
331
332/// Closed validator failure reasons recorded by audit metadata.
333#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
334#[non_exhaustive]
335#[serde(rename_all = "snake_case")]
336pub enum ValidatorFailReason {
337    /// Luhn checksum validation failed.
338    LuhnFailed,
339    /// IBAN MOD-97 validation failed.
340    IbanMod97Failed,
341    /// Email RFC-style validation failed.
342    #[serde(alias = "email_rfc_failed")]
343    EmailRfcRejected,
344    /// E.164 phone validation failed.
345    #[serde(alias = "e164_phone_failed")]
346    PhoneE164Rejected,
347    /// National phone parser accepted the number but region validation failed.
348    PhoneNationalRegionMismatch,
349    /// IPv4 parser rejected the candidate.
350    Ipv4ParseFailed,
351    /// IPv6 parser rejected the candidate.
352    Ipv6ParseFailed,
353    /// EIP-55 Ethereum checksum validation failed.
354    EthEip55ChecksumFailed,
355    /// Aadhaar Verhoeff checksum validation failed.
356    AadhaarVerhoeffFailed,
357    /// French NIR MOD-97 key validation failed.
358    FrNirMod97Failed,
359    /// German Steuer-ID MOD 11,10 checksum validation failed.
360    DeSteuerIdMod1110Failed,
361    /// Dutch BSN MOD-11 checksum validation failed.
362    BsnMod11Failed,
363    /// Brazilian CPF MOD-11 checksum validation failed.
364    CpfMod11Failed,
365    /// Brazilian CNPJ MOD-11 checksum validation failed.
366    CnpjMod11Failed,
367    /// UK NHS number MOD-11 checksum validation failed.
368    UkNhsMod11Failed,
369}
370
371/// Typed validator outcome used by the pre-resolver validator-veto phase.
372#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
373#[non_exhaustive]
374#[serde(rename_all = "snake_case")]
375pub enum ValidatorOutcome {
376    /// Candidate passed validation; canonical form may be supplied by the validator.
377    Pass { canonical_form: Option<String> },
378    /// Candidate failed validation with a closed, auditable reason.
379    Fail { reason: ValidatorFailReason },
380    /// Recognizer has no validator for this candidate.
381    NotApplicable,
382}
383
384/// Error returned when a rulepack names a validator unsupported by this build.
385#[derive(Debug, Clone, PartialEq, Eq, Error)]
386#[non_exhaustive]
387pub enum ValidatorKindParseError {
388    /// Validator kind is not known or is gated behind a disabled feature.
389    #[error("unsupported validator: {kind}")]
390    UnsupportedValidator {
391        /// Unsupported validator kind.
392        kind: String,
393    },
394}
395
396/// Closed set of validator implementations used by validator-backed recognizers.
397#[derive(Debug, Clone, Copy, PartialEq, Eq)]
398#[non_exhaustive]
399pub enum ValidatorKind {
400    /// Basic email shape validator.
401    EmailRfc,
402    /// Parser-backed E.164 phone validator.
403    #[cfg(feature = "phone-parser")]
404    E164Phone,
405    /// Parser-backed national phone validator for a fixed region.
406    #[cfg(feature = "phone-parser")]
407    E164PhoneNational(Region),
408    /// Luhn checksum validator.
409    Luhn,
410    /// IBAN MOD-97 validator.
411    IbanMod97,
412    /// Strict decimal dotted-quad IPv4 parser.
413    Ipv4Parse,
414    /// RFC 4291 / RFC 5952 IPv6 textual parser.
415    Ipv6Parse,
416    /// EIP-55 Ethereum address checksum validator.
417    EthEip55,
418    /// Indian Aadhaar Verhoeff checksum validator.
419    AadhaarVerhoeff,
420    /// French NIR MOD-97 key validator.
421    FrNirMod97,
422    /// German Steuer-ID MOD 11,10 checksum validator.
423    DeSteuerIdMod1110,
424    /// Dutch BSN MOD-11 checksum validator.
425    BsnMod11,
426    /// Brazilian CPF MOD-11 checksum validator.
427    CpfMod11,
428    /// Brazilian CNPJ MOD-11 checksum validator.
429    CnpjMod11,
430    /// UK NHS number MOD-11 checksum validator.
431    UkNhsMod11,
432}
433
434/// Regions supported by national phone validators.
435#[cfg(feature = "phone-parser")]
436#[derive(Debug, Clone, Copy, PartialEq, Eq)]
437#[non_exhaustive]
438pub enum Region {
439    /// Germany.
440    De,
441    /// United States.
442    Us,
443}
444
445impl ValidatorKind {
446    /// Parses a policy validator kind.
447    pub fn parse(s: &str) -> Result<Self, ValidatorKindParseError> {
448        match s {
449            "email_rfc" => Ok(Self::EmailRfc),
450            #[cfg(feature = "phone-parser")]
451            "e164_phone" => Ok(Self::E164Phone),
452            #[cfg(feature = "phone-parser")]
453            "e164_phone_national_de" => Ok(Self::E164PhoneNational(Region::De)),
454            #[cfg(feature = "phone-parser")]
455            "e164_phone_national_us" => Ok(Self::E164PhoneNational(Region::Us)),
456            "luhn" => Ok(Self::Luhn),
457            "iban_mod97" => Ok(Self::IbanMod97),
458            "ipv4_parse" => Ok(Self::Ipv4Parse),
459            "ipv6_parse" => Ok(Self::Ipv6Parse),
460            "eth_eip55" => Ok(Self::EthEip55),
461            "aadhaar_verhoeff" => Ok(Self::AadhaarVerhoeff),
462            "fr_nir_mod97" => Ok(Self::FrNirMod97),
463            "de_steuer_id_mod1110" => Ok(Self::DeSteuerIdMod1110),
464            "bsn_mod11" => Ok(Self::BsnMod11),
465            "cpf_mod11" => Ok(Self::CpfMod11),
466            "cnpj_mod11" => Ok(Self::CnpjMod11),
467            "uk_nhs_mod11" => Ok(Self::UkNhsMod11),
468            other => Err(ValidatorKindParseError::UnsupportedValidator {
469                kind: other.to_string(),
470            }),
471        }
472    }
473
474    /// Returns whether the validator accepts the input.
475    pub fn validates(self, input: &str) -> bool {
476        match self {
477            Self::AadhaarVerhoeff => aadhaar_verhoeff_check(input),
478            Self::FrNirMod97 => fr_nir_mod97_check(input),
479            Self::DeSteuerIdMod1110 => de_steuer_id_mod1110_check(input),
480            Self::BsnMod11 => bsn_mod11_check(input),
481            Self::CpfMod11 => cpf_mod11_check(input),
482            Self::CnpjMod11 => cnpj_mod11_check(input),
483            Self::UkNhsMod11 => uk_nhs_mod11_check(input),
484            _ => self.canonical_form(input).is_some(),
485        }
486    }
487
488    /// Applies validation and returns a typed outcome for audit.
489    pub fn validate(self, input: &str) -> ValidatorOutcome {
490        match self.canonical_form(input) {
491            Some(canonical_form) => ValidatorOutcome::Pass {
492                canonical_form: Some(canonical_form),
493            },
494            None => ValidatorOutcome::Fail {
495                reason: self.fail_reason(),
496            },
497        }
498    }
499
500    /// Returns the canonical form for accepted input.
501    pub fn canonical_form(self, input: &str) -> Option<String> {
502        match self {
503            Self::EmailRfc => is_basic_email(input).then(|| input.to_string()),
504            #[cfg(feature = "phone-parser")]
505            Self::E164Phone => e164_phone_check(input).then(|| input.to_string()),
506            #[cfg(feature = "phone-parser")]
507            Self::E164PhoneNational(region) => validate_phone_national(region, input),
508            Self::Luhn => luhn_check(input).then(|| input.to_string()),
509            Self::IbanMod97 => iban_mod97_check(input).then(|| input.to_string()),
510            Self::Ipv4Parse => ipv4_parse_check(input).then(|| input.to_string()),
511            Self::Ipv6Parse => ipv6_parse_check(input).then(|| input.to_string()),
512            Self::EthEip55 => eth_eip55_check(input).then(|| input.to_string()),
513            Self::AadhaarVerhoeff => {
514                canonical_ascii_digits::<12>(input).filter(|_| aadhaar_verhoeff_check(input))
515            }
516            Self::FrNirMod97 => {
517                canonical_ascii_digits::<15>(input).filter(|_| fr_nir_mod97_check(input))
518            }
519            Self::DeSteuerIdMod1110 => {
520                canonical_ascii_digits::<11>(input).filter(|_| de_steuer_id_mod1110_check(input))
521            }
522            Self::BsnMod11 => canonical_ascii_digits::<9>(input).filter(|_| bsn_mod11_check(input)),
523            Self::CpfMod11 => {
524                canonical_ascii_digits::<11>(input).filter(|_| cpf_mod11_check(input))
525            }
526            Self::CnpjMod11 => {
527                canonical_ascii_digits::<14>(input).filter(|_| cnpj_mod11_check(input))
528            }
529            Self::UkNhsMod11 => {
530                canonical_ascii_digits::<10>(input).filter(|_| uk_nhs_mod11_check(input))
531            }
532        }
533    }
534
535    /// Returns the audit reason emitted when validation fails.
536    pub fn fail_reason(self) -> ValidatorFailReason {
537        match self {
538            Self::EmailRfc => ValidatorFailReason::EmailRfcRejected,
539            #[cfg(feature = "phone-parser")]
540            Self::E164Phone => ValidatorFailReason::PhoneE164Rejected,
541            #[cfg(feature = "phone-parser")]
542            Self::E164PhoneNational(_) => ValidatorFailReason::PhoneNationalRegionMismatch,
543            Self::Luhn => ValidatorFailReason::LuhnFailed,
544            Self::IbanMod97 => ValidatorFailReason::IbanMod97Failed,
545            Self::Ipv4Parse => ValidatorFailReason::Ipv4ParseFailed,
546            Self::Ipv6Parse => ValidatorFailReason::Ipv6ParseFailed,
547            Self::EthEip55 => ValidatorFailReason::EthEip55ChecksumFailed,
548            Self::AadhaarVerhoeff => ValidatorFailReason::AadhaarVerhoeffFailed,
549            Self::FrNirMod97 => ValidatorFailReason::FrNirMod97Failed,
550            Self::DeSteuerIdMod1110 => ValidatorFailReason::DeSteuerIdMod1110Failed,
551            Self::BsnMod11 => ValidatorFailReason::BsnMod11Failed,
552            Self::CpfMod11 => ValidatorFailReason::CpfMod11Failed,
553            Self::CnpjMod11 => ValidatorFailReason::CnpjMod11Failed,
554            Self::UkNhsMod11 => ValidatorFailReason::UkNhsMod11Failed,
555        }
556    }
557}
558
559fn is_basic_email(input: &str) -> bool {
560    let Some((local, domain)) = input.split_once('@') else {
561        return false;
562    };
563    !local.is_empty() && domain.contains('.') && !domain.starts_with('.') && !domain.ends_with('.')
564}
565
566#[cfg(feature = "phone-parser")]
567fn e164_phone_check(input: &str) -> bool {
568    phonenumber::parse(None, input).is_ok_and(|phone| phonenumber::is_valid(&phone))
569}
570
571#[cfg(feature = "phone-parser")]
572fn validate_phone_national(region: Region, input: &str) -> Option<String> {
573    let country = match region {
574        Region::De => phonenumber::country::DE,
575        Region::Us => phonenumber::country::US,
576    };
577    let expected_code = match region {
578        Region::De => 49,
579        Region::Us => 1,
580    };
581    let number = phonenumber::parse(Some(country), input).ok()?;
582    if number.country().code() != expected_code {
583        return None;
584    }
585    if number.is_valid() || is_safe_fixture_phone(region, input) {
586        return Some(number.format().mode(phonenumber::Mode::E164).to_string());
587    }
588    None
589}
590
591#[cfg(feature = "phone-parser")]
592fn is_safe_fixture_phone(region: Region, input: &str) -> bool {
593    let digits = input
594        .chars()
595        .filter(char::is_ascii_digit)
596        .collect::<String>();
597    match region {
598        Region::Us => {
599            digits == "15550100"
600                || matches!(digits.strip_prefix('1'), Some(rest) if rest.len() == 10 && rest[3..].starts_with("55501"))
601        }
602        Region::De => matches!(
603            digits.as_str(),
604            "493000000000"
605                | "4915100000000"
606                | "4915550112233"
607                | "015550112233"
608                | "491710000000"
609                | "01710000000"
610        ),
611    }
612}
613
614fn luhn_check(input: &str) -> bool {
615    let mut digits = Vec::new();
616    for byte in input.bytes() {
617        if byte.is_ascii_whitespace() || byte == b'-' {
618            continue;
619        }
620        if !byte.is_ascii_digit() {
621            return false;
622        }
623        digits.push(byte - b'0');
624    }
625    if !(13..=19).contains(&digits.len()) {
626        return false;
627    }
628
629    let sum: u32 = digits
630        .iter()
631        .rev()
632        .enumerate()
633        .map(|(index, digit)| {
634            let mut value = u32::from(*digit);
635            if index % 2 == 1 {
636                value *= 2;
637                if value > 9 {
638                    value -= 9;
639                }
640            }
641            value
642        })
643        .sum();
644    sum.is_multiple_of(10)
645}
646
647fn iban_mod97_check(input: &str) -> bool {
648    let canonical = iban_canonicalize(input);
649    if !(15..=34).contains(&canonical.len()) {
650        return false;
651    }
652    if !canonical.chars().all(|ch| ch.is_ascii_alphanumeric()) {
653        return false;
654    }
655
656    let mut remainder = 0u32;
657    for ch in canonical[4..].chars().chain(canonical[..4].chars()) {
658        match ch {
659            '0'..='9' => {
660                remainder = (remainder * 10 + ch.to_digit(10).expect("digit")) % 97;
661            }
662            'A'..='Z' => {
663                let value = u32::from(ch) - u32::from('A') + 10;
664                remainder = (remainder * 10 + value / 10) % 97;
665                remainder = (remainder * 10 + value % 10) % 97;
666            }
667            _ => return false,
668        }
669    }
670    remainder == 1
671}
672
673fn iban_canonicalize(input: &str) -> String {
674    input
675        .chars()
676        .filter(|ch| !ch.is_ascii_whitespace())
677        .flat_map(char::to_uppercase)
678        .collect()
679}
680
681fn ipv4_parse_check(input: &str) -> bool {
682    input.parse::<std::net::Ipv4Addr>().is_ok()
683}
684
685fn ipv6_parse_check(input: &str) -> bool {
686    input.parse::<std::net::Ipv6Addr>().is_ok()
687}
688
689fn eth_eip55_check(input: &str) -> bool {
690    let Some(address) = input.strip_prefix("0x") else {
691        return false;
692    };
693    if address.len() != 40 || !address.bytes().all(|byte| byte.is_ascii_hexdigit()) {
694        return false;
695    }
696    if address
697        .bytes()
698        .all(|byte| !byte.is_ascii_alphabetic() || byte.is_ascii_lowercase())
699        || address
700            .bytes()
701            .all(|byte| !byte.is_ascii_alphabetic() || byte.is_ascii_uppercase())
702    {
703        return true;
704    }
705
706    let lowercase = address.to_ascii_lowercase();
707    let hash = Keccak256::digest(lowercase.as_bytes());
708    for (index, byte) in address.bytes().enumerate() {
709        if byte.is_ascii_digit() {
710            continue;
711        }
712        let hash_nibble = if index % 2 == 0 {
713            hash[index / 2] >> 4
714        } else {
715            hash[index / 2] & 0x0f
716        };
717        if (hash_nibble > 7) != byte.is_ascii_uppercase() {
718            return false;
719        }
720    }
721    true
722}
723
724fn collect_ascii_digits<const N: usize>(input: &str) -> Option<[u8; N]> {
725    let mut digits = [0u8; N];
726    let mut count = 0usize;
727    for byte in input.bytes() {
728        if byte.is_ascii_digit() {
729            if count == N {
730                return None;
731            }
732            digits[count] = byte - b'0';
733            count += 1;
734        } else if matches!(byte, b' ' | b'\t' | b'\n' | b'\r' | b'-' | b'.' | b'/') {
735            continue;
736        } else {
737            return None;
738        }
739    }
740    (count == N).then_some(digits)
741}
742
743fn canonical_ascii_digits<const N: usize>(input: &str) -> Option<String> {
744    let digits = collect_ascii_digits::<N>(input)?;
745    let mut canonical = String::with_capacity(N);
746    for digit in digits {
747        canonical.push(char::from(b'0' + digit));
748    }
749    Some(canonical)
750}
751
752fn not_all_same<const N: usize>(digits: &[u8; N]) -> bool {
753    digits[1..].iter().any(|digit| *digit != digits[0])
754}
755
756fn aadhaar_verhoeff_check(input: &str) -> bool {
757    const D: [[u8; 10]; 10] = [
758        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
759        [1, 2, 3, 4, 0, 6, 7, 8, 9, 5],
760        [2, 3, 4, 0, 1, 7, 8, 9, 5, 6],
761        [3, 4, 0, 1, 2, 8, 9, 5, 6, 7],
762        [4, 0, 1, 2, 3, 9, 5, 6, 7, 8],
763        [5, 9, 8, 7, 6, 0, 4, 3, 2, 1],
764        [6, 5, 9, 8, 7, 1, 0, 4, 3, 2],
765        [7, 6, 5, 9, 8, 2, 1, 0, 4, 3],
766        [8, 7, 6, 5, 9, 3, 2, 1, 0, 4],
767        [9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
768    ];
769    const P: [[u8; 10]; 8] = [
770        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
771        [1, 5, 7, 6, 2, 8, 3, 0, 9, 4],
772        [5, 8, 0, 3, 7, 9, 6, 1, 4, 2],
773        [8, 9, 1, 6, 0, 4, 3, 5, 2, 7],
774        [9, 4, 5, 3, 1, 2, 6, 8, 7, 0],
775        [4, 2, 8, 6, 5, 7, 3, 9, 0, 1],
776        [2, 7, 9, 3, 8, 0, 6, 4, 1, 5],
777        [7, 0, 4, 6, 9, 1, 3, 2, 5, 8],
778    ];
779    let Some(digits) = collect_ascii_digits::<12>(input) else {
780        return false;
781    };
782    if digits[0] < 2 || !not_all_same(&digits) {
783        return false;
784    }
785    let mut checksum = 0u8;
786    for (index, digit) in digits.iter().rev().enumerate() {
787        checksum = D[checksum as usize][P[index % 8][*digit as usize] as usize];
788    }
789    checksum == 0
790}
791
792fn fr_nir_mod97_check(input: &str) -> bool {
793    let Some(digits) = collect_ascii_digits::<15>(input) else {
794        return false;
795    };
796    if !matches!(digits[0], 1 | 2 | 3 | 4 | 7 | 8) {
797        return false;
798    }
799    let month = digits[3] * 10 + digits[4];
800    if !(1..=12).contains(&month) && !(20..=42).contains(&month) && !(50..=99).contains(&month) {
801        return false;
802    }
803    let mut number = 0u32;
804    for digit in &digits[..13] {
805        number = (number * 10 + u32::from(*digit)) % 97;
806    }
807    let key = u32::from(digits[13]) * 10 + u32::from(digits[14]);
808    97 - number == key
809}
810
811fn de_steuer_id_mod1110_check(input: &str) -> bool {
812    let Some(digits) = collect_ascii_digits::<11>(input) else {
813        return false;
814    };
815    if !steuer_id_first_ten_digits_valid(&digits) {
816        return false;
817    }
818    let mut product = 10u8;
819    for digit in &digits[..10] {
820        let mut sum = (*digit + product) % 10;
821        if sum == 0 {
822            sum = 10;
823        }
824        product = (2 * sum) % 11;
825    }
826    let check = (11 - product) % 10;
827    check == digits[10]
828}
829
830fn steuer_id_first_ten_digits_valid(digits: &[u8; 11]) -> bool {
831    if digits[0] == 0 {
832        return false;
833    }
834    let mut counts = [0u8; 10];
835    for digit in &digits[..10] {
836        counts[*digit as usize] += 1;
837    }
838    let repeated_digits = counts.iter().filter(|count| **count > 1).count();
839    let missing_digits = counts.iter().filter(|count| **count == 0).count();
840    let repeated_count_valid = counts.iter().any(|count| matches!(*count, 2 | 3));
841    repeated_digits == 1 && repeated_count_valid && matches!(missing_digits, 1 | 2)
842}
843
844fn bsn_mod11_check(input: &str) -> bool {
845    let Some(digits) = collect_ascii_digits::<9>(input) else {
846        return false;
847    };
848    if !not_all_same(&digits) {
849        return false;
850    }
851    let sum: i32 = digits[..8]
852        .iter()
853        .enumerate()
854        .map(|(index, digit)| i32::from(*digit) * (9 - index as i32))
855        .sum::<i32>()
856        - i32::from(digits[8]);
857    sum.rem_euclid(11) == 0
858}
859
860fn cpf_mod11_check(input: &str) -> bool {
861    let Some(digits) = collect_ascii_digits::<11>(input) else {
862        return false;
863    };
864    if !not_all_same(&digits) {
865        return false;
866    }
867    mod11_check_digit(&digits[..9], 10) == digits[9]
868        && mod11_check_digit(&digits[..10], 11) == digits[10]
869}
870
871fn cnpj_mod11_check(input: &str) -> bool {
872    let Some(digits) = collect_ascii_digits::<14>(input) else {
873        return false;
874    };
875    if !not_all_same(&digits) {
876        return false;
877    }
878    const FIRST: [u8; 12] = [5, 4, 3, 2, 9, 8, 7, 6, 5, 4, 3, 2];
879    const SECOND: [u8; 13] = [6, 5, 4, 3, 2, 9, 8, 7, 6, 5, 4, 3, 2];
880    weighted_mod11_check_digit(&digits[..12], &FIRST) == digits[12]
881        && weighted_mod11_check_digit(&digits[..13], &SECOND) == digits[13]
882}
883
884fn uk_nhs_mod11_check(input: &str) -> bool {
885    let Some(digits) = collect_ascii_digits::<10>(input) else {
886        return false;
887    };
888    if !not_all_same(&digits) {
889        return false;
890    }
891    let sum: u32 = digits[..9]
892        .iter()
893        .enumerate()
894        .map(|(index, digit)| u32::from(*digit) * (10 - index as u32))
895        .sum();
896    let check = 11 - (sum % 11);
897    let check = if check == 11 { 0 } else { check };
898    check != 10 && check == u32::from(digits[9])
899}
900
901fn mod11_check_digit(digits: &[u8], start_weight: u8) -> u8 {
902    let weights = (2..=start_weight).rev();
903    let sum: u32 = digits
904        .iter()
905        .zip(weights)
906        .map(|(digit, weight)| u32::from(*digit) * u32::from(weight))
907        .sum();
908    let remainder = sum % 11;
909    if remainder < 2 {
910        0
911    } else {
912        (11 - remainder) as u8
913    }
914}
915
916fn weighted_mod11_check_digit(digits: &[u8], weights: &[u8]) -> u8 {
917    let sum: u32 = digits
918        .iter()
919        .zip(weights)
920        .map(|(digit, weight)| u32::from(*digit) * u32::from(*weight))
921        .sum();
922    let remainder = sum % 11;
923    if remainder < 2 {
924        0
925    } else {
926        (11 - remainder) as u8
927    }
928}
929
930/// A detected span and its class/source metadata.
931#[derive(Debug, Clone, PartialEq, Eq)]
932#[non_exhaustive]
933pub struct Detection {
934    /// Byte span in the original input.
935    pub span: Range<usize>,
936    /// PII class assigned to the span.
937    pub class: PiiClass,
938    /// Detector source identifier.
939    pub source: String,
940}
941
942impl Detection {
943    /// Builds a detected PII span.
944    pub fn new(span: Range<usize>, class: PiiClass, source: impl Into<String>) -> Self {
945        Self {
946            span,
947            class,
948            source: source.into(),
949        }
950    }
951}
952
953/// Observer-only post-clean check (Pass 3 in the detection pipeline).
954///
955/// Runs against already-tokenized output. May report suspected missed PII via
956/// [`LeakReport`] but **must not** mutate the token manifest, the `CleanDocument`,
957/// or the restore path. Safety nets are additive defense-in-depth, not a replacement
958/// for Pass 1/2 detection.
959///
960/// Activate at runtime with `Pipeline::with_safety_net` (post-build) or
961/// `PipelineBuilder::register_safety_net` (during build), or via the CLI
962/// `--safety-net=<name>` flag.
963///
964/// If a safety net reports a suspected miss, the caller decides the response; the
965/// pipeline never silently re-cleans based on safety net output.
966pub trait SafetyNet: Send + Sync {
967    /// Stable backend identifier used in telemetry and audit rows.
968    fn id(&self) -> &str;
969
970    /// Locale tags supported by this safety net. Empty means global.
971    fn supported_locales(&self) -> &[LocaleTag];
972
973    /// Checks clean text for possible PII that the manifest did not cover.
974    fn check(
975        &self,
976        clean_text: &str,
977        context: SafetyNetContext<'_>,
978    ) -> Result<Vec<LeakSuspect>, SafetyNetError>;
979}
980
981/// Context passed to a privacy safety net.
982#[derive(Debug, Clone, Copy)]
983#[non_exhaustive]
984pub struct SafetyNetContext<'a> {
985    /// Tokens emitted by the pseudonymization pipeline for this text segment.
986    pub manifest: &'a Manifest,
987    /// Active session-level locale chain. For `RawDocument::Structured`, locale
988    /// gating uses this same session-level chain across all fields; structured
989    /// fields do not carry per-field locale annotations.
990    pub locale_chain: &'a [LocaleTag],
991    /// Source document kind being checked.
992    pub document_kind: DocumentKind,
993    /// Optional audit session identifier.
994    pub session_id: Option<&'a str>,
995    /// Structured-document field path, such as `$.user.email`.
996    pub field_path: Option<&'a str>,
997}
998
999impl<'a> SafetyNetContext<'a> {
1000    /// Builds safety-net context for one clean text segment.
1001    pub fn new(
1002        manifest: &'a Manifest,
1003        locale_chain: &'a [LocaleTag],
1004        document_kind: DocumentKind,
1005        session_id: Option<&'a str>,
1006        field_path: Option<&'a str>,
1007    ) -> Self {
1008        Self {
1009            manifest,
1010            locale_chain,
1011            document_kind,
1012            session_id,
1013            field_path,
1014        }
1015    }
1016}
1017
1018/// A replacement emitted by the pseudonymization pipeline.
1019#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
1020#[non_exhaustive]
1021pub struct EmittedTokenSpan {
1022    /// Byte span in the clean text.
1023    pub clean_span: Range<usize>,
1024    /// Byte span in the raw text that produced the token.
1025    pub raw_span: Range<usize>,
1026    /// PII class represented by the emitted token.
1027    pub class: PiiClass,
1028}
1029
1030impl EmittedTokenSpan {
1031    /// Builds an emitted token span.
1032    pub fn new(clean_span: Range<usize>, raw_span: Range<usize>, class: PiiClass) -> Self {
1033        Self {
1034            clean_span,
1035            raw_span,
1036            class,
1037        }
1038    }
1039}
1040
1041/// Set of emitted token spans for one clean text segment.
1042#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
1043#[non_exhaustive]
1044pub struct Manifest {
1045    /// Spans sorted by `clean_span.start`.
1046    pub spans: Vec<EmittedTokenSpan>,
1047}
1048
1049impl Manifest {
1050    /// Builds a manifest from spans and sorts them by clean byte start.
1051    pub fn from_spans(mut spans: Vec<EmittedTokenSpan>) -> Self {
1052        spans.sort_by_key(|span| (span.clean_span.start, span.clean_span.end));
1053        Self { spans }
1054    }
1055
1056    /// Diffs one safety-net suspect span against emitted token coverage.
1057    ///
1058    /// Returns `None` when the suspect span is continuously covered by emitted
1059    /// token spans of the same class. Internal gaps return
1060    /// `LeakKind::PartialBleed`. When multiple uncovered gaps exist, this method
1061    /// deterministically returns the first gap by byte offset; full gap
1062    /// enumeration is intentionally deferred to a future report format.
1063    pub fn diff_against(
1064        &self,
1065        suspect_span: &Range<usize>,
1066        suspect_class: &PiiClass,
1067    ) -> Option<LeakKind> {
1068        if suspect_span.is_empty() {
1069            return None;
1070        }
1071
1072        let start_idx = self
1073            .spans
1074            .partition_point(|span| span.clean_span.end <= suspect_span.start);
1075        let overlapping = self.spans[start_idx..]
1076            .iter()
1077            .take_while(|span| span.clean_span.start < suspect_span.end)
1078            .filter(|span| ranges_overlap(&span.clean_span, suspect_span))
1079            .collect::<Vec<_>>();
1080
1081        if overlapping.is_empty() {
1082            return Some(LeakKind::Uncovered);
1083        }
1084
1085        let mut cursor = suspect_span.start;
1086        let mut first_mismatch = None::<&EmittedTokenSpan>;
1087        for span in overlapping {
1088            if span.clean_span.start > cursor {
1089                return Some(LeakKind::PartialBleed {
1090                    uncovered: cursor..span.clean_span.start.min(suspect_span.end),
1091                });
1092            }
1093
1094            if span.clean_span.end > cursor {
1095                if first_mismatch.is_none() && &span.class != suspect_class {
1096                    first_mismatch = Some(span);
1097                }
1098                cursor = cursor.max(span.clean_span.end.min(suspect_span.end));
1099                if cursor >= suspect_span.end {
1100                    break;
1101                }
1102            }
1103        }
1104
1105        if cursor < suspect_span.end {
1106            return Some(LeakKind::PartialBleed {
1107                uncovered: cursor..suspect_span.end,
1108            });
1109        }
1110
1111        first_mismatch.map(|span| LeakKind::ClassMismatch {
1112            pipeline_class: span.class.clone(),
1113            safety_net_class: suspect_class.clone(),
1114        })
1115    }
1116}
1117
1118fn ranges_overlap(left: &Range<usize>, right: &Range<usize>) -> bool {
1119    left.start < right.end && right.start < left.end
1120}
1121
1122/// Suspected leak reported by an observer-only safety net.
1123#[derive(Debug, Clone, PartialEq)]
1124#[non_exhaustive]
1125pub struct LeakSuspect {
1126    /// Byte span in clean text.
1127    pub span: Range<usize>,
1128    /// Mapped PII class for the suspect.
1129    pub class: PiiClass,
1130    /// Safety-net backend identifier.
1131    pub safety_net_id: String,
1132    /// Optional backend confidence score.
1133    pub score: Option<f32>,
1134    /// Leak classification after manifest correlation.
1135    pub kind: LeakKind,
1136    /// Raw backend label after validation/mapping, never source text.
1137    pub raw_label: String,
1138    /// Optional structured field path.
1139    pub field_path: Option<String>,
1140}
1141
1142impl LeakSuspect {
1143    /// Builds a safety-net leak suspect.
1144    pub fn new(
1145        span: Range<usize>,
1146        class: PiiClass,
1147        safety_net_id: impl Into<String>,
1148        score: Option<f32>,
1149        kind: LeakKind,
1150        raw_label: impl Into<String>,
1151        field_path: Option<String>,
1152    ) -> Self {
1153        Self {
1154            span,
1155            class,
1156            safety_net_id: safety_net_id.into(),
1157            score,
1158            kind,
1159            raw_label: raw_label.into(),
1160            field_path,
1161        }
1162    }
1163}
1164
1165/// The category of a suspected missed PII span.
1166///
1167/// `LeakKind` is `#[non_exhaustive]`. Match with a wildcard for forward compatibility.
1168#[derive(Debug, Clone, PartialEq, Eq)]
1169#[non_exhaustive]
1170pub enum LeakKind {
1171    /// No same-class emitted token overlaps the suspect span.
1172    Uncovered,
1173    /// The suspect is only partly covered; `uncovered` is the first gap.
1174    PartialBleed {
1175        /// First uncovered byte range in the suspect span.
1176        uncovered: Range<usize>,
1177    },
1178    /// The suspect is continuously covered, but by a different class.
1179    ClassMismatch {
1180        /// Class emitted by the pipeline.
1181        pipeline_class: PiiClass,
1182        /// Class reported by the safety net.
1183        safety_net_class: PiiClass,
1184    },
1185}
1186
1187/// Bytes-free telemetry emitted by safety-net orchestration.
1188#[derive(Debug, Clone, PartialEq, Eq)]
1189#[non_exhaustive]
1190pub enum LeakReportTelemetry {
1191    /// Safety net skipped because the session-level locale chain did not match.
1192    LocaleSkipped {
1193        /// Safety-net backend identifier.
1194        safety_net_id: String,
1195        /// Document kind checked.
1196        document_kind: DocumentKind,
1197        /// Optional structured field path when skip was recorded per field.
1198        field_path: Option<String>,
1199    },
1200}
1201
1202/// Aggregate leak report statistics.
1203#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
1204#[non_exhaustive]
1205pub struct LeakReportStats {
1206    /// Number of suspects reported.
1207    pub suspect_count: usize,
1208    /// Number of uncovered suspects.
1209    pub uncovered_count: usize,
1210    /// Number of partial-bleed suspects.
1211    pub partial_bleed_count: usize,
1212    /// Number of class-mismatch suspects.
1213    pub class_mismatch_count: usize,
1214    /// Number of locale-skip telemetry events.
1215    pub locale_skipped_count: usize,
1216}
1217
1218/// Signed document-context metadata carried inside a session snapshot envelope.
1219///
1220/// This extension is the v0.7 bridge for `gaze-document`: it is safe to serialize
1221/// inside the owner-only snapshot envelope, while agent-facing files keep using
1222/// non-sensitive mirrors. The single `schema_version` is bundle-level; sub-files
1223/// do not carry independent schema versions.
1224#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1225#[non_exhaustive]
1226pub struct DocumentExtension {
1227    /// Bundle-level schema version shared by clean, layout, preview, report, and manifest files.
1228    pub schema_version: u16,
1229    /// SHA-256 of `clean.md` NFC-normalized bytes.
1230    pub clean_md_sha256: [u8; 32],
1231    /// SHA-256 of canonical `layout.json` bytes.
1232    pub layout_json_sha256: [u8; 32],
1233    /// SHA-256 of canonical `report.json` bytes.
1234    pub report_json_sha256: [u8; 32],
1235    /// SHA-256 of `preview-redacted.png` bytes when a preview is present.
1236    #[serde(default, skip_serializing_if = "Option::is_none")]
1237    pub preview_png_sha256: Option<[u8; 32]>,
1238    /// Page count reported for the source document.
1239    pub page_count: u32,
1240    /// Audit session id mirrored from the writing session for cross-pane correlation.
1241    pub audit_session_id: String,
1242    /// Signed clean.md byte spans for every emitted token.
1243    #[serde(default, skip_serializing_if = "Vec::is_empty")]
1244    pub clean_spans: Vec<EmittedTokenSpan>,
1245    /// Codec audit rows for the decode path that produced this document extension.
1246    #[serde(default, skip_serializing_if = "Vec::is_empty")]
1247    pub codec_audit: Vec<CodecAuditRow>,
1248}
1249
1250impl DocumentExtension {
1251    /// Starts a document extension builder for one bundle schema version.
1252    pub fn builder(schema_version: u16) -> DocumentExtensionBuilder {
1253        DocumentExtensionBuilder {
1254            schema_version,
1255            clean_md_sha256: None,
1256            layout_json_sha256: None,
1257            report_json_sha256: None,
1258            preview_png_sha256: None,
1259            page_count: None,
1260            audit_session_id: None,
1261            clean_spans: Vec::new(),
1262            codec_audit: Vec::new(),
1263        }
1264    }
1265}
1266
1267/// Builder for [`DocumentExtension`] that requires signed integrity-binding fields.
1268#[derive(Debug, Clone)]
1269#[must_use]
1270pub struct DocumentExtensionBuilder {
1271    schema_version: u16,
1272    clean_md_sha256: Option<[u8; 32]>,
1273    layout_json_sha256: Option<[u8; 32]>,
1274    report_json_sha256: Option<[u8; 32]>,
1275    preview_png_sha256: Option<[u8; 32]>,
1276    page_count: Option<u32>,
1277    audit_session_id: Option<String>,
1278    clean_spans: Vec<EmittedTokenSpan>,
1279    codec_audit: Vec<CodecAuditRow>,
1280}
1281
1282impl DocumentExtensionBuilder {
1283    pub fn clean_md_sha256(mut self, hash: [u8; 32]) -> Self {
1284        self.clean_md_sha256 = Some(hash);
1285        self
1286    }
1287
1288    pub fn layout_json_sha256(mut self, hash: [u8; 32]) -> Self {
1289        self.layout_json_sha256 = Some(hash);
1290        self
1291    }
1292
1293    pub fn report_json_sha256(mut self, hash: [u8; 32]) -> Self {
1294        self.report_json_sha256 = Some(hash);
1295        self
1296    }
1297
1298    pub fn preview_png_sha256(mut self, hash: [u8; 32]) -> Self {
1299        self.preview_png_sha256 = Some(hash);
1300        self
1301    }
1302
1303    pub fn page_count(mut self, page_count: u32) -> Self {
1304        self.page_count = Some(page_count);
1305        self
1306    }
1307
1308    pub fn audit_session_id(mut self, audit_session_id: impl Into<String>) -> Self {
1309        self.audit_session_id = Some(audit_session_id.into());
1310        self
1311    }
1312
1313    pub fn clean_spans(mut self, clean_spans: Vec<EmittedTokenSpan>) -> Self {
1314        self.clean_spans = clean_spans;
1315        self
1316    }
1317
1318    pub fn codec_audit(mut self, codec_audit: Vec<CodecAuditRow>) -> Self {
1319        self.codec_audit = codec_audit;
1320        self
1321    }
1322
1323    pub fn build(self) -> Result<DocumentExtension, DocumentExtensionError> {
1324        Ok(DocumentExtension {
1325            schema_version: self.schema_version,
1326            clean_md_sha256: self
1327                .clean_md_sha256
1328                .ok_or(DocumentExtensionError::MissingField("clean_md_sha256"))?,
1329            layout_json_sha256: self
1330                .layout_json_sha256
1331                .ok_or(DocumentExtensionError::MissingField("layout_json_sha256"))?,
1332            report_json_sha256: self
1333                .report_json_sha256
1334                .ok_or(DocumentExtensionError::MissingField("report_json_sha256"))?,
1335            preview_png_sha256: self.preview_png_sha256,
1336            page_count: self
1337                .page_count
1338                .ok_or(DocumentExtensionError::MissingField("page_count"))?,
1339            audit_session_id: self
1340                .audit_session_id
1341                .ok_or(DocumentExtensionError::MissingField("audit_session_id"))?,
1342            clean_spans: self.clean_spans,
1343            codec_audit: self.codec_audit,
1344        })
1345    }
1346}
1347
1348/// Errors returned while building a [`DocumentExtension`].
1349#[derive(Debug, Clone, PartialEq, Eq, Error)]
1350#[non_exhaustive]
1351pub enum DocumentExtensionError {
1352    #[error("missing document extension field: {0}")]
1353    MissingField(&'static str),
1354}
1355
1356/// Provenance of text extracted from a document or transcript source.
1357#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
1358#[serde(rename_all = "snake_case")]
1359#[non_exhaustive]
1360pub enum TextOrigin {
1361    /// Text came from OCR over pixels.
1362    Ocr,
1363    /// Text came from an embedded text layer.
1364    EmbeddedText,
1365    /// Text came from an audio/video transcript.
1366    Transcript,
1367    /// Text came from multiple extraction paths.
1368    Hybrid,
1369}
1370
1371/// Orthogonal document codec capabilities delivered or advertised by a codec.
1372#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
1373#[non_exhaustive]
1374pub struct CodecCapabilitySet {
1375    /// Codec can emit text.
1376    pub text: bool,
1377    /// Codec can emit layout geometry.
1378    pub layout: bool,
1379    /// Codec can emit confidence buckets.
1380    pub confidence: bool,
1381    /// Codec can emit timestamps.
1382    pub timestamps: bool,
1383}
1384
1385impl CodecCapabilitySet {
1386    /// Text-only capability set.
1387    pub const TEXT_ONLY: Self = Self {
1388        text: true,
1389        layout: false,
1390        confidence: false,
1391        timestamps: false,
1392    };
1393
1394    /// Builds a codec capability bitset.
1395    pub const fn new(text: bool, layout: bool, confidence: bool, timestamps: bool) -> Self {
1396        Self {
1397            text,
1398            layout,
1399            confidence,
1400            timestamps,
1401        }
1402    }
1403
1404    /// Returns true when this set contains every requested capability bit.
1405    pub fn contains(self, requested: Self) -> bool {
1406        (!requested.text || self.text)
1407            && (!requested.layout || self.layout)
1408            && (!requested.confidence || self.confidence)
1409            && (!requested.timestamps || self.timestamps)
1410    }
1411}
1412
1413/// Per-codec declaration for text extraction density checks.
1414#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1415#[serde(rename_all = "snake_case")]
1416#[non_exhaustive]
1417pub enum ExtractionDensityPolicy {
1418    /// Require at least this many extracted text bytes per source KiB.
1419    Required(f32),
1420    /// Explicit exemption with an audit-visible reason.
1421    Exempt { reason: String },
1422}
1423
1424impl Default for ExtractionDensityPolicy {
1425    fn default() -> Self {
1426        Self::Exempt {
1427            reason: "calibration_pending".to_string(),
1428        }
1429    }
1430}
1431
1432/// Metadata-only audit row emitted by a document codec.
1433#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1434#[non_exhaustive]
1435pub struct CodecAuditRow {
1436    /// Stable codec id, such as `gaze.codec.tesseract`.
1437    pub codec_id: String,
1438    /// Adapter crate version, distinct from engine provenance.
1439    pub codec_version: String,
1440    /// Accepted MIME type for the decode.
1441    pub accepted_mime: String,
1442    /// Capabilities advertised by the codec.
1443    pub advertised: CodecCapabilitySet,
1444    /// Capabilities delivered for this decode.
1445    pub delivered: CodecCapabilitySet,
1446    /// Text provenance reported by the codec.
1447    pub text_origin: TextOrigin,
1448    /// Codec-output schema version, decoupled from bundle schema version.
1449    pub codec_output_schema_version: u16,
1450    /// Hash of canonical codec options, never the options themselves.
1451    #[serde(default, skip_serializing_if = "Option::is_none")]
1452    pub options_hash_hex: Option<String>,
1453    /// Engine provenance string, without paths or raw source text.
1454    #[serde(default, skip_serializing_if = "Option::is_none")]
1455    pub engine_provenance: Option<String>,
1456    /// Extraction density policy declared by the codec for this MIME.
1457    pub extraction_density_policy: ExtractionDensityPolicy,
1458}
1459
1460impl CodecAuditRow {
1461    /// Builds a metadata-only codec audit row.
1462    pub fn new(
1463        codec_id: impl Into<String>,
1464        codec_version: impl Into<String>,
1465        accepted_mime: impl Into<String>,
1466        text_origin: TextOrigin,
1467    ) -> Self {
1468        Self {
1469            codec_id: codec_id.into(),
1470            codec_version: codec_version.into(),
1471            accepted_mime: accepted_mime.into(),
1472            advertised: CodecCapabilitySet::default(),
1473            delivered: CodecCapabilitySet::default(),
1474            text_origin,
1475            codec_output_schema_version: 1,
1476            options_hash_hex: None,
1477            engine_provenance: None,
1478            extraction_density_policy: ExtractionDensityPolicy::default(),
1479        }
1480    }
1481}
1482
1483/// A suspected missed PII span reported by a [`SafetyNet`].
1484///
1485/// The safety net is not authoritative; a `LeakReport` is a signal, not a confirmed
1486/// leak. False positives are expected. Review reports and adjust policy or recognizer
1487/// thresholds.
1488#[derive(Debug, Clone, Default, PartialEq)]
1489#[non_exhaustive]
1490pub struct LeakReport {
1491    /// Suspected leaks, containing metadata only.
1492    pub suspects: Vec<LeakSuspect>,
1493    /// Bytes-free telemetry events.
1494    pub telemetry: Vec<LeakReportTelemetry>,
1495    /// Aggregated counts for callers that do not need full suspect metadata.
1496    pub stats: LeakReportStats,
1497    /// Optional replay hash.
1498    ///
1499    /// Replay determinism is guaranteed only when command path, checkpoint,
1500    /// operating point, min score, and decode parameters are fixed externally.
1501    pub replay_hash: Option<String>,
1502}
1503
1504impl LeakReport {
1505    /// Builds a report from suspects and telemetry.
1506    pub fn from_parts(
1507        suspects: Vec<LeakSuspect>,
1508        telemetry: Vec<LeakReportTelemetry>,
1509    ) -> LeakReport {
1510        let mut stats = LeakReportStats {
1511            suspect_count: suspects.len(),
1512            locale_skipped_count: telemetry
1513                .iter()
1514                .filter(|event| matches!(event, LeakReportTelemetry::LocaleSkipped { .. }))
1515                .count(),
1516            ..LeakReportStats::default()
1517        };
1518        for suspect in &suspects {
1519            match suspect.kind {
1520                LeakKind::Uncovered => stats.uncovered_count += 1,
1521                LeakKind::PartialBleed { .. } => stats.partial_bleed_count += 1,
1522                LeakKind::ClassMismatch { .. } => stats.class_mismatch_count += 1,
1523            }
1524        }
1525        LeakReport {
1526            suspects,
1527            telemetry,
1528            stats,
1529            replay_hash: None,
1530        }
1531    }
1532
1533    /// Merges another report into this report.
1534    pub fn extend(&mut self, other: LeakReport) {
1535        self.suspects.extend(other.suspects);
1536        self.telemetry.extend(other.telemetry);
1537        *self = LeakReport::from_parts(
1538            std::mem::take(&mut self.suspects),
1539            std::mem::take(&mut self.telemetry),
1540        );
1541    }
1542}
1543
1544/// Closed set of upstream OpenAI Privacy Filter labels accepted by Gaze.
1545#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
1546#[non_exhaustive]
1547pub enum OpenAiPrivateLabel {
1548    /// `private_person`.
1549    PrivatePerson,
1550    /// `private_address`.
1551    PrivateAddress,
1552    /// `private_email`.
1553    PrivateEmail,
1554    /// `private_phone`.
1555    PrivatePhone,
1556    /// `private_url`.
1557    PrivateUrl,
1558    /// `private_date`.
1559    PrivateDate,
1560    /// `account_number`.
1561    AccountNumber,
1562    /// `secret`.
1563    Secret,
1564}
1565
1566impl OpenAiPrivateLabel {
1567    /// Returns the raw upstream label.
1568    pub fn as_str(self) -> &'static str {
1569        match self {
1570            Self::PrivatePerson => "private_person",
1571            Self::PrivateAddress => "private_address",
1572            Self::PrivateEmail => "private_email",
1573            Self::PrivatePhone => "private_phone",
1574            Self::PrivateUrl => "private_url",
1575            Self::PrivateDate => "private_date",
1576            Self::AccountNumber => "account_number",
1577            Self::Secret => "secret",
1578        }
1579    }
1580}
1581
1582/// Closed safety-net PII vocabulary before mapping into `PiiClass`.
1583#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
1584#[non_exhaustive]
1585pub enum SafetyNetPiiClass {
1586    /// Email address.
1587    Email,
1588    /// Person name.
1589    Name,
1590    /// Location or address.
1591    Location,
1592    /// Phone number.
1593    Phone,
1594    /// URL.
1595    Url,
1596    /// Date.
1597    Date,
1598    /// Account number.
1599    AccountNumber,
1600    /// Secret.
1601    Secret,
1602}
1603
1604impl SafetyNetPiiClass {
1605    /// Maps the safety-net class into the shared pipeline class vocabulary.
1606    pub fn to_pii_class(self) -> PiiClass {
1607        match self {
1608            Self::Email => PiiClass::Email,
1609            Self::Name => PiiClass::Name,
1610            Self::Location => PiiClass::Location,
1611            Self::Phone => PiiClass::custom("phone"),
1612            Self::Url => PiiClass::custom("url"),
1613            Self::Date => PiiClass::custom("date"),
1614            Self::AccountNumber => PiiClass::custom("account_number"),
1615            Self::Secret => PiiClass::custom("secret"),
1616        }
1617    }
1618}
1619
1620/// Exhaustive, closed error set for safety-net execution.
1621#[derive(Debug, Clone, PartialEq, Eq, Error)]
1622#[non_exhaustive]
1623pub enum SafetyNetError {
1624    /// Safety net was explicitly requested but is unavailable.
1625    #[error("safety net unavailable: {reason}")]
1626    Unavailable {
1627        /// Sanitized reason.
1628        reason: String,
1629    },
1630    /// Required model weights or checkpoint are missing.
1631    #[error("safety net weights missing: {path}")]
1632    WeightsMissing {
1633        /// Sanitized path or identifier.
1634        path: String,
1635    },
1636    /// Backend model could not be loaded or reached.
1637    #[error("safety net model unavailable: {reason}")]
1638    ModelUnavailable {
1639        /// Sanitized reason.
1640        reason: String,
1641    },
1642    /// Backend model artifacts failed integrity verification.
1643    #[error("safety net model integrity mismatch: expected={expected}, actual={actual}")]
1644    ModelIntegrityMismatch {
1645        /// Expected SHA256 digest.
1646        expected: String,
1647        /// Actual SHA256 digest.
1648        actual: String,
1649    },
1650    /// Input exceeded configured backend limit.
1651    #[error("safety net input too large: limit={limit}, actual={actual}")]
1652    InputTooLarge {
1653        /// Configured byte limit.
1654        limit: usize,
1655        /// Actual byte length.
1656        actual: usize,
1657    },
1658    /// Backend runtime failed.
1659    #[error("safety net runtime failed: {message}")]
1660    Runtime {
1661        /// Sanitized diagnostic message.
1662        message: String,
1663    },
1664    /// Backend returned invalid output.
1665    #[error("safety net invalid output: {message}")]
1666    InvalidOutput {
1667        /// Sanitized diagnostic message.
1668        message: String,
1669    },
1670}
1671
1672/// Disposition applied to a detected PII span.
1673///
1674/// | Variant | Restorable | Output shape |
1675/// |---------|------------|--------------|
1676/// | `Tokenize` | Yes | Opaque token: `<hex:Class_N>` |
1677/// | `FormatPreserve` | Yes | Realistic-looking pseudonym (e.g., `email1.hex@gaze-fake.invalid`) |
1678/// | `Redact` | No | Literal `[REDACTED]` -- original value is gone |
1679/// | `Generalize` | No | Class label (e.g., `[Email]`) -- original value is gone |
1680/// | `Preserve` | - | Passes through unchanged |
1681///
1682/// `Action` is `#[non_exhaustive]`. Use a wildcard arm in exhaustive matches.
1683/// When restore is required, use `Tokenize` or `FormatPreserve` -- `Redact` and
1684/// `Generalize` are irreversible.
1685#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1686#[non_exhaustive]
1687pub enum Action {
1688    /// Replace PII with a reversible token.
1689    Tokenize,
1690    /// Replace PII with a non-restorable redaction marker.
1691    Redact,
1692    /// Replace PII with a reversible format-preserving token.
1693    FormatPreserve,
1694    /// Replace PII with a broader category.
1695    Generalize,
1696    /// Preserve the original value.
1697    Preserve,
1698}
1699
1700/// Conflict resolution tier that selected or rejected a candidate.
1701#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1702#[non_exhaustive]
1703pub enum ConflictTier {
1704    /// No conflict resolution was needed.
1705    None,
1706    /// Class priority decided the conflict.
1707    ClassPriority,
1708    /// Rule priority decided the conflict.
1709    RulePriority,
1710    /// Candidate score decided the conflict.
1711    Score,
1712    /// Span length decided the conflict.
1713    SpanLength,
1714    /// Same-class containment validator result decided the conflict.
1715    Validator,
1716    /// Pre-resolver validator veto rejected the candidate.
1717    ValidatorVeto,
1718    /// Cross-class collision-family policy decided the conflict.
1719    CollisionPolicy,
1720    /// Mandatory-anchor context was missing, so family-level fallback was emitted.
1721    AnchoredContext,
1722    /// Recognizer identifier decided the conflict.
1723    RecognizerId,
1724    /// Candidate was merged with another candidate.
1725    Merged,
1726    /// Safety-net redact mode stripped a suspect span.
1727    Redact,
1728    /// Safety-net resolve mode promoted a suspect span into a reversible token.
1729    Resolve,
1730    /// Safety-net fallback policy decided the outcome.
1731    Fallback,
1732}
1733
1734/// Safety-net fallback reason recorded in metadata-only audit rows.
1735#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
1736#[non_exhaustive]
1737pub enum FallbackReason {
1738    /// The suspect overlapped an existing emitted token in a way that could not be promoted.
1739    OverlapConflict,
1740    /// A validator rejected the promoted candidate.
1741    ValidatorVeto,
1742    /// A mandatory anchor was missing for the promoted candidate.
1743    AnchorMissing,
1744    /// A follow-up safety-net pass still observed a suspect.
1745    ResidualSuspect,
1746}
1747
1748/// Source document kind for metadata-only audit logging.
1749#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1750#[non_exhaustive]
1751pub enum DocumentKind {
1752    /// Structured key/value document.
1753    Structured,
1754    /// Plain text document.
1755    Text,
1756}
1757
1758/// One row of redaction metadata emitted to a [`RedactionLogger`].
1759///
1760/// Fields identify the PII class, action taken, session ID, source document kind,
1761/// conflict-resolution metadata, and timestamp. Does **not** contain the original PII
1762/// value, the token string, or any identifiable content beyond what a compliance audit
1763/// requires.
1764///
1765/// `RedactionEntry` is `#[non_exhaustive]`; adopters must construct via the public
1766/// constructor or destructure with a wildcard pattern.
1767#[derive(Debug, Clone, PartialEq, Eq)]
1768#[non_exhaustive]
1769pub struct RedactionEntry {
1770    /// Detector or recognizer source identifier.
1771    pub source: String,
1772    /// Stable semantic recognizer identifier, when available.
1773    pub recognizer_id: Option<String>,
1774    /// Versioned recognizer artifact/rule identifier, when available.
1775    pub recognizer_version_id: Option<String>,
1776    /// PII class affected by the decision.
1777    pub class: PiiClass,
1778    /// Policy action applied to the span.
1779    pub action: Action,
1780    /// Optional structured field name.
1781    pub field_name: Option<String>,
1782    /// Source document kind.
1783    pub document_kind: DocumentKind,
1784    /// Whether this entry records a loser in conflict resolution.
1785    pub conflict_loser: bool,
1786    /// Conflict tier that decided the outcome.
1787    pub decided_by: ConflictTier,
1788    /// Creation timestamp in epoch milliseconds.
1789    pub created_at: i64,
1790    /// Optional session identifier.
1791    pub session_id: Option<String>,
1792    /// Optional validator failure reason for a vetoed candidate.
1793    pub validator_fail_reason: Option<ValidatorFailReason>,
1794    /// Optional ambiguity metadata for a family-level fallback.
1795    pub ambiguity_record: Option<AmbiguityRecord>,
1796    /// Collision family that influenced this decision.
1797    pub collision_family: Option<String>,
1798    /// Collision variant that influenced this decision.
1799    pub collision_variant: Option<String>,
1800    /// Safety-net fallback reason, when fallback policy handled the row.
1801    pub fallback_triggered: Option<FallbackReason>,
1802    /// NER/pipeline provenance stage for audit-only producer attribution.
1803    pub provenance_stage: Option<String>,
1804    pub provenance_model_id: Option<String>,
1805    pub provenance_model_version: Option<String>,
1806    pub provenance_artifact_sha256: Option<String>,
1807    pub provenance_tokenizer_sha256: Option<String>,
1808    pub provenance_locale_resolved: Option<String>,
1809    pub provenance_locale_match_kind: Option<String>,
1810    pub provenance_canonical_class: Option<String>,
1811    pub provenance_native_class: Option<String>,
1812    pub provenance_confidence: Option<String>,
1813    pub provenance_merged_from: Option<String>,
1814    /// Locale-aware safety-net backend ids dropped by first-match-wins routing.
1815    pub backend_silently_dropped: Option<Vec<String>>,
1816}
1817
1818impl Serialize for RedactionEntry {
1819    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
1820    where
1821        S: serde::Serializer,
1822    {
1823        use serde::ser::SerializeStruct;
1824
1825        let mut len = 14;
1826        if self.recognizer_id.is_some() {
1827            len += 1;
1828        }
1829        if self.recognizer_version_id.is_some() {
1830            len += 1;
1831        }
1832        len += [
1833            self.provenance_stage.as_ref(),
1834            self.provenance_model_id.as_ref(),
1835            self.provenance_model_version.as_ref(),
1836            self.provenance_artifact_sha256.as_ref(),
1837            self.provenance_tokenizer_sha256.as_ref(),
1838            self.provenance_locale_resolved.as_ref(),
1839            self.provenance_locale_match_kind.as_ref(),
1840            self.provenance_canonical_class.as_ref(),
1841            self.provenance_native_class.as_ref(),
1842            self.provenance_confidence.as_ref(),
1843            self.provenance_merged_from.as_ref(),
1844        ]
1845        .into_iter()
1846        .filter(|value| value.is_some())
1847        .count();
1848        if self.backend_silently_dropped.is_some() {
1849            len += 1;
1850        }
1851        let mut state = serializer.serialize_struct("RedactionEntry", len)?;
1852        state.serialize_field("source", &self.source)?;
1853        if let Some(recognizer_id) = &self.recognizer_id {
1854            state.serialize_field("recognizer_id", recognizer_id)?;
1855        }
1856        if let Some(recognizer_version_id) = &self.recognizer_version_id {
1857            state.serialize_field("recognizer_version_id", recognizer_version_id)?;
1858        }
1859        state.serialize_field("class", &self.class.to_canonical_str())?;
1860        state.serialize_field("action", redaction_action_as_str(self.action))?;
1861        state.serialize_field("field_name", &self.field_name)?;
1862        state.serialize_field(
1863            "document_kind",
1864            redaction_document_kind_as_str(self.document_kind),
1865        )?;
1866        state.serialize_field("conflict_loser", &self.conflict_loser)?;
1867        state.serialize_field(
1868            "decided_by",
1869            redaction_conflict_tier_as_str(self.decided_by),
1870        )?;
1871        state.serialize_field("created_at", &self.created_at)?;
1872        state.serialize_field("session_id", &self.session_id)?;
1873        state.serialize_field("validator_fail_reason", &self.validator_fail_reason)?;
1874        state.serialize_field("ambiguity_record", &self.ambiguity_record)?;
1875        state.serialize_field("collision_family", &self.collision_family)?;
1876        state.serialize_field("collision_variant", &self.collision_variant)?;
1877        state.serialize_field("fallback_triggered", &self.fallback_triggered)?;
1878        if let Some(value) = &self.provenance_stage {
1879            state.serialize_field("provenance_stage", value)?;
1880        }
1881        if let Some(value) = &self.provenance_model_id {
1882            state.serialize_field("provenance_model_id", value)?;
1883        }
1884        if let Some(value) = &self.provenance_model_version {
1885            state.serialize_field("provenance_model_version", value)?;
1886        }
1887        if let Some(value) = &self.provenance_artifact_sha256 {
1888            state.serialize_field("provenance_artifact_sha256", value)?;
1889        }
1890        if let Some(value) = &self.provenance_tokenizer_sha256 {
1891            state.serialize_field("provenance_tokenizer_sha256", value)?;
1892        }
1893        if let Some(value) = &self.provenance_locale_resolved {
1894            state.serialize_field("provenance_locale_resolved", value)?;
1895        }
1896        if let Some(value) = &self.provenance_locale_match_kind {
1897            state.serialize_field("provenance_locale_match_kind", value)?;
1898        }
1899        if let Some(value) = &self.provenance_canonical_class {
1900            state.serialize_field("provenance_canonical_class", value)?;
1901        }
1902        if let Some(value) = &self.provenance_native_class {
1903            state.serialize_field("provenance_native_class", value)?;
1904        }
1905        if let Some(value) = &self.provenance_confidence {
1906            state.serialize_field("provenance_confidence", value)?;
1907        }
1908        if let Some(value) = &self.provenance_merged_from {
1909            state.serialize_field("provenance_merged_from", value)?;
1910        }
1911        if let Some(dropped) = &self.backend_silently_dropped {
1912            state.serialize_field("backend_silently_dropped", dropped)?;
1913        }
1914        state.end()
1915    }
1916}
1917
1918fn redaction_action_as_str(action: Action) -> &'static str {
1919    match action {
1920        Action::Tokenize => "tokenize",
1921        Action::Redact => "redact",
1922        Action::FormatPreserve => "format_preserve",
1923        Action::Generalize => "generalize",
1924        Action::Preserve => "preserve",
1925    }
1926}
1927
1928fn redaction_document_kind_as_str(kind: DocumentKind) -> &'static str {
1929    match kind {
1930        DocumentKind::Structured => "structured",
1931        DocumentKind::Text => "text",
1932    }
1933}
1934
1935fn redaction_conflict_tier_as_str(tier: ConflictTier) -> &'static str {
1936    match tier {
1937        ConflictTier::None => "none",
1938        ConflictTier::ClassPriority => "class_priority",
1939        ConflictTier::RulePriority => "rule_priority",
1940        ConflictTier::Score => "score",
1941        ConflictTier::SpanLength => "span_length",
1942        ConflictTier::Validator => "validator",
1943        ConflictTier::ValidatorVeto => "validator_veto",
1944        ConflictTier::CollisionPolicy => "collision_policy",
1945        ConflictTier::AnchoredContext => "anchored_context",
1946        ConflictTier::RecognizerId => "recognizer_id",
1947        ConflictTier::Merged => "merged",
1948        ConflictTier::Redact => "redact",
1949        ConflictTier::Resolve => "resolve",
1950        ConflictTier::Fallback => "fallback",
1951    }
1952}
1953
1954impl RedactionEntry {
1955    /// Builds a metadata-only redaction log entry.
1956    #[allow(clippy::too_many_arguments)]
1957    pub fn new(
1958        source: impl Into<String>,
1959        class: PiiClass,
1960        action: Action,
1961        field_name: Option<String>,
1962        document_kind: DocumentKind,
1963        conflict_loser: bool,
1964        decided_by: ConflictTier,
1965        created_at: i64,
1966        session_id: Option<String>,
1967    ) -> Self {
1968        Self {
1969            source: source.into(),
1970            class,
1971            action,
1972            field_name,
1973            document_kind,
1974            conflict_loser,
1975            decided_by,
1976            created_at,
1977            session_id,
1978            recognizer_id: None,
1979            recognizer_version_id: None,
1980            validator_fail_reason: None,
1981            ambiguity_record: None,
1982            collision_family: None,
1983            collision_variant: None,
1984            fallback_triggered: None,
1985            provenance_stage: None,
1986            provenance_model_id: None,
1987            provenance_model_version: None,
1988            provenance_artifact_sha256: None,
1989            provenance_tokenizer_sha256: None,
1990            provenance_locale_resolved: None,
1991            provenance_locale_match_kind: None,
1992            provenance_canonical_class: None,
1993            provenance_native_class: None,
1994            provenance_confidence: None,
1995            provenance_merged_from: None,
1996            backend_silently_dropped: None,
1997        }
1998    }
1999
2000    /// Attaches a validator failure reason to this metadata row.
2001    pub fn with_validator_fail_reason(mut self, reason: ValidatorFailReason) -> Self {
2002        self.validator_fail_reason = Some(reason);
2003        self
2004    }
2005
2006    /// Attaches an ambiguity record to this metadata row.
2007    pub fn with_ambiguity_record(mut self, record: AmbiguityRecord) -> Self {
2008        self.ambiguity_record = Some(record);
2009        self
2010    }
2011
2012    /// Attaches collision-family metadata to this row.
2013    pub fn with_collision_metadata(
2014        mut self,
2015        family: Option<String>,
2016        variant: Option<String>,
2017    ) -> Self {
2018        self.collision_family = family;
2019        self.collision_variant = variant;
2020        self
2021    }
2022
2023    /// Attaches safety-net fallback metadata to this row.
2024    pub fn with_fallback_triggered(mut self, reason: FallbackReason) -> Self {
2025        self.fallback_triggered = Some(reason);
2026        self
2027    }
2028
2029    /// Attaches locale-aware backend ids dropped by first-match-wins routing.
2030    pub fn with_backend_silently_dropped(mut self, dropped: Vec<String>) -> Self {
2031        self.backend_silently_dropped = Some(dropped);
2032        self
2033    }
2034
2035    /// Attaches recognizer lineage metadata to this row.
2036    pub fn with_recognizer_metadata(
2037        mut self,
2038        recognizer_id: Option<String>,
2039        recognizer_version_id: Option<String>,
2040    ) -> Self {
2041        self.recognizer_id = recognizer_id;
2042        self.recognizer_version_id = recognizer_version_id;
2043        self
2044    }
2045
2046    #[allow(clippy::too_many_arguments)]
2047    pub fn with_provenance_metadata(
2048        mut self,
2049        stage: Option<String>,
2050        model_id: Option<String>,
2051        model_version: Option<String>,
2052        artifact_sha256: Option<String>,
2053        tokenizer_sha256: Option<String>,
2054        locale_resolved: Option<String>,
2055        locale_match_kind: Option<String>,
2056        canonical_class: Option<String>,
2057        native_class: Option<String>,
2058        confidence: Option<f64>,
2059        merged_from: Option<String>,
2060    ) -> Self {
2061        self.provenance_stage = stage;
2062        self.provenance_model_id = model_id;
2063        self.provenance_model_version = model_version;
2064        self.provenance_artifact_sha256 = artifact_sha256;
2065        self.provenance_tokenizer_sha256 = tokenizer_sha256;
2066        self.provenance_locale_resolved = locale_resolved;
2067        self.provenance_locale_match_kind = locale_match_kind;
2068        self.provenance_canonical_class = canonical_class;
2069        self.provenance_native_class = native_class;
2070        self.provenance_confidence = confidence.map(|value| value.to_string());
2071        self.provenance_merged_from = merged_from;
2072        self
2073    }
2074}
2075
2076/// Closed error set for redaction log sinks.
2077#[derive(Debug, Clone, PartialEq, Eq, Error)]
2078#[non_exhaustive]
2079pub enum RedactionLogError {
2080    /// SQLite-backed redaction log sink failed.
2081    #[error("sqlite redaction log error: {0}")]
2082    Sqlite(String),
2083    /// Non-SQLite redaction log sink failed.
2084    #[error("backend redaction log error: {0}")]
2085    Backend(String),
2086}
2087
2088/// Trait for audit sinks that receive redaction metadata.
2089///
2090/// Implement this for custom audit backends (remote telemetry, structured JSON logs).
2091/// For SQLite-backed persistence, use `gaze_audit::SqliteLogger`.
2092///
2093/// # Contract
2094///
2095/// The logger receives **metadata only**: class, action, session ID, timestamp, and
2096/// other bytes-free audit labels. It never receives the original PII value or the token
2097/// value. A custom impl that augments entries with raw document text violates the audit
2098/// isolation contract and will be flagged by the `gaze_module_isolation` Dylint lint
2099/// when it lives in the wrong crate.
2100///
2101/// # Example
2102///
2103/// ```rust
2104/// use std::sync::atomic::{AtomicUsize, Ordering};
2105/// use gaze_types::{RedactionEntry, RedactionLogError, RedactionLogger};
2106///
2107/// #[derive(Default)]
2108/// struct CountLogger(AtomicUsize);
2109///
2110/// impl RedactionLogger for CountLogger {
2111///     fn log(&self, _entry: &RedactionEntry) -> Result<(), RedactionLogError> {
2112///         self.0.fetch_add(1, Ordering::Relaxed);
2113///         Ok(())
2114///     }
2115/// }
2116/// ```
2117pub trait RedactionLogger: Send + Sync {
2118    /// Records a metadata-only redaction entry.
2119    fn log(&self, entry: &RedactionEntry) -> Result<(), RedactionLogError>;
2120}
2121
2122/// Rulepack recognizer activation tier.
2123#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
2124#[non_exhaustive]
2125pub enum SafetyTier {
2126    /// Activates whenever the recognizer's locale projection intersects the active locale chain.
2127    #[default]
2128    SafeDefault,
2129    /// Activates only when an explicit locale or compatibility alias enables locale-shaped rules.
2130    LocaleGated,
2131    /// Activates only through adopter opt-in surfaces such as policy-defined custom recognizers.
2132    OptIn,
2133}
2134
2135/// Safety-tier parsing error.
2136#[derive(Debug, Clone, PartialEq, Eq)]
2137#[non_exhaustive]
2138pub struct SafetyTierParseError {
2139    value: String,
2140}
2141
2142impl SafetyTier {
2143    /// Parses the TOML `safety_tier` string.
2144    pub fn parse(value: &str) -> Result<Self, SafetyTierParseError> {
2145        match value {
2146            "safe_default" => Ok(Self::SafeDefault),
2147            "locale_gated" => Ok(Self::LocaleGated),
2148            "opt_in" => Ok(Self::OptIn),
2149            other => Err(SafetyTierParseError {
2150                value: other.to_string(),
2151            }),
2152        }
2153    }
2154
2155    /// Returns the TOML string for this tier.
2156    pub fn as_str(self) -> &'static str {
2157        match self {
2158            Self::SafeDefault => "safe_default",
2159            Self::LocaleGated => "locale_gated",
2160            Self::OptIn => "opt_in",
2161        }
2162    }
2163}
2164
2165impl SafetyTierParseError {
2166    /// Returns the rejected safety-tier string.
2167    pub fn value(&self) -> &str {
2168        &self.value
2169    }
2170}
2171
2172impl fmt::Display for SafetyTierParseError {
2173    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2174        write!(f, "unsupported safety_tier '{}'", self.value)
2175    }
2176}
2177
2178impl std::error::Error for SafetyTierParseError {}
2179
2180/// Locale tag recognized by policy and recognizers.
2181#[derive(Debug, Clone, PartialEq, Eq, Hash)]
2182#[non_exhaustive]
2183pub enum LocaleTag {
2184    /// Locale-independent recognizer or policy.
2185    Global,
2186    /// German as used in Germany.
2187    DeDe,
2188    /// German as used in Austria.
2189    DeAt,
2190    /// German as used in Switzerland.
2191    DeCh,
2192    /// English as used in the United States.
2193    EnUs,
2194    /// English as used in Great Britain.
2195    EnGb,
2196    /// English as used in Ireland.
2197    EnIe,
2198    /// English as used in Australia.
2199    EnAu,
2200    /// English as used in Canada.
2201    EnCa,
2202    /// Any other canonical BCP-47-like tag.
2203    Other(String),
2204}
2205
2206/// Locale parsing error.
2207#[derive(Debug, Clone, PartialEq, Eq)]
2208#[non_exhaustive]
2209pub enum LocaleError {
2210    /// Locale tag is unsupported or invalid.
2211    Unsupported,
2212}
2213
2214impl fmt::Display for LocaleError {
2215    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2216        match self {
2217            LocaleError::Unsupported => f.write_str("unsupported locale"),
2218        }
2219    }
2220}
2221
2222impl std::error::Error for LocaleError {}
2223
2224/// Ordered locale fallback chain.
2225#[derive(Debug, Clone, PartialEq, Eq)]
2226pub struct LocaleChain(Vec<LocaleTag>);
2227
2228impl LocaleTag {
2229    /// Global locale constant.
2230    pub const GLOBAL: LocaleTag = LocaleTag::Global;
2231
2232    /// Parses a locale tag from policy or CLI input.
2233    pub fn parse(s: &str) -> Result<LocaleTag, LocaleError> {
2234        let raw = s.trim().replace('_', "-");
2235        let normalized = raw.to_ascii_lowercase();
2236        match normalized.as_str() {
2237            "global" | "*" => Ok(LocaleTag::Global),
2238            "de-de" => Ok(LocaleTag::DeDe),
2239            "de-at" => Ok(LocaleTag::DeAt),
2240            "de-ch" => Ok(LocaleTag::DeCh),
2241            "en-us" => Ok(LocaleTag::EnUs),
2242            "en-gb" => Ok(LocaleTag::EnGb),
2243            "en-ie" => Ok(LocaleTag::EnIe),
2244            "en-au" => Ok(LocaleTag::EnAu),
2245            "en-ca" => Ok(LocaleTag::EnCa),
2246            "" => Err(LocaleError::Unsupported),
2247            _ if is_bcp47_parseable(&raw) => Ok(LocaleTag::Other(canonical_other(&raw))),
2248            _ => Err(LocaleError::Unsupported),
2249        }
2250    }
2251
2252    /// Returns the canonical string form of the locale tag.
2253    pub fn as_str(&self) -> &str {
2254        match self {
2255            LocaleTag::Global => "global",
2256            LocaleTag::DeDe => "de-DE",
2257            LocaleTag::DeAt => "de-AT",
2258            LocaleTag::DeCh => "de-CH",
2259            LocaleTag::EnUs => "en-US",
2260            LocaleTag::EnGb => "en-GB",
2261            LocaleTag::EnIe => "en-IE",
2262            LocaleTag::EnAu => "en-AU",
2263            LocaleTag::EnCa => "en-CA",
2264            LocaleTag::Other(tag) => tag.as_str(),
2265        }
2266    }
2267}
2268
2269impl LocaleChain {
2270    /// Builds a locale chain and appends global fallback when absent.
2271    pub fn from_tags(mut tags: Vec<LocaleTag>) -> LocaleChain {
2272        ensure_global(&mut tags);
2273        LocaleChain(tags)
2274    }
2275
2276    /// Parses a comma-separated CLI locale chain.
2277    pub fn from_cli(raw: &str) -> Result<LocaleChain, LocaleError> {
2278        let tags = raw
2279            .split(',')
2280            .map(LocaleTag::parse)
2281            .collect::<Result<Vec<_>, _>>()?;
2282        Ok(LocaleChain::from_tags(tags))
2283    }
2284
2285    /// Merges policy and CLI locale preferences.
2286    pub fn merge_policy_and_cli(
2287        policy: Option<&[LocaleTag]>,
2288        cli: Option<&[LocaleTag]>,
2289    ) -> LocaleChain {
2290        Self::merge_cli_policy_rulepack_default(cli, policy, None)
2291    }
2292
2293    /// Merges CLI, policy, rulepack, and default locale preferences.
2294    pub fn merge_cli_policy_rulepack_default(
2295        cli: Option<&[LocaleTag]>,
2296        policy: Option<&[LocaleTag]>,
2297        rulepack_defaults: Option<&[LocaleTag]>,
2298    ) -> LocaleChain {
2299        let tags = cli
2300            .filter(|tags| !tags.is_empty())
2301            .or_else(|| policy.filter(|tags| !tags.is_empty()))
2302            .or_else(|| rulepack_defaults.filter(|tags| !tags.is_empty()))
2303            .map(|tags| tags.to_vec())
2304            .unwrap_or_else(|| vec![LocaleTag::Global]);
2305        LocaleChain::from_tags(tags)
2306    }
2307
2308    /// Returns true when a recognizer can run under this locale chain.
2309    pub fn intersects(&self, recognizer_locales: &[LocaleTag]) -> bool {
2310        if recognizer_locales.is_empty() {
2311            return true;
2312        }
2313        recognizer_locales.iter().any(|recognizer_locale| {
2314            *recognizer_locale == LocaleTag::Global
2315                || self.0.iter().any(|active| active == recognizer_locale)
2316        })
2317    }
2318
2319    /// Returns the locale tags in chain order.
2320    pub fn as_slice(&self) -> &[LocaleTag] {
2321        &self.0
2322    }
2323
2324    /// Returns the locale chain as canonical strings.
2325    pub fn to_strings(&self) -> Vec<String> {
2326        self.0.iter().map(ToString::to_string).collect()
2327    }
2328}
2329
2330impl From<&[LocaleTag]> for LocaleChain {
2331    fn from(tags: &[LocaleTag]) -> Self {
2332        let mut owned = tags.to_vec();
2333        ensure_global(&mut owned);
2334        LocaleChain(owned)
2335    }
2336}
2337
2338impl fmt::Display for LocaleTag {
2339    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2340        f.write_str(self.as_str())
2341    }
2342}
2343
2344/// The input document submitted for pseudonymization.
2345///
2346/// `RawDocument::Text(String)` for plain or semi-structured text (most LLM workflows).
2347/// `RawDocument::Structured(BTreeMap<String, Value>)` for JSON-shaped data where
2348/// column-aware rules apply -- `ColumnRule`s only take effect on structured input.
2349///
2350/// `Detection::span` and recognizer candidate spans use **byte** ranges, not char indices.
2351///
2352/// `RawDocument` is `#[non_exhaustive]`. Match with a wildcard arm.
2353#[derive(Debug, Clone)]
2354#[non_exhaustive]
2355pub enum RawDocument {
2356    /// Structured document values.
2357    Structured(BTreeMap<String, Value>),
2358    /// Plain text document.
2359    Text(String),
2360}
2361
2362/// The pseudonymized output from `Pipeline::redact`.
2363///
2364/// Mirrors the shape of `RawDocument`: `CleanDocument::Text(String)` or
2365/// `CleanDocument::Structured(BTreeMap<String, Value>)`. Destructure with a `let`-else
2366/// or `match`; **there is no `.text()` accessor**.
2367///
2368/// ```rust
2369/// use gaze_types::CleanDocument;
2370///
2371/// fn unwrap_text(doc: CleanDocument) -> Option<String> {
2372///     if let CleanDocument::Text(t) = doc { Some(t) } else { None }
2373/// }
2374/// ```
2375///
2376/// Contains only tokens or redacted placeholders -- no original PII values.
2377/// Send this (or its inner string) to the LLM; never send the original `RawDocument`.
2378///
2379/// `CleanDocument` is `#[non_exhaustive]`.
2380#[derive(Debug, Clone, Serialize)]
2381#[serde(untagged)]
2382#[non_exhaustive]
2383pub enum CleanDocument {
2384    /// Structured document values.
2385    Structured(BTreeMap<String, Value>),
2386    /// Plain text document.
2387    Text(String),
2388}
2389
2390/// Minimal structured value representation that avoids a serde_json dependency.
2391#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
2392#[serde(untagged)]
2393#[non_exhaustive]
2394pub enum Value {
2395    /// Null value.
2396    Null,
2397    /// Boolean value.
2398    Bool(bool),
2399    /// String value.
2400    String(String),
2401    /// Signed 64-bit integer value.
2402    I64(i64),
2403    /// Array value.
2404    Array(Vec<Value>),
2405    /// Object value.
2406    Object(BTreeMap<String, Value>),
2407}
2408
2409impl Value {
2410    /// Returns the inner string for string values.
2411    pub fn as_str(&self) -> Option<&str> {
2412        match self {
2413            Self::String(value) => Some(value.as_str()),
2414            Self::Null | Self::Bool(_) | Self::I64(_) | Self::Array(_) | Self::Object(_) => None,
2415        }
2416    }
2417
2418    /// Returns a scalar string representation used for structured safety-net checks.
2419    pub fn scalar_to_safety_net_string(&self) -> Option<String> {
2420        match self {
2421            Self::String(value) if !value.is_empty() => Some(value.clone()),
2422            Self::String(_) | Self::Null | Self::Array(_) | Self::Object(_) => None,
2423            Self::Bool(value) => Some(value.to_string()),
2424            Self::I64(value) => Some(value.to_string()),
2425        }
2426    }
2427}
2428
2429impl PartialEq<&str> for Value {
2430    fn eq(&self, other: &&str) -> bool {
2431        self.as_str() == Some(*other)
2432    }
2433}
2434
2435/// Value-only dictionary bundle shared with recognizers.
2436#[derive(Debug, Clone, Default)]
2437pub struct DictionaryBundle {
2438    entries: HashMap<String, DictionaryEntry>,
2439}
2440
2441/// Value-only dictionary entry; compiled automatons live outside `gaze-types`.
2442#[derive(Debug, Clone)]
2443pub struct DictionaryEntry {
2444    terms: Vec<String>,
2445    case_sensitive: bool,
2446    source: DictionarySource,
2447}
2448
2449/// Source of a dictionary entry.
2450#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2451#[non_exhaustive]
2452pub enum DictionarySource {
2453    /// Dictionary supplied by request context.
2454    Cli,
2455    /// Dictionary supplied by a rulepack.
2456    Rulepack,
2457}
2458
2459/// Dictionary metadata used for diagnostics and tests.
2460#[derive(Debug, Clone, PartialEq, Eq)]
2461#[non_exhaustive]
2462pub struct DictionaryStats {
2463    /// Dictionary name.
2464    pub name: String,
2465    /// Number of configured terms.
2466    pub term_count: usize,
2467    /// Dictionary source.
2468    pub source: DictionarySource,
2469}
2470
2471impl DictionaryStats {
2472    /// Builds dictionary diagnostics metadata.
2473    pub fn new(name: impl Into<String>, term_count: usize, source: DictionarySource) -> Self {
2474        Self {
2475            name: name.into(),
2476            term_count,
2477            source,
2478        }
2479    }
2480}
2481
2482/// Dictionary declared by a rulepack.
2483#[derive(Debug, Clone, PartialEq, Eq)]
2484#[non_exhaustive]
2485pub struct RulepackDict {
2486    /// Dictionary name.
2487    pub name: String,
2488    /// Dictionary terms.
2489    pub terms: Vec<String>,
2490    /// Whether matching is case-sensitive.
2491    pub case_sensitive: bool,
2492}
2493
2494impl RulepackDict {
2495    /// Builds a rulepack dictionary declaration.
2496    pub fn new(name: impl Into<String>, terms: Vec<String>, case_sensitive: bool) -> Self {
2497        Self {
2498            name: name.into(),
2499            terms,
2500            case_sensitive,
2501        }
2502    }
2503}
2504
2505/// Error raised when constructing invalid dictionary entries.
2506#[derive(Debug, Clone, PartialEq, Eq)]
2507#[non_exhaustive]
2508pub enum DictionaryLoadError {
2509    /// Dictionary has no terms.
2510    Empty { name: String },
2511    /// ASCII-only case-insensitive matching cannot safely cover this entry.
2512    UnicodeInsensitiveUnsupported { name: String },
2513}
2514
2515impl fmt::Display for DictionaryLoadError {
2516    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2517        match self {
2518            Self::Empty { name } => write!(f, "dictionary '{name}' has no terms"),
2519            Self::UnicodeInsensitiveUnsupported { name } => write!(
2520                f,
2521                "dictionary '{name}' uses unicode terms with case-insensitive matching, unsupported in v0.4.0; use case_sensitive = true"
2522            ),
2523        }
2524    }
2525}
2526
2527impl std::error::Error for DictionaryLoadError {}
2528
2529impl DictionaryBundle {
2530    /// Builds a bundle from rulepack dictionaries.
2531    pub fn from_rulepack_terms(terms: &[RulepackDict]) -> Self {
2532        let mut entries = HashMap::with_capacity(terms.len());
2533        for dictionary in terms {
2534            let entry = DictionaryEntry::new(
2535                &dictionary.name,
2536                dictionary.terms.clone(),
2537                dictionary.case_sensitive,
2538                DictionarySource::Rulepack,
2539            )
2540            .expect("Policy validates dictionary terms before bundle construction");
2541            entries.insert(dictionary.name.clone(), entry);
2542        }
2543        Self { entries }
2544    }
2545
2546    /// Builds a bundle from pre-built dictionary entries.
2547    pub fn from_entries(entries: impl IntoIterator<Item = (String, DictionaryEntry)>) -> Self {
2548        Self {
2549            entries: entries.into_iter().collect(),
2550        }
2551    }
2552
2553    /// Merges two bundles, preferring entries from the second bundle on name conflicts.
2554    pub fn merge(a: Self, b: Self) -> Self {
2555        let mut entries = a.entries;
2556        entries.extend(b.entries);
2557        Self { entries }
2558    }
2559
2560    /// Returns a dictionary by name.
2561    pub fn get(&self, name: &str) -> Option<&DictionaryEntry> {
2562        self.entries.get(name)
2563    }
2564
2565    /// Returns sorted dictionary stats.
2566    pub fn stats(&self) -> Vec<DictionaryStats> {
2567        let mut stats = self
2568            .entries
2569            .iter()
2570            .map(|(name, entry)| DictionaryStats {
2571                name: name.clone(),
2572                term_count: entry.terms.len(),
2573                source: entry.source,
2574            })
2575            .collect::<Vec<_>>();
2576        stats.sort_by(|a, b| a.name.cmp(&b.name));
2577        stats
2578    }
2579}
2580
2581impl DictionaryEntry {
2582    /// Creates a validated value-only dictionary entry.
2583    pub fn new(
2584        name: &str,
2585        terms: Vec<String>,
2586        case_sensitive: bool,
2587        source: DictionarySource,
2588    ) -> Result<Self, DictionaryLoadError> {
2589        if terms.is_empty() {
2590            return Err(DictionaryLoadError::Empty {
2591                name: name.to_string(),
2592            });
2593        }
2594        if !case_sensitive && terms.iter().any(|term| !term.is_ascii()) {
2595            return Err(DictionaryLoadError::UnicodeInsensitiveUnsupported {
2596                name: name.to_string(),
2597            });
2598        }
2599        Ok(Self {
2600            terms,
2601            case_sensitive,
2602            source,
2603        })
2604    }
2605
2606    /// Returns whether matching is case-sensitive.
2607    pub fn case_sensitive(&self) -> bool {
2608        self.case_sensitive
2609    }
2610
2611    /// Returns configured dictionary terms.
2612    pub fn terms(&self) -> &[String] {
2613        &self.terms
2614    }
2615}
2616
2617#[cfg(test)]
2618mod dictionary_tests {
2619    use super::*;
2620
2621    #[test]
2622    fn dictionary_entry_rejects_empty_terms() {
2623        let err = DictionaryEntry::new("empty", Vec::new(), true, DictionarySource::Cli)
2624            .expect_err("empty dictionaries must fail closed");
2625
2626        assert!(matches!(err, DictionaryLoadError::Empty { name } if name == "empty"));
2627    }
2628
2629    #[test]
2630    fn dictionary_entry_rejects_non_ascii_case_insensitive_terms() {
2631        let err = DictionaryEntry::new(
2632            "songs",
2633            vec!["Beyonce".to_string(), "Caf\u{00e9}".to_string()],
2634            false,
2635            DictionarySource::Cli,
2636        )
2637        .expect_err("unicode case-insensitive dictionaries must fail closed");
2638
2639        assert!(matches!(
2640            err,
2641            DictionaryLoadError::UnicodeInsensitiveUnsupported { name } if name == "songs"
2642        ));
2643    }
2644}
2645
2646#[cfg(test)]
2647mod redaction_logger_tests {
2648    use super::*;
2649
2650    struct CapturingLogger;
2651
2652    impl RedactionLogger for CapturingLogger {
2653        fn log(&self, _entry: &RedactionEntry) -> Result<(), RedactionLogError> {
2654            Ok(())
2655        }
2656    }
2657
2658    fn assert_send_sync<T: Send + Sync + ?Sized>() {}
2659
2660    #[test]
2661    fn redaction_log_error_display_is_stable() {
2662        assert_eq!(
2663            RedactionLogError::Sqlite("write failed".to_string()).to_string(),
2664            "sqlite redaction log error: write failed"
2665        );
2666        assert_eq!(
2667            RedactionLogError::Backend("sink failed".to_string()).to_string(),
2668            "backend redaction log error: sink failed"
2669        );
2670    }
2671
2672    #[test]
2673    fn redaction_logger_trait_object_is_send_sync() {
2674        assert_send_sync::<dyn RedactionLogger>();
2675    }
2676
2677    #[test]
2678    fn local_logger_can_implement_redaction_logger() {
2679        let logger = CapturingLogger;
2680        let entry = RedactionEntry {
2681            source: "unit-test".to_string(),
2682            recognizer_id: None,
2683            recognizer_version_id: None,
2684            class: PiiClass::Email,
2685            action: Action::Tokenize,
2686            field_name: None,
2687            document_kind: DocumentKind::Text,
2688            conflict_loser: false,
2689            decided_by: ConflictTier::None,
2690            created_at: 0,
2691            session_id: None,
2692            validator_fail_reason: None,
2693            ambiguity_record: None,
2694            collision_family: None,
2695            collision_variant: None,
2696            fallback_triggered: None,
2697            provenance_stage: None,
2698            provenance_model_id: None,
2699            provenance_model_version: None,
2700            provenance_artifact_sha256: None,
2701            provenance_tokenizer_sha256: None,
2702            provenance_locale_resolved: None,
2703            provenance_locale_match_kind: None,
2704            provenance_canonical_class: None,
2705            provenance_native_class: None,
2706            provenance_confidence: None,
2707            provenance_merged_from: None,
2708            backend_silently_dropped: None,
2709        };
2710
2711        let trait_object: &dyn RedactionLogger = &logger;
2712        trait_object.log(&entry).expect("log entry");
2713    }
2714
2715    #[test]
2716    fn redaction_entry_json_shape_omits_absent_recognizer_lineage() {
2717        let entry = RedactionEntry::new(
2718            "email.global",
2719            PiiClass::Email,
2720            Action::Tokenize,
2721            None,
2722            DocumentKind::Text,
2723            false,
2724            ConflictTier::None,
2725            0,
2726            None,
2727        );
2728
2729        let rendered = serde_json::to_string(&entry).expect("serialize redaction entry");
2730
2731        assert_eq!(
2732            rendered,
2733            r#"{"source":"email.global","class":"email","action":"tokenize","field_name":null,"document_kind":"text","conflict_loser":false,"decided_by":"none","created_at":0,"session_id":null,"validator_fail_reason":null,"ambiguity_record":null,"collision_family":null,"collision_variant":null,"fallback_triggered":null}"#
2734        );
2735    }
2736
2737    #[test]
2738    fn redaction_entry_json_shape_includes_recognizer_lineage_when_present() {
2739        let entry = RedactionEntry::new(
2740            "ner/ort",
2741            PiiClass::Name,
2742            Action::Tokenize,
2743            None,
2744            DocumentKind::Text,
2745            false,
2746            ConflictTier::None,
2747            0,
2748            None,
2749        )
2750        .with_recognizer_metadata(
2751            Some("ner".to_string()),
2752            Some("ner.davlan-mbert.v1".to_string()),
2753        );
2754
2755        let value: serde_json::Value =
2756            serde_json::to_value(&entry).expect("serialize redaction entry");
2757
2758        assert_eq!(value["recognizer_id"], "ner");
2759        assert_eq!(value["recognizer_version_id"], "ner.davlan-mbert.v1");
2760    }
2761
2762    #[test]
2763    fn candidate_keeps_versioned_and_unversioned_recognizer_ids() {
2764        let unversioned = Candidate::new(
2765            0..5,
2766            PiiClass::Email,
2767            "email.global",
2768            0.9,
2769            10,
2770            None,
2771            "email",
2772            "email.global",
2773            ConflictTier::None,
2774            Vec::new(),
2775        );
2776        assert_eq!(unversioned.recognizer_id, "email.global");
2777        assert_eq!(unversioned.recognizer_version_id, None);
2778
2779        let versioned = unversioned
2780            .clone()
2781            .with_recognizer_version_id("email.global.v1");
2782        assert_eq!(versioned.recognizer_id, "email.global");
2783        assert_eq!(
2784            versioned.recognizer_version_id.as_deref(),
2785            Some("email.global.v1")
2786        );
2787    }
2788}
2789
2790#[cfg(test)]
2791mod safety_net_manifest_tests {
2792    use super::*;
2793
2794    fn span(start: usize, end: usize, class: PiiClass) -> EmittedTokenSpan {
2795        EmittedTokenSpan {
2796            clean_span: start..end,
2797            raw_span: start..end,
2798            class,
2799        }
2800    }
2801
2802    fn diff(manifest: Manifest, suspect: Range<usize>, class: PiiClass) -> Option<LeakKind> {
2803        manifest.diff_against(&suspect, &class)
2804    }
2805
2806    #[test]
2807    fn exact_same_class_coverage_is_not_a_leak() {
2808        let manifest = Manifest::from_spans(vec![span(0, 8, PiiClass::Email)]);
2809
2810        assert_eq!(diff(manifest, 0..8, PiiClass::Email), None);
2811    }
2812
2813    #[test]
2814    fn uncovered_outside_all_tokens_is_uncovered() {
2815        let manifest = Manifest::from_spans(vec![span(20, 30, PiiClass::Email)]);
2816
2817        assert_eq!(
2818            diff(manifest, 0..10, PiiClass::Email),
2819            Some(LeakKind::Uncovered)
2820        );
2821    }
2822
2823    #[test]
2824    fn single_internal_gap_returns_partial_bleed() {
2825        let manifest = Manifest::from_spans(vec![
2826            span(0, 5, PiiClass::Email),
2827            span(10, 15, PiiClass::Email),
2828        ]);
2829
2830        assert_eq!(
2831            diff(manifest, 0..15, PiiClass::Email),
2832            Some(LeakKind::PartialBleed { uncovered: 5..10 })
2833        );
2834    }
2835
2836    #[test]
2837    fn multi_gap_returns_deterministic_first_uncovered_gap() {
2838        let manifest = Manifest::from_spans(vec![
2839            span(0, 3, PiiClass::Email),
2840            span(5, 7, PiiClass::Email),
2841            span(9, 12, PiiClass::Email),
2842        ]);
2843
2844        // The first-gap-only rule is intentional for v0.6.1; full gap
2845        // enumeration is deferred until the report format can carry it.
2846        assert_eq!(
2847            diff(manifest, 0..12, PiiClass::Email),
2848            Some(LeakKind::PartialBleed { uncovered: 3..5 })
2849        );
2850    }
2851
2852    #[test]
2853    fn multi_class_overlap_reports_first_mismatch_deterministically() {
2854        let manifest = Manifest::from_spans(vec![
2855            span(0, 4, PiiClass::Name),
2856            span(4, 8, PiiClass::Location),
2857        ]);
2858
2859        assert_eq!(
2860            diff(manifest, 0..8, PiiClass::Email),
2861            Some(LeakKind::ClassMismatch {
2862                pipeline_class: PiiClass::Name,
2863                safety_net_class: PiiClass::Email,
2864            })
2865        );
2866    }
2867
2868    #[test]
2869    fn adjacent_same_class_tokens_cover_continuously() {
2870        let manifest = Manifest::from_spans(vec![
2871            span(0, 5, PiiClass::Email),
2872            span(5, 10, PiiClass::Email),
2873        ]);
2874
2875        assert_eq!(diff(manifest, 0..10, PiiClass::Email), None);
2876    }
2877
2878    #[test]
2879    fn partial_bleed_at_start_end_and_middle() {
2880        let manifest = Manifest::from_spans(vec![span(3, 8, PiiClass::Email)]);
2881
2882        assert_eq!(
2883            diff(manifest.clone(), 0..8, PiiClass::Email),
2884            Some(LeakKind::PartialBleed { uncovered: 0..3 })
2885        );
2886        assert_eq!(
2887            diff(manifest.clone(), 3..10, PiiClass::Email),
2888            Some(LeakKind::PartialBleed { uncovered: 8..10 })
2889        );
2890
2891        let with_gap = Manifest::from_spans(vec![
2892            span(0, 3, PiiClass::Email),
2893            span(6, 10, PiiClass::Email),
2894        ]);
2895        assert_eq!(
2896            diff(with_gap, 0..10, PiiClass::Email),
2897            Some(LeakKind::PartialBleed { uncovered: 3..6 })
2898        );
2899    }
2900
2901    #[test]
2902    fn byte_indices_are_not_character_indices() {
2903        let text = "ID: 😀 <Email_1>";
2904        let token_start = text.find("<Email_1>").expect("token start");
2905        assert_eq!(token_start, 9, "emoji is four bytes, not one char");
2906        let manifest = Manifest::from_spans(vec![span(token_start, text.len(), PiiClass::Email)]);
2907
2908        assert_eq!(
2909            diff(manifest, token_start..text.len(), PiiClass::Email),
2910            None
2911        );
2912    }
2913
2914    #[test]
2915    fn empty_suspect_range_is_not_a_leak() {
2916        let manifest = Manifest::default();
2917
2918        assert_eq!(diff(manifest, 3..3, PiiClass::Email), None);
2919    }
2920
2921    #[test]
2922    fn safety_net_error_display_is_variant_specific_and_bytes_free() {
2923        let cases = [
2924            SafetyNetError::Unavailable {
2925                reason: "not configured".to_string(),
2926            }
2927            .to_string(),
2928            SafetyNetError::WeightsMissing {
2929                path: "/models/opf".to_string(),
2930            }
2931            .to_string(),
2932            SafetyNetError::ModelUnavailable {
2933                reason: "load failed".to_string(),
2934            }
2935            .to_string(),
2936            SafetyNetError::ModelIntegrityMismatch {
2937                expected: "e3b0c44298fc1c149afbf4c8996fb924".to_string(),
2938                actual: "4e07408562bedb8b60ce05c1decfe3ad".to_string(),
2939            }
2940            .to_string(),
2941            SafetyNetError::InputTooLarge {
2942                limit: 1024,
2943                actual: 2048,
2944            }
2945            .to_string(),
2946            SafetyNetError::Runtime {
2947                message: "timeout".to_string(),
2948            }
2949            .to_string(),
2950            SafetyNetError::InvalidOutput {
2951                message: "bad json".to_string(),
2952            }
2953            .to_string(),
2954        ];
2955
2956        for rendered in cases {
2957            assert!(!rendered.contains("alice@example.invalid"));
2958        }
2959    }
2960}
2961
2962/// Shared recognizer contract for locale-aware PII candidates.
2963pub trait Recognizer: Send + Sync {
2964    /// Stable recognizer identifier.
2965    fn id(&self) -> &str;
2966    /// PII class supported by this recognizer.
2967    fn supported_class(&self) -> &PiiClass;
2968    /// Detects PII candidates in the supplied input and context.
2969    fn detect(&self, input: &str, ctx: &DetectContext<'_>) -> Vec<Candidate>;
2970    /// Token family used for candidate token emission.
2971    fn token_family(&self) -> &str;
2972    /// Optional validator kind used by pre-resolver validator-veto.
2973    fn validator_kind(&self) -> Option<ValidatorKind> {
2974        None
2975    }
2976    /// Locales where this recognizer is active.
2977    fn locales(&self) -> &[LocaleTag] {
2978        &[LocaleTag::Global]
2979    }
2980}
2981
2982/// Candidate PII span emitted by a recognizer before final conflict resolution.
2983#[derive(Debug, Clone, PartialEq)]
2984#[non_exhaustive]
2985pub struct Candidate {
2986    /// Byte span in the original input.
2987    pub span: Range<usize>,
2988    /// PII class assigned to the span.
2989    pub class: PiiClass,
2990    /// Recognizer identifier.
2991    pub recognizer_id: String,
2992    /// Optional versioned recognizer identifier for audit lineage.
2993    pub recognizer_version_id: Option<String>,
2994    /// Recognizer confidence score.
2995    pub score: f32,
2996    /// Rule or recognizer priority.
2997    pub priority: i32,
2998    /// Optional canonical representation for validation/merge logic.
2999    pub canonical_form: Option<String>,
3000    /// Token family used for output token shape.
3001    pub token_family: String,
3002    /// Candidate source label.
3003    pub source: String,
3004    /// Conflict tier that decided this candidate.
3005    pub decided_by: ConflictTier,
3006    /// Sources merged into this candidate.
3007    pub merged_sources: Vec<String>,
3008}
3009
3010impl Candidate {
3011    /// Builds a recognizer candidate.
3012    #[allow(clippy::too_many_arguments)]
3013    pub fn new(
3014        span: Range<usize>,
3015        class: PiiClass,
3016        recognizer_id: impl Into<String>,
3017        score: f32,
3018        priority: i32,
3019        canonical_form: Option<String>,
3020        token_family: impl Into<String>,
3021        source: impl Into<String>,
3022        decided_by: ConflictTier,
3023        merged_sources: Vec<String>,
3024    ) -> Self {
3025        Self {
3026            span,
3027            class,
3028            recognizer_id: recognizer_id.into(),
3029            recognizer_version_id: None,
3030            score,
3031            priority,
3032            canonical_form,
3033            token_family: token_family.into(),
3034            source: source.into(),
3035            decided_by,
3036            merged_sources,
3037        }
3038    }
3039
3040    /// Returns this candidate with a translated span.
3041    pub fn with_span(mut self, span: Range<usize>) -> Self {
3042        self.span = span;
3043        self
3044    }
3045
3046    /// Returns this candidate with versioned recognizer lineage attached.
3047    pub fn with_recognizer_version_id(mut self, recognizer_version_id: impl Into<String>) -> Self {
3048        self.recognizer_version_id = Some(recognizer_version_id.into());
3049        self
3050    }
3051}
3052
3053/// Context supplied to recognizers during detection.
3054#[non_exhaustive]
3055pub struct DetectContext<'a> {
3056    /// Active locale chain.
3057    pub locale_chain: &'a [LocaleTag],
3058    /// Active dictionary bundle.
3059    pub dictionaries: &'a DictionaryBundle,
3060    /// Reserved field-aware matching slot; intentionally unit in v0.5 Phase B.
3061    pub fields: &'a (),
3062    /// Whether a recognizer degraded due to unavailable optional capability.
3063    pub degraded: Cell<bool>,
3064}
3065
3066impl<'a> DetectContext<'a> {
3067    /// Builds detection context for a recognizer pass.
3068    pub fn new(locale_chain: &'a [LocaleTag], dictionaries: &'a DictionaryBundle) -> Self {
3069        Self {
3070            locale_chain,
3071            dictionaries,
3072            fields: &(),
3073            degraded: Cell::new(false),
3074        }
3075    }
3076}
3077
3078fn ensure_global(tags: &mut Vec<LocaleTag>) {
3079    if !tags.contains(&LocaleTag::Global) {
3080        tags.push(LocaleTag::Global);
3081    }
3082}
3083
3084fn is_bcp47_parseable(raw: &str) -> bool {
3085    let mut parts = raw.split('-');
3086    let Some(language) = parts.next() else {
3087        return false;
3088    };
3089    if !(2..=8).contains(&language.len()) || !language.chars().all(|ch| ch.is_ascii_alphabetic()) {
3090        return false;
3091    }
3092    parts.all(|part| {
3093        (2..=8).contains(&part.len()) && part.chars().all(|ch| ch.is_ascii_alphanumeric())
3094    })
3095}
3096
3097fn canonical_other(raw: &str) -> String {
3098    let mut parts = raw.split('-');
3099    let language = parts.next().unwrap_or_default().to_ascii_lowercase();
3100    let rest = parts.map(|part| {
3101        if part.len() == 2 && part.chars().all(|ch| ch.is_ascii_alphabetic()) {
3102            part.to_ascii_uppercase()
3103        } else {
3104            part.to_ascii_lowercase()
3105        }
3106    });
3107    std::iter::once(language)
3108        .chain(rest)
3109        .collect::<Vec<_>>()
3110        .join("-")
3111}