Skip to main content

gaze_types/
lib.rs

1#![cfg_attr(docsrs, feature(doc_cfg))]
2
3use std::cell::Cell;
4use std::collections::{BTreeMap, HashMap};
5use std::fmt;
6use std::ops::Range;
7
8use serde::{Deserialize, Serialize};
9use sha3::{Digest, Keccak256};
10use thiserror::Error;
11
12/// Shared detector contract for text-only PII detection.
13pub trait Detector: Send + Sync {
14    /// Detect PII spans in the supplied input string.
15    fn detect(&self, input: &str) -> Vec<Detection>;
16
17    /// Fallible detection entrypoint for detectors backed by runtime systems.
18    fn try_detect(&self, input: &str) -> Result<Vec<Detection>, RecognizerRuntimeError> {
19        Ok(self.detect(input))
20    }
21}
22
23/// Runtime failure raised by a recognizer or detector backend during detection.
24#[derive(Debug, Clone, PartialEq, Eq)]
25#[non_exhaustive]
26pub struct RecognizerRuntimeError {
27    pub recognizer_id: String,
28    pub message: String,
29}
30
31impl RecognizerRuntimeError {
32    pub fn new(recognizer_id: impl Into<String>, message: impl Into<String>) -> Self {
33        Self {
34            recognizer_id: recognizer_id.into(),
35            message: message.into(),
36        }
37    }
38}
39
40impl fmt::Display for RecognizerRuntimeError {
41    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
42        write!(
43            f,
44            "recognizer '{}' backend failed: {}",
45            self.recognizer_id, self.message
46        )
47    }
48}
49
50impl std::error::Error for RecognizerRuntimeError {}
51
52/// The category of a detected PII span.
53///
54/// Built-in variants: `Email`, `Name`, `Location`, `Organization`. Tenant-specific PII
55/// (case references, titles, internal codes) is carried as `PiiClass::Custom(String)`.
56/// **There is no `Phone` variant** -- phone detection is provided by recognizers in
57/// `gaze-recognizers` and surfaces as either a `Custom("phone")` class or a class
58/// defined by a rulepack.
59///
60/// `PiiClass` is exhaustive. Match every variant explicitly so new built-in classes
61/// force call sites to review their handling at compile time:
62///
63/// ```rust
64/// use gaze_types::PiiClass;
65///
66/// fn label(class: &PiiClass) -> &'static str {
67///     match class {
68///         PiiClass::Email        => "email",
69///         PiiClass::Name         => "name",
70///         PiiClass::Location     => "location",
71///         PiiClass::Organization => "org",
72///         PiiClass::Custom(_)    => "pii",
73///     }
74/// }
75/// ```
76///
77/// Policy TOML uses the lowercase forms `email` / `name` / `location` / `organization`,
78/// and tenant classes are spelled like `custom:case_ref` (lowercase, snake_case).
79#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
80pub enum PiiClass {
81    /// Email address class.
82    Email,
83    /// Person name class.
84    Name,
85    /// Location class.
86    Location,
87    /// Organization class.
88    Organization,
89    /// Tenant- or policy-defined class.
90    Custom(String),
91}
92
93/// Built-in class labels in stable display order.
94pub const BUILTIN_CLASS_NAMES: &[&str] = &["Email", "Name", "Location", "Organization"];
95
96/// Family names reserved for bundled collision-policy rulepacks.
97///
98/// Adopter policy-level custom recognizers cannot claim these names because bundled
99/// families are part of the core disambiguation contract.
100pub const RESERVED_BUNDLED_FAMILIES: &[&str] = &[
101    "us-9-digit-id",
102    "iberian-id",
103    "payment-card-or-iban",
104    "phone-or-imei",
105    "vin-or-serial",
106    "mac-or-hex",
107    "passport-or-doc-support",
108    "national-13-digit",
109    "italian-cf-or-serial",
110    "german-personalausweis",
111    "swedish-personnummer",
112    "finnish-hetu",
113];
114
115pub const RESTORE_PHASE_MANIFEST_LOOKUP: u32 = 1 << 0;
116pub const RESTORE_PHASE_UNKNOWN_TOKEN_SCAN: u32 = 1 << 1;
117pub const RESTORE_PHASE_MANIFEST_BYPASS_SCAN: u32 = 1 << 2;
118pub const RESTORE_PHASE_FRESH_PII_SCAN: u32 = 1 << 3;
119
120#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
121#[non_exhaustive]
122pub struct RestoredText {
123    pub text: String,
124}
125
126impl RestoredText {
127    pub fn new(text: impl Into<String>) -> Self {
128        Self { text: text.into() }
129    }
130}
131
132#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
133#[serde(rename_all = "snake_case")]
134#[non_exhaustive]
135pub enum RestorePolicy {
136    Strict,
137    Lenient,
138}
139
140impl RestorePolicy {
141    pub fn as_str(self) -> &'static str {
142        match self {
143            Self::Strict => "strict",
144            Self::Lenient => "lenient",
145        }
146    }
147}
148
149#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
150#[serde(rename_all = "snake_case")]
151#[non_exhaustive]
152pub enum RestoreDecision {
153    Success,
154    Partial,
155    Failed,
156}
157
158impl RestoreDecision {
159    pub fn as_str(self) -> &'static str {
160        match self {
161            Self::Success => "success",
162            Self::Partial => "partial",
163            Self::Failed => "failed",
164        }
165    }
166}
167
168#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
169#[non_exhaustive]
170pub struct RestoreTelemetry {
171    pub unknown_token_count: u64,
172    pub manifest_bypass_count: u64,
173    pub fresh_pii_detected_count: u64,
174    pub restore_policy: RestorePolicy,
175    pub restore_decision: RestoreDecision,
176    pub phase_execution_mask: u32,
177}
178
179impl RestoreTelemetry {
180    pub fn new(restore_policy: RestorePolicy) -> Self {
181        Self {
182            unknown_token_count: 0,
183            manifest_bypass_count: 0,
184            fresh_pii_detected_count: 0,
185            restore_policy,
186            restore_decision: RestoreDecision::Success,
187            phase_execution_mask: 0,
188        }
189    }
190
191    pub fn restore_policy_str(&self) -> &'static str {
192        self.restore_policy.as_str()
193    }
194
195    pub fn restore_decision_str(&self) -> &'static str {
196        self.restore_decision.as_str()
197    }
198}
199
200/// Collision-family membership metadata for one recognizer.
201#[derive(Debug, Clone, PartialEq, Eq)]
202#[non_exhaustive]
203pub struct CollisionMembership {
204    /// Cross-class family name.
205    pub family: String,
206    /// Variant name within the family.
207    pub variant: String,
208    /// Lower values win when two variants in the same family overlap.
209    pub precedence: u32,
210    /// Optional anchor variant required by later ambiguity handling.
211    pub mandatory_anchor: Option<String>,
212}
213
214impl CollisionMembership {
215    /// Builds collision-family membership metadata.
216    pub fn new(
217        family: impl Into<String>,
218        variant: impl Into<String>,
219        precedence: u32,
220        mandatory_anchor: Option<String>,
221    ) -> Self {
222        Self {
223            family: family.into(),
224            variant: variant.into(),
225            precedence,
226            mandatory_anchor,
227        }
228    }
229}
230
231impl PiiClass {
232    /// Parses a policy class name into the shared class vocabulary.
233    pub fn from_policy_name(input: &str) -> Option<Self> {
234        match input {
235            "email" => Some(Self::Email),
236            "name" => Some(Self::Name),
237            "location" => Some(Self::Location),
238            "organization" => Some(Self::Organization),
239            custom if custom.starts_with("custom:") => {
240                let name = custom.trim_start_matches("custom:");
241                (!name.trim().is_empty()).then(|| Self::custom(name))
242            }
243            _ => None,
244        }
245    }
246
247    /// Returns the built-in class variants.
248    pub fn builtin_variants() -> &'static [PiiClass] {
249        &[
250            PiiClass::Email,
251            PiiClass::Name,
252            PiiClass::Location,
253            PiiClass::Organization,
254        ]
255    }
256
257    /// Builds a normalized custom class name.
258    pub fn custom(name: &str) -> Self {
259        let mut normalized = String::new();
260        let mut pending_underscore = false;
261        for ch in name.trim().chars() {
262            if ch.is_ascii_alphanumeric() {
263                if pending_underscore && !normalized.is_empty() {
264                    normalized.push('_');
265                }
266                normalized.push(ch.to_ascii_lowercase());
267                pending_underscore = false;
268            } else {
269                pending_underscore = true;
270            }
271        }
272
273        Self::Custom(normalized)
274    }
275
276    /// Returns the normalized custom class name for custom classes.
277    pub fn as_custom_name(&self) -> Option<&str> {
278        match self {
279            Self::Custom(name) => Some(name.as_str()),
280            Self::Email | Self::Name | Self::Location | Self::Organization => None,
281        }
282    }
283
284    /// Returns the audit/token display label for this class.
285    pub fn class_name(&self) -> String {
286        match self {
287            Self::Email => BUILTIN_CLASS_NAMES[0].to_string(),
288            Self::Name => BUILTIN_CLASS_NAMES[1].to_string(),
289            Self::Location => BUILTIN_CLASS_NAMES[2].to_string(),
290            Self::Organization => BUILTIN_CLASS_NAMES[3].to_string(),
291            Self::Custom(name) => format!("Custom:{name}"),
292        }
293    }
294
295    /// Returns the canonical audit/serde label for this class.
296    pub fn to_canonical_str(&self) -> String {
297        match self {
298            Self::Email => "email".to_string(),
299            Self::Name => "name".to_string(),
300            Self::Location => "location".to_string(),
301            Self::Organization => "organization".to_string(),
302            Self::Custom(name) => format!("custom:{name}"),
303        }
304    }
305
306    /// Parses the canonical audit/serde label for a PII class.
307    pub fn from_canonical_str(value: &str) -> Option<Self> {
308        match value {
309            "email" | "Email" => Some(Self::Email),
310            "name" | "Name" => Some(Self::Name),
311            "location" | "Location" => Some(Self::Location),
312            "organization" | "Organization" => Some(Self::Organization),
313            custom if custom.starts_with("custom:") => {
314                let name = &custom["custom:".len()..];
315                (!name.is_empty()).then(|| Self::Custom(name.to_string()))
316            }
317            _ => None,
318        }
319    }
320}
321
322/// Audit-canonical form of [`PiiClass`].
323///
324/// Serializes as `"email"`, `"name"`, `"custom:foo"`, and similar canonical
325/// strings. Use this wrapper for audit-row JSON only. Session snapshots use
326/// bare [`PiiClass`] serde so their byte shape stays stable.
327#[derive(Debug, Clone, PartialEq, Eq)]
328#[non_exhaustive]
329pub struct PiiClassAudit(pub PiiClass);
330
331impl PiiClassAudit {
332    /// Builds an audit-canonical class wrapper.
333    pub fn new(class: PiiClass) -> Self {
334        Self(class)
335    }
336
337    /// Unwraps the underlying class.
338    pub fn into_inner(self) -> PiiClass {
339        self.0
340    }
341}
342
343impl Serialize for PiiClassAudit {
344    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
345    where
346        S: serde::Serializer,
347    {
348        serializer.serialize_str(&self.0.to_canonical_str())
349    }
350}
351
352impl<'de> Deserialize<'de> for PiiClassAudit {
353    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
354    where
355        D: serde::Deserializer<'de>,
356    {
357        let value = String::deserialize(deserializer)?;
358        PiiClass::from_canonical_str(&value)
359            .map(Self)
360            .ok_or_else(|| {
361                serde::de::Error::custom(format!("unknown PiiClass canonical form: {value}"))
362            })
363    }
364}
365
366mod pii_class_audit_serde {
367    use super::{PiiClass, PiiClassAudit};
368    use serde::{Deserialize, Deserializer, Serialize, Serializer};
369
370    pub fn serialize<S>(class: &PiiClass, serializer: S) -> Result<S::Ok, S::Error>
371    where
372        S: Serializer,
373    {
374        PiiClassAudit::new(class.clone()).serialize(serializer)
375    }
376
377    pub fn deserialize<'de, D>(deserializer: D) -> Result<PiiClass, D::Error>
378    where
379        D: Deserializer<'de>,
380    {
381        Ok(PiiClassAudit::deserialize(deserializer)?.into_inner())
382    }
383}
384
385/// A candidate recognizer/class pair that lost ambiguity resolution.
386#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
387#[non_exhaustive]
388pub struct LosingCandidate {
389    /// PII class proposed by the losing recognizer.
390    #[serde(with = "pii_class_audit_serde")]
391    pub class: PiiClass,
392    /// Stable recognizer identifier for traceability.
393    pub recognizer_id: String,
394}
395
396impl LosingCandidate {
397    /// Builds a losing ambiguity candidate.
398    pub fn new(class: PiiClass, recognizer_id: impl Into<String>) -> Self {
399        Self {
400            class,
401            recognizer_id: recognizer_id.into(),
402        }
403    }
404}
405
406/// Structured metadata describing an ambiguity outcome.
407#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
408#[non_exhaustive]
409pub struct AmbiguityRecord {
410    /// The family-level class assigned when disambiguation failed.
411    #[serde(with = "pii_class_audit_serde")]
412    pub ambiguity_class: PiiClass,
413    /// Variants that could not be disambiguated.
414    ///
415    /// Producers must keep this list stable by sorting `recognizer_id` ascending.
416    pub losing_candidates: Vec<LosingCandidate>,
417    /// Why disambiguation failed.
418    pub reason: AmbiguityReason,
419}
420
421impl AmbiguityRecord {
422    /// Builds a structured ambiguity record.
423    pub fn new(
424        ambiguity_class: PiiClass,
425        losing_candidates: Vec<LosingCandidate>,
426        reason: AmbiguityReason,
427    ) -> Self {
428        Self {
429            ambiguity_class,
430            losing_candidates,
431            reason,
432        }
433    }
434}
435
436/// Closed set of ambiguity outcomes recorded by the audit side-channel.
437#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
438#[non_exhaustive]
439#[serde(rename_all = "snake_case")]
440pub enum AmbiguityReason {
441    /// Span matched a multi-recognizer family and no anchor cue resolved it.
442    NoAnchor,
443    /// Multiple validator-stage recognizers remained viable for the same span.
444    ValidatorIndeterminate,
445    /// Span matched recognizers across two or more distinct PII class families.
446    MultiFamilyMatch,
447    /// Multiple variants had the same precedence and no discriminator resolved them.
448    PrecedenceTie,
449}
450
451/// Closed validator failure reasons recorded by audit metadata.
452#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
453#[non_exhaustive]
454#[serde(rename_all = "snake_case")]
455pub enum ValidatorFailReason {
456    /// Luhn checksum validation failed.
457    LuhnFailed,
458    /// IBAN MOD-97 validation failed.
459    IbanMod97Failed,
460    /// Email RFC-style validation failed.
461    #[serde(alias = "email_rfc_failed")]
462    EmailRfcRejected,
463    /// E.164 phone validation failed.
464    #[serde(alias = "e164_phone_failed")]
465    PhoneE164Rejected,
466    /// National phone parser accepted the number but region validation failed.
467    PhoneNationalRegionMismatch,
468    /// IPv4 parser rejected the candidate.
469    Ipv4ParseFailed,
470    /// IPv6 parser rejected the candidate.
471    Ipv6ParseFailed,
472    /// EIP-55 Ethereum checksum validation failed.
473    EthEip55ChecksumFailed,
474    /// Aadhaar Verhoeff checksum validation failed.
475    AadhaarVerhoeffFailed,
476    /// French NIR MOD-97 key validation failed.
477    FrNirMod97Failed,
478    /// German Steuer-ID MOD 11,10 checksum validation failed.
479    DeSteuerIdMod1110Failed,
480    /// Dutch BSN MOD-11 checksum validation failed.
481    BsnMod11Failed,
482    /// Brazilian CPF MOD-11 checksum validation failed.
483    CpfMod11Failed,
484    /// Brazilian CNPJ MOD-11 checksum validation failed.
485    CnpjMod11Failed,
486    /// UK NHS number MOD-11 checksum validation failed.
487    UkNhsMod11Failed,
488}
489
490/// Typed validator outcome used by the pre-resolver validator-veto phase.
491#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
492#[non_exhaustive]
493#[serde(rename_all = "snake_case")]
494pub enum ValidatorOutcome {
495    /// Candidate passed validation; canonical form may be supplied by the validator.
496    Pass { canonical_form: Option<String> },
497    /// Candidate failed validation with a closed, auditable reason.
498    Fail { reason: ValidatorFailReason },
499    /// Recognizer has no validator for this candidate.
500    NotApplicable,
501}
502
503/// Error returned when a rulepack names a validator unsupported by this build.
504#[derive(Debug, Clone, PartialEq, Eq, Error)]
505#[non_exhaustive]
506pub enum ValidatorKindParseError {
507    /// Validator kind is not known or is gated behind a disabled feature.
508    #[error("unsupported validator: {kind}")]
509    UnsupportedValidator {
510        /// Unsupported validator kind.
511        kind: String,
512    },
513}
514
515/// Closed set of validator implementations used by validator-backed recognizers.
516#[derive(Debug, Clone, Copy, PartialEq, Eq)]
517#[non_exhaustive]
518pub enum ValidatorKind {
519    /// Basic email shape validator.
520    EmailRfc,
521    /// Parser-backed E.164 phone validator.
522    #[cfg(feature = "phone-parser")]
523    E164Phone,
524    /// Parser-backed national phone validator for a fixed region.
525    #[cfg(feature = "phone-parser")]
526    E164PhoneNational(Region),
527    /// Luhn checksum validator.
528    Luhn,
529    /// IBAN MOD-97 validator.
530    IbanMod97,
531    /// Strict decimal dotted-quad IPv4 parser.
532    Ipv4Parse,
533    /// RFC 4291 / RFC 5952 IPv6 textual parser.
534    Ipv6Parse,
535    /// EIP-55 Ethereum address checksum validator.
536    EthEip55,
537    /// Indian Aadhaar Verhoeff checksum validator.
538    AadhaarVerhoeff,
539    /// French NIR MOD-97 key validator.
540    FrNirMod97,
541    /// German Steuer-ID MOD 11,10 checksum validator.
542    DeSteuerIdMod1110,
543    /// Dutch BSN MOD-11 checksum validator.
544    BsnMod11,
545    /// Brazilian CPF MOD-11 checksum validator.
546    CpfMod11,
547    /// Brazilian CNPJ MOD-11 checksum validator.
548    CnpjMod11,
549    /// UK NHS number MOD-11 checksum validator.
550    UkNhsMod11,
551}
552
553/// Regions supported by national phone validators.
554#[cfg(feature = "phone-parser")]
555#[derive(Debug, Clone, Copy, PartialEq, Eq)]
556#[non_exhaustive]
557pub enum Region {
558    /// Germany.
559    De,
560    /// United States.
561    Us,
562}
563
564impl ValidatorKind {
565    /// Parses a policy validator kind.
566    pub fn parse(s: &str) -> Result<Self, ValidatorKindParseError> {
567        match s {
568            "email_rfc" => Ok(Self::EmailRfc),
569            #[cfg(feature = "phone-parser")]
570            "e164_phone" => Ok(Self::E164Phone),
571            #[cfg(feature = "phone-parser")]
572            "e164_phone_national_de" => Ok(Self::E164PhoneNational(Region::De)),
573            #[cfg(feature = "phone-parser")]
574            "e164_phone_national_us" => Ok(Self::E164PhoneNational(Region::Us)),
575            "luhn" => Ok(Self::Luhn),
576            "iban_mod97" => Ok(Self::IbanMod97),
577            "ipv4_parse" => Ok(Self::Ipv4Parse),
578            "ipv6_parse" => Ok(Self::Ipv6Parse),
579            "eth_eip55" => Ok(Self::EthEip55),
580            "aadhaar_verhoeff" => Ok(Self::AadhaarVerhoeff),
581            "fr_nir_mod97" => Ok(Self::FrNirMod97),
582            "de_steuer_id_mod1110" => Ok(Self::DeSteuerIdMod1110),
583            "bsn_mod11" => Ok(Self::BsnMod11),
584            "cpf_mod11" => Ok(Self::CpfMod11),
585            "cnpj_mod11" => Ok(Self::CnpjMod11),
586            "uk_nhs_mod11" => Ok(Self::UkNhsMod11),
587            other => Err(ValidatorKindParseError::UnsupportedValidator {
588                kind: other.to_string(),
589            }),
590        }
591    }
592
593    /// Returns whether the validator accepts the input.
594    pub fn validates(self, input: &str) -> bool {
595        match self {
596            Self::AadhaarVerhoeff => aadhaar_verhoeff_check(input),
597            Self::FrNirMod97 => fr_nir_mod97_check(input),
598            Self::DeSteuerIdMod1110 => de_steuer_id_mod1110_check(input),
599            Self::BsnMod11 => bsn_mod11_check(input),
600            Self::CpfMod11 => cpf_mod11_check(input),
601            Self::CnpjMod11 => cnpj_mod11_check(input),
602            Self::UkNhsMod11 => uk_nhs_mod11_check(input),
603            _ => self.canonical_form(input).is_some(),
604        }
605    }
606
607    /// Applies validation and returns a typed outcome for audit.
608    pub fn validate(self, input: &str) -> ValidatorOutcome {
609        match self.canonical_form(input) {
610            Some(canonical_form) => ValidatorOutcome::Pass {
611                canonical_form: Some(canonical_form),
612            },
613            None => ValidatorOutcome::Fail {
614                reason: self.fail_reason(),
615            },
616        }
617    }
618
619    /// Returns the canonical form for accepted input.
620    pub fn canonical_form(self, input: &str) -> Option<String> {
621        match self {
622            Self::EmailRfc => is_basic_email(input).then(|| input.to_string()),
623            #[cfg(feature = "phone-parser")]
624            Self::E164Phone => e164_phone_check(input).then(|| input.to_string()),
625            #[cfg(feature = "phone-parser")]
626            Self::E164PhoneNational(region) => validate_phone_national(region, input),
627            Self::Luhn => luhn_check(input).then(|| input.to_string()),
628            Self::IbanMod97 => iban_mod97_check(input).then(|| input.to_string()),
629            Self::Ipv4Parse => ipv4_parse_check(input).then(|| input.to_string()),
630            Self::Ipv6Parse => ipv6_parse_check(input).then(|| input.to_string()),
631            Self::EthEip55 => eth_eip55_check(input).then(|| input.to_string()),
632            Self::AadhaarVerhoeff => {
633                canonical_ascii_digits::<12>(input).filter(|_| aadhaar_verhoeff_check(input))
634            }
635            Self::FrNirMod97 => {
636                canonical_ascii_digits::<15>(input).filter(|_| fr_nir_mod97_check(input))
637            }
638            Self::DeSteuerIdMod1110 => {
639                canonical_ascii_digits::<11>(input).filter(|_| de_steuer_id_mod1110_check(input))
640            }
641            Self::BsnMod11 => canonical_ascii_digits::<9>(input).filter(|_| bsn_mod11_check(input)),
642            Self::CpfMod11 => {
643                canonical_ascii_digits::<11>(input).filter(|_| cpf_mod11_check(input))
644            }
645            Self::CnpjMod11 => {
646                canonical_ascii_digits::<14>(input).filter(|_| cnpj_mod11_check(input))
647            }
648            Self::UkNhsMod11 => {
649                canonical_ascii_digits::<10>(input).filter(|_| uk_nhs_mod11_check(input))
650            }
651        }
652    }
653
654    /// Returns the audit reason emitted when validation fails.
655    pub fn fail_reason(self) -> ValidatorFailReason {
656        match self {
657            Self::EmailRfc => ValidatorFailReason::EmailRfcRejected,
658            #[cfg(feature = "phone-parser")]
659            Self::E164Phone => ValidatorFailReason::PhoneE164Rejected,
660            #[cfg(feature = "phone-parser")]
661            Self::E164PhoneNational(_) => ValidatorFailReason::PhoneNationalRegionMismatch,
662            Self::Luhn => ValidatorFailReason::LuhnFailed,
663            Self::IbanMod97 => ValidatorFailReason::IbanMod97Failed,
664            Self::Ipv4Parse => ValidatorFailReason::Ipv4ParseFailed,
665            Self::Ipv6Parse => ValidatorFailReason::Ipv6ParseFailed,
666            Self::EthEip55 => ValidatorFailReason::EthEip55ChecksumFailed,
667            Self::AadhaarVerhoeff => ValidatorFailReason::AadhaarVerhoeffFailed,
668            Self::FrNirMod97 => ValidatorFailReason::FrNirMod97Failed,
669            Self::DeSteuerIdMod1110 => ValidatorFailReason::DeSteuerIdMod1110Failed,
670            Self::BsnMod11 => ValidatorFailReason::BsnMod11Failed,
671            Self::CpfMod11 => ValidatorFailReason::CpfMod11Failed,
672            Self::CnpjMod11 => ValidatorFailReason::CnpjMod11Failed,
673            Self::UkNhsMod11 => ValidatorFailReason::UkNhsMod11Failed,
674        }
675    }
676}
677
678fn is_basic_email(input: &str) -> bool {
679    let Some((local, domain)) = input.split_once('@') else {
680        return false;
681    };
682    !local.is_empty() && domain.contains('.') && !domain.starts_with('.') && !domain.ends_with('.')
683}
684
685#[cfg(feature = "phone-parser")]
686fn e164_phone_check(input: &str) -> bool {
687    phonenumber::parse(None, input).is_ok_and(|phone| phonenumber::is_valid(&phone))
688}
689
690#[cfg(feature = "phone-parser")]
691fn validate_phone_national(region: Region, input: &str) -> Option<String> {
692    let country = match region {
693        Region::De => phonenumber::country::DE,
694        Region::Us => phonenumber::country::US,
695    };
696    let expected_code = match region {
697        Region::De => 49,
698        Region::Us => 1,
699    };
700    let number = phonenumber::parse(Some(country), input).ok()?;
701    if number.country().code() != expected_code {
702        return None;
703    }
704    if number.is_valid() || is_safe_fixture_phone(region, input) {
705        return Some(number.format().mode(phonenumber::Mode::E164).to_string());
706    }
707    None
708}
709
710#[cfg(feature = "phone-parser")]
711fn is_safe_fixture_phone(region: Region, input: &str) -> bool {
712    let digits = input
713        .chars()
714        .filter(char::is_ascii_digit)
715        .collect::<String>();
716    match region {
717        Region::Us => {
718            digits == "15550100"
719                || matches!(digits.strip_prefix('1'), Some(rest) if rest.len() == 10 && rest[3..].starts_with("55501"))
720        }
721        Region::De => matches!(
722            digits.as_str(),
723            "493000000000"
724                | "4915100000000"
725                | "4915550112233"
726                | "015550112233"
727                | "491710000000"
728                | "01710000000"
729        ),
730    }
731}
732
733fn luhn_check(input: &str) -> bool {
734    let mut digits = Vec::new();
735    for byte in input.bytes() {
736        if byte.is_ascii_whitespace() || byte == b'-' {
737            continue;
738        }
739        if !byte.is_ascii_digit() {
740            return false;
741        }
742        digits.push(byte - b'0');
743    }
744    if !(13..=19).contains(&digits.len()) {
745        return false;
746    }
747
748    let sum: u32 = digits
749        .iter()
750        .rev()
751        .enumerate()
752        .map(|(index, digit)| {
753            let mut value = u32::from(*digit);
754            if index % 2 == 1 {
755                value *= 2;
756                if value > 9 {
757                    value -= 9;
758                }
759            }
760            value
761        })
762        .sum();
763    sum.is_multiple_of(10)
764}
765
766fn iban_mod97_check(input: &str) -> bool {
767    let canonical = iban_canonicalize(input);
768    if !(15..=34).contains(&canonical.len()) {
769        return false;
770    }
771    if !canonical.chars().all(|ch| ch.is_ascii_alphanumeric()) {
772        return false;
773    }
774
775    let mut remainder = 0u32;
776    for ch in canonical[4..].chars().chain(canonical[..4].chars()) {
777        match ch {
778            '0'..='9' => {
779                remainder = (remainder * 10 + ch.to_digit(10).expect("digit")) % 97;
780            }
781            'A'..='Z' => {
782                let value = u32::from(ch) - u32::from('A') + 10;
783                remainder = (remainder * 10 + value / 10) % 97;
784                remainder = (remainder * 10 + value % 10) % 97;
785            }
786            _ => return false,
787        }
788    }
789    remainder == 1
790}
791
792fn iban_canonicalize(input: &str) -> String {
793    input
794        .chars()
795        .filter(|ch| !ch.is_ascii_whitespace())
796        .flat_map(char::to_uppercase)
797        .collect()
798}
799
800fn ipv4_parse_check(input: &str) -> bool {
801    input.parse::<std::net::Ipv4Addr>().is_ok()
802}
803
804fn ipv6_parse_check(input: &str) -> bool {
805    input.parse::<std::net::Ipv6Addr>().is_ok()
806}
807
808fn eth_eip55_check(input: &str) -> bool {
809    let Some(address) = input.strip_prefix("0x") else {
810        return false;
811    };
812    if address.len() != 40 || !address.bytes().all(|byte| byte.is_ascii_hexdigit()) {
813        return false;
814    }
815    if address
816        .bytes()
817        .all(|byte| !byte.is_ascii_alphabetic() || byte.is_ascii_lowercase())
818        || address
819            .bytes()
820            .all(|byte| !byte.is_ascii_alphabetic() || byte.is_ascii_uppercase())
821    {
822        return true;
823    }
824
825    let lowercase = address.to_ascii_lowercase();
826    let hash = Keccak256::digest(lowercase.as_bytes());
827    for (index, byte) in address.bytes().enumerate() {
828        if byte.is_ascii_digit() {
829            continue;
830        }
831        let hash_nibble = if index % 2 == 0 {
832            hash[index / 2] >> 4
833        } else {
834            hash[index / 2] & 0x0f
835        };
836        if (hash_nibble > 7) != byte.is_ascii_uppercase() {
837            return false;
838        }
839    }
840    true
841}
842
843fn collect_ascii_digits<const N: usize>(input: &str) -> Option<[u8; N]> {
844    let mut digits = [0u8; N];
845    let mut count = 0usize;
846    for byte in input.bytes() {
847        if byte.is_ascii_digit() {
848            if count == N {
849                return None;
850            }
851            digits[count] = byte - b'0';
852            count += 1;
853        } else if matches!(byte, b' ' | b'\t' | b'\n' | b'\r' | b'-' | b'.' | b'/') {
854            continue;
855        } else {
856            return None;
857        }
858    }
859    (count == N).then_some(digits)
860}
861
862fn canonical_ascii_digits<const N: usize>(input: &str) -> Option<String> {
863    let digits = collect_ascii_digits::<N>(input)?;
864    let mut canonical = String::with_capacity(N);
865    for digit in digits {
866        canonical.push(char::from(b'0' + digit));
867    }
868    Some(canonical)
869}
870
871fn not_all_same<const N: usize>(digits: &[u8; N]) -> bool {
872    digits[1..].iter().any(|digit| *digit != digits[0])
873}
874
875fn aadhaar_verhoeff_check(input: &str) -> bool {
876    const D: [[u8; 10]; 10] = [
877        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
878        [1, 2, 3, 4, 0, 6, 7, 8, 9, 5],
879        [2, 3, 4, 0, 1, 7, 8, 9, 5, 6],
880        [3, 4, 0, 1, 2, 8, 9, 5, 6, 7],
881        [4, 0, 1, 2, 3, 9, 5, 6, 7, 8],
882        [5, 9, 8, 7, 6, 0, 4, 3, 2, 1],
883        [6, 5, 9, 8, 7, 1, 0, 4, 3, 2],
884        [7, 6, 5, 9, 8, 2, 1, 0, 4, 3],
885        [8, 7, 6, 5, 9, 3, 2, 1, 0, 4],
886        [9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
887    ];
888    const P: [[u8; 10]; 8] = [
889        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
890        [1, 5, 7, 6, 2, 8, 3, 0, 9, 4],
891        [5, 8, 0, 3, 7, 9, 6, 1, 4, 2],
892        [8, 9, 1, 6, 0, 4, 3, 5, 2, 7],
893        [9, 4, 5, 3, 1, 2, 6, 8, 7, 0],
894        [4, 2, 8, 6, 5, 7, 3, 9, 0, 1],
895        [2, 7, 9, 3, 8, 0, 6, 4, 1, 5],
896        [7, 0, 4, 6, 9, 1, 3, 2, 5, 8],
897    ];
898    let Some(digits) = collect_ascii_digits::<12>(input) else {
899        return false;
900    };
901    if digits[0] < 2 || !not_all_same(&digits) {
902        return false;
903    }
904    let mut checksum = 0u8;
905    for (index, digit) in digits.iter().rev().enumerate() {
906        checksum = D[checksum as usize][P[index % 8][*digit as usize] as usize];
907    }
908    checksum == 0
909}
910
911fn fr_nir_mod97_check(input: &str) -> bool {
912    let Some(digits) = collect_ascii_digits::<15>(input) else {
913        return false;
914    };
915    if !matches!(digits[0], 1 | 2 | 3 | 4 | 7 | 8) {
916        return false;
917    }
918    let month = digits[3] * 10 + digits[4];
919    if !(1..=12).contains(&month) && !(20..=42).contains(&month) && !(50..=99).contains(&month) {
920        return false;
921    }
922    let mut number = 0u32;
923    for digit in &digits[..13] {
924        number = (number * 10 + u32::from(*digit)) % 97;
925    }
926    let key = u32::from(digits[13]) * 10 + u32::from(digits[14]);
927    97 - number == key
928}
929
930fn de_steuer_id_mod1110_check(input: &str) -> bool {
931    let Some(digits) = collect_ascii_digits::<11>(input) else {
932        return false;
933    };
934    if !steuer_id_first_ten_digits_valid(&digits) {
935        return false;
936    }
937    let mut product = 10u8;
938    for digit in &digits[..10] {
939        let mut sum = (*digit + product) % 10;
940        if sum == 0 {
941            sum = 10;
942        }
943        product = (2 * sum) % 11;
944    }
945    let check = (11 - product) % 10;
946    check == digits[10]
947}
948
949fn steuer_id_first_ten_digits_valid(digits: &[u8; 11]) -> bool {
950    if digits[0] == 0 {
951        return false;
952    }
953    let mut counts = [0u8; 10];
954    for digit in &digits[..10] {
955        counts[*digit as usize] += 1;
956    }
957    let repeated_digits = counts.iter().filter(|count| **count > 1).count();
958    let missing_digits = counts.iter().filter(|count| **count == 0).count();
959    let repeated_count_valid = counts.iter().any(|count| matches!(*count, 2 | 3));
960    repeated_digits == 1 && repeated_count_valid && matches!(missing_digits, 1 | 2)
961}
962
963fn bsn_mod11_check(input: &str) -> bool {
964    let Some(digits) = collect_ascii_digits::<9>(input) else {
965        return false;
966    };
967    if !not_all_same(&digits) {
968        return false;
969    }
970    let sum: i32 = digits[..8]
971        .iter()
972        .enumerate()
973        .map(|(index, digit)| i32::from(*digit) * (9 - index as i32))
974        .sum::<i32>()
975        - i32::from(digits[8]);
976    sum.rem_euclid(11) == 0
977}
978
979fn cpf_mod11_check(input: &str) -> bool {
980    let Some(digits) = collect_ascii_digits::<11>(input) else {
981        return false;
982    };
983    if !not_all_same(&digits) {
984        return false;
985    }
986    mod11_check_digit(&digits[..9], 10) == digits[9]
987        && mod11_check_digit(&digits[..10], 11) == digits[10]
988}
989
990fn cnpj_mod11_check(input: &str) -> bool {
991    let Some(digits) = collect_ascii_digits::<14>(input) else {
992        return false;
993    };
994    if !not_all_same(&digits) {
995        return false;
996    }
997    const FIRST: [u8; 12] = [5, 4, 3, 2, 9, 8, 7, 6, 5, 4, 3, 2];
998    const SECOND: [u8; 13] = [6, 5, 4, 3, 2, 9, 8, 7, 6, 5, 4, 3, 2];
999    weighted_mod11_check_digit(&digits[..12], &FIRST) == digits[12]
1000        && weighted_mod11_check_digit(&digits[..13], &SECOND) == digits[13]
1001}
1002
1003fn uk_nhs_mod11_check(input: &str) -> bool {
1004    let Some(digits) = collect_ascii_digits::<10>(input) else {
1005        return false;
1006    };
1007    if !not_all_same(&digits) {
1008        return false;
1009    }
1010    let sum: u32 = digits[..9]
1011        .iter()
1012        .enumerate()
1013        .map(|(index, digit)| u32::from(*digit) * (10 - index as u32))
1014        .sum();
1015    let check = 11 - (sum % 11);
1016    let check = if check == 11 { 0 } else { check };
1017    check != 10 && check == u32::from(digits[9])
1018}
1019
1020fn mod11_check_digit(digits: &[u8], start_weight: u8) -> u8 {
1021    let weights = (2..=start_weight).rev();
1022    let sum: u32 = digits
1023        .iter()
1024        .zip(weights)
1025        .map(|(digit, weight)| u32::from(*digit) * u32::from(weight))
1026        .sum();
1027    let remainder = sum % 11;
1028    if remainder < 2 {
1029        0
1030    } else {
1031        (11 - remainder) as u8
1032    }
1033}
1034
1035fn weighted_mod11_check_digit(digits: &[u8], weights: &[u8]) -> u8 {
1036    let sum: u32 = digits
1037        .iter()
1038        .zip(weights)
1039        .map(|(digit, weight)| u32::from(*digit) * u32::from(*weight))
1040        .sum();
1041    let remainder = sum % 11;
1042    if remainder < 2 {
1043        0
1044    } else {
1045        (11 - remainder) as u8
1046    }
1047}
1048
1049/// A detected span and its class/source metadata.
1050#[derive(Debug, Clone, PartialEq, Eq)]
1051#[non_exhaustive]
1052pub struct Detection {
1053    /// Byte span in the original input.
1054    pub span: Range<usize>,
1055    /// PII class assigned to the span.
1056    pub class: PiiClass,
1057    /// Detector source identifier.
1058    pub source: String,
1059}
1060
1061impl Detection {
1062    /// Builds a detected PII span.
1063    pub fn new(span: Range<usize>, class: PiiClass, source: impl Into<String>) -> Self {
1064        Self {
1065            span,
1066            class,
1067            source: source.into(),
1068        }
1069    }
1070}
1071
1072/// Observer-only post-clean check (Pass 3 in the detection pipeline).
1073///
1074/// Runs against already-tokenized output. May report suspected missed PII via
1075/// [`LeakReport`] but **must not** mutate the token manifest, the `CleanDocument`,
1076/// or the restore path. Safety nets are additive defense-in-depth, not a replacement
1077/// for Pass 1/2 detection.
1078///
1079/// Activate at runtime with `Pipeline::with_safety_net` (post-build) or
1080/// `PipelineBuilder::register_safety_net` (during build), or via the CLI
1081/// `--safety-net=<name>` flag.
1082///
1083/// If a safety net reports a suspected miss, the caller decides the response; the
1084/// pipeline never silently re-cleans based on safety net output.
1085pub trait SafetyNet: Send + Sync {
1086    /// Stable backend identifier used in telemetry and audit rows.
1087    fn id(&self) -> &str;
1088
1089    /// Locale tags supported by this safety net. Empty means global.
1090    fn supported_locales(&self) -> &[LocaleTag];
1091
1092    /// Checks clean text for possible PII that the manifest did not cover.
1093    fn check(
1094        &self,
1095        clean_text: &str,
1096        context: SafetyNetContext<'_>,
1097    ) -> Result<Vec<LeakSuspect>, SafetyNetError>;
1098}
1099
1100/// Context passed to a privacy safety net.
1101#[derive(Debug, Clone, Copy)]
1102#[non_exhaustive]
1103pub struct SafetyNetContext<'a> {
1104    /// Tokens emitted by the pseudonymization pipeline for this text segment.
1105    pub manifest: &'a Manifest,
1106    /// Active session-level locale chain. For `RawDocument::Structured`, locale
1107    /// gating uses this same session-level chain across all fields; structured
1108    /// fields do not carry per-field locale annotations.
1109    pub locale_chain: &'a [LocaleTag],
1110    /// Source document kind being checked.
1111    pub document_kind: DocumentKind,
1112    /// Optional audit session identifier.
1113    pub session_id: Option<&'a str>,
1114    /// Structured-document field path, such as `$.user.email`.
1115    pub field_path: Option<&'a str>,
1116}
1117
1118impl<'a> SafetyNetContext<'a> {
1119    /// Builds safety-net context for one clean text segment.
1120    pub fn new(
1121        manifest: &'a Manifest,
1122        locale_chain: &'a [LocaleTag],
1123        document_kind: DocumentKind,
1124        session_id: Option<&'a str>,
1125        field_path: Option<&'a str>,
1126    ) -> Self {
1127        Self {
1128            manifest,
1129            locale_chain,
1130            document_kind,
1131            session_id,
1132            field_path,
1133        }
1134    }
1135}
1136
1137/// A replacement emitted by the pseudonymization pipeline.
1138#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
1139#[non_exhaustive]
1140pub struct EmittedTokenSpan {
1141    /// Byte span in the clean text.
1142    pub clean_span: Range<usize>,
1143    /// Byte span in the raw text that produced the token.
1144    pub raw_span: Range<usize>,
1145    /// PII class represented by the emitted token.
1146    pub class: PiiClass,
1147}
1148
1149impl EmittedTokenSpan {
1150    /// Builds an emitted token span.
1151    pub fn new(clean_span: Range<usize>, raw_span: Range<usize>, class: PiiClass) -> Self {
1152        Self {
1153            clean_span,
1154            raw_span,
1155            class,
1156        }
1157    }
1158}
1159
1160/// Set of emitted token spans for one clean text segment.
1161#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
1162#[non_exhaustive]
1163pub struct Manifest {
1164    /// Spans sorted by `clean_span.start`.
1165    pub spans: Vec<EmittedTokenSpan>,
1166}
1167
1168impl Manifest {
1169    /// Builds a manifest from spans and sorts them by clean byte start.
1170    pub fn from_spans(mut spans: Vec<EmittedTokenSpan>) -> Self {
1171        spans.sort_by_key(|span| (span.clean_span.start, span.clean_span.end));
1172        Self { spans }
1173    }
1174
1175    /// Diffs one safety-net suspect span against emitted token coverage.
1176    ///
1177    /// Returns `None` when the suspect span is continuously covered by emitted
1178    /// token spans of the same class. Internal gaps return
1179    /// `LeakKind::PartialBleed`. When multiple uncovered gaps exist, this method
1180    /// deterministically returns the first gap by byte offset; full gap
1181    /// enumeration is intentionally deferred to a future report format.
1182    pub fn diff_against(
1183        &self,
1184        suspect_span: &Range<usize>,
1185        suspect_class: &PiiClass,
1186    ) -> Option<LeakKind> {
1187        if suspect_span.is_empty() {
1188            return None;
1189        }
1190
1191        let start_idx = self
1192            .spans
1193            .partition_point(|span| span.clean_span.end <= suspect_span.start);
1194        let overlapping = self.spans[start_idx..]
1195            .iter()
1196            .take_while(|span| span.clean_span.start < suspect_span.end)
1197            .filter(|span| ranges_overlap(&span.clean_span, suspect_span))
1198            .collect::<Vec<_>>();
1199
1200        if overlapping.is_empty() {
1201            return Some(LeakKind::Uncovered);
1202        }
1203
1204        let mut cursor = suspect_span.start;
1205        let mut first_mismatch = None::<&EmittedTokenSpan>;
1206        for span in overlapping {
1207            if span.clean_span.start > cursor {
1208                return Some(LeakKind::PartialBleed {
1209                    uncovered: cursor..span.clean_span.start.min(suspect_span.end),
1210                });
1211            }
1212
1213            if span.clean_span.end > cursor {
1214                if first_mismatch.is_none() && &span.class != suspect_class {
1215                    first_mismatch = Some(span);
1216                }
1217                cursor = cursor.max(span.clean_span.end.min(suspect_span.end));
1218                if cursor >= suspect_span.end {
1219                    break;
1220                }
1221            }
1222        }
1223
1224        if cursor < suspect_span.end {
1225            return Some(LeakKind::PartialBleed {
1226                uncovered: cursor..suspect_span.end,
1227            });
1228        }
1229
1230        first_mismatch.map(|span| LeakKind::ClassMismatch {
1231            pipeline_class: span.class.clone(),
1232            safety_net_class: suspect_class.clone(),
1233        })
1234    }
1235}
1236
1237fn ranges_overlap(left: &Range<usize>, right: &Range<usize>) -> bool {
1238    left.start < right.end && right.start < left.end
1239}
1240
1241/// Suspected leak reported by an observer-only safety net.
1242#[derive(Debug, Clone, PartialEq)]
1243#[non_exhaustive]
1244pub struct LeakSuspect {
1245    /// Byte span in clean text.
1246    pub span: Range<usize>,
1247    /// Mapped PII class for the suspect.
1248    pub class: PiiClass,
1249    /// Safety-net backend identifier.
1250    pub safety_net_id: String,
1251    /// Optional backend confidence score.
1252    pub score: Option<f32>,
1253    /// Leak classification after manifest correlation.
1254    pub kind: LeakKind,
1255    /// Raw backend label after validation/mapping, never source text.
1256    pub raw_label: String,
1257    /// Optional structured field path.
1258    pub field_path: Option<String>,
1259}
1260
1261impl LeakSuspect {
1262    /// Builds a safety-net leak suspect.
1263    pub fn new(
1264        span: Range<usize>,
1265        class: PiiClass,
1266        safety_net_id: impl Into<String>,
1267        score: Option<f32>,
1268        kind: LeakKind,
1269        raw_label: impl Into<String>,
1270        field_path: Option<String>,
1271    ) -> Self {
1272        Self {
1273            span,
1274            class,
1275            safety_net_id: safety_net_id.into(),
1276            score,
1277            kind,
1278            raw_label: raw_label.into(),
1279            field_path,
1280        }
1281    }
1282}
1283
1284/// The category of a suspected missed PII span.
1285///
1286/// `LeakKind` is `#[non_exhaustive]`. Match with a wildcard for forward compatibility.
1287#[derive(Debug, Clone, PartialEq, Eq)]
1288#[non_exhaustive]
1289pub enum LeakKind {
1290    /// No same-class emitted token overlaps the suspect span.
1291    Uncovered,
1292    /// The suspect is only partly covered; `uncovered` is the first gap.
1293    PartialBleed {
1294        /// First uncovered byte range in the suspect span.
1295        uncovered: Range<usize>,
1296    },
1297    /// The suspect is continuously covered, but by a different class.
1298    ClassMismatch {
1299        /// Class emitted by the pipeline.
1300        pipeline_class: PiiClass,
1301        /// Class reported by the safety net.
1302        safety_net_class: PiiClass,
1303    },
1304}
1305
1306/// Bytes-free telemetry emitted by safety-net orchestration.
1307#[derive(Debug, Clone, PartialEq, Eq)]
1308#[non_exhaustive]
1309pub enum LeakReportTelemetry {
1310    /// Safety net skipped because the session-level locale chain did not match.
1311    LocaleSkipped {
1312        /// Safety-net backend identifier.
1313        safety_net_id: String,
1314        /// Document kind checked.
1315        document_kind: DocumentKind,
1316        /// Optional structured field path when skip was recorded per field.
1317        field_path: Option<String>,
1318    },
1319}
1320
1321/// Aggregate leak report statistics.
1322#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
1323#[non_exhaustive]
1324pub struct LeakReportStats {
1325    /// Number of suspects reported.
1326    pub suspect_count: usize,
1327    /// Number of uncovered suspects.
1328    pub uncovered_count: usize,
1329    /// Number of partial-bleed suspects.
1330    pub partial_bleed_count: usize,
1331    /// Number of class-mismatch suspects.
1332    pub class_mismatch_count: usize,
1333    /// Number of locale-skip telemetry events.
1334    pub locale_skipped_count: usize,
1335}
1336
1337/// Signed document-context metadata carried inside a session snapshot envelope.
1338///
1339/// This extension is the v0.7 bridge for `gaze-document`: it is safe to serialize
1340/// inside the owner-only snapshot envelope, while agent-facing files keep using
1341/// non-sensitive mirrors. The single `schema_version` is bundle-level; sub-files
1342/// do not carry independent schema versions.
1343#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1344#[non_exhaustive]
1345pub struct DocumentExtension {
1346    /// Bundle-level schema version shared by clean, layout, preview, report, and manifest files.
1347    pub schema_version: u16,
1348    /// SHA-256 of `clean.md` NFC-normalized bytes.
1349    pub clean_md_sha256: [u8; 32],
1350    /// SHA-256 of canonical `layout.json` bytes.
1351    pub layout_json_sha256: [u8; 32],
1352    /// SHA-256 of canonical `report.json` bytes.
1353    pub report_json_sha256: [u8; 32],
1354    /// SHA-256 of `preview-redacted.png` bytes when a preview is present.
1355    #[serde(default, skip_serializing_if = "Option::is_none")]
1356    pub preview_png_sha256: Option<[u8; 32]>,
1357    /// Page count reported for the source document.
1358    pub page_count: u32,
1359    /// Audit session id mirrored from the writing session for cross-pane correlation.
1360    pub audit_session_id: String,
1361    /// Signed clean.md byte spans for every emitted token.
1362    #[serde(default, skip_serializing_if = "Vec::is_empty")]
1363    pub clean_spans: Vec<EmittedTokenSpan>,
1364    /// Codec audit rows for the decode path that produced this document extension.
1365    #[serde(default, skip_serializing_if = "Vec::is_empty")]
1366    pub codec_audit: Vec<CodecAuditRow>,
1367}
1368
1369impl DocumentExtension {
1370    /// Starts a document extension builder for one bundle schema version.
1371    pub fn builder(schema_version: u16) -> DocumentExtensionBuilder {
1372        DocumentExtensionBuilder {
1373            schema_version,
1374            clean_md_sha256: None,
1375            layout_json_sha256: None,
1376            report_json_sha256: None,
1377            preview_png_sha256: None,
1378            page_count: None,
1379            audit_session_id: None,
1380            clean_spans: Vec::new(),
1381            codec_audit: Vec::new(),
1382        }
1383    }
1384}
1385
1386/// Builder for [`DocumentExtension`] that requires signed integrity-binding fields.
1387#[derive(Debug, Clone)]
1388#[must_use]
1389pub struct DocumentExtensionBuilder {
1390    schema_version: u16,
1391    clean_md_sha256: Option<[u8; 32]>,
1392    layout_json_sha256: Option<[u8; 32]>,
1393    report_json_sha256: Option<[u8; 32]>,
1394    preview_png_sha256: Option<[u8; 32]>,
1395    page_count: Option<u32>,
1396    audit_session_id: Option<String>,
1397    clean_spans: Vec<EmittedTokenSpan>,
1398    codec_audit: Vec<CodecAuditRow>,
1399}
1400
1401impl DocumentExtensionBuilder {
1402    pub fn clean_md_sha256(mut self, hash: [u8; 32]) -> Self {
1403        self.clean_md_sha256 = Some(hash);
1404        self
1405    }
1406
1407    pub fn layout_json_sha256(mut self, hash: [u8; 32]) -> Self {
1408        self.layout_json_sha256 = Some(hash);
1409        self
1410    }
1411
1412    pub fn report_json_sha256(mut self, hash: [u8; 32]) -> Self {
1413        self.report_json_sha256 = Some(hash);
1414        self
1415    }
1416
1417    pub fn preview_png_sha256(mut self, hash: [u8; 32]) -> Self {
1418        self.preview_png_sha256 = Some(hash);
1419        self
1420    }
1421
1422    pub fn page_count(mut self, page_count: u32) -> Self {
1423        self.page_count = Some(page_count);
1424        self
1425    }
1426
1427    pub fn audit_session_id(mut self, audit_session_id: impl Into<String>) -> Self {
1428        self.audit_session_id = Some(audit_session_id.into());
1429        self
1430    }
1431
1432    pub fn clean_spans(mut self, clean_spans: Vec<EmittedTokenSpan>) -> Self {
1433        self.clean_spans = clean_spans;
1434        self
1435    }
1436
1437    pub fn codec_audit(mut self, codec_audit: Vec<CodecAuditRow>) -> Self {
1438        self.codec_audit = codec_audit;
1439        self
1440    }
1441
1442    pub fn build(self) -> Result<DocumentExtension, DocumentExtensionError> {
1443        Ok(DocumentExtension {
1444            schema_version: self.schema_version,
1445            clean_md_sha256: self
1446                .clean_md_sha256
1447                .ok_or(DocumentExtensionError::MissingField("clean_md_sha256"))?,
1448            layout_json_sha256: self
1449                .layout_json_sha256
1450                .ok_or(DocumentExtensionError::MissingField("layout_json_sha256"))?,
1451            report_json_sha256: self
1452                .report_json_sha256
1453                .ok_or(DocumentExtensionError::MissingField("report_json_sha256"))?,
1454            preview_png_sha256: self.preview_png_sha256,
1455            page_count: self
1456                .page_count
1457                .ok_or(DocumentExtensionError::MissingField("page_count"))?,
1458            audit_session_id: self
1459                .audit_session_id
1460                .ok_or(DocumentExtensionError::MissingField("audit_session_id"))?,
1461            clean_spans: self.clean_spans,
1462            codec_audit: self.codec_audit,
1463        })
1464    }
1465}
1466
1467/// Errors returned while building a [`DocumentExtension`].
1468#[derive(Debug, Clone, PartialEq, Eq, Error)]
1469#[non_exhaustive]
1470pub enum DocumentExtensionError {
1471    #[error("missing document extension field: {0}")]
1472    MissingField(&'static str),
1473}
1474
1475/// Provenance of text extracted from a document or transcript source.
1476#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
1477#[serde(rename_all = "snake_case")]
1478#[non_exhaustive]
1479pub enum TextOrigin {
1480    /// Text came from OCR over pixels.
1481    Ocr,
1482    /// Text came from an embedded text layer.
1483    EmbeddedText,
1484    /// Text came from an audio/video transcript.
1485    Transcript,
1486    /// Text came from multiple extraction paths.
1487    Hybrid,
1488}
1489
1490/// Orthogonal document codec capabilities delivered or advertised by a codec.
1491#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
1492#[non_exhaustive]
1493pub struct CodecCapabilitySet {
1494    /// Codec can emit text.
1495    pub text: bool,
1496    /// Codec can emit layout geometry.
1497    pub layout: bool,
1498    /// Codec can emit confidence buckets.
1499    pub confidence: bool,
1500    /// Codec can emit timestamps.
1501    pub timestamps: bool,
1502}
1503
1504impl CodecCapabilitySet {
1505    /// Text-only capability set.
1506    pub const TEXT_ONLY: Self = Self {
1507        text: true,
1508        layout: false,
1509        confidence: false,
1510        timestamps: false,
1511    };
1512
1513    /// Builds a codec capability bitset.
1514    pub const fn new(text: bool, layout: bool, confidence: bool, timestamps: bool) -> Self {
1515        Self {
1516            text,
1517            layout,
1518            confidence,
1519            timestamps,
1520        }
1521    }
1522
1523    /// Returns true when this set contains every requested capability bit.
1524    pub fn contains(self, requested: Self) -> bool {
1525        (!requested.text || self.text)
1526            && (!requested.layout || self.layout)
1527            && (!requested.confidence || self.confidence)
1528            && (!requested.timestamps || self.timestamps)
1529    }
1530}
1531
1532/// Per-codec declaration for text extraction density checks.
1533#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1534#[serde(rename_all = "snake_case")]
1535#[non_exhaustive]
1536pub enum ExtractionDensityPolicy {
1537    /// Require at least this many extracted text bytes per source KiB.
1538    Required(f32),
1539    /// Explicit exemption with an audit-visible reason.
1540    Exempt { reason: String },
1541}
1542
1543impl Default for ExtractionDensityPolicy {
1544    fn default() -> Self {
1545        Self::Exempt {
1546            reason: "calibration_pending".to_string(),
1547        }
1548    }
1549}
1550
1551/// Metadata-only audit row emitted by a document codec.
1552#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1553#[non_exhaustive]
1554pub struct CodecAuditRow {
1555    /// Stable codec id, such as `gaze.codec.tesseract`.
1556    pub codec_id: String,
1557    /// Adapter crate version, distinct from engine provenance.
1558    pub codec_version: String,
1559    /// Accepted MIME type for the decode.
1560    pub accepted_mime: String,
1561    /// Capabilities advertised by the codec.
1562    pub advertised: CodecCapabilitySet,
1563    /// Capabilities delivered for this decode.
1564    pub delivered: CodecCapabilitySet,
1565    /// Text provenance reported by the codec.
1566    pub text_origin: TextOrigin,
1567    /// Codec-output schema version, decoupled from bundle schema version.
1568    pub codec_output_schema_version: u16,
1569    /// Hash of canonical codec options, never the options themselves.
1570    #[serde(default, skip_serializing_if = "Option::is_none")]
1571    pub options_hash_hex: Option<String>,
1572    /// Engine provenance string, without paths or raw source text.
1573    #[serde(default, skip_serializing_if = "Option::is_none")]
1574    pub engine_provenance: Option<String>,
1575    /// Extraction density policy declared by the codec for this MIME.
1576    pub extraction_density_policy: ExtractionDensityPolicy,
1577}
1578
1579impl CodecAuditRow {
1580    /// Builds a metadata-only codec audit row.
1581    pub fn new(
1582        codec_id: impl Into<String>,
1583        codec_version: impl Into<String>,
1584        accepted_mime: impl Into<String>,
1585        text_origin: TextOrigin,
1586    ) -> Self {
1587        Self {
1588            codec_id: codec_id.into(),
1589            codec_version: codec_version.into(),
1590            accepted_mime: accepted_mime.into(),
1591            advertised: CodecCapabilitySet::default(),
1592            delivered: CodecCapabilitySet::default(),
1593            text_origin,
1594            codec_output_schema_version: 1,
1595            options_hash_hex: None,
1596            engine_provenance: None,
1597            extraction_density_policy: ExtractionDensityPolicy::default(),
1598        }
1599    }
1600}
1601
1602/// A suspected missed PII span reported by a [`SafetyNet`].
1603///
1604/// The safety net is not authoritative; a `LeakReport` is a signal, not a confirmed
1605/// leak. False positives are expected. Review reports and adjust policy or recognizer
1606/// thresholds.
1607#[derive(Debug, Clone, Default, PartialEq)]
1608#[non_exhaustive]
1609pub struct LeakReport {
1610    /// Suspected leaks, containing metadata only.
1611    pub suspects: Vec<LeakSuspect>,
1612    /// Bytes-free telemetry events.
1613    pub telemetry: Vec<LeakReportTelemetry>,
1614    /// Aggregated counts for callers that do not need full suspect metadata.
1615    pub stats: LeakReportStats,
1616    /// Optional replay hash.
1617    ///
1618    /// Replay determinism is guaranteed only when command path, checkpoint,
1619    /// operating point, min score, and decode parameters are fixed externally.
1620    pub replay_hash: Option<String>,
1621}
1622
1623impl LeakReport {
1624    /// Builds a report from suspects and telemetry.
1625    pub fn from_parts(
1626        suspects: Vec<LeakSuspect>,
1627        telemetry: Vec<LeakReportTelemetry>,
1628    ) -> LeakReport {
1629        let mut stats = LeakReportStats {
1630            suspect_count: suspects.len(),
1631            locale_skipped_count: telemetry
1632                .iter()
1633                .filter(|event| matches!(event, LeakReportTelemetry::LocaleSkipped { .. }))
1634                .count(),
1635            ..LeakReportStats::default()
1636        };
1637        for suspect in &suspects {
1638            match suspect.kind {
1639                LeakKind::Uncovered => stats.uncovered_count += 1,
1640                LeakKind::PartialBleed { .. } => stats.partial_bleed_count += 1,
1641                LeakKind::ClassMismatch { .. } => stats.class_mismatch_count += 1,
1642            }
1643        }
1644        LeakReport {
1645            suspects,
1646            telemetry,
1647            stats,
1648            replay_hash: None,
1649        }
1650    }
1651
1652    /// Merges another report into this report.
1653    pub fn extend(&mut self, other: LeakReport) {
1654        self.suspects.extend(other.suspects);
1655        self.telemetry.extend(other.telemetry);
1656        *self = LeakReport::from_parts(
1657            std::mem::take(&mut self.suspects),
1658            std::mem::take(&mut self.telemetry),
1659        );
1660    }
1661}
1662
1663/// Closed set of upstream OpenAI Privacy Filter labels accepted by Gaze.
1664#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
1665#[non_exhaustive]
1666pub enum OpenAiPrivateLabel {
1667    /// `private_person`.
1668    PrivatePerson,
1669    /// `private_address`.
1670    PrivateAddress,
1671    /// `private_email`.
1672    PrivateEmail,
1673    /// `private_phone`.
1674    PrivatePhone,
1675    /// `private_url`.
1676    PrivateUrl,
1677    /// `private_date`.
1678    PrivateDate,
1679    /// `account_number`.
1680    AccountNumber,
1681    /// `secret`.
1682    Secret,
1683}
1684
1685impl OpenAiPrivateLabel {
1686    /// Returns the raw upstream label.
1687    pub fn as_str(self) -> &'static str {
1688        match self {
1689            Self::PrivatePerson => "private_person",
1690            Self::PrivateAddress => "private_address",
1691            Self::PrivateEmail => "private_email",
1692            Self::PrivatePhone => "private_phone",
1693            Self::PrivateUrl => "private_url",
1694            Self::PrivateDate => "private_date",
1695            Self::AccountNumber => "account_number",
1696            Self::Secret => "secret",
1697        }
1698    }
1699}
1700
1701/// Closed safety-net PII vocabulary before mapping into `PiiClass`.
1702#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
1703#[non_exhaustive]
1704pub enum SafetyNetPiiClass {
1705    /// Email address.
1706    Email,
1707    /// Person name.
1708    Name,
1709    /// Location or address.
1710    Location,
1711    /// Phone number.
1712    Phone,
1713    /// URL.
1714    Url,
1715    /// Date.
1716    Date,
1717    /// Account number.
1718    AccountNumber,
1719    /// Secret.
1720    Secret,
1721}
1722
1723impl SafetyNetPiiClass {
1724    /// Maps the safety-net class into the shared pipeline class vocabulary.
1725    pub fn to_pii_class(self) -> PiiClass {
1726        match self {
1727            Self::Email => PiiClass::Email,
1728            Self::Name => PiiClass::Name,
1729            Self::Location => PiiClass::Location,
1730            Self::Phone => PiiClass::custom("phone"),
1731            Self::Url => PiiClass::custom("url"),
1732            Self::Date => PiiClass::custom("date"),
1733            Self::AccountNumber => PiiClass::custom("account_number"),
1734            Self::Secret => PiiClass::custom("secret"),
1735        }
1736    }
1737}
1738
1739/// Exhaustive, closed error set for safety-net execution.
1740#[derive(Debug, Clone, PartialEq, Eq, Error)]
1741#[non_exhaustive]
1742pub enum SafetyNetError {
1743    /// Safety net was explicitly requested but is unavailable.
1744    #[error("safety net unavailable: {reason}")]
1745    Unavailable {
1746        /// Sanitized reason.
1747        reason: String,
1748    },
1749    /// Required model weights or checkpoint are missing.
1750    #[error("safety net weights missing: {path}")]
1751    WeightsMissing {
1752        /// Sanitized path or identifier.
1753        path: String,
1754    },
1755    /// Backend model could not be loaded or reached.
1756    #[error("safety net model unavailable: {reason}")]
1757    ModelUnavailable {
1758        /// Sanitized reason.
1759        reason: String,
1760    },
1761    /// Backend model artifacts failed integrity verification.
1762    #[error("safety net model integrity mismatch: expected={expected}, actual={actual}")]
1763    ModelIntegrityMismatch {
1764        /// Expected SHA256 digest.
1765        expected: String,
1766        /// Actual SHA256 digest.
1767        actual: String,
1768    },
1769    /// Input exceeded configured backend limit.
1770    #[error("safety net input too large: limit={limit}, actual={actual}")]
1771    InputTooLarge {
1772        /// Configured byte limit.
1773        limit: usize,
1774        /// Actual byte length.
1775        actual: usize,
1776    },
1777    /// Backend runtime failed.
1778    #[error("safety net runtime failed: {message}")]
1779    Runtime {
1780        /// Sanitized diagnostic message.
1781        message: String,
1782    },
1783    /// Backend returned invalid output.
1784    #[error("safety net invalid output: {message}")]
1785    InvalidOutput {
1786        /// Sanitized diagnostic message.
1787        message: String,
1788    },
1789}
1790
1791/// Disposition applied to a detected PII span.
1792///
1793/// | Variant | Restorable | Output shape |
1794/// |---------|------------|--------------|
1795/// | `Tokenize` | Yes | Opaque token: `<hex:Class_N>` |
1796/// | `FormatPreserve` | Yes | Realistic-looking pseudonym (e.g., `email1.hex@gaze-fake.invalid`) |
1797/// | `Redact` | No | Literal `[REDACTED]` -- original value is gone |
1798/// | `Generalize` | No | Class label (e.g., `[Email]`) -- original value is gone |
1799/// | `Preserve` | - | Passes through unchanged |
1800///
1801/// `Action` is `#[non_exhaustive]`. Use a wildcard arm in exhaustive matches.
1802/// When restore is required, use `Tokenize` or `FormatPreserve` -- `Redact` and
1803/// `Generalize` are irreversible.
1804#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1805#[non_exhaustive]
1806pub enum Action {
1807    /// Replace PII with a reversible token.
1808    Tokenize,
1809    /// Replace PII with a non-restorable redaction marker.
1810    Redact,
1811    /// Replace PII with a reversible format-preserving token.
1812    FormatPreserve,
1813    /// Replace PII with a broader category.
1814    Generalize,
1815    /// Preserve the original value.
1816    Preserve,
1817}
1818
1819/// Conflict resolution tier that selected or rejected a candidate.
1820#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1821#[non_exhaustive]
1822pub enum ConflictTier {
1823    /// No conflict resolution was needed.
1824    None,
1825    /// Class priority decided the conflict.
1826    ClassPriority,
1827    /// Rule priority decided the conflict.
1828    RulePriority,
1829    /// Candidate score decided the conflict.
1830    Score,
1831    /// Span length decided the conflict.
1832    SpanLength,
1833    /// Same-class containment validator result decided the conflict.
1834    Validator,
1835    /// Pre-resolver validator veto rejected the candidate.
1836    ValidatorVeto,
1837    /// Cross-class collision-family policy decided the conflict.
1838    CollisionPolicy,
1839    /// Mandatory-anchor context was missing, so family-level fallback was emitted.
1840    AnchoredContext,
1841    /// Recognizer identifier decided the conflict.
1842    RecognizerId,
1843    /// Candidate was merged with another candidate.
1844    Merged,
1845    /// Safety-net redact mode stripped a suspect span.
1846    Redact,
1847    /// Safety-net resolve mode promoted a suspect span into a reversible token.
1848    Resolve,
1849    /// Safety-net fallback policy decided the outcome.
1850    Fallback,
1851}
1852
1853/// Safety-net fallback reason recorded in metadata-only audit rows.
1854#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
1855#[non_exhaustive]
1856pub enum FallbackReason {
1857    /// The suspect overlapped an existing emitted token in a way that could not be promoted.
1858    OverlapConflict,
1859    /// A validator rejected the promoted candidate.
1860    ValidatorVeto,
1861    /// A mandatory anchor was missing for the promoted candidate.
1862    AnchorMissing,
1863    /// A follow-up safety-net pass still observed a suspect.
1864    ResidualSuspect,
1865}
1866
1867/// Source document kind for metadata-only audit logging.
1868#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1869#[non_exhaustive]
1870pub enum DocumentKind {
1871    /// Structured key/value document.
1872    Structured,
1873    /// Plain text document.
1874    Text,
1875}
1876
1877/// One row of redaction metadata emitted to a [`RedactionLogger`].
1878///
1879/// Fields identify the PII class, action taken, session ID, source document kind,
1880/// conflict-resolution metadata, and timestamp. Does **not** contain the original PII
1881/// value, the token string, or any identifiable content beyond what a compliance audit
1882/// requires.
1883///
1884/// `RedactionEntry` is `#[non_exhaustive]`; adopters must construct via the public
1885/// constructor or destructure with a wildcard pattern.
1886#[derive(Debug, Clone, PartialEq, Eq)]
1887#[non_exhaustive]
1888pub struct RedactionEntry {
1889    /// Detector or recognizer source identifier.
1890    pub source: String,
1891    /// Stable semantic recognizer identifier, when available.
1892    pub recognizer_id: Option<String>,
1893    /// Versioned recognizer artifact/rule identifier, when available.
1894    pub recognizer_version_id: Option<String>,
1895    /// PII class affected by the decision.
1896    pub class: PiiClass,
1897    /// Policy action applied to the span.
1898    pub action: Action,
1899    /// Optional structured field name.
1900    pub field_name: Option<String>,
1901    /// Source document kind.
1902    pub document_kind: DocumentKind,
1903    /// Whether this entry records a loser in conflict resolution.
1904    pub conflict_loser: bool,
1905    /// Conflict tier that decided the outcome.
1906    pub decided_by: ConflictTier,
1907    /// Creation timestamp in epoch milliseconds.
1908    pub created_at: i64,
1909    /// Optional session identifier.
1910    pub session_id: Option<String>,
1911    /// Optional validator failure reason for a vetoed candidate.
1912    pub validator_fail_reason: Option<ValidatorFailReason>,
1913    /// Optional ambiguity metadata for a family-level fallback.
1914    pub ambiguity_record: Option<AmbiguityRecord>,
1915    /// Collision family that influenced this decision.
1916    pub collision_family: Option<String>,
1917    /// Collision variant that influenced this decision.
1918    pub collision_variant: Option<String>,
1919    /// Safety-net fallback reason, when fallback policy handled the row.
1920    pub fallback_triggered: Option<FallbackReason>,
1921    /// NER/pipeline provenance stage for audit-only producer attribution.
1922    pub provenance_stage: Option<String>,
1923    pub provenance_model_id: Option<String>,
1924    pub provenance_model_version: Option<String>,
1925    pub provenance_artifact_sha256: Option<String>,
1926    pub provenance_tokenizer_sha256: Option<String>,
1927    pub provenance_locale_resolved: Option<String>,
1928    pub provenance_locale_match_kind: Option<String>,
1929    pub provenance_canonical_class: Option<String>,
1930    pub provenance_native_class: Option<String>,
1931    pub provenance_confidence: Option<String>,
1932    pub provenance_merged_from: Option<String>,
1933    /// Locale-aware safety-net backend ids dropped by first-match-wins routing.
1934    pub backend_silently_dropped: Option<Vec<String>>,
1935    pub restore_policy: Option<String>,
1936    pub restore_decision: Option<String>,
1937    pub restore_unknown_token_count: Option<u64>,
1938    pub restore_manifest_bypass_count: Option<u64>,
1939    pub restore_fresh_pii_count: Option<u64>,
1940    pub restore_phase_mask: Option<u32>,
1941}
1942
1943impl Serialize for RedactionEntry {
1944    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
1945    where
1946        S: serde::Serializer,
1947    {
1948        use serde::ser::SerializeStruct;
1949
1950        let mut len = 14;
1951        if self.recognizer_id.is_some() {
1952            len += 1;
1953        }
1954        if self.recognizer_version_id.is_some() {
1955            len += 1;
1956        }
1957        len += [
1958            self.provenance_stage.as_ref(),
1959            self.provenance_model_id.as_ref(),
1960            self.provenance_model_version.as_ref(),
1961            self.provenance_artifact_sha256.as_ref(),
1962            self.provenance_tokenizer_sha256.as_ref(),
1963            self.provenance_locale_resolved.as_ref(),
1964            self.provenance_locale_match_kind.as_ref(),
1965            self.provenance_canonical_class.as_ref(),
1966            self.provenance_native_class.as_ref(),
1967            self.provenance_confidence.as_ref(),
1968            self.provenance_merged_from.as_ref(),
1969        ]
1970        .into_iter()
1971        .filter(|value| value.is_some())
1972        .count();
1973        if self.backend_silently_dropped.is_some() {
1974            len += 1;
1975        }
1976        len += [self.restore_policy.as_ref(), self.restore_decision.as_ref()]
1977            .into_iter()
1978            .filter(|value| value.is_some())
1979            .count();
1980        len += [
1981            self.restore_unknown_token_count.is_some(),
1982            self.restore_manifest_bypass_count.is_some(),
1983            self.restore_fresh_pii_count.is_some(),
1984            self.restore_phase_mask.is_some(),
1985        ]
1986        .into_iter()
1987        .filter(|value| *value)
1988        .count();
1989        let mut state = serializer.serialize_struct("RedactionEntry", len)?;
1990        state.serialize_field("source", &self.source)?;
1991        if let Some(recognizer_id) = &self.recognizer_id {
1992            state.serialize_field("recognizer_id", recognizer_id)?;
1993        }
1994        if let Some(recognizer_version_id) = &self.recognizer_version_id {
1995            state.serialize_field("recognizer_version_id", recognizer_version_id)?;
1996        }
1997        state.serialize_field("class", &self.class.to_canonical_str())?;
1998        state.serialize_field("action", redaction_action_as_str(self.action))?;
1999        state.serialize_field("field_name", &self.field_name)?;
2000        state.serialize_field(
2001            "document_kind",
2002            redaction_document_kind_as_str(self.document_kind),
2003        )?;
2004        state.serialize_field("conflict_loser", &self.conflict_loser)?;
2005        state.serialize_field(
2006            "decided_by",
2007            redaction_conflict_tier_as_str(self.decided_by),
2008        )?;
2009        state.serialize_field("created_at", &self.created_at)?;
2010        state.serialize_field("session_id", &self.session_id)?;
2011        state.serialize_field("validator_fail_reason", &self.validator_fail_reason)?;
2012        state.serialize_field("ambiguity_record", &self.ambiguity_record)?;
2013        state.serialize_field("collision_family", &self.collision_family)?;
2014        state.serialize_field("collision_variant", &self.collision_variant)?;
2015        state.serialize_field("fallback_triggered", &self.fallback_triggered)?;
2016        if let Some(value) = &self.provenance_stage {
2017            state.serialize_field("provenance_stage", value)?;
2018        }
2019        if let Some(value) = &self.provenance_model_id {
2020            state.serialize_field("provenance_model_id", value)?;
2021        }
2022        if let Some(value) = &self.provenance_model_version {
2023            state.serialize_field("provenance_model_version", value)?;
2024        }
2025        if let Some(value) = &self.provenance_artifact_sha256 {
2026            state.serialize_field("provenance_artifact_sha256", value)?;
2027        }
2028        if let Some(value) = &self.provenance_tokenizer_sha256 {
2029            state.serialize_field("provenance_tokenizer_sha256", value)?;
2030        }
2031        if let Some(value) = &self.provenance_locale_resolved {
2032            state.serialize_field("provenance_locale_resolved", value)?;
2033        }
2034        if let Some(value) = &self.provenance_locale_match_kind {
2035            state.serialize_field("provenance_locale_match_kind", value)?;
2036        }
2037        if let Some(value) = &self.provenance_canonical_class {
2038            state.serialize_field("provenance_canonical_class", value)?;
2039        }
2040        if let Some(value) = &self.provenance_native_class {
2041            state.serialize_field("provenance_native_class", value)?;
2042        }
2043        if let Some(value) = &self.provenance_confidence {
2044            state.serialize_field("provenance_confidence", value)?;
2045        }
2046        if let Some(value) = &self.provenance_merged_from {
2047            state.serialize_field("provenance_merged_from", value)?;
2048        }
2049        if let Some(dropped) = &self.backend_silently_dropped {
2050            state.serialize_field("backend_silently_dropped", dropped)?;
2051        }
2052        if let Some(value) = &self.restore_policy {
2053            state.serialize_field("restore_policy", value)?;
2054        }
2055        if let Some(value) = &self.restore_decision {
2056            state.serialize_field("restore_decision", value)?;
2057        }
2058        if let Some(value) = self.restore_unknown_token_count {
2059            state.serialize_field("restore_unknown_token_count", &value)?;
2060        }
2061        if let Some(value) = self.restore_manifest_bypass_count {
2062            state.serialize_field("restore_manifest_bypass_count", &value)?;
2063        }
2064        if let Some(value) = self.restore_fresh_pii_count {
2065            state.serialize_field("restore_fresh_pii_count", &value)?;
2066        }
2067        if let Some(value) = self.restore_phase_mask {
2068            state.serialize_field("restore_phase_mask", &value)?;
2069        }
2070        state.end()
2071    }
2072}
2073
2074fn redaction_action_as_str(action: Action) -> &'static str {
2075    match action {
2076        Action::Tokenize => "tokenize",
2077        Action::Redact => "redact",
2078        Action::FormatPreserve => "format_preserve",
2079        Action::Generalize => "generalize",
2080        Action::Preserve => "preserve",
2081    }
2082}
2083
2084fn redaction_document_kind_as_str(kind: DocumentKind) -> &'static str {
2085    match kind {
2086        DocumentKind::Structured => "structured",
2087        DocumentKind::Text => "text",
2088    }
2089}
2090
2091fn redaction_conflict_tier_as_str(tier: ConflictTier) -> &'static str {
2092    match tier {
2093        ConflictTier::None => "none",
2094        ConflictTier::ClassPriority => "class_priority",
2095        ConflictTier::RulePriority => "rule_priority",
2096        ConflictTier::Score => "score",
2097        ConflictTier::SpanLength => "span_length",
2098        ConflictTier::Validator => "validator",
2099        ConflictTier::ValidatorVeto => "validator_veto",
2100        ConflictTier::CollisionPolicy => "collision_policy",
2101        ConflictTier::AnchoredContext => "anchored_context",
2102        ConflictTier::RecognizerId => "recognizer_id",
2103        ConflictTier::Merged => "merged",
2104        ConflictTier::Redact => "redact",
2105        ConflictTier::Resolve => "resolve",
2106        ConflictTier::Fallback => "fallback",
2107    }
2108}
2109
2110impl RedactionEntry {
2111    /// Builds a metadata-only redaction log entry.
2112    #[allow(clippy::too_many_arguments)]
2113    pub fn new(
2114        source: impl Into<String>,
2115        class: PiiClass,
2116        action: Action,
2117        field_name: Option<String>,
2118        document_kind: DocumentKind,
2119        conflict_loser: bool,
2120        decided_by: ConflictTier,
2121        created_at: i64,
2122        session_id: Option<String>,
2123    ) -> Self {
2124        Self {
2125            source: source.into(),
2126            class,
2127            action,
2128            field_name,
2129            document_kind,
2130            conflict_loser,
2131            decided_by,
2132            created_at,
2133            session_id,
2134            recognizer_id: None,
2135            recognizer_version_id: None,
2136            validator_fail_reason: None,
2137            ambiguity_record: None,
2138            collision_family: None,
2139            collision_variant: None,
2140            fallback_triggered: None,
2141            provenance_stage: None,
2142            provenance_model_id: None,
2143            provenance_model_version: None,
2144            provenance_artifact_sha256: None,
2145            provenance_tokenizer_sha256: None,
2146            provenance_locale_resolved: None,
2147            provenance_locale_match_kind: None,
2148            provenance_canonical_class: None,
2149            provenance_native_class: None,
2150            provenance_confidence: None,
2151            provenance_merged_from: None,
2152            backend_silently_dropped: None,
2153            restore_policy: None,
2154            restore_decision: None,
2155            restore_unknown_token_count: None,
2156            restore_manifest_bypass_count: None,
2157            restore_fresh_pii_count: None,
2158            restore_phase_mask: None,
2159        }
2160    }
2161
2162    /// Attaches a validator failure reason to this metadata row.
2163    pub fn with_validator_fail_reason(mut self, reason: ValidatorFailReason) -> Self {
2164        self.validator_fail_reason = Some(reason);
2165        self
2166    }
2167
2168    /// Attaches an ambiguity record to this metadata row.
2169    pub fn with_ambiguity_record(mut self, record: AmbiguityRecord) -> Self {
2170        self.ambiguity_record = Some(record);
2171        self
2172    }
2173
2174    /// Attaches collision-family metadata to this row.
2175    pub fn with_collision_metadata(
2176        mut self,
2177        family: Option<String>,
2178        variant: Option<String>,
2179    ) -> Self {
2180        self.collision_family = family;
2181        self.collision_variant = variant;
2182        self
2183    }
2184
2185    /// Attaches safety-net fallback metadata to this row.
2186    pub fn with_fallback_triggered(mut self, reason: FallbackReason) -> Self {
2187        self.fallback_triggered = Some(reason);
2188        self
2189    }
2190
2191    /// Attaches locale-aware backend ids dropped by first-match-wins routing.
2192    pub fn with_backend_silently_dropped(mut self, dropped: Vec<String>) -> Self {
2193        self.backend_silently_dropped = Some(dropped);
2194        self
2195    }
2196
2197    pub fn with_restore_telemetry(mut self, telemetry: RestoreTelemetry) -> Self {
2198        self.restore_policy = Some(telemetry.restore_policy_str().to_string());
2199        self.restore_decision = Some(telemetry.restore_decision_str().to_string());
2200        self.restore_unknown_token_count = Some(telemetry.unknown_token_count);
2201        self.restore_manifest_bypass_count = Some(telemetry.manifest_bypass_count);
2202        self.restore_fresh_pii_count = Some(telemetry.fresh_pii_detected_count);
2203        self.restore_phase_mask = Some(telemetry.phase_execution_mask);
2204        self
2205    }
2206
2207    /// Attaches recognizer lineage metadata to this row.
2208    pub fn with_recognizer_metadata(
2209        mut self,
2210        recognizer_id: Option<String>,
2211        recognizer_version_id: Option<String>,
2212    ) -> Self {
2213        self.recognizer_id = recognizer_id;
2214        self.recognizer_version_id = recognizer_version_id;
2215        self
2216    }
2217
2218    #[allow(clippy::too_many_arguments)]
2219    pub fn with_provenance_metadata(
2220        mut self,
2221        stage: Option<String>,
2222        model_id: Option<String>,
2223        model_version: Option<String>,
2224        artifact_sha256: Option<String>,
2225        tokenizer_sha256: Option<String>,
2226        locale_resolved: Option<String>,
2227        locale_match_kind: Option<String>,
2228        canonical_class: Option<String>,
2229        native_class: Option<String>,
2230        confidence: Option<f64>,
2231        merged_from: Option<String>,
2232    ) -> Self {
2233        self.provenance_stage = stage;
2234        self.provenance_model_id = model_id;
2235        self.provenance_model_version = model_version;
2236        self.provenance_artifact_sha256 = artifact_sha256;
2237        self.provenance_tokenizer_sha256 = tokenizer_sha256;
2238        self.provenance_locale_resolved = locale_resolved;
2239        self.provenance_locale_match_kind = locale_match_kind;
2240        self.provenance_canonical_class = canonical_class;
2241        self.provenance_native_class = native_class;
2242        self.provenance_confidence = confidence.map(|value| value.to_string());
2243        self.provenance_merged_from = merged_from;
2244        self
2245    }
2246}
2247
2248/// Closed error set for redaction log sinks.
2249#[derive(Debug, Clone, PartialEq, Eq, Error)]
2250#[non_exhaustive]
2251pub enum RedactionLogError {
2252    /// SQLite-backed redaction log sink failed.
2253    #[error("sqlite redaction log error: {0}")]
2254    Sqlite(String),
2255    /// Non-SQLite redaction log sink failed.
2256    #[error("backend redaction log error: {0}")]
2257    Backend(String),
2258}
2259
2260/// Trait for audit sinks that receive redaction metadata.
2261///
2262/// Implement this for custom audit backends (remote telemetry, structured JSON logs).
2263/// For SQLite-backed persistence, use `gaze_audit::SqliteLogger`.
2264///
2265/// # Contract
2266///
2267/// The logger receives **metadata only**: class, action, session ID, timestamp, and
2268/// other bytes-free audit labels. It never receives the original PII value or the token
2269/// value. A custom impl that augments entries with raw document text violates the audit
2270/// isolation contract and will be flagged by the `gaze_module_isolation` Dylint lint
2271/// when it lives in the wrong crate.
2272///
2273/// # Example
2274///
2275/// ```rust
2276/// use std::sync::atomic::{AtomicUsize, Ordering};
2277/// use gaze_types::{RedactionEntry, RedactionLogError, RedactionLogger};
2278///
2279/// #[derive(Default)]
2280/// struct CountLogger(AtomicUsize);
2281///
2282/// impl RedactionLogger for CountLogger {
2283///     fn log(&self, _entry: &RedactionEntry) -> Result<(), RedactionLogError> {
2284///         self.0.fetch_add(1, Ordering::Relaxed);
2285///         Ok(())
2286///     }
2287/// }
2288/// ```
2289pub trait RedactionLogger: Send + Sync {
2290    /// Records a metadata-only redaction entry.
2291    fn log(&self, entry: &RedactionEntry) -> Result<(), RedactionLogError>;
2292}
2293
2294/// Rulepack recognizer activation tier.
2295#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
2296#[non_exhaustive]
2297pub enum SafetyTier {
2298    /// Activates whenever the recognizer's locale projection intersects the active locale chain.
2299    #[default]
2300    SafeDefault,
2301    /// Activates only when an explicit locale or compatibility alias enables locale-shaped rules.
2302    LocaleGated,
2303    /// Activates only through adopter opt-in surfaces such as policy-defined custom recognizers.
2304    OptIn,
2305}
2306
2307/// Safety-tier parsing error.
2308#[derive(Debug, Clone, PartialEq, Eq)]
2309#[non_exhaustive]
2310pub struct SafetyTierParseError {
2311    value: String,
2312}
2313
2314impl SafetyTier {
2315    /// Parses the TOML `safety_tier` string.
2316    pub fn parse(value: &str) -> Result<Self, SafetyTierParseError> {
2317        match value {
2318            "safe_default" => Ok(Self::SafeDefault),
2319            "locale_gated" => Ok(Self::LocaleGated),
2320            "opt_in" => Ok(Self::OptIn),
2321            other => Err(SafetyTierParseError {
2322                value: other.to_string(),
2323            }),
2324        }
2325    }
2326
2327    /// Returns the TOML string for this tier.
2328    pub fn as_str(self) -> &'static str {
2329        match self {
2330            Self::SafeDefault => "safe_default",
2331            Self::LocaleGated => "locale_gated",
2332            Self::OptIn => "opt_in",
2333        }
2334    }
2335}
2336
2337impl SafetyTierParseError {
2338    /// Returns the rejected safety-tier string.
2339    pub fn value(&self) -> &str {
2340        &self.value
2341    }
2342}
2343
2344impl fmt::Display for SafetyTierParseError {
2345    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2346        write!(f, "unsupported safety_tier '{}'", self.value)
2347    }
2348}
2349
2350impl std::error::Error for SafetyTierParseError {}
2351
2352/// Locale tag recognized by policy and recognizers.
2353#[derive(Debug, Clone, PartialEq, Eq, Hash)]
2354#[non_exhaustive]
2355pub enum LocaleTag {
2356    /// Locale-independent recognizer or policy.
2357    Global,
2358    /// German as used in Germany.
2359    DeDe,
2360    /// German as used in Austria.
2361    DeAt,
2362    /// German as used in Switzerland.
2363    DeCh,
2364    /// English as used in the United States.
2365    EnUs,
2366    /// English as used in Great Britain.
2367    EnGb,
2368    /// English as used in Ireland.
2369    EnIe,
2370    /// English as used in Australia.
2371    EnAu,
2372    /// English as used in Canada.
2373    EnCa,
2374    /// Any other canonical BCP-47-like tag.
2375    Other(String),
2376}
2377
2378/// Locale parsing error.
2379#[derive(Debug, Clone, PartialEq, Eq)]
2380#[non_exhaustive]
2381pub enum LocaleError {
2382    /// Locale tag is unsupported or invalid.
2383    Unsupported,
2384}
2385
2386impl fmt::Display for LocaleError {
2387    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2388        match self {
2389            LocaleError::Unsupported => f.write_str("unsupported locale"),
2390        }
2391    }
2392}
2393
2394impl std::error::Error for LocaleError {}
2395
2396/// Ordered locale fallback chain.
2397#[derive(Debug, Clone, PartialEq, Eq)]
2398pub struct LocaleChain(Vec<LocaleTag>);
2399
2400impl LocaleTag {
2401    /// Global locale constant.
2402    pub const GLOBAL: LocaleTag = LocaleTag::Global;
2403
2404    /// Parses a locale tag from policy or CLI input.
2405    pub fn parse(s: &str) -> Result<LocaleTag, LocaleError> {
2406        let raw = s.trim().replace('_', "-");
2407        let normalized = raw.to_ascii_lowercase();
2408        match normalized.as_str() {
2409            "global" | "*" => Ok(LocaleTag::Global),
2410            "de-de" => Ok(LocaleTag::DeDe),
2411            "de-at" => Ok(LocaleTag::DeAt),
2412            "de-ch" => Ok(LocaleTag::DeCh),
2413            "en-us" => Ok(LocaleTag::EnUs),
2414            "en-gb" => Ok(LocaleTag::EnGb),
2415            "en-ie" => Ok(LocaleTag::EnIe),
2416            "en-au" => Ok(LocaleTag::EnAu),
2417            "en-ca" => Ok(LocaleTag::EnCa),
2418            "" => Err(LocaleError::Unsupported),
2419            _ if is_bcp47_parseable(&raw) => Ok(LocaleTag::Other(canonical_other(&raw))),
2420            _ => Err(LocaleError::Unsupported),
2421        }
2422    }
2423
2424    /// Returns the canonical string form of the locale tag.
2425    pub fn as_str(&self) -> &str {
2426        match self {
2427            LocaleTag::Global => "global",
2428            LocaleTag::DeDe => "de-DE",
2429            LocaleTag::DeAt => "de-AT",
2430            LocaleTag::DeCh => "de-CH",
2431            LocaleTag::EnUs => "en-US",
2432            LocaleTag::EnGb => "en-GB",
2433            LocaleTag::EnIe => "en-IE",
2434            LocaleTag::EnAu => "en-AU",
2435            LocaleTag::EnCa => "en-CA",
2436            LocaleTag::Other(tag) => tag.as_str(),
2437        }
2438    }
2439}
2440
2441impl LocaleChain {
2442    /// Builds a locale chain and appends global fallback when absent.
2443    pub fn from_tags(mut tags: Vec<LocaleTag>) -> LocaleChain {
2444        ensure_global(&mut tags);
2445        LocaleChain(tags)
2446    }
2447
2448    /// Parses a comma-separated CLI locale chain.
2449    pub fn from_cli(raw: &str) -> Result<LocaleChain, LocaleError> {
2450        let tags = raw
2451            .split(',')
2452            .map(LocaleTag::parse)
2453            .collect::<Result<Vec<_>, _>>()?;
2454        Ok(LocaleChain::from_tags(tags))
2455    }
2456
2457    /// Merges policy and CLI locale preferences.
2458    pub fn merge_policy_and_cli(
2459        policy: Option<&[LocaleTag]>,
2460        cli: Option<&[LocaleTag]>,
2461    ) -> LocaleChain {
2462        Self::merge_cli_policy_rulepack_default(cli, policy, None)
2463    }
2464
2465    /// Merges CLI, policy, rulepack, and default locale preferences.
2466    pub fn merge_cli_policy_rulepack_default(
2467        cli: Option<&[LocaleTag]>,
2468        policy: Option<&[LocaleTag]>,
2469        rulepack_defaults: Option<&[LocaleTag]>,
2470    ) -> LocaleChain {
2471        let tags = cli
2472            .filter(|tags| !tags.is_empty())
2473            .or_else(|| policy.filter(|tags| !tags.is_empty()))
2474            .or_else(|| rulepack_defaults.filter(|tags| !tags.is_empty()))
2475            .map(|tags| tags.to_vec())
2476            .unwrap_or_else(|| vec![LocaleTag::Global]);
2477        LocaleChain::from_tags(tags)
2478    }
2479
2480    /// Returns true when a recognizer can run under this locale chain.
2481    pub fn intersects(&self, recognizer_locales: &[LocaleTag]) -> bool {
2482        if recognizer_locales.is_empty() {
2483            return true;
2484        }
2485        recognizer_locales.iter().any(|recognizer_locale| {
2486            *recognizer_locale == LocaleTag::Global
2487                || self.0.iter().any(|active| active == recognizer_locale)
2488        })
2489    }
2490
2491    /// Returns the locale tags in chain order.
2492    pub fn as_slice(&self) -> &[LocaleTag] {
2493        &self.0
2494    }
2495
2496    /// Returns the locale chain as canonical strings.
2497    pub fn to_strings(&self) -> Vec<String> {
2498        self.0.iter().map(ToString::to_string).collect()
2499    }
2500}
2501
2502impl From<&[LocaleTag]> for LocaleChain {
2503    fn from(tags: &[LocaleTag]) -> Self {
2504        let mut owned = tags.to_vec();
2505        ensure_global(&mut owned);
2506        LocaleChain(owned)
2507    }
2508}
2509
2510impl fmt::Display for LocaleTag {
2511    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2512        f.write_str(self.as_str())
2513    }
2514}
2515
2516/// The input document submitted for pseudonymization.
2517///
2518/// `RawDocument::Text(String)` for plain or semi-structured text (most LLM workflows).
2519/// `RawDocument::Structured(BTreeMap<String, Value>)` for JSON-shaped data where
2520/// column-aware rules apply -- `ColumnRule`s only take effect on structured input.
2521///
2522/// `Detection::span` and recognizer candidate spans use **byte** ranges, not char indices.
2523///
2524/// `RawDocument` is `#[non_exhaustive]`. Match with a wildcard arm.
2525#[derive(Debug, Clone)]
2526#[non_exhaustive]
2527pub enum RawDocument {
2528    /// Structured document values.
2529    Structured(BTreeMap<String, Value>),
2530    /// Plain text document.
2531    Text(String),
2532}
2533
2534/// The pseudonymized output from `Pipeline::redact`.
2535///
2536/// Mirrors the shape of `RawDocument`: `CleanDocument::Text(String)` or
2537/// `CleanDocument::Structured(BTreeMap<String, Value>)`. Destructure with a `let`-else
2538/// or `match`; **there is no `.text()` accessor**.
2539///
2540/// ```rust
2541/// use gaze_types::CleanDocument;
2542///
2543/// fn unwrap_text(doc: CleanDocument) -> Option<String> {
2544///     if let CleanDocument::Text(t) = doc { Some(t) } else { None }
2545/// }
2546/// ```
2547///
2548/// Contains only tokens or redacted placeholders -- no original PII values.
2549/// Send this (or its inner string) to the LLM; never send the original `RawDocument`.
2550///
2551/// `CleanDocument` is `#[non_exhaustive]`.
2552#[derive(Debug, Clone, Serialize)]
2553#[serde(untagged)]
2554#[non_exhaustive]
2555pub enum CleanDocument {
2556    /// Structured document values.
2557    Structured(BTreeMap<String, Value>),
2558    /// Plain text document.
2559    Text(String),
2560}
2561
2562/// Minimal structured value representation that avoids a serde_json dependency.
2563#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
2564#[serde(untagged)]
2565#[non_exhaustive]
2566pub enum Value {
2567    /// Null value.
2568    Null,
2569    /// Boolean value.
2570    Bool(bool),
2571    /// String value.
2572    String(String),
2573    /// Signed 64-bit integer value.
2574    I64(i64),
2575    /// Array value.
2576    Array(Vec<Value>),
2577    /// Object value.
2578    Object(BTreeMap<String, Value>),
2579}
2580
2581impl Value {
2582    /// Returns the inner string for string values.
2583    pub fn as_str(&self) -> Option<&str> {
2584        match self {
2585            Self::String(value) => Some(value.as_str()),
2586            Self::Null | Self::Bool(_) | Self::I64(_) | Self::Array(_) | Self::Object(_) => None,
2587        }
2588    }
2589
2590    /// Returns a scalar string representation used for structured safety-net checks.
2591    pub fn scalar_to_safety_net_string(&self) -> Option<String> {
2592        match self {
2593            Self::String(value) if !value.is_empty() => Some(value.clone()),
2594            Self::String(_) | Self::Null | Self::Array(_) | Self::Object(_) => None,
2595            Self::Bool(value) => Some(value.to_string()),
2596            Self::I64(value) => Some(value.to_string()),
2597        }
2598    }
2599}
2600
2601impl PartialEq<&str> for Value {
2602    fn eq(&self, other: &&str) -> bool {
2603        self.as_str() == Some(*other)
2604    }
2605}
2606
2607/// Value-only dictionary bundle shared with recognizers.
2608#[derive(Debug, Clone, Default)]
2609pub struct DictionaryBundle {
2610    entries: HashMap<String, DictionaryEntry>,
2611}
2612
2613/// Value-only dictionary entry; compiled automatons live outside `gaze-types`.
2614#[derive(Debug, Clone)]
2615pub struct DictionaryEntry {
2616    terms: Vec<String>,
2617    case_sensitive: bool,
2618    source: DictionarySource,
2619}
2620
2621/// Source of a dictionary entry.
2622#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2623#[non_exhaustive]
2624pub enum DictionarySource {
2625    /// Dictionary supplied by request context.
2626    Cli,
2627    /// Dictionary supplied by a rulepack.
2628    Rulepack,
2629}
2630
2631/// Dictionary metadata used for diagnostics and tests.
2632#[derive(Debug, Clone, PartialEq, Eq)]
2633#[non_exhaustive]
2634pub struct DictionaryStats {
2635    /// Dictionary name.
2636    pub name: String,
2637    /// Number of configured terms.
2638    pub term_count: usize,
2639    /// Dictionary source.
2640    pub source: DictionarySource,
2641}
2642
2643impl DictionaryStats {
2644    /// Builds dictionary diagnostics metadata.
2645    pub fn new(name: impl Into<String>, term_count: usize, source: DictionarySource) -> Self {
2646        Self {
2647            name: name.into(),
2648            term_count,
2649            source,
2650        }
2651    }
2652}
2653
2654/// Dictionary declared by a rulepack.
2655#[derive(Debug, Clone, PartialEq, Eq)]
2656#[non_exhaustive]
2657pub struct RulepackDict {
2658    /// Dictionary name.
2659    pub name: String,
2660    /// Dictionary terms.
2661    pub terms: Vec<String>,
2662    /// Whether matching is case-sensitive.
2663    pub case_sensitive: bool,
2664}
2665
2666impl RulepackDict {
2667    /// Builds a rulepack dictionary declaration.
2668    pub fn new(name: impl Into<String>, terms: Vec<String>, case_sensitive: bool) -> Self {
2669        Self {
2670            name: name.into(),
2671            terms,
2672            case_sensitive,
2673        }
2674    }
2675}
2676
2677/// Error raised when constructing invalid dictionary entries.
2678#[derive(Debug, Clone, PartialEq, Eq)]
2679#[non_exhaustive]
2680pub enum DictionaryLoadError {
2681    /// Dictionary has no terms.
2682    Empty { name: String },
2683    /// ASCII-only case-insensitive matching cannot safely cover this entry.
2684    UnicodeInsensitiveUnsupported { name: String },
2685}
2686
2687impl fmt::Display for DictionaryLoadError {
2688    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2689        match self {
2690            Self::Empty { name } => write!(f, "dictionary '{name}' has no terms"),
2691            Self::UnicodeInsensitiveUnsupported { name } => write!(
2692                f,
2693                "dictionary '{name}' uses unicode terms with case-insensitive matching, unsupported in v0.4.0; use case_sensitive = true"
2694            ),
2695        }
2696    }
2697}
2698
2699impl std::error::Error for DictionaryLoadError {}
2700
2701impl DictionaryBundle {
2702    /// Builds a bundle from rulepack dictionaries.
2703    pub fn from_rulepack_terms(terms: &[RulepackDict]) -> Self {
2704        let mut entries = HashMap::with_capacity(terms.len());
2705        for dictionary in terms {
2706            let entry = DictionaryEntry::new(
2707                &dictionary.name,
2708                dictionary.terms.clone(),
2709                dictionary.case_sensitive,
2710                DictionarySource::Rulepack,
2711            )
2712            .expect("Policy validates dictionary terms before bundle construction");
2713            entries.insert(dictionary.name.clone(), entry);
2714        }
2715        Self { entries }
2716    }
2717
2718    /// Builds a bundle from pre-built dictionary entries.
2719    pub fn from_entries(entries: impl IntoIterator<Item = (String, DictionaryEntry)>) -> Self {
2720        Self {
2721            entries: entries.into_iter().collect(),
2722        }
2723    }
2724
2725    /// Merges two bundles, preferring entries from the second bundle on name conflicts.
2726    pub fn merge(a: Self, b: Self) -> Self {
2727        let mut entries = a.entries;
2728        entries.extend(b.entries);
2729        Self { entries }
2730    }
2731
2732    /// Returns a dictionary by name.
2733    pub fn get(&self, name: &str) -> Option<&DictionaryEntry> {
2734        self.entries.get(name)
2735    }
2736
2737    /// Returns sorted dictionary stats.
2738    pub fn stats(&self) -> Vec<DictionaryStats> {
2739        let mut stats = self
2740            .entries
2741            .iter()
2742            .map(|(name, entry)| DictionaryStats {
2743                name: name.clone(),
2744                term_count: entry.terms.len(),
2745                source: entry.source,
2746            })
2747            .collect::<Vec<_>>();
2748        stats.sort_by(|a, b| a.name.cmp(&b.name));
2749        stats
2750    }
2751}
2752
2753impl DictionaryEntry {
2754    /// Creates a validated value-only dictionary entry.
2755    pub fn new(
2756        name: &str,
2757        terms: Vec<String>,
2758        case_sensitive: bool,
2759        source: DictionarySource,
2760    ) -> Result<Self, DictionaryLoadError> {
2761        if terms.is_empty() {
2762            return Err(DictionaryLoadError::Empty {
2763                name: name.to_string(),
2764            });
2765        }
2766        if !case_sensitive && terms.iter().any(|term| !term.is_ascii()) {
2767            return Err(DictionaryLoadError::UnicodeInsensitiveUnsupported {
2768                name: name.to_string(),
2769            });
2770        }
2771        Ok(Self {
2772            terms,
2773            case_sensitive,
2774            source,
2775        })
2776    }
2777
2778    /// Returns whether matching is case-sensitive.
2779    pub fn case_sensitive(&self) -> bool {
2780        self.case_sensitive
2781    }
2782
2783    /// Returns configured dictionary terms.
2784    pub fn terms(&self) -> &[String] {
2785        &self.terms
2786    }
2787}
2788
2789#[cfg(test)]
2790mod dictionary_tests {
2791    use super::*;
2792
2793    #[test]
2794    fn dictionary_entry_rejects_empty_terms() {
2795        let err = DictionaryEntry::new("empty", Vec::new(), true, DictionarySource::Cli)
2796            .expect_err("empty dictionaries must fail closed");
2797
2798        assert!(matches!(err, DictionaryLoadError::Empty { name } if name == "empty"));
2799    }
2800
2801    #[test]
2802    fn dictionary_entry_rejects_non_ascii_case_insensitive_terms() {
2803        let err = DictionaryEntry::new(
2804            "songs",
2805            vec!["Beyonce".to_string(), "Caf\u{00e9}".to_string()],
2806            false,
2807            DictionarySource::Cli,
2808        )
2809        .expect_err("unicode case-insensitive dictionaries must fail closed");
2810
2811        assert!(matches!(
2812            err,
2813            DictionaryLoadError::UnicodeInsensitiveUnsupported { name } if name == "songs"
2814        ));
2815    }
2816}
2817
2818#[cfg(test)]
2819mod redaction_logger_tests {
2820    use super::*;
2821
2822    struct CapturingLogger;
2823
2824    impl RedactionLogger for CapturingLogger {
2825        fn log(&self, _entry: &RedactionEntry) -> Result<(), RedactionLogError> {
2826            Ok(())
2827        }
2828    }
2829
2830    fn assert_send_sync<T: Send + Sync + ?Sized>() {}
2831
2832    #[test]
2833    fn redaction_log_error_display_is_stable() {
2834        assert_eq!(
2835            RedactionLogError::Sqlite("write failed".to_string()).to_string(),
2836            "sqlite redaction log error: write failed"
2837        );
2838        assert_eq!(
2839            RedactionLogError::Backend("sink failed".to_string()).to_string(),
2840            "backend redaction log error: sink failed"
2841        );
2842    }
2843
2844    #[test]
2845    fn redaction_logger_trait_object_is_send_sync() {
2846        assert_send_sync::<dyn RedactionLogger>();
2847    }
2848
2849    #[test]
2850    fn local_logger_can_implement_redaction_logger() {
2851        let logger = CapturingLogger;
2852        let entry = RedactionEntry {
2853            source: "unit-test".to_string(),
2854            recognizer_id: None,
2855            recognizer_version_id: None,
2856            class: PiiClass::Email,
2857            action: Action::Tokenize,
2858            field_name: None,
2859            document_kind: DocumentKind::Text,
2860            conflict_loser: false,
2861            decided_by: ConflictTier::None,
2862            created_at: 0,
2863            session_id: None,
2864            validator_fail_reason: None,
2865            ambiguity_record: None,
2866            collision_family: None,
2867            collision_variant: None,
2868            fallback_triggered: None,
2869            provenance_stage: None,
2870            provenance_model_id: None,
2871            provenance_model_version: None,
2872            provenance_artifact_sha256: None,
2873            provenance_tokenizer_sha256: None,
2874            provenance_locale_resolved: None,
2875            provenance_locale_match_kind: None,
2876            provenance_canonical_class: None,
2877            provenance_native_class: None,
2878            provenance_confidence: None,
2879            provenance_merged_from: None,
2880            backend_silently_dropped: None,
2881            restore_policy: None,
2882            restore_decision: None,
2883            restore_unknown_token_count: None,
2884            restore_manifest_bypass_count: None,
2885            restore_fresh_pii_count: None,
2886            restore_phase_mask: None,
2887        };
2888
2889        let trait_object: &dyn RedactionLogger = &logger;
2890        trait_object.log(&entry).expect("log entry");
2891    }
2892
2893    #[test]
2894    fn redaction_entry_json_shape_omits_absent_recognizer_lineage() {
2895        let entry = RedactionEntry::new(
2896            "email.global",
2897            PiiClass::Email,
2898            Action::Tokenize,
2899            None,
2900            DocumentKind::Text,
2901            false,
2902            ConflictTier::None,
2903            0,
2904            None,
2905        );
2906
2907        let rendered = serde_json::to_string(&entry).expect("serialize redaction entry");
2908
2909        assert_eq!(
2910            rendered,
2911            r#"{"source":"email.global","class":"email","action":"tokenize","field_name":null,"document_kind":"text","conflict_loser":false,"decided_by":"none","created_at":0,"session_id":null,"validator_fail_reason":null,"ambiguity_record":null,"collision_family":null,"collision_variant":null,"fallback_triggered":null}"#
2912        );
2913    }
2914
2915    #[test]
2916    fn redaction_entry_json_shape_includes_recognizer_lineage_when_present() {
2917        let entry = RedactionEntry::new(
2918            "ner/ort",
2919            PiiClass::Name,
2920            Action::Tokenize,
2921            None,
2922            DocumentKind::Text,
2923            false,
2924            ConflictTier::None,
2925            0,
2926            None,
2927        )
2928        .with_recognizer_metadata(
2929            Some("ner".to_string()),
2930            Some("ner.davlan-mbert.v1".to_string()),
2931        );
2932
2933        let value: serde_json::Value =
2934            serde_json::to_value(&entry).expect("serialize redaction entry");
2935
2936        assert_eq!(value["recognizer_id"], "ner");
2937        assert_eq!(value["recognizer_version_id"], "ner.davlan-mbert.v1");
2938    }
2939
2940    #[test]
2941    fn candidate_keeps_versioned_and_unversioned_recognizer_ids() {
2942        let unversioned = Candidate::new(
2943            0..5,
2944            PiiClass::Email,
2945            "email.global",
2946            0.9,
2947            10,
2948            None,
2949            "email",
2950            "email.global",
2951            ConflictTier::None,
2952            Vec::new(),
2953        );
2954        assert_eq!(unversioned.recognizer_id, "email.global");
2955        assert_eq!(unversioned.recognizer_version_id, None);
2956
2957        let versioned = unversioned
2958            .clone()
2959            .with_recognizer_version_id("email.global.v1");
2960        assert_eq!(versioned.recognizer_id, "email.global");
2961        assert_eq!(
2962            versioned.recognizer_version_id.as_deref(),
2963            Some("email.global.v1")
2964        );
2965    }
2966}
2967
2968#[cfg(test)]
2969mod safety_net_manifest_tests {
2970    use super::*;
2971
2972    fn span(start: usize, end: usize, class: PiiClass) -> EmittedTokenSpan {
2973        EmittedTokenSpan {
2974            clean_span: start..end,
2975            raw_span: start..end,
2976            class,
2977        }
2978    }
2979
2980    fn diff(manifest: Manifest, suspect: Range<usize>, class: PiiClass) -> Option<LeakKind> {
2981        manifest.diff_against(&suspect, &class)
2982    }
2983
2984    #[test]
2985    fn exact_same_class_coverage_is_not_a_leak() {
2986        let manifest = Manifest::from_spans(vec![span(0, 8, PiiClass::Email)]);
2987
2988        assert_eq!(diff(manifest, 0..8, PiiClass::Email), None);
2989    }
2990
2991    #[test]
2992    fn uncovered_outside_all_tokens_is_uncovered() {
2993        let manifest = Manifest::from_spans(vec![span(20, 30, PiiClass::Email)]);
2994
2995        assert_eq!(
2996            diff(manifest, 0..10, PiiClass::Email),
2997            Some(LeakKind::Uncovered)
2998        );
2999    }
3000
3001    #[test]
3002    fn single_internal_gap_returns_partial_bleed() {
3003        let manifest = Manifest::from_spans(vec![
3004            span(0, 5, PiiClass::Email),
3005            span(10, 15, PiiClass::Email),
3006        ]);
3007
3008        assert_eq!(
3009            diff(manifest, 0..15, PiiClass::Email),
3010            Some(LeakKind::PartialBleed { uncovered: 5..10 })
3011        );
3012    }
3013
3014    #[test]
3015    fn multi_gap_returns_deterministic_first_uncovered_gap() {
3016        let manifest = Manifest::from_spans(vec![
3017            span(0, 3, PiiClass::Email),
3018            span(5, 7, PiiClass::Email),
3019            span(9, 12, PiiClass::Email),
3020        ]);
3021
3022        // The first-gap-only rule is intentional for v0.6.1; full gap
3023        // enumeration is deferred until the report format can carry it.
3024        assert_eq!(
3025            diff(manifest, 0..12, PiiClass::Email),
3026            Some(LeakKind::PartialBleed { uncovered: 3..5 })
3027        );
3028    }
3029
3030    #[test]
3031    fn multi_class_overlap_reports_first_mismatch_deterministically() {
3032        let manifest = Manifest::from_spans(vec![
3033            span(0, 4, PiiClass::Name),
3034            span(4, 8, PiiClass::Location),
3035        ]);
3036
3037        assert_eq!(
3038            diff(manifest, 0..8, PiiClass::Email),
3039            Some(LeakKind::ClassMismatch {
3040                pipeline_class: PiiClass::Name,
3041                safety_net_class: PiiClass::Email,
3042            })
3043        );
3044    }
3045
3046    #[test]
3047    fn adjacent_same_class_tokens_cover_continuously() {
3048        let manifest = Manifest::from_spans(vec![
3049            span(0, 5, PiiClass::Email),
3050            span(5, 10, PiiClass::Email),
3051        ]);
3052
3053        assert_eq!(diff(manifest, 0..10, PiiClass::Email), None);
3054    }
3055
3056    #[test]
3057    fn partial_bleed_at_start_end_and_middle() {
3058        let manifest = Manifest::from_spans(vec![span(3, 8, PiiClass::Email)]);
3059
3060        assert_eq!(
3061            diff(manifest.clone(), 0..8, PiiClass::Email),
3062            Some(LeakKind::PartialBleed { uncovered: 0..3 })
3063        );
3064        assert_eq!(
3065            diff(manifest.clone(), 3..10, PiiClass::Email),
3066            Some(LeakKind::PartialBleed { uncovered: 8..10 })
3067        );
3068
3069        let with_gap = Manifest::from_spans(vec![
3070            span(0, 3, PiiClass::Email),
3071            span(6, 10, PiiClass::Email),
3072        ]);
3073        assert_eq!(
3074            diff(with_gap, 0..10, PiiClass::Email),
3075            Some(LeakKind::PartialBleed { uncovered: 3..6 })
3076        );
3077    }
3078
3079    #[test]
3080    fn byte_indices_are_not_character_indices() {
3081        let text = "ID: 😀 <Email_1>";
3082        let token_start = text.find("<Email_1>").expect("token start");
3083        assert_eq!(token_start, 9, "emoji is four bytes, not one char");
3084        let manifest = Manifest::from_spans(vec![span(token_start, text.len(), PiiClass::Email)]);
3085
3086        assert_eq!(
3087            diff(manifest, token_start..text.len(), PiiClass::Email),
3088            None
3089        );
3090    }
3091
3092    #[test]
3093    fn empty_suspect_range_is_not_a_leak() {
3094        let manifest = Manifest::default();
3095
3096        assert_eq!(diff(manifest, 3..3, PiiClass::Email), None);
3097    }
3098
3099    #[test]
3100    fn safety_net_error_display_is_variant_specific_and_bytes_free() {
3101        let cases = [
3102            SafetyNetError::Unavailable {
3103                reason: "not configured".to_string(),
3104            }
3105            .to_string(),
3106            SafetyNetError::WeightsMissing {
3107                path: "/models/opf".to_string(),
3108            }
3109            .to_string(),
3110            SafetyNetError::ModelUnavailable {
3111                reason: "load failed".to_string(),
3112            }
3113            .to_string(),
3114            SafetyNetError::ModelIntegrityMismatch {
3115                expected: "e3b0c44298fc1c149afbf4c8996fb924".to_string(),
3116                actual: "4e07408562bedb8b60ce05c1decfe3ad".to_string(),
3117            }
3118            .to_string(),
3119            SafetyNetError::InputTooLarge {
3120                limit: 1024,
3121                actual: 2048,
3122            }
3123            .to_string(),
3124            SafetyNetError::Runtime {
3125                message: "timeout".to_string(),
3126            }
3127            .to_string(),
3128            SafetyNetError::InvalidOutput {
3129                message: "bad json".to_string(),
3130            }
3131            .to_string(),
3132        ];
3133
3134        for rendered in cases {
3135            assert!(!rendered.contains("alice@example.invalid"));
3136        }
3137    }
3138}
3139
3140/// Shared recognizer contract for locale-aware PII candidates.
3141pub trait Recognizer: Send + Sync {
3142    /// Stable recognizer identifier.
3143    fn id(&self) -> &str;
3144    /// PII class supported by this recognizer.
3145    fn supported_class(&self) -> &PiiClass;
3146    /// Detects PII candidates in the supplied input and context.
3147    fn detect(&self, input: &str, ctx: &DetectContext<'_>) -> Vec<Candidate>;
3148    /// Fallible detection entrypoint for recognizers backed by runtime systems.
3149    fn try_detect(
3150        &self,
3151        input: &str,
3152        ctx: &DetectContext<'_>,
3153    ) -> Result<Vec<Candidate>, RecognizerRuntimeError> {
3154        Ok(self.detect(input, ctx))
3155    }
3156    /// Token family used for candidate token emission.
3157    fn token_family(&self) -> &str;
3158    /// Optional validator kind used by pre-resolver validator-veto.
3159    fn validator_kind(&self) -> Option<ValidatorKind> {
3160        None
3161    }
3162    /// Locales where this recognizer is active.
3163    fn locales(&self) -> &[LocaleTag] {
3164        &[LocaleTag::Global]
3165    }
3166}
3167
3168/// Candidate PII span emitted by a recognizer before final conflict resolution.
3169#[derive(Debug, Clone, PartialEq)]
3170#[non_exhaustive]
3171pub struct Candidate {
3172    /// Byte span in the original input.
3173    pub span: Range<usize>,
3174    /// PII class assigned to the span.
3175    pub class: PiiClass,
3176    /// Recognizer identifier.
3177    pub recognizer_id: String,
3178    /// Optional versioned recognizer identifier for audit lineage.
3179    pub recognizer_version_id: Option<String>,
3180    /// Recognizer confidence score.
3181    pub score: f32,
3182    /// Rule or recognizer priority.
3183    pub priority: i32,
3184    /// Optional canonical representation for validation/merge logic.
3185    pub canonical_form: Option<String>,
3186    /// Token family used for output token shape.
3187    pub token_family: String,
3188    /// Candidate source label.
3189    pub source: String,
3190    /// Conflict tier that decided this candidate.
3191    pub decided_by: ConflictTier,
3192    /// Sources merged into this candidate.
3193    pub merged_sources: Vec<String>,
3194}
3195
3196impl Candidate {
3197    /// Builds a recognizer candidate.
3198    #[allow(clippy::too_many_arguments)]
3199    pub fn new(
3200        span: Range<usize>,
3201        class: PiiClass,
3202        recognizer_id: impl Into<String>,
3203        score: f32,
3204        priority: i32,
3205        canonical_form: Option<String>,
3206        token_family: impl Into<String>,
3207        source: impl Into<String>,
3208        decided_by: ConflictTier,
3209        merged_sources: Vec<String>,
3210    ) -> Self {
3211        Self {
3212            span,
3213            class,
3214            recognizer_id: recognizer_id.into(),
3215            recognizer_version_id: None,
3216            score,
3217            priority,
3218            canonical_form,
3219            token_family: token_family.into(),
3220            source: source.into(),
3221            decided_by,
3222            merged_sources,
3223        }
3224    }
3225
3226    /// Returns this candidate with a translated span.
3227    pub fn with_span(mut self, span: Range<usize>) -> Self {
3228        self.span = span;
3229        self
3230    }
3231
3232    /// Returns this candidate with versioned recognizer lineage attached.
3233    pub fn with_recognizer_version_id(mut self, recognizer_version_id: impl Into<String>) -> Self {
3234        self.recognizer_version_id = Some(recognizer_version_id.into());
3235        self
3236    }
3237}
3238
3239/// Context supplied to recognizers during detection.
3240#[non_exhaustive]
3241pub struct DetectContext<'a> {
3242    /// Active locale chain.
3243    pub locale_chain: &'a [LocaleTag],
3244    /// Active dictionary bundle.
3245    pub dictionaries: &'a DictionaryBundle,
3246    /// Reserved field-aware matching slot; intentionally unit in v0.5 Phase B.
3247    pub fields: &'a (),
3248    /// Whether a recognizer degraded due to unavailable optional capability.
3249    pub degraded: Cell<bool>,
3250}
3251
3252impl<'a> DetectContext<'a> {
3253    /// Builds detection context for a recognizer pass.
3254    pub fn new(locale_chain: &'a [LocaleTag], dictionaries: &'a DictionaryBundle) -> Self {
3255        Self {
3256            locale_chain,
3257            dictionaries,
3258            fields: &(),
3259            degraded: Cell::new(false),
3260        }
3261    }
3262}
3263
3264fn ensure_global(tags: &mut Vec<LocaleTag>) {
3265    if !tags.contains(&LocaleTag::Global) {
3266        tags.push(LocaleTag::Global);
3267    }
3268}
3269
3270fn is_bcp47_parseable(raw: &str) -> bool {
3271    let mut parts = raw.split('-');
3272    let Some(language) = parts.next() else {
3273        return false;
3274    };
3275    if !(2..=8).contains(&language.len()) || !language.chars().all(|ch| ch.is_ascii_alphabetic()) {
3276        return false;
3277    }
3278    parts.all(|part| {
3279        (2..=8).contains(&part.len()) && part.chars().all(|ch| ch.is_ascii_alphanumeric())
3280    })
3281}
3282
3283fn canonical_other(raw: &str) -> String {
3284    let mut parts = raw.split('-');
3285    let language = parts.next().unwrap_or_default().to_ascii_lowercase();
3286    let rest = parts.map(|part| {
3287        if part.len() == 2 && part.chars().all(|ch| ch.is_ascii_alphabetic()) {
3288            part.to_ascii_uppercase()
3289        } else {
3290            part.to_ascii_lowercase()
3291        }
3292    });
3293    std::iter::once(language)
3294        .chain(rest)
3295        .collect::<Vec<_>>()
3296        .join("-")
3297}