Skip to main content

gaze_types/
lib.rs

1#![cfg_attr(docsrs, feature(doc_cfg))]
2
3use std::cell::Cell;
4use std::collections::{BTreeMap, HashMap};
5use std::fmt;
6use std::ops::Range;
7
8use serde::{Deserialize, Serialize};
9use thiserror::Error;
10
11/// Shared detector contract for text-only PII detection.
12pub trait Detector: Send + Sync {
13    /// Detect PII spans in the supplied input string.
14    fn detect(&self, input: &str) -> Vec<Detection>;
15}
16
17/// The category of a detected PII span.
18///
19/// Built-in variants: `Email`, `Name`, `Location`, `Organization`. Tenant-specific PII
20/// (case references, titles, internal codes) is carried as `PiiClass::Custom(String)`.
21/// **There is no `Phone` variant** -- phone detection is provided by recognizers in
22/// `gaze-recognizers` and surfaces as either a `Custom("phone")` class or a class
23/// defined by a rulepack.
24///
25/// `PiiClass` is exhaustive. Match every variant explicitly so new built-in classes
26/// force call sites to review their handling at compile time:
27///
28/// ```rust
29/// use gaze_types::PiiClass;
30///
31/// fn label(class: &PiiClass) -> &'static str {
32///     match class {
33///         PiiClass::Email        => "email",
34///         PiiClass::Name         => "name",
35///         PiiClass::Location     => "location",
36///         PiiClass::Organization => "org",
37///         PiiClass::Custom(_)    => "pii",
38///     }
39/// }
40/// ```
41///
42/// Policy TOML uses the lowercase forms `email` / `name` / `location` / `organization`,
43/// and tenant classes are spelled like `custom:case_ref` (lowercase, snake_case).
44#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
45pub enum PiiClass {
46    /// Email address class.
47    Email,
48    /// Person name class.
49    Name,
50    /// Location class.
51    Location,
52    /// Organization class.
53    Organization,
54    /// Tenant- or policy-defined class.
55    Custom(String),
56}
57
58/// Built-in class labels in stable display order.
59pub const BUILTIN_CLASS_NAMES: &[&str] = &["Email", "Name", "Location", "Organization"];
60
61impl PiiClass {
62    /// Parses a policy class name into the shared class vocabulary.
63    pub fn from_policy_name(input: &str) -> Option<Self> {
64        match input {
65            "email" => Some(Self::Email),
66            "name" => Some(Self::Name),
67            "location" => Some(Self::Location),
68            "organization" => Some(Self::Organization),
69            custom if custom.starts_with("custom:") => {
70                let name = custom.trim_start_matches("custom:");
71                (!name.trim().is_empty()).then(|| Self::custom(name))
72            }
73            _ => None,
74        }
75    }
76
77    /// Returns the built-in class variants.
78    pub fn builtin_variants() -> &'static [PiiClass] {
79        &[
80            PiiClass::Email,
81            PiiClass::Name,
82            PiiClass::Location,
83            PiiClass::Organization,
84        ]
85    }
86
87    /// Builds a normalized custom class name.
88    pub fn custom(name: &str) -> Self {
89        let mut normalized = String::new();
90        let mut pending_underscore = false;
91        for ch in name.trim().chars() {
92            if ch.is_ascii_alphanumeric() {
93                if pending_underscore && !normalized.is_empty() {
94                    normalized.push('_');
95                }
96                normalized.push(ch.to_ascii_lowercase());
97                pending_underscore = false;
98            } else {
99                pending_underscore = true;
100            }
101        }
102
103        Self::Custom(normalized)
104    }
105
106    /// Returns the normalized custom class name for custom classes.
107    pub fn as_custom_name(&self) -> Option<&str> {
108        match self {
109            Self::Custom(name) => Some(name.as_str()),
110            Self::Email | Self::Name | Self::Location | Self::Organization => None,
111        }
112    }
113
114    /// Returns the audit/token display label for this class.
115    pub fn class_name(&self) -> String {
116        match self {
117            Self::Email => BUILTIN_CLASS_NAMES[0].to_string(),
118            Self::Name => BUILTIN_CLASS_NAMES[1].to_string(),
119            Self::Location => BUILTIN_CLASS_NAMES[2].to_string(),
120            Self::Organization => BUILTIN_CLASS_NAMES[3].to_string(),
121            Self::Custom(name) => format!("Custom:{name}"),
122        }
123    }
124}
125
126/// A detected span and its class/source metadata.
127#[derive(Debug, Clone, PartialEq, Eq)]
128#[non_exhaustive]
129pub struct Detection {
130    /// Byte span in the original input.
131    pub span: Range<usize>,
132    /// PII class assigned to the span.
133    pub class: PiiClass,
134    /// Detector source identifier.
135    pub source: String,
136}
137
138impl Detection {
139    /// Builds a detected PII span.
140    pub fn new(span: Range<usize>, class: PiiClass, source: impl Into<String>) -> Self {
141        Self {
142            span,
143            class,
144            source: source.into(),
145        }
146    }
147}
148
149/// Observer-only post-clean check (Pass 3 in the detection pipeline).
150///
151/// Runs against already-tokenized output. May report suspected missed PII via
152/// [`LeakReport`] but **must not** mutate the token manifest, the `CleanDocument`,
153/// or the restore path. Safety nets are additive defense-in-depth, not a replacement
154/// for Pass 1/2 detection.
155///
156/// Activate at runtime with `Pipeline::with_safety_net` (post-build) or
157/// `PipelineBuilder::register_safety_net` (during build), or via the CLI
158/// `--safety-net=<name>` flag.
159///
160/// If a safety net reports a suspected miss, the caller decides the response; the
161/// pipeline never silently re-cleans based on safety net output.
162pub trait SafetyNet: Send + Sync {
163    /// Stable backend identifier used in telemetry and audit rows.
164    fn id(&self) -> &str;
165
166    /// Locale tags supported by this safety net. Empty means global.
167    fn supported_locales(&self) -> &[LocaleTag];
168
169    /// Checks clean text for possible PII that the manifest did not cover.
170    fn check(
171        &self,
172        clean_text: &str,
173        context: SafetyNetContext<'_>,
174    ) -> Result<Vec<LeakSuspect>, SafetyNetError>;
175}
176
177/// Context passed to a privacy safety net.
178#[derive(Debug, Clone, Copy)]
179#[non_exhaustive]
180pub struct SafetyNetContext<'a> {
181    /// Tokens emitted by the pseudonymization pipeline for this text segment.
182    pub manifest: &'a Manifest,
183    /// Active session-level locale chain. For `RawDocument::Structured`, locale
184    /// gating uses this same session-level chain across all fields; structured
185    /// fields do not carry per-field locale annotations.
186    pub locale_chain: &'a [LocaleTag],
187    /// Source document kind being checked.
188    pub document_kind: DocumentKind,
189    /// Optional audit session identifier.
190    pub session_id: Option<&'a str>,
191    /// Structured-document field path, such as `$.user.email`.
192    pub field_path: Option<&'a str>,
193}
194
195impl<'a> SafetyNetContext<'a> {
196    /// Builds safety-net context for one clean text segment.
197    pub fn new(
198        manifest: &'a Manifest,
199        locale_chain: &'a [LocaleTag],
200        document_kind: DocumentKind,
201        session_id: Option<&'a str>,
202        field_path: Option<&'a str>,
203    ) -> Self {
204        Self {
205            manifest,
206            locale_chain,
207            document_kind,
208            session_id,
209            field_path,
210        }
211    }
212}
213
214/// A replacement emitted by the pseudonymization pipeline.
215#[derive(Debug, Clone, PartialEq, Eq)]
216#[non_exhaustive]
217pub struct EmittedTokenSpan {
218    /// Byte span in the clean text.
219    pub clean_span: Range<usize>,
220    /// Byte span in the raw text that produced the token.
221    pub raw_span: Range<usize>,
222    /// PII class represented by the emitted token.
223    pub class: PiiClass,
224}
225
226impl EmittedTokenSpan {
227    /// Builds an emitted token span.
228    pub fn new(clean_span: Range<usize>, raw_span: Range<usize>, class: PiiClass) -> Self {
229        Self {
230            clean_span,
231            raw_span,
232            class,
233        }
234    }
235}
236
237/// Set of emitted token spans for one clean text segment.
238#[derive(Debug, Clone, Default, PartialEq, Eq)]
239#[non_exhaustive]
240pub struct Manifest {
241    /// Spans sorted by `clean_span.start`.
242    pub spans: Vec<EmittedTokenSpan>,
243}
244
245impl Manifest {
246    /// Builds a manifest from spans and sorts them by clean byte start.
247    pub fn from_spans(mut spans: Vec<EmittedTokenSpan>) -> Self {
248        spans.sort_by_key(|span| (span.clean_span.start, span.clean_span.end));
249        Self { spans }
250    }
251
252    /// Diffs one safety-net suspect span against emitted token coverage.
253    ///
254    /// Returns `None` when the suspect span is continuously covered by emitted
255    /// token spans of the same class. Internal gaps return
256    /// `LeakKind::PartialBleed`. When multiple uncovered gaps exist, this method
257    /// deterministically returns the first gap by byte offset; full gap
258    /// enumeration is intentionally deferred to a future report format.
259    pub fn diff_against(
260        &self,
261        suspect_span: &Range<usize>,
262        suspect_class: &PiiClass,
263    ) -> Option<LeakKind> {
264        if suspect_span.is_empty() {
265            return None;
266        }
267
268        let start_idx = self
269            .spans
270            .partition_point(|span| span.clean_span.end <= suspect_span.start);
271        let overlapping = self.spans[start_idx..]
272            .iter()
273            .take_while(|span| span.clean_span.start < suspect_span.end)
274            .filter(|span| ranges_overlap(&span.clean_span, suspect_span))
275            .collect::<Vec<_>>();
276
277        if overlapping.is_empty() {
278            return Some(LeakKind::Uncovered);
279        }
280
281        let mut cursor = suspect_span.start;
282        let mut first_mismatch = None::<&EmittedTokenSpan>;
283        for span in overlapping {
284            if span.clean_span.start > cursor {
285                return Some(LeakKind::PartialBleed {
286                    uncovered: cursor..span.clean_span.start.min(suspect_span.end),
287                });
288            }
289
290            if span.clean_span.end > cursor {
291                if first_mismatch.is_none() && &span.class != suspect_class {
292                    first_mismatch = Some(span);
293                }
294                cursor = cursor.max(span.clean_span.end.min(suspect_span.end));
295                if cursor >= suspect_span.end {
296                    break;
297                }
298            }
299        }
300
301        if cursor < suspect_span.end {
302            return Some(LeakKind::PartialBleed {
303                uncovered: cursor..suspect_span.end,
304            });
305        }
306
307        first_mismatch.map(|span| LeakKind::ClassMismatch {
308            pipeline_class: span.class.clone(),
309            safety_net_class: suspect_class.clone(),
310        })
311    }
312}
313
314fn ranges_overlap(left: &Range<usize>, right: &Range<usize>) -> bool {
315    left.start < right.end && right.start < left.end
316}
317
318/// Suspected leak reported by an observer-only safety net.
319#[derive(Debug, Clone, PartialEq)]
320#[non_exhaustive]
321pub struct LeakSuspect {
322    /// Byte span in clean text.
323    pub span: Range<usize>,
324    /// Mapped PII class for the suspect.
325    pub class: PiiClass,
326    /// Safety-net backend identifier.
327    pub safety_net_id: String,
328    /// Optional backend confidence score.
329    pub score: Option<f32>,
330    /// Leak classification after manifest correlation.
331    pub kind: LeakKind,
332    /// Raw backend label after validation/mapping, never source text.
333    pub raw_label: String,
334    /// Optional structured field path.
335    pub field_path: Option<String>,
336}
337
338impl LeakSuspect {
339    /// Builds a safety-net leak suspect.
340    pub fn new(
341        span: Range<usize>,
342        class: PiiClass,
343        safety_net_id: impl Into<String>,
344        score: Option<f32>,
345        kind: LeakKind,
346        raw_label: impl Into<String>,
347        field_path: Option<String>,
348    ) -> Self {
349        Self {
350            span,
351            class,
352            safety_net_id: safety_net_id.into(),
353            score,
354            kind,
355            raw_label: raw_label.into(),
356            field_path,
357        }
358    }
359}
360
361/// The category of a suspected missed PII span.
362///
363/// `LeakKind` is `#[non_exhaustive]`. Match with a wildcard for forward compatibility.
364#[derive(Debug, Clone, PartialEq, Eq)]
365#[non_exhaustive]
366pub enum LeakKind {
367    /// No same-class emitted token overlaps the suspect span.
368    Uncovered,
369    /// The suspect is only partly covered; `uncovered` is the first gap.
370    PartialBleed {
371        /// First uncovered byte range in the suspect span.
372        uncovered: Range<usize>,
373    },
374    /// The suspect is continuously covered, but by a different class.
375    ClassMismatch {
376        /// Class emitted by the pipeline.
377        pipeline_class: PiiClass,
378        /// Class reported by the safety net.
379        safety_net_class: PiiClass,
380    },
381}
382
383/// Bytes-free telemetry emitted by safety-net orchestration.
384#[derive(Debug, Clone, PartialEq, Eq)]
385#[non_exhaustive]
386pub enum LeakReportTelemetry {
387    /// Safety net skipped because the session-level locale chain did not match.
388    LocaleSkipped {
389        /// Safety-net backend identifier.
390        safety_net_id: String,
391        /// Document kind checked.
392        document_kind: DocumentKind,
393        /// Optional structured field path when skip was recorded per field.
394        field_path: Option<String>,
395    },
396}
397
398/// Aggregate leak report statistics.
399#[derive(Debug, Clone, Default, PartialEq, Eq)]
400#[non_exhaustive]
401pub struct LeakReportStats {
402    /// Number of suspects reported.
403    pub suspect_count: usize,
404    /// Number of uncovered suspects.
405    pub uncovered_count: usize,
406    /// Number of partial-bleed suspects.
407    pub partial_bleed_count: usize,
408    /// Number of class-mismatch suspects.
409    pub class_mismatch_count: usize,
410    /// Number of locale-skip telemetry events.
411    pub locale_skipped_count: usize,
412}
413
414/// A suspected missed PII span reported by a [`SafetyNet`].
415///
416/// The safety net is not authoritative; a `LeakReport` is a signal, not a confirmed
417/// leak. False positives are expected. Review reports and adjust policy or recognizer
418/// thresholds.
419#[derive(Debug, Clone, Default, PartialEq)]
420#[non_exhaustive]
421pub struct LeakReport {
422    /// Suspected leaks, containing metadata only.
423    pub suspects: Vec<LeakSuspect>,
424    /// Bytes-free telemetry events.
425    pub telemetry: Vec<LeakReportTelemetry>,
426    /// Aggregated counts for callers that do not need full suspect metadata.
427    pub stats: LeakReportStats,
428    /// Optional replay hash.
429    ///
430    /// Replay determinism is guaranteed only when command path, checkpoint,
431    /// operating point, min score, and decode parameters are fixed externally.
432    pub replay_hash: Option<String>,
433}
434
435impl LeakReport {
436    /// Builds a report from suspects and telemetry.
437    pub fn from_parts(
438        suspects: Vec<LeakSuspect>,
439        telemetry: Vec<LeakReportTelemetry>,
440    ) -> LeakReport {
441        let mut stats = LeakReportStats {
442            suspect_count: suspects.len(),
443            locale_skipped_count: telemetry
444                .iter()
445                .filter(|event| matches!(event, LeakReportTelemetry::LocaleSkipped { .. }))
446                .count(),
447            ..LeakReportStats::default()
448        };
449        for suspect in &suspects {
450            match suspect.kind {
451                LeakKind::Uncovered => stats.uncovered_count += 1,
452                LeakKind::PartialBleed { .. } => stats.partial_bleed_count += 1,
453                LeakKind::ClassMismatch { .. } => stats.class_mismatch_count += 1,
454            }
455        }
456        LeakReport {
457            suspects,
458            telemetry,
459            stats,
460            replay_hash: None,
461        }
462    }
463
464    /// Merges another report into this report.
465    pub fn extend(&mut self, other: LeakReport) {
466        self.suspects.extend(other.suspects);
467        self.telemetry.extend(other.telemetry);
468        *self = LeakReport::from_parts(
469            std::mem::take(&mut self.suspects),
470            std::mem::take(&mut self.telemetry),
471        );
472    }
473}
474
475/// Closed set of upstream OpenAI Privacy Filter labels accepted by Gaze.
476#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
477#[non_exhaustive]
478pub enum OpenAiPrivateLabel {
479    /// `private_person`.
480    PrivatePerson,
481    /// `private_address`.
482    PrivateAddress,
483    /// `private_email`.
484    PrivateEmail,
485    /// `private_phone`.
486    PrivatePhone,
487    /// `private_url`.
488    PrivateUrl,
489    /// `private_date`.
490    PrivateDate,
491    /// `account_number`.
492    AccountNumber,
493    /// `secret`.
494    Secret,
495}
496
497impl OpenAiPrivateLabel {
498    /// Returns the raw upstream label.
499    pub fn as_str(self) -> &'static str {
500        match self {
501            Self::PrivatePerson => "private_person",
502            Self::PrivateAddress => "private_address",
503            Self::PrivateEmail => "private_email",
504            Self::PrivatePhone => "private_phone",
505            Self::PrivateUrl => "private_url",
506            Self::PrivateDate => "private_date",
507            Self::AccountNumber => "account_number",
508            Self::Secret => "secret",
509        }
510    }
511}
512
513/// Closed safety-net PII vocabulary before mapping into `PiiClass`.
514#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
515#[non_exhaustive]
516pub enum SafetyNetPiiClass {
517    /// Email address.
518    Email,
519    /// Person name.
520    Name,
521    /// Location or address.
522    Location,
523    /// Phone number.
524    Phone,
525    /// URL.
526    Url,
527    /// Date.
528    Date,
529    /// Account number.
530    AccountNumber,
531    /// Secret.
532    Secret,
533}
534
535impl SafetyNetPiiClass {
536    /// Maps the safety-net class into the shared pipeline class vocabulary.
537    pub fn to_pii_class(self) -> PiiClass {
538        match self {
539            Self::Email => PiiClass::Email,
540            Self::Name => PiiClass::Name,
541            Self::Location => PiiClass::Location,
542            Self::Phone => PiiClass::custom("phone"),
543            Self::Url => PiiClass::custom("url"),
544            Self::Date => PiiClass::custom("date"),
545            Self::AccountNumber => PiiClass::custom("account_number"),
546            Self::Secret => PiiClass::custom("secret"),
547        }
548    }
549}
550
551/// Exhaustive, closed error set for safety-net execution.
552#[derive(Debug, Clone, PartialEq, Eq, Error)]
553#[non_exhaustive]
554pub enum SafetyNetError {
555    /// Safety net was explicitly requested but is unavailable.
556    #[error("safety net unavailable: {reason}")]
557    Unavailable {
558        /// Sanitized reason.
559        reason: String,
560    },
561    /// Required model weights or checkpoint are missing.
562    #[error("safety net weights missing: {path}")]
563    WeightsMissing {
564        /// Sanitized path or identifier.
565        path: String,
566    },
567    /// Backend model could not be loaded or reached.
568    #[error("safety net model unavailable: {reason}")]
569    ModelUnavailable {
570        /// Sanitized reason.
571        reason: String,
572    },
573    /// Input exceeded configured backend limit.
574    #[error("safety net input too large: limit={limit}, actual={actual}")]
575    InputTooLarge {
576        /// Configured byte limit.
577        limit: usize,
578        /// Actual byte length.
579        actual: usize,
580    },
581    /// Backend runtime failed.
582    #[error("safety net runtime failed: {message}")]
583    Runtime {
584        /// Sanitized diagnostic message.
585        message: String,
586    },
587    /// Backend returned invalid output.
588    #[error("safety net invalid output: {message}")]
589    InvalidOutput {
590        /// Sanitized diagnostic message.
591        message: String,
592    },
593}
594
595/// Disposition applied to a detected PII span.
596///
597/// | Variant | Restorable | Output shape |
598/// |---------|------------|--------------|
599/// | `Tokenize` | Yes | Opaque token: `<hex:Class_N>` |
600/// | `FormatPreserve` | Yes | Realistic-looking pseudonym (e.g., `email1.hex@gaze-fake.invalid`) |
601/// | `Redact` | No | Literal `[REDACTED]` -- original value is gone |
602/// | `Generalize` | No | Class label (e.g., `[Email]`) -- original value is gone |
603/// | `Preserve` | - | Passes through unchanged |
604///
605/// `Action` is `#[non_exhaustive]`. Use a wildcard arm in exhaustive matches.
606/// When restore is required, use `Tokenize` or `FormatPreserve` -- `Redact` and
607/// `Generalize` are irreversible.
608#[derive(Debug, Clone, Copy, PartialEq, Eq)]
609#[non_exhaustive]
610pub enum Action {
611    /// Replace PII with a reversible token.
612    Tokenize,
613    /// Replace PII with a non-restorable redaction marker.
614    Redact,
615    /// Replace PII with a reversible format-preserving token.
616    FormatPreserve,
617    /// Replace PII with a broader category.
618    Generalize,
619    /// Preserve the original value.
620    Preserve,
621}
622
623/// Conflict resolution tier that selected or rejected a candidate.
624#[derive(Debug, Clone, Copy, PartialEq, Eq)]
625#[non_exhaustive]
626pub enum ConflictTier {
627    /// No conflict resolution was needed.
628    None,
629    /// Class priority decided the conflict.
630    ClassPriority,
631    /// Rule priority decided the conflict.
632    RulePriority,
633    /// Candidate score decided the conflict.
634    Score,
635    /// Span length decided the conflict.
636    SpanLength,
637    /// Validator result decided the conflict.
638    Validator,
639    /// Recognizer identifier decided the conflict.
640    RecognizerId,
641    /// Candidate was merged with another candidate.
642    Merged,
643}
644
645/// Source document kind for metadata-only audit logging.
646#[derive(Debug, Clone, Copy, PartialEq, Eq)]
647#[non_exhaustive]
648pub enum DocumentKind {
649    /// Structured key/value document.
650    Structured,
651    /// Plain text document.
652    Text,
653}
654
655/// One row of redaction metadata emitted to a [`RedactionLogger`].
656///
657/// Fields identify the PII class, action taken, session ID, source document kind,
658/// conflict-resolution metadata, and timestamp. Does **not** contain the original PII
659/// value, the token string, or any identifiable content beyond what a compliance audit
660/// requires.
661///
662/// `RedactionEntry` is `#[non_exhaustive]`; adopters must construct via the public
663/// constructor or destructure with a wildcard pattern.
664#[derive(Debug, Clone, PartialEq, Eq)]
665#[non_exhaustive]
666pub struct RedactionEntry {
667    /// Detector or recognizer source identifier.
668    pub source: String,
669    /// PII class affected by the decision.
670    pub class: PiiClass,
671    /// Policy action applied to the span.
672    pub action: Action,
673    /// Optional structured field name.
674    pub field_name: Option<String>,
675    /// Source document kind.
676    pub document_kind: DocumentKind,
677    /// Whether this entry records a loser in conflict resolution.
678    pub conflict_loser: bool,
679    /// Conflict tier that decided the outcome.
680    pub decided_by: ConflictTier,
681    /// Creation timestamp in epoch milliseconds.
682    pub created_at: i64,
683    /// Optional session identifier.
684    pub session_id: Option<String>,
685}
686
687impl RedactionEntry {
688    /// Builds a metadata-only redaction log entry.
689    #[allow(clippy::too_many_arguments)]
690    pub fn new(
691        source: impl Into<String>,
692        class: PiiClass,
693        action: Action,
694        field_name: Option<String>,
695        document_kind: DocumentKind,
696        conflict_loser: bool,
697        decided_by: ConflictTier,
698        created_at: i64,
699        session_id: Option<String>,
700    ) -> Self {
701        Self {
702            source: source.into(),
703            class,
704            action,
705            field_name,
706            document_kind,
707            conflict_loser,
708            decided_by,
709            created_at,
710            session_id,
711        }
712    }
713}
714
715/// Closed error set for redaction log sinks.
716#[derive(Debug, Clone, PartialEq, Eq, Error)]
717#[non_exhaustive]
718pub enum RedactionLogError {
719    /// SQLite-backed redaction log sink failed.
720    #[error("sqlite redaction log error: {0}")]
721    Sqlite(String),
722    /// Non-SQLite redaction log sink failed.
723    #[error("backend redaction log error: {0}")]
724    Backend(String),
725}
726
727/// Trait for audit sinks that receive redaction metadata.
728///
729/// Implement this for custom audit backends (remote telemetry, structured JSON logs).
730/// For SQLite-backed persistence, use `gaze_audit::SqliteLogger`.
731///
732/// # Contract
733///
734/// The logger receives **metadata only**: class, action, session ID, timestamp, and
735/// other bytes-free audit labels. It never receives the original PII value or the token
736/// value. A custom impl that augments entries with raw document text violates the audit
737/// isolation contract and will be flagged by the `gaze_module_isolation` Dylint lint
738/// when it lives in the wrong crate.
739///
740/// # Example
741///
742/// ```rust
743/// use std::sync::atomic::{AtomicUsize, Ordering};
744/// use gaze_types::{RedactionEntry, RedactionLogError, RedactionLogger};
745///
746/// #[derive(Default)]
747/// struct CountLogger(AtomicUsize);
748///
749/// impl RedactionLogger for CountLogger {
750///     fn log(&self, _entry: &RedactionEntry) -> Result<(), RedactionLogError> {
751///         self.0.fetch_add(1, Ordering::Relaxed);
752///         Ok(())
753///     }
754/// }
755/// ```
756pub trait RedactionLogger: Send + Sync {
757    /// Records a metadata-only redaction entry.
758    fn log(&self, entry: &RedactionEntry) -> Result<(), RedactionLogError>;
759}
760
761/// Locale tag recognized by policy and recognizers.
762#[derive(Debug, Clone, PartialEq, Eq, Hash)]
763#[non_exhaustive]
764pub enum LocaleTag {
765    /// Locale-independent recognizer or policy.
766    Global,
767    /// German as used in Germany.
768    DeDe,
769    /// German as used in Austria.
770    DeAt,
771    /// German as used in Switzerland.
772    DeCh,
773    /// English as used in the United States.
774    EnUs,
775    /// English as used in Great Britain.
776    EnGb,
777    /// English as used in Ireland.
778    EnIe,
779    /// English as used in Australia.
780    EnAu,
781    /// English as used in Canada.
782    EnCa,
783    /// Any other canonical BCP-47-like tag.
784    Other(String),
785}
786
787/// Locale parsing error.
788#[derive(Debug, Clone, PartialEq, Eq)]
789#[non_exhaustive]
790pub enum LocaleError {
791    /// Locale tag is unsupported or invalid.
792    Unsupported,
793}
794
795impl fmt::Display for LocaleError {
796    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
797        match self {
798            LocaleError::Unsupported => f.write_str("unsupported locale"),
799        }
800    }
801}
802
803impl std::error::Error for LocaleError {}
804
805/// Ordered locale fallback chain.
806#[derive(Debug, Clone, PartialEq, Eq)]
807pub struct LocaleChain(Vec<LocaleTag>);
808
809impl LocaleTag {
810    /// Global locale constant.
811    pub const GLOBAL: LocaleTag = LocaleTag::Global;
812
813    /// Parses a locale tag from policy or CLI input.
814    pub fn parse(s: &str) -> Result<LocaleTag, LocaleError> {
815        let raw = s.trim().replace('_', "-");
816        let normalized = raw.to_ascii_lowercase();
817        match normalized.as_str() {
818            "global" | "*" => Ok(LocaleTag::Global),
819            "de-de" => Ok(LocaleTag::DeDe),
820            "de-at" => Ok(LocaleTag::DeAt),
821            "de-ch" => Ok(LocaleTag::DeCh),
822            "en-us" => Ok(LocaleTag::EnUs),
823            "en-gb" => Ok(LocaleTag::EnGb),
824            "en-ie" => Ok(LocaleTag::EnIe),
825            "en-au" => Ok(LocaleTag::EnAu),
826            "en-ca" => Ok(LocaleTag::EnCa),
827            "" => Err(LocaleError::Unsupported),
828            _ if is_bcp47_parseable(&raw) => Ok(LocaleTag::Other(canonical_other(&raw))),
829            _ => Err(LocaleError::Unsupported),
830        }
831    }
832
833    /// Returns the canonical string form of the locale tag.
834    pub fn as_str(&self) -> &str {
835        match self {
836            LocaleTag::Global => "global",
837            LocaleTag::DeDe => "de-DE",
838            LocaleTag::DeAt => "de-AT",
839            LocaleTag::DeCh => "de-CH",
840            LocaleTag::EnUs => "en-US",
841            LocaleTag::EnGb => "en-GB",
842            LocaleTag::EnIe => "en-IE",
843            LocaleTag::EnAu => "en-AU",
844            LocaleTag::EnCa => "en-CA",
845            LocaleTag::Other(tag) => tag.as_str(),
846        }
847    }
848}
849
850impl LocaleChain {
851    /// Builds a locale chain and appends global fallback when absent.
852    pub fn from_tags(mut tags: Vec<LocaleTag>) -> LocaleChain {
853        ensure_global(&mut tags);
854        LocaleChain(tags)
855    }
856
857    /// Parses a comma-separated CLI locale chain.
858    pub fn from_cli(raw: &str) -> Result<LocaleChain, LocaleError> {
859        let tags = raw
860            .split(',')
861            .map(LocaleTag::parse)
862            .collect::<Result<Vec<_>, _>>()?;
863        Ok(LocaleChain::from_tags(tags))
864    }
865
866    /// Merges policy and CLI locale preferences.
867    pub fn merge_policy_and_cli(
868        policy: Option<&[LocaleTag]>,
869        cli: Option<&[LocaleTag]>,
870    ) -> LocaleChain {
871        Self::merge_cli_policy_rulepack_default(cli, policy, None)
872    }
873
874    /// Merges CLI, policy, rulepack, and default locale preferences.
875    pub fn merge_cli_policy_rulepack_default(
876        cli: Option<&[LocaleTag]>,
877        policy: Option<&[LocaleTag]>,
878        rulepack_defaults: Option<&[LocaleTag]>,
879    ) -> LocaleChain {
880        let tags = cli
881            .filter(|tags| !tags.is_empty())
882            .or_else(|| policy.filter(|tags| !tags.is_empty()))
883            .or_else(|| rulepack_defaults.filter(|tags| !tags.is_empty()))
884            .map(|tags| tags.to_vec())
885            .unwrap_or_else(|| vec![LocaleTag::Global]);
886        LocaleChain::from_tags(tags)
887    }
888
889    /// Returns true when a recognizer can run under this locale chain.
890    pub fn intersects(&self, recognizer_locales: &[LocaleTag]) -> bool {
891        if recognizer_locales.is_empty() {
892            return true;
893        }
894        recognizer_locales.iter().any(|recognizer_locale| {
895            *recognizer_locale == LocaleTag::Global
896                || self.0.iter().any(|active| active == recognizer_locale)
897        })
898    }
899
900    /// Returns the locale tags in chain order.
901    pub fn as_slice(&self) -> &[LocaleTag] {
902        &self.0
903    }
904
905    /// Returns the locale chain as canonical strings.
906    pub fn to_strings(&self) -> Vec<String> {
907        self.0.iter().map(ToString::to_string).collect()
908    }
909}
910
911impl From<&[LocaleTag]> for LocaleChain {
912    fn from(tags: &[LocaleTag]) -> Self {
913        let mut owned = tags.to_vec();
914        ensure_global(&mut owned);
915        LocaleChain(owned)
916    }
917}
918
919impl fmt::Display for LocaleTag {
920    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
921        f.write_str(self.as_str())
922    }
923}
924
925/// The input document submitted for pseudonymization.
926///
927/// `RawDocument::Text(String)` for plain or semi-structured text (most LLM workflows).
928/// `RawDocument::Structured(BTreeMap<String, Value>)` for JSON-shaped data where
929/// column-aware rules apply -- `ColumnRule`s only take effect on structured input.
930///
931/// `Detection::span` and recognizer candidate spans use **byte** ranges, not char indices.
932///
933/// `RawDocument` is `#[non_exhaustive]`. Match with a wildcard arm.
934#[derive(Debug, Clone)]
935#[non_exhaustive]
936pub enum RawDocument {
937    /// Structured document values.
938    Structured(BTreeMap<String, Value>),
939    /// Plain text document.
940    Text(String),
941}
942
943/// The pseudonymized output from `Pipeline::redact`.
944///
945/// Mirrors the shape of `RawDocument`: `CleanDocument::Text(String)` or
946/// `CleanDocument::Structured(BTreeMap<String, Value>)`. Destructure with a `let`-else
947/// or `match`; **there is no `.text()` accessor**.
948///
949/// ```rust
950/// use gaze_types::CleanDocument;
951///
952/// fn unwrap_text(doc: CleanDocument) -> Option<String> {
953///     if let CleanDocument::Text(t) = doc { Some(t) } else { None }
954/// }
955/// ```
956///
957/// Contains only tokens or redacted placeholders -- no original PII values.
958/// Send this (or its inner string) to the LLM; never send the original `RawDocument`.
959///
960/// `CleanDocument` is `#[non_exhaustive]`.
961#[derive(Debug, Clone, Serialize)]
962#[serde(untagged)]
963#[non_exhaustive]
964pub enum CleanDocument {
965    /// Structured document values.
966    Structured(BTreeMap<String, Value>),
967    /// Plain text document.
968    Text(String),
969}
970
971/// Minimal structured value representation that avoids a serde_json dependency.
972#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
973#[serde(untagged)]
974#[non_exhaustive]
975pub enum Value {
976    /// Null value.
977    Null,
978    /// Boolean value.
979    Bool(bool),
980    /// String value.
981    String(String),
982    /// Signed 64-bit integer value.
983    I64(i64),
984    /// Array value.
985    Array(Vec<Value>),
986    /// Object value.
987    Object(BTreeMap<String, Value>),
988}
989
990impl Value {
991    /// Returns the inner string for string values.
992    pub fn as_str(&self) -> Option<&str> {
993        match self {
994            Self::String(value) => Some(value.as_str()),
995            Self::Null | Self::Bool(_) | Self::I64(_) | Self::Array(_) | Self::Object(_) => None,
996        }
997    }
998
999    /// Returns a scalar string representation used for structured safety-net checks.
1000    pub fn scalar_to_safety_net_string(&self) -> Option<String> {
1001        match self {
1002            Self::String(value) if !value.is_empty() => Some(value.clone()),
1003            Self::String(_) | Self::Null | Self::Array(_) | Self::Object(_) => None,
1004            Self::Bool(value) => Some(value.to_string()),
1005            Self::I64(value) => Some(value.to_string()),
1006        }
1007    }
1008}
1009
1010impl PartialEq<&str> for Value {
1011    fn eq(&self, other: &&str) -> bool {
1012        self.as_str() == Some(*other)
1013    }
1014}
1015
1016/// Value-only dictionary bundle shared with recognizers.
1017#[derive(Debug, Clone, Default)]
1018pub struct DictionaryBundle {
1019    entries: HashMap<String, DictionaryEntry>,
1020}
1021
1022/// Value-only dictionary entry; compiled automatons live outside `gaze-types`.
1023#[derive(Debug, Clone)]
1024pub struct DictionaryEntry {
1025    terms: Vec<String>,
1026    case_sensitive: bool,
1027    source: DictionarySource,
1028}
1029
1030/// Source of a dictionary entry.
1031#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1032#[non_exhaustive]
1033pub enum DictionarySource {
1034    /// Dictionary supplied by request context.
1035    Cli,
1036    /// Dictionary supplied by a rulepack.
1037    Rulepack,
1038}
1039
1040/// Dictionary metadata used for diagnostics and tests.
1041#[derive(Debug, Clone, PartialEq, Eq)]
1042#[non_exhaustive]
1043pub struct DictionaryStats {
1044    /// Dictionary name.
1045    pub name: String,
1046    /// Number of configured terms.
1047    pub term_count: usize,
1048    /// Dictionary source.
1049    pub source: DictionarySource,
1050}
1051
1052impl DictionaryStats {
1053    /// Builds dictionary diagnostics metadata.
1054    pub fn new(name: impl Into<String>, term_count: usize, source: DictionarySource) -> Self {
1055        Self {
1056            name: name.into(),
1057            term_count,
1058            source,
1059        }
1060    }
1061}
1062
1063/// Dictionary declared by a rulepack.
1064#[derive(Debug, Clone, PartialEq, Eq)]
1065#[non_exhaustive]
1066pub struct RulepackDict {
1067    /// Dictionary name.
1068    pub name: String,
1069    /// Dictionary terms.
1070    pub terms: Vec<String>,
1071    /// Whether matching is case-sensitive.
1072    pub case_sensitive: bool,
1073}
1074
1075impl RulepackDict {
1076    /// Builds a rulepack dictionary declaration.
1077    pub fn new(name: impl Into<String>, terms: Vec<String>, case_sensitive: bool) -> Self {
1078        Self {
1079            name: name.into(),
1080            terms,
1081            case_sensitive,
1082        }
1083    }
1084}
1085
1086/// Error raised when constructing invalid dictionary entries.
1087#[derive(Debug, Clone, PartialEq, Eq)]
1088#[non_exhaustive]
1089pub enum DictionaryLoadError {
1090    /// Dictionary has no terms.
1091    Empty { name: String },
1092    /// ASCII-only case-insensitive matching cannot safely cover this entry.
1093    UnicodeInsensitiveUnsupported { name: String },
1094}
1095
1096impl fmt::Display for DictionaryLoadError {
1097    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1098        match self {
1099            Self::Empty { name } => write!(f, "dictionary '{name}' has no terms"),
1100            Self::UnicodeInsensitiveUnsupported { name } => write!(
1101                f,
1102                "dictionary '{name}' uses unicode terms with case-insensitive matching, unsupported in v0.4.0; use case_sensitive = true"
1103            ),
1104        }
1105    }
1106}
1107
1108impl std::error::Error for DictionaryLoadError {}
1109
1110impl DictionaryBundle {
1111    /// Builds a bundle from rulepack dictionaries.
1112    pub fn from_rulepack_terms(terms: &[RulepackDict]) -> Self {
1113        let mut entries = HashMap::with_capacity(terms.len());
1114        for dictionary in terms {
1115            let entry = DictionaryEntry::new(
1116                &dictionary.name,
1117                dictionary.terms.clone(),
1118                dictionary.case_sensitive,
1119                DictionarySource::Rulepack,
1120            )
1121            .expect("Policy validates dictionary terms before bundle construction");
1122            entries.insert(dictionary.name.clone(), entry);
1123        }
1124        Self { entries }
1125    }
1126
1127    /// Builds a bundle from pre-built dictionary entries.
1128    pub fn from_entries(entries: impl IntoIterator<Item = (String, DictionaryEntry)>) -> Self {
1129        Self {
1130            entries: entries.into_iter().collect(),
1131        }
1132    }
1133
1134    /// Merges two bundles, preferring entries from the second bundle on name conflicts.
1135    pub fn merge(a: Self, b: Self) -> Self {
1136        let mut entries = a.entries;
1137        entries.extend(b.entries);
1138        Self { entries }
1139    }
1140
1141    /// Returns a dictionary by name.
1142    pub fn get(&self, name: &str) -> Option<&DictionaryEntry> {
1143        self.entries.get(name)
1144    }
1145
1146    /// Returns sorted dictionary stats.
1147    pub fn stats(&self) -> Vec<DictionaryStats> {
1148        let mut stats = self
1149            .entries
1150            .iter()
1151            .map(|(name, entry)| DictionaryStats {
1152                name: name.clone(),
1153                term_count: entry.terms.len(),
1154                source: entry.source,
1155            })
1156            .collect::<Vec<_>>();
1157        stats.sort_by(|a, b| a.name.cmp(&b.name));
1158        stats
1159    }
1160}
1161
1162impl DictionaryEntry {
1163    /// Creates a validated value-only dictionary entry.
1164    pub fn new(
1165        name: &str,
1166        terms: Vec<String>,
1167        case_sensitive: bool,
1168        source: DictionarySource,
1169    ) -> Result<Self, DictionaryLoadError> {
1170        if terms.is_empty() {
1171            return Err(DictionaryLoadError::Empty {
1172                name: name.to_string(),
1173            });
1174        }
1175        if !case_sensitive && terms.iter().any(|term| !term.is_ascii()) {
1176            return Err(DictionaryLoadError::UnicodeInsensitiveUnsupported {
1177                name: name.to_string(),
1178            });
1179        }
1180        Ok(Self {
1181            terms,
1182            case_sensitive,
1183            source,
1184        })
1185    }
1186
1187    /// Returns whether matching is case-sensitive.
1188    pub fn case_sensitive(&self) -> bool {
1189        self.case_sensitive
1190    }
1191
1192    /// Returns configured dictionary terms.
1193    pub fn terms(&self) -> &[String] {
1194        &self.terms
1195    }
1196}
1197
1198#[cfg(test)]
1199mod dictionary_tests {
1200    use super::*;
1201
1202    #[test]
1203    fn dictionary_entry_rejects_empty_terms() {
1204        let err = DictionaryEntry::new("empty", Vec::new(), true, DictionarySource::Cli)
1205            .expect_err("empty dictionaries must fail closed");
1206
1207        assert!(matches!(err, DictionaryLoadError::Empty { name } if name == "empty"));
1208    }
1209
1210    #[test]
1211    fn dictionary_entry_rejects_non_ascii_case_insensitive_terms() {
1212        let err = DictionaryEntry::new(
1213            "songs",
1214            vec!["Beyonce".to_string(), "Caf\u{00e9}".to_string()],
1215            false,
1216            DictionarySource::Cli,
1217        )
1218        .expect_err("unicode case-insensitive dictionaries must fail closed");
1219
1220        assert!(matches!(
1221            err,
1222            DictionaryLoadError::UnicodeInsensitiveUnsupported { name } if name == "songs"
1223        ));
1224    }
1225}
1226
1227#[cfg(test)]
1228mod redaction_logger_tests {
1229    use super::*;
1230
1231    struct CapturingLogger;
1232
1233    impl RedactionLogger for CapturingLogger {
1234        fn log(&self, _entry: &RedactionEntry) -> Result<(), RedactionLogError> {
1235            Ok(())
1236        }
1237    }
1238
1239    fn assert_send_sync<T: Send + Sync + ?Sized>() {}
1240
1241    #[test]
1242    fn redaction_log_error_display_is_stable() {
1243        assert_eq!(
1244            RedactionLogError::Sqlite("write failed".to_string()).to_string(),
1245            "sqlite redaction log error: write failed"
1246        );
1247        assert_eq!(
1248            RedactionLogError::Backend("sink failed".to_string()).to_string(),
1249            "backend redaction log error: sink failed"
1250        );
1251    }
1252
1253    #[test]
1254    fn redaction_logger_trait_object_is_send_sync() {
1255        assert_send_sync::<dyn RedactionLogger>();
1256    }
1257
1258    #[test]
1259    fn local_logger_can_implement_redaction_logger() {
1260        let logger = CapturingLogger;
1261        let entry = RedactionEntry {
1262            source: "unit-test".to_string(),
1263            class: PiiClass::Email,
1264            action: Action::Tokenize,
1265            field_name: None,
1266            document_kind: DocumentKind::Text,
1267            conflict_loser: false,
1268            decided_by: ConflictTier::None,
1269            created_at: 0,
1270            session_id: None,
1271        };
1272
1273        let trait_object: &dyn RedactionLogger = &logger;
1274        trait_object.log(&entry).expect("log entry");
1275    }
1276}
1277
1278#[cfg(test)]
1279mod safety_net_manifest_tests {
1280    use super::*;
1281
1282    fn span(start: usize, end: usize, class: PiiClass) -> EmittedTokenSpan {
1283        EmittedTokenSpan {
1284            clean_span: start..end,
1285            raw_span: start..end,
1286            class,
1287        }
1288    }
1289
1290    fn diff(manifest: Manifest, suspect: Range<usize>, class: PiiClass) -> Option<LeakKind> {
1291        manifest.diff_against(&suspect, &class)
1292    }
1293
1294    #[test]
1295    fn exact_same_class_coverage_is_not_a_leak() {
1296        let manifest = Manifest::from_spans(vec![span(0, 8, PiiClass::Email)]);
1297
1298        assert_eq!(diff(manifest, 0..8, PiiClass::Email), None);
1299    }
1300
1301    #[test]
1302    fn uncovered_outside_all_tokens_is_uncovered() {
1303        let manifest = Manifest::from_spans(vec![span(20, 30, PiiClass::Email)]);
1304
1305        assert_eq!(
1306            diff(manifest, 0..10, PiiClass::Email),
1307            Some(LeakKind::Uncovered)
1308        );
1309    }
1310
1311    #[test]
1312    fn single_internal_gap_returns_partial_bleed() {
1313        let manifest = Manifest::from_spans(vec![
1314            span(0, 5, PiiClass::Email),
1315            span(10, 15, PiiClass::Email),
1316        ]);
1317
1318        assert_eq!(
1319            diff(manifest, 0..15, PiiClass::Email),
1320            Some(LeakKind::PartialBleed { uncovered: 5..10 })
1321        );
1322    }
1323
1324    #[test]
1325    fn multi_gap_returns_deterministic_first_uncovered_gap() {
1326        let manifest = Manifest::from_spans(vec![
1327            span(0, 3, PiiClass::Email),
1328            span(5, 7, PiiClass::Email),
1329            span(9, 12, PiiClass::Email),
1330        ]);
1331
1332        // The first-gap-only rule is intentional for v0.6.1; full gap
1333        // enumeration is deferred until the report format can carry it.
1334        assert_eq!(
1335            diff(manifest, 0..12, PiiClass::Email),
1336            Some(LeakKind::PartialBleed { uncovered: 3..5 })
1337        );
1338    }
1339
1340    #[test]
1341    fn multi_class_overlap_reports_first_mismatch_deterministically() {
1342        let manifest = Manifest::from_spans(vec![
1343            span(0, 4, PiiClass::Name),
1344            span(4, 8, PiiClass::Location),
1345        ]);
1346
1347        assert_eq!(
1348            diff(manifest, 0..8, PiiClass::Email),
1349            Some(LeakKind::ClassMismatch {
1350                pipeline_class: PiiClass::Name,
1351                safety_net_class: PiiClass::Email,
1352            })
1353        );
1354    }
1355
1356    #[test]
1357    fn adjacent_same_class_tokens_cover_continuously() {
1358        let manifest = Manifest::from_spans(vec![
1359            span(0, 5, PiiClass::Email),
1360            span(5, 10, PiiClass::Email),
1361        ]);
1362
1363        assert_eq!(diff(manifest, 0..10, PiiClass::Email), None);
1364    }
1365
1366    #[test]
1367    fn partial_bleed_at_start_end_and_middle() {
1368        let manifest = Manifest::from_spans(vec![span(3, 8, PiiClass::Email)]);
1369
1370        assert_eq!(
1371            diff(manifest.clone(), 0..8, PiiClass::Email),
1372            Some(LeakKind::PartialBleed { uncovered: 0..3 })
1373        );
1374        assert_eq!(
1375            diff(manifest.clone(), 3..10, PiiClass::Email),
1376            Some(LeakKind::PartialBleed { uncovered: 8..10 })
1377        );
1378
1379        let with_gap = Manifest::from_spans(vec![
1380            span(0, 3, PiiClass::Email),
1381            span(6, 10, PiiClass::Email),
1382        ]);
1383        assert_eq!(
1384            diff(with_gap, 0..10, PiiClass::Email),
1385            Some(LeakKind::PartialBleed { uncovered: 3..6 })
1386        );
1387    }
1388
1389    #[test]
1390    fn byte_indices_are_not_character_indices() {
1391        let text = "ID: 😀 <Email_1>";
1392        let token_start = text.find("<Email_1>").expect("token start");
1393        assert_eq!(token_start, 9, "emoji is four bytes, not one char");
1394        let manifest = Manifest::from_spans(vec![span(token_start, text.len(), PiiClass::Email)]);
1395
1396        assert_eq!(
1397            diff(manifest, token_start..text.len(), PiiClass::Email),
1398            None
1399        );
1400    }
1401
1402    #[test]
1403    fn empty_suspect_range_is_not_a_leak() {
1404        let manifest = Manifest::default();
1405
1406        assert_eq!(diff(manifest, 3..3, PiiClass::Email), None);
1407    }
1408
1409    #[test]
1410    fn safety_net_error_display_is_variant_specific_and_bytes_free() {
1411        let cases = [
1412            SafetyNetError::Unavailable {
1413                reason: "not configured".to_string(),
1414            }
1415            .to_string(),
1416            SafetyNetError::WeightsMissing {
1417                path: "/models/opf".to_string(),
1418            }
1419            .to_string(),
1420            SafetyNetError::ModelUnavailable {
1421                reason: "load failed".to_string(),
1422            }
1423            .to_string(),
1424            SafetyNetError::InputTooLarge {
1425                limit: 1024,
1426                actual: 2048,
1427            }
1428            .to_string(),
1429            SafetyNetError::Runtime {
1430                message: "timeout".to_string(),
1431            }
1432            .to_string(),
1433            SafetyNetError::InvalidOutput {
1434                message: "bad json".to_string(),
1435            }
1436            .to_string(),
1437        ];
1438
1439        for rendered in cases {
1440            assert!(!rendered.contains("alice@example.invalid"));
1441        }
1442    }
1443}
1444
1445/// Shared recognizer contract for locale-aware PII candidates.
1446pub trait Recognizer: Send + Sync {
1447    /// Stable recognizer identifier.
1448    fn id(&self) -> &str;
1449    /// PII class supported by this recognizer.
1450    fn supported_class(&self) -> &PiiClass;
1451    /// Detects PII candidates in the supplied input and context.
1452    fn detect(&self, input: &str, ctx: &DetectContext<'_>) -> Vec<Candidate>;
1453    /// Token family used for candidate token emission.
1454    fn token_family(&self) -> &str;
1455    /// Locales where this recognizer is active.
1456    fn locales(&self) -> &[LocaleTag] {
1457        &[LocaleTag::Global]
1458    }
1459}
1460
1461/// Candidate PII span emitted by a recognizer before final conflict resolution.
1462#[derive(Debug, Clone, PartialEq)]
1463#[non_exhaustive]
1464pub struct Candidate {
1465    /// Byte span in the original input.
1466    pub span: Range<usize>,
1467    /// PII class assigned to the span.
1468    pub class: PiiClass,
1469    /// Recognizer identifier.
1470    pub recognizer_id: String,
1471    /// Recognizer confidence score.
1472    pub score: f32,
1473    /// Rule or recognizer priority.
1474    pub priority: i32,
1475    /// Optional canonical representation for validation/merge logic.
1476    pub canonical_form: Option<String>,
1477    /// Token family used for output token shape.
1478    pub token_family: String,
1479    /// Candidate source label.
1480    pub source: String,
1481    /// Conflict tier that decided this candidate.
1482    pub decided_by: ConflictTier,
1483    /// Sources merged into this candidate.
1484    pub merged_sources: Vec<String>,
1485}
1486
1487impl Candidate {
1488    /// Builds a recognizer candidate.
1489    #[allow(clippy::too_many_arguments)]
1490    pub fn new(
1491        span: Range<usize>,
1492        class: PiiClass,
1493        recognizer_id: impl Into<String>,
1494        score: f32,
1495        priority: i32,
1496        canonical_form: Option<String>,
1497        token_family: impl Into<String>,
1498        source: impl Into<String>,
1499        decided_by: ConflictTier,
1500        merged_sources: Vec<String>,
1501    ) -> Self {
1502        Self {
1503            span,
1504            class,
1505            recognizer_id: recognizer_id.into(),
1506            score,
1507            priority,
1508            canonical_form,
1509            token_family: token_family.into(),
1510            source: source.into(),
1511            decided_by,
1512            merged_sources,
1513        }
1514    }
1515
1516    /// Returns this candidate with a translated span.
1517    pub fn with_span(mut self, span: Range<usize>) -> Self {
1518        self.span = span;
1519        self
1520    }
1521}
1522
1523/// Context supplied to recognizers during detection.
1524#[non_exhaustive]
1525pub struct DetectContext<'a> {
1526    /// Active locale chain.
1527    pub locale_chain: &'a [LocaleTag],
1528    /// Active dictionary bundle.
1529    pub dictionaries: &'a DictionaryBundle,
1530    /// Reserved field-aware matching slot; intentionally unit in v0.5 Phase B.
1531    pub fields: &'a (),
1532    /// Whether a recognizer degraded due to unavailable optional capability.
1533    pub degraded: Cell<bool>,
1534}
1535
1536impl<'a> DetectContext<'a> {
1537    /// Builds detection context for a recognizer pass.
1538    pub fn new(locale_chain: &'a [LocaleTag], dictionaries: &'a DictionaryBundle) -> Self {
1539        Self {
1540            locale_chain,
1541            dictionaries,
1542            fields: &(),
1543            degraded: Cell::new(false),
1544        }
1545    }
1546}
1547
1548fn ensure_global(tags: &mut Vec<LocaleTag>) {
1549    if !tags.contains(&LocaleTag::Global) {
1550        tags.push(LocaleTag::Global);
1551    }
1552}
1553
1554fn is_bcp47_parseable(raw: &str) -> bool {
1555    let mut parts = raw.split('-');
1556    let Some(language) = parts.next() else {
1557        return false;
1558    };
1559    if !(2..=8).contains(&language.len()) || !language.chars().all(|ch| ch.is_ascii_alphabetic()) {
1560        return false;
1561    }
1562    parts.all(|part| {
1563        (2..=8).contains(&part.len()) && part.chars().all(|ch| ch.is_ascii_alphanumeric())
1564    })
1565}
1566
1567fn canonical_other(raw: &str) -> String {
1568    let mut parts = raw.split('-');
1569    let language = parts.next().unwrap_or_default().to_ascii_lowercase();
1570    let rest = parts.map(|part| {
1571        if part.len() == 2 && part.chars().all(|ch| ch.is_ascii_alphabetic()) {
1572            part.to_ascii_uppercase()
1573        } else {
1574            part.to_ascii_lowercase()
1575        }
1576    });
1577    std::iter::once(language)
1578        .chain(rest)
1579        .collect::<Vec<_>>()
1580        .join("-")
1581}