gaze_types/
lib.rs

1#![cfg_attr(docsrs, feature(doc_cfg))]
2
3use std::cell::Cell;
4use std::collections::{BTreeMap, HashMap};
5use std::fmt;
6use std::ops::Range;
7
8use serde::{Deserialize, Serialize};
9use thiserror::Error;
10
11/// Shared detector contract for text-only PII detection.
12pub trait Detector: Send + Sync {
13    /// Detect PII spans in the supplied input string.
14    fn detect(&self, input: &str) -> Vec<Detection>;
15}
16
17/// The category of a detected PII span.
18///
19/// Built-in variants: `Email`, `Name`, `Location`, `Organization`. Tenant-specific PII
20/// (case references, titles, internal codes) is carried as `PiiClass::Custom(String)`.
21/// **There is no `Phone` variant** -- phone detection is provided by recognizers in
22/// `gaze-recognizers` and surfaces as either a `Custom("phone")` class or a class
23/// defined by a rulepack.
24///
25/// `PiiClass` is exhaustive. Match every variant explicitly so new built-in classes
26/// force call sites to review their handling at compile time:
27///
28/// ```rust
29/// use gaze_types::PiiClass;
30///
31/// fn label(class: &PiiClass) -> &'static str {
32///     match class {
33///         PiiClass::Email        => "email",
34///         PiiClass::Name         => "name",
35///         PiiClass::Location     => "location",
36///         PiiClass::Organization => "org",
37///         PiiClass::Custom(_)    => "pii",
38///     }
39/// }
40/// ```
41///
42/// Policy TOML uses the lowercase forms `email` / `name` / `location` / `organization`,
43/// and tenant classes are spelled like `custom:case_ref` (lowercase, snake_case).
44#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
45pub enum PiiClass {
46    /// Email address class.
47    Email,
48    /// Person name class.
49    Name,
50    /// Location class.
51    Location,
52    /// Organization class.
53    Organization,
54    /// Tenant- or policy-defined class.
55    Custom(String),
56}
57
58/// Built-in class labels in stable display order.
59pub const BUILTIN_CLASS_NAMES: &[&str] = &["Email", "Name", "Location", "Organization"];
60
61impl PiiClass {
62    /// Parses a policy class name into the shared class vocabulary.
63    pub fn from_policy_name(input: &str) -> Option<Self> {
64        match input {
65            "email" => Some(Self::Email),
66            "name" => Some(Self::Name),
67            "location" => Some(Self::Location),
68            "organization" => Some(Self::Organization),
69            custom if custom.starts_with("custom:") => {
70                let name = custom.trim_start_matches("custom:");
71                (!name.trim().is_empty()).then(|| Self::custom(name))
72            }
73            _ => None,
74        }
75    }
76
77    /// Returns the built-in class variants.
78    pub fn builtin_variants() -> &'static [PiiClass] {
79        &[
80            PiiClass::Email,
81            PiiClass::Name,
82            PiiClass::Location,
83            PiiClass::Organization,
84        ]
85    }
86
87    /// Builds a normalized custom class name.
88    pub fn custom(name: &str) -> Self {
89        let mut normalized = String::new();
90        let mut pending_underscore = false;
91        for ch in name.trim().chars() {
92            if ch.is_ascii_alphanumeric() {
93                if pending_underscore && !normalized.is_empty() {
94                    normalized.push('_');
95                }
96                normalized.push(ch.to_ascii_lowercase());
97                pending_underscore = false;
98            } else {
99                pending_underscore = true;
100            }
101        }
102
103        Self::Custom(normalized)
104    }
105
106    /// Returns the normalized custom class name for custom classes.
107    pub fn as_custom_name(&self) -> Option<&str> {
108        match self {
109            Self::Custom(name) => Some(name.as_str()),
110            Self::Email | Self::Name | Self::Location | Self::Organization => None,
111        }
112    }
113
114    /// Returns the audit/token display label for this class.
115    pub fn class_name(&self) -> String {
116        match self {
117            Self::Email => BUILTIN_CLASS_NAMES[0].to_string(),
118            Self::Name => BUILTIN_CLASS_NAMES[1].to_string(),
119            Self::Location => BUILTIN_CLASS_NAMES[2].to_string(),
120            Self::Organization => BUILTIN_CLASS_NAMES[3].to_string(),
121            Self::Custom(name) => format!("Custom:{name}"),
122        }
123    }
124}
125
126/// A detected span and its class/source metadata.
127#[derive(Debug, Clone, PartialEq, Eq)]
128#[non_exhaustive]
129pub struct Detection {
130    /// Byte span in the original input.
131    pub span: Range<usize>,
132    /// PII class assigned to the span.
133    pub class: PiiClass,
134    /// Detector source identifier.
135    pub source: String,
136}
137
138impl Detection {
139    /// Builds a detected PII span.
140    pub fn new(span: Range<usize>, class: PiiClass, source: impl Into<String>) -> Self {
141        Self {
142            span,
143            class,
144            source: source.into(),
145        }
146    }
147}
148
149/// Observer-only post-clean check (Pass 3 in the detection pipeline).
150///
151/// Runs against already-tokenized output. May report suspected missed PII via
152/// [`LeakReport`] but **must not** mutate the token manifest, the `CleanDocument`,
153/// or the restore path. Safety nets are additive defense-in-depth, not a replacement
154/// for Pass 1/2 detection.
155///
156/// Activate at runtime with `Pipeline::with_safety_net` (post-build) or
157/// `PipelineBuilder::register_safety_net` (during build), or via the CLI
158/// `--safety-net=<name>` flag.
159///
160/// If a safety net reports a suspected miss, the caller decides the response; the
161/// pipeline never silently re-cleans based on safety net output.
162pub trait SafetyNet: Send + Sync {
163    /// Stable backend identifier used in telemetry and audit rows.
164    fn id(&self) -> &str;
165
166    /// Locale tags supported by this safety net. Empty means global.
167    fn supported_locales(&self) -> &[LocaleTag];
168
169    /// Checks clean text for possible PII that the manifest did not cover.
170    fn check(
171        &self,
172        clean_text: &str,
173        context: SafetyNetContext<'_>,
174    ) -> Result<Vec<LeakSuspect>, SafetyNetError>;
175}
176
177/// Context passed to a privacy safety net.
178#[derive(Debug, Clone, Copy)]
179#[non_exhaustive]
180pub struct SafetyNetContext<'a> {
181    /// Tokens emitted by the pseudonymization pipeline for this text segment.
182    pub manifest: &'a Manifest,
183    /// Active session-level locale chain. For `RawDocument::Structured`, locale
184    /// gating uses this same session-level chain across all fields; structured
185    /// fields do not carry per-field locale annotations.
186    pub locale_chain: &'a [LocaleTag],
187    /// Source document kind being checked.
188    pub document_kind: DocumentKind,
189    /// Optional audit session identifier.
190    pub session_id: Option<&'a str>,
191    /// Structured-document field path, such as `$.user.email`.
192    pub field_path: Option<&'a str>,
193}
194
195impl<'a> SafetyNetContext<'a> {
196    /// Builds safety-net context for one clean text segment.
197    pub fn new(
198        manifest: &'a Manifest,
199        locale_chain: &'a [LocaleTag],
200        document_kind: DocumentKind,
201        session_id: Option<&'a str>,
202        field_path: Option<&'a str>,
203    ) -> Self {
204        Self {
205            manifest,
206            locale_chain,
207            document_kind,
208            session_id,
209            field_path,
210        }
211    }
212}
213
214/// A replacement emitted by the pseudonymization pipeline.
215#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
216#[non_exhaustive]
217pub struct EmittedTokenSpan {
218    /// Byte span in the clean text.
219    pub clean_span: Range<usize>,
220    /// Byte span in the raw text that produced the token.
221    pub raw_span: Range<usize>,
222    /// PII class represented by the emitted token.
223    pub class: PiiClass,
224}
225
226impl EmittedTokenSpan {
227    /// Builds an emitted token span.
228    pub fn new(clean_span: Range<usize>, raw_span: Range<usize>, class: PiiClass) -> Self {
229        Self {
230            clean_span,
231            raw_span,
232            class,
233        }
234    }
235}
236
237/// Set of emitted token spans for one clean text segment.
238#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
239#[non_exhaustive]
240pub struct Manifest {
241    /// Spans sorted by `clean_span.start`.
242    pub spans: Vec<EmittedTokenSpan>,
243}
244
245impl Manifest {
246    /// Builds a manifest from spans and sorts them by clean byte start.
247    pub fn from_spans(mut spans: Vec<EmittedTokenSpan>) -> Self {
248        spans.sort_by_key(|span| (span.clean_span.start, span.clean_span.end));
249        Self { spans }
250    }
251
252    /// Diffs one safety-net suspect span against emitted token coverage.
253    ///
254    /// Returns `None` when the suspect span is continuously covered by emitted
255    /// token spans of the same class. Internal gaps return
256    /// `LeakKind::PartialBleed`. When multiple uncovered gaps exist, this method
257    /// deterministically returns the first gap by byte offset; full gap
258    /// enumeration is intentionally deferred to a future report format.
259    pub fn diff_against(
260        &self,
261        suspect_span: &Range<usize>,
262        suspect_class: &PiiClass,
263    ) -> Option<LeakKind> {
264        if suspect_span.is_empty() {
265            return None;
266        }
267
268        let start_idx = self
269            .spans
270            .partition_point(|span| span.clean_span.end <= suspect_span.start);
271        let overlapping = self.spans[start_idx..]
272            .iter()
273            .take_while(|span| span.clean_span.start < suspect_span.end)
274            .filter(|span| ranges_overlap(&span.clean_span, suspect_span))
275            .collect::<Vec<_>>();
276
277        if overlapping.is_empty() {
278            return Some(LeakKind::Uncovered);
279        }
280
281        let mut cursor = suspect_span.start;
282        let mut first_mismatch = None::<&EmittedTokenSpan>;
283        for span in overlapping {
284            if span.clean_span.start > cursor {
285                return Some(LeakKind::PartialBleed {
286                    uncovered: cursor..span.clean_span.start.min(suspect_span.end),
287                });
288            }
289
290            if span.clean_span.end > cursor {
291                if first_mismatch.is_none() && &span.class != suspect_class {
292                    first_mismatch = Some(span);
293                }
294                cursor = cursor.max(span.clean_span.end.min(suspect_span.end));
295                if cursor >= suspect_span.end {
296                    break;
297                }
298            }
299        }
300
301        if cursor < suspect_span.end {
302            return Some(LeakKind::PartialBleed {
303                uncovered: cursor..suspect_span.end,
304            });
305        }
306
307        first_mismatch.map(|span| LeakKind::ClassMismatch {
308            pipeline_class: span.class.clone(),
309            safety_net_class: suspect_class.clone(),
310        })
311    }
312}
313
314fn ranges_overlap(left: &Range<usize>, right: &Range<usize>) -> bool {
315    left.start < right.end && right.start < left.end
316}
317
318/// Suspected leak reported by an observer-only safety net.
319#[derive(Debug, Clone, PartialEq)]
320#[non_exhaustive]
321pub struct LeakSuspect {
322    /// Byte span in clean text.
323    pub span: Range<usize>,
324    /// Mapped PII class for the suspect.
325    pub class: PiiClass,
326    /// Safety-net backend identifier.
327    pub safety_net_id: String,
328    /// Optional backend confidence score.
329    pub score: Option<f32>,
330    /// Leak classification after manifest correlation.
331    pub kind: LeakKind,
332    /// Raw backend label after validation/mapping, never source text.
333    pub raw_label: String,
334    /// Optional structured field path.
335    pub field_path: Option<String>,
336}
337
338impl LeakSuspect {
339    /// Builds a safety-net leak suspect.
340    pub fn new(
341        span: Range<usize>,
342        class: PiiClass,
343        safety_net_id: impl Into<String>,
344        score: Option<f32>,
345        kind: LeakKind,
346        raw_label: impl Into<String>,
347        field_path: Option<String>,
348    ) -> Self {
349        Self {
350            span,
351            class,
352            safety_net_id: safety_net_id.into(),
353            score,
354            kind,
355            raw_label: raw_label.into(),
356            field_path,
357        }
358    }
359}
360
361/// The category of a suspected missed PII span.
362///
363/// `LeakKind` is `#[non_exhaustive]`. Match with a wildcard for forward compatibility.
364#[derive(Debug, Clone, PartialEq, Eq)]
365#[non_exhaustive]
366pub enum LeakKind {
367    /// No same-class emitted token overlaps the suspect span.
368    Uncovered,
369    /// The suspect is only partly covered; `uncovered` is the first gap.
370    PartialBleed {
371        /// First uncovered byte range in the suspect span.
372        uncovered: Range<usize>,
373    },
374    /// The suspect is continuously covered, but by a different class.
375    ClassMismatch {
376        /// Class emitted by the pipeline.
377        pipeline_class: PiiClass,
378        /// Class reported by the safety net.
379        safety_net_class: PiiClass,
380    },
381}
382
383/// Bytes-free telemetry emitted by safety-net orchestration.
384#[derive(Debug, Clone, PartialEq, Eq)]
385#[non_exhaustive]
386pub enum LeakReportTelemetry {
387    /// Safety net skipped because the session-level locale chain did not match.
388    LocaleSkipped {
389        /// Safety-net backend identifier.
390        safety_net_id: String,
391        /// Document kind checked.
392        document_kind: DocumentKind,
393        /// Optional structured field path when skip was recorded per field.
394        field_path: Option<String>,
395    },
396}
397
398/// Aggregate leak report statistics.
399#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
400#[non_exhaustive]
401pub struct LeakReportStats {
402    /// Number of suspects reported.
403    pub suspect_count: usize,
404    /// Number of uncovered suspects.
405    pub uncovered_count: usize,
406    /// Number of partial-bleed suspects.
407    pub partial_bleed_count: usize,
408    /// Number of class-mismatch suspects.
409    pub class_mismatch_count: usize,
410    /// Number of locale-skip telemetry events.
411    pub locale_skipped_count: usize,
412}
413
414/// Signed document-context metadata carried inside a session snapshot envelope.
415///
416/// This extension is the v0.7 bridge for `gaze-document`: it is safe to serialize
417/// inside the owner-only snapshot envelope, while agent-facing files keep using
418/// non-sensitive mirrors. The single `schema_version` is bundle-level; sub-files
419/// do not carry independent schema versions.
420#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
421#[non_exhaustive]
422pub struct DocumentExtension {
423    /// Bundle-level schema version shared by clean, layout, preview, report, and manifest files.
424    pub schema_version: u16,
425    /// SHA-256 of `clean.md` NFC-normalized bytes.
426    pub clean_md_sha256: [u8; 32],
427    /// SHA-256 of canonical `layout.json` bytes.
428    pub layout_json_sha256: [u8; 32],
429    /// SHA-256 of canonical `report.json` bytes.
430    pub report_json_sha256: [u8; 32],
431    /// SHA-256 of `preview-redacted.png` bytes when a preview is present.
432    #[serde(default, skip_serializing_if = "Option::is_none")]
433    pub preview_png_sha256: Option<[u8; 32]>,
434    /// Page count reported for the source document.
435    pub page_count: u32,
436    /// Audit session id mirrored from the writing session for cross-pane correlation.
437    pub audit_session_id: String,
438    /// Signed clean.md byte spans for every emitted token.
439    #[serde(default, skip_serializing_if = "Vec::is_empty")]
440    pub clean_spans: Vec<EmittedTokenSpan>,
441    /// Codec audit rows for the decode path that produced this document extension.
442    #[serde(default, skip_serializing_if = "Vec::is_empty")]
443    pub codec_audit: Vec<CodecAuditRow>,
444}
445
446impl DocumentExtension {
447    /// Starts a document extension builder for one bundle schema version.
448    pub fn builder(schema_version: u16) -> DocumentExtensionBuilder {
449        DocumentExtensionBuilder {
450            schema_version,
451            clean_md_sha256: None,
452            layout_json_sha256: None,
453            report_json_sha256: None,
454            preview_png_sha256: None,
455            page_count: None,
456            audit_session_id: None,
457            clean_spans: Vec::new(),
458            codec_audit: Vec::new(),
459        }
460    }
461}
462
463/// Builder for [`DocumentExtension`] that requires signed integrity-binding fields.
464#[derive(Debug, Clone)]
465#[must_use]
466pub struct DocumentExtensionBuilder {
467    schema_version: u16,
468    clean_md_sha256: Option<[u8; 32]>,
469    layout_json_sha256: Option<[u8; 32]>,
470    report_json_sha256: Option<[u8; 32]>,
471    preview_png_sha256: Option<[u8; 32]>,
472    page_count: Option<u32>,
473    audit_session_id: Option<String>,
474    clean_spans: Vec<EmittedTokenSpan>,
475    codec_audit: Vec<CodecAuditRow>,
476}
477
478impl DocumentExtensionBuilder {
479    pub fn clean_md_sha256(mut self, hash: [u8; 32]) -> Self {
480        self.clean_md_sha256 = Some(hash);
481        self
482    }
483
484    pub fn layout_json_sha256(mut self, hash: [u8; 32]) -> Self {
485        self.layout_json_sha256 = Some(hash);
486        self
487    }
488
489    pub fn report_json_sha256(mut self, hash: [u8; 32]) -> Self {
490        self.report_json_sha256 = Some(hash);
491        self
492    }
493
494    pub fn preview_png_sha256(mut self, hash: [u8; 32]) -> Self {
495        self.preview_png_sha256 = Some(hash);
496        self
497    }
498
499    pub fn page_count(mut self, page_count: u32) -> Self {
500        self.page_count = Some(page_count);
501        self
502    }
503
504    pub fn audit_session_id(mut self, audit_session_id: impl Into<String>) -> Self {
505        self.audit_session_id = Some(audit_session_id.into());
506        self
507    }
508
509    pub fn clean_spans(mut self, clean_spans: Vec<EmittedTokenSpan>) -> Self {
510        self.clean_spans = clean_spans;
511        self
512    }
513
514    pub fn codec_audit(mut self, codec_audit: Vec<CodecAuditRow>) -> Self {
515        self.codec_audit = codec_audit;
516        self
517    }
518
519    pub fn build(self) -> Result<DocumentExtension, DocumentExtensionError> {
520        Ok(DocumentExtension {
521            schema_version: self.schema_version,
522            clean_md_sha256: self
523                .clean_md_sha256
524                .ok_or(DocumentExtensionError::MissingField("clean_md_sha256"))?,
525            layout_json_sha256: self
526                .layout_json_sha256
527                .ok_or(DocumentExtensionError::MissingField("layout_json_sha256"))?,
528            report_json_sha256: self
529                .report_json_sha256
530                .ok_or(DocumentExtensionError::MissingField("report_json_sha256"))?,
531            preview_png_sha256: self.preview_png_sha256,
532            page_count: self
533                .page_count
534                .ok_or(DocumentExtensionError::MissingField("page_count"))?,
535            audit_session_id: self
536                .audit_session_id
537                .ok_or(DocumentExtensionError::MissingField("audit_session_id"))?,
538            clean_spans: self.clean_spans,
539            codec_audit: self.codec_audit,
540        })
541    }
542}
543
544/// Errors returned while building a [`DocumentExtension`].
545#[derive(Debug, Clone, PartialEq, Eq, Error)]
546#[non_exhaustive]
547pub enum DocumentExtensionError {
548    #[error("missing document extension field: {0}")]
549    MissingField(&'static str),
550}
551
552/// Provenance of text extracted from a document or transcript source.
553#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
554#[serde(rename_all = "snake_case")]
555#[non_exhaustive]
556pub enum TextOrigin {
557    /// Text came from OCR over pixels.
558    Ocr,
559    /// Text came from an embedded text layer.
560    EmbeddedText,
561    /// Text came from an audio/video transcript.
562    Transcript,
563    /// Text came from multiple extraction paths.
564    Hybrid,
565}
566
567/// Orthogonal document codec capabilities delivered or advertised by a codec.
568#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
569#[non_exhaustive]
570pub struct CodecCapabilitySet {
571    /// Codec can emit text.
572    pub text: bool,
573    /// Codec can emit layout geometry.
574    pub layout: bool,
575    /// Codec can emit confidence buckets.
576    pub confidence: bool,
577    /// Codec can emit timestamps.
578    pub timestamps: bool,
579}
580
581impl CodecCapabilitySet {
582    /// Text-only capability set.
583    pub const TEXT_ONLY: Self = Self {
584        text: true,
585        layout: false,
586        confidence: false,
587        timestamps: false,
588    };
589
590    /// Builds a codec capability bitset.
591    pub const fn new(text: bool, layout: bool, confidence: bool, timestamps: bool) -> Self {
592        Self {
593            text,
594            layout,
595            confidence,
596            timestamps,
597        }
598    }
599
600    /// Returns true when this set contains every requested capability bit.
601    pub fn contains(self, requested: Self) -> bool {
602        (!requested.text || self.text)
603            && (!requested.layout || self.layout)
604            && (!requested.confidence || self.confidence)
605            && (!requested.timestamps || self.timestamps)
606    }
607}
608
609/// Per-codec declaration for text extraction density checks.
610#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
611#[serde(rename_all = "snake_case")]
612#[non_exhaustive]
613pub enum ExtractionDensityPolicy {
614    /// Require at least this many extracted text bytes per source KiB.
615    Required(f32),
616    /// Explicit exemption with an audit-visible reason.
617    Exempt { reason: String },
618}
619
620impl Default for ExtractionDensityPolicy {
621    fn default() -> Self {
622        Self::Exempt {
623            reason: "calibration_pending".to_string(),
624        }
625    }
626}
627
628/// Metadata-only audit row emitted by a document codec.
629#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
630#[non_exhaustive]
631pub struct CodecAuditRow {
632    /// Stable codec id, such as `gaze.codec.tesseract`.
633    pub codec_id: String,
634    /// Adapter crate version, distinct from engine provenance.
635    pub codec_version: String,
636    /// Accepted MIME type for the decode.
637    pub accepted_mime: String,
638    /// Capabilities advertised by the codec.
639    pub advertised: CodecCapabilitySet,
640    /// Capabilities delivered for this decode.
641    pub delivered: CodecCapabilitySet,
642    /// Text provenance reported by the codec.
643    pub text_origin: TextOrigin,
644    /// Codec-output schema version, decoupled from bundle schema version.
645    pub codec_output_schema_version: u16,
646    /// Hash of canonical codec options, never the options themselves.
647    #[serde(default, skip_serializing_if = "Option::is_none")]
648    pub options_hash_hex: Option<String>,
649    /// Engine provenance string, without paths or raw source text.
650    #[serde(default, skip_serializing_if = "Option::is_none")]
651    pub engine_provenance: Option<String>,
652    /// Extraction density policy declared by the codec for this MIME.
653    pub extraction_density_policy: ExtractionDensityPolicy,
654}
655
656impl CodecAuditRow {
657    /// Builds a metadata-only codec audit row.
658    pub fn new(
659        codec_id: impl Into<String>,
660        codec_version: impl Into<String>,
661        accepted_mime: impl Into<String>,
662        text_origin: TextOrigin,
663    ) -> Self {
664        Self {
665            codec_id: codec_id.into(),
666            codec_version: codec_version.into(),
667            accepted_mime: accepted_mime.into(),
668            advertised: CodecCapabilitySet::default(),
669            delivered: CodecCapabilitySet::default(),
670            text_origin,
671            codec_output_schema_version: 1,
672            options_hash_hex: None,
673            engine_provenance: None,
674            extraction_density_policy: ExtractionDensityPolicy::default(),
675        }
676    }
677}
678
679/// A suspected missed PII span reported by a [`SafetyNet`].
680///
681/// The safety net is not authoritative; a `LeakReport` is a signal, not a confirmed
682/// leak. False positives are expected. Review reports and adjust policy or recognizer
683/// thresholds.
684#[derive(Debug, Clone, Default, PartialEq)]
685#[non_exhaustive]
686pub struct LeakReport {
687    /// Suspected leaks, containing metadata only.
688    pub suspects: Vec<LeakSuspect>,
689    /// Bytes-free telemetry events.
690    pub telemetry: Vec<LeakReportTelemetry>,
691    /// Aggregated counts for callers that do not need full suspect metadata.
692    pub stats: LeakReportStats,
693    /// Optional replay hash.
694    ///
695    /// Replay determinism is guaranteed only when command path, checkpoint,
696    /// operating point, min score, and decode parameters are fixed externally.
697    pub replay_hash: Option<String>,
698}
699
700impl LeakReport {
701    /// Builds a report from suspects and telemetry.
702    pub fn from_parts(
703        suspects: Vec<LeakSuspect>,
704        telemetry: Vec<LeakReportTelemetry>,
705    ) -> LeakReport {
706        let mut stats = LeakReportStats {
707            suspect_count: suspects.len(),
708            locale_skipped_count: telemetry
709                .iter()
710                .filter(|event| matches!(event, LeakReportTelemetry::LocaleSkipped { .. }))
711                .count(),
712            ..LeakReportStats::default()
713        };
714        for suspect in &suspects {
715            match suspect.kind {
716                LeakKind::Uncovered => stats.uncovered_count += 1,
717                LeakKind::PartialBleed { .. } => stats.partial_bleed_count += 1,
718                LeakKind::ClassMismatch { .. } => stats.class_mismatch_count += 1,
719            }
720        }
721        LeakReport {
722            suspects,
723            telemetry,
724            stats,
725            replay_hash: None,
726        }
727    }
728
729    /// Merges another report into this report.
730    pub fn extend(&mut self, other: LeakReport) {
731        self.suspects.extend(other.suspects);
732        self.telemetry.extend(other.telemetry);
733        *self = LeakReport::from_parts(
734            std::mem::take(&mut self.suspects),
735            std::mem::take(&mut self.telemetry),
736        );
737    }
738}
739
740/// Closed set of upstream OpenAI Privacy Filter labels accepted by Gaze.
741#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
742#[non_exhaustive]
743pub enum OpenAiPrivateLabel {
744    /// `private_person`.
745    PrivatePerson,
746    /// `private_address`.
747    PrivateAddress,
748    /// `private_email`.
749    PrivateEmail,
750    /// `private_phone`.
751    PrivatePhone,
752    /// `private_url`.
753    PrivateUrl,
754    /// `private_date`.
755    PrivateDate,
756    /// `account_number`.
757    AccountNumber,
758    /// `secret`.
759    Secret,
760}
761
762impl OpenAiPrivateLabel {
763    /// Returns the raw upstream label.
764    pub fn as_str(self) -> &'static str {
765        match self {
766            Self::PrivatePerson => "private_person",
767            Self::PrivateAddress => "private_address",
768            Self::PrivateEmail => "private_email",
769            Self::PrivatePhone => "private_phone",
770            Self::PrivateUrl => "private_url",
771            Self::PrivateDate => "private_date",
772            Self::AccountNumber => "account_number",
773            Self::Secret => "secret",
774        }
775    }
776}
777
778/// Closed safety-net PII vocabulary before mapping into `PiiClass`.
779#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
780#[non_exhaustive]
781pub enum SafetyNetPiiClass {
782    /// Email address.
783    Email,
784    /// Person name.
785    Name,
786    /// Location or address.
787    Location,
788    /// Phone number.
789    Phone,
790    /// URL.
791    Url,
792    /// Date.
793    Date,
794    /// Account number.
795    AccountNumber,
796    /// Secret.
797    Secret,
798}
799
800impl SafetyNetPiiClass {
801    /// Maps the safety-net class into the shared pipeline class vocabulary.
802    pub fn to_pii_class(self) -> PiiClass {
803        match self {
804            Self::Email => PiiClass::Email,
805            Self::Name => PiiClass::Name,
806            Self::Location => PiiClass::Location,
807            Self::Phone => PiiClass::custom("phone"),
808            Self::Url => PiiClass::custom("url"),
809            Self::Date => PiiClass::custom("date"),
810            Self::AccountNumber => PiiClass::custom("account_number"),
811            Self::Secret => PiiClass::custom("secret"),
812        }
813    }
814}
815
816/// Exhaustive, closed error set for safety-net execution.
817#[derive(Debug, Clone, PartialEq, Eq, Error)]
818#[non_exhaustive]
819pub enum SafetyNetError {
820    /// Safety net was explicitly requested but is unavailable.
821    #[error("safety net unavailable: {reason}")]
822    Unavailable {
823        /// Sanitized reason.
824        reason: String,
825    },
826    /// Required model weights or checkpoint are missing.
827    #[error("safety net weights missing: {path}")]
828    WeightsMissing {
829        /// Sanitized path or identifier.
830        path: String,
831    },
832    /// Backend model could not be loaded or reached.
833    #[error("safety net model unavailable: {reason}")]
834    ModelUnavailable {
835        /// Sanitized reason.
836        reason: String,
837    },
838    /// Input exceeded configured backend limit.
839    #[error("safety net input too large: limit={limit}, actual={actual}")]
840    InputTooLarge {
841        /// Configured byte limit.
842        limit: usize,
843        /// Actual byte length.
844        actual: usize,
845    },
846    /// Backend runtime failed.
847    #[error("safety net runtime failed: {message}")]
848    Runtime {
849        /// Sanitized diagnostic message.
850        message: String,
851    },
852    /// Backend returned invalid output.
853    #[error("safety net invalid output: {message}")]
854    InvalidOutput {
855        /// Sanitized diagnostic message.
856        message: String,
857    },
858}
859
860/// Disposition applied to a detected PII span.
861///
862/// | Variant | Restorable | Output shape |
863/// |---------|------------|--------------|
864/// | `Tokenize` | Yes | Opaque token: `<hex:Class_N>` |
865/// | `FormatPreserve` | Yes | Realistic-looking pseudonym (e.g., `email1.hex@gaze-fake.invalid`) |
866/// | `Redact` | No | Literal `[REDACTED]` -- original value is gone |
867/// | `Generalize` | No | Class label (e.g., `[Email]`) -- original value is gone |
868/// | `Preserve` | - | Passes through unchanged |
869///
870/// `Action` is `#[non_exhaustive]`. Use a wildcard arm in exhaustive matches.
871/// When restore is required, use `Tokenize` or `FormatPreserve` -- `Redact` and
872/// `Generalize` are irreversible.
873#[derive(Debug, Clone, Copy, PartialEq, Eq)]
874#[non_exhaustive]
875pub enum Action {
876    /// Replace PII with a reversible token.
877    Tokenize,
878    /// Replace PII with a non-restorable redaction marker.
879    Redact,
880    /// Replace PII with a reversible format-preserving token.
881    FormatPreserve,
882    /// Replace PII with a broader category.
883    Generalize,
884    /// Preserve the original value.
885    Preserve,
886}
887
888/// Conflict resolution tier that selected or rejected a candidate.
889#[derive(Debug, Clone, Copy, PartialEq, Eq)]
890#[non_exhaustive]
891pub enum ConflictTier {
892    /// No conflict resolution was needed.
893    None,
894    /// Class priority decided the conflict.
895    ClassPriority,
896    /// Rule priority decided the conflict.
897    RulePriority,
898    /// Candidate score decided the conflict.
899    Score,
900    /// Span length decided the conflict.
901    SpanLength,
902    /// Validator result decided the conflict.
903    Validator,
904    /// Recognizer identifier decided the conflict.
905    RecognizerId,
906    /// Candidate was merged with another candidate.
907    Merged,
908}
909
910/// Source document kind for metadata-only audit logging.
911#[derive(Debug, Clone, Copy, PartialEq, Eq)]
912#[non_exhaustive]
913pub enum DocumentKind {
914    /// Structured key/value document.
915    Structured,
916    /// Plain text document.
917    Text,
918}
919
920/// One row of redaction metadata emitted to a [`RedactionLogger`].
921///
922/// Fields identify the PII class, action taken, session ID, source document kind,
923/// conflict-resolution metadata, and timestamp. Does **not** contain the original PII
924/// value, the token string, or any identifiable content beyond what a compliance audit
925/// requires.
926///
927/// `RedactionEntry` is `#[non_exhaustive]`; adopters must construct via the public
928/// constructor or destructure with a wildcard pattern.
929#[derive(Debug, Clone, PartialEq, Eq)]
930#[non_exhaustive]
931pub struct RedactionEntry {
932    /// Detector or recognizer source identifier.
933    pub source: String,
934    /// PII class affected by the decision.
935    pub class: PiiClass,
936    /// Policy action applied to the span.
937    pub action: Action,
938    /// Optional structured field name.
939    pub field_name: Option<String>,
940    /// Source document kind.
941    pub document_kind: DocumentKind,
942    /// Whether this entry records a loser in conflict resolution.
943    pub conflict_loser: bool,
944    /// Conflict tier that decided the outcome.
945    pub decided_by: ConflictTier,
946    /// Creation timestamp in epoch milliseconds.
947    pub created_at: i64,
948    /// Optional session identifier.
949    pub session_id: Option<String>,
950}
951
952impl RedactionEntry {
953    /// Builds a metadata-only redaction log entry.
954    #[allow(clippy::too_many_arguments)]
955    pub fn new(
956        source: impl Into<String>,
957        class: PiiClass,
958        action: Action,
959        field_name: Option<String>,
960        document_kind: DocumentKind,
961        conflict_loser: bool,
962        decided_by: ConflictTier,
963        created_at: i64,
964        session_id: Option<String>,
965    ) -> Self {
966        Self {
967            source: source.into(),
968            class,
969            action,
970            field_name,
971            document_kind,
972            conflict_loser,
973            decided_by,
974            created_at,
975            session_id,
976        }
977    }
978}
979
980/// Closed error set for redaction log sinks.
981#[derive(Debug, Clone, PartialEq, Eq, Error)]
982#[non_exhaustive]
983pub enum RedactionLogError {
984    /// SQLite-backed redaction log sink failed.
985    #[error("sqlite redaction log error: {0}")]
986    Sqlite(String),
987    /// Non-SQLite redaction log sink failed.
988    #[error("backend redaction log error: {0}")]
989    Backend(String),
990}
991
992/// Trait for audit sinks that receive redaction metadata.
993///
994/// Implement this for custom audit backends (remote telemetry, structured JSON logs).
995/// For SQLite-backed persistence, use `gaze_audit::SqliteLogger`.
996///
997/// # Contract
998///
999/// The logger receives **metadata only**: class, action, session ID, timestamp, and
1000/// other bytes-free audit labels. It never receives the original PII value or the token
1001/// value. A custom impl that augments entries with raw document text violates the audit
1002/// isolation contract and will be flagged by the `gaze_module_isolation` Dylint lint
1003/// when it lives in the wrong crate.
1004///
1005/// # Example
1006///
1007/// ```rust
1008/// use std::sync::atomic::{AtomicUsize, Ordering};
1009/// use gaze_types::{RedactionEntry, RedactionLogError, RedactionLogger};
1010///
1011/// #[derive(Default)]
1012/// struct CountLogger(AtomicUsize);
1013///
1014/// impl RedactionLogger for CountLogger {
1015///     fn log(&self, _entry: &RedactionEntry) -> Result<(), RedactionLogError> {
1016///         self.0.fetch_add(1, Ordering::Relaxed);
1017///         Ok(())
1018///     }
1019/// }
1020/// ```
1021pub trait RedactionLogger: Send + Sync {
1022    /// Records a metadata-only redaction entry.
1023    fn log(&self, entry: &RedactionEntry) -> Result<(), RedactionLogError>;
1024}
1025
1026/// Locale tag recognized by policy and recognizers.
1027#[derive(Debug, Clone, PartialEq, Eq, Hash)]
1028#[non_exhaustive]
1029pub enum LocaleTag {
1030    /// Locale-independent recognizer or policy.
1031    Global,
1032    /// German as used in Germany.
1033    DeDe,
1034    /// German as used in Austria.
1035    DeAt,
1036    /// German as used in Switzerland.
1037    DeCh,
1038    /// English as used in the United States.
1039    EnUs,
1040    /// English as used in Great Britain.
1041    EnGb,
1042    /// English as used in Ireland.
1043    EnIe,
1044    /// English as used in Australia.
1045    EnAu,
1046    /// English as used in Canada.
1047    EnCa,
1048    /// Any other canonical BCP-47-like tag.
1049    Other(String),
1050}
1051
1052/// Locale parsing error.
1053#[derive(Debug, Clone, PartialEq, Eq)]
1054#[non_exhaustive]
1055pub enum LocaleError {
1056    /// Locale tag is unsupported or invalid.
1057    Unsupported,
1058}
1059
1060impl fmt::Display for LocaleError {
1061    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1062        match self {
1063            LocaleError::Unsupported => f.write_str("unsupported locale"),
1064        }
1065    }
1066}
1067
1068impl std::error::Error for LocaleError {}
1069
1070/// Ordered locale fallback chain.
1071#[derive(Debug, Clone, PartialEq, Eq)]
1072pub struct LocaleChain(Vec<LocaleTag>);
1073
1074impl LocaleTag {
1075    /// Global locale constant.
1076    pub const GLOBAL: LocaleTag = LocaleTag::Global;
1077
1078    /// Parses a locale tag from policy or CLI input.
1079    pub fn parse(s: &str) -> Result<LocaleTag, LocaleError> {
1080        let raw = s.trim().replace('_', "-");
1081        let normalized = raw.to_ascii_lowercase();
1082        match normalized.as_str() {
1083            "global" | "*" => Ok(LocaleTag::Global),
1084            "de-de" => Ok(LocaleTag::DeDe),
1085            "de-at" => Ok(LocaleTag::DeAt),
1086            "de-ch" => Ok(LocaleTag::DeCh),
1087            "en-us" => Ok(LocaleTag::EnUs),
1088            "en-gb" => Ok(LocaleTag::EnGb),
1089            "en-ie" => Ok(LocaleTag::EnIe),
1090            "en-au" => Ok(LocaleTag::EnAu),
1091            "en-ca" => Ok(LocaleTag::EnCa),
1092            "" => Err(LocaleError::Unsupported),
1093            _ if is_bcp47_parseable(&raw) => Ok(LocaleTag::Other(canonical_other(&raw))),
1094            _ => Err(LocaleError::Unsupported),
1095        }
1096    }
1097
1098    /// Returns the canonical string form of the locale tag.
1099    pub fn as_str(&self) -> &str {
1100        match self {
1101            LocaleTag::Global => "global",
1102            LocaleTag::DeDe => "de-DE",
1103            LocaleTag::DeAt => "de-AT",
1104            LocaleTag::DeCh => "de-CH",
1105            LocaleTag::EnUs => "en-US",
1106            LocaleTag::EnGb => "en-GB",
1107            LocaleTag::EnIe => "en-IE",
1108            LocaleTag::EnAu => "en-AU",
1109            LocaleTag::EnCa => "en-CA",
1110            LocaleTag::Other(tag) => tag.as_str(),
1111        }
1112    }
1113}
1114
1115impl LocaleChain {
1116    /// Builds a locale chain and appends global fallback when absent.
1117    pub fn from_tags(mut tags: Vec<LocaleTag>) -> LocaleChain {
1118        ensure_global(&mut tags);
1119        LocaleChain(tags)
1120    }
1121
1122    /// Parses a comma-separated CLI locale chain.
1123    pub fn from_cli(raw: &str) -> Result<LocaleChain, LocaleError> {
1124        let tags = raw
1125            .split(',')
1126            .map(LocaleTag::parse)
1127            .collect::<Result<Vec<_>, _>>()?;
1128        Ok(LocaleChain::from_tags(tags))
1129    }
1130
1131    /// Merges policy and CLI locale preferences.
1132    pub fn merge_policy_and_cli(
1133        policy: Option<&[LocaleTag]>,
1134        cli: Option<&[LocaleTag]>,
1135    ) -> LocaleChain {
1136        Self::merge_cli_policy_rulepack_default(cli, policy, None)
1137    }
1138
1139    /// Merges CLI, policy, rulepack, and default locale preferences.
1140    pub fn merge_cli_policy_rulepack_default(
1141        cli: Option<&[LocaleTag]>,
1142        policy: Option<&[LocaleTag]>,
1143        rulepack_defaults: Option<&[LocaleTag]>,
1144    ) -> LocaleChain {
1145        let tags = cli
1146            .filter(|tags| !tags.is_empty())
1147            .or_else(|| policy.filter(|tags| !tags.is_empty()))
1148            .or_else(|| rulepack_defaults.filter(|tags| !tags.is_empty()))
1149            .map(|tags| tags.to_vec())
1150            .unwrap_or_else(|| vec![LocaleTag::Global]);
1151        LocaleChain::from_tags(tags)
1152    }
1153
1154    /// Returns true when a recognizer can run under this locale chain.
1155    pub fn intersects(&self, recognizer_locales: &[LocaleTag]) -> bool {
1156        if recognizer_locales.is_empty() {
1157            return true;
1158        }
1159        recognizer_locales.iter().any(|recognizer_locale| {
1160            *recognizer_locale == LocaleTag::Global
1161                || self.0.iter().any(|active| active == recognizer_locale)
1162        })
1163    }
1164
1165    /// Returns the locale tags in chain order.
1166    pub fn as_slice(&self) -> &[LocaleTag] {
1167        &self.0
1168    }
1169
1170    /// Returns the locale chain as canonical strings.
1171    pub fn to_strings(&self) -> Vec<String> {
1172        self.0.iter().map(ToString::to_string).collect()
1173    }
1174}
1175
1176impl From<&[LocaleTag]> for LocaleChain {
1177    fn from(tags: &[LocaleTag]) -> Self {
1178        let mut owned = tags.to_vec();
1179        ensure_global(&mut owned);
1180        LocaleChain(owned)
1181    }
1182}
1183
1184impl fmt::Display for LocaleTag {
1185    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1186        f.write_str(self.as_str())
1187    }
1188}
1189
1190/// The input document submitted for pseudonymization.
1191///
1192/// `RawDocument::Text(String)` for plain or semi-structured text (most LLM workflows).
1193/// `RawDocument::Structured(BTreeMap<String, Value>)` for JSON-shaped data where
1194/// column-aware rules apply -- `ColumnRule`s only take effect on structured input.
1195///
1196/// `Detection::span` and recognizer candidate spans use **byte** ranges, not char indices.
1197///
1198/// `RawDocument` is `#[non_exhaustive]`. Match with a wildcard arm.
1199#[derive(Debug, Clone)]
1200#[non_exhaustive]
1201pub enum RawDocument {
1202    /// Structured document values.
1203    Structured(BTreeMap<String, Value>),
1204    /// Plain text document.
1205    Text(String),
1206}
1207
1208/// The pseudonymized output from `Pipeline::redact`.
1209///
1210/// Mirrors the shape of `RawDocument`: `CleanDocument::Text(String)` or
1211/// `CleanDocument::Structured(BTreeMap<String, Value>)`. Destructure with a `let`-else
1212/// or `match`; **there is no `.text()` accessor**.
1213///
1214/// ```rust
1215/// use gaze_types::CleanDocument;
1216///
1217/// fn unwrap_text(doc: CleanDocument) -> Option<String> {
1218///     if let CleanDocument::Text(t) = doc { Some(t) } else { None }
1219/// }
1220/// ```
1221///
1222/// Contains only tokens or redacted placeholders -- no original PII values.
1223/// Send this (or its inner string) to the LLM; never send the original `RawDocument`.
1224///
1225/// `CleanDocument` is `#[non_exhaustive]`.
1226#[derive(Debug, Clone, Serialize)]
1227#[serde(untagged)]
1228#[non_exhaustive]
1229pub enum CleanDocument {
1230    /// Structured document values.
1231    Structured(BTreeMap<String, Value>),
1232    /// Plain text document.
1233    Text(String),
1234}
1235
1236/// Minimal structured value representation that avoids a serde_json dependency.
1237#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
1238#[serde(untagged)]
1239#[non_exhaustive]
1240pub enum Value {
1241    /// Null value.
1242    Null,
1243    /// Boolean value.
1244    Bool(bool),
1245    /// String value.
1246    String(String),
1247    /// Signed 64-bit integer value.
1248    I64(i64),
1249    /// Array value.
1250    Array(Vec<Value>),
1251    /// Object value.
1252    Object(BTreeMap<String, Value>),
1253}
1254
1255impl Value {
1256    /// Returns the inner string for string values.
1257    pub fn as_str(&self) -> Option<&str> {
1258        match self {
1259            Self::String(value) => Some(value.as_str()),
1260            Self::Null | Self::Bool(_) | Self::I64(_) | Self::Array(_) | Self::Object(_) => None,
1261        }
1262    }
1263
1264    /// Returns a scalar string representation used for structured safety-net checks.
1265    pub fn scalar_to_safety_net_string(&self) -> Option<String> {
1266        match self {
1267            Self::String(value) if !value.is_empty() => Some(value.clone()),
1268            Self::String(_) | Self::Null | Self::Array(_) | Self::Object(_) => None,
1269            Self::Bool(value) => Some(value.to_string()),
1270            Self::I64(value) => Some(value.to_string()),
1271        }
1272    }
1273}
1274
1275impl PartialEq<&str> for Value {
1276    fn eq(&self, other: &&str) -> bool {
1277        self.as_str() == Some(*other)
1278    }
1279}
1280
1281/// Value-only dictionary bundle shared with recognizers.
1282#[derive(Debug, Clone, Default)]
1283pub struct DictionaryBundle {
1284    entries: HashMap<String, DictionaryEntry>,
1285}
1286
1287/// Value-only dictionary entry; compiled automatons live outside `gaze-types`.
1288#[derive(Debug, Clone)]
1289pub struct DictionaryEntry {
1290    terms: Vec<String>,
1291    case_sensitive: bool,
1292    source: DictionarySource,
1293}
1294
1295/// Source of a dictionary entry.
1296#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1297#[non_exhaustive]
1298pub enum DictionarySource {
1299    /// Dictionary supplied by request context.
1300    Cli,
1301    /// Dictionary supplied by a rulepack.
1302    Rulepack,
1303}
1304
1305/// Dictionary metadata used for diagnostics and tests.
1306#[derive(Debug, Clone, PartialEq, Eq)]
1307#[non_exhaustive]
1308pub struct DictionaryStats {
1309    /// Dictionary name.
1310    pub name: String,
1311    /// Number of configured terms.
1312    pub term_count: usize,
1313    /// Dictionary source.
1314    pub source: DictionarySource,
1315}
1316
1317impl DictionaryStats {
1318    /// Builds dictionary diagnostics metadata.
1319    pub fn new(name: impl Into<String>, term_count: usize, source: DictionarySource) -> Self {
1320        Self {
1321            name: name.into(),
1322            term_count,
1323            source,
1324        }
1325    }
1326}
1327
1328/// Dictionary declared by a rulepack.
1329#[derive(Debug, Clone, PartialEq, Eq)]
1330#[non_exhaustive]
1331pub struct RulepackDict {
1332    /// Dictionary name.
1333    pub name: String,
1334    /// Dictionary terms.
1335    pub terms: Vec<String>,
1336    /// Whether matching is case-sensitive.
1337    pub case_sensitive: bool,
1338}
1339
1340impl RulepackDict {
1341    /// Builds a rulepack dictionary declaration.
1342    pub fn new(name: impl Into<String>, terms: Vec<String>, case_sensitive: bool) -> Self {
1343        Self {
1344            name: name.into(),
1345            terms,
1346            case_sensitive,
1347        }
1348    }
1349}
1350
1351/// Error raised when constructing invalid dictionary entries.
1352#[derive(Debug, Clone, PartialEq, Eq)]
1353#[non_exhaustive]
1354pub enum DictionaryLoadError {
1355    /// Dictionary has no terms.
1356    Empty { name: String },
1357    /// ASCII-only case-insensitive matching cannot safely cover this entry.
1358    UnicodeInsensitiveUnsupported { name: String },
1359}
1360
1361impl fmt::Display for DictionaryLoadError {
1362    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1363        match self {
1364            Self::Empty { name } => write!(f, "dictionary '{name}' has no terms"),
1365            Self::UnicodeInsensitiveUnsupported { name } => write!(
1366                f,
1367                "dictionary '{name}' uses unicode terms with case-insensitive matching, unsupported in v0.4.0; use case_sensitive = true"
1368            ),
1369        }
1370    }
1371}
1372
1373impl std::error::Error for DictionaryLoadError {}
1374
1375impl DictionaryBundle {
1376    /// Builds a bundle from rulepack dictionaries.
1377    pub fn from_rulepack_terms(terms: &[RulepackDict]) -> Self {
1378        let mut entries = HashMap::with_capacity(terms.len());
1379        for dictionary in terms {
1380            let entry = DictionaryEntry::new(
1381                &dictionary.name,
1382                dictionary.terms.clone(),
1383                dictionary.case_sensitive,
1384                DictionarySource::Rulepack,
1385            )
1386            .expect("Policy validates dictionary terms before bundle construction");
1387            entries.insert(dictionary.name.clone(), entry);
1388        }
1389        Self { entries }
1390    }
1391
1392    /// Builds a bundle from pre-built dictionary entries.
1393    pub fn from_entries(entries: impl IntoIterator<Item = (String, DictionaryEntry)>) -> Self {
1394        Self {
1395            entries: entries.into_iter().collect(),
1396        }
1397    }
1398
1399    /// Merges two bundles, preferring entries from the second bundle on name conflicts.
1400    pub fn merge(a: Self, b: Self) -> Self {
1401        let mut entries = a.entries;
1402        entries.extend(b.entries);
1403        Self { entries }
1404    }
1405
1406    /// Returns a dictionary by name.
1407    pub fn get(&self, name: &str) -> Option<&DictionaryEntry> {
1408        self.entries.get(name)
1409    }
1410
1411    /// Returns sorted dictionary stats.
1412    pub fn stats(&self) -> Vec<DictionaryStats> {
1413        let mut stats = self
1414            .entries
1415            .iter()
1416            .map(|(name, entry)| DictionaryStats {
1417                name: name.clone(),
1418                term_count: entry.terms.len(),
1419                source: entry.source,
1420            })
1421            .collect::<Vec<_>>();
1422        stats.sort_by(|a, b| a.name.cmp(&b.name));
1423        stats
1424    }
1425}
1426
1427impl DictionaryEntry {
1428    /// Creates a validated value-only dictionary entry.
1429    pub fn new(
1430        name: &str,
1431        terms: Vec<String>,
1432        case_sensitive: bool,
1433        source: DictionarySource,
1434    ) -> Result<Self, DictionaryLoadError> {
1435        if terms.is_empty() {
1436            return Err(DictionaryLoadError::Empty {
1437                name: name.to_string(),
1438            });
1439        }
1440        if !case_sensitive && terms.iter().any(|term| !term.is_ascii()) {
1441            return Err(DictionaryLoadError::UnicodeInsensitiveUnsupported {
1442                name: name.to_string(),
1443            });
1444        }
1445        Ok(Self {
1446            terms,
1447            case_sensitive,
1448            source,
1449        })
1450    }
1451
1452    /// Returns whether matching is case-sensitive.
1453    pub fn case_sensitive(&self) -> bool {
1454        self.case_sensitive
1455    }
1456
1457    /// Returns configured dictionary terms.
1458    pub fn terms(&self) -> &[String] {
1459        &self.terms
1460    }
1461}
1462
1463#[cfg(test)]
1464mod document_extension_tests {
1465    use super::*;
1466
1467    fn audit_row() -> CodecAuditRow {
1468        let mut row = CodecAuditRow::new(
1469            "gaze.codec.tesseract",
1470            "gaze-codec-tesseract@0.7.1",
1471            "image/png",
1472            TextOrigin::Ocr,
1473        );
1474        row.advertised = CodecCapabilitySet::new(true, true, true, false);
1475        row.delivered = CodecCapabilitySet::new(true, true, false, false);
1476        row.extraction_density_policy = ExtractionDensityPolicy::Required(1.0);
1477        row
1478    }
1479
1480    fn extension_builder() -> DocumentExtensionBuilder {
1481        DocumentExtension::builder(1)
1482            .clean_md_sha256([1; 32])
1483            .layout_json_sha256([2; 32])
1484            .report_json_sha256([3; 32])
1485            .page_count(2)
1486            .audit_session_id("018f0000-0000-7000-8000-000000000000")
1487    }
1488
1489    #[test]
1490    fn document_extension_round_trips_with_bundle_root_schema_version() {
1491        let mut row = audit_row();
1492        row.options_hash_hex = Some("00".repeat(32));
1493        row.engine_provenance = Some("tesseract@5.3.4".to_string());
1494        let extension = extension_builder()
1495            .preview_png_sha256([4; 32])
1496            .clean_spans(vec![EmittedTokenSpan::new(0..8, 0..12, PiiClass::Email)])
1497            .codec_audit(vec![row])
1498            .build()
1499            .expect("document extension");
1500
1501        let json = serde_json::to_value(&extension).expect("serialize document extension");
1502
1503        assert_eq!(json["schema_version"], 1);
1504        assert_eq!(json["clean_md_sha256"].as_array().expect("hash").len(), 32);
1505        assert_eq!(
1506            json["layout_json_sha256"].as_array().expect("hash").len(),
1507            32
1508        );
1509        assert_eq!(
1510            json["report_json_sha256"].as_array().expect("hash").len(),
1511            32
1512        );
1513        assert_eq!(
1514            json["preview_png_sha256"].as_array().expect("hash").len(),
1515            32
1516        );
1517        assert_eq!(json["page_count"], 2);
1518        assert_eq!(
1519            json["audit_session_id"],
1520            "018f0000-0000-7000-8000-000000000000"
1521        );
1522        assert_eq!(json["clean_spans"].as_array().expect("spans").len(), 1);
1523        assert!(json.get("clean_schema_version").is_none());
1524        assert!(json.get("layout_schema_version").is_none());
1525        assert!(json.get("report_schema_version").is_none());
1526        assert!(json.get("manifest_schema_version").is_none());
1527
1528        let decoded: DocumentExtension =
1529            serde_json::from_value(json).expect("deserialize document extension");
1530        assert_eq!(decoded, extension);
1531    }
1532
1533    #[test]
1534    fn document_extension_carries_full_integrity_set() {
1535        let extension = DocumentExtension::builder(1)
1536            .clean_md_sha256([10; 32])
1537            .layout_json_sha256([11; 32])
1538            .report_json_sha256([12; 32])
1539            .preview_png_sha256([13; 32])
1540            .page_count(7)
1541            .audit_session_id("018f0000-0000-7000-8000-000000000001")
1542            .clean_spans(vec![EmittedTokenSpan::new(5..14, 20..34, PiiClass::Name)])
1543            .codec_audit(vec![audit_row()])
1544            .build()
1545            .expect("document extension");
1546
1547        let json = serde_json::to_string(&extension).expect("serialize document extension");
1548        let decoded: DocumentExtension =
1549            serde_json::from_str(&json).expect("deserialize document extension");
1550
1551        assert_eq!(decoded, extension);
1552        assert_eq!(decoded.clean_md_sha256, [10; 32]);
1553        assert_eq!(decoded.layout_json_sha256, [11; 32]);
1554        assert_eq!(decoded.report_json_sha256, [12; 32]);
1555        assert_eq!(decoded.preview_png_sha256, Some([13; 32]));
1556        assert_eq!(decoded.page_count, 7);
1557        assert_eq!(
1558            decoded.audit_session_id,
1559            "018f0000-0000-7000-8000-000000000001"
1560        );
1561        assert_eq!(decoded.clean_spans.len(), 1);
1562        assert_eq!(decoded.codec_audit.len(), 1);
1563    }
1564
1565    #[test]
1566    fn document_extension_builder_requires_integrity_fields() {
1567        assert_eq!(
1568            DocumentExtension::builder(1).build(),
1569            Err(DocumentExtensionError::MissingField("clean_md_sha256"))
1570        );
1571        assert_eq!(
1572            DocumentExtension::builder(1)
1573                .clean_md_sha256([1; 32])
1574                .layout_json_sha256([2; 32])
1575                .report_json_sha256([3; 32])
1576                .page_count(1)
1577                .build(),
1578            Err(DocumentExtensionError::MissingField("audit_session_id"))
1579        );
1580    }
1581
1582    #[test]
1583    fn codec_audit_row_round_trips_without_raw_pii_fields() {
1584        let row = audit_row();
1585        let json = serde_json::to_string(&row).expect("serialize codec audit row");
1586
1587        assert!(json.contains("\"codec_id\""));
1588        assert!(!json.contains("alice@example.invalid"));
1589        assert!(!json.contains("\"raw\""));
1590        assert_eq!(
1591            serde_json::from_str::<CodecAuditRow>(&json).expect("deserialize codec audit row"),
1592            row
1593        );
1594    }
1595
1596    #[test]
1597    fn text_origin_round_trips() {
1598        for origin in [
1599            TextOrigin::Ocr,
1600            TextOrigin::EmbeddedText,
1601            TextOrigin::Transcript,
1602            TextOrigin::Hybrid,
1603        ] {
1604            let json = serde_json::to_string(&origin).expect("serialize text origin");
1605            let decoded: TextOrigin = serde_json::from_str(&json).expect("deserialize text origin");
1606            assert_eq!(decoded, origin);
1607        }
1608    }
1609
1610    #[test]
1611    fn codec_capability_set_round_trips_and_contains_requested_bits() {
1612        let delivered = CodecCapabilitySet::new(true, true, false, false);
1613
1614        let json = serde_json::to_string(&delivered).expect("serialize capabilities");
1615        let decoded: CodecCapabilitySet =
1616            serde_json::from_str(&json).expect("deserialize capabilities");
1617
1618        assert_eq!(decoded, delivered);
1619        assert!(decoded.contains(CodecCapabilitySet::TEXT_ONLY));
1620        assert!(!decoded.contains(CodecCapabilitySet::new(true, true, true, false)));
1621    }
1622
1623    #[test]
1624    fn extraction_density_policy_round_trips_closed_variants() {
1625        for policy in [
1626            ExtractionDensityPolicy::Required(1.25),
1627            ExtractionDensityPolicy::Exempt {
1628                reason: "text_only".to_string(),
1629            },
1630        ] {
1631            let json = serde_json::to_string(&policy).expect("serialize density policy");
1632            let decoded: ExtractionDensityPolicy =
1633                serde_json::from_str(&json).expect("deserialize density policy");
1634            assert_eq!(decoded, policy);
1635        }
1636    }
1637
1638    #[test]
1639    fn manifest_stats_round_trip_for_document_report_mirrors() {
1640        let manifest =
1641            Manifest::from_spans(vec![EmittedTokenSpan::new(0..15, 0..19, PiiClass::Email)]);
1642        let stats = LeakReportStats {
1643            suspect_count: 1,
1644            uncovered_count: 0,
1645            partial_bleed_count: 0,
1646            class_mismatch_count: 0,
1647            locale_skipped_count: 0,
1648        };
1649
1650        let manifest_json = serde_json::to_string(&manifest).expect("serialize manifest");
1651        let stats_json = serde_json::to_string(&stats).expect("serialize stats");
1652
1653        assert_eq!(
1654            serde_json::from_str::<Manifest>(&manifest_json).expect("deserialize manifest"),
1655            manifest
1656        );
1657        assert_eq!(
1658            serde_json::from_str::<LeakReportStats>(&stats_json).expect("deserialize stats"),
1659            stats
1660        );
1661    }
1662}
1663
1664#[cfg(test)]
1665mod dictionary_tests {
1666    use super::*;
1667
1668    #[test]
1669    fn dictionary_entry_rejects_empty_terms() {
1670        let err = DictionaryEntry::new("empty", Vec::new(), true, DictionarySource::Cli)
1671            .expect_err("empty dictionaries must fail closed");
1672
1673        assert!(matches!(err, DictionaryLoadError::Empty { name } if name == "empty"));
1674    }
1675
1676    #[test]
1677    fn dictionary_entry_rejects_non_ascii_case_insensitive_terms() {
1678        let err = DictionaryEntry::new(
1679            "songs",
1680            vec!["Beyonce".to_string(), "Caf\u{00e9}".to_string()],
1681            false,
1682            DictionarySource::Cli,
1683        )
1684        .expect_err("unicode case-insensitive dictionaries must fail closed");
1685
1686        assert!(matches!(
1687            err,
1688            DictionaryLoadError::UnicodeInsensitiveUnsupported { name } if name == "songs"
1689        ));
1690    }
1691}
1692
1693#[cfg(test)]
1694mod redaction_logger_tests {
1695    use super::*;
1696
1697    struct CapturingLogger;
1698
1699    impl RedactionLogger for CapturingLogger {
1700        fn log(&self, _entry: &RedactionEntry) -> Result<(), RedactionLogError> {
1701            Ok(())
1702        }
1703    }
1704
1705    fn assert_send_sync<T: Send + Sync + ?Sized>() {}
1706
1707    #[test]
1708    fn redaction_log_error_display_is_stable() {
1709        assert_eq!(
1710            RedactionLogError::Sqlite("write failed".to_string()).to_string(),
1711            "sqlite redaction log error: write failed"
1712        );
1713        assert_eq!(
1714            RedactionLogError::Backend("sink failed".to_string()).to_string(),
1715            "backend redaction log error: sink failed"
1716        );
1717    }
1718
1719    #[test]
1720    fn redaction_logger_trait_object_is_send_sync() {
1721        assert_send_sync::<dyn RedactionLogger>();
1722    }
1723
1724    #[test]
1725    fn local_logger_can_implement_redaction_logger() {
1726        let logger = CapturingLogger;
1727        let entry = RedactionEntry {
1728            source: "unit-test".to_string(),
1729            class: PiiClass::Email,
1730            action: Action::Tokenize,
1731            field_name: None,
1732            document_kind: DocumentKind::Text,
1733            conflict_loser: false,
1734            decided_by: ConflictTier::None,
1735            created_at: 0,
1736            session_id: None,
1737        };
1738
1739        let trait_object: &dyn RedactionLogger = &logger;
1740        trait_object.log(&entry).expect("log entry");
1741    }
1742}
1743
1744#[cfg(test)]
1745mod safety_net_manifest_tests {
1746    use super::*;
1747
1748    fn span(start: usize, end: usize, class: PiiClass) -> EmittedTokenSpan {
1749        EmittedTokenSpan {
1750            clean_span: start..end,
1751            raw_span: start..end,
1752            class,
1753        }
1754    }
1755
1756    fn diff(manifest: Manifest, suspect: Range<usize>, class: PiiClass) -> Option<LeakKind> {
1757        manifest.diff_against(&suspect, &class)
1758    }
1759
1760    #[test]
1761    fn exact_same_class_coverage_is_not_a_leak() {
1762        let manifest = Manifest::from_spans(vec![span(0, 8, PiiClass::Email)]);
1763
1764        assert_eq!(diff(manifest, 0..8, PiiClass::Email), None);
1765    }
1766
1767    #[test]
1768    fn uncovered_outside_all_tokens_is_uncovered() {
1769        let manifest = Manifest::from_spans(vec![span(20, 30, PiiClass::Email)]);
1770
1771        assert_eq!(
1772            diff(manifest, 0..10, PiiClass::Email),
1773            Some(LeakKind::Uncovered)
1774        );
1775    }
1776
1777    #[test]
1778    fn single_internal_gap_returns_partial_bleed() {
1779        let manifest = Manifest::from_spans(vec![
1780            span(0, 5, PiiClass::Email),
1781            span(10, 15, PiiClass::Email),
1782        ]);
1783
1784        assert_eq!(
1785            diff(manifest, 0..15, PiiClass::Email),
1786            Some(LeakKind::PartialBleed { uncovered: 5..10 })
1787        );
1788    }
1789
1790    #[test]
1791    fn multi_gap_returns_deterministic_first_uncovered_gap() {
1792        let manifest = Manifest::from_spans(vec![
1793            span(0, 3, PiiClass::Email),
1794            span(5, 7, PiiClass::Email),
1795            span(9, 12, PiiClass::Email),
1796        ]);
1797
1798        // The first-gap-only rule is intentional for v0.6.1; full gap
1799        // enumeration is deferred until the report format can carry it.
1800        assert_eq!(
1801            diff(manifest, 0..12, PiiClass::Email),
1802            Some(LeakKind::PartialBleed { uncovered: 3..5 })
1803        );
1804    }
1805
1806    #[test]
1807    fn multi_class_overlap_reports_first_mismatch_deterministically() {
1808        let manifest = Manifest::from_spans(vec![
1809            span(0, 4, PiiClass::Name),
1810            span(4, 8, PiiClass::Location),
1811        ]);
1812
1813        assert_eq!(
1814            diff(manifest, 0..8, PiiClass::Email),
1815            Some(LeakKind::ClassMismatch {
1816                pipeline_class: PiiClass::Name,
1817                safety_net_class: PiiClass::Email,
1818            })
1819        );
1820    }
1821
1822    #[test]
1823    fn adjacent_same_class_tokens_cover_continuously() {
1824        let manifest = Manifest::from_spans(vec![
1825            span(0, 5, PiiClass::Email),
1826            span(5, 10, PiiClass::Email),
1827        ]);
1828
1829        assert_eq!(diff(manifest, 0..10, PiiClass::Email), None);
1830    }
1831
1832    #[test]
1833    fn partial_bleed_at_start_end_and_middle() {
1834        let manifest = Manifest::from_spans(vec![span(3, 8, PiiClass::Email)]);
1835
1836        assert_eq!(
1837            diff(manifest.clone(), 0..8, PiiClass::Email),
1838            Some(LeakKind::PartialBleed { uncovered: 0..3 })
1839        );
1840        assert_eq!(
1841            diff(manifest.clone(), 3..10, PiiClass::Email),
1842            Some(LeakKind::PartialBleed { uncovered: 8..10 })
1843        );
1844
1845        let with_gap = Manifest::from_spans(vec![
1846            span(0, 3, PiiClass::Email),
1847            span(6, 10, PiiClass::Email),
1848        ]);
1849        assert_eq!(
1850            diff(with_gap, 0..10, PiiClass::Email),
1851            Some(LeakKind::PartialBleed { uncovered: 3..6 })
1852        );
1853    }
1854
1855    #[test]
1856    fn byte_indices_are_not_character_indices() {
1857        let text = "ID: 😀 <Email_1>";
1858        let token_start = text.find("<Email_1>").expect("token start");
1859        assert_eq!(token_start, 9, "emoji is four bytes, not one char");
1860        let manifest = Manifest::from_spans(vec![span(token_start, text.len(), PiiClass::Email)]);
1861
1862        assert_eq!(
1863            diff(manifest, token_start..text.len(), PiiClass::Email),
1864            None
1865        );
1866    }
1867
1868    #[test]
1869    fn empty_suspect_range_is_not_a_leak() {
1870        let manifest = Manifest::default();
1871
1872        assert_eq!(diff(manifest, 3..3, PiiClass::Email), None);
1873    }
1874
1875    #[test]
1876    fn safety_net_error_display_is_variant_specific_and_bytes_free() {
1877        let cases = [
1878            SafetyNetError::Unavailable {
1879                reason: "not configured".to_string(),
1880            }
1881            .to_string(),
1882            SafetyNetError::WeightsMissing {
1883                path: "/models/opf".to_string(),
1884            }
1885            .to_string(),
1886            SafetyNetError::ModelUnavailable {
1887                reason: "load failed".to_string(),
1888            }
1889            .to_string(),
1890            SafetyNetError::InputTooLarge {
1891                limit: 1024,
1892                actual: 2048,
1893            }
1894            .to_string(),
1895            SafetyNetError::Runtime {
1896                message: "timeout".to_string(),
1897            }
1898            .to_string(),
1899            SafetyNetError::InvalidOutput {
1900                message: "bad json".to_string(),
1901            }
1902            .to_string(),
1903        ];
1904
1905        for rendered in cases {
1906            assert!(!rendered.contains("alice@example.invalid"));
1907        }
1908    }
1909}
1910
1911/// Shared recognizer contract for locale-aware PII candidates.
1912pub trait Recognizer: Send + Sync {
1913    /// Stable recognizer identifier.
1914    fn id(&self) -> &str;
1915    /// PII class supported by this recognizer.
1916    fn supported_class(&self) -> &PiiClass;
1917    /// Detects PII candidates in the supplied input and context.
1918    fn detect(&self, input: &str, ctx: &DetectContext<'_>) -> Vec<Candidate>;
1919    /// Token family used for candidate token emission.
1920    fn token_family(&self) -> &str;
1921    /// Locales where this recognizer is active.
1922    fn locales(&self) -> &[LocaleTag] {
1923        &[LocaleTag::Global]
1924    }
1925}
1926
1927/// Candidate PII span emitted by a recognizer before final conflict resolution.
1928#[derive(Debug, Clone, PartialEq)]
1929#[non_exhaustive]
1930pub struct Candidate {
1931    /// Byte span in the original input.
1932    pub span: Range<usize>,
1933    /// PII class assigned to the span.
1934    pub class: PiiClass,
1935    /// Recognizer identifier.
1936    pub recognizer_id: String,
1937    /// Recognizer confidence score.
1938    pub score: f32,
1939    /// Rule or recognizer priority.
1940    pub priority: i32,
1941    /// Optional canonical representation for validation/merge logic.
1942    pub canonical_form: Option<String>,
1943    /// Token family used for output token shape.
1944    pub token_family: String,
1945    /// Candidate source label.
1946    pub source: String,
1947    /// Conflict tier that decided this candidate.
1948    pub decided_by: ConflictTier,
1949    /// Sources merged into this candidate.
1950    pub merged_sources: Vec<String>,
1951}
1952
1953impl Candidate {
1954    /// Builds a recognizer candidate.
1955    #[allow(clippy::too_many_arguments)]
1956    pub fn new(
1957        span: Range<usize>,
1958        class: PiiClass,
1959        recognizer_id: impl Into<String>,
1960        score: f32,
1961        priority: i32,
1962        canonical_form: Option<String>,
1963        token_family: impl Into<String>,
1964        source: impl Into<String>,
1965        decided_by: ConflictTier,
1966        merged_sources: Vec<String>,
1967    ) -> Self {
1968        Self {
1969            span,
1970            class,
1971            recognizer_id: recognizer_id.into(),
1972            score,
1973            priority,
1974            canonical_form,
1975            token_family: token_family.into(),
1976            source: source.into(),
1977            decided_by,
1978            merged_sources,
1979        }
1980    }
1981
1982    /// Returns this candidate with a translated span.
1983    pub fn with_span(mut self, span: Range<usize>) -> Self {
1984        self.span = span;
1985        self
1986    }
1987}
1988
1989/// Context supplied to recognizers during detection.
1990#[non_exhaustive]
1991pub struct DetectContext<'a> {
1992    /// Active locale chain.
1993    pub locale_chain: &'a [LocaleTag],
1994    /// Active dictionary bundle.
1995    pub dictionaries: &'a DictionaryBundle,
1996    /// Reserved field-aware matching slot; intentionally unit in v0.5 Phase B.
1997    pub fields: &'a (),
1998    /// Whether a recognizer degraded due to unavailable optional capability.
1999    pub degraded: Cell<bool>,
2000}
2001
2002impl<'a> DetectContext<'a> {
2003    /// Builds detection context for a recognizer pass.
2004    pub fn new(locale_chain: &'a [LocaleTag], dictionaries: &'a DictionaryBundle) -> Self {
2005        Self {
2006            locale_chain,
2007            dictionaries,
2008            fields: &(),
2009            degraded: Cell::new(false),
2010        }
2011    }
2012}
2013
2014fn ensure_global(tags: &mut Vec<LocaleTag>) {
2015    if !tags.contains(&LocaleTag::Global) {
2016        tags.push(LocaleTag::Global);
2017    }
2018}
2019
2020fn is_bcp47_parseable(raw: &str) -> bool {
2021    let mut parts = raw.split('-');
2022    let Some(language) = parts.next() else {
2023        return false;
2024    };
2025    if !(2..=8).contains(&language.len()) || !language.chars().all(|ch| ch.is_ascii_alphabetic()) {
2026        return false;
2027    }
2028    parts.all(|part| {
2029        (2..=8).contains(&part.len()) && part.chars().all(|ch| ch.is_ascii_alphanumeric())
2030    })
2031}
2032
2033fn canonical_other(raw: &str) -> String {
2034    let mut parts = raw.split('-');
2035    let language = parts.next().unwrap_or_default().to_ascii_lowercase();
2036    let rest = parts.map(|part| {
2037        if part.len() == 2 && part.chars().all(|ch| ch.is_ascii_alphabetic()) {
2038            part.to_ascii_uppercase()
2039        } else {
2040            part.to_ascii_lowercase()
2041        }
2042    });
2043    std::iter::once(language)
2044        .chain(rest)
2045        .collect::<Vec<_>>()
2046        .join("-")
2047}
gaze_types/lib.rs

gaze_types/
lib.rs