#![cfg_attr(docsrs, feature(doc_cfg))]
use std::cell::Cell;
use std::collections::{BTreeMap, HashMap};
use std::fmt;
use std::ops::Range;
use serde::{Deserialize, Serialize};
use thiserror::Error;
pub trait Detector: Send + Sync {
fn detect(&self, input: &str) -> Vec<Detection>;
}
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
pub enum PiiClass {
Email,
Name,
Location,
Organization,
Custom(String),
}
pub const BUILTIN_CLASS_NAMES: &[&str] = &["Email", "Name", "Location", "Organization"];
impl PiiClass {
pub fn from_policy_name(input: &str) -> Option<Self> {
match input {
"email" => Some(Self::Email),
"name" => Some(Self::Name),
"location" => Some(Self::Location),
"organization" => Some(Self::Organization),
custom if custom.starts_with("custom:") => {
let name = custom.trim_start_matches("custom:");
(!name.trim().is_empty()).then(|| Self::custom(name))
}
_ => None,
}
}
pub fn builtin_variants() -> &'static [PiiClass] {
&[
PiiClass::Email,
PiiClass::Name,
PiiClass::Location,
PiiClass::Organization,
]
}
pub fn custom(name: &str) -> Self {
let mut normalized = String::new();
let mut pending_underscore = false;
for ch in name.trim().chars() {
if ch.is_ascii_alphanumeric() {
if pending_underscore && !normalized.is_empty() {
normalized.push('_');
}
normalized.push(ch.to_ascii_lowercase());
pending_underscore = false;
} else {
pending_underscore = true;
}
}
Self::Custom(normalized)
}
pub fn as_custom_name(&self) -> Option<&str> {
match self {
Self::Custom(name) => Some(name.as_str()),
Self::Email | Self::Name | Self::Location | Self::Organization => None,
}
}
pub fn class_name(&self) -> String {
match self {
Self::Email => BUILTIN_CLASS_NAMES[0].to_string(),
Self::Name => BUILTIN_CLASS_NAMES[1].to_string(),
Self::Location => BUILTIN_CLASS_NAMES[2].to_string(),
Self::Organization => BUILTIN_CLASS_NAMES[3].to_string(),
Self::Custom(name) => format!("Custom:{name}"),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub struct Detection {
pub span: Range<usize>,
pub class: PiiClass,
pub source: String,
}
impl Detection {
pub fn new(span: Range<usize>, class: PiiClass, source: impl Into<String>) -> Self {
Self {
span,
class,
source: source.into(),
}
}
}
pub trait SafetyNet: Send + Sync {
fn id(&self) -> &str;
fn supported_locales(&self) -> &[LocaleTag];
fn check(
&self,
clean_text: &str,
context: SafetyNetContext<'_>,
) -> Result<Vec<LeakSuspect>, SafetyNetError>;
}
#[derive(Debug, Clone, Copy)]
#[non_exhaustive]
pub struct SafetyNetContext<'a> {
pub manifest: &'a Manifest,
pub locale_chain: &'a [LocaleTag],
pub document_kind: DocumentKind,
pub session_id: Option<&'a str>,
pub field_path: Option<&'a str>,
}
impl<'a> SafetyNetContext<'a> {
pub fn new(
manifest: &'a Manifest,
locale_chain: &'a [LocaleTag],
document_kind: DocumentKind,
session_id: Option<&'a str>,
field_path: Option<&'a str>,
) -> Self {
Self {
manifest,
locale_chain,
document_kind,
session_id,
field_path,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct EmittedTokenSpan {
pub clean_span: Range<usize>,
pub raw_span: Range<usize>,
pub class: PiiClass,
}
impl EmittedTokenSpan {
pub fn new(clean_span: Range<usize>, raw_span: Range<usize>, class: PiiClass) -> Self {
Self {
clean_span,
raw_span,
class,
}
}
}
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct Manifest {
pub spans: Vec<EmittedTokenSpan>,
}
impl Manifest {
pub fn from_spans(mut spans: Vec<EmittedTokenSpan>) -> Self {
spans.sort_by_key(|span| (span.clean_span.start, span.clean_span.end));
Self { spans }
}
pub fn diff_against(
&self,
suspect_span: &Range<usize>,
suspect_class: &PiiClass,
) -> Option<LeakKind> {
if suspect_span.is_empty() {
return None;
}
let start_idx = self
.spans
.partition_point(|span| span.clean_span.end <= suspect_span.start);
let overlapping = self.spans[start_idx..]
.iter()
.take_while(|span| span.clean_span.start < suspect_span.end)
.filter(|span| ranges_overlap(&span.clean_span, suspect_span))
.collect::<Vec<_>>();
if overlapping.is_empty() {
return Some(LeakKind::Uncovered);
}
let mut cursor = suspect_span.start;
let mut first_mismatch = None::<&EmittedTokenSpan>;
for span in overlapping {
if span.clean_span.start > cursor {
return Some(LeakKind::PartialBleed {
uncovered: cursor..span.clean_span.start.min(suspect_span.end),
});
}
if span.clean_span.end > cursor {
if first_mismatch.is_none() && &span.class != suspect_class {
first_mismatch = Some(span);
}
cursor = cursor.max(span.clean_span.end.min(suspect_span.end));
if cursor >= suspect_span.end {
break;
}
}
}
if cursor < suspect_span.end {
return Some(LeakKind::PartialBleed {
uncovered: cursor..suspect_span.end,
});
}
first_mismatch.map(|span| LeakKind::ClassMismatch {
pipeline_class: span.class.clone(),
safety_net_class: suspect_class.clone(),
})
}
}
fn ranges_overlap(left: &Range<usize>, right: &Range<usize>) -> bool {
left.start < right.end && right.start < left.end
}
#[derive(Debug, Clone, PartialEq)]
#[non_exhaustive]
pub struct LeakSuspect {
pub span: Range<usize>,
pub class: PiiClass,
pub safety_net_id: String,
pub score: Option<f32>,
pub kind: LeakKind,
pub raw_label: String,
pub field_path: Option<String>,
}
impl LeakSuspect {
pub fn new(
span: Range<usize>,
class: PiiClass,
safety_net_id: impl Into<String>,
score: Option<f32>,
kind: LeakKind,
raw_label: impl Into<String>,
field_path: Option<String>,
) -> Self {
Self {
span,
class,
safety_net_id: safety_net_id.into(),
score,
kind,
raw_label: raw_label.into(),
field_path,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub enum LeakKind {
Uncovered,
PartialBleed {
uncovered: Range<usize>,
},
ClassMismatch {
pipeline_class: PiiClass,
safety_net_class: PiiClass,
},
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub enum LeakReportTelemetry {
LocaleSkipped {
safety_net_id: String,
document_kind: DocumentKind,
field_path: Option<String>,
},
}
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct LeakReportStats {
pub suspect_count: usize,
pub uncovered_count: usize,
pub partial_bleed_count: usize,
pub class_mismatch_count: usize,
pub locale_skipped_count: usize,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct DocumentExtension {
pub schema_version: u16,
pub clean_md_sha256: [u8; 32],
pub layout_json_sha256: [u8; 32],
pub report_json_sha256: [u8; 32],
#[serde(default, skip_serializing_if = "Option::is_none")]
pub preview_png_sha256: Option<[u8; 32]>,
pub page_count: u32,
pub audit_session_id: String,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub clean_spans: Vec<EmittedTokenSpan>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub codec_audit: Vec<CodecAuditRow>,
}
impl DocumentExtension {
pub fn builder(schema_version: u16) -> DocumentExtensionBuilder {
DocumentExtensionBuilder {
schema_version,
clean_md_sha256: None,
layout_json_sha256: None,
report_json_sha256: None,
preview_png_sha256: None,
page_count: None,
audit_session_id: None,
clean_spans: Vec::new(),
codec_audit: Vec::new(),
}
}
}
#[derive(Debug, Clone)]
#[must_use]
pub struct DocumentExtensionBuilder {
schema_version: u16,
clean_md_sha256: Option<[u8; 32]>,
layout_json_sha256: Option<[u8; 32]>,
report_json_sha256: Option<[u8; 32]>,
preview_png_sha256: Option<[u8; 32]>,
page_count: Option<u32>,
audit_session_id: Option<String>,
clean_spans: Vec<EmittedTokenSpan>,
codec_audit: Vec<CodecAuditRow>,
}
impl DocumentExtensionBuilder {
pub fn clean_md_sha256(mut self, hash: [u8; 32]) -> Self {
self.clean_md_sha256 = Some(hash);
self
}
pub fn layout_json_sha256(mut self, hash: [u8; 32]) -> Self {
self.layout_json_sha256 = Some(hash);
self
}
pub fn report_json_sha256(mut self, hash: [u8; 32]) -> Self {
self.report_json_sha256 = Some(hash);
self
}
pub fn preview_png_sha256(mut self, hash: [u8; 32]) -> Self {
self.preview_png_sha256 = Some(hash);
self
}
pub fn page_count(mut self, page_count: u32) -> Self {
self.page_count = Some(page_count);
self
}
pub fn audit_session_id(mut self, audit_session_id: impl Into<String>) -> Self {
self.audit_session_id = Some(audit_session_id.into());
self
}
pub fn clean_spans(mut self, clean_spans: Vec<EmittedTokenSpan>) -> Self {
self.clean_spans = clean_spans;
self
}
pub fn codec_audit(mut self, codec_audit: Vec<CodecAuditRow>) -> Self {
self.codec_audit = codec_audit;
self
}
pub fn build(self) -> Result<DocumentExtension, DocumentExtensionError> {
Ok(DocumentExtension {
schema_version: self.schema_version,
clean_md_sha256: self
.clean_md_sha256
.ok_or(DocumentExtensionError::MissingField("clean_md_sha256"))?,
layout_json_sha256: self
.layout_json_sha256
.ok_or(DocumentExtensionError::MissingField("layout_json_sha256"))?,
report_json_sha256: self
.report_json_sha256
.ok_or(DocumentExtensionError::MissingField("report_json_sha256"))?,
preview_png_sha256: self.preview_png_sha256,
page_count: self
.page_count
.ok_or(DocumentExtensionError::MissingField("page_count"))?,
audit_session_id: self
.audit_session_id
.ok_or(DocumentExtensionError::MissingField("audit_session_id"))?,
clean_spans: self.clean_spans,
codec_audit: self.codec_audit,
})
}
}
#[derive(Debug, Clone, PartialEq, Eq, Error)]
#[non_exhaustive]
pub enum DocumentExtensionError {
#[error("missing document extension field: {0}")]
MissingField(&'static str),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub enum TextOrigin {
Ocr,
EmbeddedText,
Transcript,
Hybrid,
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct CodecCapabilitySet {
pub text: bool,
pub layout: bool,
pub confidence: bool,
pub timestamps: bool,
}
impl CodecCapabilitySet {
pub const TEXT_ONLY: Self = Self {
text: true,
layout: false,
confidence: false,
timestamps: false,
};
pub const fn new(text: bool, layout: bool, confidence: bool, timestamps: bool) -> Self {
Self {
text,
layout,
confidence,
timestamps,
}
}
pub fn contains(self, requested: Self) -> bool {
(!requested.text || self.text)
&& (!requested.layout || self.layout)
&& (!requested.confidence || self.confidence)
&& (!requested.timestamps || self.timestamps)
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub enum ExtractionDensityPolicy {
Required(f32),
Exempt { reason: String },
}
impl Default for ExtractionDensityPolicy {
fn default() -> Self {
Self::Exempt {
reason: "calibration_pending".to_string(),
}
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct CodecAuditRow {
pub codec_id: String,
pub codec_version: String,
pub accepted_mime: String,
pub advertised: CodecCapabilitySet,
pub delivered: CodecCapabilitySet,
pub text_origin: TextOrigin,
pub codec_output_schema_version: u16,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub options_hash_hex: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub engine_provenance: Option<String>,
pub extraction_density_policy: ExtractionDensityPolicy,
}
impl CodecAuditRow {
pub fn new(
codec_id: impl Into<String>,
codec_version: impl Into<String>,
accepted_mime: impl Into<String>,
text_origin: TextOrigin,
) -> Self {
Self {
codec_id: codec_id.into(),
codec_version: codec_version.into(),
accepted_mime: accepted_mime.into(),
advertised: CodecCapabilitySet::default(),
delivered: CodecCapabilitySet::default(),
text_origin,
codec_output_schema_version: 1,
options_hash_hex: None,
engine_provenance: None,
extraction_density_policy: ExtractionDensityPolicy::default(),
}
}
}
#[derive(Debug, Clone, Default, PartialEq)]
#[non_exhaustive]
pub struct LeakReport {
pub suspects: Vec<LeakSuspect>,
pub telemetry: Vec<LeakReportTelemetry>,
pub stats: LeakReportStats,
pub replay_hash: Option<String>,
}
impl LeakReport {
pub fn from_parts(
suspects: Vec<LeakSuspect>,
telemetry: Vec<LeakReportTelemetry>,
) -> LeakReport {
let mut stats = LeakReportStats {
suspect_count: suspects.len(),
locale_skipped_count: telemetry
.iter()
.filter(|event| matches!(event, LeakReportTelemetry::LocaleSkipped { .. }))
.count(),
..LeakReportStats::default()
};
for suspect in &suspects {
match suspect.kind {
LeakKind::Uncovered => stats.uncovered_count += 1,
LeakKind::PartialBleed { .. } => stats.partial_bleed_count += 1,
LeakKind::ClassMismatch { .. } => stats.class_mismatch_count += 1,
}
}
LeakReport {
suspects,
telemetry,
stats,
replay_hash: None,
}
}
pub fn extend(&mut self, other: LeakReport) {
self.suspects.extend(other.suspects);
self.telemetry.extend(other.telemetry);
*self = LeakReport::from_parts(
std::mem::take(&mut self.suspects),
std::mem::take(&mut self.telemetry),
);
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[non_exhaustive]
pub enum OpenAiPrivateLabel {
PrivatePerson,
PrivateAddress,
PrivateEmail,
PrivatePhone,
PrivateUrl,
PrivateDate,
AccountNumber,
Secret,
}
impl OpenAiPrivateLabel {
pub fn as_str(self) -> &'static str {
match self {
Self::PrivatePerson => "private_person",
Self::PrivateAddress => "private_address",
Self::PrivateEmail => "private_email",
Self::PrivatePhone => "private_phone",
Self::PrivateUrl => "private_url",
Self::PrivateDate => "private_date",
Self::AccountNumber => "account_number",
Self::Secret => "secret",
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[non_exhaustive]
pub enum SafetyNetPiiClass {
Email,
Name,
Location,
Phone,
Url,
Date,
AccountNumber,
Secret,
}
impl SafetyNetPiiClass {
pub fn to_pii_class(self) -> PiiClass {
match self {
Self::Email => PiiClass::Email,
Self::Name => PiiClass::Name,
Self::Location => PiiClass::Location,
Self::Phone => PiiClass::custom("phone"),
Self::Url => PiiClass::custom("url"),
Self::Date => PiiClass::custom("date"),
Self::AccountNumber => PiiClass::custom("account_number"),
Self::Secret => PiiClass::custom("secret"),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Error)]
#[non_exhaustive]
pub enum SafetyNetError {
#[error("safety net unavailable: {reason}")]
Unavailable {
reason: String,
},
#[error("safety net weights missing: {path}")]
WeightsMissing {
path: String,
},
#[error("safety net model unavailable: {reason}")]
ModelUnavailable {
reason: String,
},
#[error("safety net input too large: limit={limit}, actual={actual}")]
InputTooLarge {
limit: usize,
actual: usize,
},
#[error("safety net runtime failed: {message}")]
Runtime {
message: String,
},
#[error("safety net invalid output: {message}")]
InvalidOutput {
message: String,
},
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum Action {
Tokenize,
Redact,
FormatPreserve,
Generalize,
Preserve,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum ConflictTier {
None,
ClassPriority,
RulePriority,
Score,
SpanLength,
Validator,
RecognizerId,
Merged,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum DocumentKind {
Structured,
Text,
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub struct RedactionEntry {
pub source: String,
pub class: PiiClass,
pub action: Action,
pub field_name: Option<String>,
pub document_kind: DocumentKind,
pub conflict_loser: bool,
pub decided_by: ConflictTier,
pub created_at: i64,
pub session_id: Option<String>,
}
impl RedactionEntry {
#[allow(clippy::too_many_arguments)]
pub fn new(
source: impl Into<String>,
class: PiiClass,
action: Action,
field_name: Option<String>,
document_kind: DocumentKind,
conflict_loser: bool,
decided_by: ConflictTier,
created_at: i64,
session_id: Option<String>,
) -> Self {
Self {
source: source.into(),
class,
action,
field_name,
document_kind,
conflict_loser,
decided_by,
created_at,
session_id,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Error)]
#[non_exhaustive]
pub enum RedactionLogError {
#[error("sqlite redaction log error: {0}")]
Sqlite(String),
#[error("backend redaction log error: {0}")]
Backend(String),
}
pub trait RedactionLogger: Send + Sync {
fn log(&self, entry: &RedactionEntry) -> Result<(), RedactionLogError>;
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
#[non_exhaustive]
pub enum LocaleTag {
Global,
DeDe,
DeAt,
DeCh,
EnUs,
EnGb,
EnIe,
EnAu,
EnCa,
Other(String),
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub enum LocaleError {
Unsupported,
}
impl fmt::Display for LocaleError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
LocaleError::Unsupported => f.write_str("unsupported locale"),
}
}
}
impl std::error::Error for LocaleError {}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct LocaleChain(Vec<LocaleTag>);
impl LocaleTag {
pub const GLOBAL: LocaleTag = LocaleTag::Global;
pub fn parse(s: &str) -> Result<LocaleTag, LocaleError> {
let raw = s.trim().replace('_', "-");
let normalized = raw.to_ascii_lowercase();
match normalized.as_str() {
"global" | "*" => Ok(LocaleTag::Global),
"de-de" => Ok(LocaleTag::DeDe),
"de-at" => Ok(LocaleTag::DeAt),
"de-ch" => Ok(LocaleTag::DeCh),
"en-us" => Ok(LocaleTag::EnUs),
"en-gb" => Ok(LocaleTag::EnGb),
"en-ie" => Ok(LocaleTag::EnIe),
"en-au" => Ok(LocaleTag::EnAu),
"en-ca" => Ok(LocaleTag::EnCa),
"" => Err(LocaleError::Unsupported),
_ if is_bcp47_parseable(&raw) => Ok(LocaleTag::Other(canonical_other(&raw))),
_ => Err(LocaleError::Unsupported),
}
}
pub fn as_str(&self) -> &str {
match self {
LocaleTag::Global => "global",
LocaleTag::DeDe => "de-DE",
LocaleTag::DeAt => "de-AT",
LocaleTag::DeCh => "de-CH",
LocaleTag::EnUs => "en-US",
LocaleTag::EnGb => "en-GB",
LocaleTag::EnIe => "en-IE",
LocaleTag::EnAu => "en-AU",
LocaleTag::EnCa => "en-CA",
LocaleTag::Other(tag) => tag.as_str(),
}
}
}
impl LocaleChain {
pub fn from_tags(mut tags: Vec<LocaleTag>) -> LocaleChain {
ensure_global(&mut tags);
LocaleChain(tags)
}
pub fn from_cli(raw: &str) -> Result<LocaleChain, LocaleError> {
let tags = raw
.split(',')
.map(LocaleTag::parse)
.collect::<Result<Vec<_>, _>>()?;
Ok(LocaleChain::from_tags(tags))
}
pub fn merge_policy_and_cli(
policy: Option<&[LocaleTag]>,
cli: Option<&[LocaleTag]>,
) -> LocaleChain {
Self::merge_cli_policy_rulepack_default(cli, policy, None)
}
pub fn merge_cli_policy_rulepack_default(
cli: Option<&[LocaleTag]>,
policy: Option<&[LocaleTag]>,
rulepack_defaults: Option<&[LocaleTag]>,
) -> LocaleChain {
let tags = cli
.filter(|tags| !tags.is_empty())
.or_else(|| policy.filter(|tags| !tags.is_empty()))
.or_else(|| rulepack_defaults.filter(|tags| !tags.is_empty()))
.map(|tags| tags.to_vec())
.unwrap_or_else(|| vec![LocaleTag::Global]);
LocaleChain::from_tags(tags)
}
pub fn intersects(&self, recognizer_locales: &[LocaleTag]) -> bool {
if recognizer_locales.is_empty() {
return true;
}
recognizer_locales.iter().any(|recognizer_locale| {
*recognizer_locale == LocaleTag::Global
|| self.0.iter().any(|active| active == recognizer_locale)
})
}
pub fn as_slice(&self) -> &[LocaleTag] {
&self.0
}
pub fn to_strings(&self) -> Vec<String> {
self.0.iter().map(ToString::to_string).collect()
}
}
impl From<&[LocaleTag]> for LocaleChain {
fn from(tags: &[LocaleTag]) -> Self {
let mut owned = tags.to_vec();
ensure_global(&mut owned);
LocaleChain(owned)
}
}
impl fmt::Display for LocaleTag {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(self.as_str())
}
}
#[derive(Debug, Clone)]
#[non_exhaustive]
pub enum RawDocument {
Structured(BTreeMap<String, Value>),
Text(String),
}
#[derive(Debug, Clone, Serialize)]
#[serde(untagged)]
#[non_exhaustive]
pub enum CleanDocument {
Structured(BTreeMap<String, Value>),
Text(String),
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
#[serde(untagged)]
#[non_exhaustive]
pub enum Value {
Null,
Bool(bool),
String(String),
I64(i64),
Array(Vec<Value>),
Object(BTreeMap<String, Value>),
}
impl Value {
pub fn as_str(&self) -> Option<&str> {
match self {
Self::String(value) => Some(value.as_str()),
Self::Null | Self::Bool(_) | Self::I64(_) | Self::Array(_) | Self::Object(_) => None,
}
}
pub fn scalar_to_safety_net_string(&self) -> Option<String> {
match self {
Self::String(value) if !value.is_empty() => Some(value.clone()),
Self::String(_) | Self::Null | Self::Array(_) | Self::Object(_) => None,
Self::Bool(value) => Some(value.to_string()),
Self::I64(value) => Some(value.to_string()),
}
}
}
impl PartialEq<&str> for Value {
fn eq(&self, other: &&str) -> bool {
self.as_str() == Some(*other)
}
}
#[derive(Debug, Clone, Default)]
pub struct DictionaryBundle {
entries: HashMap<String, DictionaryEntry>,
}
#[derive(Debug, Clone)]
pub struct DictionaryEntry {
terms: Vec<String>,
case_sensitive: bool,
source: DictionarySource,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum DictionarySource {
Cli,
Rulepack,
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub struct DictionaryStats {
pub name: String,
pub term_count: usize,
pub source: DictionarySource,
}
impl DictionaryStats {
pub fn new(name: impl Into<String>, term_count: usize, source: DictionarySource) -> Self {
Self {
name: name.into(),
term_count,
source,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub struct RulepackDict {
pub name: String,
pub terms: Vec<String>,
pub case_sensitive: bool,
}
impl RulepackDict {
pub fn new(name: impl Into<String>, terms: Vec<String>, case_sensitive: bool) -> Self {
Self {
name: name.into(),
terms,
case_sensitive,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub enum DictionaryLoadError {
Empty { name: String },
UnicodeInsensitiveUnsupported { name: String },
}
impl fmt::Display for DictionaryLoadError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Empty { name } => write!(f, "dictionary '{name}' has no terms"),
Self::UnicodeInsensitiveUnsupported { name } => write!(
f,
"dictionary '{name}' uses unicode terms with case-insensitive matching, unsupported in v0.4.0; use case_sensitive = true"
),
}
}
}
impl std::error::Error for DictionaryLoadError {}
impl DictionaryBundle {
pub fn from_rulepack_terms(terms: &[RulepackDict]) -> Self {
let mut entries = HashMap::with_capacity(terms.len());
for dictionary in terms {
let entry = DictionaryEntry::new(
&dictionary.name,
dictionary.terms.clone(),
dictionary.case_sensitive,
DictionarySource::Rulepack,
)
.expect("Policy validates dictionary terms before bundle construction");
entries.insert(dictionary.name.clone(), entry);
}
Self { entries }
}
pub fn from_entries(entries: impl IntoIterator<Item = (String, DictionaryEntry)>) -> Self {
Self {
entries: entries.into_iter().collect(),
}
}
pub fn merge(a: Self, b: Self) -> Self {
let mut entries = a.entries;
entries.extend(b.entries);
Self { entries }
}
pub fn get(&self, name: &str) -> Option<&DictionaryEntry> {
self.entries.get(name)
}
pub fn stats(&self) -> Vec<DictionaryStats> {
let mut stats = self
.entries
.iter()
.map(|(name, entry)| DictionaryStats {
name: name.clone(),
term_count: entry.terms.len(),
source: entry.source,
})
.collect::<Vec<_>>();
stats.sort_by(|a, b| a.name.cmp(&b.name));
stats
}
}
impl DictionaryEntry {
pub fn new(
name: &str,
terms: Vec<String>,
case_sensitive: bool,
source: DictionarySource,
) -> Result<Self, DictionaryLoadError> {
if terms.is_empty() {
return Err(DictionaryLoadError::Empty {
name: name.to_string(),
});
}
if !case_sensitive && terms.iter().any(|term| !term.is_ascii()) {
return Err(DictionaryLoadError::UnicodeInsensitiveUnsupported {
name: name.to_string(),
});
}
Ok(Self {
terms,
case_sensitive,
source,
})
}
pub fn case_sensitive(&self) -> bool {
self.case_sensitive
}
pub fn terms(&self) -> &[String] {
&self.terms
}
}
#[cfg(test)]
mod document_extension_tests {
use super::*;
fn audit_row() -> CodecAuditRow {
let mut row = CodecAuditRow::new(
"gaze.codec.tesseract",
"gaze-codec-tesseract@0.7.1",
"image/png",
TextOrigin::Ocr,
);
row.advertised = CodecCapabilitySet::new(true, true, true, false);
row.delivered = CodecCapabilitySet::new(true, true, false, false);
row.extraction_density_policy = ExtractionDensityPolicy::Required(1.0);
row
}
fn extension_builder() -> DocumentExtensionBuilder {
DocumentExtension::builder(1)
.clean_md_sha256([1; 32])
.layout_json_sha256([2; 32])
.report_json_sha256([3; 32])
.page_count(2)
.audit_session_id("018f0000-0000-7000-8000-000000000000")
}
#[test]
fn document_extension_round_trips_with_bundle_root_schema_version() {
let mut row = audit_row();
row.options_hash_hex = Some("00".repeat(32));
row.engine_provenance = Some("tesseract@5.3.4".to_string());
let extension = extension_builder()
.preview_png_sha256([4; 32])
.clean_spans(vec![EmittedTokenSpan::new(0..8, 0..12, PiiClass::Email)])
.codec_audit(vec![row])
.build()
.expect("document extension");
let json = serde_json::to_value(&extension).expect("serialize document extension");
assert_eq!(json["schema_version"], 1);
assert_eq!(json["clean_md_sha256"].as_array().expect("hash").len(), 32);
assert_eq!(
json["layout_json_sha256"].as_array().expect("hash").len(),
32
);
assert_eq!(
json["report_json_sha256"].as_array().expect("hash").len(),
32
);
assert_eq!(
json["preview_png_sha256"].as_array().expect("hash").len(),
32
);
assert_eq!(json["page_count"], 2);
assert_eq!(
json["audit_session_id"],
"018f0000-0000-7000-8000-000000000000"
);
assert_eq!(json["clean_spans"].as_array().expect("spans").len(), 1);
assert!(json.get("clean_schema_version").is_none());
assert!(json.get("layout_schema_version").is_none());
assert!(json.get("report_schema_version").is_none());
assert!(json.get("manifest_schema_version").is_none());
let decoded: DocumentExtension =
serde_json::from_value(json).expect("deserialize document extension");
assert_eq!(decoded, extension);
}
#[test]
fn document_extension_carries_full_integrity_set() {
let extension = DocumentExtension::builder(1)
.clean_md_sha256([10; 32])
.layout_json_sha256([11; 32])
.report_json_sha256([12; 32])
.preview_png_sha256([13; 32])
.page_count(7)
.audit_session_id("018f0000-0000-7000-8000-000000000001")
.clean_spans(vec![EmittedTokenSpan::new(5..14, 20..34, PiiClass::Name)])
.codec_audit(vec![audit_row()])
.build()
.expect("document extension");
let json = serde_json::to_string(&extension).expect("serialize document extension");
let decoded: DocumentExtension =
serde_json::from_str(&json).expect("deserialize document extension");
assert_eq!(decoded, extension);
assert_eq!(decoded.clean_md_sha256, [10; 32]);
assert_eq!(decoded.layout_json_sha256, [11; 32]);
assert_eq!(decoded.report_json_sha256, [12; 32]);
assert_eq!(decoded.preview_png_sha256, Some([13; 32]));
assert_eq!(decoded.page_count, 7);
assert_eq!(
decoded.audit_session_id,
"018f0000-0000-7000-8000-000000000001"
);
assert_eq!(decoded.clean_spans.len(), 1);
assert_eq!(decoded.codec_audit.len(), 1);
}
#[test]
fn document_extension_builder_requires_integrity_fields() {
assert_eq!(
DocumentExtension::builder(1).build(),
Err(DocumentExtensionError::MissingField("clean_md_sha256"))
);
assert_eq!(
DocumentExtension::builder(1)
.clean_md_sha256([1; 32])
.layout_json_sha256([2; 32])
.report_json_sha256([3; 32])
.page_count(1)
.build(),
Err(DocumentExtensionError::MissingField("audit_session_id"))
);
}
#[test]
fn codec_audit_row_round_trips_without_raw_pii_fields() {
let row = audit_row();
let json = serde_json::to_string(&row).expect("serialize codec audit row");
assert!(json.contains("\"codec_id\""));
assert!(!json.contains("alice@example.invalid"));
assert!(!json.contains("\"raw\""));
assert_eq!(
serde_json::from_str::<CodecAuditRow>(&json).expect("deserialize codec audit row"),
row
);
}
#[test]
fn text_origin_round_trips() {
for origin in [
TextOrigin::Ocr,
TextOrigin::EmbeddedText,
TextOrigin::Transcript,
TextOrigin::Hybrid,
] {
let json = serde_json::to_string(&origin).expect("serialize text origin");
let decoded: TextOrigin = serde_json::from_str(&json).expect("deserialize text origin");
assert_eq!(decoded, origin);
}
}
#[test]
fn codec_capability_set_round_trips_and_contains_requested_bits() {
let delivered = CodecCapabilitySet::new(true, true, false, false);
let json = serde_json::to_string(&delivered).expect("serialize capabilities");
let decoded: CodecCapabilitySet =
serde_json::from_str(&json).expect("deserialize capabilities");
assert_eq!(decoded, delivered);
assert!(decoded.contains(CodecCapabilitySet::TEXT_ONLY));
assert!(!decoded.contains(CodecCapabilitySet::new(true, true, true, false)));
}
#[test]
fn extraction_density_policy_round_trips_closed_variants() {
for policy in [
ExtractionDensityPolicy::Required(1.25),
ExtractionDensityPolicy::Exempt {
reason: "text_only".to_string(),
},
] {
let json = serde_json::to_string(&policy).expect("serialize density policy");
let decoded: ExtractionDensityPolicy =
serde_json::from_str(&json).expect("deserialize density policy");
assert_eq!(decoded, policy);
}
}
#[test]
fn manifest_stats_round_trip_for_document_report_mirrors() {
let manifest =
Manifest::from_spans(vec![EmittedTokenSpan::new(0..15, 0..19, PiiClass::Email)]);
let stats = LeakReportStats {
suspect_count: 1,
uncovered_count: 0,
partial_bleed_count: 0,
class_mismatch_count: 0,
locale_skipped_count: 0,
};
let manifest_json = serde_json::to_string(&manifest).expect("serialize manifest");
let stats_json = serde_json::to_string(&stats).expect("serialize stats");
assert_eq!(
serde_json::from_str::<Manifest>(&manifest_json).expect("deserialize manifest"),
manifest
);
assert_eq!(
serde_json::from_str::<LeakReportStats>(&stats_json).expect("deserialize stats"),
stats
);
}
}
#[cfg(test)]
mod dictionary_tests {
use super::*;
#[test]
fn dictionary_entry_rejects_empty_terms() {
let err = DictionaryEntry::new("empty", Vec::new(), true, DictionarySource::Cli)
.expect_err("empty dictionaries must fail closed");
assert!(matches!(err, DictionaryLoadError::Empty { name } if name == "empty"));
}
#[test]
fn dictionary_entry_rejects_non_ascii_case_insensitive_terms() {
let err = DictionaryEntry::new(
"songs",
vec!["Beyonce".to_string(), "Caf\u{00e9}".to_string()],
false,
DictionarySource::Cli,
)
.expect_err("unicode case-insensitive dictionaries must fail closed");
assert!(matches!(
err,
DictionaryLoadError::UnicodeInsensitiveUnsupported { name } if name == "songs"
));
}
}
#[cfg(test)]
mod redaction_logger_tests {
use super::*;
struct CapturingLogger;
impl RedactionLogger for CapturingLogger {
fn log(&self, _entry: &RedactionEntry) -> Result<(), RedactionLogError> {
Ok(())
}
}
fn assert_send_sync<T: Send + Sync + ?Sized>() {}
#[test]
fn redaction_log_error_display_is_stable() {
assert_eq!(
RedactionLogError::Sqlite("write failed".to_string()).to_string(),
"sqlite redaction log error: write failed"
);
assert_eq!(
RedactionLogError::Backend("sink failed".to_string()).to_string(),
"backend redaction log error: sink failed"
);
}
#[test]
fn redaction_logger_trait_object_is_send_sync() {
assert_send_sync::<dyn RedactionLogger>();
}
#[test]
fn local_logger_can_implement_redaction_logger() {
let logger = CapturingLogger;
let entry = RedactionEntry {
source: "unit-test".to_string(),
class: PiiClass::Email,
action: Action::Tokenize,
field_name: None,
document_kind: DocumentKind::Text,
conflict_loser: false,
decided_by: ConflictTier::None,
created_at: 0,
session_id: None,
};
let trait_object: &dyn RedactionLogger = &logger;
trait_object.log(&entry).expect("log entry");
}
}
#[cfg(test)]
mod safety_net_manifest_tests {
use super::*;
fn span(start: usize, end: usize, class: PiiClass) -> EmittedTokenSpan {
EmittedTokenSpan {
clean_span: start..end,
raw_span: start..end,
class,
}
}
fn diff(manifest: Manifest, suspect: Range<usize>, class: PiiClass) -> Option<LeakKind> {
manifest.diff_against(&suspect, &class)
}
#[test]
fn exact_same_class_coverage_is_not_a_leak() {
let manifest = Manifest::from_spans(vec![span(0, 8, PiiClass::Email)]);
assert_eq!(diff(manifest, 0..8, PiiClass::Email), None);
}
#[test]
fn uncovered_outside_all_tokens_is_uncovered() {
let manifest = Manifest::from_spans(vec![span(20, 30, PiiClass::Email)]);
assert_eq!(
diff(manifest, 0..10, PiiClass::Email),
Some(LeakKind::Uncovered)
);
}
#[test]
fn single_internal_gap_returns_partial_bleed() {
let manifest = Manifest::from_spans(vec![
span(0, 5, PiiClass::Email),
span(10, 15, PiiClass::Email),
]);
assert_eq!(
diff(manifest, 0..15, PiiClass::Email),
Some(LeakKind::PartialBleed { uncovered: 5..10 })
);
}
#[test]
fn multi_gap_returns_deterministic_first_uncovered_gap() {
let manifest = Manifest::from_spans(vec![
span(0, 3, PiiClass::Email),
span(5, 7, PiiClass::Email),
span(9, 12, PiiClass::Email),
]);
assert_eq!(
diff(manifest, 0..12, PiiClass::Email),
Some(LeakKind::PartialBleed { uncovered: 3..5 })
);
}
#[test]
fn multi_class_overlap_reports_first_mismatch_deterministically() {
let manifest = Manifest::from_spans(vec![
span(0, 4, PiiClass::Name),
span(4, 8, PiiClass::Location),
]);
assert_eq!(
diff(manifest, 0..8, PiiClass::Email),
Some(LeakKind::ClassMismatch {
pipeline_class: PiiClass::Name,
safety_net_class: PiiClass::Email,
})
);
}
#[test]
fn adjacent_same_class_tokens_cover_continuously() {
let manifest = Manifest::from_spans(vec![
span(0, 5, PiiClass::Email),
span(5, 10, PiiClass::Email),
]);
assert_eq!(diff(manifest, 0..10, PiiClass::Email), None);
}
#[test]
fn partial_bleed_at_start_end_and_middle() {
let manifest = Manifest::from_spans(vec![span(3, 8, PiiClass::Email)]);
assert_eq!(
diff(manifest.clone(), 0..8, PiiClass::Email),
Some(LeakKind::PartialBleed { uncovered: 0..3 })
);
assert_eq!(
diff(manifest.clone(), 3..10, PiiClass::Email),
Some(LeakKind::PartialBleed { uncovered: 8..10 })
);
let with_gap = Manifest::from_spans(vec![
span(0, 3, PiiClass::Email),
span(6, 10, PiiClass::Email),
]);
assert_eq!(
diff(with_gap, 0..10, PiiClass::Email),
Some(LeakKind::PartialBleed { uncovered: 3..6 })
);
}
#[test]
fn byte_indices_are_not_character_indices() {
let text = "ID: 😀 <Email_1>";
let token_start = text.find("<Email_1>").expect("token start");
assert_eq!(token_start, 9, "emoji is four bytes, not one char");
let manifest = Manifest::from_spans(vec![span(token_start, text.len(), PiiClass::Email)]);
assert_eq!(
diff(manifest, token_start..text.len(), PiiClass::Email),
None
);
}
#[test]
fn empty_suspect_range_is_not_a_leak() {
let manifest = Manifest::default();
assert_eq!(diff(manifest, 3..3, PiiClass::Email), None);
}
#[test]
fn safety_net_error_display_is_variant_specific_and_bytes_free() {
let cases = [
SafetyNetError::Unavailable {
reason: "not configured".to_string(),
}
.to_string(),
SafetyNetError::WeightsMissing {
path: "/models/opf".to_string(),
}
.to_string(),
SafetyNetError::ModelUnavailable {
reason: "load failed".to_string(),
}
.to_string(),
SafetyNetError::InputTooLarge {
limit: 1024,
actual: 2048,
}
.to_string(),
SafetyNetError::Runtime {
message: "timeout".to_string(),
}
.to_string(),
SafetyNetError::InvalidOutput {
message: "bad json".to_string(),
}
.to_string(),
];
for rendered in cases {
assert!(!rendered.contains("alice@example.invalid"));
}
}
}
pub trait Recognizer: Send + Sync {
fn id(&self) -> &str;
fn supported_class(&self) -> &PiiClass;
fn detect(&self, input: &str, ctx: &DetectContext<'_>) -> Vec<Candidate>;
fn token_family(&self) -> &str;
fn locales(&self) -> &[LocaleTag] {
&[LocaleTag::Global]
}
}
#[derive(Debug, Clone, PartialEq)]
#[non_exhaustive]
pub struct Candidate {
pub span: Range<usize>,
pub class: PiiClass,
pub recognizer_id: String,
pub score: f32,
pub priority: i32,
pub canonical_form: Option<String>,
pub token_family: String,
pub source: String,
pub decided_by: ConflictTier,
pub merged_sources: Vec<String>,
}
impl Candidate {
#[allow(clippy::too_many_arguments)]
pub fn new(
span: Range<usize>,
class: PiiClass,
recognizer_id: impl Into<String>,
score: f32,
priority: i32,
canonical_form: Option<String>,
token_family: impl Into<String>,
source: impl Into<String>,
decided_by: ConflictTier,
merged_sources: Vec<String>,
) -> Self {
Self {
span,
class,
recognizer_id: recognizer_id.into(),
score,
priority,
canonical_form,
token_family: token_family.into(),
source: source.into(),
decided_by,
merged_sources,
}
}
pub fn with_span(mut self, span: Range<usize>) -> Self {
self.span = span;
self
}
}
#[non_exhaustive]
pub struct DetectContext<'a> {
pub locale_chain: &'a [LocaleTag],
pub dictionaries: &'a DictionaryBundle,
pub fields: &'a (),
pub degraded: Cell<bool>,
}
impl<'a> DetectContext<'a> {
pub fn new(locale_chain: &'a [LocaleTag], dictionaries: &'a DictionaryBundle) -> Self {
Self {
locale_chain,
dictionaries,
fields: &(),
degraded: Cell::new(false),
}
}
}
fn ensure_global(tags: &mut Vec<LocaleTag>) {
if !tags.contains(&LocaleTag::Global) {
tags.push(LocaleTag::Global);
}
}
fn is_bcp47_parseable(raw: &str) -> bool {
let mut parts = raw.split('-');
let Some(language) = parts.next() else {
return false;
};
if !(2..=8).contains(&language.len()) || !language.chars().all(|ch| ch.is_ascii_alphabetic()) {
return false;
}
parts.all(|part| {
(2..=8).contains(&part.len()) && part.chars().all(|ch| ch.is_ascii_alphanumeric())
})
}
fn canonical_other(raw: &str) -> String {
let mut parts = raw.split('-');
let language = parts.next().unwrap_or_default().to_ascii_lowercase();
let rest = parts.map(|part| {
if part.len() == 2 && part.chars().all(|ch| ch.is_ascii_alphabetic()) {
part.to_ascii_uppercase()
} else {
part.to_ascii_lowercase()
}
});
std::iter::once(language)
.chain(rest)
.collect::<Vec<_>>()
.join("-")
}