#![cfg_attr(docsrs, feature(doc_cfg))]
use std::cell::Cell;
use std::collections::{BTreeMap, HashMap};
use std::fmt;
use std::ops::Range;
use serde::{Deserialize, Serialize};
use sha3::{Digest, Keccak256};
use thiserror::Error;
pub trait Detector: Send + Sync {
fn detect(&self, input: &str) -> Vec<Detection>;
}
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
pub enum PiiClass {
Email,
Name,
Location,
Organization,
Custom(String),
}
pub const BUILTIN_CLASS_NAMES: &[&str] = &["Email", "Name", "Location", "Organization"];
pub const RESERVED_BUNDLED_FAMILIES: &[&str] = &[
"us-9-digit-id",
"iberian-id",
"payment-card-or-iban",
"phone-or-imei",
"vin-or-serial",
"mac-or-hex",
"passport-or-doc-support",
"national-13-digit",
"italian-cf-or-serial",
"german-personalausweis",
"swedish-personnummer",
"finnish-hetu",
];
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub struct CollisionMembership {
pub family: String,
pub variant: String,
pub precedence: u32,
pub mandatory_anchor: Option<String>,
}
impl CollisionMembership {
pub fn new(
family: impl Into<String>,
variant: impl Into<String>,
precedence: u32,
mandatory_anchor: Option<String>,
) -> Self {
Self {
family: family.into(),
variant: variant.into(),
precedence,
mandatory_anchor,
}
}
}
impl PiiClass {
pub fn from_policy_name(input: &str) -> Option<Self> {
match input {
"email" => Some(Self::Email),
"name" => Some(Self::Name),
"location" => Some(Self::Location),
"organization" => Some(Self::Organization),
custom if custom.starts_with("custom:") => {
let name = custom.trim_start_matches("custom:");
(!name.trim().is_empty()).then(|| Self::custom(name))
}
_ => None,
}
}
pub fn builtin_variants() -> &'static [PiiClass] {
&[
PiiClass::Email,
PiiClass::Name,
PiiClass::Location,
PiiClass::Organization,
]
}
pub fn custom(name: &str) -> Self {
let mut normalized = String::new();
let mut pending_underscore = false;
for ch in name.trim().chars() {
if ch.is_ascii_alphanumeric() {
if pending_underscore && !normalized.is_empty() {
normalized.push('_');
}
normalized.push(ch.to_ascii_lowercase());
pending_underscore = false;
} else {
pending_underscore = true;
}
}
Self::Custom(normalized)
}
pub fn as_custom_name(&self) -> Option<&str> {
match self {
Self::Custom(name) => Some(name.as_str()),
Self::Email | Self::Name | Self::Location | Self::Organization => None,
}
}
pub fn class_name(&self) -> String {
match self {
Self::Email => BUILTIN_CLASS_NAMES[0].to_string(),
Self::Name => BUILTIN_CLASS_NAMES[1].to_string(),
Self::Location => BUILTIN_CLASS_NAMES[2].to_string(),
Self::Organization => BUILTIN_CLASS_NAMES[3].to_string(),
Self::Custom(name) => format!("Custom:{name}"),
}
}
pub fn to_canonical_str(&self) -> String {
match self {
Self::Email => "email".to_string(),
Self::Name => "name".to_string(),
Self::Location => "location".to_string(),
Self::Organization => "organization".to_string(),
Self::Custom(name) => format!("custom:{name}"),
}
}
pub fn from_canonical_str(value: &str) -> Option<Self> {
match value {
"email" | "Email" => Some(Self::Email),
"name" | "Name" => Some(Self::Name),
"location" | "Location" => Some(Self::Location),
"organization" | "Organization" => Some(Self::Organization),
custom if custom.starts_with("custom:") => {
let name = &custom["custom:".len()..];
(!name.is_empty()).then(|| Self::Custom(name.to_string()))
}
_ => None,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub struct PiiClassAudit(pub PiiClass);
impl PiiClassAudit {
pub fn new(class: PiiClass) -> Self {
Self(class)
}
pub fn into_inner(self) -> PiiClass {
self.0
}
}
impl Serialize for PiiClassAudit {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
serializer.serialize_str(&self.0.to_canonical_str())
}
}
impl<'de> Deserialize<'de> for PiiClassAudit {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
let value = String::deserialize(deserializer)?;
PiiClass::from_canonical_str(&value)
.map(Self)
.ok_or_else(|| {
serde::de::Error::custom(format!("unknown PiiClass canonical form: {value}"))
})
}
}
mod pii_class_audit_serde {
use super::{PiiClass, PiiClassAudit};
use serde::{Deserialize, Deserializer, Serialize, Serializer};
pub fn serialize<S>(class: &PiiClass, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
PiiClassAudit::new(class.clone()).serialize(serializer)
}
pub fn deserialize<'de, D>(deserializer: D) -> Result<PiiClass, D::Error>
where
D: Deserializer<'de>,
{
Ok(PiiClassAudit::deserialize(deserializer)?.into_inner())
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct LosingCandidate {
#[serde(with = "pii_class_audit_serde")]
pub class: PiiClass,
pub recognizer_id: String,
}
impl LosingCandidate {
pub fn new(class: PiiClass, recognizer_id: impl Into<String>) -> Self {
Self {
class,
recognizer_id: recognizer_id.into(),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct AmbiguityRecord {
#[serde(with = "pii_class_audit_serde")]
pub ambiguity_class: PiiClass,
pub losing_candidates: Vec<LosingCandidate>,
pub reason: AmbiguityReason,
}
impl AmbiguityRecord {
pub fn new(
ambiguity_class: PiiClass,
losing_candidates: Vec<LosingCandidate>,
reason: AmbiguityReason,
) -> Self {
Self {
ambiguity_class,
losing_candidates,
reason,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
#[serde(rename_all = "snake_case")]
pub enum AmbiguityReason {
NoAnchor,
ValidatorIndeterminate,
MultiFamilyMatch,
PrecedenceTie,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
#[serde(rename_all = "snake_case")]
pub enum ValidatorFailReason {
LuhnFailed,
IbanMod97Failed,
#[serde(alias = "email_rfc_failed")]
EmailRfcRejected,
#[serde(alias = "e164_phone_failed")]
PhoneE164Rejected,
PhoneNationalRegionMismatch,
Ipv4ParseFailed,
Ipv6ParseFailed,
EthEip55ChecksumFailed,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
#[serde(rename_all = "snake_case")]
pub enum ValidatorOutcome {
Pass { canonical_form: Option<String> },
Fail { reason: ValidatorFailReason },
NotApplicable,
}
#[derive(Debug, Clone, PartialEq, Eq, Error)]
#[non_exhaustive]
pub enum ValidatorKindParseError {
#[error("unsupported validator: {kind}")]
UnsupportedValidator {
kind: String,
},
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum ValidatorKind {
EmailRfc,
#[cfg(feature = "phone-parser")]
E164Phone,
#[cfg(feature = "phone-parser")]
E164PhoneNational(Region),
Luhn,
IbanMod97,
Ipv4Parse,
Ipv6Parse,
EthEip55,
}
#[cfg(feature = "phone-parser")]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum Region {
De,
Us,
}
impl ValidatorKind {
pub fn parse(s: &str) -> Result<Self, ValidatorKindParseError> {
match s {
"email_rfc" => Ok(Self::EmailRfc),
#[cfg(feature = "phone-parser")]
"e164_phone" => Ok(Self::E164Phone),
#[cfg(feature = "phone-parser")]
"e164_phone_national_de" => Ok(Self::E164PhoneNational(Region::De)),
#[cfg(feature = "phone-parser")]
"e164_phone_national_us" => Ok(Self::E164PhoneNational(Region::Us)),
"luhn" => Ok(Self::Luhn),
"iban_mod97" => Ok(Self::IbanMod97),
"ipv4_parse" => Ok(Self::Ipv4Parse),
"ipv6_parse" => Ok(Self::Ipv6Parse),
"eth_eip55" => Ok(Self::EthEip55),
other => Err(ValidatorKindParseError::UnsupportedValidator {
kind: other.to_string(),
}),
}
}
pub fn validates(self, input: &str) -> bool {
self.canonical_form(input).is_some()
}
pub fn validate(self, input: &str) -> ValidatorOutcome {
match self.canonical_form(input) {
Some(canonical_form) => ValidatorOutcome::Pass {
canonical_form: Some(canonical_form),
},
None => ValidatorOutcome::Fail {
reason: self.fail_reason(),
},
}
}
pub fn canonical_form(self, input: &str) -> Option<String> {
match self {
Self::EmailRfc => is_basic_email(input).then(|| input.to_string()),
#[cfg(feature = "phone-parser")]
Self::E164Phone => e164_phone_check(input).then(|| input.to_string()),
#[cfg(feature = "phone-parser")]
Self::E164PhoneNational(region) => validate_phone_national(region, input),
Self::Luhn => luhn_check(input).then(|| input.to_string()),
Self::IbanMod97 => iban_mod97_check(input).then(|| input.to_string()),
Self::Ipv4Parse => ipv4_parse_check(input).then(|| input.to_string()),
Self::Ipv6Parse => ipv6_parse_check(input).then(|| input.to_string()),
Self::EthEip55 => eth_eip55_check(input).then(|| input.to_string()),
}
}
pub fn fail_reason(self) -> ValidatorFailReason {
match self {
Self::EmailRfc => ValidatorFailReason::EmailRfcRejected,
#[cfg(feature = "phone-parser")]
Self::E164Phone => ValidatorFailReason::PhoneE164Rejected,
#[cfg(feature = "phone-parser")]
Self::E164PhoneNational(_) => ValidatorFailReason::PhoneNationalRegionMismatch,
Self::Luhn => ValidatorFailReason::LuhnFailed,
Self::IbanMod97 => ValidatorFailReason::IbanMod97Failed,
Self::Ipv4Parse => ValidatorFailReason::Ipv4ParseFailed,
Self::Ipv6Parse => ValidatorFailReason::Ipv6ParseFailed,
Self::EthEip55 => ValidatorFailReason::EthEip55ChecksumFailed,
}
}
}
fn is_basic_email(input: &str) -> bool {
let Some((local, domain)) = input.split_once('@') else {
return false;
};
!local.is_empty() && domain.contains('.') && !domain.starts_with('.') && !domain.ends_with('.')
}
#[cfg(feature = "phone-parser")]
fn e164_phone_check(input: &str) -> bool {
phonenumber::parse(None, input).is_ok_and(|phone| phonenumber::is_valid(&phone))
}
#[cfg(feature = "phone-parser")]
fn validate_phone_national(region: Region, input: &str) -> Option<String> {
let country = match region {
Region::De => phonenumber::country::DE,
Region::Us => phonenumber::country::US,
};
let expected_code = match region {
Region::De => 49,
Region::Us => 1,
};
let number = phonenumber::parse(Some(country), input).ok()?;
if number.country().code() != expected_code {
return None;
}
if number.is_valid() || is_safe_fixture_phone(region, input) {
return Some(number.format().mode(phonenumber::Mode::E164).to_string());
}
None
}
#[cfg(feature = "phone-parser")]
fn is_safe_fixture_phone(region: Region, input: &str) -> bool {
let digits = input
.chars()
.filter(char::is_ascii_digit)
.collect::<String>();
match region {
Region::Us => {
digits == "15550100"
|| matches!(digits.strip_prefix('1'), Some(rest) if rest.len() == 10 && rest[3..].starts_with("55501"))
}
Region::De => matches!(
digits.as_str(),
"493000000000"
| "4915100000000"
| "4915550112233"
| "015550112233"
| "491710000000"
| "01710000000"
),
}
}
fn luhn_check(input: &str) -> bool {
let mut digits = Vec::new();
for byte in input.bytes() {
if byte.is_ascii_whitespace() || byte == b'-' {
continue;
}
if !byte.is_ascii_digit() {
return false;
}
digits.push(byte - b'0');
}
if !(13..=19).contains(&digits.len()) {
return false;
}
let sum: u32 = digits
.iter()
.rev()
.enumerate()
.map(|(index, digit)| {
let mut value = u32::from(*digit);
if index % 2 == 1 {
value *= 2;
if value > 9 {
value -= 9;
}
}
value
})
.sum();
sum.is_multiple_of(10)
}
fn iban_mod97_check(input: &str) -> bool {
let canonical = iban_canonicalize(input);
if !(15..=34).contains(&canonical.len()) {
return false;
}
if !canonical.chars().all(|ch| ch.is_ascii_alphanumeric()) {
return false;
}
let mut remainder = 0u32;
for ch in canonical[4..].chars().chain(canonical[..4].chars()) {
match ch {
'0'..='9' => {
remainder = (remainder * 10 + ch.to_digit(10).expect("digit")) % 97;
}
'A'..='Z' => {
let value = u32::from(ch) - u32::from('A') + 10;
remainder = (remainder * 10 + value / 10) % 97;
remainder = (remainder * 10 + value % 10) % 97;
}
_ => return false,
}
}
remainder == 1
}
fn iban_canonicalize(input: &str) -> String {
input
.chars()
.filter(|ch| !ch.is_ascii_whitespace())
.flat_map(char::to_uppercase)
.collect()
}
fn ipv4_parse_check(input: &str) -> bool {
input.parse::<std::net::Ipv4Addr>().is_ok()
}
fn ipv6_parse_check(input: &str) -> bool {
input.parse::<std::net::Ipv6Addr>().is_ok()
}
fn eth_eip55_check(input: &str) -> bool {
let Some(address) = input.strip_prefix("0x") else {
return false;
};
if address.len() != 40 || !address.bytes().all(|byte| byte.is_ascii_hexdigit()) {
return false;
}
if address
.bytes()
.all(|byte| !byte.is_ascii_alphabetic() || byte.is_ascii_lowercase())
|| address
.bytes()
.all(|byte| !byte.is_ascii_alphabetic() || byte.is_ascii_uppercase())
{
return true;
}
let lowercase = address.to_ascii_lowercase();
let hash = Keccak256::digest(lowercase.as_bytes());
for (index, byte) in address.bytes().enumerate() {
if byte.is_ascii_digit() {
continue;
}
let hash_nibble = if index % 2 == 0 {
hash[index / 2] >> 4
} else {
hash[index / 2] & 0x0f
};
if (hash_nibble > 7) != byte.is_ascii_uppercase() {
return false;
}
}
true
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub struct Detection {
pub span: Range<usize>,
pub class: PiiClass,
pub source: String,
}
impl Detection {
pub fn new(span: Range<usize>, class: PiiClass, source: impl Into<String>) -> Self {
Self {
span,
class,
source: source.into(),
}
}
}
pub trait SafetyNet: Send + Sync {
fn id(&self) -> &str;
fn supported_locales(&self) -> &[LocaleTag];
fn check(
&self,
clean_text: &str,
context: SafetyNetContext<'_>,
) -> Result<Vec<LeakSuspect>, SafetyNetError>;
}
#[derive(Debug, Clone, Copy)]
#[non_exhaustive]
pub struct SafetyNetContext<'a> {
pub manifest: &'a Manifest,
pub locale_chain: &'a [LocaleTag],
pub document_kind: DocumentKind,
pub session_id: Option<&'a str>,
pub field_path: Option<&'a str>,
}
impl<'a> SafetyNetContext<'a> {
pub fn new(
manifest: &'a Manifest,
locale_chain: &'a [LocaleTag],
document_kind: DocumentKind,
session_id: Option<&'a str>,
field_path: Option<&'a str>,
) -> Self {
Self {
manifest,
locale_chain,
document_kind,
session_id,
field_path,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct EmittedTokenSpan {
pub clean_span: Range<usize>,
pub raw_span: Range<usize>,
pub class: PiiClass,
}
impl EmittedTokenSpan {
pub fn new(clean_span: Range<usize>, raw_span: Range<usize>, class: PiiClass) -> Self {
Self {
clean_span,
raw_span,
class,
}
}
}
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct Manifest {
pub spans: Vec<EmittedTokenSpan>,
}
impl Manifest {
pub fn from_spans(mut spans: Vec<EmittedTokenSpan>) -> Self {
spans.sort_by_key(|span| (span.clean_span.start, span.clean_span.end));
Self { spans }
}
pub fn diff_against(
&self,
suspect_span: &Range<usize>,
suspect_class: &PiiClass,
) -> Option<LeakKind> {
if suspect_span.is_empty() {
return None;
}
let start_idx = self
.spans
.partition_point(|span| span.clean_span.end <= suspect_span.start);
let overlapping = self.spans[start_idx..]
.iter()
.take_while(|span| span.clean_span.start < suspect_span.end)
.filter(|span| ranges_overlap(&span.clean_span, suspect_span))
.collect::<Vec<_>>();
if overlapping.is_empty() {
return Some(LeakKind::Uncovered);
}
let mut cursor = suspect_span.start;
let mut first_mismatch = None::<&EmittedTokenSpan>;
for span in overlapping {
if span.clean_span.start > cursor {
return Some(LeakKind::PartialBleed {
uncovered: cursor..span.clean_span.start.min(suspect_span.end),
});
}
if span.clean_span.end > cursor {
if first_mismatch.is_none() && &span.class != suspect_class {
first_mismatch = Some(span);
}
cursor = cursor.max(span.clean_span.end.min(suspect_span.end));
if cursor >= suspect_span.end {
break;
}
}
}
if cursor < suspect_span.end {
return Some(LeakKind::PartialBleed {
uncovered: cursor..suspect_span.end,
});
}
first_mismatch.map(|span| LeakKind::ClassMismatch {
pipeline_class: span.class.clone(),
safety_net_class: suspect_class.clone(),
})
}
}
fn ranges_overlap(left: &Range<usize>, right: &Range<usize>) -> bool {
left.start < right.end && right.start < left.end
}
#[derive(Debug, Clone, PartialEq)]
#[non_exhaustive]
pub struct LeakSuspect {
pub span: Range<usize>,
pub class: PiiClass,
pub safety_net_id: String,
pub score: Option<f32>,
pub kind: LeakKind,
pub raw_label: String,
pub field_path: Option<String>,
}
impl LeakSuspect {
pub fn new(
span: Range<usize>,
class: PiiClass,
safety_net_id: impl Into<String>,
score: Option<f32>,
kind: LeakKind,
raw_label: impl Into<String>,
field_path: Option<String>,
) -> Self {
Self {
span,
class,
safety_net_id: safety_net_id.into(),
score,
kind,
raw_label: raw_label.into(),
field_path,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub enum LeakKind {
Uncovered,
PartialBleed {
uncovered: Range<usize>,
},
ClassMismatch {
pipeline_class: PiiClass,
safety_net_class: PiiClass,
},
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub enum LeakReportTelemetry {
LocaleSkipped {
safety_net_id: String,
document_kind: DocumentKind,
field_path: Option<String>,
},
}
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct LeakReportStats {
pub suspect_count: usize,
pub uncovered_count: usize,
pub partial_bleed_count: usize,
pub class_mismatch_count: usize,
pub locale_skipped_count: usize,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct DocumentExtension {
pub schema_version: u16,
pub clean_md_sha256: [u8; 32],
pub layout_json_sha256: [u8; 32],
pub report_json_sha256: [u8; 32],
#[serde(default, skip_serializing_if = "Option::is_none")]
pub preview_png_sha256: Option<[u8; 32]>,
pub page_count: u32,
pub audit_session_id: String,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub clean_spans: Vec<EmittedTokenSpan>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub codec_audit: Vec<CodecAuditRow>,
}
impl DocumentExtension {
pub fn builder(schema_version: u16) -> DocumentExtensionBuilder {
DocumentExtensionBuilder {
schema_version,
clean_md_sha256: None,
layout_json_sha256: None,
report_json_sha256: None,
preview_png_sha256: None,
page_count: None,
audit_session_id: None,
clean_spans: Vec::new(),
codec_audit: Vec::new(),
}
}
}
#[derive(Debug, Clone)]
#[must_use]
pub struct DocumentExtensionBuilder {
schema_version: u16,
clean_md_sha256: Option<[u8; 32]>,
layout_json_sha256: Option<[u8; 32]>,
report_json_sha256: Option<[u8; 32]>,
preview_png_sha256: Option<[u8; 32]>,
page_count: Option<u32>,
audit_session_id: Option<String>,
clean_spans: Vec<EmittedTokenSpan>,
codec_audit: Vec<CodecAuditRow>,
}
impl DocumentExtensionBuilder {
pub fn clean_md_sha256(mut self, hash: [u8; 32]) -> Self {
self.clean_md_sha256 = Some(hash);
self
}
pub fn layout_json_sha256(mut self, hash: [u8; 32]) -> Self {
self.layout_json_sha256 = Some(hash);
self
}
pub fn report_json_sha256(mut self, hash: [u8; 32]) -> Self {
self.report_json_sha256 = Some(hash);
self
}
pub fn preview_png_sha256(mut self, hash: [u8; 32]) -> Self {
self.preview_png_sha256 = Some(hash);
self
}
pub fn page_count(mut self, page_count: u32) -> Self {
self.page_count = Some(page_count);
self
}
pub fn audit_session_id(mut self, audit_session_id: impl Into<String>) -> Self {
self.audit_session_id = Some(audit_session_id.into());
self
}
pub fn clean_spans(mut self, clean_spans: Vec<EmittedTokenSpan>) -> Self {
self.clean_spans = clean_spans;
self
}
pub fn codec_audit(mut self, codec_audit: Vec<CodecAuditRow>) -> Self {
self.codec_audit = codec_audit;
self
}
pub fn build(self) -> Result<DocumentExtension, DocumentExtensionError> {
Ok(DocumentExtension {
schema_version: self.schema_version,
clean_md_sha256: self
.clean_md_sha256
.ok_or(DocumentExtensionError::MissingField("clean_md_sha256"))?,
layout_json_sha256: self
.layout_json_sha256
.ok_or(DocumentExtensionError::MissingField("layout_json_sha256"))?,
report_json_sha256: self
.report_json_sha256
.ok_or(DocumentExtensionError::MissingField("report_json_sha256"))?,
preview_png_sha256: self.preview_png_sha256,
page_count: self
.page_count
.ok_or(DocumentExtensionError::MissingField("page_count"))?,
audit_session_id: self
.audit_session_id
.ok_or(DocumentExtensionError::MissingField("audit_session_id"))?,
clean_spans: self.clean_spans,
codec_audit: self.codec_audit,
})
}
}
#[derive(Debug, Clone, PartialEq, Eq, Error)]
#[non_exhaustive]
pub enum DocumentExtensionError {
#[error("missing document extension field: {0}")]
MissingField(&'static str),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub enum TextOrigin {
Ocr,
EmbeddedText,
Transcript,
Hybrid,
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct CodecCapabilitySet {
pub text: bool,
pub layout: bool,
pub confidence: bool,
pub timestamps: bool,
}
impl CodecCapabilitySet {
pub const TEXT_ONLY: Self = Self {
text: true,
layout: false,
confidence: false,
timestamps: false,
};
pub const fn new(text: bool, layout: bool, confidence: bool, timestamps: bool) -> Self {
Self {
text,
layout,
confidence,
timestamps,
}
}
pub fn contains(self, requested: Self) -> bool {
(!requested.text || self.text)
&& (!requested.layout || self.layout)
&& (!requested.confidence || self.confidence)
&& (!requested.timestamps || self.timestamps)
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub enum ExtractionDensityPolicy {
Required(f32),
Exempt { reason: String },
}
impl Default for ExtractionDensityPolicy {
fn default() -> Self {
Self::Exempt {
reason: "calibration_pending".to_string(),
}
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct CodecAuditRow {
pub codec_id: String,
pub codec_version: String,
pub accepted_mime: String,
pub advertised: CodecCapabilitySet,
pub delivered: CodecCapabilitySet,
pub text_origin: TextOrigin,
pub codec_output_schema_version: u16,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub options_hash_hex: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub engine_provenance: Option<String>,
pub extraction_density_policy: ExtractionDensityPolicy,
}
impl CodecAuditRow {
pub fn new(
codec_id: impl Into<String>,
codec_version: impl Into<String>,
accepted_mime: impl Into<String>,
text_origin: TextOrigin,
) -> Self {
Self {
codec_id: codec_id.into(),
codec_version: codec_version.into(),
accepted_mime: accepted_mime.into(),
advertised: CodecCapabilitySet::default(),
delivered: CodecCapabilitySet::default(),
text_origin,
codec_output_schema_version: 1,
options_hash_hex: None,
engine_provenance: None,
extraction_density_policy: ExtractionDensityPolicy::default(),
}
}
}
#[derive(Debug, Clone, Default, PartialEq)]
#[non_exhaustive]
pub struct LeakReport {
pub suspects: Vec<LeakSuspect>,
pub telemetry: Vec<LeakReportTelemetry>,
pub stats: LeakReportStats,
pub replay_hash: Option<String>,
}
impl LeakReport {
pub fn from_parts(
suspects: Vec<LeakSuspect>,
telemetry: Vec<LeakReportTelemetry>,
) -> LeakReport {
let mut stats = LeakReportStats {
suspect_count: suspects.len(),
locale_skipped_count: telemetry
.iter()
.filter(|event| matches!(event, LeakReportTelemetry::LocaleSkipped { .. }))
.count(),
..LeakReportStats::default()
};
for suspect in &suspects {
match suspect.kind {
LeakKind::Uncovered => stats.uncovered_count += 1,
LeakKind::PartialBleed { .. } => stats.partial_bleed_count += 1,
LeakKind::ClassMismatch { .. } => stats.class_mismatch_count += 1,
}
}
LeakReport {
suspects,
telemetry,
stats,
replay_hash: None,
}
}
pub fn extend(&mut self, other: LeakReport) {
self.suspects.extend(other.suspects);
self.telemetry.extend(other.telemetry);
*self = LeakReport::from_parts(
std::mem::take(&mut self.suspects),
std::mem::take(&mut self.telemetry),
);
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[non_exhaustive]
pub enum OpenAiPrivateLabel {
PrivatePerson,
PrivateAddress,
PrivateEmail,
PrivatePhone,
PrivateUrl,
PrivateDate,
AccountNumber,
Secret,
}
impl OpenAiPrivateLabel {
pub fn as_str(self) -> &'static str {
match self {
Self::PrivatePerson => "private_person",
Self::PrivateAddress => "private_address",
Self::PrivateEmail => "private_email",
Self::PrivatePhone => "private_phone",
Self::PrivateUrl => "private_url",
Self::PrivateDate => "private_date",
Self::AccountNumber => "account_number",
Self::Secret => "secret",
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[non_exhaustive]
pub enum SafetyNetPiiClass {
Email,
Name,
Location,
Phone,
Url,
Date,
AccountNumber,
Secret,
}
impl SafetyNetPiiClass {
pub fn to_pii_class(self) -> PiiClass {
match self {
Self::Email => PiiClass::Email,
Self::Name => PiiClass::Name,
Self::Location => PiiClass::Location,
Self::Phone => PiiClass::custom("phone"),
Self::Url => PiiClass::custom("url"),
Self::Date => PiiClass::custom("date"),
Self::AccountNumber => PiiClass::custom("account_number"),
Self::Secret => PiiClass::custom("secret"),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Error)]
#[non_exhaustive]
pub enum SafetyNetError {
#[error("safety net unavailable: {reason}")]
Unavailable {
reason: String,
},
#[error("safety net weights missing: {path}")]
WeightsMissing {
path: String,
},
#[error("safety net model unavailable: {reason}")]
ModelUnavailable {
reason: String,
},
#[error("safety net input too large: limit={limit}, actual={actual}")]
InputTooLarge {
limit: usize,
actual: usize,
},
#[error("safety net runtime failed: {message}")]
Runtime {
message: String,
},
#[error("safety net invalid output: {message}")]
InvalidOutput {
message: String,
},
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum Action {
Tokenize,
Redact,
FormatPreserve,
Generalize,
Preserve,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum ConflictTier {
None,
ClassPriority,
RulePriority,
Score,
SpanLength,
Validator,
ValidatorVeto,
CollisionPolicy,
AnchoredContext,
RecognizerId,
Merged,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum DocumentKind {
Structured,
Text,
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub struct RedactionEntry {
pub source: String,
pub class: PiiClass,
pub action: Action,
pub field_name: Option<String>,
pub document_kind: DocumentKind,
pub conflict_loser: bool,
pub decided_by: ConflictTier,
pub created_at: i64,
pub session_id: Option<String>,
pub validator_fail_reason: Option<ValidatorFailReason>,
pub ambiguity_record: Option<AmbiguityRecord>,
pub collision_family: Option<String>,
pub collision_variant: Option<String>,
}
impl RedactionEntry {
#[allow(clippy::too_many_arguments)]
pub fn new(
source: impl Into<String>,
class: PiiClass,
action: Action,
field_name: Option<String>,
document_kind: DocumentKind,
conflict_loser: bool,
decided_by: ConflictTier,
created_at: i64,
session_id: Option<String>,
) -> Self {
Self {
source: source.into(),
class,
action,
field_name,
document_kind,
conflict_loser,
decided_by,
created_at,
session_id,
validator_fail_reason: None,
ambiguity_record: None,
collision_family: None,
collision_variant: None,
}
}
pub fn with_validator_fail_reason(mut self, reason: ValidatorFailReason) -> Self {
self.validator_fail_reason = Some(reason);
self
}
pub fn with_ambiguity_record(mut self, record: AmbiguityRecord) -> Self {
self.ambiguity_record = Some(record);
self
}
pub fn with_collision_metadata(
mut self,
family: Option<String>,
variant: Option<String>,
) -> Self {
self.collision_family = family;
self.collision_variant = variant;
self
}
}
#[derive(Debug, Clone, PartialEq, Eq, Error)]
#[non_exhaustive]
pub enum RedactionLogError {
#[error("sqlite redaction log error: {0}")]
Sqlite(String),
#[error("backend redaction log error: {0}")]
Backend(String),
}
pub trait RedactionLogger: Send + Sync {
fn log(&self, entry: &RedactionEntry) -> Result<(), RedactionLogError>;
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
#[non_exhaustive]
pub enum LocaleTag {
Global,
DeDe,
DeAt,
DeCh,
EnUs,
EnGb,
EnIe,
EnAu,
EnCa,
Other(String),
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub enum LocaleError {
Unsupported,
}
impl fmt::Display for LocaleError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
LocaleError::Unsupported => f.write_str("unsupported locale"),
}
}
}
impl std::error::Error for LocaleError {}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct LocaleChain(Vec<LocaleTag>);
impl LocaleTag {
pub const GLOBAL: LocaleTag = LocaleTag::Global;
pub fn parse(s: &str) -> Result<LocaleTag, LocaleError> {
let raw = s.trim().replace('_', "-");
let normalized = raw.to_ascii_lowercase();
match normalized.as_str() {
"global" | "*" => Ok(LocaleTag::Global),
"de-de" => Ok(LocaleTag::DeDe),
"de-at" => Ok(LocaleTag::DeAt),
"de-ch" => Ok(LocaleTag::DeCh),
"en-us" => Ok(LocaleTag::EnUs),
"en-gb" => Ok(LocaleTag::EnGb),
"en-ie" => Ok(LocaleTag::EnIe),
"en-au" => Ok(LocaleTag::EnAu),
"en-ca" => Ok(LocaleTag::EnCa),
"" => Err(LocaleError::Unsupported),
_ if is_bcp47_parseable(&raw) => Ok(LocaleTag::Other(canonical_other(&raw))),
_ => Err(LocaleError::Unsupported),
}
}
pub fn as_str(&self) -> &str {
match self {
LocaleTag::Global => "global",
LocaleTag::DeDe => "de-DE",
LocaleTag::DeAt => "de-AT",
LocaleTag::DeCh => "de-CH",
LocaleTag::EnUs => "en-US",
LocaleTag::EnGb => "en-GB",
LocaleTag::EnIe => "en-IE",
LocaleTag::EnAu => "en-AU",
LocaleTag::EnCa => "en-CA",
LocaleTag::Other(tag) => tag.as_str(),
}
}
}
impl LocaleChain {
pub fn from_tags(mut tags: Vec<LocaleTag>) -> LocaleChain {
ensure_global(&mut tags);
LocaleChain(tags)
}
pub fn from_cli(raw: &str) -> Result<LocaleChain, LocaleError> {
let tags = raw
.split(',')
.map(LocaleTag::parse)
.collect::<Result<Vec<_>, _>>()?;
Ok(LocaleChain::from_tags(tags))
}
pub fn merge_policy_and_cli(
policy: Option<&[LocaleTag]>,
cli: Option<&[LocaleTag]>,
) -> LocaleChain {
Self::merge_cli_policy_rulepack_default(cli, policy, None)
}
pub fn merge_cli_policy_rulepack_default(
cli: Option<&[LocaleTag]>,
policy: Option<&[LocaleTag]>,
rulepack_defaults: Option<&[LocaleTag]>,
) -> LocaleChain {
let tags = cli
.filter(|tags| !tags.is_empty())
.or_else(|| policy.filter(|tags| !tags.is_empty()))
.or_else(|| rulepack_defaults.filter(|tags| !tags.is_empty()))
.map(|tags| tags.to_vec())
.unwrap_or_else(|| vec![LocaleTag::Global]);
LocaleChain::from_tags(tags)
}
pub fn intersects(&self, recognizer_locales: &[LocaleTag]) -> bool {
if recognizer_locales.is_empty() {
return true;
}
recognizer_locales.iter().any(|recognizer_locale| {
*recognizer_locale == LocaleTag::Global
|| self.0.iter().any(|active| active == recognizer_locale)
})
}
pub fn as_slice(&self) -> &[LocaleTag] {
&self.0
}
pub fn to_strings(&self) -> Vec<String> {
self.0.iter().map(ToString::to_string).collect()
}
}
impl From<&[LocaleTag]> for LocaleChain {
fn from(tags: &[LocaleTag]) -> Self {
let mut owned = tags.to_vec();
ensure_global(&mut owned);
LocaleChain(owned)
}
}
impl fmt::Display for LocaleTag {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(self.as_str())
}
}
#[derive(Debug, Clone)]
#[non_exhaustive]
pub enum RawDocument {
Structured(BTreeMap<String, Value>),
Text(String),
}
#[derive(Debug, Clone, Serialize)]
#[serde(untagged)]
#[non_exhaustive]
pub enum CleanDocument {
Structured(BTreeMap<String, Value>),
Text(String),
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
#[serde(untagged)]
#[non_exhaustive]
pub enum Value {
Null,
Bool(bool),
String(String),
I64(i64),
Array(Vec<Value>),
Object(BTreeMap<String, Value>),
}
impl Value {
pub fn as_str(&self) -> Option<&str> {
match self {
Self::String(value) => Some(value.as_str()),
Self::Null | Self::Bool(_) | Self::I64(_) | Self::Array(_) | Self::Object(_) => None,
}
}
pub fn scalar_to_safety_net_string(&self) -> Option<String> {
match self {
Self::String(value) if !value.is_empty() => Some(value.clone()),
Self::String(_) | Self::Null | Self::Array(_) | Self::Object(_) => None,
Self::Bool(value) => Some(value.to_string()),
Self::I64(value) => Some(value.to_string()),
}
}
}
impl PartialEq<&str> for Value {
fn eq(&self, other: &&str) -> bool {
self.as_str() == Some(*other)
}
}
#[derive(Debug, Clone, Default)]
pub struct DictionaryBundle {
entries: HashMap<String, DictionaryEntry>,
}
#[derive(Debug, Clone)]
pub struct DictionaryEntry {
terms: Vec<String>,
case_sensitive: bool,
source: DictionarySource,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum DictionarySource {
Cli,
Rulepack,
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub struct DictionaryStats {
pub name: String,
pub term_count: usize,
pub source: DictionarySource,
}
impl DictionaryStats {
pub fn new(name: impl Into<String>, term_count: usize, source: DictionarySource) -> Self {
Self {
name: name.into(),
term_count,
source,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub struct RulepackDict {
pub name: String,
pub terms: Vec<String>,
pub case_sensitive: bool,
}
impl RulepackDict {
pub fn new(name: impl Into<String>, terms: Vec<String>, case_sensitive: bool) -> Self {
Self {
name: name.into(),
terms,
case_sensitive,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub enum DictionaryLoadError {
Empty { name: String },
UnicodeInsensitiveUnsupported { name: String },
}
impl fmt::Display for DictionaryLoadError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Empty { name } => write!(f, "dictionary '{name}' has no terms"),
Self::UnicodeInsensitiveUnsupported { name } => write!(
f,
"dictionary '{name}' uses unicode terms with case-insensitive matching, unsupported in v0.4.0; use case_sensitive = true"
),
}
}
}
impl std::error::Error for DictionaryLoadError {}
impl DictionaryBundle {
pub fn from_rulepack_terms(terms: &[RulepackDict]) -> Self {
let mut entries = HashMap::with_capacity(terms.len());
for dictionary in terms {
let entry = DictionaryEntry::new(
&dictionary.name,
dictionary.terms.clone(),
dictionary.case_sensitive,
DictionarySource::Rulepack,
)
.expect("Policy validates dictionary terms before bundle construction");
entries.insert(dictionary.name.clone(), entry);
}
Self { entries }
}
pub fn from_entries(entries: impl IntoIterator<Item = (String, DictionaryEntry)>) -> Self {
Self {
entries: entries.into_iter().collect(),
}
}
pub fn merge(a: Self, b: Self) -> Self {
let mut entries = a.entries;
entries.extend(b.entries);
Self { entries }
}
pub fn get(&self, name: &str) -> Option<&DictionaryEntry> {
self.entries.get(name)
}
pub fn stats(&self) -> Vec<DictionaryStats> {
let mut stats = self
.entries
.iter()
.map(|(name, entry)| DictionaryStats {
name: name.clone(),
term_count: entry.terms.len(),
source: entry.source,
})
.collect::<Vec<_>>();
stats.sort_by(|a, b| a.name.cmp(&b.name));
stats
}
}
impl DictionaryEntry {
pub fn new(
name: &str,
terms: Vec<String>,
case_sensitive: bool,
source: DictionarySource,
) -> Result<Self, DictionaryLoadError> {
if terms.is_empty() {
return Err(DictionaryLoadError::Empty {
name: name.to_string(),
});
}
if !case_sensitive && terms.iter().any(|term| !term.is_ascii()) {
return Err(DictionaryLoadError::UnicodeInsensitiveUnsupported {
name: name.to_string(),
});
}
Ok(Self {
terms,
case_sensitive,
source,
})
}
pub fn case_sensitive(&self) -> bool {
self.case_sensitive
}
pub fn terms(&self) -> &[String] {
&self.terms
}
}
#[cfg(test)]
mod dictionary_tests {
use super::*;
#[test]
fn dictionary_entry_rejects_empty_terms() {
let err = DictionaryEntry::new("empty", Vec::new(), true, DictionarySource::Cli)
.expect_err("empty dictionaries must fail closed");
assert!(matches!(err, DictionaryLoadError::Empty { name } if name == "empty"));
}
#[test]
fn dictionary_entry_rejects_non_ascii_case_insensitive_terms() {
let err = DictionaryEntry::new(
"songs",
vec!["Beyonce".to_string(), "Caf\u{00e9}".to_string()],
false,
DictionarySource::Cli,
)
.expect_err("unicode case-insensitive dictionaries must fail closed");
assert!(matches!(
err,
DictionaryLoadError::UnicodeInsensitiveUnsupported { name } if name == "songs"
));
}
}
#[cfg(test)]
mod redaction_logger_tests {
use super::*;
struct CapturingLogger;
impl RedactionLogger for CapturingLogger {
fn log(&self, _entry: &RedactionEntry) -> Result<(), RedactionLogError> {
Ok(())
}
}
fn assert_send_sync<T: Send + Sync + ?Sized>() {}
#[test]
fn redaction_log_error_display_is_stable() {
assert_eq!(
RedactionLogError::Sqlite("write failed".to_string()).to_string(),
"sqlite redaction log error: write failed"
);
assert_eq!(
RedactionLogError::Backend("sink failed".to_string()).to_string(),
"backend redaction log error: sink failed"
);
}
#[test]
fn redaction_logger_trait_object_is_send_sync() {
assert_send_sync::<dyn RedactionLogger>();
}
#[test]
fn local_logger_can_implement_redaction_logger() {
let logger = CapturingLogger;
let entry = RedactionEntry {
source: "unit-test".to_string(),
class: PiiClass::Email,
action: Action::Tokenize,
field_name: None,
document_kind: DocumentKind::Text,
conflict_loser: false,
decided_by: ConflictTier::None,
created_at: 0,
session_id: None,
validator_fail_reason: None,
ambiguity_record: None,
collision_family: None,
collision_variant: None,
};
let trait_object: &dyn RedactionLogger = &logger;
trait_object.log(&entry).expect("log entry");
}
}
#[cfg(test)]
mod safety_net_manifest_tests {
use super::*;
fn span(start: usize, end: usize, class: PiiClass) -> EmittedTokenSpan {
EmittedTokenSpan {
clean_span: start..end,
raw_span: start..end,
class,
}
}
fn diff(manifest: Manifest, suspect: Range<usize>, class: PiiClass) -> Option<LeakKind> {
manifest.diff_against(&suspect, &class)
}
#[test]
fn exact_same_class_coverage_is_not_a_leak() {
let manifest = Manifest::from_spans(vec![span(0, 8, PiiClass::Email)]);
assert_eq!(diff(manifest, 0..8, PiiClass::Email), None);
}
#[test]
fn uncovered_outside_all_tokens_is_uncovered() {
let manifest = Manifest::from_spans(vec![span(20, 30, PiiClass::Email)]);
assert_eq!(
diff(manifest, 0..10, PiiClass::Email),
Some(LeakKind::Uncovered)
);
}
#[test]
fn single_internal_gap_returns_partial_bleed() {
let manifest = Manifest::from_spans(vec![
span(0, 5, PiiClass::Email),
span(10, 15, PiiClass::Email),
]);
assert_eq!(
diff(manifest, 0..15, PiiClass::Email),
Some(LeakKind::PartialBleed { uncovered: 5..10 })
);
}
#[test]
fn multi_gap_returns_deterministic_first_uncovered_gap() {
let manifest = Manifest::from_spans(vec![
span(0, 3, PiiClass::Email),
span(5, 7, PiiClass::Email),
span(9, 12, PiiClass::Email),
]);
assert_eq!(
diff(manifest, 0..12, PiiClass::Email),
Some(LeakKind::PartialBleed { uncovered: 3..5 })
);
}
#[test]
fn multi_class_overlap_reports_first_mismatch_deterministically() {
let manifest = Manifest::from_spans(vec![
span(0, 4, PiiClass::Name),
span(4, 8, PiiClass::Location),
]);
assert_eq!(
diff(manifest, 0..8, PiiClass::Email),
Some(LeakKind::ClassMismatch {
pipeline_class: PiiClass::Name,
safety_net_class: PiiClass::Email,
})
);
}
#[test]
fn adjacent_same_class_tokens_cover_continuously() {
let manifest = Manifest::from_spans(vec![
span(0, 5, PiiClass::Email),
span(5, 10, PiiClass::Email),
]);
assert_eq!(diff(manifest, 0..10, PiiClass::Email), None);
}
#[test]
fn partial_bleed_at_start_end_and_middle() {
let manifest = Manifest::from_spans(vec![span(3, 8, PiiClass::Email)]);
assert_eq!(
diff(manifest.clone(), 0..8, PiiClass::Email),
Some(LeakKind::PartialBleed { uncovered: 0..3 })
);
assert_eq!(
diff(manifest.clone(), 3..10, PiiClass::Email),
Some(LeakKind::PartialBleed { uncovered: 8..10 })
);
let with_gap = Manifest::from_spans(vec![
span(0, 3, PiiClass::Email),
span(6, 10, PiiClass::Email),
]);
assert_eq!(
diff(with_gap, 0..10, PiiClass::Email),
Some(LeakKind::PartialBleed { uncovered: 3..6 })
);
}
#[test]
fn byte_indices_are_not_character_indices() {
let text = "ID: 😀 <Email_1>";
let token_start = text.find("<Email_1>").expect("token start");
assert_eq!(token_start, 9, "emoji is four bytes, not one char");
let manifest = Manifest::from_spans(vec![span(token_start, text.len(), PiiClass::Email)]);
assert_eq!(
diff(manifest, token_start..text.len(), PiiClass::Email),
None
);
}
#[test]
fn empty_suspect_range_is_not_a_leak() {
let manifest = Manifest::default();
assert_eq!(diff(manifest, 3..3, PiiClass::Email), None);
}
#[test]
fn safety_net_error_display_is_variant_specific_and_bytes_free() {
let cases = [
SafetyNetError::Unavailable {
reason: "not configured".to_string(),
}
.to_string(),
SafetyNetError::WeightsMissing {
path: "/models/opf".to_string(),
}
.to_string(),
SafetyNetError::ModelUnavailable {
reason: "load failed".to_string(),
}
.to_string(),
SafetyNetError::InputTooLarge {
limit: 1024,
actual: 2048,
}
.to_string(),
SafetyNetError::Runtime {
message: "timeout".to_string(),
}
.to_string(),
SafetyNetError::InvalidOutput {
message: "bad json".to_string(),
}
.to_string(),
];
for rendered in cases {
assert!(!rendered.contains("alice@example.invalid"));
}
}
}
pub trait Recognizer: Send + Sync {
fn id(&self) -> &str;
fn supported_class(&self) -> &PiiClass;
fn detect(&self, input: &str, ctx: &DetectContext<'_>) -> Vec<Candidate>;
fn token_family(&self) -> &str;
fn validator_kind(&self) -> Option<ValidatorKind> {
None
}
fn locales(&self) -> &[LocaleTag] {
&[LocaleTag::Global]
}
}
#[derive(Debug, Clone, PartialEq)]
#[non_exhaustive]
pub struct Candidate {
pub span: Range<usize>,
pub class: PiiClass,
pub recognizer_id: String,
pub score: f32,
pub priority: i32,
pub canonical_form: Option<String>,
pub token_family: String,
pub source: String,
pub decided_by: ConflictTier,
pub merged_sources: Vec<String>,
}
impl Candidate {
#[allow(clippy::too_many_arguments)]
pub fn new(
span: Range<usize>,
class: PiiClass,
recognizer_id: impl Into<String>,
score: f32,
priority: i32,
canonical_form: Option<String>,
token_family: impl Into<String>,
source: impl Into<String>,
decided_by: ConflictTier,
merged_sources: Vec<String>,
) -> Self {
Self {
span,
class,
recognizer_id: recognizer_id.into(),
score,
priority,
canonical_form,
token_family: token_family.into(),
source: source.into(),
decided_by,
merged_sources,
}
}
pub fn with_span(mut self, span: Range<usize>) -> Self {
self.span = span;
self
}
}
#[non_exhaustive]
pub struct DetectContext<'a> {
pub locale_chain: &'a [LocaleTag],
pub dictionaries: &'a DictionaryBundle,
pub fields: &'a (),
pub degraded: Cell<bool>,
}
impl<'a> DetectContext<'a> {
pub fn new(locale_chain: &'a [LocaleTag], dictionaries: &'a DictionaryBundle) -> Self {
Self {
locale_chain,
dictionaries,
fields: &(),
degraded: Cell::new(false),
}
}
}
fn ensure_global(tags: &mut Vec<LocaleTag>) {
if !tags.contains(&LocaleTag::Global) {
tags.push(LocaleTag::Global);
}
}
fn is_bcp47_parseable(raw: &str) -> bool {
let mut parts = raw.split('-');
let Some(language) = parts.next() else {
return false;
};
if !(2..=8).contains(&language.len()) || !language.chars().all(|ch| ch.is_ascii_alphabetic()) {
return false;
}
parts.all(|part| {
(2..=8).contains(&part.len()) && part.chars().all(|ch| ch.is_ascii_alphanumeric())
})
}
fn canonical_other(raw: &str) -> String {
let mut parts = raw.split('-');
let language = parts.next().unwrap_or_default().to_ascii_lowercase();
let rest = parts.map(|part| {
if part.len() == 2 && part.chars().all(|ch| ch.is_ascii_alphabetic()) {
part.to_ascii_uppercase()
} else {
part.to_ascii_lowercase()
}
});
std::iter::once(language)
.chain(rest)
.collect::<Vec<_>>()
.join("-")
}