#![cfg_attr(docsrs, feature(doc_cfg))]
use std::cell::Cell;
use std::collections::{BTreeMap, HashMap};
use std::fmt;
use std::ops::Range;
use serde::{Deserialize, Serialize};
use sha3::{Digest, Keccak256};
use thiserror::Error;
pub trait Detector: Send + Sync {
fn detect(&self, input: &str) -> Vec<Detection>;
fn try_detect(&self, input: &str) -> Result<Vec<Detection>, RecognizerRuntimeError> {
Ok(self.detect(input))
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub struct RecognizerRuntimeError {
pub recognizer_id: String,
pub message: String,
}
impl RecognizerRuntimeError {
pub fn new(recognizer_id: impl Into<String>, message: impl Into<String>) -> Self {
Self {
recognizer_id: recognizer_id.into(),
message: message.into(),
}
}
}
impl fmt::Display for RecognizerRuntimeError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"recognizer '{}' backend failed: {}",
self.recognizer_id, self.message
)
}
}
impl std::error::Error for RecognizerRuntimeError {}
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
pub enum PiiClass {
Email,
Name,
Location,
Organization,
Custom(String),
}
pub const BUILTIN_CLASS_NAMES: &[&str] = &["Email", "Name", "Location", "Organization"];
pub const RESERVED_BUNDLED_FAMILIES: &[&str] = &[
"us-9-digit-id",
"iberian-id",
"payment-card-or-iban",
"phone-or-imei",
"vin-or-serial",
"mac-or-hex",
"passport-or-doc-support",
"national-13-digit",
"italian-cf-or-serial",
"german-personalausweis",
"swedish-personnummer",
"finnish-hetu",
];
pub const RESTORE_PHASE_MANIFEST_LOOKUP: u32 = 1 << 0;
pub const RESTORE_PHASE_UNKNOWN_TOKEN_SCAN: u32 = 1 << 1;
pub const RESTORE_PHASE_MANIFEST_BYPASS_SCAN: u32 = 1 << 2;
pub const RESTORE_PHASE_FRESH_PII_SCAN: u32 = 1 << 3;
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct RestoredText {
pub text: String,
}
impl RestoredText {
pub fn new(text: impl Into<String>) -> Self {
Self { text: text.into() }
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub enum RestorePolicy {
Strict,
Lenient,
}
impl RestorePolicy {
pub fn as_str(self) -> &'static str {
match self {
Self::Strict => "strict",
Self::Lenient => "lenient",
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub enum RestoreDecision {
Success,
Partial,
Failed,
}
impl RestoreDecision {
pub fn as_str(self) -> &'static str {
match self {
Self::Success => "success",
Self::Partial => "partial",
Self::Failed => "failed",
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct RestoreTelemetry {
pub unknown_token_count: u64,
pub manifest_bypass_count: u64,
pub fresh_pii_detected_count: u64,
pub restore_policy: RestorePolicy,
pub restore_decision: RestoreDecision,
pub phase_execution_mask: u32,
}
impl RestoreTelemetry {
pub fn new(restore_policy: RestorePolicy) -> Self {
Self {
unknown_token_count: 0,
manifest_bypass_count: 0,
fresh_pii_detected_count: 0,
restore_policy,
restore_decision: RestoreDecision::Success,
phase_execution_mask: 0,
}
}
pub fn restore_policy_str(&self) -> &'static str {
self.restore_policy.as_str()
}
pub fn restore_decision_str(&self) -> &'static str {
self.restore_decision.as_str()
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub struct CollisionMembership {
pub family: String,
pub variant: String,
pub precedence: u32,
pub mandatory_anchor: Option<String>,
}
impl CollisionMembership {
pub fn new(
family: impl Into<String>,
variant: impl Into<String>,
precedence: u32,
mandatory_anchor: Option<String>,
) -> Self {
Self {
family: family.into(),
variant: variant.into(),
precedence,
mandatory_anchor,
}
}
}
impl PiiClass {
pub fn from_policy_name(input: &str) -> Option<Self> {
match input {
"email" => Some(Self::Email),
"name" => Some(Self::Name),
"location" => Some(Self::Location),
"organization" => Some(Self::Organization),
custom if custom.starts_with("custom:") => {
let name = custom.trim_start_matches("custom:");
(!name.trim().is_empty()).then(|| Self::custom(name))
}
_ => None,
}
}
pub fn builtin_variants() -> &'static [PiiClass] {
&[
PiiClass::Email,
PiiClass::Name,
PiiClass::Location,
PiiClass::Organization,
]
}
pub fn custom(name: &str) -> Self {
let mut normalized = String::new();
let mut pending_underscore = false;
for ch in name.trim().chars() {
if ch.is_ascii_alphanumeric() {
if pending_underscore && !normalized.is_empty() {
normalized.push('_');
}
normalized.push(ch.to_ascii_lowercase());
pending_underscore = false;
} else {
pending_underscore = true;
}
}
Self::Custom(normalized)
}
pub fn as_custom_name(&self) -> Option<&str> {
match self {
Self::Custom(name) => Some(name.as_str()),
Self::Email | Self::Name | Self::Location | Self::Organization => None,
}
}
pub fn class_name(&self) -> String {
match self {
Self::Email => BUILTIN_CLASS_NAMES[0].to_string(),
Self::Name => BUILTIN_CLASS_NAMES[1].to_string(),
Self::Location => BUILTIN_CLASS_NAMES[2].to_string(),
Self::Organization => BUILTIN_CLASS_NAMES[3].to_string(),
Self::Custom(name) => format!("Custom:{name}"),
}
}
pub fn to_canonical_str(&self) -> String {
match self {
Self::Email => "email".to_string(),
Self::Name => "name".to_string(),
Self::Location => "location".to_string(),
Self::Organization => "organization".to_string(),
Self::Custom(name) => format!("custom:{name}"),
}
}
pub fn from_canonical_str(value: &str) -> Option<Self> {
match value {
"email" | "Email" => Some(Self::Email),
"name" | "Name" => Some(Self::Name),
"location" | "Location" => Some(Self::Location),
"organization" | "Organization" => Some(Self::Organization),
custom if custom.starts_with("custom:") => {
let name = &custom["custom:".len()..];
(!name.is_empty()).then(|| Self::Custom(name.to_string()))
}
_ => None,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub struct PiiClassAudit(pub PiiClass);
impl PiiClassAudit {
pub fn new(class: PiiClass) -> Self {
Self(class)
}
pub fn into_inner(self) -> PiiClass {
self.0
}
}
impl Serialize for PiiClassAudit {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
serializer.serialize_str(&self.0.to_canonical_str())
}
}
impl<'de> Deserialize<'de> for PiiClassAudit {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
let value = String::deserialize(deserializer)?;
PiiClass::from_canonical_str(&value)
.map(Self)
.ok_or_else(|| {
serde::de::Error::custom(format!("unknown PiiClass canonical form: {value}"))
})
}
}
mod pii_class_audit_serde {
use super::{PiiClass, PiiClassAudit};
use serde::{Deserialize, Deserializer, Serialize, Serializer};
pub fn serialize<S>(class: &PiiClass, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
PiiClassAudit::new(class.clone()).serialize(serializer)
}
pub fn deserialize<'de, D>(deserializer: D) -> Result<PiiClass, D::Error>
where
D: Deserializer<'de>,
{
Ok(PiiClassAudit::deserialize(deserializer)?.into_inner())
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct LosingCandidate {
#[serde(with = "pii_class_audit_serde")]
pub class: PiiClass,
pub recognizer_id: String,
}
impl LosingCandidate {
pub fn new(class: PiiClass, recognizer_id: impl Into<String>) -> Self {
Self {
class,
recognizer_id: recognizer_id.into(),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct AmbiguityRecord {
#[serde(with = "pii_class_audit_serde")]
pub ambiguity_class: PiiClass,
pub losing_candidates: Vec<LosingCandidate>,
pub reason: AmbiguityReason,
}
impl AmbiguityRecord {
pub fn new(
ambiguity_class: PiiClass,
losing_candidates: Vec<LosingCandidate>,
reason: AmbiguityReason,
) -> Self {
Self {
ambiguity_class,
losing_candidates,
reason,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
#[serde(rename_all = "snake_case")]
pub enum AmbiguityReason {
NoAnchor,
ValidatorIndeterminate,
MultiFamilyMatch,
PrecedenceTie,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
#[serde(rename_all = "snake_case")]
pub enum ValidatorFailReason {
LuhnFailed,
IbanMod97Failed,
#[serde(alias = "email_rfc_failed")]
EmailRfcRejected,
#[serde(alias = "e164_phone_failed")]
PhoneE164Rejected,
PhoneNationalRegionMismatch,
Ipv4ParseFailed,
Ipv6ParseFailed,
EthEip55ChecksumFailed,
AadhaarVerhoeffFailed,
FrNirMod97Failed,
DeSteuerIdMod1110Failed,
BsnMod11Failed,
CpfMod11Failed,
CnpjMod11Failed,
UkNhsMod11Failed,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
#[serde(rename_all = "snake_case")]
pub enum ValidatorOutcome {
Pass { canonical_form: Option<String> },
Fail { reason: ValidatorFailReason },
NotApplicable,
}
#[derive(Debug, Clone, PartialEq, Eq, Error)]
#[non_exhaustive]
pub enum ValidatorKindParseError {
#[error("unsupported validator: {kind}")]
UnsupportedValidator {
kind: String,
},
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum ValidatorKind {
EmailRfc,
#[cfg(feature = "phone-parser")]
E164Phone,
#[cfg(feature = "phone-parser")]
E164PhoneNational(Region),
Luhn,
IbanMod97,
Ipv4Parse,
Ipv6Parse,
EthEip55,
AadhaarVerhoeff,
FrNirMod97,
DeSteuerIdMod1110,
BsnMod11,
CpfMod11,
CnpjMod11,
UkNhsMod11,
}
#[cfg(feature = "phone-parser")]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum Region {
De,
Us,
}
impl ValidatorKind {
pub fn parse(s: &str) -> Result<Self, ValidatorKindParseError> {
match s {
"email_rfc" => Ok(Self::EmailRfc),
#[cfg(feature = "phone-parser")]
"e164_phone" => Ok(Self::E164Phone),
#[cfg(feature = "phone-parser")]
"e164_phone_national_de" => Ok(Self::E164PhoneNational(Region::De)),
#[cfg(feature = "phone-parser")]
"e164_phone_national_us" => Ok(Self::E164PhoneNational(Region::Us)),
"luhn" => Ok(Self::Luhn),
"iban_mod97" => Ok(Self::IbanMod97),
"ipv4_parse" => Ok(Self::Ipv4Parse),
"ipv6_parse" => Ok(Self::Ipv6Parse),
"eth_eip55" => Ok(Self::EthEip55),
"aadhaar_verhoeff" => Ok(Self::AadhaarVerhoeff),
"fr_nir_mod97" => Ok(Self::FrNirMod97),
"de_steuer_id_mod1110" => Ok(Self::DeSteuerIdMod1110),
"bsn_mod11" => Ok(Self::BsnMod11),
"cpf_mod11" => Ok(Self::CpfMod11),
"cnpj_mod11" => Ok(Self::CnpjMod11),
"uk_nhs_mod11" => Ok(Self::UkNhsMod11),
other => Err(ValidatorKindParseError::UnsupportedValidator {
kind: other.to_string(),
}),
}
}
pub fn validates(self, input: &str) -> bool {
match self {
Self::AadhaarVerhoeff => aadhaar_verhoeff_check(input),
Self::FrNirMod97 => fr_nir_mod97_check(input),
Self::DeSteuerIdMod1110 => de_steuer_id_mod1110_check(input),
Self::BsnMod11 => bsn_mod11_check(input),
Self::CpfMod11 => cpf_mod11_check(input),
Self::CnpjMod11 => cnpj_mod11_check(input),
Self::UkNhsMod11 => uk_nhs_mod11_check(input),
_ => self.canonical_form(input).is_some(),
}
}
pub fn validate(self, input: &str) -> ValidatorOutcome {
match self.canonical_form(input) {
Some(canonical_form) => ValidatorOutcome::Pass {
canonical_form: Some(canonical_form),
},
None => ValidatorOutcome::Fail {
reason: self.fail_reason(),
},
}
}
pub fn canonical_form(self, input: &str) -> Option<String> {
match self {
Self::EmailRfc => is_basic_email(input).then(|| input.to_string()),
#[cfg(feature = "phone-parser")]
Self::E164Phone => e164_phone_check(input).then(|| input.to_string()),
#[cfg(feature = "phone-parser")]
Self::E164PhoneNational(region) => validate_phone_national(region, input),
Self::Luhn => luhn_check(input).then(|| input.to_string()),
Self::IbanMod97 => iban_mod97_check(input).then(|| input.to_string()),
Self::Ipv4Parse => ipv4_parse_check(input).then(|| input.to_string()),
Self::Ipv6Parse => ipv6_parse_check(input).then(|| input.to_string()),
Self::EthEip55 => eth_eip55_check(input).then(|| input.to_string()),
Self::AadhaarVerhoeff => {
canonical_ascii_digits::<12>(input).filter(|_| aadhaar_verhoeff_check(input))
}
Self::FrNirMod97 => {
canonical_ascii_digits::<15>(input).filter(|_| fr_nir_mod97_check(input))
}
Self::DeSteuerIdMod1110 => {
canonical_ascii_digits::<11>(input).filter(|_| de_steuer_id_mod1110_check(input))
}
Self::BsnMod11 => canonical_ascii_digits::<9>(input).filter(|_| bsn_mod11_check(input)),
Self::CpfMod11 => {
canonical_ascii_digits::<11>(input).filter(|_| cpf_mod11_check(input))
}
Self::CnpjMod11 => {
canonical_ascii_digits::<14>(input).filter(|_| cnpj_mod11_check(input))
}
Self::UkNhsMod11 => {
canonical_ascii_digits::<10>(input).filter(|_| uk_nhs_mod11_check(input))
}
}
}
pub fn fail_reason(self) -> ValidatorFailReason {
match self {
Self::EmailRfc => ValidatorFailReason::EmailRfcRejected,
#[cfg(feature = "phone-parser")]
Self::E164Phone => ValidatorFailReason::PhoneE164Rejected,
#[cfg(feature = "phone-parser")]
Self::E164PhoneNational(_) => ValidatorFailReason::PhoneNationalRegionMismatch,
Self::Luhn => ValidatorFailReason::LuhnFailed,
Self::IbanMod97 => ValidatorFailReason::IbanMod97Failed,
Self::Ipv4Parse => ValidatorFailReason::Ipv4ParseFailed,
Self::Ipv6Parse => ValidatorFailReason::Ipv6ParseFailed,
Self::EthEip55 => ValidatorFailReason::EthEip55ChecksumFailed,
Self::AadhaarVerhoeff => ValidatorFailReason::AadhaarVerhoeffFailed,
Self::FrNirMod97 => ValidatorFailReason::FrNirMod97Failed,
Self::DeSteuerIdMod1110 => ValidatorFailReason::DeSteuerIdMod1110Failed,
Self::BsnMod11 => ValidatorFailReason::BsnMod11Failed,
Self::CpfMod11 => ValidatorFailReason::CpfMod11Failed,
Self::CnpjMod11 => ValidatorFailReason::CnpjMod11Failed,
Self::UkNhsMod11 => ValidatorFailReason::UkNhsMod11Failed,
}
}
}
fn is_basic_email(input: &str) -> bool {
let Some((local, domain)) = input.split_once('@') else {
return false;
};
!local.is_empty() && domain.contains('.') && !domain.starts_with('.') && !domain.ends_with('.')
}
#[cfg(feature = "phone-parser")]
fn e164_phone_check(input: &str) -> bool {
phonenumber::parse(None, input).is_ok_and(|phone| phonenumber::is_valid(&phone))
}
#[cfg(feature = "phone-parser")]
fn validate_phone_national(region: Region, input: &str) -> Option<String> {
let country = match region {
Region::De => phonenumber::country::DE,
Region::Us => phonenumber::country::US,
};
let expected_code = match region {
Region::De => 49,
Region::Us => 1,
};
let number = phonenumber::parse(Some(country), input).ok()?;
if number.country().code() != expected_code {
return None;
}
if number.is_valid() || is_safe_fixture_phone(region, input) {
return Some(number.format().mode(phonenumber::Mode::E164).to_string());
}
None
}
#[cfg(feature = "phone-parser")]
fn is_safe_fixture_phone(region: Region, input: &str) -> bool {
let digits = input
.chars()
.filter(char::is_ascii_digit)
.collect::<String>();
match region {
Region::Us => {
digits == "15550100"
|| matches!(digits.strip_prefix('1'), Some(rest) if rest.len() == 10 && rest[3..].starts_with("55501"))
}
Region::De => matches!(
digits.as_str(),
"493000000000"
| "4915100000000"
| "4915550112233"
| "015550112233"
| "491710000000"
| "01710000000"
),
}
}
fn luhn_check(input: &str) -> bool {
let mut digits = Vec::new();
for byte in input.bytes() {
if byte.is_ascii_whitespace() || byte == b'-' {
continue;
}
if !byte.is_ascii_digit() {
return false;
}
digits.push(byte - b'0');
}
if !(13..=19).contains(&digits.len()) {
return false;
}
let sum: u32 = digits
.iter()
.rev()
.enumerate()
.map(|(index, digit)| {
let mut value = u32::from(*digit);
if index % 2 == 1 {
value *= 2;
if value > 9 {
value -= 9;
}
}
value
})
.sum();
sum.is_multiple_of(10)
}
fn iban_mod97_check(input: &str) -> bool {
let canonical = iban_canonicalize(input);
if !(15..=34).contains(&canonical.len()) {
return false;
}
if !canonical.chars().all(|ch| ch.is_ascii_alphanumeric()) {
return false;
}
let mut remainder = 0u32;
for ch in canonical[4..].chars().chain(canonical[..4].chars()) {
match ch {
'0'..='9' => {
remainder = (remainder * 10 + ch.to_digit(10).expect("digit")) % 97;
}
'A'..='Z' => {
let value = u32::from(ch) - u32::from('A') + 10;
remainder = (remainder * 10 + value / 10) % 97;
remainder = (remainder * 10 + value % 10) % 97;
}
_ => return false,
}
}
remainder == 1
}
fn iban_canonicalize(input: &str) -> String {
input
.chars()
.filter(|ch| !ch.is_ascii_whitespace())
.flat_map(char::to_uppercase)
.collect()
}
fn ipv4_parse_check(input: &str) -> bool {
input.parse::<std::net::Ipv4Addr>().is_ok()
}
fn ipv6_parse_check(input: &str) -> bool {
input.parse::<std::net::Ipv6Addr>().is_ok()
}
fn eth_eip55_check(input: &str) -> bool {
let Some(address) = input.strip_prefix("0x") else {
return false;
};
if address.len() != 40 || !address.bytes().all(|byte| byte.is_ascii_hexdigit()) {
return false;
}
if address
.bytes()
.all(|byte| !byte.is_ascii_alphabetic() || byte.is_ascii_lowercase())
|| address
.bytes()
.all(|byte| !byte.is_ascii_alphabetic() || byte.is_ascii_uppercase())
{
return true;
}
let lowercase = address.to_ascii_lowercase();
let hash = Keccak256::digest(lowercase.as_bytes());
for (index, byte) in address.bytes().enumerate() {
if byte.is_ascii_digit() {
continue;
}
let hash_nibble = if index % 2 == 0 {
hash[index / 2] >> 4
} else {
hash[index / 2] & 0x0f
};
if (hash_nibble > 7) != byte.is_ascii_uppercase() {
return false;
}
}
true
}
fn collect_ascii_digits<const N: usize>(input: &str) -> Option<[u8; N]> {
let mut digits = [0u8; N];
let mut count = 0usize;
for byte in input.bytes() {
if byte.is_ascii_digit() {
if count == N {
return None;
}
digits[count] = byte - b'0';
count += 1;
} else if matches!(byte, b' ' | b'\t' | b'\n' | b'\r' | b'-' | b'.' | b'/') {
continue;
} else {
return None;
}
}
(count == N).then_some(digits)
}
fn canonical_ascii_digits<const N: usize>(input: &str) -> Option<String> {
let digits = collect_ascii_digits::<N>(input)?;
let mut canonical = String::with_capacity(N);
for digit in digits {
canonical.push(char::from(b'0' + digit));
}
Some(canonical)
}
fn not_all_same<const N: usize>(digits: &[u8; N]) -> bool {
digits[1..].iter().any(|digit| *digit != digits[0])
}
fn aadhaar_verhoeff_check(input: &str) -> bool {
const D: [[u8; 10]; 10] = [
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
[1, 2, 3, 4, 0, 6, 7, 8, 9, 5],
[2, 3, 4, 0, 1, 7, 8, 9, 5, 6],
[3, 4, 0, 1, 2, 8, 9, 5, 6, 7],
[4, 0, 1, 2, 3, 9, 5, 6, 7, 8],
[5, 9, 8, 7, 6, 0, 4, 3, 2, 1],
[6, 5, 9, 8, 7, 1, 0, 4, 3, 2],
[7, 6, 5, 9, 8, 2, 1, 0, 4, 3],
[8, 7, 6, 5, 9, 3, 2, 1, 0, 4],
[9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
];
const P: [[u8; 10]; 8] = [
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
[1, 5, 7, 6, 2, 8, 3, 0, 9, 4],
[5, 8, 0, 3, 7, 9, 6, 1, 4, 2],
[8, 9, 1, 6, 0, 4, 3, 5, 2, 7],
[9, 4, 5, 3, 1, 2, 6, 8, 7, 0],
[4, 2, 8, 6, 5, 7, 3, 9, 0, 1],
[2, 7, 9, 3, 8, 0, 6, 4, 1, 5],
[7, 0, 4, 6, 9, 1, 3, 2, 5, 8],
];
let Some(digits) = collect_ascii_digits::<12>(input) else {
return false;
};
if digits[0] < 2 || !not_all_same(&digits) {
return false;
}
let mut checksum = 0u8;
for (index, digit) in digits.iter().rev().enumerate() {
checksum = D[checksum as usize][P[index % 8][*digit as usize] as usize];
}
checksum == 0
}
fn fr_nir_mod97_check(input: &str) -> bool {
let Some(digits) = collect_ascii_digits::<15>(input) else {
return false;
};
if !matches!(digits[0], 1 | 2 | 3 | 4 | 7 | 8) {
return false;
}
let month = digits[3] * 10 + digits[4];
if !(1..=12).contains(&month) && !(20..=42).contains(&month) && !(50..=99).contains(&month) {
return false;
}
let mut number = 0u32;
for digit in &digits[..13] {
number = (number * 10 + u32::from(*digit)) % 97;
}
let key = u32::from(digits[13]) * 10 + u32::from(digits[14]);
97 - number == key
}
fn de_steuer_id_mod1110_check(input: &str) -> bool {
let Some(digits) = collect_ascii_digits::<11>(input) else {
return false;
};
if !steuer_id_first_ten_digits_valid(&digits) {
return false;
}
let mut product = 10u8;
for digit in &digits[..10] {
let mut sum = (*digit + product) % 10;
if sum == 0 {
sum = 10;
}
product = (2 * sum) % 11;
}
let check = (11 - product) % 10;
check == digits[10]
}
fn steuer_id_first_ten_digits_valid(digits: &[u8; 11]) -> bool {
if digits[0] == 0 {
return false;
}
let mut counts = [0u8; 10];
for digit in &digits[..10] {
counts[*digit as usize] += 1;
}
let repeated_digits = counts.iter().filter(|count| **count > 1).count();
let missing_digits = counts.iter().filter(|count| **count == 0).count();
let repeated_count_valid = counts.iter().any(|count| matches!(*count, 2 | 3));
repeated_digits == 1 && repeated_count_valid && matches!(missing_digits, 1 | 2)
}
fn bsn_mod11_check(input: &str) -> bool {
let Some(digits) = collect_ascii_digits::<9>(input) else {
return false;
};
if !not_all_same(&digits) {
return false;
}
let sum: i32 = digits[..8]
.iter()
.enumerate()
.map(|(index, digit)| i32::from(*digit) * (9 - index as i32))
.sum::<i32>()
- i32::from(digits[8]);
sum.rem_euclid(11) == 0
}
fn cpf_mod11_check(input: &str) -> bool {
let Some(digits) = collect_ascii_digits::<11>(input) else {
return false;
};
if !not_all_same(&digits) {
return false;
}
mod11_check_digit(&digits[..9], 10) == digits[9]
&& mod11_check_digit(&digits[..10], 11) == digits[10]
}
fn cnpj_mod11_check(input: &str) -> bool {
let Some(digits) = collect_ascii_digits::<14>(input) else {
return false;
};
if !not_all_same(&digits) {
return false;
}
const FIRST: [u8; 12] = [5, 4, 3, 2, 9, 8, 7, 6, 5, 4, 3, 2];
const SECOND: [u8; 13] = [6, 5, 4, 3, 2, 9, 8, 7, 6, 5, 4, 3, 2];
weighted_mod11_check_digit(&digits[..12], &FIRST) == digits[12]
&& weighted_mod11_check_digit(&digits[..13], &SECOND) == digits[13]
}
fn uk_nhs_mod11_check(input: &str) -> bool {
let Some(digits) = collect_ascii_digits::<10>(input) else {
return false;
};
if !not_all_same(&digits) {
return false;
}
let sum: u32 = digits[..9]
.iter()
.enumerate()
.map(|(index, digit)| u32::from(*digit) * (10 - index as u32))
.sum();
let check = 11 - (sum % 11);
let check = if check == 11 { 0 } else { check };
check != 10 && check == u32::from(digits[9])
}
fn mod11_check_digit(digits: &[u8], start_weight: u8) -> u8 {
let weights = (2..=start_weight).rev();
let sum: u32 = digits
.iter()
.zip(weights)
.map(|(digit, weight)| u32::from(*digit) * u32::from(weight))
.sum();
let remainder = sum % 11;
if remainder < 2 {
0
} else {
(11 - remainder) as u8
}
}
fn weighted_mod11_check_digit(digits: &[u8], weights: &[u8]) -> u8 {
let sum: u32 = digits
.iter()
.zip(weights)
.map(|(digit, weight)| u32::from(*digit) * u32::from(*weight))
.sum();
let remainder = sum % 11;
if remainder < 2 {
0
} else {
(11 - remainder) as u8
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub struct Detection {
pub span: Range<usize>,
pub class: PiiClass,
pub source: String,
}
impl Detection {
pub fn new(span: Range<usize>, class: PiiClass, source: impl Into<String>) -> Self {
Self {
span,
class,
source: source.into(),
}
}
}
pub trait SafetyNet: Send + Sync {
fn id(&self) -> &str;
fn supported_locales(&self) -> &[LocaleTag];
fn check(
&self,
clean_text: &str,
context: SafetyNetContext<'_>,
) -> Result<Vec<LeakSuspect>, SafetyNetError>;
}
#[derive(Debug, Clone, Copy)]
#[non_exhaustive]
pub struct SafetyNetContext<'a> {
pub manifest: &'a Manifest,
pub locale_chain: &'a [LocaleTag],
pub document_kind: DocumentKind,
pub session_id: Option<&'a str>,
pub field_path: Option<&'a str>,
}
impl<'a> SafetyNetContext<'a> {
pub fn new(
manifest: &'a Manifest,
locale_chain: &'a [LocaleTag],
document_kind: DocumentKind,
session_id: Option<&'a str>,
field_path: Option<&'a str>,
) -> Self {
Self {
manifest,
locale_chain,
document_kind,
session_id,
field_path,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct EmittedTokenSpan {
pub clean_span: Range<usize>,
pub raw_span: Range<usize>,
pub class: PiiClass,
}
impl EmittedTokenSpan {
pub fn new(clean_span: Range<usize>, raw_span: Range<usize>, class: PiiClass) -> Self {
Self {
clean_span,
raw_span,
class,
}
}
}
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct Manifest {
pub spans: Vec<EmittedTokenSpan>,
}
impl Manifest {
pub fn from_spans(mut spans: Vec<EmittedTokenSpan>) -> Self {
spans.sort_by_key(|span| (span.clean_span.start, span.clean_span.end));
Self { spans }
}
pub fn diff_against(
&self,
suspect_span: &Range<usize>,
suspect_class: &PiiClass,
) -> Option<LeakKind> {
if suspect_span.is_empty() {
return None;
}
let start_idx = self
.spans
.partition_point(|span| span.clean_span.end <= suspect_span.start);
let overlapping = self.spans[start_idx..]
.iter()
.take_while(|span| span.clean_span.start < suspect_span.end)
.filter(|span| ranges_overlap(&span.clean_span, suspect_span))
.collect::<Vec<_>>();
if overlapping.is_empty() {
return Some(LeakKind::Uncovered);
}
let mut cursor = suspect_span.start;
let mut first_mismatch = None::<&EmittedTokenSpan>;
for span in overlapping {
if span.clean_span.start > cursor {
return Some(LeakKind::PartialBleed {
uncovered: cursor..span.clean_span.start.min(suspect_span.end),
});
}
if span.clean_span.end > cursor {
if first_mismatch.is_none() && &span.class != suspect_class {
first_mismatch = Some(span);
}
cursor = cursor.max(span.clean_span.end.min(suspect_span.end));
if cursor >= suspect_span.end {
break;
}
}
}
if cursor < suspect_span.end {
return Some(LeakKind::PartialBleed {
uncovered: cursor..suspect_span.end,
});
}
first_mismatch.map(|span| LeakKind::ClassMismatch {
pipeline_class: span.class.clone(),
safety_net_class: suspect_class.clone(),
})
}
}
fn ranges_overlap(left: &Range<usize>, right: &Range<usize>) -> bool {
left.start < right.end && right.start < left.end
}
#[derive(Debug, Clone, PartialEq)]
#[non_exhaustive]
pub struct LeakSuspect {
pub span: Range<usize>,
pub class: PiiClass,
pub safety_net_id: String,
pub score: Option<f32>,
pub kind: LeakKind,
pub raw_label: String,
pub field_path: Option<String>,
}
impl LeakSuspect {
pub fn new(
span: Range<usize>,
class: PiiClass,
safety_net_id: impl Into<String>,
score: Option<f32>,
kind: LeakKind,
raw_label: impl Into<String>,
field_path: Option<String>,
) -> Self {
Self {
span,
class,
safety_net_id: safety_net_id.into(),
score,
kind,
raw_label: raw_label.into(),
field_path,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub enum LeakKind {
Uncovered,
PartialBleed {
uncovered: Range<usize>,
},
ClassMismatch {
pipeline_class: PiiClass,
safety_net_class: PiiClass,
},
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub enum LeakReportTelemetry {
LocaleSkipped {
safety_net_id: String,
document_kind: DocumentKind,
field_path: Option<String>,
},
}
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct LeakReportStats {
pub suspect_count: usize,
pub uncovered_count: usize,
pub partial_bleed_count: usize,
pub class_mismatch_count: usize,
pub locale_skipped_count: usize,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct DocumentExtension {
pub schema_version: u16,
pub clean_md_sha256: [u8; 32],
pub layout_json_sha256: [u8; 32],
pub report_json_sha256: [u8; 32],
#[serde(default, skip_serializing_if = "Option::is_none")]
pub preview_png_sha256: Option<[u8; 32]>,
pub page_count: u32,
pub audit_session_id: String,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub clean_spans: Vec<EmittedTokenSpan>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub codec_audit: Vec<CodecAuditRow>,
}
impl DocumentExtension {
pub fn builder(schema_version: u16) -> DocumentExtensionBuilder {
DocumentExtensionBuilder {
schema_version,
clean_md_sha256: None,
layout_json_sha256: None,
report_json_sha256: None,
preview_png_sha256: None,
page_count: None,
audit_session_id: None,
clean_spans: Vec::new(),
codec_audit: Vec::new(),
}
}
}
#[derive(Debug, Clone)]
#[must_use]
pub struct DocumentExtensionBuilder {
schema_version: u16,
clean_md_sha256: Option<[u8; 32]>,
layout_json_sha256: Option<[u8; 32]>,
report_json_sha256: Option<[u8; 32]>,
preview_png_sha256: Option<[u8; 32]>,
page_count: Option<u32>,
audit_session_id: Option<String>,
clean_spans: Vec<EmittedTokenSpan>,
codec_audit: Vec<CodecAuditRow>,
}
impl DocumentExtensionBuilder {
pub fn clean_md_sha256(mut self, hash: [u8; 32]) -> Self {
self.clean_md_sha256 = Some(hash);
self
}
pub fn layout_json_sha256(mut self, hash: [u8; 32]) -> Self {
self.layout_json_sha256 = Some(hash);
self
}
pub fn report_json_sha256(mut self, hash: [u8; 32]) -> Self {
self.report_json_sha256 = Some(hash);
self
}
pub fn preview_png_sha256(mut self, hash: [u8; 32]) -> Self {
self.preview_png_sha256 = Some(hash);
self
}
pub fn page_count(mut self, page_count: u32) -> Self {
self.page_count = Some(page_count);
self
}
pub fn audit_session_id(mut self, audit_session_id: impl Into<String>) -> Self {
self.audit_session_id = Some(audit_session_id.into());
self
}
pub fn clean_spans(mut self, clean_spans: Vec<EmittedTokenSpan>) -> Self {
self.clean_spans = clean_spans;
self
}
pub fn codec_audit(mut self, codec_audit: Vec<CodecAuditRow>) -> Self {
self.codec_audit = codec_audit;
self
}
pub fn build(self) -> Result<DocumentExtension, DocumentExtensionError> {
Ok(DocumentExtension {
schema_version: self.schema_version,
clean_md_sha256: self
.clean_md_sha256
.ok_or(DocumentExtensionError::MissingField("clean_md_sha256"))?,
layout_json_sha256: self
.layout_json_sha256
.ok_or(DocumentExtensionError::MissingField("layout_json_sha256"))?,
report_json_sha256: self
.report_json_sha256
.ok_or(DocumentExtensionError::MissingField("report_json_sha256"))?,
preview_png_sha256: self.preview_png_sha256,
page_count: self
.page_count
.ok_or(DocumentExtensionError::MissingField("page_count"))?,
audit_session_id: self
.audit_session_id
.ok_or(DocumentExtensionError::MissingField("audit_session_id"))?,
clean_spans: self.clean_spans,
codec_audit: self.codec_audit,
})
}
}
#[derive(Debug, Clone, PartialEq, Eq, Error)]
#[non_exhaustive]
pub enum DocumentExtensionError {
#[error("missing document extension field: {0}")]
MissingField(&'static str),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub enum TextOrigin {
Ocr,
EmbeddedText,
Transcript,
Hybrid,
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct CodecCapabilitySet {
pub text: bool,
pub layout: bool,
pub confidence: bool,
pub timestamps: bool,
}
impl CodecCapabilitySet {
pub const TEXT_ONLY: Self = Self {
text: true,
layout: false,
confidence: false,
timestamps: false,
};
pub const fn new(text: bool, layout: bool, confidence: bool, timestamps: bool) -> Self {
Self {
text,
layout,
confidence,
timestamps,
}
}
pub fn contains(self, requested: Self) -> bool {
(!requested.text || self.text)
&& (!requested.layout || self.layout)
&& (!requested.confidence || self.confidence)
&& (!requested.timestamps || self.timestamps)
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub enum ExtractionDensityPolicy {
Required(f32),
Exempt { reason: String },
}
impl Default for ExtractionDensityPolicy {
fn default() -> Self {
Self::Exempt {
reason: "calibration_pending".to_string(),
}
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct CodecAuditRow {
pub codec_id: String,
pub codec_version: String,
pub accepted_mime: String,
pub advertised: CodecCapabilitySet,
pub delivered: CodecCapabilitySet,
pub text_origin: TextOrigin,
pub codec_output_schema_version: u16,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub options_hash_hex: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub engine_provenance: Option<String>,
pub extraction_density_policy: ExtractionDensityPolicy,
}
impl CodecAuditRow {
pub fn new(
codec_id: impl Into<String>,
codec_version: impl Into<String>,
accepted_mime: impl Into<String>,
text_origin: TextOrigin,
) -> Self {
Self {
codec_id: codec_id.into(),
codec_version: codec_version.into(),
accepted_mime: accepted_mime.into(),
advertised: CodecCapabilitySet::default(),
delivered: CodecCapabilitySet::default(),
text_origin,
codec_output_schema_version: 1,
options_hash_hex: None,
engine_provenance: None,
extraction_density_policy: ExtractionDensityPolicy::default(),
}
}
}
#[derive(Debug, Clone, Default, PartialEq)]
#[non_exhaustive]
pub struct LeakReport {
pub suspects: Vec<LeakSuspect>,
pub telemetry: Vec<LeakReportTelemetry>,
pub stats: LeakReportStats,
pub replay_hash: Option<String>,
}
impl LeakReport {
pub fn from_parts(
suspects: Vec<LeakSuspect>,
telemetry: Vec<LeakReportTelemetry>,
) -> LeakReport {
let mut stats = LeakReportStats {
suspect_count: suspects.len(),
locale_skipped_count: telemetry
.iter()
.filter(|event| matches!(event, LeakReportTelemetry::LocaleSkipped { .. }))
.count(),
..LeakReportStats::default()
};
for suspect in &suspects {
match suspect.kind {
LeakKind::Uncovered => stats.uncovered_count += 1,
LeakKind::PartialBleed { .. } => stats.partial_bleed_count += 1,
LeakKind::ClassMismatch { .. } => stats.class_mismatch_count += 1,
}
}
LeakReport {
suspects,
telemetry,
stats,
replay_hash: None,
}
}
pub fn extend(&mut self, other: LeakReport) {
self.suspects.extend(other.suspects);
self.telemetry.extend(other.telemetry);
*self = LeakReport::from_parts(
std::mem::take(&mut self.suspects),
std::mem::take(&mut self.telemetry),
);
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[non_exhaustive]
pub enum OpenAiPrivateLabel {
PrivatePerson,
PrivateAddress,
PrivateEmail,
PrivatePhone,
PrivateUrl,
PrivateDate,
AccountNumber,
Secret,
}
impl OpenAiPrivateLabel {
pub fn as_str(self) -> &'static str {
match self {
Self::PrivatePerson => "private_person",
Self::PrivateAddress => "private_address",
Self::PrivateEmail => "private_email",
Self::PrivatePhone => "private_phone",
Self::PrivateUrl => "private_url",
Self::PrivateDate => "private_date",
Self::AccountNumber => "account_number",
Self::Secret => "secret",
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[non_exhaustive]
pub enum SafetyNetPiiClass {
Email,
Name,
Location,
Phone,
Url,
Date,
AccountNumber,
Secret,
}
impl SafetyNetPiiClass {
pub fn to_pii_class(self) -> PiiClass {
match self {
Self::Email => PiiClass::Email,
Self::Name => PiiClass::Name,
Self::Location => PiiClass::Location,
Self::Phone => PiiClass::custom("phone"),
Self::Url => PiiClass::custom("url"),
Self::Date => PiiClass::custom("date"),
Self::AccountNumber => PiiClass::custom("account_number"),
Self::Secret => PiiClass::custom("secret"),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Error)]
#[non_exhaustive]
pub enum SafetyNetError {
#[error("safety net unavailable: {reason}")]
Unavailable {
reason: String,
},
#[error("safety net weights missing: {path}")]
WeightsMissing {
path: String,
},
#[error("safety net model unavailable: {reason}")]
ModelUnavailable {
reason: String,
},
#[error("safety net model integrity mismatch: expected={expected}, actual={actual}")]
ModelIntegrityMismatch {
expected: String,
actual: String,
},
#[error("safety net input too large: limit={limit}, actual={actual}")]
InputTooLarge {
limit: usize,
actual: usize,
},
#[error("safety net runtime failed: {message}")]
Runtime {
message: String,
},
#[error("safety net invalid output: {message}")]
InvalidOutput {
message: String,
},
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum Action {
Tokenize,
Redact,
FormatPreserve,
Generalize,
Preserve,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum ConflictTier {
None,
ClassPriority,
RulePriority,
Score,
SpanLength,
Validator,
ValidatorVeto,
CollisionPolicy,
AnchoredContext,
RecognizerId,
Merged,
Redact,
Resolve,
Fallback,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
pub enum FallbackReason {
OverlapConflict,
ValidatorVeto,
AnchorMissing,
ResidualSuspect,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum DocumentKind {
Structured,
Text,
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub struct RedactionEntry {
pub source: String,
pub recognizer_id: Option<String>,
pub recognizer_version_id: Option<String>,
pub class: PiiClass,
pub action: Action,
pub field_name: Option<String>,
pub document_kind: DocumentKind,
pub conflict_loser: bool,
pub decided_by: ConflictTier,
pub created_at: i64,
pub session_id: Option<String>,
pub validator_fail_reason: Option<ValidatorFailReason>,
pub ambiguity_record: Option<AmbiguityRecord>,
pub collision_family: Option<String>,
pub collision_variant: Option<String>,
pub fallback_triggered: Option<FallbackReason>,
pub provenance_stage: Option<String>,
pub provenance_model_id: Option<String>,
pub provenance_model_version: Option<String>,
pub provenance_artifact_sha256: Option<String>,
pub provenance_tokenizer_sha256: Option<String>,
pub provenance_locale_resolved: Option<String>,
pub provenance_locale_match_kind: Option<String>,
pub provenance_canonical_class: Option<String>,
pub provenance_native_class: Option<String>,
pub provenance_confidence: Option<String>,
pub provenance_merged_from: Option<String>,
pub backend_silently_dropped: Option<Vec<String>>,
pub restore_policy: Option<String>,
pub restore_decision: Option<String>,
pub restore_unknown_token_count: Option<u64>,
pub restore_manifest_bypass_count: Option<u64>,
pub restore_fresh_pii_count: Option<u64>,
pub restore_phase_mask: Option<u32>,
}
impl Serialize for RedactionEntry {
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
use serde::ser::SerializeStruct;
let mut len = 14;
if self.recognizer_id.is_some() {
len += 1;
}
if self.recognizer_version_id.is_some() {
len += 1;
}
len += [
self.provenance_stage.as_ref(),
self.provenance_model_id.as_ref(),
self.provenance_model_version.as_ref(),
self.provenance_artifact_sha256.as_ref(),
self.provenance_tokenizer_sha256.as_ref(),
self.provenance_locale_resolved.as_ref(),
self.provenance_locale_match_kind.as_ref(),
self.provenance_canonical_class.as_ref(),
self.provenance_native_class.as_ref(),
self.provenance_confidence.as_ref(),
self.provenance_merged_from.as_ref(),
]
.into_iter()
.filter(|value| value.is_some())
.count();
if self.backend_silently_dropped.is_some() {
len += 1;
}
len += [self.restore_policy.as_ref(), self.restore_decision.as_ref()]
.into_iter()
.filter(|value| value.is_some())
.count();
len += [
self.restore_unknown_token_count.is_some(),
self.restore_manifest_bypass_count.is_some(),
self.restore_fresh_pii_count.is_some(),
self.restore_phase_mask.is_some(),
]
.into_iter()
.filter(|value| *value)
.count();
let mut state = serializer.serialize_struct("RedactionEntry", len)?;
state.serialize_field("source", &self.source)?;
if let Some(recognizer_id) = &self.recognizer_id {
state.serialize_field("recognizer_id", recognizer_id)?;
}
if let Some(recognizer_version_id) = &self.recognizer_version_id {
state.serialize_field("recognizer_version_id", recognizer_version_id)?;
}
state.serialize_field("class", &self.class.to_canonical_str())?;
state.serialize_field("action", redaction_action_as_str(self.action))?;
state.serialize_field("field_name", &self.field_name)?;
state.serialize_field(
"document_kind",
redaction_document_kind_as_str(self.document_kind),
)?;
state.serialize_field("conflict_loser", &self.conflict_loser)?;
state.serialize_field(
"decided_by",
redaction_conflict_tier_as_str(self.decided_by),
)?;
state.serialize_field("created_at", &self.created_at)?;
state.serialize_field("session_id", &self.session_id)?;
state.serialize_field("validator_fail_reason", &self.validator_fail_reason)?;
state.serialize_field("ambiguity_record", &self.ambiguity_record)?;
state.serialize_field("collision_family", &self.collision_family)?;
state.serialize_field("collision_variant", &self.collision_variant)?;
state.serialize_field("fallback_triggered", &self.fallback_triggered)?;
if let Some(value) = &self.provenance_stage {
state.serialize_field("provenance_stage", value)?;
}
if let Some(value) = &self.provenance_model_id {
state.serialize_field("provenance_model_id", value)?;
}
if let Some(value) = &self.provenance_model_version {
state.serialize_field("provenance_model_version", value)?;
}
if let Some(value) = &self.provenance_artifact_sha256 {
state.serialize_field("provenance_artifact_sha256", value)?;
}
if let Some(value) = &self.provenance_tokenizer_sha256 {
state.serialize_field("provenance_tokenizer_sha256", value)?;
}
if let Some(value) = &self.provenance_locale_resolved {
state.serialize_field("provenance_locale_resolved", value)?;
}
if let Some(value) = &self.provenance_locale_match_kind {
state.serialize_field("provenance_locale_match_kind", value)?;
}
if let Some(value) = &self.provenance_canonical_class {
state.serialize_field("provenance_canonical_class", value)?;
}
if let Some(value) = &self.provenance_native_class {
state.serialize_field("provenance_native_class", value)?;
}
if let Some(value) = &self.provenance_confidence {
state.serialize_field("provenance_confidence", value)?;
}
if let Some(value) = &self.provenance_merged_from {
state.serialize_field("provenance_merged_from", value)?;
}
if let Some(dropped) = &self.backend_silently_dropped {
state.serialize_field("backend_silently_dropped", dropped)?;
}
if let Some(value) = &self.restore_policy {
state.serialize_field("restore_policy", value)?;
}
if let Some(value) = &self.restore_decision {
state.serialize_field("restore_decision", value)?;
}
if let Some(value) = self.restore_unknown_token_count {
state.serialize_field("restore_unknown_token_count", &value)?;
}
if let Some(value) = self.restore_manifest_bypass_count {
state.serialize_field("restore_manifest_bypass_count", &value)?;
}
if let Some(value) = self.restore_fresh_pii_count {
state.serialize_field("restore_fresh_pii_count", &value)?;
}
if let Some(value) = self.restore_phase_mask {
state.serialize_field("restore_phase_mask", &value)?;
}
state.end()
}
}
fn redaction_action_as_str(action: Action) -> &'static str {
match action {
Action::Tokenize => "tokenize",
Action::Redact => "redact",
Action::FormatPreserve => "format_preserve",
Action::Generalize => "generalize",
Action::Preserve => "preserve",
}
}
fn redaction_document_kind_as_str(kind: DocumentKind) -> &'static str {
match kind {
DocumentKind::Structured => "structured",
DocumentKind::Text => "text",
}
}
fn redaction_conflict_tier_as_str(tier: ConflictTier) -> &'static str {
match tier {
ConflictTier::None => "none",
ConflictTier::ClassPriority => "class_priority",
ConflictTier::RulePriority => "rule_priority",
ConflictTier::Score => "score",
ConflictTier::SpanLength => "span_length",
ConflictTier::Validator => "validator",
ConflictTier::ValidatorVeto => "validator_veto",
ConflictTier::CollisionPolicy => "collision_policy",
ConflictTier::AnchoredContext => "anchored_context",
ConflictTier::RecognizerId => "recognizer_id",
ConflictTier::Merged => "merged",
ConflictTier::Redact => "redact",
ConflictTier::Resolve => "resolve",
ConflictTier::Fallback => "fallback",
}
}
impl RedactionEntry {
#[allow(clippy::too_many_arguments)]
pub fn new(
source: impl Into<String>,
class: PiiClass,
action: Action,
field_name: Option<String>,
document_kind: DocumentKind,
conflict_loser: bool,
decided_by: ConflictTier,
created_at: i64,
session_id: Option<String>,
) -> Self {
Self {
source: source.into(),
class,
action,
field_name,
document_kind,
conflict_loser,
decided_by,
created_at,
session_id,
recognizer_id: None,
recognizer_version_id: None,
validator_fail_reason: None,
ambiguity_record: None,
collision_family: None,
collision_variant: None,
fallback_triggered: None,
provenance_stage: None,
provenance_model_id: None,
provenance_model_version: None,
provenance_artifact_sha256: None,
provenance_tokenizer_sha256: None,
provenance_locale_resolved: None,
provenance_locale_match_kind: None,
provenance_canonical_class: None,
provenance_native_class: None,
provenance_confidence: None,
provenance_merged_from: None,
backend_silently_dropped: None,
restore_policy: None,
restore_decision: None,
restore_unknown_token_count: None,
restore_manifest_bypass_count: None,
restore_fresh_pii_count: None,
restore_phase_mask: None,
}
}
pub fn with_validator_fail_reason(mut self, reason: ValidatorFailReason) -> Self {
self.validator_fail_reason = Some(reason);
self
}
pub fn with_ambiguity_record(mut self, record: AmbiguityRecord) -> Self {
self.ambiguity_record = Some(record);
self
}
pub fn with_collision_metadata(
mut self,
family: Option<String>,
variant: Option<String>,
) -> Self {
self.collision_family = family;
self.collision_variant = variant;
self
}
pub fn with_fallback_triggered(mut self, reason: FallbackReason) -> Self {
self.fallback_triggered = Some(reason);
self
}
pub fn with_backend_silently_dropped(mut self, dropped: Vec<String>) -> Self {
self.backend_silently_dropped = Some(dropped);
self
}
pub fn with_restore_telemetry(mut self, telemetry: RestoreTelemetry) -> Self {
self.restore_policy = Some(telemetry.restore_policy_str().to_string());
self.restore_decision = Some(telemetry.restore_decision_str().to_string());
self.restore_unknown_token_count = Some(telemetry.unknown_token_count);
self.restore_manifest_bypass_count = Some(telemetry.manifest_bypass_count);
self.restore_fresh_pii_count = Some(telemetry.fresh_pii_detected_count);
self.restore_phase_mask = Some(telemetry.phase_execution_mask);
self
}
pub fn with_recognizer_metadata(
mut self,
recognizer_id: Option<String>,
recognizer_version_id: Option<String>,
) -> Self {
self.recognizer_id = recognizer_id;
self.recognizer_version_id = recognizer_version_id;
self
}
#[allow(clippy::too_many_arguments)]
pub fn with_provenance_metadata(
mut self,
stage: Option<String>,
model_id: Option<String>,
model_version: Option<String>,
artifact_sha256: Option<String>,
tokenizer_sha256: Option<String>,
locale_resolved: Option<String>,
locale_match_kind: Option<String>,
canonical_class: Option<String>,
native_class: Option<String>,
confidence: Option<f64>,
merged_from: Option<String>,
) -> Self {
self.provenance_stage = stage;
self.provenance_model_id = model_id;
self.provenance_model_version = model_version;
self.provenance_artifact_sha256 = artifact_sha256;
self.provenance_tokenizer_sha256 = tokenizer_sha256;
self.provenance_locale_resolved = locale_resolved;
self.provenance_locale_match_kind = locale_match_kind;
self.provenance_canonical_class = canonical_class;
self.provenance_native_class = native_class;
self.provenance_confidence = confidence.map(|value| value.to_string());
self.provenance_merged_from = merged_from;
self
}
}
#[derive(Debug, Clone, PartialEq, Eq, Error)]
#[non_exhaustive]
pub enum RedactionLogError {
#[error("sqlite redaction log error: {0}")]
Sqlite(String),
#[error("backend redaction log error: {0}")]
Backend(String),
}
pub trait RedactionLogger: Send + Sync {
fn log(&self, entry: &RedactionEntry) -> Result<(), RedactionLogError>;
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
#[non_exhaustive]
pub enum SafetyTier {
#[default]
SafeDefault,
LocaleGated,
OptIn,
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub struct SafetyTierParseError {
value: String,
}
impl SafetyTier {
pub fn parse(value: &str) -> Result<Self, SafetyTierParseError> {
match value {
"safe_default" => Ok(Self::SafeDefault),
"locale_gated" => Ok(Self::LocaleGated),
"opt_in" => Ok(Self::OptIn),
other => Err(SafetyTierParseError {
value: other.to_string(),
}),
}
}
pub fn as_str(self) -> &'static str {
match self {
Self::SafeDefault => "safe_default",
Self::LocaleGated => "locale_gated",
Self::OptIn => "opt_in",
}
}
}
impl SafetyTierParseError {
pub fn value(&self) -> &str {
&self.value
}
}
impl fmt::Display for SafetyTierParseError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "unsupported safety_tier '{}'", self.value)
}
}
impl std::error::Error for SafetyTierParseError {}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
#[non_exhaustive]
pub enum LocaleTag {
Global,
DeDe,
DeAt,
DeCh,
EnUs,
EnGb,
EnIe,
EnAu,
EnCa,
Other(String),
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub enum LocaleError {
Unsupported,
}
impl fmt::Display for LocaleError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
LocaleError::Unsupported => f.write_str("unsupported locale"),
}
}
}
impl std::error::Error for LocaleError {}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct LocaleChain(Vec<LocaleTag>);
impl LocaleTag {
pub const GLOBAL: LocaleTag = LocaleTag::Global;
pub fn parse(s: &str) -> Result<LocaleTag, LocaleError> {
let raw = s.trim().replace('_', "-");
let normalized = raw.to_ascii_lowercase();
match normalized.as_str() {
"global" | "*" => Ok(LocaleTag::Global),
"de-de" => Ok(LocaleTag::DeDe),
"de-at" => Ok(LocaleTag::DeAt),
"de-ch" => Ok(LocaleTag::DeCh),
"en-us" => Ok(LocaleTag::EnUs),
"en-gb" => Ok(LocaleTag::EnGb),
"en-ie" => Ok(LocaleTag::EnIe),
"en-au" => Ok(LocaleTag::EnAu),
"en-ca" => Ok(LocaleTag::EnCa),
"" => Err(LocaleError::Unsupported),
_ if is_bcp47_parseable(&raw) => Ok(LocaleTag::Other(canonical_other(&raw))),
_ => Err(LocaleError::Unsupported),
}
}
pub fn as_str(&self) -> &str {
match self {
LocaleTag::Global => "global",
LocaleTag::DeDe => "de-DE",
LocaleTag::DeAt => "de-AT",
LocaleTag::DeCh => "de-CH",
LocaleTag::EnUs => "en-US",
LocaleTag::EnGb => "en-GB",
LocaleTag::EnIe => "en-IE",
LocaleTag::EnAu => "en-AU",
LocaleTag::EnCa => "en-CA",
LocaleTag::Other(tag) => tag.as_str(),
}
}
}
impl LocaleChain {
pub fn from_tags(mut tags: Vec<LocaleTag>) -> LocaleChain {
ensure_global(&mut tags);
LocaleChain(tags)
}
pub fn from_cli(raw: &str) -> Result<LocaleChain, LocaleError> {
let tags = raw
.split(',')
.map(LocaleTag::parse)
.collect::<Result<Vec<_>, _>>()?;
Ok(LocaleChain::from_tags(tags))
}
pub fn merge_policy_and_cli(
policy: Option<&[LocaleTag]>,
cli: Option<&[LocaleTag]>,
) -> LocaleChain {
Self::merge_cli_policy_rulepack_default(cli, policy, None)
}
pub fn merge_cli_policy_rulepack_default(
cli: Option<&[LocaleTag]>,
policy: Option<&[LocaleTag]>,
rulepack_defaults: Option<&[LocaleTag]>,
) -> LocaleChain {
let tags = cli
.filter(|tags| !tags.is_empty())
.or_else(|| policy.filter(|tags| !tags.is_empty()))
.or_else(|| rulepack_defaults.filter(|tags| !tags.is_empty()))
.map(|tags| tags.to_vec())
.unwrap_or_else(|| vec![LocaleTag::Global]);
LocaleChain::from_tags(tags)
}
pub fn intersects(&self, recognizer_locales: &[LocaleTag]) -> bool {
if recognizer_locales.is_empty() {
return true;
}
recognizer_locales.iter().any(|recognizer_locale| {
*recognizer_locale == LocaleTag::Global
|| self.0.iter().any(|active| active == recognizer_locale)
})
}
pub fn as_slice(&self) -> &[LocaleTag] {
&self.0
}
pub fn to_strings(&self) -> Vec<String> {
self.0.iter().map(ToString::to_string).collect()
}
}
impl From<&[LocaleTag]> for LocaleChain {
fn from(tags: &[LocaleTag]) -> Self {
let mut owned = tags.to_vec();
ensure_global(&mut owned);
LocaleChain(owned)
}
}
impl fmt::Display for LocaleTag {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(self.as_str())
}
}
#[derive(Debug, Clone)]
#[non_exhaustive]
pub enum RawDocument {
Structured(BTreeMap<String, Value>),
Text(String),
}
#[derive(Debug, Clone, Serialize)]
#[serde(untagged)]
#[non_exhaustive]
pub enum CleanDocument {
Structured(BTreeMap<String, Value>),
Text(String),
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
#[serde(untagged)]
#[non_exhaustive]
pub enum Value {
Null,
Bool(bool),
String(String),
I64(i64),
Array(Vec<Value>),
Object(BTreeMap<String, Value>),
}
impl Value {
pub fn as_str(&self) -> Option<&str> {
match self {
Self::String(value) => Some(value.as_str()),
Self::Null | Self::Bool(_) | Self::I64(_) | Self::Array(_) | Self::Object(_) => None,
}
}
pub fn scalar_to_safety_net_string(&self) -> Option<String> {
match self {
Self::String(value) if !value.is_empty() => Some(value.clone()),
Self::String(_) | Self::Null | Self::Array(_) | Self::Object(_) => None,
Self::Bool(value) => Some(value.to_string()),
Self::I64(value) => Some(value.to_string()),
}
}
}
impl PartialEq<&str> for Value {
fn eq(&self, other: &&str) -> bool {
self.as_str() == Some(*other)
}
}
#[derive(Debug, Clone, Default)]
pub struct DictionaryBundle {
entries: HashMap<String, DictionaryEntry>,
}
#[derive(Debug, Clone)]
pub struct DictionaryEntry {
terms: Vec<String>,
case_sensitive: bool,
source: DictionarySource,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum DictionarySource {
Cli,
Rulepack,
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub struct DictionaryStats {
pub name: String,
pub term_count: usize,
pub source: DictionarySource,
}
impl DictionaryStats {
pub fn new(name: impl Into<String>, term_count: usize, source: DictionarySource) -> Self {
Self {
name: name.into(),
term_count,
source,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub struct RulepackDict {
pub name: String,
pub terms: Vec<String>,
pub case_sensitive: bool,
}
impl RulepackDict {
pub fn new(name: impl Into<String>, terms: Vec<String>, case_sensitive: bool) -> Self {
Self {
name: name.into(),
terms,
case_sensitive,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub enum DictionaryLoadError {
Empty { name: String },
UnicodeInsensitiveUnsupported { name: String },
}
impl fmt::Display for DictionaryLoadError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Empty { name } => write!(f, "dictionary '{name}' has no terms"),
Self::UnicodeInsensitiveUnsupported { name } => write!(
f,
"dictionary '{name}' uses unicode terms with case-insensitive matching, unsupported in v0.4.0; use case_sensitive = true"
),
}
}
}
impl std::error::Error for DictionaryLoadError {}
impl DictionaryBundle {
pub fn from_rulepack_terms(terms: &[RulepackDict]) -> Self {
let mut entries = HashMap::with_capacity(terms.len());
for dictionary in terms {
let entry = DictionaryEntry::new(
&dictionary.name,
dictionary.terms.clone(),
dictionary.case_sensitive,
DictionarySource::Rulepack,
)
.expect("Policy validates dictionary terms before bundle construction");
entries.insert(dictionary.name.clone(), entry);
}
Self { entries }
}
pub fn from_entries(entries: impl IntoIterator<Item = (String, DictionaryEntry)>) -> Self {
Self {
entries: entries.into_iter().collect(),
}
}
pub fn merge(a: Self, b: Self) -> Self {
let mut entries = a.entries;
entries.extend(b.entries);
Self { entries }
}
pub fn get(&self, name: &str) -> Option<&DictionaryEntry> {
self.entries.get(name)
}
pub fn stats(&self) -> Vec<DictionaryStats> {
let mut stats = self
.entries
.iter()
.map(|(name, entry)| DictionaryStats {
name: name.clone(),
term_count: entry.terms.len(),
source: entry.source,
})
.collect::<Vec<_>>();
stats.sort_by(|a, b| a.name.cmp(&b.name));
stats
}
}
impl DictionaryEntry {
pub fn new(
name: &str,
terms: Vec<String>,
case_sensitive: bool,
source: DictionarySource,
) -> Result<Self, DictionaryLoadError> {
if terms.is_empty() {
return Err(DictionaryLoadError::Empty {
name: name.to_string(),
});
}
if !case_sensitive && terms.iter().any(|term| !term.is_ascii()) {
return Err(DictionaryLoadError::UnicodeInsensitiveUnsupported {
name: name.to_string(),
});
}
Ok(Self {
terms,
case_sensitive,
source,
})
}
pub fn case_sensitive(&self) -> bool {
self.case_sensitive
}
pub fn terms(&self) -> &[String] {
&self.terms
}
}
#[cfg(test)]
mod dictionary_tests {
use super::*;
#[test]
fn dictionary_entry_rejects_empty_terms() {
let err = DictionaryEntry::new("empty", Vec::new(), true, DictionarySource::Cli)
.expect_err("empty dictionaries must fail closed");
assert!(matches!(err, DictionaryLoadError::Empty { name } if name == "empty"));
}
#[test]
fn dictionary_entry_rejects_non_ascii_case_insensitive_terms() {
let err = DictionaryEntry::new(
"songs",
vec!["Beyonce".to_string(), "Caf\u{00e9}".to_string()],
false,
DictionarySource::Cli,
)
.expect_err("unicode case-insensitive dictionaries must fail closed");
assert!(matches!(
err,
DictionaryLoadError::UnicodeInsensitiveUnsupported { name } if name == "songs"
));
}
}
#[cfg(test)]
mod redaction_logger_tests {
use super::*;
struct CapturingLogger;
impl RedactionLogger for CapturingLogger {
fn log(&self, _entry: &RedactionEntry) -> Result<(), RedactionLogError> {
Ok(())
}
}
fn assert_send_sync<T: Send + Sync + ?Sized>() {}
#[test]
fn redaction_log_error_display_is_stable() {
assert_eq!(
RedactionLogError::Sqlite("write failed".to_string()).to_string(),
"sqlite redaction log error: write failed"
);
assert_eq!(
RedactionLogError::Backend("sink failed".to_string()).to_string(),
"backend redaction log error: sink failed"
);
}
#[test]
fn redaction_logger_trait_object_is_send_sync() {
assert_send_sync::<dyn RedactionLogger>();
}
#[test]
fn local_logger_can_implement_redaction_logger() {
let logger = CapturingLogger;
let entry = RedactionEntry {
source: "unit-test".to_string(),
recognizer_id: None,
recognizer_version_id: None,
class: PiiClass::Email,
action: Action::Tokenize,
field_name: None,
document_kind: DocumentKind::Text,
conflict_loser: false,
decided_by: ConflictTier::None,
created_at: 0,
session_id: None,
validator_fail_reason: None,
ambiguity_record: None,
collision_family: None,
collision_variant: None,
fallback_triggered: None,
provenance_stage: None,
provenance_model_id: None,
provenance_model_version: None,
provenance_artifact_sha256: None,
provenance_tokenizer_sha256: None,
provenance_locale_resolved: None,
provenance_locale_match_kind: None,
provenance_canonical_class: None,
provenance_native_class: None,
provenance_confidence: None,
provenance_merged_from: None,
backend_silently_dropped: None,
restore_policy: None,
restore_decision: None,
restore_unknown_token_count: None,
restore_manifest_bypass_count: None,
restore_fresh_pii_count: None,
restore_phase_mask: None,
};
let trait_object: &dyn RedactionLogger = &logger;
trait_object.log(&entry).expect("log entry");
}
#[test]
fn redaction_entry_json_shape_omits_absent_recognizer_lineage() {
let entry = RedactionEntry::new(
"email.global",
PiiClass::Email,
Action::Tokenize,
None,
DocumentKind::Text,
false,
ConflictTier::None,
0,
None,
);
let rendered = serde_json::to_string(&entry).expect("serialize redaction entry");
assert_eq!(
rendered,
r#"{"source":"email.global","class":"email","action":"tokenize","field_name":null,"document_kind":"text","conflict_loser":false,"decided_by":"none","created_at":0,"session_id":null,"validator_fail_reason":null,"ambiguity_record":null,"collision_family":null,"collision_variant":null,"fallback_triggered":null}"#
);
}
#[test]
fn redaction_entry_json_shape_includes_recognizer_lineage_when_present() {
let entry = RedactionEntry::new(
"ner/ort",
PiiClass::Name,
Action::Tokenize,
None,
DocumentKind::Text,
false,
ConflictTier::None,
0,
None,
)
.with_recognizer_metadata(
Some("ner".to_string()),
Some("ner.davlan-mbert.v1".to_string()),
);
let value: serde_json::Value =
serde_json::to_value(&entry).expect("serialize redaction entry");
assert_eq!(value["recognizer_id"], "ner");
assert_eq!(value["recognizer_version_id"], "ner.davlan-mbert.v1");
}
#[test]
fn candidate_keeps_versioned_and_unversioned_recognizer_ids() {
let unversioned = Candidate::new(
0..5,
PiiClass::Email,
"email.global",
0.9,
10,
None,
"email",
"email.global",
ConflictTier::None,
Vec::new(),
);
assert_eq!(unversioned.recognizer_id, "email.global");
assert_eq!(unversioned.recognizer_version_id, None);
let versioned = unversioned
.clone()
.with_recognizer_version_id("email.global.v1");
assert_eq!(versioned.recognizer_id, "email.global");
assert_eq!(
versioned.recognizer_version_id.as_deref(),
Some("email.global.v1")
);
}
}
#[cfg(test)]
mod safety_net_manifest_tests {
use super::*;
fn span(start: usize, end: usize, class: PiiClass) -> EmittedTokenSpan {
EmittedTokenSpan {
clean_span: start..end,
raw_span: start..end,
class,
}
}
fn diff(manifest: Manifest, suspect: Range<usize>, class: PiiClass) -> Option<LeakKind> {
manifest.diff_against(&suspect, &class)
}
#[test]
fn exact_same_class_coverage_is_not_a_leak() {
let manifest = Manifest::from_spans(vec![span(0, 8, PiiClass::Email)]);
assert_eq!(diff(manifest, 0..8, PiiClass::Email), None);
}
#[test]
fn uncovered_outside_all_tokens_is_uncovered() {
let manifest = Manifest::from_spans(vec![span(20, 30, PiiClass::Email)]);
assert_eq!(
diff(manifest, 0..10, PiiClass::Email),
Some(LeakKind::Uncovered)
);
}
#[test]
fn single_internal_gap_returns_partial_bleed() {
let manifest = Manifest::from_spans(vec![
span(0, 5, PiiClass::Email),
span(10, 15, PiiClass::Email),
]);
assert_eq!(
diff(manifest, 0..15, PiiClass::Email),
Some(LeakKind::PartialBleed { uncovered: 5..10 })
);
}
#[test]
fn multi_gap_returns_deterministic_first_uncovered_gap() {
let manifest = Manifest::from_spans(vec![
span(0, 3, PiiClass::Email),
span(5, 7, PiiClass::Email),
span(9, 12, PiiClass::Email),
]);
assert_eq!(
diff(manifest, 0..12, PiiClass::Email),
Some(LeakKind::PartialBleed { uncovered: 3..5 })
);
}
#[test]
fn multi_class_overlap_reports_first_mismatch_deterministically() {
let manifest = Manifest::from_spans(vec![
span(0, 4, PiiClass::Name),
span(4, 8, PiiClass::Location),
]);
assert_eq!(
diff(manifest, 0..8, PiiClass::Email),
Some(LeakKind::ClassMismatch {
pipeline_class: PiiClass::Name,
safety_net_class: PiiClass::Email,
})
);
}
#[test]
fn adjacent_same_class_tokens_cover_continuously() {
let manifest = Manifest::from_spans(vec![
span(0, 5, PiiClass::Email),
span(5, 10, PiiClass::Email),
]);
assert_eq!(diff(manifest, 0..10, PiiClass::Email), None);
}
#[test]
fn partial_bleed_at_start_end_and_middle() {
let manifest = Manifest::from_spans(vec![span(3, 8, PiiClass::Email)]);
assert_eq!(
diff(manifest.clone(), 0..8, PiiClass::Email),
Some(LeakKind::PartialBleed { uncovered: 0..3 })
);
assert_eq!(
diff(manifest.clone(), 3..10, PiiClass::Email),
Some(LeakKind::PartialBleed { uncovered: 8..10 })
);
let with_gap = Manifest::from_spans(vec![
span(0, 3, PiiClass::Email),
span(6, 10, PiiClass::Email),
]);
assert_eq!(
diff(with_gap, 0..10, PiiClass::Email),
Some(LeakKind::PartialBleed { uncovered: 3..6 })
);
}
#[test]
fn byte_indices_are_not_character_indices() {
let text = "ID: 😀 <Email_1>";
let token_start = text.find("<Email_1>").expect("token start");
assert_eq!(token_start, 9, "emoji is four bytes, not one char");
let manifest = Manifest::from_spans(vec![span(token_start, text.len(), PiiClass::Email)]);
assert_eq!(
diff(manifest, token_start..text.len(), PiiClass::Email),
None
);
}
#[test]
fn empty_suspect_range_is_not_a_leak() {
let manifest = Manifest::default();
assert_eq!(diff(manifest, 3..3, PiiClass::Email), None);
}
#[test]
fn safety_net_error_display_is_variant_specific_and_bytes_free() {
let cases = [
SafetyNetError::Unavailable {
reason: "not configured".to_string(),
}
.to_string(),
SafetyNetError::WeightsMissing {
path: "/models/opf".to_string(),
}
.to_string(),
SafetyNetError::ModelUnavailable {
reason: "load failed".to_string(),
}
.to_string(),
SafetyNetError::ModelIntegrityMismatch {
expected: "e3b0c44298fc1c149afbf4c8996fb924".to_string(),
actual: "4e07408562bedb8b60ce05c1decfe3ad".to_string(),
}
.to_string(),
SafetyNetError::InputTooLarge {
limit: 1024,
actual: 2048,
}
.to_string(),
SafetyNetError::Runtime {
message: "timeout".to_string(),
}
.to_string(),
SafetyNetError::InvalidOutput {
message: "bad json".to_string(),
}
.to_string(),
];
for rendered in cases {
assert!(!rendered.contains("alice@example.invalid"));
}
}
}
pub trait Recognizer: Send + Sync {
fn id(&self) -> &str;
fn supported_class(&self) -> &PiiClass;
fn detect(&self, input: &str, ctx: &DetectContext<'_>) -> Vec<Candidate>;
fn try_detect(
&self,
input: &str,
ctx: &DetectContext<'_>,
) -> Result<Vec<Candidate>, RecognizerRuntimeError> {
Ok(self.detect(input, ctx))
}
fn token_family(&self) -> &str;
fn validator_kind(&self) -> Option<ValidatorKind> {
None
}
fn locales(&self) -> &[LocaleTag] {
&[LocaleTag::Global]
}
}
#[derive(Debug, Clone, PartialEq)]
#[non_exhaustive]
pub struct Candidate {
pub span: Range<usize>,
pub class: PiiClass,
pub recognizer_id: String,
pub recognizer_version_id: Option<String>,
pub score: f32,
pub priority: i32,
pub canonical_form: Option<String>,
pub token_family: String,
pub source: String,
pub decided_by: ConflictTier,
pub merged_sources: Vec<String>,
}
impl Candidate {
#[allow(clippy::too_many_arguments)]
pub fn new(
span: Range<usize>,
class: PiiClass,
recognizer_id: impl Into<String>,
score: f32,
priority: i32,
canonical_form: Option<String>,
token_family: impl Into<String>,
source: impl Into<String>,
decided_by: ConflictTier,
merged_sources: Vec<String>,
) -> Self {
Self {
span,
class,
recognizer_id: recognizer_id.into(),
recognizer_version_id: None,
score,
priority,
canonical_form,
token_family: token_family.into(),
source: source.into(),
decided_by,
merged_sources,
}
}
pub fn with_span(mut self, span: Range<usize>) -> Self {
self.span = span;
self
}
pub fn with_recognizer_version_id(mut self, recognizer_version_id: impl Into<String>) -> Self {
self.recognizer_version_id = Some(recognizer_version_id.into());
self
}
}
#[non_exhaustive]
pub struct DetectContext<'a> {
pub locale_chain: &'a [LocaleTag],
pub dictionaries: &'a DictionaryBundle,
pub fields: &'a (),
pub degraded: Cell<bool>,
}
impl<'a> DetectContext<'a> {
pub fn new(locale_chain: &'a [LocaleTag], dictionaries: &'a DictionaryBundle) -> Self {
Self {
locale_chain,
dictionaries,
fields: &(),
degraded: Cell::new(false),
}
}
}
fn ensure_global(tags: &mut Vec<LocaleTag>) {
if !tags.contains(&LocaleTag::Global) {
tags.push(LocaleTag::Global);
}
}
fn is_bcp47_parseable(raw: &str) -> bool {
let mut parts = raw.split('-');
let Some(language) = parts.next() else {
return false;
};
if !(2..=8).contains(&language.len()) || !language.chars().all(|ch| ch.is_ascii_alphabetic()) {
return false;
}
parts.all(|part| {
(2..=8).contains(&part.len()) && part.chars().all(|ch| ch.is_ascii_alphanumeric())
})
}
fn canonical_other(raw: &str) -> String {
let mut parts = raw.split('-');
let language = parts.next().unwrap_or_default().to_ascii_lowercase();
let rest = parts.map(|part| {
if part.len() == 2 && part.chars().all(|ch| ch.is_ascii_alphabetic()) {
part.to_ascii_uppercase()
} else {
part.to_ascii_lowercase()
}
});
std::iter::once(language)
.chain(rest)
.collect::<Vec<_>>()
.join("-")
}