pii 0.1.0

PII detection and anonymization with deterministic, capability-aware NLP pipelines.
Documentation
//! Core data types for PII detection and anonymization.
//!
//! These types represent the stable contract of the library. Offsets are
//! byte offsets into the original UTF-8 input and must never split codepoints.
//! All recognizers and anonymizers operate on these types.

use crate::capabilities::Capabilities;
use serde::{Deserialize, Serialize};
use std::fmt;

/// Language tag wrapper used across the pipeline.
#[derive(Clone, Debug, Eq, PartialEq, Hash, Serialize, Deserialize)]
pub struct Language(String);

impl Language {
    /// Creates a new language tag.
    pub fn new(code: impl Into<String>) -> Self {
        Self(code.into())
    }

    /// Returns the underlying language tag string.
    pub fn as_str(&self) -> &str {
        &self.0
    }
}

impl fmt::Display for Language {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        self.0.fmt(f)
    }
}

impl From<&str> for Language {
    fn from(value: &str) -> Self {
        Self::new(value)
    }
}

impl From<String> for Language {
    fn from(value: String) -> Self {
        Self::new(value)
    }
}

/// English language tag.
pub const LANGUAGE_EN: &str = "en";
/// German language tag.
pub const LANGUAGE_DE: &str = "de";
/// Spanish language tag.
pub const LANGUAGE_ES: &str = "es";

/// Built-in and custom entity types.
#[derive(Clone, Debug, Eq, PartialEq, Hash, Serialize, Deserialize)]
pub enum EntityType {
    /// Email address.
    Email,
    /// Phone number.
    Phone,
    /// IPv4 address.
    IpAddress,
    /// IPv6 address.
    Ipv6,
    /// Credit card number.
    CreditCard,
    /// IBAN.
    Iban,
    /// US Social Security Number.
    Ssn,
    /// Individual Taxpayer Identification Number.
    Itin,
    /// Tax identification number.
    TaxId,
    /// Passport identifier.
    Passport,
    /// Driver's license identifier.
    DriverLicense,
    /// Bank account number.
    BankAccount,
    /// Routing number.
    RoutingNumber,
    /// Cryptocurrency address.
    CryptoAddress,
    /// MAC address.
    MacAddress,
    /// UUID.
    Uuid,
    /// Vehicle identification number.
    Vin,
    /// IMEI.
    Imei,
    /// URL.
    Url,
    /// Domain name.
    Domain,
    /// Hostname.
    Hostname,
    /// Person entity (NER).
    Person,
    /// Location entity (NER).
    Location,
    /// Organization entity (NER).
    Organization,
    /// Custom entity type.
    Custom(String),
}

impl EntityType {
    /// Returns a stable string identifier.
    pub fn as_str(&self) -> String {
        match self {
            EntityType::Email => "Email".to_string(),
            EntityType::Phone => "Phone".to_string(),
            EntityType::IpAddress => "IpAddress".to_string(),
            EntityType::Ipv6 => "Ipv6".to_string(),
            EntityType::CreditCard => "CreditCard".to_string(),
            EntityType::Iban => "Iban".to_string(),
            EntityType::Ssn => "Ssn".to_string(),
            EntityType::Itin => "Itin".to_string(),
            EntityType::TaxId => "TaxId".to_string(),
            EntityType::Passport => "Passport".to_string(),
            EntityType::DriverLicense => "DriverLicense".to_string(),
            EntityType::BankAccount => "BankAccount".to_string(),
            EntityType::RoutingNumber => "RoutingNumber".to_string(),
            EntityType::CryptoAddress => "CryptoAddress".to_string(),
            EntityType::MacAddress => "MacAddress".to_string(),
            EntityType::Uuid => "Uuid".to_string(),
            EntityType::Vin => "Vin".to_string(),
            EntityType::Imei => "Imei".to_string(),
            EntityType::Url => "Url".to_string(),
            EntityType::Domain => "Domain".to_string(),
            EntityType::Hostname => "Hostname".to_string(),
            EntityType::Person => "Person".to_string(),
            EntityType::Location => "Location".to_string(),
            EntityType::Organization => "Organization".to_string(),
            EntityType::Custom(name) => name.clone(),
        }
    }
}

/// Token with stable byte offsets and optional linguistic tags.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct Token {
    /// Token text.
    pub text: String,
    /// Start byte offset.
    pub start: usize,
    /// End byte offset.
    pub end: usize,
    /// Lemma form, if available.
    pub lemma: Option<String>,
    /// Part-of-speech tag, if available.
    pub pos: Option<String>,
}

/// NLP artifacts returned by an `NlpEngine`.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct NlpArtifacts {
    /// Language tag.
    pub language: Language,
    /// Input text length in bytes.
    pub text_len: usize,
    /// Tokens with offsets.
    pub tokens: Vec<Token>,
    /// Sentence boundary offsets.
    pub sentences: Vec<(usize, usize)>,
    /// NER spans, if available.
    pub ner: Vec<NerSpan>,
    /// Capability flags for this analysis.
    pub capabilities: Capabilities,
}

/// NER span produced by an NLP model.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct NerSpan {
    /// Entity type emitted by the model.
    pub entity_type: EntityType,
    /// Start byte offset.
    pub start: usize,
    /// End byte offset.
    pub end: usize,
    /// Model confidence score.
    pub score: f32,
    /// Model identifier.
    pub model: String,
}

/// Detected entity span produced by a recognizer.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct Detection {
    /// Entity type for this detection.
    pub entity_type: EntityType,
    /// Start byte offset.
    pub start: usize,
    /// End byte offset.
    pub end: usize,
    /// Confidence score.
    pub score: f32,
    /// Recognizer name.
    pub recognizer: String,
    /// Explanation of how the detection was produced.
    pub explanation: DetectionExplanation,
}

/// Explanation of how a detection was produced.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub enum DetectionExplanation {
    /// Regex-based match.
    Regex { pattern_name: String },
    /// Validator-based match.
    Validator { validator: String, passed: bool },
    /// Dictionary term match.
    Dictionary { source: String },
    /// Model-driven span.
    Ner { model: String, raw_score: f32 },
    /// Context-based score boost.
    ContextBoost {
        base: f32,
        boost: f32,
        matched_terms: Vec<String>,
    },
}

/// Output of the analyzer.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct AnalyzeResult {
    /// Language tag.
    pub language: Language,
    /// Final, resolved detections.
    pub entities: Vec<Detection>,
    /// Capability flags for this analysis.
    pub capabilities: Capabilities,
}

/// An individual anonymization operation applied to a span.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct AnonymizedItem {
    /// Entity detection that was anonymized.
    pub entity: Detection,
    /// Replacement text.
    pub replacement: String,
}

/// Anonymization output with transformed text and itemized changes.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct AnonymizeResult {
    /// Redacted/anonymized text.
    pub text: String,
    /// Itemized replacements.
    pub items: Vec<AnonymizedItem>,
}