pii 0.1.0

PII detection and anonymization with deterministic, capability-aware NLP pipelines.
Documentation
//! Built-in recognizer presets for common PII types.
//!
//! This module defines a curated default set of recognizers that provide
//! broad coverage for identifiers like emails, IPs, payment numbers, and
//! account-related patterns. These are meant as a safe baseline; you can
//! remove or replace recognizers for stricter or domain-specific policies.

use crate::recognizers::regex::RegexRecognizer;
use crate::recognizers::validator::{
    iban_check, imei_check, itin_check, luhn_check, routing_check, ssn_check, tax_id_check,
    ValidatorRecognizer,
};
use crate::recognizers::Recognizer;
use crate::types::EntityType;
use std::sync::Arc;

/// Returns the default set of built-in recognizers.
pub fn default_recognizers() -> Vec<Box<dyn Recognizer>> {
    let mut recognizers: Vec<Box<dyn Recognizer>> = Vec::new();

    if let Ok(recognizer) = RegexRecognizer::new(
        "regex_email",
        EntityType::Email,
        r"(?i)[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}",
        0.8,
        "email",
    ) {
        recognizers.push(Box::new(recognizer));
    }

    if let Ok(recognizer) = RegexRecognizer::new(
        "regex_url",
        EntityType::Url,
        r"(?i)\bhttps?://[A-Z0-9.-]+\.[A-Z]{2,}(?:/[^\s]*)?",
        0.7,
        "url",
    ) {
        recognizers.push(Box::new(recognizer));
    }

    if let Ok(recognizer) = RegexRecognizer::new(
        "regex_domain",
        EntityType::Domain,
        r"(?i)\b(?:[A-Z0-9-]+\.)+[A-Z]{2,}\b",
        0.5,
        "domain",
    ) {
        recognizers.push(Box::new(recognizer));
    }

    if let Ok(recognizer) = RegexRecognizer::new(
        "regex_hostname",
        EntityType::Hostname,
        r"(?i)\b[a-z0-9][a-z0-9-]*(?:\.[a-z0-9][a-z0-9-]*)+\b",
        0.5,
        "hostname",
    ) {
        recognizers.push(Box::new(recognizer));
    }

    if let Ok(recognizer) = RegexRecognizer::new(
        "regex_phone",
        EntityType::Phone,
        r"\+?[0-9][0-9\s\-()]{7,}[0-9]",
        0.6,
        "phone",
    ) {
        recognizers.push(Box::new(recognizer));
    }

    if let Ok(recognizer) = RegexRecognizer::new(
        "regex_ip",
        EntityType::IpAddress,
        r"\b(?:\d{1,3}\.){3}\d{1,3}\b",
        0.7,
        "ipv4",
    ) {
        recognizers.push(Box::new(recognizer));
    }

    if let Ok(recognizer) = RegexRecognizer::new(
        "regex_ipv6",
        EntityType::Ipv6,
        r"\b(?:[0-9A-Fa-f]{1,4}:){7}[0-9A-Fa-f]{1,4}\b",
        0.7,
        "ipv6",
    ) {
        recognizers.push(Box::new(recognizer));
    }

    if let Ok(recognizer) = ValidatorRecognizer::new(
        "validator_credit_card",
        EntityType::CreditCard,
        r"(?:\d[ -]*?){13,19}",
        0.9,
        "luhn",
        Arc::new(luhn_check),
    ) {
        recognizers.push(Box::new(recognizer));
    }

    if let Ok(recognizer) = ValidatorRecognizer::new(
        "validator_iban",
        EntityType::Iban,
        r"\b[A-Z]{2}\d{2}[A-Z0-9]{11,30}\b",
        0.8,
        "iban",
        Arc::new(iban_check),
    ) {
        recognizers.push(Box::new(recognizer));
    }

    if let Ok(recognizer) = ValidatorRecognizer::new(
        "validator_ssn",
        EntityType::Ssn,
        r"\b\d{3}[- ]?\d{2}[- ]?\d{4}\b",
        0.8,
        "ssn",
        Arc::new(ssn_check),
    ) {
        recognizers.push(Box::new(recognizer));
    }

    if let Ok(recognizer) = ValidatorRecognizer::new(
        "validator_itin",
        EntityType::Itin,
        r"\b9\d{2}[- ]?\d{2}[- ]?\d{4}\b",
        0.8,
        "itin",
        Arc::new(itin_check),
    ) {
        recognizers.push(Box::new(recognizer));
    }

    if let Ok(recognizer) = ValidatorRecognizer::new(
        "validator_tax_id",
        EntityType::TaxId,
        r"\b\d{2}-\d{7}\b",
        0.75,
        "tax_id",
        Arc::new(tax_id_check),
    ) {
        recognizers.push(Box::new(recognizer));
    }

    if let Ok(recognizer) = ValidatorRecognizer::new(
        "validator_routing_number",
        EntityType::RoutingNumber,
        r"\b\d{9}\b",
        0.8,
        "routing_number",
        Arc::new(routing_check),
    ) {
        recognizers.push(Box::new(recognizer));
    }

    if let Ok(recognizer) = ValidatorRecognizer::new(
        "validator_imei",
        EntityType::Imei,
        r"\b\d{15}\b",
        0.75,
        "imei",
        Arc::new(imei_check),
    ) {
        recognizers.push(Box::new(recognizer));
    }

    if let Ok(recognizer) = RegexRecognizer::new(
        "regex_passport",
        EntityType::Passport,
        r"(?i)\b(?:passport|pass)\s*(?:no\.?|number)?\s*[:#-]?\s*[A-Z0-9]{6,9}\b",
        0.6,
        "passport",
    ) {
        recognizers.push(Box::new(recognizer));
    }

    if let Ok(recognizer) = RegexRecognizer::new(
        "regex_driver_license",
        EntityType::DriverLicense,
        r"(?i)\b(?:driver(?:'s)?\s*license|dl)\s*[:#-]?\s*[A-Z0-9]{4,12}\b",
        0.6,
        "driver_license",
    ) {
        recognizers.push(Box::new(recognizer));
    }

    if let Ok(recognizer) = RegexRecognizer::new(
        "regex_bank_account",
        EntityType::BankAccount,
        r"(?i)\b(?:account|acct)\s*(?:number|no\.?)?\s*[:#-]?\s*\d{6,17}\b",
        0.5,
        "bank_account",
    ) {
        recognizers.push(Box::new(recognizer));
    }

    if let Ok(recognizer) = RegexRecognizer::new(
        "regex_crypto_address",
        EntityType::CryptoAddress,
        r"(?i)\b(?:0x[a-f0-9]{40}|bc1[0-9a-z]{25,71}|[13][a-km-zA-HJ-NP-Z1-9]{25,34})\b",
        0.7,
        "crypto_address",
    ) {
        recognizers.push(Box::new(recognizer));
    }

    if let Ok(recognizer) = RegexRecognizer::new(
        "regex_mac_address",
        EntityType::MacAddress,
        r"\b(?:[0-9A-Fa-f]{2}:){5}[0-9A-Fa-f]{2}\b",
        0.7,
        "mac_address",
    ) {
        recognizers.push(Box::new(recognizer));
    }

    if let Ok(recognizer) = RegexRecognizer::new(
        "regex_uuid",
        EntityType::Uuid,
        r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}\b",
        0.7,
        "uuid",
    ) {
        recognizers.push(Box::new(recognizer));
    }

    if let Ok(recognizer) = RegexRecognizer::new(
        "regex_vin",
        EntityType::Vin,
        r"\b[A-HJ-NPR-Z0-9]{17}\b",
        0.6,
        "vin",
    ) {
        recognizers.push(Box::new(recognizer));
    }

    recognizers
}