pii 0.1.0

PII detection and anonymization with deterministic, capability-aware NLP pipelines.
Documentation
//! Regex + validator-based recognizers for structured identifiers.
//!
//! Validator recognizers are appropriate when a regex match is necessary
//! but not sufficient (e.g., credit card numbers, routing numbers, IBAN).
//! The validator predicate confirms correctness and ensures deterministic
//! acceptance rules across environments.

use crate::recognizers::Recognizer;
use crate::types::{Detection, DetectionExplanation, EntityType, NlpArtifacts};
use regex::Regex;
use std::sync::Arc;

/// Regex-based recognizer that validates matches with a custom predicate.
#[derive(Clone)]
pub struct ValidatorRecognizer {
    name: String,
    entity: EntityType,
    regex: Regex,
    score: f32,
    validator: Arc<dyn Fn(&str) -> bool + Send + Sync>,
    validator_name: String,
}

impl ValidatorRecognizer {
    /// Creates a validator-backed recognizer.
    pub fn new(
        name: impl Into<String>,
        entity: EntityType,
        pattern: &str,
        score: f32,
        validator_name: impl Into<String>,
        validator: Arc<dyn Fn(&str) -> bool + Send + Sync>,
    ) -> Result<Self, regex::Error> {
        Ok(Self {
            name: name.into(),
            entity,
            regex: Regex::new(pattern)?,
            score,
            validator,
            validator_name: validator_name.into(),
        })
    }
}

impl Recognizer for ValidatorRecognizer {
    fn name(&self) -> &str {
        &self.name
    }

    fn supported_entities(&self) -> &[EntityType] {
        std::slice::from_ref(&self.entity)
    }

    fn analyze(&self, text: &str, _artifacts: &NlpArtifacts) -> Vec<Detection> {
        self.regex
            .find_iter(text)
            .filter_map(|m| {
                let value = &text[m.start()..m.end()];
                let passed = (self.validator)(value);
                if !passed {
                    return None;
                }
                Some(Detection {
                    entity_type: self.entity.clone(),
                    start: m.start(),
                    end: m.end(),
                    score: self.score,
                    recognizer: self.name.clone(),
                    explanation: DetectionExplanation::Validator {
                        validator: self.validator_name.clone(),
                        passed,
                    },
                })
            })
            .collect()
    }
}

/// Validates using the Luhn checksum algorithm.
pub fn luhn_check(value: &str) -> bool {
    let digits: Vec<u32> = strip_digits(value);
    if digits.len() < 12 {
        return false;
    }
    let mut sum = 0;
    let mut double = false;
    for digit in digits.iter().rev() {
        let mut val = *digit;
        if double {
            val *= 2;
            if val > 9 {
                val -= 9;
            }
        }
        sum += val;
        double = !double;
    }
    sum % 10 == 0
}

/// Validates IBAN strings using modulo-97.
pub fn iban_check(value: &str) -> bool {
    let mut cleaned = String::new();
    for ch in value.chars() {
        if ch.is_ascii_alphanumeric() {
            cleaned.push(ch.to_ascii_uppercase());
        }
    }
    if cleaned.len() < 15 || cleaned.len() > 34 {
        return false;
    }
    let mut rearranged = cleaned[4..].to_string();
    rearranged.push_str(&cleaned[..4]);

    let mut remainder: u128 = 0;
    for ch in rearranged.chars() {
        let chunk = if ch.is_ascii_digit() {
            ch.to_string()
        } else {
            let val = (ch as u32) - ('A' as u32) + 10;
            val.to_string()
        };
        for digit in chunk.chars() {
            let d = digit.to_digit(10).unwrap_or(0) as u128;
            remainder = (remainder * 10 + d) % 97;
        }
    }
    remainder == 1
}

/// Validates US routing numbers using weighted checksum.
pub fn routing_check(value: &str) -> bool {
    let digits = strip_digits(value);
    if digits.len() != 9 {
        return false;
    }
    let weights = [3u32, 7u32, 1u32, 3u32, 7u32, 1u32, 3u32, 7u32, 1u32];
    let sum: u32 = digits
        .iter()
        .zip(weights.iter())
        .map(|(d, w)| d * w)
        .sum();
    sum % 10 == 0
}

/// Validates SSN structure (area/group/serial rules).
pub fn ssn_check(value: &str) -> bool {
    let digits = strip_digits(value);
    if digits.len() != 9 {
        return false;
    }
    let area = digits[0] * 100 + digits[1] * 10 + digits[2];
    let group = digits[3] * 10 + digits[4];
    let serial = digits[5] * 1000 + digits[6] * 100 + digits[7] * 10 + digits[8];
    if area == 0 || area == 666 || area >= 900 {
        return false;
    }
    if group == 0 || serial == 0 {
        return false;
    }
    true
}

/// Validates ITIN structure (9xx-xx-xxxx with allowed middle range).
pub fn itin_check(value: &str) -> bool {
    let digits = strip_digits(value);
    if digits.len() != 9 {
        return false;
    }
    if digits[0] != 9 {
        return false;
    }
    let middle = digits[3] * 10 + digits[4];
    matches!(middle, 70..=88 | 90..=92 | 94..=99)
}

/// Validates a basic EIN/tax ID shape.
pub fn tax_id_check(value: &str) -> bool {
    let digits = strip_digits(value);
    if digits.len() != 9 {
        return false;
    }
    let prefix = digits[0] * 10 + digits[1];
    prefix != 0
}

/// Validates IMEI using Luhn.
pub fn imei_check(value: &str) -> bool {
    let digits = strip_digits(value);
    if digits.len() != 15 {
        return false;
    }
    luhn_check(value)
}

/// Strips non-digits and returns numeric digits.
fn strip_digits(value: &str) -> Vec<u32> {
    value
        .chars()
        .filter(|ch| ch.is_ascii_digit())
        .filter_map(|ch| ch.to_digit(10))
        .collect()
}