pii 0.1.0

PII detection and anonymization with deterministic, capability-aware NLP pipelines.
Documentation
use pii::anonymize::{AnonymizeConfig, Anonymizer, Operator};
use pii::decision::resolve;
use pii::nlp::{NlpEngine, SimpleNlpEngine};
use pii::presets::default_recognizers;
use pii::recognizers::dictionary::DictionaryRecognizer;
use pii::recognizers::validator::{imei_check, itin_check, luhn_check, routing_check, ssn_check, tax_id_check};
use pii::{Analyzer, PolicyConfig};
use pii::types::{Detection, DetectionExplanation, EntityType, Language};
use std::collections::HashMap;

#[test]
fn test_token_offsets_roundtrip() {
    let engine = SimpleNlpEngine::default();
    let text = "Hello Jose.";
    let artifacts = engine.analyze(text, &Language::from("en")).unwrap();
    for token in artifacts.tokens {
        assert_eq!(token.text, &text[token.start..token.end]);
    }
}

#[test]
fn test_luhn_check() {
    assert!(luhn_check("4539 1488 0343 6467"));
    assert!(!luhn_check("4539 1488 0343 6468"));
}

#[test]
fn test_routing_check() {
    assert!(routing_check("021000021"));
    assert!(!routing_check("021000022"));
}

#[test]
fn test_ssn_check() {
    assert!(ssn_check("123-45-6789"));
    assert!(!ssn_check("000-12-3456"));
}

#[test]
fn test_itin_check() {
    assert!(itin_check("912-70-1234"));
    assert!(!itin_check("912-69-1234"));
}

#[test]
fn test_tax_id_check() {
    assert!(tax_id_check("12-3456789"));
    assert!(!tax_id_check("00-1234567"));
}

#[test]
fn test_imei_check() {
    assert!(imei_check("490154203237518"));
    assert!(!imei_check("490154203237517"));
}

#[test]
fn test_overlap_resolution_prefers_score() {
    let det_a = Detection {
        entity_type: EntityType::Email,
        start: 0,
        end: 10,
        score: 0.6,
        recognizer: "a".to_string(),
        explanation: DetectionExplanation::Regex {
            pattern_name: "email".to_string(),
        },
    };
    let det_b = Detection {
        entity_type: EntityType::Email,
        start: 5,
        end: 12,
        score: 0.9,
        recognizer: "b".to_string(),
        explanation: DetectionExplanation::Regex {
            pattern_name: "email".to_string(),
        },
    };
    let resolved = resolve(vec![det_a, det_b], &|_| 0.0);
    assert_eq!(resolved.len(), 1);
    assert_eq!(resolved[0].recognizer, "b");
}

#[test]
fn test_overlap_resolution_prefers_validator_on_tie() {
    let det_a = Detection {
        entity_type: EntityType::Email,
        start: 0,
        end: 10,
        score: 0.8,
        recognizer: "regex".to_string(),
        explanation: DetectionExplanation::Regex {
            pattern_name: "email".to_string(),
        },
    };
    let det_b = Detection {
        entity_type: EntityType::Email,
        start: 2,
        end: 12,
        score: 0.8,
        recognizer: "validator".to_string(),
        explanation: DetectionExplanation::Validator {
            validator: "email".to_string(),
            passed: true,
        },
    };
    let resolved = resolve(vec![det_a, det_b], &|_| 0.0);
    assert_eq!(resolved.len(), 1);
    assert_eq!(resolved[0].recognizer, "validator");
}

#[test]
fn test_anonymizer_mask() {
    let detection = Detection {
        entity_type: EntityType::Phone,
        start: 0,
        end: 10,
        score: 0.9,
        recognizer: "r".to_string(),
        explanation: DetectionExplanation::Regex {
            pattern_name: "phone".to_string(),
        },
    };
    let mut config = AnonymizeConfig::default();
    let mut per_entity = HashMap::new();
    per_entity.insert(
        EntityType::Phone.as_str(),
        Operator::Mask { ch: '*', from_end: 4 },
    );
    config.per_entity = per_entity;

    let result = Anonymizer::anonymize("1234567890", &[detection], &config).unwrap();
    assert_eq!(result.text, "******7890");
}

#[test]
fn test_policy_filters_entities() {
    let mut policy = PolicyConfig::default();
    policy.enabled_entities.insert(EntityType::Email);
    let analyzer = Analyzer::new(
        Box::new(SimpleNlpEngine::default()),
        default_recognizers(),
        Vec::new(),
        policy,
    );
    let text = "Email me at jane@example.com or call +1 212-555-0909.";
    let result = analyzer.analyze(text, &Language::from("en")).unwrap();
    assert!(result
        .entities
        .iter()
        .all(|det| det.entity_type == EntityType::Email));
}

#[test]
fn test_dictionary_recognizer_detects_terms() {
    let recognizer = DictionaryRecognizer::new(
        "dict_names",
        EntityType::Person,
        &["Alice".to_string(), "Bob".to_string()],
        0.75,
        "names",
        true,
    );
    let analyzer = Analyzer::new(
        Box::new(SimpleNlpEngine::default()),
        vec![Box::new(recognizer)],
        Vec::new(),
        PolicyConfig::default(),
    );
    let text = "Alice and Bob met.";
    let result = analyzer.analyze(text, &Language::from("en")).unwrap();
    assert_eq!(result.entities.len(), 2);
    assert_eq!(&text[result.entities[0].start..result.entities[0].end], "Alice");
    assert_eq!(&text[result.entities[1].start..result.entities[1].end], "Bob");
}