pii 0.1.0

PII detection and anonymization with deterministic, capability-aware NLP pipelines.
Documentation
use pii::analyzer::Analyzer;
use pii::config::PolicyConfig;
use pii::nlp::SimpleNlpEngine;
use pii::presets::default_recognizers;
use pii::types::{EntityType, Language};
use std::collections::HashSet;

struct Lcg {
    state: u64,
}

impl Lcg {
    fn new(seed: u64) -> Self {
        Self { state: seed }
    }

    fn next_u32(&mut self) -> u32 {
        self.state = self
            .state
            .wrapping_mul(6364136223846793005)
            .wrapping_add(1);
        (self.state >> 32) as u32
    }

    fn next_usize(&mut self, max: usize) -> usize {
        if max == 0 {
            return 0;
        }
        (self.next_u32() as usize) % max
    }
}

#[derive(Clone, Debug)]
struct Expected {
    entity: EntityType,
    start: usize,
    end: usize,
}

#[test]
fn test_injection_generator() {
    let mut rng = Lcg::new(0xC0FFEE);
    let analyzer = Analyzer::new(
        Box::new(SimpleNlpEngine::default()),
        default_recognizers(),
        Vec::new(),
        policy_for(&[
            EntityType::Email,
            EntityType::IpAddress,
            EntityType::CreditCard,
            EntityType::Phone,
        ]),
    );

    for _case in 0..8 {
        let mut text = String::new();
        let mut expected: Vec<Expected> = Vec::new();
        for idx in 0..30 {
            if idx == 5 {
                inject(&mut text, &mut expected, EntityType::Email, "user@example.com");
            }
            if idx == 12 {
                inject(&mut text, &mut expected, EntityType::IpAddress, "10.0.0.5");
            }
            if idx == 18 {
                inject(
                    &mut text,
                    &mut expected,
                    EntityType::CreditCard,
                    "4539 1488 0343 6467",
                );
            }
            if idx == 24 {
                inject(
                    &mut text,
                    &mut expected,
                    EntityType::Phone,
                    "+1 415-555-1212",
                );
            }
            let word = random_word(&mut rng);
            text.push_str(&word);
            text.push(' ');
        }

        let result = analyzer.analyze(&text, &Language::from("en")).unwrap();
        for entry in expected {
            let found = result.entities.iter().find(|det| {
                det.entity_type == entry.entity && det.start == entry.start && det.end == entry.end
            });
            assert!(
                found.is_some(),
                "missing {:?} at {}..{} in text {:?}",
                entry.entity,
                entry.start,
                entry.end,
                text
            );
        }
    }
}

fn policy_for(entities: &[EntityType]) -> PolicyConfig {
    let mut policy = PolicyConfig::default();
    policy.enabled_entities = entities.iter().cloned().collect::<HashSet<_>>();
    policy
}

fn inject(text: &mut String, expected: &mut Vec<Expected>, entity: EntityType, literal: &str) {
    let start = text.len();
    text.push_str(literal);
    let end = text.len();
    expected.push(Expected { entity, start, end });
    text.push(' ');
}

fn random_word(rng: &mut Lcg) -> String {
    let len = 3 + rng.next_usize(6);
    let mut word = String::with_capacity(len);
    for _ in 0..len {
        let ch = (b'a' + rng.next_usize(26) as u8) as char;
        word.push(ch);
    }
    word
}