pii-vault 0.2.0

Presidio-compatible PII detection, anonymization, and reversible tokenization
Documentation
use crate::entity::RecognizerResult;
use crate::vault::Vault;
use serde::{Deserialize, Serialize};

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type")]
pub enum Operator {
    Replace { new_value: String },
    Mask { masking_char: char, chars_to_mask: usize, from_end: bool },
    Hash { hash_type: HashType },
    Redact,
    Vault,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum HashType {
    Fnv,
}

impl Default for Operator {
    fn default() -> Self {
        Operator::Replace {
            new_value: String::new(),
        }
    }
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnonymizedItem {
    pub entity_type: String,
    pub start: usize,
    pub end: usize,
    pub original: String,
    pub replacement: String,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnonymizedResult {
    pub text: String,
    pub items: Vec<AnonymizedItem>,
}

pub struct Anonymizer;

impl Anonymizer {
    pub fn anonymize(
        text: &str,
        entities: &[RecognizerResult],
        operators: &std::collections::HashMap<String, Operator>,
        default_operator: &Operator,
        mut vault: Option<&mut Vault>,
    ) -> AnonymizedResult {
        let mut sorted = entities.to_vec();
        sorted.sort_by(|a, b| b.start.cmp(&a.start));

        let mut result = text.to_string();
        let mut items = Vec::new();

        for entity in &sorted {
            let original = &text[entity.start..entity.end];
            let op = operators
                .get(entity.entity_type.as_str())
                .unwrap_or(default_operator);

            let replacement = match op {
                Operator::Replace { new_value } => {
                    if new_value.is_empty() {
                        format!("<{}>", entity.entity_type)
                    } else {
                        new_value.clone()
                    }
                }
                Operator::Mask { masking_char, chars_to_mask, from_end } => {
                    mask_text(original, *masking_char, *chars_to_mask, *from_end)
                }
                Operator::Hash { hash_type } => {
                    hash_text(original, hash_type)
                }
                Operator::Redact => String::new(),
                Operator::Vault => {
                    if let Some(ref mut v) = { vault.as_deref_mut() } {
                        v.tokenize(entity.entity_type.as_str(), original)
                    } else {
                        format!("<{}>", entity.entity_type)
                    }
                }
            };

            items.push(AnonymizedItem {
                entity_type: entity.entity_type.to_string(),
                start: entity.start,
                end: entity.end,
                original: original.to_string(),
                replacement: replacement.clone(),
            });

            result.replace_range(entity.start..entity.end, &replacement);
        }

        items.reverse();

        AnonymizedResult { text: result, items }
    }
}

fn mask_text(text: &str, mask_char: char, chars_to_mask: usize, from_end: bool) -> String {
    let chars: Vec<char> = text.chars().collect();
    let n = chars.len();
    let to_mask = chars_to_mask.min(n);
    let mut result = chars.clone();

    if from_end {
        for c in result.iter_mut().rev().take(to_mask) {
            *c = mask_char;
        }
    } else {
        for c in result.iter_mut().take(to_mask) {
            *c = mask_char;
        }
    }
    result.into_iter().collect()
}

fn hash_text(text: &str, _hash_type: &HashType) -> String {
    let hash = text.bytes().fold(2166136261u32, |h, b| {
        (h ^ b as u32).wrapping_mul(16777619)
    });
    format!("{:08x}", hash)
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::entity::EntityType;
    use std::collections::HashMap;

    fn make_entity(entity_type: &str, start: usize, end: usize) -> RecognizerResult {
        RecognizerResult {
            entity_type: EntityType::new(entity_type),
            start,
            end,
            score: 0.9,
            recognizer_name: None,
        }
    }

    #[test]
    fn test_anonymize_replace_default() {
        let text = "Email me at test@example.com";
        let entities = vec![make_entity("EMAIL_ADDRESS", 12, 28)];
        let result = Anonymizer::anonymize(text, &entities, &HashMap::new(), &Operator::default(), None);
        assert_eq!(result.text, "Email me at <EMAIL_ADDRESS>");
        assert_eq!(result.items.len(), 1);
    }

    #[test]
    fn test_anonymize_replace_custom() {
        let text = "SSN: 123-45-6789";
        let entities = vec![make_entity("US_SSN", 5, 16)];
        let mut ops = HashMap::new();
        ops.insert("US_SSN".to_string(), Operator::Replace { new_value: "[REDACTED]".to_string() });
        let result = Anonymizer::anonymize(text, &entities, &ops, &Operator::default(), None);
        assert_eq!(result.text, "SSN: [REDACTED]");
    }

    #[test]
    fn test_anonymize_mask() {
        let text = "Card: 4111111111111111";
        let entities = vec![make_entity("CREDIT_CARD", 6, 22)];
        let mut ops = HashMap::new();
        ops.insert("CREDIT_CARD".to_string(), Operator::Mask {
            masking_char: '*',
            chars_to_mask: 12,
            from_end: false,
        });
        let result = Anonymizer::anonymize(text, &entities, &ops, &Operator::default(), None);
        assert_eq!(result.text, "Card: ************1111");
    }

    #[test]
    fn test_anonymize_hash() {
        let text = "Email: test@example.com";
        let entities = vec![make_entity("EMAIL_ADDRESS", 7, 23)];
        let mut ops = HashMap::new();
        ops.insert("EMAIL_ADDRESS".to_string(), Operator::Hash { hash_type: HashType::Fnv });
        let result = Anonymizer::anonymize(text, &entities, &ops, &Operator::default(), None);
        assert!(result.text.starts_with("Email: "));
        assert_ne!(result.text, text);
    }

    #[test]
    fn test_anonymize_redact() {
        let text = "My phone is 555-123-4567";
        let entities = vec![make_entity("PHONE_NUMBER", 12, 24)];
        let mut ops = HashMap::new();
        ops.insert("PHONE_NUMBER".to_string(), Operator::Redact);
        let result = Anonymizer::anonymize(text, &entities, &ops, &Operator::default(), None);
        assert_eq!(result.text, "My phone is ");
    }

    #[test]
    fn test_anonymize_vault() {
        let text = "Email: test@example.com";
        let entities = vec![make_entity("EMAIL_ADDRESS", 7, 23)];
        let mut ops = HashMap::new();
        ops.insert("EMAIL_ADDRESS".to_string(), Operator::Vault);
        let mut vault = crate::vault::Vault::new();
        let result = Anonymizer::anonymize(text, &entities, &ops, &Operator::default(), Some(&mut vault));
        assert!(result.text.contains("[EMAIL_ADDRESS:"));
        assert_eq!(vault.entry_count(), 1);

        let restored = vault.detokenize(&result.text);
        assert_eq!(restored, text);
    }

    #[test]
    fn test_anonymize_multiple_entities() {
        let text = "alice@test.com called 555-123-4567";
        let entities = vec![
            make_entity("EMAIL_ADDRESS", 0, 14),
            make_entity("PHONE_NUMBER", 22, 34),
        ];
        let result = Anonymizer::anonymize(text, &entities, &HashMap::new(), &Operator::default(), None);
        assert_eq!(result.text, "<EMAIL_ADDRESS> called <PHONE_NUMBER>");
    }
}