pii 0.1.0

PII detection and anonymization with deterministic, capability-aware NLP pipelines.
Documentation
//! Anonymization operators and helpers.
//!
//! This module turns detections into transformed output text. It does not
//! attempt to detect PII on its own; it only applies operators to spans
//! produced by the analyzer. Operators are deterministic and preserve
//! stable byte offsets by applying replacements from the end of the string
//! back to the beginning.
//!
//! Use this module when you need:
//! - full redaction of detected spans
//! - masking with suffix preservation (e.g., last 4 digits)
//! - replacement with domain-specific placeholders
//! - hashing with a caller-provided salt for auditability
//!
//! You can configure per-entity behavior through `AnonymizeConfig`.

use crate::error::{PiiError, PiiResult};
use crate::types::{AnonymizeResult, AnonymizedItem, Detection};
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use std::collections::HashMap;

#[derive(Clone, Debug, Serialize, Deserialize)]
/// Redaction operator applied to detected spans.
pub enum Operator {
    /// Replace the span with a fixed token.
    Redact,
    /// Mask the span, keeping a suffix of length `from_end`.
    Mask { ch: char, from_end: usize },
    /// Replace the span with a caller-provided string.
    Replace { with: String },
    /// Hash the span with SHA-256 and a caller-provided salt.
    HashSha256 { salt: String },
}

#[derive(Clone, Debug, Serialize, Deserialize)]
/// Per-entity anonymization configuration.
pub struct AnonymizeConfig {
    /// Operator used when no per-entity override is present.
    pub default: Operator,
    /// Per-entity overrides keyed by `EntityType::as_str()`.
    pub per_entity: HashMap<String, Operator>,
}

impl Default for AnonymizeConfig {
    fn default() -> Self {
        Self {
            default: Operator::Redact,
            per_entity: HashMap::new(),
        }
    }
}

/// Applies anonymization operators to detected spans.
pub struct Anonymizer;

impl Anonymizer {
    /// Applies anonymization operators to the detected spans.
    pub fn anonymize(
        text: &str,
        detections: &[Detection],
        config: &AnonymizeConfig,
    ) -> PiiResult<AnonymizeResult> {
        let mut ordered = detections.to_vec();
        ordered.sort_by(|a, b| b.start.cmp(&a.start));

        let mut output = text.to_string();
        let mut items = Vec::new();

        for detection in ordered {
            let start = detection.start;
            let end = detection.end;
            if start > end || end > text.len() {
                return Err(PiiError::Anonymizer("invalid offsets".to_string()));
            }
            let span = &text[start..end];
            let op = config
                .per_entity
                .get(&detection.entity_type.as_str())
                .unwrap_or(&config.default);
            let replacement = apply_operator(op, span);
            output.replace_range(start..end, &replacement);
            items.push(AnonymizedItem {
                entity: detection.clone(),
                replacement,
            });
        }

        items.sort_by(|a, b| a.entity.start.cmp(&b.entity.start));
        Ok(AnonymizeResult { text: output, items })
    }
}

fn apply_operator(op: &Operator, span: &str) -> String {
    match op {
        Operator::Redact => "<REDACTED>".to_string(),
        Operator::Mask { ch, from_end } => mask(span, *ch, *from_end),
        Operator::Replace { with } => with.clone(),
        Operator::HashSha256 { salt } => hash(span, salt),
    }
}

/// Masks a string leaving the last `from_end` characters unmasked.
fn mask(value: &str, ch: char, from_end: usize) -> String {
    let chars: Vec<char> = value.chars().collect();
    if chars.is_empty() {
        return String::new();
    }
    let keep = from_end.min(chars.len());
    let mask_len = chars.len().saturating_sub(keep);
    let mut result = String::new();
    result.extend(std::iter::repeat(ch).take(mask_len));
    if keep > 0 {
        result.extend(chars[chars.len() - keep..].iter());
    }
    result
}

/// Hashes a string with SHA-256 using the provided salt.
fn hash(value: &str, salt: &str) -> String {
    let mut hasher = Sha256::new();
    hasher.update(salt.as_bytes());
    hasher.update(value.as_bytes());
    let digest = hasher.finalize();
    hex::encode(digest)
}