pii 0.1.0

PII detection and anonymization with deterministic, capability-aware NLP pipelines.
Documentation
//! Context-aware score boosting for ambiguous detections.
//!
//! Context enhancers look at nearby tokens to increase confidence for matches
//! that are ambiguous on their own (e.g., "May" as a name vs a month). The
//! `LemmaContextEnhancer` uses lemma forms when available and falls back to
//! surface tokens when lemma data is missing.
//!
//! Context is optional. You can omit enhancers entirely for strict pattern-
//! only detection, or add your own enhancer to implement domain-specific
//! heuristics.

use crate::profile::ContextTerms;
use crate::types::{Detection, DetectionExplanation, EntityType, NlpArtifacts};
use std::collections::{HashMap, HashSet};

/// Enhances detections by inspecting nearby tokens.
pub trait ContextEnhancer: Send + Sync {
    /// Applies in-place score adjustments to detections.
    fn enhance(&self, detections: &mut [Detection], text: &str, artifacts: &NlpArtifacts);
}

/// Lemma-aware context enhancer with surface-form fallback.
#[derive(Clone, Debug)]
pub struct LemmaContextEnhancer {
    context: HashMap<EntityType, ContextTerms>,
}

impl LemmaContextEnhancer {
    /// Creates a new enhancer with entity-specific context terms.
    pub fn new(context: HashMap<EntityType, ContextTerms>) -> Self {
        Self { context }
    }
}

impl ContextEnhancer for LemmaContextEnhancer {
    fn enhance(&self, detections: &mut [Detection], _text: &str, artifacts: &NlpArtifacts) {
        let use_lemma = artifacts.capabilities.lemma;
        for detection in detections.iter_mut() {
            let terms = match self.context.get(&detection.entity_type) {
                Some(terms) => terms,
                None => continue,
            };
            let matched = find_context_terms(artifacts, detection.start, detection.end, terms, use_lemma);
            if matched.is_empty() {
                continue;
            }
            let base = detection.score;
            let boost = terms.boost;
            detection.score = (base + boost).clamp(0.0, 1.0);
            detection.explanation = DetectionExplanation::ContextBoost {
                base,
                boost,
                matched_terms: matched,
            };
        }
    }
}

/// Finds matching context terms within a token window.
fn find_context_terms(
    artifacts: &NlpArtifacts,
    start: usize,
    end: usize,
    terms: &ContextTerms,
    use_lemma: bool,
) -> Vec<String> {
    let mut terms_set: HashSet<String> = terms.terms.iter().map(|t| t.to_lowercase()).collect();
    let tokens = &artifacts.tokens;
    let mut center_index = None;
    for (idx, token) in tokens.iter().enumerate() {
        if token.start < end && token.end > start {
            center_index = Some(idx);
            break;
        }
    }
    let center_index = match center_index {
        Some(idx) => idx,
        None => return Vec::new(),
    };

    let window = terms.window_tokens;
    let start_idx = center_index.saturating_sub(window);
    let end_idx = (center_index + window + 1).min(tokens.len());

    let mut matched = Vec::new();
    for token in &tokens[start_idx..end_idx] {
        let candidate = if use_lemma {
            token
                .lemma
                .as_ref()
                .map(|s| s.to_lowercase())
                .unwrap_or_else(|| token.text.to_lowercase())
        } else {
            token.text.to_lowercase()
        };
        if terms_set.remove(&candidate) {
            matched.push(candidate);
        }
    }

    matched
}