pii 0.1.0

PII detection and anonymization with deterministic, capability-aware NLP pipelines.
Documentation
//! Dictionary-based recognizer using Aho-Corasick.
//!
//! Dictionary recognizers are useful for names, codewords, or any curated
//! list of terms. Matching is exact (case-insensitive if configured), and
//! the emitted detections include the dictionary source label for audit.

use crate::recognizers::Recognizer;
use crate::types::{Detection, DetectionExplanation, EntityType, NlpArtifacts};
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};

/// Emits detections for exact dictionary terms.
#[derive(Clone)]
pub struct DictionaryRecognizer {
    name: String,
    entity: EntityType,
    matcher: AhoCorasick,
    score: f32,
    source: String,
}

impl DictionaryRecognizer {
    /// Creates a dictionary recognizer.
    pub fn new(
        name: impl Into<String>,
        entity: EntityType,
        terms: &[String],
        score: f32,
        source: impl Into<String>,
        case_insensitive: bool,
    ) -> Self {
        let matcher = AhoCorasickBuilder::new()
            .ascii_case_insensitive(case_insensitive)
            .match_kind(MatchKind::LeftmostLongest)
            .build(terms)
            .expect("failed to build dictionary matcher");
        Self {
            name: name.into(),
            entity,
            matcher,
            score,
            source: source.into(),
        }
    }
}

impl Recognizer for DictionaryRecognizer {
    fn name(&self) -> &str {
        &self.name
    }

    fn supported_entities(&self) -> &[EntityType] {
        std::slice::from_ref(&self.entity)
    }

    fn analyze(&self, text: &str, _artifacts: &NlpArtifacts) -> Vec<Detection> {
        self.matcher
            .find_iter(text)
            .map(|m| Detection {
                entity_type: self.entity.clone(),
                start: m.start(),
                end: m.end(),
                score: self.score,
                recognizer: self.name.clone(),
                explanation: DetectionExplanation::Dictionary {
                    source: self.source.clone(),
                },
            })
            .collect()
    }
}