pii 0.1.0 - Docs.rs

//! Analyzer pipeline wiring for PII detection.
//!
//! This module defines the `Analyzer`, the main entry point for running PII
//! detection. The analyzer is responsible for:
//! - producing NLP artifacts (tokens, offsets, optional lemma/POS/NER)
//! - running recognizers that emit candidate detections
//! - optionally applying context-based score boosts
//! - resolving overlaps and applying thresholds deterministically
//!
//! The analyzer is intentionally modular. You can:
//! - plug in a custom `NlpEngine` to control tokenization and capabilities
//! - provide your own recognizers or replace the defaults
//! - add context enhancers for ambiguous patterns
//! - configure thresholds and allowlists via `PolicyConfig`
//!
//! This structure keeps detection logic explicit and testable while allowing
//! callers to tailor the pipeline to their domain.

use crate::config::PolicyConfig;
use crate::context::ContextEnhancer;
use crate::decision::resolve;
use crate::error::PiiResult;
use crate::nlp::NlpEngine;
use crate::recognizers::Recognizer;
use crate::types::{AnalyzeResult, Detection, EntityType, Language};

/// Orchestrates NLP analysis, recognizers, and context enhancers.
pub struct Analyzer {
    nlp: Box<dyn NlpEngine>,
    recognizers: Vec<Box<dyn Recognizer>>,
    enhancers: Vec<Box<dyn ContextEnhancer>>,
    policy: PolicyConfig,
}

impl Analyzer {
    /// Creates a new analyzer with the provided pipeline components.
    pub fn new(
        nlp: Box<dyn NlpEngine>,
        recognizers: Vec<Box<dyn Recognizer>>,
        enhancers: Vec<Box<dyn ContextEnhancer>>,
        policy: PolicyConfig,
    ) -> Self {
        Self {
            nlp,
            recognizers,
            enhancers,
            policy,
        }
    }

    /// Runs detection over the supplied text and language.
    pub fn analyze(&self, text: &str, language: &Language) -> PiiResult<AnalyzeResult> {
        let artifacts = self.nlp.analyze(text, language)?;
        let mut candidates = Vec::new();
        for recognizer in &self.recognizers {
            let mut detected = recognizer.analyze(text, &artifacts);
            detected.retain(|det| self.policy.is_enabled(&det.entity_type));
            candidates.extend(detected);
        }

        for enhancer in &self.enhancers {
            enhancer.enhance(&mut candidates, text, &artifacts);
        }

        let resolved = resolve(candidates, &|det: &Detection| {
            self.policy.threshold_for(&det.entity_type)
        });

        Ok(AnalyzeResult {
            language: artifacts.language.clone(),
            entities: resolved,
            capabilities: artifacts.capabilities.clone(),
        })
    }
}

/// Returns the default score threshold for built-in entity types.
pub fn default_threshold(entity: &EntityType) -> f32 {
    match entity {
        EntityType::Email => 0.6,
        EntityType::Phone => 0.6,
        EntityType::IpAddress => 0.6,
        EntityType::Ipv6 => 0.6,
        EntityType::CreditCard => 0.7,
        EntityType::Iban => 0.7,
        EntityType::Ssn => 0.7,
        EntityType::Itin => 0.7,
        EntityType::TaxId => 0.7,
        EntityType::Passport => 0.6,
        EntityType::DriverLicense => 0.6,
        EntityType::BankAccount => 0.6,
        EntityType::RoutingNumber => 0.7,
        EntityType::CryptoAddress => 0.6,
        EntityType::MacAddress => 0.6,
        EntityType::Uuid => 0.6,
        EntityType::Vin => 0.6,
        EntityType::Imei => 0.7,
        EntityType::Url => 0.5,
        EntityType::Domain => 0.5,
        EntityType::Hostname => 0.5,
        EntityType::Person => 0.7,
        EntityType::Location => 0.7,
        EntityType::Organization => 0.7,
        EntityType::Custom(_) => 0.5,
    }
}