pii 0.1.0 - Docs.rs

//! Regex-based recognizer implementation.
//!
//! Regex recognizers are best for well-structured patterns like emails,
//! URLs, or hostnames. They emit detections with a fixed score and a
//! human-readable pattern name for audit logs.

use crate::recognizers::Recognizer;
use crate::types::{Detection, DetectionExplanation, EntityType, NlpArtifacts};
use regex::Regex;

/// Emits detections from a regex pattern.
#[derive(Clone, Debug)]
pub struct RegexRecognizer {
    name: String,
    entity: EntityType,
    regex: Regex,
    score: f32,
    pattern_name: String,
}

impl RegexRecognizer {
    /// Creates a new regex recognizer.
    pub fn new(
        name: impl Into<String>,
        entity: EntityType,
        pattern: &str,
        score: f32,
        pattern_name: impl Into<String>,
    ) -> Result<Self, regex::Error> {
        Ok(Self {
            name: name.into(),
            entity,
            regex: Regex::new(pattern)?,
            score,
            pattern_name: pattern_name.into(),
        })
    }
}

impl Recognizer for RegexRecognizer {
    fn name(&self) -> &str {
        &self.name
    }

    fn supported_entities(&self) -> &[EntityType] {
        std::slice::from_ref(&self.entity)
    }

    fn analyze(&self, text: &str, _artifacts: &NlpArtifacts) -> Vec<Detection> {
        self.regex
            .find_iter(text)
            .map(|m| Detection {
                entity_type: self.entity.clone(),
                start: m.start(),
                end: m.end(),
                score: self.score,
                recognizer: self.name.clone(),
                explanation: DetectionExplanation::Regex {
                    pattern_name: self.pattern_name.clone(),
                },
            })
            .collect()
    }
}