Skip to main content

cloakpipe_core/detector/
mod.rs

1//! Multi-layer entity detection engine.
2//!
3//! Layers (applied in order, results merged and deduplicated):
4//! 1. Pattern matching (regex) — secrets, emails, IPs, URLs
5//! 2. Financial intelligence — amounts, percentages, fiscal dates
6//! 3. Named Entity Recognition (ONNX) — persons, organizations, locations
7//! 4. Custom rules (TOML config) — project codenames, client tiers, etc.
8
9pub mod patterns;
10pub mod financial;
11pub mod custom;
12
13#[cfg(feature = "ner")]
14pub mod ner;
15
16use crate::{DetectedEntity, config::DetectionConfig};
17use anyhow::Result;
18
19/// The combined detection engine that runs all layers.
20pub struct Detector {
21    pattern_detector: patterns::PatternDetector,
22    financial_detector: financial::FinancialDetector,
23    custom_detector: custom::CustomDetector,
24    #[cfg(feature = "ner")]
25    ner_detector: Option<ner::NerDetector>,
26    /// Entities to never anonymize (e.g., public companies).
27    preserve_list: Vec<String>,
28    /// Entities to always anonymize regardless of detection.
29    force_list: Vec<String>,
30}
31
32impl Detector {
33    /// Create a new detector from configuration.
34    pub fn from_config(config: &DetectionConfig) -> Result<Self> {
35        Ok(Self {
36            pattern_detector: patterns::PatternDetector::new(config)?,
37            financial_detector: financial::FinancialDetector::new(config)?,
38            custom_detector: custom::CustomDetector::new(config)?,
39            #[cfg(feature = "ner")]
40            ner_detector: if config.ner.enabled {
41                Some(ner::NerDetector::new(&config.ner)?)
42            } else {
43                None
44            },
45            preserve_list: config.overrides.preserve.clone(),
46            force_list: config.overrides.force.clone(),
47        })
48    }
49
50    /// Run all detection layers on the input text.
51    /// Returns a list of detected entities, sorted by position, deduplicated.
52    pub fn detect(&self, text: &str) -> Result<Vec<DetectedEntity>> {
53        let mut entities = Vec::new();
54
55        // Layer 1: Pattern matching
56        entities.extend(self.pattern_detector.detect(text)?);
57
58        // Layer 2: Financial intelligence
59        entities.extend(self.financial_detector.detect(text)?);
60
61        // Layer 3: NER (optional)
62        #[cfg(feature = "ner")]
63        if let Some(ref ner) = self.ner_detector {
64            entities.extend(ner.detect(text)?);
65        }
66
67        // Layer 4: Custom TOML rules
68        entities.extend(self.custom_detector.detect(text)?);
69
70        // Filter: remove preserved entities
71        entities.retain(|e| !self.preserve_list.contains(&e.original));
72
73        // Add: force-anonymize entities
74        for forced in &self.force_list {
75            if let Some(start) = text.find(forced.as_str()) {
76                entities.push(DetectedEntity {
77                    original: forced.clone(),
78                    start,
79                    end: start + forced.len(),
80                    category: crate::EntityCategory::Custom("FORCED".into()),
81                    confidence: 1.0,
82                    source: crate::DetectionSource::Custom,
83                });
84            }
85        }
86
87        // Sort by position and deduplicate overlapping spans
88        entities.sort_by_key(|e| e.start);
89        entities = Self::deduplicate_spans(entities);
90
91        Ok(entities)
92    }
93
94    /// Remove overlapping entity spans, keeping highest confidence.
95    fn deduplicate_spans(entities: Vec<DetectedEntity>) -> Vec<DetectedEntity> {
96        let mut result: Vec<DetectedEntity> = Vec::new();
97        for entity in entities {
98            if let Some(last) = result.last() {
99                if entity.start < last.end {
100                    // Overlap: keep the one with higher confidence
101                    if entity.confidence > last.confidence {
102                        result.pop();
103                        result.push(entity);
104                    }
105                    continue;
106                }
107            }
108            result.push(entity);
109        }
110        result
111    }
112}