Skip to main content

cloakpipe_core/detector/
patterns.rs

1//! Layer 1: Regex-based pattern detection for secrets, emails, IPs, etc.
2
3use crate::{DetectedEntity, EntityCategory, DetectionSource, config::DetectionConfig};
4use anyhow::Result;
5use regex::Regex;
6
7pub struct PatternDetector {
8    rules: Vec<PatternRule>,
9}
10
11struct PatternRule {
12    regex: Regex,
13    category: EntityCategory,
14    _name: String,
15}
16
17impl PatternDetector {
18    pub fn new(config: &DetectionConfig) -> Result<Self> {
19        let mut rules = Vec::new();
20
21        if config.secrets {
22            // AWS keys
23            rules.push(PatternRule {
24                regex: Regex::new(r"(?i)(AKIA[0-9A-Z]{16})")?,
25                category: EntityCategory::Secret,
26                _name: "aws_access_key".into(),
27            });
28            // Generic API keys / tokens
29            rules.push(PatternRule {
30                regex: Regex::new(r"(?i)(sk-[a-zA-Z0-9]{32,}|ghp_[a-zA-Z0-9]{36}|gho_[a-zA-Z0-9]{36})")?,
31                category: EntityCategory::Secret,
32                _name: "api_token".into(),
33            });
34            // Connection strings
35            rules.push(PatternRule {
36                regex: Regex::new(r"(?i)(postgres(?:ql)?://[^\s]+|mysql://[^\s]+|mongodb(?:\+srv)?://[^\s]+)")?,
37                category: EntityCategory::Secret,
38                _name: "connection_string".into(),
39            });
40            // JWT tokens
41            rules.push(PatternRule {
42                regex: Regex::new(r"eyJ[a-zA-Z0-9_-]+\.eyJ[a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+")?,
43                category: EntityCategory::Secret,
44                _name: "jwt_token".into(),
45            });
46        }
47
48        if config.emails {
49            rules.push(PatternRule {
50                regex: Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")?,
51                category: EntityCategory::Email,
52                _name: "email".into(),
53            });
54        }
55
56        if config.phone_numbers {
57            rules.push(PatternRule {
58                regex: Regex::new(r"\+?[1-9]\d{0,2}[-.\s]?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,9}")?,
59                category: EntityCategory::PhoneNumber,
60                _name: "phone".into(),
61            });
62        }
63
64        if config.ip_addresses {
65            rules.push(PatternRule {
66                regex: Regex::new(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")?,
67                category: EntityCategory::IpAddress,
68                _name: "ipv4".into(),
69            });
70        }
71
72        if config.urls_internal {
73            rules.push(PatternRule {
74                regex: Regex::new(r"https?://(?:internal|staging|dev|admin)\.[a-zA-Z0-9.-]+(?:/[^\s]*)?")?,
75                category: EntityCategory::Url,
76                _name: "internal_url".into(),
77            });
78        }
79
80        Ok(Self { rules })
81    }
82
83    pub fn detect(&self, text: &str) -> Result<Vec<DetectedEntity>> {
84        let mut entities = Vec::new();
85        for rule in &self.rules {
86            for mat in rule.regex.find_iter(text) {
87                entities.push(DetectedEntity {
88                    original: mat.as_str().to_string(),
89                    start: mat.start(),
90                    end: mat.end(),
91                    category: rule.category.clone(),
92                    confidence: 1.0,
93                    source: DetectionSource::Pattern,
94                });
95            }
96        }
97        Ok(entities)
98    }
99}