Skip to main content

rustyclaw_core/security/
leak_detector.rs

1//! Enhanced leak detection (inspired by IronClaw)
2//!
3//! Scans data at sandbox boundaries to prevent secret exfiltration.
4//! Uses Aho-Corasick for fast O(n) multi-pattern matching plus regex for
5//! complex patterns.
6//!
7//! # Security Model
8//!
9//! Leak detection happens at TWO points:
10//!
11//! 1. **Before outbound requests** - Prevents exfiltrating secrets via URLs,
12//!    headers, or request bodies
13//! 2. **After responses/outputs** - Prevents accidental exposure in logs,
14//!    tool outputs, or data returned to the model
15//!
16//! # Architecture
17//!
18//! ```text
19//! ┌───────────────────────────────────────────────────────────────────┐
20//! │                     HTTP Request Flow                             │
21//! │                                                                   │
22//! │  Request ──► Allowlist ──► Leak Scan ──► Execute ──► Response   │
23//! │              Validator     (request)                     │       │
24//! │                                                          ▼       │
25//! │                               Output ◀── Leak Scan ◀── Response │
26//! │                                          (response)              │
27//! └───────────────────────────────────────────────────────────────────┘
28//!
29//! ┌───────────────────────────────────────────────────────────────────┐
30//! │                       Scan Result Actions                         │
31//! │                                                                   │
32//! │   LeakDetector.scan() ──► LeakScanResult                         │
33//! │                               │                                   │
34//! │                               ├─► clean: pass through             │
35//! │                               ├─► warn: log, pass                 │
36//! │                               ├─► redact: mask secret             │
37//! │                               └─► block: reject entirely          │
38//! └───────────────────────────────────────────────────────────────────┘
39//! ```
40//!
41//! # Attribution
42//!
43//! HTTP request scanning and Aho-Corasick optimization inspired by
44//! [IronClaw](https://github.com/nearai/ironclaw) (Apache-2.0).
45
46use std::ops::Range;
47
48use aho_corasick::AhoCorasick;
49use regex::Regex;
50
51/// Action to take when a leak is detected.
52#[derive(Debug, Clone, Copy, PartialEq, Eq)]
53pub enum LeakAction {
54    /// Block the output entirely (for critical secrets).
55    Block,
56    /// Redact the secret, replacing it with [REDACTED].
57    Redact,
58    /// Log a warning but allow the output.
59    Warn,
60}
61
62impl std::fmt::Display for LeakAction {
63    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
64        match self {
65            LeakAction::Block => write!(f, "block"),
66            LeakAction::Redact => write!(f, "redact"),
67            LeakAction::Warn => write!(f, "warn"),
68        }
69    }
70}
71
72/// Severity of a detected leak.
73#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
74pub enum LeakSeverity {
75    Low,
76    Medium,
77    High,
78    Critical,
79}
80
81impl std::fmt::Display for LeakSeverity {
82    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
83        match self {
84            LeakSeverity::Low => write!(f, "low"),
85            LeakSeverity::Medium => write!(f, "medium"),
86            LeakSeverity::High => write!(f, "high"),
87            LeakSeverity::Critical => write!(f, "critical"),
88        }
89    }
90}
91
92/// A pattern for detecting secret leaks.
93#[derive(Debug, Clone)]
94pub struct LeakPattern {
95    pub name: String,
96    pub regex: Regex,
97    pub severity: LeakSeverity,
98    pub action: LeakAction,
99}
100
101/// A detected potential secret leak.
102#[derive(Debug, Clone)]
103pub struct LeakMatch {
104    pub pattern_name: String,
105    pub severity: LeakSeverity,
106    pub action: LeakAction,
107    /// Location in the scanned content.
108    pub location: Range<usize>,
109    /// A preview of the match with the secret partially masked.
110    pub masked_preview: String,
111}
112
113/// Result of scanning content for leaks.
114#[derive(Debug)]
115pub struct LeakScanResult {
116    /// All detected potential leaks.
117    pub matches: Vec<LeakMatch>,
118    /// Whether any match requires blocking.
119    pub should_block: bool,
120    /// Content with secrets redacted (if redaction was applied).
121    pub redacted_content: Option<String>,
122}
123
124impl LeakScanResult {
125    /// Check if content is clean (no leaks detected).
126    pub fn is_clean(&self) -> bool {
127        self.matches.is_empty()
128    }
129
130    /// Get the highest severity found.
131    pub fn max_severity(&self) -> Option<LeakSeverity> {
132        self.matches.iter().map(|m| m.severity).max()
133    }
134}
135
136/// Error from leak detection.
137#[derive(Debug, Clone, thiserror::Error)]
138pub enum LeakDetectionError {
139    #[error("Secret leak blocked: pattern '{pattern}' matched '{preview}'")]
140    SecretLeakBlocked { pattern: String, preview: String },
141}
142
143/// Detector for secret leaks in output data.
144///
145/// Uses Aho-Corasick for fast prefix matching combined with regex for
146/// accurate pattern validation.
147pub struct LeakDetector {
148    patterns: Vec<LeakPattern>,
149    /// For fast prefix matching of known patterns
150    prefix_matcher: Option<AhoCorasick>,
151    known_prefixes: Vec<(String, usize)>, // (prefix, pattern_index)
152}
153
154impl LeakDetector {
155    /// Create a new detector with default patterns.
156    pub fn new() -> Self {
157        Self::with_patterns(default_patterns())
158    }
159
160    /// Create a detector with custom patterns.
161    pub fn with_patterns(patterns: Vec<LeakPattern>) -> Self {
162        // Build prefix matcher for patterns that start with a known prefix
163        let mut prefixes = Vec::new();
164        for (idx, pattern) in patterns.iter().enumerate() {
165            if let Some(prefix) = extract_literal_prefix(pattern.regex.as_str()) {
166                if prefix.len() >= 3 {
167                    prefixes.push((prefix, idx));
168                }
169            }
170        }
171
172        let prefix_matcher = if !prefixes.is_empty() {
173            let prefix_strings: Vec<&str> = prefixes.iter().map(|(s, _)| s.as_str()).collect();
174            AhoCorasick::builder()
175                .ascii_case_insensitive(false)
176                .build(&prefix_strings)
177                .ok()
178        } else {
179            None
180        };
181
182        Self {
183            patterns,
184            prefix_matcher,
185            known_prefixes: prefixes,
186        }
187    }
188
189    /// Scan content for potential secret leaks.
190    pub fn scan(&self, content: &str) -> LeakScanResult {
191        let mut matches = Vec::new();
192        let mut should_block = false;
193        let mut redact_ranges = Vec::new();
194
195        // Use prefix matcher for quick elimination
196        let candidate_indices: Vec<usize> = if let Some(ref matcher) = self.prefix_matcher {
197            let mut indices = Vec::new();
198            for mat in matcher.find_iter(content) {
199                let pattern_idx = self.known_prefixes[mat.pattern().as_usize()].1;
200                if !indices.contains(&pattern_idx) {
201                    indices.push(pattern_idx);
202                }
203            }
204            // Also include patterns without prefixes
205            for (idx, _) in self.patterns.iter().enumerate() {
206                if !self.known_prefixes.iter().any(|(_, i)| *i == idx) && !indices.contains(&idx) {
207                    indices.push(idx);
208                }
209            }
210            indices
211        } else {
212            (0..self.patterns.len()).collect()
213        };
214
215        // Check candidate patterns
216        for idx in candidate_indices {
217            let pattern = &self.patterns[idx];
218            for mat in pattern.regex.find_iter(content) {
219                let matched_text = mat.as_str();
220                let location = mat.start()..mat.end();
221
222                let leak_match = LeakMatch {
223                    pattern_name: pattern.name.clone(),
224                    severity: pattern.severity,
225                    action: pattern.action,
226                    location: location.clone(),
227                    masked_preview: mask_secret(matched_text),
228                };
229
230                if pattern.action == LeakAction::Block {
231                    should_block = true;
232                }
233
234                if pattern.action == LeakAction::Redact {
235                    redact_ranges.push(location);
236                }
237
238                matches.push(leak_match);
239            }
240        }
241
242        // Sort by location for proper redaction
243        matches.sort_by_key(|m| m.location.start);
244        redact_ranges.sort_by_key(|r| r.start);
245
246        // Build redacted content if needed
247        let redacted_content = if !redact_ranges.is_empty() {
248            Some(apply_redactions(content, &redact_ranges))
249        } else {
250            None
251        };
252
253        LeakScanResult {
254            matches,
255            should_block,
256            redacted_content,
257        }
258    }
259
260    /// Scan content and return cleaned version based on action.
261    ///
262    /// Returns `Err` if content should be blocked, `Ok(content)` otherwise.
263    pub fn scan_and_clean(&self, content: &str) -> Result<String, LeakDetectionError> {
264        let result = self.scan(content);
265
266        if result.should_block {
267            let blocking_match = result
268                .matches
269                .iter()
270                .find(|m| m.action == LeakAction::Block);
271            return Err(LeakDetectionError::SecretLeakBlocked {
272                pattern: blocking_match
273                    .map(|m| m.pattern_name.clone())
274                    .unwrap_or_default(),
275                preview: blocking_match
276                    .map(|m| m.masked_preview.clone())
277                    .unwrap_or_default(),
278            });
279        }
280
281        // Log warnings
282        for m in &result.matches {
283            if m.action == LeakAction::Warn {
284                tracing::warn!(
285                    pattern = %m.pattern_name,
286                    severity = %m.severity,
287                    preview = %m.masked_preview,
288                    "Potential secret leak detected (warning only)"
289                );
290            }
291        }
292
293        // Return redacted content if any, otherwise original
294        Ok(result
295            .redacted_content
296            .unwrap_or_else(|| content.to_string()))
297    }
298
299    /// Scan an outbound HTTP request for potential secret leakage.
300    ///
301    /// This MUST be called before executing any HTTP request to prevent
302    /// exfiltration of secrets via URL, headers, or body.
303    ///
304    /// Returns `Err` if any part contains a blocked secret pattern.
305    pub fn scan_http_request(
306        &self,
307        url: &str,
308        headers: &[(String, String)],
309        body: Option<&[u8]>,
310    ) -> Result<(), LeakDetectionError> {
311        // Scan URL (most common exfiltration vector)
312        self.scan_and_clean(url)?;
313
314        // Scan each header value
315        for (name, value) in headers {
316            self.scan_and_clean(value).map_err(|e| {
317                LeakDetectionError::SecretLeakBlocked {
318                    pattern: format!("header:{}", name),
319                    preview: e.to_string(),
320                }
321            })?;
322        }
323
324        // Scan body if present. Use lossy UTF-8 conversion so a leading
325        // non-UTF8 byte can't be used to skip scanning entirely.
326        if let Some(body_bytes) = body {
327            let body_str = String::from_utf8_lossy(body_bytes);
328            self.scan_and_clean(&body_str)?;
329        }
330
331        Ok(())
332    }
333
334    /// Add a custom pattern at runtime.
335    pub fn add_pattern(&mut self, pattern: LeakPattern) {
336        self.patterns.push(pattern);
337        // Note: prefix_matcher won't be updated; rebuild if needed
338    }
339
340    /// Get the number of patterns.
341    pub fn pattern_count(&self) -> usize {
342        self.patterns.len()
343    }
344}
345
346impl Default for LeakDetector {
347    fn default() -> Self {
348        Self::new()
349    }
350}
351
352/// Mask a secret for safe display.
353///
354/// Shows first 4 and last 4 characters, masks the middle.
355fn mask_secret(secret: &str) -> String {
356    let len = secret.len();
357    if len <= 8 {
358        return "*".repeat(len);
359    }
360
361    let prefix: String = secret.chars().take(4).collect();
362    let suffix: String = secret.chars().skip(len - 4).collect();
363    let middle_len = len - 8;
364    format!("{}{}{}", prefix, "*".repeat(middle_len.min(8)), suffix)
365}
366
367/// Apply redaction ranges to content.
368fn apply_redactions(content: &str, ranges: &[Range<usize>]) -> String {
369    if ranges.is_empty() {
370        return content.to_string();
371    }
372
373    let mut result = String::with_capacity(content.len());
374    let mut last_end = 0;
375
376    for range in ranges {
377        if range.start > last_end {
378            result.push_str(&content[last_end..range.start]);
379        }
380        result.push_str("[REDACTED]");
381        last_end = range.end;
382    }
383
384    if last_end < content.len() {
385        result.push_str(&content[last_end..]);
386    }
387
388    result
389}
390
391/// Extract a literal prefix from a regex pattern (if one exists).
392fn extract_literal_prefix(pattern: &str) -> Option<String> {
393    let mut prefix = String::new();
394
395    for ch in pattern.chars() {
396        match ch {
397            // These start special regex constructs
398            '[' | '(' | '.' | '*' | '+' | '?' | '{' | '|' | '^' | '$' => break,
399            // Escape sequence
400            '\\' => break,
401            // Regular character
402            _ => prefix.push(ch),
403        }
404    }
405
406    if prefix.len() >= 3 {
407        Some(prefix)
408    } else {
409        None
410    }
411}
412
413/// Default leak detection patterns.
414fn default_patterns() -> Vec<LeakPattern> {
415    vec![
416        // OpenAI API keys
417        LeakPattern {
418            name: "openai_api_key".to_string(),
419            regex: Regex::new(r"sk-(?:proj-)?[a-zA-Z0-9]{20,}(?:T3BlbkFJ[a-zA-Z0-9_-]*)?").unwrap(),
420            severity: LeakSeverity::Critical,
421            action: LeakAction::Block,
422        },
423        // Anthropic API keys
424        LeakPattern {
425            name: "anthropic_api_key".to_string(),
426            regex: Regex::new(r"sk-ant-api[a-zA-Z0-9_-]{90,}").unwrap(),
427            severity: LeakSeverity::Critical,
428            action: LeakAction::Block,
429        },
430        // AWS Access Key ID
431        LeakPattern {
432            name: "aws_access_key".to_string(),
433            regex: Regex::new(r"AKIA[0-9A-Z]{16}").unwrap(),
434            severity: LeakSeverity::Critical,
435            action: LeakAction::Block,
436        },
437        // GitHub tokens
438        LeakPattern {
439            name: "github_token".to_string(),
440            regex: Regex::new(r"gh[pousr]_[A-Za-z0-9_]{36,}").unwrap(),
441            severity: LeakSeverity::Critical,
442            action: LeakAction::Block,
443        },
444        // GitHub fine-grained PAT
445        LeakPattern {
446            name: "github_fine_grained_pat".to_string(),
447            regex: Regex::new(r"github_pat_[a-zA-Z0-9]{22}_[a-zA-Z0-9]{59}").unwrap(),
448            severity: LeakSeverity::Critical,
449            action: LeakAction::Block,
450        },
451        // Stripe keys
452        LeakPattern {
453            name: "stripe_api_key".to_string(),
454            regex: Regex::new(r"sk_(?:live|test)_[a-zA-Z0-9]{24,}").unwrap(),
455            severity: LeakSeverity::Critical,
456            action: LeakAction::Block,
457        },
458        // PEM private keys
459        LeakPattern {
460            name: "pem_private_key".to_string(),
461            regex: Regex::new(r"-----BEGIN\s+(?:RSA\s+)?PRIVATE\s+KEY-----").unwrap(),
462            severity: LeakSeverity::Critical,
463            action: LeakAction::Block,
464        },
465        // SSH private keys
466        LeakPattern {
467            name: "ssh_private_key".to_string(),
468            regex: Regex::new(r"-----BEGIN\s+(?:OPENSSH|EC|DSA)\s+PRIVATE\s+KEY-----").unwrap(),
469            severity: LeakSeverity::Critical,
470            action: LeakAction::Block,
471        },
472        // Google API keys
473        LeakPattern {
474            name: "google_api_key".to_string(),
475            regex: Regex::new(r"AIza[0-9A-Za-z_-]{35}").unwrap(),
476            severity: LeakSeverity::High,
477            action: LeakAction::Block,
478        },
479        // Slack tokens
480        LeakPattern {
481            name: "slack_token".to_string(),
482            regex: Regex::new(r"xox[baprs]-[0-9a-zA-Z-]{10,}").unwrap(),
483            severity: LeakSeverity::High,
484            action: LeakAction::Block,
485        },
486        // Twilio API keys
487        LeakPattern {
488            name: "twilio_api_key".to_string(),
489            regex: Regex::new(r"SK[a-fA-F0-9]{32}").unwrap(),
490            severity: LeakSeverity::High,
491            action: LeakAction::Block,
492        },
493        // SendGrid API keys
494        LeakPattern {
495            name: "sendgrid_api_key".to_string(),
496            regex: Regex::new(r"SG\.[a-zA-Z0-9_-]{22}\.[a-zA-Z0-9_-]{43}").unwrap(),
497            severity: LeakSeverity::High,
498            action: LeakAction::Block,
499        },
500        // Bearer tokens (redact instead of block, might be intentional)
501        LeakPattern {
502            name: "bearer_token".to_string(),
503            regex: Regex::new(r"Bearer\s+[a-zA-Z0-9_-]{20,}").unwrap(),
504            severity: LeakSeverity::High,
505            action: LeakAction::Redact,
506        },
507        // Authorization header with key
508        LeakPattern {
509            name: "auth_header".to_string(),
510            regex: Regex::new(r"(?i)authorization:\s*[a-zA-Z]+\s+[a-zA-Z0-9_-]{20,}").unwrap(),
511            severity: LeakSeverity::High,
512            action: LeakAction::Redact,
513        },
514        // High entropy hex (potential secrets, warn only)
515        LeakPattern {
516            name: "high_entropy_hex".to_string(),
517            regex: Regex::new(r"\b[a-fA-F0-9]{64}\b").unwrap(),
518            severity: LeakSeverity::Medium,
519            action: LeakAction::Warn,
520        },
521    ]
522}
523
524#[cfg(test)]
525mod tests {
526    use super::*;
527
528    #[test]
529    fn test_detect_openai_key() {
530        let detector = LeakDetector::new();
531        // Use obviously fake key (all X's) to avoid GitHub push protection
532        let content = "API key: sk-proj-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX";
533
534        let result = detector.scan(content);
535        assert!(!result.is_clean());
536        assert!(result.should_block);
537        assert!(result.matches.iter().any(|m| m.pattern_name == "openai_api_key"));
538    }
539
540    #[test]
541    fn test_detect_github_token() {
542        let detector = LeakDetector::new();
543        let content = "token: ghp_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX";
544
545        let result = detector.scan(content);
546        assert!(!result.is_clean());
547        assert!(result.matches.iter().any(|m| m.pattern_name == "github_token"));
548    }
549
550    #[test]
551    fn test_detect_aws_key() {
552        let detector = LeakDetector::new();
553        let content = "AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE";
554
555        let result = detector.scan(content);
556        assert!(!result.is_clean());
557        assert!(result.matches.iter().any(|m| m.pattern_name == "aws_access_key"));
558    }
559
560    #[test]
561    fn test_detect_pem_key() {
562        let detector = LeakDetector::new();
563        let content = "-----BEGIN RSA PRIVATE KEY-----\nMIIEowIBAAKCAQEA...";
564
565        let result = detector.scan(content);
566        assert!(!result.is_clean());
567        assert!(result.matches.iter().any(|m| m.pattern_name == "pem_private_key"));
568    }
569
570    #[test]
571    fn test_clean_content() {
572        let detector = LeakDetector::new();
573        let content = "Hello world! This is just regular text with no secrets.";
574
575        let result = detector.scan(content);
576        assert!(result.is_clean());
577        assert!(!result.should_block);
578    }
579
580    #[test]
581    fn test_redact_bearer_token() {
582        let detector = LeakDetector::new();
583        let content = "Authorization: Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9_longtokenvalue";
584
585        let result = detector.scan(content);
586        assert!(!result.is_clean());
587        assert!(!result.should_block); // Bearer is redact, not block
588
589        let redacted = result.redacted_content.unwrap();
590        assert!(redacted.contains("[REDACTED]"));
591        assert!(!redacted.contains("eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9"));
592    }
593
594    #[test]
595    fn test_scan_and_clean_blocks() {
596        let detector = LeakDetector::new();
597        // Use obviously fake pattern (all X's)
598        let content = "sk-proj-XXXXXXXXXXXXXXXXXXXXXXXX";
599
600        let result = detector.scan_and_clean(content);
601        assert!(result.is_err());
602    }
603
604    #[test]
605    fn test_scan_and_clean_passes_clean() {
606        let detector = LeakDetector::new();
607        let content = "Just regular text";
608
609        let result = detector.scan_and_clean(content);
610        assert!(result.is_ok());
611        assert_eq!(result.unwrap(), content);
612    }
613
614    #[test]
615    fn test_mask_secret() {
616        assert_eq!(mask_secret("short"), "*****");
617        assert_eq!(mask_secret("sk-test1234567890abcdef"), "sk-t********cdef");
618    }
619
620    #[test]
621    fn test_multiple_matches() {
622        let detector = LeakDetector::new();
623        // Use AWS example key (from AWS docs) and all-X GitHub token
624        let content = "Keys: AKIAIOSFODNN7EXAMPLE and ghp_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX";
625
626        let result = detector.scan(content);
627        assert_eq!(result.matches.len(), 2);
628    }
629
630    #[test]
631    fn test_severity_ordering() {
632        assert!(LeakSeverity::Critical > LeakSeverity::High);
633        assert!(LeakSeverity::High > LeakSeverity::Medium);
634        assert!(LeakSeverity::Medium > LeakSeverity::Low);
635    }
636
637    #[test]
638    fn test_scan_http_request_clean() {
639        let detector = LeakDetector::new();
640
641        let result = detector.scan_http_request(
642            "https://api.example.com/data",
643            &[("Content-Type".to_string(), "application/json".to_string())],
644            Some(b"{\"query\": \"hello\"}"),
645        );
646        assert!(result.is_ok());
647    }
648
649    #[test]
650    fn test_scan_http_request_blocks_secret_in_url() {
651        let detector = LeakDetector::new();
652
653        let result = detector.scan_http_request(
654            "https://evil.com/steal?key=AKIAIOSFODNN7EXAMPLE",
655            &[],
656            None,
657        );
658        assert!(result.is_err());
659    }
660
661    #[test]
662    fn test_scan_http_request_blocks_secret_in_header() {
663        let detector = LeakDetector::new();
664
665        let result = detector.scan_http_request(
666            "https://api.example.com/data",
667            &[(
668                "X-Custom".to_string(),
669                "ghp_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX".to_string(),
670            )],
671            None,
672        );
673        assert!(result.is_err());
674    }
675
676    #[test]
677    fn test_scan_http_request_blocks_secret_in_body() {
678        let detector = LeakDetector::new();
679
680        let body = b"{\"stolen\": \"sk-proj-XXXXXXXXXXXXXXXXXXXXXXXX\"}";
681        let result = detector.scan_http_request("https://api.example.com/webhook", &[], Some(body));
682        assert!(result.is_err());
683    }
684
685    #[test]
686    fn test_scan_http_request_blocks_secret_in_binary_body() {
687        let detector = LeakDetector::new();
688
689        // Attacker prepends a non-UTF8 byte to bypass strict from_utf8 check
690        let mut body = vec![0xFF]; // invalid UTF-8 leading byte
691        body.extend_from_slice(b"sk-proj-XXXXXXXXXXXXXXXXXXXXXXXX");
692
693        let result = detector.scan_http_request("https://api.example.com/exfil", &[], Some(&body));
694        assert!(result.is_err(), "binary body should still be scanned");
695    }
696}