Skip to main content

synapse_pingora/profiler/
patterns.rs

1//! Pattern detection for string values.
2//!
3//! Uses lazy_static regexes for efficient pattern matching of common
4//! string formats like UUIDs, emails, dates, URLs, and IP addresses.
5//!
6//! ## Performance
7//! - Pattern matching: ~100-500ns per string
8//! - Regex compilation: Once at first use (lazy)
9
10use once_cell::sync::Lazy;
11use regex::Regex;
12
13use crate::profiler::schema_types::PatternType;
14
15// ============================================================================
16// Pattern Regexes (compiled once, reused)
17// ============================================================================
18
19/// UUID pattern: 8-4-4-4-12 hexadecimal format
20/// Matches: 550e8400-e29b-41d4-a716-446655440000
21static UUID_PATTERN: Lazy<Regex> = Lazy::new(|| {
22    Regex::new(r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$")
23        .expect("UUID regex compilation failed")
24});
25
26/// Email pattern: basic email format
27/// Matches: user@example.com, name.last@sub.domain.org
28static EMAIL_PATTERN: Lazy<Regex> = Lazy::new(|| {
29    Regex::new(r"^[^\s@]+@[^\s@]+\.[^\s@]+$").expect("Email regex compilation failed")
30});
31
32/// ISO 8601 datetime pattern
33/// Matches: 2024-01-15T10:30:00, 2024-01-15T10:30:00Z, 2024-01-15T10:30:00+05:00
34static ISO_DATE_PATTERN: Lazy<Regex> = Lazy::new(|| {
35    Regex::new(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}").expect("ISO date regex compilation failed")
36});
37
38/// URL pattern: HTTP/HTTPS URLs
39/// Matches: http://example.com, https://api.example.com/path?query=1
40static URL_PATTERN: Lazy<Regex> =
41    Lazy::new(|| Regex::new(r"^https?://[^\s]+$").expect("URL regex compilation failed"));
42
43/// IPv4 address pattern
44/// Matches: 192.168.1.1, 10.0.0.255
45static IPV4_PATTERN: Lazy<Regex> = Lazy::new(|| {
46    Regex::new(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$").expect("IPv4 regex compilation failed")
47});
48
49/// IPv6 address pattern (simplified)
50/// Matches: 2001:0db8:85a3:0000:0000:8a2e:0370:7334, ::1
51static IPV6_PATTERN: Lazy<Regex> = Lazy::new(|| {
52    Regex::new(r"^([0-9a-fA-F]{0,4}:){2,7}[0-9a-fA-F]{0,4}$")
53        .expect("IPv6 regex compilation failed")
54});
55
56/// JWT pattern: three base64url segments separated by dots
57/// Matches: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0.dozjgNryP4J3jVmNHl0w5N_XgL0n3I9PlFUP0THsR8U
58static JWT_PATTERN: Lazy<Regex> = Lazy::new(|| {
59    Regex::new(r"^[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+$")
60        .expect("JWT regex compilation failed")
61});
62
63/// MongoDB ObjectId pattern: 24 hexadecimal characters
64/// Matches: 507f1f77bcf86cd799439011
65static OBJECT_ID_PATTERN: Lazy<Regex> =
66    Lazy::new(|| Regex::new(r"^[0-9a-fA-F]{24}$").expect("ObjectId regex compilation failed"));
67
68/// Generic hex string pattern: 16+ hexadecimal characters
69/// Matches: abcdef1234567890abcdef, 0123456789abcdef0123456789abcdef
70static HEX_STRING_PATTERN: Lazy<Regex> =
71    Lazy::new(|| Regex::new(r"^[0-9a-fA-F]{16,}$").expect("Hex string regex compilation failed"));
72
73/// Phone number pattern (various formats)
74/// Matches: +1-555-123-4567, (555) 123-4567, 555.123.4567
75static PHONE_PATTERN: Lazy<Regex> = Lazy::new(|| {
76    Regex::new(
77        r"^[\+]?[(]?[0-9]{1,3}[)]?[-\s\.]?[(]?[0-9]{1,4}[)]?[-\s\.]?[0-9]{1,4}[-\s\.]?[0-9]{1,9}$",
78    )
79    .expect("Phone regex compilation failed")
80});
81
82/// Credit card pattern (basic format, 13-19 digits with optional separators)
83/// Note: This is for pattern detection only, not validation
84static CREDIT_CARD_PATTERN: Lazy<Regex> = Lazy::new(|| {
85    Regex::new(r"^[0-9]{4}[-\s]?[0-9]{4}[-\s]?[0-9]{4}[-\s]?[0-9]{1,7}$")
86        .expect("Credit card regex compilation failed")
87});
88
89// ============================================================================
90// Pattern Detection
91// ============================================================================
92
93/// Detect pattern type from a string value.
94///
95/// Checks patterns in order of specificity:
96/// 1. UUID (most specific format)
97/// 2. ObjectId (24 hex chars)
98/// 3. JWT (three base64 segments)
99/// 4. Email
100/// 5. ISO Date
101/// 6. URL
102/// 7. IPv4
103/// 8. IPv6
104/// 9. Phone
105/// 10. Credit Card
106/// 11. Hex String (generic fallback for hex)
107///
108/// Returns None if no pattern matches.
109///
110/// ## Performance
111/// Average: ~200ns for non-matching strings
112/// Worst case: ~1us when checking all patterns
113#[inline]
114pub fn detect_pattern(value: &str) -> Option<PatternType> {
115    // Short-circuit for empty or very short strings
116    if value.len() < 3 {
117        return None;
118    }
119
120    // Check length-specific patterns first (faster rejection)
121    let len = value.len();
122
123    // UUID is exactly 36 chars
124    if len == 36 && UUID_PATTERN.is_match(value) {
125        return Some(PatternType::Uuid);
126    }
127
128    // ObjectId is exactly 24 hex chars
129    if len == 24 && OBJECT_ID_PATTERN.is_match(value) {
130        return Some(PatternType::ObjectId);
131    }
132
133    // JWT typically > 50 chars and contains dots
134    if len > 50 && value.contains('.') && JWT_PATTERN.is_match(value) {
135        return Some(PatternType::Jwt);
136    }
137
138    // Email detection (contains @)
139    if value.contains('@') && EMAIL_PATTERN.is_match(value) {
140        return Some(PatternType::Email);
141    }
142
143    // ISO date detection (starts with digit, contains T)
144    if value.starts_with(|c: char| c.is_ascii_digit()) {
145        if value.contains('T') && ISO_DATE_PATTERN.is_match(value) {
146            return Some(PatternType::IsoDate);
147        }
148
149        // IPv4 detection (4 dot-separated octets)
150        if value.contains('.') && !value.contains(':') && IPV4_PATTERN.is_match(value) {
151            return Some(PatternType::Ipv4);
152        }
153
154        // Credit card detection
155        if (13..=19).contains(&len) && CREDIT_CARD_PATTERN.is_match(value) {
156            return Some(PatternType::CreditCard);
157        }
158    }
159
160    // Phone number detection (can start with + or digit)
161    if (7..=20).contains(&len) {
162        let first_char = value.chars().next();
163        if (matches!(first_char, Some('+') | Some('('))
164            || value.starts_with(|c: char| c.is_ascii_digit()))
165            && PHONE_PATTERN.is_match(value)
166        {
167            return Some(PatternType::Phone);
168        }
169    }
170
171    // URL detection (starts with http)
172    if value.starts_with("http") && URL_PATTERN.is_match(value) {
173        return Some(PatternType::Url);
174    }
175
176    // IPv6 detection (contains colons, hex digits)
177    if value.contains(':') && IPV6_PATTERN.is_match(value) {
178        return Some(PatternType::Ipv6);
179    }
180
181    // Generic hex string (16+ hex chars, no separators)
182    if len >= 16 && HEX_STRING_PATTERN.is_match(value) {
183        return Some(PatternType::HexString);
184    }
185
186    None
187}
188
189/// Check if a value matches a specific pattern.
190#[inline]
191pub fn matches_pattern(value: &str, pattern: PatternType) -> bool {
192    match pattern {
193        PatternType::Uuid => UUID_PATTERN.is_match(value),
194        PatternType::Email => EMAIL_PATTERN.is_match(value),
195        PatternType::IsoDate => ISO_DATE_PATTERN.is_match(value),
196        PatternType::Url => URL_PATTERN.is_match(value),
197        PatternType::Ipv4 => IPV4_PATTERN.is_match(value),
198        PatternType::Ipv6 => IPV6_PATTERN.is_match(value),
199        PatternType::Jwt => JWT_PATTERN.is_match(value),
200        PatternType::ObjectId => OBJECT_ID_PATTERN.is_match(value),
201        PatternType::HexString => HEX_STRING_PATTERN.is_match(value),
202        PatternType::Phone => PHONE_PATTERN.is_match(value),
203        PatternType::CreditCard => CREDIT_CARD_PATTERN.is_match(value),
204    }
205}
206
207// ============================================================================
208// Tests
209// ============================================================================
210
211#[cfg(test)]
212mod tests {
213    use super::*;
214
215    #[test]
216    fn test_uuid_detection() {
217        assert_eq!(
218            detect_pattern("550e8400-e29b-41d4-a716-446655440000"),
219            Some(PatternType::Uuid)
220        );
221        assert_eq!(
222            detect_pattern("550E8400-E29B-41D4-A716-446655440000"),
223            Some(PatternType::Uuid)
224        );
225        assert_eq!(detect_pattern("not-a-uuid"), None);
226        // 32 hex chars without dashes matches HexString (16+ hex chars)
227        assert_eq!(
228            detect_pattern("550e8400e29b41d4a716446655440000"),
229            Some(PatternType::HexString)
230        );
231    }
232
233    #[test]
234    fn test_email_detection() {
235        assert_eq!(detect_pattern("user@example.com"), Some(PatternType::Email));
236        assert_eq!(
237            detect_pattern("name.last@sub.domain.org"),
238            Some(PatternType::Email)
239        );
240        assert_eq!(detect_pattern("invalid-email"), None);
241        assert_eq!(detect_pattern("@nodomain"), None);
242    }
243
244    #[test]
245    fn test_iso_date_detection() {
246        assert_eq!(
247            detect_pattern("2024-01-15T10:30:00"),
248            Some(PatternType::IsoDate)
249        );
250        assert_eq!(
251            detect_pattern("2024-01-15T10:30:00Z"),
252            Some(PatternType::IsoDate)
253        );
254        assert_eq!(
255            detect_pattern("2024-01-15T10:30:00+05:00"),
256            Some(PatternType::IsoDate)
257        );
258        // Date only without time - doesn't match our ISO date pattern (requires T separator)
259        assert!(!matches_pattern("2024-01-15", PatternType::IsoDate));
260    }
261
262    #[test]
263    fn test_url_detection() {
264        assert_eq!(detect_pattern("http://example.com"), Some(PatternType::Url));
265        assert_eq!(
266            detect_pattern("https://api.example.com/path?query=1"),
267            Some(PatternType::Url)
268        );
269        assert_eq!(detect_pattern("ftp://example.com"), None); // Not HTTP(S)
270        assert_eq!(detect_pattern("example.com"), None); // No protocol
271    }
272
273    #[test]
274    fn test_ipv4_detection() {
275        assert_eq!(detect_pattern("192.168.1.1"), Some(PatternType::Ipv4));
276        assert_eq!(detect_pattern("10.0.0.255"), Some(PatternType::Ipv4));
277        assert_eq!(detect_pattern("256.1.1.1"), Some(PatternType::Ipv4)); // Invalid but matches format
278                                                                          // Missing octet - doesn't match IPv4 pattern
279        assert!(!matches_pattern("192.168.1", PatternType::Ipv4));
280    }
281
282    #[test]
283    fn test_ipv6_detection() {
284        assert_eq!(
285            detect_pattern("2001:0db8:85a3:0000:0000:8a2e:0370:7334"),
286            Some(PatternType::Ipv6)
287        );
288        assert_eq!(detect_pattern("::1"), Some(PatternType::Ipv6));
289        assert_eq!(detect_pattern("fe80::1"), Some(PatternType::Ipv6));
290    }
291
292    #[test]
293    fn test_jwt_detection() {
294        let jwt = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0.dozjgNryP4J3jVmNHl0w5N_XgL0n3I9PlFUP0THsR8U";
295        assert_eq!(detect_pattern(jwt), Some(PatternType::Jwt));
296        assert_eq!(detect_pattern("not.a.jwt"), None); // Too short
297    }
298
299    #[test]
300    fn test_object_id_detection() {
301        assert_eq!(
302            detect_pattern("507f1f77bcf86cd799439011"),
303            Some(PatternType::ObjectId)
304        );
305        assert_eq!(
306            detect_pattern("507F1F77BCF86CD799439011"),
307            Some(PatternType::ObjectId)
308        );
309        // 23 hex chars - too short for ObjectId (24), but matches HexString (16+)
310        assert_eq!(
311            detect_pattern("507f1f77bcf86cd79943901"),
312            Some(PatternType::HexString)
313        );
314    }
315
316    #[test]
317    fn test_hex_string_detection() {
318        assert_eq!(
319            detect_pattern("abcdef1234567890"),
320            Some(PatternType::HexString)
321        );
322        assert_eq!(
323            detect_pattern("0123456789abcdef0123456789abcdef"),
324            Some(PatternType::HexString)
325        );
326        assert_eq!(detect_pattern("abcdef12345678"), None); // 14 chars, too short
327        assert_eq!(detect_pattern("ghijkl1234567890"), None); // Non-hex chars
328    }
329
330    #[test]
331    fn test_phone_detection() {
332        // Phone numbers have separators that prevent them from matching other patterns
333        assert_eq!(detect_pattern("+1-555-1234567"), Some(PatternType::Phone));
334        // These formats may conflict with other pattern checks, test the specific matcher
335        assert!(matches_pattern("+1-555-123-4567", PatternType::Phone));
336        assert!(matches_pattern("(555) 123-4567", PatternType::Phone));
337        assert!(matches_pattern("555.123.4567", PatternType::Phone));
338    }
339
340    #[test]
341    fn test_matches_pattern() {
342        assert!(matches_pattern(
343            "550e8400-e29b-41d4-a716-446655440000",
344            PatternType::Uuid
345        ));
346        assert!(!matches_pattern("not-a-uuid", PatternType::Uuid));
347
348        assert!(matches_pattern("user@example.com", PatternType::Email));
349        assert!(!matches_pattern("invalid", PatternType::Email));
350    }
351
352    #[test]
353    fn test_empty_and_short_strings() {
354        assert_eq!(detect_pattern(""), None);
355        assert_eq!(detect_pattern("ab"), None);
356        assert_eq!(detect_pattern("abc"), None);
357    }
358
359    #[test]
360    fn test_pattern_priority() {
361        // UUID should take priority over hex string
362        let uuid = "550e8400-e29b-41d4-a716-446655440000";
363        assert_eq!(detect_pattern(uuid), Some(PatternType::Uuid));
364
365        // ObjectId should take priority over generic hex
366        let object_id = "507f1f77bcf86cd799439011";
367        assert_eq!(detect_pattern(object_id), Some(PatternType::ObjectId));
368    }
369
370    #[test]
371    fn test_credit_card_detection() {
372        // Note: These are formatted patterns for detection, not real card numbers
373        assert!(matches_pattern(
374            "4111-1111-1111-1111",
375            PatternType::CreditCard
376        ));
377        assert!(matches_pattern("4111111111111111", PatternType::CreditCard));
378        assert!(matches_pattern(
379            "4111 1111 1111 1111",
380            PatternType::CreditCard
381        ));
382
383        // Short/Long variants
384        assert!(matches_pattern("1234567890123", PatternType::CreditCard)); // 13 digits
385        assert!(matches_pattern(
386            "1234567890123456789",
387            PatternType::CreditCard
388        )); // 19 digits
389
390        // detect_pattern end-to-end
391        assert_eq!(
392            detect_pattern("4111-2222-3333-4444"),
393            Some(PatternType::CreditCard)
394        );
395        assert_eq!(
396            detect_pattern("4111222233334444"),
397            Some(PatternType::CreditCard)
398        );
399
400        // Negative cases
401        assert_ne!(detect_pattern("12345"), Some(PatternType::CreditCard)); // Too short
402        assert_ne!(
403            detect_pattern("1234-5678-9012-3456-7890"),
404            Some(PatternType::CreditCard)
405        ); // Too long
406    }
407}