1use super::{validation::validate_entity, Recognizer, RecognizerResult};
6use crate::types::EntityType;
7use anyhow::Result;
8use lazy_static::lazy_static;
9use regex::Regex;
10use std::collections::HashMap;
11
12#[derive(Debug, Clone)]
14pub struct PatternRecognizer {
15 name: String,
16 patterns: HashMap<EntityType, Vec<CompiledPattern>>,
17 min_score: f32,
18}
19
20#[derive(Debug, Clone)]
21struct CompiledPattern {
22 regex: Regex,
23 score: f32,
24 context_words: Vec<String>,
25}
26
27impl PatternRecognizer {
28 pub fn new() -> Self {
30 let mut recognizer = Self {
31 name: "PatternRecognizer".to_string(),
32 patterns: HashMap::new(),
33 min_score: 0.5,
34 };
35 recognizer.load_default_patterns();
36 recognizer
37 }
38
39 pub fn with_name(name: impl Into<String>) -> Self {
41 let mut recognizer = Self::new();
42 recognizer.name = name.into();
43 recognizer
44 }
45
46 pub fn with_min_score(mut self, min_score: f32) -> Self {
48 self.min_score = min_score;
49 self
50 }
51
52 pub fn add_pattern(
54 &mut self,
55 entity_type: EntityType,
56 pattern: &str,
57 score: f32,
58 ) -> Result<()> {
59 let regex = Regex::new(pattern)?;
60 let compiled = CompiledPattern {
61 regex,
62 score,
63 context_words: vec![],
64 };
65 self.patterns.entry(entity_type).or_default().push(compiled);
66 Ok(())
67 }
68
69 pub fn add_pattern_with_context(
71 &mut self,
72 entity_type: EntityType,
73 pattern: &str,
74 score: f32,
75 context_words: Vec<String>,
76 ) -> Result<()> {
77 let regex = Regex::new(pattern)?;
78 let compiled = CompiledPattern {
79 regex,
80 score,
81 context_words,
82 };
83 self.patterns.entry(entity_type).or_default().push(compiled);
84 Ok(())
85 }
86
87 fn load_default_patterns(&mut self) {
89 let _ = self.add_pattern(
91 EntityType::EmailAddress,
92 r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b",
93 0.8,
94 );
95
96 let _ = self.add_pattern(
102 EntityType::PhoneNumber,
103 r"\(\d{3}\)[-.\s]?\d{3}[-.\s]?\d{4}\b|\b\d{3}[-.\s]\d{3}[-.\s]?\d{4}\b",
104 0.7,
105 );
106
107 let _ = self.add_pattern(
109 EntityType::CreditCard,
110 r"\b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13}|6(?:011|5[0-9]{2})[0-9]{12})\b",
111 0.9,
112 );
113
114 let _ = self.add_pattern(EntityType::UsSsn, r"\b\d{3}-\d{2}-\d{4}\b", 0.9);
117
118 let _ = self.add_pattern(
120 EntityType::IpAddress,
121 r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b",
122 0.8,
123 );
124
125 let _ = self.add_pattern(
127 EntityType::Url,
128 r"\b(?:https?://|www\.)[a-zA-Z0-9][-a-zA-Z0-9]*(?:\.[a-zA-Z0-9][-a-zA-Z0-9]*)+(?:/[^\s]*)?\b",
129 0.7,
130 );
131
132 let _ = self.add_pattern(
134 EntityType::DomainName,
135 r"\b(?:[A-Za-z0-9](?:[A-Za-z0-9-]{0,61}[A-Za-z0-9])?\.)+[A-Za-z]{2,}\b",
136 0.7,
137 );
138
139 let _ = self.add_pattern(
141 EntityType::Guid,
142 r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b",
143 0.9,
144 );
145
146 let _ = self.add_pattern(
148 EntityType::MacAddress,
149 r"\b(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}\b",
150 0.9,
151 );
152
153 let _ = self.add_pattern_with_context(
155 EntityType::UkNhs,
156 r"\b(?:\d{3}\s?\d{3}\s?\d{4}|\d{10})\b",
157 0.6,
158 vec![
159 "NHS".to_string(),
160 "patient".to_string(),
161 "health".to_string(),
162 ],
163 );
164
165 let _ = self.add_pattern(
167 EntityType::UkNino,
168 r"\b[A-CEGHJ-PR-TW-Z]{1}[A-CEGHJ-NPR-TW-Z]{1}\d{6}[A-D]{1}\b",
169 0.85,
170 );
171
172 let _ = self.add_pattern(
174 EntityType::UkPostcode,
175 r"\b[A-Z]{1,2}\d[A-Z\d]?\s?\d[A-Z]{2}\b",
176 0.75,
177 );
178
179 let _ = self.add_pattern(EntityType::UkSortCode, r"\b\d{2}-\d{2}-\d{2}\b", 0.7);
181
182 let _ = self.add_pattern(
184 EntityType::IbanCode,
185 r"\b[A-Z]{2}\d{2}[A-Z0-9]{1,30}\b",
186 0.75,
187 );
188
189 let _ = self.add_pattern(
191 EntityType::BtcAddress,
192 r"\b(?:bc1|[13])[a-zA-HJ-NP-Z0-9]{25,62}\b",
193 0.85,
194 );
195
196 let _ = self.add_pattern(EntityType::EthAddress, r"\b0x[a-fA-F0-9]{40}\b", 0.9);
198
199 let _ = self.add_pattern(EntityType::Md5Hash, r"\b[a-fA-F0-9]{32}\b", 0.6);
201
202 let _ = self.add_pattern(EntityType::Sha1Hash, r"\b[a-fA-F0-9]{40}\b", 0.6);
204
205 let _ = self.add_pattern(EntityType::Sha256Hash, r"\b[a-fA-F0-9]{64}\b", 0.6);
207
208 let _ = self.add_pattern(
210 EntityType::UsZipCode,
211 r"\b\d{5}(?:-\d{4})?\b",
212 0.6, );
214
215 let _ = self.add_pattern_with_context(
217 EntityType::PoBox,
218 r"\b(?:P\.?\s?O\.?|POST\s+OFFICE)\s*BOX\s+\d+\b",
219 0.85,
220 vec![
221 "address".to_string(),
222 "mail".to_string(),
223 "ship".to_string(),
224 ],
225 );
226
227 let _ = self.add_pattern(
229 EntityType::Isbn,
230 r"\b(?:ISBN(?:-1[03])?:?\s*)?(?:\d{9}[\dX]|\d{13})\b",
231 0.8,
232 );
233
234 let _ = self.add_pattern_with_context(
236 EntityType::PassportNumber,
237 r"\b[A-Z]{1,2}\d{6,9}\b",
238 0.7,
239 vec!["passport".to_string(), "travel".to_string()],
240 );
241
242 let _ = self.add_pattern_with_context(
244 EntityType::MedicalRecordNumber,
245 r"\b(?:MRN|Medical\s*Record|Patient\s*ID):?\s*[A-Z0-9]{6,12}\b",
246 0.85,
247 vec![
248 "patient".to_string(),
249 "medical".to_string(),
250 "hospital".to_string(),
251 ],
252 );
253
254 let _ = self.add_pattern_with_context(
256 EntityType::Age,
257 r"\b(?:age|aged|years old):?\s*(\d{1,3})\b",
258 0.8,
259 vec!["years".to_string(), "old".to_string(), "age".to_string()],
260 );
261
262 let _ = self.add_pattern(
264 EntityType::DateTime,
265 r"\b\d{4}-\d{2}-\d{2}(?:[T\s]\d{2}:\d{2}(?::\d{2})?)?\b",
266 0.5,
267 );
268
269 let _ = self.add_pattern_with_context(
275 EntityType::UsDriverLicense,
276 r"\b[A-Z]\d{6,8}\b|\b[A-Z]\d{3}-\d{4}-\d{4}\b",
277 0.4,
278 vec![
279 "driver".to_string(),
280 "license".to_string(),
281 "DL".to_string(),
282 "DMV".to_string(),
283 ],
284 );
285
286 let _ = self.add_pattern_with_context(
289 EntityType::UsPassport,
290 r"\b[A-Z]?\d{9}\b",
291 0.4,
292 vec![
293 "passport".to_string(),
294 "travel".to_string(),
295 "state department".to_string(),
296 ],
297 );
298
299 let _ = self.add_pattern_with_context(
302 EntityType::UsBankNumber,
303 r"\b\d{8,17}\b",
304 0.3,
305 vec![
306 "account".to_string(),
307 "bank".to_string(),
308 "routing".to_string(),
309 "checking".to_string(),
310 "savings".to_string(),
311 ],
312 );
313
314 let _ = self.add_pattern(
317 EntityType::UkDriverLicense,
318 r"\b[A-Z]{5}\d{6}[A-Z0-9]{2}\d[A-Z]{2}\s?\d{2}\b",
319 0.85,
320 );
321
322 let _ = self.add_pattern_with_context(
325 EntityType::UkPassportNumber,
326 r"\b\d{9}\b",
327 0.3,
328 vec![
329 "passport".to_string(),
330 "travel".to_string(),
331 "HMPO".to_string(),
332 ],
333 );
334
335 let _ = self.add_pattern(
337 EntityType::UkPhoneNumber,
338 r"\b(?:0[1-3]\d{2,3}\s?\d{3}\s?\d{4}|0[1-3]\d{2,3}\s?\d{6,7})\b",
339 0.75,
340 );
341
342 let _ = self.add_pattern(
344 EntityType::UkMobileNumber,
345 r"\b07\d{3}\s?\d{3}\s?\d{3}\b",
346 0.8,
347 );
348
349 let _ = self.add_pattern_with_context(
352 EntityType::UkCompanyNumber,
353 r"\b(?:\d{8}|[A-Z]{2}\d{6})\b",
354 0.3,
355 vec![
356 "company".to_string(),
357 "companies house".to_string(),
358 "registration".to_string(),
359 "CRN".to_string(),
360 ],
361 );
362
363 let _ = self.add_pattern_with_context(
365 EntityType::MedicalLicense,
366 r"\b(?:MD|DO|NP|PA|RN|LPN)[-\s]?\d{5,10}\b",
367 0.8,
368 vec![
369 "license".to_string(),
370 "medical".to_string(),
371 "physician".to_string(),
372 "doctor".to_string(),
373 "nurse".to_string(),
374 ],
375 );
376
377 let _ = self.add_pattern_with_context(
380 EntityType::CryptoWallet,
381 r"\b[LMr3][a-km-zA-HJ-NP-Z1-9]{25,34}\b",
382 0.75,
383 vec![
384 "wallet".to_string(),
385 "crypto".to_string(),
386 "address".to_string(),
387 "coin".to_string(),
388 ],
389 );
390 }
391
392 fn check_context(&self, text: &str, start: usize, end: usize, context_words: &[String]) -> f32 {
394 if context_words.is_empty() {
395 return 0.0;
396 }
397
398 let context_start = start.saturating_sub(50);
400 let context_end = (end + 50).min(text.len());
401 let context = &text[context_start..context_end].to_lowercase();
402
403 let matches = context_words
405 .iter()
406 .filter(|word| context.contains(&word.to_lowercase()))
407 .count();
408
409 (matches as f32 / context_words.len() as f32) * 0.3
411 }
412}
413
414impl Default for PatternRecognizer {
415 fn default() -> Self {
416 Self::new()
417 }
418}
419
420impl Recognizer for PatternRecognizer {
421 fn name(&self) -> &str {
422 &self.name
423 }
424
425 fn supported_entities(&self) -> &[EntityType] {
426 lazy_static! {
427 static ref SUPPORTED: Vec<EntityType> = vec![
428 EntityType::EmailAddress,
430 EntityType::PhoneNumber,
431 EntityType::IpAddress,
432 EntityType::Url,
433 EntityType::DomainName,
434 EntityType::CreditCard,
436 EntityType::IbanCode,
437 EntityType::UsBankNumber,
438 EntityType::UsSsn,
440 EntityType::UsDriverLicense,
441 EntityType::UsPassport,
442 EntityType::UsZipCode,
443 EntityType::UkNhs,
445 EntityType::UkNino,
446 EntityType::UkPostcode,
447 EntityType::UkSortCode,
448 EntityType::UkDriverLicense,
449 EntityType::UkPassportNumber,
450 EntityType::UkPhoneNumber,
451 EntityType::UkMobileNumber,
452 EntityType::UkCompanyNumber,
453 EntityType::MedicalLicense,
455 EntityType::MedicalRecordNumber,
456 EntityType::PassportNumber,
458 EntityType::Age,
459 EntityType::Isbn,
460 EntityType::PoBox,
461 EntityType::DateTime,
462 EntityType::CryptoWallet,
464 EntityType::BtcAddress,
465 EntityType::EthAddress,
466 EntityType::Guid,
468 EntityType::MacAddress,
469 EntityType::Md5Hash,
470 EntityType::Sha1Hash,
471 EntityType::Sha256Hash,
472 ];
473 }
474 &SUPPORTED
475 }
476
477 fn analyze(&self, text: &str, _language: &str) -> Result<Vec<RecognizerResult>> {
478 let mut results = Vec::new();
479
480 for (entity_type, patterns) in &self.patterns {
481 for pattern in patterns {
482 for capture in pattern.regex.captures_iter(text) {
483 if let Some(matched) = capture.get(0) {
484 let start = matched.start();
485 let end = matched.end();
486 let matched_text = matched.as_str();
487
488 let mut score = pattern.score;
490
491 if !pattern.context_words.is_empty() {
493 score += self.check_context(text, start, end, &pattern.context_words);
494 score = score.min(1.0); }
496
497 let validation_factor = validate_entity(entity_type, matched_text);
500 score *= validation_factor;
501
502 if score >= self.min_score {
503 results.push(
504 RecognizerResult::new(
505 entity_type.clone(),
506 start,
507 end,
508 score,
509 self.name(),
510 )
511 .with_text(text),
512 );
513 }
514 }
515 }
516 }
517 }
518
519 Ok(results)
520 }
521
522 fn min_score(&self) -> f32 {
523 self.min_score
524 }
525}
526
527#[cfg(test)]
528mod tests {
529 use super::*;
530
531 #[test]
532 fn test_email_detection() {
533 let recognizer = PatternRecognizer::new();
534 let text = "Contact me at john.doe@example.com for details";
535 let results = recognizer.analyze(text, "en").unwrap();
536
537 let email_results: Vec<_> = results
538 .iter()
539 .filter(|r| r.entity_type == EntityType::EmailAddress)
540 .collect();
541 assert_eq!(email_results.len(), 1);
542 assert_eq!(
543 email_results[0].text,
544 Some("john.doe@example.com".to_string())
545 );
546 assert!(email_results[0].score >= 0.8);
547 }
548
549 #[test]
550 fn test_phone_detection() {
551 let recognizer = PatternRecognizer::new();
552 let text = "Call me at (555) 123-4567";
553 let results = recognizer.analyze(text, "en").unwrap();
554
555 assert!(!results.is_empty());
556 let phone_result = results
557 .iter()
558 .find(|r| r.entity_type == EntityType::PhoneNumber);
559 assert!(phone_result.is_some());
560 }
561
562 #[test]
563 fn test_credit_card_detection() {
564 let recognizer = PatternRecognizer::new();
565 let text = "Card number: 4532015112830366";
566 let results = recognizer.analyze(text, "en").unwrap();
567
568 assert!(!results.is_empty());
569 let cc_result = results
570 .iter()
571 .find(|r| r.entity_type == EntityType::CreditCard);
572 assert!(cc_result.is_some());
573 }
574
575 #[test]
576 fn test_ssn_detection() {
577 let recognizer = PatternRecognizer::new();
578 let text = "SSN: 123-45-6789";
579 let results = recognizer.analyze(text, "en").unwrap();
580
581 assert!(!results.is_empty());
582 let ssn_result = results.iter().find(|r| r.entity_type == EntityType::UsSsn);
583 assert!(ssn_result.is_some());
584 }
585
586 #[test]
587 fn test_uk_nhs_with_context() {
588 let recognizer = PatternRecognizer::new();
589 let text = "NHS patient number is 401 023 2137";
594 let results = recognizer.analyze(text, "en").unwrap();
595
596 assert!(!results.is_empty());
597 let nhs_result = results.iter().find(|r| r.entity_type == EntityType::UkNhs);
598 assert!(
599 nhs_result.is_some(),
600 "Should detect NHS number with context"
601 );
602 if let Some(result) = nhs_result {
604 assert!(result.score > 0.6);
605 }
606 }
607
608 #[test]
609 fn test_uk_nino_detection() {
610 let recognizer = PatternRecognizer::new();
611 let text = "NINO: AB123456C";
612 let results = recognizer.analyze(text, "en").unwrap();
613
614 assert!(!results.is_empty());
615 let nino_result = results.iter().find(|r| r.entity_type == EntityType::UkNino);
616 assert!(nino_result.is_some());
617 }
618
619 #[test]
620 fn test_multiple_entities() {
621 let recognizer = PatternRecognizer::new();
622 let text = "Email john@example.com, phone (555) 123-4567, SSN 123-45-6789";
623 let results = recognizer.analyze(text, "en").unwrap();
624
625 assert!(results.len() >= 3);
626 assert!(results
627 .iter()
628 .any(|r| r.entity_type == EntityType::EmailAddress));
629 assert!(results
630 .iter()
631 .any(|r| r.entity_type == EntityType::PhoneNumber));
632 assert!(results.iter().any(|r| r.entity_type == EntityType::UsSsn));
633 }
634
635 #[test]
636 fn test_custom_pattern() {
637 let mut recognizer = PatternRecognizer::new();
638 recognizer
639 .add_pattern(
640 EntityType::Custom("CUSTOM_ID".to_string()),
641 r"\bCID-\d{6}\b",
642 0.9,
643 )
644 .unwrap();
645
646 let text = "Your customer ID is CID-123456";
647 let results = recognizer.analyze(text, "en").unwrap();
648
649 let custom_result = results
650 .iter()
651 .find(|r| matches!(r.entity_type, EntityType::Custom(_)));
652 assert!(custom_result.is_some());
653 }
654
655 #[test]
656 fn test_min_score_filtering() {
657 let recognizer = PatternRecognizer::new().with_min_score(0.9);
658 let text = "Date: 2024-01-15"; let results = recognizer.analyze(text, "en").unwrap();
660
661 let date_results = results
663 .iter()
664 .filter(|r| r.entity_type == EntityType::DateTime)
665 .count();
666 assert_eq!(date_results, 0);
667 }
668
669 #[test]
670 fn test_uk_driver_license_detection() {
671 let recognizer = PatternRecognizer::new();
672 let text = "UK DL: MORGA753116SM9IJ 35";
673 let results = recognizer.analyze(text, "en").unwrap();
674
675 let dl_result = results
676 .iter()
677 .find(|r| r.entity_type == EntityType::UkDriverLicense);
678 assert!(dl_result.is_some(), "Should detect UK driver's license");
679 }
680
681 #[test]
682 fn test_uk_mobile_detection() {
683 let recognizer = PatternRecognizer::new();
684 let text = "Call me on 07700 900123";
685 let results = recognizer.analyze(text, "en").unwrap();
686
687 let mobile_result = results
688 .iter()
689 .find(|r| r.entity_type == EntityType::UkMobileNumber);
690 assert!(mobile_result.is_some(), "Should detect UK mobile number");
691 }
692
693 #[test]
694 fn test_uk_phone_detection() {
695 let recognizer = PatternRecognizer::new();
696 let text = "Office: 0207 123 4567";
697 let results = recognizer.analyze(text, "en").unwrap();
698
699 let phone_result = results
700 .iter()
701 .find(|r| r.entity_type == EntityType::UkPhoneNumber);
702 assert!(phone_result.is_some(), "Should detect UK phone number");
703 }
704
705 #[test]
706 fn test_medical_license_detection() {
707 let recognizer = PatternRecognizer::new();
708 let text = "Medical license: MD-123456789";
709 let results = recognizer.analyze(text, "en").unwrap();
710
711 let license_result = results
712 .iter()
713 .find(|r| r.entity_type == EntityType::MedicalLicense);
714 assert!(license_result.is_some(), "Should detect medical license");
715 }
716
717 #[test]
718 fn test_supported_entities_count() {
719 let recognizer = PatternRecognizer::new();
720 let supported = recognizer.supported_entities();
721 assert_eq!(
723 supported.len(),
724 36,
725 "Should support 36 pattern-based entity types, got {}",
726 supported.len()
727 );
728 }
729}