provenant/finder/
emails.rs1use regex::Regex;
5use std::sync::LazyLock;
6
7use crate::models::LineNumber;
8
9use super::DetectionConfig;
10use super::host::is_good_email_domain;
11use super::junk_data::classify_email;
12
13#[derive(Debug, Clone, PartialEq)]
14pub struct EmailDetection {
15 pub email: String,
16 pub start_line: LineNumber,
17 pub end_line: LineNumber,
18}
19
20static EMAILS_REGEX: LazyLock<Regex> = LazyLock::new(|| {
21 Regex::new(r"(?i)\b[A-Z0-9._%-]+@[A-Z0-9.-]+\.[A-Z]{2,63}\b").expect("valid email regex")
22});
23
24pub fn find_emails(text: &str, config: &DetectionConfig) -> Vec<EmailDetection> {
25 let mut detections = Vec::new();
26
27 for (line_index, line) in text.lines().enumerate() {
28 let line_number = LineNumber::from_0_indexed(line_index);
29 let normalized_line = line.replace("\\r\\n", "\\n").replace("\\r", "\\n");
30 for segment in normalized_line.split("\\n") {
31 for matched in EMAILS_REGEX.find_iter(segment) {
32 let email = matched.as_str().to_lowercase();
33 if !is_good_email_domain(&email) {
34 continue;
35 }
36 if !classify_email(&email) {
37 continue;
38 }
39
40 detections.push(EmailDetection {
41 email,
42 start_line: line_number,
43 end_line: line_number,
44 });
45 }
46 }
47 }
48
49 let mut detections = if config.unique {
50 let mut seen = std::collections::HashSet::<String>::new();
51 detections
52 .into_iter()
53 .filter(|d| seen.insert(d.email.clone()))
54 .collect::<Vec<_>>()
55 } else {
56 detections
57 };
58
59 if config.max_emails > 0 && detections.len() > config.max_emails {
60 let mut seen = std::collections::HashSet::<String>::new();
61 detections.retain(|d| seen.insert(d.email.clone()));
62 detections.truncate(config.max_emails);
63 }
64
65 detections
66}