provenant/finder/
emails.rs1use regex::Regex;
2use std::sync::LazyLock;
3
4use crate::models::LineNumber;
5
6use super::DetectionConfig;
7use super::host::is_good_email_domain;
8use super::junk_data::classify_email;
9
10#[derive(Debug, Clone, PartialEq)]
11pub struct EmailDetection {
12 pub email: String,
13 pub start_line: LineNumber,
14 pub end_line: LineNumber,
15}
16
17static EMAILS_REGEX: LazyLock<Regex> = LazyLock::new(|| {
18 Regex::new(r"(?i)\b[A-Z0-9._%-]+@[A-Z0-9.-]+\.[A-Z]{2,63}\b").expect("valid email regex")
19});
20
21pub fn find_emails(text: &str, config: &DetectionConfig) -> Vec<EmailDetection> {
22 let mut detections = Vec::new();
23
24 for (line_index, line) in text.lines().enumerate() {
25 let line_number = LineNumber::from_0_indexed(line_index);
26 let normalized_line = line.replace("\\r\\n", "\\n").replace("\\r", "\\n");
27 for segment in normalized_line.split("\\n") {
28 for matched in EMAILS_REGEX.find_iter(segment) {
29 let email = matched.as_str().to_lowercase();
30 if !is_good_email_domain(&email) {
31 continue;
32 }
33 if !classify_email(&email) {
34 continue;
35 }
36
37 detections.push(EmailDetection {
38 email,
39 start_line: line_number,
40 end_line: line_number,
41 });
42 }
43 }
44 }
45
46 let mut detections = if config.unique {
47 let mut seen = std::collections::HashSet::<String>::new();
48 detections
49 .into_iter()
50 .filter(|d| seen.insert(d.email.clone()))
51 .collect::<Vec<_>>()
52 } else {
53 detections
54 };
55
56 if config.max_emails > 0 && detections.len() > config.max_emails {
57 detections.truncate(config.max_emails);
58 }
59
60 detections
61}