Skip to main content

provenant/finder/
emails.rs

1use regex::Regex;
2use std::sync::LazyLock;
3
4use crate::models::LineNumber;
5
6use super::DetectionConfig;
7use super::host::is_good_email_domain;
8use super::junk_data::classify_email;
9
10#[derive(Debug, Clone, PartialEq)]
11pub struct EmailDetection {
12    pub email: String,
13    pub start_line: LineNumber,
14    pub end_line: LineNumber,
15}
16
17static EMAILS_REGEX: LazyLock<Regex> = LazyLock::new(|| {
18    Regex::new(r"(?i)\b[A-Z0-9._%-]+@[A-Z0-9.-]+\.[A-Z]{2,63}\b").expect("valid email regex")
19});
20
21pub fn find_emails(text: &str, config: &DetectionConfig) -> Vec<EmailDetection> {
22    let mut detections = Vec::new();
23
24    for (line_index, line) in text.lines().enumerate() {
25        let line_number = LineNumber::from_0_indexed(line_index);
26        let normalized_line = line.replace("\\r\\n", "\\n").replace("\\r", "\\n");
27        for segment in normalized_line.split("\\n") {
28            for matched in EMAILS_REGEX.find_iter(segment) {
29                let email = matched.as_str().to_lowercase();
30                if !is_good_email_domain(&email) {
31                    continue;
32                }
33                if !classify_email(&email) {
34                    continue;
35                }
36
37                detections.push(EmailDetection {
38                    email,
39                    start_line: line_number,
40                    end_line: line_number,
41                });
42            }
43        }
44    }
45
46    let mut detections = if config.unique {
47        let mut seen = std::collections::HashSet::<String>::new();
48        detections
49            .into_iter()
50            .filter(|d| seen.insert(d.email.clone()))
51            .collect::<Vec<_>>()
52    } else {
53        detections
54    };
55
56    if config.max_emails > 0 && detections.len() > config.max_emails {
57        detections.truncate(config.max_emails);
58    }
59
60    detections
61}