Skip to main content

provenant/finder/
emails.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4use regex::Regex;
5use std::sync::LazyLock;
6
7use crate::models::LineNumber;
8
9use super::DetectionConfig;
10use super::host::is_good_email_domain;
11use super::junk_data::classify_email;
12
13#[derive(Debug, Clone, PartialEq)]
14pub struct EmailDetection {
15    pub email: String,
16    pub start_line: LineNumber,
17    pub end_line: LineNumber,
18}
19
20static EMAILS_REGEX: LazyLock<Regex> = LazyLock::new(|| {
21    Regex::new(r"(?i)\b[A-Z0-9._%-]+@[A-Z0-9.-]+\.[A-Z]{2,63}\b").expect("valid email regex")
22});
23
24pub fn find_emails(text: &str, config: &DetectionConfig) -> Vec<EmailDetection> {
25    let mut detections = Vec::new();
26
27    for (line_index, line) in text.lines().enumerate() {
28        let line_number = LineNumber::from_0_indexed(line_index);
29        let normalized_line = line.replace("\\r\\n", "\\n").replace("\\r", "\\n");
30        for segment in normalized_line.split("\\n") {
31            for matched in EMAILS_REGEX.find_iter(segment) {
32                let email = matched.as_str().to_lowercase();
33                if !is_good_email_domain(&email) {
34                    continue;
35                }
36                if !classify_email(&email) {
37                    continue;
38                }
39
40                detections.push(EmailDetection {
41                    email,
42                    start_line: line_number,
43                    end_line: line_number,
44                });
45            }
46        }
47    }
48
49    let mut detections = if config.unique {
50        let mut seen = std::collections::HashSet::<String>::new();
51        detections
52            .into_iter()
53            .filter(|d| seen.insert(d.email.clone()))
54            .collect::<Vec<_>>()
55    } else {
56        detections
57    };
58
59    if config.max_emails > 0 && detections.len() > config.max_emails {
60        let mut seen = std::collections::HashSet::<String>::new();
61        detections.retain(|d| seen.insert(d.email.clone()));
62        detections.truncate(config.max_emails);
63    }
64
65    detections
66}