Skip to main content

provenant/finder/
mod.rs

1mod emails;
2#[cfg(all(test, feature = "golden-tests"))]
3mod golden_test;
4mod host;
5mod junk_data;
6mod urls;
7
8pub use emails::find_emails;
9pub use urls::find_urls;
10
11#[derive(Debug, Clone)]
12pub struct DetectionConfig {
13    pub max_emails: usize,
14    pub max_urls: usize,
15    pub unique: bool,
16}
17
18impl Default for DetectionConfig {
19    fn default() -> Self {
20        Self {
21            max_emails: 50,
22            max_urls: 50,
23            unique: true,
24        }
25    }
26}
27
28#[cfg(test)]
29mod tests {
30    use super::{DetectionConfig, find_emails, find_urls};
31
32    #[test]
33    fn test_find_emails_threshold() {
34        let text = "a@b.com\nc@d.com\ne@f.com\n";
35        let config = DetectionConfig {
36            max_emails: 2,
37            ..Default::default()
38        };
39        let emails = find_emails(text, &config);
40        assert_eq!(emails.len(), 2);
41        assert_eq!(emails[0].email, "a@b.com");
42        assert_eq!(emails[0].start_line, 1);
43    }
44
45    #[test]
46    fn test_find_urls_threshold() {
47        let text = "http://a.com\nhttp://b.com\nhttp://c.com\n";
48        let config = DetectionConfig {
49            max_urls: 2,
50            ..Default::default()
51        };
52        let urls = find_urls(text, &config);
53        assert_eq!(urls.len(), 2);
54        assert_eq!(urls[0].url, "http://a.com/");
55        assert_eq!(urls[1].url, "http://b.com/");
56    }
57
58    #[test]
59    fn test_find_emails_filters_local_machine_domains() {
60        let text = "admin@rust-lang.org\ngeisse@shopgates-mac-mini-3.local\n";
61        let config = DetectionConfig::default();
62        let emails = find_emails(text, &config);
63
64        assert_eq!(emails.len(), 1);
65        assert_eq!(emails[0].email, "admin@rust-lang.org");
66    }
67
68    #[test]
69    fn test_find_urls_ignores_email_like_ftp_token() {
70        let text = "See ftp.mtuci@gmail.com for details.";
71        let config = DetectionConfig::default();
72        let urls = find_urls(text, &config);
73
74        assert!(urls.is_empty(), "urls: {urls:#?}");
75    }
76
77    #[test]
78    fn test_find_urls_keeps_plain_ftp_hostname() {
79        let text = "Mirror: ftp.gnu.org/gnu/tar/";
80        let config = DetectionConfig::default();
81        let urls = find_urls(text, &config);
82
83        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
84        assert_eq!(urls[0].url, "http://ftp.gnu.org/gnu/tar/");
85    }
86
87    #[test]
88    fn test_find_urls_splits_literal_escaped_newline_separated_urls() {
89        let text = "https://docs.celeryq.dev/en/latest/userguide/workers.html#concurrency\\nhttps://docs.celeryq.dev/en/latest/userguide/concurrency/eventlet.html";
90        let config = DetectionConfig::default();
91        let urls = find_urls(text, &config);
92
93        let values: Vec<_> = urls.into_iter().map(|url| url.url).collect();
94        assert_eq!(
95            values,
96            vec![
97                "https://docs.celeryq.dev/en/latest/userguide/workers.html#concurrency".to_string(),
98                "https://docs.celeryq.dev/en/latest/userguide/concurrency/eventlet.html"
99                    .to_string(),
100            ]
101        );
102    }
103
104    #[test]
105    fn test_find_urls_strips_template_credentials_from_git_urls() {
106        let text = "Repo: https://user:{ACCESS_TOKEN}@github.com/apache/airflow.git";
107        let config = DetectionConfig::default();
108        let urls = find_urls(text, &config);
109
110        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
111        assert_eq!(urls[0].url, "https://github.com/apache/airflow.git");
112    }
113}