Skip to main content

provenant/finder/
mod.rs

1mod emails;
2#[cfg(all(test, feature = "golden-tests"))]
3mod golden_test;
4mod host;
5mod junk_data;
6mod urls;
7
8pub use emails::find_emails;
9pub use urls::find_urls;
10
11#[derive(Debug, Clone)]
12pub struct DetectionConfig {
13    pub max_emails: usize,
14    pub max_urls: usize,
15    pub unique: bool,
16}
17
18impl Default for DetectionConfig {
19    fn default() -> Self {
20        Self {
21            max_emails: 50,
22            max_urls: 50,
23            unique: true,
24        }
25    }
26}
27
28#[cfg(test)]
29mod tests {
30    use super::{DetectionConfig, find_emails, find_urls};
31    use crate::models::LineNumber;
32
33    #[test]
34    fn test_find_emails_threshold() {
35        let text = "a@b.com\nc@d.com\ne@f.com\n";
36        let config = DetectionConfig {
37            max_emails: 2,
38            ..Default::default()
39        };
40        let emails = find_emails(text, &config);
41        assert_eq!(emails.len(), 2);
42        assert_eq!(emails[0].email, "a@b.com");
43        assert_eq!(emails[0].start_line, LineNumber::ONE);
44    }
45
46    #[test]
47    fn test_find_urls_threshold() {
48        let text = "http://a.com\nhttp://b.com\nhttp://c.com\n";
49        let config = DetectionConfig {
50            max_urls: 2,
51            ..Default::default()
52        };
53        let urls = find_urls(text, &config);
54        assert_eq!(urls.len(), 2);
55        assert_eq!(urls[0].url, "http://a.com/");
56        assert_eq!(urls[1].url, "http://b.com/");
57    }
58
59    #[test]
60    fn test_find_emails_filters_local_machine_domains() {
61        let text = "admin@rust-lang.org\ngeisse@shopgates-mac-mini-3.local\n";
62        let config = DetectionConfig::default();
63        let emails = find_emails(text, &config);
64
65        assert_eq!(emails.len(), 1);
66        assert_eq!(emails[0].email, "admin@rust-lang.org");
67    }
68
69    #[test]
70    fn test_find_urls_ignores_email_like_ftp_token() {
71        let text = "See ftp.mtuci@gmail.com for details.";
72        let config = DetectionConfig::default();
73        let urls = find_urls(text, &config);
74
75        assert!(urls.is_empty(), "urls: {urls:#?}");
76    }
77
78    #[test]
79    fn test_find_urls_keeps_plain_ftp_hostname() {
80        let text = "Mirror: ftp.gnu.org/gnu/tar/";
81        let config = DetectionConfig::default();
82        let urls = find_urls(text, &config);
83
84        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
85        assert_eq!(urls[0].url, "http://ftp.gnu.org/gnu/tar/");
86    }
87
88    #[test]
89    fn test_find_urls_splits_literal_escaped_newline_separated_urls() {
90        let text = "https://docs.celeryq.dev/en/latest/userguide/workers.html#concurrency\\nhttps://docs.celeryq.dev/en/latest/userguide/concurrency/eventlet.html";
91        let config = DetectionConfig::default();
92        let urls = find_urls(text, &config);
93
94        let values: Vec<_> = urls.into_iter().map(|url| url.url).collect();
95        assert_eq!(
96            values,
97            vec![
98                "https://docs.celeryq.dev/en/latest/userguide/workers.html#concurrency".to_string(),
99                "https://docs.celeryq.dev/en/latest/userguide/concurrency/eventlet.html"
100                    .to_string(),
101            ]
102        );
103    }
104
105    #[test]
106    fn test_find_urls_strips_template_credentials_from_git_urls() {
107        let text = "Repo: https://user:{ACCESS_TOKEN}@github.com/example/project.git";
108        let config = DetectionConfig::default();
109        let urls = find_urls(text, &config);
110
111        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
112        assert_eq!(urls[0].url, "https://github.com/example/project.git");
113    }
114
115    #[test]
116    fn test_find_urls_strips_percent_encoded_template_credentials_from_git_urls() {
117        let text = "Repo: https://user:%7BACCESS_TOKEN%7D@github.com/example/project.git";
118        let config = DetectionConfig::default();
119        let urls = find_urls(text, &config);
120
121        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
122        assert_eq!(urls[0].url, "https://github.com/example/project.git");
123    }
124
125    #[test]
126    fn test_find_urls_dedupes_plain_and_templated_git_urls_after_sanitization() {
127        let text = concat!(
128            "https://github.com/example/project.git\n",
129            "https://user:%7BACCESS_TOKEN%7D@github.com/example/project.git\n",
130        );
131        let config = DetectionConfig::default();
132        let urls = find_urls(text, &config);
133
134        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
135        assert_eq!(urls[0].url, "https://github.com/example/project.git");
136    }
137
138    #[test]
139    fn test_find_urls_strips_trailing_backticks() {
140        let text = "Docs: https://github.com/example/project.git``";
141        let config = DetectionConfig::default();
142        let urls = find_urls(text, &config);
143
144        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
145        assert_eq!(urls[0].url, "https://github.com/example/project.git");
146    }
147}