Skip to main content

provenant/finder/
mod.rs

1mod emails;
2#[cfg(all(test, feature = "golden-tests"))]
3mod golden_test;
4mod host;
5mod junk_data;
6mod urls;
7
8pub use emails::find_emails;
9pub use urls::find_urls;
10
11#[derive(Debug, Clone)]
12pub struct DetectionConfig {
13    pub max_emails: usize,
14    pub max_urls: usize,
15    pub unique: bool,
16}
17
18impl Default for DetectionConfig {
19    fn default() -> Self {
20        Self {
21            max_emails: 50,
22            max_urls: 50,
23            unique: true,
24        }
25    }
26}
27
28#[cfg(test)]
29mod tests {
30    use super::{DetectionConfig, find_emails, find_urls};
31    use crate::models::LineNumber;
32
33    #[test]
34    fn test_find_emails_threshold() {
35        let text = "a@b.com\nc@d.com\ne@f.com\n";
36        let config = DetectionConfig {
37            max_emails: 2,
38            ..Default::default()
39        };
40        let emails = find_emails(text, &config);
41        assert_eq!(emails.len(), 2);
42        assert_eq!(emails[0].email, "a@b.com");
43        assert_eq!(emails[0].start_line, LineNumber::ONE);
44    }
45
46    #[test]
47    fn test_find_urls_threshold() {
48        let text = "http://a.com\nhttp://b.com\nhttp://c.com\n";
49        let config = DetectionConfig {
50            max_urls: 2,
51            ..Default::default()
52        };
53        let urls = find_urls(text, &config);
54        assert_eq!(urls.len(), 2);
55        assert_eq!(urls[0].url, "http://a.com/");
56        assert_eq!(urls[1].url, "http://b.com/");
57    }
58
59    #[test]
60    fn test_find_emails_filters_local_machine_domains() {
61        let text = "admin@rust-lang.org\ngeisse@shopgates-mac-mini-3.local\n";
62        let config = DetectionConfig::default();
63        let emails = find_emails(text, &config);
64
65        assert_eq!(emails.len(), 1);
66        assert_eq!(emails[0].email, "admin@rust-lang.org");
67    }
68
69    #[test]
70    fn test_find_emails_ignores_literal_escaped_newline_code_artifacts() {
71        let text = r#"email": "global_writer@email.com\n@app.route\n@csrf.exempt\nuser5@email.com"#;
72        let config = DetectionConfig::default();
73        let emails = find_emails(text, &config);
74
75        let values: Vec<_> = emails.into_iter().map(|email| email.email).collect();
76        assert_eq!(
77            values,
78            vec![
79                "global_writer@email.com".to_string(),
80                "user5@email.com".to_string(),
81            ]
82        );
83    }
84
85    #[test]
86    fn test_find_urls_ignores_email_like_ftp_token() {
87        let text = "See ftp.mtuci@gmail.com for details.";
88        let config = DetectionConfig::default();
89        let urls = find_urls(text, &config);
90
91        assert!(urls.is_empty(), "urls: {urls:#?}");
92    }
93
94    #[test]
95    fn test_find_urls_keeps_plain_ftp_hostname() {
96        let text = "Mirror: ftp.gnu.org/gnu/tar/";
97        let config = DetectionConfig::default();
98        let urls = find_urls(text, &config);
99
100        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
101        assert_eq!(urls[0].url, "http://ftp.gnu.org/gnu/tar/");
102    }
103
104    #[test]
105    fn test_find_urls_splits_literal_escaped_newline_separated_urls() {
106        let text = "https://docs.celeryq.dev/en/latest/userguide/workers.html#concurrency\\nhttps://docs.celeryq.dev/en/latest/userguide/concurrency/eventlet.html";
107        let config = DetectionConfig::default();
108        let urls = find_urls(text, &config);
109
110        let values: Vec<_> = urls.into_iter().map(|url| url.url).collect();
111        assert_eq!(
112            values,
113            vec![
114                "https://docs.celeryq.dev/en/latest/userguide/workers.html#concurrency".to_string(),
115                "https://docs.celeryq.dev/en/latest/userguide/concurrency/eventlet.html"
116                    .to_string(),
117            ]
118        );
119    }
120
121    #[test]
122    fn test_find_urls_strips_template_credentials_from_git_urls() {
123        let text = "Repo: https://user:{ACCESS_TOKEN}@github.com/example/project.git";
124        let config = DetectionConfig::default();
125        let urls = find_urls(text, &config);
126
127        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
128        assert_eq!(urls[0].url, "https://github.com/example/project.git");
129    }
130
131    #[test]
132    fn test_find_urls_strips_percent_encoded_template_credentials_from_git_urls() {
133        let text = "Repo: https://user:%7BACCESS_TOKEN%7D@github.com/example/project.git";
134        let config = DetectionConfig::default();
135        let urls = find_urls(text, &config);
136
137        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
138        assert_eq!(urls[0].url, "https://github.com/example/project.git");
139    }
140
141    #[test]
142    fn test_find_urls_dedupes_plain_and_templated_git_urls_after_sanitization() {
143        let text = concat!(
144            "https://github.com/example/project.git\n",
145            "https://user:%7BACCESS_TOKEN%7D@github.com/example/project.git\n",
146        );
147        let config = DetectionConfig::default();
148        let urls = find_urls(text, &config);
149
150        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
151        assert_eq!(urls[0].url, "https://github.com/example/project.git");
152    }
153
154    #[test]
155    fn test_find_urls_strips_trailing_backticks() {
156        let text = "Docs: https://github.com/example/project.git``";
157        let config = DetectionConfig::default();
158        let urls = find_urls(text, &config);
159
160        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
161        assert_eq!(urls[0].url, "https://github.com/example/project.git");
162    }
163
164    #[test]
165    fn test_find_urls_ignores_markdown_emphasis_inside_hostname() {
166        let text = "Use https://**yourcompany**.atlassian.net for Jira Cloud.";
167        let config = DetectionConfig::default();
168        let urls = find_urls(text, &config);
169
170        assert!(urls.is_empty(), "urls: {urls:#?}");
171    }
172
173    #[test]
174    fn test_find_urls_filters_code_variable_host_artifacts() {
175        let text = "loginUrl = \"http://os.environ['DD_BASE_URL']/login\"";
176        let config = DetectionConfig::default();
177        let urls = find_urls(text, &config);
178
179        assert!(urls.is_empty(), "urls: {urls:#?}");
180    }
181}