Skip to main content

provenant/finder/
mod.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4mod emails;
5mod host;
6mod junk_data;
7mod urls;
8
9pub use emails::find_emails;
10pub use urls::find_urls;
11
12#[derive(Debug, Clone)]
13pub struct DetectionConfig {
14    pub max_emails: usize,
15    pub max_urls: usize,
16    pub unique: bool,
17}
18
19impl Default for DetectionConfig {
20    fn default() -> Self {
21        Self {
22            max_emails: 50,
23            max_urls: 50,
24            unique: true,
25        }
26    }
27}
28
29#[cfg(test)]
30mod tests {
31    use super::{DetectionConfig, find_emails, find_urls};
32    use crate::models::LineNumber;
33
34    #[test]
35    fn test_find_emails_threshold() {
36        let text = "a@b.com\nc@d.com\ne@f.com\n";
37        let config = DetectionConfig {
38            max_emails: 2,
39            ..Default::default()
40        };
41        let emails = find_emails(text, &config);
42        assert_eq!(emails.len(), 2);
43        assert_eq!(emails[0].email, "a@b.com");
44        assert_eq!(emails[0].start_line, LineNumber::ONE);
45    }
46
47    #[test]
48    fn test_find_urls_threshold() {
49        let text = "http://a.com\nhttp://b.com\nhttp://c.com\n";
50        let config = DetectionConfig {
51            max_urls: 2,
52            ..Default::default()
53        };
54        let urls = find_urls(text, &config);
55        assert_eq!(urls.len(), 2);
56        assert_eq!(urls[0].url, "http://a.com/");
57        assert_eq!(urls[1].url, "http://b.com/");
58    }
59
60    #[test]
61    fn test_find_emails_filters_local_machine_domains() {
62        let text = "admin@rust-lang.org\ngeisse@shopgates-mac-mini-3.local\n";
63        let config = DetectionConfig::default();
64        let emails = find_emails(text, &config);
65
66        assert_eq!(emails.len(), 1);
67        assert_eq!(emails[0].email, "admin@rust-lang.org");
68    }
69
70    #[test]
71    fn test_find_emails_ignores_literal_escaped_newline_code_artifacts() {
72        let text = r#"email": "global_writer@email.com\n@app.route\n@csrf.exempt\nuser5@email.com"#;
73        let config = DetectionConfig::default();
74        let emails = find_emails(text, &config);
75
76        let values: Vec<_> = emails.into_iter().map(|email| email.email).collect();
77        assert_eq!(
78            values,
79            vec![
80                "global_writer@email.com".to_string(),
81                "user5@email.com".to_string(),
82            ]
83        );
84    }
85
86    #[test]
87    fn test_find_emails_ignores_r_slot_access_false_positives() {
88        let text = "element@arrow.fill <- element@colour\ntt@inherit.blank <- FALSE\n";
89        let config = DetectionConfig::default();
90        let emails = find_emails(text, &config);
91
92        assert!(emails.is_empty(), "emails: {emails:#?}");
93    }
94
95    #[test]
96    fn test_find_urls_ignores_email_like_ftp_token() {
97        let text = "See ftp.mtuci@gmail.com for details.";
98        let config = DetectionConfig::default();
99        let urls = find_urls(text, &config);
100
101        assert!(urls.is_empty(), "urls: {urls:#?}");
102    }
103
104    #[test]
105    fn test_find_urls_keeps_plain_ftp_hostname() {
106        let text = "Mirror: ftp.gnu.org/gnu/tar/";
107        let config = DetectionConfig::default();
108        let urls = find_urls(text, &config);
109
110        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
111        assert_eq!(urls[0].url, "http://ftp.gnu.org/gnu/tar/");
112    }
113
114    #[test]
115    fn test_find_urls_splits_literal_escaped_newline_separated_urls() {
116        let text = "https://docs.celeryq.dev/en/latest/userguide/workers.html#concurrency\\nhttps://docs.celeryq.dev/en/latest/userguide/concurrency/eventlet.html";
117        let config = DetectionConfig::default();
118        let urls = find_urls(text, &config);
119
120        let values: Vec<_> = urls.into_iter().map(|url| url.url).collect();
121        assert_eq!(
122            values,
123            vec![
124                "https://docs.celeryq.dev/en/latest/userguide/workers.html#concurrency".to_string(),
125                "https://docs.celeryq.dev/en/latest/userguide/concurrency/eventlet.html"
126                    .to_string(),
127            ]
128        );
129    }
130
131    #[test]
132    fn test_find_urls_strips_template_credentials_from_git_urls() {
133        let text = "Repo: https://user:{ACCESS_TOKEN}@github.com/example/project.git";
134        let config = DetectionConfig::default();
135        let urls = find_urls(text, &config);
136
137        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
138        assert_eq!(urls[0].url, "https://github.com/example/project.git");
139    }
140
141    #[test]
142    fn test_find_urls_strips_percent_encoded_template_credentials_from_git_urls() {
143        let text = "Repo: https://user:%7BACCESS_TOKEN%7D@github.com/example/project.git";
144        let config = DetectionConfig::default();
145        let urls = find_urls(text, &config);
146
147        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
148        assert_eq!(urls[0].url, "https://github.com/example/project.git");
149    }
150
151    #[test]
152    fn test_find_urls_dedupes_plain_and_templated_git_urls_after_sanitization() {
153        let text = concat!(
154            "https://github.com/example/project.git\n",
155            "https://user:%7BACCESS_TOKEN%7D@github.com/example/project.git\n",
156        );
157        let config = DetectionConfig::default();
158        let urls = find_urls(text, &config);
159
160        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
161        assert_eq!(urls[0].url, "https://github.com/example/project.git");
162    }
163
164    #[test]
165    fn test_find_urls_strips_trailing_backticks() {
166        let text = "Docs: https://github.com/example/project.git``";
167        let config = DetectionConfig::default();
168        let urls = find_urls(text, &config);
169
170        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
171        assert_eq!(urls[0].url, "https://github.com/example/project.git");
172    }
173
174    #[test]
175    fn test_find_urls_strips_rd_url_braces() {
176        let text = r#"\\url{https://dplyr.tidyverse.org}"#;
177        let config = DetectionConfig::default();
178        let urls = find_urls(text, &config);
179
180        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
181        assert_eq!(urls[0].url, "https://dplyr.tidyverse.org/");
182    }
183
184    #[test]
185    fn test_find_urls_strips_rd_href_trailing_braces() {
186        let text = r#"\\href{https://orcid.org/0000-0003-4757-117X}{ORCID}"#;
187        let config = DetectionConfig::default();
188        let urls = find_urls(text, &config);
189
190        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
191        assert_eq!(urls[0].url, "https://orcid.org/0000-0003-4757-117X");
192    }
193
194    #[test]
195    fn test_find_urls_strips_rd_url_double_closing_braces() {
196        let text = r#"\\url{https://fred.stlouisfed.org/series/PCE}}"#;
197        let config = DetectionConfig::default();
198        let urls = find_urls(text, &config);
199
200        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
201        assert_eq!(urls[0].url, "https://fred.stlouisfed.org/series/PCE");
202    }
203
204    #[test]
205    fn test_find_urls_strips_rd_closing_brace_before_punctuation() {
206        let text = r#"\\url{https://fred.stlouisfed.org/}."#;
207        let config = DetectionConfig::default();
208        let urls = find_urls(text, &config);
209
210        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
211        assert_eq!(urls[0].url, "https://fred.stlouisfed.org/");
212    }
213
214    #[test]
215    fn test_find_urls_keeps_closed_template_placeholders() {
216        let text =
217            "https://flutter-dashboard.appspot.com/#/build?repo=flutter&branch=${branchName}";
218        let config = DetectionConfig::default();
219        let urls = find_urls(text, &config);
220
221        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
222        assert_eq!(
223            urls[0].url,
224            "https://flutter-dashboard.appspot.com/#/build?repo=flutter&branch=${branchName}"
225        );
226    }
227
228    #[test]
229    fn test_find_urls_trims_open_template_suffixes() {
230        let text =
231            "https://github.com/flutter/flutter/pull/${{ github.event.pull_request.number }}";
232        let config = DetectionConfig::default();
233        let urls = find_urls(text, &config);
234
235        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
236        assert_eq!(urls[0].url, "https://github.com/flutter/flutter/pull");
237    }
238
239    #[test]
240    fn test_find_urls_ignores_markdown_emphasis_inside_hostname() {
241        let text = "Use https://**yourcompany**.atlassian.net for Jira Cloud.";
242        let config = DetectionConfig::default();
243        let urls = find_urls(text, &config);
244
245        assert!(urls.is_empty(), "urls: {urls:#?}");
246    }
247
248    #[test]
249    fn test_find_urls_keeps_query_asterisk_in_url() {
250        let text = "http://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20html%20where%20url%3D%22";
251        let config = DetectionConfig::default();
252        let urls = find_urls(text, &config);
253
254        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
255        assert_eq!(
256            urls[0].url,
257            "http://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20html%20where%20url%3D%22"
258        );
259    }
260
261    #[test]
262    fn test_find_urls_keeps_npm_package_url_ending_with_fill() {
263        let text = "[npm](https://www.npmjs.com/package/lodash.fill \"See the npm package\")";
264        let config = DetectionConfig::default();
265        let urls = find_urls(text, &config);
266
267        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
268        assert_eq!(urls[0].url, "https://www.npmjs.com/package/lodash.fill");
269    }
270
271    #[test]
272    fn test_find_urls_drops_nested_scheme_artifact_in_path() {
273        let text =
274            "https://getfirebug.com/releases/lite/latest/skin/xp/chrome://firebug/skin/group.gif";
275        let config = DetectionConfig::default();
276        let urls = find_urls(text, &config);
277
278        assert!(urls.is_empty(), "urls: {urls:#?}");
279    }
280
281    #[test]
282    fn test_find_urls_keeps_wayback_archive_capture_with_embedded_http_target() {
283        let text = "http://web.archive.org/web/20160201063255/http://download.microsoft.com/download/foo.exe";
284        let config = DetectionConfig::default();
285        let urls = find_urls(text, &config);
286
287        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
288        assert_eq!(
289            urls[0].url,
290            "http://web.archive.org/web/20160201063255/http://download.microsoft.com/download/foo.exe"
291        );
292    }
293
294    #[test]
295    fn test_find_urls_keeps_wayback_archive_capture_with_embedded_https_target() {
296        let text = "https://web.archive.org/web/20231103044404/https://raphlinus.github.io/graphics/2020/04/21/blurred-rounded-rects.html";
297        let config = DetectionConfig::default();
298        let urls = find_urls(text, &config);
299
300        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
301        assert_eq!(
302            urls[0].url,
303            "https://web.archive.org/web/20231103044404/https://raphlinus.github.io/graphics/2020/04/21/blurred-rounded-rects.html"
304        );
305    }
306
307    #[test]
308    fn test_find_urls_filters_code_variable_host_artifacts() {
309        let text = "loginUrl = \"http://os.environ['DD_BASE_URL']/login\"";
310        let config = DetectionConfig::default();
311        let urls = find_urls(text, &config);
312
313        assert!(urls.is_empty(), "urls: {urls:#?}");
314    }
315
316    #[test]
317    fn test_find_emails_ignores_file_like_domains() {
318        let text = "s@index.html version@.tar.gz real@rust-lang.org";
319        let config = DetectionConfig::default();
320        let emails = find_emails(text, &config);
321
322        let values: Vec<_> = emails.into_iter().map(|email| email.email).collect();
323        assert_eq!(values, vec!["real@rust-lang.org".to_string()]);
324    }
325
326    #[test]
327    fn test_find_emails_ignores_generated_template_asset_tokens() {
328        let text = "icon-app-20x20@2x.png.img.tmpl git@github.com this@mockk.projectdir";
329        let config = DetectionConfig::default();
330        let emails = find_emails(text, &config);
331
332        let values: Vec<_> = emails.into_iter().map(|email| email.email).collect();
333        assert_eq!(values, vec!["git@github.com".to_string()]);
334    }
335
336    #[test]
337    fn test_find_urls_ignores_sftp_go_identifiers() {
338        let text = "Use sftp.Client to connect via sftp.Config with sftp.c.Stat()";
339        let config = DetectionConfig::default();
340        let urls = find_urls(text, &config);
341        assert!(urls.is_empty(), "urls: {urls:#?}");
342    }
343
344    #[test]
345    fn test_find_urls_keeps_ftp_hostname_after_punctuation() {
346        let text = "Download: ftp.gnu.org/gnu/tar/ and also (ftp.mozilla.org/pub/)";
347        let config = DetectionConfig::default();
348        let urls = find_urls(text, &config);
349
350        let values: Vec<_> = urls.into_iter().map(|url| url.url).collect();
351        assert_eq!(
352            values,
353            vec![
354                "http://ftp.gnu.org/gnu/tar/".to_string(),
355                "http://ftp.mozilla.org/pub/".to_string(),
356            ]
357        );
358    }
359
360    #[test]
361    fn test_find_urls_ignores_file_like_fake_hosts() {
362        let text = "http://ftp.sftp/ http://www.classes.hint/ http://www.conf.default/ https://rust-lang.org/real";
363        let config = DetectionConfig::default();
364        let urls = find_urls(text, &config);
365
366        let values: Vec<_> = urls.into_iter().map(|url| url.url).collect();
367        assert_eq!(values, vec!["https://rust-lang.org/real".to_string()]);
368    }
369
370    #[test]
371    fn test_find_urls_ignores_ellipsis_placeholder_hosts() {
372        let text = "Fetch https://.../script.py after download.";
373        let config = DetectionConfig::default();
374        let urls = find_urls(text, &config);
375
376        assert!(urls.is_empty(), "urls: {urls:#?}");
377    }
378
379    #[test]
380    fn test_find_urls_ignores_braced_placeholder_hosts() {
381        let text = "Download from http://{httpserver.host/ when tests boot.";
382        let config = DetectionConfig::default();
383        let urls = find_urls(text, &config);
384
385        assert!(urls.is_empty(), "urls: {urls:#?}");
386    }
387}