Skip to main content

provenant/finder/
mod.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4mod emails;
5mod host;
6mod junk_data;
7mod urls;
8
9pub use emails::find_emails;
10pub use urls::find_urls;
11
12#[derive(Debug, Clone)]
13pub struct DetectionConfig {
14    pub max_emails: usize,
15    pub max_urls: usize,
16    pub unique: bool,
17}
18
19impl Default for DetectionConfig {
20    fn default() -> Self {
21        Self {
22            max_emails: 50,
23            max_urls: 50,
24            unique: true,
25        }
26    }
27}
28
29#[cfg(test)]
30mod tests {
31    use super::{DetectionConfig, find_emails, find_urls};
32    use crate::models::LineNumber;
33
34    #[test]
35    fn test_find_emails_threshold() {
36        let text = "a@b.com\nc@d.com\ne@f.com\n";
37        let config = DetectionConfig {
38            max_emails: 2,
39            ..Default::default()
40        };
41        let emails = find_emails(text, &config);
42        assert_eq!(emails.len(), 2);
43        assert_eq!(emails[0].email, "a@b.com");
44        assert_eq!(emails[0].start_line, LineNumber::ONE);
45    }
46
47    #[test]
48    fn test_find_urls_threshold() {
49        let text = "http://a.com\nhttp://b.com\nhttp://c.com\n";
50        let config = DetectionConfig {
51            max_urls: 2,
52            ..Default::default()
53        };
54        let urls = find_urls(text, &config);
55        assert_eq!(urls.len(), 2);
56        assert_eq!(urls[0].url, "http://a.com/");
57        assert_eq!(urls[1].url, "http://b.com/");
58    }
59
60    #[test]
61    fn test_find_emails_filters_local_machine_domains() {
62        let text = "admin@rust-lang.org\ngeisse@shopgates-mac-mini-3.local\n";
63        let config = DetectionConfig::default();
64        let emails = find_emails(text, &config);
65
66        assert_eq!(emails.len(), 1);
67        assert_eq!(emails[0].email, "admin@rust-lang.org");
68    }
69
70    #[test]
71    fn test_find_emails_ignores_literal_escaped_newline_code_artifacts() {
72        let text = r#"email": "global_writer@email.com\n@app.route\n@csrf.exempt\nuser5@email.com"#;
73        let config = DetectionConfig::default();
74        let emails = find_emails(text, &config);
75
76        let values: Vec<_> = emails.into_iter().map(|email| email.email).collect();
77        assert_eq!(
78            values,
79            vec![
80                "global_writer@email.com".to_string(),
81                "user5@email.com".to_string(),
82            ]
83        );
84    }
85
86    #[test]
87    fn test_find_emails_ignores_r_slot_access_false_positives() {
88        let text = "element@arrow.fill <- element@colour\ntt@inherit.blank <- FALSE\n";
89        let config = DetectionConfig::default();
90        let emails = find_emails(text, &config);
91
92        assert!(emails.is_empty(), "emails: {emails:#?}");
93    }
94
95    #[test]
96    fn test_find_urls_ignores_email_like_ftp_token() {
97        let text = "See ftp.mtuci@gmail.com for details.";
98        let config = DetectionConfig::default();
99        let urls = find_urls(text, &config);
100
101        assert!(urls.is_empty(), "urls: {urls:#?}");
102    }
103
104    #[test]
105    fn test_find_urls_keeps_plain_ftp_hostname() {
106        let text = "Mirror: ftp.gnu.org/gnu/tar/";
107        let config = DetectionConfig::default();
108        let urls = find_urls(text, &config);
109
110        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
111        assert_eq!(urls[0].url, "http://ftp.gnu.org/gnu/tar/");
112    }
113
114    #[test]
115    fn test_find_urls_splits_literal_escaped_newline_separated_urls() {
116        let text = "https://docs.celeryq.dev/en/latest/userguide/workers.html#concurrency\\nhttps://docs.celeryq.dev/en/latest/userguide/concurrency/eventlet.html";
117        let config = DetectionConfig::default();
118        let urls = find_urls(text, &config);
119
120        let values: Vec<_> = urls.into_iter().map(|url| url.url).collect();
121        assert_eq!(
122            values,
123            vec![
124                "https://docs.celeryq.dev/en/latest/userguide/workers.html#concurrency".to_string(),
125                "https://docs.celeryq.dev/en/latest/userguide/concurrency/eventlet.html"
126                    .to_string(),
127            ]
128        );
129    }
130
131    #[test]
132    fn test_find_urls_strips_template_credentials_from_git_urls() {
133        let text = "Repo: https://user:{ACCESS_TOKEN}@github.com/example/project.git";
134        let config = DetectionConfig::default();
135        let urls = find_urls(text, &config);
136
137        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
138        assert_eq!(urls[0].url, "https://github.com/example/project.git");
139    }
140
141    #[test]
142    fn test_find_urls_strips_percent_encoded_template_credentials_from_git_urls() {
143        let text = "Repo: https://user:%7BACCESS_TOKEN%7D@github.com/example/project.git";
144        let config = DetectionConfig::default();
145        let urls = find_urls(text, &config);
146
147        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
148        assert_eq!(urls[0].url, "https://github.com/example/project.git");
149    }
150
151    #[test]
152    fn test_find_urls_dedupes_plain_and_templated_git_urls_after_sanitization() {
153        let text = concat!(
154            "https://github.com/example/project.git\n",
155            "https://user:%7BACCESS_TOKEN%7D@github.com/example/project.git\n",
156        );
157        let config = DetectionConfig::default();
158        let urls = find_urls(text, &config);
159
160        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
161        assert_eq!(urls[0].url, "https://github.com/example/project.git");
162    }
163
164    #[test]
165    fn test_find_urls_strips_trailing_backticks() {
166        let text = "Docs: https://github.com/example/project.git``";
167        let config = DetectionConfig::default();
168        let urls = find_urls(text, &config);
169
170        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
171        assert_eq!(urls[0].url, "https://github.com/example/project.git");
172    }
173
174    #[test]
175    fn test_find_urls_strips_rd_url_braces() {
176        let text = r#"\\url{https://dplyr.tidyverse.org}"#;
177        let config = DetectionConfig::default();
178        let urls = find_urls(text, &config);
179
180        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
181        assert_eq!(urls[0].url, "https://dplyr.tidyverse.org/");
182    }
183
184    #[test]
185    fn test_find_urls_strips_rd_href_trailing_braces() {
186        let text = r#"\\href{https://orcid.org/0000-0003-4757-117X}{ORCID}"#;
187        let config = DetectionConfig::default();
188        let urls = find_urls(text, &config);
189
190        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
191        assert_eq!(urls[0].url, "https://orcid.org/0000-0003-4757-117X");
192    }
193
194    #[test]
195    fn test_find_urls_strips_rd_url_double_closing_braces() {
196        let text = r#"\\url{https://fred.stlouisfed.org/series/PCE}}"#;
197        let config = DetectionConfig::default();
198        let urls = find_urls(text, &config);
199
200        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
201        assert_eq!(urls[0].url, "https://fred.stlouisfed.org/series/PCE");
202    }
203
204    #[test]
205    fn test_find_urls_strips_rd_closing_brace_before_punctuation() {
206        let text = r#"\\url{https://fred.stlouisfed.org/}."#;
207        let config = DetectionConfig::default();
208        let urls = find_urls(text, &config);
209
210        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
211        assert_eq!(urls[0].url, "https://fred.stlouisfed.org/");
212    }
213
214    #[test]
215    fn test_find_urls_keeps_closed_template_placeholders() {
216        let text =
217            "https://flutter-dashboard.appspot.com/#/build?repo=flutter&branch=${branchName}";
218        let config = DetectionConfig::default();
219        let urls = find_urls(text, &config);
220
221        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
222        assert_eq!(
223            urls[0].url,
224            "https://flutter-dashboard.appspot.com/#/build?repo=flutter&branch=${branchName}"
225        );
226    }
227
228    #[test]
229    fn test_find_urls_trims_open_template_suffixes() {
230        let text =
231            "https://github.com/flutter/flutter/pull/${{ github.event.pull_request.number }}";
232        let config = DetectionConfig::default();
233        let urls = find_urls(text, &config);
234
235        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
236        assert_eq!(urls[0].url, "https://github.com/flutter/flutter/pull");
237    }
238
239    #[test]
240    fn test_find_urls_ignores_markdown_emphasis_inside_hostname() {
241        let text = "Use https://**yourcompany**.atlassian.net for Jira Cloud.";
242        let config = DetectionConfig::default();
243        let urls = find_urls(text, &config);
244
245        assert!(urls.is_empty(), "urls: {urls:#?}");
246    }
247
248    #[test]
249    fn test_find_urls_filters_code_variable_host_artifacts() {
250        let text = "loginUrl = \"http://os.environ['DD_BASE_URL']/login\"";
251        let config = DetectionConfig::default();
252        let urls = find_urls(text, &config);
253
254        assert!(urls.is_empty(), "urls: {urls:#?}");
255    }
256
257    #[test]
258    fn test_find_emails_ignores_file_like_domains() {
259        let text = "s@index.html version@.tar.gz real@rust-lang.org";
260        let config = DetectionConfig::default();
261        let emails = find_emails(text, &config);
262
263        let values: Vec<_> = emails.into_iter().map(|email| email.email).collect();
264        assert_eq!(values, vec!["real@rust-lang.org".to_string()]);
265    }
266
267    #[test]
268    fn test_find_emails_ignores_generated_template_asset_tokens() {
269        let text = "icon-app-20x20@2x.png.img.tmpl git@github.com this@mockk.projectdir";
270        let config = DetectionConfig::default();
271        let emails = find_emails(text, &config);
272
273        let values: Vec<_> = emails.into_iter().map(|email| email.email).collect();
274        assert_eq!(values, vec!["git@github.com".to_string()]);
275    }
276
277    #[test]
278    fn test_find_urls_ignores_file_like_fake_hosts() {
279        let text = "http://ftp.sftp/ http://www.classes.hint/ http://www.conf.default/ https://rust-lang.org/real";
280        let config = DetectionConfig::default();
281        let urls = find_urls(text, &config);
282
283        let values: Vec<_> = urls.into_iter().map(|url| url.url).collect();
284        assert_eq!(values, vec!["https://rust-lang.org/real".to_string()]);
285    }
286}