Skip to main content

provenant/finder/
mod.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4mod emails;
5#[cfg(all(test, feature = "golden-tests"))]
6mod golden_test;
7mod host;
8mod junk_data;
9mod urls;
10
11pub use emails::find_emails;
12pub use urls::find_urls;
13
14#[derive(Debug, Clone)]
15pub struct DetectionConfig {
16    pub max_emails: usize,
17    pub max_urls: usize,
18    pub unique: bool,
19}
20
21impl Default for DetectionConfig {
22    fn default() -> Self {
23        Self {
24            max_emails: 50,
25            max_urls: 50,
26            unique: true,
27        }
28    }
29}
30
31#[cfg(test)]
32mod tests {
33    use super::{DetectionConfig, find_emails, find_urls};
34    use crate::models::LineNumber;
35
36    #[test]
37    fn test_find_emails_threshold() {
38        let text = "a@b.com\nc@d.com\ne@f.com\n";
39        let config = DetectionConfig {
40            max_emails: 2,
41            ..Default::default()
42        };
43        let emails = find_emails(text, &config);
44        assert_eq!(emails.len(), 2);
45        assert_eq!(emails[0].email, "a@b.com");
46        assert_eq!(emails[0].start_line, LineNumber::ONE);
47    }
48
49    #[test]
50    fn test_find_urls_threshold() {
51        let text = "http://a.com\nhttp://b.com\nhttp://c.com\n";
52        let config = DetectionConfig {
53            max_urls: 2,
54            ..Default::default()
55        };
56        let urls = find_urls(text, &config);
57        assert_eq!(urls.len(), 2);
58        assert_eq!(urls[0].url, "http://a.com/");
59        assert_eq!(urls[1].url, "http://b.com/");
60    }
61
62    #[test]
63    fn test_find_emails_filters_local_machine_domains() {
64        let text = "admin@rust-lang.org\ngeisse@shopgates-mac-mini-3.local\n";
65        let config = DetectionConfig::default();
66        let emails = find_emails(text, &config);
67
68        assert_eq!(emails.len(), 1);
69        assert_eq!(emails[0].email, "admin@rust-lang.org");
70    }
71
72    #[test]
73    fn test_find_emails_ignores_literal_escaped_newline_code_artifacts() {
74        let text = r#"email": "global_writer@email.com\n@app.route\n@csrf.exempt\nuser5@email.com"#;
75        let config = DetectionConfig::default();
76        let emails = find_emails(text, &config);
77
78        let values: Vec<_> = emails.into_iter().map(|email| email.email).collect();
79        assert_eq!(
80            values,
81            vec![
82                "global_writer@email.com".to_string(),
83                "user5@email.com".to_string(),
84            ]
85        );
86    }
87
88    #[test]
89    fn test_find_emails_ignores_r_slot_access_false_positives() {
90        let text = "element@arrow.fill <- element@colour\ntt@inherit.blank <- FALSE\n";
91        let config = DetectionConfig::default();
92        let emails = find_emails(text, &config);
93
94        assert!(emails.is_empty(), "emails: {emails:#?}");
95    }
96
97    #[test]
98    fn test_find_urls_ignores_email_like_ftp_token() {
99        let text = "See ftp.mtuci@gmail.com for details.";
100        let config = DetectionConfig::default();
101        let urls = find_urls(text, &config);
102
103        assert!(urls.is_empty(), "urls: {urls:#?}");
104    }
105
106    #[test]
107    fn test_find_urls_keeps_plain_ftp_hostname() {
108        let text = "Mirror: ftp.gnu.org/gnu/tar/";
109        let config = DetectionConfig::default();
110        let urls = find_urls(text, &config);
111
112        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
113        assert_eq!(urls[0].url, "http://ftp.gnu.org/gnu/tar/");
114    }
115
116    #[test]
117    fn test_find_urls_splits_literal_escaped_newline_separated_urls() {
118        let text = "https://docs.celeryq.dev/en/latest/userguide/workers.html#concurrency\\nhttps://docs.celeryq.dev/en/latest/userguide/concurrency/eventlet.html";
119        let config = DetectionConfig::default();
120        let urls = find_urls(text, &config);
121
122        let values: Vec<_> = urls.into_iter().map(|url| url.url).collect();
123        assert_eq!(
124            values,
125            vec![
126                "https://docs.celeryq.dev/en/latest/userguide/workers.html#concurrency".to_string(),
127                "https://docs.celeryq.dev/en/latest/userguide/concurrency/eventlet.html"
128                    .to_string(),
129            ]
130        );
131    }
132
133    #[test]
134    fn test_find_urls_strips_template_credentials_from_git_urls() {
135        let text = "Repo: https://user:{ACCESS_TOKEN}@github.com/example/project.git";
136        let config = DetectionConfig::default();
137        let urls = find_urls(text, &config);
138
139        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
140        assert_eq!(urls[0].url, "https://github.com/example/project.git");
141    }
142
143    #[test]
144    fn test_find_urls_strips_percent_encoded_template_credentials_from_git_urls() {
145        let text = "Repo: https://user:%7BACCESS_TOKEN%7D@github.com/example/project.git";
146        let config = DetectionConfig::default();
147        let urls = find_urls(text, &config);
148
149        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
150        assert_eq!(urls[0].url, "https://github.com/example/project.git");
151    }
152
153    #[test]
154    fn test_find_urls_dedupes_plain_and_templated_git_urls_after_sanitization() {
155        let text = concat!(
156            "https://github.com/example/project.git\n",
157            "https://user:%7BACCESS_TOKEN%7D@github.com/example/project.git\n",
158        );
159        let config = DetectionConfig::default();
160        let urls = find_urls(text, &config);
161
162        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
163        assert_eq!(urls[0].url, "https://github.com/example/project.git");
164    }
165
166    #[test]
167    fn test_find_urls_strips_trailing_backticks() {
168        let text = "Docs: https://github.com/example/project.git``";
169        let config = DetectionConfig::default();
170        let urls = find_urls(text, &config);
171
172        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
173        assert_eq!(urls[0].url, "https://github.com/example/project.git");
174    }
175
176    #[test]
177    fn test_find_urls_strips_rd_url_braces() {
178        let text = r#"\\url{https://dplyr.tidyverse.org}"#;
179        let config = DetectionConfig::default();
180        let urls = find_urls(text, &config);
181
182        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
183        assert_eq!(urls[0].url, "https://dplyr.tidyverse.org/");
184    }
185
186    #[test]
187    fn test_find_urls_strips_rd_href_trailing_braces() {
188        let text = r#"\\href{https://orcid.org/0000-0003-4757-117X}{ORCID}"#;
189        let config = DetectionConfig::default();
190        let urls = find_urls(text, &config);
191
192        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
193        assert_eq!(urls[0].url, "https://orcid.org/0000-0003-4757-117X");
194    }
195
196    #[test]
197    fn test_find_urls_strips_rd_url_double_closing_braces() {
198        let text = r#"\\url{https://fred.stlouisfed.org/series/PCE}}"#;
199        let config = DetectionConfig::default();
200        let urls = find_urls(text, &config);
201
202        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
203        assert_eq!(urls[0].url, "https://fred.stlouisfed.org/series/PCE");
204    }
205
206    #[test]
207    fn test_find_urls_strips_rd_closing_brace_before_punctuation() {
208        let text = r#"\\url{https://fred.stlouisfed.org/}."#;
209        let config = DetectionConfig::default();
210        let urls = find_urls(text, &config);
211
212        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
213        assert_eq!(urls[0].url, "https://fred.stlouisfed.org/");
214    }
215
216    #[test]
217    fn test_find_urls_ignores_markdown_emphasis_inside_hostname() {
218        let text = "Use https://**yourcompany**.atlassian.net for Jira Cloud.";
219        let config = DetectionConfig::default();
220        let urls = find_urls(text, &config);
221
222        assert!(urls.is_empty(), "urls: {urls:#?}");
223    }
224
225    #[test]
226    fn test_find_urls_filters_code_variable_host_artifacts() {
227        let text = "loginUrl = \"http://os.environ['DD_BASE_URL']/login\"";
228        let config = DetectionConfig::default();
229        let urls = find_urls(text, &config);
230
231        assert!(urls.is_empty(), "urls: {urls:#?}");
232    }
233
234    #[test]
235    fn test_find_emails_ignores_file_like_domains() {
236        let text = "s@index.html version@.tar.gz real@rust-lang.org";
237        let config = DetectionConfig::default();
238        let emails = find_emails(text, &config);
239
240        let values: Vec<_> = emails.into_iter().map(|email| email.email).collect();
241        assert_eq!(values, vec!["real@rust-lang.org".to_string()]);
242    }
243
244    #[test]
245    fn test_find_urls_ignores_file_like_fake_hosts() {
246        let text = "http://ftp.sftp/ http://www.classes.hint/ http://www.conf.default/ https://rust-lang.org/real";
247        let config = DetectionConfig::default();
248        let urls = find_urls(text, &config);
249
250        let values: Vec<_> = urls.into_iter().map(|url| url.url).collect();
251        assert_eq!(values, vec!["https://rust-lang.org/real".to_string()]);
252    }
253}