Skip to main content

provenant/finder/
mod.rs

1mod emails;
2#[cfg(all(test, feature = "golden-tests"))]
3mod golden_test;
4mod host;
5mod junk_data;
6mod urls;
7
8pub use emails::find_emails;
9pub use urls::find_urls;
10
11#[derive(Debug, Clone)]
12pub struct DetectionConfig {
13    pub max_emails: usize,
14    pub max_urls: usize,
15    pub unique: bool,
16}
17
18impl Default for DetectionConfig {
19    fn default() -> Self {
20        Self {
21            max_emails: 50,
22            max_urls: 50,
23            unique: true,
24        }
25    }
26}
27
28#[cfg(test)]
29mod tests {
30    use super::{DetectionConfig, find_emails, find_urls};
31    use crate::models::LineNumber;
32
33    #[test]
34    fn test_find_emails_threshold() {
35        let text = "a@b.com\nc@d.com\ne@f.com\n";
36        let config = DetectionConfig {
37            max_emails: 2,
38            ..Default::default()
39        };
40        let emails = find_emails(text, &config);
41        assert_eq!(emails.len(), 2);
42        assert_eq!(emails[0].email, "a@b.com");
43        assert_eq!(emails[0].start_line, LineNumber::ONE);
44    }
45
46    #[test]
47    fn test_find_urls_threshold() {
48        let text = "http://a.com\nhttp://b.com\nhttp://c.com\n";
49        let config = DetectionConfig {
50            max_urls: 2,
51            ..Default::default()
52        };
53        let urls = find_urls(text, &config);
54        assert_eq!(urls.len(), 2);
55        assert_eq!(urls[0].url, "http://a.com/");
56        assert_eq!(urls[1].url, "http://b.com/");
57    }
58
59    #[test]
60    fn test_find_emails_filters_local_machine_domains() {
61        let text = "admin@rust-lang.org\ngeisse@shopgates-mac-mini-3.local\n";
62        let config = DetectionConfig::default();
63        let emails = find_emails(text, &config);
64
65        assert_eq!(emails.len(), 1);
66        assert_eq!(emails[0].email, "admin@rust-lang.org");
67    }
68
69    #[test]
70    fn test_find_emails_ignores_literal_escaped_newline_code_artifacts() {
71        let text = r#"email": "global_writer@email.com\n@app.route\n@csrf.exempt\nuser5@email.com"#;
72        let config = DetectionConfig::default();
73        let emails = find_emails(text, &config);
74
75        let values: Vec<_> = emails.into_iter().map(|email| email.email).collect();
76        assert_eq!(
77            values,
78            vec![
79                "global_writer@email.com".to_string(),
80                "user5@email.com".to_string(),
81            ]
82        );
83    }
84
85    #[test]
86    fn test_find_emails_ignores_r_slot_access_false_positives() {
87        let text = "element@arrow.fill <- element@colour\ntt@inherit.blank <- FALSE\n";
88        let config = DetectionConfig::default();
89        let emails = find_emails(text, &config);
90
91        assert!(emails.is_empty(), "emails: {emails:#?}");
92    }
93
94    #[test]
95    fn test_find_urls_ignores_email_like_ftp_token() {
96        let text = "See ftp.mtuci@gmail.com for details.";
97        let config = DetectionConfig::default();
98        let urls = find_urls(text, &config);
99
100        assert!(urls.is_empty(), "urls: {urls:#?}");
101    }
102
103    #[test]
104    fn test_find_urls_keeps_plain_ftp_hostname() {
105        let text = "Mirror: ftp.gnu.org/gnu/tar/";
106        let config = DetectionConfig::default();
107        let urls = find_urls(text, &config);
108
109        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
110        assert_eq!(urls[0].url, "http://ftp.gnu.org/gnu/tar/");
111    }
112
113    #[test]
114    fn test_find_urls_splits_literal_escaped_newline_separated_urls() {
115        let text = "https://docs.celeryq.dev/en/latest/userguide/workers.html#concurrency\\nhttps://docs.celeryq.dev/en/latest/userguide/concurrency/eventlet.html";
116        let config = DetectionConfig::default();
117        let urls = find_urls(text, &config);
118
119        let values: Vec<_> = urls.into_iter().map(|url| url.url).collect();
120        assert_eq!(
121            values,
122            vec![
123                "https://docs.celeryq.dev/en/latest/userguide/workers.html#concurrency".to_string(),
124                "https://docs.celeryq.dev/en/latest/userguide/concurrency/eventlet.html"
125                    .to_string(),
126            ]
127        );
128    }
129
130    #[test]
131    fn test_find_urls_strips_template_credentials_from_git_urls() {
132        let text = "Repo: https://user:{ACCESS_TOKEN}@github.com/example/project.git";
133        let config = DetectionConfig::default();
134        let urls = find_urls(text, &config);
135
136        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
137        assert_eq!(urls[0].url, "https://github.com/example/project.git");
138    }
139
140    #[test]
141    fn test_find_urls_strips_percent_encoded_template_credentials_from_git_urls() {
142        let text = "Repo: https://user:%7BACCESS_TOKEN%7D@github.com/example/project.git";
143        let config = DetectionConfig::default();
144        let urls = find_urls(text, &config);
145
146        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
147        assert_eq!(urls[0].url, "https://github.com/example/project.git");
148    }
149
150    #[test]
151    fn test_find_urls_dedupes_plain_and_templated_git_urls_after_sanitization() {
152        let text = concat!(
153            "https://github.com/example/project.git\n",
154            "https://user:%7BACCESS_TOKEN%7D@github.com/example/project.git\n",
155        );
156        let config = DetectionConfig::default();
157        let urls = find_urls(text, &config);
158
159        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
160        assert_eq!(urls[0].url, "https://github.com/example/project.git");
161    }
162
163    #[test]
164    fn test_find_urls_strips_trailing_backticks() {
165        let text = "Docs: https://github.com/example/project.git``";
166        let config = DetectionConfig::default();
167        let urls = find_urls(text, &config);
168
169        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
170        assert_eq!(urls[0].url, "https://github.com/example/project.git");
171    }
172
173    #[test]
174    fn test_find_urls_strips_rd_url_braces() {
175        let text = r#"\\url{https://dplyr.tidyverse.org}"#;
176        let config = DetectionConfig::default();
177        let urls = find_urls(text, &config);
178
179        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
180        assert_eq!(urls[0].url, "https://dplyr.tidyverse.org/");
181    }
182
183    #[test]
184    fn test_find_urls_strips_rd_href_trailing_braces() {
185        let text = r#"\\href{https://orcid.org/0000-0003-4757-117X}{ORCID}"#;
186        let config = DetectionConfig::default();
187        let urls = find_urls(text, &config);
188
189        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
190        assert_eq!(urls[0].url, "https://orcid.org/0000-0003-4757-117X");
191    }
192
193    #[test]
194    fn test_find_urls_strips_rd_url_double_closing_braces() {
195        let text = r#"\\url{https://fred.stlouisfed.org/series/PCE}}"#;
196        let config = DetectionConfig::default();
197        let urls = find_urls(text, &config);
198
199        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
200        assert_eq!(urls[0].url, "https://fred.stlouisfed.org/series/PCE");
201    }
202
203    #[test]
204    fn test_find_urls_strips_rd_closing_brace_before_punctuation() {
205        let text = r#"\\url{https://fred.stlouisfed.org/}."#;
206        let config = DetectionConfig::default();
207        let urls = find_urls(text, &config);
208
209        assert_eq!(urls.len(), 1, "urls: {urls:#?}");
210        assert_eq!(urls[0].url, "https://fred.stlouisfed.org/");
211    }
212
213    #[test]
214    fn test_find_urls_ignores_markdown_emphasis_inside_hostname() {
215        let text = "Use https://**yourcompany**.atlassian.net for Jira Cloud.";
216        let config = DetectionConfig::default();
217        let urls = find_urls(text, &config);
218
219        assert!(urls.is_empty(), "urls: {urls:#?}");
220    }
221
222    #[test]
223    fn test_find_urls_filters_code_variable_host_artifacts() {
224        let text = "loginUrl = \"http://os.environ['DD_BASE_URL']/login\"";
225        let config = DetectionConfig::default();
226        let urls = find_urls(text, &config);
227
228        assert!(urls.is_empty(), "urls: {urls:#?}");
229    }
230}