1mod emails;
5mod host;
6mod junk_data;
7mod urls;
8
9pub use emails::find_emails;
10pub use urls::find_urls;
11
12#[derive(Debug, Clone)]
13pub struct DetectionConfig {
14 pub max_emails: usize,
15 pub max_urls: usize,
16 pub unique: bool,
17}
18
19impl Default for DetectionConfig {
20 fn default() -> Self {
21 Self {
22 max_emails: 50,
23 max_urls: 50,
24 unique: true,
25 }
26 }
27}
28
29#[cfg(test)]
30mod tests {
31 use super::{DetectionConfig, find_emails, find_urls};
32 use crate::models::LineNumber;
33
34 #[test]
35 fn test_find_emails_threshold() {
36 let text = "a@b.com\nc@d.com\ne@f.com\n";
37 let config = DetectionConfig {
38 max_emails: 2,
39 ..Default::default()
40 };
41 let emails = find_emails(text, &config);
42 assert_eq!(emails.len(), 2);
43 assert_eq!(emails[0].email, "a@b.com");
44 assert_eq!(emails[0].start_line, LineNumber::ONE);
45 }
46
47 #[test]
48 fn test_find_urls_threshold() {
49 let text = "http://a.com\nhttp://b.com\nhttp://c.com\n";
50 let config = DetectionConfig {
51 max_urls: 2,
52 ..Default::default()
53 };
54 let urls = find_urls(text, &config);
55 assert_eq!(urls.len(), 2);
56 assert_eq!(urls[0].url, "http://a.com/");
57 assert_eq!(urls[1].url, "http://b.com/");
58 }
59
60 #[test]
61 fn test_find_emails_filters_local_machine_domains() {
62 let text = "admin@rust-lang.org\ngeisse@shopgates-mac-mini-3.local\n";
63 let config = DetectionConfig::default();
64 let emails = find_emails(text, &config);
65
66 assert_eq!(emails.len(), 1);
67 assert_eq!(emails[0].email, "admin@rust-lang.org");
68 }
69
70 #[test]
71 fn test_find_emails_ignores_literal_escaped_newline_code_artifacts() {
72 let text = r#"email": "global_writer@email.com\n@app.route\n@csrf.exempt\nuser5@email.com"#;
73 let config = DetectionConfig::default();
74 let emails = find_emails(text, &config);
75
76 let values: Vec<_> = emails.into_iter().map(|email| email.email).collect();
77 assert_eq!(
78 values,
79 vec![
80 "global_writer@email.com".to_string(),
81 "user5@email.com".to_string(),
82 ]
83 );
84 }
85
86 #[test]
87 fn test_find_emails_ignores_r_slot_access_false_positives() {
88 let text = "element@arrow.fill <- element@colour\ntt@inherit.blank <- FALSE\n";
89 let config = DetectionConfig::default();
90 let emails = find_emails(text, &config);
91
92 assert!(emails.is_empty(), "emails: {emails:#?}");
93 }
94
95 #[test]
96 fn test_find_urls_ignores_email_like_ftp_token() {
97 let text = "See ftp.mtuci@gmail.com for details.";
98 let config = DetectionConfig::default();
99 let urls = find_urls(text, &config);
100
101 assert!(urls.is_empty(), "urls: {urls:#?}");
102 }
103
104 #[test]
105 fn test_find_urls_keeps_plain_ftp_hostname() {
106 let text = "Mirror: ftp.gnu.org/gnu/tar/";
107 let config = DetectionConfig::default();
108 let urls = find_urls(text, &config);
109
110 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
111 assert_eq!(urls[0].url, "http://ftp.gnu.org/gnu/tar/");
112 }
113
114 #[test]
115 fn test_find_urls_splits_literal_escaped_newline_separated_urls() {
116 let text = "https://docs.celeryq.dev/en/latest/userguide/workers.html#concurrency\\nhttps://docs.celeryq.dev/en/latest/userguide/concurrency/eventlet.html";
117 let config = DetectionConfig::default();
118 let urls = find_urls(text, &config);
119
120 let values: Vec<_> = urls.into_iter().map(|url| url.url).collect();
121 assert_eq!(
122 values,
123 vec![
124 "https://docs.celeryq.dev/en/latest/userguide/workers.html#concurrency".to_string(),
125 "https://docs.celeryq.dev/en/latest/userguide/concurrency/eventlet.html"
126 .to_string(),
127 ]
128 );
129 }
130
131 #[test]
132 fn test_find_urls_strips_template_credentials_from_git_urls() {
133 let text = "Repo: https://user:{ACCESS_TOKEN}@github.com/example/project.git";
134 let config = DetectionConfig::default();
135 let urls = find_urls(text, &config);
136
137 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
138 assert_eq!(urls[0].url, "https://github.com/example/project.git");
139 }
140
141 #[test]
142 fn test_find_urls_strips_percent_encoded_template_credentials_from_git_urls() {
143 let text = "Repo: https://user:%7BACCESS_TOKEN%7D@github.com/example/project.git";
144 let config = DetectionConfig::default();
145 let urls = find_urls(text, &config);
146
147 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
148 assert_eq!(urls[0].url, "https://github.com/example/project.git");
149 }
150
151 #[test]
152 fn test_find_urls_dedupes_plain_and_templated_git_urls_after_sanitization() {
153 let text = concat!(
154 "https://github.com/example/project.git\n",
155 "https://user:%7BACCESS_TOKEN%7D@github.com/example/project.git\n",
156 );
157 let config = DetectionConfig::default();
158 let urls = find_urls(text, &config);
159
160 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
161 assert_eq!(urls[0].url, "https://github.com/example/project.git");
162 }
163
164 #[test]
165 fn test_find_urls_strips_trailing_backticks() {
166 let text = "Docs: https://github.com/example/project.git``";
167 let config = DetectionConfig::default();
168 let urls = find_urls(text, &config);
169
170 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
171 assert_eq!(urls[0].url, "https://github.com/example/project.git");
172 }
173
174 #[test]
175 fn test_find_urls_strips_rd_url_braces() {
176 let text = r#"\\url{https://dplyr.tidyverse.org}"#;
177 let config = DetectionConfig::default();
178 let urls = find_urls(text, &config);
179
180 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
181 assert_eq!(urls[0].url, "https://dplyr.tidyverse.org/");
182 }
183
184 #[test]
185 fn test_find_urls_strips_rd_href_trailing_braces() {
186 let text = r#"\\href{https://orcid.org/0000-0003-4757-117X}{ORCID}"#;
187 let config = DetectionConfig::default();
188 let urls = find_urls(text, &config);
189
190 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
191 assert_eq!(urls[0].url, "https://orcid.org/0000-0003-4757-117X");
192 }
193
194 #[test]
195 fn test_find_urls_strips_rd_url_double_closing_braces() {
196 let text = r#"\\url{https://fred.stlouisfed.org/series/PCE}}"#;
197 let config = DetectionConfig::default();
198 let urls = find_urls(text, &config);
199
200 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
201 assert_eq!(urls[0].url, "https://fred.stlouisfed.org/series/PCE");
202 }
203
204 #[test]
205 fn test_find_urls_strips_rd_closing_brace_before_punctuation() {
206 let text = r#"\\url{https://fred.stlouisfed.org/}."#;
207 let config = DetectionConfig::default();
208 let urls = find_urls(text, &config);
209
210 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
211 assert_eq!(urls[0].url, "https://fred.stlouisfed.org/");
212 }
213
214 #[test]
215 fn test_find_urls_keeps_closed_template_placeholders() {
216 let text =
217 "https://flutter-dashboard.appspot.com/#/build?repo=flutter&branch=${branchName}";
218 let config = DetectionConfig::default();
219 let urls = find_urls(text, &config);
220
221 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
222 assert_eq!(
223 urls[0].url,
224 "https://flutter-dashboard.appspot.com/#/build?repo=flutter&branch=${branchName}"
225 );
226 }
227
228 #[test]
229 fn test_find_urls_trims_open_template_suffixes() {
230 let text =
231 "https://github.com/flutter/flutter/pull/${{ github.event.pull_request.number }}";
232 let config = DetectionConfig::default();
233 let urls = find_urls(text, &config);
234
235 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
236 assert_eq!(urls[0].url, "https://github.com/flutter/flutter/pull");
237 }
238
239 #[test]
240 fn test_find_urls_ignores_markdown_emphasis_inside_hostname() {
241 let text = "Use https://**yourcompany**.atlassian.net for Jira Cloud.";
242 let config = DetectionConfig::default();
243 let urls = find_urls(text, &config);
244
245 assert!(urls.is_empty(), "urls: {urls:#?}");
246 }
247
248 #[test]
249 fn test_find_urls_keeps_query_asterisk_in_url() {
250 let text = "http://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20html%20where%20url%3D%22";
251 let config = DetectionConfig::default();
252 let urls = find_urls(text, &config);
253
254 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
255 assert_eq!(
256 urls[0].url,
257 "http://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20html%20where%20url%3D%22"
258 );
259 }
260
261 #[test]
262 fn test_find_urls_keeps_npm_package_url_ending_with_fill() {
263 let text = "[npm](https://www.npmjs.com/package/lodash.fill \"See the npm package\")";
264 let config = DetectionConfig::default();
265 let urls = find_urls(text, &config);
266
267 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
268 assert_eq!(urls[0].url, "https://www.npmjs.com/package/lodash.fill");
269 }
270
271 #[test]
272 fn test_find_urls_drops_nested_scheme_artifact_in_path() {
273 let text =
274 "https://getfirebug.com/releases/lite/latest/skin/xp/chrome://firebug/skin/group.gif";
275 let config = DetectionConfig::default();
276 let urls = find_urls(text, &config);
277
278 assert!(urls.is_empty(), "urls: {urls:#?}");
279 }
280
281 #[test]
282 fn test_find_urls_keeps_wayback_archive_capture_with_embedded_http_target() {
283 let text = "http://web.archive.org/web/20160201063255/http://download.microsoft.com/download/foo.exe";
284 let config = DetectionConfig::default();
285 let urls = find_urls(text, &config);
286
287 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
288 assert_eq!(
289 urls[0].url,
290 "http://web.archive.org/web/20160201063255/http://download.microsoft.com/download/foo.exe"
291 );
292 }
293
294 #[test]
295 fn test_find_urls_keeps_wayback_archive_capture_with_embedded_https_target() {
296 let text = "https://web.archive.org/web/20231103044404/https://raphlinus.github.io/graphics/2020/04/21/blurred-rounded-rects.html";
297 let config = DetectionConfig::default();
298 let urls = find_urls(text, &config);
299
300 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
301 assert_eq!(
302 urls[0].url,
303 "https://web.archive.org/web/20231103044404/https://raphlinus.github.io/graphics/2020/04/21/blurred-rounded-rects.html"
304 );
305 }
306
307 #[test]
308 fn test_find_urls_filters_code_variable_host_artifacts() {
309 let text = "loginUrl = \"http://os.environ['DD_BASE_URL']/login\"";
310 let config = DetectionConfig::default();
311 let urls = find_urls(text, &config);
312
313 assert!(urls.is_empty(), "urls: {urls:#?}");
314 }
315
316 #[test]
317 fn test_find_emails_ignores_file_like_domains() {
318 let text = "s@index.html version@.tar.gz real@rust-lang.org";
319 let config = DetectionConfig::default();
320 let emails = find_emails(text, &config);
321
322 let values: Vec<_> = emails.into_iter().map(|email| email.email).collect();
323 assert_eq!(values, vec!["real@rust-lang.org".to_string()]);
324 }
325
326 #[test]
327 fn test_find_emails_ignores_generated_template_asset_tokens() {
328 let text = "icon-app-20x20@2x.png.img.tmpl git@github.com this@mockk.projectdir";
329 let config = DetectionConfig::default();
330 let emails = find_emails(text, &config);
331
332 let values: Vec<_> = emails.into_iter().map(|email| email.email).collect();
333 assert_eq!(values, vec!["git@github.com".to_string()]);
334 }
335
336 #[test]
337 fn test_find_urls_ignores_file_like_fake_hosts() {
338 let text = "http://ftp.sftp/ http://www.classes.hint/ http://www.conf.default/ https://rust-lang.org/real";
339 let config = DetectionConfig::default();
340 let urls = find_urls(text, &config);
341
342 let values: Vec<_> = urls.into_iter().map(|url| url.url).collect();
343 assert_eq!(values, vec!["https://rust-lang.org/real".to_string()]);
344 }
345
346 #[test]
347 fn test_find_urls_ignores_ellipsis_placeholder_hosts() {
348 let text = "Fetch https://.../script.py after download.";
349 let config = DetectionConfig::default();
350 let urls = find_urls(text, &config);
351
352 assert!(urls.is_empty(), "urls: {urls:#?}");
353 }
354
355 #[test]
356 fn test_find_urls_ignores_braced_placeholder_hosts() {
357 let text = "Download from http://{httpserver.host/ when tests boot.";
358 let config = DetectionConfig::default();
359 let urls = find_urls(text, &config);
360
361 assert!(urls.is_empty(), "urls: {urls:#?}");
362 }
363}