1mod emails;
5mod host;
6mod junk_data;
7mod urls;
8
9pub use emails::find_emails;
10pub use urls::find_urls;
11
12#[derive(Debug, Clone)]
13pub struct DetectionConfig {
14 pub max_emails: usize,
15 pub max_urls: usize,
16 pub unique: bool,
17}
18
19impl Default for DetectionConfig {
20 fn default() -> Self {
21 Self {
22 max_emails: 50,
23 max_urls: 50,
24 unique: true,
25 }
26 }
27}
28
29#[cfg(test)]
30mod tests {
31 use super::{DetectionConfig, find_emails, find_urls};
32 use crate::models::LineNumber;
33
34 #[test]
35 fn test_find_emails_threshold() {
36 let text = "a@b.com\nc@d.com\ne@f.com\n";
37 let config = DetectionConfig {
38 max_emails: 2,
39 ..Default::default()
40 };
41 let emails = find_emails(text, &config);
42 assert_eq!(emails.len(), 2);
43 assert_eq!(emails[0].email, "a@b.com");
44 assert_eq!(emails[0].start_line, LineNumber::ONE);
45 }
46
47 #[test]
48 fn test_find_urls_threshold() {
49 let text = "http://a.com\nhttp://b.com\nhttp://c.com\n";
50 let config = DetectionConfig {
51 max_urls: 2,
52 ..Default::default()
53 };
54 let urls = find_urls(text, &config);
55 assert_eq!(urls.len(), 2);
56 assert_eq!(urls[0].url, "http://a.com/");
57 assert_eq!(urls[1].url, "http://b.com/");
58 }
59
60 #[test]
61 fn test_find_emails_filters_local_machine_domains() {
62 let text = "admin@rust-lang.org\ngeisse@shopgates-mac-mini-3.local\n";
63 let config = DetectionConfig::default();
64 let emails = find_emails(text, &config);
65
66 assert_eq!(emails.len(), 1);
67 assert_eq!(emails[0].email, "admin@rust-lang.org");
68 }
69
70 #[test]
71 fn test_find_emails_ignores_literal_escaped_newline_code_artifacts() {
72 let text = r#"email": "global_writer@email.com\n@app.route\n@csrf.exempt\nuser5@email.com"#;
73 let config = DetectionConfig::default();
74 let emails = find_emails(text, &config);
75
76 let values: Vec<_> = emails.into_iter().map(|email| email.email).collect();
77 assert_eq!(
78 values,
79 vec![
80 "global_writer@email.com".to_string(),
81 "user5@email.com".to_string(),
82 ]
83 );
84 }
85
86 #[test]
87 fn test_find_emails_ignores_r_slot_access_false_positives() {
88 let text = "element@arrow.fill <- element@colour\ntt@inherit.blank <- FALSE\n";
89 let config = DetectionConfig::default();
90 let emails = find_emails(text, &config);
91
92 assert!(emails.is_empty(), "emails: {emails:#?}");
93 }
94
95 #[test]
96 fn test_find_urls_ignores_email_like_ftp_token() {
97 let text = "See ftp.mtuci@gmail.com for details.";
98 let config = DetectionConfig::default();
99 let urls = find_urls(text, &config);
100
101 assert!(urls.is_empty(), "urls: {urls:#?}");
102 }
103
104 #[test]
105 fn test_find_urls_keeps_plain_ftp_hostname() {
106 let text = "Mirror: ftp.gnu.org/gnu/tar/";
107 let config = DetectionConfig::default();
108 let urls = find_urls(text, &config);
109
110 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
111 assert_eq!(urls[0].url, "http://ftp.gnu.org/gnu/tar/");
112 }
113
114 #[test]
115 fn test_find_urls_splits_literal_escaped_newline_separated_urls() {
116 let text = "https://docs.celeryq.dev/en/latest/userguide/workers.html#concurrency\\nhttps://docs.celeryq.dev/en/latest/userguide/concurrency/eventlet.html";
117 let config = DetectionConfig::default();
118 let urls = find_urls(text, &config);
119
120 let values: Vec<_> = urls.into_iter().map(|url| url.url).collect();
121 assert_eq!(
122 values,
123 vec![
124 "https://docs.celeryq.dev/en/latest/userguide/workers.html#concurrency".to_string(),
125 "https://docs.celeryq.dev/en/latest/userguide/concurrency/eventlet.html"
126 .to_string(),
127 ]
128 );
129 }
130
131 #[test]
132 fn test_find_urls_strips_template_credentials_from_git_urls() {
133 let text = "Repo: https://user:{ACCESS_TOKEN}@github.com/example/project.git";
134 let config = DetectionConfig::default();
135 let urls = find_urls(text, &config);
136
137 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
138 assert_eq!(urls[0].url, "https://github.com/example/project.git");
139 }
140
141 #[test]
142 fn test_find_urls_strips_percent_encoded_template_credentials_from_git_urls() {
143 let text = "Repo: https://user:%7BACCESS_TOKEN%7D@github.com/example/project.git";
144 let config = DetectionConfig::default();
145 let urls = find_urls(text, &config);
146
147 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
148 assert_eq!(urls[0].url, "https://github.com/example/project.git");
149 }
150
151 #[test]
152 fn test_find_urls_dedupes_plain_and_templated_git_urls_after_sanitization() {
153 let text = concat!(
154 "https://github.com/example/project.git\n",
155 "https://user:%7BACCESS_TOKEN%7D@github.com/example/project.git\n",
156 );
157 let config = DetectionConfig::default();
158 let urls = find_urls(text, &config);
159
160 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
161 assert_eq!(urls[0].url, "https://github.com/example/project.git");
162 }
163
164 #[test]
165 fn test_find_urls_strips_trailing_backticks() {
166 let text = "Docs: https://github.com/example/project.git``";
167 let config = DetectionConfig::default();
168 let urls = find_urls(text, &config);
169
170 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
171 assert_eq!(urls[0].url, "https://github.com/example/project.git");
172 }
173
174 #[test]
175 fn test_find_urls_strips_rd_url_braces() {
176 let text = r#"\\url{https://dplyr.tidyverse.org}"#;
177 let config = DetectionConfig::default();
178 let urls = find_urls(text, &config);
179
180 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
181 assert_eq!(urls[0].url, "https://dplyr.tidyverse.org/");
182 }
183
184 #[test]
185 fn test_find_urls_strips_rd_href_trailing_braces() {
186 let text = r#"\\href{https://orcid.org/0000-0003-4757-117X}{ORCID}"#;
187 let config = DetectionConfig::default();
188 let urls = find_urls(text, &config);
189
190 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
191 assert_eq!(urls[0].url, "https://orcid.org/0000-0003-4757-117X");
192 }
193
194 #[test]
195 fn test_find_urls_strips_rd_url_double_closing_braces() {
196 let text = r#"\\url{https://fred.stlouisfed.org/series/PCE}}"#;
197 let config = DetectionConfig::default();
198 let urls = find_urls(text, &config);
199
200 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
201 assert_eq!(urls[0].url, "https://fred.stlouisfed.org/series/PCE");
202 }
203
204 #[test]
205 fn test_find_urls_strips_rd_closing_brace_before_punctuation() {
206 let text = r#"\\url{https://fred.stlouisfed.org/}."#;
207 let config = DetectionConfig::default();
208 let urls = find_urls(text, &config);
209
210 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
211 assert_eq!(urls[0].url, "https://fred.stlouisfed.org/");
212 }
213
214 #[test]
215 fn test_find_urls_keeps_closed_template_placeholders() {
216 let text =
217 "https://flutter-dashboard.appspot.com/#/build?repo=flutter&branch=${branchName}";
218 let config = DetectionConfig::default();
219 let urls = find_urls(text, &config);
220
221 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
222 assert_eq!(
223 urls[0].url,
224 "https://flutter-dashboard.appspot.com/#/build?repo=flutter&branch=${branchName}"
225 );
226 }
227
228 #[test]
229 fn test_find_urls_trims_open_template_suffixes() {
230 let text =
231 "https://github.com/flutter/flutter/pull/${{ github.event.pull_request.number }}";
232 let config = DetectionConfig::default();
233 let urls = find_urls(text, &config);
234
235 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
236 assert_eq!(urls[0].url, "https://github.com/flutter/flutter/pull");
237 }
238
239 #[test]
240 fn test_find_urls_ignores_markdown_emphasis_inside_hostname() {
241 let text = "Use https://**yourcompany**.atlassian.net for Jira Cloud.";
242 let config = DetectionConfig::default();
243 let urls = find_urls(text, &config);
244
245 assert!(urls.is_empty(), "urls: {urls:#?}");
246 }
247
248 #[test]
249 fn test_find_urls_filters_code_variable_host_artifacts() {
250 let text = "loginUrl = \"http://os.environ['DD_BASE_URL']/login\"";
251 let config = DetectionConfig::default();
252 let urls = find_urls(text, &config);
253
254 assert!(urls.is_empty(), "urls: {urls:#?}");
255 }
256
257 #[test]
258 fn test_find_emails_ignores_file_like_domains() {
259 let text = "s@index.html version@.tar.gz real@rust-lang.org";
260 let config = DetectionConfig::default();
261 let emails = find_emails(text, &config);
262
263 let values: Vec<_> = emails.into_iter().map(|email| email.email).collect();
264 assert_eq!(values, vec!["real@rust-lang.org".to_string()]);
265 }
266
267 #[test]
268 fn test_find_emails_ignores_generated_template_asset_tokens() {
269 let text = "icon-app-20x20@2x.png.img.tmpl git@github.com this@mockk.projectdir";
270 let config = DetectionConfig::default();
271 let emails = find_emails(text, &config);
272
273 let values: Vec<_> = emails.into_iter().map(|email| email.email).collect();
274 assert_eq!(values, vec!["git@github.com".to_string()]);
275 }
276
277 #[test]
278 fn test_find_urls_ignores_file_like_fake_hosts() {
279 let text = "http://ftp.sftp/ http://www.classes.hint/ http://www.conf.default/ https://rust-lang.org/real";
280 let config = DetectionConfig::default();
281 let urls = find_urls(text, &config);
282
283 let values: Vec<_> = urls.into_iter().map(|url| url.url).collect();
284 assert_eq!(values, vec!["https://rust-lang.org/real".to_string()]);
285 }
286
287 #[test]
288 fn test_find_urls_ignores_ellipsis_placeholder_hosts() {
289 let text = "Fetch https://.../script.py after download.";
290 let config = DetectionConfig::default();
291 let urls = find_urls(text, &config);
292
293 assert!(urls.is_empty(), "urls: {urls:#?}");
294 }
295
296 #[test]
297 fn test_find_urls_ignores_braced_placeholder_hosts() {
298 let text = "Download from http://{httpserver.host/ when tests boot.";
299 let config = DetectionConfig::default();
300 let urls = find_urls(text, &config);
301
302 assert!(urls.is_empty(), "urls: {urls:#?}");
303 }
304}