1mod emails;
5#[cfg(all(test, feature = "golden-tests"))]
6mod golden_test;
7mod host;
8mod junk_data;
9mod urls;
10
11pub use emails::find_emails;
12pub use urls::find_urls;
13
14#[derive(Debug, Clone)]
15pub struct DetectionConfig {
16 pub max_emails: usize,
17 pub max_urls: usize,
18 pub unique: bool,
19}
20
21impl Default for DetectionConfig {
22 fn default() -> Self {
23 Self {
24 max_emails: 50,
25 max_urls: 50,
26 unique: true,
27 }
28 }
29}
30
31#[cfg(test)]
32mod tests {
33 use super::{DetectionConfig, find_emails, find_urls};
34 use crate::models::LineNumber;
35
36 #[test]
37 fn test_find_emails_threshold() {
38 let text = "a@b.com\nc@d.com\ne@f.com\n";
39 let config = DetectionConfig {
40 max_emails: 2,
41 ..Default::default()
42 };
43 let emails = find_emails(text, &config);
44 assert_eq!(emails.len(), 2);
45 assert_eq!(emails[0].email, "a@b.com");
46 assert_eq!(emails[0].start_line, LineNumber::ONE);
47 }
48
49 #[test]
50 fn test_find_urls_threshold() {
51 let text = "http://a.com\nhttp://b.com\nhttp://c.com\n";
52 let config = DetectionConfig {
53 max_urls: 2,
54 ..Default::default()
55 };
56 let urls = find_urls(text, &config);
57 assert_eq!(urls.len(), 2);
58 assert_eq!(urls[0].url, "http://a.com/");
59 assert_eq!(urls[1].url, "http://b.com/");
60 }
61
62 #[test]
63 fn test_find_emails_filters_local_machine_domains() {
64 let text = "admin@rust-lang.org\ngeisse@shopgates-mac-mini-3.local\n";
65 let config = DetectionConfig::default();
66 let emails = find_emails(text, &config);
67
68 assert_eq!(emails.len(), 1);
69 assert_eq!(emails[0].email, "admin@rust-lang.org");
70 }
71
72 #[test]
73 fn test_find_emails_ignores_literal_escaped_newline_code_artifacts() {
74 let text = r#"email": "global_writer@email.com\n@app.route\n@csrf.exempt\nuser5@email.com"#;
75 let config = DetectionConfig::default();
76 let emails = find_emails(text, &config);
77
78 let values: Vec<_> = emails.into_iter().map(|email| email.email).collect();
79 assert_eq!(
80 values,
81 vec![
82 "global_writer@email.com".to_string(),
83 "user5@email.com".to_string(),
84 ]
85 );
86 }
87
88 #[test]
89 fn test_find_emails_ignores_r_slot_access_false_positives() {
90 let text = "element@arrow.fill <- element@colour\ntt@inherit.blank <- FALSE\n";
91 let config = DetectionConfig::default();
92 let emails = find_emails(text, &config);
93
94 assert!(emails.is_empty(), "emails: {emails:#?}");
95 }
96
97 #[test]
98 fn test_find_urls_ignores_email_like_ftp_token() {
99 let text = "See ftp.mtuci@gmail.com for details.";
100 let config = DetectionConfig::default();
101 let urls = find_urls(text, &config);
102
103 assert!(urls.is_empty(), "urls: {urls:#?}");
104 }
105
106 #[test]
107 fn test_find_urls_keeps_plain_ftp_hostname() {
108 let text = "Mirror: ftp.gnu.org/gnu/tar/";
109 let config = DetectionConfig::default();
110 let urls = find_urls(text, &config);
111
112 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
113 assert_eq!(urls[0].url, "http://ftp.gnu.org/gnu/tar/");
114 }
115
116 #[test]
117 fn test_find_urls_splits_literal_escaped_newline_separated_urls() {
118 let text = "https://docs.celeryq.dev/en/latest/userguide/workers.html#concurrency\\nhttps://docs.celeryq.dev/en/latest/userguide/concurrency/eventlet.html";
119 let config = DetectionConfig::default();
120 let urls = find_urls(text, &config);
121
122 let values: Vec<_> = urls.into_iter().map(|url| url.url).collect();
123 assert_eq!(
124 values,
125 vec![
126 "https://docs.celeryq.dev/en/latest/userguide/workers.html#concurrency".to_string(),
127 "https://docs.celeryq.dev/en/latest/userguide/concurrency/eventlet.html"
128 .to_string(),
129 ]
130 );
131 }
132
133 #[test]
134 fn test_find_urls_strips_template_credentials_from_git_urls() {
135 let text = "Repo: https://user:{ACCESS_TOKEN}@github.com/example/project.git";
136 let config = DetectionConfig::default();
137 let urls = find_urls(text, &config);
138
139 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
140 assert_eq!(urls[0].url, "https://github.com/example/project.git");
141 }
142
143 #[test]
144 fn test_find_urls_strips_percent_encoded_template_credentials_from_git_urls() {
145 let text = "Repo: https://user:%7BACCESS_TOKEN%7D@github.com/example/project.git";
146 let config = DetectionConfig::default();
147 let urls = find_urls(text, &config);
148
149 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
150 assert_eq!(urls[0].url, "https://github.com/example/project.git");
151 }
152
153 #[test]
154 fn test_find_urls_dedupes_plain_and_templated_git_urls_after_sanitization() {
155 let text = concat!(
156 "https://github.com/example/project.git\n",
157 "https://user:%7BACCESS_TOKEN%7D@github.com/example/project.git\n",
158 );
159 let config = DetectionConfig::default();
160 let urls = find_urls(text, &config);
161
162 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
163 assert_eq!(urls[0].url, "https://github.com/example/project.git");
164 }
165
166 #[test]
167 fn test_find_urls_strips_trailing_backticks() {
168 let text = "Docs: https://github.com/example/project.git``";
169 let config = DetectionConfig::default();
170 let urls = find_urls(text, &config);
171
172 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
173 assert_eq!(urls[0].url, "https://github.com/example/project.git");
174 }
175
176 #[test]
177 fn test_find_urls_strips_rd_url_braces() {
178 let text = r#"\\url{https://dplyr.tidyverse.org}"#;
179 let config = DetectionConfig::default();
180 let urls = find_urls(text, &config);
181
182 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
183 assert_eq!(urls[0].url, "https://dplyr.tidyverse.org/");
184 }
185
186 #[test]
187 fn test_find_urls_strips_rd_href_trailing_braces() {
188 let text = r#"\\href{https://orcid.org/0000-0003-4757-117X}{ORCID}"#;
189 let config = DetectionConfig::default();
190 let urls = find_urls(text, &config);
191
192 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
193 assert_eq!(urls[0].url, "https://orcid.org/0000-0003-4757-117X");
194 }
195
196 #[test]
197 fn test_find_urls_strips_rd_url_double_closing_braces() {
198 let text = r#"\\url{https://fred.stlouisfed.org/series/PCE}}"#;
199 let config = DetectionConfig::default();
200 let urls = find_urls(text, &config);
201
202 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
203 assert_eq!(urls[0].url, "https://fred.stlouisfed.org/series/PCE");
204 }
205
206 #[test]
207 fn test_find_urls_strips_rd_closing_brace_before_punctuation() {
208 let text = r#"\\url{https://fred.stlouisfed.org/}."#;
209 let config = DetectionConfig::default();
210 let urls = find_urls(text, &config);
211
212 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
213 assert_eq!(urls[0].url, "https://fred.stlouisfed.org/");
214 }
215
216 #[test]
217 fn test_find_urls_ignores_markdown_emphasis_inside_hostname() {
218 let text = "Use https://**yourcompany**.atlassian.net for Jira Cloud.";
219 let config = DetectionConfig::default();
220 let urls = find_urls(text, &config);
221
222 assert!(urls.is_empty(), "urls: {urls:#?}");
223 }
224
225 #[test]
226 fn test_find_urls_filters_code_variable_host_artifacts() {
227 let text = "loginUrl = \"http://os.environ['DD_BASE_URL']/login\"";
228 let config = DetectionConfig::default();
229 let urls = find_urls(text, &config);
230
231 assert!(urls.is_empty(), "urls: {urls:#?}");
232 }
233
234 #[test]
235 fn test_find_emails_ignores_file_like_domains() {
236 let text = "s@index.html version@.tar.gz real@rust-lang.org";
237 let config = DetectionConfig::default();
238 let emails = find_emails(text, &config);
239
240 let values: Vec<_> = emails.into_iter().map(|email| email.email).collect();
241 assert_eq!(values, vec!["real@rust-lang.org".to_string()]);
242 }
243
244 #[test]
245 fn test_find_urls_ignores_file_like_fake_hosts() {
246 let text = "http://ftp.sftp/ http://www.classes.hint/ http://www.conf.default/ https://rust-lang.org/real";
247 let config = DetectionConfig::default();
248 let urls = find_urls(text, &config);
249
250 let values: Vec<_> = urls.into_iter().map(|url| url.url).collect();
251 assert_eq!(values, vec!["https://rust-lang.org/real".to_string()]);
252 }
253}