1mod emails;
2#[cfg(all(test, feature = "golden-tests"))]
3mod golden_test;
4mod host;
5mod junk_data;
6mod urls;
7
8pub use emails::find_emails;
9pub use urls::find_urls;
10
11#[derive(Debug, Clone)]
12pub struct DetectionConfig {
13 pub max_emails: usize,
14 pub max_urls: usize,
15 pub unique: bool,
16}
17
18impl Default for DetectionConfig {
19 fn default() -> Self {
20 Self {
21 max_emails: 50,
22 max_urls: 50,
23 unique: true,
24 }
25 }
26}
27
28#[cfg(test)]
29mod tests {
30 use super::{DetectionConfig, find_emails, find_urls};
31 use crate::models::LineNumber;
32
33 #[test]
34 fn test_find_emails_threshold() {
35 let text = "a@b.com\nc@d.com\ne@f.com\n";
36 let config = DetectionConfig {
37 max_emails: 2,
38 ..Default::default()
39 };
40 let emails = find_emails(text, &config);
41 assert_eq!(emails.len(), 2);
42 assert_eq!(emails[0].email, "a@b.com");
43 assert_eq!(emails[0].start_line, LineNumber::ONE);
44 }
45
46 #[test]
47 fn test_find_urls_threshold() {
48 let text = "http://a.com\nhttp://b.com\nhttp://c.com\n";
49 let config = DetectionConfig {
50 max_urls: 2,
51 ..Default::default()
52 };
53 let urls = find_urls(text, &config);
54 assert_eq!(urls.len(), 2);
55 assert_eq!(urls[0].url, "http://a.com/");
56 assert_eq!(urls[1].url, "http://b.com/");
57 }
58
59 #[test]
60 fn test_find_emails_filters_local_machine_domains() {
61 let text = "admin@rust-lang.org\ngeisse@shopgates-mac-mini-3.local\n";
62 let config = DetectionConfig::default();
63 let emails = find_emails(text, &config);
64
65 assert_eq!(emails.len(), 1);
66 assert_eq!(emails[0].email, "admin@rust-lang.org");
67 }
68
69 #[test]
70 fn test_find_emails_ignores_literal_escaped_newline_code_artifacts() {
71 let text = r#"email": "global_writer@email.com\n@app.route\n@csrf.exempt\nuser5@email.com"#;
72 let config = DetectionConfig::default();
73 let emails = find_emails(text, &config);
74
75 let values: Vec<_> = emails.into_iter().map(|email| email.email).collect();
76 assert_eq!(
77 values,
78 vec![
79 "global_writer@email.com".to_string(),
80 "user5@email.com".to_string(),
81 ]
82 );
83 }
84
85 #[test]
86 fn test_find_emails_ignores_r_slot_access_false_positives() {
87 let text = "element@arrow.fill <- element@colour\ntt@inherit.blank <- FALSE\n";
88 let config = DetectionConfig::default();
89 let emails = find_emails(text, &config);
90
91 assert!(emails.is_empty(), "emails: {emails:#?}");
92 }
93
94 #[test]
95 fn test_find_urls_ignores_email_like_ftp_token() {
96 let text = "See ftp.mtuci@gmail.com for details.";
97 let config = DetectionConfig::default();
98 let urls = find_urls(text, &config);
99
100 assert!(urls.is_empty(), "urls: {urls:#?}");
101 }
102
103 #[test]
104 fn test_find_urls_keeps_plain_ftp_hostname() {
105 let text = "Mirror: ftp.gnu.org/gnu/tar/";
106 let config = DetectionConfig::default();
107 let urls = find_urls(text, &config);
108
109 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
110 assert_eq!(urls[0].url, "http://ftp.gnu.org/gnu/tar/");
111 }
112
113 #[test]
114 fn test_find_urls_splits_literal_escaped_newline_separated_urls() {
115 let text = "https://docs.celeryq.dev/en/latest/userguide/workers.html#concurrency\\nhttps://docs.celeryq.dev/en/latest/userguide/concurrency/eventlet.html";
116 let config = DetectionConfig::default();
117 let urls = find_urls(text, &config);
118
119 let values: Vec<_> = urls.into_iter().map(|url| url.url).collect();
120 assert_eq!(
121 values,
122 vec![
123 "https://docs.celeryq.dev/en/latest/userguide/workers.html#concurrency".to_string(),
124 "https://docs.celeryq.dev/en/latest/userguide/concurrency/eventlet.html"
125 .to_string(),
126 ]
127 );
128 }
129
130 #[test]
131 fn test_find_urls_strips_template_credentials_from_git_urls() {
132 let text = "Repo: https://user:{ACCESS_TOKEN}@github.com/example/project.git";
133 let config = DetectionConfig::default();
134 let urls = find_urls(text, &config);
135
136 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
137 assert_eq!(urls[0].url, "https://github.com/example/project.git");
138 }
139
140 #[test]
141 fn test_find_urls_strips_percent_encoded_template_credentials_from_git_urls() {
142 let text = "Repo: https://user:%7BACCESS_TOKEN%7D@github.com/example/project.git";
143 let config = DetectionConfig::default();
144 let urls = find_urls(text, &config);
145
146 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
147 assert_eq!(urls[0].url, "https://github.com/example/project.git");
148 }
149
150 #[test]
151 fn test_find_urls_dedupes_plain_and_templated_git_urls_after_sanitization() {
152 let text = concat!(
153 "https://github.com/example/project.git\n",
154 "https://user:%7BACCESS_TOKEN%7D@github.com/example/project.git\n",
155 );
156 let config = DetectionConfig::default();
157 let urls = find_urls(text, &config);
158
159 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
160 assert_eq!(urls[0].url, "https://github.com/example/project.git");
161 }
162
163 #[test]
164 fn test_find_urls_strips_trailing_backticks() {
165 let text = "Docs: https://github.com/example/project.git``";
166 let config = DetectionConfig::default();
167 let urls = find_urls(text, &config);
168
169 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
170 assert_eq!(urls[0].url, "https://github.com/example/project.git");
171 }
172
173 #[test]
174 fn test_find_urls_strips_rd_url_braces() {
175 let text = r#"\\url{https://dplyr.tidyverse.org}"#;
176 let config = DetectionConfig::default();
177 let urls = find_urls(text, &config);
178
179 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
180 assert_eq!(urls[0].url, "https://dplyr.tidyverse.org/");
181 }
182
183 #[test]
184 fn test_find_urls_strips_rd_href_trailing_braces() {
185 let text = r#"\\href{https://orcid.org/0000-0003-4757-117X}{ORCID}"#;
186 let config = DetectionConfig::default();
187 let urls = find_urls(text, &config);
188
189 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
190 assert_eq!(urls[0].url, "https://orcid.org/0000-0003-4757-117X");
191 }
192
193 #[test]
194 fn test_find_urls_strips_rd_url_double_closing_braces() {
195 let text = r#"\\url{https://fred.stlouisfed.org/series/PCE}}"#;
196 let config = DetectionConfig::default();
197 let urls = find_urls(text, &config);
198
199 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
200 assert_eq!(urls[0].url, "https://fred.stlouisfed.org/series/PCE");
201 }
202
203 #[test]
204 fn test_find_urls_strips_rd_closing_brace_before_punctuation() {
205 let text = r#"\\url{https://fred.stlouisfed.org/}."#;
206 let config = DetectionConfig::default();
207 let urls = find_urls(text, &config);
208
209 assert_eq!(urls.len(), 1, "urls: {urls:#?}");
210 assert_eq!(urls[0].url, "https://fred.stlouisfed.org/");
211 }
212
213 #[test]
214 fn test_find_urls_ignores_markdown_emphasis_inside_hostname() {
215 let text = "Use https://**yourcompany**.atlassian.net for Jira Cloud.";
216 let config = DetectionConfig::default();
217 let urls = find_urls(text, &config);
218
219 assert!(urls.is_empty(), "urls: {urls:#?}");
220 }
221
222 #[test]
223 fn test_find_urls_filters_code_variable_host_artifacts() {
224 let text = "loginUrl = \"http://os.environ['DD_BASE_URL']/login\"";
225 let config = DetectionConfig::default();
226 let urls = find_urls(text, &config);
227
228 assert!(urls.is_empty(), "urls: {urls:#?}");
229 }
230}