1use crate::common::{compile_regex, confidence, context_boost};
2use crate::{EmailRecognizer, SsnRecognizer};
3use cloakrs_core::{Confidence, EntityType, Locale, PiiEntity, Recognizer, Span};
4use once_cell::sync::Lazy;
5use regex::Regex;
6use std::collections::HashSet;
7use std::net::IpAddr;
8
9static URL_REGEX: Lazy<Regex> =
10 Lazy::new(|| compile_regex(r##"(?i)\b(?:https?://|ftp://|www\.)[^\s<>"'`{}|\\^\[\]]+"##));
11
12static US_LOCALES: &[Locale] = &[Locale::US];
13
14const CONTEXT_WORDS: &[&str] = &[
15 "url", "uri", "link", "website", "endpoint", "callback", "redirect",
16];
17
18#[derive(Debug, Clone, Copy, Default)]
31pub struct UrlRecognizer;
32
33impl Recognizer for UrlRecognizer {
34 fn id(&self) -> &str {
35 "url_regex_v1"
36 }
37
38 fn entity_type(&self) -> EntityType {
39 EntityType::Url
40 }
41
42 fn supported_locales(&self) -> &[Locale] {
43 &[]
44 }
45
46 fn scan(&self, text: &str) -> Vec<PiiEntity> {
47 find_url_spans(text)
48 .into_iter()
49 .map(|span| {
50 let candidate = &text[span.start..span.end];
51 PiiEntity {
52 entity_type: self.entity_type(),
53 span,
54 text: candidate.to_string(),
55 confidence: self.compute_confidence(text, span.start, candidate),
56 recognizer_id: self.id().to_string(),
57 }
58 })
59 .collect()
60 }
61
62 fn validate(&self, candidate: &str) -> bool {
63 validate_url(candidate)
64 }
65}
66
67impl UrlRecognizer {
68 fn compute_confidence(&self, text: &str, start: usize, candidate: &str) -> Confidence {
69 let base = if has_explicit_scheme(candidate) {
70 0.90
71 } else {
72 0.80
73 };
74 confidence(base + context_boost(text, start, CONTEXT_WORDS))
75 }
76}
77
78pub(crate) struct UrlQueryEmailRecognizer;
79
80impl Recognizer for UrlQueryEmailRecognizer {
81 fn id(&self) -> &str {
82 "url_query_email_v1"
83 }
84
85 fn entity_type(&self) -> EntityType {
86 EntityType::Email
87 }
88
89 fn supported_locales(&self) -> &[Locale] {
90 &[]
91 }
92
93 fn scan(&self, text: &str) -> Vec<PiiEntity> {
94 scan_query_values(text, &EmailRecognizer, self.id())
95 }
96}
97
98pub(crate) struct UrlQuerySsnRecognizer;
99
100impl Recognizer for UrlQuerySsnRecognizer {
101 fn id(&self) -> &str {
102 "url_query_ssn_v1"
103 }
104
105 fn entity_type(&self) -> EntityType {
106 EntityType::Ssn
107 }
108
109 fn supported_locales(&self) -> &[Locale] {
110 US_LOCALES
111 }
112
113 fn scan(&self, text: &str) -> Vec<PiiEntity> {
114 scan_query_values(text, &SsnRecognizer, self.id())
115 }
116}
117
118fn find_url_spans(text: &str) -> Vec<Span> {
119 URL_REGEX
120 .find_iter(text)
121 .filter_map(|matched| {
122 let end = trim_url_end(matched.as_str(), matched.start());
123 (matched.start() < end && validate_url(&text[matched.start()..end]))
124 .then(|| Span::new(matched.start(), end))
125 })
126 .filter(|span| is_url_boundary(text, span.start, span.end))
127 .collect()
128}
129
130fn trim_url_end(candidate: &str, start: usize) -> usize {
131 let mut end = start + candidate.len();
132 let mut value = candidate;
133 while let Some(c) = value.chars().next_back() {
134 let should_trim = matches!(c, '.' | ',' | ';' | ':' | '!' | '?')
135 || (matches!(c, ')' | ']' | '}') && !has_matching_opener(value, c));
136 if !should_trim {
137 break;
138 }
139 end -= c.len_utf8();
140 value = &candidate[..end - start];
141 }
142 end
143}
144
145fn has_matching_opener(value: &str, closer: char) -> bool {
146 let opener = match closer {
147 ')' => '(',
148 ']' => '[',
149 '}' => '{',
150 _ => return true,
151 };
152 value.chars().filter(|c| *c == opener).count() >= value.chars().filter(|c| *c == closer).count()
153}
154
155fn validate_url(candidate: &str) -> bool {
156 let Some(authority) = authority(candidate) else {
157 return false;
158 };
159 let host = host_from_authority(authority);
160 validate_host(host)
161}
162
163fn has_explicit_scheme(candidate: &str) -> bool {
164 candidate
165 .get(..7)
166 .is_some_and(|prefix| prefix.eq_ignore_ascii_case("http://"))
167 || candidate
168 .get(..8)
169 .is_some_and(|prefix| prefix.eq_ignore_ascii_case("https://"))
170 || candidate
171 .get(..6)
172 .is_some_and(|prefix| prefix.eq_ignore_ascii_case("ftp://"))
173}
174
175fn authority(candidate: &str) -> Option<&str> {
176 let after_prefix = if let Some((_, rest)) = candidate.split_once("://") {
177 rest
178 } else {
179 candidate
180 .get(..4)
181 .is_some_and(|prefix| prefix.eq_ignore_ascii_case("www."))
182 .then_some(candidate)?
183 };
184 let end = after_prefix
185 .find(['/', '?', '#'])
186 .unwrap_or(after_prefix.len());
187 let authority = &after_prefix[..end];
188 (!authority.is_empty()).then_some(authority)
189}
190
191fn host_from_authority(authority: &str) -> &str {
192 let without_userinfo = authority
193 .rsplit_once('@')
194 .map_or(authority, |(_, host)| host);
195 if let Some(rest) = without_userinfo.strip_prefix('[') {
196 return rest
197 .split_once(']')
198 .map_or(without_userinfo, |(host, _)| host);
199 }
200 without_userinfo
201 .split_once(':')
202 .map_or(without_userinfo, |(host, _)| host)
203}
204
205fn validate_host(host: &str) -> bool {
206 if host.eq_ignore_ascii_case("localhost") || host.parse::<IpAddr>().is_ok() {
207 return true;
208 }
209 if host.is_empty() || !host.contains('.') {
210 return false;
211 }
212 host.split('.').all(validate_host_label)
213}
214
215fn validate_host_label(label: &str) -> bool {
216 !label.is_empty()
217 && !label.starts_with('-')
218 && !label.ends_with('-')
219 && label.chars().all(|c| c.is_ascii_alphanumeric() || c == '-')
220}
221
222fn is_url_boundary(text: &str, start: usize, end: usize) -> bool {
223 let before = text[..start].chars().next_back();
224 let after = text[end..].chars().next();
225 !before.is_some_and(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-')
226 && !after.is_some_and(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-')
227}
228
229fn scan_query_values<R>(text: &str, recognizer: &R, recognizer_id: &str) -> Vec<PiiEntity>
230where
231 R: Recognizer,
232{
233 let mut findings = Vec::new();
234 let mut seen = HashSet::new();
235
236 for url_span in find_url_spans(text) {
237 let Some(query_span) = query_span(text, url_span) else {
238 continue;
239 };
240 for value_span in query_value_spans(text, query_span) {
241 scan_query_value(
242 text,
243 value_span,
244 recognizer,
245 recognizer_id,
246 &mut seen,
247 &mut findings,
248 );
249 }
250 }
251
252 findings.sort_by_key(|finding| finding.span.start);
253 findings
254}
255
256fn scan_query_value<R>(
257 text: &str,
258 value_span: Span,
259 recognizer: &R,
260 recognizer_id: &str,
261 seen: &mut HashSet<(EntityType, usize, usize)>,
262 findings: &mut Vec<PiiEntity>,
263) where
264 R: Recognizer,
265{
266 add_query_findings(
267 text,
268 value_span,
269 recognizer.scan(&text[value_span.start..value_span.end]),
270 recognizer_id,
271 seen,
272 findings,
273 );
274
275 let Some(decoded) = percent_decode_with_mapping(&text[value_span.start..value_span.end]) else {
276 return;
277 };
278 if decoded.value == text[value_span.start..value_span.end] {
279 return;
280 }
281
282 let decoded_findings = recognizer.scan(&decoded.value);
283 for finding in decoded_findings {
284 if finding.span.is_empty() || finding.span.end > decoded.mapping.len() {
285 continue;
286 }
287 let original_start = value_span.start + decoded.mapping[finding.span.start].0;
288 let original_end = value_span.start + decoded.mapping[finding.span.end - 1].1;
289 add_query_findings(
290 text,
291 Span::new(original_start, original_end),
292 vec![PiiEntity {
293 entity_type: finding.entity_type,
294 span: Span::new(0, original_end - original_start),
295 text: text[original_start..original_end].to_string(),
296 confidence: finding.confidence,
297 recognizer_id: finding.recognizer_id,
298 }],
299 recognizer_id,
300 seen,
301 findings,
302 );
303 }
304}
305
306fn add_query_findings(
307 text: &str,
308 offset: Span,
309 local_findings: Vec<PiiEntity>,
310 recognizer_id: &str,
311 seen: &mut HashSet<(EntityType, usize, usize)>,
312 findings: &mut Vec<PiiEntity>,
313) {
314 for finding in local_findings {
315 let span = Span::new(
316 offset.start + finding.span.start,
317 offset.start + finding.span.end,
318 );
319 if span.end > text.len()
320 || !seen.insert((finding.entity_type.clone(), span.start, span.end))
321 {
322 continue;
323 }
324 findings.push(PiiEntity {
325 entity_type: finding.entity_type,
326 span,
327 text: text[span.start..span.end].to_string(),
328 confidence: finding.confidence,
329 recognizer_id: recognizer_id.to_string(),
330 });
331 }
332}
333
334fn query_span(text: &str, url_span: Span) -> Option<Span> {
335 let url = &text[url_span.start..url_span.end];
336 let query_start = url.find('?')? + 1;
337 let query_end = url[query_start..]
338 .find('#')
339 .map_or(url.len(), |fragment| query_start + fragment);
340 (query_start < query_end)
341 .then(|| Span::new(url_span.start + query_start, url_span.start + query_end))
342}
343
344fn query_value_spans(text: &str, query_span: Span) -> Vec<Span> {
345 let mut spans = Vec::new();
346 let mut parameter_start = query_span.start;
347 let query = &text[query_span.start..query_span.end];
348
349 for (offset, c) in query.char_indices() {
350 if matches!(c, '&' | ';') {
351 push_query_value_span(text, parameter_start, query_span.start + offset, &mut spans);
352 parameter_start = query_span.start + offset + c.len_utf8();
353 }
354 }
355 push_query_value_span(text, parameter_start, query_span.end, &mut spans);
356
357 spans
358}
359
360fn push_query_value_span(text: &str, start: usize, end: usize, spans: &mut Vec<Span>) {
361 if start >= end {
362 return;
363 }
364 let parameter = &text[start..end];
365 if let Some(eq) = parameter.find('=') {
366 let value_start = start + eq + 1;
367 if value_start < end {
368 spans.push(Span::new(value_start, end));
369 }
370 }
371}
372
373struct DecodedValue {
374 value: String,
375 mapping: Vec<(usize, usize)>,
376}
377
378fn percent_decode_with_mapping(value: &str) -> Option<DecodedValue> {
379 let bytes = value.as_bytes();
380 let mut decoded = Vec::with_capacity(bytes.len());
381 let mut mapping = Vec::with_capacity(bytes.len());
382 let mut index = 0;
383
384 while index < bytes.len() {
385 if bytes[index] == b'%' && index + 2 < bytes.len() {
386 if let (Some(high), Some(low)) =
387 (hex_value(bytes[index + 1]), hex_value(bytes[index + 2]))
388 {
389 decoded.push(high * 16 + low);
390 mapping.push((index, index + 3));
391 index += 3;
392 continue;
393 }
394 }
395
396 decoded.push(if bytes[index] == b'+' {
397 b' '
398 } else {
399 bytes[index]
400 });
401 mapping.push((index, index + 1));
402 index += 1;
403 }
404
405 String::from_utf8(decoded)
406 .ok()
407 .map(|value| DecodedValue { value, mapping })
408}
409
410fn hex_value(byte: u8) -> Option<u8> {
411 match byte {
412 b'0'..=b'9' => Some(byte - b'0'),
413 b'a'..=b'f' => Some(byte - b'a' + 10),
414 b'A'..=b'F' => Some(byte - b'A' + 10),
415 _ => None,
416 }
417}
418
419#[cfg(test)]
420mod tests {
421 use super::*;
422 use crate::default_registry;
423 use cloakrs_core::MaskStrategy;
424
425 fn url_texts(input: &str) -> Vec<String> {
426 UrlRecognizer
427 .scan(input)
428 .into_iter()
429 .map(|finding| finding.text)
430 .collect()
431 }
432
433 fn query_email_texts(input: &str) -> Vec<String> {
434 UrlQueryEmailRecognizer
435 .scan(input)
436 .into_iter()
437 .map(|finding| finding.text)
438 .collect()
439 }
440
441 fn query_ssn_texts(input: &str) -> Vec<String> {
442 UrlQuerySsnRecognizer
443 .scan(input)
444 .into_iter()
445 .map(|finding| finding.text)
446 .collect()
447 }
448
449 #[test]
450 fn test_url_http_detected() {
451 assert_eq!(url_texts("link http://example.com"), ["http://example.com"]);
452 }
453
454 #[test]
455 fn test_url_https_path_query_fragment_detected() {
456 assert_eq!(
457 url_texts("visit https://example.com/a/b?x=1#top"),
458 ["https://example.com/a/b?x=1#top"]
459 );
460 }
461
462 #[test]
463 fn test_url_www_detected() {
464 assert_eq!(
465 url_texts("open www.example.com/docs"),
466 ["www.example.com/docs"]
467 );
468 }
469
470 #[test]
471 fn test_url_localhost_port_detected() {
472 assert_eq!(
473 url_texts("endpoint http://localhost:8080/health"),
474 ["http://localhost:8080/health"]
475 );
476 }
477
478 #[test]
479 fn test_url_ip_host_detected() {
480 assert_eq!(
481 url_texts("endpoint http://203.0.113.42/api"),
482 ["http://203.0.113.42/api"]
483 );
484 }
485
486 #[test]
487 fn test_url_trailing_punctuation_excluded() {
488 assert_eq!(
489 url_texts("see https://example.com/path."),
490 ["https://example.com/path"]
491 );
492 }
493
494 #[test]
495 fn test_url_balanced_parentheses_preserved() {
496 assert_eq!(
497 url_texts("see https://example.com/a_(b)"),
498 ["https://example.com/a_(b)"]
499 );
500 }
501
502 #[test]
503 fn test_url_invalid_host_without_dot_rejected() {
504 assert!(url_texts("go https://example/path").is_empty());
505 }
506
507 #[test]
508 fn test_url_embedded_in_word_rejected() {
509 assert!(url_texts("abchttps://example.com").is_empty());
510 }
511
512 #[test]
513 fn test_url_multiple_values_detected() {
514 assert_eq!(
515 url_texts("a https://example.com b www.example.org"),
516 ["https://example.com", "www.example.org"]
517 );
518 }
519
520 #[test]
521 fn test_url_context_boosts_confidence() {
522 let with_context = UrlRecognizer.scan("url https://example.com");
523 let without_context = UrlRecognizer.scan("value https://example.com");
524 assert!(with_context[0].confidence > without_context[0].confidence);
525 }
526
527 #[test]
528 fn test_url_www_confidence_lower_than_scheme() {
529 let scheme = UrlRecognizer.scan("https://example.com");
530 let www = UrlRecognizer.scan("www.example.com");
531 assert!(www[0].confidence < scheme[0].confidence);
532 }
533
534 #[test]
535 fn test_url_query_email_unencoded_detected() {
536 assert_eq!(
537 query_email_texts("https://example.com/callback?email=jane@example.com"),
538 ["jane@example.com"]
539 );
540 }
541
542 #[test]
543 fn test_url_query_email_percent_encoded_detected() {
544 assert_eq!(
545 query_email_texts("https://example.com/callback?email=jane%40example.com"),
546 ["jane%40example.com"]
547 );
548 }
549
550 #[test]
551 fn test_url_query_ssn_detected() {
552 assert_eq!(
553 query_ssn_texts("https://example.com/callback?ssn=123-45-6789"),
554 ["123-45-6789"]
555 );
556 }
557
558 #[test]
559 fn test_url_query_ssn_supported_locale_is_us() {
560 assert_eq!(UrlQuerySsnRecognizer.supported_locales(), &[Locale::US]);
561 }
562
563 #[test]
564 fn test_url_default_registry_preserves_url_and_query_email() {
565 let scanner = default_registry()
566 .into_scanner_builder()
567 .without_masking()
568 .build()
569 .unwrap();
570
571 let result = scanner
572 .scan("go https://example.com/callback?email=jane@example.com")
573 .unwrap();
574
575 assert!(result
576 .findings
577 .iter()
578 .any(|finding| finding.entity_type == EntityType::Url));
579 assert!(result
580 .findings
581 .iter()
582 .any(|finding| finding.entity_type == EntityType::Email
583 && finding.recognizer_id == "url_query_email_v1"));
584 }
585
586 #[test]
587 fn test_url_us_scanner_preserves_url_and_query_ssn() {
588 let scanner = default_registry()
589 .into_scanner_builder()
590 .locale(Locale::US)
591 .without_masking()
592 .build()
593 .unwrap();
594
595 let result = scanner
596 .scan("go https://example.com/callback?ssn=123-45-6789")
597 .unwrap();
598
599 assert!(result
600 .findings
601 .iter()
602 .any(|finding| finding.entity_type == EntityType::Url));
603 assert!(result
604 .findings
605 .iter()
606 .any(|finding| finding.entity_type == EntityType::Ssn
607 && finding.recognizer_id == "url_query_ssn_v1"));
608 }
609
610 #[test]
611 fn test_url_universal_scanner_excludes_query_ssn() {
612 let scanner = default_registry()
613 .into_scanner_builder()
614 .locale(Locale::Universal)
615 .without_masking()
616 .build()
617 .unwrap();
618
619 let result = scanner
620 .scan("go https://example.com/callback?ssn=123-45-6789")
621 .unwrap();
622
623 assert!(result
624 .findings
625 .iter()
626 .all(|finding| finding.entity_type != EntityType::Ssn));
627 }
628
629 #[test]
630 fn test_url_masking_redacts_outer_url_once() {
631 let scanner = default_registry()
632 .into_scanner_builder()
633 .strategy(MaskStrategy::Redact)
634 .build()
635 .unwrap();
636
637 let result = scanner
638 .scan("go https://example.com/callback?email=jane@example.com")
639 .unwrap();
640
641 assert_eq!(result.masked_text.as_deref(), Some("go [URL]"));
642 }
643}