Skip to main content

cloakrs_patterns/
url.rs

1use crate::common::{compile_regex, confidence, context_boost};
2use crate::{EmailRecognizer, SsnRecognizer};
3use cloakrs_core::{Confidence, EntityType, Locale, PiiEntity, Recognizer, Span};
4use once_cell::sync::Lazy;
5use regex::Regex;
6use std::collections::HashSet;
7use std::net::IpAddr;
8
9static URL_REGEX: Lazy<Regex> =
10    Lazy::new(|| compile_regex(r##"(?i)\b(?:https?://|ftp://|www\.)[^\s<>"'`{}|\\^\[\]]+"##));
11
12static US_LOCALES: &[Locale] = &[Locale::US];
13
14const CONTEXT_WORDS: &[&str] = &[
15    "url", "uri", "link", "website", "endpoint", "callback", "redirect",
16];
17
18/// Recognizes HTTP, HTTPS, FTP, and `www.` URLs.
19///
20/// # Examples
21///
22/// ```
23/// use cloakrs_core::{EntityType, Recognizer};
24/// use cloakrs_patterns::UrlRecognizer;
25///
26/// let findings = UrlRecognizer.scan("link: https://example.com/path");
27/// assert_eq!(findings[0].entity_type, EntityType::Url);
28/// assert_eq!(findings[0].text, "https://example.com/path");
29/// ```
30#[derive(Debug, Clone, Copy, Default)]
31pub struct UrlRecognizer;
32
33impl Recognizer for UrlRecognizer {
34    fn id(&self) -> &str {
35        "url_regex_v1"
36    }
37
38    fn entity_type(&self) -> EntityType {
39        EntityType::Url
40    }
41
42    fn supported_locales(&self) -> &[Locale] {
43        &[]
44    }
45
46    fn scan(&self, text: &str) -> Vec<PiiEntity> {
47        find_url_spans(text)
48            .into_iter()
49            .map(|span| {
50                let candidate = &text[span.start..span.end];
51                PiiEntity {
52                    entity_type: self.entity_type(),
53                    span,
54                    text: candidate.to_string(),
55                    confidence: self.compute_confidence(text, span.start, candidate),
56                    recognizer_id: self.id().to_string(),
57                }
58            })
59            .collect()
60    }
61
62    fn validate(&self, candidate: &str) -> bool {
63        validate_url(candidate)
64    }
65}
66
67impl UrlRecognizer {
68    fn compute_confidence(&self, text: &str, start: usize, candidate: &str) -> Confidence {
69        let base = if has_explicit_scheme(candidate) {
70            0.90
71        } else {
72            0.80
73        };
74        confidence(base + context_boost(text, start, CONTEXT_WORDS))
75    }
76}
77
78pub(crate) struct UrlQueryEmailRecognizer;
79
80impl Recognizer for UrlQueryEmailRecognizer {
81    fn id(&self) -> &str {
82        "url_query_email_v1"
83    }
84
85    fn entity_type(&self) -> EntityType {
86        EntityType::Email
87    }
88
89    fn supported_locales(&self) -> &[Locale] {
90        &[]
91    }
92
93    fn scan(&self, text: &str) -> Vec<PiiEntity> {
94        scan_query_values(text, &EmailRecognizer, self.id())
95    }
96}
97
98pub(crate) struct UrlQuerySsnRecognizer;
99
100impl Recognizer for UrlQuerySsnRecognizer {
101    fn id(&self) -> &str {
102        "url_query_ssn_v1"
103    }
104
105    fn entity_type(&self) -> EntityType {
106        EntityType::Ssn
107    }
108
109    fn supported_locales(&self) -> &[Locale] {
110        US_LOCALES
111    }
112
113    fn scan(&self, text: &str) -> Vec<PiiEntity> {
114        scan_query_values(text, &SsnRecognizer, self.id())
115    }
116}
117
118fn find_url_spans(text: &str) -> Vec<Span> {
119    URL_REGEX
120        .find_iter(text)
121        .filter_map(|matched| {
122            let end = trim_url_end(matched.as_str(), matched.start());
123            (matched.start() < end && validate_url(&text[matched.start()..end]))
124                .then(|| Span::new(matched.start(), end))
125        })
126        .filter(|span| is_url_boundary(text, span.start, span.end))
127        .collect()
128}
129
130fn trim_url_end(candidate: &str, start: usize) -> usize {
131    let mut end = start + candidate.len();
132    let mut value = candidate;
133    while let Some(c) = value.chars().next_back() {
134        let should_trim = matches!(c, '.' | ',' | ';' | ':' | '!' | '?')
135            || (matches!(c, ')' | ']' | '}') && !has_matching_opener(value, c));
136        if !should_trim {
137            break;
138        }
139        end -= c.len_utf8();
140        value = &candidate[..end - start];
141    }
142    end
143}
144
145fn has_matching_opener(value: &str, closer: char) -> bool {
146    let opener = match closer {
147        ')' => '(',
148        ']' => '[',
149        '}' => '{',
150        _ => return true,
151    };
152    value.chars().filter(|c| *c == opener).count() >= value.chars().filter(|c| *c == closer).count()
153}
154
155fn validate_url(candidate: &str) -> bool {
156    let Some(authority) = authority(candidate) else {
157        return false;
158    };
159    let host = host_from_authority(authority);
160    validate_host(host)
161}
162
163fn has_explicit_scheme(candidate: &str) -> bool {
164    candidate
165        .get(..7)
166        .is_some_and(|prefix| prefix.eq_ignore_ascii_case("http://"))
167        || candidate
168            .get(..8)
169            .is_some_and(|prefix| prefix.eq_ignore_ascii_case("https://"))
170        || candidate
171            .get(..6)
172            .is_some_and(|prefix| prefix.eq_ignore_ascii_case("ftp://"))
173}
174
175fn authority(candidate: &str) -> Option<&str> {
176    let after_prefix = if let Some((_, rest)) = candidate.split_once("://") {
177        rest
178    } else {
179        candidate
180            .get(..4)
181            .is_some_and(|prefix| prefix.eq_ignore_ascii_case("www."))
182            .then_some(candidate)?
183    };
184    let end = after_prefix
185        .find(['/', '?', '#'])
186        .unwrap_or(after_prefix.len());
187    let authority = &after_prefix[..end];
188    (!authority.is_empty()).then_some(authority)
189}
190
191fn host_from_authority(authority: &str) -> &str {
192    let without_userinfo = authority
193        .rsplit_once('@')
194        .map_or(authority, |(_, host)| host);
195    if let Some(rest) = without_userinfo.strip_prefix('[') {
196        return rest
197            .split_once(']')
198            .map_or(without_userinfo, |(host, _)| host);
199    }
200    without_userinfo
201        .split_once(':')
202        .map_or(without_userinfo, |(host, _)| host)
203}
204
205fn validate_host(host: &str) -> bool {
206    if host.eq_ignore_ascii_case("localhost") || host.parse::<IpAddr>().is_ok() {
207        return true;
208    }
209    if host.is_empty() || !host.contains('.') {
210        return false;
211    }
212    host.split('.').all(validate_host_label)
213}
214
215fn validate_host_label(label: &str) -> bool {
216    !label.is_empty()
217        && !label.starts_with('-')
218        && !label.ends_with('-')
219        && label.chars().all(|c| c.is_ascii_alphanumeric() || c == '-')
220}
221
222fn is_url_boundary(text: &str, start: usize, end: usize) -> bool {
223    let before = text[..start].chars().next_back();
224    let after = text[end..].chars().next();
225    !before.is_some_and(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-')
226        && !after.is_some_and(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-')
227}
228
229fn scan_query_values<R>(text: &str, recognizer: &R, recognizer_id: &str) -> Vec<PiiEntity>
230where
231    R: Recognizer,
232{
233    let mut findings = Vec::new();
234    let mut seen = HashSet::new();
235
236    for url_span in find_url_spans(text) {
237        let Some(query_span) = query_span(text, url_span) else {
238            continue;
239        };
240        for value_span in query_value_spans(text, query_span) {
241            scan_query_value(
242                text,
243                value_span,
244                recognizer,
245                recognizer_id,
246                &mut seen,
247                &mut findings,
248            );
249        }
250    }
251
252    findings.sort_by_key(|finding| finding.span.start);
253    findings
254}
255
256fn scan_query_value<R>(
257    text: &str,
258    value_span: Span,
259    recognizer: &R,
260    recognizer_id: &str,
261    seen: &mut HashSet<(EntityType, usize, usize)>,
262    findings: &mut Vec<PiiEntity>,
263) where
264    R: Recognizer,
265{
266    add_query_findings(
267        text,
268        value_span,
269        recognizer.scan(&text[value_span.start..value_span.end]),
270        recognizer_id,
271        seen,
272        findings,
273    );
274
275    let Some(decoded) = percent_decode_with_mapping(&text[value_span.start..value_span.end]) else {
276        return;
277    };
278    if decoded.value == text[value_span.start..value_span.end] {
279        return;
280    }
281
282    let decoded_findings = recognizer.scan(&decoded.value);
283    for finding in decoded_findings {
284        if finding.span.is_empty() || finding.span.end > decoded.mapping.len() {
285            continue;
286        }
287        let original_start = value_span.start + decoded.mapping[finding.span.start].0;
288        let original_end = value_span.start + decoded.mapping[finding.span.end - 1].1;
289        add_query_findings(
290            text,
291            Span::new(original_start, original_end),
292            vec![PiiEntity {
293                entity_type: finding.entity_type,
294                span: Span::new(0, original_end - original_start),
295                text: text[original_start..original_end].to_string(),
296                confidence: finding.confidence,
297                recognizer_id: finding.recognizer_id,
298            }],
299            recognizer_id,
300            seen,
301            findings,
302        );
303    }
304}
305
306fn add_query_findings(
307    text: &str,
308    offset: Span,
309    local_findings: Vec<PiiEntity>,
310    recognizer_id: &str,
311    seen: &mut HashSet<(EntityType, usize, usize)>,
312    findings: &mut Vec<PiiEntity>,
313) {
314    for finding in local_findings {
315        let span = Span::new(
316            offset.start + finding.span.start,
317            offset.start + finding.span.end,
318        );
319        if span.end > text.len()
320            || !seen.insert((finding.entity_type.clone(), span.start, span.end))
321        {
322            continue;
323        }
324        findings.push(PiiEntity {
325            entity_type: finding.entity_type,
326            span,
327            text: text[span.start..span.end].to_string(),
328            confidence: finding.confidence,
329            recognizer_id: recognizer_id.to_string(),
330        });
331    }
332}
333
334fn query_span(text: &str, url_span: Span) -> Option<Span> {
335    let url = &text[url_span.start..url_span.end];
336    let query_start = url.find('?')? + 1;
337    let query_end = url[query_start..]
338        .find('#')
339        .map_or(url.len(), |fragment| query_start + fragment);
340    (query_start < query_end)
341        .then(|| Span::new(url_span.start + query_start, url_span.start + query_end))
342}
343
344fn query_value_spans(text: &str, query_span: Span) -> Vec<Span> {
345    let mut spans = Vec::new();
346    let mut parameter_start = query_span.start;
347    let query = &text[query_span.start..query_span.end];
348
349    for (offset, c) in query.char_indices() {
350        if matches!(c, '&' | ';') {
351            push_query_value_span(text, parameter_start, query_span.start + offset, &mut spans);
352            parameter_start = query_span.start + offset + c.len_utf8();
353        }
354    }
355    push_query_value_span(text, parameter_start, query_span.end, &mut spans);
356
357    spans
358}
359
360fn push_query_value_span(text: &str, start: usize, end: usize, spans: &mut Vec<Span>) {
361    if start >= end {
362        return;
363    }
364    let parameter = &text[start..end];
365    if let Some(eq) = parameter.find('=') {
366        let value_start = start + eq + 1;
367        if value_start < end {
368            spans.push(Span::new(value_start, end));
369        }
370    }
371}
372
373struct DecodedValue {
374    value: String,
375    mapping: Vec<(usize, usize)>,
376}
377
378fn percent_decode_with_mapping(value: &str) -> Option<DecodedValue> {
379    let bytes = value.as_bytes();
380    let mut decoded = Vec::with_capacity(bytes.len());
381    let mut mapping = Vec::with_capacity(bytes.len());
382    let mut index = 0;
383
384    while index < bytes.len() {
385        if bytes[index] == b'%' && index + 2 < bytes.len() {
386            if let (Some(high), Some(low)) =
387                (hex_value(bytes[index + 1]), hex_value(bytes[index + 2]))
388            {
389                decoded.push(high * 16 + low);
390                mapping.push((index, index + 3));
391                index += 3;
392                continue;
393            }
394        }
395
396        decoded.push(if bytes[index] == b'+' {
397            b' '
398        } else {
399            bytes[index]
400        });
401        mapping.push((index, index + 1));
402        index += 1;
403    }
404
405    String::from_utf8(decoded)
406        .ok()
407        .map(|value| DecodedValue { value, mapping })
408}
409
410fn hex_value(byte: u8) -> Option<u8> {
411    match byte {
412        b'0'..=b'9' => Some(byte - b'0'),
413        b'a'..=b'f' => Some(byte - b'a' + 10),
414        b'A'..=b'F' => Some(byte - b'A' + 10),
415        _ => None,
416    }
417}
418
419#[cfg(test)]
420mod tests {
421    use super::*;
422    use crate::default_registry;
423    use cloakrs_core::MaskStrategy;
424
425    fn url_texts(input: &str) -> Vec<String> {
426        UrlRecognizer
427            .scan(input)
428            .into_iter()
429            .map(|finding| finding.text)
430            .collect()
431    }
432
433    fn query_email_texts(input: &str) -> Vec<String> {
434        UrlQueryEmailRecognizer
435            .scan(input)
436            .into_iter()
437            .map(|finding| finding.text)
438            .collect()
439    }
440
441    fn query_ssn_texts(input: &str) -> Vec<String> {
442        UrlQuerySsnRecognizer
443            .scan(input)
444            .into_iter()
445            .map(|finding| finding.text)
446            .collect()
447    }
448
449    #[test]
450    fn test_url_http_detected() {
451        assert_eq!(url_texts("link http://example.com"), ["http://example.com"]);
452    }
453
454    #[test]
455    fn test_url_https_path_query_fragment_detected() {
456        assert_eq!(
457            url_texts("visit https://example.com/a/b?x=1#top"),
458            ["https://example.com/a/b?x=1#top"]
459        );
460    }
461
462    #[test]
463    fn test_url_www_detected() {
464        assert_eq!(
465            url_texts("open www.example.com/docs"),
466            ["www.example.com/docs"]
467        );
468    }
469
470    #[test]
471    fn test_url_localhost_port_detected() {
472        assert_eq!(
473            url_texts("endpoint http://localhost:8080/health"),
474            ["http://localhost:8080/health"]
475        );
476    }
477
478    #[test]
479    fn test_url_ip_host_detected() {
480        assert_eq!(
481            url_texts("endpoint http://203.0.113.42/api"),
482            ["http://203.0.113.42/api"]
483        );
484    }
485
486    #[test]
487    fn test_url_trailing_punctuation_excluded() {
488        assert_eq!(
489            url_texts("see https://example.com/path."),
490            ["https://example.com/path"]
491        );
492    }
493
494    #[test]
495    fn test_url_balanced_parentheses_preserved() {
496        assert_eq!(
497            url_texts("see https://example.com/a_(b)"),
498            ["https://example.com/a_(b)"]
499        );
500    }
501
502    #[test]
503    fn test_url_invalid_host_without_dot_rejected() {
504        assert!(url_texts("go https://example/path").is_empty());
505    }
506
507    #[test]
508    fn test_url_embedded_in_word_rejected() {
509        assert!(url_texts("abchttps://example.com").is_empty());
510    }
511
512    #[test]
513    fn test_url_multiple_values_detected() {
514        assert_eq!(
515            url_texts("a https://example.com b www.example.org"),
516            ["https://example.com", "www.example.org"]
517        );
518    }
519
520    #[test]
521    fn test_url_context_boosts_confidence() {
522        let with_context = UrlRecognizer.scan("url https://example.com");
523        let without_context = UrlRecognizer.scan("value https://example.com");
524        assert!(with_context[0].confidence > without_context[0].confidence);
525    }
526
527    #[test]
528    fn test_url_www_confidence_lower_than_scheme() {
529        let scheme = UrlRecognizer.scan("https://example.com");
530        let www = UrlRecognizer.scan("www.example.com");
531        assert!(www[0].confidence < scheme[0].confidence);
532    }
533
534    #[test]
535    fn test_url_query_email_unencoded_detected() {
536        assert_eq!(
537            query_email_texts("https://example.com/callback?email=jane@example.com"),
538            ["jane@example.com"]
539        );
540    }
541
542    #[test]
543    fn test_url_query_email_percent_encoded_detected() {
544        assert_eq!(
545            query_email_texts("https://example.com/callback?email=jane%40example.com"),
546            ["jane%40example.com"]
547        );
548    }
549
550    #[test]
551    fn test_url_query_ssn_detected() {
552        assert_eq!(
553            query_ssn_texts("https://example.com/callback?ssn=123-45-6789"),
554            ["123-45-6789"]
555        );
556    }
557
558    #[test]
559    fn test_url_query_ssn_supported_locale_is_us() {
560        assert_eq!(UrlQuerySsnRecognizer.supported_locales(), &[Locale::US]);
561    }
562
563    #[test]
564    fn test_url_default_registry_preserves_url_and_query_email() {
565        let scanner = default_registry()
566            .into_scanner_builder()
567            .without_masking()
568            .build()
569            .unwrap();
570
571        let result = scanner
572            .scan("go https://example.com/callback?email=jane@example.com")
573            .unwrap();
574
575        assert!(result
576            .findings
577            .iter()
578            .any(|finding| finding.entity_type == EntityType::Url));
579        assert!(result
580            .findings
581            .iter()
582            .any(|finding| finding.entity_type == EntityType::Email
583                && finding.recognizer_id == "url_query_email_v1"));
584    }
585
586    #[test]
587    fn test_url_us_scanner_preserves_url_and_query_ssn() {
588        let scanner = default_registry()
589            .into_scanner_builder()
590            .locale(Locale::US)
591            .without_masking()
592            .build()
593            .unwrap();
594
595        let result = scanner
596            .scan("go https://example.com/callback?ssn=123-45-6789")
597            .unwrap();
598
599        assert!(result
600            .findings
601            .iter()
602            .any(|finding| finding.entity_type == EntityType::Url));
603        assert!(result
604            .findings
605            .iter()
606            .any(|finding| finding.entity_type == EntityType::Ssn
607                && finding.recognizer_id == "url_query_ssn_v1"));
608    }
609
610    #[test]
611    fn test_url_universal_scanner_excludes_query_ssn() {
612        let scanner = default_registry()
613            .into_scanner_builder()
614            .locale(Locale::Universal)
615            .without_masking()
616            .build()
617            .unwrap();
618
619        let result = scanner
620            .scan("go https://example.com/callback?ssn=123-45-6789")
621            .unwrap();
622
623        assert!(result
624            .findings
625            .iter()
626            .all(|finding| finding.entity_type != EntityType::Ssn));
627    }
628
629    #[test]
630    fn test_url_masking_redacts_outer_url_once() {
631        let scanner = default_registry()
632            .into_scanner_builder()
633            .strategy(MaskStrategy::Redact)
634            .build()
635            .unwrap();
636
637        let result = scanner
638            .scan("go https://example.com/callback?email=jane@example.com")
639            .unwrap();
640
641        assert_eq!(result.masked_text.as_deref(), Some("go [URL]"));
642    }
643}