Skip to main content

cloakrs_patterns/
ip_address.rs

1use crate::common::{compile_regex, confidence, context_boost};
2use cloakrs_core::{Confidence, EntityType, Locale, PiiEntity, Recognizer, Span};
3use once_cell::sync::Lazy;
4use regex::Regex;
5use std::collections::HashSet;
6use std::net::IpAddr;
7
8static IPV4_REGEX: Lazy<Regex> = Lazy::new(|| compile_regex(r"(?:\d{1,3}\.){3}\d{1,3}"));
9static IPV6_REGEX: Lazy<Regex> =
10    Lazy::new(|| compile_regex(r"[0-9A-Fa-f]{0,4}(?::[0-9A-Fa-f]{0,4}){2,7}"));
11
12const CONTEXT_WORDS: &[&str] = &[
13    "ip",
14    "ip address",
15    "remote_addr",
16    "client_ip",
17    "source ip",
18    "destination ip",
19    "host",
20    "server",
21];
22
23/// Recognizes IPv4 and IPv6 addresses.
24///
25/// # Examples
26///
27/// ```
28/// use cloakrs_core::{EntityType, Recognizer};
29/// use cloakrs_patterns::IpAddressRecognizer;
30///
31/// let findings = IpAddressRecognizer.scan("client_ip=203.0.113.42");
32/// assert_eq!(findings[0].entity_type, EntityType::IpAddress);
33/// assert_eq!(findings[0].text, "203.0.113.42");
34/// ```
35#[derive(Debug, Clone, Copy, Default)]
36pub struct IpAddressRecognizer;
37
38impl Recognizer for IpAddressRecognizer {
39    fn id(&self) -> &str {
40        "ip_address_std_v1"
41    }
42
43    fn entity_type(&self) -> EntityType {
44        EntityType::IpAddress
45    }
46
47    fn supported_locales(&self) -> &[Locale] {
48        &[]
49    }
50
51    fn scan(&self, text: &str) -> Vec<PiiEntity> {
52        let mut seen = HashSet::new();
53        let mut findings = Vec::new();
54
55        for regex in [&*IPV4_REGEX, &*IPV6_REGEX] {
56            for matched in regex.find_iter(text) {
57                if seen.insert((matched.start(), matched.end()))
58                    && self.is_valid_match(text, matched.start(), matched.end())
59                {
60                    findings.push(PiiEntity {
61                        entity_type: self.entity_type(),
62                        span: Span::new(matched.start(), matched.end()),
63                        text: matched.as_str().to_string(),
64                        confidence: self.compute_confidence(
65                            text,
66                            matched.start(),
67                            matched.as_str(),
68                        ),
69                        recognizer_id: self.id().to_string(),
70                    });
71                }
72            }
73        }
74
75        findings.sort_by_key(|finding| finding.span.start);
76        findings
77    }
78
79    fn validate(&self, candidate: &str) -> bool {
80        parse_ip(candidate).is_some()
81    }
82}
83
84impl IpAddressRecognizer {
85    fn is_valid_match(&self, text: &str, start: usize, end: usize) -> bool {
86        self.validate(&text[start..end]) && is_ip_boundary(text, start, end)
87    }
88
89    fn compute_confidence(&self, text: &str, start: usize, candidate: &str) -> Confidence {
90        let base = match parse_ip(candidate) {
91            Some(IpAddr::V4(_)) => 0.85,
92            Some(IpAddr::V6(_)) => 0.90,
93            None => 0.0,
94        };
95        confidence(base + context_boost(text, start, CONTEXT_WORDS))
96    }
97}
98
99fn parse_ip(candidate: &str) -> Option<IpAddr> {
100    if candidate == "::" {
101        return None;
102    }
103    candidate.parse::<IpAddr>().ok()
104}
105
106fn is_ip_boundary(text: &str, start: usize, end: usize) -> bool {
107    let before = text[..start].chars().next_back();
108    let after = text[end..].chars().next();
109    !before.is_some_and(is_ip_continuation) && !after.is_some_and(is_ip_continuation)
110}
111
112fn is_ip_continuation(c: char) -> bool {
113    c.is_ascii_alphanumeric() || matches!(c, '_' | '-' | '.' | ':' | '%')
114}
115
116#[cfg(test)]
117mod tests {
118    use super::*;
119    use cloakrs_core::RecognizerRegistry;
120
121    fn texts(input: &str) -> Vec<String> {
122        IpAddressRecognizer
123            .scan(input)
124            .into_iter()
125            .map(|finding| finding.text)
126            .collect()
127    }
128
129    #[test]
130    fn test_ip_address_ipv4_public_detected() {
131        assert_eq!(texts("client_ip=203.0.113.42"), ["203.0.113.42"]);
132    }
133
134    #[test]
135    fn test_ip_address_ipv4_private_detected() {
136        assert_eq!(texts("host 192.168.1.105"), ["192.168.1.105"]);
137    }
138
139    #[test]
140    fn test_ip_address_ipv4_loopback_detected() {
141        assert_eq!(texts("server 127.0.0.1"), ["127.0.0.1"]);
142    }
143
144    #[test]
145    fn test_ip_address_ipv4_zero_octets_detected() {
146        assert_eq!(texts("remote 0.0.0.0"), ["0.0.0.0"]);
147    }
148
149    #[test]
150    fn test_ip_address_ipv6_full_detected() {
151        assert_eq!(
152            texts("addr 2001:0db8:85a3:0000:0000:8a2e:0370:7334"),
153            ["2001:0db8:85a3:0000:0000:8a2e:0370:7334"]
154        );
155    }
156
157    #[test]
158    fn test_ip_address_ipv6_compressed_detected() {
159        assert_eq!(texts("source ip 2001:db8::1"), ["2001:db8::1"]);
160    }
161
162    #[test]
163    fn test_ip_address_ipv6_loopback_detected() {
164        assert_eq!(texts("host ::1"), ["::1"]);
165    }
166
167    #[test]
168    fn test_ip_address_ipv4_octet_above_range_rejected() {
169        assert!(texts("client_ip=256.168.1.1").is_empty());
170    }
171
172    #[test]
173    fn test_ip_address_ipv4_partial_octet_range_rejected() {
174        assert!(texts("client_ip=999.10.10.10").is_empty());
175    }
176
177    #[test]
178    fn test_ip_address_ipv6_invalid_shape_rejected() {
179        assert!(texts("addr 2001:::1").is_empty());
180    }
181
182    #[test]
183    fn test_ip_address_ipv6_punctuation_only_rejected() {
184        assert!(texts("addr ::").is_empty());
185    }
186
187    #[test]
188    fn test_ip_address_ipv4_embedded_in_larger_dot_sequence_rejected() {
189        assert!(texts("version 192.168.1.1.5").is_empty());
190    }
191
192    #[test]
193    fn test_ip_address_ipv4_embedded_in_word_rejected() {
194        assert!(texts("id192.168.1.1").is_empty());
195    }
196
197    #[test]
198    fn test_ip_address_ipv6_embedded_in_larger_colon_sequence_rejected() {
199        assert!(texts("addr x2001:db8::1").is_empty());
200    }
201
202    #[test]
203    fn test_ip_address_multiple_findings_detected() {
204        assert_eq!(
205            texts("client_ip=203.0.113.42 server 2001:db8::5"),
206            ["203.0.113.42", "2001:db8::5"]
207        );
208    }
209
210    #[test]
211    fn test_ip_address_context_boosts_ipv4_confidence() {
212        let with_context = IpAddressRecognizer.scan("client_ip=203.0.113.42");
213        let without_context = IpAddressRecognizer.scan("value 203.0.113.42");
214        assert!(with_context[0].confidence > without_context[0].confidence);
215    }
216
217    #[test]
218    fn test_ip_address_context_boosts_ipv6_confidence() {
219        let with_context = IpAddressRecognizer.scan("destination ip 2001:db8::1");
220        let without_context = IpAddressRecognizer.scan("value 2001:db8::1");
221        assert!(with_context[0].confidence > without_context[0].confidence);
222    }
223
224    #[test]
225    fn test_ip_address_supported_locales_are_universal() {
226        assert!(IpAddressRecognizer.supported_locales().is_empty());
227    }
228
229    #[test]
230    fn test_ip_address_validate_accepts_valid_ipv4() {
231        assert!(IpAddressRecognizer.validate("203.0.113.42"));
232    }
233
234    #[test]
235    fn test_ip_address_validate_rejects_invalid_ipv6() {
236        assert!(!IpAddressRecognizer.validate("2001:::1"));
237    }
238
239    #[test]
240    fn test_ip_address_registry_integration_detects_default_recognizer() {
241        let mut registry = RecognizerRegistry::new();
242        crate::register_default_recognizers(&mut registry);
243
244        let findings = registry.scan_all("remote_addr=203.0.113.42");
245
246        assert!(findings
247            .iter()
248            .any(|finding| finding.entity_type == EntityType::IpAddress));
249    }
250}