Skip to main content

cloakrs_patterns/
hostname.rs

1use crate::common::{compile_regex, confidence, context_boost};
2use cloakrs_core::{Confidence, EntityType, Locale, PiiEntity, Recognizer, Span};
3use once_cell::sync::Lazy;
4use regex::Regex;
5use std::collections::HashSet;
6
7static DOMAIN_HOSTNAME_REGEX: Lazy<Regex> = Lazy::new(|| {
8    compile_regex(
9        r"(?i)\b[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?(?:\.[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?){1,}\b",
10    )
11});
12
13static WINDOWS_HOSTNAME_REGEX: Lazy<Regex> =
14    Lazy::new(|| compile_regex(r"\b(?:DESKTOP|LAPTOP|WIN|PC|WORKSTATION)-[A-Z0-9]{6,10}\b"));
15
16const CONTEXT_WORDS: &[&str] = &[
17    "host",
18    "hostname",
19    "server",
20    "node",
21    "instance",
22    "machine",
23    "connecting to",
24    "connected to",
25    "resolved",
26    "dns",
27    "nslookup",
28    "ping",
29    "ssh",
30    "host:",
31    "server_name",
32    "server_addr",
33    "remote_host",
34    "upstream",
35];
36
37const INTERNAL_LABELS: &[&str] = &[
38    "internal", "local", "lan", "corp", "private", "intranet", "k8s", "svc", "cluster",
39];
40
41const INFRA_LABEL_PREFIXES: &[&str] = &[
42    "server", "node", "worker", "db", "redis", "cache", "web", "app", "api", "proxy", "lb",
43    "queue", "staging", "prod", "dev", "test", "ip",
44];
45
46const CLOUD_SUFFIXES: &[&[&str]] = &[
47    &["ec2", "internal"],
48    &["compute", "internal"],
49    &["k8s", "local"],
50    &["svc", "cluster", "local"],
51    &["rds", "amazonaws", "com"],
52    &["cloudfront", "net"],
53    &["elasticbeanstalk", "com"],
54];
55
56const PUBLIC_FIRST_LABELS: &[&str] = &["com", "org", "net", "io", "edu", "gov"];
57
58/// Recognizes internal hostnames and machine names that can leak infrastructure.
59///
60/// # Examples
61///
62/// ```
63/// use cloakrs_core::{EntityType, Recognizer};
64/// use cloakrs_patterns::HostnameRecognizer;
65///
66/// let findings = HostnameRecognizer.scan("connecting to db-prod-01.internal.company.com");
67/// assert_eq!(findings[0].entity_type, EntityType::Hostname);
68/// assert_eq!(findings[0].text, "db-prod-01.internal.company.com");
69/// ```
70#[derive(Debug, Clone, Copy, Default)]
71pub struct HostnameRecognizer;
72
73impl Recognizer for HostnameRecognizer {
74    fn id(&self) -> &str {
75        "hostname_infra_v1"
76    }
77
78    fn entity_type(&self) -> EntityType {
79        EntityType::Hostname
80    }
81
82    fn supported_locales(&self) -> &[Locale] {
83        &[]
84    }
85
86    fn scan(&self, text: &str) -> Vec<PiiEntity> {
87        let mut seen = HashSet::new();
88        let mut findings = Vec::new();
89
90        for matched in DOMAIN_HOSTNAME_REGEX.find_iter(text) {
91            let start = matched.start();
92            let end = matched.end();
93            if seen.insert((start, end)) && self.is_valid_domain_match(text, start, end) {
94                findings.push(self.finding(text, Span::new(start, end)));
95            }
96        }
97
98        for matched in WINDOWS_HOSTNAME_REGEX.find_iter(text) {
99            let start = matched.start();
100            let end = matched.end();
101            if seen.insert((start, end)) && is_boundary(text, start, end) {
102                findings.push(self.finding(text, Span::new(start, end)));
103            }
104        }
105
106        findings.sort_by_key(|finding| finding.span.start);
107        findings
108    }
109
110    fn validate(&self, candidate: &str) -> bool {
111        is_windows_hostname(candidate) || validate_domain_hostname(candidate)
112    }
113}
114
115impl HostnameRecognizer {
116    fn finding(&self, text: &str, span: Span) -> PiiEntity {
117        let candidate = &text[span.start..span.end];
118        PiiEntity {
119            entity_type: self.entity_type(),
120            span,
121            text: candidate.to_string(),
122            confidence: self.compute_confidence(text, span.start, candidate),
123            recognizer_id: self.id().to_string(),
124        }
125    }
126
127    fn is_valid_domain_match(&self, text: &str, start: usize, end: usize) -> bool {
128        let candidate = &text[start..end];
129        validate_domain_hostname(candidate)
130            && is_boundary(text, start, end)
131            && !is_email_domain(text, start)
132            && !is_url_host(text, start, candidate)
133    }
134
135    fn compute_confidence(&self, text: &str, start: usize, candidate: &str) -> Confidence {
136        let base = if is_windows_hostname(candidate) || has_internal_marker(candidate) {
137            0.85
138        } else if has_cloud_suffix(candidate) || has_infra_label(candidate) {
139            0.80
140        } else {
141            0.50
142        };
143        confidence(base + context_boost(text, start, CONTEXT_WORDS))
144    }
145}
146
147fn validate_domain_hostname(candidate: &str) -> bool {
148    let labels: Vec<&str> = candidate.split('.').collect();
149    labels.len() >= 2
150        && labels.iter().all(|label| validate_label(label))
151        && !is_reversed_domain(&labels)
152        && (has_internal_marker(candidate)
153            || has_cloud_suffix(candidate)
154            || has_infra_label(candidate))
155}
156
157fn validate_label(label: &str) -> bool {
158    !label.is_empty()
159        && label.len() <= 63
160        && !label.starts_with('-')
161        && !label.ends_with('-')
162        && label.chars().all(|c| c.is_ascii_alphanumeric() || c == '-')
163}
164
165fn is_windows_hostname(candidate: &str) -> bool {
166    WINDOWS_HOSTNAME_REGEX
167        .find(candidate)
168        .is_some_and(|matched| matched.start() == 0 && matched.end() == candidate.len())
169}
170
171fn has_internal_marker(candidate: &str) -> bool {
172    candidate
173        .split('.')
174        .any(|label| INTERNAL_LABELS.contains(&label.to_ascii_lowercase().as_str()))
175}
176
177fn has_cloud_suffix(candidate: &str) -> bool {
178    let labels: Vec<String> = candidate.split('.').map(str::to_ascii_lowercase).collect();
179    CLOUD_SUFFIXES.iter().any(|suffix| {
180        labels.len() >= suffix.len()
181            && labels[labels.len() - suffix.len()..]
182                .iter()
183                .zip(suffix.iter())
184                .all(|(left, right)| left == right)
185    })
186}
187
188fn has_infra_label(candidate: &str) -> bool {
189    candidate.split('.').any(|label| {
190        let lower = label.to_ascii_lowercase();
191        INFRA_LABEL_PREFIXES.iter().any(|prefix| {
192            lower == *prefix
193                || lower.strip_prefix(prefix).is_some_and(|rest| {
194                    rest.starts_with('-') || rest.chars().next().is_some_and(|c| c.is_ascii_digit())
195                })
196        })
197    })
198}
199
200fn is_reversed_domain(labels: &[&str]) -> bool {
201    labels
202        .first()
203        .is_some_and(|label| PUBLIC_FIRST_LABELS.contains(&label.to_ascii_lowercase().as_str()))
204}
205
206fn is_email_domain(text: &str, start: usize) -> bool {
207    text[..start].ends_with('@')
208}
209
210fn is_url_host(text: &str, start: usize, candidate: &str) -> bool {
211    candidate
212        .get(..4)
213        .is_some_and(|prefix| prefix.eq_ignore_ascii_case("www."))
214        || text[..start].ends_with("://")
215}
216
217fn is_boundary(text: &str, start: usize, end: usize) -> bool {
218    let before = text[..start].chars().next_back();
219    let after = text[end..].chars().next();
220    !before.is_some_and(is_hostname_continuation) && !after.is_some_and(is_hostname_continuation)
221}
222
223fn is_hostname_continuation(c: char) -> bool {
224    c.is_ascii_alphanumeric() || matches!(c, '_' | '-' | '.')
225}
226
227#[cfg(test)]
228mod tests {
229    use super::*;
230    use cloakrs_core::RecognizerRegistry;
231
232    fn texts(input: &str) -> Vec<String> {
233        HostnameRecognizer
234            .scan(input)
235            .into_iter()
236            .map(|finding| finding.text)
237            .collect()
238    }
239
240    #[test]
241    fn test_hostname_internal_company_host_detected() {
242        assert_eq!(
243            texts("ssh server-prod-01.internal.company.com"),
244            ["server-prod-01.internal.company.com"]
245        );
246    }
247
248    #[test]
249    fn test_hostname_rds_cloud_host_detected() {
250        assert_eq!(
251            texts("server db-replica-3.eu-west-1.rds.amazonaws.com"),
252            ["db-replica-3.eu-west-1.rds.amazonaws.com"]
253        );
254    }
255
256    #[test]
257    fn test_hostname_ec2_internal_host_detected() {
258        assert_eq!(
259            texts("resolved ip-172-31-16-58.ec2.internal"),
260            ["ip-172-31-16-58.ec2.internal"]
261        );
262    }
263
264    #[test]
265    fn test_hostname_k8s_local_host_detected() {
266        assert_eq!(
267            texts("node worker-node-7.k8s.local"),
268            ["worker-node-7.k8s.local"]
269        );
270    }
271
272    #[test]
273    fn test_hostname_infra_context_public_tld_detected() {
274        assert_eq!(
275            texts("host redis-cache-01.staging.myapp.io"),
276            ["redis-cache-01.staging.myapp.io"]
277        );
278    }
279
280    #[test]
281    fn test_hostname_windows_machine_detected() {
282        assert_eq!(texts("machine DESKTOP-A1B2C3D"), ["DESKTOP-A1B2C3D"]);
283    }
284
285    #[test]
286    fn test_hostname_mdns_machine_detected() {
287        assert_eq!(
288            texts("ping macbook-pro-kadir.local"),
289            ["macbook-pro-kadir.local"]
290        );
291    }
292
293    #[test]
294    fn test_hostname_corp_short_host_detected() {
295        assert_eq!(texts("upstream api-gateway.corp"), ["api-gateway.corp"]);
296    }
297
298    #[test]
299    fn test_hostname_public_google_domain_rejected() {
300        assert!(texts("open google.com").is_empty());
301    }
302
303    #[test]
304    fn test_hostname_public_github_domain_rejected() {
305        assert!(texts("host github.com").is_empty());
306    }
307
308    #[test]
309    fn test_hostname_email_domain_rejected() {
310        assert!(texts("mail user@company.com").is_empty());
311    }
312
313    #[test]
314    fn test_hostname_url_host_rejected() {
315        assert!(texts("url https://example.com/path").is_empty());
316    }
317
318    #[test]
319    fn test_hostname_localhost_rejected() {
320        assert!(texts("host localhost").is_empty());
321    }
322
323    #[test]
324    fn test_hostname_reversed_domain_rejected() {
325        assert!(texts("package com.google.android.app").is_empty());
326    }
327
328    #[test]
329    fn test_hostname_single_project_name_rejected() {
330        assert!(texts("repo my-cool-project").is_empty());
331    }
332
333    #[test]
334    fn test_hostname_connecting_context_boosts_confidence() {
335        let with_context = HostnameRecognizer.scan("connecting to db-prod.internal.myco.com");
336        let without_context = HostnameRecognizer.scan("value db-prod.internal.myco.com");
337        assert!(with_context[0].confidence > without_context[0].confidence);
338    }
339
340    #[test]
341    fn test_hostname_host_prefix_context_boosts_confidence() {
342        let with_context = HostnameRecognizer.scan("host: web-01.corp");
343        let without_context = HostnameRecognizer.scan("value web-01.corp");
344        assert!(with_context[0].confidence > without_context[0].confidence);
345    }
346
347    #[test]
348    fn test_hostname_supported_locales_are_universal() {
349        assert!(HostnameRecognizer.supported_locales().is_empty());
350    }
351
352    #[test]
353    fn test_hostname_registry_integration_detects_default_recognizer() {
354        let mut registry = RecognizerRegistry::new();
355        crate::register_default_recognizers(&mut registry);
356
357        let findings = registry.scan_all("remote_host=db-prod.internal.myco.com");
358
359        assert!(findings
360            .iter()
361            .any(|finding| finding.entity_type == EntityType::Hostname));
362    }
363}