1use crate::common::{compile_regex, confidence, context_boost};
2use cloakrs_core::{Confidence, EntityType, Locale, PiiEntity, Recognizer, Span};
3use once_cell::sync::Lazy;
4use regex::Regex;
5use std::collections::HashSet;
6
7static DOMAIN_HOSTNAME_REGEX: Lazy<Regex> = Lazy::new(|| {
8 compile_regex(
9 r"(?i)\b[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?(?:\.[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?){1,}\b",
10 )
11});
12
13static WINDOWS_HOSTNAME_REGEX: Lazy<Regex> =
14 Lazy::new(|| compile_regex(r"\b(?:DESKTOP|LAPTOP|WIN|PC|WORKSTATION)-[A-Z0-9]{6,10}\b"));
15
16const CONTEXT_WORDS: &[&str] = &[
17 "host",
18 "hostname",
19 "server",
20 "node",
21 "instance",
22 "machine",
23 "connecting to",
24 "connected to",
25 "resolved",
26 "dns",
27 "nslookup",
28 "ping",
29 "ssh",
30 "host:",
31 "server_name",
32 "server_addr",
33 "remote_host",
34 "upstream",
35];
36
37const INTERNAL_LABELS: &[&str] = &[
38 "internal", "local", "lan", "corp", "private", "intranet", "k8s", "svc", "cluster",
39];
40
41const INFRA_LABEL_PREFIXES: &[&str] = &[
42 "server", "node", "worker", "db", "redis", "cache", "web", "app", "api", "proxy", "lb",
43 "queue", "staging", "prod", "dev", "test", "ip",
44];
45
46const CLOUD_SUFFIXES: &[&[&str]] = &[
47 &["ec2", "internal"],
48 &["compute", "internal"],
49 &["k8s", "local"],
50 &["svc", "cluster", "local"],
51 &["rds", "amazonaws", "com"],
52 &["cloudfront", "net"],
53 &["elasticbeanstalk", "com"],
54];
55
56const PUBLIC_FIRST_LABELS: &[&str] = &["com", "org", "net", "io", "edu", "gov"];
57
58#[derive(Debug, Clone, Copy, Default)]
71pub struct HostnameRecognizer;
72
73impl Recognizer for HostnameRecognizer {
74 fn id(&self) -> &str {
75 "hostname_infra_v1"
76 }
77
78 fn entity_type(&self) -> EntityType {
79 EntityType::Hostname
80 }
81
82 fn supported_locales(&self) -> &[Locale] {
83 &[]
84 }
85
86 fn scan(&self, text: &str) -> Vec<PiiEntity> {
87 let mut seen = HashSet::new();
88 let mut findings = Vec::new();
89
90 for matched in DOMAIN_HOSTNAME_REGEX.find_iter(text) {
91 let start = matched.start();
92 let end = matched.end();
93 if seen.insert((start, end)) && self.is_valid_domain_match(text, start, end) {
94 findings.push(self.finding(text, Span::new(start, end)));
95 }
96 }
97
98 for matched in WINDOWS_HOSTNAME_REGEX.find_iter(text) {
99 let start = matched.start();
100 let end = matched.end();
101 if seen.insert((start, end)) && is_boundary(text, start, end) {
102 findings.push(self.finding(text, Span::new(start, end)));
103 }
104 }
105
106 findings.sort_by_key(|finding| finding.span.start);
107 findings
108 }
109
110 fn validate(&self, candidate: &str) -> bool {
111 is_windows_hostname(candidate) || validate_domain_hostname(candidate)
112 }
113}
114
115impl HostnameRecognizer {
116 fn finding(&self, text: &str, span: Span) -> PiiEntity {
117 let candidate = &text[span.start..span.end];
118 PiiEntity {
119 entity_type: self.entity_type(),
120 span,
121 text: candidate.to_string(),
122 confidence: self.compute_confidence(text, span.start, candidate),
123 recognizer_id: self.id().to_string(),
124 }
125 }
126
127 fn is_valid_domain_match(&self, text: &str, start: usize, end: usize) -> bool {
128 let candidate = &text[start..end];
129 validate_domain_hostname(candidate)
130 && is_boundary(text, start, end)
131 && !is_email_domain(text, start)
132 && !is_url_host(text, start, candidate)
133 }
134
135 fn compute_confidence(&self, text: &str, start: usize, candidate: &str) -> Confidence {
136 let base = if is_windows_hostname(candidate) || has_internal_marker(candidate) {
137 0.85
138 } else if has_cloud_suffix(candidate) || has_infra_label(candidate) {
139 0.80
140 } else {
141 0.50
142 };
143 confidence(base + context_boost(text, start, CONTEXT_WORDS))
144 }
145}
146
147fn validate_domain_hostname(candidate: &str) -> bool {
148 let labels: Vec<&str> = candidate.split('.').collect();
149 labels.len() >= 2
150 && labels.iter().all(|label| validate_label(label))
151 && !is_reversed_domain(&labels)
152 && (has_internal_marker(candidate)
153 || has_cloud_suffix(candidate)
154 || has_infra_label(candidate))
155}
156
157fn validate_label(label: &str) -> bool {
158 !label.is_empty()
159 && label.len() <= 63
160 && !label.starts_with('-')
161 && !label.ends_with('-')
162 && label.chars().all(|c| c.is_ascii_alphanumeric() || c == '-')
163}
164
165fn is_windows_hostname(candidate: &str) -> bool {
166 WINDOWS_HOSTNAME_REGEX
167 .find(candidate)
168 .is_some_and(|matched| matched.start() == 0 && matched.end() == candidate.len())
169}
170
171fn has_internal_marker(candidate: &str) -> bool {
172 candidate
173 .split('.')
174 .any(|label| INTERNAL_LABELS.contains(&label.to_ascii_lowercase().as_str()))
175}
176
177fn has_cloud_suffix(candidate: &str) -> bool {
178 let labels: Vec<String> = candidate.split('.').map(str::to_ascii_lowercase).collect();
179 CLOUD_SUFFIXES.iter().any(|suffix| {
180 labels.len() >= suffix.len()
181 && labels[labels.len() - suffix.len()..]
182 .iter()
183 .zip(suffix.iter())
184 .all(|(left, right)| left == right)
185 })
186}
187
188fn has_infra_label(candidate: &str) -> bool {
189 candidate.split('.').any(|label| {
190 let lower = label.to_ascii_lowercase();
191 INFRA_LABEL_PREFIXES.iter().any(|prefix| {
192 lower == *prefix
193 || lower.strip_prefix(prefix).is_some_and(|rest| {
194 rest.starts_with('-') || rest.chars().next().is_some_and(|c| c.is_ascii_digit())
195 })
196 })
197 })
198}
199
200fn is_reversed_domain(labels: &[&str]) -> bool {
201 labels
202 .first()
203 .is_some_and(|label| PUBLIC_FIRST_LABELS.contains(&label.to_ascii_lowercase().as_str()))
204}
205
206fn is_email_domain(text: &str, start: usize) -> bool {
207 text[..start].ends_with('@')
208}
209
210fn is_url_host(text: &str, start: usize, candidate: &str) -> bool {
211 candidate
212 .get(..4)
213 .is_some_and(|prefix| prefix.eq_ignore_ascii_case("www."))
214 || text[..start].ends_with("://")
215}
216
217fn is_boundary(text: &str, start: usize, end: usize) -> bool {
218 let before = text[..start].chars().next_back();
219 let after = text[end..].chars().next();
220 !before.is_some_and(is_hostname_continuation) && !after.is_some_and(is_hostname_continuation)
221}
222
223fn is_hostname_continuation(c: char) -> bool {
224 c.is_ascii_alphanumeric() || matches!(c, '_' | '-' | '.')
225}
226
227#[cfg(test)]
228mod tests {
229 use super::*;
230 use cloakrs_core::RecognizerRegistry;
231
232 fn texts(input: &str) -> Vec<String> {
233 HostnameRecognizer
234 .scan(input)
235 .into_iter()
236 .map(|finding| finding.text)
237 .collect()
238 }
239
240 #[test]
241 fn test_hostname_internal_company_host_detected() {
242 assert_eq!(
243 texts("ssh server-prod-01.internal.company.com"),
244 ["server-prod-01.internal.company.com"]
245 );
246 }
247
248 #[test]
249 fn test_hostname_rds_cloud_host_detected() {
250 assert_eq!(
251 texts("server db-replica-3.eu-west-1.rds.amazonaws.com"),
252 ["db-replica-3.eu-west-1.rds.amazonaws.com"]
253 );
254 }
255
256 #[test]
257 fn test_hostname_ec2_internal_host_detected() {
258 assert_eq!(
259 texts("resolved ip-172-31-16-58.ec2.internal"),
260 ["ip-172-31-16-58.ec2.internal"]
261 );
262 }
263
264 #[test]
265 fn test_hostname_k8s_local_host_detected() {
266 assert_eq!(
267 texts("node worker-node-7.k8s.local"),
268 ["worker-node-7.k8s.local"]
269 );
270 }
271
272 #[test]
273 fn test_hostname_infra_context_public_tld_detected() {
274 assert_eq!(
275 texts("host redis-cache-01.staging.myapp.io"),
276 ["redis-cache-01.staging.myapp.io"]
277 );
278 }
279
280 #[test]
281 fn test_hostname_windows_machine_detected() {
282 assert_eq!(texts("machine DESKTOP-A1B2C3D"), ["DESKTOP-A1B2C3D"]);
283 }
284
285 #[test]
286 fn test_hostname_mdns_machine_detected() {
287 assert_eq!(
288 texts("ping macbook-pro-kadir.local"),
289 ["macbook-pro-kadir.local"]
290 );
291 }
292
293 #[test]
294 fn test_hostname_corp_short_host_detected() {
295 assert_eq!(texts("upstream api-gateway.corp"), ["api-gateway.corp"]);
296 }
297
298 #[test]
299 fn test_hostname_public_google_domain_rejected() {
300 assert!(texts("open google.com").is_empty());
301 }
302
303 #[test]
304 fn test_hostname_public_github_domain_rejected() {
305 assert!(texts("host github.com").is_empty());
306 }
307
308 #[test]
309 fn test_hostname_email_domain_rejected() {
310 assert!(texts("mail user@company.com").is_empty());
311 }
312
313 #[test]
314 fn test_hostname_url_host_rejected() {
315 assert!(texts("url https://example.com/path").is_empty());
316 }
317
318 #[test]
319 fn test_hostname_localhost_rejected() {
320 assert!(texts("host localhost").is_empty());
321 }
322
323 #[test]
324 fn test_hostname_reversed_domain_rejected() {
325 assert!(texts("package com.google.android.app").is_empty());
326 }
327
328 #[test]
329 fn test_hostname_single_project_name_rejected() {
330 assert!(texts("repo my-cool-project").is_empty());
331 }
332
333 #[test]
334 fn test_hostname_connecting_context_boosts_confidence() {
335 let with_context = HostnameRecognizer.scan("connecting to db-prod.internal.myco.com");
336 let without_context = HostnameRecognizer.scan("value db-prod.internal.myco.com");
337 assert!(with_context[0].confidence > without_context[0].confidence);
338 }
339
340 #[test]
341 fn test_hostname_host_prefix_context_boosts_confidence() {
342 let with_context = HostnameRecognizer.scan("host: web-01.corp");
343 let without_context = HostnameRecognizer.scan("value web-01.corp");
344 assert!(with_context[0].confidence > without_context[0].confidence);
345 }
346
347 #[test]
348 fn test_hostname_supported_locales_are_universal() {
349 assert!(HostnameRecognizer.supported_locales().is_empty());
350 }
351
352 #[test]
353 fn test_hostname_registry_integration_detects_default_recognizer() {
354 let mut registry = RecognizerRegistry::new();
355 crate::register_default_recognizers(&mut registry);
356
357 let findings = registry.scan_all("remote_host=db-prod.internal.myco.com");
358
359 assert!(findings
360 .iter()
361 .any(|finding| finding.entity_type == EntityType::Hostname));
362 }
363}