1use crate::common::{compile_regex, confidence, context_boost};
2use cloakrs_core::{Confidence, EntityType, Locale, PiiEntity, Recognizer, Span};
3use once_cell::sync::Lazy;
4use regex::Regex;
5use std::collections::HashSet;
6
7static UNIX_HOME_REGEX: Lazy<Regex> = Lazy::new(|| {
8 compile_regex(r#"(?:/home/|/Users/)([a-zA-Z][a-zA-Z0-9._-]{0,31})(/[^\s:"'`,;)}\]]*)*"#)
9});
10
11static WINDOWS_HOME_REGEX: Lazy<Regex> = Lazy::new(|| {
12 compile_regex(r#"(?i)[A-Z]:\\Users\\([a-zA-Z][a-zA-Z0-9._\- ]{0,31})(\\[^\s:"'`,;)}\]]*)*"#)
13});
14
15static ROOT_PATH_REGEX: Lazy<Regex> = Lazy::new(|| compile_regex(r#"/root(/[^\s:"'`,;)}\]]*)*"#));
16
17const IGNORE_USERNAMES: &[&str] = &[
18 "user",
19 "username",
20 "example",
21 "your_user",
22 "your_username",
23 "yourusername",
24 "myuser",
25 "testuser",
26 "$user",
27 "{user}",
28 "xxx",
29 "placeholder",
30];
31
32const CONTEXT_WORDS: &[&str] = &[
33 "file",
34 "path",
35 "directory",
36 "folder",
37 "config",
38 "open",
39 "read",
40 "write",
41 "permission",
42 "denied",
43 "not found",
44 "no such file",
45 "filenotfounderror",
46 "enoent",
47 "stack trace",
48 "at /",
49 "from /",
50 "in /",
51];
52
53const SENSITIVE_SUBPATHS: &[&str] = &[
54 ".ssh",
55 ".aws",
56 ".env",
57 "credentials",
58 ".gnupg",
59 "id_rsa",
60 "authorized_keys",
61];
62
63#[derive(Debug, Clone, Copy, Default)]
76pub struct UserPathRecognizer;
77
78impl Recognizer for UserPathRecognizer {
79 fn id(&self) -> &str {
80 "user_path_home_v1"
81 }
82
83 fn entity_type(&self) -> EntityType {
84 EntityType::UserPath
85 }
86
87 fn supported_locales(&self) -> &[Locale] {
88 &[]
89 }
90
91 fn scan(&self, text: &str) -> Vec<PiiEntity> {
92 let mut seen = HashSet::new();
93 let mut findings = Vec::new();
94
95 for regex in [&*UNIX_HOME_REGEX, &*WINDOWS_HOME_REGEX, &*ROOT_PATH_REGEX] {
96 for matched in regex.find_iter(text) {
97 let span = trim_path_span(text, Span::new(matched.start(), matched.end()));
98 if span.is_empty() || !seen.insert((span.start, span.end)) {
99 continue;
100 }
101 let candidate = &text[span.start..span.end];
102 if self.is_valid_path_match(text, span.start, span.end, candidate) {
103 findings.push(PiiEntity {
104 entity_type: self.entity_type(),
105 span,
106 text: candidate.to_string(),
107 confidence: self.compute_confidence(text, span.start, candidate),
108 recognizer_id: self.id().to_string(),
109 });
110 }
111 }
112 }
113
114 findings.sort_by_key(|finding| finding.span.start);
115 findings
116 }
117
118 fn validate(&self, candidate: &str) -> bool {
119 Self::extract_username(candidate).is_some_and(|username| !is_ignored_username(&username))
120 }
121}
122
123impl UserPathRecognizer {
124 #[must_use]
135 pub fn extract_username(path: &str) -> Option<String> {
136 if let Some(rest) = path.strip_prefix("/home/") {
137 return first_unix_segment(rest);
138 }
139 if let Some(rest) = path.strip_prefix("/Users/") {
140 return first_unix_segment(rest);
141 }
142 let lower = path.to_ascii_lowercase();
143 if let Some(index) = lower.find(r"\users\") {
144 let after = &path[index + r"\Users\".len()..];
145 return after.split('\\').next().map(str::to_string);
146 }
147 if path == "/root" || path.starts_with("/root/") {
148 return Some("root".to_string());
149 }
150 None
151 }
152
153 fn is_valid_path_match(&self, text: &str, start: usize, end: usize, candidate: &str) -> bool {
154 self.validate(candidate) && is_path_boundary(text, start, end)
155 }
156
157 fn compute_confidence(&self, text: &str, start: usize, candidate: &str) -> Confidence {
158 let base = if candidate == "/root" || candidate.starts_with("/root/") {
159 0.75
160 } else {
161 0.85
162 };
163 let sensitive_boost = if contains_sensitive_subpath(candidate) {
164 0.05
165 } else {
166 0.0
167 };
168 confidence(base + sensitive_boost + context_boost(text, start, CONTEXT_WORDS))
169 }
170}
171
172fn first_unix_segment(rest: &str) -> Option<String> {
173 rest.split('/').next().map(str::to_string)
174}
175
176fn is_ignored_username(username: &str) -> bool {
177 let lower = username.to_ascii_lowercase();
178 IGNORE_USERNAMES.contains(&lower.as_str())
179}
180
181fn contains_sensitive_subpath(path: &str) -> bool {
182 let lower = path.to_ascii_lowercase();
183 SENSITIVE_SUBPATHS
184 .iter()
185 .any(|subpath| lower.contains(subpath))
186}
187
188fn trim_path_span(text: &str, span: Span) -> Span {
189 let mut end = span.end;
190 while end > span.start {
191 let value = &text[span.start..end];
192 let Some(c) = value.chars().next_back() else {
193 break;
194 };
195 if !matches!(c, '.' | ',' | ':' | ';' | '!' | '?' | ')' | ']' | '}') {
196 break;
197 }
198 end -= c.len_utf8();
199 }
200 Span::new(span.start, end)
201}
202
203fn is_path_boundary(text: &str, start: usize, end: usize) -> bool {
204 let before = text[..start].chars().next_back();
205 let after = text[end..].chars().next();
206 !before.is_some_and(is_path_continuation) && !after.is_some_and(is_path_continuation)
207}
208
209fn is_path_continuation(c: char) -> bool {
210 c.is_ascii_alphanumeric() || matches!(c, '_' | '-' | '.' | '/' | '\\')
211}
212
213#[cfg(test)]
214mod tests {
215 use super::*;
216 use cloakrs_core::RecognizerRegistry;
217
218 fn texts(input: &str) -> Vec<String> {
219 UserPathRecognizer
220 .scan(input)
221 .into_iter()
222 .map(|finding| finding.text)
223 .collect()
224 }
225
226 #[test]
227 fn test_user_path_home_project_detected() {
228 assert_eq!(
229 texts("open /home/kadir/projects/myapp/config.yml"),
230 ["/home/kadir/projects/myapp/config.yml"]
231 );
232 }
233
234 #[test]
235 fn test_user_path_linux_ssh_key_detected() {
236 assert_eq!(
237 texts("read /home/ubuntu/.ssh/id_rsa"),
238 ["/home/ubuntu/.ssh/id_rsa"]
239 );
240 }
241
242 #[test]
243 fn test_user_path_linux_env_file_detected() {
244 assert_eq!(texts("config /home/deploy/.env"), ["/home/deploy/.env"]);
245 }
246
247 #[test]
248 fn test_user_path_macos_document_detected() {
249 assert_eq!(
250 texts("file /Users/john/Documents/report.pdf"),
251 ["/Users/john/Documents/report.pdf"]
252 );
253 }
254
255 #[test]
256 fn test_user_path_macos_aws_credentials_detected() {
257 assert_eq!(
258 texts("path /Users/admin/.aws/credentials"),
259 ["/Users/admin/.aws/credentials"]
260 );
261 }
262
263 #[test]
264 fn test_user_path_windows_administrator_detected() {
265 assert_eq!(
266 texts(r"open C:\Users\Administrator\Desktop\secrets.txt"),
267 [r"C:\Users\Administrator\Desktop\secrets.txt"]
268 );
269 }
270
271 #[test]
272 fn test_user_path_windows_dotted_username_detected() {
273 assert_eq!(
274 texts(r"temp C:\Users\john.doe\AppData\Local\Temp"),
275 [r"C:\Users\john.doe\AppData\Local\Temp"]
276 );
277 }
278
279 #[test]
280 fn test_user_path_root_bashrc_detected() {
281 assert_eq!(texts("file /root/.bashrc"), ["/root/.bashrc"]);
282 }
283
284 #[test]
285 fn test_user_path_root_authorized_keys_detected() {
286 assert_eq!(
287 texts("read /root/.ssh/authorized_keys"),
288 ["/root/.ssh/authorized_keys"]
289 );
290 }
291
292 #[test]
293 fn test_user_path_usr_local_rejected() {
294 assert!(texts("path /usr/local/bin/python").is_empty());
295 }
296
297 #[test]
298 fn test_user_path_var_log_rejected() {
299 assert!(texts("path /var/log/syslog").is_empty());
300 }
301
302 #[test]
303 fn test_user_path_etc_nginx_rejected() {
304 assert!(texts("path /etc/nginx/nginx.conf").is_empty());
305 }
306
307 #[test]
308 fn test_user_path_relative_config_rejected() {
309 assert!(texts("path ./config/settings.yml").is_empty());
310 }
311
312 #[test]
313 fn test_user_path_placeholder_user_rejected() {
314 assert!(texts("example /home/user/example").is_empty());
315 }
316
317 #[test]
318 fn test_user_path_container_app_rejected() {
319 assert!(texts("path /app/src/main.rs").is_empty());
320 }
321
322 #[test]
323 fn test_user_path_filenotfound_context_boosts_confidence() {
324 let with_context =
325 UserPathRecognizer.scan("FileNotFoundError: /home/kadir/.config/app.yml");
326 let without_context = UserPathRecognizer.scan("value /home/kadir/.config/app.yml");
327 assert!(with_context[0].confidence > without_context[0].confidence);
328 }
329
330 #[test]
331 fn test_user_path_permission_denied_context_boosts_confidence() {
332 let with_context = UserPathRecognizer.scan("permission denied: /Users/admin/private");
333 let without_context = UserPathRecognizer.scan("value /Users/admin/private");
334 assert!(with_context[0].confidence > without_context[0].confidence);
335 }
336
337 #[test]
338 fn test_user_path_extract_username_from_linux_home() {
339 assert_eq!(
340 UserPathRecognizer::extract_username("/home/kadir/stuff").as_deref(),
341 Some("kadir")
342 );
343 }
344
345 #[test]
346 fn test_user_path_extract_username_from_windows_home() {
347 assert_eq!(
348 UserPathRecognizer::extract_username(r"C:\Users\john.doe\Desktop").as_deref(),
349 Some("john.doe")
350 );
351 }
352
353 #[test]
354 fn test_user_path_extract_username_from_root_home() {
355 assert_eq!(
356 UserPathRecognizer::extract_username("/root/.ssh").as_deref(),
357 Some("root")
358 );
359 }
360
361 #[test]
362 fn test_user_path_sensitive_subpath_boosts_confidence() {
363 let sensitive = UserPathRecognizer.scan("value /home/kadir/.ssh/id_rsa");
364 let ordinary = UserPathRecognizer.scan("value /home/kadir/projects/app");
365 assert!(sensitive[0].confidence > ordinary[0].confidence);
366 }
367
368 #[test]
369 fn test_user_path_supported_locales_are_universal() {
370 assert!(UserPathRecognizer.supported_locales().is_empty());
371 }
372
373 #[test]
374 fn test_user_path_registry_integration_detects_default_recognizer() {
375 let mut registry = RecognizerRegistry::new();
376 crate::register_default_recognizers(&mut registry);
377
378 let findings = registry.scan_all("open /home/kadir/projects/app");
379
380 assert!(findings
381 .iter()
382 .any(|finding| finding.entity_type == EntityType::UserPath));
383 }
384}