1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
use regex::Regex;
use std::sync::LazyLock;
use super::Token;
// File system paths (Unix-style) - simplified without character class issues
static FILE_PATH: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(/[a-zA-Z0-9_.\-/]+(?:\.[a-zA-Z0-9]+)?(?:/[a-zA-Z0-9_.\-]*)*/?)")
.expect("Failed to compile file path regex")
});
// URL paths (without domain)
static URL_PATH: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"((?:/[a-zA-Z0-9_.\-~%]*)+(?:\?[a-zA-Z0-9_.\-~%&=]*)?(?:#[a-zA-Z0-9_.\-~%]*)?)")
.expect("Failed to compile URL path regex")
});
// Full URLs with schemes (https://host/path) - capture the entire URL
static FULL_URL: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r#"https?://[^/\s]+(?:/[^\s"]*)"#).expect("Failed to compile full URL regex")
});
// Windows paths
static WINDOWS_PATH: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"([A-Za-z]:\\[a-zA-Z0-9_.\-\\]+(?:\\[a-zA-Z0-9_.\-]*)*\\?)")
.expect("Failed to compile Windows path regex")
});
// Query parameters (standalone)
static QUERY_PARAMS: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"\?([a-zA-Z0-9_\-]+=([^&\s]+)(&[a-zA-Z0-9_\-]+=([^&\s]+))*)")
.expect("Failed to compile query params regex")
});
// Route parameters in paths (like /users/123/posts/456, /namespaces/kube-system/pods/pod-name)
static ROUTE_PARAMS: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"/([0-9a-fA-F]{8,}|[0-9]{3,}|[a-fA-F0-9\-]{8,})")
.expect("Failed to compile route params regex")
});
// Variable text segments in paths (like /namespaces/NAME/serviceaccounts/NAME)
static PATH_SEGMENTS: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"/([a-zA-Z0-9][a-zA-Z0-9\-]{2,}[a-zA-Z0-9])")
.expect("Failed to compile path segments regex")
});
// Source file with line number pattern (like file.go:1234])
static SOURCE_LINE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"([a-zA-Z_][a-zA-Z0-9_\-]*\.(go|rs|py|js|java|c|cpp|h|hpp|rb|php|ts|tsx|jsx|cs|swift|kt|m|mm|scala|clj|ex|exs|erl|hrl)):\d+\]")
.expect("Failed to compile source line regex")
});
// CLI flags pattern (like --flag-name or -f)
static CLI_FLAG: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"\s--?[a-zA-Z][a-zA-Z0-9\-_]*").expect("Failed to compile CLI flag regex")
});
// JSON-like structures and embedded objects
// Matches {key:value...} or escaped JSON \"{}\"
static JSON_STRUCT: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r#"(\\"?\{[\\"\w\s:,\[\]$/_-]*\}\\"?)"#)
.expect("Failed to compile JSON structure regex")
});
// Event objects that span multiple lines
static EVENT_OBJECT: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"&Event\{[^}]*\}").expect("Failed to compile event object regex"));
pub struct PathDetector;
impl PathDetector {
pub fn detect_and_replace(text: &str) -> (String, Vec<Token>) {
let mut result = text.to_string();
let mut tokens = Vec::new();
// Replace Event objects first (they're more specific)
result = EVENT_OBJECT
.replace_all(&result, |caps: ®ex::Captures| {
let event = caps.get(0).unwrap().as_str();
tokens.push(Token::Path(event.to_string()));
"<EVENT_OBJECT>".to_string()
})
.to_string();
// Replace JSON structures (like {volumeName:..., podName:...})
result = JSON_STRUCT
.replace_all(&result, |caps: ®ex::Captures| {
let json = caps.get(0).unwrap().as_str();
tokens.push(Token::Path(json.to_string()));
// Try to identify what type of structure it is
if json.contains("volumeName:") {
"<VOLUME_SPEC>"
} else if json.contains("ObjectMeta:") {
"<K8S_OBJECT>"
} else {
"<JSON_DATA>"
}
.to_string()
})
.to_string();
// Replace CLI flags (like --flag-name)
result = CLI_FLAG
.replace_all(&result, |caps: ®ex::Captures| {
let flag = caps.get(0).unwrap().as_str();
tokens.push(Token::Path(flag.trim().to_string()));
" <FLAG>".to_string()
})
.to_string();
// Replace source file:line patterns (before other path detection)
// This normalizes file.go:1234] to file.go:<LINE>]
result = SOURCE_LINE
.replace_all(&result, |caps: ®ex::Captures| {
let matched = caps.get(0).unwrap().as_str();
let filename = caps.get(1).unwrap().as_str();
tokens.push(Token::Path(matched.to_string()));
format!("{filename}:<LINE>]")
})
.to_string();
// Replace full URLs NEXT (https://host/path) - must run before file paths
// Otherwise the /path part gets detected as a file path
result = FULL_URL
.replace_all(&result, |caps: ®ex::Captures| {
let full_url = caps.get(0).unwrap().as_str();
tokens.push(Token::Path(full_url.to_string()));
"<PATH>".to_string()
})
.to_string();
// Replace file system paths (Unix-style)
result = FILE_PATH
.replace_all(&result, |caps: ®ex::Captures| {
let path = caps.get(1).unwrap().as_str();
if Self::is_likely_file_path(path) {
tokens.push(Token::Path(path.to_string()));
"<PATH>".to_string()
} else {
caps[0].to_string()
}
})
.to_string();
// Replace Windows paths
result = WINDOWS_PATH
.replace_all(&result, |caps: ®ex::Captures| {
let path = caps.get(1).unwrap().as_str();
tokens.push(Token::Path(path.to_string()));
"<PATH>".to_string()
})
.to_string();
// Replace URL paths and query parameters
result = URL_PATH
.replace_all(&result, |caps: ®ex::Captures| {
let path = caps.get(1).unwrap().as_str();
if Self::is_likely_url_path(path) {
let normalized = Self::normalize_url_path(path);
tokens.push(Token::Path(path.to_string()));
normalized
} else {
caps[0].to_string()
}
})
.to_string();
(result, tokens)
}
#[mutants::skip] // Equivalent mutant: common dirs (/var/, /usr/, etc.) always imply has_multiple_segments, making || vs && on those conditions indistinguishable
fn is_likely_file_path(path: &str) -> bool {
// Must start with /
if !path.starts_with('/') {
return false;
}
// Exclude very short paths that might be false positives
if path.len() < 3 {
return false;
}
// Common file path indicators
let has_extension =
path.contains('.') && path.split('/').next_back().unwrap_or("").contains('.');
let has_multiple_segments = path.matches('/').count() > 1;
let has_common_dirs = path.contains("/var/")
|| path.contains("/usr/")
|| path.contains("/etc/")
|| path.contains("/home/")
|| path.contains("/opt/")
|| path.contains("/tmp/");
has_extension || has_multiple_segments || has_common_dirs
}
#[mutants::skip] // Equivalent mutant: API patterns (/api/, /v1/, /static/) always imply has_multiple_segments, making || vs && indistinguishable
fn is_likely_url_path(path: &str) -> bool {
// Must start with /
if !path.starts_with('/') {
return false;
}
// Exclude very short paths
if path.len() < 2 {
return false;
}
// Common URL path indicators
let has_api_patterns = path.contains("/api/")
|| path.contains("/v1/")
|| path.contains("/v2/")
|| path.starts_with("/static/")
|| path.starts_with("/assets/");
let has_query_params = path.contains('?');
let has_multiple_segments = path.matches('/').count() > 1;
let has_numeric_ids = ROUTE_PARAMS.is_match(path);
has_api_patterns || has_query_params || has_multiple_segments || has_numeric_ids
}
fn normalize_url_path(path: &str) -> String {
// First replace numeric IDs and hex IDs in path segments
let mut normalized = ROUTE_PARAMS.replace_all(path, "/<PATH>").to_string();
// Then replace variable text segments (like namespace names, service account names)
// Skip common fixed segments like 'api', 'v1', 'namespaces', 'serviceaccounts', 'token'
normalized = PATH_SEGMENTS
.replace_all(&normalized, |caps: ®ex::Captures| {
let segment = caps.get(1).unwrap().as_str();
// Keep common fixed API segments unchanged
match segment {
"api" | "v1" | "v2" | "v3" | "alpha" | "beta" | "namespaces" | "pods"
| "services" | "deployments" | "configmaps" | "secrets" | "serviceaccounts"
| "token" | "status" | "proxy" | "logs" | "exec" | "static" | "assets"
| "public" | "health" | "metrics" => {
format!("/{segment}")
}
_ => "/<PATH>".to_string(),
}
})
.to_string();
// Replace query parameter values
normalized = QUERY_PARAMS
.replace_all(&normalized, |caps: ®ex::Captures| {
let full_query = caps.get(1).unwrap().as_str();
let parts: Vec<&str> = full_query.split('&').collect();
let normalized_parts: Vec<String> = parts
.iter()
.map(|part| {
if let Some(eq_pos) = part.find('=') {
format!("{}=<PATH>", &part[..eq_pos])
} else {
part.to_string()
}
})
.collect();
format!("?{}", normalized_parts.join("&"))
})
.to_string();
normalized
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_file_path_detection() {
let test_cases = vec![
("/var/log/app.log", true),
("/home/user/document.txt", true),
("/usr/bin/python", true),
("/", false),
("/a", false),
("not/a/path", false),
];
for (path, expected) in test_cases {
assert_eq!(
PathDetector::is_likely_file_path(path),
expected,
"Failed for path: {path}"
);
}
}
#[test]
fn test_url_path_detection() {
let test_cases = vec![
("/api/users/123", true),
("/static/css/main.css", true),
("/search?q=test", true),
("/", false),
("/a", false),
];
for (path, expected) in test_cases {
assert_eq!(
PathDetector::is_likely_url_path(path),
expected,
"Failed for path: {path}"
);
}
}
// --- is_likely_file_path ---
#[test]
fn file_path_short_rejected() {
assert!(!PathDetector::is_likely_file_path("/a"));
}
#[test]
fn file_path_no_leading_slash_rejected() {
assert!(!PathDetector::is_likely_file_path("var/log"));
}
#[test]
fn file_path_common_dir() {
assert!(PathDetector::is_likely_file_path("/var/log"));
}
#[test]
fn file_path_with_extension() {
assert!(PathDetector::is_likely_file_path("/app/main.rs"));
}
// --- is_likely_url_path ---
#[test]
fn url_path_api_pattern() {
assert!(PathDetector::is_likely_url_path("/api/v1/users"));
}
#[test]
fn url_path_too_short() {
assert!(!PathDetector::is_likely_url_path("/"));
}
#[test]
fn url_path_query_params() {
assert!(PathDetector::is_likely_url_path("/search?q=test"));
}
#[test]
fn test_path_normalization() {
let test_cases = vec![
("Error in /var/log/app.2025-01-20.log", "Error in <PATH>"),
("GET /api/users/123/posts", "GET <PATH>"),
// "search" is not in the fixed API segments list, so it becomes <PATH> too
(
"Request to /search?q=test&page=5",
"Request to /<PATH>?q=<PATH>&page=<PATH>",
),
(r"C:\Users\john\Documents\file.txt", "<PATH>"),
];
for (input, expected) in test_cases {
let (result, _tokens) = PathDetector::detect_and_replace(input);
assert_eq!(result, expected, "Failed for input: {input}");
}
}
// ---- is_likely_file_path: per-branch tests ----
#[test]
fn file_path_no_leading_slash() {
assert!(!PathDetector::is_likely_file_path("no_slash"));
}
#[test]
fn file_path_len_under_3() {
assert!(!PathDetector::is_likely_file_path("/a"));
}
#[test]
fn file_path_has_extension() {
assert!(PathDetector::is_likely_file_path("/app.log"));
}
#[test]
fn file_path_multiple_segments() {
assert!(PathDetector::is_likely_file_path("/usr/bin/python"));
}
#[test]
fn file_path_var() {
assert!(PathDetector::is_likely_file_path("/var/log/syslog"));
}
#[test]
fn file_path_usr() {
assert!(PathDetector::is_likely_file_path("/usr/local/bin"));
}
#[test]
fn file_path_etc() {
assert!(PathDetector::is_likely_file_path("/etc/nginx/nginx.conf"));
}
#[test]
fn file_path_home() {
assert!(PathDetector::is_likely_file_path("/home/user/doc"));
}
#[test]
fn file_path_opt() {
assert!(PathDetector::is_likely_file_path("/opt/app/bin"));
}
#[test]
fn file_path_tmp() {
assert!(PathDetector::is_likely_file_path("/tmp/data"));
}
// ---- is_likely_url_path: per-branch tests ----
#[test]
fn url_path_no_leading_slash() {
assert!(!PathDetector::is_likely_url_path("api/v1"));
}
#[test]
fn url_path_len_under_2() {
assert!(!PathDetector::is_likely_url_path("/"));
}
#[test]
fn url_path_api() {
assert!(PathDetector::is_likely_url_path("/api/users"));
}
#[test]
fn url_path_v1() {
assert!(PathDetector::is_likely_url_path("/v1/resources"));
}
#[test]
fn url_path_v2() {
assert!(PathDetector::is_likely_url_path("/v2/items"));
}
#[test]
fn url_path_static() {
assert!(PathDetector::is_likely_url_path("/static/style.css"));
}
#[test]
fn url_path_assets() {
assert!(PathDetector::is_likely_url_path("/assets/img.png"));
}
#[test]
fn url_path_has_query_params() {
assert!(PathDetector::is_likely_url_path("/search?q=test"));
}
#[test]
fn url_path_multi_segments() {
assert!(PathDetector::is_likely_url_path("/users/123/orders"));
}
// ---- Mutant-killing: is_likely_file_path boundary & branch isolation ----
#[test]
fn file_path_len_exactly_3_accepted() {
// len=3 "/ab" must pass the `< 3` guard (boundary: == should fail, <= should fail)
// It has no extension, no multiple segments, no common dirs, so it should
// actually return false (none of the three conditions are met). But it DOES
// pass the length guard, which is the mutant we're killing.
// We need a 3-char path that satisfies at least one condition.
// "/a." has a dot but split('/').next_back() = "a." which contains('.') -> has_extension
assert!(PathDetector::is_likely_file_path("/a."));
}
#[test]
fn file_path_len_exactly_2_rejected() {
// len=2 "/a" must fail the `< 3` guard
assert!(!PathDetector::is_likely_file_path("/a"));
}
#[test]
fn file_path_has_extension_only() {
// has_extension=true, has_multiple_segments=false, has_common_dirs=false
// Single segment after /, has extension, no common dir.
// "/x.log" -> segments from split('/'): ["", "x.log"], matches('/').count()=1 so not >1
assert!(PathDetector::is_likely_file_path("/x.log"));
}
#[test]
fn file_path_has_multiple_segments_only() {
// has_extension=false, has_multiple_segments=true, has_common_dirs=false
// No extension, multiple segments, no common dir names
assert!(PathDetector::is_likely_file_path("/foo/bar/baz"));
}
#[test]
fn file_path_has_common_dirs_only() {
// has_extension=false, has_multiple_segments=false (need only 1 slash beyond root),
// has_common_dirs=true
// "/var/x" -> matches('/').count()=2 which IS >1, so has_multiple_segments=true too.
// We need exactly 1 slash: that means the path is just "/var/" which contains "/var/"
// "/var/" -> matches('/').count()=2, still >1. Hard to isolate.
// Actually "/tmp/" -> contains "/tmp/", matches('/')=2 -> has_multiple_segments=true
// Let's verify the mutant is still killed: if || became &&, then having only
// common_dirs true (with multiple_segments also true) wouldn't isolate.
// Instead test: has_common_dirs=true but extension=false, to kill the first ||
assert!(PathDetector::is_likely_file_path("/var/x"));
}
#[test]
fn file_path_none_of_three_conditions() {
// No extension, single segment (1 slash), no common dir -> should be false
// Kills mutant: || replaced with && (if all were required, this proves || works
// by showing that ABSENCE of all three returns false)
assert!(!PathDetector::is_likely_file_path("/xyz"));
}
// ---- Mutant-killing: is_likely_url_path boundary & branch isolation ----
#[test]
fn url_path_len_exactly_2_accepted() {
// len=2 "/x" must pass the `< 2` guard. But then it needs to satisfy a condition.
// "/x" has no api patterns, no query params, matches('/').count()=1 (not >1), no numeric ids.
// So it returns false. The mutant is about the guard itself.
// We need a 2-char URL path that satisfies at least one condition.
// "/?" has query params -> has_query_params=true, len=2
assert!(PathDetector::is_likely_url_path("/?"));
}
#[test]
fn url_path_len_exactly_1_rejected() {
// len=1 "/" must fail the `< 2` guard
assert!(!PathDetector::is_likely_url_path("/"));
}
#[test]
fn url_path_has_api_patterns_only() {
// has_api_patterns=true, has_query_params=false, has_multiple_segments=false(?), has_numeric_ids=false
// "/static/x" -> starts_with("/static/")=true, no '?', matches('/')=2 which IS >1
// Hard to isolate api from multiple_segments. Use "/static/" itself:
// matches('/')=2 -> has_multiple_segments=true. Can't avoid it with /static/.
// Use "/v1/" -> matches('/')=2 again.
// The point is: if || became &&, requiring ALL to be true would fail since
// has_query_params and has_numeric_ids are false. This kills the mutant.
assert!(PathDetector::is_likely_url_path("/v1/items"));
}
#[test]
fn url_path_has_query_params_only() {
// has_api_patterns=false, has_query_params=true, has_multiple_segments=false, has_numeric_ids=false
// "/x?q=1" -> no api pattern, has '?', matches('/')=1 (not >1), no route params
assert!(PathDetector::is_likely_url_path("/x?q=1"));
}
#[test]
fn url_path_has_multiple_segments_only() {
// has_api_patterns=false, has_query_params=false, has_multiple_segments=true, has_numeric_ids=false
// "/foo/bar" -> no api, no '?', matches('/')=2 (>1), no numeric route params
assert!(PathDetector::is_likely_url_path("/foo/bar"));
}
#[test]
fn url_path_has_numeric_ids_only() {
// has_api_patterns=false, has_query_params=false, has_multiple_segments=false, has_numeric_ids=true
// Need a path with a numeric route param but only 1 slash.
// ROUTE_PARAMS matches /([0-9a-fA-F]{8,}|[0-9]{3,}|[a-fA-F0-9\-]{8,})
// "/12345" -> matches('/').count()=1 so not multiple segments
// ROUTE_PARAMS: /([0-9]{3,}) matches "/12345"
assert!(PathDetector::is_likely_url_path("/12345"));
}
#[test]
fn url_path_segment_count_boundary() {
// has_multiple_segments requires matches('/').count() > 1
// "/x" has count=1 (not >1). Kills mutant: > replaced with <
assert!(!PathDetector::is_likely_url_path("/x"));
// "/x/y" has count=2 (>1) -> true
assert!(PathDetector::is_likely_url_path("/x/y"));
}
// ---- Mutant-killing: normalize_url_path API segment preservation ----
#[test]
fn normalize_url_path_preserves_api_segments() {
// PATH_SEGMENTS regex matches 4+ char segments, so "api"/"v1" are too short
// to match. Use 4+ char segments from the preserved list.
let result = PathDetector::normalize_url_path("/namespaces/my-ns/pods/nginx-abc");
assert!(
result.contains("/namespaces"),
"namespaces should be preserved, got: {result}"
);
assert!(
result.contains("/pods"),
"pods should be preserved, got: {result}"
);
// "my-ns" and "nginx-abc" are variable, should become <PATH>
assert!(
!result.contains("my-ns"),
"variable segment should be normalized, got: {result}"
);
}
#[test]
fn normalize_url_path_static_preserved() {
let result = PathDetector::normalize_url_path("/static/style.css");
assert!(
result.contains("/static"),
"static should be preserved, got: {result}"
);
}
#[test]
fn normalize_url_path_health_preserved() {
let result = PathDetector::normalize_url_path("/health/check-endpoint");
assert!(
result.contains("/health"),
"health should be preserved, got: {result}"
);
}
}