Skip to main content

manasight_parser/
sanitize.rs

1//! Privacy scrubber for raw MTGA log text.
2//!
3//! Strips sensitive data (auth tokens, bearer tokens, OS-specific user paths,
4//! session identifiers, display names, and hardware fingerprint lines) from
5//! unstructured `Player.log` text. This is a best-effort filter; novel token
6//! formats may slip through.
7//!
8//! Regex patterns are compiled once via [`std::sync::LazyLock`] and reused
9//! across all calls.
10
11use std::sync::LazyLock;
12
13use regex::Regex;
14
15/// A compiled regex pattern paired with its replacement string.
16struct ScrubPattern {
17    regex: Regex,
18    replacement: &'static str,
19}
20
21/// Compiled privacy-scrubbing patterns, initialized once on first use.
22///
23/// Each entry strips a class of sensitive data from raw log lines:
24/// - Auth tokens (`Token: <value>`)
25/// - Bearer tokens (`Bearer <value>`, word-boundary guarded to avoid game
26///   cosmetic false positives like `Title_StandardBearer`)
27/// - `WotC` account IDs in log prefixes (`Match to <id>:`)
28/// - JSON `"clientId"` and `"userId"` values
29/// - Windows user paths (`C:\Users\<username>\`)
30/// - macOS user paths (`/Users/<username>/`)
31/// - Linux user paths (`/home/<username>/`)
32/// - Session identifiers (JSON `"token"` and `"sessionId"` values)
33/// - Display names (JSON `"screenName"` and `"playerName"` values)
34/// - Hardware fingerprint lines (Renderer, Vendor, VRAM, Driver)
35static SCRUB_PATTERNS: LazyLock<Vec<ScrubPattern>> = LazyLock::new(|| {
36    // Patterns and replacements. Each regex is compiled exactly once.
37    // Order matters: more specific patterns should come before general ones
38    // if there is overlap. Currently there is no overlap between categories.
39    let definitions: &[(&str, &str)] = &[
40        // Auth tokens: "Token: <base64-or-hex-value>"
41        // Matches "Token:" followed by optional whitespace and a non-whitespace token value.
42        (r"Token:\s*\S+", "Token: <redacted>"),
43        // Bearer tokens in HTTP Authorization headers.
44        // Uses word boundary to avoid matching game cosmetics like
45        // "Title_StandardBearer" where "Bearer" appears as a substring
46        // of a larger word. The \b anchor matches at the start of the
47        // string or after a non-word character, so "Bearer" following
48        // a letter (as in "StandardBearer") does not match.
49        (r"\bBearer\s+\S+", "Bearer <redacted>"),
50        // WotC account IDs in log line prefixes.
51        // Arena logs game messages prefixed with the player's account ID:
52        //   "Match to CR4QJUQPDBCVVMGCGNZLWGDFJE: AuthenticateResponse"
53        (r"Match to [A-Z0-9_]+:", "Match to <redacted>:"),
54        // JSON "clientId" values from authenticateResponse blocks.
55        (
56            r#""[Cc]lient[Ii]d"\s*:\s*"[^"]+""#,
57            r#""clientId": "<redacted>""#,
58        ),
59        // JSON "userId" values from matchGameRoomStateChangedEvent blocks.
60        (
61            r#""[Uu]ser[Ii]d"\s*:\s*"[^"]+""#,
62            r#""userId": "<redacted>""#,
63        ),
64        // Windows paths: C:\Users\<username>\ (any drive letter)
65        (r"[A-Z]:\\Users\\[^\\]+\\", r"<user-path>\"),
66        // macOS paths: /Users/<username>/
67        (r"/Users/[^/]+/", "<user-path>/"),
68        // Linux paths: /home/<username>/
69        (r"/home/[^/]+/", "<user-path>/"),
70        // Session identifiers: JSON "token" values from authenticateResponse
71        // and similar auth payloads.
72        (r#""[Tt]oken"\s*:\s*"[^"]+""#, r#""token": "<redacted>""#),
73        // Session identifiers: JSON "sessionId" values from auth responses.
74        (
75            r#""[Ss]ession[Ii]d"\s*:\s*"[^"]+""#,
76            r#""sessionId": "<redacted>""#,
77        ),
78        // Display names: JSON "screenName" values from authenticateResponse.
79        (
80            r#""[Ss]creen[Nn]ame"\s*:\s*"[^"]+""#,
81            r#""screenName": "<redacted>""#,
82        ),
83        // Display names: JSON "playerName" values from match state.
84        // Contains BOTH players' display names, meaning opponent PII
85        // is leaked without this pattern.
86        (
87            r#""[Pp]layer[Nn]ame"\s*:\s*"[^"]+""#,
88            r#""playerName": "<redacted>""#,
89        ),
90        // Hardware fingerprint: GPU renderer line in log header.
91        // (?m) enables per-line ^ matching since we scrub the full text buffer.
92        // Leading whitespace (^\s+) is required to avoid false positives.
93        (r"(?m)^\s+Renderer:\s+.+", "  Renderer: <redacted>"),
94        // Hardware fingerprint: GPU vendor.
95        (r"(?m)^\s+Vendor:\s+.+", "  Vendor: <redacted>"),
96        // Hardware fingerprint: VRAM size in MB.
97        (r"(?m)^\s+VRAM:\s+.+", "  VRAM: <redacted>"),
98        // Hardware fingerprint: GPU driver version.
99        (r"(?m)^\s+Driver:\s+.+", "  Driver: <redacted>"),
100    ];
101
102    definitions
103        .iter()
104        .filter_map(|(pattern, replacement)| {
105            // These patterns are static string literals validated by tests.
106            // A compilation failure here indicates a programmer error in the
107            // pattern definitions above, not a runtime data issue.
108            match Regex::new(pattern) {
109                Ok(regex) => Some(ScrubPattern { regex, replacement }),
110                Err(e) => {
111                    ::log::error!("BUG: failed to compile privacy pattern {pattern:?}: {e}");
112                    None
113                }
114            }
115        })
116        .collect()
117});
118
119/// Redact PII and credentials from raw MTGA `Player.log` text.
120///
121/// Applies each compiled privacy regex pattern to the full input text,
122/// replacing all matches with redaction placeholders. Handles empty input,
123/// single-line input, and multi-megabyte files without panicking.
124///
125/// # Examples
126///
127/// ```
128/// use manasight_parser::sanitize::scrub_raw_log;
129///
130/// let raw = r#"Token: secret123 and "screenName": "Player#999""#;
131/// let clean = scrub_raw_log(raw);
132/// assert!(clean.contains("Token: <redacted>"));
133/// assert!(!clean.contains("secret123"));
134/// ```
135pub fn scrub_raw_log(input: &str) -> String {
136    if input.is_empty() {
137        return String::new();
138    }
139
140    let mut result = input.to_owned();
141    for pattern in SCRUB_PATTERNS.iter() {
142        result = pattern
143            .regex
144            .replace_all(&result, pattern.replacement)
145            .into_owned();
146    }
147    result
148}
149
150// ---------------------------------------------------------------------------
151// Tests
152// ---------------------------------------------------------------------------
153
154#[cfg(test)]
155mod tests {
156    use super::*;
157
158    // --- Empty and trivial input ---
159
160    #[test]
161    fn test_scrub_raw_log_empty_input_returns_empty() {
162        assert_eq!(scrub_raw_log(""), "");
163    }
164
165    #[test]
166    fn test_scrub_raw_log_single_line_no_sensitive_data_unchanged() {
167        let input = "[UnityCrossThreadLogger] Game started";
168        assert_eq!(scrub_raw_log(input), input);
169    }
170
171    #[test]
172    fn test_scrub_raw_log_multiline_no_sensitive_data_unchanged() {
173        let input = "Line 1\nLine 2\nLine 3\n";
174        assert_eq!(scrub_raw_log(input), input);
175    }
176
177    // --- Auth token patterns ---
178
179    #[test]
180    fn test_scrub_raw_log_token_value_redacted() {
181        let input =
182            "Token: eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0.signature";
183        let result = scrub_raw_log(input);
184        assert_eq!(result, "Token: <redacted>");
185    }
186
187    #[test]
188    fn test_scrub_raw_log_token_no_space_after_colon_redacted() {
189        let input = "Token:abc123def456";
190        let result = scrub_raw_log(input);
191        assert_eq!(result, "Token: <redacted>");
192    }
193
194    #[test]
195    fn test_scrub_raw_log_token_with_surrounding_text() {
196        let input = "[Auth] Login response Token: eyJhbGciOiJSUzI1NiJ9.payload.sig -- done";
197        let result = scrub_raw_log(input);
198        assert_eq!(result, "[Auth] Login response Token: <redacted> -- done");
199    }
200
201    #[test]
202    fn test_scrub_raw_log_multiple_tokens_on_separate_lines() {
203        let input = "Token: first_token\nSome other line\nToken: second_token\n";
204        let result = scrub_raw_log(input);
205        assert!(result.contains("Token: <redacted>"));
206        assert!(!result.contains("first_token"));
207        assert!(!result.contains("second_token"));
208    }
209
210    // --- Bearer token patterns ---
211
212    #[test]
213    fn test_scrub_raw_log_bearer_token_redacted() {
214        let input = "Authorization: Bearer eyJhbGciOiJSUzI1NiJ9.payload.signature";
215        let result = scrub_raw_log(input);
216        assert_eq!(result, "Authorization: Bearer <redacted>");
217    }
218
219    #[test]
220    fn test_scrub_raw_log_bearer_with_extra_whitespace() {
221        let input = "Bearer   some_token_value";
222        let result = scrub_raw_log(input);
223        assert_eq!(result, "Bearer <redacted>");
224    }
225
226    #[test]
227    fn test_scrub_raw_log_bearer_false_positive_standard_bearer_not_redacted() {
228        let input = r#""Title_StandardBearer""#;
229        assert_eq!(scrub_raw_log(input), input);
230    }
231
232    #[test]
233    fn test_scrub_raw_log_bearer_jwt_still_redacted() {
234        let input = "Authorization: Bearer eyJhbGciOiJIUzI1NiJ9.payload.signature";
235        let result = scrub_raw_log(input);
236        assert_eq!(result, "Authorization: Bearer <redacted>");
237        assert!(!result.contains("eyJhbGciOiJIUzI1NiJ9"));
238    }
239
240    // --- Windows path patterns ---
241
242    #[test]
243    fn test_scrub_raw_log_windows_path_redacted() {
244        let input =
245            r"Loading from C:\Users\JohnDoe\AppData\LocalLow\Wizards Of The Coast\MTGA\Player.log";
246        let result = scrub_raw_log(input);
247        assert!(result.contains(r"<user-path>\AppData\LocalLow"));
248        assert!(!result.contains("JohnDoe"));
249    }
250
251    #[test]
252    fn test_scrub_raw_log_windows_path_different_drive_letter() {
253        let input = r"D:\Users\Alice\Documents\game.log";
254        let result = scrub_raw_log(input);
255        assert!(result.contains(r"<user-path>\Documents"));
256        assert!(!result.contains("Alice"));
257    }
258
259    // --- macOS path patterns ---
260
261    #[test]
262    fn test_scrub_raw_log_macos_path_redacted() {
263        let input = "/Users/johndoe/Library/Logs/com.wizards.mtga/Player.log";
264        let result = scrub_raw_log(input);
265        assert!(result.contains("<user-path>/Library/Logs"));
266        assert!(!result.contains("johndoe"));
267    }
268
269    #[test]
270    fn test_scrub_raw_log_macos_path_with_spaces_in_context() {
271        let input = "Reading file at /Users/jane_doe/Library/Logs/app.log successfully";
272        let result = scrub_raw_log(input);
273        assert!(result.contains("<user-path>/Library/Logs"));
274        assert!(!result.contains("jane_doe"));
275    }
276
277    // --- Linux path patterns ---
278
279    #[test]
280    fn test_scrub_raw_log_linux_path_redacted() {
281        let input = "/home/gamer/.local/share/Steam/steamapps/common/MTGA/Player.log";
282        let result = scrub_raw_log(input);
283        assert!(result.contains("<user-path>/.local/share"));
284        assert!(!result.contains("gamer"));
285    }
286
287    #[test]
288    fn test_scrub_raw_log_linux_path_different_username() {
289        let input = "Config at /home/mtg_player/.config/manasight/settings.toml";
290        let result = scrub_raw_log(input);
291        assert!(result.contains("<user-path>/.config/manasight"));
292        assert!(!result.contains("mtg_player"));
293    }
294
295    // --- Session identifier patterns ---
296
297    #[test]
298    fn test_scrub_raw_log_json_token_value_redacted() {
299        let input = r#"{"screenName": "Player#1", "token": "abc123secret"}"#;
300        let result = scrub_raw_log(input);
301        assert!(result.contains(r#""token": "<redacted>""#));
302        assert!(!result.contains("abc123secret"));
303    }
304
305    #[test]
306    fn test_scrub_raw_log_json_token_uppercase_key_redacted() {
307        let input = r#"{"Token": "eyJhbGci.payload.sig"}"#;
308        let result = scrub_raw_log(input);
309        assert!(result.contains(r#""token": "<redacted>""#));
310        assert!(!result.contains("eyJhbGci"));
311    }
312
313    #[test]
314    fn test_scrub_raw_log_json_session_id_redacted() {
315        let input = r#"{"sessionId": "sess_abc123def456", "status": "connected"}"#;
316        let result = scrub_raw_log(input);
317        assert!(result.contains(r#""sessionId": "<redacted>""#));
318        assert!(!result.contains("sess_abc123def456"));
319    }
320
321    #[test]
322    fn test_scrub_raw_log_authenticate_response_block() {
323        let input = "[UnityCrossThreadLogger]authenticateResponse\n\
324                     {\"screenName\": \"TestPlayer#12345\", \"token\": \"secret_jwt_value\"}";
325        let result = scrub_raw_log(input);
326        assert!(!result.contains("secret_jwt_value"));
327        assert!(result.contains(r#""token": "<redacted>""#));
328        assert!(!result.contains("TestPlayer#12345"));
329        assert!(result.contains(r#""screenName": "<redacted>""#));
330    }
331
332    #[test]
333    fn test_scrub_raw_log_session_id_with_spaces_in_json() {
334        let input = r#"{ "SessionId" : "long-session-id-value-here" }"#;
335        let result = scrub_raw_log(input);
336        assert!(result.contains(r#""sessionId": "<redacted>""#));
337        assert!(!result.contains("long-session-id-value-here"));
338    }
339
340    // --- WotC account ID in log prefix ---
341
342    #[test]
343    fn test_scrub_raw_log_match_to_account_id_redacted() {
344        let input = "Match to CR4QJUQPDBCVVMGCGNZLWGDFJE: AuthenticateResponse";
345        let result = scrub_raw_log(input);
346        assert_eq!(result, "Match to <redacted>: AuthenticateResponse");
347        assert!(!result.contains("CR4QJUQPDBCVVMGCGNZLWGDFJE"));
348    }
349
350    #[test]
351    fn test_scrub_raw_log_match_to_with_underscore_in_id() {
352        let input = "Match to SOME_ACCOUNT_ID_123: MatchCreated";
353        let result = scrub_raw_log(input);
354        assert_eq!(result, "Match to <redacted>: MatchCreated");
355        assert!(!result.contains("SOME_ACCOUNT_ID_123"));
356    }
357
358    #[test]
359    fn test_scrub_raw_log_match_to_with_log_timestamp_prefix() {
360        let input = "[UnityCrossThreadLogger]3/22/2026 12:00:31 PM: Match to CR4QJUQPDBCVVMGCGNZLWGDFJE: AuthenticateResponse";
361        let result = scrub_raw_log(input);
362        assert!(result.contains("Match to <redacted>:"));
363        assert!(!result.contains("CR4QJUQPDBCVVMGCGNZLWGDFJE"));
364    }
365
366    // --- JSON clientId pattern ---
367
368    #[test]
369    fn test_scrub_raw_log_json_client_id_redacted() {
370        let input = r#""clientId": "CR4QJUQPDBCVVMGCGNZLWGDFJE""#;
371        let result = scrub_raw_log(input);
372        assert_eq!(result, r#""clientId": "<redacted>""#);
373        assert!(!result.contains("CR4QJUQPDBCVVMGCGNZLWGDFJE"));
374    }
375
376    #[test]
377    fn test_scrub_raw_log_json_client_id_with_spaces() {
378        let input = r#"{ "ClientId" : "ABCDEF123456" }"#;
379        let result = scrub_raw_log(input);
380        assert!(result.contains(r#""clientId": "<redacted>""#));
381        assert!(!result.contains("ABCDEF123456"));
382    }
383
384    // --- JSON userId pattern ---
385
386    #[test]
387    fn test_scrub_raw_log_json_user_id_redacted() {
388        let input = r#""userId": "CR4QJUQPDBCVVMGCGNZLWGDFJE""#;
389        let result = scrub_raw_log(input);
390        assert_eq!(result, r#""userId": "<redacted>""#);
391        assert!(!result.contains("CR4QJUQPDBCVVMGCGNZLWGDFJE"));
392    }
393
394    #[test]
395    fn test_scrub_raw_log_json_user_id_uppercase_key() {
396        let input = r#"{"UserId": "OPPONENT_ACCOUNT_ID_XYZ"}"#;
397        let result = scrub_raw_log(input);
398        assert!(result.contains(r#""userId": "<redacted>""#));
399        assert!(!result.contains("OPPONENT_ACCOUNT_ID_XYZ"));
400    }
401
402    #[test]
403    fn test_scrub_raw_log_json_user_id_in_match_event() {
404        let input = r#"{"players": [{"userId": "PLAYER_ABC"}, {"userId": "OPPONENT_XYZ"}]}"#;
405        let result = scrub_raw_log(input);
406        assert!(!result.contains("PLAYER_ABC"));
407        assert!(!result.contains("OPPONENT_XYZ"));
408        assert_eq!(result.matches(r#""userId": "<redacted>""#).count(), 2);
409    }
410
411    // --- screenName pattern ---
412
413    #[test]
414    fn test_scrub_raw_log_screen_name_redacted() {
415        let input = r#""screenName": "PlayerDisplayName#12345""#;
416        let result = scrub_raw_log(input);
417        assert_eq!(result, r#""screenName": "<redacted>""#);
418        assert!(!result.contains("PlayerDisplayName"));
419    }
420
421    #[test]
422    fn test_scrub_raw_log_screen_name_uppercase_key() {
423        let input = r#"{"ScreenName": "SomePlayer#99999"}"#;
424        let result = scrub_raw_log(input);
425        assert!(result.contains(r#""screenName": "<redacted>""#));
426        assert!(!result.contains("SomePlayer"));
427    }
428
429    #[test]
430    fn test_scrub_raw_log_screen_name_no_space_after_colon() {
431        let input = r#""screenName":"Truffie#12345""#;
432        let result = scrub_raw_log(input);
433        assert!(result.contains(r#""screenName": "<redacted>""#));
434        assert!(!result.contains("Truffie"));
435    }
436
437    // --- playerName pattern ---
438
439    #[test]
440    fn test_scrub_raw_log_player_name_redacted() {
441        let input = r#""playerName": "OpponentName#67890""#;
442        let result = scrub_raw_log(input);
443        assert_eq!(result, r#""playerName": "<redacted>""#);
444        assert!(!result.contains("OpponentName"));
445    }
446
447    #[test]
448    fn test_scrub_raw_log_player_name_both_players_redacted() {
449        let input =
450            r#"{"players": [{"playerName": "LocalPlayer#111"}, {"playerName": "Opponent#222"}]}"#;
451        let result = scrub_raw_log(input);
452        assert!(!result.contains("LocalPlayer"));
453        assert!(!result.contains("Opponent"));
454        assert_eq!(result.matches(r#""playerName": "<redacted>""#).count(), 2);
455    }
456
457    #[test]
458    fn test_scrub_raw_log_player_name_uppercase_key() {
459        let input = r#"{"PlayerName": "SomeUser#42"}"#;
460        let result = scrub_raw_log(input);
461        assert!(result.contains(r#""playerName": "<redacted>""#));
462        assert!(!result.contains("SomeUser"));
463    }
464
465    // --- Hardware fingerprint patterns ---
466
467    #[test]
468    fn test_scrub_raw_log_hardware_fingerprint_all_lines_redacted() {
469        let input =
470            "  Renderer: NVIDIA GeForce RTX 3080\n  Vendor: NVIDIA\n  VRAM: 10240\n  Driver: 537.58";
471        let result = scrub_raw_log(input);
472        assert!(!result.contains("NVIDIA GeForce RTX 3080"));
473        assert!(!result.contains("NVIDIA"));
474        assert!(!result.contains("10240"));
475        assert!(!result.contains("537.58"));
476        assert!(result.contains("Renderer: <redacted>"));
477        assert!(result.contains("Vendor: <redacted>"));
478        assert!(result.contains("VRAM: <redacted>"));
479        assert!(result.contains("Driver: <redacted>"));
480    }
481
482    #[test]
483    fn test_scrub_raw_log_hardware_fingerprint_in_full_log_header() {
484        let input = "\
485[UnityCrossThreadLogger] Version: 1.2.3.4
486  SystemInfo:
487  Renderer: AMD Radeon RX 6800 XT
488  Vendor: AMD
489  VRAM: 16384
490  Driver: 23.12.1
491[UnityCrossThreadLogger] Game starting";
492        let result = scrub_raw_log(input);
493        assert!(!result.contains("AMD Radeon RX 6800 XT"));
494        assert!(!result.contains("16384"));
495        assert!(!result.contains("23.12.1"));
496        assert!(result.contains("Version: 1.2.3.4"));
497        assert!(result.contains("Game starting"));
498    }
499
500    #[test]
501    fn test_scrub_raw_log_hardware_renderer_not_matched_without_leading_whitespace() {
502        let input = "Renderer: some game object reference";
503        assert_eq!(scrub_raw_log(input), input);
504    }
505
506    #[test]
507    fn test_scrub_raw_log_hardware_vendor_not_matched_without_leading_whitespace() {
508        let input = "Vendor: some vendor string in game data";
509        assert_eq!(scrub_raw_log(input), input);
510    }
511
512    // --- Multiple patterns in one block ---
513
514    #[test]
515    fn test_scrub_raw_log_mixed_sensitive_data_all_redacted() {
516        let input = "\
517[Auth] Token: eyJhbGciOiJSUzI1NiJ9.payload.sig
518[HTTP] Authorization: Bearer eyToken123.payload.sig
519[Init] Loading config from C:\\Users\\JaneDoe\\AppData\\Local\\manasight\\config.toml
520[Init] Log path: /Users/johndoe/Library/Logs/manasight.log
521[Init] Linux path: /home/linuxuser/.local/share/manasight/data.db
522[Game] Match started: event=PlayQueue";
523
524        let result = scrub_raw_log(input);
525
526        assert!(!result.contains("eyJhbGciOiJSUzI1NiJ9"));
527        assert!(!result.contains("eyToken123"));
528        assert!(!result.contains("JaneDoe"));
529        assert!(!result.contains("johndoe"));
530        assert!(!result.contains("linuxuser"));
531
532        assert!(result.contains("Token: <redacted>"));
533        assert!(result.contains("Bearer <redacted>"));
534        assert!(result.contains(r"<user-path>\AppData"));
535        assert!(result.contains("<user-path>/Library/Logs"));
536        assert!(result.contains("<user-path>/.local/share"));
537
538        assert!(result.contains("[Game] Match started: event=PlayQueue"));
539    }
540
541    // --- Edge cases ---
542
543    #[test]
544    fn test_scrub_raw_log_preserves_line_endings() {
545        let input = "Line 1\r\nToken: secret_value\r\nLine 3\r\n";
546        let result = scrub_raw_log(input);
547        assert!(result.contains("\r\n"));
548        assert!(result.contains("Token: <redacted>"));
549    }
550
551    #[test]
552    fn test_scrub_raw_log_large_input_does_not_panic() {
553        let line = "Normal log line without sensitive data\n";
554        let large_input: String = line.repeat(25_000);
555        let result = scrub_raw_log(&large_input);
556        assert_eq!(result.len(), large_input.len());
557    }
558
559    #[test]
560    fn test_scrub_raw_log_token_at_end_of_line_no_trailing_space() {
561        let input = "Token: abc123";
562        let result = scrub_raw_log(input);
563        assert_eq!(result, "Token: <redacted>");
564    }
565
566    #[test]
567    fn test_scrub_raw_log_bearer_at_end_of_line_no_trailing_space() {
568        let input = "Bearer abc123";
569        let result = scrub_raw_log(input);
570        assert_eq!(result, "Bearer <redacted>");
571    }
572
573    #[test]
574    fn test_scrub_raw_log_path_only_line() {
575        let input = r"C:\Users\SomeUser\";
576        let result = scrub_raw_log(input);
577        assert_eq!(result, r"<user-path>\");
578    }
579
580    #[test]
581    fn test_scrub_raw_log_multiple_paths_on_same_line() {
582        let input = "Copied /Users/alice/source.txt to /Users/bob/dest.txt";
583        let result = scrub_raw_log(input);
584        assert!(!result.contains("alice"));
585        assert!(!result.contains("bob"));
586        assert_eq!(
587            result,
588            "Copied <user-path>/source.txt to <user-path>/dest.txt"
589        );
590    }
591
592    #[test]
593    fn test_scrub_raw_log_idempotent() {
594        let input = "Token: secret123\n/home/user/.config/app.toml";
595        let first_pass = scrub_raw_log(input);
596        let second_pass = scrub_raw_log(&first_pass);
597        assert_eq!(first_pass, second_pass, "Scrubbing should be idempotent");
598    }
599
600    // --- Patterns that should NOT be redacted ---
601
602    #[test]
603    fn test_scrub_raw_log_lowercase_token_not_redacted() {
604        let input = "token: not_a_real_token";
605        assert_eq!(scrub_raw_log(input), input);
606    }
607
608    #[test]
609    fn test_scrub_raw_log_lowercase_bearer_not_redacted() {
610        let input = "bearer not_a_real_token";
611        assert_eq!(scrub_raw_log(input), input);
612    }
613
614    #[test]
615    fn test_scrub_raw_log_non_user_paths_not_redacted() {
616        let input = "/usr/local/bin/mtga\n/etc/config.toml\n/var/log/syslog";
617        assert_eq!(scrub_raw_log(input), input);
618    }
619
620    // --- Corpus validation (env-gated, not run in CI) ---
621
622    /// Run `scrub_raw_log` against every `.log` file in the corpus directory
623    /// and verify that none of the PII patterns survive scrubbing.
624    ///
625    /// Skipped unless `SCRUBBER_CORPUS_DIR` is set:
626    /// ```sh
627    /// SCRUBBER_CORPUS_DIR=/tmp/smoke-corpus cargo test corpus_scrub -- --nocapture
628    /// ```
629    #[test]
630    fn test_corpus_scrub_no_pii_survives() {
631        let Ok(dir) = std::env::var("SCRUBBER_CORPUS_DIR") else {
632            return;
633        };
634        let corpus_dir = std::path::PathBuf::from(dir);
635
636        let pii_patterns: Vec<(&str, Regex)> = vec![
637            (
638                "screenName",
639                Regex::new(r#""[Ss]creen[Nn]ame"\s*:\s*"([^"]+)""#)
640                    .unwrap_or_else(|_| unreachable!()),
641            ),
642            (
643                "playerName",
644                Regex::new(r#""[Pp]layer[Nn]ame"\s*:\s*"([^"]+)""#)
645                    .unwrap_or_else(|_| unreachable!()),
646            ),
647            (
648                "Renderer",
649                Regex::new(r"(?m)^\s+Renderer:\s+(.+)").unwrap_or_else(|_| unreachable!()),
650            ),
651            (
652                "Vendor",
653                Regex::new(r"(?m)^\s+Vendor:\s+(.+)").unwrap_or_else(|_| unreachable!()),
654            ),
655            (
656                "VRAM",
657                Regex::new(r"(?m)^\s+VRAM:\s+(.+)").unwrap_or_else(|_| unreachable!()),
658            ),
659            (
660                "Driver",
661                Regex::new(r"(?m)^\s+Driver:\s+(.+)").unwrap_or_else(|_| unreachable!()),
662            ),
663        ];
664
665        let mut total_before = 0u32;
666        let mut failures: Vec<String> = Vec::new();
667
668        let entries: Vec<_> = std::fs::read_dir(&corpus_dir)
669            .unwrap_or_else(|_| unreachable!())
670            .filter_map(Result::ok)
671            .filter(|e| e.path().extension().is_some_and(|ext| ext == "log"))
672            .collect();
673
674        for entry in &entries {
675            let path = entry.path();
676            let filename = path
677                .file_name()
678                .unwrap_or_else(|| unreachable!())
679                .to_string_lossy();
680            let Ok(raw) = std::fs::read_to_string(&path) else {
681                continue;
682            };
683
684            let scrubbed = scrub_raw_log(&raw);
685
686            for (name, re) in &pii_patterns {
687                let before = u32::try_from(re.find_iter(&raw).count()).unwrap_or(u32::MAX);
688                total_before += before;
689
690                let leaked: Vec<String> = re
691                    .captures_iter(&scrubbed)
692                    .filter_map(|cap| {
693                        let val = cap.get(1).map_or("", |m| m.as_str());
694                        if val == "<redacted>" {
695                            None
696                        } else {
697                            Some(val.to_owned())
698                        }
699                    })
700                    .collect();
701
702                for val in &leaked {
703                    failures.push(format!("{filename}: {name} leaked: {val:?}"));
704                }
705            }
706        }
707
708        assert!(
709            total_before > 0,
710            "corpus should contain at least one PII match to be a meaningful test"
711        );
712        assert!(
713            failures.is_empty(),
714            "PII survived scrubbing in {} location(s) (of {total_before} raw matches):\n{}",
715            failures.len(),
716            failures.join("\n")
717        );
718    }
719}