Skip to main content

manasight_parser/
sanitize.rs

1//! Privacy scrubber for raw MTGA log text.
2//!
3//! Strips sensitive data (auth tokens, bearer tokens, OS-specific user paths,
4//! session identifiers, display names, email addresses, IP addresses, and
5//! hardware fingerprint lines) from unstructured `Player.log` text. This is a
6//! best-effort filter; novel token formats may slip through.
7//!
8//! Regex patterns are compiled once via [`std::sync::LazyLock`] and reused
9//! across all calls.
10
11use std::sync::LazyLock;
12
13use regex::Regex;
14
15/// A compiled regex pattern paired with its replacement string.
16struct ScrubPattern {
17    regex: Regex,
18    replacement: &'static str,
19    /// When `true`, this pattern redacts a player display name field
20    /// (`screenName` or `playerName`). Used by [`scrub_raw_log_with`] to
21    /// conditionally skip name redaction when [`ScrubOptions::keep_player_names`]
22    /// is set.
23    is_player_name: bool,
24}
25
26/// Options controlling which classes of data are redacted by [`scrub_raw_log_with`].
27///
28/// All fields default to `false`, which reproduces the same behavior as
29/// [`scrub_raw_log`] (maximum redaction).
30///
31/// # Examples
32///
33/// ```
34/// use manasight_parser::{ScrubOptions, scrub_raw_log_with};
35///
36/// // Preserve player names while still redacting everything else.
37/// let opts = ScrubOptions { keep_player_names: true };
38/// let raw = r#"Token: secret123 and "screenName": "Player#999""#;
39/// let clean = scrub_raw_log_with(raw, &opts);
40/// assert!(clean.contains("Token: <redacted>"));
41/// assert!(clean.contains(r#""Player#999""#));
42/// ```
43#[derive(Debug, Clone, PartialEq, Eq, Default)]
44pub struct ScrubOptions {
45    /// When `true`, the `screenName` and `playerName` JSON fields are **not**
46    /// redacted. All other patterns (tokens, bearer tokens, paths, `clientId`,
47    /// `userId`, `sessionId`, email addresses, IP addresses, hardware
48    /// fingerprints) still apply.
49    ///
50    /// Use this when the upload destination should retain both players' handles
51    /// for replay or analytics attribution (AC-OPP-1).
52    pub keep_player_names: bool,
53}
54
55/// Compiled privacy-scrubbing patterns, initialized once on first use.
56///
57/// Each entry strips a class of sensitive data from raw log lines:
58/// - Auth tokens (`Token: <value>`)
59/// - Bearer tokens (`Bearer <value>`, word-boundary guarded to avoid game
60///   cosmetic false positives like `Title_StandardBearer`)
61/// - `WotC` account IDs in log prefixes (`Match to <id>:`)
62/// - JSON `"clientId"` and `"userId"` values
63/// - Windows user paths (`C:\Users\<username>\`)
64/// - macOS user paths (`/Users/<username>/`)
65/// - Linux user paths (`/home/<username>/`)
66/// - Session identifiers (JSON `"token"` and `"sessionId"` values)
67/// - Display names (JSON `"screenName"` and `"playerName"` values)
68/// - Hardware fingerprint lines (Renderer, Vendor, VRAM, Driver)
69/// - Email addresses
70/// - IPv4 dotted-quad addresses
71/// - IPv6 addresses (compressed, full, `::1`, `fe80::` link-local)
72static SCRUB_PATTERNS: LazyLock<Vec<ScrubPattern>> = LazyLock::new(|| {
73    // Patterns, replacements, and per-pattern flags.
74    // Each regex is compiled exactly once.
75    // Order matters: more specific patterns should come before general ones
76    // if there is overlap. Currently there is no overlap between categories.
77    //
78    // Tuple fields: (pattern, replacement, is_player_name)
79    let definitions: &[(&str, &str, bool)] = &[
80        // Auth tokens: "Token: <base64-or-hex-value>"
81        // Matches "Token:" followed by optional whitespace and a non-whitespace token value.
82        (r"Token:\s*\S+", "Token: <redacted>", false),
83        // Bearer tokens in HTTP Authorization headers.
84        // Uses word boundary to avoid matching game cosmetics like
85        // "Title_StandardBearer" where "Bearer" appears as a substring
86        // of a larger word. The \b anchor matches at the start of the
87        // string or after a non-word character, so "Bearer" following
88        // a letter (as in "StandardBearer") does not match.
89        (r"\bBearer\s+\S+", "Bearer <redacted>", false),
90        // WotC account IDs in log line prefixes.
91        // Arena logs game messages prefixed with the player's account ID:
92        //   "Match to CR4QJUQPDBCVVMGCGNZLWGDFJE: AuthenticateResponse"
93        (r"Match to [A-Z0-9_]+:", "Match to <redacted>:", false),
94        // JSON "clientId" values from authenticateResponse blocks.
95        (
96            r#""[Cc]lient[Ii]d"\s*:\s*"[^"]+""#,
97            r#""clientId": "<redacted>""#,
98            false,
99        ),
100        // JSON "userId" values from matchGameRoomStateChangedEvent blocks.
101        (
102            r#""[Uu]ser[Ii]d"\s*:\s*"[^"]+""#,
103            r#""userId": "<redacted>""#,
104            false,
105        ),
106        // Windows paths: C:\Users\<username>\ (any drive letter)
107        (r"[A-Z]:\\Users\\[^\\]+\\", r"<user-path>\", false),
108        // macOS paths: /Users/<username>/
109        (r"/Users/[^/]+/", "<user-path>/", false),
110        // Linux paths: /home/<username>/
111        (r"/home/[^/]+/", "<user-path>/", false),
112        // Session identifiers: JSON "token" values from authenticateResponse
113        // and similar auth payloads.
114        (
115            r#""[Tt]oken"\s*:\s*"[^"]+""#,
116            r#""token": "<redacted>""#,
117            false,
118        ),
119        // Session identifiers: JSON "sessionId" values from auth responses.
120        (
121            r#""[Ss]ession[Ii]d"\s*:\s*"[^"]+""#,
122            r#""sessionId": "<redacted>""#,
123            false,
124        ),
125        // Display names: JSON "screenName" values from authenticateResponse.
126        // is_player_name = true so scrub_raw_log_with can skip this when
127        // keep_player_names is set.
128        (
129            r#""[Ss]creen[Nn]ame"\s*:\s*"[^"]+""#,
130            r#""screenName": "<redacted>""#,
131            true,
132        ),
133        // Display names: JSON "playerName" values from match state.
134        // Contains BOTH players' display names, meaning opponent PII
135        // is leaked without this pattern.
136        // is_player_name = true — skipped when keep_player_names is set.
137        (
138            r#""[Pp]layer[Nn]ame"\s*:\s*"[^"]+""#,
139            r#""playerName": "<redacted>""#,
140            true,
141        ),
142        // Hardware fingerprint: GPU renderer line in log header.
143        // (?m) enables per-line ^ matching since we scrub the full text buffer.
144        // Leading whitespace (^\s+) is required to avoid false positives.
145        (r"(?m)^\s+Renderer:\s+.+", "  Renderer: <redacted>", false),
146        // Hardware fingerprint: GPU vendor.
147        (r"(?m)^\s+Vendor:\s+.+", "  Vendor: <redacted>", false),
148        // Hardware fingerprint: VRAM size in MB.
149        (r"(?m)^\s+VRAM:\s+.+", "  VRAM: <redacted>", false),
150        // Hardware fingerprint: GPU driver version.
151        (r"(?m)^\s+Driver:\s+.+", "  Driver: <redacted>", false),
152        // Email addresses (defense-in-depth; MTGA logs carry no known third-party
153        // emails empirically, but this closes a latent gap for future client changes).
154        (
155            r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}",
156            "<email-redacted>",
157            false,
158        ),
159        // IPv6 addresses — matched BEFORE IPv4 to avoid the embedded IPv4 portion
160        // of IPv4-mapped IPv6 addresses being double-substituted.
161        //
162        // Covers: full 8-group addresses, compressed addresses (::), loopback (::1),
163        // link-local (fe80::...), and IPv4-mapped (::ffff:a.b.c.d).
164        //
165        // Three alternations (leftmost wins):
166        //   1. `::` optionally followed by hex groups — covers `::1`, `::`, `::ffff:...`
167        //   2. One or more hex groups followed by `::` and optional hex tail — covers
168        //      `fe80::1`, `2001:db8::1`
169        //   3. Three or more colon-separated hex groups without `::` — covers full
170        //      8-group addresses like `2001:0db8:85a3:0000:0000:8a2e:0370:7334`
171        //
172        // Alternation 1 uses no leading \b because `::` starts with a non-word
173        // character. Alternations 2 and 3 use \b to avoid partial matches inside
174        // larger tokens.
175        (
176            concat!(
177                r"::(?:[0-9a-fA-F]{1,4}(?::[0-9a-fA-F]{1,4})*)?",
178                r"|\b[0-9a-fA-F]{1,4}(?::[0-9a-fA-F]{1,4})*::[0-9a-fA-F]{0,4}(?::[0-9a-fA-F]{1,4})*",
179                r"|\b(?:[0-9a-fA-F]{1,4}:){3,7}[0-9a-fA-F]{1,4}\b",
180            ),
181            "<ip-redacted>",
182            false,
183        ),
184        // IPv4 dotted-quad addresses (defense-in-depth).
185        //
186        // NOTE: A straightforward dotted-quad regex also matches version strings
187        // of the form "N.N.N.N" (e.g. "Version: 1.2.3.4" in the MTGA log header).
188        // Because the `regex` crate is DFA-based and does not support lookbehind,
189        // there is no way to exclude the version-line context without substantial
190        // added complexity. The deliberate tradeoff here is that a 4-segment version
191        // string is syntactically indistinguishable from an IPv4 address; redacting
192        // it is acceptable as defense-in-depth (AC-PRIV-8). The test fixture
193        // `test_scrub_raw_log_hardware_fingerprint_in_full_log_header` has been
194        // updated accordingly.
195        (
196            r"\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b",
197            "<ip-redacted>",
198            false,
199        ),
200    ];
201
202    definitions
203        .iter()
204        .filter_map(|(pattern, replacement, is_player_name)| {
205            // These patterns are static string literals validated by tests.
206            // A compilation failure here indicates a programmer error in the
207            // pattern definitions above, not a runtime data issue.
208            match Regex::new(pattern) {
209                Ok(regex) => Some(ScrubPattern {
210                    regex,
211                    replacement,
212                    is_player_name: *is_player_name,
213                }),
214                Err(e) => {
215                    ::log::error!("BUG: failed to compile privacy pattern {pattern:?}: {e}");
216                    None
217                }
218            }
219        })
220        .collect()
221});
222
223/// Redact PII and credentials from raw MTGA `Player.log` text.
224///
225/// Applies each compiled privacy regex pattern to the full input text,
226/// replacing all matches with redaction placeholders. Handles empty input,
227/// single-line input, and multi-megabyte files without panicking.
228///
229/// This is equivalent to `scrub_raw_log_with(input, &ScrubOptions::default())`.
230///
231/// # Examples
232///
233/// ```
234/// use manasight_parser::sanitize::scrub_raw_log;
235///
236/// let raw = r#"Token: secret123 and "screenName": "Player#999""#;
237/// let clean = scrub_raw_log(raw);
238/// assert!(clean.contains("Token: <redacted>"));
239/// assert!(!clean.contains("secret123"));
240/// ```
241pub fn scrub_raw_log(input: &str) -> String {
242    scrub_raw_log_with(input, &ScrubOptions::default())
243}
244
245/// Redact PII and credentials from raw MTGA `Player.log` text with configurable options.
246///
247/// Like [`scrub_raw_log`], but accepts a [`ScrubOptions`] value to control which
248/// data classes are redacted. See [`ScrubOptions`] for available flags.
249///
250/// # Examples
251///
252/// ```
253/// use manasight_parser::{ScrubOptions, scrub_raw_log_with};
254///
255/// // Keep player handles for server-side replay attribution.
256/// let opts = ScrubOptions { keep_player_names: true };
257/// let raw = r#""screenName": "TimCahill#1234", "token": "secret""#;
258/// let clean = scrub_raw_log_with(raw, &opts);
259/// assert!(clean.contains("TimCahill#1234"));
260/// assert!(clean.contains(r#""token": "<redacted>""#));
261/// ```
262pub fn scrub_raw_log_with(input: &str, opts: &ScrubOptions) -> String {
263    if input.is_empty() {
264        return String::new();
265    }
266
267    let mut result = input.to_owned();
268    for pattern in SCRUB_PATTERNS.iter() {
269        if opts.keep_player_names && pattern.is_player_name {
270            continue;
271        }
272        result = pattern
273            .regex
274            .replace_all(&result, pattern.replacement)
275            .into_owned();
276    }
277    result
278}
279
280// ---------------------------------------------------------------------------
281// Tests
282// ---------------------------------------------------------------------------
283
284#[cfg(test)]
285mod tests {
286    use super::*;
287
288    // --- Empty and trivial input ---
289
290    #[test]
291    fn test_scrub_raw_log_empty_input_returns_empty() {
292        assert_eq!(scrub_raw_log(""), "");
293    }
294
295    #[test]
296    fn test_scrub_raw_log_single_line_no_sensitive_data_unchanged() {
297        let input = "[UnityCrossThreadLogger] Game started";
298        assert_eq!(scrub_raw_log(input), input);
299    }
300
301    #[test]
302    fn test_scrub_raw_log_multiline_no_sensitive_data_unchanged() {
303        let input = "Line 1\nLine 2\nLine 3\n";
304        assert_eq!(scrub_raw_log(input), input);
305    }
306
307    // --- Auth token patterns ---
308
309    #[test]
310    fn test_scrub_raw_log_token_value_redacted() {
311        let input =
312            "Token: eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0.signature";
313        let result = scrub_raw_log(input);
314        assert_eq!(result, "Token: <redacted>");
315    }
316
317    #[test]
318    fn test_scrub_raw_log_token_no_space_after_colon_redacted() {
319        let input = "Token:abc123def456";
320        let result = scrub_raw_log(input);
321        assert_eq!(result, "Token: <redacted>");
322    }
323
324    #[test]
325    fn test_scrub_raw_log_token_with_surrounding_text() {
326        let input = "[Auth] Login response Token: eyJhbGciOiJSUzI1NiJ9.payload.sig -- done";
327        let result = scrub_raw_log(input);
328        assert_eq!(result, "[Auth] Login response Token: <redacted> -- done");
329    }
330
331    #[test]
332    fn test_scrub_raw_log_multiple_tokens_on_separate_lines() {
333        let input = "Token: first_token\nSome other line\nToken: second_token\n";
334        let result = scrub_raw_log(input);
335        assert!(result.contains("Token: <redacted>"));
336        assert!(!result.contains("first_token"));
337        assert!(!result.contains("second_token"));
338    }
339
340    // --- Bearer token patterns ---
341
342    #[test]
343    fn test_scrub_raw_log_bearer_token_redacted() {
344        let input = "Authorization: Bearer eyJhbGciOiJSUzI1NiJ9.payload.signature";
345        let result = scrub_raw_log(input);
346        assert_eq!(result, "Authorization: Bearer <redacted>");
347    }
348
349    #[test]
350    fn test_scrub_raw_log_bearer_with_extra_whitespace() {
351        let input = "Bearer   some_token_value";
352        let result = scrub_raw_log(input);
353        assert_eq!(result, "Bearer <redacted>");
354    }
355
356    #[test]
357    fn test_scrub_raw_log_bearer_false_positive_standard_bearer_not_redacted() {
358        let input = r#""Title_StandardBearer""#;
359        assert_eq!(scrub_raw_log(input), input);
360    }
361
362    #[test]
363    fn test_scrub_raw_log_bearer_jwt_still_redacted() {
364        let input = "Authorization: Bearer eyJhbGciOiJIUzI1NiJ9.payload.signature";
365        let result = scrub_raw_log(input);
366        assert_eq!(result, "Authorization: Bearer <redacted>");
367        assert!(!result.contains("eyJhbGciOiJIUzI1NiJ9"));
368    }
369
370    // --- Windows path patterns ---
371
372    #[test]
373    fn test_scrub_raw_log_windows_path_redacted() {
374        let input =
375            r"Loading from C:\Users\JohnDoe\AppData\LocalLow\Wizards Of The Coast\MTGA\Player.log";
376        let result = scrub_raw_log(input);
377        assert!(result.contains(r"<user-path>\AppData\LocalLow"));
378        assert!(!result.contains("JohnDoe"));
379    }
380
381    #[test]
382    fn test_scrub_raw_log_windows_path_different_drive_letter() {
383        let input = r"D:\Users\Alice\Documents\game.log";
384        let result = scrub_raw_log(input);
385        assert!(result.contains(r"<user-path>\Documents"));
386        assert!(!result.contains("Alice"));
387    }
388
389    // --- macOS path patterns ---
390
391    #[test]
392    fn test_scrub_raw_log_macos_path_redacted() {
393        let input = "/Users/johndoe/Library/Logs/com.wizards.mtga/Player.log";
394        let result = scrub_raw_log(input);
395        assert!(result.contains("<user-path>/Library/Logs"));
396        assert!(!result.contains("johndoe"));
397    }
398
399    #[test]
400    fn test_scrub_raw_log_macos_path_with_spaces_in_context() {
401        let input = "Reading file at /Users/jane_doe/Library/Logs/app.log successfully";
402        let result = scrub_raw_log(input);
403        assert!(result.contains("<user-path>/Library/Logs"));
404        assert!(!result.contains("jane_doe"));
405    }
406
407    // --- Linux path patterns ---
408
409    #[test]
410    fn test_scrub_raw_log_linux_path_redacted() {
411        let input = "/home/gamer/.local/share/Steam/steamapps/common/MTGA/Player.log";
412        let result = scrub_raw_log(input);
413        assert!(result.contains("<user-path>/.local/share"));
414        assert!(!result.contains("gamer"));
415    }
416
417    #[test]
418    fn test_scrub_raw_log_linux_path_different_username() {
419        let input = "Config at /home/mtg_player/.config/manasight/settings.toml";
420        let result = scrub_raw_log(input);
421        assert!(result.contains("<user-path>/.config/manasight"));
422        assert!(!result.contains("mtg_player"));
423    }
424
425    // --- Session identifier patterns ---
426
427    #[test]
428    fn test_scrub_raw_log_json_token_value_redacted() {
429        let input = r#"{"screenName": "Player#1", "token": "abc123secret"}"#;
430        let result = scrub_raw_log(input);
431        assert!(result.contains(r#""token": "<redacted>""#));
432        assert!(!result.contains("abc123secret"));
433    }
434
435    #[test]
436    fn test_scrub_raw_log_json_token_uppercase_key_redacted() {
437        let input = r#"{"Token": "eyJhbGci.payload.sig"}"#;
438        let result = scrub_raw_log(input);
439        assert!(result.contains(r#""token": "<redacted>""#));
440        assert!(!result.contains("eyJhbGci"));
441    }
442
443    #[test]
444    fn test_scrub_raw_log_json_session_id_redacted() {
445        let input = r#"{"sessionId": "sess_abc123def456", "status": "connected"}"#;
446        let result = scrub_raw_log(input);
447        assert!(result.contains(r#""sessionId": "<redacted>""#));
448        assert!(!result.contains("sess_abc123def456"));
449    }
450
451    #[test]
452    fn test_scrub_raw_log_authenticate_response_block() {
453        let input = "[UnityCrossThreadLogger]authenticateResponse\n\
454                     {\"screenName\": \"TestPlayer#12345\", \"token\": \"secret_jwt_value\"}";
455        let result = scrub_raw_log(input);
456        assert!(!result.contains("secret_jwt_value"));
457        assert!(result.contains(r#""token": "<redacted>""#));
458        assert!(!result.contains("TestPlayer#12345"));
459        assert!(result.contains(r#""screenName": "<redacted>""#));
460    }
461
462    #[test]
463    fn test_scrub_raw_log_session_id_with_spaces_in_json() {
464        let input = r#"{ "SessionId" : "long-session-id-value-here" }"#;
465        let result = scrub_raw_log(input);
466        assert!(result.contains(r#""sessionId": "<redacted>""#));
467        assert!(!result.contains("long-session-id-value-here"));
468    }
469
470    // --- WotC account ID in log prefix ---
471
472    #[test]
473    fn test_scrub_raw_log_match_to_account_id_redacted() {
474        let input = "Match to CR4QJUQPDBCVVMGCGNZLWGDFJE: AuthenticateResponse";
475        let result = scrub_raw_log(input);
476        assert_eq!(result, "Match to <redacted>: AuthenticateResponse");
477        assert!(!result.contains("CR4QJUQPDBCVVMGCGNZLWGDFJE"));
478    }
479
480    #[test]
481    fn test_scrub_raw_log_match_to_with_underscore_in_id() {
482        let input = "Match to SOME_ACCOUNT_ID_123: MatchCreated";
483        let result = scrub_raw_log(input);
484        assert_eq!(result, "Match to <redacted>: MatchCreated");
485        assert!(!result.contains("SOME_ACCOUNT_ID_123"));
486    }
487
488    #[test]
489    fn test_scrub_raw_log_match_to_with_log_timestamp_prefix() {
490        let input = "[UnityCrossThreadLogger]3/22/2026 12:00:31 PM: Match to CR4QJUQPDBCVVMGCGNZLWGDFJE: AuthenticateResponse";
491        let result = scrub_raw_log(input);
492        assert!(result.contains("Match to <redacted>:"));
493        assert!(!result.contains("CR4QJUQPDBCVVMGCGNZLWGDFJE"));
494    }
495
496    // --- JSON clientId pattern ---
497
498    #[test]
499    fn test_scrub_raw_log_json_client_id_redacted() {
500        let input = r#""clientId": "CR4QJUQPDBCVVMGCGNZLWGDFJE""#;
501        let result = scrub_raw_log(input);
502        assert_eq!(result, r#""clientId": "<redacted>""#);
503        assert!(!result.contains("CR4QJUQPDBCVVMGCGNZLWGDFJE"));
504    }
505
506    #[test]
507    fn test_scrub_raw_log_json_client_id_with_spaces() {
508        let input = r#"{ "ClientId" : "ABCDEF123456" }"#;
509        let result = scrub_raw_log(input);
510        assert!(result.contains(r#""clientId": "<redacted>""#));
511        assert!(!result.contains("ABCDEF123456"));
512    }
513
514    // --- JSON userId pattern ---
515
516    #[test]
517    fn test_scrub_raw_log_json_user_id_redacted() {
518        let input = r#""userId": "CR4QJUQPDBCVVMGCGNZLWGDFJE""#;
519        let result = scrub_raw_log(input);
520        assert_eq!(result, r#""userId": "<redacted>""#);
521        assert!(!result.contains("CR4QJUQPDBCVVMGCGNZLWGDFJE"));
522    }
523
524    #[test]
525    fn test_scrub_raw_log_json_user_id_uppercase_key() {
526        let input = r#"{"UserId": "OPPONENT_ACCOUNT_ID_XYZ"}"#;
527        let result = scrub_raw_log(input);
528        assert!(result.contains(r#""userId": "<redacted>""#));
529        assert!(!result.contains("OPPONENT_ACCOUNT_ID_XYZ"));
530    }
531
532    #[test]
533    fn test_scrub_raw_log_json_user_id_in_match_event() {
534        let input = r#"{"players": [{"userId": "PLAYER_ABC"}, {"userId": "OPPONENT_XYZ"}]}"#;
535        let result = scrub_raw_log(input);
536        assert!(!result.contains("PLAYER_ABC"));
537        assert!(!result.contains("OPPONENT_XYZ"));
538        assert_eq!(result.matches(r#""userId": "<redacted>""#).count(), 2);
539    }
540
541    // --- screenName pattern ---
542
543    #[test]
544    fn test_scrub_raw_log_screen_name_redacted() {
545        let input = r#""screenName": "PlayerDisplayName#12345""#;
546        let result = scrub_raw_log(input);
547        assert_eq!(result, r#""screenName": "<redacted>""#);
548        assert!(!result.contains("PlayerDisplayName"));
549    }
550
551    #[test]
552    fn test_scrub_raw_log_screen_name_uppercase_key() {
553        let input = r#"{"ScreenName": "SomePlayer#99999"}"#;
554        let result = scrub_raw_log(input);
555        assert!(result.contains(r#""screenName": "<redacted>""#));
556        assert!(!result.contains("SomePlayer"));
557    }
558
559    #[test]
560    fn test_scrub_raw_log_screen_name_no_space_after_colon() {
561        let input = r#""screenName":"Truffie#12345""#;
562        let result = scrub_raw_log(input);
563        assert!(result.contains(r#""screenName": "<redacted>""#));
564        assert!(!result.contains("Truffie"));
565    }
566
567    // --- playerName pattern ---
568
569    #[test]
570    fn test_scrub_raw_log_player_name_redacted() {
571        let input = r#""playerName": "OpponentName#67890""#;
572        let result = scrub_raw_log(input);
573        assert_eq!(result, r#""playerName": "<redacted>""#);
574        assert!(!result.contains("OpponentName"));
575    }
576
577    #[test]
578    fn test_scrub_raw_log_player_name_both_players_redacted() {
579        let input =
580            r#"{"players": [{"playerName": "LocalPlayer#111"}, {"playerName": "Opponent#222"}]}"#;
581        let result = scrub_raw_log(input);
582        assert!(!result.contains("LocalPlayer"));
583        assert!(!result.contains("Opponent"));
584        assert_eq!(result.matches(r#""playerName": "<redacted>""#).count(), 2);
585    }
586
587    #[test]
588    fn test_scrub_raw_log_player_name_uppercase_key() {
589        let input = r#"{"PlayerName": "SomeUser#42"}"#;
590        let result = scrub_raw_log(input);
591        assert!(result.contains(r#""playerName": "<redacted>""#));
592        assert!(!result.contains("SomeUser"));
593    }
594
595    // --- Hardware fingerprint patterns ---
596
597    #[test]
598    fn test_scrub_raw_log_hardware_fingerprint_all_lines_redacted() {
599        let input =
600            "  Renderer: NVIDIA GeForce RTX 3080\n  Vendor: NVIDIA\n  VRAM: 10240\n  Driver: 537.58";
601        let result = scrub_raw_log(input);
602        assert!(!result.contains("NVIDIA GeForce RTX 3080"));
603        assert!(!result.contains("NVIDIA"));
604        assert!(!result.contains("10240"));
605        assert!(!result.contains("537.58"));
606        assert!(result.contains("Renderer: <redacted>"));
607        assert!(result.contains("Vendor: <redacted>"));
608        assert!(result.contains("VRAM: <redacted>"));
609        assert!(result.contains("Driver: <redacted>"));
610    }
611
612    #[test]
613    fn test_scrub_raw_log_hardware_fingerprint_in_full_log_header() {
614        // NOTE: "Version: 1.2.3.4" is intentionally redacted to "<ip-redacted>"
615        // by the IPv4 pattern. A 4-segment numeric string is syntactically
616        // indistinguishable from an IPv4 address without semantic context, and
617        // the `regex` crate (DFA-based) provides no lookbehind to exclude the
618        // version-line context. Redacting it is acceptable as defense-in-depth
619        // (AC-PRIV-8): the version number is not PII and its loss in the
620        // scrubbed upload blob does not affect replay correctness or analytics.
621        let input = "\
622[UnityCrossThreadLogger] Version: 1.2.3.4
623  SystemInfo:
624  Renderer: AMD Radeon RX 6800 XT
625  Vendor: AMD
626  VRAM: 16384
627  Driver: 23.12.1
628[UnityCrossThreadLogger] Game starting";
629        let result = scrub_raw_log(input);
630        assert!(!result.contains("AMD Radeon RX 6800 XT"));
631        assert!(!result.contains("16384"));
632        assert!(!result.contains("23.12.1"));
633        // Version string is redacted by the IPv4 pattern (deliberate tradeoff —
634        // see comment above).
635        assert!(!result.contains("1.2.3.4"));
636        assert!(result.contains("Version: <ip-redacted>"));
637        assert!(result.contains("Game starting"));
638    }
639
640    #[test]
641    fn test_scrub_raw_log_hardware_renderer_not_matched_without_leading_whitespace() {
642        let input = "Renderer: some game object reference";
643        assert_eq!(scrub_raw_log(input), input);
644    }
645
646    #[test]
647    fn test_scrub_raw_log_hardware_vendor_not_matched_without_leading_whitespace() {
648        let input = "Vendor: some vendor string in game data";
649        assert_eq!(scrub_raw_log(input), input);
650    }
651
652    // --- Multiple patterns in one block ---
653
654    #[test]
655    fn test_scrub_raw_log_mixed_sensitive_data_all_redacted() {
656        let input = "\
657[Auth] Token: eyJhbGciOiJSUzI1NiJ9.payload.sig
658[HTTP] Authorization: Bearer eyToken123.payload.sig
659[Init] Loading config from C:\\Users\\JaneDoe\\AppData\\Local\\manasight\\config.toml
660[Init] Log path: /Users/johndoe/Library/Logs/manasight.log
661[Init] Linux path: /home/linuxuser/.local/share/manasight/data.db
662[Game] Match started: event=PlayQueue";
663
664        let result = scrub_raw_log(input);
665
666        assert!(!result.contains("eyJhbGciOiJSUzI1NiJ9"));
667        assert!(!result.contains("eyToken123"));
668        assert!(!result.contains("JaneDoe"));
669        assert!(!result.contains("johndoe"));
670        assert!(!result.contains("linuxuser"));
671
672        assert!(result.contains("Token: <redacted>"));
673        assert!(result.contains("Bearer <redacted>"));
674        assert!(result.contains(r"<user-path>\AppData"));
675        assert!(result.contains("<user-path>/Library/Logs"));
676        assert!(result.contains("<user-path>/.local/share"));
677
678        assert!(result.contains("[Game] Match started: event=PlayQueue"));
679    }
680
681    // --- Edge cases ---
682
683    #[test]
684    fn test_scrub_raw_log_preserves_line_endings() {
685        let input = "Line 1\r\nToken: secret_value\r\nLine 3\r\n";
686        let result = scrub_raw_log(input);
687        assert!(result.contains("\r\n"));
688        assert!(result.contains("Token: <redacted>"));
689    }
690
691    #[test]
692    fn test_scrub_raw_log_large_input_does_not_panic() {
693        let line = "Normal log line without sensitive data\n";
694        let large_input: String = line.repeat(25_000);
695        let result = scrub_raw_log(&large_input);
696        assert_eq!(result.len(), large_input.len());
697    }
698
699    #[test]
700    fn test_scrub_raw_log_token_at_end_of_line_no_trailing_space() {
701        let input = "Token: abc123";
702        let result = scrub_raw_log(input);
703        assert_eq!(result, "Token: <redacted>");
704    }
705
706    #[test]
707    fn test_scrub_raw_log_bearer_at_end_of_line_no_trailing_space() {
708        let input = "Bearer abc123";
709        let result = scrub_raw_log(input);
710        assert_eq!(result, "Bearer <redacted>");
711    }
712
713    #[test]
714    fn test_scrub_raw_log_path_only_line() {
715        let input = r"C:\Users\SomeUser\";
716        let result = scrub_raw_log(input);
717        assert_eq!(result, r"<user-path>\");
718    }
719
720    #[test]
721    fn test_scrub_raw_log_multiple_paths_on_same_line() {
722        let input = "Copied /Users/alice/source.txt to /Users/bob/dest.txt";
723        let result = scrub_raw_log(input);
724        assert!(!result.contains("alice"));
725        assert!(!result.contains("bob"));
726        assert_eq!(
727            result,
728            "Copied <user-path>/source.txt to <user-path>/dest.txt"
729        );
730    }
731
732    #[test]
733    fn test_scrub_raw_log_idempotent() {
734        let input = "Token: secret123\n/home/user/.config/app.toml";
735        let first_pass = scrub_raw_log(input);
736        let second_pass = scrub_raw_log(&first_pass);
737        assert_eq!(first_pass, second_pass, "Scrubbing should be idempotent");
738    }
739
740    // --- Patterns that should NOT be redacted ---
741
742    #[test]
743    fn test_scrub_raw_log_lowercase_token_not_redacted() {
744        let input = "token: not_a_real_token";
745        assert_eq!(scrub_raw_log(input), input);
746    }
747
748    #[test]
749    fn test_scrub_raw_log_lowercase_bearer_not_redacted() {
750        let input = "bearer not_a_real_token";
751        assert_eq!(scrub_raw_log(input), input);
752    }
753
754    #[test]
755    fn test_scrub_raw_log_non_user_paths_not_redacted() {
756        let input = "/usr/local/bin/mtga\n/etc/config.toml\n/var/log/syslog";
757        assert_eq!(scrub_raw_log(input), input);
758    }
759
760    // --- ScrubOptions / keep_player_names ---
761
762    #[test]
763    fn test_scrub_raw_log_with_keep_player_names_false_redacts_names() {
764        let opts = ScrubOptions {
765            keep_player_names: false,
766        };
767        let input = r#""screenName": "Alice#123", "playerName": "Bob#456""#;
768        let result = scrub_raw_log_with(input, &opts);
769        assert!(!result.contains("Alice"));
770        assert!(!result.contains("Bob"));
771        assert!(result.contains(r#""screenName": "<redacted>""#));
772        assert!(result.contains(r#""playerName": "<redacted>""#));
773    }
774
775    #[test]
776    fn test_scrub_raw_log_with_keep_player_names_true_preserves_names() {
777        let opts = ScrubOptions {
778            keep_player_names: true,
779        };
780        let input = r#""screenName": "Alice#123", "playerName": "Bob#456""#;
781        let result = scrub_raw_log_with(input, &opts);
782        assert!(result.contains("Alice#123"));
783        assert!(result.contains("Bob#456"));
784    }
785
786    #[test]
787    fn test_scrub_raw_log_with_keep_player_names_true_still_redacts_tokens() {
788        let opts = ScrubOptions {
789            keep_player_names: true,
790        };
791        let input = r#"Token: secret123 and "screenName": "Alice#123""#;
792        let result = scrub_raw_log_with(input, &opts);
793        assert!(result.contains("Token: <redacted>"));
794        assert!(!result.contains("secret123"));
795        assert!(result.contains("Alice#123"));
796    }
797
798    #[test]
799    fn test_scrub_raw_log_with_keep_player_names_true_still_redacts_session_ids() {
800        let opts = ScrubOptions {
801            keep_player_names: true,
802        };
803        let input = r#"{"sessionId": "sess_xyz789", "screenName": "Alice#123"}"#;
804        let result = scrub_raw_log_with(input, &opts);
805        assert!(result.contains(r#""sessionId": "<redacted>""#));
806        assert!(!result.contains("sess_xyz789"));
807        assert!(result.contains("Alice#123"));
808    }
809
810    #[test]
811    fn test_scrub_raw_log_with_keep_player_names_true_still_redacts_paths() {
812        let opts = ScrubOptions {
813            keep_player_names: true,
814        };
815        let input = r#""playerName": "Alice#123" at /home/alice/.config/app"#;
816        let result = scrub_raw_log_with(input, &opts);
817        assert!(result.contains("Alice#123"));
818        assert!(!result.contains("/home/alice/"));
819        assert!(result.contains("<user-path>/"));
820    }
821
822    #[test]
823    fn test_scrub_raw_log_with_keep_player_names_true_still_redacts_client_id() {
824        let opts = ScrubOptions {
825            keep_player_names: true,
826        };
827        let input = r#"{"clientId": "CR4QJUQP", "screenName": "Alice#123"}"#;
828        let result = scrub_raw_log_with(input, &opts);
829        assert!(result.contains(r#""clientId": "<redacted>""#));
830        assert!(!result.contains("CR4QJUQP"));
831        assert!(result.contains("Alice#123"));
832    }
833
834    #[test]
835    fn test_scrub_raw_log_with_keep_player_names_true_still_redacts_hardware_fingerprints() {
836        let opts = ScrubOptions {
837            keep_player_names: true,
838        };
839        let input = "\"playerName\": \"Alice#123\"\n  Renderer: NVIDIA GeForce RTX 3080";
840        let result = scrub_raw_log_with(input, &opts);
841        assert!(result.contains("Alice#123"));
842        assert!(!result.contains("NVIDIA GeForce RTX 3080"));
843        assert!(result.contains("Renderer: <redacted>"));
844    }
845
846    #[test]
847    fn test_scrub_raw_log_with_default_opts_equals_scrub_raw_log() {
848        // scrub_raw_log_with(.., &ScrubOptions::default()) must produce
849        // identical output to scrub_raw_log(..) for the same input.
850        let inputs = [
851            r#""screenName": "Alice#123", Token: secret"#,
852            "Token: abc Bearer tok123",
853            r#"{"sessionId": "s1", "playerName": "Bob#99"}"#,
854            "[UnityCrossThreadLogger] Game started",
855            "",
856        ];
857        for input in &inputs {
858            assert_eq!(
859                scrub_raw_log(input),
860                scrub_raw_log_with(input, &ScrubOptions::default()),
861                "scrub_raw_log and scrub_raw_log_with(default) differ for input: {input:?}"
862            );
863        }
864    }
865
866    // --- Email redaction ---
867
868    #[test]
869    fn test_scrub_raw_log_email_address_redacted() {
870        let input = "Contact: user@example.com for support";
871        let result = scrub_raw_log(input);
872        assert!(!result.contains("user@example.com"));
873        assert!(result.contains("<email-redacted>"));
874    }
875
876    #[test]
877    fn test_scrub_raw_log_email_in_json_value_redacted() {
878        let input = r#"{"email": "player.one+mtga@arena.wizards.com"}"#;
879        let result = scrub_raw_log(input);
880        assert!(!result.contains("player.one+mtga@arena.wizards.com"));
881        assert!(result.contains("<email-redacted>"));
882    }
883
884    #[test]
885    fn test_scrub_raw_log_multiple_emails_on_same_line_redacted() {
886        let input = "From: alice@example.com To: bob@example.org";
887        let result = scrub_raw_log(input);
888        assert!(!result.contains("alice@example.com"));
889        assert!(!result.contains("bob@example.org"));
890        assert_eq!(result.matches("<email-redacted>").count(), 2);
891    }
892
893    // --- IPv4 redaction ---
894
895    #[test]
896    fn test_scrub_raw_log_ipv4_address_redacted() {
897        let input = "Server address: 192.168.1.100 port 443";
898        let result = scrub_raw_log(input);
899        assert!(!result.contains("192.168.1.100"));
900        assert!(result.contains("<ip-redacted>"));
901    }
902
903    #[test]
904    fn test_scrub_raw_log_ipv4_loopback_redacted() {
905        let input = "Connecting to 127.0.0.1:8080";
906        let result = scrub_raw_log(input);
907        assert!(!result.contains("127.0.0.1"));
908        assert!(result.contains("<ip-redacted>"));
909    }
910
911    #[test]
912    fn test_scrub_raw_log_ipv4_public_address_redacted() {
913        let input = "WotC endpoint: 52.23.1.200";
914        let result = scrub_raw_log(input);
915        assert!(!result.contains("52.23.1.200"));
916        assert!(result.contains("<ip-redacted>"));
917    }
918
919    #[test]
920    fn test_scrub_raw_log_version_string_redacted_as_ipv4_deliberate_tradeoff() {
921        // A 4-segment version string is syntactically indistinguishable from
922        // an IPv4 address without semantic context. The regex crate (DFA-based)
923        // provides no lookbehind to exclude version-line context, so version
924        // strings like "1.2.3.4" are redacted. This is an acceptable
925        // defense-in-depth tradeoff (AC-PRIV-8).
926        let input = "Version: 1.2.3.4";
927        let result = scrub_raw_log(input);
928        assert!(!result.contains("1.2.3.4"));
929        assert!(result.contains("<ip-redacted>"));
930    }
931
932    // --- IPv6 redaction ---
933
934    #[test]
935    fn test_scrub_raw_log_ipv6_loopback_redacted() {
936        let input = "Listening on ::1 port 3000";
937        let result = scrub_raw_log(input);
938        assert!(!result.contains("::1"));
939        assert!(result.contains("<ip-redacted>"));
940    }
941
942    #[test]
943    fn test_scrub_raw_log_ipv6_link_local_redacted() {
944        let input = "Interface address: fe80::1%eth0";
945        let result = scrub_raw_log(input);
946        assert!(!result.contains("fe80::1"));
947        assert!(result.contains("<ip-redacted>"));
948    }
949
950    #[test]
951    fn test_scrub_raw_log_ipv6_full_address_redacted() {
952        let input = "IPv6: 2001:0db8:85a3:0000:0000:8a2e:0370:7334";
953        let result = scrub_raw_log(input);
954        assert!(!result.contains("2001:0db8:85a3:0000:0000:8a2e:0370:7334"));
955        assert!(result.contains("<ip-redacted>"));
956    }
957
958    #[test]
959    fn test_scrub_raw_log_ipv6_compressed_redacted() {
960        let input = "Remote: 2001:db8::1";
961        let result = scrub_raw_log(input);
962        assert!(!result.contains("2001:db8::1"));
963        assert!(result.contains("<ip-redacted>"));
964    }
965
966    // --- Corpus validation (env-gated, not run in CI) ---
967
968    /// Run `scrub_raw_log` against every `.log` file in the corpus directory
969    /// and verify that none of the PII patterns survive scrubbing.
970    ///
971    /// Skipped unless `SCRUBBER_CORPUS_DIR` is set:
972    /// ```sh
973    /// SCRUBBER_CORPUS_DIR=/tmp/smoke-corpus cargo test corpus_scrub -- --nocapture
974    /// ```
975    #[test]
976    fn test_corpus_scrub_no_pii_survives() {
977        let Ok(dir) = std::env::var("SCRUBBER_CORPUS_DIR") else {
978            return;
979        };
980        let corpus_dir = std::path::PathBuf::from(dir);
981
982        let pii_patterns: Vec<(&str, Regex)> = vec![
983            (
984                "screenName",
985                Regex::new(r#""[Ss]creen[Nn]ame"\s*:\s*"([^"]+)""#)
986                    .unwrap_or_else(|_| unreachable!()),
987            ),
988            (
989                "playerName",
990                Regex::new(r#""[Pp]layer[Nn]ame"\s*:\s*"([^"]+)""#)
991                    .unwrap_or_else(|_| unreachable!()),
992            ),
993            (
994                "Renderer",
995                Regex::new(r"(?m)^\s+Renderer:\s+(.+)").unwrap_or_else(|_| unreachable!()),
996            ),
997            (
998                "Vendor",
999                Regex::new(r"(?m)^\s+Vendor:\s+(.+)").unwrap_or_else(|_| unreachable!()),
1000            ),
1001            (
1002                "VRAM",
1003                Regex::new(r"(?m)^\s+VRAM:\s+(.+)").unwrap_or_else(|_| unreachable!()),
1004            ),
1005            (
1006                "Driver",
1007                Regex::new(r"(?m)^\s+Driver:\s+(.+)").unwrap_or_else(|_| unreachable!()),
1008            ),
1009        ];
1010
1011        let mut total_before = 0u32;
1012        let mut failures: Vec<String> = Vec::new();
1013
1014        let entries: Vec<_> = std::fs::read_dir(&corpus_dir)
1015            .unwrap_or_else(|_| unreachable!())
1016            .filter_map(Result::ok)
1017            .filter(|e| e.path().extension().is_some_and(|ext| ext == "log"))
1018            .collect();
1019
1020        for entry in &entries {
1021            let path = entry.path();
1022            let filename = path
1023                .file_name()
1024                .unwrap_or_else(|| unreachable!())
1025                .to_string_lossy();
1026            let Ok(raw) = std::fs::read_to_string(&path) else {
1027                continue;
1028            };
1029
1030            let scrubbed = scrub_raw_log(&raw);
1031
1032            for (name, re) in &pii_patterns {
1033                let before = u32::try_from(re.find_iter(&raw).count()).unwrap_or(u32::MAX);
1034                total_before += before;
1035
1036                let leaked: Vec<String> = re
1037                    .captures_iter(&scrubbed)
1038                    .filter_map(|cap| {
1039                        let val = cap.get(1).map_or("", |m| m.as_str());
1040                        if val == "<redacted>" {
1041                            None
1042                        } else {
1043                            Some(val.to_owned())
1044                        }
1045                    })
1046                    .collect();
1047
1048                for val in &leaked {
1049                    failures.push(format!("{filename}: {name} leaked: {val:?}"));
1050                }
1051            }
1052        }
1053
1054        assert!(
1055            total_before > 0,
1056            "corpus should contain at least one PII match to be a meaningful test"
1057        );
1058        assert!(
1059            failures.is_empty(),
1060            "PII survived scrubbing in {} location(s) (of {total_before} raw matches):\n{}",
1061            failures.len(),
1062            failures.join("\n")
1063        );
1064    }
1065}