Skip to main content

sqz_engine/
verifier.rs

1/// Two-pass compression verifier.
2///
3/// After compression, the verifier checks that critical information was
4/// preserved. If confidence is below the threshold, it signals the caller
5/// to fall back to a safer (less aggressive) compression mode.
6///
7/// Checks performed:
8/// 1. Required JSON keys present (if original was JSON)
9/// 2. Numeric fields unchanged (no value corruption)
10/// 3. Error/warning lines retained (critical signal preservation)
11/// 4. Diff hunk headers present (if input was a git diff)
12/// 5. File paths preserved (no path truncation)
13/// 6. Minimum content retention (output not too short vs input)
14/// 7. Identifier/path/URL preservation — deterministic token scan covering
15///    filesystem paths, URLs, backtick-quoted code identifiers, environment
16///    variable names, and version numbers. Added after the sessions that
17///    produced the `packages → pkgs` / `configuration/` / `repository/` bug
18///    class — the idea (post-compression preservation check) was prompted
19///    by caveman-compress's validate.py, but the scan mechanism, inputs,
20///    and integration point are sqz-specific.
21
22use crate::types::VerifyResult;
23
24/// Confidence threshold below which fallback is triggered.
25const FALLBACK_THRESHOLD: f64 = 0.6;
26
27pub struct Verifier;
28
29impl Verifier {
30    /// Run all invariant checks on `original` → `compressed`.
31    ///
32    /// Returns a `VerifyResult` with confidence score and check details.
33    /// If `result.confidence < FALLBACK_THRESHOLD`, the caller should
34    /// re-compress with a safer preset.
35    pub fn verify(original: &str, compressed: &str) -> VerifyResult {
36        let mut passed = Vec::new();
37        let mut failed = Vec::new();
38
39        // Check 1: Minimum content retention (output must be ≥ 10% of input length)
40        let retention = if original.is_empty() {
41            1.0
42        } else {
43            compressed.len() as f64 / original.len() as f64
44        };
45        if retention >= 0.10 {
46            passed.push("min_retention".to_string());
47        } else {
48            failed.push((
49                "min_retention".to_string(),
50                format!("output is only {:.1}% of input length", retention * 100.0),
51            ));
52        }
53
54        // Check 2: Error/warning lines retained
55        let error_lines: Vec<&str> = original
56            .lines()
57            .filter(|l| {
58                let lower = l.to_lowercase();
59                lower.contains("error:") || lower.contains("warning:") || lower.contains("fatal:")
60                    || lower.contains("panic:") || lower.contains("exception:")
61            })
62            .collect();
63        if error_lines.is_empty() {
64            passed.push("error_lines".to_string());
65        } else {
66            let missing: Vec<&str> = error_lines
67                .iter()
68                .filter(|&&line| !compressed.contains(line.trim()))
69                .copied()
70                .collect();
71            if missing.is_empty() {
72                passed.push("error_lines".to_string());
73            } else {
74                failed.push((
75                    "error_lines".to_string(),
76                    format!("{} error/warning line(s) missing from output", missing.len()),
77                ));
78            }
79        }
80
81        // Check 3: File paths preserved (lines containing / or \ with extension)
82        let path_lines: Vec<&str> = original
83            .lines()
84            .filter(|l| {
85                (l.contains('/') || l.contains('\\'))
86                    && l.chars().any(|c| c == '.')
87                    && l.len() < 200 // skip very long lines
88            })
89            .take(20) // only check first 20 path-like lines
90            .collect();
91        if path_lines.is_empty() {
92            passed.push("file_paths".to_string());
93        } else {
94            let missing_paths = path_lines
95                .iter()
96                .filter(|&&line| {
97                    // Extract the path-like token and check it's in the output
98                    let token = line.split_whitespace()
99                        .find(|t| t.contains('/') || t.contains('\\'))
100                        .unwrap_or("");
101                    !token.is_empty() && !compressed.contains(token)
102                })
103                .count();
104            if missing_paths == 0 {
105                passed.push("file_paths".to_string());
106            } else {
107                failed.push((
108                    "file_paths".to_string(),
109                    format!("{missing_paths} file path(s) missing from output"),
110                ));
111            }
112        }
113
114        // Check 4: JSON key preservation (if original is JSON)
115        // We check that the compressed output contains at least 50% of the
116        // original top-level keys. Intentionally stripped keys are expected to be absent.
117        let orig_trimmed = original.trim();
118        if orig_trimmed.starts_with('{') || orig_trimmed.starts_with('[') {
119            if let Ok(orig_val) = serde_json::from_str::<serde_json::Value>(orig_trimmed) {
120                let orig_keys = collect_top_level_keys(&orig_val);
121                if orig_keys.is_empty() {
122                    passed.push("json_keys".to_string());
123                } else {
124                    let present: usize = orig_keys
125                        .iter()
126                        .filter(|&&k| compressed.contains(k))
127                        .count();
128                    let retention_ratio = present as f64 / orig_keys.len() as f64;
129                    // Pass if at least 50% of original keys are present
130                    if retention_ratio >= 0.5 {
131                        passed.push("json_keys".to_string());
132                    } else {
133                        let missing: Vec<&str> = orig_keys
134                            .iter()
135                            .filter(|&&k| !compressed.contains(k))
136                            .copied()
137                            .collect();
138                        failed.push((
139                            "json_keys".to_string(),
140                            format!("only {:.0}% of JSON keys retained; missing: {:?}",
141                                retention_ratio * 100.0,
142                                &missing[..missing.len().min(5)]),
143                        ));
144                    }
145                }
146            } else {
147                passed.push("json_keys".to_string()); // not valid JSON, skip
148            }
149        } else {
150            passed.push("json_keys".to_string()); // not JSON, skip
151        }
152
153        // Check 5: Diff hunk headers preserved (if input is a git diff)
154        let hunk_headers: Vec<&str> = original
155            .lines()
156            .filter(|l| l.starts_with("@@"))
157            .collect();
158        if hunk_headers.is_empty() {
159            passed.push("diff_hunks".to_string());
160        } else {
161            let missing_hunks = hunk_headers
162                .iter()
163                .filter(|&&h| !compressed.contains(h))
164                .count();
165            if missing_hunks == 0 {
166                passed.push("diff_hunks".to_string());
167            } else {
168                failed.push((
169                    "diff_hunks".to_string(),
170                    format!("{missing_hunks} diff hunk header(s) missing"),
171                ));
172            }
173        }
174
175        // Check 6: Numeric values preserved (spot-check first 10 numbers)
176        let numbers: Vec<&str> = original
177            .split(|c: char| !c.is_ascii_digit() && c != '.' && c != '-')
178            .filter(|s| !s.is_empty() && s.len() >= 2 && s.parse::<f64>().is_ok())
179            .take(10)
180            .collect();
181        if numbers.is_empty() {
182            passed.push("numeric_values".to_string());
183        } else {
184            let missing_nums = numbers
185                .iter()
186                .filter(|&&n| !compressed.contains(n))
187                .count();
188            if missing_nums == 0 {
189                passed.push("numeric_values".to_string());
190            } else {
191                failed.push((
192                    "numeric_values".to_string(),
193                    format!("{missing_nums} numeric value(s) missing from output"),
194                ));
195            }
196        }
197
198        // Check 7: Preservation tokens — identifier-shaped substrings the
199        // model may dereference (paths, URLs, backticked code, env vars,
200        // version numbers). See the session fixes for `packages/`,
201        // `configuration/`, `repository/` — this check catches that bug
202        // class deterministically rather than waiting for user reports.
203        //
204        // We require at least 85% of preservation tokens to survive. Lower
205        // than 100% because: (1) dedup may have collapsed a repeated path
206        // into a `§ref:…§` marker intentionally, and (2) the scanner is
207        // heuristic and may flag a token that the pipeline legitimately
208        // rewrote (e.g. long base64 truncated by entropy_truncate).
209        let preservation_tokens = extract_preservation_tokens(original);
210        if preservation_tokens.is_empty() {
211            passed.push("preservation".to_string());
212        } else {
213            let present = preservation_tokens
214                .iter()
215                .filter(|t| compressed.contains(t.as_str()))
216                .count();
217            let total = preservation_tokens.len();
218            let ratio = present as f64 / total as f64;
219            if ratio >= 0.85 {
220                passed.push("preservation".to_string());
221            } else {
222                let missing: Vec<&str> = preservation_tokens
223                    .iter()
224                    .filter(|t| !compressed.contains(t.as_str()))
225                    .take(5)
226                    .map(|t| t.as_str())
227                    .collect();
228                failed.push((
229                    "preservation".to_string(),
230                    format!(
231                        "only {}/{} preservation tokens retained ({:.0}%); missing: {:?}",
232                        present, total, ratio * 100.0, missing,
233                    ),
234                ));
235            }
236        }
237
238        // Compute confidence: ratio of passed checks to total checks.
239        //
240        // Special case: preservation is a "sentinel" check. If it fails, the
241        // LLM will likely try to dereference a token (filename, URL, identifier)
242        // that no longer exists in the compressed output, which causes
243        // cascading failures in agent sessions (the exact bug class that
244        // produced fixes fd4603d, f6dc86c, b8bd0d7). Cap confidence at 0.5
245        // when preservation fails so that the fallback kicks in even when
246        // the other 6 checks pass.
247        let total = passed.len() + failed.len();
248        let mut confidence = if total == 0 {
249            1.0
250        } else {
251            passed.len() as f64 / total as f64
252        };
253        let preservation_failed = failed.iter().any(|(k, _)| k == "preservation");
254        if preservation_failed {
255            confidence = confidence.min(0.5);
256        }
257
258        let fallback_triggered = confidence < FALLBACK_THRESHOLD;
259
260        VerifyResult {
261            passed: failed.is_empty(),
262            confidence,
263            checks_passed: passed,
264            checks_failed: failed,
265            fallback_triggered,
266        }
267    }
268
269    /// Check if a verify result warrants fallback to safer compression.
270    pub fn should_fallback(result: &VerifyResult) -> bool {
271        result.fallback_triggered
272    }
273}
274
275fn collect_top_level_keys(value: &serde_json::Value) -> Vec<&str> {
276    match value {
277        serde_json::Value::Object(map) => map.keys().map(|k| k.as_str()).collect(),
278        _ => vec![],
279    }
280}
281
282// ---------------------------------------------------------------------------
283// Preservation-token extractor
284// ---------------------------------------------------------------------------
285//
286// Pulls out identifier-shaped substrings from input that the LLM may try to
287// dereference. The verifier requires these tokens to appear somewhere in the
288// compressed output — if too many go missing, the compression is rejected
289// and the caller falls back to Safe mode.
290//
291// Design notes:
292// - Byte-level ASCII scan. No regex crate dependency; avoids ReDoS.
293// - Conservative by design: false positives (over-preservation) cause at
294//   worst a missed compression opportunity, whereas false negatives cause
295//   silent data loss. We err toward flagging more.
296// - Deduplicates tokens (each unique token counted once).
297// - Caps the scan at 1 MB of input to bound worst-case runtime. Larger
298//   inputs are truncated for scanning only; the actual compression still
299//   processes the full input.
300
301const MAX_SCAN_BYTES: usize = 1024 * 1024;
302const MAX_TOKENS: usize = 500;
303
304/// Scan input for preservation tokens: filesystem paths, URLs, backtick-quoted
305/// code identifiers, environment variable names, and version numbers.
306fn extract_preservation_tokens(input: &str) -> Vec<String> {
307    let scan = &input[..input.len().min(MAX_SCAN_BYTES)];
308    let bytes = scan.as_bytes();
309    let mut tokens: std::collections::BTreeSet<String> = std::collections::BTreeSet::new();
310
311    let mut i = 0;
312    while i < bytes.len() && tokens.len() < MAX_TOKENS {
313        let b = bytes[i];
314
315        // Backtick-quoted identifier: `foo_bar`, `Type::method`, `api::v1`
316        if b == b'`' {
317            if let Some(end) = find_closing(bytes, i + 1, b'`') {
318                let slice = &scan[i + 1..end];
319                // Must look like an identifier (not prose): contain at least
320                // one non-space char and no spaces unless it's path-like.
321                if !slice.is_empty()
322                    && slice.len() <= 200
323                    && is_identifier_or_path_content(slice)
324                {
325                    tokens.insert(slice.to_string());
326                }
327                i = end + 1;
328                continue;
329            }
330        }
331
332        // Environment variable: $HOME, $PATH, ${FOO}
333        if b == b'$' && i + 1 < bytes.len() {
334            let next = bytes[i + 1];
335            if next == b'{' {
336                if let Some(end) = find_closing(bytes, i + 2, b'}') {
337                    let slice = &scan[i + 2..end];
338                    if is_env_name(slice) {
339                        tokens.insert(format!("${{{}}}", slice));
340                    }
341                    i = end + 1;
342                    continue;
343                }
344            } else if next.is_ascii_uppercase() || next == b'_' {
345                // Bare $VARNAME — consume uppercase/underscore/digit run
346                let start = i + 1;
347                let mut j = start;
348                while j < bytes.len()
349                    && (bytes[j].is_ascii_uppercase()
350                        || bytes[j] == b'_'
351                        || bytes[j].is_ascii_digit())
352                {
353                    j += 1;
354                }
355                if j > start {
356                    tokens.insert(format!("${}", &scan[start..j]));
357                    i = j;
358                    continue;
359                }
360            }
361        }
362
363        // URL: detect common protocol prefixes
364        if is_url_start(bytes, i) {
365            let end = scan_url_end(bytes, i);
366            if end > i + 8 {
367                // min "http://x"
368                tokens.insert(scan[i..end].to_string());
369                i = end;
370                continue;
371            }
372        }
373
374        // Path or path-like token. Must have at least one '/' with alphanum
375        // on both sides. Starts with '/', '.', or alphanum; ends at whitespace
376        // or unambiguous terminator.
377        if is_path_start(bytes, i) {
378            let end = scan_path_end(bytes, i);
379            if end > i {
380                let slice = &scan[i..end];
381                // Require at least one '/' for path-ness.
382                if slice.contains('/') && is_plausible_path(slice) {
383                    tokens.insert(slice.to_string());
384                    i = end;
385                    continue;
386                }
387            }
388        }
389
390        // Version number: digit.digit(.digit)+, optional 'v' prefix, optional suffix
391        if b.is_ascii_digit() || (b == b'v' && i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit()) {
392            let end = scan_version_end(bytes, i);
393            if end > i {
394                let slice = &scan[i..end];
395                if is_version(slice) {
396                    tokens.insert(slice.to_string());
397                    i = end;
398                    continue;
399                }
400            }
401        }
402
403        i += 1;
404    }
405
406    tokens.into_iter().collect()
407}
408
409fn find_closing(bytes: &[u8], start: usize, target: u8) -> Option<usize> {
410    // Look up to 256 bytes forward for the closing char. Anything longer
411    // than that is almost certainly not a single token — give up.
412    let end = (start + 256).min(bytes.len());
413    bytes[start..end].iter().position(|&b| b == target).map(|off| start + off)
414}
415
416fn is_identifier_or_path_content(s: &str) -> bool {
417    // Reject if it's mostly whitespace or contains multiple spaces (prose).
418    // Allow single spaces (e.g. `some command --flag` in docs is prose, but
419    // `cargo test` in docs is a command; we treat both conservatively as
420    // preserve-worthy since splitting would lose the intent).
421    let space_count = s.bytes().filter(|&b| b == b' ').count();
422    if space_count > 3 {
423        return false;
424    }
425    // Must contain at least one "identifier-ish" byte.
426    s.bytes().any(|b| b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b'/' || b == b':')
427}
428
429fn is_env_name(s: &str) -> bool {
430    !s.is_empty()
431        && s.len() <= 64
432        && s.bytes()
433            .all(|b| b.is_ascii_uppercase() || b.is_ascii_digit() || b == b'_')
434}
435
436fn is_url_start(bytes: &[u8], i: usize) -> bool {
437    const PREFIXES: &[&[u8]] = &[
438        b"https://", b"http://", b"git://", b"ssh://", b"ftp://",
439        b"file://", b"git@", b"ws://", b"wss://",
440    ];
441    PREFIXES.iter().any(|p| bytes[i..].starts_with(p))
442}
443
444fn scan_url_end(bytes: &[u8], start: usize) -> usize {
445    // URL ends at whitespace, quote, angle bracket, comma followed by space,
446    // or closing paren/bracket at end-of-sentence. Cap at 2KB.
447    let cap = (start + 2048).min(bytes.len());
448    let mut i = start;
449    while i < cap {
450        let b = bytes[i];
451        if b == b' ' || b == b'\t' || b == b'\n' || b == b'\r'
452            || b == b'"' || b == b'\'' || b == b'<' || b == b'>'
453            || b == b'`'
454        {
455            break;
456        }
457        i += 1;
458    }
459    // Trim trailing punctuation that's usually sentence, not URL.
460    while i > start {
461        let last = bytes[i - 1];
462        if last == b'.' || last == b',' || last == b';' || last == b':'
463            || last == b')' || last == b']' || last == b'!' || last == b'?'
464        {
465            i -= 1;
466        } else {
467            break;
468        }
469    }
470    i
471}
472
473fn is_path_start(bytes: &[u8], i: usize) -> bool {
474    if i >= bytes.len() {
475        return false;
476    }
477    // Path start can't follow an alphanum (otherwise we'd split identifiers).
478    if i > 0 && bytes[i - 1].is_ascii_alphanumeric() {
479        return false;
480    }
481    let b = bytes[i];
482    b == b'/' || b == b'.' || b.is_ascii_alphanumeric() || b == b'_' || b == b'-'
483}
484
485fn scan_path_end(bytes: &[u8], start: usize) -> usize {
486    // Path characters: alphanum, '_', '-', '.', '/', and nothing else.
487    // Cap at 512 bytes to avoid pathological inputs.
488    let cap = (start + 512).min(bytes.len());
489    let mut i = start;
490    while i < cap {
491        let b = bytes[i];
492        if b.is_ascii_alphanumeric() || b == b'_' || b == b'-' || b == b'.' || b == b'/' {
493            i += 1;
494        } else {
495            break;
496        }
497    }
498    // Trim trailing '.' that's probably sentence punctuation.
499    while i > start && bytes[i - 1] == b'.' {
500        i -= 1;
501    }
502    i
503}
504
505fn is_plausible_path(s: &str) -> bool {
506    let bytes = s.as_bytes();
507    let slash = match bytes.iter().position(|&b| b == b'/') {
508        Some(p) => p,
509        None => return false,
510    };
511    // Require: something before the slash (so "/" alone isn't a path), OR
512    // content after the slash (so a path can start with '/'). Also require
513    // at least one alphabetic byte anywhere, to reject "3/4" fractions.
514    let has_before = slash > 0;
515    let has_after = slash + 1 < bytes.len();
516    if !has_before && !has_after {
517        return false;
518    }
519    bytes.iter().any(|b| b.is_ascii_alphabetic())
520}
521
522fn scan_version_end(bytes: &[u8], start: usize) -> usize {
523    let cap = (start + 64).min(bytes.len());
524    let mut i = start;
525    while i < cap {
526        let b = bytes[i];
527        if b.is_ascii_alphanumeric() || b == b'.' || b == b'-' {
528            i += 1;
529        } else {
530            break;
531        }
532    }
533    i
534}
535
536fn is_version(s: &str) -> bool {
537    // Semver-ish: needs at least two '.' with digits on both sides.
538    // Optional leading 'v'. Optional pre-release suffix.
539    let trimmed = s.strip_prefix('v').unwrap_or(s);
540    let dots = trimmed.bytes().filter(|&b| b == b'.').count();
541    if dots < 2 {
542        return false;
543    }
544    // First segment must be digits.
545    let first_segment: String = trimmed.chars().take_while(|c| c.is_ascii_digit()).collect();
546    if first_segment.is_empty() {
547        return false;
548    }
549    // Second segment must start with digit (after first '.').
550    let after_first: &str = &trimmed[first_segment.len() + 1..];
551    after_first.chars().next().map(|c| c.is_ascii_digit()).unwrap_or(false)
552}
553
554// ---------------------------------------------------------------------------
555// Tests
556// ---------------------------------------------------------------------------
557
558#[cfg(test)]
559mod tests {
560    use super::*;
561
562    #[test]
563    fn verify_identical_passes_all() {
564        let text = "error: something went wrong\nfile: src/main.rs\n";
565        let result = Verifier::verify(text, text);
566        assert!(result.passed);
567        assert!((result.confidence - 1.0).abs() < f64::EPSILON);
568        assert!(!result.fallback_triggered);
569    }
570
571    #[test]
572    fn verify_empty_input_passes() {
573        let result = Verifier::verify("", "");
574        assert!(result.passed);
575    }
576
577    #[test]
578    fn verify_detects_missing_error_line() {
579        let original = "error: connection refused\nsome other content here\n";
580        let compressed = "some other content here\n"; // error line stripped
581        let result = Verifier::verify(original, compressed);
582        assert!(!result.passed);
583        assert!(result.checks_failed.iter().any(|(k, _)| k == "error_lines"));
584    }
585
586    #[test]
587    fn verify_detects_over_compression() {
588        // Use content with multiple checkable markers so more checks fail
589        let original = "error: critical failure at line 42\n@@ -1,5 +1,5 @@\n/path/to/file.rs\nvalue: 12345\n".repeat(20);
590        let compressed = "x"; // almost nothing retained
591        let result = Verifier::verify(&original, compressed);
592        assert!(!result.passed);
593        assert!(result.checks_failed.iter().any(|(k, _)| k == "min_retention"));
594        assert!(result.fallback_triggered, "should trigger fallback: confidence={:.2}", result.confidence);
595    }
596
597    #[test]
598    fn verify_json_keys_preserved() {
599        let original = r#"{"id":1,"name":"Alice","status":"active"}"#;
600        let compressed = r#"TOON:{id:1,name:"Alice",status:"active"}"#;
601        let result = Verifier::verify(original, compressed);
602        assert!(result.checks_passed.contains(&"json_keys".to_string()));
603    }
604
605    #[test]
606    fn verify_detects_missing_json_keys() {
607        let original = r#"{"id":1,"name":"Alice","status":"active","role":"admin","email":"a@b.com","created":"2024-01-01"}"#;
608        let compressed = r#"TOON:{id:1}"#; // only 1 of 6 keys retained (17%)
609        let result = Verifier::verify(original, compressed);
610        assert!(result.checks_failed.iter().any(|(k, _)| k == "json_keys"),
611            "should fail json_keys when <50% of keys retained");
612    }
613
614    #[test]
615    fn verify_diff_hunks_preserved() {
616        let original = "@@ -1,5 +1,5 @@\n-old\n+new\n context\n";
617        let compressed = "@@ -1,5 +1,5 @@\n-old\n+new\n";
618        let result = Verifier::verify(original, compressed);
619        assert!(result.checks_passed.contains(&"diff_hunks".to_string()));
620    }
621
622    #[test]
623    fn verify_detects_missing_diff_hunks() {
624        let original = "@@ -1,5 +1,5 @@\n-old\n+new\n";
625        let compressed = "-old\n+new\n"; // hunk header stripped
626        let result = Verifier::verify(original, compressed);
627        assert!(result.checks_failed.iter().any(|(k, _)| k == "diff_hunks"));
628    }
629
630    #[test]
631    fn fallback_threshold_triggers_correctly() {
632        // Create a result that fails most checks
633        let original = "error: critical failure\n@@ -1,5 +1,5 @@\n/path/to/file.rs:42\n";
634        let compressed = "x"; // almost nothing retained
635        let result = Verifier::verify(original, compressed);
636        assert!(result.fallback_triggered, "should trigger fallback on low confidence");
637    }
638
639    // ── Real-world coding session patterns ────────────────────────────────
640
641    #[test]
642    fn verify_cargo_test_output_preserved() {
643        let original = "running 47 tests\ntest engine::tests::test_compress ... ok\ntest pipeline::tests::compress_json ... ok\ntest result: ok. 47 passed; 0 failed; 0 ignored; finished in 2.34s\n";
644        let compressed = "47 tests\ntest result: ok. 47 passed; 0 failed; finished in 2.34s\n";
645        let result = Verifier::verify(original, compressed);
646        // Should pass — key info retained, no error lines, no JSON
647        assert!(result.confidence >= 0.7, "cargo test output should verify well: {:.2}", result.confidence);
648    }
649
650    #[test]
651    fn verify_rust_compile_error_preserved() {
652        let original = "error[E0308]: mismatched types\n --> src/main.rs:42:5\n  |\n42 |     let x: i32 = \"hello\";\n  |                  ^^^^^^^ expected `i32`, found `&str`\n\nerror: aborting due to previous error\n";
653        let compressed = "error[E0308]: mismatched types\n --> src/main.rs:42:5\nerror: aborting due to previous error\n";
654        let result = Verifier::verify(original, compressed);
655        // Error lines must be retained
656        assert!(result.checks_passed.contains(&"error_lines".to_string()),
657            "error lines should be preserved");
658    }
659
660    #[test]
661    fn verify_git_log_output() {
662        let original = "commit a1b2c3d4\nAuthor: Ojus Chugh <ojuschugh@gmail.com>\nDate:   Sun Apr 12 10:00:00 2026\n\n    feat: Add compression engine\n\ncommit b2c3d4e5\nAuthor: Ojus Chugh <ojuschugh@gmail.com>\nDate:   Sat Apr 11 15:30:00 2026\n\n    fix: Handle edge case\n";
663        let compressed = "commit a1b2c3d4\n    feat: Add compression engine\ncommit b2c3d4e5\n    fix: Handle edge case\n";
664        let result = Verifier::verify(original, compressed);
665        assert!(result.confidence >= 0.7, "git log should verify well: {:.2}", result.confidence);
666    }
667
668    #[test]
669    fn verify_json_api_with_stripped_nulls() {
670        // Simulates what the pipeline does: strip null fields, TOON encode
671        let original = r#"{"id":1,"name":"Alice","debug_info":null,"trace_id":null,"status":"active"}"#;
672        let compressed = r#"TOON:{id:1,name:"Alice",status:"active"}"#;
673        let result = Verifier::verify(original, compressed);
674        // 3 of 5 keys retained (60%) — should pass the 50% threshold
675        assert!(result.checks_passed.contains(&"json_keys".to_string()),
676            "60% key retention should pass: {:?}", result.checks_failed);
677    }
678
679    // ── Preservation-token extractor tests ──────────────────────────────
680
681    #[test]
682    fn extract_detects_absolute_paths() {
683        let tokens = extract_preservation_tokens("see /etc/myapp/config.yml for details");
684        assert!(tokens.contains(&"/etc/myapp/config.yml".to_string()),
685            "absolute path should be extracted: {:?}", tokens);
686    }
687
688    #[test]
689    fn extract_detects_relative_paths() {
690        let tokens = extract_preservation_tokens("edit src/main.rs and tests/util.rs");
691        assert!(tokens.contains(&"src/main.rs".to_string()), "{:?}", tokens);
692        assert!(tokens.contains(&"tests/util.rs".to_string()), "{:?}", tokens);
693    }
694
695    #[test]
696    fn extract_detects_directory_listing_entries() {
697        // The Reddit case: `packages/` directory listing
698        let input = "drwxr-xr-x  user staff  192 Apr 18 packages/\n\
699                     drwxr-xr-x  user staff   96 Apr 18 configuration/\n";
700        let tokens = extract_preservation_tokens(input);
701        // These should appear as path-like tokens (they have `/`)
702        assert!(tokens.iter().any(|t| t.contains("packages")),
703            "should extract packages: {:?}", tokens);
704        assert!(tokens.iter().any(|t| t.contains("configuration")),
705            "should extract configuration: {:?}", tokens);
706    }
707
708    #[test]
709    fn extract_detects_urls() {
710        let input = "clone from https://github.com/example/repository and \
711                     read https://docs.example.com/guide.";
712        let tokens = extract_preservation_tokens(input);
713        assert!(tokens.contains(&"https://github.com/example/repository".to_string()),
714            "{:?}", tokens);
715        assert!(tokens.iter().any(|t| t.starts_with("https://docs.example.com")),
716            "{:?}", tokens);
717    }
718
719    #[test]
720    fn extract_detects_backtick_identifiers() {
721        let tokens = extract_preservation_tokens(
722            "use `SqzEngine::new` and `CompressionPipeline::compress`"
723        );
724        assert!(tokens.contains(&"SqzEngine::new".to_string()), "{:?}", tokens);
725        assert!(tokens.contains(&"CompressionPipeline::compress".to_string()), "{:?}", tokens);
726    }
727
728    #[test]
729    fn extract_detects_env_vars() {
730        let tokens = extract_preservation_tokens("set $HOME and ${FOO_BAR} and $PATH");
731        assert!(tokens.contains(&"$HOME".to_string()), "{:?}", tokens);
732        assert!(tokens.contains(&"${FOO_BAR}".to_string()), "{:?}", tokens);
733        assert!(tokens.contains(&"$PATH".to_string()), "{:?}", tokens);
734    }
735
736    #[test]
737    fn extract_detects_version_numbers() {
738        let tokens = extract_preservation_tokens(
739            "upgrade to 1.2.3 from v0.7.0 and pin 2.0.0-beta.1"
740        );
741        assert!(tokens.iter().any(|t| t.starts_with("1.2.3")), "{:?}", tokens);
742        assert!(tokens.iter().any(|t| t.starts_with("v0.7.0")), "{:?}", tokens);
743    }
744
745    #[test]
746    fn extract_ignores_prose() {
747        // Plain prose without paths/URLs/identifiers should produce no tokens
748        let tokens = extract_preservation_tokens(
749            "The quick brown fox jumps over the lazy dog. Lorem ipsum dolor sit amet."
750        );
751        assert!(tokens.is_empty(), "prose should yield no preservation tokens: {:?}", tokens);
752    }
753
754    #[test]
755    fn extract_ignores_fractions_in_prose() {
756        // "3/4 of the way" is not a path
757        let tokens = extract_preservation_tokens("We completed 3/4 of the tasks");
758        assert!(tokens.iter().all(|t| !t.contains("3/4")),
759            "fraction should not be extracted as path: {:?}", tokens);
760    }
761
762    #[test]
763    fn extract_caps_at_max_tokens() {
764        // Generate input with far more than MAX_TOKENS paths
765        let mut input = String::new();
766        for i in 0..1000 {
767            input.push_str(&format!("file_{}/sub_{}.txt ", i, i));
768        }
769        let tokens = extract_preservation_tokens(&input);
770        assert!(tokens.len() <= MAX_TOKENS, "should cap at {MAX_TOKENS}, got {}", tokens.len());
771    }
772
773    // ── Preservation check integration tests (regression for session bugs) ─
774
775    #[test]
776    fn verify_rejects_packages_to_pkgs_rewrite() {
777        // Reddit bug: `packages` directory renamed to `pkgs` in output
778        let original = "drwxr-xr-x  user staff  192 Apr 18 packages/\n\
779                        drwxr-xr-x  user staff  128 Apr 18 documentation/\n";
780        let compressed = "drwxr-xr-x  user staff  192 Apr 18 pkgs/\n\
781                          drwxr-xr-x  user staff  128 Apr 18 docs/\n";
782        let result = Verifier::verify(original, compressed);
783        assert!(
784            result.checks_failed.iter().any(|(k, _)| k == "preservation"),
785            "should fail preservation when packages→pkgs: {:?}", result.checks_failed
786        );
787    }
788
789    #[test]
790    fn verify_rejects_config_path_rewrite() {
791        // /etc/myapp/configuration/ → /etc/myapp/config/
792        let original = "check /etc/myapp/configuration/default.yml for errors";
793        let compressed = "check /etc/myapp/config/default.yml for errors";
794        let result = Verifier::verify(original, compressed);
795        assert!(
796            result.checks_failed.iter().any(|(k, _)| k == "preservation"),
797            "should fail preservation when path segment rewritten: {:?}", result.checks_failed
798        );
799    }
800
801    #[test]
802    fn verify_rejects_github_repo_rewrite() {
803        // github.com/.../repository → github.com/.../repo
804        let original = "origin  https://github.com/example/repository (fetch)";
805        let compressed = "origin  https://github.com/example/repo (fetch)";
806        let result = Verifier::verify(original, compressed);
807        assert!(
808            result.checks_failed.iter().any(|(k, _)| k == "preservation"),
809            "should fail preservation when URL path rewritten: {:?}", result.checks_failed
810        );
811    }
812
813    #[test]
814    fn verify_rejects_drops_filenames_entirely() {
815        // RLE pattern-run bug: 4 directory lines collapsed to a summary
816        let original = "drwxr-xr-x packages/\n\
817                        drwxr-xr-x configuration/\n\
818                        drwxr-xr-x documentation/\n\
819                        drwxr-xr-x environment/\n";
820        let compressed = "drwxr-xr-x ... [×4, varying: 4 unique values]\n";
821        let result = Verifier::verify(original, compressed);
822        assert!(
823            result.checks_failed.iter().any(|(k, _)| k == "preservation"),
824            "should fail preservation when filenames dropped: {:?}", result.checks_failed
825        );
826    }
827
828    #[test]
829    fn verify_accepts_lossless_dedup_output() {
830        // Legitimate dedup: original content is fully represented in output
831        let original = "see /etc/myapp/default.yml and src/main.rs";
832        let compressed = "see /etc/myapp/default.yml and src/main.rs";
833        let result = Verifier::verify(original, compressed);
834        assert!(
835            result.checks_passed.contains(&"preservation".to_string()),
836            "identical content must pass preservation: {:?}", result.checks_failed
837        );
838    }
839
840    #[test]
841    fn verify_accepts_json_null_stripped() {
842        // stripping null fields from JSON should leave paths in values intact
843        let original = r#"{"path":"/etc/foo.yml","debug":null,"log":"/var/log/app.log"}"#;
844        let compressed = r#"TOON:{path:"/etc/foo.yml",log:"/var/log/app.log"}"#;
845        let result = Verifier::verify(original, compressed);
846        assert!(
847            result.checks_passed.contains(&"preservation".to_string()),
848            "null-stripping must not trip preservation: {:?}", result.checks_failed
849        );
850    }
851
852    #[test]
853    fn verify_accepts_empty_input() {
854        // No preservation tokens in empty input → check passes vacuously
855        let result = Verifier::verify("", "");
856        assert!(result.checks_passed.contains(&"preservation".to_string()));
857    }
858
859    #[test]
860    fn preservation_failure_triggers_fallback() {
861        // Even if all other checks pass, a preservation failure must drop
862        // confidence below the fallback threshold. This is the defense we
863        // set up against the bug class from the April 18 audit session
864        // (fd4603d / f6dc86c / b8bd0d7).
865        let original = "commit: check /etc/myapp/configuration/default.yml\n\
866                        file: src/main.rs line 42\n";
867        let compressed = "commit: check /etc/myapp/config/default.yml\n\
868                          file: src/main.rs line 42\n";
869        let result = Verifier::verify(original, compressed);
870        assert!(
871            result.checks_failed.iter().any(|(k, _)| k == "preservation"),
872            "preservation should fail"
873        );
874        assert!(
875            result.fallback_triggered,
876            "preservation failure alone must trigger fallback (confidence={:.2})",
877            result.confidence
878        );
879    }
880}