wafrift_evolution/
rule_corpus.rs

1//! Per-rule WAF-bypass corpus — persistent {rule_id → bucket} store.
2//!
3//! `super::coverage_feedback` tracks rule_id observations in process
4//! memory for the current bench run. This module persists the
5//! richer corpus across runs:
6//!
7//! - The **payload bytes** that triggered each rule (not just the
8//!   descriptor — actual reproducible bytes).
9//! - The **encoding/grammar/smuggling chain** that produced the payload
10//!   so the operator can rebuild any variant by name.
11//! - The **bypass set** per rule — payloads that the WAF passed
12//!   (the only payloads with bounty value).
13//! - **Submission status** tracking the bounty lifecycle (Queued →
14//!   Submitted → Accepted / Duplicate / Rejected) so `wafrift harvest`
15//!   skips already-handled bypasses. wafrift never auto-files — filing is
16//!   a deliberate, one-at-a-time `wafrift submit` step.
17//! - **Drift timestamps** so `super::dilution` / `super::coverage_feedback`
18//!   can re-fire bypasses around CF Auto-Tune retrain windows.
19//!
20//! ## Why a separate module
21//!
22//! `coverage_feedback` is in the MAP-Elites hot path — every probe
23//! response updates it. We do NOT want disk I/O in that loop. The
24//! corpus is the **persistence layer** — written at round boundaries
25//! (every N probes, or on shutdown). The in-memory `RuleCoverage`
26//! observes; the on-disk `RuleBypassCorpus` accumulates.
27//!
28//! ## Target fingerprint
29//!
30//! One corpus per TARGET. Cloudflare's Managed Ruleset against
31//! `bench/cf-real/` is a different rule surface from AWS WAF's
32//! `AWSManagedRulesCommonRuleSet`. The corpus carries a
33//! `target_fingerprint` (typically `<vendor>:<ruleset-version>:<host>`)
34//! so cross-pollution between targets is impossible.
35//!
36//! ## File format
37//!
38//! JSON, schema-versioned. Field additions are backwards-compatible
39//! via serde defaults. Schema bumps require an explicit migration in
40//! `RuleBypassCorpus::load_or_default`.
41//!
42//! ## Concurrency
43//!
44//! Mid-hunt, multiple async workers may want to write the corpus.
45//! `RuleBypassCorpus::save_atomic` writes to a tempfile in the
46//! same directory then renames — POSIX rename is atomic on the same
47//! filesystem. Callers serialize their writes with a `Mutex` at the
48//! orchestrator level; the file itself is not a synchronization
49//! primitive.
50
51use serde::{Deserialize, Serialize};
52use std::collections::BTreeMap;
53use std::path::{Path, PathBuf};
54use std::time::{SystemTime, UNIX_EPOCH};
55
56use crate::coverage_feedback::{PayloadClass, RuleId};
57
58/// Current on-disk corpus schema version. Bump when a non-additive
59/// field change lands; older files load via the upgrade path.
60pub const CORPUS_SCHEMA_VERSION: u32 = 1;
61
62/// One attack-payload recorded against a WAF rule.
63///
64/// Distinguished from [`RecordedBypass`] in two ways:
65///
66/// 1. **Verdict** — a `RecordedAttempt` was blocked. A `RecordedBypass`
67///    was passed.
68/// 2. **Submission lifecycle** — only bypasses have submission status
69///    fields; blocks are tracked for "we've seen this fail before,
70///    don't retry until drift."
71#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
72pub struct RecordedAttempt {
73    /// The payload bytes as sent on the wire (after every encoder /
74    /// grammar mutation / smuggling wrap).
75    pub payload: String,
76    /// Attack class (`sql`, `xss`, `cmd`, …) so the corpus can
77    /// answer "what classes have we explored against rule X."
78    pub payload_class: PayloadClass,
79    /// Ordered list of technique identifiers applied to produce this
80    /// payload. Operator can rebuild the variant by replaying the chain.
81    pub encoding_chain: Vec<String>,
82    /// Hash of the response body — collapses near-identical "Sorry,
83    /// you have been blocked" pages so the corpus stays compact.
84    pub response_hash: u64,
85    /// Epoch seconds at observation.
86    pub observed_at_secs: u64,
87}
88
89/// A confirmed WAF bypass — the WAF passed this payload through to
90/// origin (verified by the oracle).
91#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
92pub struct RecordedBypass {
93    /// The payload that bypassed.
94    pub payload: String,
95    pub payload_class: PayloadClass,
96    pub encoding_chain: Vec<String>,
97    pub response_hash: u64,
98    pub observed_at_secs: u64,
99    /// Lifecycle status of the bounty submission.
100    #[serde(default)]
101    pub submission: SubmissionStatus,
102    /// Serialized delivery shape that produced this bypass — the EXACT
103    /// `(method, path, headers, body)` envelope the winning probe used,
104    /// JSON-encoded (`wafrift_grammar::grammar::equiv::DeliveryShape`).
105    /// `wafrift harvest` deserializes it to re-fire the *same* request
106    /// instead of guessing across standard shapes — the difference
107    /// between a recorded number and a reproducible, submittable bypass.
108    ///
109    /// Stored as an opaque `String` (not the typed shape) so this crate
110    /// stays decoupled from the grammar crate — the same deliberate
111    /// decoupling as [`Self::encoding_chain`]. Empty for bypasses
112    /// recorded before delivery capture, or by strategies with no
113    /// equivalence shape; harvest falls back to standard shapes then.
114    #[serde(default)]
115    pub delivery: String,
116}
117
118/// HackerOne submission lifecycle for a single bypass.
119#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
120#[serde(tag = "stage", content = "data")]
121pub enum SubmissionStatus {
122    /// Just discovered; awaiting the dry-run grace window.
123    #[default]
124    Queued,
125    /// Held until `release_at_secs` epoch — first 24h of any new
126    /// bypass goes here so we don't fire submissions at 3am.
127    DryRunHold { release_at_secs: u64 },
128    /// Sent to HackerOne, awaiting triage. `report_id` is the H1
129    /// report number.
130    Submitted { report_id: String },
131    /// H1 accepted the report. `report_id` retained for tracking.
132    Accepted { report_id: String },
133    /// H1 marked duplicate of a prior report.
134    Duplicate { duplicate_of: String },
135    /// H1 rejected (informative / NA / out-of-scope).
136    Rejected { reason: String },
137}
138
139/// All recorded attempts and bypasses for ONE WAF rule.
140#[derive(Debug, Clone, Default, Serialize, Deserialize)]
141pub struct RuleBucket {
142    /// Rule identifier the corpus is keyed on. Stored redundantly so
143    /// a bucket extracted from the map stays self-describing.
144    pub rule_id: RuleId,
145    /// Optional human-readable rule name when the WAF exposes one
146    /// (e.g. CRS rule "942100 — SQL Injection Attack: Detected").
147    #[serde(default)]
148    pub description: Option<String>,
149    /// Payloads that triggered this rule.
150    #[serde(default)]
151    pub blocked: Vec<RecordedAttempt>,
152    /// Payloads that bypassed this rule (passed through to origin).
153    #[serde(default)]
154    pub bypassed: Vec<RecordedBypass>,
155    /// Epoch seconds of last detected ruleset drift — when CF
156    /// Auto-Tune retrains, this updates and previously-blocked
157    /// payloads become retry-eligible.
158    #[serde(default)]
159    pub last_drift_at_secs: Option<u64>,
160}
161
162/// The full persistent corpus, indexed by rule_id.
163///
164/// Cheap to clone (BTreeMap of buckets); meant to be held by the
165/// hunt orchestrator + read by the bench reporter.
166#[derive(Debug, Clone, Serialize, Deserialize)]
167pub struct RuleBypassCorpus {
168    /// Schema version — load_or_default uses this to migrate older
169    /// formats. Always [`CORPUS_SCHEMA_VERSION`] on save.
170    #[serde(default)]
171    pub schema_version: u32,
172    /// Target fingerprint — `<vendor>:<ruleset>:<host>`. Two
173    /// fingerprints share no buckets; protect against cross-target
174    /// pollution.
175    pub target_fingerprint: String,
176    /// rule_id → bucket. BTreeMap so iteration is deterministic
177    /// (the bench-result determinism contract per Sonnet B's work
178    /// extends to this corpus's serialization).
179    #[serde(default)]
180    pub buckets: BTreeMap<String, RuleBucket>,
181    /// Epoch seconds at last save.
182    #[serde(default)]
183    pub last_saved_at_secs: u64,
184}
185
186impl RuleBypassCorpus {
187    /// Create a new empty corpus for the given target fingerprint.
188    #[must_use]
189    pub fn new(target_fingerprint: impl Into<String>) -> Self {
190        Self {
191            schema_version: CORPUS_SCHEMA_VERSION,
192            target_fingerprint: target_fingerprint.into(),
193            buckets: BTreeMap::new(),
194            last_saved_at_secs: 0,
195        }
196    }
197
198    /// Maximum corpus size we will read into memory. The corpus is
199    /// operator-private, self-authored state (NOT an untrusted download),
200    /// so the decompression-bomb threat model behind `safe_io` does not
201    /// apply — this ceiling only bounds memory on a pathologically huge
202    /// file and sits far above any real corpus. A file larger than this
203    /// is *preserved* (moved aside), never silently dropped. (§15 / §1)
204    const CORPUS_READ_CEILING_BYTES: usize = 1024 * 1024 * 1024; // 1 GiB
205
206    /// Load from disk, **never destroying recoverable data**.
207    ///
208    /// Return a fresh corpus ONLY when the file genuinely does not exist
209    /// or is empty (first run for this target). Every OTHER outcome on an
210    /// *existing, non-empty* file is treated as recoverable bounty data
211    /// that must survive:
212    ///
213    /// - **Too large to read / I-O error** → the file is moved aside to
214    ///   `<path>.corrupt-<epoch>` (so a later save can't overwrite it)
215    ///   and a loud warning is printed before a fresh corpus is returned.
216    /// - **Won't parse** (schema drift, truncation, corruption) → same
217    ///   preserve-aside-then-fresh path.
218    /// - **Parses, but bloated** → recompacted in memory (per-bucket caps
219    ///   re-applied) and returned intact; the next save reclaims the bloat.
220    ///   No bypass is ever lost — bypasses are capped generously, far
221    ///   above any real hunt.
222    ///
223    /// This is the fix for the recurring "corpus disappeared" data loss:
224    /// the old code returned an empty `Self::new(...)` on ANY read/parse
225    /// failure, and the next `save_atomic` atomically overwrote the real
226    /// corpus with nothing. A load failure must never silently become an
227    /// empty corpus the next save destroys.
228    ///
229    /// `target_fingerprint` is used only when the file is absent/empty or
230    /// had to be preserved-and-rebuilt — when the file IS valid its
231    /// embedded fingerprint wins (callers should verify the fingerprint
232    /// matches what they expect via [`Self::target_fingerprint`]).
233    pub fn load_or_default(path: &Path, target_fingerprint: impl Into<String>) -> Self {
234        // A genuinely missing file is a legitimate fresh start.
235        if !path.exists() {
236            return Self::new(target_fingerprint);
237        }
238        let raw = match crate::safe_io::read_capped_text(path, Self::CORPUS_READ_CEILING_BYTES) {
239            Ok(s) => s,
240            Err(e) => {
241                // Oversize or I-O error on an existing file. We can't
242                // read it, but we must NOT let the next save clobber it.
243                preserve_unreadable_corpus(path, &format!("read failed: {e}"));
244                return Self::new(target_fingerprint);
245            }
246        };
247        // An empty / whitespace-only file is equivalent to absent — a
248        // fresh start, with no noisy preserve-aside.
249        if raw.trim().is_empty() {
250            return Self::new(target_fingerprint);
251        }
252        match serde_json::from_str::<Self>(&raw) {
253            Ok(mut corpus) => {
254                if corpus.schema_version == 0 {
255                    corpus.schema_version = CORPUS_SCHEMA_VERSION;
256                }
257                // Recompact a pre-cap / bloated corpus: truncate each
258                // bucket to the respective cap on load so the next save
259                // reclaims the bloat. Keeps the earliest coverage and
260                // harvest samples; bypasses are capped generously so no
261                // real harvest material is lost. (§15/§1)
262                for bucket in corpus.buckets.values_mut() {
263                    bucket.blocked.truncate(Self::MAX_BLOCKED_PER_BUCKET);
264                    bucket.bypassed.truncate(Self::MAX_BYPASSED_PER_BUCKET);
265                }
266                corpus
267            }
268            Err(e) => {
269                // The file exists and is non-empty but won't parse. DO
270                // NOT return an empty corpus the next save would write
271                // over the original — preserve the bytes aside first.
272                preserve_unreadable_corpus(path, &format!("parse failed: {e}"));
273                Self::new(target_fingerprint)
274            }
275        }
276    }
277
278    /// Save atomically via tempfile + rename. Returns an error only on
279    /// I/O failure; the rename itself is atomic on the same filesystem
280    /// so a concurrent reader either sees the prior snapshot or this
281    /// one — never a torn write.
282    pub fn save_atomic(&self, path: &Path) -> std::io::Result<()> {
283        // Rolling backup: before replacing an existing non-empty corpus,
284        // snapshot it to `<path>.bak`. One bad save — a logic regression,
285        // a parse-fail-induced empty reload that slipped past the loader's
286        // preserve guard, a schema drift — is then always one step
287        // recoverable. The corpus is irreplaceable bounty data. (§15/§1)
288        backup_before_overwrite(path);
289        let mut snap = self.clone();
290        snap.schema_version = CORPUS_SCHEMA_VERSION;
291        snap.last_saved_at_secs = current_epoch_secs();
292        let body = serde_json::to_vec_pretty(&snap)
293            .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
294        // R55 pass-19 I4 (CLAUDE.md §7 DEDUP): route through the
295        // workspace's canonical atomic writer so the mkdir-parent,
296        // unique-tmp-name, fsync, rename(2) sequence lives in ONE
297        // place. Pre-fix this module + edge_pop_coverage + h1_dedup
298        // each had their own subtly different copy.
299        wafrift_types::loaders::write_atomic(path, &body)
300    }
301
302    /// Get or insert the bucket for `rule_id`. Cheap because we hand
303    /// out a `&mut RuleBucket` instead of cloning.
304    pub fn bucket_mut(&mut self, rule_id: &str) -> &mut RuleBucket {
305        self.buckets
306            .entry(rule_id.to_string())
307            .or_insert_with(|| RuleBucket {
308                rule_id: RuleId::new(rule_id),
309                ..RuleBucket::default()
310            })
311    }
312
313    /// Max BLOCKED samples retained per rule bucket. Blocked payloads are a
314    /// rule-coverage sample, not harvest material (bypasses are uncapped), so a
315    /// few hundred per rule fully characterise what a rule blocks. The cap
316    /// bounds three real costs a 62 MB CumulusFire corpus surfaced via dogfood
317    /// (§15 / §1): corpus growth toward `RULE_CORPUS_MAX_BYTES` (past which the
318    /// whole corpus is lost on the next `load_or_default`), `save_atomic` write
319    /// size, and the O(n) dedup scan below — which would otherwise make the hot
320    /// record path O(n²) over a long hunt.
321    const MAX_BLOCKED_PER_BUCKET: usize = 512;
322
323    /// Max BYPASSED samples retained per rule bucket. Bypasses are the primary
324    /// harvest material so the cap is generous (8× the blocked cap), but it is
325    /// still finite: an adversarial response-varying WAF can grow `bypassed`
326    /// without bound, eventually pushing the corpus past `RULE_CORPUS_MAX_BYTES`
327    /// — at which point `load_or_default` silently discards the WHOLE corpus
328    /// (total data-loss). This cap bounds growth far below that cliff while
329    /// preserving virtually all real harvest material encountered in practice.
330    /// `load_or_default` truncates over-cap buckets on load to heal corpora
331    /// written before this cap was introduced.
332    const MAX_BYPASSED_PER_BUCKET: usize = 4096;
333
334    /// Record a payload that the WAF BLOCKED, tagged with the rule_id
335    /// it triggered (if the oracle could attribute it).
336    pub fn record_block(
337        &mut self,
338        rule_id: &str,
339        payload: &str,
340        payload_class: PayloadClass,
341        encoding_chain: Vec<String>,
342        response_hash: u64,
343    ) {
344        let entry = RecordedAttempt {
345            payload: payload.to_string(),
346            payload_class,
347            encoding_chain,
348            response_hash,
349            observed_at_secs: current_epoch_secs(),
350        };
351        let bucket = self.bucket_mut(rule_id);
352        // Coverage cap: once a rule has MAX_BLOCKED_PER_BUCKET samples we have
353        // characterised what it blocks; stop recording blocked payloads to bound
354        // corpus growth and keep the dedup scan below O(cap), not O(n). Bypasses
355        // have their own generous cap (MAX_BYPASSED_PER_BUCKET). (§15/§1)
356        if bucket.blocked.len() >= Self::MAX_BLOCKED_PER_BUCKET {
357            return;
358        }
359        // Dedup by (response_hash, payload) so re-running the same
360        // bench doesn't bloat the file.
361        if !bucket
362            .blocked
363            .iter()
364            .any(|a| a.response_hash == entry.response_hash && a.payload == entry.payload)
365        {
366            bucket.blocked.push(entry);
367        }
368    }
369
370    /// Record a payload that BYPASSED the WAF. The default submission
371    /// status is `Queued`; callers can transition via
372    /// [`Self::set_submission`].
373    pub fn record_bypass(
374        &mut self,
375        rule_id: &str,
376        payload: &str,
377        payload_class: PayloadClass,
378        encoding_chain: Vec<String>,
379        response_hash: u64,
380    ) {
381        let entry = RecordedBypass {
382            payload: payload.to_string(),
383            payload_class,
384            encoding_chain,
385            response_hash,
386            observed_at_secs: current_epoch_secs(),
387            submission: SubmissionStatus::Queued,
388            delivery: String::new(),
389        };
390        let bucket = self.bucket_mut(rule_id);
391        // Generous cap: 4 096 bypasses per rule is far more than any real hunt
392        // accumulates, but bounds corpus growth away from the 128 MiB load cliff
393        // that would silently discard the whole corpus (§15 / §1).
394        if bucket.bypassed.len() >= Self::MAX_BYPASSED_PER_BUCKET {
395            return;
396        }
397        if !bucket
398            .bypassed
399            .iter()
400            .any(|b| b.response_hash == entry.response_hash && b.payload == entry.payload)
401        {
402            bucket.bypassed.push(entry);
403        }
404    }
405
406    /// Mark a ruleset drift event on a specific rule (e.g. CF
407    /// Auto-Tune retrain detected via [`crate::dilution`]'s drift
408    /// detector). Triggers "retry the blocked corpus" downstream.
409    pub fn mark_drift(&mut self, rule_id: &str) {
410        let bucket = self.bucket_mut(rule_id);
411        bucket.last_drift_at_secs = Some(current_epoch_secs());
412    }
413
414    /// Update the submission status of a previously-recorded bypass.
415    /// Returns `true` if the bypass was found and updated.
416    pub fn set_submission(
417        &mut self,
418        rule_id: &str,
419        payload: &str,
420        new_status: SubmissionStatus,
421    ) -> bool {
422        if let Some(bucket) = self.buckets.get_mut(rule_id)
423            && let Some(b) = bucket.bypassed.iter_mut().find(|b| b.payload == payload)
424        {
425            b.submission = new_status;
426            return true;
427        }
428        false
429    }
430
431    /// Attach the serialized delivery shape (see [`RecordedBypass::delivery`])
432    /// to a previously-recorded bypass. Returns `true` if the bypass was
433    /// found and updated.
434    ///
435    /// Recorded as a separate step after [`Self::record_bypass`] so the
436    /// hot record path (which dedups by `(response_hash, payload)`) stays
437    /// unchanged: the recorder calls this once, immediately after the
438    /// write, with the shape the winning probe used. A blank `delivery`
439    /// is never written — only a non-empty shape overwrites.
440    pub fn set_delivery(&mut self, rule_id: &str, payload: &str, delivery: String) -> bool {
441        if delivery.is_empty() {
442            return false;
443        }
444        if let Some(bucket) = self.buckets.get_mut(rule_id)
445            && let Some(b) = bucket.bypassed.iter_mut().find(|b| b.payload == payload)
446        {
447            b.delivery = delivery;
448            return true;
449        }
450        false
451    }
452
453    /// Rules with fewer than `min_attempts` recorded blocks AND zero
454    /// bypasses. The hunt orchestrator targets these first — they're
455    /// the unexplored cells of the (rule_id × class) grid.
456    #[must_use]
457    pub fn unexplored_rules(&self, min_attempts: usize) -> Vec<String> {
458        self.buckets
459            .iter()
460            .filter(|(_, b)| b.blocked.len() < min_attempts && b.bypassed.is_empty())
461            .map(|(k, _)| k.clone())
462            .collect()
463    }
464
465    /// Rules where drift was detected within the last `window_secs`
466    /// AND there are blocked payloads worth re-firing.
467    #[must_use]
468    pub fn rules_due_for_retry(&self, window_secs: u64) -> Vec<String> {
469        let now = current_epoch_secs();
470        self.buckets
471            .iter()
472            .filter(|(_, b)| {
473                b.last_drift_at_secs
474                    .is_some_and(|d| now.saturating_sub(d) <= window_secs)
475                    && !b.blocked.is_empty()
476            })
477            .map(|(k, _)| k.clone())
478            .collect()
479    }
480
481    /// All bypasses recorded against a specific rule (newest last
482    /// per insertion order).
483    #[must_use]
484    pub fn bypasses_for_rule(&self, rule_id: &str) -> &[RecordedBypass] {
485        self.buckets
486            .get(rule_id)
487            .map(|b| b.bypassed.as_slice())
488            .unwrap_or(&[])
489    }
490
491    /// All blocked attempts recorded against a specific rule.
492    #[must_use]
493    pub fn blocked_for_rule(&self, rule_id: &str) -> &[RecordedAttempt] {
494        self.buckets
495            .get(rule_id)
496            .map(|b| b.blocked.as_slice())
497            .unwrap_or(&[])
498    }
499
500    /// Bypasses still in `Queued` status whose dry-run hold has
501    /// expired — these are ready for submission to HackerOne.
502    ///
503    /// `default_dry_run_secs` is applied to bypasses still in
504    /// `Queued` state whose `observed_at_secs + default_dry_run_secs`
505    /// has passed (most operators leave bypasses queued without
506    /// setting an explicit `DryRunHold` and rely on this default).
507    #[must_use]
508    pub fn novel_bypasses_pending_submission(
509        &self,
510        default_dry_run_secs: u64,
511    ) -> Vec<(&str, &RecordedBypass)> {
512        let now = current_epoch_secs();
513        let mut out = vec![];
514        for (rule_id, bucket) in &self.buckets {
515            for b in &bucket.bypassed {
516                let ready = match &b.submission {
517                    SubmissionStatus::Queued => {
518                        now.saturating_sub(b.observed_at_secs) >= default_dry_run_secs
519                    }
520                    SubmissionStatus::DryRunHold { release_at_secs } => now >= *release_at_secs,
521                    _ => false,
522                };
523                if ready {
524                    out.push((rule_id.as_str(), b));
525                }
526            }
527        }
528        out
529    }
530
531    /// Total bypass count across all rules.
532    #[must_use]
533    pub fn total_bypasses(&self) -> usize {
534        self.buckets.values().map(|b| b.bypassed.len()).sum()
535    }
536
537    /// Total block count across all rules.
538    #[must_use]
539    pub fn total_blocks(&self) -> usize {
540        self.buckets.values().map(|b| b.blocked.len()).sum()
541    }
542
543    /// Number of distinct rule_ids with at least one observation.
544    #[must_use]
545    pub fn rules_seen(&self) -> usize {
546        self.buckets.len()
547    }
548
549    /// Summary suitable for the bench reporter — totals + per-class
550    /// breakdown for quick "what did we learn" gut-check.
551    #[must_use]
552    pub fn summary(&self) -> CoverageSummary {
553        let mut per_class: BTreeMap<String, ClassStats> = BTreeMap::new();
554        for bucket in self.buckets.values() {
555            for b in &bucket.blocked {
556                let entry = per_class
557                    .entry(b.payload_class.as_str().to_string())
558                    .or_default();
559                entry.blocks += 1;
560            }
561            for b in &bucket.bypassed {
562                let entry = per_class
563                    .entry(b.payload_class.as_str().to_string())
564                    .or_default();
565                entry.bypasses += 1;
566            }
567        }
568        CoverageSummary {
569            target_fingerprint: self.target_fingerprint.clone(),
570            rules_seen: self.rules_seen(),
571            total_blocks: self.total_blocks(),
572            total_bypasses: self.total_bypasses(),
573            per_class,
574        }
575    }
576}
577
578/// Per-class block/bypass counts for the corpus summary.
579#[derive(Debug, Clone, Default, Serialize, Deserialize)]
580pub struct ClassStats {
581    pub blocks: usize,
582    pub bypasses: usize,
583}
584
585/// What the bench reporter pulls when it wants a one-line gut-check
586/// on the corpus state.
587#[derive(Debug, Clone, Serialize, Deserialize)]
588pub struct CoverageSummary {
589    pub target_fingerprint: String,
590    pub rules_seen: usize,
591    pub total_blocks: usize,
592    pub total_bypasses: usize,
593    pub per_class: BTreeMap<String, ClassStats>,
594}
595
596/// Default disk location for the corpus — `~/.wafrift/corpus/<fingerprint>.json`.
597/// Falls back to a `wafrift-bench/results/corpus/` directory under CWD when
598/// the home directory can't be resolved.
599#[must_use]
600pub fn default_corpus_path(target_fingerprint: &str) -> PathBuf {
601    let safe = sanitize_fingerprint_for_filename(target_fingerprint);
602    if let Some(home) = dirs_home() {
603        return home
604            .join(".wafrift")
605            .join("corpus")
606            .join(format!("{safe}.json"));
607    }
608    PathBuf::from("wafrift-bench/results/corpus").join(format!("{safe}.json"))
609}
610
611/// Sanitize a fingerprint string for use as a filename — strips
612/// path separators and other shell-hostile bytes.
613///
614/// Allows only `[A-Za-z0-9_-]`; every other character (including `.`)
615/// becomes `_`. Excluding `.` prevents a crafted fingerprint such as
616/// `..` from producing a `..`-bearing filename component, eliminating
617/// even the theoretical path-traversal surface.
618fn sanitize_fingerprint_for_filename(fp: &str) -> String {
619    fp.chars()
620        .map(|c| {
621            if c.is_ascii_alphanumeric() || c == '-' || c == '_' {
622                c
623            } else {
624                '_'
625            }
626        })
627        .collect()
628}
629
630fn current_epoch_secs() -> u64 {
631    SystemTime::now()
632        .duration_since(UNIX_EPOCH)
633        .map(|d| d.as_secs())
634        .unwrap_or(0)
635}
636
637/// Move an existing-but-unreadable corpus file aside to a timestamped
638/// sidecar (`<path>.corrupt-<epoch>`) so a subsequent `save_atomic` can
639/// never overwrite it, and emit a loud warning naming the preserved file.
640///
641/// This is the load-side half of the corpus-durability guarantee: an
642/// oversize / corrupt / unparseable corpus is *preserved*, never silently
643/// discarded. Best-effort — if the file can't be moved aside we still
644/// warn (and the save-side [`backup_before_overwrite`] guard provides a
645/// second line of defence by copying the file to `<path>.bak` before any
646/// overwrite). Never panics; the caller still receives a fresh corpus.
647fn preserve_unreadable_corpus(path: &Path, reason: &str) {
648    // Unique sidecar name (epoch + pid + nanos) so two corruption events within
649    // the same wall-clock second can't collide — a second-granularity name
650    // would let the second `rename` replace the first sidecar and lose the
651    // earlier corrupt bytes. Mirrors the unique-tmp-name policy `write_atomic`
652    // uses. (§15 / §1 — never lose recoverable data.)
653    let nanos = SystemTime::now()
654        .duration_since(UNIX_EPOCH)
655        .map(|d| d.as_nanos())
656        .unwrap_or(0);
657    let mut aside = path.as_os_str().to_owned();
658    aside.push(format!(
659        ".corrupt-{}-{}-{}",
660        current_epoch_secs(),
661        std::process::id(),
662        nanos
663    ));
664    let aside = PathBuf::from(aside);
665    match std::fs::rename(path, &aside) {
666        Ok(()) => eprintln!(
667            "wafrift: WARNING — corpus at {} could not be loaded ({reason}). \
668             Your data was PRESERVED at {} and a fresh corpus was started. \
669             Rename it back once the cause is addressed.",
670            path.display(),
671            aside.display(),
672        ),
673        Err(e) => eprintln!(
674            "wafrift: ERROR — corpus at {} could not be loaded ({reason}) AND \
675             could not be moved aside ({e}). Back this file up MANUALLY before \
676             the next run — a save may otherwise overwrite it.",
677            path.display(),
678        ),
679    }
680}
681
682/// Snapshot an existing non-empty corpus to `<path>.bak` before it is
683/// overwritten by `RuleBypassCorpus::save_atomic`. Best-effort; never
684/// blocks or fails the save. Empty/absent prior files are skipped (nothing
685/// to protect). This is the save-side half of the durability guarantee.
686fn backup_before_overwrite(path: &Path) {
687    match std::fs::metadata(path) {
688        Ok(meta) if meta.len() > 0 => {
689            let mut bak = path.as_os_str().to_owned();
690            bak.push(".bak");
691            let _ = std::fs::copy(path, PathBuf::from(bak));
692        }
693        _ => {}
694    }
695}
696
697fn dirs_home() -> Option<PathBuf> {
698    // We don't take a hard dep on `dirs` here — read $HOME or
699    // %USERPROFILE% directly. Keeps the crate's dep surface tight.
700    if let Ok(h) = std::env::var("HOME")
701        && !h.is_empty()
702    {
703        return Some(PathBuf::from(h));
704    }
705    if let Ok(h) = std::env::var("USERPROFILE")
706        && !h.is_empty()
707    {
708        return Some(PathBuf::from(h));
709    }
710    None
711}
712
713#[cfg(test)]
714mod tests {
715    use super::*;
716    use tempfile::tempdir;
717
718    fn cls(s: &str) -> PayloadClass {
719        PayloadClass::new(s)
720    }
721
722    #[test]
723    fn new_corpus_is_empty() {
724        let c = RuleBypassCorpus::new("cf:managed-ruleset:cumulusfire.cloudflare.com");
725        assert_eq!(c.rules_seen(), 0);
726        assert_eq!(c.total_blocks(), 0);
727        assert_eq!(c.total_bypasses(), 0);
728        assert_eq!(
729            c.target_fingerprint,
730            "cf:managed-ruleset:cumulusfire.cloudflare.com"
731        );
732        assert_eq!(c.schema_version, CORPUS_SCHEMA_VERSION);
733    }
734
735    #[test]
736    fn record_block_dedups_by_payload_and_hash() {
737        let mut c = RuleBypassCorpus::new("t");
738        c.record_block(
739            "942100",
740            "' OR 1=1--",
741            cls("sql"),
742            vec!["url".into()],
743            0xCAFE,
744        );
745        c.record_block(
746            "942100",
747            "' OR 1=1--",
748            cls("sql"),
749            vec!["url".into()],
750            0xCAFE,
751        );
752        c.record_block(
753            "942100",
754            "' OR 1=1--",
755            cls("sql"),
756            vec!["url".into()],
757            0xCAFE,
758        );
759        assert_eq!(c.blocked_for_rule("942100").len(), 1);
760    }
761
762    #[test]
763    fn record_block_keeps_distinct_payloads_per_rule() {
764        let mut c = RuleBypassCorpus::new("t");
765        c.record_block("942100", "' OR 1=1--", cls("sql"), vec![], 1);
766        c.record_block("942100", "UNION SELECT 1", cls("sql"), vec![], 2);
767        c.record_block("942100", "1' AND 1=1--", cls("sql"), vec![], 3);
768        assert_eq!(c.blocked_for_rule("942100").len(), 3);
769    }
770
771    #[test]
772    fn record_block_caps_blocked_per_bucket() {
773        // Dogfood-found (§15/§1): a 62 MB CumulusFire corpus came from an
774        // UNCAPPED `blocked` array (unique + dedup'd, but unbounded → creeps to
775        // the 128 MiB load cliff + an O(n²) dedup scan). Pin the per-bucket cap
776        // on blocked; bypasses have their own generous cap (MAX_BYPASSED_PER_BUCKET).
777        let mut c = RuleBypassCorpus::new("t");
778        let over = RuleBypassCorpus::MAX_BLOCKED_PER_BUCKET + 200;
779        for i in 0..over {
780            // Distinct payload + hash so dedup never collapses them — only the
781            // cap should bound the count.
782            c.record_block("r", &format!("p{i}"), cls("sql"), vec![], i as u64);
783        }
784        assert_eq!(
785            c.blocked_for_rule("r").len(),
786            RuleBypassCorpus::MAX_BLOCKED_PER_BUCKET,
787            "blocked must be capped per bucket"
788        );
789        // Bypasses have a generous cap (4096). Push well under it — all persist.
790        let n_bypass = RuleBypassCorpus::MAX_BLOCKED_PER_BUCKET + 50;
791        for i in 0..n_bypass {
792            c.record_bypass(
793                "r",
794                &format!("b{i}"),
795                cls("sql"),
796                vec![],
797                1_000_000 + i as u64,
798            );
799        }
800        assert_eq!(
801            c.total_bypasses(),
802            n_bypass,
803            "bypasses under MAX_BYPASSED_PER_BUCKET must all persist"
804        );
805    }
806
807    #[test]
808    fn record_bypass_caps_bypassed_per_bucket() {
809        // A response-varying WAF can drive unbounded `bypassed` growth → total
810        // corpus loss when it hits the 128 MiB RULE_CORPUS_MAX_BYTES cliff.
811        // Pin that the cap is enforced at MAX_BYPASSED_PER_BUCKET. (§15/§1)
812        let mut c = RuleBypassCorpus::new("t");
813        let over = RuleBypassCorpus::MAX_BYPASSED_PER_BUCKET + 500;
814        for i in 0..over {
815            // Distinct payload + hash so dedup never collapses — only the cap limits.
816            c.record_bypass("r", &format!("b{i}"), cls("sql"), vec![], i as u64);
817        }
818        assert_eq!(
819            c.bypasses_for_rule("r").len(),
820            RuleBypassCorpus::MAX_BYPASSED_PER_BUCKET,
821            "bypassed must be capped at MAX_BYPASSED_PER_BUCKET"
822        );
823    }
824
825    #[test]
826    fn load_or_default_heals_pre_cap_oversized_blocked() {
827        use std::env::temp_dir;
828        // A corpus written BEFORE the cap (or hand-edited) may hold >cap
829        // blocked entries — e.g. the 62 MB CumulusFire corpus. Loading must
830        // truncate each bucket to the cap so the next save reclaims the bloat,
831        // while bypasses (harvest material) survive untouched. (§15/§1)
832        let mut c = RuleBypassCorpus::new("heal-test");
833        let over = RuleBypassCorpus::MAX_BLOCKED_PER_BUCKET + 300;
834        let blocked: Vec<RecordedAttempt> = (0..over)
835            .map(|i| RecordedAttempt {
836                payload: format!("p{i}"),
837                payload_class: cls("sql"),
838                encoding_chain: vec![],
839                response_hash: i as u64,
840                observed_at_secs: 0,
841            })
842            .collect();
843        c.buckets.insert(
844            "r".to_string(),
845            RuleBucket {
846                blocked,
847                ..RuleBucket::default()
848            },
849        );
850        c.record_bypass("r", "winner", cls("sql"), vec![], 42);
851
852        let path = temp_dir().join(format!("wafrift-corpus-heal-{}.json", std::process::id()));
853        let _ = std::fs::remove_file(&path);
854        c.save_atomic(&path).expect("save oversized corpus");
855        let healed = RuleBypassCorpus::load_or_default(&path, "heal-test");
856        assert_eq!(
857            healed.blocked_for_rule("r").len(),
858            RuleBypassCorpus::MAX_BLOCKED_PER_BUCKET,
859            "load must truncate over-cap blocked to reclaim the bloat"
860        );
861        assert_eq!(healed.total_bypasses(), 1, "bypasses survive the heal");
862        let _ = std::fs::remove_file(&path);
863    }
864
865    #[test]
866    fn load_or_default_heals_pre_cap_oversized_bypassed() {
867        use std::env::temp_dir;
868        // A corpus written before MAX_BYPASSED_PER_BUCKET was introduced may
869        // hold more bypasses than the cap. load_or_default must truncate each
870        // bucket's `bypassed` vec to MAX_BYPASSED_PER_BUCKET on load so the
871        // next save reclaims the bloat and stays below the 128 MiB cliff.
872        // (§15/§1 — mirrors the blocked-heal test above)
873        let mut c = RuleBypassCorpus::new("bypass-heal-test");
874        let over = RuleBypassCorpus::MAX_BYPASSED_PER_BUCKET + 200;
875        // Construct an over-cap bypassed vec directly (bypassing record_bypass's
876        // write-time cap) to simulate a legacy on-disk corpus.
877        let bypassed: Vec<RecordedBypass> = (0..over)
878            .map(|i| RecordedBypass {
879                payload: format!("b{i}"),
880                payload_class: cls("sql"),
881                encoding_chain: vec![],
882                response_hash: i as u64,
883                observed_at_secs: 0,
884                submission: SubmissionStatus::Queued,
885                delivery: String::new(),
886            })
887            .collect();
888        c.buckets.insert(
889            "r".to_string(),
890            RuleBucket {
891                bypassed,
892                ..RuleBucket::default()
893            },
894        );
895        // Also confirm a blocked entry survives the heal.
896        c.record_block("r", "blocker", cls("sql"), vec![], 1);
897
898        let path = temp_dir().join(format!(
899            "wafrift-corpus-bypass-heal-{}.json",
900            std::process::id()
901        ));
902        let _ = std::fs::remove_file(&path);
903        c.save_atomic(&path).expect("save oversized bypass corpus");
904        let healed = RuleBypassCorpus::load_or_default(&path, "bypass-heal-test");
905        assert_eq!(
906            healed.bypasses_for_rule("r").len(),
907            RuleBypassCorpus::MAX_BYPASSED_PER_BUCKET,
908            "load must truncate over-cap bypassed to MAX_BYPASSED_PER_BUCKET"
909        );
910        assert_eq!(healed.total_blocks(), 1, "blocked entries survive the heal");
911        let _ = std::fs::remove_file(&path);
912    }
913
914    #[test]
915    fn record_bypass_dedups() {
916        let mut c = RuleBypassCorpus::new("t");
917        c.record_bypass("942100", "Ω union select", cls("sql"), vec![], 1);
918        c.record_bypass("942100", "Ω union select", cls("sql"), vec![], 1);
919        assert_eq!(c.bypasses_for_rule("942100").len(), 1);
920    }
921
922    #[test]
923    fn record_bypass_default_status_is_queued() {
924        let mut c = RuleBypassCorpus::new("t");
925        c.record_bypass("942100", "payload", cls("sql"), vec![], 1);
926        let b = &c.bypasses_for_rule("942100")[0];
927        assert!(matches!(b.submission, SubmissionStatus::Queued));
928    }
929
930    #[test]
931    fn set_submission_updates_lifecycle() {
932        let mut c = RuleBypassCorpus::new("t");
933        c.record_bypass("942100", "payload", cls("sql"), vec![], 1);
934        let ok = c.set_submission(
935            "942100",
936            "payload",
937            SubmissionStatus::Submitted {
938                report_id: "H1-12345".into(),
939            },
940        );
941        assert!(ok);
942        let b = &c.bypasses_for_rule("942100")[0];
943        assert!(matches!(
944            &b.submission,
945            SubmissionStatus::Submitted { report_id } if report_id == "H1-12345"
946        ));
947    }
948
949    #[test]
950    fn set_submission_missing_returns_false() {
951        let mut c = RuleBypassCorpus::new("t");
952        let ok = c.set_submission(
953            "doesnt-exist",
954            "payload",
955            SubmissionStatus::Accepted {
956                report_id: "X".into(),
957            },
958        );
959        assert!(!ok);
960    }
961
962    #[test]
963    fn record_bypass_default_delivery_is_empty() {
964        let mut c = RuleBypassCorpus::new("t");
965        c.record_bypass("R1", "p", cls("sql"), vec![], 1);
966        assert_eq!(c.bypasses_for_rule("R1")[0].delivery, "");
967    }
968
969    #[test]
970    fn set_delivery_attaches_shape_to_recorded_bypass() {
971        let mut c = RuleBypassCorpus::new("t");
972        c.record_bypass("R1", "p", cls("sql"), vec![], 1);
973        let ok = c.set_delivery("R1", "p", "{\"Query\":{\"param\":\"q\"}}".into());
974        assert!(ok);
975        assert_eq!(
976            c.bypasses_for_rule("R1")[0].delivery,
977            "{\"Query\":{\"param\":\"q\"}}"
978        );
979    }
980
981    #[test]
982    fn set_delivery_missing_bypass_returns_false() {
983        let mut c = RuleBypassCorpus::new("t");
984        assert!(!c.set_delivery("nope", "p", "{\"PathSegment\":null}".into()));
985    }
986
987    #[test]
988    fn set_delivery_ignores_empty_string() {
989        // A blank delivery must never clobber an already-recorded shape.
990        let mut c = RuleBypassCorpus::new("t");
991        c.record_bypass("R1", "p", cls("sql"), vec![], 1);
992        assert!(c.set_delivery("R1", "p", "\"PathSegment\"".into()));
993        assert!(!c.set_delivery("R1", "p", String::new()));
994        assert_eq!(c.bypasses_for_rule("R1")[0].delivery, "\"PathSegment\"");
995    }
996
997    #[test]
998    fn delivery_round_trips_through_save_load() {
999        let dir = tempdir().expect("tempdir");
1000        let path = dir.path().join("c.json");
1001        let mut c = RuleBypassCorpus::new("cf:mr:cumulus");
1002        c.record_bypass("942100", "1 OR 1=1 --", cls("sql"), vec![], 9);
1003        c.set_delivery(
1004            "942100",
1005            "1 OR 1=1 --",
1006            "{\"HppSplit\":{\"param\":\"q\",\"parts\":3}}".into(),
1007        );
1008        c.save_atomic(&path).expect("save");
1009        let r = RuleBypassCorpus::load_or_default(&path, "ignored");
1010        assert_eq!(
1011            r.bypasses_for_rule("942100")[0].delivery,
1012            "{\"HppSplit\":{\"param\":\"q\",\"parts\":3}}"
1013        );
1014    }
1015
1016    #[test]
1017    fn delivery_defaults_empty_for_corpus_without_the_field() {
1018        // Pre-delivery-capture corpus files have no `delivery` key. Prove
1019        // serde default keeps them loadable (LAW 2 backwards-compat) by
1020        // STRIPPING the key from a real serialization — robust against the
1021        // exact RuleId / PayloadClass JSON shape.
1022        let mut c = RuleBypassCorpus::new("t");
1023        c.record_bypass("R1", "old", cls("sql"), vec![], 1);
1024        let mut v: serde_json::Value =
1025            serde_json::from_str(&serde_json::to_string(&c).unwrap()).unwrap();
1026        for bucket in v["buckets"].as_object_mut().unwrap().values_mut() {
1027            for bp in bucket["bypassed"].as_array_mut().unwrap() {
1028                assert!(
1029                    bp.as_object_mut().unwrap().remove("delivery").is_some(),
1030                    "serialization must include the delivery key to strip"
1031                );
1032            }
1033        }
1034        let dir = tempdir().expect("tempdir");
1035        let path = dir.path().join("old.json");
1036        std::fs::write(&path, serde_json::to_string(&v).unwrap()).expect("write");
1037        let r = RuleBypassCorpus::load_or_default(&path, "ignored");
1038        let b = &r.bypasses_for_rule("R1")[0];
1039        assert_eq!(b.payload, "old");
1040        assert_eq!(b.delivery, "", "missing delivery must default to empty");
1041    }
1042
1043    #[test]
1044    fn unexplored_rules_skips_ones_with_bypass() {
1045        let mut c = RuleBypassCorpus::new("t");
1046        c.record_block("R1", "p1", cls("sql"), vec![], 1);
1047        c.record_bypass("R2", "p2", cls("sql"), vec![], 2);
1048        // R1: 1 block, 0 bypasses → unexplored when threshold > 1.
1049        // R2: 0 blocks, 1 bypass → NOT unexplored.
1050        let unexplored = c.unexplored_rules(3);
1051        assert!(unexplored.contains(&"R1".to_string()));
1052        assert!(!unexplored.contains(&"R2".to_string()));
1053    }
1054
1055    #[test]
1056    fn rules_due_for_retry_respects_window() {
1057        let mut c = RuleBypassCorpus::new("t");
1058        c.record_block("R1", "p", cls("sql"), vec![], 1);
1059        // Drift now, ask for 60s window → should be present.
1060        c.mark_drift("R1");
1061        let due = c.rules_due_for_retry(60);
1062        assert_eq!(due, vec!["R1".to_string()]);
1063    }
1064
1065    #[test]
1066    fn rules_due_for_retry_skips_rules_with_no_blocks() {
1067        let mut c = RuleBypassCorpus::new("t");
1068        c.mark_drift("R1");
1069        // No blocks recorded — nothing to re-try.
1070        assert!(c.rules_due_for_retry(60).is_empty());
1071    }
1072
1073    #[test]
1074    fn total_counts_aggregate_across_rules() {
1075        let mut c = RuleBypassCorpus::new("t");
1076        c.record_block("R1", "p1", cls("sql"), vec![], 1);
1077        c.record_block("R2", "p2", cls("xss"), vec![], 2);
1078        c.record_bypass("R1", "p3", cls("sql"), vec![], 3);
1079        assert_eq!(c.total_blocks(), 2);
1080        assert_eq!(c.total_bypasses(), 1);
1081        assert_eq!(c.rules_seen(), 2);
1082    }
1083
1084    #[test]
1085    fn summary_breaks_down_by_class() {
1086        let mut c = RuleBypassCorpus::new("cf:mr:foo");
1087        c.record_block("R1", "p1", cls("sql"), vec![], 1);
1088        c.record_block("R1", "p2", cls("sql"), vec![], 2);
1089        c.record_block("R2", "p3", cls("xss"), vec![], 3);
1090        c.record_bypass("R1", "p4", cls("sql"), vec![], 4);
1091        let s = c.summary();
1092        assert_eq!(s.target_fingerprint, "cf:mr:foo");
1093        assert_eq!(s.rules_seen, 2);
1094        assert_eq!(s.total_blocks, 3);
1095        assert_eq!(s.total_bypasses, 1);
1096        let sql_stats = s.per_class.get("sql").unwrap();
1097        assert_eq!(sql_stats.blocks, 2);
1098        assert_eq!(sql_stats.bypasses, 1);
1099        let xss_stats = s.per_class.get("xss").unwrap();
1100        assert_eq!(xss_stats.blocks, 1);
1101        assert_eq!(xss_stats.bypasses, 0);
1102    }
1103
1104    #[test]
1105    fn save_load_round_trip() {
1106        let dir = tempdir().expect("tempdir");
1107        let path = dir.path().join("corpus.json");
1108        let mut c = RuleBypassCorpus::new("cf:mr:cumulus");
1109        c.record_block("942100", "payload-1", cls("sql"), vec!["url".into()], 1);
1110        c.record_bypass(
1111            "942100",
1112            "payload-2",
1113            cls("sql"),
1114            vec!["unicode".into(), "case".into()],
1115            2,
1116        );
1117        c.save_atomic(&path).expect("save");
1118
1119        let reloaded = RuleBypassCorpus::load_or_default(&path, "ignored");
1120        assert_eq!(reloaded.target_fingerprint, "cf:mr:cumulus");
1121        assert_eq!(reloaded.rules_seen(), 1);
1122        assert_eq!(reloaded.total_blocks(), 1);
1123        assert_eq!(reloaded.total_bypasses(), 1);
1124        let bp = &reloaded.bypasses_for_rule("942100")[0];
1125        assert_eq!(bp.payload, "payload-2");
1126        assert_eq!(
1127            bp.encoding_chain,
1128            vec!["unicode".to_string(), "case".to_string()]
1129        );
1130    }
1131
1132    #[test]
1133    fn load_missing_file_returns_default() {
1134        let dir = tempdir().expect("tempdir");
1135        let path = dir.path().join("nope.json");
1136        let c = RuleBypassCorpus::load_or_default(&path, "cf:mr:x");
1137        assert_eq!(c.target_fingerprint, "cf:mr:x");
1138        assert_eq!(c.rules_seen(), 0);
1139    }
1140
1141    #[test]
1142    fn load_corrupted_file_preserves_original_then_defaults() {
1143        // A non-empty, unparseable corpus must NOT be silently dropped.
1144        // The original bytes are moved aside to a `.corrupt-*` sidecar
1145        // (so a later save can't clobber them) and a fresh corpus is
1146        // returned. (Regression: the old behaviour returned an empty
1147        // corpus that the next save destroyed the original with.)
1148        let dir = tempdir().expect("tempdir");
1149        let path = dir.path().join("trash.json");
1150        let original = b"{not valid json !!! but represents 500 lost bypasses";
1151        std::fs::write(&path, original).expect("write");
1152
1153        let c = RuleBypassCorpus::load_or_default(&path, "fallback");
1154        assert_eq!(c.target_fingerprint, "fallback");
1155        assert_eq!(c.rules_seen(), 0);
1156
1157        // The original file was moved aside, not left where a save would
1158        // overwrite it, and the preserved bytes are intact.
1159        assert!(!path.exists(), "the unparseable file must be moved aside");
1160        let aside: Vec<_> = std::fs::read_dir(dir.path())
1161            .unwrap()
1162            .filter_map(Result::ok)
1163            .filter(|e| {
1164                e.file_name()
1165                    .to_string_lossy()
1166                    .contains("trash.json.corrupt-")
1167            })
1168            .collect();
1169        assert_eq!(aside.len(), 1, "exactly one preserved sidecar must exist");
1170        let preserved = std::fs::read(aside[0].path()).expect("read sidecar");
1171        assert_eq!(
1172            preserved, original,
1173            "preserved bytes must be byte-identical"
1174        );
1175    }
1176
1177    #[test]
1178    fn load_empty_file_returns_default_without_preserving() {
1179        // An empty file is equivalent to "no corpus yet" — a clean fresh
1180        // start, and crucially NO noisy `.corrupt-*` sidecar.
1181        let dir = tempdir().expect("tempdir");
1182        let path = dir.path().join("empty.json");
1183        std::fs::write(&path, b"").expect("write");
1184        let c = RuleBypassCorpus::load_or_default(&path, "fallback");
1185        assert_eq!(c.target_fingerprint, "fallback");
1186        let has_sidecar = std::fs::read_dir(dir.path())
1187            .unwrap()
1188            .filter_map(Result::ok)
1189            .any(|e| e.file_name().to_string_lossy().contains(".corrupt-"));
1190        assert!(!has_sidecar, "empty file must not spawn a preserve sidecar");
1191    }
1192
1193    #[test]
1194    fn save_atomic_backs_up_prior_corpus_before_overwrite() {
1195        // The save-side durability guard: overwriting an existing
1196        // non-empty corpus first snapshots it to `<path>.bak`, so one bad
1197        // save is always one step recoverable.
1198        let dir = tempdir().expect("tempdir");
1199        let path = dir.path().join("corpus.json");
1200
1201        let mut a = RuleBypassCorpus::new("cf:mr:cumulus");
1202        a.record_bypass("942100", "winner-A", cls("xss"), vec![], 1);
1203        a.save_atomic(&path).expect("save A");
1204
1205        // A second save (e.g. a regression that produced an EMPTY corpus)
1206        // must leave the prior good corpus recoverable in `.bak`.
1207        let empty = RuleBypassCorpus::new("cf:mr:cumulus");
1208        empty.save_atomic(&path).expect("save empty over A");
1209
1210        let bak = dir.path().join("corpus.json.bak");
1211        assert!(
1212            bak.exists(),
1213            "a .bak snapshot of the prior corpus must exist"
1214        );
1215        let recovered = RuleBypassCorpus::load_or_default(&bak, "ignored");
1216        assert_eq!(
1217            recovered.total_bypasses(),
1218            1,
1219            "the prior bypass must be recoverable from the .bak snapshot"
1220        );
1221        assert_eq!(recovered.bypasses_for_rule("942100")[0].payload, "winner-A");
1222    }
1223
1224    #[test]
1225    fn corrupt_then_save_does_not_destroy_preserved_bypasses() {
1226        // End-to-end: a corpus file goes corrupt, the recorder reloads
1227        // (gets a fresh corpus) and saves an empty one — the real bypasses
1228        // must still be on disk in the preserved sidecar. This is the
1229        // exact "corpus disappeared" sequence, now non-destructive.
1230        let dir = tempdir().expect("tempdir");
1231        let path = dir.path().join("corpus.json");
1232
1233        // A real corpus existed and was later corrupted (truncated mid
1234        // write by a crash, NFS hiccup, schema drift, ...).
1235        let mut real = RuleBypassCorpus::new("cf:mr:cumulus");
1236        for i in 0..50 {
1237            real.record_bypass("942100", &format!("bypass-{i}"), cls("xss"), vec![], i);
1238        }
1239        let real_bytes = serde_json::to_vec_pretty(&real).unwrap();
1240        // Simulate corruption: keep the (parseable-looking) bytes but break them.
1241        let mut corrupt = real_bytes.clone();
1242        corrupt.truncate(corrupt.len() / 2);
1243        std::fs::write(&path, &corrupt).expect("write corrupt");
1244
1245        // Recorder reloads → fresh corpus → saves empty.
1246        let fresh = RuleBypassCorpus::load_or_default(&path, "cf:mr:cumulus");
1247        assert_eq!(fresh.total_bypasses(), 0);
1248        fresh.save_atomic(&path).expect("save fresh");
1249
1250        // The corrupt bytes were preserved in a sidecar — not destroyed.
1251        let aside: Vec<_> = std::fs::read_dir(dir.path())
1252            .unwrap()
1253            .filter_map(Result::ok)
1254            .filter(|e| e.file_name().to_string_lossy().contains(".corrupt-"))
1255            .collect();
1256        assert_eq!(aside.len(), 1, "corrupt bytes must be preserved aside");
1257        assert_eq!(
1258            std::fs::read(aside[0].path()).unwrap(),
1259            corrupt,
1260            "preserved sidecar must hold the exact corrupt bytes for manual recovery"
1261        );
1262    }
1263
1264    #[test]
1265    fn save_creates_parent_directory() {
1266        let dir = tempdir().expect("tempdir");
1267        let nested = dir.path().join("deep/nested/path/corpus.json");
1268        let c = RuleBypassCorpus::new("t");
1269        c.save_atomic(&nested).expect("save creates parents");
1270        assert!(nested.exists());
1271    }
1272
1273    #[test]
1274    fn save_atomic_no_torn_write_on_existing_file() {
1275        // Pre-populate the target with garbage. save_atomic should
1276        // replace it with valid JSON, never leaving a partial state.
1277        let dir = tempdir().expect("tempdir");
1278        let path = dir.path().join("corpus.json");
1279        std::fs::write(&path, b"prior-garbage-bytes").expect("seed");
1280        let c = RuleBypassCorpus::new("cf:mr:t");
1281        c.save_atomic(&path).expect("save");
1282        let bytes = std::fs::read(&path).expect("read");
1283        // Should NOT contain the prior garbage.
1284        assert!(
1285            !std::str::from_utf8(&bytes)
1286                .unwrap()
1287                .contains("prior-garbage")
1288        );
1289    }
1290
1291    #[test]
1292    fn novel_bypasses_pending_submission_honors_dry_run() {
1293        let mut c = RuleBypassCorpus::new("t");
1294        c.record_bypass("R1", "fresh", cls("sql"), vec![], 1);
1295        // Just-recorded with default dry-run 24h → NOT ready.
1296        let pending = c.novel_bypasses_pending_submission(86400);
1297        assert!(pending.is_empty(), "fresh bypass should not be pending");
1298
1299        // With a 0-second dry-run, the same bypass IS ready.
1300        let pending = c.novel_bypasses_pending_submission(0);
1301        assert_eq!(pending.len(), 1);
1302        assert_eq!(pending[0].0, "R1");
1303    }
1304
1305    #[test]
1306    fn novel_bypasses_pending_submission_skips_already_submitted() {
1307        let mut c = RuleBypassCorpus::new("t");
1308        c.record_bypass("R1", "p", cls("sql"), vec![], 1);
1309        c.set_submission(
1310            "R1",
1311            "p",
1312            SubmissionStatus::Submitted {
1313                report_id: "H1-X".into(),
1314            },
1315        );
1316        let pending = c.novel_bypasses_pending_submission(0);
1317        assert!(
1318            pending.is_empty(),
1319            "Submitted bypass should not appear pending"
1320        );
1321    }
1322
1323    #[test]
1324    fn novel_bypasses_pending_submission_honors_explicit_hold() {
1325        let mut c = RuleBypassCorpus::new("t");
1326        c.record_bypass("R1", "p", cls("sql"), vec![], 1);
1327        // Explicit hold one hour in the future.
1328        let future = current_epoch_secs() + 3600;
1329        c.set_submission(
1330            "R1",
1331            "p",
1332            SubmissionStatus::DryRunHold {
1333                release_at_secs: future,
1334            },
1335        );
1336        let pending = c.novel_bypasses_pending_submission(0);
1337        assert!(pending.is_empty(), "explicit DryRunHold must be honored");
1338    }
1339
1340    #[test]
1341    fn schema_version_normalized_on_load() {
1342        // Simulate an older file without a schema_version.
1343        let raw = r#"{"target_fingerprint":"t","buckets":{}}"#;
1344        let dir = tempdir().expect("tempdir");
1345        let path = dir.path().join("c.json");
1346        std::fs::write(&path, raw).expect("write");
1347        let c = RuleBypassCorpus::load_or_default(&path, "ignored");
1348        assert_eq!(c.schema_version, CORPUS_SCHEMA_VERSION);
1349    }
1350
1351    #[test]
1352    fn sanitize_fingerprint_strips_path_separators() {
1353        assert_eq!(
1354            sanitize_fingerprint_for_filename("cf:managed-ruleset:host/foo"),
1355            "cf_managed-ruleset_host_foo"
1356        );
1357        // Backslashes AND dots become `_`; a `..` fingerprint can never produce
1358        // a `..`-bearing filename component (path-traversal hygiene).
1359        assert_eq!(
1360            sanitize_fingerprint_for_filename("..\\..\\evil"),
1361            "______evil"
1362        );
1363    }
1364
1365    #[test]
1366    fn sanitize_fingerprint_preserves_safe_chars() {
1367        // Only [A-Za-z0-9_-] pass through unchanged; dots map to `_`.
1368        assert_eq!(
1369            sanitize_fingerprint_for_filename("cf-managed_ruleset_v1"),
1370            "cf-managed_ruleset_v1"
1371        );
1372        // A dot-containing fingerprint segment gets its dots replaced.
1373        assert_eq!(
1374            sanitize_fingerprint_for_filename("cf-managed.ruleset_v1"),
1375            "cf-managed_ruleset_v1"
1376        );
1377    }
1378
1379    #[test]
1380    fn default_corpus_path_uses_fingerprint() {
1381        let p = default_corpus_path("cf:mr:x.com");
1382        let s = p.to_string_lossy();
1383        // dots in fingerprint are replaced by `_` in the filename
1384        assert!(s.contains("cf_mr_x_com"));
1385        assert!(s.ends_with(".json"));
1386    }
1387
1388    #[test]
1389    fn determinism_serialization_btree_order() {
1390        // BTreeMap iteration is deterministic — serializing the same
1391        // corpus twice must produce identical bytes.
1392        let mut c = RuleBypassCorpus::new("t");
1393        for i in (0..50).rev() {
1394            c.record_block(
1395                &format!("R{i}"),
1396                &format!("p{i}"),
1397                cls("sql"),
1398                vec![],
1399                i as u64,
1400            );
1401        }
1402        let a = serde_json::to_string(&c).unwrap();
1403        let b = serde_json::to_string(&c).unwrap();
1404        assert_eq!(a, b);
1405    }
1406
1407    #[test]
1408    fn description_field_persists() {
1409        let mut c = RuleBypassCorpus::new("t");
1410        c.record_block("942100", "p", cls("sql"), vec![], 1);
1411        c.bucket_mut("942100").description = Some("SQL injection — OWASP CRS 942100".into());
1412        let dir = tempdir().expect("tempdir");
1413        let path = dir.path().join("c.json");
1414        c.save_atomic(&path).expect("save");
1415        let r = RuleBypassCorpus::load_or_default(&path, "ignored");
1416        let desc = r
1417            .buckets
1418            .get("942100")
1419            .and_then(|b| b.description.as_deref());
1420        assert_eq!(desc, Some("SQL injection — OWASP CRS 942100"));
1421    }
1422
1423    #[test]
1424    fn mark_drift_updates_timestamp() {
1425        let mut c = RuleBypassCorpus::new("t");
1426        c.record_block("R1", "p", cls("sql"), vec![], 1);
1427        c.mark_drift("R1");
1428        let t1 = c.buckets["R1"].last_drift_at_secs.unwrap();
1429        // Subsequent mark_drift updates (within 1s test, monotone).
1430        std::thread::sleep(std::time::Duration::from_millis(1100));
1431        c.mark_drift("R1");
1432        let t2 = c.buckets["R1"].last_drift_at_secs.unwrap();
1433        assert!(t2 >= t1);
1434    }
1435
1436    #[test]
1437    fn adversarial_large_chain_no_panic() {
1438        let big_chain: Vec<String> = (0..1000).map(|i| format!("technique-{i}")).collect();
1439        let mut c = RuleBypassCorpus::new("t");
1440        c.record_bypass("R1", "p", cls("sql"), big_chain.clone(), 1);
1441        assert_eq!(c.bypasses_for_rule("R1")[0].encoding_chain.len(), 1000);
1442    }
1443
1444    #[test]
1445    fn adversarial_huge_payload_no_panic() {
1446        let big = "A".repeat(1_000_000);
1447        let mut c = RuleBypassCorpus::new("t");
1448        c.record_block("R1", &big, cls("sql"), vec![], 1);
1449        // Verify round-trip through serde doesn't OOM on a 1MB payload.
1450        let dir = tempdir().expect("tempdir");
1451        let path = dir.path().join("c.json");
1452        c.save_atomic(&path).expect("save");
1453        let r = RuleBypassCorpus::load_or_default(&path, "ignored");
1454        assert_eq!(r.blocked_for_rule("R1").len(), 1);
1455        assert_eq!(r.blocked_for_rule("R1")[0].payload.len(), 1_000_000);
1456    }
1457
1458    #[test]
1459    fn unicode_in_payload_round_trips() {
1460        let mut c = RuleBypassCorpus::new("t");
1461        c.record_bypass(
1462            "R1",
1463            "ＳＥＬＥＣＴ Ω 中文 \u{200B} \u{E0041}",
1464            cls("sql"),
1465            vec![],
1466            1,
1467        );
1468        let dir = tempdir().expect("tempdir");
1469        let path = dir.path().join("c.json");
1470        c.save_atomic(&path).expect("save");
1471        let r = RuleBypassCorpus::load_or_default(&path, "ignored");
1472        let b = &r.bypasses_for_rule("R1")[0];
1473        assert!(b.payload.contains("ＳＥＬＥＣＴ"));
1474        assert!(b.payload.contains("中文"));
1475        assert!(b.payload.contains('\u{200B}'));
1476        assert!(b.payload.contains('\u{E0041}'));
1477    }
1478
1479    #[test]
1480    fn dedup_distinguishes_different_response_hashes() {
1481        let mut c = RuleBypassCorpus::new("t");
1482        c.record_block("R1", "p", cls("sql"), vec![], 1);
1483        c.record_block("R1", "p", cls("sql"), vec![], 2); // different hash
1484        // Same payload + different response = two separate observations
1485        // (the WAF may have returned different block pages).
1486        assert_eq!(c.blocked_for_rule("R1").len(), 2);
1487    }
1488
1489    // ====================================================================
1490    // Durability / preservation adversarial + boundary + property tests.
1491    //
1492    // Contract under test (corpus-durability fix):
1493    //   missing/empty/whitespace  -> fresh corpus, NO sidecar
1494    //   oversize-but-valid         -> recompact + keep (1 GiB ceiling)
1495    //   unparseable / IO / non-UTF8 -> move file aside to
1496    //       `<path>.corrupt-<epoch>` with BYTE-IDENTICAL content, then fresh
1497    //   save_atomic over non-empty prior -> snapshot prior to `<path>.bak`
1498    //
1499    // Every assertion checks real bytes / contents — never just !is_empty().
1500    // ====================================================================
1501
1502    /// Collect every `<base>.corrupt-*` sidecar in `dir`.
1503    fn corrupt_sidecars(dir: &Path, base: &str) -> Vec<PathBuf> {
1504        std::fs::read_dir(dir)
1505            .unwrap()
1506            .filter_map(Result::ok)
1507            .filter(|e| {
1508                let name = e.file_name().to_string_lossy().into_owned();
1509                name.starts_with(base) && name.contains(".corrupt-")
1510            })
1511            .map(|e| e.path())
1512            .collect()
1513    }
1514
1515    /// Assert the load preserved `original` byte-for-byte in exactly one
1516    /// sidecar, returned a fresh corpus, and moved the original path aside.
1517    fn assert_preserved_fresh(
1518        dir: &Path,
1519        path: &Path,
1520        base: &str,
1521        original: &[u8],
1522        fingerprint: &str,
1523    ) {
1524        let c = RuleBypassCorpus::load_or_default(path, fingerprint);
1525        assert_eq!(
1526            c.target_fingerprint, fingerprint,
1527            "fresh corpus uses fallback fp"
1528        );
1529        assert_eq!(c.rules_seen(), 0, "returned corpus must be fresh/empty");
1530        assert_eq!(c.total_bypasses(), 0);
1531        assert_eq!(c.total_blocks(), 0);
1532        assert!(
1533            !path.exists(),
1534            "the unreadable original must be moved aside"
1535        );
1536        let aside = corrupt_sidecars(dir, base);
1537        assert_eq!(aside.len(), 1, "exactly one preserved sidecar must exist");
1538        let preserved = std::fs::read(&aside[0]).expect("read sidecar");
1539        assert_eq!(
1540            preserved, original,
1541            "preserved sidecar bytes must be byte-identical to the original"
1542        );
1543    }
1544
1545    // ---- Preservation: unparseable / non-UTF8 / truncated / partial -----
1546
1547    #[test]
1548    fn preserve_non_utf8_file_byte_identical() {
1549        // Invalid UTF-8 fails inside read_capped_text -> read-failed branch ->
1550        // preserved aside. Bytes must survive exactly (a binary-corrupted
1551        // corpus is still recoverable material).
1552        let dir = tempdir().expect("tempdir");
1553        let path = dir.path().join("nonutf8.json");
1554        // Lone continuation + invalid lead bytes — definitively not UTF-8.
1555        let original: &[u8] = &[0x7B, 0xFF, 0xFE, 0x80, 0xC0, 0x22, 0x6B, 0x65, 0x79];
1556        std::fs::write(&path, original).expect("write");
1557        assert_preserved_fresh(dir.path(), &path, "nonutf8.json", original, "fb");
1558    }
1559
1560    #[test]
1561    fn preserve_truncated_mid_json_byte_identical() {
1562        // A write/crash truncated the file mid-token. Non-empty + unparseable
1563        // -> preserve aside, fresh corpus.
1564        let dir = tempdir().expect("tempdir");
1565        let path = dir.path().join("trunc.json");
1566        let original = br#"{"schema_version":1,"target_fingerprint":"cf:mr:x","buckets":{"942100":{"rule_id":{"#;
1567        std::fs::write(&path, original).expect("write");
1568        assert_preserved_fresh(dir.path(), &path, "trunc.json", original, "fb");
1569    }
1570
1571    #[test]
1572    fn preserve_lone_open_brace_byte_identical() {
1573        let dir = tempdir().expect("tempdir");
1574        let path = dir.path().join("brace.json");
1575        let original = b"{";
1576        std::fs::write(&path, original).expect("write");
1577        assert_preserved_fresh(dir.path(), &path, "brace.json", original, "fb");
1578    }
1579
1580    #[test]
1581    fn preserve_valid_json_wrong_schema_byte_identical() {
1582        // Syntactically valid JSON but NOT a corpus (missing the required
1583        // non-default `target_fingerprint` field) -> serde fails -> preserved.
1584        let dir = tempdir().expect("tempdir");
1585        let path = dir.path().join("wrongschema.json");
1586        let original = br#"{"completely":"different","shape":[1,2,3],"nested":{"a":true}}"#;
1587        std::fs::write(&path, original).expect("write");
1588        assert_preserved_fresh(dir.path(), &path, "wrongschema.json", original, "fb");
1589    }
1590
1591    #[test]
1592    fn preserve_json_array_instead_of_object_byte_identical() {
1593        // A top-level array is valid JSON but the wrong type for the corpus.
1594        let dir = tempdir().expect("tempdir");
1595        let path = dir.path().join("arr.json");
1596        let original = br#"["this","is","not","a","corpus"]"#;
1597        std::fs::write(&path, original).expect("write");
1598        assert_preserved_fresh(dir.path(), &path, "arr.json", original, "fb");
1599    }
1600
1601    #[test]
1602    fn preserve_garbage_text_byte_identical() {
1603        let dir = tempdir().expect("tempdir");
1604        let path = dir.path().join("garbage.json");
1605        let original = b"this is not json at all -- 500 lost bypasses live here\n\x01\x02";
1606        std::fs::write(&path, original).expect("write");
1607        assert_preserved_fresh(dir.path(), &path, "garbage.json", original, "fb");
1608    }
1609
1610    #[test]
1611    fn preserve_moves_aside_on_every_corruption_event() {
1612        // Each corrupt load moves the original path aside into a
1613        // `.corrupt-<epoch>` sidecar with its EXACT bytes, and always returns
1614        // a fresh corpus (the original is never left where a save could
1615        // overwrite it). NOTE: the sidecar name carries only epoch-SECOND
1616        // granularity, so two corruptions within the same wall-clock second
1617        // map to the same sidecar name and the second rename replaces the
1618        // first — i.e. at second resolution at least the most-recent corrupt
1619        // bytes are always recoverable. We assert that guaranteed property
1620        // (the latest corruption's exact bytes survive) plus the move-aside
1621        // and fresh-corpus invariants that hold on every event.
1622        let dir = tempdir().expect("tempdir");
1623        let path = dir.path().join("multi.json");
1624
1625        let first = b"FIRST corrupt corpus bytes !!!";
1626        std::fs::write(&path, first).expect("write 1");
1627        let c1 = RuleBypassCorpus::load_or_default(&path, "fb");
1628        assert_eq!(c1.rules_seen(), 0, "fresh corpus after first corruption");
1629        assert!(
1630            !path.exists(),
1631            "original moved aside after first corruption"
1632        );
1633
1634        let second = b"SECOND corrupt corpus bytes ???";
1635        std::fs::write(&path, second).expect("write 2");
1636        let c2 = RuleBypassCorpus::load_or_default(&path, "fb");
1637        assert_eq!(c2.rules_seen(), 0, "fresh corpus after second corruption");
1638        assert!(
1639            !path.exists(),
1640            "original moved aside after second corruption"
1641        );
1642
1643        // The latest corruption's exact bytes are always recoverable.
1644        let bytes: Vec<Vec<u8>> = corrupt_sidecars(dir.path(), "multi.json")
1645            .iter()
1646            .map(|p| std::fs::read(p).unwrap())
1647            .collect();
1648        assert!(
1649            bytes.iter().any(|b| b.as_slice() == second.as_slice()),
1650            "latest corruption's exact bytes must be preserved aside"
1651        );
1652    }
1653
1654    // ---- Empty / whitespace -> fresh, NO sidecar ------------------------
1655
1656    #[test]
1657    fn whitespace_only_file_is_fresh_no_sidecar() {
1658        let dir = tempdir().expect("tempdir");
1659        let path = dir.path().join("ws.json");
1660        std::fs::write(&path, b"   \n\t  \r\n   ").expect("write");
1661        let c = RuleBypassCorpus::load_or_default(&path, "fb");
1662        assert_eq!(c.target_fingerprint, "fb");
1663        assert_eq!(c.rules_seen(), 0);
1664        assert!(
1665            corrupt_sidecars(dir.path(), "ws.json").is_empty(),
1666            "whitespace-only file must NOT spawn a preserve sidecar"
1667        );
1668        // The whitespace file itself is treated as absent — left in place, not
1669        // moved aside (only unreadable/unparseable files are preserved-aside).
1670        assert!(path.exists(), "whitespace file is not moved aside");
1671    }
1672
1673    #[test]
1674    fn empty_file_leaves_no_sidecar_and_returns_fresh() {
1675        let dir = tempdir().expect("tempdir");
1676        let path = dir.path().join("zero.json");
1677        std::fs::write(&path, b"").expect("write");
1678        let c = RuleBypassCorpus::load_or_default(&path, "fb");
1679        assert_eq!(c.rules_seen(), 0);
1680        assert!(corrupt_sidecars(dir.path(), "zero.json").is_empty());
1681    }
1682
1683    // ---- save_atomic .bak behaviour ------------------------------------
1684
1685    #[test]
1686    fn bak_recovers_first_corpus_after_empty_second_save() {
1687        // First save writes real bypasses; a second (empty) save must leave
1688        // the FIRST corpus fully recoverable from `<path>.bak`.
1689        let dir = tempdir().expect("tempdir");
1690        let path = dir.path().join("c.json");
1691
1692        let mut first = RuleBypassCorpus::new("cf:mr:cumulus");
1693        first.record_bypass("942100", "winner-A", cls("xss"), vec!["b64".into()], 7);
1694        first.record_bypass("942100", "winner-B", cls("sql"), vec![], 8);
1695        first.record_block("942100", "blk", cls("sql"), vec![], 9);
1696        first.save_atomic(&path).expect("save first");
1697
1698        let empty = RuleBypassCorpus::new("cf:mr:cumulus");
1699        empty.save_atomic(&path).expect("save empty");
1700
1701        let bak = dir.path().join("c.json.bak");
1702        assert!(
1703            bak.exists(),
1704            ".bak must exist after overwriting a non-empty corpus"
1705        );
1706        let recovered = RuleBypassCorpus::load_or_default(&bak, "ignored");
1707        assert_eq!(
1708            recovered.total_bypasses(),
1709            2,
1710            "both prior bypasses recoverable"
1711        );
1712        assert_eq!(recovered.total_blocks(), 1, "prior block recoverable");
1713        let payloads: Vec<_> = recovered
1714            .bypasses_for_rule("942100")
1715            .iter()
1716            .map(|b| b.payload.clone())
1717            .collect();
1718        assert_eq!(
1719            payloads,
1720            vec!["winner-A".to_string(), "winner-B".to_string()]
1721        );
1722        assert_eq!(
1723            recovered.bypasses_for_rule("942100")[0].encoding_chain,
1724            vec!["b64".to_string()]
1725        );
1726    }
1727
1728    #[test]
1729    fn bak_skipped_when_no_prior_file() {
1730        // First-ever save (no prior file) must NOT create a .bak — there is
1731        // nothing to protect.
1732        let dir = tempdir().expect("tempdir");
1733        let path = dir.path().join("c.json");
1734        let mut c = RuleBypassCorpus::new("t");
1735        c.record_bypass("R1", "p", cls("sql"), vec![], 1);
1736        c.save_atomic(&path).expect("save");
1737        assert!(
1738            !dir.path().join("c.json.bak").exists(),
1739            "no .bak on the first save (no prior file)"
1740        );
1741    }
1742
1743    #[test]
1744    fn bak_skipped_when_prior_file_empty() {
1745        // An empty prior file has nothing worth protecting — backup is skipped.
1746        let dir = tempdir().expect("tempdir");
1747        let path = dir.path().join("c.json");
1748        std::fs::write(&path, b"").expect("seed empty");
1749        let mut c = RuleBypassCorpus::new("t");
1750        c.record_bypass("R1", "p", cls("sql"), vec![], 1);
1751        c.save_atomic(&path).expect("save over empty");
1752        assert!(
1753            !dir.path().join("c.json.bak").exists(),
1754            "empty prior file must not be backed up"
1755        );
1756    }
1757
1758    #[test]
1759    fn bak_holds_exact_prior_bytes() {
1760        // The .bak must be a byte-exact copy of the prior on-disk file, not a
1761        // re-serialization. Prove by comparing bytes captured before overwrite.
1762        let dir = tempdir().expect("tempdir");
1763        let path = dir.path().join("c.json");
1764        let mut first = RuleBypassCorpus::new("cf:mr:x");
1765        first.record_bypass("R1", "p", cls("sql"), vec![], 1);
1766        first.save_atomic(&path).expect("save first");
1767        let prior_bytes = std::fs::read(&path).expect("read prior");
1768
1769        let mut second = RuleBypassCorpus::new("cf:mr:x");
1770        second.record_bypass("R2", "q", cls("xss"), vec![], 2);
1771        second.save_atomic(&path).expect("save second");
1772
1773        let bak_bytes = std::fs::read(dir.path().join("c.json.bak")).expect("read bak");
1774        assert_eq!(
1775            bak_bytes, prior_bytes,
1776            ".bak must be a byte-exact snapshot of the prior file"
1777        );
1778    }
1779
1780    #[test]
1781    fn bak_round_trips_then_main_continues() {
1782        // After a bad empty save, recover from .bak, re-save it, and confirm
1783        // the corpus is whole again on the main path.
1784        let dir = tempdir().expect("tempdir");
1785        let path = dir.path().join("c.json");
1786        let mut good = RuleBypassCorpus::new("cf:mr:x");
1787        good.record_bypass("R1", "keep-me", cls("sql"), vec![], 1);
1788        good.save_atomic(&path).expect("save good");
1789        RuleBypassCorpus::new("cf:mr:x")
1790            .save_atomic(&path)
1791            .expect("save empty");
1792
1793        let bak = dir.path().join("c.json.bak");
1794        let recovered = RuleBypassCorpus::load_or_default(&bak, "x");
1795        recovered.save_atomic(&path).expect("restore");
1796        let reloaded = RuleBypassCorpus::load_or_default(&path, "x");
1797        assert_eq!(reloaded.bypasses_for_rule("R1").len(), 1);
1798        assert_eq!(reloaded.bypasses_for_rule("R1")[0].payload, "keep-me");
1799    }
1800
1801    // ---- End-to-end "corpus disappeared" with NON-UTF8 corruption ------
1802
1803    #[test]
1804    fn end_to_end_corpus_disappeared_non_utf8() {
1805        // Real corpus on disk -> binary corruption (non-UTF8) -> reload (fresh)
1806        // -> save empty over it. The corrupt bytes must be preserved in a
1807        // sidecar; the empty save must NOT have destroyed recoverable bytes.
1808        let dir = tempdir().expect("tempdir");
1809        let path = dir.path().join("corpus.json");
1810
1811        let mut real = RuleBypassCorpus::new("cf:mr:cumulus");
1812        for i in 0..30 {
1813            real.record_bypass("942100", &format!("bypass-{i}"), cls("xss"), vec![], i);
1814        }
1815        real.save_atomic(&path).expect("save real");
1816
1817        // Corrupt with raw non-UTF8 bytes (e.g. partial NFS write of binary).
1818        let corrupt: &[u8] = &[0x00, 0xFF, 0x80, 0x7B, 0xC3, 0x28, 0x42];
1819        std::fs::write(&path, corrupt).expect("corrupt");
1820
1821        let fresh = RuleBypassCorpus::load_or_default(&path, "cf:mr:cumulus");
1822        assert_eq!(fresh.total_bypasses(), 0);
1823        fresh.save_atomic(&path).expect("save fresh empty");
1824
1825        let aside = corrupt_sidecars(dir.path(), "corpus.json");
1826        assert_eq!(aside.len(), 1, "corrupt non-UTF8 bytes preserved aside");
1827        assert_eq!(
1828            std::fs::read(&aside[0]).unwrap(),
1829            corrupt,
1830            "sidecar holds the exact corrupt bytes"
1831        );
1832    }
1833
1834    // ---- Per-bucket heal on load (mixed) -------------------------------
1835
1836    #[test]
1837    fn heal_truncates_blocked_but_keeps_all_bypasses() {
1838        let dir = tempdir().expect("tempdir");
1839        let path = dir.path().join("c.json");
1840        let mut c = RuleBypassCorpus::new("heal");
1841        let over = RuleBypassCorpus::MAX_BLOCKED_PER_BUCKET + 100;
1842        let blocked: Vec<RecordedAttempt> = (0..over)
1843            .map(|i| RecordedAttempt {
1844                payload: format!("blk{i}"),
1845                payload_class: cls("sql"),
1846                encoding_chain: vec![],
1847                response_hash: i as u64,
1848                observed_at_secs: 0,
1849            })
1850            .collect();
1851        // Under-cap bypasses must all survive the heal untouched.
1852        let bypassed: Vec<RecordedBypass> = (0..10)
1853            .map(|i| RecordedBypass {
1854                payload: format!("by{i}"),
1855                payload_class: cls("xss"),
1856                encoding_chain: vec![],
1857                response_hash: 1_000 + i as u64,
1858                observed_at_secs: 0,
1859                submission: SubmissionStatus::Queued,
1860                delivery: String::new(),
1861            })
1862            .collect();
1863        c.buckets.insert(
1864            "r".into(),
1865            RuleBucket {
1866                blocked,
1867                bypassed,
1868                ..RuleBucket::default()
1869            },
1870        );
1871        c.save_atomic(&path).expect("save");
1872
1873        let healed = RuleBypassCorpus::load_or_default(&path, "heal");
1874        assert_eq!(
1875            healed.blocked_for_rule("r").len(),
1876            RuleBypassCorpus::MAX_BLOCKED_PER_BUCKET
1877        );
1878        // The earliest blocked sample is kept (truncate keeps the prefix).
1879        assert_eq!(healed.blocked_for_rule("r")[0].payload, "blk0");
1880        assert_eq!(
1881            healed.bypasses_for_rule("r").len(),
1882            10,
1883            "under-cap bypasses untouched"
1884        );
1885        assert_eq!(healed.bypasses_for_rule("r")[9].payload, "by9");
1886    }
1887
1888    #[test]
1889    fn heal_leaves_under_cap_bucket_untouched() {
1890        let dir = tempdir().expect("tempdir");
1891        let path = dir.path().join("c.json");
1892        let mut c = RuleBypassCorpus::new("t");
1893        for i in 0..5 {
1894            c.record_block("r", &format!("b{i}"), cls("sql"), vec![], i);
1895            c.record_bypass("r", &format!("p{i}"), cls("sql"), vec![], 100 + i);
1896        }
1897        c.save_atomic(&path).expect("save");
1898        let healed = RuleBypassCorpus::load_or_default(&path, "t");
1899        assert_eq!(healed.blocked_for_rule("r").len(), 5);
1900        assert_eq!(healed.bypasses_for_rule("r").len(), 5);
1901        // Order + exact payloads preserved.
1902        assert_eq!(healed.blocked_for_rule("r")[4].payload, "b4");
1903        assert_eq!(healed.bypasses_for_rule("r")[0].payload, "p0");
1904    }
1905
1906    #[test]
1907    fn heal_truncated_bypassed_keeps_blocked_and_prefix() {
1908        let dir = tempdir().expect("tempdir");
1909        let path = dir.path().join("c.json");
1910        let mut c = RuleBypassCorpus::new("t");
1911        let over = RuleBypassCorpus::MAX_BYPASSED_PER_BUCKET + 17;
1912        let bypassed: Vec<RecordedBypass> = (0..over)
1913            .map(|i| RecordedBypass {
1914                payload: format!("by{i}"),
1915                payload_class: cls("sql"),
1916                encoding_chain: vec![],
1917                response_hash: i as u64,
1918                observed_at_secs: 0,
1919                submission: SubmissionStatus::Queued,
1920                delivery: String::new(),
1921            })
1922            .collect();
1923        c.buckets.insert(
1924            "r".into(),
1925            RuleBucket {
1926                bypassed,
1927                ..RuleBucket::default()
1928            },
1929        );
1930        c.bucket_mut("r").blocked.push(RecordedAttempt {
1931            payload: "survivor".into(),
1932            payload_class: cls("sql"),
1933            encoding_chain: vec![],
1934            response_hash: 9,
1935            observed_at_secs: 0,
1936        });
1937        c.save_atomic(&path).expect("save");
1938        let healed = RuleBypassCorpus::load_or_default(&path, "t");
1939        assert_eq!(
1940            healed.bypasses_for_rule("r").len(),
1941            RuleBypassCorpus::MAX_BYPASSED_PER_BUCKET
1942        );
1943        assert_eq!(
1944            healed.bypasses_for_rule("r")[0].payload,
1945            "by0",
1946            "kept prefix"
1947        );
1948        assert_eq!(healed.blocked_for_rule("r").len(), 1);
1949        assert_eq!(healed.blocked_for_rule("r")[0].payload, "survivor");
1950    }
1951
1952    // ---- schema_version normalization ----------------------------------
1953
1954    #[test]
1955    fn schema_version_zero_normalized_to_current() {
1956        // An explicit `schema_version: 0` must be upgraded to the current
1957        // version on load (0 is the serde-default sentinel for "old file").
1958        let dir = tempdir().expect("tempdir");
1959        let path = dir.path().join("c.json");
1960        let raw = r#"{"schema_version":0,"target_fingerprint":"t","buckets":{}}"#;
1961        std::fs::write(&path, raw).expect("write");
1962        let c = RuleBypassCorpus::load_or_default(&path, "ignored");
1963        assert_eq!(c.schema_version, CORPUS_SCHEMA_VERSION);
1964        assert_eq!(
1965            c.target_fingerprint, "t",
1966            "embedded fingerprint wins for valid file"
1967        );
1968    }
1969
1970    #[test]
1971    fn schema_version_missing_normalized_to_current() {
1972        let dir = tempdir().expect("tempdir");
1973        let path = dir.path().join("c.json");
1974        // No schema_version key at all -> serde default 0 -> normalized.
1975        let raw = r#"{"target_fingerprint":"emb","buckets":{}}"#;
1976        std::fs::write(&path, raw).expect("write");
1977        let c = RuleBypassCorpus::load_or_default(&path, "ignored");
1978        assert_eq!(c.schema_version, CORPUS_SCHEMA_VERSION);
1979        assert_eq!(c.target_fingerprint, "emb");
1980    }
1981
1982    #[test]
1983    fn valid_file_fingerprint_overrides_fallback() {
1984        // When the file is valid, its embedded fingerprint wins; the passed
1985        // fallback fingerprint is ignored.
1986        let dir = tempdir().expect("tempdir");
1987        let path = dir.path().join("c.json");
1988        let mut c = RuleBypassCorpus::new("embedded-fp");
1989        c.record_bypass("R1", "p", cls("sql"), vec![], 1);
1990        c.save_atomic(&path).expect("save");
1991        let r = RuleBypassCorpus::load_or_default(&path, "fallback-should-be-ignored");
1992        assert_eq!(r.target_fingerprint, "embedded-fp");
1993    }
1994
1995    // ---- delivery defaults / set_delivery edge cases -------------------
1996
1997    #[test]
1998    fn old_corpus_loads_with_default_delivery_for_every_bypass() {
1999        // Multiple bypasses, none with a `delivery` key — all must default
2000        // to "" and remain fully intact otherwise.
2001        let dir = tempdir().expect("tempdir");
2002        let path = dir.path().join("old.json");
2003        let mut c = RuleBypassCorpus::new("t");
2004        c.record_bypass("R1", "a", cls("sql"), vec!["x".into()], 1);
2005        c.record_bypass("R1", "b", cls("xss"), vec![], 2);
2006        let mut v: serde_json::Value =
2007            serde_json::from_str(&serde_json::to_string(&c).unwrap()).unwrap();
2008        for bucket in v["buckets"].as_object_mut().unwrap().values_mut() {
2009            for bp in bucket["bypassed"].as_array_mut().unwrap() {
2010                bp.as_object_mut().unwrap().remove("delivery");
2011            }
2012        }
2013        std::fs::write(&path, serde_json::to_string(&v).unwrap()).expect("write");
2014        let r = RuleBypassCorpus::load_or_default(&path, "ignored");
2015        let bps = r.bypasses_for_rule("R1");
2016        assert_eq!(bps.len(), 2);
2017        assert_eq!(bps[0].delivery, "");
2018        assert_eq!(bps[1].delivery, "");
2019        assert_eq!(bps[0].encoding_chain, vec!["x".to_string()]);
2020    }
2021
2022    #[test]
2023    fn set_delivery_overwrites_existing_shape_with_non_empty() {
2024        let mut c = RuleBypassCorpus::new("t");
2025        c.record_bypass("R1", "p", cls("sql"), vec![], 1);
2026        assert!(c.set_delivery("R1", "p", "\"first\"".into()));
2027        assert!(c.set_delivery("R1", "p", "\"second\"".into()));
2028        assert_eq!(c.bypasses_for_rule("R1")[0].delivery, "\"second\"");
2029    }
2030
2031    #[test]
2032    fn set_delivery_empty_on_missing_bucket_returns_false() {
2033        // Empty delivery short-circuits to false even before bucket lookup.
2034        let mut c = RuleBypassCorpus::new("t");
2035        assert!(!c.set_delivery("nope", "p", String::new()));
2036    }
2037
2038    #[test]
2039    fn set_submission_empty_corpus_returns_false() {
2040        let mut c = RuleBypassCorpus::new("t");
2041        assert!(!c.set_submission("R1", "p", SubmissionStatus::Queued));
2042    }
2043
2044    #[test]
2045    fn set_submission_bucket_exists_but_payload_absent_returns_false() {
2046        let mut c = RuleBypassCorpus::new("t");
2047        c.record_bypass("R1", "present", cls("sql"), vec![], 1);
2048        assert!(
2049            !c.set_submission(
2050                "R1",
2051                "absent",
2052                SubmissionStatus::Accepted {
2053                    report_id: "X".into()
2054                }
2055            ),
2056            "wrong payload in an existing bucket must not match"
2057        );
2058        // The real bypass is untouched.
2059        assert!(matches!(
2060            c.bypasses_for_rule("R1")[0].submission,
2061            SubmissionStatus::Queued
2062        ));
2063    }
2064
2065    #[test]
2066    fn submission_status_round_trips_all_variants() {
2067        // Each lifecycle variant must serialize + deserialize losslessly so a
2068        // mid-flight bounty status survives a save/load cycle.
2069        let dir = tempdir().expect("tempdir");
2070        let path = dir.path().join("c.json");
2071        let mut c = RuleBypassCorpus::new("t");
2072        let variants = [
2073            ("p0", SubmissionStatus::Queued),
2074            (
2075                "p1",
2076                SubmissionStatus::DryRunHold {
2077                    release_at_secs: 1234,
2078                },
2079            ),
2080            (
2081                "p2",
2082                SubmissionStatus::Submitted {
2083                    report_id: "H1-1".into(),
2084                },
2085            ),
2086            (
2087                "p3",
2088                SubmissionStatus::Accepted {
2089                    report_id: "H1-2".into(),
2090                },
2091            ),
2092            (
2093                "p4",
2094                SubmissionStatus::Duplicate {
2095                    duplicate_of: "H1-3".into(),
2096                },
2097            ),
2098            (
2099                "p5",
2100                SubmissionStatus::Rejected {
2101                    reason: "informative".into(),
2102                },
2103            ),
2104        ];
2105        for (p, _) in &variants {
2106            c.record_bypass("R1", p, cls("sql"), vec![], 0);
2107        }
2108        for (p, st) in &variants {
2109            assert!(c.set_submission("R1", p, st.clone()));
2110        }
2111        c.save_atomic(&path).expect("save");
2112        let r = RuleBypassCorpus::load_or_default(&path, "ignored");
2113        let by_payload: BTreeMap<_, _> = r
2114            .bypasses_for_rule("R1")
2115            .iter()
2116            .map(|b| (b.payload.clone(), b.submission.clone()))
2117            .collect();
2118        assert_eq!(
2119            by_payload["p1"],
2120            SubmissionStatus::DryRunHold {
2121                release_at_secs: 1234
2122            }
2123        );
2124        assert_eq!(
2125            by_payload["p2"],
2126            SubmissionStatus::Submitted {
2127                report_id: "H1-1".into()
2128            }
2129        );
2130        assert_eq!(
2131            by_payload["p4"],
2132            SubmissionStatus::Duplicate {
2133                duplicate_of: "H1-3".into()
2134            }
2135        );
2136        assert_eq!(
2137            by_payload["p5"],
2138            SubmissionStatus::Rejected {
2139                reason: "informative".into()
2140            }
2141        );
2142    }
2143
2144    // ---- Determinism / property tests ----------------------------------
2145
2146    #[test]
2147    fn determinism_identical_serialization_after_save_load() {
2148        // Serializing the SAME corpus twice is byte-identical, and a
2149        // save/load round-trip re-serializes identically (BTreeMap order).
2150        let dir = tempdir().expect("tempdir");
2151        let path = dir.path().join("c.json");
2152        let mut c = RuleBypassCorpus::new("t");
2153        for i in (0..40).rev() {
2154            c.record_bypass(&format!("R{i:03}"), &format!("p{i}"), cls("sql"), vec![], i);
2155        }
2156        let s1 = serde_json::to_string(&c).unwrap();
2157        let s2 = serde_json::to_string(&c).unwrap();
2158        assert_eq!(s1, s2);
2159        c.save_atomic(&path).expect("save");
2160        let r = RuleBypassCorpus::load_or_default(&path, "ignored");
2161        // Bucket iteration order is sorted regardless of insertion order.
2162        let keys: Vec<_> = r.buckets.keys().cloned().collect();
2163        let mut sorted = keys.clone();
2164        sorted.sort();
2165        assert_eq!(keys, sorted, "BTreeMap keys must iterate in sorted order");
2166    }
2167
2168    #[test]
2169    fn btreemap_order_independent_of_insertion_order() {
2170        // Two corpora with the same rules inserted in opposite orders must
2171        // serialize identically.
2172        let mut a = RuleBypassCorpus::new("t");
2173        let mut b = RuleBypassCorpus::new("t");
2174        let ids = ["R5", "R1", "R9", "R3", "R7"];
2175        for id in ids {
2176            a.record_block(id, "p", cls("sql"), vec![], 1);
2177        }
2178        for id in ids.iter().rev() {
2179            b.record_block(id, "p", cls("sql"), vec![], 1);
2180        }
2181        assert_eq!(
2182            serde_json::to_string(&a).unwrap(),
2183            serde_json::to_string(&b).unwrap(),
2184            "BTreeMap makes serialization insertion-order-independent"
2185        );
2186    }
2187
2188    #[test]
2189    fn unicode_payload_round_trips_with_exact_bytes() {
2190        let dir = tempdir().expect("tempdir");
2191        let path = dir.path().join("c.json");
2192        let payload = "𝕊𝔼𝕃𝔼ℂ𝕋 ' OR 𝟙=𝟙 -- 中文 \u{200B}\u{FEFF}\u{1F4A9} emoji";
2193        let mut c = RuleBypassCorpus::new("t");
2194        c.record_bypass("R1", payload, cls("sql"), vec![], 1);
2195        c.save_atomic(&path).expect("save");
2196        let r = RuleBypassCorpus::load_or_default(&path, "ignored");
2197        assert_eq!(
2198            r.bypasses_for_rule("R1")[0].payload,
2199            payload,
2200            "unicode payload exact"
2201        );
2202    }
2203
2204    #[test]
2205    fn one_mb_bypass_payload_round_trips_no_oom() {
2206        let dir = tempdir().expect("tempdir");
2207        let path = dir.path().join("c.json");
2208        let big = "A".repeat(1_200_000);
2209        let mut c = RuleBypassCorpus::new("t");
2210        c.record_bypass("R1", &big, cls("sql"), vec![], 1);
2211        c.save_atomic(&path).expect("save");
2212        let r = RuleBypassCorpus::load_or_default(&path, "ignored");
2213        assert_eq!(r.bypasses_for_rule("R1")[0].payload.len(), 1_200_000);
2214        assert!(
2215            r.bypasses_for_rule("R1")[0]
2216                .payload
2217                .bytes()
2218                .all(|b| b == b'A')
2219        );
2220    }
2221
2222    #[test]
2223    fn huge_encoding_chain_round_trips() {
2224        let dir = tempdir().expect("tempdir");
2225        let path = dir.path().join("c.json");
2226        let chain: Vec<String> = (0..5000).map(|i| format!("t{i}")).collect();
2227        let mut c = RuleBypassCorpus::new("t");
2228        c.record_bypass("R1", "p", cls("sql"), chain.clone(), 1);
2229        c.save_atomic(&path).expect("save");
2230        let r = RuleBypassCorpus::load_or_default(&path, "ignored");
2231        let got = &r.bypasses_for_rule("R1")[0].encoding_chain;
2232        assert_eq!(got.len(), 5000);
2233        assert_eq!(got[0], "t0");
2234        assert_eq!(got[4999], "t4999");
2235    }
2236
2237    #[test]
2238    fn dedup_bypass_by_response_hash_and_payload_property() {
2239        // Property: across N records, the stored count equals the number of
2240        // distinct (response_hash, payload) pairs, never more.
2241        let mut c = RuleBypassCorpus::new("t");
2242        // 3 distinct pairs, each recorded several times in interleaved order.
2243        let inputs = [
2244            ("p", 1u64),
2245            ("q", 1),
2246            ("p", 2),
2247            ("p", 1),
2248            ("q", 1),
2249            ("p", 2),
2250            ("p", 2),
2251        ];
2252        for (p, h) in inputs {
2253            c.record_bypass("R1", p, cls("sql"), vec![], h);
2254        }
2255        assert_eq!(
2256            c.bypasses_for_rule("R1").len(),
2257            3,
2258            "only distinct (hash,payload) survive"
2259        );
2260        // Same payload different hash are distinct entries.
2261        let pairs: std::collections::BTreeSet<(String, u64)> = c
2262            .bypasses_for_rule("R1")
2263            .iter()
2264            .map(|b| (b.payload.clone(), b.response_hash))
2265            .collect();
2266        assert!(pairs.contains(&("p".to_string(), 1)));
2267        assert!(pairs.contains(&("p".to_string(), 2)));
2268        assert!(pairs.contains(&("q".to_string(), 1)));
2269    }
2270
2271    #[test]
2272    fn drift_timestamp_monotonic_across_remarks() {
2273        let mut c = RuleBypassCorpus::new("t");
2274        c.record_block("R1", "p", cls("sql"), vec![], 1);
2275        c.mark_drift("R1");
2276        let t1 = c.buckets["R1"].last_drift_at_secs.unwrap();
2277        // Re-marking never moves the timestamp backwards (epoch is monotone).
2278        c.mark_drift("R1");
2279        let t2 = c.buckets["R1"].last_drift_at_secs.unwrap();
2280        assert!(t2 >= t1, "drift timestamp must be monotonic non-decreasing");
2281    }
2282
2283    #[test]
2284    fn first_save_writes_current_schema_version_to_disk() {
2285        // save_atomic must stamp CORPUS_SCHEMA_VERSION regardless of the
2286        // in-memory value, so a corpus constructed with version 0 is healed
2287        // on its first write.
2288        let dir = tempdir().expect("tempdir");
2289        let path = dir.path().join("c.json");
2290        let mut c = RuleBypassCorpus::new("t");
2291        c.schema_version = 0; // force a stale value
2292        c.record_bypass("R1", "p", cls("sql"), vec![], 1);
2293        c.save_atomic(&path).expect("save");
2294        let v: serde_json::Value =
2295            serde_json::from_str(&std::fs::read_to_string(&path).unwrap()).unwrap();
2296        assert_eq!(
2297            v["schema_version"].as_u64().unwrap(),
2298            u64::from(CORPUS_SCHEMA_VERSION)
2299        );
2300    }
2301
2302    #[test]
2303    fn save_stamps_last_saved_at_secs() {
2304        let dir = tempdir().expect("tempdir");
2305        let path = dir.path().join("c.json");
2306        let c = RuleBypassCorpus::new("t");
2307        assert_eq!(c.last_saved_at_secs, 0);
2308        c.save_atomic(&path).expect("save");
2309        let r = RuleBypassCorpus::load_or_default(&path, "t");
2310        assert!(
2311            r.last_saved_at_secs > 0,
2312            "save must stamp a real epoch second"
2313        );
2314    }
2315
2316    #[test]
2317    fn valid_oversize_under_ceiling_is_preserved_not_dropped() {
2318        // A large-but-valid corpus (well under the 1 GiB ceiling) must load
2319        // intact — never preserved-aside. Build a multi-MB valid file.
2320        let dir = tempdir().expect("tempdir");
2321        let path = dir.path().join("c.json");
2322        let mut c = RuleBypassCorpus::new("t");
2323        // ~3 MB of valid bypasses across buckets (under per-bucket caps).
2324        for r in 0..30 {
2325            for i in 0..50 {
2326                c.record_bypass(
2327                    &format!("R{r}"),
2328                    &format!("{}-{r}-{i}", "X".repeat(2000)),
2329                    cls("sql"),
2330                    vec![],
2331                    (r * 1000 + i) as u64,
2332                );
2333            }
2334        }
2335        c.save_atomic(&path).expect("save");
2336        let on_disk = std::fs::metadata(&path).unwrap().len();
2337        assert!(on_disk > 1_000_000, "test corpus should be multi-MB");
2338        let r = RuleBypassCorpus::load_or_default(&path, "ignored");
2339        assert_eq!(
2340            r.total_bypasses(),
2341            30 * 50,
2342            "all valid bypasses load intact"
2343        );
2344        assert!(
2345            corrupt_sidecars(dir.path(), "c.json").is_empty(),
2346            "valid file never preserved-aside"
2347        );
2348    }
2349
2350    #[test]
2351    fn save_atomic_leaves_no_tempfiles_behind() {
2352        // The atomic writer's temp file must be renamed into place, leaving
2353        // only the corpus (and possibly a .bak) in the directory.
2354        let dir = tempdir().expect("tempdir");
2355        let path = dir.path().join("c.json");
2356        let mut c = RuleBypassCorpus::new("t");
2357        c.record_bypass("R1", "p", cls("sql"), vec![], 1);
2358        c.save_atomic(&path).expect("save");
2359        let entries: Vec<String> = std::fs::read_dir(dir.path())
2360            .unwrap()
2361            .filter_map(Result::ok)
2362            .map(|e| e.file_name().to_string_lossy().into_owned())
2363            .collect();
2364        assert!(entries.contains(&"c.json".to_string()));
2365        assert!(
2366            entries.iter().all(|n| n == "c.json" || n == "c.json.bak"),
2367            "no stray temp files left behind, got: {entries:?}"
2368        );
2369    }
2370
2371    #[test]
2372    fn empty_buckets_and_blocks_persist_exact_counts() {
2373        // A corpus with rules that have ONLY blocks, ONLY bypasses, or a mix
2374        // round-trips with exact per-rule counts.
2375        let dir = tempdir().expect("tempdir");
2376        let path = dir.path().join("c.json");
2377        let mut c = RuleBypassCorpus::new("t");
2378        c.record_block("only-block", "b", cls("sql"), vec![], 1);
2379        c.record_bypass("only-bypass", "p", cls("xss"), vec![], 2);
2380        c.record_block("mixed", "b", cls("cmd"), vec![], 3);
2381        c.record_bypass("mixed", "p", cls("cmd"), vec![], 4);
2382        c.save_atomic(&path).expect("save");
2383        let r = RuleBypassCorpus::load_or_default(&path, "t");
2384        assert_eq!(r.blocked_for_rule("only-block").len(), 1);
2385        assert_eq!(r.bypasses_for_rule("only-block").len(), 0);
2386        assert_eq!(r.bypasses_for_rule("only-bypass").len(), 1);
2387        assert_eq!(r.blocked_for_rule("only-bypass").len(), 0);
2388        assert_eq!(r.blocked_for_rule("mixed").len(), 1);
2389        assert_eq!(r.bypasses_for_rule("mixed").len(), 1);
2390        assert_eq!(r.rules_seen(), 3);
2391    }
2392}
wafrift_evolution/rule_corpus.rs

wafrift_evolution/
rule_corpus.rs