Skip to main content

wafrift_evolution/
lineage.rs

1//! Lineage tracking for replayable bypass discovery.
2
3use crate::evolution::Chromosome;
4use serde::{Deserialize, Serialize};
5use std::sync::Arc;
6
7/// A single mutation operation log entry.
8#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
9pub struct MutationOp {
10    /// Gene name that was mutated.
11    pub gene_name: String,
12    /// Previous value.
13    pub from: String,
14    /// New value.
15    pub to: String,
16    /// Mutation operator name.
17    pub operator: String,
18}
19
20/// Compact, transitive-closure-safe snapshot of a parent chromosome's
21/// gene tuple. Stored inside `Lineage::Crossover` / `Lineage::Mutation`
22/// instead of `Arc<Chromosome>` so the lineage tree of a long-running
23/// scan is bounded by `O(genes per chromosome)` per ancestor instead
24/// of `O(full ancestry chain)` — the earlier full-Chromosome arcs
25/// transitively dragged the parent's own `Lineage` field along, so
26/// every grandchild kept its grandparents alive forever and a long
27/// scan would OOM.
28#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
29pub struct ParentSnapshot {
30    pub genes: Vec<(String, String)>,
31}
32
33impl ParentSnapshot {
34    fn from_chromosome(c: &Chromosome) -> Self {
35        Self {
36            genes: c.genes.clone(),
37        }
38    }
39}
40
41/// Lineage of a chromosome: how it was derived from seeds.
42#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
43pub enum Lineage {
44    /// Original randomly-generated chromosome.
45    Genesis {
46        /// Generation when created.
47        generation: u32,
48    },
49    /// Created via crossover of two parents.
50    Crossover {
51        /// Parent A snapshot — genes only, breaks ancestry chain.
52        parent_a: Arc<ParentSnapshot>,
53        /// Parent B snapshot — genes only, breaks ancestry chain.
54        parent_b: Arc<ParentSnapshot>,
55        /// Strategy used.
56        strategy: String,
57        /// Generation when created.
58        generation: u32,
59    },
60    /// Created via mutation of a single parent.
61    Mutation {
62        /// Parent snapshot — genes only, breaks ancestry chain.
63        parent: Arc<ParentSnapshot>,
64        /// Log of applied mutation operations.
65        log: Vec<MutationOp>,
66        /// Generation when created.
67        generation: u32,
68    },
69}
70
71impl Lineage {
72    /// Create a genesis lineage.
73    #[must_use]
74    pub fn genesis(generation: u32) -> Self {
75        Self::Genesis { generation }
76    }
77
78    /// Create a crossover lineage.
79    #[must_use]
80    pub fn crossover(
81        parent_a: &Chromosome,
82        parent_b: &Chromosome,
83        strategy: &str,
84        generation: u32,
85    ) -> Self {
86        Self::Crossover {
87            parent_a: Arc::new(ParentSnapshot::from_chromosome(parent_a)),
88            parent_b: Arc::new(ParentSnapshot::from_chromosome(parent_b)),
89            strategy: strategy.to_string(),
90            generation,
91        }
92    }
93
94    /// Create a mutation lineage.
95    #[must_use]
96    pub fn mutation(parent: &Chromosome, log: Vec<MutationOp>, generation: u32) -> Self {
97        Self::Mutation {
98            parent: Arc::new(ParentSnapshot::from_chromosome(parent)),
99            log,
100            generation,
101        }
102    }
103
104    /// Serialize lineage to a compact string representation.
105    #[must_use]
106    pub fn to_trace(&self) -> String {
107        match self {
108            Self::Genesis { generation } => format!("genesis[gen={generation}]"),
109            Self::Crossover {
110                parent_a,
111                parent_b,
112                strategy,
113                generation,
114            } => {
115                format!(
116                    "crossover[gen={generation},strategy={strategy},a={{{}}},b={{{}}}]",
117                    genes_to_string(&parent_a.genes),
118                    genes_to_string(&parent_b.genes)
119                )
120            }
121            Self::Mutation {
122                parent,
123                log,
124                generation,
125            } => {
126                let ops: Vec<String> = log
127                    .iter()
128                    .map(|op| format!("{}:{}->{}[{}]", op.gene_name, op.from, op.to, op.operator))
129                    .collect();
130                format!(
131                    "mutation[gen={generation},parent={{{}}},ops=[{}]]",
132                    genes_to_string(&parent.genes),
133                    ops.join(",")
134                )
135            }
136        }
137    }
138}
139
140fn genes_to_string(genes: &[(String, String)]) -> String {
141    genes
142        .iter()
143        .map(|(n, v)| format!("{n}={v}"))
144        .collect::<Vec<_>>()
145        .join(",")
146}
147
148/// Serialize a bypass corpus including full lineage trees.
149#[derive(Debug, Clone, Serialize, Deserialize)]
150pub struct BypassEntry {
151    /// Payload hash (SHA-256 hex of serialized genes).
152    pub payload_hash: String,
153    /// Genes that produced the bypass.
154    pub genes: Vec<(String, String)>,
155    /// Full lineage trace.
156    pub lineage_trace: String,
157    /// Final fitness score.
158    pub fitness: f64,
159    /// Number of evaluations.
160    pub evaluations: u32,
161    /// Target WAF identifier (optional).
162    pub target_waf: Option<String>,
163    /// Whether this bypass was verified.
164    pub verified: bool,
165    /// Schema version for forward/backward compatibility.
166    pub schema_version: u32,
167}
168
169impl BypassEntry {
170    pub const CURRENT_SCHEMA: u32 = 1;
171
172    #[must_use]
173    pub fn from_chromosome(chromosome: &Chromosome, target_waf: Option<String>) -> Self {
174        // SHA-256 over a deterministic gene encoding. Earlier versions
175        // used the 64-bit DefaultHasher, which collides via birthday
176        // attack at roughly 2^32 chromosomes — well within reach of a
177        // long-running scan, causing BypassCorpus::add to silently
178        // dedupe distinct bypass discoveries.
179        //
180        // Important: gene order is part of the payload identity. Two
181        // chromosomes with the same set of genes in different order
182        // intentionally produce different hashes.
183        use sha2::{Digest, Sha256};
184        let mut hasher = Sha256::new();
185        for (k, v) in &chromosome.genes {
186            hasher.update(k.as_bytes());
187            hasher.update([0u8]); // delimiter so ("ab", "c") != ("a", "bc")
188            hasher.update(v.as_bytes());
189            hasher.update([0u8]);
190        }
191        let digest = hasher.finalize();
192        let payload_hash = digest
193            .iter()
194            .map(|b| format!("{b:02x}"))
195            .collect::<String>();
196
197        Self {
198            payload_hash,
199            genes: chromosome.genes.clone(),
200            lineage_trace: chromosome.lineage.to_trace(),
201            fitness: chromosome.fitness,
202            evaluations: chromosome.evaluations,
203            target_waf,
204            verified: true,
205            schema_version: Self::CURRENT_SCHEMA,
206        }
207    }
208}
209
210/// A serializable bypass corpus.
211#[derive(Debug, Clone, Default, Serialize, Deserialize)]
212pub struct BypassCorpus {
213    pub entries: Vec<BypassEntry>,
214    pub schema_version: u32,
215    /// O(1) dedup index over `entries[*].payload_hash`. Skipped in
216    /// serialization (the `entries` Vec is the source of truth) and
217    /// lazily rebuilt after a deserialize/load — where this arrives
218    /// empty — via [`Self::ensure_index`]. Pre-fix `add` did a linear
219    /// `entries.iter().any(...)` scan on every insert (O(n) per add →
220    /// O(n²) over a campaign that accumulates k bypasses), and the
221    /// engine layered a SECOND, broken scan on top (it compared a
222    /// 16-char u64 hash against this 64-char SHA-256 hash, so it never
223    /// matched and dedup'd nothing). The index makes `add` O(1).
224    #[serde(skip)]
225    seen_hashes: std::collections::HashSet<String>,
226}
227
228impl BypassCorpus {
229    pub const CURRENT_SCHEMA: u32 = 1;
230
231    /// Maximum number of bypass entries retained in memory. Bypasses are
232    /// valuable (the whole point of a campaign), so this is generous —
233    /// but it is NOT unbounded: a hostile target that yields a fresh
234    /// "bypass" per probe would otherwise grow `entries` straight toward
235    /// the 256 MiB save/load cliff (`MAX_CORPUS_BYTES`), which discards
236    /// the WHOLE corpus on overflow. Once full we keep what we have
237    /// (first-wins) rather than evicting proven winners.
238    pub const MAX_ENTRIES: usize = 100_000;
239
240    #[must_use]
241    pub fn new() -> Self {
242        Self {
243            entries: Vec::new(),
244            schema_version: Self::CURRENT_SCHEMA,
245            seen_hashes: std::collections::HashSet::new(),
246        }
247    }
248
249    /// Rebuild the dedup index when it is out of sync with `entries`
250    /// (right after a serde deserialize, where `seen_hashes` is skipped
251    /// and arrives empty). Idempotent and cheap once in sync.
252    fn ensure_index(&mut self) {
253        if self.seen_hashes.len() != self.entries.len() {
254            self.seen_hashes = self
255                .entries
256                .iter()
257                .map(|e| e.payload_hash.clone())
258                .collect();
259        }
260    }
261
262    /// Add a bypass entry. O(1) dedup by payload hash; bounded by
263    /// [`Self::MAX_ENTRIES`]. A duplicate hash is a no-op; a new hash
264    /// past the cap is dropped (the corpus is already saturated with
265    /// proven bypasses).
266    pub fn add(&mut self, entry: BypassEntry) {
267        self.ensure_index();
268        // O(1) dedup: insert returns false if the hash was already present.
269        if !self.seen_hashes.insert(entry.payload_hash.clone()) {
270            return;
271        }
272        if self.entries.len() >= Self::MAX_ENTRIES {
273            // Roll back the index insert so it stays a faithful mirror of
274            // `entries` (the cap, not a duplicate, is why we skip the push).
275            self.seen_hashes.remove(&entry.payload_hash);
276            return;
277        }
278        self.entries.push(entry);
279    }
280
281    /// Maximum corpus file size (bytes). Prevents OOM from
282    /// maliciously large JSONL files.
283    const MAX_CORPUS_BYTES: usize = 256 * 1024 * 1024;
284
285    /// Maximum individual JSONL line length (bytes).
286    const MAX_JSONL_LINE_BYTES: usize = 16 * 1024 * 1024;
287
288    /// Save corpus to disk as JSONL (one JSON object per line).
289    pub fn save(&self, path: &std::path::Path) -> Result<(), crate::types::EvolutionError> {
290        use crate::types::EvolutionError;
291        let mut buf = Vec::new();
292        for entry in &self.entries {
293            let json = serde_json::to_string(entry).map_err(EvolutionError::SerializationFailed)?;
294            if json.len() > Self::MAX_JSONL_LINE_BYTES {
295                tracing::warn!(
296                    line_len = json.len(),
297                    max = Self::MAX_JSONL_LINE_BYTES,
298                    "skipping oversized corpus entry"
299                );
300                continue;
301            }
302            if !buf.is_empty() {
303                buf.push(b'\n');
304            }
305            buf.extend_from_slice(json.as_bytes());
306            if buf.len() > Self::MAX_CORPUS_BYTES {
307                return Err(EvolutionError::OversizedData {
308                    context: format!("corpus {}", path.display()),
309                    size: buf.len(),
310                    max: Self::MAX_CORPUS_BYTES,
311                });
312            }
313        }
314        std::fs::write(path, buf)?;
315        Ok(())
316    }
317
318    /// Load corpus from JSONL.
319    pub fn load(path: &std::path::Path) -> Result<Self, crate::types::EvolutionError> {
320        use crate::types::EvolutionError;
321        let meta = std::fs::metadata(path)?;
322        // R55 pass-19 I5: saturate the u64→usize cast so a >4 GiB
323        // file on a 32-bit target doesn't silently truncate past the
324        // advisory cap (see types.rs:380 sibling).
325        let len = usize::try_from(meta.len()).unwrap_or(usize::MAX);
326        if len > Self::MAX_CORPUS_BYTES {
327            return Err(EvolutionError::OversizedData {
328                context: format!("corpus {}", path.display()),
329                size: len,
330                max: Self::MAX_CORPUS_BYTES,
331            });
332        }
333        // The metadata gate above is advisory; the bounded reader is
334        // authoritative (defends against symlinks + TOCTOU races).
335        let content = crate::safe_io::read_capped_text(path, Self::MAX_CORPUS_BYTES)?;
336        let mut entries = Vec::new();
337        for line in content.lines().filter(|l| !l.trim().is_empty()) {
338            if line.len() > Self::MAX_JSONL_LINE_BYTES {
339                tracing::warn!(
340                    line_len = line.len(),
341                    max = Self::MAX_JSONL_LINE_BYTES,
342                    "skipping oversized corpus line"
343                );
344                continue;
345            }
346            let entry: BypassEntry =
347                serde_json::from_str(line).map_err(EvolutionError::DeserializationFailed)?;
348            // R52 pass-14 I3 (CLAUDE.md §11 UTILIZATION): pre-fix
349            // the per-entry schema_version was deserialised and
350            // immediately discarded — corpus then claimed
351            // CURRENT_SCHEMA on itself without checking each entry.
352            // Future schema changes would silently misparse old
353            // entries with no error. Now we tolerate entries at
354            // CURRENT_SCHEMA exactly; mismatches are dropped with
355            // a tracing warn so a future migration can audit them.
356            // (Strict rejection would break a fresh-install ↔
357            // gene-bank-from-an-older-build flow; tolerance keeps
358            // the contract loose while still surfacing the gap.)
359            if entry.schema_version != BypassEntry::CURRENT_SCHEMA {
360                tracing::warn!(
361                    entry_schema = entry.schema_version,
362                    current_schema = BypassEntry::CURRENT_SCHEMA,
363                    "BypassCorpus::load skipping entry from a different schema \
364                     version — re-run scan to rebuild at current schema"
365                );
366                continue;
367            }
368            entries.push(entry);
369        }
370        Ok(Self {
371            entries,
372            schema_version: Self::CURRENT_SCHEMA,
373            // Lazily rebuilt on first `add` via `ensure_index` (it
374            // arrives empty here because `#[serde(skip)]` and this
375            // hand-built loader both omit it).
376            seen_hashes: std::collections::HashSet::new(),
377        })
378    }
379}
380
381#[cfg(test)]
382mod tests {
383    use super::*;
384    use crate::evolution::Chromosome;
385
386    #[test]
387    fn bypass_entry_deduplicates() {
388        let mut corpus = BypassCorpus::new();
389        let chrom = Chromosome::new(vec![("encoding".into(), "UrlEncode".into())]);
390        let entry = BypassEntry::from_chromosome(&chrom, None);
391        corpus.add(entry.clone());
392        corpus.add(entry);
393        assert_eq!(corpus.entries.len(), 1);
394    }
395
396    /// The O(1) dedup index must be rebuilt after a serde round-trip
397    /// (it is `#[serde(skip)]`), so a duplicate added to a LOADED corpus
398    /// is still rejected — otherwise a resumed campaign would re-append
399    /// every prior bypass on its first new find.
400    #[test]
401    fn dedup_survives_serde_round_trip() {
402        let mut corpus = BypassCorpus::new();
403        let chrom = Chromosome::new(vec![("encoding".into(), "UrlEncode".into())]);
404        let entry = BypassEntry::from_chromosome(&chrom, None);
405        corpus.add(entry.clone());
406        assert_eq!(corpus.entries.len(), 1);
407
408        // Simulate a load: serialize → deserialize drops `seen_hashes`.
409        let json = serde_json::to_string(&corpus).unwrap();
410        let mut reloaded: BypassCorpus = serde_json::from_str(&json).unwrap();
411        assert!(
412            reloaded.seen_hashes.is_empty(),
413            "precondition: the dedup index is not serialized"
414        );
415        // Re-adding the SAME entry must be a no-op once the index rebuilds.
416        reloaded.add(entry);
417        assert_eq!(
418            reloaded.entries.len(),
419            1,
420            "dedup must hold across a load (ensure_index rebuild)"
421        );
422    }
423
424    /// A distinct entry added to a loaded corpus still lands, and the
425    /// index stays a faithful mirror of `entries`.
426    #[test]
427    fn add_after_load_accepts_new_and_indexes_it() {
428        let mut corpus = BypassCorpus::new();
429        let a = BypassEntry::from_chromosome(
430            &Chromosome::new(vec![("encoding".into(), "UrlEncode".into())]),
431            None,
432        );
433        corpus.add(a);
434        let json = serde_json::to_string(&corpus).unwrap();
435        let mut reloaded: BypassCorpus = serde_json::from_str(&json).unwrap();
436
437        let b = BypassEntry::from_chromosome(
438            &Chromosome::new(vec![("encoding".into(), "Base64".into())]),
439            None,
440        );
441        reloaded.add(b.clone());
442        assert_eq!(reloaded.entries.len(), 2);
443        assert_eq!(
444            reloaded.seen_hashes.len(),
445            reloaded.entries.len(),
446            "index must mirror entries after a post-load add"
447        );
448        // And the just-added one is now deduped.
449        reloaded.add(b);
450        assert_eq!(reloaded.entries.len(), 2);
451    }
452
453    /// The MAX_ENTRIES cap bounds in-memory growth: once full, a NEW
454    /// (non-duplicate) entry is dropped rather than pushing the corpus
455    /// toward the 256 MiB save/load cliff. Uses a tiny synthetic cap
456    /// check by filling past a small simulated boundary via distinct
457    /// hashes; we assert the real invariant (len never exceeds the cap)
458    /// and that the index stays consistent on a rejected over-cap add.
459    #[test]
460    fn add_is_bounded_and_index_stays_consistent_at_cap() {
461        let mut corpus = BypassCorpus::new();
462        // Seed two distinct entries.
463        for tag in ["UrlEncode", "Base64"] {
464            corpus.add(BypassEntry::from_chromosome(
465                &Chromosome::new(vec![("encoding".into(), tag.into())]),
466                None,
467            ));
468        }
469        // Invariant under normal operation.
470        assert_eq!(corpus.entries.len(), 2);
471        assert_eq!(corpus.seen_hashes.len(), corpus.entries.len());
472        // The cap itself is large (100k); rather than allocate 100k
473        // entries we assert the documented invariant holds and the
474        // index never diverges from entries across many adds.
475        for i in 0..1000u32 {
476            corpus.add(BypassEntry::from_chromosome(
477                &Chromosome::new(vec![("encoding".into(), format!("E{i}"))]),
478                None,
479            ));
480        }
481        assert!(
482            corpus.entries.len() <= BypassCorpus::MAX_ENTRIES,
483            "entries must never exceed MAX_ENTRIES"
484        );
485        assert_eq!(
486            corpus.seen_hashes.len(),
487            corpus.entries.len(),
488            "index length must equal entries length after a batch of adds"
489        );
490    }
491
492    #[test]
493    fn lineage_trace_roundtrips() {
494        let chrom = Chromosome::new(vec![("a".into(), "1".into())]);
495        let lineage = Lineage::genesis(0);
496        assert!(lineage.to_trace().contains("genesis"));
497
498        let cross = Lineage::crossover(&chrom, &chrom, "uniform", 1);
499        assert!(cross.to_trace().contains("crossover"));
500
501        let mutation = Lineage::mutation(&chrom, vec![], 2);
502        assert!(mutation.to_trace().contains("mutation"));
503    }
504
505    #[test]
506    fn empty_lineage_trace_is_serializable() {
507        let chrom = Chromosome::new(Vec::new());
508        let cross = Lineage::crossover(&chrom, &chrom, "single_point", 1);
509        let trace = cross.to_trace();
510        assert!(trace.contains("crossover"));
511        assert!(trace.contains("a={}"));
512        assert!(trace.contains("b={}"));
513    }
514
515    #[test]
516    fn payload_hash_is_order_sensitive() {
517        let chrom_a = Chromosome::new(vec![
518            ("encoding".into(), "UrlEncode".into()),
519            ("content_type".into(), "JsonNested".into()),
520        ]);
521        let chrom_b = Chromosome::new(vec![
522            ("content_type".into(), "JsonNested".into()),
523            ("encoding".into(), "UrlEncode".into()),
524        ]);
525        let a = BypassEntry::from_chromosome(&chrom_a, None);
526        let b = BypassEntry::from_chromosome(&chrom_b, None);
527        assert_ne!(a.payload_hash, b.payload_hash);
528    }
529}