Skip to main content

wallfacer_core/
fuzz_corpus.rs

1//! Phase R — persistent fuzz corpus.
2//!
3//! The original `fuzz` plan was stateless: every iteration
4//! generates a payload from the schema seeded by the master seed,
5//! invokes the tool, records findings, throws the input away.
6//! That's good for reproducibility but it means `wallfacer fuzz`
7//! repeated against the same server, the same seed, the same
8//! pack, finds the same bugs over and over and never explores
9//! beyond what one fresh schema-driven generator can synthesise.
10//!
11//! Phase R adds a persistent input corpus on top, in the spirit
12//! of libFuzzer / AFL's coverage-feedback loop:
13//!
14//! 1. Inputs that triggered a finding are saved.
15//! 2. Inputs that produced a previously-unseen *response
16//!    fingerprint* are saved (proxy for "explored a new path").
17//! 3. Subsequent runs read the corpus, pick a random entry, and
18//!    mutate it — by default 90 % of the time. The remaining 10 %
19//!    is pure schema-driven random, so a fuzzer that has converged
20//!    on a small basin keeps exploring.
21//!
22//! The corpus lives at `<corpus_dir>/../fuzz_corpus/<tool>/`
23//! (sibling of the findings corpus), one JSON file per entry,
24//! filename keyed on a SHA-256 of the canonical input. Entries are
25//! deduplicated on the input itself so identical mutations don't
26//! grow the corpus indefinitely.
27
28use std::{
29    fs::{self, OpenOptions},
30    io::Write,
31    path::PathBuf,
32};
33
34use chrono::{DateTime, Utc};
35use serde::{Deserialize, Serialize};
36use serde_json::Value;
37use sha2::{Digest, Sha256};
38use thiserror::Error;
39
40use crate::corpus::sanitize_tool_name;
41
42/// One persisted entry in the fuzz corpus.
43#[derive(Debug, Clone, Serialize, Deserialize)]
44pub struct FuzzCorpusEntry {
45    /// Tool the entry targets. Mirrors the per-tool subdirectory
46    /// it lives in.
47    pub tool: String,
48    /// Concrete input payload that survived. Re-used as the seed
49    /// of future mutation rounds.
50    pub input: Value,
51    /// Why this entry was kept — either it triggered a finding
52    /// (the strongest signal) or it produced a never-before-seen
53    /// response fingerprint.
54    pub trigger: CorpusTrigger,
55    /// Response fingerprint at the time of capture; persisted so
56    /// the dedup tracker can be rebuilt by replaying the corpus
57    /// on startup.
58    pub fingerprint: String,
59    /// When the entry was captured. Older corpus entries are
60    /// preferred when picking a mutation seed (they've had more
61    /// chances to be useful), but the picker is still random.
62    pub timestamp: DateTime<Utc>,
63}
64
65/// Why an input is interesting enough to keep.
66#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
67#[serde(tag = "type", rename_all = "snake_case")]
68pub enum CorpusTrigger {
69    /// The input triggered a [`crate::finding::Finding`]. The
70    /// strongest signal — these are the entries that produced
71    /// real bugs and are worth mutating from again.
72    Finding {
73        /// Lower-snake-case identifier of the finding kind.
74        kind: String,
75    },
76    /// The input produced a previously-unseen response
77    /// fingerprint. Soft signal: useful for exploration, but the
78    /// fingerprint hashing scheme is conservative so most novelty
79    /// caught here is shallow ("server returned a slightly
80    /// different error message").
81    NewFingerprint,
82}
83
84#[derive(Debug, Error)]
85pub enum FuzzCorpusError {
86    #[error("create directory {path}: {source}")]
87    CreateDir {
88        path: PathBuf,
89        source: std::io::Error,
90    },
91    #[error("read {path}: {source}")]
92    Read {
93        path: PathBuf,
94        source: std::io::Error,
95    },
96    #[error("write {path}: {source}")]
97    Write {
98        path: PathBuf,
99        source: std::io::Error,
100    },
101    #[error("serialise corpus entry: {0}")]
102    Serialize(#[from] serde_json::Error),
103}
104
105pub type Result<T> = std::result::Result<T, FuzzCorpusError>;
106
107/// Persistent fuzz-input store. Cheaply `Clone`-able; the on-disk
108/// layout is the source of truth, in-memory state is just a cache
109/// for fingerprint dedup during a single run.
110#[derive(Debug, Clone)]
111pub struct FuzzCorpus {
112    root: PathBuf,
113}
114
115impl FuzzCorpus {
116    /// Builds a corpus rooted at `root`. The directory is created
117    /// lazily on the first `save` — operators that don't enable
118    /// the corpus pay zero filesystem cost.
119    pub fn new(root: impl Into<PathBuf>) -> Self {
120        Self { root: root.into() }
121    }
122
123    /// Path on disk for `tool`'s sub-corpus. Sanitised against
124    /// untrusted tool names per the same rules as the findings
125    /// corpus (Phase v0.3.2).
126    pub fn tool_dir(&self, tool: &str) -> PathBuf {
127        self.root.join(sanitize_tool_name(tool))
128    }
129
130    /// Reads every persisted entry for `tool`. Returns an empty
131    /// vector when the directory doesn't exist yet (a fresh
132    /// run-against-new-server).
133    pub fn list(&self, tool: &str) -> Result<Vec<FuzzCorpusEntry>> {
134        let dir = self.tool_dir(tool);
135        if !dir.is_dir() {
136            return Ok(Vec::new());
137        }
138        let mut out = Vec::new();
139        for entry in fs::read_dir(&dir).map_err(|source| FuzzCorpusError::Read {
140            path: dir.clone(),
141            source,
142        })? {
143            let entry = entry.map_err(|source| FuzzCorpusError::Read {
144                path: dir.clone(),
145                source,
146            })?;
147            let path = entry.path();
148            if path.extension().is_some_and(|ext| ext == "json") {
149                let bytes = fs::read(&path).map_err(|source| FuzzCorpusError::Read {
150                    path: path.clone(),
151                    source,
152                })?;
153                if let Ok(parsed) = serde_json::from_slice::<FuzzCorpusEntry>(&bytes) {
154                    out.push(parsed);
155                }
156            }
157        }
158        out.sort_by_key(|e| e.timestamp);
159        Ok(out)
160    }
161
162    /// Saves `entry`. Writes are idempotent — keying on a SHA-256
163    /// of the input means identical inputs don't grow the corpus,
164    /// even when called concurrently.
165    pub fn save(&self, entry: &FuzzCorpusEntry) -> Result<PathBuf> {
166        let dir = self.tool_dir(&entry.tool);
167        fs::create_dir_all(&dir).map_err(|source| FuzzCorpusError::CreateDir {
168            path: dir.clone(),
169            source,
170        })?;
171        let key = input_key(&entry.input);
172        let path = dir.join(format!("{key}.json"));
173        let body = serde_json::to_vec_pretty(entry)?;
174        // Use create_new on first write but tolerate a pre-existing
175        // file from a concurrent or prior run — the key dedup means
176        // the body would be identical.
177        let mut options = OpenOptions::new();
178        options.write(true).create(true).truncate(true);
179        #[cfg(unix)]
180        {
181            use std::os::unix::fs::OpenOptionsExt;
182            options.mode(0o600);
183        }
184        let mut file = options
185            .open(&path)
186            .map_err(|source| FuzzCorpusError::Write {
187                path: path.clone(),
188                source,
189            })?;
190        file.write_all(&body)
191            .map_err(|source| FuzzCorpusError::Write {
192                path: path.clone(),
193                source,
194            })?;
195        Ok(path)
196    }
197
198    /// Total number of entries across every tool sub-corpus.
199    /// Used by tests + the human reporter ("corpus: N entries").
200    pub fn total(&self) -> Result<usize> {
201        if !self.root.is_dir() {
202            return Ok(0);
203        }
204        let mut total = 0;
205        for entry in fs::read_dir(&self.root).map_err(|source| FuzzCorpusError::Read {
206            path: self.root.clone(),
207            source,
208        })? {
209            let entry = entry.map_err(|source| FuzzCorpusError::Read {
210                path: self.root.clone(),
211                source,
212            })?;
213            if entry.path().is_dir() {
214                let count = fs::read_dir(entry.path())
215                    .map(|i| {
216                        i.flatten()
217                            .filter(|e| e.path().extension().is_some_and(|x| x == "json"))
218                            .count()
219                    })
220                    .unwrap_or(0);
221                total += count;
222            }
223        }
224        Ok(total)
225    }
226}
227
228/// SHA-256 of the canonical JSON of `input`, hex-encoded — used
229/// as the on-disk filename so identical mutations dedup naturally.
230pub fn input_key(input: &Value) -> String {
231    let canonical = crate::finding::canonical_json(input);
232    let hash = Sha256::digest(canonical.as_bytes());
233    hex::encode(hash)[..16].to_string()
234}
235
236/// Conservative response fingerprint. Hashes:
237/// - `isError` boolean (or `null` when missing),
238/// - the *type* of every entry in `content` (so a `text` →
239///   `image` swap registers as a different shape),
240/// - the first 64 bytes of `content[0].text` after Unicode
241///   normalisation (catches "different error message" novelty).
242///
243/// Conservative on purpose: a too-fine fingerprint would flag
244/// every mutation as novel and explode the corpus. The 64-byte
245/// prefix gives us "this is a different class of response"
246/// without "this is a different rendering of the same response".
247pub fn response_fingerprint(response: &Value) -> String {
248    let mut hasher = Sha256::new();
249    let is_error = response.get("isError").and_then(Value::as_bool);
250    hasher.update(format!("isError={is_error:?}|").as_bytes());
251    if let Some(arr) = response.get("content").and_then(Value::as_array) {
252        for item in arr {
253            let kind = item.get("type").and_then(Value::as_str).unwrap_or("?");
254            hasher.update(format!("type={kind}|").as_bytes());
255        }
256        if let Some(first_text) = arr
257            .first()
258            .and_then(|v| v.get("text"))
259            .and_then(Value::as_str)
260        {
261            let prefix: String = first_text.chars().take(64).collect();
262            hasher.update(prefix.as_bytes());
263        }
264    }
265    hex::encode(hasher.finalize())[..16].to_string()
266}
267
268#[cfg(test)]
269#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
270mod tests {
271    use super::*;
272    use serde_json::json;
273    use tempfile::tempdir;
274
275    fn entry(tool: &str, input: Value, trigger: CorpusTrigger) -> FuzzCorpusEntry {
276        FuzzCorpusEntry {
277            tool: tool.to_string(),
278            fingerprint: response_fingerprint(
279                &json!({"content": [{"type": "text", "text": "ok"}]}),
280            ),
281            input,
282            trigger,
283            timestamp: Utc::now(),
284        }
285    }
286
287    #[test]
288    fn save_then_list_round_trips() {
289        let dir = tempdir().expect("tempdir");
290        let corpus = FuzzCorpus::new(dir.path().to_path_buf());
291        let e = entry(
292            "x",
293            json!({"a": 1}),
294            CorpusTrigger::Finding {
295                kind: "crash".to_string(),
296            },
297        );
298        corpus.save(&e).expect("save");
299        let listed = corpus.list("x").expect("list");
300        assert_eq!(listed.len(), 1);
301        assert_eq!(listed[0].input, json!({"a": 1}));
302    }
303
304    #[test]
305    fn identical_inputs_dedup_on_disk() {
306        let dir = tempdir().expect("tempdir");
307        let corpus = FuzzCorpus::new(dir.path().to_path_buf());
308        for _ in 0..3 {
309            let e = entry(
310                "x",
311                json!({"a": 1, "b": "constant"}),
312                CorpusTrigger::NewFingerprint,
313            );
314            corpus.save(&e).expect("save");
315        }
316        let listed = corpus.list("x").expect("list");
317        assert_eq!(
318            listed.len(),
319            1,
320            "identical inputs must dedup to a single file (key = SHA-256 of canonical JSON)"
321        );
322    }
323
324    #[test]
325    fn list_returns_empty_for_unknown_tool() {
326        let dir = tempdir().expect("tempdir");
327        let corpus = FuzzCorpus::new(dir.path().to_path_buf());
328        assert!(corpus.list("never-saved").expect("list").is_empty());
329    }
330
331    #[test]
332    fn fingerprint_changes_when_is_error_flips() {
333        let a = response_fingerprint(&json!({"content": [], "isError": false}));
334        let b = response_fingerprint(&json!({"content": [], "isError": true}));
335        assert_ne!(a, b);
336    }
337
338    #[test]
339    fn fingerprint_changes_when_content_type_changes() {
340        let a = response_fingerprint(&json!({"content": [{"type": "text", "text": "x"}]}));
341        let b = response_fingerprint(&json!({"content": [{"type": "image", "data": "x"}]}));
342        assert_ne!(a, b);
343    }
344
345    #[test]
346    fn fingerprint_stable_for_same_response() {
347        let r = json!({"content": [{"type": "text", "text": "foo"}], "isError": false});
348        assert_eq!(response_fingerprint(&r), response_fingerprint(&r));
349    }
350
351    #[test]
352    fn fingerprint_first_text_prefix_separates_distinct_messages() {
353        let a = response_fingerprint(
354            &json!({"content": [{"type": "text", "text": "permission denied"}]}),
355        );
356        let b = response_fingerprint(&json!({"content": [{"type": "text", "text": "not found"}]}));
357        assert_ne!(a, b);
358    }
359
360    #[test]
361    fn total_counts_across_tool_subdirs() {
362        let dir = tempdir().expect("tempdir");
363        let corpus = FuzzCorpus::new(dir.path().to_path_buf());
364        corpus
365            .save(&entry("x", json!({"a": 1}), CorpusTrigger::NewFingerprint))
366            .unwrap();
367        corpus
368            .save(&entry("y", json!({"b": 2}), CorpusTrigger::NewFingerprint))
369            .unwrap();
370        corpus
371            .save(&entry("y", json!({"b": 3}), CorpusTrigger::NewFingerprint))
372            .unwrap();
373        assert_eq!(corpus.total().unwrap(), 3);
374    }
375}