Skip to main content

wallfacer_core/
corpus.rs

1use std::{
2    fs::{self, OpenOptions},
3    io::{self, Write},
4    path::{Path, PathBuf},
5    thread,
6    time::{Duration, Instant},
7};
8
9use thiserror::Error;
10
11use crate::{
12    finding::Finding,
13    redact::Redact,
14    target::{default_lock_timeout_ms, OutputConfig},
15};
16
17#[derive(Debug, Error)]
18pub enum CorpusError {
19    #[error("failed to create corpus directory {path}: {source}")]
20    CreateDir { path: PathBuf, source: io::Error },
21    #[error("failed to acquire corpus lock {path}: {source}")]
22    Lock { path: PathBuf, source: io::Error },
23    #[error("timed out acquiring corpus lock {0}")]
24    LockTimeout(PathBuf),
25    #[error("failed to serialize finding {id}: {source}")]
26    Serialize {
27        id: String,
28        source: serde_json::Error,
29    },
30    #[error("failed to write corpus file {path}: {source}")]
31    Write { path: PathBuf, source: io::Error },
32    #[error("failed to read corpus file {path}: {source}")]
33    Read { path: PathBuf, source: io::Error },
34    #[error("failed to parse corpus file {path}: {source}")]
35    Parse {
36        path: PathBuf,
37        source: serde_json::Error,
38    },
39    #[error("finding `{0}` not found in corpus")]
40    NotFound(String),
41}
42
43pub type Result<T> = std::result::Result<T, CorpusError>;
44
45#[derive(Debug, Clone)]
46pub struct Corpus {
47    root: PathBuf,
48    lock_timeout: Duration,
49}
50
51impl Corpus {
52    /// Builds a corpus rooted at `root` with the default lock timeout.
53    pub fn new(root: impl Into<PathBuf>) -> Self {
54        Self {
55            root: root.into(),
56            lock_timeout: Duration::from_millis(default_lock_timeout_ms()),
57        }
58    }
59
60    /// Builds a corpus from a config, honoring `[output] lock_timeout_ms`.
61    pub fn from_config(config: &OutputConfig) -> Self {
62        Self {
63            root: config.corpus_dir.clone(),
64            lock_timeout: Duration::from_millis(config.lock_timeout_ms),
65        }
66    }
67
68    /// Override the lock timeout (used by tests and CLI overrides).
69    #[must_use]
70    pub fn with_lock_timeout(mut self, timeout: Duration) -> Self {
71        self.lock_timeout = timeout;
72        self
73    }
74
75    pub fn write_finding(&self, finding: &Finding) -> Result<PathBuf> {
76        let wallfacer_dir = self
77            .root
78            .parent()
79            .map(Path::to_path_buf)
80            .unwrap_or_else(|| PathBuf::from(".wallfacer"));
81        fs::create_dir_all(&wallfacer_dir).map_err(|source| CorpusError::CreateDir {
82            path: wallfacer_dir.clone(),
83            source,
84        })?;
85
86        let _lock = CorpusLock::acquire(wallfacer_dir.join(".lock"), self.lock_timeout)?;
87
88        // Tool names come from the MCP server, which we treat as
89        // semi-trusted: a misbehaving server could declare a name like
90        // `../../etc/x` and have us write outside `corpus_dir`. Sanitise
91        // before joining so the path stays inside `self.root`.
92        let safe_tool = sanitize_tool_name(&finding.tool);
93        let tool_dir = self.root.join(&safe_tool);
94        fs::create_dir_all(&tool_dir).map_err(|source| CorpusError::CreateDir {
95            path: tool_dir.clone(),
96            source,
97        })?;
98
99        // Persist the redacted form: corpus files may end up in CI artefacts,
100        // shared storage, or commit history. The in-memory `finding` is
101        // preserved untouched for callers that need the original payload (e.g.
102        // an in-process replay).
103        let redacted = finding.redacted();
104        let path = tool_dir.join(format!("{}.json", redacted.id));
105        let body =
106            serde_json::to_string_pretty(&redacted).map_err(|source| CorpusError::Serialize {
107                id: redacted.id.clone(),
108                source,
109            })?;
110        write_secure(&path, body.as_bytes())?;
111        Ok(path)
112    }
113
114    pub fn list_findings(&self) -> Result<Vec<Finding>> {
115        let mut findings = Vec::new();
116        if !self.root.is_dir() {
117            return Ok(findings);
118        }
119
120        visit_json_files(&self.root, &mut |path| {
121            findings.push(read_finding_file(path)?);
122            Ok(())
123        })?;
124        findings.sort_by(|left, right| left.id.cmp(&right.id));
125        Ok(findings)
126    }
127
128    pub fn find_by_id(&self, id: &str) -> Result<Finding> {
129        self.list_findings()?
130            .into_iter()
131            .find(|finding| finding.id == id || finding.id.starts_with(id))
132            .ok_or_else(|| CorpusError::NotFound(id.to_string()))
133    }
134}
135
136/// Writes a corpus file with restrictive permissions on Unix (mode `0o600`),
137/// so the file is readable only by the owning user. On Windows, we fall back
138/// to the default ACL inherited from the parent directory; operators sharing
139/// runners should rely on directory-level permissions there. This is
140/// documented in `docs/security.md` (Phase A).
141fn write_secure(path: &Path, body: &[u8]) -> Result<()> {
142    let mut options = OpenOptions::new();
143    options.write(true).create(true).truncate(true);
144    #[cfg(unix)]
145    {
146        use std::os::unix::fs::OpenOptionsExt;
147        options.mode(0o600);
148    }
149    let mut file = options.open(path).map_err(|source| CorpusError::Write {
150        path: path.to_path_buf(),
151        source,
152    })?;
153    file.write_all(body).map_err(|source| CorpusError::Write {
154        path: path.to_path_buf(),
155        source,
156    })?;
157    // If the file pre-existed with looser permissions, `mode(0o600)` above
158    // would not have applied. Tighten after the fact on Unix; best-effort.
159    #[cfg(unix)]
160    {
161        use std::os::unix::fs::PermissionsExt;
162        let _ = fs::set_permissions(path, fs::Permissions::from_mode(0o600));
163    }
164    Ok(())
165}
166
167fn visit_json_files(path: &Path, visitor: &mut impl FnMut(&Path) -> Result<()>) -> Result<()> {
168    for entry in fs::read_dir(path).map_err(|source| CorpusError::Read {
169        path: path.to_path_buf(),
170        source,
171    })? {
172        let entry = entry.map_err(|source| CorpusError::Read {
173            path: path.to_path_buf(),
174            source,
175        })?;
176        let path = entry.path();
177        if path.is_dir() {
178            visit_json_files(&path, visitor)?;
179        } else if path
180            .extension()
181            .is_some_and(|extension| extension == "json")
182        {
183            visitor(&path)?;
184        }
185    }
186    Ok(())
187}
188
189/// Returns a filesystem-safe form of a tool name: any character outside
190/// `[A-Za-z0-9_-]` is replaced with `_`. Empty input maps to `_`.
191///
192/// The harness treats MCP server output as semi-trusted (the server is
193/// what we are fuzzing), so tool names that flow into a filesystem path
194/// must never contain `/`, `\`, `..`, or NUL bytes. This helper is the
195/// single point of truth used by both the corpus and the inferred-schema
196/// directory.
197pub fn sanitize_tool_name(tool_name: &str) -> String {
198    if tool_name.is_empty() {
199        return "_".to_string();
200    }
201    tool_name
202        .chars()
203        .map(|ch| {
204            if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' {
205                ch
206            } else {
207                '_'
208            }
209        })
210        .collect()
211}
212
213fn read_finding_file(path: &Path) -> Result<Finding> {
214    let body = fs::read_to_string(path).map_err(|source| CorpusError::Read {
215        path: path.to_path_buf(),
216        source,
217    })?;
218    serde_json::from_str(&body).map_err(|source| CorpusError::Parse {
219        path: path.to_path_buf(),
220        source,
221    })
222}
223
224struct CorpusLock {
225    path: PathBuf,
226}
227
228const LOCK_BACKOFF_INITIAL: Duration = Duration::from_millis(25);
229const LOCK_BACKOFF_CAP: Duration = Duration::from_millis(1_000);
230
231impl CorpusLock {
232    /// Tries to acquire the corpus lock, polling with exponential backoff
233    /// (capped at [`LOCK_BACKOFF_CAP`]) until either the lock is held or
234    /// `timeout` elapses. Phase E3 made the timeout configurable.
235    fn acquire(path: PathBuf, timeout: Duration) -> Result<Self> {
236        let deadline = Instant::now() + timeout;
237        let mut backoff = LOCK_BACKOFF_INITIAL;
238        loop {
239            match OpenOptions::new().write(true).create_new(true).open(&path) {
240                Ok(_) => return Ok(Self { path }),
241                Err(error) if error.kind() == io::ErrorKind::AlreadyExists => {
242                    if Instant::now() >= deadline {
243                        return Err(CorpusError::LockTimeout(path));
244                    }
245                    let remaining = deadline.saturating_duration_since(Instant::now());
246                    let wait = backoff.min(remaining);
247                    if wait.is_zero() {
248                        return Err(CorpusError::LockTimeout(path));
249                    }
250                    thread::sleep(wait);
251                    backoff = (backoff * 2).min(LOCK_BACKOFF_CAP);
252                }
253                Err(source) => {
254                    return Err(CorpusError::Lock {
255                        path: path.clone(),
256                        source,
257                    });
258                }
259            }
260        }
261    }
262}
263
264impl Drop for CorpusLock {
265    fn drop(&mut self) {
266        let _ = fs::remove_file(&self.path);
267    }
268}
269
270#[cfg(test)]
271#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
272mod tests {
273    use super::*;
274    use crate::finding::{FindingKind, ReproInfo};
275    use serde_json::json;
276
277    #[test]
278    fn sanitize_strips_path_separators_and_traversal() {
279        // 2× ".." + 2× "/" = 6 sanitised chars before "etc".
280        assert_eq!(sanitize_tool_name("../../etc/passwd"), "______etc_passwd");
281        assert_eq!(sanitize_tool_name("..\\windows"), "___windows");
282        assert_eq!(sanitize_tool_name("ok_name-1"), "ok_name-1");
283        assert_eq!(sanitize_tool_name(""), "_");
284        assert_eq!(sanitize_tool_name("with space"), "with_space");
285        assert_eq!(sanitize_tool_name("nul\0byte"), "nul_byte");
286    }
287
288    #[test]
289    fn write_finding_keeps_output_inside_corpus_root() {
290        let tmp = tempfile::tempdir().unwrap();
291        let root = tmp.path().join("corpus");
292        let corpus = Corpus::new(root.clone());
293        let finding = Finding::new(
294            FindingKind::Crash,
295            "../../escape",
296            "msg",
297            "details",
298            ReproInfo {
299                seed: 0,
300                tool_call: json!({}),
301                transport: "stdio".to_string(),
302                composition_trail: Vec::new(),
303            },
304        );
305        let path = corpus.write_finding(&finding).unwrap();
306        // Resolve via absolute paths to defeat any `..` segment that would
307        // otherwise canonicalise outside the root.
308        let canon_root = std::fs::canonicalize(&root).unwrap();
309        let canon_path = std::fs::canonicalize(&path).unwrap();
310        assert!(
311            canon_path.starts_with(&canon_root),
312            "finding written outside corpus root: {canon_path:?} not under {canon_root:?}"
313        );
314    }
315}