1use std::{
2 fs::{self, OpenOptions},
3 io::{self, Write},
4 path::{Path, PathBuf},
5 thread,
6 time::{Duration, Instant},
7};
8
9use thiserror::Error;
10
11use crate::{
12 finding::Finding,
13 redact::Redact,
14 target::{default_lock_timeout_ms, OutputConfig},
15};
16
17#[derive(Debug, Error)]
18pub enum CorpusError {
19 #[error("failed to create corpus directory {path}: {source}")]
20 CreateDir { path: PathBuf, source: io::Error },
21 #[error("failed to acquire corpus lock {path}: {source}")]
22 Lock { path: PathBuf, source: io::Error },
23 #[error("timed out acquiring corpus lock {0}")]
24 LockTimeout(PathBuf),
25 #[error("failed to serialize finding {id}: {source}")]
26 Serialize {
27 id: String,
28 source: serde_json::Error,
29 },
30 #[error("failed to write corpus file {path}: {source}")]
31 Write { path: PathBuf, source: io::Error },
32 #[error("failed to read corpus file {path}: {source}")]
33 Read { path: PathBuf, source: io::Error },
34 #[error("failed to parse corpus file {path}: {source}")]
35 Parse {
36 path: PathBuf,
37 source: serde_json::Error,
38 },
39 #[error("finding `{0}` not found in corpus")]
40 NotFound(String),
41}
42
43pub type Result<T> = std::result::Result<T, CorpusError>;
44
45#[derive(Debug, Clone)]
46pub struct Corpus {
47 root: PathBuf,
48 lock_timeout: Duration,
49}
50
51impl Corpus {
52 pub fn new(root: impl Into<PathBuf>) -> Self {
54 Self {
55 root: root.into(),
56 lock_timeout: Duration::from_millis(default_lock_timeout_ms()),
57 }
58 }
59
60 pub fn from_config(config: &OutputConfig) -> Self {
62 Self {
63 root: config.corpus_dir.clone(),
64 lock_timeout: Duration::from_millis(config.lock_timeout_ms),
65 }
66 }
67
68 #[must_use]
70 pub fn with_lock_timeout(mut self, timeout: Duration) -> Self {
71 self.lock_timeout = timeout;
72 self
73 }
74
75 pub fn write_finding(&self, finding: &Finding) -> Result<PathBuf> {
76 let wallfacer_dir = self
77 .root
78 .parent()
79 .map(Path::to_path_buf)
80 .unwrap_or_else(|| PathBuf::from(".wallfacer"));
81 fs::create_dir_all(&wallfacer_dir).map_err(|source| CorpusError::CreateDir {
82 path: wallfacer_dir.clone(),
83 source,
84 })?;
85
86 let _lock = CorpusLock::acquire(wallfacer_dir.join(".lock"), self.lock_timeout)?;
87
88 let safe_tool = sanitize_tool_name(&finding.tool);
93 let tool_dir = self.root.join(&safe_tool);
94 fs::create_dir_all(&tool_dir).map_err(|source| CorpusError::CreateDir {
95 path: tool_dir.clone(),
96 source,
97 })?;
98
99 let redacted = finding.redacted();
104 let path = tool_dir.join(format!("{}.json", redacted.id));
105 let body =
106 serde_json::to_string_pretty(&redacted).map_err(|source| CorpusError::Serialize {
107 id: redacted.id.clone(),
108 source,
109 })?;
110 write_secure(&path, body.as_bytes())?;
111 Ok(path)
112 }
113
114 pub fn list_findings(&self) -> Result<Vec<Finding>> {
115 let mut findings = Vec::new();
116 if !self.root.is_dir() {
117 return Ok(findings);
118 }
119
120 visit_json_files(&self.root, &mut |path| {
121 findings.push(read_finding_file(path)?);
122 Ok(())
123 })?;
124 findings.sort_by(|left, right| left.id.cmp(&right.id));
125 Ok(findings)
126 }
127
128 pub fn find_by_id(&self, id: &str) -> Result<Finding> {
129 self.list_findings()?
130 .into_iter()
131 .find(|finding| finding.id == id || finding.id.starts_with(id))
132 .ok_or_else(|| CorpusError::NotFound(id.to_string()))
133 }
134}
135
136fn write_secure(path: &Path, body: &[u8]) -> Result<()> {
142 let mut options = OpenOptions::new();
143 options.write(true).create(true).truncate(true);
144 #[cfg(unix)]
145 {
146 use std::os::unix::fs::OpenOptionsExt;
147 options.mode(0o600);
148 }
149 let mut file = options.open(path).map_err(|source| CorpusError::Write {
150 path: path.to_path_buf(),
151 source,
152 })?;
153 file.write_all(body).map_err(|source| CorpusError::Write {
154 path: path.to_path_buf(),
155 source,
156 })?;
157 #[cfg(unix)]
160 {
161 use std::os::unix::fs::PermissionsExt;
162 let _ = fs::set_permissions(path, fs::Permissions::from_mode(0o600));
163 }
164 Ok(())
165}
166
167fn visit_json_files(path: &Path, visitor: &mut impl FnMut(&Path) -> Result<()>) -> Result<()> {
168 for entry in fs::read_dir(path).map_err(|source| CorpusError::Read {
169 path: path.to_path_buf(),
170 source,
171 })? {
172 let entry = entry.map_err(|source| CorpusError::Read {
173 path: path.to_path_buf(),
174 source,
175 })?;
176 let path = entry.path();
177 if path.is_dir() {
178 visit_json_files(&path, visitor)?;
179 } else if path
180 .extension()
181 .is_some_and(|extension| extension == "json")
182 {
183 visitor(&path)?;
184 }
185 }
186 Ok(())
187}
188
189pub fn sanitize_tool_name(tool_name: &str) -> String {
198 if tool_name.is_empty() {
199 return "_".to_string();
200 }
201 tool_name
202 .chars()
203 .map(|ch| {
204 if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' {
205 ch
206 } else {
207 '_'
208 }
209 })
210 .collect()
211}
212
213fn read_finding_file(path: &Path) -> Result<Finding> {
214 let body = fs::read_to_string(path).map_err(|source| CorpusError::Read {
215 path: path.to_path_buf(),
216 source,
217 })?;
218 serde_json::from_str(&body).map_err(|source| CorpusError::Parse {
219 path: path.to_path_buf(),
220 source,
221 })
222}
223
224struct CorpusLock {
225 path: PathBuf,
226}
227
228const LOCK_BACKOFF_INITIAL: Duration = Duration::from_millis(25);
229const LOCK_BACKOFF_CAP: Duration = Duration::from_millis(1_000);
230
231impl CorpusLock {
232 fn acquire(path: PathBuf, timeout: Duration) -> Result<Self> {
236 let deadline = Instant::now() + timeout;
237 let mut backoff = LOCK_BACKOFF_INITIAL;
238 loop {
239 match OpenOptions::new().write(true).create_new(true).open(&path) {
240 Ok(_) => return Ok(Self { path }),
241 Err(error) if error.kind() == io::ErrorKind::AlreadyExists => {
242 if Instant::now() >= deadline {
243 return Err(CorpusError::LockTimeout(path));
244 }
245 let remaining = deadline.saturating_duration_since(Instant::now());
246 let wait = backoff.min(remaining);
247 if wait.is_zero() {
248 return Err(CorpusError::LockTimeout(path));
249 }
250 thread::sleep(wait);
251 backoff = (backoff * 2).min(LOCK_BACKOFF_CAP);
252 }
253 Err(source) => {
254 return Err(CorpusError::Lock {
255 path: path.clone(),
256 source,
257 });
258 }
259 }
260 }
261 }
262}
263
264impl Drop for CorpusLock {
265 fn drop(&mut self) {
266 let _ = fs::remove_file(&self.path);
267 }
268}
269
270#[cfg(test)]
271#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
272mod tests {
273 use super::*;
274 use crate::finding::{FindingKind, ReproInfo};
275 use serde_json::json;
276
277 #[test]
278 fn sanitize_strips_path_separators_and_traversal() {
279 assert_eq!(sanitize_tool_name("../../etc/passwd"), "______etc_passwd");
281 assert_eq!(sanitize_tool_name("..\\windows"), "___windows");
282 assert_eq!(sanitize_tool_name("ok_name-1"), "ok_name-1");
283 assert_eq!(sanitize_tool_name(""), "_");
284 assert_eq!(sanitize_tool_name("with space"), "with_space");
285 assert_eq!(sanitize_tool_name("nul\0byte"), "nul_byte");
286 }
287
288 #[test]
289 fn write_finding_keeps_output_inside_corpus_root() {
290 let tmp = tempfile::tempdir().unwrap();
291 let root = tmp.path().join("corpus");
292 let corpus = Corpus::new(root.clone());
293 let finding = Finding::new(
294 FindingKind::Crash,
295 "../../escape",
296 "msg",
297 "details",
298 ReproInfo {
299 seed: 0,
300 tool_call: json!({}),
301 transport: "stdio".to_string(),
302 composition_trail: Vec::new(),
303 },
304 );
305 let path = corpus.write_finding(&finding).unwrap();
306 let canon_root = std::fs::canonicalize(&root).unwrap();
309 let canon_path = std::fs::canonicalize(&path).unwrap();
310 assert!(
311 canon_path.starts_with(&canon_root),
312 "finding written outside corpus root: {canon_path:?} not under {canon_root:?}"
313 );
314 }
315}