use std::{
fs::{self, OpenOptions},
io::{self, Write},
path::{Path, PathBuf},
thread,
time::{Duration, Instant},
};
use thiserror::Error;
use crate::{
finding::Finding,
redact::Redact,
target::{default_lock_timeout_ms, OutputConfig},
};
#[derive(Debug, Error)]
pub enum CorpusError {
#[error("failed to create corpus directory {path}: {source}")]
CreateDir { path: PathBuf, source: io::Error },
#[error("failed to acquire corpus lock {path}: {source}")]
Lock { path: PathBuf, source: io::Error },
#[error("timed out acquiring corpus lock {0}")]
LockTimeout(PathBuf),
#[error("failed to serialize finding {id}: {source}")]
Serialize {
id: String,
source: serde_json::Error,
},
#[error("failed to write corpus file {path}: {source}")]
Write { path: PathBuf, source: io::Error },
#[error("failed to read corpus file {path}: {source}")]
Read { path: PathBuf, source: io::Error },
#[error("failed to parse corpus file {path}: {source}")]
Parse {
path: PathBuf,
source: serde_json::Error,
},
#[error("finding `{0}` not found in corpus")]
NotFound(String),
}
pub type Result<T> = std::result::Result<T, CorpusError>;
#[derive(Debug, Clone)]
pub struct Corpus {
root: PathBuf,
lock_timeout: Duration,
}
impl Corpus {
pub fn new(root: impl Into<PathBuf>) -> Self {
Self {
root: root.into(),
lock_timeout: Duration::from_millis(default_lock_timeout_ms()),
}
}
pub fn from_config(config: &OutputConfig) -> Self {
Self {
root: config.corpus_dir.clone(),
lock_timeout: Duration::from_millis(config.lock_timeout_ms),
}
}
#[must_use]
pub fn with_lock_timeout(mut self, timeout: Duration) -> Self {
self.lock_timeout = timeout;
self
}
pub fn write_finding(&self, finding: &Finding) -> Result<PathBuf> {
let wallfacer_dir = self
.root
.parent()
.map(Path::to_path_buf)
.unwrap_or_else(|| PathBuf::from(".wallfacer"));
fs::create_dir_all(&wallfacer_dir).map_err(|source| CorpusError::CreateDir {
path: wallfacer_dir.clone(),
source,
})?;
let _lock = CorpusLock::acquire(wallfacer_dir.join(".lock"), self.lock_timeout)?;
let safe_tool = sanitize_tool_name(&finding.tool);
let tool_dir = self.root.join(&safe_tool);
fs::create_dir_all(&tool_dir).map_err(|source| CorpusError::CreateDir {
path: tool_dir.clone(),
source,
})?;
let redacted = finding.redacted();
let path = tool_dir.join(format!("{}.json", redacted.id));
let body =
serde_json::to_string_pretty(&redacted).map_err(|source| CorpusError::Serialize {
id: redacted.id.clone(),
source,
})?;
write_secure(&path, body.as_bytes())?;
Ok(path)
}
pub fn list_findings(&self) -> Result<Vec<Finding>> {
let mut findings = Vec::new();
if !self.root.is_dir() {
return Ok(findings);
}
visit_json_files(&self.root, &mut |path| {
findings.push(read_finding_file(path)?);
Ok(())
})?;
findings.sort_by(|left, right| left.id.cmp(&right.id));
Ok(findings)
}
pub fn find_by_id(&self, id: &str) -> Result<Finding> {
self.list_findings()?
.into_iter()
.find(|finding| finding.id == id || finding.id.starts_with(id))
.ok_or_else(|| CorpusError::NotFound(id.to_string()))
}
}
fn write_secure(path: &Path, body: &[u8]) -> Result<()> {
let mut options = OpenOptions::new();
options.write(true).create(true).truncate(true);
#[cfg(unix)]
{
use std::os::unix::fs::OpenOptionsExt;
options.mode(0o600);
}
let mut file = options.open(path).map_err(|source| CorpusError::Write {
path: path.to_path_buf(),
source,
})?;
file.write_all(body).map_err(|source| CorpusError::Write {
path: path.to_path_buf(),
source,
})?;
#[cfg(unix)]
{
use std::os::unix::fs::PermissionsExt;
let _ = fs::set_permissions(path, fs::Permissions::from_mode(0o600));
}
Ok(())
}
fn visit_json_files(path: &Path, visitor: &mut impl FnMut(&Path) -> Result<()>) -> Result<()> {
for entry in fs::read_dir(path).map_err(|source| CorpusError::Read {
path: path.to_path_buf(),
source,
})? {
let entry = entry.map_err(|source| CorpusError::Read {
path: path.to_path_buf(),
source,
})?;
let path = entry.path();
if path.is_dir() {
visit_json_files(&path, visitor)?;
} else if path
.extension()
.is_some_and(|extension| extension == "json")
{
visitor(&path)?;
}
}
Ok(())
}
pub fn sanitize_tool_name(tool_name: &str) -> String {
if tool_name.is_empty() {
return "_".to_string();
}
tool_name
.chars()
.map(|ch| {
if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' {
ch
} else {
'_'
}
})
.collect()
}
fn read_finding_file(path: &Path) -> Result<Finding> {
let body = fs::read_to_string(path).map_err(|source| CorpusError::Read {
path: path.to_path_buf(),
source,
})?;
serde_json::from_str(&body).map_err(|source| CorpusError::Parse {
path: path.to_path_buf(),
source,
})
}
struct CorpusLock {
path: PathBuf,
}
const LOCK_BACKOFF_INITIAL: Duration = Duration::from_millis(25);
const LOCK_BACKOFF_CAP: Duration = Duration::from_millis(1_000);
impl CorpusLock {
fn acquire(path: PathBuf, timeout: Duration) -> Result<Self> {
let deadline = Instant::now() + timeout;
let mut backoff = LOCK_BACKOFF_INITIAL;
loop {
match OpenOptions::new().write(true).create_new(true).open(&path) {
Ok(_) => return Ok(Self { path }),
Err(error) if error.kind() == io::ErrorKind::AlreadyExists => {
if Instant::now() >= deadline {
return Err(CorpusError::LockTimeout(path));
}
let remaining = deadline.saturating_duration_since(Instant::now());
let wait = backoff.min(remaining);
if wait.is_zero() {
return Err(CorpusError::LockTimeout(path));
}
thread::sleep(wait);
backoff = (backoff * 2).min(LOCK_BACKOFF_CAP);
}
Err(source) => {
return Err(CorpusError::Lock {
path: path.clone(),
source,
});
}
}
}
}
}
impl Drop for CorpusLock {
fn drop(&mut self) {
let _ = fs::remove_file(&self.path);
}
}
#[cfg(test)]
#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
mod tests {
use super::*;
use crate::finding::{FindingKind, ReproInfo};
use serde_json::json;
#[test]
fn sanitize_strips_path_separators_and_traversal() {
assert_eq!(sanitize_tool_name("../../etc/passwd"), "______etc_passwd");
assert_eq!(sanitize_tool_name("..\\windows"), "___windows");
assert_eq!(sanitize_tool_name("ok_name-1"), "ok_name-1");
assert_eq!(sanitize_tool_name(""), "_");
assert_eq!(sanitize_tool_name("with space"), "with_space");
assert_eq!(sanitize_tool_name("nul\0byte"), "nul_byte");
}
#[test]
fn write_finding_keeps_output_inside_corpus_root() {
let tmp = tempfile::tempdir().unwrap();
let root = tmp.path().join("corpus");
let corpus = Corpus::new(root.clone());
let finding = Finding::new(
FindingKind::Crash,
"../../escape",
"msg",
"details",
ReproInfo {
seed: 0,
tool_call: json!({}),
transport: "stdio".to_string(),
composition_trail: Vec::new(),
},
);
let path = corpus.write_finding(&finding).unwrap();
let canon_root = std::fs::canonicalize(&root).unwrap();
let canon_path = std::fs::canonicalize(&path).unwrap();
assert!(
canon_path.starts_with(&canon_root),
"finding written outside corpus root: {canon_path:?} not under {canon_root:?}"
);
}
}