Skip to main content

dci_tool/
sandbox.rs

1//! Sandboxing primitives: a read-only path-jail rooted at a single corpus
2//! directory, plus the resource limits that bound every operation.
3//!
4//! The security model is deliberately conservative:
5//!
6//! * Every caller-supplied path is canonicalized and must remain a descendant
7//!   of the canonicalized corpus root. This defeats `..` traversal and symlink
8//!   escapes (canonicalization resolves symlinks before the prefix check).
9//! * Operations are read-only; nothing in this crate creates, writes, or
10//!   deletes files.
11//! * Walks, matches, and outputs are bounded by [`Limits`] so a single tool
12//!   call cannot exhaust memory or run unbounded.
13
14use std::path::{Path, PathBuf};
15use std::sync::Arc;
16use std::time::Duration;
17
18use crate::error::{DciError, Result};
19
20/// Resource bounds applied to every corpus operation.
21///
22/// Defaults are tuned to return useful-but-bounded evidence to an LLM without
23/// flooding its context window.
24#[derive(Debug, Clone)]
25pub struct Limits {
26    /// Maximum number of matches (search) or paths (find) returned per call.
27    pub max_results: usize,
28    /// Maximum number of files a single walk will visit before stopping.
29    pub max_files_walked: usize,
30    /// Files larger than this (in bytes) are skipped during search and
31    /// truncated during read.
32    pub max_file_bytes: u64,
33    /// Maximum characters kept from any single line before truncation.
34    pub max_line_len: usize,
35    /// Maximum number of lines a single `read` call may return.
36    pub max_read_lines: usize,
37    /// Per-operation wall-clock budget.
38    pub timeout: Duration,
39    /// Whether `.gitignore`/`.ignore` rules are honored during walks.
40    ///
41    /// `true` suits source-code corpora; set `false` for forensic log corpora
42    /// where ignored files may still be evidence.
43    pub respect_gitignore: bool,
44    /// Whether hidden (dot) files and directories are included in walks.
45    pub include_hidden: bool,
46}
47
48impl Default for Limits {
49    fn default() -> Self {
50        Self {
51            max_results: 200,
52            max_files_walked: 50_000,
53            max_file_bytes: 8 * 1024 * 1024,
54            max_line_len: 512,
55            max_read_lines: 400,
56            timeout: Duration::from_secs(15),
57            respect_gitignore: true,
58            include_hidden: true,
59        }
60    }
61}
62
63/// A canonicalized, read-only corpus root that all paths are jailed to.
64///
65/// Cheaply cloneable; clones share the same root and limits.
66#[derive(Debug, Clone)]
67pub struct CorpusRoot {
68    inner: Arc<CorpusRootInner>,
69}
70
71#[derive(Debug)]
72struct CorpusRootInner {
73    root: PathBuf,
74    limits: Limits,
75}
76
77impl CorpusRoot {
78    /// Establish a corpus root at `path` with [`Limits::default`].
79    ///
80    /// Fails if the path does not exist or is not a directory.
81    pub fn new(path: impl AsRef<Path>) -> Result<Self> {
82        Self::with_limits(path, Limits::default())
83    }
84
85    /// Establish a corpus root at `path` with explicit limits.
86    pub fn with_limits(path: impl AsRef<Path>, limits: Limits) -> Result<Self> {
87        let requested = path.as_ref();
88        let root = requested
89            .canonicalize()
90            .map_err(|e| DciError::InvalidRoot {
91                path: requested.to_path_buf(),
92                reason: e.to_string(),
93            })?;
94        if !root.is_dir() {
95            return Err(DciError::InvalidRoot {
96                path: root,
97                reason: "not a directory".to_string(),
98            });
99        }
100        Ok(Self {
101            inner: Arc::new(CorpusRootInner { root, limits }),
102        })
103    }
104
105    /// The canonicalized root directory.
106    pub fn root(&self) -> &Path {
107        &self.inner.root
108    }
109
110    /// The active resource limits.
111    pub fn limits(&self) -> &Limits {
112        &self.inner.limits
113    }
114
115    /// Resolve a caller-supplied path against the corpus root and verify it
116    /// stays within the jail.
117    ///
118    /// `requested` may be relative (resolved against the root) or absolute
119    /// (which must still land inside the root). The path must exist; symlinks
120    /// are resolved before the containment check, so a link pointing outside
121    /// the corpus is rejected.
122    pub fn resolve(&self, requested: &str) -> Result<PathBuf> {
123        let candidate = self.join_unchecked(requested);
124
125        let canonical = candidate.canonicalize().map_err(|e| {
126            if e.kind() == std::io::ErrorKind::NotFound {
127                DciError::NotFound {
128                    requested: requested.to_string(),
129                }
130            } else {
131                DciError::Io {
132                    path: candidate.clone(),
133                    source: e,
134                }
135            }
136        })?;
137
138        if !canonical.starts_with(&self.inner.root) {
139            return Err(DciError::PathEscape {
140                requested: requested.to_string(),
141            });
142        }
143        Ok(canonical)
144    }
145
146    /// Render a path (assumed inside the root) as a corpus-relative string for
147    /// display back to the agent. Falls back to the original on failure.
148    pub fn relativize<'a>(&self, path: &'a Path) -> std::borrow::Cow<'a, str> {
149        match path.strip_prefix(&self.inner.root) {
150            Ok(rel) if rel.as_os_str().is_empty() => std::borrow::Cow::Borrowed("."),
151            Ok(rel) => rel.to_string_lossy(),
152            Err(_) => path.to_string_lossy(),
153        }
154    }
155
156    /// Join a requested path to the root without performing the containment
157    /// check (used internally before canonicalization).
158    fn join_unchecked(&self, requested: &str) -> PathBuf {
159        let p = Path::new(requested);
160        if p.is_absolute() {
161            // Strip the leading separator and re-root so absolute-looking
162            // inputs are still interpreted relative to the corpus. The
163            // canonicalize + prefix check is the real guard.
164            let stripped = p.strip_prefix("/").unwrap_or(p);
165            self.inner.root.join(stripped)
166        } else {
167            self.inner.root.join(p)
168        }
169    }
170}
171
172#[cfg(test)]
173mod tests {
174    #![allow(
175        clippy::unwrap_used,
176        clippy::expect_used,
177        clippy::indexing_slicing,
178        clippy::panic
179    )]
180    use super::*;
181    use std::fs;
182
183    fn temp_corpus() -> (tempfile::TempDir, CorpusRoot) {
184        let dir = tempfile::tempdir().expect("tempdir");
185        fs::create_dir(dir.path().join("sub")).expect("subdir");
186        fs::write(dir.path().join("sub/a.txt"), "hello").expect("write");
187        let root = CorpusRoot::new(dir.path()).expect("root");
188        (dir, root)
189    }
190
191    #[test]
192    fn resolves_paths_inside_root() {
193        let (_dir, root) = temp_corpus();
194        let resolved = root.resolve("sub/a.txt").expect("resolve");
195        assert!(resolved.ends_with("sub/a.txt"));
196    }
197
198    #[test]
199    fn rejects_parent_traversal() {
200        let (_dir, root) = temp_corpus();
201        let err = root.resolve("../../../etc/passwd").unwrap_err();
202        // Either it canonicalizes outside (PathEscape) or doesn't exist
203        // (NotFound); both deny the escape.
204        assert!(matches!(
205            err,
206            DciError::PathEscape { .. } | DciError::NotFound { .. }
207        ));
208    }
209
210    #[test]
211    fn rejects_symlink_escape() {
212        let (dir, root) = temp_corpus();
213        let outside = dir.path().parent().expect("parent");
214        let link = dir.path().join("escape");
215        #[cfg(unix)]
216        {
217            std::os::unix::fs::symlink(outside, &link).expect("symlink");
218            let err = root.resolve("escape").unwrap_err();
219            assert!(matches!(err, DciError::PathEscape { .. }));
220        }
221    }
222
223    #[test]
224    fn absolute_input_is_rerooted() {
225        let (_dir, root) = temp_corpus();
226        // An absolute-looking path is reinterpreted relative to the root.
227        let resolved = root.resolve("/sub/a.txt").expect("resolve");
228        assert!(resolved.ends_with("sub/a.txt"));
229    }
230}