dci-tool 0.1.0

Direct Corpus Interaction: a sandboxed, ripgrep-backed corpus-search toolset and agent for cyber-focused LLM agents, built on rig.
Documentation
//! Sandboxing primitives: a read-only path-jail rooted at a single corpus
//! directory, plus the resource limits that bound every operation.
//!
//! The security model is deliberately conservative:
//!
//! * Every caller-supplied path is canonicalized and must remain a descendant
//!   of the canonicalized corpus root. This defeats `..` traversal and symlink
//!   escapes (canonicalization resolves symlinks before the prefix check).
//! * Operations are read-only; nothing in this crate creates, writes, or
//!   deletes files.
//! * Walks, matches, and outputs are bounded by [`Limits`] so a single tool
//!   call cannot exhaust memory or run unbounded.

use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::Duration;

use crate::error::{DciError, Result};

/// Resource bounds applied to every corpus operation.
///
/// Defaults are tuned to return useful-but-bounded evidence to an LLM without
/// flooding its context window.
#[derive(Debug, Clone)]
pub struct Limits {
    /// Maximum number of matches (search) or paths (find) returned per call.
    pub max_results: usize,
    /// Maximum number of files a single walk will visit before stopping.
    pub max_files_walked: usize,
    /// Files larger than this (in bytes) are skipped during search and
    /// truncated during read.
    pub max_file_bytes: u64,
    /// Maximum characters kept from any single line before truncation.
    pub max_line_len: usize,
    /// Maximum number of lines a single `read` call may return.
    pub max_read_lines: usize,
    /// Per-operation wall-clock budget.
    pub timeout: Duration,
    /// Whether `.gitignore`/`.ignore` rules are honored during walks.
    ///
    /// `true` suits source-code corpora; set `false` for forensic log corpora
    /// where ignored files may still be evidence.
    pub respect_gitignore: bool,
    /// Whether hidden (dot) files and directories are included in walks.
    pub include_hidden: bool,
}

impl Default for Limits {
    fn default() -> Self {
        Self {
            max_results: 200,
            max_files_walked: 50_000,
            max_file_bytes: 8 * 1024 * 1024,
            max_line_len: 512,
            max_read_lines: 400,
            timeout: Duration::from_secs(15),
            respect_gitignore: true,
            include_hidden: true,
        }
    }
}

/// A canonicalized, read-only corpus root that all paths are jailed to.
///
/// Cheaply cloneable; clones share the same root and limits.
#[derive(Debug, Clone)]
pub struct CorpusRoot {
    inner: Arc<CorpusRootInner>,
}

#[derive(Debug)]
struct CorpusRootInner {
    root: PathBuf,
    limits: Limits,
}

impl CorpusRoot {
    /// Establish a corpus root at `path` with [`Limits::default`].
    ///
    /// Fails if the path does not exist or is not a directory.
    pub fn new(path: impl AsRef<Path>) -> Result<Self> {
        Self::with_limits(path, Limits::default())
    }

    /// Establish a corpus root at `path` with explicit limits.
    pub fn with_limits(path: impl AsRef<Path>, limits: Limits) -> Result<Self> {
        let requested = path.as_ref();
        let root = requested
            .canonicalize()
            .map_err(|e| DciError::InvalidRoot {
                path: requested.to_path_buf(),
                reason: e.to_string(),
            })?;
        if !root.is_dir() {
            return Err(DciError::InvalidRoot {
                path: root,
                reason: "not a directory".to_string(),
            });
        }
        Ok(Self {
            inner: Arc::new(CorpusRootInner { root, limits }),
        })
    }

    /// The canonicalized root directory.
    pub fn root(&self) -> &Path {
        &self.inner.root
    }

    /// The active resource limits.
    pub fn limits(&self) -> &Limits {
        &self.inner.limits
    }

    /// Resolve a caller-supplied path against the corpus root and verify it
    /// stays within the jail.
    ///
    /// `requested` may be relative (resolved against the root) or absolute
    /// (which must still land inside the root). The path must exist; symlinks
    /// are resolved before the containment check, so a link pointing outside
    /// the corpus is rejected.
    pub fn resolve(&self, requested: &str) -> Result<PathBuf> {
        let candidate = self.join_unchecked(requested);

        let canonical = candidate.canonicalize().map_err(|e| {
            if e.kind() == std::io::ErrorKind::NotFound {
                DciError::NotFound {
                    requested: requested.to_string(),
                }
            } else {
                DciError::Io {
                    path: candidate.clone(),
                    source: e,
                }
            }
        })?;

        if !canonical.starts_with(&self.inner.root) {
            return Err(DciError::PathEscape {
                requested: requested.to_string(),
            });
        }
        Ok(canonical)
    }

    /// Render a path (assumed inside the root) as a corpus-relative string for
    /// display back to the agent. Falls back to the original on failure.
    pub fn relativize<'a>(&self, path: &'a Path) -> std::borrow::Cow<'a, str> {
        match path.strip_prefix(&self.inner.root) {
            Ok(rel) if rel.as_os_str().is_empty() => std::borrow::Cow::Borrowed("."),
            Ok(rel) => rel.to_string_lossy(),
            Err(_) => path.to_string_lossy(),
        }
    }

    /// Join a requested path to the root without performing the containment
    /// check (used internally before canonicalization).
    fn join_unchecked(&self, requested: &str) -> PathBuf {
        let p = Path::new(requested);
        if p.is_absolute() {
            // Strip the leading separator and re-root so absolute-looking
            // inputs are still interpreted relative to the corpus. The
            // canonicalize + prefix check is the real guard.
            let stripped = p.strip_prefix("/").unwrap_or(p);
            self.inner.root.join(stripped)
        } else {
            self.inner.root.join(p)
        }
    }
}

#[cfg(test)]
mod tests {
    #![allow(
        clippy::unwrap_used,
        clippy::expect_used,
        clippy::indexing_slicing,
        clippy::panic
    )]
    use super::*;
    use std::fs;

    fn temp_corpus() -> (tempfile::TempDir, CorpusRoot) {
        let dir = tempfile::tempdir().expect("tempdir");
        fs::create_dir(dir.path().join("sub")).expect("subdir");
        fs::write(dir.path().join("sub/a.txt"), "hello").expect("write");
        let root = CorpusRoot::new(dir.path()).expect("root");
        (dir, root)
    }

    #[test]
    fn resolves_paths_inside_root() {
        let (_dir, root) = temp_corpus();
        let resolved = root.resolve("sub/a.txt").expect("resolve");
        assert!(resolved.ends_with("sub/a.txt"));
    }

    #[test]
    fn rejects_parent_traversal() {
        let (_dir, root) = temp_corpus();
        let err = root.resolve("../../../etc/passwd").unwrap_err();
        // Either it canonicalizes outside (PathEscape) or doesn't exist
        // (NotFound); both deny the escape.
        assert!(matches!(
            err,
            DciError::PathEscape { .. } | DciError::NotFound { .. }
        ));
    }

    #[test]
    fn rejects_symlink_escape() {
        let (dir, root) = temp_corpus();
        let outside = dir.path().parent().expect("parent");
        let link = dir.path().join("escape");
        #[cfg(unix)]
        {
            std::os::unix::fs::symlink(outside, &link).expect("symlink");
            let err = root.resolve("escape").unwrap_err();
            assert!(matches!(err, DciError::PathEscape { .. }));
        }
    }

    #[test]
    fn absolute_input_is_rerooted() {
        let (_dir, root) = temp_corpus();
        // An absolute-looking path is reinterpreted relative to the root.
        let resolved = root.resolve("/sub/a.txt").expect("resolve");
        assert!(resolved.ends_with("sub/a.txt"));
    }
}