dci-tool 0.1.0

Direct Corpus Interaction: a sandboxed, ripgrep-backed corpus-search toolset and agent for cyber-focused LLM agents, built on rig.
Documentation
//! Discrete, typed corpus-interaction tools exposed to a rig [`Agent`].
//!
//! Each tool is a thin, strongly-typed wrapper over [`crate::engine`] that
//! runs the blocking engine work on a worker thread under a wall-clock
//! timeout. The tools are intentionally narrow (search / find / read / list)
//! rather than a single "run a shell command" tool: bounded, structured
//! commands are what state-of-the-art agentic-search systems converged on, and
//! they remove the shell-injection surface entirely.
//!
//! [`Agent`]: rig_core::agent::Agent

use rig_core::completion::ToolDefinition;
use rig_core::tool::Tool;
use serde::Deserialize;
use serde_json::json;

use crate::engine::{
    self, FindQuery, FindResult, ListResult, ReadResult, SearchQuery, SearchResult,
};
use crate::error::DciError;
use crate::sandbox::CorpusRoot;

/// Run a blocking engine operation on a worker thread, bounded by the corpus
/// timeout.
///
/// The engine walks enforce the same `timeout` cooperatively (stopping between
/// files and returning partial, `truncated` results). This outer bound is a
/// backstop with a grace margin for the rare case where a single operation
/// cannot reach a cancellation point in time; it should seldom fire.
async fn run_blocking<T, F>(corpus: CorpusRoot, op: F) -> Result<T, DciError>
where
    F: FnOnce(&CorpusRoot) -> Result<T, DciError> + Send + 'static,
    T: Send + 'static,
{
    let timeout = corpus.limits().timeout;
    // Backstop margin: let the engine's cooperative deadline win in the normal
    // case so callers get partial results instead of a hard timeout error.
    let backstop = timeout + std::time::Duration::from_secs(5);
    let handle = tokio::task::spawn_blocking(move || op(&corpus));
    match tokio::time::timeout(backstop, handle).await {
        Ok(Ok(result)) => result,
        Ok(Err(join_err)) => Err(DciError::Worker(join_err.to_string())),
        Err(_) => Err(DciError::Timeout {
            millis: backstop.as_millis() as u64,
        }),
    }
}

/// Regex search across the corpus, returning `file:line` evidence.
#[derive(Clone)]
pub struct SearchTool {
    corpus: CorpusRoot,
}

impl SearchTool {
    /// Create a search tool bound to `corpus`.
    pub fn new(corpus: CorpusRoot) -> Self {
        Self { corpus }
    }
}

/// Arguments for [`SearchTool`].
#[derive(Debug, serde::Serialize, Deserialize)]
pub struct SearchArgs {
    /// Regular expression to match (ripgrep/Rust regex syntax).
    pub pattern: String,
    /// Optional glob restricting which files are searched (e.g. `**/*.log`).
    #[serde(default)]
    pub path_glob: Option<String>,
    /// Case-insensitive matching. Defaults to `false`.
    #[serde(default)]
    pub case_insensitive: Option<bool>,
    /// Lines of surrounding context to include per match. Defaults to `0`.
    #[serde(default)]
    pub context_lines: Option<usize>,
    /// Cap on the number of matches returned.
    #[serde(default)]
    pub max_results: Option<usize>,
}

impl Tool for SearchTool {
    const NAME: &'static str = "corpus_search";
    type Error = DciError;
    type Args = SearchArgs;
    type Output = SearchResult;

    async fn definition(&self, _prompt: String) -> ToolDefinition {
        ToolDefinition {
            name: Self::NAME.to_string(),
            description: "Search the corpus with a regular expression and return matching \
                          file paths, line numbers, and line text. Use this first to locate \
                          evidence, then narrow with path_glob or read the surrounding lines."
                .to_string(),
            parameters: json!({
                "type": "object",
                "properties": {
                    "pattern": {
                        "type": "string",
                        "description": "Regular expression to search for (Rust/ripgrep syntax)."
                    },
                    "path_glob": {
                        "type": "string",
                        "description": "Optional glob to restrict files, e.g. '**/*.log' or 'auth*'."
                    },
                    "case_insensitive": {
                        "type": "boolean",
                        "description": "Match case-insensitively. Default false."
                    },
                    "context_lines": {
                        "type": "integer",
                        "description": "Lines of context to include on each side of a match. Default 0."
                    },
                    "max_results": {
                        "type": "integer",
                        "description": "Maximum number of matching lines to return."
                    }
                },
                "required": ["pattern"]
            }),
        }
    }

    async fn call(&self, args: Self::Args) -> Result<Self::Output, Self::Error> {
        let args_str = serde_json::to_string(&args).unwrap_or_else(|_| "{}".to_string());
        crate::telemetry::record_tool_call(Self::NAME, &args_str, || async {
            let query = SearchQuery {
                pattern: args.pattern,
                path_glob: args.path_glob,
                case_insensitive: args.case_insensitive.unwrap_or(false),
                context_lines: args.context_lines.unwrap_or(0),
                max_results: args.max_results,
            };
            run_blocking(self.corpus.clone(), move |c| engine::search(c, &query)).await
        })
        .await
    }
}

/// Locate files by a glob over their corpus-relative path.
#[derive(Clone)]
pub struct FindTool {
    corpus: CorpusRoot,
}

impl FindTool {
    /// Create a find tool bound to `corpus`.
    pub fn new(corpus: CorpusRoot) -> Self {
        Self { corpus }
    }
}

/// Arguments for [`FindTool`].
#[derive(Debug, serde::Serialize, Deserialize)]
pub struct FindArgs {
    /// Glob to match against corpus-relative paths (e.g. `**/*.rs`, `auth*`).
    pub glob: String,
    /// Cap on the number of paths returned.
    #[serde(default)]
    pub max_results: Option<usize>,
}

impl Tool for FindTool {
    const NAME: &'static str = "corpus_find";
    type Error = DciError;
    type Args = FindArgs;
    type Output = FindResult;

    async fn definition(&self, _prompt: String) -> ToolDefinition {
        ToolDefinition {
            name: Self::NAME.to_string(),
            description: "Find files in the corpus whose path matches a glob. Use this to \
                          discover where relevant files live before searching or reading them."
                .to_string(),
            parameters: json!({
                "type": "object",
                "properties": {
                    "glob": {
                        "type": "string",
                        "description": "Glob over relative paths, e.g. '**/*.log', 'src/**/*.rs', or 'passwd'."
                    },
                    "max_results": {
                        "type": "integer",
                        "description": "Maximum number of paths to return."
                    }
                },
                "required": ["glob"]
            }),
        }
    }

    async fn call(&self, args: Self::Args) -> Result<Self::Output, Self::Error> {
        let args_str = serde_json::to_string(&args).unwrap_or_else(|_| "{}".to_string());
        crate::telemetry::record_tool_call(Self::NAME, &args_str, || async {
            let query = FindQuery {
                glob: args.glob,
                max_results: args.max_results,
            };
            run_blocking(self.corpus.clone(), move |c| engine::find(c, &query)).await
        })
        .await
    }
}

/// Read a bounded, line-numbered window from one file.
#[derive(Clone)]
pub struct ReadTool {
    corpus: CorpusRoot,
}

impl ReadTool {
    /// Create a read tool bound to `corpus`.
    pub fn new(corpus: CorpusRoot) -> Self {
        Self { corpus }
    }
}

/// Arguments for [`ReadTool`].
#[derive(Debug, serde::Serialize, Deserialize)]
pub struct ReadArgs {
    /// Corpus-relative path of the file to read.
    pub path: String,
    /// 1-based line to start at. Defaults to 1.
    #[serde(default)]
    pub start_line: Option<usize>,
    /// Number of lines to return. Clamped to the configured read limit.
    #[serde(default)]
    pub line_count: Option<usize>,
}

impl Tool for ReadTool {
    const NAME: &'static str = "corpus_read";
    type Error = DciError;
    type Args = ReadArgs;
    type Output = ReadResult;

    async fn definition(&self, _prompt: String) -> ToolDefinition {
        ToolDefinition {
            name: Self::NAME.to_string(),
            description: "Read a bounded, line-numbered window from a single corpus file. Use \
                          this to inspect the exact lines around a search hit and quote evidence."
                .to_string(),
            parameters: json!({
                "type": "object",
                "properties": {
                    "path": {
                        "type": "string",
                        "description": "Corpus-relative path to read."
                    },
                    "start_line": {
                        "type": "integer",
                        "description": "1-based line to start at. Default 1."
                    },
                    "line_count": {
                        "type": "integer",
                        "description": "Number of lines to return (clamped to the read limit)."
                    }
                },
                "required": ["path"]
            }),
        }
    }

    async fn call(&self, args: Self::Args) -> Result<Self::Output, Self::Error> {
        let args_str = serde_json::to_string(&args).unwrap_or_else(|_| "{}".to_string());
        crate::telemetry::record_tool_call(Self::NAME, &args_str, || async {
            run_blocking(self.corpus.clone(), move |c| {
                engine::read_range(c, &args.path, args.start_line, args.line_count)
            })
            .await
        })
        .await
    }
}

/// List the immediate entries of a corpus directory.
#[derive(Clone)]
pub struct ListTool {
    corpus: CorpusRoot,
}

impl ListTool {
    /// Create a list tool bound to `corpus`.
    pub fn new(corpus: CorpusRoot) -> Self {
        Self { corpus }
    }
}

/// Arguments for [`ListTool`].
#[derive(Debug, serde::Serialize, Deserialize)]
pub struct ListArgs {
    /// Corpus-relative directory to list. Defaults to the corpus root.
    #[serde(default)]
    pub path: Option<String>,
}

impl Tool for ListTool {
    const NAME: &'static str = "corpus_list";
    type Error = DciError;
    type Args = ListArgs;
    type Output = ListResult;

    async fn definition(&self, _prompt: String) -> ToolDefinition {
        ToolDefinition {
            name: Self::NAME.to_string(),
            description: "List the files and subdirectories of a corpus directory to orient \
                          yourself before searching."
                .to_string(),
            parameters: json!({
                "type": "object",
                "properties": {
                    "path": {
                        "type": "string",
                        "description": "Corpus-relative directory to list. Defaults to the root."
                    }
                }
            }),
        }
    }

    async fn call(&self, args: Self::Args) -> Result<Self::Output, Self::Error> {
        let args_str = serde_json::to_string(&args).unwrap_or_else(|_| "{}".to_string());
        crate::telemetry::record_tool_call(Self::NAME, &args_str, || async {
            run_blocking(self.corpus.clone(), move |c| {
                engine::list_dir(c, args.path.as_deref())
            })
            .await
        })
        .await
    }
}

/// Bundle of the four corpus tools, all sharing one [`CorpusRoot`].
///
/// Used by [`crate::agent`] to register the full toolset on an agent in one
/// call, and available to callers who want the tools individually.
#[derive(Clone)]
pub struct CorpusTools {
    /// The regex search tool.
    pub search: SearchTool,
    /// The glob find tool.
    pub find: FindTool,
    /// The bounded read tool.
    pub read: ReadTool,
    /// The directory listing tool.
    pub list: ListTool,
}

impl CorpusTools {
    /// Build the full toolset over a shared corpus root.
    pub fn new(corpus: CorpusRoot) -> Self {
        Self {
            search: SearchTool::new(corpus.clone()),
            find: FindTool::new(corpus.clone()),
            read: ReadTool::new(corpus.clone()),
            list: ListTool::new(corpus),
        }
    }
}