dci-tool 0.1.0

Direct Corpus Interaction: a sandboxed, ripgrep-backed corpus-search toolset and agent for cyber-focused LLM agents, built on rig.
Documentation
//! Phase 3 — evaluation and benchmarking.
//!
//! This module measures how well Direct Corpus Interaction *retrieves* evidence
//! versus a vector-embedding baseline, on the same queries and ground truth, so
//! the core thesis ("interrogating raw text can match or beat semantic
//! retrieval") can be tested rather than asserted.
//!
//! It deliberately scores at the **retrieval** layer — a query maps to a ranked
//! list of corpus files — which is deterministic, offline, and directly
//! comparable to a vector store. The scoring primitives (BEIR-style qrels, the
//! six IR metrics, the report with bootstrap confidence intervals) are reused
//! from [`rig_retrieval_evals`]; only the DCI and vector *retrievers* and the
//! synthetic benchmark are new here.
//!
//! Gated behind the `eval` feature.

pub mod agent;
pub mod beir;
pub mod synth;
mod vector;

use std::collections::{HashMap, HashSet};

use rig_retrieval_evals::dataset::{Qrels, RetrievedDoc};
use rig_retrieval_evals::report::MultiReport;
use rig_retrieval_evals::retrieval::{
    HitRateAtK, MapAtK, Mrr, NdcgAtK, PrecisionAtK, RecallAtK, RetrievalMetric,
};
use rig_retrieval_evals::retriever::RetrieveFuture;
use rig_retrieval_evals::score_retriever;

use crate::engine::{self, SearchQuery};
use crate::error::DciError;
use crate::sandbox::CorpusRoot;

/// Re-export of the upstream retriever seam so existing
/// `dci_tool::eval::Retriever` imports keep working. Implementations in this
/// crate (`DciRetriever`, `VectorRetriever`) plug straight into
/// [`rig_retrieval_evals::score_retriever`].
pub use rig_retrieval_evals::Retriever;
pub use vector::VectorRetriever;

/// Knobs for an evaluation run.
#[derive(Debug, Clone)]
pub struct EvalConfig {
    /// Cut-off depth `k` for the `@k` metrics and retrieval.
    pub k: usize,
    /// Bootstrap resamples for confidence intervals (0 disables CIs).
    pub bootstrap_iters: usize,
    /// Confidence level for the bootstrap interval (e.g. `0.95`).
    pub ci_level: f64,
    /// Seed for the (deterministic) bootstrap resampling.
    pub seed: u64,
    /// Dataset label recorded in the report.
    pub dataset_id: String,
}

impl Default for EvalConfig {
    fn default() -> Self {
        Self {
            k: 10,
            bootstrap_iters: 1000,
            ci_level: 0.95,
            seed: 42,
            dataset_id: "dci-eval".to_string(),
        }
    }
}

impl EvalConfig {
    fn metrics(&self) -> Vec<Box<dyn RetrievalMetric>> {
        vec![
            Box::new(RecallAtK::new(self.k)),
            Box::new(PrecisionAtK::new(self.k)),
            Box::new(HitRateAtK::new(self.k)),
            Box::new(Mrr),
            Box::new(MapAtK::new(self.k)),
            Box::new(NdcgAtK::new(self.k)),
        ]
    }
}

/// Score `retriever` against `qrels`, producing a [`MultiReport`] with one
/// entry per IR metric (recall, precision, hit-rate, MRR, MAP, nDCG `@k`).
///
/// Thin wrapper over [`rig_retrieval_evals::score_retriever`]: it supplies the
/// six-metric suite from [`EvalConfig`], attaches a deterministic bootstrap CI,
/// and tags the report with the dataset id and the retriever's store label.
pub async fn evaluate(
    retriever: &dyn Retriever,
    qrels: &Qrels,
    cfg: &EvalConfig,
) -> Result<MultiReport, DciError> {
    let metrics = cfg.metrics();
    let report = score_retriever(retriever, qrels, cfg.k, &metrics, 1)
        .await
        .map_err(|e| DciError::Worker(e.to_string()))?;
    let report = if cfg.bootstrap_iters > 0 {
        report.with_bootstrap(cfg.bootstrap_iters, cfg.ci_level, cfg.seed)
    } else {
        report
    };
    Ok(report
        .with_dataset(cfg.dataset_id.clone())
        .with_store(retriever.name().to_string()))
}

/// A head-to-head evaluation of DCI against a baseline on the same qrels.
pub struct Comparison {
    /// Report for the Direct Corpus Interaction retriever.
    pub dci: MultiReport,
    /// Report for the baseline retriever.
    pub baseline: MultiReport,
}

impl Comparison {
    /// Evaluate both retrievers on `qrels` with the same config.
    pub async fn run(
        dci: &dyn Retriever,
        baseline: &dyn Retriever,
        qrels: &Qrels,
        cfg: &EvalConfig,
    ) -> Result<Self, DciError> {
        Ok(Self {
            dci: evaluate(dci, qrels, cfg).await?,
            baseline: evaluate(baseline, qrels, cfg).await?,
        })
    }

    /// Render both reports plus a per-metric delta table (DCI − baseline).
    ///
    /// The delta table is produced by the upstream
    /// [`MultiReport::delta_markdown`](rig_retrieval_evals::report::MultiReport::delta_markdown),
    /// which also surfaces per-query winner/loser counts.
    pub fn to_markdown(&self) -> String {
        let mut out = String::new();
        out.push_str("## DCI\n\n");
        out.push_str(&self.dci.to_markdown());
        out.push_str("\n\n## Baseline\n\n");
        out.push_str(&self.baseline.to_markdown());
        out.push_str("\n\n## Delta (DCI − baseline)\n\n");
        out.push_str(&self.dci.delta_markdown(&self.baseline).unwrap_or_default());
        out
    }
}

/// A retriever that ranks corpus files by lexical overlap with the query,
/// powered entirely by the in-process ripgrep engine — i.e. direct corpus
/// interaction used *as* a retriever.
///
/// For each distinct query term it searches the corpus (case-insensitively)
/// and scores every file by how many distinct query terms it contains, with a
/// small bonus for total match density. This is the honest "search-the-raw-text"
/// baseline that a DCI agent's first move approximates.
pub struct DciRetriever {
    corpus: CorpusRoot,
    name: String,
}

impl DciRetriever {
    /// Create a DCI retriever over `corpus`.
    pub fn new(corpus: CorpusRoot) -> Self {
        Self {
            corpus,
            name: "dci-lexical".to_string(),
        }
    }

    /// Override the report store label.
    pub fn with_name(mut self, name: impl Into<String>) -> Self {
        self.name = name.into();
        self
    }
}

impl Retriever for DciRetriever {
    fn name(&self) -> &str {
        &self.name
    }

    fn retrieve<'a>(&'a self, query: &'a str, k: usize) -> RetrieveFuture<'a> {
        let terms = tokenize(query);
        let corpus = self.corpus.clone();
        Box::pin(async move {
            if terms.is_empty() {
                return Ok(Vec::new());
            }
            let max_results = corpus.limits().max_results;

            // Run a single parallel corpus walk for the whole query: one alternation
            // regex over all terms, rather than one walk per term. Per-file term
            // attribution is recovered afterward from each returned line, keeping the
            // original "distinct terms matched + match-density" ranking semantics at
            // O(1) walks instead of O(terms) walks.
            let joined = tokio::task::spawn_blocking(move || {
                let pattern = terms
                    .iter()
                    .map(|t| regex_escape(t))
                    .collect::<Vec<_>>()
                    .join("|");
                let result = engine::search(
                    &corpus,
                    &SearchQuery {
                        pattern,
                        path_glob: None,
                        case_insensitive: true,
                        context_lines: 0,
                        max_results: Some(max_results),
                    },
                )?;

                let lc_terms: Vec<String> = terms.iter().map(|t| t.to_lowercase()).collect();
                // file -> (distinct term indices matched, total matched lines)
                let mut per_file: HashMap<String, (HashSet<usize>, usize)> = HashMap::new();
                for hit in &result.hits {
                    let entry = per_file.entry(hit.path.clone()).or_default();
                    entry.1 += 1;
                    let line_lc = hit.text.to_lowercase();
                    for (i, term) in lc_terms.iter().enumerate() {
                        if line_lc.contains(term) {
                            entry.0.insert(i);
                        }
                    }
                }

                let mut ranked: Vec<RetrievedDoc> = per_file
                    .into_iter()
                    .map(|(doc_id, (distinct, total))| {
                        let score = distinct.len() as f64 + 0.1 * (1.0 + total as f64).ln();
                        RetrievedDoc { doc_id, score }
                    })
                    .collect();
                // Sort best-first; break ties by path for determinism.
                ranked.sort_by(|a, b| {
                    b.score
                        .partial_cmp(&a.score)
                        .unwrap_or(std::cmp::Ordering::Equal)
                        .then_with(|| a.doc_id.cmp(&b.doc_id))
                });
                ranked.truncate(k);
                Ok::<_, DciError>(ranked)
            })
            .await
            .map_err(|e| rig_retrieval_evals::Error::Config(e.to_string()))?;
            joined.map_err(|e| rig_retrieval_evals::Error::Config(e.to_string()))
        })
    }
}

/// Lightweight stopword list — enough to keep natural-language queries focused
/// without pulling an NLP dependency.
const STOPWORDS: &[&str] = &[
    "the", "a", "an", "of", "to", "in", "is", "are", "was", "were", "and", "or", "for", "on", "at",
    "by", "with", "as", "that", "this", "it", "be", "from", "who", "what", "when", "where",
    "which", "how", "did", "do", "does",
];

/// Split a query into lowercase search terms, trimming surrounding punctuation
/// but preserving internal characters (so `10.0.0.5` and `IOC-..` survive),
/// dropping very short tokens and stopwords.
fn tokenize(query: &str) -> Vec<String> {
    let mut seen = HashSet::new();
    query
        .split_whitespace()
        .map(|tok| {
            tok.trim_matches(|c: char| !c.is_alphanumeric())
                .to_lowercase()
        })
        .filter(|tok| tok.len() >= 2 && !STOPWORDS.contains(&tok.as_str()))
        .filter(|tok| seen.insert(tok.clone()))
        .collect()
}

/// Escape every non-alphanumeric character so an arbitrary term is matched
/// literally by the regex engine.
fn regex_escape(term: &str) -> String {
    let mut out = String::with_capacity(term.len() * 2);
    for ch in term.chars() {
        if !ch.is_alphanumeric() {
            out.push('\\');
        }
        out.push(ch);
    }
    out
}

#[cfg(test)]
mod tests {
    #![allow(
        clippy::unwrap_used,
        clippy::expect_used,
        clippy::indexing_slicing,
        clippy::panic
    )]
    use super::*;

    #[test]
    fn tokenize_drops_stopwords_and_punctuation() {
        let terms = tokenize("Who wrote 1984?");
        assert_eq!(terms, vec!["wrote", "1984"]);
    }

    #[test]
    fn tokenize_preserves_internal_punctuation() {
        let terms = tokenize("contact from 10.0.0.5 please");
        assert!(terms.contains(&"10.0.0.5".to_string()));
    }

    #[test]
    fn regex_escape_neutralizes_metachars() {
        assert_eq!(regex_escape("10.0.0.5"), "10\\.0\\.0\\.5");
        assert_eq!(regex_escape("a+b"), "a\\+b");
        assert_eq!(regex_escape("abc"), "abc");
    }
}