dci-tool 0.1.0

Direct Corpus Interaction: a sandboxed, ripgrep-backed corpus-search toolset and agent for cyber-focused LLM agents, built on rig.
Documentation
//! Synthetic "needle in a haystack" log corpus generator.
//!
//! Builds a large, deterministic corpus of benign-looking log lines with a
//! handful of unique indicator-of-compromise (IOC) tokens injected at known
//! locations, plus the matching BEIR-style [`Qrels`]. This mirrors the
//! cyber/forensic use case — find the rare evidence buried in volume — and is
//! fully offline and reproducible from a seed, so it can drive CI benchmarks.

use std::fs;
use std::io::Write;
use std::path::Path;

use rig_retrieval_evals::dataset::{GoldQuery, Qrels};

use crate::error::{DciError, Result};

/// Configuration for [`generate`].
#[derive(Debug, Clone)]
pub struct SyntheticLogConfig {
    /// Number of log files to create.
    pub files: usize,
    /// Approximate number of lines per file.
    pub lines_per_file: usize,
    /// Number of unique IOC needles to inject (one gold query each).
    pub needles: usize,
    /// Seed controlling all pseudo-random choices (corpus is reproducible).
    pub seed: u64,
}

impl Default for SyntheticLogConfig {
    fn default() -> Self {
        Self {
            files: 8,
            lines_per_file: 500,
            needles: 12,
            seed: 1,
        }
    }
}

/// A tiny deterministic PRNG (SplitMix64) — avoids pulling a rand dependency
/// and guarantees identical corpora across platforms for a given seed.
struct SplitMix64 {
    state: u64,
}

impl SplitMix64 {
    fn new(seed: u64) -> Self {
        Self { state: seed }
    }

    fn next_u64(&mut self) -> u64 {
        self.state = self.state.wrapping_add(0x9E37_79B9_7F4A_7C15);
        let mut z = self.state;
        z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
        z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
        z ^ (z >> 31)
    }

    fn below(&mut self, bound: usize) -> usize {
        if bound == 0 {
            0
        } else {
            (self.next_u64() % bound as u64) as usize
        }
    }
}

const USERS: &[&str] = &["alice", "bob", "carol", "dave", "erin", "frank"];
const ACTIONS: &[&str] = &[
    "login succeeded",
    "session opened",
    "heartbeat ok",
    "config reloaded",
    "cache flushed",
    "job completed",
];
const HOSTS: &[&str] = &["web01", "web02", "db01", "cache01", "edge01"];

/// Generate a synthetic log corpus under `dir` and return the gold qrels.
///
/// `dir` must already exist. Files are named `log-000.log`, `log-001.log`, …
/// Each needle injects a line containing a globally-unique IOC token into one
/// file; the corresponding [`GoldQuery`] uses that token as the query and marks
/// that file as the sole relevant document.
pub fn generate(dir: &Path, cfg: &SyntheticLogConfig) -> Result<Qrels> {
    if cfg.files == 0 {
        return Err(DciError::Worker(
            "synthetic generator requires at least one file".to_string(),
        ));
    }
    if !dir.is_dir() {
        return Err(DciError::InvalidRoot {
            path: dir.to_path_buf(),
            reason: "output directory does not exist".to_string(),
        });
    }

    let mut rng = SplitMix64::new(cfg.seed);

    // Decide, before writing, which (file, line) each needle lands on so we can
    // both inject it and record the gold label.
    struct Needle {
        token: String,
        file_idx: usize,
        line_idx: usize,
    }
    let mut needles = Vec::with_capacity(cfg.needles);
    for i in 0..cfg.needles {
        let file_idx = rng.below(cfg.files);
        let line_idx = if cfg.lines_per_file == 0 {
            0
        } else {
            rng.below(cfg.lines_per_file)
        };
        // Tokens are globally unique and lexically distinctive so an exact
        // search has unambiguous recall.
        let token = format!("IOC-{:08x}-{i:04}", cfg.seed);
        needles.push(Needle {
            token,
            file_idx,
            line_idx,
        });
    }

    let file_name = |idx: usize| format!("log-{idx:03}.log");

    for file_idx in 0..cfg.files {
        let path = dir.join(file_name(file_idx));
        let mut file = fs::File::create(&path).map_err(|e| DciError::Io {
            path: path.clone(),
            source: e,
        })?;

        let needles_here: Vec<&Needle> =
            needles.iter().filter(|n| n.file_idx == file_idx).collect();

        for line_idx in 0..cfg.lines_per_file {
            // Deterministic benign line.
            let ts = 1_700_000_000u64 + (file_idx as u64 * 100_000) + line_idx as u64;
            let user = *USERS.get(rng.below(USERS.len())).unwrap_or(&"");
            let host = *HOSTS.get(rng.below(HOSTS.len())).unwrap_or(&"");
            let action = *ACTIONS.get(rng.below(ACTIONS.len())).unwrap_or(&"");
            writeln!(file, "{ts} host={host} user={user} event=\"{action}\"").map_err(|e| {
                DciError::Io {
                    path: path.clone(),
                    source: e,
                }
            })?;

            // Inject any needles anchored to this line.
            for needle in needles_here.iter().filter(|n| n.line_idx == line_idx) {
                writeln!(
                    file,
                    "{ts} host={host} user={user} event=\"alert\" indicator={}",
                    needle.token
                )
                .map_err(|e| DciError::Io {
                    path: path.clone(),
                    source: e,
                })?;
            }
        }
    }

    let queries = needles
        .iter()
        .enumerate()
        .map(|(i, needle)| {
            let mut relevant = std::collections::HashMap::new();
            relevant.insert(file_name(needle.file_idx), 1u8);
            GoldQuery {
                query_id: format!("needle-{i:04}"),
                query: needle.token.clone(),
                relevant_docs: relevant,
                reference_answer: None,
            }
        })
        .collect();

    Ok(Qrels { queries })
}

#[cfg(test)]
mod tests {
    #![allow(
        clippy::unwrap_used,
        clippy::expect_used,
        clippy::indexing_slicing,
        clippy::panic
    )]
    use super::*;

    #[test]
    fn generation_is_deterministic_for_a_seed() {
        let dir_a = tempfile::tempdir().unwrap();
        let dir_b = tempfile::tempdir().unwrap();
        let cfg = SyntheticLogConfig {
            files: 3,
            lines_per_file: 50,
            needles: 5,
            seed: 7,
        };
        let qa = generate(dir_a.path(), &cfg).unwrap();
        let qb = generate(dir_b.path(), &cfg).unwrap();

        assert_eq!(qa.queries.len(), 5);
        // Same seed -> same queries and same first file bytes.
        let qa_tokens: Vec<_> = qa.queries.iter().map(|q| &q.query).collect();
        let qb_tokens: Vec<_> = qb.queries.iter().map(|q| &q.query).collect();
        assert_eq!(qa_tokens, qb_tokens);

        let file_a = std::fs::read(dir_a.path().join("log-000.log")).unwrap();
        let file_b = std::fs::read(dir_b.path().join("log-000.log")).unwrap();
        assert_eq!(file_a, file_b);
    }

    #[test]
    fn each_needle_token_appears_in_its_file() {
        let dir = tempfile::tempdir().unwrap();
        let cfg = SyntheticLogConfig {
            files: 4,
            lines_per_file: 40,
            needles: 6,
            seed: 3,
        };
        let qrels = generate(dir.path(), &cfg).unwrap();
        for query in &qrels.queries {
            let (file, _grade) = query.relevant_docs.iter().next().unwrap();
            let content = std::fs::read_to_string(dir.path().join(file)).unwrap();
            assert!(
                content.contains(&query.query),
                "token {} should be in {file}",
                query.query
            );
        }
    }
}