dci-tool 0.1.0 - Docs.rs

//! The in-process corpus engine.
//!
//! Search uses ripgrep's own crates ([`grep`], [`ignore`], [`globset`]) linked
//! directly into the process — the same matching/walking engine as the `rg`
//! binary, but with no subprocess and therefore no shell-injection surface.
//!
//! All functions here are synchronous and blocking; callers on an async runtime
//! should dispatch them via `spawn_blocking` (the tool layer does this).

use std::io;
use std::path::Path;
use std::sync::Mutex;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::time::Instant;

use globset::{Glob, GlobSet, GlobSetBuilder};
use grep::regex::RegexMatcherBuilder;
use grep::searcher::{BinaryDetection, Searcher, SearcherBuilder, Sink, SinkContext, SinkMatch};
use ignore::{WalkBuilder, WalkState};
use serde::{Deserialize, Serialize};

use crate::error::{DciError, Result};
use crate::sandbox::CorpusRoot;

/// Parameters for a [`search`] call.
#[derive(Debug, Clone)]
pub struct SearchQuery {
    /// The regular expression to match (ripgrep/Rust regex syntax).
    pub pattern: String,
    /// Optional glob restricting which files are searched (e.g. `**/*.log`).
    pub path_glob: Option<String>,
    /// Case-insensitive matching when `true`.
    pub case_insensitive: bool,
    /// Number of context lines to capture on each side of a match.
    pub context_lines: usize,
    /// Override for the maximum number of matches returned.
    pub max_results: Option<usize>,
}

/// A single line emitted by [`search`], either a match or surrounding context.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct SearchHit {
    /// Corpus-relative path of the file.
    pub path: String,
    /// 1-based line number.
    pub line: u64,
    /// The (possibly truncated) line text, trailing newline removed.
    pub text: String,
    /// `true` for the matched line, `false` for surrounding context lines.
    pub is_match: bool,
}

/// Result of a [`search`] call.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SearchResult {
    /// Collected hits in walk order.
    pub hits: Vec<SearchHit>,
    /// Number of files actually searched.
    pub files_searched: usize,
    /// `true` if the result was capped by a limit (more matches may exist).
    pub truncated: bool,
}

/// Run a regular-expression search across the corpus.
///
/// The walk and per-file search run in parallel across ripgrep's worker
/// threads (the same engine as the `rg` binary). To keep results reproducible
/// regardless of thread scheduling, every matching line is collected and then
/// deterministically ordered by `(path, line, match-before-context)` before the
/// result cap is applied. The only non-deterministic bound is
/// `max_files_walked`, which acts purely as a runaway-corpus safety valve and
/// is not reached on the bounded corpora used for evaluation or testing.
pub fn search(corpus: &CorpusRoot, query: &SearchQuery) -> Result<SearchResult> {
    let limits = corpus.limits();
    let cap = query.max_results.unwrap_or(limits.max_results).max(1);

    let matcher = RegexMatcherBuilder::new()
        .case_insensitive(query.case_insensitive)
        .line_terminator(Some(b'\n'))
        .build(&query.pattern)
        .map_err(|e| DciError::InvalidPattern(e.to_string()))?;

    let glob = query.path_glob.as_deref().map(build_globset).transpose()?;

    let hits: Mutex<Vec<SearchHit>> = Mutex::new(Vec::new());
    let files_searched = AtomicUsize::new(0);
    let files_walked = AtomicUsize::new(0);
    let timed_out = AtomicBool::new(false);
    let deadline = Instant::now() + limits.timeout;

    // Shared references captured by each per-thread worker closure.
    let hits_ref = &hits;
    let files_searched_ref = &files_searched;
    let files_walked_ref = &files_walked;
    let timed_out_ref = &timed_out;
    let matcher_ref = &matcher;
    let glob_ref = &glob;

    walk(corpus).build_parallel().run(|| {
        // Each worker thread gets its own matcher clone and searcher: the
        // matcher is cheap to clone and `Searcher` holds non-shareable buffers.
        let matcher = matcher_ref.clone();
        let glob = glob_ref.clone();
        let context_lines = query.context_lines;
        let max_line_len = limits.max_line_len;
        let max_file_bytes = limits.max_file_bytes;
        let max_files_walked = limits.max_files_walked;
        let mut searcher = SearcherBuilder::new()
            .line_number(true)
            .before_context(context_lines)
            .after_context(context_lines)
            .binary_detection(BinaryDetection::quit(0))
            .build();

        Box::new(move |result| {
            let entry = match result {
                Ok(e) => e,
                Err(_) => return WalkState::Continue,
            };
            if !entry.file_type().is_some_and(|t| t.is_file()) {
                return WalkState::Continue;
            }
            let walked = files_walked_ref.fetch_add(1, Ordering::Relaxed) + 1;
            if walked > max_files_walked {
                return WalkState::Quit;
            }
            // Cooperative wall-clock cancellation: stop the walk between files
            // once the budget is spent and flag the result as truncated, rather
            // than running to completion on a thread the caller has abandoned.
            if Instant::now() >= deadline {
                timed_out_ref.store(true, Ordering::Relaxed);
                return WalkState::Quit;
            }

            let path = entry.path();
            let rel = corpus.relativize(path).into_owned();

            if let Some(set) = &glob {
                if !set.is_match(rel.as_str()) {
                    return WalkState::Continue;
                }
            }
            if let Ok(meta) = entry.metadata() {
                if meta.len() > max_file_bytes {
                    return WalkState::Continue;
                }
            }

            // Bound each file's contribution to `cap` so a single pathological
            // file cannot exhaust memory; the global cap is applied after merge.
            let mut local: Vec<SearchHit> = Vec::new();
            let mut sink = CollectSink {
                rel: &rel,
                hits: &mut local,
                remaining: cap,
                max_line_len,
            };
            // Search errors on a single file (e.g. permissions) are non-fatal.
            let _ = searcher.search_path(&matcher, path, &mut sink);
            files_searched_ref.fetch_add(1, Ordering::Relaxed);

            if !local.is_empty() {
                let mut guard = hits_ref.lock().unwrap_or_else(|e| e.into_inner());
                guard.extend(local);
            }
            WalkState::Continue
        })
    });

    let mut hits = hits.into_inner().unwrap_or_else(|e| e.into_inner());
    let walked_total = files_walked.load(Ordering::Relaxed);
    let collected = hits.len();
    let truncated = walked_total > limits.max_files_walked
        || collected >= cap
        || timed_out.load(Ordering::Relaxed);

    // Deterministic order: by path, then line, with matched lines ahead of
    // their surrounding context at the same line number.
    hits.sort_by(|a, b| {
        a.path
            .cmp(&b.path)
            .then(a.line.cmp(&b.line))
            .then(b.is_match.cmp(&a.is_match))
    });
    hits.truncate(cap);

    Ok(SearchResult {
        hits,
        files_searched: files_searched.load(Ordering::Relaxed),
        truncated,
    })
}

/// Parameters for a [`find`] call.
#[derive(Debug, Clone)]
pub struct FindQuery {
    /// Glob to match against corpus-relative paths (e.g. `**/*.rs`, `auth*`).
    pub glob: String,
    /// Override for the maximum number of paths returned.
    pub max_results: Option<usize>,
}

/// Result of a [`find`] call.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FindResult {
    /// Matching corpus-relative paths.
    pub paths: Vec<String>,
    /// `true` if the result was capped by a limit.
    pub truncated: bool,
}

/// Find files whose corpus-relative path matches a glob.
///
/// The walk runs in parallel across ripgrep's worker threads. To keep results
/// reproducible regardless of thread scheduling, every matching path is
/// collected and then deterministically ordered before the result cap is
/// applied. The only non-deterministic bound is `max_files_walked`, which acts
/// purely as a runaway-corpus safety valve.
pub fn find(corpus: &CorpusRoot, query: &FindQuery) -> Result<FindResult> {
    let limits = corpus.limits();
    let cap = query.max_results.unwrap_or(limits.max_results).max(1);
    let set = build_globset(&query.glob)?;

    let paths = Mutex::new(Vec::new());
    let files_walked = AtomicUsize::new(0);
    let timed_out = AtomicBool::new(false);
    let deadline = Instant::now() + limits.timeout;

    let paths_ref = &paths;
    let files_walked_ref = &files_walked;
    let timed_out_ref = &timed_out;
    let set_ref = &set;

    walk(corpus).build_parallel().run(|| {
        Box::new(move |result| {
            let entry = match result {
                Ok(e) => e,
                Err(_) => return WalkState::Continue,
            };
            if !entry.file_type().is_some_and(|t| t.is_file()) {
                return WalkState::Continue;
            }
            let walked = files_walked_ref.fetch_add(1, Ordering::Relaxed) + 1;
            if walked > limits.max_files_walked {
                return WalkState::Quit;
            }
            if Instant::now() >= deadline {
                timed_out_ref.store(true, Ordering::Relaxed);
                return WalkState::Quit;
            }

            let rel = corpus.relativize(entry.path()).into_owned();
            if set_ref.is_match(rel.as_str()) {
                let mut guard = paths_ref.lock().unwrap_or_else(|e| e.into_inner());
                guard.push(rel);
            }
            WalkState::Continue
        })
    });

    let mut paths = paths.into_inner().unwrap_or_else(|e| e.into_inner());
    let walked_total = files_walked.load(Ordering::Relaxed);
    let collected = paths.len();
    let truncated = walked_total > limits.max_files_walked
        || collected > cap
        || timed_out.load(Ordering::Relaxed);

    // Deterministic selection: order all matches, then apply the cap, so the
    // surviving subset never depends on thread scheduling.
    paths.sort();
    paths.truncate(cap);

    Ok(FindResult { paths, truncated })
}

/// A numbered line returned by [`read_range`].
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct NumberedLine {
    /// 1-based line number within the file.
    pub line: u64,
    /// The (possibly truncated) line text.
    pub text: String,
}

/// Result of a [`read_range`] call.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ReadResult {
    /// Corpus-relative path that was read.
    pub path: String,
    /// The returned lines.
    pub lines: Vec<NumberedLine>,
    /// `true` if more lines exist below the returned window.
    pub more_below: bool,
}

/// Read a bounded, line-numbered window from a single file.
///
/// `start_line` is 1-based (defaults to 1). `line_count` is clamped to the
/// configured read limit.
pub fn read_range(
    corpus: &CorpusRoot,
    path: &str,
    start_line: Option<usize>,
    line_count: Option<usize>,
) -> Result<ReadResult> {
    let limits = corpus.limits();
    let resolved = corpus.resolve(path)?;

    if !resolved.is_file() {
        return Err(DciError::NotFound {
            requested: path.to_string(),
        });
    }

    let file = std::fs::File::open(&resolved).map_err(|e| DciError::Io {
        path: resolved.clone(),
        source: e,
    })?;
    use std::io::{BufRead, BufReader, Read};
    let mut reader = BufReader::new(file.take(limits.max_file_bytes));

    let start = start_line.unwrap_or(1).max(1);
    let count = line_count
        .unwrap_or(limits.max_read_lines)
        .min(limits.max_read_lines);

    let mut lines = Vec::new();
    let mut more_below = false;
    let mut current_idx = 0;
    let mut line_buf = Vec::new();

    while let Ok(bytes_read) = reader.read_until(b'\n', &mut line_buf) {
        if bytes_read == 0 {
            break;
        }
        current_idx += 1;

        if current_idx < start {
            line_buf.clear();
            continue;
        }
        if lines.len() >= count {
            more_below = true;
            break;
        }
        
        let raw = String::from_utf8_lossy(&line_buf);
        lines.push(NumberedLine {
            line: current_idx as u64,
            text: truncate(&raw, limits.max_line_len),
        });
        line_buf.clear();
    }

    Ok(ReadResult {
        path: corpus.relativize(&resolved).into_owned(),
        lines,
        more_below,
    })
}

/// A directory entry returned by [`list_dir`].
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DirEntryInfo {
    /// Entry name (not a full path).
    pub name: String,
    /// `"file"`, `"dir"`, `"symlink"`, or `"other"`.
    pub kind: String,
    /// Size in bytes for files; `None` otherwise.
    pub size_bytes: Option<u64>,
}

/// Result of a [`list_dir`] call.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ListResult {
    /// Corpus-relative path of the listed directory.
    pub path: String,
    /// Entries sorted directories-first, then by name.
    pub entries: Vec<DirEntryInfo>,
    /// `true` if the listing was capped by a limit.
    pub truncated: bool,
}

/// List the immediate entries of a directory inside the corpus.
pub fn list_dir(corpus: &CorpusRoot, path: Option<&str>) -> Result<ListResult> {
    let limits = corpus.limits();
    let resolved = match path {
        Some(p) if !p.is_empty() && p != "." => corpus.resolve(p)?,
        _ => corpus.root().to_path_buf(),
    };

    if !resolved.is_dir() {
        return Err(DciError::NotFound {
            requested: path.unwrap_or(".").to_string(),
        });
    }

    let read_dir = std::fs::read_dir(&resolved).map_err(|e| DciError::Io {
        path: resolved.clone(),
        source: e,
    })?;

    let mut entries = Vec::new();
    let mut truncated = false;
    for entry in read_dir {
        let entry = match entry {
            Ok(e) => e,
            Err(_) => continue,
        };
        if entries.len() >= limits.max_results {
            truncated = true;
            break;
        }
        let file_type = entry.file_type().ok();
        let (kind, size_bytes) = match file_type {
            Some(t) if t.is_dir() => ("dir", None),
            Some(t) if t.is_symlink() => ("symlink", None),
            Some(t) if t.is_file() => ("file", entry.metadata().ok().map(|m| m.len())),
            _ => ("other", None),
        };
        entries.push(DirEntryInfo {
            name: entry.file_name().to_string_lossy().into_owned(),
            kind: kind.to_string(),
            size_bytes,
        });
    }

    entries.sort_by(|a, b| {
        let rank = |k: &str| if k == "dir" { 0 } else { 1 };
        rank(&a.kind)
            .cmp(&rank(&b.kind))
            .then_with(|| a.name.cmp(&b.name))
    });

    Ok(ListResult {
        path: corpus.relativize(&resolved).into_owned(),
        entries,
        truncated,
    })
}

/// Enumerate every file in the corpus as a corpus-relative path, honoring the
/// walk limits (gitignore, hidden, `max_files_walked`).
///
/// Used by corpus-wide consumers such as the evaluation vector baseline, which
/// must materialize the whole corpus to embed it.
pub fn list_files(corpus: &CorpusRoot) -> Result<Vec<String>> {
    let limits = corpus.limits();
    let mut paths = Vec::new();
    for entry in walk(corpus).build() {
        if paths.len() >= limits.max_files_walked {
            break;
        }
        let entry = match entry {
            Ok(e) => e,
            Err(_) => continue,
        };
        if entry.file_type().is_some_and(|t| t.is_file()) {
            paths.push(corpus.relativize(entry.path()).into_owned());
        }
    }
    paths.sort();
    Ok(paths)
}

/// Read a whole file (bounded by `max_file_bytes`) as UTF-8 (lossy), resolving
/// the path through the corpus jail.
pub fn read_document(corpus: &CorpusRoot, path: &str) -> Result<String> {
    let resolved = corpus.resolve(path)?;
    if !resolved.is_file() {
        return Err(DciError::NotFound {
            requested: path.to_string(),
        });
    }
    read_file_bounded(&resolved, corpus.limits().max_file_bytes)
}

// --- internals ---------------------------------------------------------------

/// Build a gitignore-aware walker honoring the corpus limits.
fn walk(corpus: &CorpusRoot) -> WalkBuilder {
    let limits = corpus.limits();
    let respect = limits.respect_gitignore;
    let mut builder = WalkBuilder::new(corpus.root());
    builder
        .standard_filters(respect)
        .git_ignore(respect)
        .git_global(respect)
        .git_exclude(respect)
        .ignore(respect)
        .parents(respect)
        // Honor `.gitignore` files even when the corpus is not itself a git
        // repository; otherwise ignore rules would be silently skipped.
        .require_git(false)
        .hidden(!limits.include_hidden)
        .follow_links(false);
    builder
}

/// Compile a glob, auto-wrapping bare patterns (no `/`) to match anywhere in
/// the tree (`auth*` becomes `**/auth*`).
fn build_globset(pattern: &str) -> Result<GlobSet> {
    let normalized = if pattern.contains('/') {
        pattern.to_string()
    } else {
        format!("**/{pattern}")
    };
    let glob = Glob::new(&normalized).map_err(|e| DciError::InvalidGlob {
        glob: pattern.to_string(),
        reason: e.to_string(),
    })?;
    let mut builder = GlobSetBuilder::new();
    builder.add(glob);
    builder.build().map_err(|e| DciError::InvalidGlob {
        glob: pattern.to_string(),
        reason: e.to_string(),
    })
}

/// Read a file, capping the number of bytes read to `max_bytes`.
fn read_file_bounded(path: &Path, max_bytes: u64) -> Result<String> {
    use std::io::Read;
    let file = std::fs::File::open(path).map_err(|e| DciError::Io {
        path: path.to_path_buf(),
        source: e,
    })?;
    let mut handle = file.take(max_bytes);
    let mut buf = Vec::new();
    handle.read_to_end(&mut buf).map_err(|e| DciError::Io {
        path: path.to_path_buf(),
        source: e,
    })?;
    Ok(String::from_utf8_lossy(&buf).into_owned())
}

fn truncate(text: &str, max_len: usize) -> String {
    let trimmed = text.trim_end_matches(['\n', '\r']);
    let mut indices = trimmed.char_indices();
    match indices.nth(max_len) {
        None => trimmed.to_string(),
        Some((byte_idx, _)) => {
            let mut out = String::with_capacity(byte_idx + 3); // 3 bytes for '…'
            out.push_str(&trimmed[..byte_idx]);
            out.push('…');
            out
        }
    }
}

/// Sink that collects matched (and context) lines up to a per-call budget.
struct CollectSink<'a> {
    rel: &'a str,
    hits: &'a mut Vec<SearchHit>,
    remaining: usize,
    max_line_len: usize,
}

impl Sink for CollectSink<'_> {
    type Error = io::Error;

    fn matched(&mut self, _searcher: &Searcher, m: &SinkMatch<'_>) -> io::Result<bool> {
        if self.remaining == 0 {
            return Ok(false);
        }
        let base = m.line_number().unwrap_or(0);
        // A SinkMatch may span multiple lines; record each with an incrementing
        // line number anchored at the match's first line.
        for (offset, line) in m.lines().enumerate() {
            if self.remaining == 0 {
                break;
            }
            self.hits.push(SearchHit {
                path: self.rel.to_string(),
                line: base + offset as u64,
                text: truncate(&String::from_utf8_lossy(line), self.max_line_len),
                is_match: true,
            });
            self.remaining -= 1;
        }
        Ok(self.remaining > 0)
    }

    fn context(&mut self, _searcher: &Searcher, ctx: &SinkContext<'_>) -> io::Result<bool> {
        if self.remaining == 0 {
            return Ok(false);
        }
        self.hits.push(SearchHit {
            path: self.rel.to_string(),
            line: ctx.line_number().unwrap_or(0),
            text: truncate(&String::from_utf8_lossy(ctx.bytes()), self.max_line_len),
            is_match: false,
        });
        self.remaining -= 1;
        Ok(self.remaining > 0)
    }
}