use std::io;
use std::path::Path;
use std::sync::Mutex;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::time::Instant;
use globset::{Glob, GlobSet, GlobSetBuilder};
use grep::regex::RegexMatcherBuilder;
use grep::searcher::{BinaryDetection, Searcher, SearcherBuilder, Sink, SinkContext, SinkMatch};
use ignore::{WalkBuilder, WalkState};
use serde::{Deserialize, Serialize};
use crate::error::{DciError, Result};
use crate::sandbox::CorpusRoot;
#[derive(Debug, Clone)]
pub struct SearchQuery {
pub pattern: String,
pub path_glob: Option<String>,
pub case_insensitive: bool,
pub context_lines: usize,
pub max_results: Option<usize>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct SearchHit {
pub path: String,
pub line: u64,
pub text: String,
pub is_match: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SearchResult {
pub hits: Vec<SearchHit>,
pub files_searched: usize,
pub truncated: bool,
}
pub fn search(corpus: &CorpusRoot, query: &SearchQuery) -> Result<SearchResult> {
let limits = corpus.limits();
let cap = query.max_results.unwrap_or(limits.max_results).max(1);
let matcher = RegexMatcherBuilder::new()
.case_insensitive(query.case_insensitive)
.line_terminator(Some(b'\n'))
.build(&query.pattern)
.map_err(|e| DciError::InvalidPattern(e.to_string()))?;
let glob = query.path_glob.as_deref().map(build_globset).transpose()?;
let hits: Mutex<Vec<SearchHit>> = Mutex::new(Vec::new());
let files_searched = AtomicUsize::new(0);
let files_walked = AtomicUsize::new(0);
let timed_out = AtomicBool::new(false);
let deadline = Instant::now() + limits.timeout;
let hits_ref = &hits;
let files_searched_ref = &files_searched;
let files_walked_ref = &files_walked;
let timed_out_ref = &timed_out;
let matcher_ref = &matcher;
let glob_ref = &glob;
walk(corpus).build_parallel().run(|| {
let matcher = matcher_ref.clone();
let glob = glob_ref.clone();
let context_lines = query.context_lines;
let max_line_len = limits.max_line_len;
let max_file_bytes = limits.max_file_bytes;
let max_files_walked = limits.max_files_walked;
let mut searcher = SearcherBuilder::new()
.line_number(true)
.before_context(context_lines)
.after_context(context_lines)
.binary_detection(BinaryDetection::quit(0))
.build();
Box::new(move |result| {
let entry = match result {
Ok(e) => e,
Err(_) => return WalkState::Continue,
};
if !entry.file_type().is_some_and(|t| t.is_file()) {
return WalkState::Continue;
}
let walked = files_walked_ref.fetch_add(1, Ordering::Relaxed) + 1;
if walked > max_files_walked {
return WalkState::Quit;
}
if Instant::now() >= deadline {
timed_out_ref.store(true, Ordering::Relaxed);
return WalkState::Quit;
}
let path = entry.path();
let rel = corpus.relativize(path).into_owned();
if let Some(set) = &glob {
if !set.is_match(rel.as_str()) {
return WalkState::Continue;
}
}
if let Ok(meta) = entry.metadata() {
if meta.len() > max_file_bytes {
return WalkState::Continue;
}
}
let mut local: Vec<SearchHit> = Vec::new();
let mut sink = CollectSink {
rel: &rel,
hits: &mut local,
remaining: cap,
max_line_len,
};
let _ = searcher.search_path(&matcher, path, &mut sink);
files_searched_ref.fetch_add(1, Ordering::Relaxed);
if !local.is_empty() {
let mut guard = hits_ref.lock().unwrap_or_else(|e| e.into_inner());
guard.extend(local);
}
WalkState::Continue
})
});
let mut hits = hits.into_inner().unwrap_or_else(|e| e.into_inner());
let walked_total = files_walked.load(Ordering::Relaxed);
let collected = hits.len();
let truncated = walked_total > limits.max_files_walked
|| collected >= cap
|| timed_out.load(Ordering::Relaxed);
hits.sort_by(|a, b| {
a.path
.cmp(&b.path)
.then(a.line.cmp(&b.line))
.then(b.is_match.cmp(&a.is_match))
});
hits.truncate(cap);
Ok(SearchResult {
hits,
files_searched: files_searched.load(Ordering::Relaxed),
truncated,
})
}
#[derive(Debug, Clone)]
pub struct FindQuery {
pub glob: String,
pub max_results: Option<usize>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FindResult {
pub paths: Vec<String>,
pub truncated: bool,
}
pub fn find(corpus: &CorpusRoot, query: &FindQuery) -> Result<FindResult> {
let limits = corpus.limits();
let cap = query.max_results.unwrap_or(limits.max_results).max(1);
let set = build_globset(&query.glob)?;
let paths = Mutex::new(Vec::new());
let files_walked = AtomicUsize::new(0);
let timed_out = AtomicBool::new(false);
let deadline = Instant::now() + limits.timeout;
let paths_ref = &paths;
let files_walked_ref = &files_walked;
let timed_out_ref = &timed_out;
let set_ref = &set;
walk(corpus).build_parallel().run(|| {
Box::new(move |result| {
let entry = match result {
Ok(e) => e,
Err(_) => return WalkState::Continue,
};
if !entry.file_type().is_some_and(|t| t.is_file()) {
return WalkState::Continue;
}
let walked = files_walked_ref.fetch_add(1, Ordering::Relaxed) + 1;
if walked > limits.max_files_walked {
return WalkState::Quit;
}
if Instant::now() >= deadline {
timed_out_ref.store(true, Ordering::Relaxed);
return WalkState::Quit;
}
let rel = corpus.relativize(entry.path()).into_owned();
if set_ref.is_match(rel.as_str()) {
let mut guard = paths_ref.lock().unwrap_or_else(|e| e.into_inner());
guard.push(rel);
}
WalkState::Continue
})
});
let mut paths = paths.into_inner().unwrap_or_else(|e| e.into_inner());
let walked_total = files_walked.load(Ordering::Relaxed);
let collected = paths.len();
let truncated = walked_total > limits.max_files_walked
|| collected > cap
|| timed_out.load(Ordering::Relaxed);
paths.sort();
paths.truncate(cap);
Ok(FindResult { paths, truncated })
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct NumberedLine {
pub line: u64,
pub text: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ReadResult {
pub path: String,
pub lines: Vec<NumberedLine>,
pub more_below: bool,
}
pub fn read_range(
corpus: &CorpusRoot,
path: &str,
start_line: Option<usize>,
line_count: Option<usize>,
) -> Result<ReadResult> {
let limits = corpus.limits();
let resolved = corpus.resolve(path)?;
if !resolved.is_file() {
return Err(DciError::NotFound {
requested: path.to_string(),
});
}
let file = std::fs::File::open(&resolved).map_err(|e| DciError::Io {
path: resolved.clone(),
source: e,
})?;
use std::io::{BufRead, BufReader, Read};
let mut reader = BufReader::new(file.take(limits.max_file_bytes));
let start = start_line.unwrap_or(1).max(1);
let count = line_count
.unwrap_or(limits.max_read_lines)
.min(limits.max_read_lines);
let mut lines = Vec::new();
let mut more_below = false;
let mut current_idx = 0;
let mut line_buf = Vec::new();
while let Ok(bytes_read) = reader.read_until(b'\n', &mut line_buf) {
if bytes_read == 0 {
break;
}
current_idx += 1;
if current_idx < start {
line_buf.clear();
continue;
}
if lines.len() >= count {
more_below = true;
break;
}
let raw = String::from_utf8_lossy(&line_buf);
lines.push(NumberedLine {
line: current_idx as u64,
text: truncate(&raw, limits.max_line_len),
});
line_buf.clear();
}
Ok(ReadResult {
path: corpus.relativize(&resolved).into_owned(),
lines,
more_below,
})
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DirEntryInfo {
pub name: String,
pub kind: String,
pub size_bytes: Option<u64>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ListResult {
pub path: String,
pub entries: Vec<DirEntryInfo>,
pub truncated: bool,
}
pub fn list_dir(corpus: &CorpusRoot, path: Option<&str>) -> Result<ListResult> {
let limits = corpus.limits();
let resolved = match path {
Some(p) if !p.is_empty() && p != "." => corpus.resolve(p)?,
_ => corpus.root().to_path_buf(),
};
if !resolved.is_dir() {
return Err(DciError::NotFound {
requested: path.unwrap_or(".").to_string(),
});
}
let read_dir = std::fs::read_dir(&resolved).map_err(|e| DciError::Io {
path: resolved.clone(),
source: e,
})?;
let mut entries = Vec::new();
let mut truncated = false;
for entry in read_dir {
let entry = match entry {
Ok(e) => e,
Err(_) => continue,
};
if entries.len() >= limits.max_results {
truncated = true;
break;
}
let file_type = entry.file_type().ok();
let (kind, size_bytes) = match file_type {
Some(t) if t.is_dir() => ("dir", None),
Some(t) if t.is_symlink() => ("symlink", None),
Some(t) if t.is_file() => ("file", entry.metadata().ok().map(|m| m.len())),
_ => ("other", None),
};
entries.push(DirEntryInfo {
name: entry.file_name().to_string_lossy().into_owned(),
kind: kind.to_string(),
size_bytes,
});
}
entries.sort_by(|a, b| {
let rank = |k: &str| if k == "dir" { 0 } else { 1 };
rank(&a.kind)
.cmp(&rank(&b.kind))
.then_with(|| a.name.cmp(&b.name))
});
Ok(ListResult {
path: corpus.relativize(&resolved).into_owned(),
entries,
truncated,
})
}
pub fn list_files(corpus: &CorpusRoot) -> Result<Vec<String>> {
let limits = corpus.limits();
let mut paths = Vec::new();
for entry in walk(corpus).build() {
if paths.len() >= limits.max_files_walked {
break;
}
let entry = match entry {
Ok(e) => e,
Err(_) => continue,
};
if entry.file_type().is_some_and(|t| t.is_file()) {
paths.push(corpus.relativize(entry.path()).into_owned());
}
}
paths.sort();
Ok(paths)
}
pub fn read_document(corpus: &CorpusRoot, path: &str) -> Result<String> {
let resolved = corpus.resolve(path)?;
if !resolved.is_file() {
return Err(DciError::NotFound {
requested: path.to_string(),
});
}
read_file_bounded(&resolved, corpus.limits().max_file_bytes)
}
fn walk(corpus: &CorpusRoot) -> WalkBuilder {
let limits = corpus.limits();
let respect = limits.respect_gitignore;
let mut builder = WalkBuilder::new(corpus.root());
builder
.standard_filters(respect)
.git_ignore(respect)
.git_global(respect)
.git_exclude(respect)
.ignore(respect)
.parents(respect)
.require_git(false)
.hidden(!limits.include_hidden)
.follow_links(false);
builder
}
fn build_globset(pattern: &str) -> Result<GlobSet> {
let normalized = if pattern.contains('/') {
pattern.to_string()
} else {
format!("**/{pattern}")
};
let glob = Glob::new(&normalized).map_err(|e| DciError::InvalidGlob {
glob: pattern.to_string(),
reason: e.to_string(),
})?;
let mut builder = GlobSetBuilder::new();
builder.add(glob);
builder.build().map_err(|e| DciError::InvalidGlob {
glob: pattern.to_string(),
reason: e.to_string(),
})
}
fn read_file_bounded(path: &Path, max_bytes: u64) -> Result<String> {
use std::io::Read;
let file = std::fs::File::open(path).map_err(|e| DciError::Io {
path: path.to_path_buf(),
source: e,
})?;
let mut handle = file.take(max_bytes);
let mut buf = Vec::new();
handle.read_to_end(&mut buf).map_err(|e| DciError::Io {
path: path.to_path_buf(),
source: e,
})?;
Ok(String::from_utf8_lossy(&buf).into_owned())
}
fn truncate(text: &str, max_len: usize) -> String {
let trimmed = text.trim_end_matches(['\n', '\r']);
let mut indices = trimmed.char_indices();
match indices.nth(max_len) {
None => trimmed.to_string(),
Some((byte_idx, _)) => {
let mut out = String::with_capacity(byte_idx + 3); out.push_str(&trimmed[..byte_idx]);
out.push('…');
out
}
}
}
struct CollectSink<'a> {
rel: &'a str,
hits: &'a mut Vec<SearchHit>,
remaining: usize,
max_line_len: usize,
}
impl Sink for CollectSink<'_> {
type Error = io::Error;
fn matched(&mut self, _searcher: &Searcher, m: &SinkMatch<'_>) -> io::Result<bool> {
if self.remaining == 0 {
return Ok(false);
}
let base = m.line_number().unwrap_or(0);
for (offset, line) in m.lines().enumerate() {
if self.remaining == 0 {
break;
}
self.hits.push(SearchHit {
path: self.rel.to_string(),
line: base + offset as u64,
text: truncate(&String::from_utf8_lossy(line), self.max_line_len),
is_match: true,
});
self.remaining -= 1;
}
Ok(self.remaining > 0)
}
fn context(&mut self, _searcher: &Searcher, ctx: &SinkContext<'_>) -> io::Result<bool> {
if self.remaining == 0 {
return Ok(false);
}
self.hits.push(SearchHit {
path: self.rel.to_string(),
line: ctx.line_number().unwrap_or(0),
text: truncate(&String::from_utf8_lossy(ctx.bytes()), self.max_line_len),
is_match: false,
});
self.remaining -= 1;
Ok(self.remaining > 0)
}
}