use std::collections::VecDeque;
use std::path::PathBuf;
use std::sync::Arc;
use globset::{Glob, GlobSet, GlobSetBuilder};
use grep_matcher::Matcher;
use grep_regex::{RegexMatcher, RegexMatcherBuilder};
use grep_searcher::{Searcher, SearcherBuilder, Sink, SinkContext, SinkContextKind, SinkMatch};
use harn_vm::process_sandbox::FsAccess;
use harn_vm::VmValue;
use ignore::WalkBuilder;
use crate::error::HostlibError;
use crate::tools::args::{
build_dict, dict_arg, optional_bool, optional_int, optional_string, optional_string_list,
require_string, str_value,
};
use crate::tools::permissions::enforce_path_scope;
const BUILTIN: &str = "hostlib_tools_search";
const DEFAULT_MAX_LINE_BYTES: usize = 1024;
const MIN_MAX_LINE_BYTES: i64 = 64;
const HARD_MAX_LINE_BYTES: i64 = 64 * 1024;
const CLIP_PREFIX: &str = "[truncated] ... ";
const CLIP_SUFFIX: &str = " ... [truncated]";
pub(super) fn run(args: &[VmValue]) -> Result<VmValue, HostlibError> {
let raw = dict_arg(BUILTIN, args)?;
let dict = raw.as_ref();
let pattern = require_string(BUILTIN, dict, "pattern")?;
if pattern.is_empty() {
return Err(HostlibError::InvalidParameter {
builtin: BUILTIN,
param: "pattern",
message: "pattern must not be empty".to_string(),
});
}
let path = optional_string(BUILTIN, dict, "path")?
.map(PathBuf::from)
.unwrap_or_else(|| PathBuf::from("."));
enforce_path_scope(BUILTIN, &path, FsAccess::Read)?;
let glob = optional_string(BUILTIN, dict, "glob")?;
let exclude_globs = optional_string_list(BUILTIN, dict, "exclude_globs")?;
let case_insensitive = optional_bool(BUILTIN, dict, "case_insensitive", false)?;
let fixed_strings = optional_bool(BUILTIN, dict, "fixed_strings", false)?;
let include_hidden = optional_bool(BUILTIN, dict, "include_hidden", false)?;
let max_matches = optional_int(BUILTIN, dict, "max_matches", 1000)?;
let max_line_bytes = optional_int(
BUILTIN,
dict,
"max_line_bytes",
DEFAULT_MAX_LINE_BYTES as i64,
)?;
let context_before = optional_int(BUILTIN, dict, "context_before", 0)?;
let context_after = optional_int(BUILTIN, dict, "context_after", 0)?;
if max_matches < 1 {
return Err(HostlibError::InvalidParameter {
builtin: BUILTIN,
param: "max_matches",
message: "must be >= 1".to_string(),
});
}
if !(MIN_MAX_LINE_BYTES..=HARD_MAX_LINE_BYTES).contains(&max_line_bytes) {
return Err(HostlibError::InvalidParameter {
builtin: BUILTIN,
param: "max_line_bytes",
message: format!("must be between {MIN_MAX_LINE_BYTES} and {HARD_MAX_LINE_BYTES}"),
});
}
if context_before < 0 {
return Err(HostlibError::InvalidParameter {
builtin: BUILTIN,
param: "context_before",
message: "must be >= 0".to_string(),
});
}
if context_after < 0 {
return Err(HostlibError::InvalidParameter {
builtin: BUILTIN,
param: "context_after",
message: "must be >= 0".to_string(),
});
}
let max_matches = max_matches as usize;
let max_line_bytes = max_line_bytes as usize;
let context_before = context_before as usize;
let context_after = context_after as usize;
let matcher = build_matcher(&pattern, case_insensitive, fixed_strings)?;
let include_set = build_include_glob(glob)?;
let exclude_set = build_exclude_globs(exclude_globs)?;
let mut walker = WalkBuilder::new(&path);
walker
.hidden(!include_hidden)
.ignore(true)
.git_ignore(true)
.git_global(true)
.git_exclude(true)
.require_git(false)
.parents(true);
let mut all_rows: Vec<RowWithPath> = Vec::new();
let mut truncated = false;
'outer: for entry in walker.build() {
let entry = match entry {
Ok(e) => e,
Err(_) => continue,
};
if !entry.file_type().is_some_and(|ft| ft.is_file()) {
continue;
}
let file_path = entry.path().to_path_buf();
if !included_by_globs(&path, &file_path, include_set.as_ref()) {
continue;
}
if excluded_by_globs(&path, &file_path, exclude_set.as_ref()) {
continue;
}
let mut sink = CollectorSink {
matcher: &matcher,
rows: Vec::new(),
pending_before: VecDeque::new(),
context_before,
remaining: max_matches.saturating_sub(all_rows.len()),
max_line_bytes,
};
let mut searcher = SearcherBuilder::new()
.before_context(context_before)
.after_context(context_after)
.line_number(true)
.build();
if let Err(err) = searcher.search_path(&matcher, &file_path, &mut sink) {
let _ = err;
continue;
}
truncated |= sink.rows.iter().any(|row| row.truncated);
for row in sink.rows {
all_rows.push(RowWithPath {
path: file_path.clone(),
row,
});
if all_rows.len() >= max_matches {
truncated = true;
break 'outer;
}
}
if all_rows.len() >= max_matches {
truncated = true;
break 'outer;
}
}
let matches: Vec<VmValue> = all_rows.into_iter().map(row_to_value).collect();
Ok(build_dict([
("matches", VmValue::List(Arc::new(matches))),
("truncated", VmValue::Bool(truncated)),
]))
}
fn build_matcher(
pattern: &str,
case_insensitive: bool,
fixed_strings: bool,
) -> Result<RegexMatcher, HostlibError> {
let mut builder = RegexMatcherBuilder::new();
builder.case_insensitive(case_insensitive);
builder.fixed_strings(fixed_strings);
builder
.build(pattern)
.map_err(|err| HostlibError::InvalidParameter {
builtin: BUILTIN,
param: "pattern",
message: format!("invalid regex: {err}"),
})
}
fn build_include_glob(pattern: Option<String>) -> Result<Option<GlobSet>, HostlibError> {
let Some(pattern) = pattern else {
return Ok(None);
};
build_glob_set([pattern], "glob")
}
fn build_exclude_globs(patterns: Vec<String>) -> Result<Option<GlobSet>, HostlibError> {
if patterns.is_empty() {
return Ok(None);
}
build_glob_set(patterns, "exclude_globs")
}
fn build_glob_set(
patterns: impl IntoIterator<Item = String>,
param: &'static str,
) -> Result<Option<GlobSet>, HostlibError> {
let mut builder = GlobSetBuilder::new();
for pattern in patterns {
for normalized in normalize_glob_variants(&pattern) {
let glob = Glob::new(&normalized).map_err(|err| HostlibError::InvalidParameter {
builtin: BUILTIN,
param,
message: format!("invalid glob `{pattern}`: {err}"),
})?;
builder.add(glob);
}
}
builder
.build()
.map(Some)
.map_err(|err| HostlibError::InvalidParameter {
builtin: BUILTIN,
param,
message: format!("invalid glob set: {err}"),
})
}
fn normalize_glob_variants(glob: &str) -> Vec<String> {
let glob = glob.replace('\\', "/");
if glob == "*" || glob.starts_with("**/") {
return vec![glob];
}
let normalized = format!("**/{glob}");
if normalized == glob {
vec![glob]
} else {
vec![glob, normalized]
}
}
fn included_by_globs(
root: &std::path::Path,
file_path: &std::path::Path,
set: Option<&GlobSet>,
) -> bool {
let Some(set) = set else {
return true;
};
let candidate = file_path.strip_prefix(root).unwrap_or(file_path);
set.is_match(candidate)
}
fn excluded_by_globs(
root: &std::path::Path,
file_path: &std::path::Path,
set: Option<&GlobSet>,
) -> bool {
let Some(set) = set else {
return false;
};
let candidate = file_path.strip_prefix(root).unwrap_or(file_path);
set.is_match(candidate)
}
#[derive(Debug, Clone)]
struct MatchRow {
line: u64,
column: u64,
text: String,
context_before: VecDeque<String>,
context_after: VecDeque<String>,
truncated: bool,
}
struct RowWithPath {
path: PathBuf,
row: MatchRow,
}
struct ContextLine {
text: String,
truncated: bool,
}
struct CollectorSink<'a> {
matcher: &'a RegexMatcher,
rows: Vec<MatchRow>,
pending_before: VecDeque<ContextLine>,
context_before: usize,
remaining: usize,
max_line_bytes: usize,
}
impl Sink for CollectorSink<'_> {
type Error = std::io::Error;
fn matched(
&mut self,
_searcher: &Searcher,
sink_match: &SinkMatch<'_>,
) -> Result<bool, std::io::Error> {
if self.remaining == 0 {
return Ok(false);
}
let line_number = sink_match.line_number().unwrap_or(0);
let raw_line = std::str::from_utf8(sink_match.bytes()).unwrap_or("");
let trimmed = raw_line.trim_end_matches(['\n', '\r']);
let mut column = 1u64;
let mut match_start = None;
if let Ok(Some(m)) = self.matcher.find(sink_match.bytes()) {
column = (m.start() as u64) + 1;
match_start = Some(m.start().min(trimmed.len()));
}
let before = std::mem::take(&mut self.pending_before);
let truncated = before.iter().any(|line| line.truncated);
let before = before
.into_iter()
.map(|line| line.text)
.collect::<VecDeque<_>>();
let (text, text_truncated) = clip_text(trimmed, self.max_line_bytes, match_start);
self.rows.push(MatchRow {
line: line_number,
column,
text,
context_before: before,
context_after: VecDeque::new(),
truncated: truncated || text_truncated,
});
self.remaining -= 1;
Ok(self.remaining > 0)
}
fn context(
&mut self,
_searcher: &Searcher,
ctx: &SinkContext<'_>,
) -> Result<bool, std::io::Error> {
let line = std::str::from_utf8(ctx.bytes()).unwrap_or("");
let trimmed = line.trim_end_matches(['\n', '\r']);
let (text, truncated) = clip_text(trimmed, self.max_line_bytes, None);
match ctx.kind() {
SinkContextKind::Before => {
self.pending_before
.push_back(ContextLine { text, truncated });
while self.pending_before.len() > self.context_before {
self.pending_before.pop_front();
}
}
SinkContextKind::After => {
if let Some(last) = self.rows.last_mut() {
last.context_after.push_back(text);
last.truncated |= truncated;
}
}
SinkContextKind::Other => {}
}
Ok(true)
}
}
fn to_agent_path(path: &std::path::Path) -> String {
let rendered = path.to_string_lossy();
if std::path::MAIN_SEPARATOR == '/' {
rendered.into_owned()
} else {
rendered.replace(std::path::MAIN_SEPARATOR, "/")
}
}
fn row_to_value(rwp: RowWithPath) -> VmValue {
let RowWithPath { path, row } = rwp;
let MatchRow {
line,
column,
text,
context_before,
context_after,
truncated: _,
} = row;
let before: Vec<VmValue> = context_before.into_iter().map(str_value).collect();
let after: Vec<VmValue> = context_after.into_iter().map(str_value).collect();
build_dict([
("path", str_value(to_agent_path(&path))),
("line", VmValue::Int(line as i64)),
("column", VmValue::Int(column as i64)),
("text", str_value(text)),
("context_before", VmValue::List(Arc::new(before))),
("context_after", VmValue::List(Arc::new(after))),
])
}
fn clip_text(value: &str, max_bytes: usize, anchor_byte: Option<usize>) -> (String, bool) {
if value.len() <= max_bytes {
return (value.to_string(), false);
}
let Some(anchor_byte) = anchor_byte else {
let keep = max_bytes.saturating_sub(CLIP_SUFFIX.len()).max(1);
let end = floor_char_boundary(value, keep);
return (format!("{}{}", &value[..end], CLIP_SUFFIX), true);
};
let content_budget = max_bytes
.saturating_sub(CLIP_PREFIX.len())
.saturating_sub(CLIP_SUFFIX.len())
.max(1);
let anchor_byte = anchor_byte.min(value.len());
let mut start = anchor_byte.saturating_sub(content_budget / 2);
if start.saturating_add(content_budget) > value.len() {
start = value.len().saturating_sub(content_budget);
}
start = floor_char_boundary(value, start);
let mut end = (start + content_budget).min(value.len());
end = floor_char_boundary(value, end);
if end <= start {
end = next_char_boundary(value, start);
}
let mut out = String::with_capacity(max_bytes);
if start > 0 {
out.push_str(CLIP_PREFIX);
}
out.push_str(&value[start..end]);
if end < value.len() {
out.push_str(CLIP_SUFFIX);
}
(out, true)
}
fn floor_char_boundary(value: &str, mut index: usize) -> usize {
index = index.min(value.len());
while index > 0 && !value.is_char_boundary(index) {
index -= 1;
}
index
}
fn next_char_boundary(value: &str, index: usize) -> usize {
if index >= value.len() {
return value.len();
}
value[index..]
.chars()
.next()
.map(|ch| index + ch.len_utf8())
.unwrap_or(value.len())
}