use std::collections::{BTreeMap, BTreeSet};
use std::path::{Path, PathBuf};
use std::rc::Rc;
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use crate::llm::{execute_llm_call, extract_llm_options, vm_value_to_json};
use crate::stdlib::json_to_vm_value;
use crate::value::{ErrorCategory, VmError, VmValue};
use crate::vm::Vm;
use super::process::resolve_source_relative_path;
use super::project::project_scan_config_value;
use super::template::render_template_result;
const STANDARD_VENDOR_DIRS: &[&str] = &[
".git",
".hg",
".svn",
".venv",
"__pycache__",
"build",
"dist",
"node_modules",
"target",
"venv",
];
const MAX_CONTEXT_FILES: usize = 12;
const MAX_SOURCE_FILES: usize = 8;
const MAX_FILE_CHARS: usize = 4_000;
const MAX_TOTAL_CONTEXT_CHARS: usize = 24_000;
const DEFAULT_BUDGET_TOKENS: i64 = 4_000;
#[derive(Debug, Clone)]
struct ProjectEnrichOptions {
base_evidence: Option<VmValue>,
prompt: String,
schema: VmValue,
budget_tokens: i64,
model: String,
provider: String,
temperature: Option<f64>,
cache_key: String,
cache_dir: Option<String>,
schema_retries: usize,
}
#[derive(Debug, Clone)]
struct RelevantFile {
rel_path: String,
content: String,
truncated: bool,
digest: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct CacheRecord {
result: serde_json::Value,
}
pub(crate) fn register_project_enrich_builtin(vm: &mut Vm) {
vm.register_async_builtin("project_enrich_native", |args| async move {
project_enrich_impl(args).await
});
}
async fn project_enrich_impl(args: Vec<VmValue>) -> Result<VmValue, VmError> {
let path = args
.first()
.map(VmValue::display)
.unwrap_or_else(|| ".".to_string());
let root = resolve_existing_directory(&path)?;
let options = parse_project_enrich_options(args.get(1))?;
let base_evidence = options
.base_evidence
.clone()
.unwrap_or_else(|| project_scan_config_value(&root));
let base_dict = base_evidence.as_dict().ok_or_else(|| {
VmError::Thrown(VmValue::String(Rc::from(
"project.enrich: base_evidence must be a dict",
)))
})?;
let relevant_files = collect_relevant_files(&root, &base_evidence);
let bindings = enrichment_bindings(&root, &base_evidence, &relevant_files);
let rendered_prompt = render_template_result(&options.prompt, Some(&bindings), None, None)
.map_err(VmError::from)?;
let schema_hash = sha256_hex(canonical_json(&vm_value_to_json(&options.schema)));
let prompt_hash = sha256_hex(rendered_prompt.as_bytes());
let content_hash = hash_relevant_files(&relevant_files);
let cache_path = cache_file_path(
&root,
options.cache_dir.as_deref(),
&options.cache_key,
&root,
&schema_hash,
&prompt_hash,
&content_hash,
);
if let Some(cached) = read_cached_result(&cache_path)? {
return Ok(result_with_cached_flag(
json_to_vm_value(&cached.result),
true,
));
}
let estimated_input_tokens = estimate_tokens(&rendered_prompt)
+ estimate_tokens(&canonical_json(&vm_value_to_json(&options.schema)));
if estimated_input_tokens > options.budget_tokens {
let mut budget_result = (*base_dict).clone();
budget_result.insert("budget_exceeded".to_string(), VmValue::Bool(true));
budget_result.insert(
"_provenance".to_string(),
provenance_value(None, estimated_input_tokens, 0, false),
);
return Ok(VmValue::Dict(Rc::new(budget_result)));
}
let llm_options_value = llm_options_value(&options, &rendered_prompt);
let extracted = extract_llm_options(&[
VmValue::String(Rc::from(rendered_prompt.as_str())),
VmValue::Nil,
llm_options_value.clone(),
])?;
match execute_llm_call(extracted, llm_options_value.as_dict().cloned()).await {
Ok(response) => {
let response_dict = response.as_dict().ok_or_else(|| {
VmError::Thrown(VmValue::String(Rc::from(
"project.enrich: expected llm response dict",
)))
})?;
let model = response_dict.get("model").map(VmValue::display);
let input_tokens = response_dict
.get("input_tokens")
.and_then(VmValue::as_int)
.unwrap_or(estimated_input_tokens);
let output_tokens = response_dict
.get("output_tokens")
.and_then(VmValue::as_int)
.unwrap_or(0);
let Some(data) = response_dict.get("data").cloned() else {
return Ok(validation_envelope(
&base_evidence,
"LLM response did not contain structured data".to_string(),
model,
input_tokens,
output_tokens,
));
};
let final_result = attach_provenance(data, model, input_tokens, output_tokens, false);
write_cached_result(&cache_path, &final_result)?;
Ok(final_result)
}
Err(VmError::CategorizedError {
message,
category: ErrorCategory::SchemaValidation,
}) => Ok(validation_envelope(
&base_evidence,
message,
None,
estimated_input_tokens,
0,
)),
Err(error) => Err(error),
}
}
fn parse_project_enrich_options(value: Option<&VmValue>) -> Result<ProjectEnrichOptions, VmError> {
let dict = value.and_then(VmValue::as_dict).ok_or_else(|| {
VmError::Thrown(VmValue::String(Rc::from(
"project.enrich: options dict is required",
)))
})?;
let prompt = dict
.get("prompt")
.and_then(value_as_string)
.filter(|value| !value.is_empty())
.ok_or_else(|| {
VmError::Thrown(VmValue::String(Rc::from(
"project.enrich: options.prompt must be a non-empty string",
)))
})?;
let schema = dict.get("schema").cloned().ok_or_else(|| {
VmError::Thrown(VmValue::String(Rc::from(
"project.enrich: options.schema is required",
)))
})?;
let budget_tokens = dict
.get("budget_tokens")
.and_then(VmValue::as_int)
.unwrap_or(DEFAULT_BUDGET_TOKENS)
.max(0);
let model = dict
.get("model")
.and_then(value_as_string)
.unwrap_or_else(|| "auto".to_string());
let provider = dict
.get("provider")
.and_then(value_as_string)
.unwrap_or_else(|| "auto".to_string());
let temperature = dict.get("temperature").and_then(value_as_float);
let cache_key = dict
.get("cache_key")
.and_then(value_as_string)
.unwrap_or_else(|| "default".to_string());
let cache_dir = dict.get("cache_dir").and_then(value_as_string);
let schema_retries = dict
.get("schema_retries")
.and_then(VmValue::as_int)
.unwrap_or(1)
.max(0) as usize;
let base_evidence = dict.get("base_evidence").cloned();
Ok(ProjectEnrichOptions {
base_evidence,
prompt,
schema,
budget_tokens,
model,
provider,
temperature,
cache_key,
cache_dir,
schema_retries,
})
}
fn resolve_existing_directory(path: &str) -> Result<PathBuf, VmError> {
let resolved = resolve_source_relative_path(path);
let target = if resolved.is_dir() {
resolved
} else {
resolved
.parent()
.map(Path::to_path_buf)
.unwrap_or_else(|| PathBuf::from("."))
};
if target.exists() {
target.canonicalize().map_err(|error| {
VmError::Thrown(VmValue::String(Rc::from(format!(
"project.enrich: failed to resolve path: {error}"
))))
})
} else {
Err(VmError::Thrown(VmValue::String(Rc::from(format!(
"project.enrich: path does not exist: {}",
target.display()
)))))
}
}
fn collect_relevant_files(root: &Path, base_evidence: &VmValue) -> Vec<RelevantFile> {
let mut selected = BTreeSet::new();
let mut files = Vec::new();
let Some(dict) = base_evidence.as_dict() else {
return files;
};
for key in ["anchors", "lockfiles"] {
if let Some(values) = dict.get(key).and_then(value_as_list) {
for entry in values {
let name = entry.display().trim_end_matches('/').to_string();
if name.is_empty() {
continue;
}
let path = root.join(&name);
if path.is_file() {
selected.insert(name);
}
}
}
}
for name in [
"README.md",
"README.MD",
"README",
"Readme.md",
"Dockerfile",
"GNUmakefile",
"Makefile",
"makefile",
"package.json",
"Cargo.toml",
"pyproject.toml",
"go.mod",
"tsconfig.json",
"next.config.js",
"next.config.mjs",
"next.config.ts",
"setup.py",
"requirements.txt",
"Gemfile",
] {
let path = root.join(name);
if path.is_file() {
selected.insert(name.to_string());
}
}
let languages = dict
.get("languages")
.and_then(value_as_list)
.map(|items| items.iter().map(VmValue::display).collect::<Vec<_>>())
.unwrap_or_default();
let source_files = collect_source_files(root, &languages);
selected.extend(source_files);
let mut total_chars = 0usize;
for rel_path in selected.into_iter().take(MAX_CONTEXT_FILES) {
let full_path = root.join(&rel_path);
let Ok(content) = std::fs::read_to_string(&full_path) else {
continue;
};
let truncated = content.chars().count() > MAX_FILE_CHARS;
let trimmed = truncate_chars(&content, MAX_FILE_CHARS);
if total_chars >= MAX_TOTAL_CONTEXT_CHARS {
break;
}
total_chars += trimmed.chars().count();
files.push(RelevantFile {
rel_path,
content: trimmed,
truncated,
digest: sha256_hex(content.as_bytes()),
});
}
files
}
fn collect_source_files(root: &Path, languages: &[String]) -> Vec<String> {
let exts = source_extensions(languages);
if exts.is_empty() {
return Vec::new();
}
let mut files = Vec::new();
collect_source_files_recursive(root, root, &exts, &mut files);
files.sort();
files.truncate(MAX_SOURCE_FILES);
files
}
fn collect_source_files_recursive(root: &Path, dir: &Path, exts: &[&str], files: &mut Vec<String>) {
if files.len() >= MAX_SOURCE_FILES {
return;
}
let Ok(entries) = std::fs::read_dir(dir) else {
return;
};
let mut children = entries.flatten().collect::<Vec<_>>();
children.sort_by_key(|entry| entry.file_name());
for child in children {
let Ok(file_type) = child.file_type() else {
continue;
};
let name = child.file_name().to_string_lossy().into_owned();
if file_type.is_dir() {
if name.starts_with('.') || STANDARD_VENDOR_DIRS.contains(&name.as_str()) {
continue;
}
collect_source_files_recursive(root, &child.path(), exts, files);
if files.len() >= MAX_SOURCE_FILES {
return;
}
continue;
}
let matches_ext = child
.path()
.extension()
.and_then(|ext| ext.to_str())
.is_some_and(|ext| exts.contains(&ext));
if !matches_ext {
continue;
}
files.push(relative_posix(root, &child.path()));
if files.len() >= MAX_SOURCE_FILES {
return;
}
}
}
fn source_extensions(languages: &[String]) -> Vec<&'static str> {
let mut exts = Vec::new();
for language in languages {
match language.as_str() {
"rust" => push_unique_str(&mut exts, "rs"),
"go" => push_unique_str(&mut exts, "go"),
"python" => push_unique_str(&mut exts, "py"),
"typescript" => {
push_unique_str(&mut exts, "ts");
push_unique_str(&mut exts, "tsx");
}
"javascript" => {
push_unique_str(&mut exts, "js");
push_unique_str(&mut exts, "jsx");
push_unique_str(&mut exts, "mjs");
push_unique_str(&mut exts, "cjs");
}
"ruby" => push_unique_str(&mut exts, "rb"),
_ => {}
}
}
exts
}
fn enrichment_bindings(
root: &Path,
base_evidence: &VmValue,
files: &[RelevantFile],
) -> BTreeMap<String, VmValue> {
let mut bindings = BTreeMap::new();
bindings.insert(
"path".to_string(),
VmValue::String(Rc::from(root.to_string_lossy().into_owned())),
);
bindings.insert("base_evidence".to_string(), base_evidence.clone());
bindings.insert("evidence".to_string(), base_evidence.clone());
let file_values = files
.iter()
.map(|file| {
let mut value = BTreeMap::new();
value.insert(
"path".to_string(),
VmValue::String(Rc::from(file.rel_path.clone())),
);
value.insert(
"content".to_string(),
VmValue::String(Rc::from(file.content.clone())),
);
value.insert("truncated".to_string(), VmValue::Bool(file.truncated));
VmValue::Dict(Rc::new(value))
})
.collect::<Vec<_>>();
bindings.insert("files".to_string(), VmValue::List(Rc::new(file_values)));
if let Some(dict) = base_evidence.as_dict() {
for (key, value) in dict.iter() {
bindings.entry(key.clone()).or_insert_with(|| value.clone());
}
}
bindings
}
fn llm_options_value(options: &ProjectEnrichOptions, rendered_prompt: &str) -> VmValue {
let mut llm_options = BTreeMap::new();
llm_options.insert(
"provider".to_string(),
VmValue::String(Rc::from(options.provider.clone())),
);
llm_options.insert(
"model".to_string(),
VmValue::String(Rc::from(options.model.clone())),
);
if let Some(temperature) = options.temperature {
llm_options.insert("temperature".to_string(), VmValue::Float(temperature));
}
llm_options.insert("output_schema".to_string(), options.schema.clone());
llm_options.insert(
"output_validation".to_string(),
VmValue::String(Rc::from("error")),
);
llm_options.insert(
"schema_retries".to_string(),
VmValue::Int(options.schema_retries as i64),
);
llm_options.insert(
"response_format".to_string(),
VmValue::String(Rc::from("json")),
);
llm_options.insert(
"messages".to_string(),
VmValue::List(Rc::new(vec![json_to_vm_value(&serde_json::json!({
"role": "user",
"content": rendered_prompt,
}))])),
);
VmValue::Dict(Rc::new(llm_options))
}
fn validation_envelope(
base_evidence: &VmValue,
message: String,
model: Option<String>,
input_tokens: i64,
output_tokens: i64,
) -> VmValue {
let mut dict = BTreeMap::new();
dict.insert("base_evidence".to_string(), base_evidence.clone());
dict.insert(
"validation_error".to_string(),
VmValue::String(Rc::from(message)),
);
dict.insert(
"_provenance".to_string(),
provenance_value(model, input_tokens, output_tokens, false),
);
VmValue::Dict(Rc::new(dict))
}
fn attach_provenance(
data: VmValue,
model: Option<String>,
input_tokens: i64,
output_tokens: i64,
cached: bool,
) -> VmValue {
match data {
VmValue::Dict(dict) => {
let mut merged = (*dict).clone();
merged.insert(
"_provenance".to_string(),
provenance_value(model, input_tokens, output_tokens, cached),
);
VmValue::Dict(Rc::new(merged))
}
other => {
let mut wrapped = BTreeMap::new();
wrapped.insert("data".to_string(), other);
wrapped.insert(
"_provenance".to_string(),
provenance_value(model, input_tokens, output_tokens, cached),
);
VmValue::Dict(Rc::new(wrapped))
}
}
}
fn provenance_value(
model: Option<String>,
input_tokens: i64,
output_tokens: i64,
cached: bool,
) -> VmValue {
let mut tokens = BTreeMap::new();
tokens.insert("in".to_string(), VmValue::Int(input_tokens));
tokens.insert("out".to_string(), VmValue::Int(output_tokens));
let mut provenance = BTreeMap::new();
provenance.insert(
"model".to_string(),
model
.map(|value| VmValue::String(Rc::from(value)))
.unwrap_or(VmValue::Nil),
);
provenance.insert("tokens".to_string(), VmValue::Dict(Rc::new(tokens)));
provenance.insert("cached".to_string(), VmValue::Bool(cached));
VmValue::Dict(Rc::new(provenance))
}
fn result_with_cached_flag(value: VmValue, cached: bool) -> VmValue {
let Some(dict) = value.as_dict() else {
return value;
};
let mut merged = (*dict).clone();
let mut provenance = merged
.get("_provenance")
.and_then(VmValue::as_dict)
.map(|value| (*value).clone())
.unwrap_or_default();
provenance.insert("cached".to_string(), VmValue::Bool(cached));
merged.insert(
"_provenance".to_string(),
VmValue::Dict(Rc::new(provenance)),
);
VmValue::Dict(Rc::new(merged))
}
fn hash_relevant_files(files: &[RelevantFile]) -> String {
let joined = files
.iter()
.map(|file| format!("{}:{}", file.rel_path, file.digest))
.collect::<Vec<_>>()
.join("|");
sha256_hex(joined.as_bytes())
}
fn cache_file_path(
root: &Path,
cache_dir: Option<&str>,
cache_key: &str,
path: &Path,
schema_hash: &str,
prompt_hash: &str,
content_hash: &str,
) -> PathBuf {
let cache_root = cache_dir
.map(resolve_source_relative_path)
.unwrap_or_else(|| root.join(".harn/cache/enrichment"));
let identity = serde_json::json!({
"cache_key": cache_key,
"path": path.to_string_lossy(),
"schema_hash": schema_hash,
"prompt_hash": prompt_hash,
"content_hash": content_hash,
});
cache_root.join(format!(
"{}.json",
sha256_hex(canonical_json(&identity).as_bytes())
))
}
fn read_cached_result(path: &Path) -> Result<Option<CacheRecord>, VmError> {
let Ok(content) = std::fs::read_to_string(path) else {
return Ok(None);
};
serde_json::from_str::<CacheRecord>(&content)
.map(Some)
.map_err(|error| {
VmError::Thrown(VmValue::String(Rc::from(format!(
"project.enrich: failed to parse cache {}: {error}",
path.display()
))))
})
}
fn write_cached_result(path: &Path, value: &VmValue) -> Result<(), VmError> {
if let Some(parent) = path.parent() {
std::fs::create_dir_all(parent).map_err(|error| {
VmError::Thrown(VmValue::String(Rc::from(format!(
"project.enrich: failed to create cache dir {}: {error}",
parent.display()
))))
})?;
}
let record = CacheRecord {
result: vm_value_to_json(value),
};
let serialized = serde_json::to_string_pretty(&record).map_err(|error| {
VmError::Thrown(VmValue::String(Rc::from(format!(
"project.enrich: failed to serialize cache record: {error}"
))))
})?;
std::fs::write(path, serialized).map_err(|error| {
VmError::Thrown(VmValue::String(Rc::from(format!(
"project.enrich: failed to write cache {}: {error}",
path.display()
))))
})
}
fn relative_posix(base: &Path, path: &Path) -> String {
match path.strip_prefix(base) {
Ok(rel) => rel.to_string_lossy().replace('\\', "/"),
Err(_) => path.to_string_lossy().replace('\\', "/"),
}
}
fn estimate_tokens(text: &str) -> i64 {
((text.chars().count() as i64) + 3) / 4
}
fn truncate_chars(text: &str, max_chars: usize) -> String {
if text.chars().count() <= max_chars {
return text.to_string();
}
text.chars().take(max_chars).collect()
}
fn canonical_json(value: &serde_json::Value) -> String {
serde_json::to_string(value).unwrap_or_default()
}
fn sha256_hex(data: impl AsRef<[u8]>) -> String {
let mut hasher = Sha256::new();
hasher.update(data.as_ref());
let digest = hasher.finalize();
let mut out = String::with_capacity(digest.len() * 2);
for byte in digest {
use std::fmt::Write;
let _ = write!(out, "{byte:02x}");
}
out
}
fn push_unique_str<'a>(items: &mut Vec<&'a str>, value: &'a str) {
if !items.contains(&value) {
items.push(value);
}
}
fn value_as_string(value: &VmValue) -> Option<String> {
match value {
VmValue::String(text) => Some(text.to_string()),
_ => None,
}
}
fn value_as_list(value: &VmValue) -> Option<&[VmValue]> {
match value {
VmValue::List(items) => Some(items.as_slice()),
_ => None,
}
}
fn value_as_float(value: &VmValue) -> Option<f64> {
match value {
VmValue::Float(number) => Some(*number),
VmValue::Int(number) => Some(*number as f64),
_ => None,
}
}
#[cfg(test)]
fn value_as_bool(value: &VmValue) -> Option<bool> {
match value {
VmValue::Bool(flag) => Some(*flag),
_ => None,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn estimate_tokens_uses_simple_char_budget() {
assert_eq!(estimate_tokens(""), 0);
assert_eq!(estimate_tokens("abcd"), 1);
assert_eq!(estimate_tokens("abcde"), 2);
}
#[test]
fn attach_provenance_wraps_non_dict_results() {
let result = attach_provenance(
VmValue::String(Rc::from("hi")),
Some("mock-model".to_string()),
10,
4,
false,
);
let dict = result.as_dict().expect("dict");
assert_eq!(
dict.get("data").map(VmValue::display).as_deref(),
Some("hi")
);
assert_eq!(
dict.get("_provenance")
.and_then(VmValue::as_dict)
.and_then(|value| value.get("cached"))
.and_then(value_as_bool),
Some(false)
);
}
#[test]
fn llm_options_value_forwards_temperature() {
let options = ProjectEnrichOptions {
base_evidence: None,
prompt: "Return JSON.".to_string(),
schema: VmValue::Dict(Rc::new(BTreeMap::new())),
budget_tokens: 4000,
model: "mock-model".to_string(),
provider: "mock".to_string(),
temperature: Some(0.25),
cache_key: "cache-v1".to_string(),
cache_dir: None,
schema_retries: 1,
};
let llm_options = llm_options_value(&options, "rendered prompt");
let dict = llm_options.as_dict().expect("dict");
assert_eq!(dict.get("temperature").and_then(value_as_float), Some(0.25));
}
}