use crate::cli::memory::{analysis_cache_key, search_cache_key};
use crate::graph::pdg::{EdgeType, NodeType, ProgramDependenceGraph};
use crate::search::search::{NodeInfo, SearchEngine};
use crate::storage::{pdg_store, schema::Storage};
use anyhow::{Context, Result};
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet, VecDeque};
use std::io::Read as _;
use std::path::{Path, PathBuf};
use tracing::info;
#[cfg(feature = "onnx")]
use crate::search::onnx::QwenEmbeddingProvider;
#[cfg(feature = "remote-embeddings")]
use crate::search::onnx::{
GenericRemoteProvider, RemoteEmbeddingConfig, RemoteEmbeddingError,
};
#[cfg(feature = "remote-embeddings")]
use crate::search::onnx::remote::RemoteEmbeddingProvider;
use super::leindex::{
FileStats, IndexStats, ProjectFileScan, DEPENDENCY_MANIFEST_NAMES, SKIP_DIRS,
SOURCE_FILE_EXTENSIONS,
};
pub(crate) fn tokenize_code(text: &str) -> Vec<String> {
let mut tokens: Vec<String> = Vec::new();
let mut current = String::new();
for ch in text.chars() {
if ch.is_alphanumeric() {
if ch.is_uppercase() && !current.is_empty() {
let last = current.chars().last().unwrap();
if last.is_lowercase() || last.is_ascii_digit() {
if current.len() >= 2 {
tokens.push(current.to_lowercase());
} else if current.chars().all(|c| c.is_ascii_digit()) {
tokens.push(current.clone());
}
current = ch.to_string();
} else {
current.push(ch);
}
} else if ch.is_lowercase()
&& !current.is_empty()
&& current.len() > 1
&& current
.chars()
.last()
.map(|c| c.is_uppercase())
.unwrap_or(false)
&& current
.chars()
.rev()
.nth(1)
.map(|c| c.is_uppercase())
.unwrap_or(false)
{
let last_char = current.pop().unwrap();
if current.len() >= 2 {
tokens.push(current.to_lowercase());
}
current.clear();
current.push(last_char);
current.push(ch);
} else if ch.is_ascii_digit()
&& !current.is_empty()
&& current
.chars()
.last()
.map(|c| c.is_alphabetic())
.unwrap_or(false)
{
if current.len() >= 2 {
tokens.push(current.to_lowercase());
}
current = ch.to_string();
} else if ch.is_alphabetic()
&& !current.is_empty()
&& current
.chars()
.last()
.map(|c| c.is_ascii_digit())
.unwrap_or(false)
{
tokens.push(current.to_lowercase());
current = ch.to_string();
} else {
current.push(ch);
}
} else if ch == '_' || ch == '-' || ch.is_whitespace() || ch.is_ascii_punctuation() {
if current.len() >= 2 {
tokens.push(current.to_lowercase());
} else if !current.is_empty() && current.chars().all(|c| c.is_ascii_digit()) {
tokens.push(current.clone());
}
current = String::new();
} else {
current.push(ch);
}
}
if current.len() >= 2 || !current.is_empty() && current.chars().all(|c| c.is_ascii_digit()) {
tokens.push(current.to_lowercase());
}
tokens
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct TfIdfPersistedState {
vocab: Vec<String>,
idf: Vec<f32>,
dimension: usize,
pdg_nodes: usize,
pdg_edges: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TfIdfEmbedder {
vocab: Vec<String>,
idf: Vec<f32>,
dimension: usize,
pdg_nodes: usize,
pdg_edges: usize,
}
impl TfIdfEmbedder {
#[cfg_attr(not(test), allow(dead_code))]
pub fn build(documents: &[(String, String)]) -> Self {
let tokenized: Vec<(String, Vec<String>)> = documents
.iter()
.map(|(id, content)| (id.clone(), tokenize_code(content)))
.collect();
Self::build_from_tokens(&tokenized)
}
pub fn build_from_tokens(documents: &[(String, Vec<String>)]) -> Self {
const TARGET_DIM: usize = crate::search::search::DEFAULT_EMBEDDING_DIMENSION;
let n = documents.len();
if n == 0 {
return Self {
vocab: Vec::new(),
idf: Vec::new(),
dimension: TARGET_DIM,
pdg_nodes: 0,
pdg_edges: 0,
};
}
let mut df: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
let mut seen: std::collections::HashSet<&str> = std::collections::HashSet::new();
for (_, tokens) in documents {
seen.clear();
for tok in tokens {
if seen.insert(tok.as_str()) {
*df.entry(tok.to_string()).or_insert(0) += 1;
}
}
}
let n_f = n as f32;
let min_df: usize = if n < 50 { 1 } else { (n / 1000).max(3) };
let max_df: usize = if n < 50 { n } else { (n / 4).max(min_df + 1) };
let mut idf_scores: Vec<(String, f32)> = df
.into_iter()
.filter(|(_, df_count)| *df_count >= min_df && *df_count <= max_df)
.map(|(tok, df_count)| {
let idf = (n_f / df_count as f32).ln();
(tok, idf)
})
.collect();
info!(
vocab_candidates = idf_scores.len(),
min_df,
max_df,
n_docs = n,
"TF-IDF vocabulary candidates (moderate-IDF filter)"
);
idf_scores.sort_by(|a, b| {
a.1.partial_cmp(&b.1)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| a.0.cmp(&b.0))
});
let final_scores: Vec<(String, f32)> = if idf_scores.len() <= TARGET_DIM {
idf_scores
} else {
let total = idf_scores.len();
let stride = total as f64 / TARGET_DIM as f64;
(0..TARGET_DIM)
.map(|i| {
let idx = ((i as f64 * stride) as usize).min(total - 1);
idf_scores[idx].clone()
})
.collect()
};
let idf_scores = final_scores;
let vocab: Vec<String> = idf_scores.iter().map(|(t, _)| t.clone()).collect();
let idf: Vec<f32> = idf_scores.iter().map(|(_, s)| *s).collect();
Self {
vocab,
idf,
dimension: TARGET_DIM,
pdg_nodes: 0,
pdg_edges: 0,
}
}
pub fn embed(&self, text: &str) -> Vec<f32> {
let mut vec = vec![0.0f32; self.dimension];
if self.vocab.is_empty() {
return vec;
}
let tokens = tokenize_code(text);
let total = tokens.len() as f32;
if total == 0.0 {
return vec;
}
let mut tf_map: std::collections::HashMap<&str, f32> = std::collections::HashMap::new();
for tok in &tokens {
*tf_map.entry(tok.as_str()).or_insert(0.0) += 1.0;
}
for (i, (word, idf_val)) in self.vocab.iter().zip(self.idf.iter()).enumerate() {
if let Some(&count) = tf_map.get(word.as_str()) {
vec[i] = (count / total) * idf_val;
}
}
let magnitude: f32 = vec.iter().map(|v| v * v).sum::<f32>().sqrt();
if magnitude > 1e-9 {
for v in &mut vec {
*v /= magnitude;
}
}
vec
}
pub fn embed_tokens(&self, tokens: &[String]) -> Vec<f32> {
let mut vec = vec![0.0f32; self.dimension];
if self.vocab.is_empty() {
return vec;
}
let total = tokens.len() as f32;
if total == 0.0 {
return vec;
}
let mut tf_map: std::collections::HashMap<&str, f32> = std::collections::HashMap::new();
for tok in tokens {
*tf_map.entry(tok.as_str()).or_insert(0.0) += 1.0;
}
for (i, (word, idf_val)) in self.vocab.iter().zip(self.idf.iter()).enumerate() {
if let Some(&count) = tf_map.get(word.as_str()) {
vec[i] = (count / total) * idf_val;
}
}
let magnitude: f32 = vec.iter().map(|v| v * v).sum::<f32>().sqrt();
if magnitude > 1e-9 {
for v in &mut vec {
*v /= magnitude;
}
}
vec
}
#[allow(dead_code)]
fn from_persisted_state(state: TfIdfPersistedState) -> Self {
Self {
vocab: state.vocab,
idf: state.idf,
dimension: state.dimension,
pdg_nodes: state.pdg_nodes,
pdg_edges: state.pdg_edges,
}
}
fn persisted_state(&self, pdg: &ProgramDependenceGraph) -> TfIdfPersistedState {
TfIdfPersistedState {
vocab: self.vocab.clone(),
idf: self.idf.clone(),
dimension: self.dimension,
pdg_nodes: pdg.node_count(),
pdg_edges: pdg.edge_count(),
}
}
pub fn is_fresh(&self, pdg_node_count: usize, pdg_edge_count: usize) -> bool {
self.pdg_nodes == pdg_node_count && self.pdg_edges == pdg_edge_count
}
pub fn dimension(&self) -> usize {
self.dimension
}
fn storage_path(project_path: &Path) -> PathBuf {
project_path.join(".leindex").join("tfidf_embedder.bin")
}
#[allow(dead_code)]
pub fn load_from_storage(project_path: &Path) -> Result<Option<Self>> {
let path = Self::storage_path(project_path);
if !path.exists() {
return Ok(None);
}
let bytes = std::fs::read(&path)
.with_context(|| format!("Failed to read persisted embedder: {}", path.display()))?;
let state: TfIdfPersistedState = bincode::deserialize(&bytes).with_context(|| {
format!(
"Failed to deserialize persisted embedder: {}",
path.display()
)
})?;
Ok(Some(Self::from_persisted_state(state)))
}
pub fn persist_to_storage(
&self,
project_path: &Path,
pdg: &ProgramDependenceGraph,
) -> Result<()> {
let path = Self::storage_path(project_path);
if let Some(parent) = path.parent() {
std::fs::create_dir_all(parent).with_context(|| {
format!("Failed to create embedder directory: {}", parent.display())
})?;
}
let payload = bincode::serialize(&self.persisted_state(pdg))
.context("Failed to serialize embedder")?;
std::fs::write(&path, payload)
.with_context(|| format!("Failed to persist embedder: {}", path.display()))
}
}
#[derive(Debug, Clone)]
pub enum HybridEmbedder {
TfIdfOnly(TfIdfEmbedder),
#[cfg(feature = "onnx")]
HybridLocal {
tfidf: TfIdfEmbedder,
neural: QwenEmbeddingProvider,
neural_weight: f32,
},
#[cfg(feature = "remote-embeddings")]
HybridRemote {
tfidf: TfIdfEmbedder,
remote: GenericRemoteProvider,
remote_weight: f32,
},
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub struct HybridScoringWeights {
pub tfidf: f32,
pub neural: f32,
pub structural: f32,
pub text_match: f32,
}
impl Default for HybridScoringWeights {
fn default() -> Self {
Self {
tfidf: 0.30,
neural: 0.40,
structural: 0.15,
text_match: 0.15,
}
}
}
impl HybridScoringWeights {
pub fn without_neural() -> Self {
Self {
tfidf: 0.60,
neural: 0.00,
structural: 0.20,
text_match: 0.20,
}
}
pub fn normalize(&self) -> Self {
let sum = self.tfidf + self.neural + self.structural + self.text_match;
if sum == 0.0 {
return Self::default();
}
Self {
tfidf: self.tfidf / sum,
neural: self.neural / sum,
structural: self.structural / sum,
text_match: self.text_match / sum,
}
}
}
impl HybridEmbedder {
pub fn tfidf_only(embedder: TfIdfEmbedder) -> Self {
Self::TfIdfOnly(embedder)
}
#[cfg(feature = "onnx")]
pub fn hybrid_local(
tfidf: TfIdfEmbedder,
neural_weight: Option<f32>,
) -> Result<Self, crate::search::onnx::QwenEmbeddingProviderError> {
Ok(Self::HybridLocal {
tfidf,
neural: QwenEmbeddingProvider::new()?,
neural_weight: neural_weight.unwrap_or(0.40),
})
}
#[cfg(feature = "remote-embeddings")]
pub fn hybrid_remote(
tfidf: TfIdfEmbedder,
remote_config: RemoteEmbeddingConfig,
remote_weight: Option<f32>,
) -> Result<Self, RemoteEmbeddingError> {
let remote = GenericRemoteProvider::from_config(remote_config)?;
Ok(Self::HybridRemote {
tfidf,
remote,
remote_weight: remote_weight.unwrap_or(0.40),
})
}
pub fn tfidf(&self) -> &TfIdfEmbedder {
match self {
Self::TfIdfOnly(embedder) => embedder,
#[cfg(feature = "onnx")]
Self::HybridLocal { tfidf, .. } => tfidf,
#[cfg(feature = "remote-embeddings")]
Self::HybridRemote { tfidf, .. } => tfidf,
}
}
pub fn tfidf_mut(&mut self) -> &mut TfIdfEmbedder {
match self {
Self::TfIdfOnly(embedder) => embedder,
#[cfg(feature = "onnx")]
Self::HybridLocal { tfidf, .. } => tfidf,
#[cfg(feature = "remote-embeddings")]
Self::HybridRemote { tfidf, .. } => tfidf,
}
}
pub fn tfidf_dimension(&self) -> usize {
self.tfidf().dimension()
}
pub fn neural_dimension(&self) -> Option<usize> {
match self {
Self::TfIdfOnly(_) => None,
#[cfg(feature = "onnx")]
Self::HybridLocal { neural, .. } => Some(neural.dimension()),
#[cfg(feature = "remote-embeddings")]
Self::HybridRemote { remote, .. } => Some(remote.dimension()),
}
}
pub fn has_neural(&self) -> bool {
match self {
Self::TfIdfOnly(_) => false,
#[cfg(feature = "onnx")]
Self::HybridLocal { .. } => true,
#[cfg(feature = "remote-embeddings")]
Self::HybridRemote { .. } => true,
}
}
pub fn neural_weight(&self) -> f32 {
match self {
Self::TfIdfOnly(_) => 0.0,
#[cfg(feature = "onnx")]
Self::HybridLocal { neural_weight, .. } => *neural_weight,
#[cfg(feature = "remote-embeddings")]
Self::HybridRemote { remote_weight, .. } => *remote_weight,
}
}
pub fn scoring_weights(&self) -> HybridScoringWeights {
if self.has_neural() {
HybridScoringWeights::default()
} else {
HybridScoringWeights::without_neural()
}
}
pub fn embed_tfidf(&self, tokens: &[String]) -> Vec<f32> {
self.tfidf().embed_tokens(tokens)
}
#[cfg(any(feature = "onnx", feature = "remote-embeddings"))]
pub async fn embed_neural_async(&self, text: &str) -> Option<Result<Vec<f32>, String>> {
match self {
Self::TfIdfOnly(_) => None,
#[cfg(feature = "onnx")]
Self::HybridLocal { neural, .. } => {
Some(
neural
.embed(text)
.map_err(|e| format!("ONNX embedding failed: {}", e)),
)
}
#[cfg(feature = "remote-embeddings")]
Self::HybridRemote { remote, .. } => {
Some(
remote
.embed(text)
.await
.map_err(|e| format!("Remote embedding failed: {}", e)),
)
}
}
}
#[cfg(any(feature = "onnx", feature = "remote-embeddings"))]
pub fn embed_neural_blocking(&self, text: &str) -> Option<Result<Vec<f32>, String>> {
match self {
Self::TfIdfOnly(_) => None,
#[cfg(feature = "onnx")]
Self::HybridLocal { neural, .. } => {
Some(
neural
.embed(text)
.map_err(|e| format!("ONNX embedding failed: {}", e)),
)
}
#[cfg(feature = "remote-embeddings")]
Self::HybridRemote { .. } => {
Some(Err("Remote embeddings require async runtime".to_string()))
}
}
}
pub fn persist_to_storage(&self, project_path: &Path, pdg: &ProgramDependenceGraph) -> Result<()> {
self.tfidf().persist_to_storage(project_path, pdg)
}
}
impl Default for HybridEmbedder {
fn default() -> Self {
Self::TfIdfOnly(TfIdfEmbedder::build_from_tokens(&[]))
}
}
pub(crate) fn read_file_once(path: &Path) -> Result<(String, std::sync::Arc<Vec<u8>>)> {
let file = std::fs::File::open(path)
.with_context(|| format!("Failed to open file: {}", path.display()))?;
let mut reader = std::io::BufReader::new(file);
let mut hasher = blake3::Hasher::new();
let mut bytes = Vec::new();
let mut buf = [0u8; 8192];
loop {
let n = reader
.read(&mut buf)
.with_context(|| format!("Failed to read file: {}", path.display()))?;
if n == 0 {
break;
}
hasher.update(&buf[..n]);
bytes.extend_from_slice(&buf[..n]);
}
let hash = hasher.finalize().to_hex().to_string();
Ok((hash, std::sync::Arc::new(bytes)))
}
pub(crate) fn hash_file(path: &Path) -> Result<String> {
Ok(read_file_once(path)?.0)
}
#[derive(Debug)]
pub(crate) struct FileReadCache {
capacity: usize,
entries: HashMap<PathBuf, std::sync::Arc<Vec<u8>>>,
order: VecDeque<PathBuf>,
}
impl FileReadCache {
pub(crate) fn new(capacity: usize) -> Self {
Self {
capacity: capacity.max(1),
entries: HashMap::new(),
order: VecDeque::new(),
}
}
pub(crate) fn get_or_read(&mut self, path: &Path) -> Result<std::sync::Arc<Vec<u8>>> {
if let Some(bytes) = self.entries.get(path).cloned() {
self.touch(path);
return Ok(bytes);
}
let (hash, bytes) = read_file_once(path)?;
self.insert(path.to_path_buf(), bytes.clone());
info!(file = %path.display(), hash = %hash, "Read file once for hash and content");
Ok(bytes)
}
fn touch(&mut self, path: &Path) {
if let Some(pos) = self.order.iter().position(|p| p == path) {
self.order.remove(pos);
}
self.order.push_back(path.to_path_buf());
}
fn insert(&mut self, path: PathBuf, bytes: std::sync::Arc<Vec<u8>>) {
if self.entries.contains_key(&path) {
self.entries.insert(path.clone(), bytes);
self.touch(&path);
return;
}
if self.entries.len() >= self.capacity {
if let Some(old) = self.order.pop_front() {
self.entries.remove(&old);
}
}
self.order.push_back(path.clone());
self.entries.insert(path, bytes);
}
}
pub(crate) fn is_dependency_manifest_name(name: &str) -> bool {
DEPENDENCY_MANIFEST_NAMES.contains(&name)
}
pub(crate) fn scan_project_files(project_path: &Path) -> Result<ProjectFileScan> {
let project_config = crate::cli::config::ProjectConfig::load(project_path).unwrap_or_default();
let limits = &project_config.indexing;
let mut source_paths = Vec::new();
let mut manifest_paths = Vec::new();
let mut total_source_size: u64 = 0;
let mut oversized_count: usize = 0;
let mut walker = walkdir::WalkDir::new(project_path).into_iter();
while let Some(entry) = walker.next() {
let entry = match entry {
Ok(e) => e,
Err(_) => continue,
};
let path = entry.path();
let file_name = entry.file_name().to_string_lossy();
if path != project_path && file_name.starts_with('.') && file_name != "." {
if entry.file_type().is_dir() {
walker.skip_current_dir();
}
continue;
}
if entry.file_type().is_dir() {
if SKIP_DIRS.contains(&file_name.as_ref()) {
walker.skip_current_dir();
}
continue;
}
if !entry.file_type().is_file() {
continue;
}
if let Some(name) = path.file_name().and_then(|name| name.to_str()) {
if is_dependency_manifest_name(name) {
let is_lockfile =
name.contains("lock") || name.contains(".sum") || name == "npm-shrinkwrap.json";
if is_lockfile || !project_config.should_exclude(path) {
manifest_paths.push(path.to_path_buf());
}
continue;
}
}
if project_config.should_exclude(path) {
continue;
}
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
let ext_lower = ext.to_ascii_lowercase();
if SOURCE_FILE_EXTENSIONS.contains(&ext_lower.as_str()) {
let file_size = entry.metadata().map(|m| m.len()).unwrap_or(0);
if limits.max_file_size > 0 && file_size > limits.max_file_size {
oversized_count += 1;
if oversized_count <= 5 {
tracing::warn!(
file = %path.display(),
size_bytes = file_size,
limit_bytes = limits.max_file_size,
"Skipping file exceeding max_file_size limit"
);
}
continue;
}
if limits.max_files > 0 && source_paths.len() >= limits.max_files {
tracing::warn!(
count = source_paths.len(),
limit = limits.max_files,
"Reached max_files limit, stopping source file scan"
);
break;
}
if limits.max_total_size > 0
&& total_source_size + file_size > limits.max_total_size
{
tracing::warn!(
total_bytes = total_source_size + file_size,
limit_bytes = limits.max_total_size,
"Reached max_total_size limit, stopping source file scan"
);
break;
}
total_source_size += file_size;
source_paths.push(path.to_path_buf());
}
}
}
if oversized_count > 5 {
tracing::warn!(
total_oversized = oversized_count,
"Additional oversized files skipped (showing first 5 warnings)"
);
}
let source_directories = crate::cli::index_freshness::extract_unique_dirs(&source_paths);
let mut manifest_hashes = std::collections::HashMap::new();
for mp in &manifest_paths {
if let Ok(bytes) = std::fs::read(mp) {
let hash = blake3::hash(&bytes).to_hex().to_string();
manifest_hashes.insert(mp.display().to_string(), hash);
}
}
Ok(ProjectFileScan {
source_paths,
manifest_paths,
source_directories,
manifest_hashes,
})
}
pub(crate) fn collect_source_files_with_hashes(
scan: &ProjectFileScan,
mut file_cache: Option<&mut FileReadCache>,
) -> Result<Vec<(PathBuf, String)>> {
scan.source_paths
.iter()
.map(|path| {
let hash = if let Some(cache) = file_cache.as_deref_mut() {
let bytes = cache.get_or_read(path)?;
blake3::hash(bytes.as_slice()).to_hex().to_string()
} else {
read_file_once(path)?.0
};
Ok((path.clone(), hash))
})
.collect()
}
pub(crate) fn merge_pdgs(target: &mut ProgramDependenceGraph, source: ProgramDependenceGraph) {
let mut id_map: std::collections::HashMap<
petgraph::graph::NodeIndex,
petgraph::graph::NodeIndex,
> = std::collections::HashMap::with_capacity(source.node_count());
for node_idx in source.node_indices() {
if let Some(node) = source.get_node(node_idx) {
let new_idx = target.add_node(node.clone());
id_map.insert(node_idx, new_idx);
}
}
for edge_idx in source.edge_indices() {
if let Some(edge) = source.get_edge(edge_idx) {
if let Some((s, t)) = source.edge_endpoints(edge_idx) {
if let (Some(&si), Some(&ti)) = (id_map.get(&s), id_map.get(&t)) {
target.add_edge(si, ti, edge.clone());
}
}
}
}
}
pub(crate) fn remove_file_from_pdg(
pdg: &mut ProgramDependenceGraph,
file_path: &str,
) -> Result<()> {
pdg.remove_file(file_path);
Ok(())
}
pub(crate) fn normalize_external_nodes(pdg: &mut ProgramDependenceGraph) {
let mut migrated = 0usize;
for node in pdg.node_weights_mut() {
let is_external = node.language == "external" || node.language.starts_with("external:");
if is_external && node.node_type != NodeType::External {
node.node_type = NodeType::External;
migrated += 1;
}
}
if migrated > 0 {
info!(
"Normalized {} external nodes to NodeType::External",
migrated
);
}
}
pub(crate) fn save_to_storage(
storage: &mut Storage,
project_id: &str,
pdg: &ProgramDependenceGraph,
) -> Result<()> {
pdg_store::save_pdg(storage, project_id, pdg).context("Failed to save PDG to storage")?;
info!("Saved PDG to storage for project: {}", project_id);
Ok(())
}
pub(crate) fn index_nodes(
pdg: &ProgramDependenceGraph,
search_engine: &mut SearchEngine,
file_stats_cache: &mut Option<HashMap<String, FileStats>>,
batch_size: usize,
) -> Result<HybridEmbedder> {
index_nodes_with_embedder(pdg, search_engine, file_stats_cache, batch_size, None, None)
}
pub(crate) fn index_nodes_with_embedder(
pdg: &ProgramDependenceGraph,
search_engine: &mut SearchEngine,
file_stats_cache: &mut Option<HashMap<String, FileStats>>,
batch_size: usize,
embedder: Option<HybridEmbedder>,
shared_file_cache: Option<FileReadCache>,
) -> Result<HybridEmbedder> {
*file_stats_cache = None;
let batch_size = batch_size.max(1);
let mut file_cache = shared_file_cache.unwrap_or_else(|| FileReadCache::new(100));
let connectivity_config = crate::graph::pdg::TraversalConfig {
max_depth: Some(1),
max_nodes: Some(1000),
allowed_edge_types: Some(&[EdgeType::Call, EdgeType::DataDependency]),
excluded_node_types: Some(vec![NodeType::External]),
min_complexity: None,
min_edge_confidence: 0.0,
};
let node_indices: Vec<petgraph::graph::NodeIndex> = pdg.node_indices().collect();
let mut df: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
let mut seen_tokens: HashSet<String> = HashSet::new();
let mut total_docs = 0usize;
let extract_node_content = |node: &crate::graph::pdg::Node,
node_idx: petgraph::graph::NodeIndex,
file_bytes: &[u8]|
-> String {
let content = String::from_utf8_lossy(file_bytes);
let mut enrichment = format!(
"// type:{} lang:{}",
match node.node_type {
NodeType::Function => "function",
NodeType::Class => "class",
NodeType::Method => "method",
NodeType::Variable => "variable",
NodeType::Module => "module",
NodeType::External => "external",
},
node.language,
);
let callers = pdg.backward_impact(node_idx, &connectivity_config);
let callees = pdg.forward_impact(node_idx, &connectivity_config);
enrichment.push_str(&format!(
" callers:{} callees:{} complexity:{}",
callers.len().min(50),
callees.len().min(50),
node.complexity,
));
if !content.is_empty() && node.byte_range.1 > node.byte_range.0 {
let content_bytes = content.as_bytes();
let start = node.byte_range.0.min(content_bytes.len());
let end = node.byte_range.1.min(content_bytes.len());
if start < end {
let snippet = String::from_utf8_lossy(&content_bytes[start..end]);
format!(
"{}\n// {} in {}\n{}",
enrichment, node.name, node.file_path, snippet
)
} else {
format!(
"{}\n// {} in {}\n{}",
enrichment, node.name, node.file_path, "// [No source code available]"
)
}
} else {
format!(
"{}\n// {} in {}\n{}",
enrichment, node.name, node.file_path, "// [No source code available]"
)
}
};
for batch in node_indices.chunks(batch_size) {
for &node_idx in batch {
if let Some(node) = pdg.get_node(node_idx) {
let file_bytes = file_cache
.get_or_read(Path::new(&*node.file_path))
.unwrap_or_else(|_| std::sync::Arc::new(Vec::new()));
let node_content = extract_node_content(node, node_idx, &file_bytes);
let tokens = tokenize_code(&node_content);
seen_tokens.clear();
for tok in &tokens {
if seen_tokens.insert(tok.clone()) {
*df.entry(tok.clone()).or_insert(0) += 1;
}
}
total_docs += 1;
}
}
}
let embedder = if let Some(embedder) = embedder {
embedder
} else if total_docs == 0 {
HybridEmbedder::tfidf_only(TfIdfEmbedder::build_from_tokens(&[]))
} else {
let n_f = total_docs as f32;
let min_df: usize = if total_docs < 50 {
1
} else {
(total_docs / 1000).max(3)
};
let max_df: usize = if total_docs < 50 {
total_docs
} else {
(total_docs / 4).max(min_df + 1)
};
let mut idf_scores: Vec<(String, f32)> = df
.into_iter()
.filter(|(_, df_count)| *df_count >= min_df && *df_count <= max_df)
.map(|(tok, df_count)| (tok, (n_f / df_count as f32).ln()))
.collect();
info!(
vocab_candidates = idf_scores.len(),
min_df,
max_df,
n_docs = total_docs,
"TF-IDF vocabulary candidates (two-pass streaming)"
);
idf_scores.sort_by(|a, b| {
a.1.partial_cmp(&b.1)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| a.0.cmp(&b.0))
});
let final_scores = if idf_scores.len() <= crate::search::search::DEFAULT_EMBEDDING_DIMENSION
{
idf_scores
} else {
let total = idf_scores.len();
let target = crate::search::search::DEFAULT_EMBEDDING_DIMENSION;
let stride = total as f64 / target as f64;
(0..target)
.map(|i| {
let idx = ((i as f64 * stride) as usize).min(total - 1);
idf_scores[idx].clone()
})
.collect()
};
let tfidf_embedder = TfIdfEmbedder {
vocab: final_scores.iter().map(|(t, _)| t.clone()).collect(),
idf: final_scores.iter().map(|(_, s)| *s).collect(),
dimension: crate::search::search::DEFAULT_EMBEDDING_DIMENSION,
pdg_nodes: pdg.node_count(),
pdg_edges: pdg.edge_count(),
};
HybridEmbedder::tfidf_only(tfidf_embedder)
};
let mut nodes: Vec<NodeInfo> = Vec::with_capacity(batch_size);
#[cfg(feature = "remote-embeddings")]
let mut warned_remote_blocking = false;
for batch in node_indices.chunks(batch_size) {
nodes.clear();
for &node_idx in batch {
if let Some(node) = pdg.get_node(node_idx) {
let file_bytes = file_cache
.get_or_read(Path::new(&*node.file_path))
.unwrap_or_else(|_| std::sync::Arc::new(Vec::new()));
let node_content = extract_node_content(node, node_idx, &file_bytes);
let tokens = tokenize_code(&node_content);
let tfidf_embedding = embedder.embed_tfidf(&tokens);
let neural_embedding = match &embedder {
HybridEmbedder::TfIdfOnly(_) => None,
#[cfg(feature = "onnx")]
HybridEmbedder::HybridLocal { .. } => {
match embedder.embed_neural_blocking(&node_content) {
Some(Ok(v)) => Some(v),
Some(Err(e)) => {
tracing::warn!("Neural embedding failed for node {}: {}", node.id, e);
None
}
None => None,
}
}
#[cfg(feature = "remote-embeddings")]
HybridEmbedder::HybridRemote { .. } => {
if !warned_remote_blocking {
tracing::warn!(
"Remote embeddings require async runtime; falling back to TF-IDF-only for this indexing run"
);
warned_remote_blocking = true;
}
None
}
};
let signature = crate::search::search::SearchEngine::extract_signature_from_content(
&node_content,
);
let search_tokens: Vec<String> = node_content
.split(|c: char| !c.is_alphanumeric())
.map(|s| s.to_ascii_lowercase())
.filter(|s| s.len() >= 2)
.collect();
nodes.push(NodeInfo {
node_id: node.id.clone(),
file_path: node.file_path.to_string(),
symbol_name: node.name.clone(),
language: node.language.clone(),
content: node_content.clone(),
byte_range: node.byte_range,
tfidf_embedding,
neural_embedding,
embedding: None, complexity: node.complexity,
signature,
pre_tokenized: Some(search_tokens),
});
}
}
search_engine.index_nodes(std::mem::take(&mut nodes));
}
Ok(embedder)
}
pub(crate) fn detect_changed_manifests(
current_scan: &ProjectFileScan,
project_id: &str,
cache_spiller: &crate::cli::memory::CacheSpiller,
) -> Vec<PathBuf> {
let cache_key = crate::cli::memory::project_scan_cache_key(project_id);
let old_hashes: std::collections::HashMap<String, String> = cache_spiller
.store()
.peek(&cache_key)
.and_then(|entry| match entry {
crate::cli::memory::CacheEntry::Binary {
serialized_data, ..
} => bincode::deserialize::<ProjectFileScan>(serialized_data).ok(),
_ => None,
})
.or_else(|| {
cache_spiller
.store()
.load_from_disk(&cache_key)
.ok()
.and_then(|entry| match entry {
crate::cli::memory::CacheEntry::Binary {
serialized_data, ..
} => bincode::deserialize::<ProjectFileScan>(&serialized_data).ok(),
_ => None,
})
})
.map(|scan| scan.manifest_hashes)
.unwrap_or_default();
let mut changed = Vec::new();
for mp in ¤t_scan.manifest_paths {
let key = mp.display().to_string();
let new_hash = current_scan.manifest_hashes.get(&key);
let old_hash = old_hashes.get(&key);
if new_hash != old_hash {
let path_str = key.to_lowercase();
let skip = path_str.contains("node_modules")
|| path_str.contains("/build/")
|| path_str.contains("\\build\\")
|| path_str.contains("/dist/")
|| path_str.contains("\\dist\\")
|| path_str.contains("/target/")
|| path_str.contains(".cache");
if !skip {
changed.push(mp.clone());
}
}
}
changed
}
#[allow(dead_code)]
pub(crate) fn files_importing_from_manifests(
changed_manifests: &[PathBuf],
all_source_paths: &[PathBuf],
pdg: &ProgramDependenceGraph,
) -> Vec<PathBuf> {
if changed_manifests.is_empty() {
return Vec::new();
}
let changed_dirs: HashSet<PathBuf> = changed_manifests
.iter()
.filter_map(|p| p.parent().map(|d| d.to_path_buf()))
.collect();
let mut affected: Vec<PathBuf> = Vec::new();
for sp in all_source_paths {
if let Some(parent) = sp.parent() {
for dir in &changed_dirs {
if sp.starts_with(dir) || parent.starts_with(dir) {
affected.push(sp.clone());
break;
}
}
}
}
let affected_set: HashSet<String> = affected.iter().map(|p| p.display().to_string()).collect();
let source_set: HashSet<String> = all_source_paths
.iter()
.map(|p| p.display().to_string())
.collect();
let mut affected_set = affected_set;
for nid in pdg.node_indices() {
if let Some(node) = pdg.get_node(nid) {
if node.node_type == NodeType::External {
let fp = node.file_path.as_ref();
if !affected_set.contains(fp) && source_set.contains(fp) {
affected_set.insert(node.file_path.to_string());
affected.push(PathBuf::from(&*node.file_path));
}
}
}
}
affected
}
pub(crate) fn index_fingerprint(stats: &IndexStats) -> String {
format!(
"{}:{}:{}",
stats.pdg_nodes, stats.pdg_edges, stats.indexed_nodes
)
}
pub(crate) fn stable_project_cache_id(project_id: &str, project_path: &Path) -> String {
let path = project_path.to_string_lossy();
let hash = blake3::hash(path.as_bytes()).to_hex();
format!("{}:{}", project_id, &hash[..12])
}
pub(crate) fn search_cache_key_for(
project_id: &str,
project_path: &Path,
stats: &IndexStats,
query: &str,
top_k: usize,
query_type: Option<&crate::search::ranking::QueryType>,
) -> String {
search_cache_key(&format!(
"query:{}:{}:{}:{}:{:?}",
stable_project_cache_id(project_id, project_path),
index_fingerprint(stats),
top_k,
query.trim().to_lowercase(),
query_type,
))
}
pub(crate) fn analysis_cache_key_for(
project_id: &str,
project_path: &Path,
stats: &IndexStats,
query: &str,
token_budget: usize,
) -> String {
analysis_cache_key(&format!(
"analyze:{}:{}:{}:{}",
stable_project_cache_id(project_id, project_path),
index_fingerprint(stats),
token_budget,
query.trim().to_lowercase()
))
}
pub(crate) fn persist_embeddings_to_mmap(
search_engine: &SearchEngine,
project_path: &Path,
) -> Result<()> {
let embeddings = search_engine.collect_embeddings();
if embeddings.is_empty() {
return Ok(());
}
let path = crate::search::vector::mmap_embeddings_path(project_path);
crate::search::vector::write_mmap_embeddings(&path, &embeddings)
.map_err(|e| anyhow::anyhow!("Failed to write mmap embeddings: {e}"))?;
info!(
count = embeddings.len(),
path = %path.display(),
"Persisted embeddings to mmap file"
);
Ok(())
}
#[allow(dead_code)]
pub(crate) fn try_load_mmap_embeddings(
project_path: &Path,
) -> Option<crate::search::vector::MmapEmbeddingIndex> {
let path = crate::search::vector::mmap_embeddings_path(project_path);
if !path.exists() {
return None;
}
match crate::search::vector::MmapEmbeddingIndex::open(&path) {
Ok(index) => {
info!(
nodes = index.len(),
dim = index.dimension(),
path = %path.display(),
"Loaded mmap embedding index"
);
Some(index)
}
Err(e) => {
tracing::warn!(
path = %path.display(),
error = %e,
"Failed to load mmap embedding index"
);
None
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_tokenize_code_camel_case() {
let toks = tokenize_code("getUserName");
assert!(
toks.contains(&"get".to_string()),
"expected 'get', got {:?}",
toks
);
assert!(
toks.contains(&"user".to_string()),
"expected 'user', got {:?}",
toks
);
assert!(
toks.contains(&"name".to_string()),
"expected 'name', got {:?}",
toks
);
}
#[test]
fn test_tokenize_code_acronyms_and_digits() {
let toks = tokenize_code("HTTPConnection");
assert!(
toks.contains(&"http".to_string()),
"expected 'http', got {:?}",
toks
);
assert!(
toks.contains(&"connection".to_string()),
"expected 'connection', got {:?}",
toks
);
let toks2 = tokenize_code("HTTP2Connection");
assert!(
toks2.contains(&"http".to_string()),
"expected 'http', got {:?}",
toks2
);
assert!(
toks2.contains(&"2".to_string()),
"expected '2', got {:?}",
toks2
);
assert!(
toks2.contains(&"connection".to_string()),
"expected 'connection', got {:?}",
toks2
);
}
#[test]
fn test_tokenize_code_snake_case() {
let toks = tokenize_code("get_user_name");
assert!(
toks.contains(&"get".to_string()),
"expected 'get', got {:?}",
toks
);
assert!(
toks.contains(&"user".to_string()),
"expected 'user', got {:?}",
toks
);
assert!(
toks.contains(&"name".to_string()),
"expected 'name', got {:?}",
toks
);
}
#[test]
fn test_tokenize_code_filters_short_tokens() {
let toks = tokenize_code("a b c xyz");
assert!(!toks.contains(&"a".to_string()));
assert!(!toks.contains(&"b".to_string()));
assert!(!toks.contains(&"c".to_string()));
assert!(toks.contains(&"xyz".to_string()));
}
#[test]
fn test_tokenize_code_empty() {
let toks = tokenize_code("");
assert!(toks.is_empty());
}
#[test]
fn test_tfidf_embedder_empty_corpus() {
let embedder = TfIdfEmbedder::build(&[]);
let vec = embedder.embed("test query");
assert_eq!(
vec.len(),
768,
"must produce 768-dim vector even for empty corpus"
);
assert!(vec.iter().all(|&v| v == 0.0), "empty corpus → zero vector");
}
#[test]
fn test_tfidf_embedding_dimension() {
let docs: Vec<(String, String)> = (0..10)
.map(|i| {
(
format!("doc_{}", i),
format!(
"fn handle_request_{} {{ let result = process(); result }}",
i
),
)
})
.collect();
let embedder = TfIdfEmbedder::build(&docs);
let vec = embedder.embed("handle request process");
assert_eq!(vec.len(), 768, "embedding dimension must be 768");
}
#[test]
fn test_tfidf_embedding_normalized() {
let docs: Vec<(String, String)> = vec![
(
"auth".to_string(),
"fn authenticate_user(token: &str) -> bool { verify_token(token) }".to_string(),
),
(
"db".to_string(),
"fn connect_database(url: &str) -> Connection { open_connection(url) }".to_string(),
),
(
"http".to_string(),
"fn send_request(endpoint: &str) -> Response { http_get(endpoint) }".to_string(),
),
];
let embedder = TfIdfEmbedder::build(&docs);
let vec = embedder.embed("authenticate token verify");
let magnitude: f32 = vec.iter().map(|v| v * v).sum::<f32>().sqrt();
if magnitude > 1e-9 {
assert!(
(magnitude - 1.0).abs() < 1e-4,
"embedding should be L2-normalized, got magnitude {}",
magnitude
);
}
}
#[test]
fn test_tfidf_related_content_higher_similarity() {
let docs: Vec<(String, String)> = vec![
(
"a1".into(),
"fn authenticate_user(token: &str) -> bool { verify_token(token) }".into(),
),
(
"a2".into(),
"fn check_user_credentials(password: &str) -> bool { hash_check(password) }".into(),
),
(
"b1".into(),
"fn connect_database(url: &str) -> Connection { open_connection(url) }".into(),
),
(
"b2".into(),
"fn execute_sql_query(query: &str) -> Vec<Row> { db_execute(query) }".into(),
),
(
"c1".into(),
"fn parse_json_payload(data: &str) -> Value { serde_parse(data) }".into(),
),
];
let embedder = TfIdfEmbedder::build(&docs);
let auth1 = embedder.embed("fn authenticate_user token verify");
let auth2 = embedder.embed("fn check_user credentials password hash");
let db1 = embedder.embed("fn connect database execute sql query");
let cosine =
|a: &[f32], b: &[f32]| -> f32 { a.iter().zip(b.iter()).map(|(x, y)| x * y).sum() };
let sim_related = cosine(&auth1, &auth2);
let sim_unrelated = cosine(&auth1, &db1);
assert!(
sim_related >= sim_unrelated - 0.1,
"related similarity ({}) should not be much lower than unrelated similarity ({})",
sim_related,
sim_unrelated
);
}
#[test]
fn test_tfidf_zero_vector_for_unseen_terms() {
let docs: Vec<(String, String)> =
vec![("a".into(), "fn foo_bar() -> bool { true }".into())];
let embedder = TfIdfEmbedder::build(&docs);
let vec = embedder.embed("zzzzzz aaaaaaa bbbbbbb cccccccc");
let magnitude: f32 = vec.iter().map(|v| v * v).sum::<f32>().sqrt();
assert!(magnitude < 1.1, "magnitude out of range: {}", magnitude);
}
#[test]
fn test_tfidf_partition_matches_sort_selection() {
use std::collections::HashMap;
let mut docs: Vec<(String, String)> = Vec::with_capacity(200);
let token_names: Vec<String> = (0usize..1200)
.map(|i| {
let first = (b'a' + (i % 26) as u8) as char;
let second = (b'a' + ((i / 26) % 26) as u8) as char;
let third = (b'a' + ((i / 676) % 26) as u8) as char;
format!("{}{}{}", first, second, third)
})
.collect();
let mut token_doc_assignments: Vec<(String, Vec<usize>)> = Vec::new();
for (token_id, token) in token_names.iter().enumerate() {
let df = 3 + (token_id % 6);
let docs_with_token: Vec<usize> = (0..df)
.map(|j| (token_id * 7 + j * 13) % 200) .collect();
token_doc_assignments.push((token.clone(), docs_with_token));
}
for doc_id in 0..200 {
let mut tokens = Vec::new();
for (token, doc_ids) in &token_doc_assignments {
if doc_ids.contains(&doc_id) {
tokens.push(token.clone());
}
}
let content = tokens.join(" ");
docs.push((format!("doc_{}", doc_id), content));
}
let embedder = TfIdfEmbedder::build(&docs);
let tokenized: Vec<(String, Vec<String>)> = docs
.iter()
.map(|(id, content)| (id.clone(), tokenize_code(content)))
.collect();
let n = tokenized.len();
let n_f = n as f32;
let min_df: usize = if n < 50 { 1 } else { (n / 1000).max(3) };
let max_df: usize = if n < 50 { n } else { (n / 4).max(min_df + 1) };
let mut df: HashMap<String, usize> = HashMap::new();
let mut seen: std::collections::HashSet<&str> = std::collections::HashSet::new();
for (_, tokens) in &tokenized {
seen.clear();
for tok in tokens {
if seen.insert(tok.as_str()) {
*df.entry(tok.to_string()).or_insert(0) += 1;
}
}
}
let mut ref_scores: Vec<(String, f32)> = df
.into_iter()
.filter(|(_, c)| *c >= min_df && *c <= max_df)
.map(|(tok, c)| (tok, (n_f / c as f32).ln()))
.collect();
ref_scores.sort_by(|a, b| {
a.1.partial_cmp(&b.1)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| a.0.cmp(&b.0))
});
let target_dim = crate::search::search::DEFAULT_EMBEDDING_DIMENSION;
let expected_vocab: Vec<String> = if ref_scores.len() <= target_dim {
ref_scores.iter().map(|(t, _)| t.clone()).collect()
} else {
let total = ref_scores.len();
let stride = total as f64 / target_dim as f64;
(0..target_dim)
.map(|i| {
ref_scores[((i as f64 * stride) as usize).min(total - 1)]
.0
.clone()
})
.collect()
};
assert_eq!(
embedder.vocab.len(),
expected_vocab.len(),
"vocab length mismatch: got {} expected {}",
embedder.vocab.len(),
expected_vocab.len()
);
for (i, (got, expected)) in embedder.vocab.iter().zip(expected_vocab.iter()).enumerate() {
assert_eq!(
got, expected,
"vocab mismatch at position {i}: got '{got}' expected '{expected}'"
);
}
}
#[test]
fn test_detect_changed_manifests_cold_start_no_false_positive() {
use crate::cli::memory::{CacheSpiller, MemoryConfig};
use std::path::PathBuf;
let temp_dir = tempfile::tempdir().unwrap();
let config = MemoryConfig {
cache_dir: temp_dir.path().join("cache"),
max_cache_bytes: 10_000_000,
..Default::default()
};
let mut spiller = CacheSpiller::new(config).unwrap();
let manifest_path = PathBuf::from("/project/Cargo.toml");
let mut old_hashes = std::collections::HashMap::new();
old_hashes.insert(manifest_path.display().to_string(), "abc123".to_string());
let old_scan = ProjectFileScan {
source_paths: vec![PathBuf::from("/project/src/main.rs")],
manifest_paths: vec![manifest_path.clone()],
source_directories: vec![PathBuf::from("/project/src")],
manifest_hashes: old_hashes,
};
let cache_key = crate::cli::memory::project_scan_cache_key("test_project");
let serialized = bincode::serialize(&old_scan).unwrap();
let entry = crate::cli::memory::CacheEntry::Binary {
metadata: std::collections::HashMap::new(),
serialized_data: serialized,
};
spiller
.store_mut()
.insert(cache_key.clone(), entry)
.unwrap();
spiller.store_mut().persist_key(&cache_key).unwrap();
let _ = spiller.store_mut().remove(&cache_key);
assert!(
spiller.store().peek(&cache_key).is_none(),
"peek should return None after removal"
);
let mut current_hashes = std::collections::HashMap::new();
current_hashes.insert(manifest_path.display().to_string(), "abc123".to_string());
let current_scan = ProjectFileScan {
source_paths: vec![PathBuf::from("/project/src/main.rs")],
manifest_paths: vec![manifest_path.clone()],
source_directories: vec![PathBuf::from("/project/src")],
manifest_hashes: current_hashes,
};
let changed = detect_changed_manifests(¤t_scan, "test_project", &spiller);
assert!(
changed.is_empty(),
"cold start should NOT produce false-positive manifest changes, got: {:?}",
changed
);
}
#[test]
fn test_detect_changed_manifests_detects_real_change() {
use crate::cli::memory::{CacheSpiller, MemoryConfig};
use std::path::PathBuf;
let temp_dir = tempfile::tempdir().unwrap();
let config = MemoryConfig {
cache_dir: temp_dir.path().join("cache"),
max_cache_bytes: 10_000_000,
..Default::default()
};
let mut spiller = CacheSpiller::new(config).unwrap();
let manifest_path = PathBuf::from("/project/Cargo.toml");
let mut old_hashes = std::collections::HashMap::new();
old_hashes.insert(manifest_path.display().to_string(), "old_hash".to_string());
let old_scan = ProjectFileScan {
source_paths: vec![PathBuf::from("/project/src/main.rs")],
manifest_paths: vec![manifest_path.clone()],
source_directories: vec![PathBuf::from("/project/src")],
manifest_hashes: old_hashes,
};
let cache_key = crate::cli::memory::project_scan_cache_key("test_project2");
let serialized = bincode::serialize(&old_scan).unwrap();
let entry = crate::cli::memory::CacheEntry::Binary {
metadata: std::collections::HashMap::new(),
serialized_data: serialized,
};
spiller
.store_mut()
.insert(cache_key.clone(), entry)
.unwrap();
spiller.store_mut().persist_key(&cache_key).unwrap();
let _ = spiller.store_mut().remove(&cache_key);
let mut current_hashes = std::collections::HashMap::new();
current_hashes.insert(manifest_path.display().to_string(), "new_hash".to_string());
let current_scan = ProjectFileScan {
source_paths: vec![PathBuf::from("/project/src/main.rs")],
manifest_paths: vec![manifest_path.clone()],
source_directories: vec![PathBuf::from("/project/src")],
manifest_hashes: current_hashes,
};
let changed = detect_changed_manifests(¤t_scan, "test_project2", &spiller);
assert_eq!(
changed.len(),
1,
"should detect exactly one changed manifest"
);
assert_eq!(
changed[0], manifest_path,
"should detect the correct manifest as changed"
);
}
#[test]
fn test_detect_changed_manifests_uses_in_memory_cache_first() {
use crate::cli::memory::{CacheSpiller, MemoryConfig};
use std::path::PathBuf;
let temp_dir = tempfile::tempdir().unwrap();
let config = MemoryConfig {
cache_dir: temp_dir.path().join("cache"),
max_cache_bytes: 10_000_000,
..Default::default()
};
let mut spiller = CacheSpiller::new(config).unwrap();
let manifest_path = PathBuf::from("/project/Cargo.toml");
let mut disk_hashes = std::collections::HashMap::new();
disk_hashes.insert(
manifest_path.display().to_string(),
"stale_hash".to_string(),
);
let disk_scan = ProjectFileScan {
source_paths: vec![],
manifest_paths: vec![manifest_path.clone()],
source_directories: vec![],
manifest_hashes: disk_hashes,
};
let cache_key = crate::cli::memory::project_scan_cache_key("test_project3");
let serialized = bincode::serialize(&disk_scan).unwrap();
let disk_entry = crate::cli::memory::CacheEntry::Binary {
metadata: std::collections::HashMap::new(),
serialized_data: serialized,
};
spiller
.store_mut()
.insert(cache_key.clone(), disk_entry)
.unwrap();
spiller.store_mut().persist_key(&cache_key).unwrap();
let mut mem_hashes = std::collections::HashMap::new();
mem_hashes.insert(
manifest_path.display().to_string(),
"current_hash".to_string(),
);
let mem_scan = ProjectFileScan {
source_paths: vec![],
manifest_paths: vec![manifest_path.clone()],
source_directories: vec![],
manifest_hashes: mem_hashes,
};
let mem_serialized = bincode::serialize(&mem_scan).unwrap();
let mem_entry = crate::cli::memory::CacheEntry::Binary {
metadata: std::collections::HashMap::new(),
serialized_data: mem_serialized,
};
spiller
.store_mut()
.insert(cache_key.clone(), mem_entry)
.unwrap();
let mut current_hashes = std::collections::HashMap::new();
current_hashes.insert(
manifest_path.display().to_string(),
"current_hash".to_string(),
);
let current_scan = ProjectFileScan {
source_paths: vec![],
manifest_paths: vec![manifest_path.clone()],
source_directories: vec![],
manifest_hashes: current_hashes,
};
let changed = detect_changed_manifests(¤t_scan, "test_project3", &spiller);
assert!(
changed.is_empty(),
"in-memory cache should be preferred over disk; should see no changes"
);
}
#[test]
fn test_tfidf_embedder_clone_roundtrip() {
let docs = vec![
("a".to_string(), "fn alpha beta gamma".to_string()),
("b".to_string(), "fn delta epsilon zeta".to_string()),
];
let embedder = TfIdfEmbedder::build(&docs);
let cloned = embedder.clone();
assert_eq!(embedder.vocab, cloned.vocab);
assert_eq!(embedder.idf, cloned.idf);
assert_eq!(embedder.dimension, cloned.dimension);
}
#[test]
fn test_tfidf_incremental_batches_match_full_build() {
let docs: Vec<(String, String)> = (0..120)
.map(|i| {
let body = format!(
"fn item_{}() {{ let value = {}; let shared = value + {}; }}",
i,
i,
i % 7
);
(format!("doc_{i}"), body)
})
.collect();
let full = TfIdfEmbedder::build(&docs);
let tokenized: Vec<(String, Vec<String>)> = docs
.iter()
.map(|(id, content)| (id.clone(), tokenize_code(content)))
.collect();
let chunked = TfIdfEmbedder::build_from_tokens(&tokenized);
assert_eq!(full.vocab, chunked.vocab);
assert_eq!(full.idf, chunked.idf);
assert_eq!(full.dimension, chunked.dimension);
}
#[test]
fn test_index_nodes_respects_batch_size_and_matches_results() {
use crate::graph::pdg::{Node, NodeType, ProgramDependenceGraph};
use crate::search::search::SearchEngine;
let mut pdg = ProgramDependenceGraph::new();
for i in 0..8 {
pdg.add_node(Node {
id: format!("node_{i}"),
name: format!("symbol_{i}"),
file_path: format!("/tmp/file_{i}.rs").into(),
language: format!("rust batch {i}"),
node_type: NodeType::Function,
byte_range: (0, 100),
complexity: i as u32 + 1,
});
}
let mut file_stats_cache = None;
let mut engine_small = SearchEngine::new();
let embedder_small =
index_nodes(&pdg, &mut engine_small, &mut file_stats_cache, 2).unwrap();
let mut file_stats_cache = None;
let mut engine_large = SearchEngine::new();
let embedder_large =
index_nodes(&pdg, &mut engine_large, &mut file_stats_cache, 64).unwrap();
let tfidf_small = match embedder_small {
HybridEmbedder::TfIdfOnly(emb) => emb,
};
let tfidf_large = match embedder_large {
HybridEmbedder::TfIdfOnly(emb) => emb,
};
assert_eq!(tfidf_small.vocab, tfidf_large.vocab);
assert_eq!(tfidf_small.idf, tfidf_large.idf);
assert_eq!(tfidf_small.dimension, tfidf_large.dimension);
}
#[test]
fn test_index_nodes_accumulates_df_across_passes() {
use crate::graph::pdg::{Node, NodeType, ProgramDependenceGraph};
use crate::search::search::SearchEngine;
let mut pdg = ProgramDependenceGraph::new();
for i in 0..6 {
let content_tag = if i < 3 { "shared_token" } else { "other_token" };
pdg.add_node(Node {
id: format!("node_{i}"),
name: format!("symbol_{i}"),
file_path: format!("/tmp/file_{i}.rs").into(),
language: "rust".to_string(),
node_type: NodeType::Function,
byte_range: (0, 100),
complexity: 1,
});
let _ = content_tag;
}
let mut cache = None;
let mut engine = SearchEngine::new();
let embedder = index_nodes(&pdg, &mut engine, &mut cache, 3).unwrap();
let tfidf_embedder = match embedder {
HybridEmbedder::TfIdfOnly(emb) => emb,
};
assert_eq!(
tfidf_embedder.dimension,
crate::search::search::DEFAULT_EMBEDDING_DIMENSION
);
assert_eq!(engine.node_count(), 3);
}
#[test]
fn test_tfidf_embedder_persist_roundtrip() {
let temp = tempfile::tempdir().unwrap();
let docs = vec![("a".to_string(), "fn alpha beta gamma".to_string())];
let embedder = TfIdfEmbedder::build(&docs);
let pdg = { crate::graph::pdg::ProgramDependenceGraph::new() };
embedder.persist_to_storage(temp.path(), &pdg).unwrap();
let loaded = TfIdfEmbedder::load_from_storage(temp.path())
.unwrap()
.unwrap();
assert_eq!(embedder.vocab, loaded.vocab);
assert_eq!(embedder.idf, loaded.idf);
assert_eq!(embedder.dimension, loaded.dimension);
}
#[test]
fn test_tfidf_embedder_missing_file_returns_none() {
let temp = tempfile::tempdir().unwrap();
assert!(TfIdfEmbedder::load_from_storage(temp.path())
.unwrap()
.is_none());
}
#[test]
fn test_tfidf_embedder_freshness_checks_pdg_counts() {
let mut embedder = TfIdfEmbedder::build_from_tokens(&[]);
embedder.pdg_nodes = 3;
embedder.pdg_edges = 7;
assert!(embedder.is_fresh(3, 7));
assert!(!embedder.is_fresh(4, 7));
assert!(!embedder.is_fresh(3, 8));
}
#[test]
fn test_tfidf_build_from_tokens_is_deterministic_across_batch_sizes() {
let docs: Vec<(String, String)> = (0..90)
.map(|i| {
let body = format!(
"pub fn symbol_{}() -> usize {{ {} + {} + {} }}",
i,
i,
i % 5,
i % 11
);
(format!("doc_{i}"), body)
})
.collect();
let tokenized: Vec<(String, Vec<String>)> = docs
.iter()
.map(|(id, content)| (id.clone(), tokenize_code(content)))
.collect();
let embedder_a = TfIdfEmbedder::build_from_tokens(&tokenized);
let embedder_b = TfIdfEmbedder::build_from_tokens(&tokenized);
assert_eq!(embedder_a.vocab, embedder_b.vocab);
assert_eq!(embedder_a.idf, embedder_b.idf);
}
#[test]
fn test_read_file_once_hash_and_content() {
let temp = tempfile::tempdir().unwrap();
let file_path = temp.path().join("test_file.txt");
let content = b"Hello, streaming BLAKE3 world!";
std::fs::write(&file_path, content).unwrap();
let (hash, bytes) = read_file_once(&file_path).unwrap();
let expected_hash = blake3::hash(content).to_hex().to_string();
assert_eq!(
hash, expected_hash,
"streaming BLAKE3 hash must match independent computation"
);
assert_eq!(
bytes.as_slice(),
content,
"file bytes must match original content"
);
}
#[test]
fn test_read_file_once_empty_file() {
let temp = tempfile::tempdir().unwrap();
let file_path = temp.path().join("empty.txt");
std::fs::write(&file_path, b"").unwrap();
let (hash, bytes) = read_file_once(&file_path).unwrap();
let expected_hash = blake3::hash(b"").to_hex().to_string();
assert_eq!(
hash, expected_hash,
"empty file hash must match blake3 of empty input"
);
assert!(bytes.is_empty(), "empty file bytes should be empty");
}
#[test]
fn test_read_file_once_error() {
let result = read_file_once(Path::new("/nonexistent/path/to/file.txt"));
assert!(
result.is_err(),
"reading a nonexistent file should return an error"
);
}
#[test]
#[cfg(feature = "onnx")]
fn test_hybrid_embedder_local_creation() {
let docs: Vec<(String, String)> = vec![
("test".to_string(), "fn test_function() -> bool".to_string()),
];
let tfidf_embedder = TfIdfEmbedder::build(&docs);
let result = HybridEmbedder::hybrid_local(tfidf_embedder, None);
assert!(result.is_ok() || result.is_err());
}
#[test]
#[cfg(not(feature = "onnx"))]
fn test_hybrid_embedder_local_feature_not_enabled() {
let docs: Vec<(String, String)> = vec![
("test".to_string(), "fn test_function() -> bool".to_string()),
];
let tfidf_embedder = TfIdfEmbedder::build(&docs);
let _ = HybridEmbedder::tfidf_only(tfidf_embedder);
}
#[test]
fn test_hybrid_embedder_tfidf_only_default() {
let embedder = HybridEmbedder::default();
assert!(!embedder.has_neural(), "default embedder should be TF-IDF only");
assert_eq!(embedder.tfidf_dimension(), 768, "TF-IDF dimension should be 768");
assert!(embedder.neural_dimension().is_none(), "neural dimension should be None");
}
#[test]
fn test_hybrid_embedder_tfidf_only() {
let docs: Vec<(String, String)> = vec![
("auth".to_string(), "fn authenticate_user(token: &str) -> bool".to_string()),
];
let tfidf_embedder = TfIdfEmbedder::build(&docs);
let embedder = HybridEmbedder::tfidf_only(tfidf_embedder);
assert!(!embedder.has_neural());
assert_eq!(embedder.tfidf_dimension(), 768);
assert_eq!(embedder.neural_weight(), 0.0);
let weights = embedder.scoring_weights();
assert_eq!(weights.tfidf, 0.60, "TF-IDF weight should be 0.60 without neural");
assert_eq!(weights.neural, 0.00, "neural weight should be 0.00 without neural");
}
#[test]
#[cfg(feature = "onnx")]
fn test_hybrid_embedder_local_dimension() {
let docs: Vec<(String, String)> = vec![
("test".to_string(), "fn test_function() -> bool".to_string()),
];
let tfidf_embedder = TfIdfEmbedder::build(&docs);
if let Ok(embedder) = HybridEmbedder::hybrid_local(tfidf_embedder, None) {
assert!(embedder.has_neural(), "hybrid local embedder should have neural");
assert_eq!(embedder.tfidf_dimension(), 768, "TF-IDF dimension should be 768");
assert!(embedder.neural_dimension().is_some(), "neural dimension should be Some");
assert_eq!(embedder.neural_weight(), 0.40, "neural weight should be 0.40");
let weights = embedder.scoring_weights();
assert_eq!(weights.tfidf, 0.30, "TF-IDF weight should be 0.30 with neural");
assert_eq!(weights.neural, 0.40, "neural weight should be 0.40 with neural");
}
}
#[test]
fn test_hybrid_embedder_embed_tfidf() {
let docs: Vec<(String, String)> = vec![
("auth".to_string(), "fn authenticate_user(token: &str) -> bool".to_string()),
];
let tfidf_embedder = TfIdfEmbedder::build(&docs);
let embedder = HybridEmbedder::tfidf_only(tfidf_embedder);
let tokens = vec!["authenticate".to_string(), "user".to_string(), "token".to_string()];
let embedding = embedder.embed_tfidf(&tokens);
assert_eq!(embedding.len(), 768, "TF-IDF embedding dimension should be 768");
}
#[test]
#[cfg(feature = "onnx")]
fn test_hybrid_embedder_embed_neural_local() {
let docs: Vec<(String, String)> = vec![
("test".to_string(), "fn test_function() -> bool".to_string()),
];
let tfidf_embedder = TfIdfEmbedder::build(&docs);
if let Ok(embedder) = HybridEmbedder::hybrid_local(tfidf_embedder, None) {
let tokens = vec!["test".to_string(), "code".to_string(), "embedding".to_string()];
let tfidf_embedding = embedder.embed_tfidf(&tokens);
assert_eq!(tfidf_embedding.len(), 768, "TF-IDF embedding dimension should be 768");
let text = "test code embedding";
if let Some(Ok(neural_embedding)) = embedder.embed_neural_blocking(text) {
assert!(neural_embedding.len() > 0, "neural embedding should have non-zero dimension");
let has_nonzero = neural_embedding.iter().any(|&v| v != 0.0);
assert!(has_nonzero, "neural embeddings should have non-zero values");
}
}
}
#[test]
fn test_hybrid_scoring_weights() {
let weights_with_neural = HybridScoringWeights::default();
assert_eq!(weights_with_neural.tfidf, 0.30);
assert_eq!(weights_with_neural.neural, 0.40);
assert_eq!(weights_with_neural.structural, 0.15);
assert_eq!(weights_with_neural.text_match, 0.15);
assert!((weights_with_neural.tfidf + weights_with_neural.neural + weights_with_neural.structural + weights_with_neural.text_match - 1.0).abs() < 0.001);
let weights_without_neural = HybridScoringWeights::without_neural();
assert_eq!(weights_without_neural.tfidf, 0.60);
assert_eq!(weights_without_neural.neural, 0.00);
assert_eq!(weights_without_neural.structural, 0.20);
assert_eq!(weights_without_neural.text_match, 0.20);
assert!((weights_without_neural.tfidf + weights_without_neural.neural + weights_without_neural.structural + weights_without_neural.text_match - 1.0).abs() < 0.001);
}
#[test]
fn test_hybrid_scoring_weights_normalize() {
let mut custom_weights = HybridScoringWeights {
tfidf: 0.5,
neural: 0.3,
structural: 0.1,
text_match: 0.1,
};
custom_weights = custom_weights.normalize();
assert!((custom_weights.tfidf + custom_weights.neural + custom_weights.structural + custom_weights.text_match - 1.0).abs() < 0.001);
}
#[test]
fn test_hybrid_embedder_compare_backends() {
let docs: Vec<(String, String)> = vec![
("test".to_string(), "fn test_function() -> bool".to_string()),
];
let tfidf_embedder = TfIdfEmbedder::build(&docs);
let tfidf_only = HybridEmbedder::tfidf_only(tfidf_embedder.clone());
assert!(!tfidf_only.has_neural());
assert_eq!(tfidf_only.tfidf_dimension(), 768);
assert!(tfidf_only.neural_dimension().is_none());
#[cfg(feature = "onnx")]
{
if let Ok(hybrid_local) = HybridEmbedder::hybrid_local(tfidf_embedder, None) {
assert!(hybrid_local.has_neural());
assert_eq!(hybrid_local.tfidf_dimension(), 768);
assert!(hybrid_local.neural_dimension().is_some());
}
}
}
}