pub(crate) mod kg;
pub(crate) mod lanes;
pub(crate) mod materialize;
use std::collections::{HashMap, HashSet};
use anyhow::Result;
use crate::core::classifier::{QueryClassifier, QueryIntent};
use crate::core::git::{normalize_path, resolve_branch_files};
use crate::core::search::rrf::{rrf_fuse, RRF_K};
use super::archive::{self, MarkerCache};
use super::docs_penalty;
use super::{
definition_boost_query_tokens, file_type_score_multiplier, is_function_definition_chunk_type,
is_struct_definition_chunk_type, CodeChunk, CodeIndexer, SearchQuery, HNSW_OVERSAMPLE,
STRUCT_DEFINITION_BOOST,
};
pub(crate) const GREP_FALLBACK_SCORE: f32 = 0.001;
pub(crate) const KG_REFINE_THRESHOLD: f32 = 0.4;
pub(crate) fn merge_grep_lane(
fused: Vec<(String, f32)>,
grep_lane: &[(String, f32)],
weight: f32,
top_k: usize,
) -> Vec<(String, f32)> {
if grep_lane.is_empty() {
return fused;
}
let mut accum: HashMap<String, f32> = fused.into_iter().collect();
for (rank0, (id, _)) in grep_lane.iter().enumerate() {
let rank = (rank0 + 1) as f32;
*accum.entry(id.clone()).or_insert(0.0) += weight * (1.0 / (RRF_K + rank));
}
let mut out: Vec<(String, f32)> = accum.into_iter().collect();
out.sort_by(|a, b| {
b.1.partial_cmp(&a.1)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| a.0.cmp(&b.0))
});
out.truncate(top_k);
out
}
pub(crate) const BRANCH_BOOST_MIN: f32 = 1.0;
pub(crate) const BRANCH_BOOST_MAX: f32 = 3.0;
pub(crate) fn resolve_branch_set(
query: &SearchQuery,
root_path: &std::path::Path,
) -> (Option<HashSet<String>>, f32) {
let boost = query.branch_boost.clamp(BRANCH_BOOST_MIN, BRANCH_BOOST_MAX);
let files: Option<Vec<String>> = match &query.branch_files {
Some(v) if !v.is_empty() => Some(v.clone()),
_ => match &query.branch {
Some(name) => resolve_branch_files(root_path, name),
None => None,
},
};
let set = files.and_then(|v| {
let s: HashSet<String> = v.iter().map(|p| normalize_path(p).to_owned()).collect();
if s.is_empty() {
None
} else {
Some(s)
}
});
if (boost - 1.0).abs() < f32::EPSILON {
(None, boost)
} else {
(set, boost)
}
}
impl CodeIndexer {
pub async fn search(&self, query: &SearchQuery) -> Result<Vec<CodeChunk>> {
self.touch_activity();
let intent = QueryClassifier::classify_with_domain(&query.text, &self.domain_terms);
let (alpha, beta, use_kg_first) = intent.weights();
let effective_mode = match (&intent, query.mode) {
(QueryIntent::Conceptual, super::SearchMode::Code) => super::SearchMode::All,
(QueryIntent::Definition, super::SearchMode::Code) => super::SearchMode::All,
_ => query.mode,
};
tracing::debug!(
"search index={} query={:?} intent={:?} alpha={} beta={} \
mode={:?} effective_mode={:?}",
self.index_id,
query.text,
intent,
alpha,
beta,
query.mode,
effective_mode
);
let lexical_only = matches!(query.stage, Some(super::SearchStage::Lexical));
let semantic_lane = matches!(query.stage, Some(super::SearchStage::Semantic));
let graph_lane = matches!(query.stage, Some(super::SearchStage::Graph));
let force_kg = graph_lane;
let skip_kg = lexical_only || semantic_lane;
let embedding = if lexical_only {
None
} else {
self.embed_query(&query.text).await?
};
let want = query.top_k.saturating_mul(HNSW_OVERSAMPLE).max(query.top_k);
let bm25_fut = self.bm25_search(&query.text, want);
let hnsw_results = match &embedding {
Some(v) => self.vector_search(v, want).await?,
None => Vec::new(),
};
let mut bm25_results = bm25_fut.await?;
self.inject_entity_exact_match(&intent, &query.text, beta, &mut bm25_results)
.await;
let grep_lane: Vec<(String, f32)> = if matches!(intent, QueryIntent::Definition) {
self.grep_fallback_search(&query.text, want).await
} else {
Vec::new()
};
let fused_raw = rrf_fuse(&hnsw_results, &bm25_results, alpha, beta, RRF_K, want);
let fused_raw = merge_grep_lane(fused_raw, &grep_lane, beta, want);
let fused_raw = if fused_raw.is_empty() {
self.grep_fallback_search(&query.text, want).await
} else {
fused_raw
};
let fused = self.apply_mmr_rerank(fused_raw, want).await;
let refine_embedding: Option<Vec<f32>> = if skip_kg {
None
} else {
match &query.refine_query {
Some(rq) if !rq.is_empty() => self.embed_query(rq).await?,
_ => None,
}
};
let (all, kg_ids) = if skip_kg {
(fused, HashSet::new())
} else {
let effective_use_kg = use_kg_first || force_kg;
let effective_expand = query.expand_graph || force_kg;
self.expand_with_kg(
fused,
&intent,
effective_use_kg,
effective_expand,
refine_embedding.as_deref(),
)
.await
};
let (branch_set, branch_boost) = resolve_branch_set(query, &self.root_path);
let all = self
.apply_score_adjustments(
all,
&intent,
&query.text,
branch_set.as_ref(),
branch_boost,
effective_mode,
)
.await;
let mut result = self
.materialize_search_results(
all,
&hnsw_results,
&bm25_results,
&kg_ids,
branch_set.as_ref(),
query,
)
.await;
self.apply_archive_downrank(&mut result, effective_mode, query.exclude_archived);
Ok(result)
}
fn apply_archive_downrank(
&self,
results: &mut Vec<CodeChunk>,
mode: super::SearchMode,
exclude_archived: bool,
) {
if results.is_empty() {
return;
}
if matches!(mode, super::SearchMode::Code) {
use crate::core::chunker::ChunkType;
results.retain(|chunk| !matches!(chunk.chunk_type, ChunkType::Docstring));
}
results.retain(|chunk| docs_penalty::is_allowed_for_mode(&chunk.file, mode));
let mut markers = MarkerCache::new();
let mut archived_ids: HashSet<String> = HashSet::new();
for chunk in results.iter_mut() {
let (archive_mult, archive_reason_opt) =
archive::classify(&self.root_path, &chunk.file, &chunk.content, &mut markers);
let (_docs_mult, docs_reason_opt) = docs_penalty::doc_score_penalty(&chunk.file, mode);
if archive_reason_opt.is_some() {
chunk.score *= archive_mult;
}
if let Some(reason) = &archive_reason_opt {
if exclude_archived && !reason.starts_with("stale:") {
archived_ids.insert(chunk.id.clone());
}
}
if archive_reason_opt.is_some() || docs_reason_opt.is_some() {
chunk.archive_reason = archive_reason_opt.or(docs_reason_opt);
}
}
if exclude_archived && !archived_ids.is_empty() {
results.retain(|chunk| !archived_ids.contains(&chunk.id));
}
results.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| a.id.cmp(&b.id))
});
}
async fn apply_score_adjustments(
&self,
candidates: Vec<(String, f32)>,
intent: &QueryIntent,
query_text: &str,
branch_files: Option<&HashSet<String>>,
branch_boost: f32,
effective_mode: super::SearchMode,
) -> Vec<(String, f32)> {
let demote_docs = matches!(intent, QueryIntent::Definition);
let struct_boost_tokens: Vec<String> = if matches!(intent, QueryIntent::Definition) {
definition_boost_query_tokens(query_text)
} else {
Vec::new()
};
let candidate_ids: Vec<String> = candidates.iter().map(|(id, _)| id.clone()).collect();
let chunks = self.fetch_chunks_for_ids(&candidate_ids).await;
let mut adjusted: Vec<(String, f32)> = candidates
.into_iter()
.map(|(id, score)| {
let mut multiplier = 1.0_f32;
let raw = chunks.get(&id);
if demote_docs {
if let Some(r) = raw {
multiplier *= file_type_score_multiplier(&r.file);
}
}
if let Some(r) = raw {
let (docs_mult, _) = docs_penalty::doc_score_penalty(&r.file, effective_mode);
multiplier *= docs_mult;
}
if !struct_boost_tokens.is_empty() {
if let Some(r) = raw {
let eligible = is_struct_definition_chunk_type(&r.chunk_type)
|| is_function_definition_chunk_type(&r.chunk_type);
if eligible {
if let Some(name) = r.function_name.as_deref() {
let name_lower = name.to_ascii_lowercase();
if struct_boost_tokens
.iter()
.any(|t| name_lower.contains(t.as_str()))
{
multiplier *= STRUCT_DEFINITION_BOOST;
}
}
}
}
}
if let (Some(set), Some(r)) = (branch_files, raw) {
if set.contains(normalize_path(&r.file)) {
multiplier *= branch_boost;
}
}
(id, score * multiplier)
})
.collect();
adjusted.sort_by(|a, b| {
b.1.partial_cmp(&a.1)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| a.0.cmp(&b.0))
});
adjusted
}
async fn inject_entity_exact_match(
&self,
intent: &QueryIntent,
query_text: &str,
beta: f32,
bm25_results: &mut Vec<(String, f32)>,
) {
if !matches!(intent, QueryIntent::Definition | QueryIntent::Unknown) {
return;
}
let Some(hit) = self.entity_exact_match(query_text).await else {
return;
};
let injected_score = beta * 1.5;
bm25_results.retain(|(id, _)| id != &hit);
bm25_results.insert(0, (hit, injected_score));
}
async fn apply_mmr_rerank(
&self,
fused_raw: Vec<(String, f32)>,
top_k: usize,
) -> Vec<(String, f32)> {
let emb_map = self.chunk_embeddings.read().await;
if emb_map.is_empty() {
return fused_raw;
}
let snapshot: HashMap<String, Vec<f32>> = fused_raw
.iter()
.filter_map(|(id, _)| emb_map.peek(id).map(|v| (id.clone(), v.clone())))
.collect();
drop(emb_map);
crate::core::mmr::mmr_rerank(
fused_raw,
&snapshot,
crate::core::mmr::DEFAULT_LAMBDA,
top_k,
)
}
}