use std::collections::HashMap;
use std::path::Path;
use crate::chunk::CodeChunk;
use crate::embed::SearchConfig;
use crate::encoder::VectorEncoder;
use crate::encoder::ripvec::bm25::{Bm25Index, search_bm25};
use crate::encoder::ripvec::dense::StaticEncoder;
use crate::encoder::ripvec::hybrid::{search_hybrid, search_semantic};
use crate::hybrid::SearchMode;
use crate::profile::Profiler;
pub struct RipvecIndex {
chunks: Vec<CodeChunk>,
embeddings: Vec<Vec<f32>>,
bm25: Bm25Index,
encoder: StaticEncoder,
file_mapping: HashMap<String, Vec<usize>>,
language_mapping: HashMap<String, Vec<usize>>,
pagerank_lookup: Option<HashMap<String, f32>>,
pagerank_alpha: f32,
corpus_class: CorpusClass,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum CorpusClass {
Code,
Mixed,
Docs,
}
impl CorpusClass {
#[must_use]
pub fn classify(chunks: &[CodeChunk]) -> Self {
if chunks.is_empty() {
return Self::Code;
}
let prose = chunks
.iter()
.filter(|c| {
crate::encoder::ripvec::ranking::is_prose_path(&c.file_path)
})
.count();
#[expect(
clippy::cast_precision_loss,
reason = "chunk count never exceeds f32 mantissa precision in practice"
)]
let frac = prose as f32 / chunks.len() as f32;
if frac >= 0.7 {
Self::Docs
} else if frac >= 0.3 {
Self::Mixed
} else {
Self::Code
}
}
#[must_use]
pub fn rerank_eligible(self) -> bool {
matches!(self, Self::Mixed | Self::Docs)
}
}
impl RipvecIndex {
pub fn from_root(
root: &Path,
encoder: StaticEncoder,
cfg: &SearchConfig,
profiler: &Profiler,
pagerank_lookup: Option<HashMap<String, f32>>,
pagerank_alpha: f32,
) -> crate::Result<Self> {
let (chunks, embeddings) = encoder.embed_root(root, cfg, profiler)?;
let bm25 = {
let _g = profiler.phase("bm25_build");
Bm25Index::build(&chunks)
};
let (file_mapping, language_mapping) = {
let _g = profiler.phase("mappings");
build_mappings(&chunks)
};
let corpus_class = CorpusClass::classify(&chunks);
Ok(Self {
chunks,
embeddings,
bm25,
encoder,
file_mapping,
language_mapping,
pagerank_lookup,
pagerank_alpha,
corpus_class,
})
}
#[must_use]
pub fn corpus_class(&self) -> CorpusClass {
self.corpus_class
}
#[must_use]
pub fn len(&self) -> usize {
self.chunks.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.chunks.is_empty()
}
#[must_use]
pub fn chunks(&self) -> &[CodeChunk] {
&self.chunks
}
#[must_use]
pub fn embeddings(&self) -> &[Vec<f32>] {
&self.embeddings
}
#[must_use]
pub fn search(
&self,
query: &str,
top_k: usize,
mode: SearchMode,
alpha: Option<f32>,
filter_languages: Option<&[String]>,
filter_paths: Option<&[String]>,
) -> Vec<(usize, f32)> {
if self.is_empty() || query.trim().is_empty() {
return Vec::new();
}
let selector = self.build_selector(filter_languages, filter_paths);
let raw = match mode {
SearchMode::Keyword => search_bm25(query, &self.bm25, top_k, selector.as_deref()),
SearchMode::Semantic => {
let q_emb = self.encoder.encode_query(query);
search_semantic(&q_emb, &self.embeddings, top_k, selector.as_deref())
}
SearchMode::Hybrid => {
let q_emb = self.encoder.encode_query(query);
search_hybrid(
query,
&q_emb,
&self.embeddings,
&self.chunks,
&self.bm25,
top_k,
alpha,
selector.as_deref(),
)
}
};
self.apply_pagerank_layer(raw)
}
fn build_selector(
&self,
filter_languages: Option<&[String]>,
filter_paths: Option<&[String]>,
) -> Option<Vec<usize>> {
let mut selector: Vec<usize> = Vec::new();
if let Some(langs) = filter_languages {
for lang in langs {
if let Some(ids) = self.language_mapping.get(lang) {
selector.extend(ids.iter().copied());
}
}
}
if let Some(paths) = filter_paths {
for path in paths {
if let Some(ids) = self.file_mapping.get(path) {
selector.extend(ids.iter().copied());
}
}
}
if selector.is_empty() {
None
} else {
selector.sort_unstable();
selector.dedup();
Some(selector)
}
}
fn apply_pagerank_layer(&self, mut results: Vec<(usize, f32)>) -> Vec<(usize, f32)> {
let Some(lookup) = &self.pagerank_lookup else {
return results;
};
if results.is_empty() || self.pagerank_alpha <= 0.0 {
return results;
}
let layers: Vec<Box<dyn crate::ranking::RankingLayer>> = vec![Box::new(
crate::ranking::PageRankBoost::new(lookup.clone(), self.pagerank_alpha),
)];
crate::ranking::apply_chain(&mut results, &self.chunks, &layers);
results
}
}
impl crate::searchable::SearchableIndex for RipvecIndex {
fn chunks(&self) -> &[CodeChunk] {
RipvecIndex::chunks(self)
}
fn search(&self, query_text: &str, top_k: usize, mode: SearchMode) -> Vec<(usize, f32)> {
RipvecIndex::search(self, query_text, top_k, mode, None, None, None)
}
fn search_from_chunk(
&self,
chunk_idx: usize,
query_text: &str,
top_k: usize,
mode: SearchMode,
) -> Vec<(usize, f32)> {
let Some(source) = self.embeddings().get(chunk_idx) else {
return RipvecIndex::search(
self,
query_text,
top_k,
SearchMode::Keyword,
None,
None,
None,
);
};
match mode {
SearchMode::Keyword => RipvecIndex::search(
self,
query_text,
top_k,
SearchMode::Keyword,
None,
None,
None,
),
SearchMode::Semantic | SearchMode::Hybrid => {
let mut scored: Vec<(usize, f32)> = self
.embeddings()
.iter()
.enumerate()
.filter(|(i, _)| *i != chunk_idx)
.map(|(i, row)| {
let dot: f32 = source.iter().zip(row.iter()).map(|(a, b)| a * b).sum();
(i, dot)
})
.collect();
scored.sort_unstable_by(|a, b| b.1.total_cmp(&a.1));
scored.truncate(top_k);
scored
}
}
}
fn as_any(&self) -> &dyn std::any::Any {
self
}
}
fn build_mappings(
chunks: &[CodeChunk],
) -> (HashMap<String, Vec<usize>>, HashMap<String, Vec<usize>>) {
let mut file_to_id: HashMap<String, Vec<usize>> = HashMap::new();
let mut lang_to_id: HashMap<String, Vec<usize>> = HashMap::new();
for (i, chunk) in chunks.iter().enumerate() {
file_to_id
.entry(chunk.file_path.clone())
.or_default()
.push(i);
if let Some(ext) = Path::new(&chunk.file_path)
.extension()
.and_then(|e| e.to_str())
{
lang_to_id.entry(ext.to_string()).or_default().push(i);
}
}
(file_to_id, lang_to_id)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn semble_index_search_signature_compiles() {
fn shape_check(
idx: &RipvecIndex,
query: &str,
top_k: usize,
mode: SearchMode,
) -> Vec<(usize, f32)> {
idx.search(query, top_k, mode, None, None, None)
}
let _ = shape_check;
}
#[test]
fn pagerank_layer_no_op_when_graph_absent() {
let _ = "see apply_pagerank_layer docs";
}
}