use std::collections::HashMap;
use std::path::{Path, PathBuf};
use crate::chunk::CodeChunk;
use crate::embed::SearchConfig;
use crate::encoder::VectorEncoder;
use crate::encoder::ripvec::bm25::{Bm25Index, search_bm25};
use crate::encoder::ripvec::dense::StaticEncoder;
use crate::encoder::ripvec::hybrid::{search_hybrid, search_semantic};
use crate::encoder::ripvec::manifest::{Diff, FileEntry, Manifest, diff_against_walk};
use crate::hybrid::SearchMode;
use crate::profile::Profiler;
use crate::walk::{WalkOptions, collect_files_with_options};
pub struct RipvecIndex {
chunks: Vec<CodeChunk>,
embeddings: ndarray::Array2<f32>,
bm25: Bm25Index,
encoder: std::sync::Arc<StaticEncoder>,
file_mapping: HashMap<String, Vec<usize>>,
language_mapping: HashMap<String, Vec<usize>>,
pagerank_lookup: Option<std::sync::Arc<HashMap<String, f32>>>,
pagerank_alpha: f32,
corpus_class: CorpusClass,
root: PathBuf,
walk_options: WalkOptions,
manifest: Manifest,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum CorpusClass {
Code,
Mixed,
Docs,
}
impl CorpusClass {
#[must_use]
pub fn classify(chunks: &[CodeChunk]) -> Self {
if chunks.is_empty() {
return Self::Code;
}
let prose = chunks
.iter()
.filter(|c| {
crate::encoder::ripvec::ranking::is_prose_path(&c.file_path)
|| chunk_is_prose_dominated(c)
})
.count();
#[expect(
clippy::cast_precision_loss,
reason = "chunk count never exceeds f32 mantissa precision in practice"
)]
let frac = prose as f32 / chunks.len() as f32;
if frac >= prose_density::CORPUS_DOCS_FRAC {
Self::Docs
} else if frac >= prose_density::CORPUS_MIXED_FRAC {
Self::Mixed
} else {
Self::Code
}
}
#[must_use]
pub fn rerank_eligible(self) -> bool {
matches!(self, Self::Mixed | Self::Docs)
}
}
pub(super) mod prose_density {
pub const CHUNK_DOMINANCE_FRAC: f32 = 0.5;
pub const CORPUS_MIXED_FRAC: f32 = 0.3;
pub const CORPUS_DOCS_FRAC: f32 = 0.7;
}
#[must_use]
fn chunk_is_prose_dominated(chunk: &CodeChunk) -> bool {
let total = chunk.content.len();
if total == 0 {
return false;
}
#[expect(
clippy::cast_precision_loss,
reason = "chunk content length never exceeds f32 mantissa precision in practice"
)]
let ratio = prose_byte_count(&chunk.content) as f32 / total as f32;
ratio > prose_density::CHUNK_DOMINANCE_FRAC
}
#[must_use]
fn prose_byte_count(source: &str) -> usize {
let bytes = source.as_bytes();
let mut prose = 0usize;
let mut i = 0usize;
while i < bytes.len() {
let rest = &bytes[i..];
if rest.starts_with(b"\"\"\"") || rest.starts_with(b"'''") {
let quote = &rest[..3];
let start = i + 3;
let mut j = start;
while j + 3 <= bytes.len() && &bytes[j..j + 3] != quote {
j += 1;
}
let end = (j + 3).min(bytes.len());
prose += end - i;
i = end;
continue;
}
if rest.starts_with(b"/*") {
let mut j = i + 2;
while j + 2 <= bytes.len() && &bytes[j..j + 2] != b"*/" {
j += 1;
}
let end = (j + 2).min(bytes.len());
prose += end - i;
i = end;
continue;
}
if rest.starts_with(b"//") {
let mut j = i;
while j < bytes.len() && bytes[j] != b'\n' {
j += 1;
}
prose += j - i;
i = j;
continue;
}
if bytes[i] == b'#' {
let mut j = i;
while j < bytes.len() && bytes[j] != b'\n' {
j += 1;
}
prose += j - i;
i = j;
continue;
}
i += 1;
}
prose
}
impl RipvecIndex {
pub fn from_root(
root: &Path,
encoder: StaticEncoder,
cfg: &SearchConfig,
profiler: &Profiler,
pagerank_lookup: Option<HashMap<String, f32>>,
pagerank_alpha: f32,
) -> crate::Result<Self> {
let pagerank_lookup = pagerank_lookup.map(std::sync::Arc::new);
let (chunks, embeddings_vec) = encoder.embed_root(root, cfg, profiler)?;
let hidden_dim = embeddings_vec.first().map_or(0, std::vec::Vec::len);
let n_chunks = embeddings_vec.len();
let mut flat: Vec<f32> = Vec::with_capacity(n_chunks * hidden_dim);
for row in embeddings_vec {
debug_assert_eq!(
row.len(),
hidden_dim,
"ragged embeddings: row of {} vs expected {hidden_dim}",
row.len()
);
flat.extend(row);
}
let embeddings = ndarray::Array2::from_shape_vec((n_chunks, hidden_dim), flat)
.map_err(|e| crate::Error::Other(anyhow::anyhow!("embeddings reshape: {e}")))?;
let bm25 = {
let _g = profiler.phase("bm25_build");
Bm25Index::build(&chunks)
};
let (file_mapping, language_mapping) = {
let _g = profiler.phase("mappings");
build_mappings(&chunks)
};
let corpus_class = CorpusClass::classify(&chunks);
let walk_options = cfg.walk_options();
let root_buf = root.to_path_buf();
let manifest = {
let _g = profiler.phase("manifest_build");
build_manifest(&root_buf, &walk_options)
};
Ok(Self {
chunks,
embeddings,
bm25,
encoder: std::sync::Arc::new(encoder),
file_mapping,
language_mapping,
pagerank_lookup,
pagerank_alpha,
corpus_class,
root: root_buf,
walk_options,
manifest,
})
}
pub fn apply_diff(&self, diff: &Diff, profiler: &Profiler) -> crate::Result<Self> {
use std::collections::HashSet;
let rel_path_for = |p: &Path| -> String {
p.strip_prefix(&self.root)
.unwrap_or(p)
.display()
.to_string()
};
let mut removed_indices: HashSet<usize> = HashSet::new();
for path in diff
.deleted
.iter()
.chain(diff.dirty.iter())
.chain(diff.new.iter())
{
let rel = rel_path_for(path);
if let Some(indices) = self.file_mapping.get(&rel) {
removed_indices.extend(indices.iter().copied());
}
}
let mut kept_chunks: Vec<CodeChunk> = Vec::with_capacity(self.chunks.len());
let mut kept_emb_rows: Vec<Vec<f32>> = Vec::with_capacity(self.chunks.len());
for (i, chunk) in self.chunks.iter().enumerate() {
if removed_indices.contains(&i) {
continue;
}
kept_chunks.push(chunk.clone());
kept_emb_rows.push(self.embeddings.row(i).to_vec());
}
let mut to_embed: Vec<std::path::PathBuf> = Vec::new();
to_embed.extend(diff.new.iter().cloned());
to_embed.extend(diff.dirty.iter().cloned());
let (new_chunks, new_embs) = if to_embed.is_empty() {
(Vec::new(), Vec::new())
} else {
let _g = profiler.phase("apply_diff_embed");
self.encoder.embed_paths(&self.root, &to_embed, profiler)?
};
kept_chunks.extend(new_chunks);
kept_emb_rows.extend(new_embs);
let n = kept_emb_rows.len();
let hidden_dim = kept_emb_rows
.first()
.map_or(self.embeddings.ncols(), Vec::len);
let mut flat: Vec<f32> = Vec::with_capacity(n * hidden_dim);
for row in kept_emb_rows {
flat.extend(row);
}
let embeddings = if n == 0 {
ndarray::Array2::<f32>::zeros((0, hidden_dim))
} else {
ndarray::Array2::from_shape_vec((n, hidden_dim), flat).map_err(|e| {
crate::Error::Other(anyhow::anyhow!("apply_diff embeddings reshape: {e}"))
})?
};
let bm25 = {
let _g = profiler.phase("apply_diff_bm25");
Bm25Index::build(&kept_chunks)
};
let (file_mapping, language_mapping) = {
let _g = profiler.phase("apply_diff_mappings");
build_mappings(&kept_chunks)
};
let corpus_class = CorpusClass::classify(&kept_chunks);
let mut manifest = self.manifest.clone();
for path in &diff.deleted {
manifest.files.remove(path);
}
for path in diff.new.iter().chain(diff.dirty.iter()) {
if let Ok(entry) = FileEntry::from_path(path) {
manifest.insert(path.clone(), entry);
}
}
for (path, refreshed_entry) in &diff.touched_clean {
if let Some(entry_mut) = manifest.files.get_mut(path) {
entry_mut.mtime = refreshed_entry.mtime;
entry_mut.size = refreshed_entry.size;
entry_mut.ino = refreshed_entry.ino;
entry_mut.blake3 = refreshed_entry.blake3;
}
}
Ok(Self {
chunks: kept_chunks,
embeddings,
bm25,
encoder: std::sync::Arc::clone(&self.encoder),
file_mapping,
language_mapping,
pagerank_lookup: self.pagerank_lookup.clone(),
pagerank_alpha: self.pagerank_alpha,
corpus_class,
root: self.root.clone(),
walk_options: self.walk_options.clone(),
manifest,
})
}
#[must_use]
pub fn diff_against_filesystem(&self) -> Diff {
let files = collect_files_with_options(&self.root, &self.walk_options);
let mut manifest = self.manifest.clone();
diff_against_walk(&mut manifest, &files)
}
#[must_use]
pub fn root(&self) -> &Path {
&self.root
}
#[must_use]
pub fn walk_options(&self) -> &WalkOptions {
&self.walk_options
}
#[must_use]
pub fn manifest(&self) -> &Manifest {
&self.manifest
}
#[must_use]
pub fn corpus_class(&self) -> CorpusClass {
self.corpus_class
}
#[must_use]
pub fn len(&self) -> usize {
self.chunks.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.chunks.is_empty()
}
#[must_use]
pub fn chunks(&self) -> &[CodeChunk] {
&self.chunks
}
#[must_use]
pub fn embeddings(&self) -> &ndarray::Array2<f32> {
&self.embeddings
}
#[must_use]
pub fn search(
&self,
query: &str,
top_k: usize,
mode: SearchMode,
alpha: Option<f32>,
filter_languages: Option<&[String]>,
filter_paths: Option<&[String]>,
) -> Vec<(usize, f32)> {
if self.is_empty() || query.trim().is_empty() {
return Vec::new();
}
let selector = self.build_selector(filter_languages, filter_paths);
let raw = match mode {
SearchMode::Keyword => search_bm25(query, &self.bm25, top_k, selector.as_deref()),
SearchMode::Semantic => {
let q_emb = self.encoder.encode_query(query);
search_semantic(&q_emb, &self.embeddings, top_k, selector.as_deref())
}
SearchMode::Hybrid => {
let q_emb = self.encoder.encode_query(query);
search_hybrid(
query,
&q_emb,
&self.embeddings,
&self.chunks,
&self.bm25,
top_k,
alpha,
selector.as_deref(),
)
}
};
self.apply_pagerank_layer(raw)
}
fn build_selector(
&self,
filter_languages: Option<&[String]>,
filter_paths: Option<&[String]>,
) -> Option<Vec<usize>> {
let mut selector: Vec<usize> = Vec::new();
if let Some(langs) = filter_languages {
for lang in langs {
if let Some(ids) = self.language_mapping.get(lang) {
selector.extend(ids.iter().copied());
}
}
}
if let Some(paths) = filter_paths {
for path in paths {
if let Some(ids) = self.file_mapping.get(path) {
selector.extend(ids.iter().copied());
}
}
}
if selector.is_empty() {
None
} else {
selector.sort_unstable();
selector.dedup();
Some(selector)
}
}
fn apply_pagerank_layer(&self, mut results: Vec<(usize, f32)>) -> Vec<(usize, f32)> {
let Some(lookup) = &self.pagerank_lookup else {
return results;
};
if results.is_empty() || self.pagerank_alpha <= 0.0 {
return results;
}
let layers: Vec<Box<dyn crate::ranking::RankingLayer>> = vec![Box::new(
crate::ranking::PageRankBoost::new(std::sync::Arc::clone(lookup), self.pagerank_alpha),
)];
crate::ranking::apply_chain(&mut results, &self.chunks, &layers);
results
}
}
impl crate::searchable::SearchableIndex for RipvecIndex {
fn chunks(&self) -> &[CodeChunk] {
RipvecIndex::chunks(self)
}
fn search(&self, query_text: &str, top_k: usize, mode: SearchMode) -> Vec<(usize, f32)> {
RipvecIndex::search(self, query_text, top_k, mode, None, None, None)
}
fn search_from_chunk(
&self,
chunk_idx: usize,
query_text: &str,
top_k: usize,
mode: SearchMode,
) -> Vec<(usize, f32)> {
if chunk_idx >= self.embeddings().nrows() {
return RipvecIndex::search(
self,
query_text,
top_k,
SearchMode::Keyword,
None,
None,
None,
);
}
match mode {
SearchMode::Keyword => RipvecIndex::search(
self,
query_text,
top_k,
SearchMode::Keyword,
None,
None,
None,
),
SearchMode::Semantic | SearchMode::Hybrid => {
let source = self.embeddings().row(chunk_idx);
let scores =
crate::encoder::ripvec::hybrid::parallel_sgemv(self.embeddings(), &source);
let mut scored: Vec<(usize, f32)> = scores
.iter()
.enumerate()
.filter(|(i, _)| *i != chunk_idx)
.map(|(i, &s)| (i, s))
.collect();
if scored.len() > top_k {
scored.select_nth_unstable_by(top_k - 1, |a, b| {
b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0))
});
scored.truncate(top_k);
}
scored.sort_unstable_by(|a, b| b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
scored
}
}
}
fn as_any(&self) -> &dyn std::any::Any {
self
}
}
#[must_use]
pub fn find_chunk_containing_line(
chunks: &[CodeChunk],
file_path: &str,
target_line_1based: usize,
) -> Option<usize> {
let path_matches = |chunk: &CodeChunk| -> bool {
let cp = &chunk.file_path;
cp == file_path
|| (cp.len() > file_path.len()
&& cp.ends_with(file_path)
&& cp.as_bytes()[cp.len() - file_path.len() - 1] == b'/')
};
if let Some(idx) = chunks
.iter()
.position(|c| path_matches(c) && c.start_line == target_line_1based)
{
return Some(idx);
}
chunks.iter().position(|c| {
path_matches(c) && c.start_line <= target_line_1based && target_line_1based <= c.end_line
})
}
fn build_manifest(root: &Path, walk_options: &WalkOptions) -> Manifest {
let mut manifest = Manifest::new();
let files = collect_files_with_options(root, walk_options);
for path in files {
let (Ok(metadata), Ok(bytes)) = (std::fs::metadata(&path), std::fs::read(&path)) else {
continue;
};
let entry = FileEntry::from_bytes(&metadata, &bytes);
manifest.insert(path, entry);
}
manifest
}
fn build_mappings(
chunks: &[CodeChunk],
) -> (HashMap<String, Vec<usize>>, HashMap<String, Vec<usize>>) {
let mut file_to_id: HashMap<String, Vec<usize>> = HashMap::new();
let mut lang_to_id: HashMap<String, Vec<usize>> = HashMap::new();
for (i, chunk) in chunks.iter().enumerate() {
file_to_id
.entry(chunk.file_path.clone())
.or_default()
.push(i);
if let Some(ext) = Path::new(&chunk.file_path)
.extension()
.and_then(|e| e.to_str())
{
lang_to_id.entry(ext.to_string()).or_default().push(i);
}
}
(file_to_id, lang_to_id)
}
#[cfg(test)]
mod tests {
use super::*;
fn py_chunk(content: &str) -> crate::chunk::CodeChunk {
crate::chunk::CodeChunk {
file_path: "src/foo.py".to_string(),
name: "test".to_string(),
kind: "function_definition".to_string(),
content_kind: crate::chunk::ContentKind::Code,
start_line: 1,
end_line: 10,
symbol_line: 1,
content: content.to_string(),
enriched_content: content.to_string(),
qualified_name: None,
}
}
#[test]
fn chunk_is_prose_dominated_python_docstring() {
let c = py_chunk(
"def handle_error(self, exc):\n \"\"\"This is the docstring \
that explains the error-handling contract. It dwarfs the body \
and so the chunk is dominated by prose.\"\"\"\n return None\n",
);
assert!(
chunk_is_prose_dominated(&c),
"Python triple-quoted docstring dominating the chunk must be \
recognised as prose-dominated"
);
}
#[test]
fn chunk_is_prose_dominated_pure_code_is_false() {
let c = py_chunk("def f(x, y):\n z = x * y + 2\n return z * z - (x + y)\n");
assert!(
!chunk_is_prose_dominated(&c),
"Pure code chunk (no docstring, no comments) must not be \
prose-dominated"
);
}
#[test]
fn chunk_is_prose_dominated_line_comments() {
let c = py_chunk(
"// This is a long-form explanation of why the function exists.\n\
// It spans multiple lines and dominates the chunk by byte count.\n\
// The actual code is a tiny one-liner.\n\
fn f() { 1 }\n",
);
assert!(
chunk_is_prose_dominated(&c),
"Chunk dominated by `//` line comments must be prose-dominated"
);
}
#[test]
fn chunk_is_prose_dominated_block_comment() {
let c = py_chunk(
"/* JS-doc style block comment describing the function in detail \
and taking up most of the chunk by byte volume. */\nfn g() {}\n",
);
assert!(
chunk_is_prose_dominated(&c),
"Chunk dominated by `/* ... */` block comment must be \
prose-dominated"
);
}
#[test]
fn chunk_is_prose_dominated_empty_is_false() {
let c = py_chunk("");
assert!(
!chunk_is_prose_dominated(&c),
"Empty chunk content must classify as not-prose (degenerate case)"
);
}
#[test]
fn corpus_class_classify_10_prose_20_code_is_mixed() {
let prose = py_chunk(
"def f():\n \"\"\"A substantial docstring whose byte count \
dominates the chunk.\"\"\"\n pass\n",
);
let code = py_chunk("def g(x):\n return x + 1\n");
assert!(chunk_is_prose_dominated(&prose));
assert!(!chunk_is_prose_dominated(&code));
let mut chunks = Vec::new();
for _ in 0..10 {
chunks.push(prose.clone());
}
for _ in 0..20 {
chunks.push(code.clone());
}
assert_eq!(
CorpusClass::classify(&chunks),
CorpusClass::Mixed,
"10 prose : 20 code (~33% prose chunks) must classify as Mixed; \
threshold is >= 30% prose"
);
assert!(
CorpusClass::classify(&chunks).rerank_eligible(),
"Mixed must be rerank-eligible — the I#64 / B-0028 fire path"
);
}
#[test]
fn corpus_class_classify_20_prose_10_code_is_rerank_eligible() {
let prose = py_chunk(
"def f():\n \"\"\"Substantial docstring that dominates the \
chunk's bytes — a Mnemosyne-class signature.\"\"\"\n pass\n",
);
let code = py_chunk("def g(x):\n return x + 1\n");
let mut chunks = Vec::new();
for _ in 0..20 {
chunks.push(prose.clone());
}
for _ in 0..10 {
chunks.push(code.clone());
}
let class = CorpusClass::classify(&chunks);
assert!(
class.rerank_eligible(),
"20 prose : 10 code (~67% prose chunks) must be rerank-eligible"
);
}
#[test]
fn corpus_class_classify_low_prose_is_code() {
let prose = py_chunk("def f():\n \"\"\"Docstring dominating the chunk's bytes.\"\"\"\n");
let code = py_chunk("def g(x):\n return x + 1\n");
let mut chunks = Vec::new();
for _ in 0..2 {
chunks.push(prose.clone());
}
for _ in 0..28 {
chunks.push(code.clone());
}
assert_eq!(
CorpusClass::classify(&chunks),
CorpusClass::Code,
"2:28 prose:code (~7% prose chunks) must classify as Code"
);
}
#[test]
fn corpus_class_classify_path_and_content_signals_compose() {
let md_chunk = crate::chunk::CodeChunk {
file_path: "README.md".to_string(),
name: "readme".to_string(),
kind: "paragraph".to_string(),
content_kind: crate::chunk::ContentKind::Docs,
start_line: 1,
end_line: 5,
symbol_line: 1,
content: "function foo() { return 1; }".to_string(), enriched_content: "function foo() { return 1; }".to_string(),
qualified_name: None,
};
let chunks = vec![md_chunk];
assert_eq!(
CorpusClass::classify(&chunks),
CorpusClass::Docs,
"A .md path counts as prose under is_prose_path even if its \
content is code-like — path and content signals OR together"
);
}
#[allow(clippy::too_many_arguments)]
fn new_for_test(
chunks: Vec<crate::chunk::CodeChunk>,
embeddings: ndarray::Array2<f32>,
encoder: std::sync::Arc<StaticEncoder>,
file_mapping: HashMap<String, Vec<usize>>,
language_mapping: HashMap<String, Vec<usize>>,
manifest: Manifest,
root: std::path::PathBuf,
walk_options: WalkOptions,
) -> RipvecIndex {
let bm25 = Bm25Index::build(&chunks);
let corpus_class = CorpusClass::classify(&chunks);
RipvecIndex {
chunks,
embeddings,
bm25,
encoder,
file_mapping,
language_mapping,
pagerank_lookup: None,
pagerank_alpha: 0.0,
corpus_class,
root,
walk_options,
manifest,
}
}
#[test]
fn semble_index_search_signature_compiles() {
fn shape_check(
idx: &RipvecIndex,
query: &str,
top_k: usize,
mode: SearchMode,
) -> Vec<(usize, f32)> {
idx.search(query, top_k, mode, None, None, None)
}
let _ = shape_check;
}
#[test]
fn pagerank_layer_no_op_when_graph_absent() {
let _ = "see apply_pagerank_layer docs";
}
#[test]
#[ignore = "requires Model2Vec download (~32 MB on first run)"]
fn apply_diff_idempotent_when_new_file_already_has_chunks() {
use crate::encoder::ripvec::dense::{DEFAULT_MODEL_REPO, StaticEncoder};
use crate::profile::Profiler;
use std::fs;
let encoder = StaticEncoder::from_pretrained(DEFAULT_MODEL_REPO).expect("encoder load");
let encoder_arc = std::sync::Arc::new(encoder);
let tmp = tempfile::TempDir::new().unwrap();
let file_a = tmp.path().join("file_a.rs");
fs::write(
&file_a,
"pub fn alpha() -> u32 { 1 }\npub fn beta() -> u32 { 2 }\n",
)
.unwrap();
let (real_chunks, real_embs) = encoder_arc
.embed_paths(tmp.path(), std::slice::from_ref(&file_a), &Profiler::noop())
.expect("embed_paths");
let n_real = real_chunks.len();
assert!(n_real > 0, "file_a.rs must produce at least one chunk");
let hidden_dim = real_embs[0].len();
let mut flat: Vec<f32> = Vec::with_capacity(n_real * hidden_dim);
for row in &real_embs {
flat.extend(row);
}
let embeddings = ndarray::Array2::from_shape_vec((n_real, hidden_dim), flat).unwrap();
let rel_key = "file_a.rs".to_string();
let indices: Vec<usize> = (0..n_real).collect();
let file_mapping = HashMap::from([(rel_key, indices)]);
let manifest = Manifest::new();
let index = new_for_test(
real_chunks,
embeddings,
std::sync::Arc::clone(&encoder_arc),
file_mapping,
HashMap::new(),
manifest,
tmp.path().to_path_buf(),
WalkOptions::default(),
);
let diff = index.diff_against_filesystem();
assert!(
diff.new.iter().any(|p| p.ends_with("file_a.rs")),
"file_a.rs must appear in diff.new when manifest is empty; got {:?}",
diff.new
);
assert!(diff.dirty.is_empty(), "no dirty expected");
assert!(diff.deleted.is_empty(), "no deleted expected");
let updated = index
.apply_diff(&diff, &Profiler::noop())
.expect("apply_diff");
let file_a_count = updated
.chunks()
.iter()
.filter(|c| c.file_path.ends_with("file_a.rs"))
.count();
assert_eq!(
file_a_count, n_real,
"file_a.rs chunk count must equal one fresh-embed pass ({n_real}); \
got {file_a_count} — stale chunks from file_mapping not cleared"
);
assert_eq!(
updated.embeddings().nrows(),
updated.chunks().len(),
"embeddings row count must match chunk count"
);
}
#[test]
#[ignore = "requires Model2Vec download (~32 MB on first run)"]
fn apply_diff_no_duplicate_chunks_after_two_passes() {
use crate::embed::SearchConfig;
use crate::encoder::ripvec::dense::{DEFAULT_MODEL_REPO, StaticEncoder};
use crate::profile::Profiler;
use std::fs;
let tmp = tempfile::TempDir::new().unwrap();
fs::write(
tmp.path().join("main.rs"),
"fn main() { println!(\"hello\"); }\n",
)
.unwrap();
let encoder = StaticEncoder::from_pretrained(DEFAULT_MODEL_REPO).expect("encoder load");
let cfg = SearchConfig {
batch_size: 32,
max_tokens: 512,
chunk: crate::chunk::ChunkConfig {
max_chunk_bytes: 4096,
window_size: 2048,
window_overlap: 512,
},
text_mode: false,
cascade_dim: None,
file_type: None,
exclude_extensions: Vec::new(),
include_extensions: Vec::new(),
ignore_patterns: Vec::new(),
corpus: crate::embed::Scope::All,
mode: crate::hybrid::SearchMode::Hybrid,
};
let index = RipvecIndex::from_root(tmp.path(), encoder, &cfg, &Profiler::noop(), None, 0.0)
.expect("from_root");
let original_count = index.chunks().len();
let diff1 = index.diff_against_filesystem();
assert!(diff1.is_empty(), "fresh index must yield empty diff");
let pass1 = index
.apply_diff(&diff1, &Profiler::noop())
.expect("apply_diff pass 1");
assert_eq!(
pass1.chunks().len(),
original_count,
"chunk count must be unchanged after empty-diff pass 1"
);
let diff2 = pass1.diff_against_filesystem();
assert!(
diff2.is_empty(),
"pass1 against unchanged FS must yield empty diff"
);
let pass2 = pass1
.apply_diff(&diff2, &Profiler::noop())
.expect("apply_diff pass 2");
assert_eq!(
pass2.chunks().len(),
original_count,
"chunk count must be unchanged after empty-diff pass 2"
);
}
}