use std::collections::HashMap;
use std::num::NonZeroUsize;
use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering};
use std::sync::{Arc, Mutex};
use std::time::Instant;
use lru::LruCache;
use tokio::sync::RwLock;
use crate::core::bm25::Bm25Index;
use crate::core::chunker::RawChunk;
use crate::core::embed::Embedder;
use crate::core::entity::RawEntity;
use crate::core::store::VectorStore;
use crate::core::symbol_graph::SymbolGraph;
pub(crate) mod archive;
pub(crate) mod docs_penalty;
mod files;
pub(crate) mod helpers;
mod ingest;
pub(crate) mod migrations;
mod persist;
mod persist_hnsw;
mod search;
mod types;
pub(crate) use ingest::PROGRESS_CHUNK_INTERVAL;
#[cfg(test)]
pub(crate) use search::KG_REFINE_THRESHOLD;
#[cfg(test)]
mod tests;
#[cfg(test)]
mod tests_cursor;
pub(crate) use helpers::{
build_compact_snippet, definition_boost_query_tokens, embed_batch_size, embedding_cache_cap,
file_type_score_multiplier, hash_query, idle_evict_secs, is_function_definition_chunk_type,
is_struct_definition_chunk_type, max_chunks_per_index, populate_virtual_terms,
raw_to_code_chunk, STRUCT_DEFINITION_BOOST,
};
#[cfg(test)]
pub(crate) use helpers::{
compute_match_reason, resolve_chunk_file, DEFAULT_CHUNKS_IDLE_EVICT_SECS,
};
pub(crate) use types::ChunkSnapshot;
pub use types::{CodeChunk, CommitTimings, ParsedBatch, SearchMode, SearchQuery, SearchStage};
const QUERY_CACHE_CAPACITY: usize = 256;
pub(crate) const HNSW_OVERSAMPLE: usize = 4;
#[allow(dead_code)]
pub(crate) const KG_EXPAND_SCORE_FACTOR: f32 = 0.7;
pub(crate) const KG_EXPAND_HOPS: usize = 1;
pub(crate) const HNSW_SNAPSHOT_BATCH_INTERVAL: u32 = 16;
pub struct CodeIndexer {
pub index_id: String,
pub root_path: std::path::PathBuf,
pub(super) embedder: Option<Arc<dyn Embedder>>,
pub(super) store: Option<Arc<dyn VectorStore>>,
pub(super) chunks: Arc<RwLock<HashMap<String, RawChunk>>>,
pub(super) corpus: Option<Arc<crate::core::corpus::CorpusStore>>,
pub(super) entities: Arc<RwLock<HashMap<String, Vec<RawEntity>>>>,
pub(super) chunk_embeddings: Arc<RwLock<LruCache<String, Vec<f32>>>>,
pub(super) bm25: Arc<RwLock<Bm25Index>>,
pub(super) query_cache: Arc<Mutex<LruCache<u64, Vec<f32>>>>,
pub(super) symbol_graph: Arc<RwLock<Arc<SymbolGraph>>>,
pub(super) ner: crate::core::ner::NerExtractor,
pub(super) persist_state: Arc<PersistState>,
pub(super) domain_terms: Vec<String>,
pub(super) created_at: Instant,
pub(super) last_activity_ms: Arc<AtomicU64>,
pub(super) chunks_evicted: Arc<AtomicBool>,
pub corpus_open_failed: bool,
}
#[derive(Debug, Default)]
pub(crate) struct PersistState {
pub(crate) in_flight: AtomicBool,
pub(crate) dirty: AtomicBool,
pub(crate) batch_counter: AtomicU32,
}
impl CodeIndexer {
pub fn new(index_id: impl Into<String>, root_path: impl Into<std::path::PathBuf>) -> Self {
let cap =
NonZeroUsize::new(QUERY_CACHE_CAPACITY).expect("QUERY_CACHE_CAPACITY must be non-zero");
let emb_cap = NonZeroUsize::new(embedding_cache_cap())
.expect("embedding_cache_cap must be non-zero (env var filtered)");
Self {
index_id: index_id.into(),
root_path: root_path.into(),
embedder: None,
store: None,
corpus: None,
chunks: Arc::new(RwLock::new(HashMap::new())),
entities: Arc::new(RwLock::new(HashMap::new())),
chunk_embeddings: Arc::new(RwLock::new(LruCache::new(emb_cap))),
bm25: Arc::new(RwLock::new(Bm25Index::new())),
query_cache: Arc::new(Mutex::new(LruCache::new(cap))),
symbol_graph: Arc::new(RwLock::new(Arc::new(SymbolGraph::new()))),
ner: crate::core::ner::NerExtractor::try_load(),
persist_state: Arc::new(PersistState::default()),
domain_terms: Vec::new(),
created_at: Instant::now(),
last_activity_ms: Arc::new(AtomicU64::new(0)),
chunks_evicted: Arc::new(AtomicBool::new(false)),
corpus_open_failed: false,
}
}
pub(super) fn touch_activity(&self) {
let ms = self.created_at.elapsed().as_millis().min(u64::MAX as u128) as u64;
self.last_activity_ms.store(ms, Ordering::Relaxed);
}
fn idle_duration(&self) -> std::time::Duration {
let now_ms = self.created_at.elapsed().as_millis().min(u64::MAX as u128) as u64;
let last = self.last_activity_ms.load(Ordering::Relaxed);
std::time::Duration::from_millis(now_ms.saturating_sub(last))
}
pub async fn in_memory_chunk_count(&self) -> usize {
self.chunks.read().await.len()
}
pub async fn evict_chunks_if_idle(&self, idle_threshold: std::time::Duration) -> usize {
if idle_threshold.is_zero() {
return 0;
}
if self.corpus.is_none() {
return 0;
}
if self.idle_duration() < idle_threshold {
return 0;
}
let mut chunks = self.chunks.write().await;
if chunks.is_empty() {
return 0;
}
let evicted = chunks.len();
chunks.clear();
chunks.shrink_to_fit();
drop(chunks);
self.chunks_evicted.store(true, Ordering::Relaxed);
tracing::info!(
"index '{}': evicted {} in-memory chunks after {}s idle \
(durable corpus retained; lazily rehydrates on next access)",
self.index_id,
evicted,
idle_threshold.as_secs(),
);
evicted
}
pub(super) async fn ensure_chunks_loaded(&self) {
if !self.chunks_evicted.load(Ordering::Relaxed) {
return;
}
let Some(corpus) = self.corpus.clone() else {
self.chunks_evicted.store(false, Ordering::Relaxed);
return;
};
let index_id = self.index_id.clone();
let loaded = tokio::task::spawn_blocking(move || corpus.load_all_chunks()).await;
match loaded {
Ok(Ok(chunks)) => {
let n = chunks.len();
let mut map = self.chunks.write().await;
for chunk in chunks {
map.insert(chunk.id.clone(), chunk);
}
drop(map);
self.chunks_evicted.store(false, Ordering::Relaxed);
tracing::info!(
"index '{index_id}': rehydrated {n} chunks from redb after idle eviction"
);
}
Ok(Err(e)) => tracing::warn!(
"index '{index_id}': failed to rehydrate chunks from redb ({e}); \
will retry on next access"
),
Err(e) => tracing::warn!(
"index '{index_id}': chunk rehydration task panicked ({e}); \
will retry on next access"
),
}
}
pub fn with_domain_terms(mut self, terms: Vec<String>) -> Self {
self.domain_terms = terms;
self
}
pub fn set_domain_terms(&mut self, terms: Vec<String>) {
self.domain_terms = terms;
}
pub async fn snapshot_symbol_graph(&self) -> Arc<SymbolGraph> {
Arc::clone(&*self.symbol_graph.read().await)
}
pub fn corpus_store(&self) -> Option<Arc<crate::core::corpus::CorpusStore>> {
self.corpus.as_ref().map(Arc::clone)
}
pub fn with_components(
mut self,
embedder: Arc<dyn Embedder>,
store: Arc<dyn VectorStore>,
) -> Self {
self.embedder = Some(embedder);
self.store = Some(store);
self
}
pub fn set_corpus_store(&mut self, corpus: Arc<crate::core::corpus::CorpusStore>) {
self.corpus = Some(corpus);
}
pub fn swap_corpus_store(
&mut self,
corpus: Arc<crate::core::corpus::CorpusStore>,
) -> Option<Arc<crate::core::corpus::CorpusStore>> {
self.corpus.replace(corpus)
}
pub fn take_corpus_store(&mut self) -> Option<Arc<crate::core::corpus::CorpusStore>> {
self.corpus.take()
}
pub fn has_corpus_store(&self) -> bool {
self.corpus.is_some()
}
pub fn has_embedder(&self) -> bool {
self.embedder.is_some()
}
}