use crate::content_hash::compute_blob_sha;
use crate::db::traits::StoreChunks;
use crate::db::traits::StoreEmbeddings;
use crate::db::{ChunkRecord, SqliteStore};
use crate::metrics::CacheMetrics;
use anyhow::{Context, Result};
use tracing::{debug, info};
pub async fn check_embedding_exists(store: &SqliteStore, blob_sha: &str) -> Result<bool> {
store
.has_embedding(blob_sha)
.await
.context("Failed to check embedding existence")
}
#[allow(clippy::too_many_arguments)] pub async fn upsert_chunk_with_cache(
store: &SqliteStore,
file_id: i64,
content: &str,
symbol_name: Option<&str>,
kind: &str,
signature: Option<&str>,
docstring: Option<&str>,
start_line: i32,
end_line: i32,
preview: &str,
ts_doc_text: &str,
recency_score: f32,
churn_score: f32,
metadata: Option<&serde_json::Value>,
worktree_id: i64,
metrics: &CacheMetrics,
) -> Result<i64> {
let blob_sha = compute_blob_sha(content);
let embedding_exists = check_embedding_exists(store, &blob_sha)
.await
.context("Failed to check embedding cache")?;
if embedding_exists {
metrics.record_hit();
debug!(
blob_sha = %blob_sha,
symbol = ?symbol_name,
worktree_id = worktree_id,
"Cache hit: reusing existing embedding"
);
} else {
metrics.record_miss();
debug!(
blob_sha = %blob_sha,
symbol = ?symbol_name,
worktree_id = worktree_id,
"Cache miss: new embedding needed"
);
}
let chunk = ChunkRecord {
file_id,
blob_sha,
symbol_name: symbol_name.map(|s| s.to_string()),
kind: kind.to_string(),
signature: signature.map(|s| s.to_string()),
docstring: docstring.map(|s| s.to_string()),
start_line,
end_line,
preview: preview.to_string(),
ts_doc_text: ts_doc_text.to_string(),
recency_score,
churn_score,
metadata: metadata.cloned(),
worktree_id,
};
let chunk_id = store
.insert_chunk(&chunk)
.await
.context("Failed to insert chunk")?;
Ok(chunk_id)
}
#[allow(clippy::type_complexity)] pub async fn upsert_chunks_batch_with_cache(
store: &SqliteStore,
chunks: &[(
i64, // file_id
String, // content (for blob_sha)
Option<String>, String, Option<String>, Option<String>, i32, i32, String, String, f32, f32, Option<serde_json::Value>, )],
worktree_id: i64,
metrics: &CacheMetrics,
) -> Result<Vec<i64>> {
if chunks.is_empty() {
return Ok(Vec::new());
}
let blob_shas: Vec<String> = chunks
.iter()
.map(|(_, content, ..)| compute_blob_sha(content))
.collect();
let mut existing_blob_shas = std::collections::HashSet::new();
for blob_sha in &blob_shas {
if store
.has_embedding(blob_sha)
.await
.context("Failed to check embedding existence")?
{
existing_blob_shas.insert(blob_sha.clone());
}
}
for blob_sha in &blob_shas {
if existing_blob_shas.contains(blob_sha) {
metrics.record_hit();
} else {
metrics.record_miss();
}
}
debug!(
total_chunks = chunks.len(),
cache_hits = existing_blob_shas.len(),
cache_misses = blob_shas.len() - existing_blob_shas.len(),
"Batch cache check complete"
);
let chunk_records: Vec<ChunkRecord> = chunks
.iter()
.zip(blob_shas.iter())
.map(
|(
(
file_id,
_content,
symbol_name,
kind,
signature,
docstring,
start_line,
end_line,
preview,
ts_doc_text,
recency_score,
churn_score,
metadata,
),
blob_sha,
)| ChunkRecord {
file_id: *file_id,
blob_sha: blob_sha.clone(),
symbol_name: symbol_name.clone(),
kind: kind.clone(),
signature: signature.clone(),
docstring: docstring.clone(),
start_line: *start_line,
end_line: *end_line,
preview: preview.clone(),
ts_doc_text: ts_doc_text.clone(),
recency_score: *recency_score,
churn_score: *churn_score,
metadata: metadata.clone(),
worktree_id,
},
)
.collect();
let chunk_ids = store
.insert_chunks_batch(&chunk_records)
.await
.context("Failed to batch insert chunks")?;
Ok(chunk_ids)
}
#[derive(Debug, Clone)]
pub struct ParsedChunk {
pub relpath: String,
pub symbol_name: Option<String>,
pub content: String,
pub start_line: i32,
pub end_line: i32,
pub kind: String,
}
pub async fn upsert_chunk_with_worktree(
store: &SqliteStore,
chunk: &ParsedChunk,
worktree_id: i64,
metrics: &CacheMetrics,
) -> Result<i64> {
let blob_sha = compute_blob_sha(&chunk.content);
let embedding_exists = check_embedding_exists(store, &blob_sha)
.await
.context("Failed to check embedding cache")?;
if embedding_exists {
metrics.record_hit();
debug!(
blob_sha = %blob_sha,
symbol = ?chunk.symbol_name,
worktree_id = worktree_id,
"Cache hit: reusing existing embedding"
);
} else {
metrics.record_miss();
debug!(
blob_sha = %blob_sha,
symbol = ?chunk.symbol_name,
worktree_id = worktree_id,
"Cache miss: new embedding needed"
);
}
anyhow::bail!(
"upsert_chunk_with_worktree is not yet fully implemented for SQLite. \
Use insert_chunk with a ChunkRecord that includes file_id instead."
);
}
pub fn log_cache_metrics(metrics: &CacheMetrics, total_chunks: usize) {
let hits = metrics.hits();
let misses = metrics.misses();
let hit_rate = metrics.hit_rate() * 100.0;
let cost = metrics.estimated_cost_usd();
info!("Indexing complete:");
info!(" - Chunks processed: {}", total_chunks);
info!(" - Cache hits: {} ({:.1}%)", hits, hit_rate);
info!(
" - Cache misses: {} ({:.1}%)",
misses,
if total_chunks > 0 {
(misses as f64 / total_chunks as f64) * 100.0
} else {
0.0
}
);
info!(" - Embeddings generated: {}", misses);
info!(" - Estimated cost: ${:.4}", cost);
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_compute_blob_sha_consistency() {
let content1 = "function foo() { return 1; }";
let content2 = "function foo() { return 1; }";
let content3 = "function bar() { return 2; }";
let sha1 = compute_blob_sha(content1);
let sha2 = compute_blob_sha(content2);
let sha3 = compute_blob_sha(content3);
assert_eq!(sha1, sha2);
assert_ne!(sha1, sha3);
assert_eq!(sha1.len(), 64);
assert!(sha1.chars().all(|c| c.is_ascii_hexdigit()));
}
#[test]
fn test_metrics_tracking() {
let metrics = CacheMetrics::new();
assert_eq!(metrics.hits(), 0);
assert_eq!(metrics.misses(), 0);
assert_eq!(metrics.hit_rate(), 0.0);
for _ in 0..8 {
metrics.record_hit();
}
for _ in 0..2 {
metrics.record_miss();
}
assert_eq!(metrics.hits(), 8);
assert_eq!(metrics.misses(), 2);
assert_eq!(metrics.hit_rate(), 0.8);
assert_eq!(metrics.embeddings_generated(), 2);
let cost = metrics.estimated_cost_usd();
assert!((cost - 0.00004).abs() < 0.000001);
}
#[test]
fn test_parsed_chunk_creation() {
let chunk = ParsedChunk {
relpath: "src/main.rs".to_string(),
symbol_name: Some("main".to_string()),
content: "fn main() {}".to_string(),
start_line: 1,
end_line: 1,
kind: "function".to_string(),
};
assert_eq!(chunk.relpath, "src/main.rs");
assert_eq!(chunk.symbol_name, Some("main".to_string()));
assert_eq!(chunk.content, "fn main() {}");
assert_eq!(chunk.start_line, 1);
assert_eq!(chunk.end_line, 1);
assert_eq!(chunk.kind, "function");
}
#[test]
fn test_parsed_chunk_clone() {
let chunk = ParsedChunk {
relpath: "src/lib.rs".to_string(),
symbol_name: None,
content: "mod test;".to_string(),
start_line: 5,
end_line: 5,
kind: "module".to_string(),
};
let cloned = chunk.clone();
assert_eq!(cloned.relpath, chunk.relpath);
assert_eq!(cloned.symbol_name, chunk.symbol_name);
assert_eq!(cloned.content, chunk.content);
assert_eq!(cloned.start_line, chunk.start_line);
assert_eq!(cloned.end_line, chunk.end_line);
assert_eq!(cloned.kind, chunk.kind);
}
#[test]
fn test_parsed_chunk_debug() {
let chunk = ParsedChunk {
relpath: "test.rs".to_string(),
symbol_name: Some("test_fn".to_string()),
content: "test content".to_string(),
start_line: 10,
end_line: 20,
kind: "function".to_string(),
};
let debug_str = format!("{:?}", chunk);
assert!(debug_str.contains("ParsedChunk"));
assert!(debug_str.contains("test.rs"));
assert!(debug_str.contains("test_fn"));
}
}