use anyhow::Result;
use sha2::{Digest, Sha256};
use std::path::Path;
#[derive(Debug, Clone, Default)]
pub struct SyncOpts {
pub languages: Option<Vec<String>>,
pub force_reindex: bool,
pub record_index_state: bool,
}
#[derive(Debug, Default)]
pub struct SyncReport {
pub added: usize,
pub updated: usize,
pub deleted: usize,
pub elapsed_ms: u128,
}
impl std::fmt::Display for SyncReport {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"added={} updated={} deleted={} elapsed_ms={}",
self.added, self.updated, self.deleted, self.elapsed_ms
)
}
}
pub fn content_hash(text: &str) -> String {
let mut h = Sha256::new();
h.update(text.as_bytes());
format!("{:x}", h.finalize())
}
impl crate::retrieval::client::RetrievalClient {
pub async fn sync_project(
&self,
project_id: &str,
root: &Path,
opts: SyncOpts,
) -> Result<SyncReport> {
use crate::embed::ast_chunker::split_file;
use crate::retrieval::drift::{diff_chunks, ChunkRef};
use crate::retrieval::payload::CodePayload;
const STACK_CHUNK_TARGET: usize = 1200;
let chunk_target: usize = std::env::var("CODESCOUT_CHUNK_TARGET")
.ok()
.and_then(|s| s.parse().ok())
.unwrap_or(STACK_CHUNK_TARGET);
tracing::info!(
chunk_target,
force_reindex = opts.force_reindex,
"retrieval sync starting"
);
let started = std::time::Instant::now();
self.code_store
.ensure_collection(
&self.config.collection("code_chunks"),
self.config.model_dim as u64,
)
.await?;
let mut local: Vec<(CodePayload, String)> = Vec::new();
for entry in ignore::WalkBuilder::new(root)
.hidden(false) .build()
.filter_map(|e| e.ok())
{
let ft = match entry.file_type() {
Some(ft) => ft,
None => continue,
};
if !ft.is_file() {
continue;
}
let path = entry.path();
let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
let Some(lang) = crate::embed::lang_for_ext(ext) else {
continue;
};
let source = match std::fs::read_to_string(path) {
Ok(s) => s,
Err(_) => continue,
};
let rel_path = path.strip_prefix(root).unwrap_or(path);
let chunks = split_file(&source, lang, path, chunk_target);
for c in chunks {
if c.content.trim().is_empty() {
continue;
}
let hash = content_hash(&c.content);
let chunk_id = format!("{project_id}:{}:{hash}", rel_path.display());
let p = CodePayload {
project_id: project_id.into(),
file_path: rel_path.display().to_string(),
language: lang.into(),
start_line: c.start_line as i64,
end_line: c.end_line as i64,
ast_kind: String::new(),
ast_header: String::new(),
content: c.content.clone(),
content_hash: hash,
last_indexed_commit: String::new(),
chunk_id,
};
local.push((p, c.content));
}
}
let server: Vec<ChunkRef> = self
.code_store
.chunk_refs(&self.config.collection("code_chunks"), project_id)
.await
.unwrap_or_default();
let local_refs: Vec<ChunkRef> = local
.iter()
.map(|(p, _)| ChunkRef {
chunk_id: p.chunk_id.clone(),
content_hash: p.content_hash.clone(),
})
.collect();
let action = if opts.force_reindex {
let diff = diff_chunks(&server, &local_refs);
crate::retrieval::drift::DriftAction {
to_upsert: local_refs.iter().map(|r| r.chunk_id.clone()).collect(),
to_delete: diff.to_delete,
}
} else {
diff_chunks(&server, &local_refs)
};
let upsert_set: std::collections::HashSet<&str> =
action.to_upsert.iter().map(String::as_str).collect();
let to_upsert: Vec<&(CodePayload, String)> = local
.iter()
.filter(|(p, c)| upsert_set.contains(p.chunk_id.as_str()) && !c.trim().is_empty())
.collect();
let texts: Vec<String> = to_upsert.iter().map(|(_, c)| c.clone()).collect();
let embeds = if !texts.is_empty() {
self.embedder.embed_batch(&texts).await?
} else {
vec![]
};
let added = to_upsert.len();
if !to_upsert.is_empty() {
let chunks: Vec<(CodePayload, crate::retrieval::embedder::EmbedOutput)> = to_upsert
.iter()
.zip(embeds)
.map(|((p, _), e)| (p.clone(), e))
.collect();
self.code_store
.upsert_chunks(&self.config.collection("code_chunks"), &chunks)
.await?;
}
let deleted = action.to_delete.len();
if !action.to_delete.is_empty() {
self.code_store
.delete_chunks(
&self.config.collection("code_chunks"),
project_id,
&action.to_delete,
)
.await?;
}
let elapsed_ms = started.elapsed().as_millis();
tracing::info!(added, deleted, elapsed_ms, "retrieval sync finished");
if opts.record_index_state {
if let Err(e) = crate::retrieval::index_state::write_index_state(root) {
tracing::warn!(error = %e, "failed to write index-state sidecar");
}
}
Ok(SyncReport {
added,
deleted,
updated: 0,
elapsed_ms,
})
}
}