use std::collections::{BTreeMap, HashMap, HashSet};
use std::fs;
use std::path::{Path, PathBuf};
use std::time::{SystemTime, UNIX_EPOCH};
use anyhow::{Context, Result, bail};
use serde::{Deserialize, Serialize};
use crate::index::dense::DenseIndex;
use crate::index::sparse::Bm25Index;
use crate::symbols::Symbol;
use crate::types::Chunk;
use crate::walker;
pub const INDEX_DIR_NAME: &str = ".veles";
pub const FORMAT_VERSION: u32 = 2;
const MANIFEST_FILE: &str = "manifest.json";
const CHUNKS_FILE: &str = "chunks.bin";
const BM25_FILE: &str = "bm25.bin";
const DENSE_FILE: &str = "dense.bin";
const SYMBOLS_FILE: &str = "symbols.bin";
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct FileFingerprint {
pub size: u64,
pub mtime_secs: i64,
pub chunk_count: usize,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub content_hash: Option<String>,
}
impl FileFingerprint {
pub fn from_path(path: &Path, chunk_count: usize) -> Result<Self> {
let meta = fs::metadata(path).with_context(|| format!("stat {}", path.display()))?;
let mtime = meta.modified().unwrap_or(UNIX_EPOCH);
let mtime_secs = mtime
.duration_since(UNIX_EPOCH)
.map(|d| d.as_secs() as i64)
.unwrap_or(0);
let content_hash = Some(content_hash(path)?);
Ok(Self {
size: meta.len(),
mtime_secs,
chunk_count,
content_hash,
})
}
}
pub fn content_hash(path: &Path) -> Result<String> {
let bytes = fs::read(path).with_context(|| format!("read {}", path.display()))?;
Ok(blake3::hash(&bytes).to_hex().to_string())
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Manifest {
pub veles_version: String,
pub format_version: u32,
pub model_name: String,
pub embedding_dim: usize,
pub include_text_files: bool,
pub indexed_at: i64,
pub files: BTreeMap<String, FileFingerprint>,
pub total_chunks: usize,
}
impl Manifest {
pub fn new(model_name: &str, embedding_dim: usize, include_text_files: bool) -> Self {
Self {
veles_version: env!("CARGO_PKG_VERSION").to_string(),
format_version: FORMAT_VERSION,
model_name: model_name.to_string(),
embedding_dim,
include_text_files,
indexed_at: now_secs(),
files: BTreeMap::new(),
total_chunks: 0,
}
}
pub fn touch(&mut self) {
self.indexed_at = now_secs();
}
}
fn now_secs() -> i64 {
SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.as_secs() as i64)
.unwrap_or(0)
}
pub fn index_dir_for(repo_root: &Path) -> PathBuf {
repo_root.join(INDEX_DIR_NAME)
}
pub fn index_exists(repo_root: &Path) -> bool {
let dir = index_dir_for(repo_root);
dir.join(MANIFEST_FILE).is_file()
&& dir.join(CHUNKS_FILE).is_file()
&& dir.join(BM25_FILE).is_file()
&& dir.join(DENSE_FILE).is_file()
}
pub struct PersistedIndex {
pub manifest: Manifest,
pub chunks: Vec<Chunk>,
pub bm25: Bm25Index,
pub dense: DenseIndex,
pub symbols: Vec<Symbol>,
}
pub fn save(
repo_root: &Path,
manifest: &Manifest,
chunks: &[Chunk],
bm25: &Bm25Index,
dense: &DenseIndex,
symbols: &[Symbol],
) -> Result<()> {
let dir = index_dir_for(repo_root);
fs::create_dir_all(&dir).with_context(|| format!("create index dir {}", dir.display()))?;
write_json(&dir.join(MANIFEST_FILE), manifest)?;
let chunks_path = dir.join(CHUNKS_FILE);
let bm25_path = dir.join(BM25_FILE);
let dense_path = dir.join(DENSE_FILE);
let symbols_path = dir.join(SYMBOLS_FILE);
let ((r1, r2), (r3, r4)) = rayon::join(
|| {
rayon::join(
|| write_bincode(&chunks_path, &chunks),
|| write_bincode(&bm25_path, bm25),
)
},
|| {
rayon::join(
|| write_bincode(&dense_path, dense),
|| write_bincode(&symbols_path, &symbols),
)
},
);
r1?;
r2?;
r3?;
r4?;
Ok(())
}
pub fn load(repo_root: &Path) -> Result<PersistedIndex> {
let dir = index_dir_for(repo_root);
if !dir.is_dir() {
bail!("No index found at {}", dir.display());
}
let manifest: Manifest = read_json(&dir.join(MANIFEST_FILE))?;
if manifest.format_version != FORMAT_VERSION {
bail!(
"Index format version {} is incompatible (expected {}). Run `veles index --force` to rebuild.",
manifest.format_version,
FORMAT_VERSION
);
}
let chunks: Vec<Chunk> = read_bincode(&dir.join(CHUNKS_FILE))?;
let bm25: Bm25Index = read_bincode(&dir.join(BM25_FILE))?;
let dense: DenseIndex = read_bincode(&dir.join(DENSE_FILE))?;
let symbols: Vec<Symbol> = if dir.join(SYMBOLS_FILE).is_file() {
read_bincode(&dir.join(SYMBOLS_FILE))?
} else {
Vec::new()
};
Ok(PersistedIndex {
manifest,
chunks,
bm25,
dense,
symbols,
})
}
pub fn load_manifest(repo_root: &Path) -> Result<Manifest> {
let dir = index_dir_for(repo_root);
read_json(&dir.join(MANIFEST_FILE))
}
pub fn clean(repo_root: &Path) -> Result<bool> {
let dir = index_dir_for(repo_root);
if dir.is_dir() {
fs::remove_dir_all(&dir).with_context(|| format!("remove {}", dir.display()))?;
return Ok(true);
}
Ok(false)
}
fn write_json<T: Serialize>(path: &Path, value: &T) -> Result<()> {
let f = fs::File::create(path).with_context(|| format!("create {}", path.display()))?;
serde_json::to_writer_pretty(f, value).with_context(|| format!("write {}", path.display()))?;
Ok(())
}
fn read_json<T: for<'de> Deserialize<'de>>(path: &Path) -> Result<T> {
let f = fs::File::open(path).with_context(|| format!("open {}", path.display()))?;
let value = serde_json::from_reader(std::io::BufReader::new(f))
.with_context(|| format!("parse {}", path.display()))?;
Ok(value)
}
fn write_bincode<T: Serialize>(path: &Path, value: &T) -> Result<()> {
let f = fs::File::create(path).with_context(|| format!("create {}", path.display()))?;
let mut w = std::io::BufWriter::new(f);
bincode::serialize_into(&mut w, value).with_context(|| format!("encode {}", path.display()))?;
Ok(())
}
fn read_bincode<T: for<'de> Deserialize<'de>>(path: &Path) -> Result<T> {
let f = fs::File::open(path).with_context(|| format!("open {}", path.display()))?;
let r = std::io::BufReader::new(f);
let value =
bincode::deserialize_from(r).with_context(|| format!("decode {}", path.display()))?;
Ok(value)
}
#[derive(Debug, Clone)]
pub struct DiskEntry {
pub abs_path: PathBuf,
pub size: u64,
pub mtime_secs: i64,
}
#[derive(Debug, Clone)]
pub enum Classification {
Unchanged,
MtimeOnly { hash: String },
Modified { hash: Option<String> },
Added,
}
#[derive(Debug)]
pub struct DiskState {
pub on_disk: HashMap<String, DiskEntry>,
pub classification: HashMap<String, Classification>,
pub removed: Vec<String>,
}
impl DiskState {
pub fn seen_now(&self) -> usize {
self.on_disk.len()
}
pub fn count_added(&self) -> usize {
self.classification
.values()
.filter(|c| matches!(c, Classification::Added))
.count()
}
pub fn count_modified(&self) -> usize {
self.classification
.values()
.filter(|c| matches!(c, Classification::Modified { .. }))
.count()
}
pub fn count_mtime_only(&self) -> usize {
self.classification
.values()
.filter(|c| matches!(c, Classification::MtimeOnly { .. }))
.count()
}
pub fn count_unchanged(&self) -> usize {
self.classification
.values()
.filter(|c| matches!(c, Classification::Unchanged))
.count()
}
pub fn count_removed(&self) -> usize {
self.removed.len()
}
pub fn is_clean(&self) -> bool {
self.removed.is_empty()
&& self
.classification
.values()
.all(|c| matches!(c, Classification::Unchanged))
}
}
pub fn classify_disk(
repo_root: &Path,
manifest: &Manifest,
extensions: &HashSet<String>,
) -> DiskState {
let mut on_disk: HashMap<String, DiskEntry> = HashMap::new();
for abs in walker::walk_files(repo_root, extensions) {
let Ok(rel_path) = abs.strip_prefix(repo_root) else {
continue;
};
let rel = rel_path.to_string_lossy().into_owned();
let Ok(meta) = fs::metadata(&abs) else {
continue;
};
let mtime_secs = meta
.modified()
.ok()
.and_then(|m| m.duration_since(UNIX_EPOCH).ok())
.map(|d| d.as_secs() as i64)
.unwrap_or(0);
on_disk.insert(
rel,
DiskEntry {
abs_path: abs,
size: meta.len(),
mtime_secs,
},
);
}
let mut classification: HashMap<String, Classification> = HashMap::new();
for (rel, entry) in &on_disk {
let cls = match manifest.files.get(rel) {
Some(prev) if prev.size == entry.size && prev.mtime_secs == entry.mtime_secs => {
Classification::Unchanged
}
Some(prev) if prev.size == entry.size && prev.content_hash.is_some() => {
match content_hash(&entry.abs_path) {
Ok(h) if Some(&h) == prev.content_hash.as_ref() => {
Classification::MtimeOnly { hash: h }
}
Ok(h) => Classification::Modified { hash: Some(h) },
Err(_) => Classification::Modified { hash: None },
}
}
Some(_) => Classification::Modified { hash: None },
None => Classification::Added,
};
classification.insert(rel.clone(), cls);
}
let removed: Vec<String> = manifest
.files
.keys()
.filter(|k| !on_disk.contains_key(*k))
.cloned()
.collect();
DiskState {
on_disk,
classification,
removed,
}
}
#[derive(Debug, Default, Clone)]
pub struct UpdateReport {
pub added_files: usize,
pub modified_files: usize,
pub removed_files: usize,
pub mtime_refreshed_files: usize,
pub kept_chunks: usize,
pub new_chunks: usize,
pub total_chunks: usize,
}
impl UpdateReport {
pub fn is_noop(&self) -> bool {
self.added_files == 0
&& self.modified_files == 0
&& self.removed_files == 0
&& self.mtime_refreshed_files == 0
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn manifest_roundtrip_via_json() {
let mut m = Manifest::new("test-model", 64, false);
m.files.insert(
"src/lib.rs".to_string(),
FileFingerprint {
size: 100,
mtime_secs: 1_000_000,
chunk_count: 2,
content_hash: Some("deadbeef".to_string()),
},
);
m.total_chunks = 2;
let s = serde_json::to_string(&m).unwrap();
let m2: Manifest = serde_json::from_str(&s).unwrap();
assert_eq!(m2.model_name, "test-model");
assert_eq!(m2.embedding_dim, 64);
assert_eq!(m2.files.len(), 1);
assert_eq!(m2.files["src/lib.rs"].size, 100);
assert_eq!(
m2.files["src/lib.rs"].content_hash.as_deref(),
Some("deadbeef")
);
}
#[test]
fn legacy_manifest_without_content_hash_loads() {
let json = r#"{
"veles_version": "0.2.3",
"format_version": 2,
"model_name": "test-model",
"embedding_dim": 64,
"include_text_files": false,
"indexed_at": 0,
"files": {
"src/lib.rs": {
"size": 100,
"mtime_secs": 1000000,
"chunk_count": 2
}
},
"total_chunks": 2
}"#;
let m: Manifest = serde_json::from_str(json).unwrap();
assert_eq!(m.files["src/lib.rs"].size, 100);
assert!(m.files["src/lib.rs"].content_hash.is_none());
}
#[test]
fn content_hash_is_deterministic_and_discriminates() {
let dir = tempfile::tempdir().unwrap();
let p = dir.path().join("a.txt");
std::fs::write(&p, b"hello").unwrap();
let h1 = content_hash(&p).unwrap();
let h2 = content_hash(&p).unwrap();
assert_eq!(h1, h2, "same bytes must hash the same");
std::fs::write(&p, b"hello world").unwrap();
let h3 = content_hash(&p).unwrap();
assert_ne!(h1, h3, "different bytes must hash differently");
}
}