use std::collections::{BTreeMap, BTreeSet};
use std::fs;
use std::path::{Path, PathBuf};
use chrono::Utc;
use sha2::{Digest, Sha256};
use thiserror::Error;
use crate::cache::cache::ContextCache;
use crate::cache::versioning::{CacheBuildConfig, CacheIndex, CacheManifest, ManifestDocumentEntry};
use crate::document::Document;
#[derive(Debug, Error)]
pub enum CacheBuildError {
#[error("IO error: {0}")]
Io(#[from] std::io::Error),
#[error("Serialization error: {0}")]
Serialization(#[from] serde_json::Error),
#[error("Output directory already exists: {0}")]
OutputExists(PathBuf),
#[error("Filename collision detected for hash fragment: {0}")]
FilenameCollision(String),
#[error("Duplicate document ID: {0}")]
DuplicateDocumentId(String),
#[error("Invalid version format: {0}")]
InvalidVersionFormat(String),
}
pub struct CacheBuilder {
config: CacheBuildConfig,
}
impl CacheBuilder {
pub fn new(config: CacheBuildConfig) -> Self {
Self { config }
}
pub fn build(
&self,
documents: Vec<Document>,
output_dir: &Path,
) -> Result<ContextCache, CacheBuildError> {
if output_dir.exists() {
return Err(CacheBuildError::OutputExists(output_dir.to_path_buf()));
}
let mut sorted_docs = documents;
sorted_docs.sort_by(|a, b| a.id.cmp(&b.id));
for pair in sorted_docs.windows(2) {
if pair[0].id == pair[1].id {
return Err(CacheBuildError::DuplicateDocumentId(
pair[0].id.as_str().to_string(),
));
}
}
let mut doc_contexts = Vec::with_capacity(sorted_docs.len());
let mut index_entries = BTreeMap::new();
let mut seen_filenames = BTreeSet::new();
let mut version_hasher = Sha256::new();
let config_json = serde_json::to_vec(&self.config)?;
version_hasher.update(&config_json);
for doc in &sorted_docs {
let line = format!("{}:{}", doc.id.as_str(), doc.version.as_str());
version_hasher.update(line.as_bytes());
let full_hash = doc
.version
.as_str()
.strip_prefix("sha256:")
.ok_or_else(|| CacheBuildError::InvalidVersionFormat(doc.version.as_str().to_string()))?;
if full_hash.len() < 12 {
return Err(CacheBuildError::FilenameCollision(full_hash.to_string()));
}
let filename_stem = &full_hash[..12];
let filename = format!("{}.json", filename_stem);
if seen_filenames.contains(filename_stem) {
return Err(CacheBuildError::FilenameCollision(filename_stem.to_string()));
}
seen_filenames.insert(filename_stem.to_string());
let relative_path = format!("documents/{}", filename);
let entry = ManifestDocumentEntry {
id: doc.id.clone(),
version: doc.version.clone(),
file: relative_path.clone(),
};
index_entries.insert(doc.id.clone(), relative_path);
doc_contexts.push((doc, entry));
}
let hash_bytes = version_hasher.finalize();
let cache_version = format!("sha256:{}", hex::encode(hash_bytes));
let mut manifest_documents: Vec<ManifestDocumentEntry> = doc_contexts
.iter()
.map(|(_, entry)| entry.clone())
.collect();
manifest_documents.sort_by(|a, b| a.id.cmp(&b.id));
let manifest = CacheManifest {
cache_version: cache_version.clone(),
build_config: self.config.clone(),
created_at: Utc::now(),
document_count: sorted_docs.len(),
documents: manifest_documents,
};
let index = CacheIndex::new(index_entries);
let temp_suffix = format!("tmp.{}", &cache_version[7..19]);
let temp_dir = output_dir.with_extension(temp_suffix);
if temp_dir.exists() {
fs::remove_dir_all(&temp_dir)?;
}
fs::create_dir_all(&temp_dir)?;
fs::create_dir(temp_dir.join("documents"))?;
for (doc, entry) in doc_contexts {
let path = temp_dir.join(&entry.file); let f = fs::File::create(path)?;
serde_json::to_writer(&f, doc)?;
f.sync_all()?;
}
let index_path = temp_dir.join("index.json");
let f_idx = fs::File::create(index_path)?;
serde_json::to_writer_pretty(&f_idx, &index)?;
f_idx.sync_all()?;
let manifest_path = temp_dir.join("manifest.json");
let f_man = fs::File::create(manifest_path)?;
serde_json::to_writer_pretty(&f_man, &manifest)?;
f_man.sync_all()?;
fs::rename(&temp_dir, output_dir)?;
Ok(ContextCache {
root: output_dir.to_path_buf(),
manifest,
})
}
}