1use std::collections::BTreeMap;
16use std::path::Path;
17
18use serde::{Deserialize, Serialize};
19
20use super::error::EmbedError;
21use super::hasher::IncrementalHasher;
22use super::types::{ChunkKind, EmbedChunk, EmbedSettings};
23use crate::bincode_safe::{deserialize_with_limit, serialize};
24
25pub const MANIFEST_VERSION: u32 = 3;
27
28#[derive(Debug, Clone, Serialize, Deserialize)]
48pub struct EmbedManifest {
49 pub version: u32,
51
52 pub repo_path: String,
54
55 #[serde(default)]
58 pub commit_hash: Option<String>,
59
60 #[serde(default)]
67 pub updated_at: Option<u64>,
68
69 pub settings: EmbedSettings,
71
72 pub chunks: BTreeMap<String, ManifestEntry>,
75
76 #[serde(default)]
79 pub checksum: Option<String>,
80}
81
82#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
84pub struct ManifestEntry {
85 pub chunk_id: String,
87
88 pub full_hash: String,
90
91 pub tokens: u32,
93
94 pub lines: (u32, u32),
96}
97
98impl EmbedManifest {
99 pub fn new(repo_path: String, settings: EmbedSettings) -> Self {
101 Self {
102 version: MANIFEST_VERSION,
103 repo_path,
104 commit_hash: None,
105 updated_at: None,
106 settings,
107 chunks: BTreeMap::new(),
108 checksum: None,
109 }
110 }
111
112 pub fn location_key(file: &str, symbol: &str, kind: ChunkKind) -> String {
117 format!("{}::{}::{}", file, symbol, kind.name())
118 }
119
120 fn compute_checksum(&self) -> String {
122 let mut hasher = IncrementalHasher::new();
123
124 hasher.update_u32(self.version);
126
127 let settings_json = serde_json::to_string(&self.settings).unwrap_or_default();
129 hasher.update_str(&settings_json);
130
131 let mut keys: Vec<_> = self.chunks.keys().collect();
133 keys.sort();
134
135 for key in keys {
136 if let Some(entry) = self.chunks.get(key) {
137 hasher.update_str(key);
138 hasher.update_str(&entry.chunk_id);
139 hasher.update_str(&entry.full_hash);
140 hasher.update_u32(entry.tokens);
141 hasher.update_u32(entry.lines.0);
142 hasher.update_u32(entry.lines.1);
143 }
144 }
145
146 hasher.finalize_hex()
147 }
148
149 pub fn save(&mut self, path: &Path) -> Result<(), EmbedError> {
171 if let Some(parent) = path.parent() {
173 std::fs::create_dir_all(parent)
174 .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
175 }
176
177 self.updated_at = Some(
179 std::time::SystemTime::now()
180 .duration_since(std::time::UNIX_EPOCH)
181 .map(|d| d.as_secs())
182 .unwrap_or(0),
183 );
184
185 self.checksum = Some(self.compute_checksum());
187
188 let bytes = serialize(self)
190 .map_err(|e| EmbedError::SerializationError { reason: e.to_string() })?;
191
192 let tmp_path = path.with_extension(format!("tmp.{}", std::process::id()));
195 std::fs::write(&tmp_path, bytes)
196 .map_err(|e| EmbedError::IoError { path: tmp_path.clone(), source: e })?;
197
198 std::fs::rename(&tmp_path, path)
200 .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
201
202 Ok(())
203 }
204
205 pub fn load(path: &Path) -> Result<Self, EmbedError> {
207 let bytes = std::fs::read(path)
208 .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
209
210 let mut manifest: Self = deserialize_with_limit(&bytes)
211 .map_err(|e| EmbedError::DeserializationError { reason: format!("Failed to read embed manifest (it may have been created by an older version of infiniloom; delete {:?} and re-run): {}", path, e) })?;
212
213 if manifest.version > MANIFEST_VERSION {
215 return Err(EmbedError::ManifestVersionTooNew {
216 found: manifest.version,
217 max_supported: MANIFEST_VERSION,
218 });
219 }
220
221 if let Some(stored_checksum) = manifest.checksum.take() {
223 let computed = manifest.compute_checksum();
224 if !constant_time_eq(stored_checksum.as_bytes(), computed.as_bytes()) {
225 return Err(EmbedError::ManifestCorrupted {
226 path: path.to_path_buf(),
227 expected: stored_checksum,
228 actual: computed,
229 });
230 }
231 }
232
233 manifest.settings.validate()?;
235
236 Ok(manifest)
237 }
238
239 pub fn load_if_exists(path: &Path) -> Result<Option<Self>, EmbedError> {
241 if path.exists() {
242 Ok(Some(Self::load(path)?))
243 } else {
244 Ok(None)
245 }
246 }
247
248 pub fn update(&mut self, chunks: &[EmbedChunk]) -> Result<(), EmbedError> {
250 let mut id_to_hash: BTreeMap<&str, &str> = BTreeMap::new();
253
254 self.chunks.clear();
255
256 for chunk in chunks {
257 if let Some(&existing_hash) = id_to_hash.get(chunk.id.as_str()) {
259 if existing_hash != chunk.full_hash.as_str() {
260 return Err(EmbedError::HashCollision {
261 id: chunk.id.clone(),
262 hash1: existing_hash.to_owned(),
263 hash2: chunk.full_hash.clone(),
264 });
265 }
266 }
267 id_to_hash.insert(&chunk.id, &chunk.full_hash);
268
269 let key = Self::location_key(&chunk.source.file, &chunk.source.symbol, chunk.kind);
270
271 self.chunks.insert(
272 key,
273 ManifestEntry {
274 chunk_id: chunk.id.clone(),
275 full_hash: chunk.full_hash.clone(),
276 tokens: chunk.tokens,
277 lines: chunk.source.lines,
278 },
279 );
280 }
281
282 Ok(())
283 }
284
285 pub fn diff(&self, current_chunks: &[EmbedChunk]) -> EmbedDiff {
287 let mut added = Vec::new();
288 let mut modified = Vec::new();
289 let mut removed = Vec::new();
290 let mut unchanged = Vec::new();
291
292 let current_map: BTreeMap<String, &EmbedChunk> = current_chunks
295 .iter()
296 .map(|c| (Self::location_key(&c.source.file, &c.source.symbol, c.kind), c))
297 .collect();
298
299 for (key, entry) in &self.chunks {
301 if let Some(current) = current_map.get(key) {
302 if current.id == entry.chunk_id {
303 unchanged.push(current.id.clone());
304 } else {
305 modified.push(ModifiedChunk {
306 old_id: entry.chunk_id.clone(),
307 new_id: current.id.clone(),
308 chunk: (*current).clone(),
309 });
310 }
311 } else {
312 removed
314 .push(RemovedChunk { id: entry.chunk_id.clone(), location_key: key.clone() });
315 }
316 }
317
318 for (key, chunk) in ¤t_map {
320 if !self.chunks.contains_key(key) {
321 added.push((*chunk).clone());
322 }
323 }
324
325 let summary = DiffSummary {
326 added: added.len(),
327 modified: modified.len(),
328 removed: removed.len(),
329 unchanged: unchanged.len(),
330 total_chunks: current_chunks.len(),
331 };
332
333 EmbedDiff { summary, added, modified, removed, unchanged }
334 }
335
336 pub fn settings_match(&self, settings: &EmbedSettings) -> bool {
338 &self.settings == settings
339 }
340
341 pub fn chunk_count(&self) -> usize {
343 self.chunks.len()
344 }
345}
346
347#[derive(Debug, Clone, Serialize, Deserialize)]
349pub struct EmbedDiff {
350 pub summary: DiffSummary,
352
353 pub added: Vec<EmbedChunk>,
355
356 pub modified: Vec<ModifiedChunk>,
358
359 pub removed: Vec<RemovedChunk>,
361
362 pub unchanged: Vec<String>,
364}
365
366impl EmbedDiff {
367 pub fn has_changes(&self) -> bool {
369 self.summary.added > 0 || self.summary.modified > 0 || self.summary.removed > 0
370 }
371
372 pub fn chunks_to_upsert(&self) -> Vec<&EmbedChunk> {
374 let mut chunks: Vec<&EmbedChunk> = self.added.iter().collect();
375 chunks.extend(self.modified.iter().map(|m| &m.chunk));
376 chunks
377 }
378
379 pub fn ids_to_delete(&self) -> Vec<&str> {
381 let mut ids: Vec<&str> = self.removed.iter().map(|r| r.id.as_str()).collect();
382 ids.extend(self.modified.iter().map(|m| m.old_id.as_str()));
384 ids
385 }
386
387 pub fn batches(&self, batch_size: usize) -> Vec<DiffBatch> {
389 let mut batches = Vec::new();
390 let mut batch_num = 0;
391
392 for chunk in self.added.chunks(batch_size) {
394 batches.push(DiffBatch {
395 batch_number: batch_num,
396 operation: BatchOperation::Upsert,
397 chunks: chunk.to_vec(),
398 ids: Vec::new(),
399 });
400 batch_num += 1;
401 }
402
403 for chunk in self.modified.chunks(batch_size) {
405 batches.push(DiffBatch {
406 batch_number: batch_num,
407 operation: BatchOperation::Upsert,
408 chunks: chunk.iter().map(|m| m.chunk.clone()).collect(),
409 ids: chunk.iter().map(|m| m.old_id.clone()).collect(), });
411 batch_num += 1;
412 }
413
414 for ids in self.removed.chunks(batch_size) {
416 batches.push(DiffBatch {
417 batch_number: batch_num,
418 operation: BatchOperation::Delete,
419 chunks: Vec::new(),
420 ids: ids.iter().map(|r| r.id.clone()).collect(),
421 });
422 batch_num += 1;
423 }
424
425 batches
426 }
427}
428
429#[derive(Debug, Clone, Serialize, Deserialize)]
431pub struct DiffSummary {
432 pub added: usize,
434
435 pub modified: usize,
437
438 pub removed: usize,
440
441 pub unchanged: usize,
443
444 pub total_chunks: usize,
446}
447
448#[derive(Debug, Clone, Serialize, Deserialize)]
450pub struct ModifiedChunk {
451 pub old_id: String,
453
454 pub new_id: String,
456
457 pub chunk: EmbedChunk,
459}
460
461#[derive(Debug, Clone, Serialize, Deserialize)]
463pub struct RemovedChunk {
464 pub id: String,
466
467 pub location_key: String,
469}
470
471#[derive(Debug, Clone, Serialize, Deserialize)]
473pub struct DiffBatch {
474 pub batch_number: usize,
476
477 pub operation: BatchOperation,
479
480 pub chunks: Vec<EmbedChunk>,
482
483 pub ids: Vec<String>,
485}
486
487#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
489#[serde(rename_all = "snake_case")]
490pub enum BatchOperation {
491 Upsert,
493 Delete,
495}
496
497#[inline]
503fn constant_time_eq(a: &[u8], b: &[u8]) -> bool {
504 if a.len() != b.len() {
505 return false;
506 }
507
508 let mut result = 0u8;
510 for (x, y) in a.iter().zip(b.iter()) {
511 result |= x ^ y;
512 }
513 result == 0
514}
515
516#[cfg(test)]
517mod tests {
518 use super::*;
519 use crate::embedding::types::{ChunkContext, ChunkSource, RepoIdentifier, Visibility};
520 use tempfile::TempDir;
521
522 fn create_test_chunk(id: &str, file: &str, symbol: &str) -> EmbedChunk {
523 EmbedChunk {
524 id: id.to_owned(),
525 full_hash: format!("{}_full", id),
526 content: "fn test() {}".to_owned(),
527 tokens: 10,
528 kind: ChunkKind::Function,
529 source: ChunkSource {
530 repo: RepoIdentifier::default(),
531 file: file.to_owned(),
532 lines: (1, 5),
533 symbol: symbol.to_owned(),
534 fqn: None,
535 language: "rust".to_owned(),
536 parent: None,
537 visibility: Visibility::Public,
538 is_test: false,
539 module_path: None,
540 parent_chunk_id: None,
541 },
542 children_ids: Vec::new(),
543 context: ChunkContext::default(),
544 repr: "code".to_string(),
545 code_chunk_id: None,
546 part: None,
547 }
548 }
549
550 #[test]
551 fn test_new_manifest() {
552 let manifest = EmbedManifest::new("my-repo".to_owned(), EmbedSettings::default());
553
554 assert_eq!(manifest.version, MANIFEST_VERSION);
555 assert_eq!(manifest.repo_path, "my-repo");
556 assert!(manifest.chunks.is_empty());
557 }
558
559 #[test]
560 fn test_location_key() {
561 let key = EmbedManifest::location_key("src/auth.rs", "validate", ChunkKind::Function);
562 assert_eq!(key, "src/auth.rs::validate::function");
563 }
564
565 #[test]
566 fn test_save_and_load() {
567 let temp_dir = TempDir::new().unwrap();
568 let manifest_path = temp_dir.path().join("test.bin");
569
570 let mut manifest = EmbedManifest::new("my-repo".to_owned(), EmbedSettings::default());
572
573 let chunks = vec![
574 create_test_chunk("ec_123", "src/foo.rs", "foo"),
575 create_test_chunk("ec_456", "src/bar.rs", "bar"),
576 ];
577 manifest.update(&chunks).unwrap();
578 manifest.save(&manifest_path).unwrap();
579
580 let loaded = EmbedManifest::load(&manifest_path).unwrap();
582 assert_eq!(loaded.repo_path, "my-repo");
583 assert_eq!(loaded.chunks.len(), 2);
584 }
585
586 #[test]
587 fn test_integrity_verification() {
588 let temp_dir = TempDir::new().unwrap();
589 let manifest_path = temp_dir.path().join("test.bin");
590
591 let mut manifest = EmbedManifest::new("my-repo".to_owned(), EmbedSettings::default());
593 manifest.save(&manifest_path).unwrap();
594
595 let mut bytes = std::fs::read(&manifest_path).unwrap();
597 if bytes.len() >= 10 {
598 let idx = bytes.len() - 10;
599 bytes[idx] ^= 0xFF;
600 std::fs::write(&manifest_path, bytes).unwrap();
601 }
602
603 let result = EmbedManifest::load(&manifest_path);
605 assert!(matches!(
606 result,
607 Err(EmbedError::ManifestCorrupted { .. })
608 | Err(EmbedError::DeserializationError { .. })
609 ));
610 }
611
612 #[test]
613 fn test_diff_added() {
614 let manifest = EmbedManifest::new("my-repo".to_owned(), EmbedSettings::default());
615
616 let chunks = vec![create_test_chunk("ec_123", "src/foo.rs", "foo")];
617
618 let diff = manifest.diff(&chunks);
619 assert_eq!(diff.summary.added, 1);
620 assert_eq!(diff.summary.modified, 0);
621 assert_eq!(diff.summary.removed, 0);
622 }
623
624 #[test]
625 fn test_diff_modified() {
626 let mut manifest = EmbedManifest::new("my-repo".to_owned(), EmbedSettings::default());
627
628 let old_chunks = vec![create_test_chunk("ec_old", "src/foo.rs", "foo")];
629 manifest.update(&old_chunks).unwrap();
630
631 let new_chunks = vec![create_test_chunk("ec_new", "src/foo.rs", "foo")];
633
634 let diff = manifest.diff(&new_chunks);
635 assert_eq!(diff.summary.added, 0);
636 assert_eq!(diff.summary.modified, 1);
637 assert_eq!(diff.summary.removed, 0);
638 assert_eq!(diff.modified[0].old_id, "ec_old");
639 assert_eq!(diff.modified[0].new_id, "ec_new");
640 }
641
642 #[test]
643 fn test_diff_removed() {
644 let mut manifest = EmbedManifest::new("my-repo".to_owned(), EmbedSettings::default());
645
646 let old_chunks = vec![create_test_chunk("ec_123", "src/foo.rs", "foo")];
647 manifest.update(&old_chunks).unwrap();
648
649 let diff = manifest.diff(&[]);
651 assert_eq!(diff.summary.added, 0);
652 assert_eq!(diff.summary.modified, 0);
653 assert_eq!(diff.summary.removed, 1);
654 }
655
656 #[test]
657 fn test_diff_unchanged() {
658 let mut manifest = EmbedManifest::new("my-repo".to_owned(), EmbedSettings::default());
659
660 let chunks = vec![create_test_chunk("ec_123", "src/foo.rs", "foo")];
661 manifest.update(&chunks).unwrap();
662
663 let diff = manifest.diff(&chunks);
665 assert_eq!(diff.summary.unchanged, 1);
666 assert!(!diff.has_changes());
667 }
668
669 #[test]
670 fn test_batches() {
671 let manifest = EmbedManifest::new("my-repo".to_owned(), EmbedSettings::default());
672
673 let chunks: Vec<_> = (0..5)
674 .map(|i| {
675 create_test_chunk(&format!("ec_{i}"), &format!("src/f{i}.rs"), &format!("f{i}"))
676 })
677 .collect();
678
679 let diff = manifest.diff(&chunks);
680 let batches = diff.batches(2);
681
682 assert_eq!(batches.len(), 3);
684 assert_eq!(batches[0].chunks.len(), 2);
685 assert_eq!(batches[1].chunks.len(), 2);
686 assert_eq!(batches[2].chunks.len(), 1);
687 }
688
689 #[test]
690 fn test_load_if_exists() {
691 let temp_dir = TempDir::new().unwrap();
692 let manifest_path = temp_dir.path().join("nonexistent.bin");
693
694 let result = EmbedManifest::load_if_exists(&manifest_path).unwrap();
696 assert!(result.is_none());
697
698 let mut manifest = EmbedManifest::new("test".to_owned(), EmbedSettings::default());
700 manifest.save(&manifest_path).unwrap();
701
702 let result = EmbedManifest::load_if_exists(&manifest_path).unwrap();
703 assert!(result.is_some());
704 }
705
706 #[test]
707 fn test_collision_detection() {
708 let mut manifest = EmbedManifest::new("my-repo".to_owned(), EmbedSettings::default());
709
710 let mut chunk1 = create_test_chunk("ec_same", "src/foo.rs", "foo");
712 let mut chunk2 = create_test_chunk("ec_same", "src/bar.rs", "bar");
713 chunk1.full_hash = "hash1".to_owned();
714 chunk2.full_hash = "hash2".to_owned();
715
716 let result = manifest.update(&[chunk1, chunk2]);
717 assert!(matches!(result, Err(EmbedError::HashCollision { .. })));
718 }
719
720 #[test]
721 fn test_settings_match() {
722 let manifest = EmbedManifest::new("my-repo".to_owned(), EmbedSettings::default());
723
724 assert!(manifest.settings_match(&EmbedSettings::default()));
725
726 let mut different = EmbedSettings::default();
727 different.max_tokens = 2000;
728 assert!(!manifest.settings_match(&different));
729 }
730}