1use std::collections::BTreeMap;
16use std::path::Path;
17
18use bincode::Options;
19use serde::{Deserialize, Serialize};
20
21use super::error::EmbedError;
22use super::hasher::IncrementalHasher;
23use super::types::{ChunkKind, EmbedChunk, EmbedSettings};
24use crate::bincode_safe::deserialize_with_limit;
25
26pub const MANIFEST_VERSION: u32 = 2;
28
29#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct EmbedManifest {
50 pub version: u32,
52
53 pub repo_path: String,
55
56 #[serde(default)]
59 pub commit_hash: Option<String>,
60
61 #[serde(default)]
68 pub updated_at: Option<u64>,
69
70 pub settings: EmbedSettings,
72
73 pub chunks: BTreeMap<String, ManifestEntry>,
76
77 #[serde(default)]
80 pub checksum: Option<String>,
81}
82
83#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
85pub struct ManifestEntry {
86 pub chunk_id: String,
88
89 pub full_hash: String,
91
92 pub tokens: u32,
94
95 pub lines: (u32, u32),
97}
98
99impl EmbedManifest {
100 pub fn new(repo_path: String, settings: EmbedSettings) -> Self {
102 Self {
103 version: MANIFEST_VERSION,
104 repo_path,
105 commit_hash: None,
106 updated_at: None,
107 settings,
108 chunks: BTreeMap::new(),
109 checksum: None,
110 }
111 }
112
113 pub fn location_key(file: &str, symbol: &str, kind: ChunkKind) -> String {
118 format!("{}::{}::{}", file, symbol, kind.name())
119 }
120
121 fn compute_checksum(&self) -> String {
123 let mut hasher = IncrementalHasher::new();
124
125 hasher.update_u32(self.version);
127
128 let settings_json = serde_json::to_string(&self.settings).unwrap_or_default();
130 hasher.update_str(&settings_json);
131
132 let mut keys: Vec<_> = self.chunks.keys().collect();
134 keys.sort();
135
136 for key in keys {
137 if let Some(entry) = self.chunks.get(key) {
138 hasher.update_str(key);
139 hasher.update_str(&entry.chunk_id);
140 hasher.update_str(&entry.full_hash);
141 hasher.update_u32(entry.tokens);
142 hasher.update_u32(entry.lines.0);
143 hasher.update_u32(entry.lines.1);
144 }
145 }
146
147 hasher.finalize_hex()
148 }
149
150 pub fn save(&mut self, path: &Path) -> Result<(), EmbedError> {
172 if let Some(parent) = path.parent() {
174 std::fs::create_dir_all(parent)
175 .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
176 }
177
178 self.updated_at = Some(
180 std::time::SystemTime::now()
181 .duration_since(std::time::UNIX_EPOCH)
182 .map(|d| d.as_secs())
183 .unwrap_or(0)
184 );
185
186 self.checksum = Some(self.compute_checksum());
188
189 let bytes = bincode::options()
192 .serialize(self)
193 .map_err(|e| EmbedError::SerializationError { reason: e.to_string() })?;
194
195 std::fs::write(path, bytes)
196 .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
197
198 Ok(())
199 }
200
201 pub fn load(path: &Path) -> Result<Self, EmbedError> {
203 let bytes = std::fs::read(path)
204 .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
205
206 let mut manifest: Self = deserialize_with_limit(&bytes)
207 .map_err(|e| EmbedError::DeserializationError { reason: e.to_string() })?;
208
209 if manifest.version > MANIFEST_VERSION {
211 return Err(EmbedError::ManifestVersionTooNew {
212 found: manifest.version,
213 max_supported: MANIFEST_VERSION,
214 });
215 }
216
217 if let Some(stored_checksum) = manifest.checksum.take() {
219 let computed = manifest.compute_checksum();
220 if !constant_time_eq(stored_checksum.as_bytes(), computed.as_bytes()) {
221 return Err(EmbedError::ManifestCorrupted {
222 path: path.to_path_buf(),
223 expected: stored_checksum,
224 actual: computed,
225 });
226 }
227 }
228
229 manifest.settings.validate()?;
231
232 Ok(manifest)
233 }
234
235 pub fn load_if_exists(path: &Path) -> Result<Option<Self>, EmbedError> {
237 if path.exists() {
238 Ok(Some(Self::load(path)?))
239 } else {
240 Ok(None)
241 }
242 }
243
244 pub fn update(&mut self, chunks: &[EmbedChunk]) -> Result<(), EmbedError> {
246 let mut id_to_hash: BTreeMap<&str, &str> = BTreeMap::new();
249
250 self.chunks.clear();
251
252 for chunk in chunks {
253 if let Some(&existing_hash) = id_to_hash.get(chunk.id.as_str()) {
255 if existing_hash != chunk.full_hash.as_str() {
256 return Err(EmbedError::HashCollision {
257 id: chunk.id.clone(),
258 hash1: existing_hash.to_string(),
259 hash2: chunk.full_hash.clone(),
260 });
261 }
262 }
263 id_to_hash.insert(&chunk.id, &chunk.full_hash);
264
265 let key = Self::location_key(
266 &chunk.source.file,
267 &chunk.source.symbol,
268 chunk.kind,
269 );
270
271 self.chunks.insert(key, ManifestEntry {
272 chunk_id: chunk.id.clone(),
273 full_hash: chunk.full_hash.clone(),
274 tokens: chunk.tokens,
275 lines: chunk.source.lines,
276 });
277 }
278
279 Ok(())
280 }
281
282 pub fn diff(&self, current_chunks: &[EmbedChunk]) -> EmbedDiff {
284 let mut added = Vec::new();
285 let mut modified = Vec::new();
286 let mut removed = Vec::new();
287 let mut unchanged = Vec::new();
288
289 let current_map: BTreeMap<String, &EmbedChunk> = current_chunks
292 .iter()
293 .map(|c| (Self::location_key(&c.source.file, &c.source.symbol, c.kind), c))
294 .collect();
295
296 for (key, entry) in &self.chunks {
298 if let Some(current) = current_map.get(key) {
299 if current.id == entry.chunk_id {
300 unchanged.push(current.id.clone());
301 } else {
302 modified.push(ModifiedChunk {
303 old_id: entry.chunk_id.clone(),
304 new_id: current.id.clone(),
305 chunk: (*current).clone(),
306 });
307 }
308 } else {
309 removed.push(RemovedChunk {
311 id: entry.chunk_id.clone(),
312 location_key: key.clone(),
313 });
314 }
315 }
316
317 for (key, chunk) in ¤t_map {
319 if !self.chunks.contains_key(key) {
320 added.push((*chunk).clone());
321 }
322 }
323
324 let summary = DiffSummary {
325 added: added.len(),
326 modified: modified.len(),
327 removed: removed.len(),
328 unchanged: unchanged.len(),
329 total_chunks: current_chunks.len(),
330 };
331
332 EmbedDiff { summary, added, modified, removed, unchanged }
333 }
334
335 pub fn settings_match(&self, settings: &EmbedSettings) -> bool {
337 &self.settings == settings
338 }
339
340 pub fn chunk_count(&self) -> usize {
342 self.chunks.len()
343 }
344}
345
346#[derive(Debug, Clone, Serialize, Deserialize)]
348pub struct EmbedDiff {
349 pub summary: DiffSummary,
351
352 pub added: Vec<EmbedChunk>,
354
355 pub modified: Vec<ModifiedChunk>,
357
358 pub removed: Vec<RemovedChunk>,
360
361 pub unchanged: Vec<String>,
363}
364
365impl EmbedDiff {
366 pub fn has_changes(&self) -> bool {
368 self.summary.added > 0 || self.summary.modified > 0 || self.summary.removed > 0
369 }
370
371 pub fn chunks_to_upsert(&self) -> Vec<&EmbedChunk> {
373 let mut chunks: Vec<&EmbedChunk> = self.added.iter().collect();
374 chunks.extend(self.modified.iter().map(|m| &m.chunk));
375 chunks
376 }
377
378 pub fn ids_to_delete(&self) -> Vec<&str> {
380 let mut ids: Vec<&str> = self.removed.iter().map(|r| r.id.as_str()).collect();
381 ids.extend(self.modified.iter().map(|m| m.old_id.as_str()));
383 ids
384 }
385
386 pub fn batches(&self, batch_size: usize) -> Vec<DiffBatch> {
388 let mut batches = Vec::new();
389 let mut batch_num = 0;
390
391 for chunk in self.added.chunks(batch_size) {
393 batches.push(DiffBatch {
394 batch_number: batch_num,
395 operation: BatchOperation::Upsert,
396 chunks: chunk.to_vec(),
397 ids: Vec::new(),
398 });
399 batch_num += 1;
400 }
401
402 for chunk in self.modified.chunks(batch_size) {
404 batches.push(DiffBatch {
405 batch_number: batch_num,
406 operation: BatchOperation::Upsert,
407 chunks: chunk.iter().map(|m| m.chunk.clone()).collect(),
408 ids: chunk.iter().map(|m| m.old_id.clone()).collect(), });
410 batch_num += 1;
411 }
412
413 for ids in self.removed.chunks(batch_size) {
415 batches.push(DiffBatch {
416 batch_number: batch_num,
417 operation: BatchOperation::Delete,
418 chunks: Vec::new(),
419 ids: ids.iter().map(|r| r.id.clone()).collect(),
420 });
421 batch_num += 1;
422 }
423
424 batches
425 }
426}
427
428#[derive(Debug, Clone, Serialize, Deserialize)]
430pub struct DiffSummary {
431 pub added: usize,
433
434 pub modified: usize,
436
437 pub removed: usize,
439
440 pub unchanged: usize,
442
443 pub total_chunks: usize,
445}
446
447#[derive(Debug, Clone, Serialize, Deserialize)]
449pub struct ModifiedChunk {
450 pub old_id: String,
452
453 pub new_id: String,
455
456 pub chunk: EmbedChunk,
458}
459
460#[derive(Debug, Clone, Serialize, Deserialize)]
462pub struct RemovedChunk {
463 pub id: String,
465
466 pub location_key: String,
468}
469
470#[derive(Debug, Clone, Serialize, Deserialize)]
472pub struct DiffBatch {
473 pub batch_number: usize,
475
476 pub operation: BatchOperation,
478
479 pub chunks: Vec<EmbedChunk>,
481
482 pub ids: Vec<String>,
484}
485
486#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
488#[serde(rename_all = "snake_case")]
489pub enum BatchOperation {
490 Upsert,
492 Delete,
494}
495
496#[inline]
501fn constant_time_eq(a: &[u8], b: &[u8]) -> bool {
502 if a.len() != b.len() {
503 return false;
504 }
505
506 let mut result = 0u8;
508 for (x, y) in a.iter().zip(b.iter()) {
509 result |= x ^ y;
510 }
511 result == 0
512}
513
514#[cfg(test)]
515mod tests {
516 use super::*;
517 use tempfile::TempDir;
518 use crate::embedding::types::{ChunkSource, ChunkContext, Visibility, RepoIdentifier};
519
520 fn create_test_chunk(id: &str, file: &str, symbol: &str) -> EmbedChunk {
521 EmbedChunk {
522 id: id.to_string(),
523 full_hash: format!("{}_full", id),
524 content: "fn test() {}".to_string(),
525 tokens: 10,
526 kind: ChunkKind::Function,
527 source: ChunkSource {
528 repo: RepoIdentifier::default(),
529 file: file.to_string(),
530 lines: (1, 5),
531 symbol: symbol.to_string(),
532 fqn: None,
533 language: "rust".to_string(),
534 parent: None,
535 visibility: Visibility::Public,
536 is_test: false,
537 },
538 context: ChunkContext::default(),
539 part: None,
540 }
541 }
542
543 #[test]
544 fn test_new_manifest() {
545 let manifest = EmbedManifest::new(
546 "my-repo".to_string(),
547 EmbedSettings::default()
548 );
549
550 assert_eq!(manifest.version, MANIFEST_VERSION);
551 assert_eq!(manifest.repo_path, "my-repo");
552 assert!(manifest.chunks.is_empty());
553 }
554
555 #[test]
556 fn test_location_key() {
557 let key = EmbedManifest::location_key("src/auth.rs", "validate", ChunkKind::Function);
558 assert_eq!(key, "src/auth.rs::validate::function");
559 }
560
561 #[test]
562 fn test_save_and_load() {
563 let temp_dir = TempDir::new().unwrap();
564 let manifest_path = temp_dir.path().join("test.bin");
565
566 let mut manifest = EmbedManifest::new(
568 "my-repo".to_string(),
569 EmbedSettings::default()
570 );
571
572 let chunks = vec![
573 create_test_chunk("ec_123", "src/foo.rs", "foo"),
574 create_test_chunk("ec_456", "src/bar.rs", "bar"),
575 ];
576 manifest.update(&chunks).unwrap();
577 manifest.save(&manifest_path).unwrap();
578
579 let loaded = EmbedManifest::load(&manifest_path).unwrap();
581 assert_eq!(loaded.repo_path, "my-repo");
582 assert_eq!(loaded.chunks.len(), 2);
583 }
584
585 #[test]
586 fn test_integrity_verification() {
587 let temp_dir = TempDir::new().unwrap();
588 let manifest_path = temp_dir.path().join("test.bin");
589
590 let mut manifest = EmbedManifest::new(
592 "my-repo".to_string(),
593 EmbedSettings::default()
594 );
595 manifest.save(&manifest_path).unwrap();
596
597 let mut bytes = std::fs::read(&manifest_path).unwrap();
599 if bytes.len() >= 10 {
600 let idx = bytes.len() - 10;
601 bytes[idx] ^= 0xFF;
602 std::fs::write(&manifest_path, bytes).unwrap();
603 }
604
605 let result = EmbedManifest::load(&manifest_path);
607 assert!(matches!(result, Err(EmbedError::ManifestCorrupted { .. }) | Err(EmbedError::DeserializationError { .. })));
608 }
609
610 #[test]
611 fn test_diff_added() {
612 let manifest = EmbedManifest::new(
613 "my-repo".to_string(),
614 EmbedSettings::default()
615 );
616
617 let chunks = vec![
618 create_test_chunk("ec_123", "src/foo.rs", "foo"),
619 ];
620
621 let diff = manifest.diff(&chunks);
622 assert_eq!(diff.summary.added, 1);
623 assert_eq!(diff.summary.modified, 0);
624 assert_eq!(diff.summary.removed, 0);
625 }
626
627 #[test]
628 fn test_diff_modified() {
629 let mut manifest = EmbedManifest::new(
630 "my-repo".to_string(),
631 EmbedSettings::default()
632 );
633
634 let old_chunks = vec![
635 create_test_chunk("ec_old", "src/foo.rs", "foo"),
636 ];
637 manifest.update(&old_chunks).unwrap();
638
639 let new_chunks = vec![
641 create_test_chunk("ec_new", "src/foo.rs", "foo"),
642 ];
643
644 let diff = manifest.diff(&new_chunks);
645 assert_eq!(diff.summary.added, 0);
646 assert_eq!(diff.summary.modified, 1);
647 assert_eq!(diff.summary.removed, 0);
648 assert_eq!(diff.modified[0].old_id, "ec_old");
649 assert_eq!(diff.modified[0].new_id, "ec_new");
650 }
651
652 #[test]
653 fn test_diff_removed() {
654 let mut manifest = EmbedManifest::new(
655 "my-repo".to_string(),
656 EmbedSettings::default()
657 );
658
659 let old_chunks = vec![
660 create_test_chunk("ec_123", "src/foo.rs", "foo"),
661 ];
662 manifest.update(&old_chunks).unwrap();
663
664 let diff = manifest.diff(&[]);
666 assert_eq!(diff.summary.added, 0);
667 assert_eq!(diff.summary.modified, 0);
668 assert_eq!(diff.summary.removed, 1);
669 }
670
671 #[test]
672 fn test_diff_unchanged() {
673 let mut manifest = EmbedManifest::new(
674 "my-repo".to_string(),
675 EmbedSettings::default()
676 );
677
678 let chunks = vec![
679 create_test_chunk("ec_123", "src/foo.rs", "foo"),
680 ];
681 manifest.update(&chunks).unwrap();
682
683 let diff = manifest.diff(&chunks);
685 assert_eq!(diff.summary.unchanged, 1);
686 assert!(!diff.has_changes());
687 }
688
689 #[test]
690 fn test_batches() {
691 let manifest = EmbedManifest::new(
692 "my-repo".to_string(),
693 EmbedSettings::default()
694 );
695
696 let chunks: Vec<_> = (0..5)
697 .map(|i| create_test_chunk(&format!("ec_{i}"), &format!("src/f{i}.rs"), &format!("f{i}")))
698 .collect();
699
700 let diff = manifest.diff(&chunks);
701 let batches = diff.batches(2);
702
703 assert_eq!(batches.len(), 3);
705 assert_eq!(batches[0].chunks.len(), 2);
706 assert_eq!(batches[1].chunks.len(), 2);
707 assert_eq!(batches[2].chunks.len(), 1);
708 }
709
710 #[test]
711 fn test_load_if_exists() {
712 let temp_dir = TempDir::new().unwrap();
713 let manifest_path = temp_dir.path().join("nonexistent.bin");
714
715 let result = EmbedManifest::load_if_exists(&manifest_path).unwrap();
717 assert!(result.is_none());
718
719 let mut manifest = EmbedManifest::new("test".to_string(), EmbedSettings::default());
721 manifest.save(&manifest_path).unwrap();
722
723 let result = EmbedManifest::load_if_exists(&manifest_path).unwrap();
724 assert!(result.is_some());
725 }
726
727 #[test]
728 fn test_collision_detection() {
729 let mut manifest = EmbedManifest::new(
730 "my-repo".to_string(),
731 EmbedSettings::default()
732 );
733
734 let mut chunk1 = create_test_chunk("ec_same", "src/foo.rs", "foo");
736 let mut chunk2 = create_test_chunk("ec_same", "src/bar.rs", "bar");
737 chunk1.full_hash = "hash1".to_string();
738 chunk2.full_hash = "hash2".to_string();
739
740 let result = manifest.update(&[chunk1, chunk2]);
741 assert!(matches!(result, Err(EmbedError::HashCollision { .. })));
742 }
743
744 #[test]
745 fn test_settings_match() {
746 let manifest = EmbedManifest::new(
747 "my-repo".to_string(),
748 EmbedSettings::default()
749 );
750
751 assert!(manifest.settings_match(&EmbedSettings::default()));
752
753 let mut different = EmbedSettings::default();
754 different.max_tokens = 2000;
755 assert!(!manifest.settings_match(&different));
756 }
757}