1use std::collections::BTreeMap;
16use std::path::Path;
17
18use bincode::Options;
19use serde::{Deserialize, Serialize};
20
21use super::error::EmbedError;
22use super::hasher::IncrementalHasher;
23use super::types::{ChunkKind, EmbedChunk, EmbedSettings};
24use crate::bincode_safe::deserialize_with_limit;
25
26pub const MANIFEST_VERSION: u32 = 2;
28
29#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct EmbedManifest {
50 pub version: u32,
52
53 pub repo_path: String,
55
56 #[serde(default)]
59 pub commit_hash: Option<String>,
60
61 #[serde(default)]
68 pub updated_at: Option<u64>,
69
70 pub settings: EmbedSettings,
72
73 pub chunks: BTreeMap<String, ManifestEntry>,
76
77 #[serde(default)]
80 pub checksum: Option<String>,
81}
82
83#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
85pub struct ManifestEntry {
86 pub chunk_id: String,
88
89 pub full_hash: String,
91
92 pub tokens: u32,
94
95 pub lines: (u32, u32),
97}
98
99impl EmbedManifest {
100 pub fn new(repo_path: String, settings: EmbedSettings) -> Self {
102 Self {
103 version: MANIFEST_VERSION,
104 repo_path,
105 commit_hash: None,
106 updated_at: None,
107 settings,
108 chunks: BTreeMap::new(),
109 checksum: None,
110 }
111 }
112
113 pub fn location_key(file: &str, symbol: &str, kind: ChunkKind) -> String {
118 format!("{}::{}::{}", file, symbol, kind.name())
119 }
120
121 fn compute_checksum(&self) -> String {
123 let mut hasher = IncrementalHasher::new();
124
125 hasher.update_u32(self.version);
127
128 let settings_json = serde_json::to_string(&self.settings).unwrap_or_default();
130 hasher.update_str(&settings_json);
131
132 let mut keys: Vec<_> = self.chunks.keys().collect();
134 keys.sort();
135
136 for key in keys {
137 if let Some(entry) = self.chunks.get(key) {
138 hasher.update_str(key);
139 hasher.update_str(&entry.chunk_id);
140 hasher.update_str(&entry.full_hash);
141 hasher.update_u32(entry.tokens);
142 hasher.update_u32(entry.lines.0);
143 hasher.update_u32(entry.lines.1);
144 }
145 }
146
147 hasher.finalize_hex()
148 }
149
150 pub fn save(&mut self, path: &Path) -> Result<(), EmbedError> {
172 if let Some(parent) = path.parent() {
174 std::fs::create_dir_all(parent)
175 .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
176 }
177
178 self.updated_at = Some(
180 std::time::SystemTime::now()
181 .duration_since(std::time::UNIX_EPOCH)
182 .map(|d| d.as_secs())
183 .unwrap_or(0),
184 );
185
186 self.checksum = Some(self.compute_checksum());
188
189 let bytes = bincode::options()
192 .serialize(self)
193 .map_err(|e| EmbedError::SerializationError { reason: e.to_string() })?;
194
195 std::fs::write(path, bytes)
196 .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
197
198 Ok(())
199 }
200
201 pub fn load(path: &Path) -> Result<Self, EmbedError> {
203 let bytes = std::fs::read(path)
204 .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
205
206 let mut manifest: Self = deserialize_with_limit(&bytes)
207 .map_err(|e| EmbedError::DeserializationError { reason: e.to_string() })?;
208
209 if manifest.version > MANIFEST_VERSION {
211 return Err(EmbedError::ManifestVersionTooNew {
212 found: manifest.version,
213 max_supported: MANIFEST_VERSION,
214 });
215 }
216
217 if let Some(stored_checksum) = manifest.checksum.take() {
219 let computed = manifest.compute_checksum();
220 if !constant_time_eq(stored_checksum.as_bytes(), computed.as_bytes()) {
221 return Err(EmbedError::ManifestCorrupted {
222 path: path.to_path_buf(),
223 expected: stored_checksum,
224 actual: computed,
225 });
226 }
227 }
228
229 manifest.settings.validate()?;
231
232 Ok(manifest)
233 }
234
235 pub fn load_if_exists(path: &Path) -> Result<Option<Self>, EmbedError> {
237 if path.exists() {
238 Ok(Some(Self::load(path)?))
239 } else {
240 Ok(None)
241 }
242 }
243
244 pub fn update(&mut self, chunks: &[EmbedChunk]) -> Result<(), EmbedError> {
246 let mut id_to_hash: BTreeMap<&str, &str> = BTreeMap::new();
249
250 self.chunks.clear();
251
252 for chunk in chunks {
253 if let Some(&existing_hash) = id_to_hash.get(chunk.id.as_str()) {
255 if existing_hash != chunk.full_hash.as_str() {
256 return Err(EmbedError::HashCollision {
257 id: chunk.id.clone(),
258 hash1: existing_hash.to_string(),
259 hash2: chunk.full_hash.clone(),
260 });
261 }
262 }
263 id_to_hash.insert(&chunk.id, &chunk.full_hash);
264
265 let key = Self::location_key(&chunk.source.file, &chunk.source.symbol, chunk.kind);
266
267 self.chunks.insert(
268 key,
269 ManifestEntry {
270 chunk_id: chunk.id.clone(),
271 full_hash: chunk.full_hash.clone(),
272 tokens: chunk.tokens,
273 lines: chunk.source.lines,
274 },
275 );
276 }
277
278 Ok(())
279 }
280
281 pub fn diff(&self, current_chunks: &[EmbedChunk]) -> EmbedDiff {
283 let mut added = Vec::new();
284 let mut modified = Vec::new();
285 let mut removed = Vec::new();
286 let mut unchanged = Vec::new();
287
288 let current_map: BTreeMap<String, &EmbedChunk> = current_chunks
291 .iter()
292 .map(|c| (Self::location_key(&c.source.file, &c.source.symbol, c.kind), c))
293 .collect();
294
295 for (key, entry) in &self.chunks {
297 if let Some(current) = current_map.get(key) {
298 if current.id == entry.chunk_id {
299 unchanged.push(current.id.clone());
300 } else {
301 modified.push(ModifiedChunk {
302 old_id: entry.chunk_id.clone(),
303 new_id: current.id.clone(),
304 chunk: (*current).clone(),
305 });
306 }
307 } else {
308 removed
310 .push(RemovedChunk { id: entry.chunk_id.clone(), location_key: key.clone() });
311 }
312 }
313
314 for (key, chunk) in ¤t_map {
316 if !self.chunks.contains_key(key) {
317 added.push((*chunk).clone());
318 }
319 }
320
321 let summary = DiffSummary {
322 added: added.len(),
323 modified: modified.len(),
324 removed: removed.len(),
325 unchanged: unchanged.len(),
326 total_chunks: current_chunks.len(),
327 };
328
329 EmbedDiff { summary, added, modified, removed, unchanged }
330 }
331
332 pub fn settings_match(&self, settings: &EmbedSettings) -> bool {
334 &self.settings == settings
335 }
336
337 pub fn chunk_count(&self) -> usize {
339 self.chunks.len()
340 }
341}
342
343#[derive(Debug, Clone, Serialize, Deserialize)]
345pub struct EmbedDiff {
346 pub summary: DiffSummary,
348
349 pub added: Vec<EmbedChunk>,
351
352 pub modified: Vec<ModifiedChunk>,
354
355 pub removed: Vec<RemovedChunk>,
357
358 pub unchanged: Vec<String>,
360}
361
362impl EmbedDiff {
363 pub fn has_changes(&self) -> bool {
365 self.summary.added > 0 || self.summary.modified > 0 || self.summary.removed > 0
366 }
367
368 pub fn chunks_to_upsert(&self) -> Vec<&EmbedChunk> {
370 let mut chunks: Vec<&EmbedChunk> = self.added.iter().collect();
371 chunks.extend(self.modified.iter().map(|m| &m.chunk));
372 chunks
373 }
374
375 pub fn ids_to_delete(&self) -> Vec<&str> {
377 let mut ids: Vec<&str> = self.removed.iter().map(|r| r.id.as_str()).collect();
378 ids.extend(self.modified.iter().map(|m| m.old_id.as_str()));
380 ids
381 }
382
383 pub fn batches(&self, batch_size: usize) -> Vec<DiffBatch> {
385 let mut batches = Vec::new();
386 let mut batch_num = 0;
387
388 for chunk in self.added.chunks(batch_size) {
390 batches.push(DiffBatch {
391 batch_number: batch_num,
392 operation: BatchOperation::Upsert,
393 chunks: chunk.to_vec(),
394 ids: Vec::new(),
395 });
396 batch_num += 1;
397 }
398
399 for chunk in self.modified.chunks(batch_size) {
401 batches.push(DiffBatch {
402 batch_number: batch_num,
403 operation: BatchOperation::Upsert,
404 chunks: chunk.iter().map(|m| m.chunk.clone()).collect(),
405 ids: chunk.iter().map(|m| m.old_id.clone()).collect(), });
407 batch_num += 1;
408 }
409
410 for ids in self.removed.chunks(batch_size) {
412 batches.push(DiffBatch {
413 batch_number: batch_num,
414 operation: BatchOperation::Delete,
415 chunks: Vec::new(),
416 ids: ids.iter().map(|r| r.id.clone()).collect(),
417 });
418 batch_num += 1;
419 }
420
421 batches
422 }
423}
424
425#[derive(Debug, Clone, Serialize, Deserialize)]
427pub struct DiffSummary {
428 pub added: usize,
430
431 pub modified: usize,
433
434 pub removed: usize,
436
437 pub unchanged: usize,
439
440 pub total_chunks: usize,
442}
443
444#[derive(Debug, Clone, Serialize, Deserialize)]
446pub struct ModifiedChunk {
447 pub old_id: String,
449
450 pub new_id: String,
452
453 pub chunk: EmbedChunk,
455}
456
457#[derive(Debug, Clone, Serialize, Deserialize)]
459pub struct RemovedChunk {
460 pub id: String,
462
463 pub location_key: String,
465}
466
467#[derive(Debug, Clone, Serialize, Deserialize)]
469pub struct DiffBatch {
470 pub batch_number: usize,
472
473 pub operation: BatchOperation,
475
476 pub chunks: Vec<EmbedChunk>,
478
479 pub ids: Vec<String>,
481}
482
483#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
485#[serde(rename_all = "snake_case")]
486pub enum BatchOperation {
487 Upsert,
489 Delete,
491}
492
493#[inline]
498fn constant_time_eq(a: &[u8], b: &[u8]) -> bool {
499 if a.len() != b.len() {
500 return false;
501 }
502
503 let mut result = 0u8;
505 for (x, y) in a.iter().zip(b.iter()) {
506 result |= x ^ y;
507 }
508 result == 0
509}
510
511#[cfg(test)]
512mod tests {
513 use super::*;
514 use crate::embedding::types::{ChunkContext, ChunkSource, RepoIdentifier, Visibility};
515 use tempfile::TempDir;
516
517 fn create_test_chunk(id: &str, file: &str, symbol: &str) -> EmbedChunk {
518 EmbedChunk {
519 id: id.to_string(),
520 full_hash: format!("{}_full", id),
521 content: "fn test() {}".to_string(),
522 tokens: 10,
523 kind: ChunkKind::Function,
524 source: ChunkSource {
525 repo: RepoIdentifier::default(),
526 file: file.to_string(),
527 lines: (1, 5),
528 symbol: symbol.to_string(),
529 fqn: None,
530 language: "rust".to_string(),
531 parent: None,
532 visibility: Visibility::Public,
533 is_test: false,
534 },
535 context: ChunkContext::default(),
536 part: None,
537 }
538 }
539
540 #[test]
541 fn test_new_manifest() {
542 let manifest = EmbedManifest::new("my-repo".to_string(), EmbedSettings::default());
543
544 assert_eq!(manifest.version, MANIFEST_VERSION);
545 assert_eq!(manifest.repo_path, "my-repo");
546 assert!(manifest.chunks.is_empty());
547 }
548
549 #[test]
550 fn test_location_key() {
551 let key = EmbedManifest::location_key("src/auth.rs", "validate", ChunkKind::Function);
552 assert_eq!(key, "src/auth.rs::validate::function");
553 }
554
555 #[test]
556 fn test_save_and_load() {
557 let temp_dir = TempDir::new().unwrap();
558 let manifest_path = temp_dir.path().join("test.bin");
559
560 let mut manifest = EmbedManifest::new("my-repo".to_string(), EmbedSettings::default());
562
563 let chunks = vec![
564 create_test_chunk("ec_123", "src/foo.rs", "foo"),
565 create_test_chunk("ec_456", "src/bar.rs", "bar"),
566 ];
567 manifest.update(&chunks).unwrap();
568 manifest.save(&manifest_path).unwrap();
569
570 let loaded = EmbedManifest::load(&manifest_path).unwrap();
572 assert_eq!(loaded.repo_path, "my-repo");
573 assert_eq!(loaded.chunks.len(), 2);
574 }
575
576 #[test]
577 fn test_integrity_verification() {
578 let temp_dir = TempDir::new().unwrap();
579 let manifest_path = temp_dir.path().join("test.bin");
580
581 let mut manifest = EmbedManifest::new("my-repo".to_string(), EmbedSettings::default());
583 manifest.save(&manifest_path).unwrap();
584
585 let mut bytes = std::fs::read(&manifest_path).unwrap();
587 if bytes.len() >= 10 {
588 let idx = bytes.len() - 10;
589 bytes[idx] ^= 0xFF;
590 std::fs::write(&manifest_path, bytes).unwrap();
591 }
592
593 let result = EmbedManifest::load(&manifest_path);
595 assert!(matches!(
596 result,
597 Err(EmbedError::ManifestCorrupted { .. })
598 | Err(EmbedError::DeserializationError { .. })
599 ));
600 }
601
602 #[test]
603 fn test_diff_added() {
604 let manifest = EmbedManifest::new("my-repo".to_string(), EmbedSettings::default());
605
606 let chunks = vec![create_test_chunk("ec_123", "src/foo.rs", "foo")];
607
608 let diff = manifest.diff(&chunks);
609 assert_eq!(diff.summary.added, 1);
610 assert_eq!(diff.summary.modified, 0);
611 assert_eq!(diff.summary.removed, 0);
612 }
613
614 #[test]
615 fn test_diff_modified() {
616 let mut manifest = EmbedManifest::new("my-repo".to_string(), EmbedSettings::default());
617
618 let old_chunks = vec![create_test_chunk("ec_old", "src/foo.rs", "foo")];
619 manifest.update(&old_chunks).unwrap();
620
621 let new_chunks = vec![create_test_chunk("ec_new", "src/foo.rs", "foo")];
623
624 let diff = manifest.diff(&new_chunks);
625 assert_eq!(diff.summary.added, 0);
626 assert_eq!(diff.summary.modified, 1);
627 assert_eq!(diff.summary.removed, 0);
628 assert_eq!(diff.modified[0].old_id, "ec_old");
629 assert_eq!(diff.modified[0].new_id, "ec_new");
630 }
631
632 #[test]
633 fn test_diff_removed() {
634 let mut manifest = EmbedManifest::new("my-repo".to_string(), EmbedSettings::default());
635
636 let old_chunks = vec![create_test_chunk("ec_123", "src/foo.rs", "foo")];
637 manifest.update(&old_chunks).unwrap();
638
639 let diff = manifest.diff(&[]);
641 assert_eq!(diff.summary.added, 0);
642 assert_eq!(diff.summary.modified, 0);
643 assert_eq!(diff.summary.removed, 1);
644 }
645
646 #[test]
647 fn test_diff_unchanged() {
648 let mut manifest = EmbedManifest::new("my-repo".to_string(), EmbedSettings::default());
649
650 let chunks = vec![create_test_chunk("ec_123", "src/foo.rs", "foo")];
651 manifest.update(&chunks).unwrap();
652
653 let diff = manifest.diff(&chunks);
655 assert_eq!(diff.summary.unchanged, 1);
656 assert!(!diff.has_changes());
657 }
658
659 #[test]
660 fn test_batches() {
661 let manifest = EmbedManifest::new("my-repo".to_string(), EmbedSettings::default());
662
663 let chunks: Vec<_> = (0..5)
664 .map(|i| {
665 create_test_chunk(&format!("ec_{i}"), &format!("src/f{i}.rs"), &format!("f{i}"))
666 })
667 .collect();
668
669 let diff = manifest.diff(&chunks);
670 let batches = diff.batches(2);
671
672 assert_eq!(batches.len(), 3);
674 assert_eq!(batches[0].chunks.len(), 2);
675 assert_eq!(batches[1].chunks.len(), 2);
676 assert_eq!(batches[2].chunks.len(), 1);
677 }
678
679 #[test]
680 fn test_load_if_exists() {
681 let temp_dir = TempDir::new().unwrap();
682 let manifest_path = temp_dir.path().join("nonexistent.bin");
683
684 let result = EmbedManifest::load_if_exists(&manifest_path).unwrap();
686 assert!(result.is_none());
687
688 let mut manifest = EmbedManifest::new("test".to_string(), EmbedSettings::default());
690 manifest.save(&manifest_path).unwrap();
691
692 let result = EmbedManifest::load_if_exists(&manifest_path).unwrap();
693 assert!(result.is_some());
694 }
695
696 #[test]
697 fn test_collision_detection() {
698 let mut manifest = EmbedManifest::new("my-repo".to_string(), EmbedSettings::default());
699
700 let mut chunk1 = create_test_chunk("ec_same", "src/foo.rs", "foo");
702 let mut chunk2 = create_test_chunk("ec_same", "src/bar.rs", "bar");
703 chunk1.full_hash = "hash1".to_string();
704 chunk2.full_hash = "hash2".to_string();
705
706 let result = manifest.update(&[chunk1, chunk2]);
707 assert!(matches!(result, Err(EmbedError::HashCollision { .. })));
708 }
709
710 #[test]
711 fn test_settings_match() {
712 let manifest = EmbedManifest::new("my-repo".to_string(), EmbedSettings::default());
713
714 assert!(manifest.settings_match(&EmbedSettings::default()));
715
716 let mut different = EmbedSettings::default();
717 different.max_tokens = 2000;
718 assert!(!manifest.settings_match(&different));
719 }
720}