Skip to main content

infiniloom_engine/embedding/
manifest.rs

1//! Manifest storage and diffing for incremental updates
2//!
3//! The manifest tracks all chunks generated for a repository, enabling:
4//! - Incremental updates (only re-embed changed chunks)
5//! - Change detection (added, modified, removed)
6//! - Integrity verification (detect tampering)
7//!
8//! # Storage Format
9//!
10//! Manifests are stored in bincode format (5-10x faster than JSON) with:
11//! - BLAKE3 integrity checksum
12//! - Version compatibility checking
13//! - Settings validation
14
15use std::collections::BTreeMap;
16use std::path::Path;
17
18use serde::{Deserialize, Serialize};
19
20use super::error::EmbedError;
21use super::hasher::IncrementalHasher;
22use super::types::{ChunkKind, EmbedChunk, EmbedSettings};
23use crate::bincode_safe::{deserialize_with_limit, serialize};
24
25/// Current manifest format version
26pub const MANIFEST_VERSION: u32 = 3;
27
28/// Manifest tracking all chunks for incremental updates
29///
30/// # Determinism Note
31///
32/// The manifest binary file is **not byte-deterministic** across saves due to the
33/// `updated_at` timestamp. However, the **checksum is deterministic** because it
34/// excludes the timestamp from its calculation.
35///
36/// For comparing manifests:
37/// - **Wrong**: Compare raw binary files (will differ due to timestamp)
38/// - **Right**: Compare checksums via `manifest.checksum` (deterministic)
39///
40/// This design allows incremental updates while still detecting actual content changes.
41///
42/// # CI/CD Integration
43///
44/// If you need byte-deterministic manifests (e.g., for Docker layer caching):
45/// - Compare checksums instead of file hashes
46/// - Or set `updated_at = None` before saving in test environments
47#[derive(Debug, Clone, Serialize, Deserialize)]
48pub struct EmbedManifest {
49    /// Manifest format version
50    pub version: u32,
51
52    /// Relative repository path (from git root or CWD)
53    pub repo_path: String,
54
55    /// Git commit hash when manifest was created (for reference only)
56    /// Note: We always serialize Option fields for bincode compatibility
57    #[serde(default)]
58    pub commit_hash: Option<String>,
59
60    /// Timestamp of last update (Unix seconds)
61    ///
62    /// **Important**: This field is excluded from the integrity checksum calculation
63    /// to allow the checksum to remain stable across re-saves of unchanged content.
64    /// The binary file will differ byte-for-byte on each save, but the checksum will
65    /// only change if actual chunk content changes.
66    #[serde(default)]
67    pub updated_at: Option<u64>,
68
69    /// Settings used to generate chunks (part of integrity)
70    pub settings: EmbedSettings,
71
72    /// All chunks indexed by location key
73    /// Using BTreeMap for deterministic iteration order (critical for cross-platform consistency)
74    pub chunks: BTreeMap<String, ManifestEntry>,
75
76    /// Integrity checksum (BLAKE3 of settings + sorted chunk entries)
77    /// Excluded from serialization, computed on save, verified on load
78    #[serde(default)]
79    pub checksum: Option<String>,
80}
81
82/// Entry in the manifest for a single chunk
83#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
84pub struct ManifestEntry {
85    /// Content-addressable chunk ID (128-bit)
86    pub chunk_id: String,
87
88    /// Full content hash for collision detection (256-bit)
89    pub full_hash: String,
90
91    /// Token count
92    pub tokens: u32,
93
94    /// Line range (1-indexed, inclusive)
95    pub lines: (u32, u32),
96}
97
98impl EmbedManifest {
99    /// Create a new empty manifest
100    pub fn new(repo_path: String, settings: EmbedSettings) -> Self {
101        Self {
102            version: MANIFEST_VERSION,
103            repo_path,
104            commit_hash: None,
105            updated_at: None,
106            settings,
107            chunks: BTreeMap::new(),
108            checksum: None,
109        }
110    }
111
112    /// Generate deterministic location key for a chunk
113    ///
114    /// Format: `file::symbol::kind`
115    /// Uses `::` as separator (unlikely in paths/symbols)
116    pub fn location_key(file: &str, symbol: &str, kind: ChunkKind) -> String {
117        format!("{}::{}::{}", file, symbol, kind.name())
118    }
119
120    /// Compute integrity checksum over settings and chunk entries
121    fn compute_checksum(&self) -> String {
122        let mut hasher = IncrementalHasher::new();
123
124        // Hash manifest version
125        hasher.update_u32(self.version);
126
127        // Hash settings (affects chunk generation)
128        let settings_json = serde_json::to_string(&self.settings).unwrap_or_default();
129        hasher.update_str(&settings_json);
130
131        // Hash chunks in deterministic order (sorted by key)
132        let mut keys: Vec<_> = self.chunks.keys().collect();
133        keys.sort();
134
135        for key in keys {
136            if let Some(entry) = self.chunks.get(key) {
137                hasher.update_str(key);
138                hasher.update_str(&entry.chunk_id);
139                hasher.update_str(&entry.full_hash);
140                hasher.update_u32(entry.tokens);
141                hasher.update_u32(entry.lines.0);
142                hasher.update_u32(entry.lines.1);
143            }
144        }
145
146        hasher.finalize_hex()
147    }
148
149    /// Save manifest to file with integrity checksum
150    ///
151    /// # Behavior
152    ///
153    /// This method:
154    /// 1. Updates `updated_at` to the current timestamp
155    /// 2. Computes a new checksum (excluding timestamp)
156    /// 3. Serializes to bincode format
157    ///
158    /// # Determinism
159    ///
160    /// The resulting binary file is **not byte-deterministic** because the timestamp
161    /// changes on every save. However, the checksum **is deterministic** - it only
162    /// changes when actual chunk content or settings change.
163    ///
164    /// For deterministic testing, set `self.updated_at = None` before saving.
165    ///
166    /// # Note
167    ///
168    /// This method mutates `self` to set checksum and timestamp.
169    /// This avoids cloning the entire manifest (which can be large).
170    pub fn save(&mut self, path: &Path) -> Result<(), EmbedError> {
171        // Create parent directories
172        if let Some(parent) = path.parent() {
173            std::fs::create_dir_all(parent)
174                .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
175        }
176
177        // Update timestamp
178        self.updated_at = Some(
179            std::time::SystemTime::now()
180                .duration_since(std::time::UNIX_EPOCH)
181                .map(|d| d.as_secs())
182                .unwrap_or(0),
183        );
184
185        // Compute checksum (excludes timestamp for deterministic checksums across saves)
186        self.checksum = Some(self.compute_checksum());
187
188        // Use bincode for faster I/O (5-10x faster than JSON for large manifests)
189        let bytes = serialize(self)
190            .map_err(|e| EmbedError::SerializationError { reason: e.to_string() })?;
191
192        // Atomic write: write to temp file first, then rename
193        // Use PID in temp name to prevent collisions from concurrent embed runs
194        let tmp_path = path.with_extension(format!("tmp.{}", std::process::id()));
195        std::fs::write(&tmp_path, bytes)
196            .map_err(|e| EmbedError::IoError { path: tmp_path.clone(), source: e })?;
197
198        // Atomic rename (protects against corruption on crash)
199        std::fs::rename(&tmp_path, path)
200            .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
201
202        Ok(())
203    }
204
205    /// Load manifest from file with integrity verification
206    pub fn load(path: &Path) -> Result<Self, EmbedError> {
207        let bytes = std::fs::read(path)
208            .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
209
210        let mut manifest: Self = deserialize_with_limit(&bytes)
211            .map_err(|e| EmbedError::DeserializationError { reason: format!("Failed to read embed manifest (it may have been created by an older version of infiniloom; delete {:?} and re-run): {}", path, e) })?;
212
213        // Version check
214        if manifest.version > MANIFEST_VERSION {
215            return Err(EmbedError::ManifestVersionTooNew {
216                found: manifest.version,
217                max_supported: MANIFEST_VERSION,
218            });
219        }
220
221        // Integrity verification using constant-time comparison to prevent timing attacks
222        if let Some(stored_checksum) = manifest.checksum.take() {
223            let computed = manifest.compute_checksum();
224            if !constant_time_eq(stored_checksum.as_bytes(), computed.as_bytes()) {
225                return Err(EmbedError::ManifestCorrupted {
226                    path: path.to_path_buf(),
227                    expected: stored_checksum,
228                    actual: computed,
229                });
230            }
231        }
232
233        // Validate settings
234        manifest.settings.validate()?;
235
236        Ok(manifest)
237    }
238
239    /// Load manifest if it exists, otherwise return None
240    pub fn load_if_exists(path: &Path) -> Result<Option<Self>, EmbedError> {
241        if path.exists() {
242            Ok(Some(Self::load(path)?))
243        } else {
244            Ok(None)
245        }
246    }
247
248    /// Update manifest with current chunks, detecting collisions
249    pub fn update(&mut self, chunks: &[EmbedChunk]) -> Result<(), EmbedError> {
250        // Collision detection: track id -> full_hash mappings
251        // Using BTreeMap for deterministic iteration (critical for cross-platform consistency)
252        let mut id_to_hash: BTreeMap<&str, &str> = BTreeMap::new();
253
254        self.chunks.clear();
255
256        for chunk in chunks {
257            // Check for hash collision
258            if let Some(&existing_hash) = id_to_hash.get(chunk.id.as_str()) {
259                if existing_hash != chunk.full_hash.as_str() {
260                    return Err(EmbedError::HashCollision {
261                        id: chunk.id.clone(),
262                        hash1: existing_hash.to_owned(),
263                        hash2: chunk.full_hash.clone(),
264                    });
265                }
266            }
267            id_to_hash.insert(&chunk.id, &chunk.full_hash);
268
269            let key = Self::location_key(&chunk.source.file, &chunk.source.symbol, chunk.kind);
270
271            self.chunks.insert(
272                key,
273                ManifestEntry {
274                    chunk_id: chunk.id.clone(),
275                    full_hash: chunk.full_hash.clone(),
276                    tokens: chunk.tokens,
277                    lines: chunk.source.lines,
278                },
279            );
280        }
281
282        Ok(())
283    }
284
285    /// Compute diff between current chunks and manifest
286    pub fn diff(&self, current_chunks: &[EmbedChunk]) -> EmbedDiff {
287        let mut added = Vec::new();
288        let mut modified = Vec::new();
289        let mut removed = Vec::new();
290        let mut unchanged = Vec::new();
291
292        // Build map of current chunks by location key
293        // Using BTreeMap for deterministic iteration in "added" detection
294        let current_map: BTreeMap<String, &EmbedChunk> = current_chunks
295            .iter()
296            .map(|c| (Self::location_key(&c.source.file, &c.source.symbol, c.kind), c))
297            .collect();
298
299        // Find modified and unchanged (iterate manifest)
300        for (key, entry) in &self.chunks {
301            if let Some(current) = current_map.get(key) {
302                if current.id == entry.chunk_id {
303                    unchanged.push(current.id.clone());
304                } else {
305                    modified.push(ModifiedChunk {
306                        old_id: entry.chunk_id.clone(),
307                        new_id: current.id.clone(),
308                        chunk: (*current).clone(),
309                    });
310                }
311            } else {
312                // In manifest but not in current = removed
313                removed
314                    .push(RemovedChunk { id: entry.chunk_id.clone(), location_key: key.clone() });
315            }
316        }
317
318        // Find added (in current but not in manifest)
319        for (key, chunk) in &current_map {
320            if !self.chunks.contains_key(key) {
321                added.push((*chunk).clone());
322            }
323        }
324
325        let summary = DiffSummary {
326            added: added.len(),
327            modified: modified.len(),
328            removed: removed.len(),
329            unchanged: unchanged.len(),
330            total_chunks: current_chunks.len(),
331        };
332
333        EmbedDiff { summary, added, modified, removed, unchanged }
334    }
335
336    /// Check if settings match the manifest settings
337    pub fn settings_match(&self, settings: &EmbedSettings) -> bool {
338        &self.settings == settings
339    }
340
341    /// Get the number of chunks in the manifest
342    pub fn chunk_count(&self) -> usize {
343        self.chunks.len()
344    }
345}
346
347/// Result of diffing current state against manifest
348#[derive(Debug, Clone, Serialize, Deserialize)]
349pub struct EmbedDiff {
350    /// Summary statistics
351    pub summary: DiffSummary,
352
353    /// New chunks (not in manifest)
354    pub added: Vec<EmbedChunk>,
355
356    /// Changed chunks (different content)
357    pub modified: Vec<ModifiedChunk>,
358
359    /// Deleted chunks (in manifest but not current)
360    pub removed: Vec<RemovedChunk>,
361
362    /// Unchanged chunk IDs (same content)
363    pub unchanged: Vec<String>,
364}
365
366impl EmbedDiff {
367    /// Check if there are any changes
368    pub fn has_changes(&self) -> bool {
369        self.summary.added > 0 || self.summary.modified > 0 || self.summary.removed > 0
370    }
371
372    /// Get all chunks that need to be upserted (added + modified)
373    pub fn chunks_to_upsert(&self) -> Vec<&EmbedChunk> {
374        let mut chunks: Vec<&EmbedChunk> = self.added.iter().collect();
375        chunks.extend(self.modified.iter().map(|m| &m.chunk));
376        chunks
377    }
378
379    /// Get all IDs that need to be deleted
380    pub fn ids_to_delete(&self) -> Vec<&str> {
381        let mut ids: Vec<&str> = self.removed.iter().map(|r| r.id.as_str()).collect();
382        // Also delete old IDs for modified chunks
383        ids.extend(self.modified.iter().map(|m| m.old_id.as_str()));
384        ids
385    }
386
387    /// Split diff into batches for vector DB operations
388    pub fn batches(&self, batch_size: usize) -> Vec<DiffBatch> {
389        let mut batches = Vec::new();
390        let mut batch_num = 0;
391
392        // Batch added chunks
393        for chunk in self.added.chunks(batch_size) {
394            batches.push(DiffBatch {
395                batch_number: batch_num,
396                operation: BatchOperation::Upsert,
397                chunks: chunk.to_vec(),
398                ids: Vec::new(),
399            });
400            batch_num += 1;
401        }
402
403        // Batch modified chunks
404        for chunk in self.modified.chunks(batch_size) {
405            batches.push(DiffBatch {
406                batch_number: batch_num,
407                operation: BatchOperation::Upsert,
408                chunks: chunk.iter().map(|m| m.chunk.clone()).collect(),
409                ids: chunk.iter().map(|m| m.old_id.clone()).collect(), // Old IDs to delete
410            });
411            batch_num += 1;
412        }
413
414        // Batch removed IDs
415        for ids in self.removed.chunks(batch_size) {
416            batches.push(DiffBatch {
417                batch_number: batch_num,
418                operation: BatchOperation::Delete,
419                chunks: Vec::new(),
420                ids: ids.iter().map(|r| r.id.clone()).collect(),
421            });
422            batch_num += 1;
423        }
424
425        batches
426    }
427}
428
429/// Summary of changes between manifest and current state
430#[derive(Debug, Clone, Serialize, Deserialize)]
431pub struct DiffSummary {
432    /// Number of new chunks
433    pub added: usize,
434
435    /// Number of modified chunks
436    pub modified: usize,
437
438    /// Number of removed chunks
439    pub removed: usize,
440
441    /// Number of unchanged chunks
442    pub unchanged: usize,
443
444    /// Total chunks in current state
445    pub total_chunks: usize,
446}
447
448/// A chunk that was modified (content changed)
449#[derive(Debug, Clone, Serialize, Deserialize)]
450pub struct ModifiedChunk {
451    /// Previous chunk ID
452    pub old_id: String,
453
454    /// New chunk ID
455    pub new_id: String,
456
457    /// The updated chunk
458    pub chunk: EmbedChunk,
459}
460
461/// A chunk that was removed
462#[derive(Debug, Clone, Serialize, Deserialize)]
463pub struct RemovedChunk {
464    /// Chunk ID that was removed
465    pub id: String,
466
467    /// Location key for reference
468    pub location_key: String,
469}
470
471/// Batch of operations for vector DB
472#[derive(Debug, Clone, Serialize, Deserialize)]
473pub struct DiffBatch {
474    /// Batch number (0-indexed)
475    pub batch_number: usize,
476
477    /// Operation type
478    pub operation: BatchOperation,
479
480    /// Chunks to upsert (for Upsert operation)
481    pub chunks: Vec<EmbedChunk>,
482
483    /// IDs to delete (for Delete operation, or old IDs for Upsert)
484    pub ids: Vec<String>,
485}
486
487/// Type of batch operation
488#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
489#[serde(rename_all = "snake_case")]
490pub enum BatchOperation {
491    /// Insert or update chunks
492    Upsert,
493    /// Delete chunks by ID
494    Delete,
495}
496
497/// Constant-time byte comparison to prevent timing attacks.
498///
499/// Returns true if both slices are equal. Uses XOR accumulation that
500/// doesn't short-circuit on first difference. The length check is
501/// acceptable here since BLAKE3 checksums are always the same fixed length.
502#[inline]
503fn constant_time_eq(a: &[u8], b: &[u8]) -> bool {
504    if a.len() != b.len() {
505        return false;
506    }
507
508    // XOR all bytes and accumulate - runs in constant time regardless of content
509    let mut result = 0u8;
510    for (x, y) in a.iter().zip(b.iter()) {
511        result |= x ^ y;
512    }
513    result == 0
514}
515
516#[cfg(test)]
517mod tests {
518    use super::*;
519    use crate::embedding::types::{ChunkContext, ChunkSource, RepoIdentifier, Visibility};
520    use tempfile::TempDir;
521
522    fn create_test_chunk(id: &str, file: &str, symbol: &str) -> EmbedChunk {
523        EmbedChunk {
524            id: id.to_owned(),
525            full_hash: format!("{}_full", id),
526            content: "fn test() {}".to_owned(),
527            tokens: 10,
528            kind: ChunkKind::Function,
529            source: ChunkSource {
530                repo: RepoIdentifier::default(),
531                file: file.to_owned(),
532                lines: (1, 5),
533                symbol: symbol.to_owned(),
534                fqn: None,
535                language: "rust".to_owned(),
536                parent: None,
537                visibility: Visibility::Public,
538                is_test: false,
539                module_path: None,
540                parent_chunk_id: None,
541            },
542            children_ids: Vec::new(),
543            context: ChunkContext::default(),
544            repr: "code".to_string(),
545            code_chunk_id: None,
546            part: None,
547        }
548    }
549
550    #[test]
551    fn test_new_manifest() {
552        let manifest = EmbedManifest::new("my-repo".to_owned(), EmbedSettings::default());
553
554        assert_eq!(manifest.version, MANIFEST_VERSION);
555        assert_eq!(manifest.repo_path, "my-repo");
556        assert!(manifest.chunks.is_empty());
557    }
558
559    #[test]
560    fn test_location_key() {
561        let key = EmbedManifest::location_key("src/auth.rs", "validate", ChunkKind::Function);
562        assert_eq!(key, "src/auth.rs::validate::function");
563    }
564
565    #[test]
566    fn test_save_and_load() {
567        let temp_dir = TempDir::new().unwrap();
568        let manifest_path = temp_dir.path().join("test.bin");
569
570        // Create and save manifest
571        let mut manifest = EmbedManifest::new("my-repo".to_owned(), EmbedSettings::default());
572
573        let chunks = vec![
574            create_test_chunk("ec_123", "src/foo.rs", "foo"),
575            create_test_chunk("ec_456", "src/bar.rs", "bar"),
576        ];
577        manifest.update(&chunks).unwrap();
578        manifest.save(&manifest_path).unwrap();
579
580        // Load and verify
581        let loaded = EmbedManifest::load(&manifest_path).unwrap();
582        assert_eq!(loaded.repo_path, "my-repo");
583        assert_eq!(loaded.chunks.len(), 2);
584    }
585
586    #[test]
587    fn test_integrity_verification() {
588        let temp_dir = TempDir::new().unwrap();
589        let manifest_path = temp_dir.path().join("test.bin");
590
591        // Create and save manifest
592        let mut manifest = EmbedManifest::new("my-repo".to_owned(), EmbedSettings::default());
593        manifest.save(&manifest_path).unwrap();
594
595        // Tamper with file
596        let mut bytes = std::fs::read(&manifest_path).unwrap();
597        if bytes.len() >= 10 {
598            let idx = bytes.len() - 10;
599            bytes[idx] ^= 0xFF;
600            std::fs::write(&manifest_path, bytes).unwrap();
601        }
602
603        // Should detect tampering
604        let result = EmbedManifest::load(&manifest_path);
605        assert!(matches!(
606            result,
607            Err(EmbedError::ManifestCorrupted { .. })
608                | Err(EmbedError::DeserializationError { .. })
609        ));
610    }
611
612    #[test]
613    fn test_diff_added() {
614        let manifest = EmbedManifest::new("my-repo".to_owned(), EmbedSettings::default());
615
616        let chunks = vec![create_test_chunk("ec_123", "src/foo.rs", "foo")];
617
618        let diff = manifest.diff(&chunks);
619        assert_eq!(diff.summary.added, 1);
620        assert_eq!(diff.summary.modified, 0);
621        assert_eq!(diff.summary.removed, 0);
622    }
623
624    #[test]
625    fn test_diff_modified() {
626        let mut manifest = EmbedManifest::new("my-repo".to_owned(), EmbedSettings::default());
627
628        let old_chunks = vec![create_test_chunk("ec_old", "src/foo.rs", "foo")];
629        manifest.update(&old_chunks).unwrap();
630
631        // Same location, different ID = modified
632        let new_chunks = vec![create_test_chunk("ec_new", "src/foo.rs", "foo")];
633
634        let diff = manifest.diff(&new_chunks);
635        assert_eq!(diff.summary.added, 0);
636        assert_eq!(diff.summary.modified, 1);
637        assert_eq!(diff.summary.removed, 0);
638        assert_eq!(diff.modified[0].old_id, "ec_old");
639        assert_eq!(diff.modified[0].new_id, "ec_new");
640    }
641
642    #[test]
643    fn test_diff_removed() {
644        let mut manifest = EmbedManifest::new("my-repo".to_owned(), EmbedSettings::default());
645
646        let old_chunks = vec![create_test_chunk("ec_123", "src/foo.rs", "foo")];
647        manifest.update(&old_chunks).unwrap();
648
649        // Empty current = all removed
650        let diff = manifest.diff(&[]);
651        assert_eq!(diff.summary.added, 0);
652        assert_eq!(diff.summary.modified, 0);
653        assert_eq!(diff.summary.removed, 1);
654    }
655
656    #[test]
657    fn test_diff_unchanged() {
658        let mut manifest = EmbedManifest::new("my-repo".to_owned(), EmbedSettings::default());
659
660        let chunks = vec![create_test_chunk("ec_123", "src/foo.rs", "foo")];
661        manifest.update(&chunks).unwrap();
662
663        // Same chunks = unchanged
664        let diff = manifest.diff(&chunks);
665        assert_eq!(diff.summary.unchanged, 1);
666        assert!(!diff.has_changes());
667    }
668
669    #[test]
670    fn test_batches() {
671        let manifest = EmbedManifest::new("my-repo".to_owned(), EmbedSettings::default());
672
673        let chunks: Vec<_> = (0..5)
674            .map(|i| {
675                create_test_chunk(&format!("ec_{i}"), &format!("src/f{i}.rs"), &format!("f{i}"))
676            })
677            .collect();
678
679        let diff = manifest.diff(&chunks);
680        let batches = diff.batches(2);
681
682        // 5 chunks / batch size 2 = 3 batches
683        assert_eq!(batches.len(), 3);
684        assert_eq!(batches[0].chunks.len(), 2);
685        assert_eq!(batches[1].chunks.len(), 2);
686        assert_eq!(batches[2].chunks.len(), 1);
687    }
688
689    #[test]
690    fn test_load_if_exists() {
691        let temp_dir = TempDir::new().unwrap();
692        let manifest_path = temp_dir.path().join("nonexistent.bin");
693
694        // Non-existent returns None
695        let result = EmbedManifest::load_if_exists(&manifest_path).unwrap();
696        assert!(result.is_none());
697
698        // Existing returns Some
699        let mut manifest = EmbedManifest::new("test".to_owned(), EmbedSettings::default());
700        manifest.save(&manifest_path).unwrap();
701
702        let result = EmbedManifest::load_if_exists(&manifest_path).unwrap();
703        assert!(result.is_some());
704    }
705
706    #[test]
707    fn test_collision_detection() {
708        let mut manifest = EmbedManifest::new("my-repo".to_owned(), EmbedSettings::default());
709
710        // Create two chunks with same ID but different hashes
711        let mut chunk1 = create_test_chunk("ec_same", "src/foo.rs", "foo");
712        let mut chunk2 = create_test_chunk("ec_same", "src/bar.rs", "bar");
713        chunk1.full_hash = "hash1".to_owned();
714        chunk2.full_hash = "hash2".to_owned();
715
716        let result = manifest.update(&[chunk1, chunk2]);
717        assert!(matches!(result, Err(EmbedError::HashCollision { .. })));
718    }
719
720    #[test]
721    fn test_settings_match() {
722        let manifest = EmbedManifest::new("my-repo".to_owned(), EmbedSettings::default());
723
724        assert!(manifest.settings_match(&EmbedSettings::default()));
725
726        let mut different = EmbedSettings::default();
727        different.max_tokens = 2000;
728        assert!(!manifest.settings_match(&different));
729    }
730}