memvid_core/memvid/
lifecycle.rs

1//! Lifecycle management for creating and opening `.mv2` memories.
2//!
3//! Responsibilities:
4//! - Enforce single-file invariant (no sidecars) and take OS locks.
5//! - Bootstrap headers, internal WAL, and TOC on create, and recover them on open.
6//! - Validate TOC/footer layout, recover the latest valid footer when needed.
7//! - Wire up index state (lex/vector/time) without mutating payload bytes.
8
9use std::convert::TryInto;
10use std::fs::{File, OpenOptions};
11use std::io::{Read, Seek, SeekFrom};
12use std::panic;
13use std::path::{Path, PathBuf};
14use std::sync::{Arc, RwLock};
15
16use crate::constants::{MAGIC, SPEC_VERSION, WAL_OFFSET, WAL_SIZE_TINY};
17use crate::error::{MemvidError, Result};
18use crate::footer::{FooterSlice, find_last_valid_footer};
19use crate::io::header::HeaderCodec;
20#[cfg(feature = "parallel_segments")]
21use crate::io::manifest_wal::ManifestWal;
22use crate::io::wal::EmbeddedWal;
23use crate::lock::{FileLock, LockMode};
24#[cfg(feature = "lex")]
25use crate::search::{EmbeddedLexStorage, TantivyEngine};
26#[cfg(feature = "temporal_track")]
27use crate::types::FrameId;
28#[cfg(feature = "parallel_segments")]
29use crate::types::IndexSegmentRef;
30use crate::types::{
31    FrameStatus, Header, IndexManifests, LogicMesh, MemoriesTrack, SchemaRegistry, SegmentCatalog,
32    SketchTrack, TicketRef, Tier, Toc, VectorCompression,
33};
34#[cfg(feature = "temporal_track")]
35use crate::{TemporalTrack, temporal_track_read};
36use crate::{lex::LexIndex, vec::VecIndex};
37use blake3::Hasher;
38use memmap2::Mmap;
39
40const DEFAULT_LOCK_TIMEOUT_MS: u64 = 250;
41const DEFAULT_HEARTBEAT_MS: u64 = 2_000;
42const DEFAULT_STALE_GRACE_MS: u64 = 10_000;
43
44/// Primary handle for interacting with a `.mv2` memory file.
45///
46/// Holds the file descriptor, lock, header, TOC, and in-memory index state. Mutations
47/// append to the embedded WAL and are materialized at commit time to keep the layout deterministic.
48pub struct Memvid {
49    pub(crate) file: File,
50    pub(crate) path: PathBuf,
51    pub(crate) lock: FileLock,
52    pub(crate) read_only: bool,
53    pub(crate) header: Header,
54    pub(crate) toc: Toc,
55    pub(crate) wal: EmbeddedWal,
56    /// Number of frame inserts appended to WAL but not yet materialized into `toc.frames`.
57    ///
58    /// This lets frontends predict stable frame IDs before an explicit commit.
59    pub(crate) pending_frame_inserts: u64,
60    pub(crate) data_end: u64,
61    pub(crate) generation: u64,
62    pub(crate) lock_settings: LockSettings,
63    pub(crate) lex_enabled: bool,
64    pub(crate) lex_index: Option<LexIndex>,
65    #[cfg(feature = "lex")]
66    #[allow(dead_code)]
67    pub(crate) lex_storage: Arc<RwLock<EmbeddedLexStorage>>,
68    pub(crate) vec_enabled: bool,
69    pub(crate) vec_compression: VectorCompression,
70    pub(crate) vec_index: Option<VecIndex>,
71    /// CLIP visual embeddings index (separate from vec due to different dimensions)
72    pub(crate) clip_enabled: bool,
73    pub(crate) clip_index: Option<crate::clip::ClipIndex>,
74    pub(crate) dirty: bool,
75    #[cfg(feature = "lex")]
76    pub(crate) tantivy: Option<TantivyEngine>,
77    #[cfg(feature = "lex")]
78    pub(crate) tantivy_dirty: bool,
79    #[cfg(feature = "temporal_track")]
80    pub(crate) temporal_track: Option<TemporalTrack>,
81    #[cfg(feature = "parallel_segments")]
82    pub(crate) manifest_wal: Option<ManifestWal>,
83    /// In-memory track for structured memory cards.
84    pub(crate) memories_track: MemoriesTrack,
85    /// In-memory Logic-Mesh graph for entity-relationship traversal.
86    pub(crate) logic_mesh: LogicMesh,
87    /// In-memory sketch track for fast candidate generation.
88    pub(crate) sketch_track: SketchTrack,
89    /// Schema registry for predicate validation.
90    pub(crate) schema_registry: SchemaRegistry,
91    /// Whether to enforce strict schema validation on card insert.
92    pub(crate) schema_strict: bool,
93    /// Active replay session being recorded (if any).
94    #[cfg(feature = "replay")]
95    pub(crate) active_session: Option<crate::replay::ActiveSession>,
96    /// Completed sessions stored in memory (until persisted to file).
97    #[cfg(feature = "replay")]
98    pub(crate) completed_sessions: Vec<crate::replay::ReplaySession>,
99}
100
101/// Controls read-only open behaviour for `.mv2` memories.
102#[derive(Debug, Clone, Copy)]
103pub struct OpenReadOptions {
104    pub allow_repair: bool,
105}
106
107#[derive(Debug, Clone)]
108pub struct LockSettings {
109    pub timeout_ms: u64,
110    pub heartbeat_ms: u64,
111    pub stale_grace_ms: u64,
112    pub force_stale: bool,
113    pub command: Option<String>,
114}
115
116impl Default for LockSettings {
117    fn default() -> Self {
118        Self {
119            timeout_ms: DEFAULT_LOCK_TIMEOUT_MS,
120            heartbeat_ms: DEFAULT_HEARTBEAT_MS,
121            stale_grace_ms: DEFAULT_STALE_GRACE_MS,
122            force_stale: false,
123            command: None,
124        }
125    }
126}
127
128impl Default for OpenReadOptions {
129    fn default() -> Self {
130        Self {
131            allow_repair: false,
132        }
133    }
134}
135
136impl Memvid {
137    /// Create a new, empty `.mv2` file with an embedded WAL and empty TOC.
138    /// The file is locked exclusively for the lifetime of the handle.
139    pub fn create<P: AsRef<Path>>(path: P) -> Result<Self> {
140        let path_ref = path.as_ref();
141        ensure_single_file(path_ref)?;
142
143        OpenOptions::new()
144            .read(true)
145            .write(true)
146            .create(true)
147            .truncate(true)
148            .open(path_ref)?;
149        let (mut file, lock) = FileLock::open_and_lock(path_ref)?;
150
151        let header = Header {
152            magic: MAGIC,
153            version: SPEC_VERSION,
154            footer_offset: WAL_OFFSET + WAL_SIZE_TINY,
155            wal_offset: WAL_OFFSET,
156            wal_size: WAL_SIZE_TINY,
157            wal_checkpoint_pos: 0,
158            wal_sequence: 0,
159            toc_checksum: [0u8; 32],
160        };
161
162        let mut toc = empty_toc();
163        // If lex feature is enabled, set the catalog flag immediately
164        #[cfg(feature = "lex")]
165        {
166            toc.segment_catalog.lex_enabled = true;
167        }
168        file.set_len(header.footer_offset)?;
169        HeaderCodec::write(&mut file, &header)?;
170
171        let wal = EmbeddedWal::open(&file, &header)?;
172        let data_end = header.footer_offset;
173        #[cfg(feature = "lex")]
174        let lex_storage = Arc::new(RwLock::new(EmbeddedLexStorage::new()));
175        #[cfg(feature = "parallel_segments")]
176        let manifest_wal = ManifestWal::open(manifest_wal_path(path_ref))?;
177        #[cfg(feature = "parallel_segments")]
178        let manifest_wal_entries = manifest_wal.replay()?;
179
180        let mut memvid = Self {
181            file,
182            path: path_ref.to_path_buf(),
183            lock,
184            read_only: false,
185            header,
186            toc,
187            wal,
188            pending_frame_inserts: 0,
189            data_end,
190            generation: 0,
191            lock_settings: LockSettings::default(),
192            lex_enabled: cfg!(feature = "lex"), // Enable by default if feature is enabled
193            lex_index: None,
194            #[cfg(feature = "lex")]
195            lex_storage,
196            vec_enabled: cfg!(feature = "vec"), // Enable by default if feature is enabled
197            vec_compression: VectorCompression::None,
198            vec_index: None,
199            clip_enabled: cfg!(feature = "clip"), // Enable by default if feature is enabled
200            clip_index: None,
201            dirty: false,
202            #[cfg(feature = "lex")]
203            tantivy: None,
204            #[cfg(feature = "lex")]
205            tantivy_dirty: false,
206            #[cfg(feature = "temporal_track")]
207            temporal_track: None,
208            #[cfg(feature = "parallel_segments")]
209            manifest_wal: Some(manifest_wal),
210            memories_track: MemoriesTrack::new(),
211            logic_mesh: LogicMesh::new(),
212            sketch_track: SketchTrack::default(),
213            schema_registry: SchemaRegistry::new(),
214            schema_strict: false,
215            #[cfg(feature = "replay")]
216            active_session: None,
217            #[cfg(feature = "replay")]
218            completed_sessions: Vec::new(),
219        };
220
221        #[cfg(feature = "lex")]
222        memvid.init_tantivy()?;
223
224        #[cfg(feature = "parallel_segments")]
225        memvid.load_manifest_segments(manifest_wal_entries);
226
227        memvid.bootstrap_segment_catalog();
228
229        // Create empty manifests for enabled indexes so they persist across open/close
230        let empty_offset = memvid.data_end;
231        let empty_checksum = *b"\xe3\xb0\xc4\x42\x98\xfc\x1c\x14\x9a\xfb\xf4\xc8\x99\x6f\xb9\x24\
232                                \x27\xae\x41\xe4\x64\x9b\x93\x4c\xa4\x95\x99\x1b\x78\x52\xb8\x55";
233
234        #[cfg(feature = "lex")]
235        if memvid.lex_enabled && memvid.toc.indexes.lex.is_none() {
236            memvid.toc.indexes.lex = Some(crate::types::LexIndexManifest {
237                doc_count: 0,
238                generation: 0,
239                bytes_offset: empty_offset,
240                bytes_length: 0,
241                checksum: empty_checksum,
242            });
243        }
244
245        #[cfg(feature = "vec")]
246        if memvid.vec_enabled && memvid.toc.indexes.vec.is_none() {
247            memvid.toc.indexes.vec = Some(crate::types::VecIndexManifest {
248                vector_count: 0,
249                dimension: 0,
250                bytes_offset: empty_offset,
251                bytes_length: 0,
252                checksum: empty_checksum,
253                compression_mode: memvid.vec_compression.clone(),
254            });
255        }
256
257        memvid.rewrite_toc_footer()?;
258        memvid.header.toc_checksum = memvid.toc.toc_checksum;
259        crate::persist_header(&mut memvid.file, &memvid.header)?;
260        memvid.file.sync_all()?;
261        Ok(memvid)
262    }
263
264    pub fn lock_settings(&self) -> &LockSettings {
265        &self.lock_settings
266    }
267
268    pub fn lock_settings_mut(&mut self) -> &mut LockSettings {
269        &mut self.lock_settings
270    }
271
272    /// Set the vector compression mode for this memory
273    /// Must be called before ingesting documents with embeddings
274    pub fn set_vector_compression(&mut self, compression: VectorCompression) {
275        self.vec_compression = compression;
276    }
277
278    /// Get the current vector compression mode
279    pub fn vector_compression(&self) -> &VectorCompression {
280        &self.vec_compression
281    }
282
283    /// Predict the next frame ID that would be assigned to a new insert.
284    ///
285    /// Frame IDs are dense indices into `toc.frames`. When a memory is mutable, inserts are first
286    /// appended to the embedded WAL and only materialized into `toc.frames` on commit. This helper
287    /// lets frontends allocate stable frame IDs before an explicit commit.
288    pub fn next_frame_id(&self) -> u64 {
289        (self.toc.frames.len() as u64).saturating_add(self.pending_frame_inserts)
290    }
291
292    /// Returns the total number of frames in the memory.
293    ///
294    /// This includes all frames regardless of status (active, deleted, etc.).
295    #[must_use]
296    pub fn frame_count(&self) -> usize {
297        self.toc.frames.len()
298    }
299
300    fn open_locked(mut file: File, lock: FileLock, path_ref: &Path) -> Result<Self> {
301        // Fast-path detection for encrypted capsules (.mv2e).
302        // This avoids confusing "invalid header" errors and provides an actionable hint.
303        let mut magic = [0u8; 4];
304        let is_mv2e = file.read_exact(&mut magic).is_ok() && magic == *b"MV2E";
305        file.seek(SeekFrom::Start(0))?;
306        if is_mv2e {
307            return Err(MemvidError::EncryptedFile {
308                path: path_ref.to_path_buf(),
309                hint: format!("Run: memvid unlock {}", path_ref.display()),
310            });
311        }
312
313        let mut header = HeaderCodec::read(&mut file)?;
314        let toc = match read_toc(&mut file, &header) {
315            Ok(toc) => toc,
316            Err(err @ MemvidError::Decode(_)) | Err(err @ MemvidError::InvalidToc { .. }) => {
317                tracing::info!("toc decode failed ({}); attempting recovery", err);
318                let (toc, recovered_offset) = recover_toc(&mut file, Some(header.footer_offset))?;
319                if recovered_offset != header.footer_offset
320                    || header.toc_checksum != toc.toc_checksum
321                {
322                    header.footer_offset = recovered_offset;
323                    header.toc_checksum = toc.toc_checksum;
324                    crate::persist_header(&mut file, &header)?;
325                }
326                toc
327            }
328            Err(err) => return Err(err),
329        };
330        let checksum_result = toc.verify_checksum();
331
332        // Validate segment integrity early to catch corruption before loading indexes
333        let file_len = file.metadata().map(|m| m.len()).unwrap_or(0);
334        if let Err(e) = validate_segment_integrity(&toc, &header, file_len) {
335            tracing::warn!("Segment integrity validation failed: {}", e);
336            // Don't fail file open - let doctor handle it
337            // This is just an early warning system
338        }
339        ensure_non_overlapping_frames(&toc, file_len)?;
340
341        let wal = EmbeddedWal::open(&file, &header)?;
342        #[cfg(feature = "lex")]
343        let lex_storage = Arc::new(RwLock::new(EmbeddedLexStorage::from_manifest(
344            toc.indexes.lex.as_ref(),
345            &toc.indexes.lex_segments,
346        )));
347        #[cfg(feature = "parallel_segments")]
348        let manifest_wal = ManifestWal::open(manifest_wal_path(path_ref))?;
349        #[cfg(feature = "parallel_segments")]
350        let manifest_wal_entries = manifest_wal.replay()?;
351
352        let generation = detect_generation(&file)?.unwrap_or(0);
353        let read_only = lock.mode() == LockMode::Shared;
354
355        let mut memvid = Self {
356            file,
357            path: path_ref.to_path_buf(),
358            lock,
359            read_only,
360            header,
361            toc,
362            wal,
363            pending_frame_inserts: 0,
364            data_end: 0,
365            generation,
366            lock_settings: LockSettings::default(),
367            lex_enabled: false,
368            lex_index: None,
369            #[cfg(feature = "lex")]
370            lex_storage,
371            vec_enabled: false,
372            vec_compression: VectorCompression::None,
373            vec_index: None,
374            clip_enabled: false,
375            clip_index: None,
376            dirty: false,
377            #[cfg(feature = "lex")]
378            tantivy: None,
379            #[cfg(feature = "lex")]
380            tantivy_dirty: false,
381            #[cfg(feature = "temporal_track")]
382            temporal_track: None,
383            #[cfg(feature = "parallel_segments")]
384            manifest_wal: Some(manifest_wal),
385            memories_track: MemoriesTrack::new(),
386            logic_mesh: LogicMesh::new(),
387            sketch_track: SketchTrack::default(),
388            schema_registry: SchemaRegistry::new(),
389            schema_strict: false,
390            #[cfg(feature = "replay")]
391            active_session: None,
392            #[cfg(feature = "replay")]
393            completed_sessions: Vec::new(),
394        };
395        memvid.data_end = compute_data_end(&memvid.toc, &memvid.header);
396        // Use consolidated helper for lex_enabled check
397        memvid.lex_enabled = has_lex_index(&memvid.toc);
398        if memvid.lex_enabled {
399            memvid.load_lex_index_from_manifest()?;
400        }
401        #[cfg(feature = "lex")]
402        {
403            memvid.init_tantivy()?;
404        }
405        memvid.vec_enabled =
406            memvid.toc.indexes.vec.is_some() || !memvid.toc.segment_catalog.vec_segments.is_empty();
407        if memvid.vec_enabled {
408            memvid.load_vec_index_from_manifest()?;
409        }
410        memvid.clip_enabled = memvid.toc.indexes.clip.is_some();
411        if memvid.clip_enabled {
412            memvid.load_clip_index_from_manifest()?;
413        }
414        memvid.recover_wal()?;
415        #[cfg(feature = "parallel_segments")]
416        memvid.load_manifest_segments(manifest_wal_entries);
417        memvid.bootstrap_segment_catalog();
418        #[cfg(feature = "temporal_track")]
419        memvid.ensure_temporal_track_loaded()?;
420        memvid.load_memories_track()?;
421        memvid.load_logic_mesh()?;
422        memvid.load_sketch_track()?;
423        if checksum_result.is_err() {
424            memvid.toc.verify_checksum()?;
425            if memvid.toc.toc_checksum != memvid.header.toc_checksum {
426                memvid.header.toc_checksum = memvid.toc.toc_checksum;
427                crate::persist_header(&mut memvid.file, &memvid.header)?;
428                memvid.file.sync_all()?;
429            }
430        }
431        Ok(memvid)
432    }
433
434    /// Open an existing `.mv2` with exclusive access, performing recovery if needed.
435    pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
436        let path_ref = path.as_ref();
437        ensure_single_file(path_ref)?;
438
439        let (file, lock) = FileLock::open_and_lock(path_ref)?;
440        Self::open_locked(file, lock, path_ref)
441    }
442
443    pub fn open_read_only<P: AsRef<Path>>(path: P) -> Result<Self> {
444        Self::open_read_only_with_options(path, OpenReadOptions::default())
445    }
446
447    pub fn open_read_only_with_options<P: AsRef<Path>>(
448        path: P,
449        options: OpenReadOptions,
450    ) -> Result<Self> {
451        let path_ref = path.as_ref();
452        ensure_single_file(path_ref)?;
453
454        if options.allow_repair {
455            return Self::open(path_ref);
456        }
457
458        Self::open_read_only_snapshot(path_ref)
459    }
460
461    fn open_read_only_snapshot(path_ref: &Path) -> Result<Self> {
462        let mut file = OpenOptions::new().read(true).write(true).open(path_ref)?;
463        let TailSnapshot {
464            toc,
465            footer_offset,
466            data_end,
467            generation,
468        } = load_tail_snapshot(&file)?;
469
470        let mut header = HeaderCodec::read(&mut file)?;
471        header.footer_offset = footer_offset;
472        header.toc_checksum = toc.toc_checksum;
473
474        let lock = FileLock::acquire_with_mode(&file, LockMode::Shared)?;
475        let wal = EmbeddedWal::open_read_only(&file, &header)?;
476
477        #[cfg(feature = "lex")]
478        let lex_storage = Arc::new(RwLock::new(EmbeddedLexStorage::from_manifest(
479            toc.indexes.lex.as_ref(),
480            &toc.indexes.lex_segments,
481        )));
482
483        let mut memvid = Self {
484            file,
485            path: path_ref.to_path_buf(),
486            lock,
487            read_only: true,
488            header,
489            toc,
490            wal,
491            pending_frame_inserts: 0,
492            data_end,
493            generation,
494            lock_settings: LockSettings::default(),
495            lex_enabled: false,
496            lex_index: None,
497            #[cfg(feature = "lex")]
498            lex_storage,
499            vec_enabled: false,
500            vec_compression: VectorCompression::None,
501            vec_index: None,
502            clip_enabled: false,
503            clip_index: None,
504            dirty: false,
505            #[cfg(feature = "lex")]
506            tantivy: None,
507            #[cfg(feature = "lex")]
508            tantivy_dirty: false,
509            #[cfg(feature = "temporal_track")]
510            temporal_track: None,
511            #[cfg(feature = "parallel_segments")]
512            manifest_wal: None,
513            memories_track: MemoriesTrack::new(),
514            logic_mesh: LogicMesh::new(),
515            sketch_track: SketchTrack::default(),
516            schema_registry: SchemaRegistry::new(),
517            schema_strict: false,
518            #[cfg(feature = "replay")]
519            active_session: None,
520            #[cfg(feature = "replay")]
521            completed_sessions: Vec::new(),
522        };
523
524        // Use consolidated helper for lex_enabled check
525        memvid.lex_enabled = has_lex_index(&memvid.toc);
526        if memvid.lex_enabled {
527            memvid.load_lex_index_from_manifest()?;
528        }
529        #[cfg(feature = "lex")]
530        memvid.init_tantivy()?;
531
532        memvid.vec_enabled =
533            memvid.toc.indexes.vec.is_some() || !memvid.toc.segment_catalog.vec_segments.is_empty();
534        if memvid.vec_enabled {
535            memvid.load_vec_index_from_manifest()?;
536        }
537        memvid.clip_enabled = memvid.toc.indexes.clip.is_some();
538        if memvid.clip_enabled {
539            memvid.load_clip_index_from_manifest()?;
540        }
541        // Load memories track, Logic-Mesh, and sketch track if present
542        memvid.load_memories_track()?;
543        memvid.load_logic_mesh()?;
544        memvid.load_sketch_track()?;
545
546        memvid.bootstrap_segment_catalog();
547        #[cfg(feature = "temporal_track")]
548        memvid.ensure_temporal_track_loaded()?;
549
550        Ok(memvid)
551    }
552
553    pub(crate) fn try_open<P: AsRef<Path>>(path: P) -> Result<Self> {
554        let path_ref = path.as_ref();
555        ensure_single_file(path_ref)?;
556
557        let file = OpenOptions::new().read(true).write(true).open(path_ref)?;
558        let lock = match FileLock::try_acquire(&file, path_ref)? {
559            Some(lock) => lock,
560            None => {
561                return Err(MemvidError::Lock(
562                    "exclusive access unavailable for doctor".to_string(),
563                ));
564            }
565        };
566        Self::open_locked(file, lock, path_ref)
567    }
568
569    fn bootstrap_segment_catalog(&mut self) {
570        let catalog = &mut self.toc.segment_catalog;
571        if catalog.version == 0 {
572            catalog.version = 1;
573        }
574        if catalog.next_segment_id == 0 {
575            let mut max_id = 0u64;
576            for descriptor in &catalog.lex_segments {
577                max_id = max_id.max(descriptor.common.segment_id);
578            }
579            for descriptor in &catalog.vec_segments {
580                max_id = max_id.max(descriptor.common.segment_id);
581            }
582            for descriptor in &catalog.time_segments {
583                max_id = max_id.max(descriptor.common.segment_id);
584            }
585            #[cfg(feature = "temporal_track")]
586            for descriptor in &catalog.temporal_segments {
587                max_id = max_id.max(descriptor.common.segment_id);
588            }
589            #[cfg(feature = "parallel_segments")]
590            for descriptor in &catalog.index_segments {
591                max_id = max_id.max(descriptor.common.segment_id);
592            }
593            if max_id > 0 {
594                catalog.next_segment_id = max_id.saturating_add(1);
595            }
596        }
597    }
598
599    #[cfg(feature = "parallel_segments")]
600    fn load_manifest_segments(&mut self, entries: Vec<IndexSegmentRef>) {
601        if entries.is_empty() {
602            return;
603        }
604        for entry in entries {
605            let duplicate = self
606                .toc
607                .segment_catalog
608                .index_segments
609                .iter()
610                .any(|existing| existing.common.segment_id == entry.common.segment_id);
611            if !duplicate {
612                self.toc.segment_catalog.index_segments.push(entry);
613            }
614        }
615    }
616
617    /// Load the memories track from the manifest if present.
618    fn load_memories_track(&mut self) -> Result<()> {
619        let manifest = match &self.toc.memories_track {
620            Some(m) => m,
621            None => return Ok(()),
622        };
623
624        // Read the compressed data from the file
625        let mut buf = vec![0u8; manifest.bytes_length as usize];
626        self.file
627            .seek(std::io::SeekFrom::Start(manifest.bytes_offset))?;
628        self.file.read_exact(&mut buf)?;
629
630        // Verify checksum
631        let actual_checksum: [u8; 32] = blake3::hash(&buf).into();
632        if actual_checksum != manifest.checksum {
633            return Err(MemvidError::InvalidToc {
634                reason: "memories track checksum mismatch".into(),
635            });
636        }
637
638        // Deserialize the memories track
639        self.memories_track = MemoriesTrack::deserialize(&buf)?;
640
641        Ok(())
642    }
643
644    /// Load the Logic-Mesh from the manifest if present.
645    fn load_logic_mesh(&mut self) -> Result<()> {
646        let manifest = match &self.toc.logic_mesh {
647            Some(m) => m,
648            None => return Ok(()),
649        };
650
651        // Read the serialized data from the file
652        let mut buf = vec![0u8; manifest.bytes_length as usize];
653        self.file
654            .seek(std::io::SeekFrom::Start(manifest.bytes_offset))?;
655        self.file.read_exact(&mut buf)?;
656
657        // Verify checksum
658        let actual_checksum: [u8; 32] = blake3::hash(&buf).into();
659        if actual_checksum != manifest.checksum {
660            return Err(MemvidError::InvalidToc {
661                reason: "logic mesh checksum mismatch".into(),
662            });
663        }
664
665        // Deserialize the logic mesh
666        self.logic_mesh = LogicMesh::deserialize(&buf)?;
667
668        Ok(())
669    }
670
671    /// Load the sketch track from the manifest if present.
672    fn load_sketch_track(&mut self) -> Result<()> {
673        let manifest = match &self.toc.sketch_track {
674            Some(m) => m.clone(),
675            None => return Ok(()),
676        };
677
678        // Read and deserialize the sketch track (read_sketch_track handles seeking and checksum)
679        self.sketch_track = crate::types::read_sketch_track(
680            &mut self.file,
681            manifest.bytes_offset,
682            manifest.bytes_length,
683        )?;
684
685        Ok(())
686    }
687
688    #[cfg(feature = "temporal_track")]
689    pub(crate) fn ensure_temporal_track_loaded(&mut self) -> Result<()> {
690        if self.temporal_track.is_some() {
691            return Ok(());
692        }
693        let manifest = match &self.toc.temporal_track {
694            Some(manifest) => manifest.clone(),
695            None => return Ok(()),
696        };
697        if manifest.bytes_length == 0 {
698            return Ok(());
699        }
700        let file_len = self.file.metadata()?.len();
701        let Some(end) = manifest.bytes_offset.checked_add(manifest.bytes_length) else {
702            return Ok(());
703        };
704        if end > file_len {
705            return Ok(());
706        }
707        match temporal_track_read(&mut self.file, manifest.bytes_offset, manifest.bytes_length) {
708            Ok(track) => self.temporal_track = Some(track),
709            Err(MemvidError::InvalidTemporalTrack { .. }) => {
710                return Ok(());
711            }
712            Err(err) => return Err(err),
713        }
714        Ok(())
715    }
716
717    #[cfg(feature = "temporal_track")]
718    pub(crate) fn temporal_track_ref(&mut self) -> Result<Option<&TemporalTrack>> {
719        self.ensure_temporal_track_loaded()?;
720        Ok(self.temporal_track.as_ref())
721    }
722
723    #[cfg(feature = "temporal_track")]
724    pub(crate) fn temporal_anchor_timestamp(&mut self, frame_id: FrameId) -> Result<Option<i64>> {
725        self.ensure_temporal_track_loaded()?;
726        let Some(track) = self.temporal_track.as_ref() else {
727            return Ok(None);
728        };
729        if !track.capabilities().has_anchors {
730            return Ok(None);
731        }
732        Ok(track
733            .anchor_for_frame(frame_id)
734            .map(|anchor| anchor.anchor_ts))
735    }
736
737    #[cfg(feature = "temporal_track")]
738    pub(crate) fn clear_temporal_track_cache(&mut self) {
739        self.temporal_track = None;
740    }
741
742    #[cfg(feature = "temporal_track")]
743    pub(crate) fn effective_temporal_timestamp(
744        &mut self,
745        frame_id: FrameId,
746        fallback: i64,
747    ) -> Result<i64> {
748        Ok(self
749            .temporal_anchor_timestamp(frame_id)?
750            .unwrap_or(fallback))
751    }
752
753    #[cfg(not(feature = "temporal_track"))]
754    pub(crate) fn effective_temporal_timestamp(
755        &mut self,
756        _frame_id: crate::types::FrameId,
757        fallback: i64,
758    ) -> Result<i64> {
759        Ok(fallback)
760    }
761
762    /// Get current memory binding information.
763    ///
764    /// Returns the binding if this file is bound to a dashboard memory,
765    /// or None if unbound.
766    pub fn get_memory_binding(&self) -> Option<&crate::types::MemoryBinding> {
767        self.toc.memory_binding.as_ref()
768    }
769
770    /// Bind this file to a dashboard memory.
771    ///
772    /// This stores the binding in the TOC and applies a temporary ticket for initial binding.
773    /// The caller should follow up with `apply_signed_ticket` for cryptographic verification.
774    ///
775    /// # Errors
776    ///
777    /// Returns `MemoryAlreadyBound` if this file is already bound to a different memory.
778    #[allow(deprecated)]
779    pub fn bind_memory(
780        &mut self,
781        binding: crate::types::MemoryBinding,
782        ticket: crate::types::Ticket,
783    ) -> Result<()> {
784        // Check existing binding
785        if let Some(existing) = self.get_memory_binding() {
786            if existing.memory_id != binding.memory_id {
787                return Err(MemvidError::MemoryAlreadyBound {
788                    existing_memory_id: existing.memory_id,
789                    existing_memory_name: existing.memory_name.clone(),
790                    bound_at: existing.bound_at.to_rfc3339(),
791                });
792            }
793        }
794
795        // Apply ticket for capacity
796        self.apply_ticket(ticket)?;
797
798        // Store binding in TOC
799        self.toc.memory_binding = Some(binding);
800        self.dirty = true;
801
802        Ok(())
803    }
804
805    /// Set only the memory binding without applying a ticket.
806    ///
807    /// This is used when the caller will immediately follow up with `apply_signed_ticket`
808    /// to apply the cryptographically verified ticket. This avoids the sequence number
809    /// conflict that occurs when using `bind_memory` with a temporary ticket.
810    ///
811    /// # Errors
812    ///
813    /// Returns `MemoryAlreadyBound` if this file is already bound to a different memory.
814    pub fn set_memory_binding_only(&mut self, binding: crate::types::MemoryBinding) -> Result<()> {
815        self.ensure_writable()?;
816
817        // Check existing binding
818        if let Some(existing) = self.get_memory_binding() {
819            if existing.memory_id != binding.memory_id {
820                return Err(MemvidError::MemoryAlreadyBound {
821                    existing_memory_id: existing.memory_id,
822                    existing_memory_name: existing.memory_name.clone(),
823                    bound_at: existing.bound_at.to_rfc3339(),
824                });
825            }
826        }
827
828        // Store binding in TOC (without applying a ticket)
829        self.toc.memory_binding = Some(binding);
830        self.dirty = true;
831
832        Ok(())
833    }
834
835    /// Unbind this file from its dashboard memory.
836    ///
837    /// This clears the binding and reverts to free tier capacity (1 GB).
838    pub fn unbind_memory(&mut self) -> Result<()> {
839        self.toc.memory_binding = None;
840        // Revert to free tier
841        self.toc.ticket_ref = crate::types::TicketRef {
842            issuer: "free-tier".into(),
843            seq_no: 1,
844            expires_in_secs: 0,
845            capacity_bytes: crate::types::Tier::Free.capacity_bytes(),
846            verified: false,
847        };
848        self.dirty = true;
849        Ok(())
850    }
851}
852
853pub(crate) fn read_toc(file: &mut File, header: &Header) -> Result<Toc> {
854    use crate::footer::{CommitFooter, FOOTER_SIZE};
855
856    let len = file.metadata()?.len();
857    if len < header.footer_offset {
858        return Err(MemvidError::InvalidToc {
859            reason: "footer offset beyond file length".into(),
860        });
861    }
862
863    // Read the entire region from footer_offset to EOF (includes TOC + footer)
864    file.seek(SeekFrom::Start(header.footer_offset))?;
865    let total_size = (len - header.footer_offset) as usize;
866
867    if total_size < FOOTER_SIZE {
868        return Err(MemvidError::InvalidToc {
869            reason: "region too small to contain footer".into(),
870        });
871    }
872
873    let mut buf = Vec::with_capacity(total_size);
874    file.read_to_end(&mut buf)?;
875
876    // Parse the footer (last FOOTER_SIZE bytes)
877    let footer_start = buf.len() - FOOTER_SIZE;
878    let footer_bytes = &buf[footer_start..];
879    let footer = CommitFooter::decode(footer_bytes).ok_or(MemvidError::InvalidToc {
880        reason: "failed to decode commit footer".into(),
881    })?;
882
883    // Extract only the TOC bytes (excluding the footer)
884    let toc_bytes = &buf[..footer_start];
885    if toc_bytes.len() != footer.toc_len as usize {
886        return Err(MemvidError::InvalidToc {
887            reason: "toc length mismatch".into(),
888        });
889    }
890    if !footer.hash_matches(toc_bytes) {
891        return Err(MemvidError::InvalidToc {
892            reason: "commit footer toc hash mismatch".into(),
893        });
894    }
895
896    verify_toc_prefix(toc_bytes)?;
897    let toc = Toc::decode(toc_bytes)?;
898    Ok(toc)
899}
900
901fn verify_toc_prefix(bytes: &[u8]) -> Result<()> {
902    const MAX_SEGMENTS: u64 = 1_000_000;
903    const MAX_FRAMES: u64 = 1_000_000;
904    const MIN_SEGMENT_META_BYTES: u64 = 32;
905    const MIN_FRAME_BYTES: u64 = 64;
906    // TOC trailer layout (little-endian):
907    // [toc_version:u64][segments_len:u64][frames_len:u64]...
908    let read_u64 = |range: std::ops::Range<usize>, context: &str| -> Result<u64> {
909        let slice = bytes.get(range).ok_or_else(|| MemvidError::InvalidToc {
910            reason: context.to_string().into(),
911        })?;
912        let array: [u8; 8] = slice.try_into().map_err(|_| MemvidError::InvalidToc {
913            reason: context.to_string().into(),
914        })?;
915        Ok(u64::from_le_bytes(array))
916    };
917
918    if bytes.len() < 24 {
919        return Err(MemvidError::InvalidToc {
920            reason: "toc trailer too small".into(),
921        });
922    }
923    let toc_version = read_u64(0..8, "toc version missing or truncated")?;
924    if toc_version > 32 {
925        return Err(MemvidError::InvalidToc {
926            reason: "toc version unreasonable".into(),
927        });
928    }
929    let segments_len = read_u64(8..16, "segment count missing or truncated")?;
930    if segments_len > MAX_SEGMENTS {
931        return Err(MemvidError::InvalidToc {
932            reason: "segment count unreasonable".into(),
933        });
934    }
935    let frames_len = read_u64(16..24, "frame count missing or truncated")?;
936    if frames_len > MAX_FRAMES {
937        return Err(MemvidError::InvalidToc {
938            reason: "frame count unreasonable".into(),
939        });
940    }
941    let required = segments_len
942        .saturating_mul(MIN_SEGMENT_META_BYTES)
943        .saturating_add(frames_len.saturating_mul(MIN_FRAME_BYTES));
944    if required > bytes.len() as u64 {
945        return Err(MemvidError::InvalidToc {
946            reason: "toc payload inconsistent with counts".into(),
947        });
948    }
949    Ok(())
950}
951
952/// Ensure frame payloads do not overlap each other or exceed file boundary.
953///
954/// Frames in the TOC are ordered by frame_id, not by payload_offset, so we must
955/// sort by payload_offset before checking for overlaps.
956///
957/// Note: Frames with payload_length == 0 are "virtual" frames (e.g., document
958/// frames that reference chunks) and are skipped from this check.
959fn ensure_non_overlapping_frames(toc: &Toc, file_len: u64) -> Result<()> {
960    // Collect active frames with actual payloads and sort by payload_offset
961    let mut frames_by_offset: Vec<_> = toc
962        .frames
963        .iter()
964        .filter(|f| f.status == FrameStatus::Active && f.payload_length > 0)
965        .collect();
966    frames_by_offset.sort_by_key(|f| f.payload_offset);
967
968    let mut previous_end = 0u64;
969    for frame in frames_by_offset {
970        let end = frame
971            .payload_offset
972            .checked_add(frame.payload_length)
973            .ok_or_else(|| MemvidError::InvalidToc {
974                reason: "frame payload offsets overflow".into(),
975            })?;
976        if end > file_len {
977            return Err(MemvidError::InvalidToc {
978                reason: "frame payload exceeds file length".into(),
979            });
980        }
981        if frame.payload_offset < previous_end {
982            return Err(MemvidError::InvalidToc {
983                reason: format!(
984                    "frame {} payload overlaps with previous frame (offset {} < previous end {})",
985                    frame.id, frame.payload_offset, previous_end
986                )
987                .into(),
988            });
989        }
990        previous_end = end;
991    }
992    Ok(())
993}
994
995pub(crate) fn recover_toc(file: &mut File, hint: Option<u64>) -> Result<(Toc, u64)> {
996    let len = file.metadata()?.len();
997    // Safety: we only create a read-only mapping over stable file bytes.
998    let mmap = unsafe { Mmap::map(&*file)? };
999    tracing::debug!(file_len = len, "attempting toc recovery");
1000
1001    // First, try to find a valid footer which includes validated TOC bytes
1002    if let Some(footer_slice) = find_last_valid_footer(&mmap) {
1003        tracing::debug!(
1004            footer_offset = footer_slice.footer_offset,
1005            toc_offset = footer_slice.toc_offset,
1006            toc_len = footer_slice.toc_bytes.len(),
1007            "found valid footer during recovery"
1008        );
1009        // The footer has already validated the TOC hash, so we can directly decode it
1010        match Toc::decode(footer_slice.toc_bytes) {
1011            Ok(toc) => {
1012                return Ok((toc, footer_slice.toc_offset as u64));
1013            }
1014            Err(err) => {
1015                tracing::warn!(
1016                    error = %err,
1017                    "footer-validated TOC failed to decode, falling back to scan"
1018                );
1019            }
1020        }
1021    }
1022
1023    // If we have a header-provided hint (`footer_offset`) but the commit footer itself is corrupted,
1024    // we can often still recover because the TOC bytes are intact. In that case, assume the TOC
1025    // spans from `hint` up to the final fixed-size commit footer and decode it best-effort.
1026    if let Some(hint_offset) = hint {
1027        use crate::footer::FOOTER_SIZE;
1028
1029        let start = (hint_offset.min(len)) as usize;
1030        if mmap.len().saturating_sub(start) >= FOOTER_SIZE {
1031            let toc_end = mmap.len().saturating_sub(FOOTER_SIZE);
1032            if toc_end > start {
1033                let toc_bytes = &mmap[start..toc_end];
1034                if verify_toc_prefix(toc_bytes).is_ok() {
1035                    let attempt = panic::catch_unwind(|| Toc::decode(toc_bytes));
1036                    if let Ok(Ok(toc)) = attempt {
1037                        tracing::debug!(
1038                            recovered_offset = hint_offset,
1039                            recovered_frames = toc.frames.len(),
1040                            "recovered toc from hinted offset without validated footer"
1041                        );
1042                        return Ok((toc, hint_offset));
1043                    }
1044                }
1045            }
1046        }
1047    }
1048
1049    // Fallback to manual scan if footer-based recovery failed
1050    let mut ranges = Vec::new();
1051    if let Some(hint_offset) = hint {
1052        let hint_idx = hint_offset.min(len) as usize;
1053        ranges.push((hint_idx, mmap.len()));
1054        if hint_idx > 0 {
1055            ranges.push((0, hint_idx));
1056        }
1057    } else {
1058        ranges.push((0, mmap.len()));
1059    }
1060
1061    for (start, end) in ranges {
1062        if let Some(found) = scan_range_for_toc(&mmap, start, end) {
1063            return Ok(found);
1064        }
1065    }
1066
1067    Err(MemvidError::InvalidToc {
1068        reason: "unable to recover table of contents from file trailer".into(),
1069    })
1070}
1071
1072fn scan_range_for_toc(data: &[u8], start: usize, end: usize) -> Option<(Toc, u64)> {
1073    if start >= end || end > data.len() {
1074        return None;
1075    }
1076    const MAX_TOC_BYTES: usize = 64 * 1024 * 1024;
1077    const ZERO_CHECKSUM: [u8; 32] = [0u8; 32];
1078
1079    // We only ever consider offsets where the candidate TOC slice would be <= MAX_TOC_BYTES,
1080    // otherwise the loop devolves into iterating over the entire file for large memories.
1081    let min_offset = data.len().saturating_sub(MAX_TOC_BYTES);
1082    let scan_start = start.max(min_offset);
1083
1084    for offset in (scan_start..end).rev() {
1085        let slice = &data[offset..];
1086        if slice.len() < 16 {
1087            continue;
1088        }
1089        debug_assert!(slice.len() <= MAX_TOC_BYTES);
1090
1091        // No footer found - try old format with checksum
1092        if slice.len() < ZERO_CHECKSUM.len() {
1093            continue;
1094        }
1095        let (body, stored_checksum) = slice.split_at(slice.len() - ZERO_CHECKSUM.len());
1096        let mut hasher = Hasher::new();
1097        hasher.update(body);
1098        hasher.update(&ZERO_CHECKSUM);
1099        if hasher.finalize().as_bytes() != stored_checksum {
1100            continue;
1101        }
1102        if verify_toc_prefix(slice).is_err() {
1103            continue;
1104        }
1105        let attempt = panic::catch_unwind(|| Toc::decode(slice));
1106        if let Ok(Ok(toc)) = attempt {
1107            let recovered_offset = offset as u64;
1108            tracing::debug!(
1109                recovered_offset,
1110                recovered_frames = toc.frames.len(),
1111                "recovered toc via scan"
1112            );
1113            return Some((toc, recovered_offset));
1114        }
1115    }
1116    None
1117}
1118
1119pub(crate) fn prepare_toc_bytes(toc: &mut Toc) -> Result<Vec<u8>> {
1120    toc.toc_checksum = [0u8; 32];
1121    let bytes = toc.encode()?;
1122    let checksum = Toc::calculate_checksum(&bytes);
1123    toc.toc_checksum = checksum;
1124    toc.encode()
1125}
1126
1127pub(crate) fn empty_toc() -> Toc {
1128    Toc {
1129        toc_version: 0,
1130        segments: Vec::new(),
1131        frames: Vec::new(),
1132        indexes: IndexManifests::default(),
1133        time_index: None,
1134        temporal_track: None,
1135        memories_track: None,
1136        logic_mesh: None,
1137        sketch_track: None,
1138        segment_catalog: SegmentCatalog::default(),
1139        ticket_ref: TicketRef {
1140            issuer: "free-tier".into(),
1141            seq_no: 1,
1142            expires_in_secs: 0,
1143            capacity_bytes: Tier::Free.capacity_bytes(),
1144            verified: false,
1145        },
1146        memory_binding: None,
1147        replay_manifest: None,
1148        enrichment_queue: crate::types::EnrichmentQueueManifest::default(),
1149        merkle_root: [0u8; 32],
1150        toc_checksum: [0u8; 32],
1151    }
1152}
1153
1154pub(crate) fn compute_data_end(toc: &Toc, header: &Header) -> u64 {
1155    // `data_end` tracks the end of all data bytes that should not be overwritten by appends:
1156    // - frame payloads
1157    // - embedded indexes / metadata segments referenced by the TOC
1158    // - the current footer boundary (TOC offset), since callers may safely overwrite old TOCs
1159    //
1160    // Keeping this conservative prevents WAL replay / appends from corrupting embedded segments.
1161    let wal_region_end = header.wal_offset.saturating_add(header.wal_size);
1162    let mut max_end = wal_region_end.max(header.footer_offset);
1163
1164    // Frame payloads (active only).
1165    for frame in toc
1166        .frames
1167        .iter()
1168        .filter(|f| f.status == FrameStatus::Active && f.payload_length > 0)
1169    {
1170        if let Some(end) = frame.payload_offset.checked_add(frame.payload_length) {
1171            max_end = max_end.max(end);
1172        }
1173    }
1174
1175    // Segment catalog entries.
1176    let catalog = &toc.segment_catalog;
1177    for seg in &catalog.lex_segments {
1178        if let Some(end) = seg.common.bytes_offset.checked_add(seg.common.bytes_length) {
1179            max_end = max_end.max(end);
1180        }
1181    }
1182    for seg in &catalog.vec_segments {
1183        if let Some(end) = seg.common.bytes_offset.checked_add(seg.common.bytes_length) {
1184            max_end = max_end.max(end);
1185        }
1186    }
1187    for seg in &catalog.time_segments {
1188        if let Some(end) = seg.common.bytes_offset.checked_add(seg.common.bytes_length) {
1189            max_end = max_end.max(end);
1190        }
1191    }
1192    #[cfg(feature = "temporal_track")]
1193    for seg in &catalog.temporal_segments {
1194        if let Some(end) = seg.common.bytes_offset.checked_add(seg.common.bytes_length) {
1195            max_end = max_end.max(end);
1196        }
1197    }
1198    #[cfg(feature = "lex")]
1199    for seg in &catalog.tantivy_segments {
1200        if let Some(end) = seg.common.bytes_offset.checked_add(seg.common.bytes_length) {
1201            max_end = max_end.max(end);
1202        }
1203    }
1204    #[cfg(feature = "parallel_segments")]
1205    for seg in &catalog.index_segments {
1206        if let Some(end) = seg.common.bytes_offset.checked_add(seg.common.bytes_length) {
1207            max_end = max_end.max(end);
1208        }
1209    }
1210
1211    // Global manifests (non-segment storage paths).
1212    if let Some(manifest) = toc.indexes.lex.as_ref() {
1213        if let Some(end) = manifest.bytes_offset.checked_add(manifest.bytes_length) {
1214            max_end = max_end.max(end);
1215        }
1216    }
1217    if let Some(manifest) = toc.indexes.vec.as_ref() {
1218        if let Some(end) = manifest.bytes_offset.checked_add(manifest.bytes_length) {
1219            max_end = max_end.max(end);
1220        }
1221    }
1222    if let Some(manifest) = toc.indexes.clip.as_ref() {
1223        if let Some(end) = manifest.bytes_offset.checked_add(manifest.bytes_length) {
1224            max_end = max_end.max(end);
1225        }
1226    }
1227    if let Some(manifest) = toc.time_index.as_ref() {
1228        if let Some(end) = manifest.bytes_offset.checked_add(manifest.bytes_length) {
1229            max_end = max_end.max(end);
1230        }
1231    }
1232    #[cfg(feature = "temporal_track")]
1233    if let Some(track) = toc.temporal_track.as_ref() {
1234        if let Some(end) = track.bytes_offset.checked_add(track.bytes_length) {
1235            max_end = max_end.max(end);
1236        }
1237    }
1238    if let Some(track) = toc.memories_track.as_ref() {
1239        if let Some(end) = track.bytes_offset.checked_add(track.bytes_length) {
1240            max_end = max_end.max(end);
1241        }
1242    }
1243    if let Some(mesh) = toc.logic_mesh.as_ref() {
1244        if let Some(end) = mesh.bytes_offset.checked_add(mesh.bytes_length) {
1245            max_end = max_end.max(end);
1246        }
1247    }
1248    if let Some(track) = toc.sketch_track.as_ref() {
1249        if let Some(end) = track.bytes_offset.checked_add(track.bytes_length) {
1250            max_end = max_end.max(end);
1251        }
1252    }
1253    #[cfg(feature = "replay")]
1254    if let Some(manifest) = toc.replay_manifest.as_ref() {
1255        if let Some(end) = manifest.segment_offset.checked_add(manifest.segment_size) {
1256            max_end = max_end.max(end);
1257        }
1258    }
1259
1260    tracing::debug!(
1261        wal_region_end,
1262        footer_offset = header.footer_offset,
1263        computed_data_end = max_end,
1264        "compute_data_end"
1265    );
1266
1267    max_end
1268}
1269
1270struct TailSnapshot {
1271    toc: Toc,
1272    footer_offset: u64,
1273    data_end: u64,
1274    generation: u64,
1275}
1276
1277fn locate_footer_window(mmap: &[u8]) -> Option<(FooterSlice<'_>, usize)> {
1278    const MAX_SEARCH_SIZE: usize = 16 * 1024 * 1024;
1279    if mmap.is_empty() {
1280        return None;
1281    }
1282    let mut window = MAX_SEARCH_SIZE.min(mmap.len());
1283    loop {
1284        let start = mmap.len() - window;
1285        if let Some(slice) = find_last_valid_footer(&mmap[start..]) {
1286            return Some((slice, start));
1287        }
1288        if window == mmap.len() {
1289            break;
1290        }
1291        window = (window * 2).min(mmap.len());
1292    }
1293    None
1294}
1295
1296fn load_tail_snapshot(file: &File) -> Result<TailSnapshot> {
1297    // Safety: we only create a read-only mapping over the stable file bytes.
1298    let mmap = unsafe { Mmap::map(file)? };
1299
1300    let (slice, offset_adjustment) =
1301        locate_footer_window(&mmap).ok_or_else(|| MemvidError::InvalidToc {
1302            reason: "no valid commit footer found".into(),
1303        })?;
1304    let toc = Toc::decode(slice.toc_bytes)?;
1305    toc.verify_checksum()?;
1306
1307    Ok(TailSnapshot {
1308        toc,
1309        footer_offset: slice.footer_offset as u64 + offset_adjustment as u64,
1310        // Using toc_offset causes stale data_end that moves footer backwards on next commit
1311        data_end: slice.footer_offset as u64 + offset_adjustment as u64,
1312        generation: slice.footer.generation,
1313    })
1314}
1315
1316fn detect_generation(file: &File) -> Result<Option<u64>> {
1317    // Safety: read-only mapping for footer inspection.
1318    let mmap = unsafe { Mmap::map(file)? };
1319
1320    Ok(locate_footer_window(&mmap).map(|(slice, _)| slice.footer.generation))
1321}
1322
1323pub(crate) fn ensure_single_file(path: &Path) -> Result<()> {
1324    if let Some(parent) = path.parent() {
1325        let name = path
1326            .file_name()
1327            .and_then(|n| n.to_str())
1328            .unwrap_or_default();
1329        let forbidden = ["-wal", "-shm", "-lock", "-journal"];
1330        for suffix in forbidden {
1331            let candidate = parent.join(format!("{name}{suffix}"));
1332            if candidate.exists() {
1333                return Err(MemvidError::AuxiliaryFileDetected { path: candidate });
1334            }
1335        }
1336        let hidden_forbidden = [".wal", ".shm", ".lock", ".journal"];
1337        for suffix in hidden_forbidden {
1338            let candidate = parent.join(format!(".{name}{suffix}"));
1339            if candidate.exists() {
1340                return Err(MemvidError::AuxiliaryFileDetected { path: candidate });
1341            }
1342        }
1343    }
1344    Ok(())
1345}
1346
1347#[cfg(feature = "parallel_segments")]
1348fn manifest_wal_path(path: &Path) -> PathBuf {
1349    let mut wal_path = path.to_path_buf();
1350    wal_path.set_extension("manifest.wal");
1351    wal_path
1352}
1353
1354#[cfg(feature = "parallel_segments")]
1355pub(crate) fn cleanup_manifest_wal_public(path: &Path) {
1356    let wal_path = manifest_wal_path(path);
1357    if wal_path.exists() {
1358        let _ = std::fs::remove_file(&wal_path);
1359    }
1360}
1361
1362/// Single source of truth: does this TOC have a lexical index?
1363/// Checks all possible locations: old manifest, lex_segments, and tantivy_segments.
1364pub(crate) fn has_lex_index(toc: &Toc) -> bool {
1365    toc.segment_catalog.lex_enabled
1366        || toc.indexes.lex.is_some()
1367        || !toc.indexes.lex_segments.is_empty()
1368        || !toc.segment_catalog.tantivy_segments.is_empty()
1369}
1370
1371/// Single source of truth: expected document count for lex index.
1372/// Returns None if we can't determine (e.g., Tantivy segments without manifest).
1373#[cfg(feature = "lex")]
1374pub(crate) fn lex_doc_count(
1375    toc: &Toc,
1376    lex_storage: &crate::search::EmbeddedLexStorage,
1377) -> Option<u64> {
1378    // First try old manifest
1379    if let Some(manifest) = &toc.indexes.lex {
1380        if manifest.doc_count > 0 {
1381            return Some(manifest.doc_count);
1382        }
1383    }
1384
1385    // Then try lex_storage (contains info from lex_segments)
1386    let storage_count = lex_storage.doc_count();
1387    if storage_count > 0 {
1388        return Some(storage_count);
1389    }
1390
1391    // For Tantivy files with segments but no manifest/storage doc_count,
1392    // we can't know doc count without loading the index.
1393    // Return None and let caller decide (init_tantivy should trust segments exist)
1394    None
1395}
1396
1397/// Validates segment integrity on file open to catch corruption early.
1398/// This helps doctor by detecting issues before they cause problems.
1399#[allow(dead_code)]
1400fn validate_segment_integrity(toc: &Toc, header: &Header, file_len: u64) -> Result<()> {
1401    let data_limit = header.footer_offset;
1402
1403    // Validate replay segment (if present). Replay is stored AT the footer boundary,
1404    // and footer_offset is moved forward after writing. So we only check against file_len,
1405    // not against footer_offset (which would be after the replay segment).
1406    #[cfg(feature = "replay")]
1407    if let Some(manifest) = toc.replay_manifest.as_ref() {
1408        if manifest.segment_size != 0 {
1409            let end = manifest
1410                .segment_offset
1411                .checked_add(manifest.segment_size)
1412                .ok_or_else(|| MemvidError::Doctor {
1413                    reason: format!(
1414                        "Replay segment offset overflow: {} + {}",
1415                        manifest.segment_offset, manifest.segment_size
1416                    ),
1417                })?;
1418
1419            // Only check against file_len - replay segments sit at the footer boundary
1420            // and footer_offset is updated to point after them
1421            if end > file_len {
1422                return Err(MemvidError::Doctor {
1423                    reason: format!(
1424                        "Replay segment out of bounds: offset={}, length={}, end={}, file_len={}",
1425                        manifest.segment_offset, manifest.segment_size, end, file_len
1426                    ),
1427                });
1428            }
1429        }
1430    }
1431
1432    // Validate Tantivy segments
1433    for (idx, seg) in toc.segment_catalog.tantivy_segments.iter().enumerate() {
1434        let offset = seg.common.bytes_offset;
1435        let length = seg.common.bytes_length;
1436
1437        if length == 0 {
1438            continue; // Empty segments are okay
1439        }
1440
1441        let end = offset
1442            .checked_add(length)
1443            .ok_or_else(|| MemvidError::Doctor {
1444                reason: format!(
1445                    "Tantivy segment {} offset overflow: {} + {}",
1446                    idx, offset, length
1447                ),
1448            })?;
1449
1450        if end > file_len || end > data_limit {
1451            return Err(MemvidError::Doctor {
1452                reason: format!(
1453                    "Tantivy segment {} out of bounds: offset={}, length={}, end={}, file_len={}, data_limit={}",
1454                    idx, offset, length, end, file_len, data_limit
1455                ),
1456            });
1457        }
1458    }
1459
1460    // Validate time index segments
1461    for (idx, seg) in toc.segment_catalog.time_segments.iter().enumerate() {
1462        let offset = seg.common.bytes_offset;
1463        let length = seg.common.bytes_length;
1464
1465        if length == 0 {
1466            continue;
1467        }
1468
1469        let end = offset
1470            .checked_add(length)
1471            .ok_or_else(|| MemvidError::Doctor {
1472                reason: format!(
1473                    "Time segment {} offset overflow: {} + {}",
1474                    idx, offset, length
1475                ),
1476            })?;
1477
1478        if end > file_len || end > data_limit {
1479            return Err(MemvidError::Doctor {
1480                reason: format!(
1481                    "Time segment {} out of bounds: offset={}, length={}, end={}, file_len={}, data_limit={}",
1482                    idx, offset, length, end, file_len, data_limit
1483                ),
1484            });
1485        }
1486    }
1487
1488    // Validate vec segments
1489    for (idx, seg) in toc.segment_catalog.vec_segments.iter().enumerate() {
1490        let offset = seg.common.bytes_offset;
1491        let length = seg.common.bytes_length;
1492
1493        if length == 0 {
1494            continue;
1495        }
1496
1497        let end = offset
1498            .checked_add(length)
1499            .ok_or_else(|| MemvidError::Doctor {
1500                reason: format!(
1501                    "Vec segment {} offset overflow: {} + {}",
1502                    idx, offset, length
1503                ),
1504            })?;
1505
1506        if end > file_len || end > data_limit {
1507            return Err(MemvidError::Doctor {
1508                reason: format!(
1509                    "Vec segment {} out of bounds: offset={}, length={}, end={}, file_len={}, data_limit={}",
1510                    idx, offset, length, end, file_len, data_limit
1511                ),
1512            });
1513        }
1514    }
1515
1516    log::debug!("✓ Segment integrity validation passed");
1517    Ok(())
1518}
1519
1520#[cfg(test)]
1521mod tests {
1522    use super::*;
1523    use tempfile::tempdir;
1524
1525    #[test]
1526    fn toc_prefix_underflow_surfaces_reason() {
1527        let err = verify_toc_prefix(&[0u8; 8]).expect_err("should reject short toc prefix");
1528        match err {
1529            MemvidError::InvalidToc { reason } => {
1530                assert!(
1531                    reason.contains("trailer too small"),
1532                    "unexpected reason: {reason}"
1533                );
1534            }
1535            other => panic!("unexpected error: {other:?}"),
1536        }
1537    }
1538
1539    #[test]
1540    fn ensure_single_file_blocks_sidecars() {
1541        let dir = tempdir().expect("tmp");
1542        let path = dir.path().join("mem.mv2");
1543        std::fs::write(dir.path().join("mem.mv2-wal"), b"junk").expect("sidecar");
1544        let result = Memvid::create(&path);
1545        assert!(matches!(
1546            result,
1547            Err(MemvidError::AuxiliaryFileDetected { .. })
1548        ));
1549    }
1550}