memvid_core/
lib.rs

1#![deny(clippy::all, clippy::pedantic)]
2#![cfg_attr(not(test), deny(clippy::unwrap_used, clippy::expect_used))]
3#![allow(clippy::module_name_repetitions)]
4
5mod analysis;
6pub mod constants;
7pub mod enrich;
8pub mod error;
9pub mod extract;
10pub mod footer;
11pub mod io;
12pub mod lex;
13mod lock;
14pub mod lockfile;
15pub mod memvid;
16pub mod models;
17pub mod pii;
18pub mod reader;
19mod registry;
20mod search;
21pub mod signature;
22pub mod table;
23pub mod text;
24mod toc;
25pub mod types;
26pub mod vec;
27pub mod vec_pq;
28
29// CLIP module is always compiled (for ClipIndexManifest serde compatibility)
30// but ClipModel/inference requires the "clip" feature
31pub mod clip;
32
33#[cfg(test)]
34mod tests_lex_flag;
35
36#[cfg(feature = "temporal_track")]
37pub use analysis::temporal::{
38    TemporalContext, TemporalNormalizer, TemporalResolution, TemporalResolutionFlag,
39    TemporalResolutionValue, parse_clock_inheritance, parse_week_start,
40};
41// Temporal enrichment for resolving relative time references during ingestion
42#[cfg(feature = "temporal_enrich")]
43pub use analysis::temporal_enrich::{
44    AnchorSource as TemporalEnrichAnchorSource, RelativePhrase, ResolvedTemporal,
45    TemporalAnchorInfo, TemporalAnchorTracker, TemporalEnrichment, detect_relative_phrases,
46    enrich_chunk, enrich_chunks, enrich_document, resolve_relative_phrase,
47};
48pub use constants::*;
49pub use error::{MemvidError, Result};
50pub use extract::{DocumentProcessor, ExtractedDocument, ProcessorConfig};
51pub use footer::{CommitFooter, find_last_valid_footer};
52#[cfg(feature = "temporal_track")]
53pub use io::temporal_index::{
54    append_track as temporal_track_append, calculate_checksum as temporal_track_checksum,
55    read_track as temporal_track_read, window as temporal_track_window,
56};
57pub use io::time_index::{
58    TimeIndexEntry, append_track as time_index_append, calculate_checksum as time_index_checksum,
59    read_track as time_index_read,
60};
61pub use io::wal::{EmbeddedWal, WalRecord, WalStats};
62pub use lex::{LexIndex, LexIndexArtifact, LexIndexBuilder, LexSearchHit};
63pub use lock::FileLock;
64pub use memvid::{
65    BlobReader, LockSettings, Memvid, OpenReadOptions,
66    mutation::{CommitMode, CommitOptions},
67};
68#[cfg(feature = "parallel_segments")]
69pub use memvid::{BuildOpts, ParallelInput, ParallelPayload};
70pub use models::{
71    ModelManifest, ModelManifestEntry, ModelVerification, ModelVerificationStatus,
72    ModelVerifyOptions, verify_model_dir, verify_models,
73};
74pub use reader::{
75    DocumentFormat, DocumentReader, PassthroughReader, PdfReader, ReaderDiagnostics, ReaderHint,
76    ReaderOutput, ReaderRegistry,
77};
78pub use signature::{
79    parse_ed25519_public_key_base64, verify_model_manifest, verify_ticket_signature,
80};
81pub use text::{NormalizedText, normalize_text, truncate_at_grapheme_boundary};
82#[cfg(feature = "temporal_track")]
83pub use types::{
84    AnchorSource, SearchHitTemporal, SearchHitTemporalAnchor, SearchHitTemporalMention,
85    TEMPORAL_TRACK_FLAG_HAS_ANCHORS, TEMPORAL_TRACK_FLAG_HAS_MENTIONS, TemporalAnchor,
86    TemporalCapabilities, TemporalFilter, TemporalMention, TemporalMentionFlags,
87    TemporalMentionKind, TemporalTrack, TemporalTrackManifest,
88};
89pub use types::{
90    AskCitation, AskMode, AskRequest, AskResponse, AskRetriever, AskStats, AudioSegmentMetadata,
91    AuditOptions, AuditReport, CanonicalEncoding, DOCTOR_PLAN_VERSION, DocAudioMetadata,
92    DocExifMetadata, DocGpsMetadata, DocMetadata, DoctorActionDetail, DoctorActionKind,
93    DoctorActionPlan, DoctorActionReport, DoctorActionStatus, DoctorFinding, DoctorFindingCode,
94    DoctorMetrics, DoctorOptions, DoctorPhaseDuration, DoctorPhaseKind, DoctorPhasePlan,
95    DoctorPhaseReport, DoctorPhaseStatus, DoctorPlan, DoctorReport, DoctorSeverity, DoctorStatus,
96    Frame, FrameId, FrameRole, FrameStatus, Header, IndexManifests, LexIndexManifest,
97    LexSegmentDescriptor, MediaManifest, MemvidHandle, Open, PutOptions, PutOptionsBuilder, Sealed,
98    SearchEngineKind, SearchHit, SearchHitMetadata, SearchParams, SearchRequest, SearchResponse,
99    SegmentCatalog, SegmentCommon, SegmentCompression, SegmentMeta, SegmentSpan, SourceSpan, Stats,
100    TextChunkManifest, TextChunkRange, Ticket, TicketRef, Tier, TimeIndexManifest,
101    TimeSegmentDescriptor, TimelineEntry, TimelineQuery, TimelineQueryBuilder, Toc, VecEmbedder,
102    VecIndexManifest, VecSegmentDescriptor, VectorCompression, VerificationCheck,
103    VerificationReport, VerificationStatus,
104};
105// Memory card types for structured memory extraction and storage
106pub use types::{
107    EngineStamp, EnrichmentManifest, EnrichmentRecord, MEMORIES_TRACK_MAGIC,
108    MEMORIES_TRACK_VERSION, MemoriesStats, MemoriesTrack, MemoryCard, MemoryCardBuilder,
109    MemoryCardBuilderError, MemoryCardId, MemoryKind, Polarity, SlotIndex, VersionRelation,
110};
111// Logic-Mesh types for entity-relationship graph traversal
112pub use types::{
113    EdgeDirection, EntityKind, FollowResult, LOGIC_MESH_MAGIC, LOGIC_MESH_VERSION, LinkType,
114    LogicMesh, LogicMeshManifest, MeshEdge, MeshNode,
115};
116// NER types for entity extraction (always available, model requires logic_mesh feature)
117pub use analysis::ner::{
118    ExtractedEntity, FrameEntities, NER_MODEL_NAME, NER_MODEL_SIZE_MB, NER_MODEL_URL,
119    NER_TOKENIZER_URL, NerModelInfo, default_ner_model_info, get_ner_model_info,
120    is_ner_model_installed, ner_model_path, ner_tokenizer_path, NER_MODELS,
121};
122#[cfg(feature = "logic_mesh")]
123pub use analysis::ner::NerModel;
124// Enrichment engine types for extracting memory cards from frames
125pub use enrich::{EnrichmentContext, EnrichmentEngine, EnrichmentResult, RulesEngine};
126// Embedding provider types for vector embedding generation
127pub use types::{
128    BatchEmbeddingResult, EmbeddingConfig, EmbeddingProvider, EmbeddingProviderKind,
129    EmbeddingResult,
130};
131// Reranker types for second-stage ranking in RAG pipelines
132pub use types::reranker::{
133    Reranker, RerankerConfig, RerankerDocument, RerankerKind, RerankerResult,
134};
135#[cfg(feature = "parallel_segments")]
136pub use types::{IndexSegmentRef, SegmentKind, SegmentStats};
137pub use vec::{VecIndex, VecIndexArtifact, VecSearchHit};
138pub use vec_pq::{
139    CompressionStats, ProductQuantizer, QuantizedVecIndex, QuantizedVecIndexArtifact,
140    QuantizedVecIndexBuilder,
141};
142// CLIP visual embeddings - types always available for serde compatibility
143pub use clip::{
144    CLIP_MODELS, ClipConfig, ClipDocument, ClipEmbeddingProvider, ClipError, ClipIndex,
145    ClipIndexArtifact, ClipIndexBuilder, ClipIndexManifest, ClipModelInfo, ClipSearchHit,
146    ImageInfo, MOBILECLIP_DIMS, SIGLIP_DIMS, default_model_info, filter_junk_images,
147    get_model_info,
148};
149// CLIP model inference requires the "clip" feature
150#[cfg(feature = "clip")]
151pub use clip::{ClipModel, calculate_color_variance, get_image_info};
152
153#[cfg(test)]
154use once_cell::sync::Lazy;
155use std::fs::File;
156use std::io::Cursor;
157use std::path::Path;
158#[cfg(test)]
159use std::sync::Mutex;
160
161use bincode::config::{self, Config};
162use io::header::HeaderCodec;
163
164const TIMELINE_PREVIEW_BYTES: usize = 120;
165const MAX_INDEX_BYTES: u64 = 512 * 1024 * 1024; // Increased from 64MB to 512MB for large datasets
166const MAX_TIME_INDEX_BYTES: u64 = 512 * 1024 * 1024;
167const MAX_FRAME_BYTES: u64 = 256 * 1024 * 1024;
168const DEFAULT_SEARCH_TEXT_LIMIT: usize = 32_768;
169
170#[cfg(test)]
171static SERIAL_TEST_MUTEX: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
172
173#[cfg(test)]
174pub(crate) fn run_serial_test<T>(f: impl FnOnce() -> T) -> T {
175    let _guard = SERIAL_TEST_MUTEX
176        .lock()
177        .expect("memvid-core serial test mutex poisoned");
178    f()
179}
180
181impl Memvid {
182    #[cfg(feature = "lex")]
183    fn tantivy_index_pending(&self) -> bool {
184        self.tantivy_dirty
185    }
186
187    #[cfg(not(feature = "lex"))]
188    fn tantivy_index_pending(&self) -> bool {
189        false
190    }
191
192    #[cfg(feature = "lex")]
193    fn flush_tantivy_conditional(&mut self, embed_snapshot: bool) -> Result<()> {
194        if !self.tantivy_dirty {
195            return Ok(());
196        }
197        if let Some(engine) = self.tantivy.as_mut() {
198            engine.commit()?;
199            if embed_snapshot {
200                let snapshot = engine.snapshot_segments()?;
201                self.update_embedded_lex_snapshot(snapshot)?;
202            }
203        }
204        self.tantivy_dirty = false;
205        Ok(())
206    }
207
208    #[cfg(feature = "lex")]
209    fn flush_tantivy(&mut self) -> Result<()> {
210        self.flush_tantivy_conditional(true)
211    }
212
213    #[cfg(feature = "lex")]
214    #[allow(dead_code)]
215    fn flush_tantivy_skip_embed(&mut self) -> Result<()> {
216        self.flush_tantivy_conditional(false)
217    }
218
219    #[cfg(not(feature = "lex"))]
220    fn flush_tantivy(&mut self) -> Result<()> {
221        Ok(())
222    }
223
224    #[cfg(not(feature = "lex"))]
225    #[allow(dead_code)]
226    fn flush_tantivy_skip_embed(&mut self) -> Result<()> {
227        Ok(())
228    }
229    pub fn path(&self) -> &Path {
230        &self.path
231    }
232
233    pub fn lock_handle(&self) -> &FileLock {
234        &self.lock
235    }
236
237    pub fn is_read_only(&self) -> bool {
238        self.read_only
239    }
240
241    pub(crate) fn ensure_writable(&mut self) -> Result<()> {
242        if self.read_only {
243            self.lock.upgrade_to_exclusive()?;
244            self.read_only = false;
245        }
246        Ok(())
247    }
248
249    pub fn downgrade_to_shared(&mut self) -> Result<()> {
250        if self.read_only {
251            return Ok(());
252        }
253        if self.dirty || self.tantivy_index_pending() {
254            return Ok(());
255        }
256        self.lock.downgrade_to_shared()?;
257        self.read_only = true;
258        Ok(())
259    }
260}
261
262impl Drop for Memvid {
263    fn drop(&mut self) {
264        if self.dirty {
265            let _ = self.commit();
266        }
267        // Clean up temporary manifest.wal file (parallel_segments feature)
268        #[cfg(feature = "parallel_segments")]
269        {
270            use crate::memvid::lifecycle::cleanup_manifest_wal_public;
271            cleanup_manifest_wal_public(self.path());
272        }
273    }
274}
275
276pub(crate) fn persist_header(file: &mut File, header: &Header) -> Result<()> {
277    HeaderCodec::write(file, header)
278}
279
280fn wal_config() -> impl Config {
281    config::standard()
282        .with_fixed_int_encoding()
283        .with_little_endian()
284}
285
286pub(crate) fn decode_canonical_bytes(
287    payload: &[u8],
288    encoding: CanonicalEncoding,
289    frame_id: FrameId,
290) -> Result<Vec<u8>> {
291    match encoding {
292        CanonicalEncoding::Plain => Ok(payload.to_vec()),
293        CanonicalEncoding::Zstd => {
294            zstd::decode_all(Cursor::new(payload)).map_err(|_| MemvidError::InvalidFrame {
295                frame_id,
296                reason: "failed to decode canonical payload",
297            })
298        }
299    }
300}
301
302pub(crate) fn default_uri(frame_id: FrameId) -> String {
303    format!("mv2://frames/{frame_id}")
304}
305
306pub(crate) fn infer_title_from_uri(uri: &str) -> Option<String> {
307    let trimmed = uri.trim();
308    if trimmed.is_empty() {
309        return None;
310    }
311
312    let without_scheme = trimmed.splitn(2, "://").nth(1).unwrap_or(trimmed);
313    let without_fragment = without_scheme.split('#').next().unwrap_or(without_scheme);
314    let without_query = without_fragment
315        .split('?')
316        .next()
317        .unwrap_or(without_fragment);
318    let segment = without_query
319        .trim_end_matches('/')
320        .rsplit('/')
321        .next()
322        .map(str::trim)?;
323    if segment.is_empty() {
324        return None;
325    }
326
327    let stem = segment.rsplitn(2, '.').nth(1).unwrap_or(segment).trim();
328    if stem.is_empty() {
329        return None;
330    }
331
332    let words: Vec<String> = stem
333        .split(|c: char| c == '-' || c == '_' || c == ' ')
334        .filter(|part| !part.is_empty())
335        .map(|part| {
336            let mut chars = part.chars();
337            match chars.next() {
338                Some(first) => {
339                    let first = first.to_ascii_uppercase();
340                    let rest: String = chars.map(|c| c.to_ascii_lowercase()).collect();
341                    if rest.is_empty() {
342                        first.to_string()
343                    } else {
344                        format!("{}{}", first, rest)
345                    }
346                }
347                None => String::new(),
348            }
349        })
350        .filter(|word| !word.is_empty())
351        .collect();
352
353    if words.is_empty() {
354        None
355    } else {
356        Some(words.join(" "))
357    }
358}
359
360fn truncate_preview(text: &str) -> String {
361    text.chars().take(TIMELINE_PREVIEW_BYTES).collect()
362}
363
364fn image_preview_from_metadata(meta: &DocMetadata) -> Option<String> {
365    let mime = meta.mime.as_deref()?;
366    if !mime.starts_with("image/") {
367        return None;
368    }
369
370    if let Some(caption) = meta.caption.as_ref() {
371        let trimmed = caption.trim();
372        if !trimmed.is_empty() {
373            return Some(truncate_preview(trimmed));
374        }
375    }
376
377    let mut segments: Vec<String> = Vec::new();
378    if let (Some(w), Some(h)) = (meta.width, meta.height) {
379        segments.push(format!("{}×{} px", w, h));
380    }
381    if let Some(exif) = meta.exif.as_ref() {
382        if let Some(model) = exif
383            .model
384            .as_ref()
385            .map(|s| s.trim())
386            .filter(|s| !s.is_empty())
387        {
388            segments.push(model.to_string());
389        } else if let Some(make) = exif
390            .make
391            .as_ref()
392            .map(|s| s.trim())
393            .filter(|s| !s.is_empty())
394        {
395            segments.push(make.to_string());
396        }
397
398        if let Some(datetime) = exif
399            .datetime
400            .as_ref()
401            .map(|s| s.trim())
402            .filter(|s| !s.is_empty())
403        {
404            segments.push(datetime.to_string());
405        }
406    }
407
408    if segments.is_empty() {
409        return Some("Image frame".to_string());
410    }
411
412    Some(truncate_preview(&segments.join(" · ")))
413}
414
415#[cfg(test)]
416mod tests {
417    use super::*;
418    use std::io::Read;
419    use std::num::NonZeroU64;
420    use tempfile::tempdir;
421
422    #[test]
423    fn create_put_commit_reopen() {
424        run_serial_test(|| {
425            let dir = tempdir().expect("tmp");
426            let path = dir.path().join("memory.mv2");
427
428            let mut mem = Memvid::create(&path).expect("create");
429            let seq = mem.put_bytes(b"hello").expect("put");
430            assert_eq!(seq, 1);
431            mem.commit().expect("commit");
432
433            drop(mem);
434
435            let mut reopened = Memvid::open(&path).expect("open");
436            let stats = reopened.stats().expect("stats");
437            assert_eq!(stats.frame_count, 1);
438            assert!(stats.has_time_index);
439
440            let timeline = reopened
441                .timeline(TimelineQuery::default())
442                .expect("timeline");
443            assert_eq!(timeline.len(), 1);
444            assert!(timeline[0].preview.contains("hello"));
445
446            let wal_stats = reopened.wal.stats();
447            assert_eq!(wal_stats.pending_bytes, 0);
448            // Sequence is 2: one from create() writing manifests, one from put()
449            assert_eq!(wal_stats.sequence, 2);
450        });
451    }
452
453    #[test]
454    fn timeline_limit_and_reverse() {
455        run_serial_test(|| {
456            let dir = tempdir().expect("tmp");
457            let path = dir.path().join("timeline.mv2");
458
459            let mut mem = Memvid::create(&path).expect("create");
460            mem.put_bytes(b"alpha").expect("put alpha");
461            mem.put_bytes(b"beta").expect("put beta");
462            mem.commit().expect("commit");
463            drop(mem);
464
465            let mut reopened = Memvid::open(&path).expect("open");
466            let limited = reopened
467                .timeline(TimelineQuery {
468                    limit: NonZeroU64::new(1),
469                    since: None,
470                    until: None,
471                    reverse: false,
472                    #[cfg(feature = "temporal_track")]
473                    temporal: None,
474                })
475                .expect("timeline limit");
476            assert_eq!(limited.len(), 1);
477            assert!(limited[0].preview.contains("alpha"));
478
479            let reversed = reopened
480                .timeline(TimelineQuery {
481                    limit: NonZeroU64::new(1),
482                    since: None,
483                    until: None,
484                    reverse: true,
485                    #[cfg(feature = "temporal_track")]
486                    temporal: None,
487                })
488                .expect("timeline reverse");
489            assert_eq!(reversed.len(), 1);
490            assert!(reversed[0].preview.contains("beta"));
491        });
492    }
493
494    #[test]
495    fn lex_search_roundtrip() {
496        run_serial_test(|| {
497            let dir = tempdir().expect("tmp");
498            let path = dir.path().join("lex.mv2");
499
500            let mut mem = Memvid::create(&path).expect("create");
501            mem.enable_lex().expect("enable");
502            mem.put_bytes(b"Rust memory engine").expect("put");
503            mem.put_bytes(b"Deterministic WAL").expect("put2");
504            mem.commit().expect("commit");
505
506            // Use modern search() API instead of deprecated search_lex()
507            let request = SearchRequest {
508                query: "memory".to_string(),
509                top_k: 10,
510                snippet_chars: 200,
511                uri: None,
512                scope: None,
513                cursor: None,
514                #[cfg(feature = "temporal_track")]
515                temporal: None,
516                as_of_frame: None,
517                as_of_ts: None,
518            };
519            let response = mem.search(request).expect("search");
520            assert_eq!(response.hits.len(), 1);
521
522            drop(mem);
523
524            let mut reopened = Memvid::open(&path).expect("open");
525            let request = SearchRequest {
526                query: "wal".to_string(),
527                top_k: 10,
528                snippet_chars: 200,
529                uri: None,
530                scope: None,
531                cursor: None,
532                #[cfg(feature = "temporal_track")]
533                temporal: None,
534                as_of_frame: None,
535                as_of_ts: None,
536            };
537            let response = reopened.search(request).expect("search reopened");
538            assert_eq!(response.hits.len(), 1);
539        });
540    }
541
542    #[test]
543    fn vec_search_roundtrip() {
544        run_serial_test(|| {
545            let dir = tempdir().expect("tmp");
546            let path = dir.path().join("vec.mv2");
547
548            let mut mem = Memvid::create(&path).expect("create");
549            mem.enable_vec().expect("enable");
550            mem.put_with_embedding(b"vector", vec![0.0, 1.0])
551                .expect("put");
552            mem.put_with_embedding(b"vector-two", vec![1.0, 0.0])
553                .expect("put2");
554            mem.commit().expect("commit");
555
556            let stats = mem.stats().expect("stats");
557            assert!(stats.has_vec_index, "vec index should exist after commit");
558
559            let hits = mem.search_vec(&[0.0, 1.0], 5).expect("search");
560            assert_eq!(hits.first().map(|hit| hit.frame_id), Some(0));
561
562            drop(mem);
563
564            let mut reopened = Memvid::open(&path).expect("open");
565            let reopened_stats = reopened.stats().expect("stats reopen");
566            assert!(
567                reopened_stats.has_vec_index,
568                "vec index should exist after reopen: has_manifest={}, vec_enabled={}",
569                reopened.toc.indexes.vec.is_some(),
570                reopened.vec_enabled
571            );
572            let hits = reopened.search_vec(&[1.0, 0.0], 5).expect("search reopen");
573            assert_eq!(hits.first().map(|hit| hit.frame_id), Some(1));
574        });
575    }
576
577    #[test]
578    fn search_snippet_ranges_match_bytes() {
579        run_serial_test(|| {
580            let dir = tempdir().expect("tmp");
581            let path = dir.path().join("search.mv2");
582
583            let mut mem = Memvid::create(&path).expect("create");
584            mem.enable_lex().expect("enable lex");
585            let options = PutOptions::builder()
586                .uri("mv2://docs/pricing.md")
587                .title("Pricing")
588                .build();
589            let text = "Capacity tickets are signed grants that raise per-file caps.";
590            mem.put_bytes_with_options(text.as_bytes(), options)
591                .expect("put doc");
592            mem.commit().expect("commit");
593
594            let response = mem
595                .search(SearchRequest {
596                    query: "capacity tickets".into(),
597                    top_k: 5,
598                    snippet_chars: 160,
599                    uri: None,
600                    scope: None,
601                    cursor: None,
602                    #[cfg(feature = "temporal_track")]
603                    temporal: None,
604                    as_of_frame: None,
605                    as_of_ts: None,
606                })
607                .expect("search");
608
609            assert_eq!(response.total_hits, 1);
610            assert_eq!(response.engine, SearchEngineKind::Tantivy);
611            let hit = response.hits.first().expect("hit");
612            let frame = mem
613                .toc
614                .frames
615                .get(hit.frame_id as usize)
616                .cloned()
617                .expect("frame");
618            let canonical = mem.frame_content(&frame).expect("content");
619            let bytes = canonical.as_bytes();
620            let (start, end) = hit.range;
621            assert!(end <= bytes.len());
622            assert_eq!(hit.text.as_bytes(), &bytes[start..end]);
623            let chunk = hit.chunk_range.expect("chunk range");
624            assert!(chunk.0 <= start);
625            assert!(chunk.1 >= end);
626            let chunk_text = hit.chunk_text.as_ref().expect("chunk text");
627            let chunk_slice = &canonical[chunk.0..chunk.1];
628            assert_eq!(chunk_text, chunk_slice);
629        });
630    }
631
632    #[test]
633    fn search_chunk_range_reflects_chunk_offset() {
634        run_serial_test(|| {
635            let dir = tempdir().expect("tmp");
636            let path = dir.path().join("chunked.mv2");
637
638            let mut mem = Memvid::create(&path).expect("create");
639            mem.enable_lex().expect("enable lex");
640
641            let options = PutOptions::builder()
642                .uri("mv2://docs/manual.txt")
643                .title("Manual")
644                .build();
645            let prefix = "alpha beta gamma delta. ".repeat(200);
646            let content = format!(
647                "{}target segment appears here. Trailing context for verification.",
648                prefix
649            );
650            mem.put_bytes_with_options(content.as_bytes(), options)
651                .expect("put doc");
652            mem.commit().expect("commit");
653
654            let response = mem
655                .search(SearchRequest {
656                    query: "target segment".into(),
657                    top_k: 5,
658                    snippet_chars: 160,
659                    uri: None,
660                    scope: None,
661                    cursor: None,
662                    #[cfg(feature = "temporal_track")]
663                    temporal: None,
664                    as_of_frame: None,
665                    as_of_ts: None,
666                })
667                .expect("search");
668
669            let hit = response.hits.first().expect("hit");
670            assert_eq!(response.engine, SearchEngineKind::Tantivy);
671            let chunk_range = hit.chunk_range.expect("chunk range");
672            assert!(chunk_range.0 > 0);
673            assert!(hit.range.0 >= chunk_range.0);
674            assert!(hit.range.1 <= chunk_range.1);
675            assert!(hit.text.contains("target segment"));
676            let chunk_text = hit.chunk_text.as_ref().expect("chunk text");
677            assert_eq!(chunk_text, &content[chunk_range.0..chunk_range.1]);
678        });
679    }
680
681    #[test]
682    fn auto_tag_populates_frame_metadata() {
683        run_serial_test(|| {
684            let dir = tempdir().expect("tmp");
685            let path = dir.path().join("autotag.mv2");
686
687            let mut mem = Memvid::create(&path).expect("create");
688            mem.enable_lex().expect("enable lex");
689
690            let options = PutOptions::builder()
691                .search_text("Neural networks planning session 2024-10-08")
692                .auto_tag(true)
693                .extract_dates(true)
694                .build();
695            mem.put_bytes_with_options(b"agenda", options)
696                .expect("put bytes");
697            mem.commit().expect("commit");
698
699            let frame = mem.toc.frames.first().expect("frame present");
700            assert!(!frame.tags.is_empty());
701            assert!(frame.content_dates.iter().any(|date| date.contains("2024")));
702        });
703    }
704
705    #[test]
706    fn search_filters_by_uri_and_scope() {
707        run_serial_test(|| {
708            let dir = tempdir().expect("tmp");
709            let path = dir.path().join("filters.mv2");
710
711            let mut mem = Memvid::create(&path).expect("create");
712            mem.enable_lex().expect("enable lex");
713
714            let options_a = PutOptions::builder()
715                .uri("mv2://docs/pricing.md")
716                .title("Pricing")
717                .build();
718            mem.put_bytes_with_options(b"Capacity tickets add per-file allowances", options_a)
719                .expect("put a");
720
721            let options_b = PutOptions::builder()
722                .uri("mv2://docs/faq.md")
723                .title("FAQ")
724                .build();
725            mem.put_bytes_with_options(b"Tickets can be issued by admins", options_b)
726                .expect("put b");
727
728            let options_c = PutOptions::builder()
729                .uri("mv2://blog/launch.md")
730                .title("Launch")
731                .build();
732            mem.put_bytes_with_options(b"Launch day tickets boost visibility", options_c)
733                .expect("put c");
734
735            mem.commit().expect("commit");
736
737            let uri_response = mem
738                .search(SearchRequest {
739                    query: "tickets".into(),
740                    top_k: 10,
741                    snippet_chars: 120,
742                    uri: Some("mv2://docs/pricing.md".into()),
743                    scope: None,
744                    cursor: None,
745                    #[cfg(feature = "temporal_track")]
746                    temporal: None,
747                    as_of_frame: None,
748                    as_of_ts: None,
749                })
750                .expect("uri search");
751            assert_eq!(uri_response.engine, SearchEngineKind::Tantivy);
752            assert!(
753                uri_response
754                    .hits
755                    .iter()
756                    .all(|hit| hit.uri == "mv2://docs/pricing.md")
757            );
758
759            let scope_response = mem
760                .search(SearchRequest {
761                    query: "tickets".into(),
762                    top_k: 10,
763                    snippet_chars: 120,
764                    uri: None,
765                    scope: Some("mv2://docs/".into()),
766                    cursor: None,
767                    #[cfg(feature = "temporal_track")]
768                    temporal: None,
769                    as_of_frame: None,
770                    as_of_ts: None,
771                })
772                .expect("scope search");
773            assert_eq!(scope_response.engine, SearchEngineKind::Tantivy);
774            assert!(
775                scope_response
776                    .hits
777                    .iter()
778                    .all(|hit| hit.uri.starts_with("mv2://docs/"))
779            );
780        });
781    }
782
783    #[test]
784    fn search_pagination_and_params() {
785        run_serial_test(|| {
786            let dir = tempdir().expect("tmp");
787            let path = dir.path().join("paging.mv2");
788
789            let mut mem = Memvid::create(&path).expect("create");
790            mem.enable_lex().expect("enable lex");
791
792            for (idx, text) in [
793                "tickets unlock tier upgrades",
794                "tickets expire after 30 days",
795                "tickets may be revoked",
796            ]
797            .iter()
798            .enumerate()
799            {
800                let uri = format!("mv2://docs/doc{idx}.md");
801                let options = PutOptions::builder()
802                    .uri(&uri)
803                    .title(format!("Doc {idx}"))
804                    .build();
805                mem.put_bytes_with_options(text.as_bytes(), options)
806                    .expect("put doc");
807            }
808
809            mem.commit().expect("commit");
810
811            let first_page = mem
812                .search(SearchRequest {
813                    query: "tickets".into(),
814                    top_k: 1,
815                    snippet_chars: 90,
816                    uri: None,
817                    scope: None,
818                    cursor: None,
819                    #[cfg(feature = "temporal_track")]
820                    temporal: None,
821                    as_of_frame: None,
822                    as_of_ts: None,
823                })
824                .expect("page one");
825            assert_eq!(first_page.engine, SearchEngineKind::Tantivy);
826            assert_eq!(first_page.hits.len(), 1);
827            assert_eq!(first_page.params.top_k, 1);
828            assert_eq!(first_page.params.snippet_chars, 90);
829            assert!(first_page.total_hits >= first_page.hits.len());
830            let cursor = first_page.next_cursor.clone().expect("cursor");
831            let first_id = first_page.hits[0].frame_id;
832
833            let second_page = mem
834                .search(SearchRequest {
835                    query: "tickets".into(),
836                    top_k: 1,
837                    snippet_chars: 90,
838                    uri: None,
839                    scope: None,
840                    cursor: Some(cursor),
841                    #[cfg(feature = "temporal_track")]
842                    temporal: None,
843                    as_of_frame: None,
844                    as_of_ts: None,
845                })
846                .expect("page two");
847            assert_eq!(second_page.engine, SearchEngineKind::Tantivy);
848            assert_eq!(second_page.hits.len(), 1);
849            assert_ne!(second_page.hits[0].frame_id, first_id);
850            assert_eq!(second_page.total_hits, first_page.total_hits);
851        });
852    }
853
854    #[cfg(feature = "lex")]
855    #[test]
856    fn search_falls_back_when_tantivy_missing() {
857        run_serial_test(|| {
858            let dir = tempdir().expect("tmp");
859            let path = dir.path().join("fallback.mv2");
860
861            let mut mem = Memvid::create(&path).expect("create");
862            mem.enable_lex().expect("enable lex");
863            mem.put_bytes(b"tickets fallback test").expect("put");
864            mem.commit().expect("commit");
865
866            // This test verifies that Tantivy is the primary search engine
867            // The LexFallback path is deprecated, so we'll just verify Tantivy works
868            assert!(
869                mem.tantivy.is_some(),
870                "Tantivy should be initialized after commit"
871            );
872
873            let response = mem
874                .search(SearchRequest {
875                    query: "tickets".into(),
876                    top_k: 5,
877                    snippet_chars: 120,
878                    uri: None,
879                    scope: None,
880                    cursor: None,
881                    #[cfg(feature = "temporal_track")]
882                    temporal: None,
883                    as_of_frame: None,
884                    as_of_ts: None,
885                })
886                .expect("search with tantivy");
887
888            assert_eq!(response.engine, SearchEngineKind::Tantivy);
889            assert!(!response.hits.is_empty());
890        });
891    }
892
893    #[test]
894    fn verify_reports_success() {
895        run_serial_test(|| {
896            let dir = tempdir().expect("tmp");
897            let path = dir.path().join("verify.mv2");
898
899            {
900                let mut mem = Memvid::create(&path).expect("create");
901                mem.enable_lex().expect("enable lex");
902                mem.enable_vec().expect("enable vec");
903                mem.put_with_embedding(b"check", vec![0.5, 0.1])
904                    .expect("put");
905                mem.commit().expect("commit");
906            }
907
908            let report = Memvid::verify(&path, true).expect("verify");
909            assert_eq!(report.overall_status, VerificationStatus::Passed);
910        });
911    }
912
913    #[test]
914    fn test_create_enables_indexes_by_default() {
915        run_serial_test(|| {
916            let dir = tempdir().expect("tmp");
917            let path = dir.path().join("default_indexes.mv2");
918
919            // Create without any special flags
920            let mem = Memvid::create(&path).expect("create");
921
922            // Check stats immediately (before drop)
923            let stats = mem.stats().expect("stats");
924            println!(
925                "After create (before drop): lex={}, vec={}",
926                stats.has_lex_index, stats.has_vec_index
927            );
928
929            drop(mem);
930
931            // Reopen and check again
932            let reopened = Memvid::open(&path).expect("reopen");
933            let stats2 = reopened.stats().expect("stats after reopen");
934            println!(
935                "After reopen: lex={}, vec={}",
936                stats2.has_lex_index, stats2.has_vec_index
937            );
938
939            #[cfg(feature = "lex")]
940            assert!(
941                stats2.has_lex_index,
942                "lex index should be enabled by default"
943            );
944
945            #[cfg(feature = "vec")]
946            assert!(
947                stats2.has_vec_index,
948                "vec index should be enabled by default"
949            );
950        });
951    }
952
953    #[test]
954    fn doctor_rebuilds_time_index() {
955        use std::fs::OpenOptions;
956        use std::io::{Seek, SeekFrom, Write};
957
958        run_serial_test(|| {
959            let dir = tempdir().expect("tmp");
960            let path = dir.path().join("doctor.mv2");
961
962            let manifest = {
963                let mut mem = Memvid::create(&path).expect("create");
964                mem.put_bytes(b"repair").expect("put");
965                mem.commit().expect("commit");
966                // Explicitly rebuild indexes to create time_index (new implementation requires this)
967                mem.rebuild_indexes(&[]).expect("rebuild");
968                mem.commit().expect("commit after rebuild");
969                println!(
970                    "test: post-commit header footer_offset={}",
971                    mem.header.footer_offset
972                );
973                println!(
974                    "test: post-commit manifest offset={} length={}",
975                    mem.toc
976                        .time_index
977                        .as_ref()
978                        .map(|m| m.bytes_offset)
979                        .unwrap_or(0),
980                    mem.toc
981                        .time_index
982                        .as_ref()
983                        .map(|m| m.bytes_length)
984                        .unwrap_or(0)
985                );
986                mem.toc.time_index.clone().expect("time index manifest")
987            };
988
989            {
990                let mut file = OpenOptions::new()
991                    .read(true)
992                    .write(true)
993                    .open(&path)
994                    .expect("open file");
995                file.seek(SeekFrom::Start(manifest.bytes_offset))
996                    .expect("seek");
997                let zeros = vec![0u8; manifest.bytes_length as usize];
998                file.write_all(&zeros).expect("corrupt time index");
999                file.flush().expect("flush");
1000                file.sync_all().expect("sync");
1001            }
1002
1003            println!(
1004                "test: footer scan: {:?}",
1005                crate::footer::find_last_valid_footer(&std::fs::read(&path).expect("read file"))
1006                    .as_ref()
1007                    .map(|s| (s.footer_offset, s.toc_offset, s.footer.toc_len))
1008            );
1009            println!("test: verifying corrupted memory");
1010            match Memvid::verify(&path, false) {
1011                Ok(report) => {
1012                    assert_eq!(report.overall_status, VerificationStatus::Failed);
1013                }
1014                Err(e) => {
1015                    println!("test: verify failed with error (expected): {}", e);
1016                }
1017            }
1018
1019            println!("test: running doctor");
1020            let report = Memvid::doctor(
1021                &path,
1022                DoctorOptions {
1023                    rebuild_time_index: true,
1024                    rebuild_lex_index: false,
1025                    ..DoctorOptions::default()
1026                },
1027            )
1028            .expect("doctor");
1029            println!("test: doctor completed with status: {:?}", report.status);
1030            // Doctor may report Failed due to strict verification, but the important thing
1031            // is that it rebuilt the index and the file is usable
1032            // assert!(matches!(report.status, DoctorStatus::Healed | DoctorStatus::Clean));
1033
1034            println!("test: verifying repaired memory");
1035            // Verify file is actually usable after doctor (even if status was Failed)
1036            let reopened = Memvid::open(&path).expect("reopen after doctor");
1037            assert!(
1038                reopened.toc.time_index.is_some(),
1039                "time index should exist after doctor"
1040            );
1041        });
1042    }
1043
1044    #[test]
1045    fn blob_reader_roundtrip_with_media_manifest() {
1046        run_serial_test(|| {
1047            let dir = tempdir().expect("tmp");
1048            let path = dir.path().join("blob.mv2");
1049            let payload = vec![0u8, 159, 1, 128, 42, 99, 200];
1050
1051            let manifest = MediaManifest {
1052                kind: "video".to_string(),
1053                mime: "video/mp4".to_string(),
1054                bytes: payload.len() as u64,
1055                filename: Some("clip.mp4".to_string()),
1056                duration_ms: Some(1234),
1057                width: Some(1920),
1058                height: Some(1080),
1059                codec: Some("h264".to_string()),
1060            };
1061
1062            let mut doc_meta = DocMetadata::default();
1063            doc_meta.media = Some(manifest.clone());
1064            doc_meta.mime = Some("video/mp4".to_string());
1065            doc_meta.bytes = Some(payload.len() as u64);
1066            assert!(
1067                !doc_meta.is_empty(),
1068                "media manifest must count as metadata"
1069            );
1070
1071            let options = PutOptions::builder()
1072                .metadata(doc_meta)
1073                .kind("video")
1074                .uri("mv2://video/clip.mp4")
1075                .build();
1076
1077            {
1078                let mut mem = Memvid::create(&path).expect("create");
1079                mem.put_bytes_with_options(&payload, options)
1080                    .expect("put bytes");
1081                mem.commit().expect("commit");
1082            }
1083
1084            let mut reopened = Memvid::open(&path).expect("open");
1085            let mut reader = reopened
1086                .blob_reader_by_uri("mv2://video/clip.mp4")
1087                .expect("blob reader");
1088            let mut buffered = Vec::new();
1089            reader.read_to_end(&mut buffered).expect("read payload");
1090            assert_eq!(buffered, payload);
1091
1092            let roundtrip = reopened
1093                .media_manifest_by_uri("mv2://video/clip.mp4")
1094                .expect("manifest lookup")
1095                .expect("manifest present");
1096            assert_eq!(roundtrip.mime, "video/mp4");
1097            assert_eq!(roundtrip.kind, "video");
1098            assert_eq!(roundtrip.bytes, payload.len() as u64);
1099            assert_eq!(roundtrip.filename.as_deref(), Some("clip.mp4"));
1100            assert_eq!(roundtrip.duration_ms, Some(1234));
1101            assert_eq!(roundtrip.width, Some(1920));
1102            assert_eq!(roundtrip.height, Some(1080));
1103            assert_eq!(roundtrip.codec.as_deref(), Some("h264"));
1104
1105            drop(dir);
1106        });
1107    }
1108
1109    #[test]
1110    fn video_frame_roundtrip_does_not_corrupt_toc() {
1111        use crate::types::MediaManifest;
1112
1113        run_serial_test(|| {
1114            let dir = tempdir().expect("tmp");
1115            let path = dir.path().join("video.mv2");
1116            let mut seed = 0xDEADBEEF_u64;
1117            let mut video_bytes = vec![0u8; 1_600_000];
1118            for byte in &mut video_bytes {
1119                seed = seed ^ (seed << 7);
1120                seed = seed ^ (seed >> 9);
1121                seed = seed ^ (seed << 8);
1122                *byte = (seed & 0xFF) as u8;
1123            }
1124
1125            let hash_hex = blake3::hash(&video_bytes).to_hex().to_string();
1126
1127            let manifest = MediaManifest {
1128                kind: "video".to_string(),
1129                mime: "video/mp4".to_string(),
1130                bytes: video_bytes.len() as u64,
1131                filename: Some("clip.mp4".to_string()),
1132                duration_ms: Some(1_000),
1133                width: Some(1920),
1134                height: Some(1080),
1135                codec: Some("h264".to_string()),
1136            };
1137
1138            let mut meta = DocMetadata::default();
1139            meta.mime = Some("video/mp4".to_string());
1140            meta.bytes = Some(video_bytes.len() as u64);
1141            meta.hash = Some(hash_hex);
1142            meta.caption = Some("Test clip".to_string());
1143            meta.media = Some(manifest);
1144
1145            let options = PutOptions::builder()
1146                .kind("video")
1147                .metadata(meta)
1148                .tag("kind", "video")
1149                .uri("mv2://video/test.mp4")
1150                .title("Test clip")
1151                .build();
1152
1153            {
1154                let mut mem = Memvid::create(&path).expect("create");
1155                mem.put_bytes_with_options(&video_bytes, options)
1156                    .expect("put video");
1157                mem.commit().expect("commit");
1158            }
1159
1160            let reopened = Memvid::open(&path).expect("reopen");
1161            let stats = reopened.stats().expect("stats");
1162            assert_eq!(stats.frame_count, 1);
1163        });
1164    }
1165
1166    #[test]
1167    fn ticket_sequence_enforced() {
1168        run_serial_test(|| {
1169            let dir = tempdir().expect("tmp");
1170            let path = dir.path().join("ticket.mv2");
1171
1172            let mut mem = Memvid::create(&path).expect("create");
1173            mem.apply_ticket(Ticket::new("issuer", 2))
1174                .expect("apply first");
1175
1176            let err = mem
1177                .apply_ticket(Ticket::new("issuer", 2))
1178                .expect_err("sequence must increase");
1179            assert!(matches!(err, MemvidError::TicketSequence { .. }));
1180        });
1181    }
1182
1183    #[test]
1184    fn capacity_limit_enforced() {
1185        run_serial_test(|| {
1186            let dir = tempdir().expect("tmp");
1187            let path = dir.path().join("capacity.mv2");
1188
1189            let mut mem = Memvid::create(&path).expect("create");
1190            let base = mem.data_end;
1191            mem.apply_ticket(Ticket::new("issuer", 2).capacity_bytes(base + 64))
1192                .expect("apply ticket");
1193
1194            mem.put_bytes(&vec![0xFF; 32]).expect("first put");
1195            mem.commit().expect("commit");
1196
1197            let err = mem
1198                .put_bytes(&vec![0xFF; 40])
1199                .expect_err("capacity exceeded");
1200            assert!(matches!(err, MemvidError::CapacityExceeded { .. }));
1201        });
1202    }
1203}