1#![deny(clippy::all, clippy::pedantic)]
2#![cfg_attr(not(test), deny(clippy::unwrap_used, clippy::expect_used))]
3#![allow(clippy::module_name_repetitions)]
4
5pub const MEMVID_CORE_VERSION: &str = env!("CARGO_PKG_VERSION");
7
8mod analysis;
9pub mod constants;
10pub mod enrich;
11pub mod error;
12pub mod extract;
13pub mod footer;
14pub mod io;
15pub mod lex;
16mod lock;
17pub mod lockfile;
18pub mod memvid;
19pub mod models;
20pub mod pii;
21pub mod reader;
22mod registry;
23mod search;
24pub mod signature;
25pub mod structure;
26pub mod table;
27pub mod text;
28mod toc;
29pub mod types;
30pub mod vec;
31pub mod vec_pq;
32
33pub mod triplet;
35
36pub mod graph_search;
38
39pub mod clip;
42
43pub mod whisper;
46
47pub mod replay;
51
52#[cfg(feature = "encryption")]
55pub mod encryption;
56
57#[cfg(test)]
58mod tests_lex_flag;
59
60#[cfg(feature = "temporal_track")]
61pub use analysis::temporal::{
62 TemporalContext, TemporalNormalizer, TemporalResolution, TemporalResolutionFlag,
63 TemporalResolutionValue, parse_clock_inheritance, parse_week_start,
64};
65#[cfg(feature = "temporal_enrich")]
67pub use analysis::temporal_enrich::{
68 AnchorSource as TemporalEnrichAnchorSource, RelativePhrase, ResolvedTemporal,
69 TemporalAnchorInfo, TemporalAnchorTracker, TemporalEnrichment, detect_relative_phrases,
70 enrich_chunk, enrich_chunks, enrich_document, resolve_relative_phrase,
71};
72pub use constants::*;
73pub use error::{MemvidError, Result};
74pub use extract::{DocumentProcessor, ExtractedDocument, ProcessorConfig};
75pub use footer::{CommitFooter, find_last_valid_footer};
76#[cfg(feature = "temporal_track")]
77pub use io::temporal_index::{
78 append_track as temporal_track_append, calculate_checksum as temporal_track_checksum,
79 read_track as temporal_track_read, window as temporal_track_window,
80};
81pub use io::time_index::{
82 TimeIndexEntry, append_track as time_index_append, calculate_checksum as time_index_checksum,
83 read_track as time_index_read,
84};
85pub use io::wal::{EmbeddedWal, WalRecord, WalStats};
86pub use lex::{LexIndex, LexIndexArtifact, LexIndexBuilder, LexSearchHit};
87pub use lock::FileLock;
88pub use memvid::{
89 BlobReader, LockSettings, Memvid, OpenReadOptions,
90 mutation::{CommitMode, CommitOptions},
91};
92#[cfg(feature = "parallel_segments")]
93pub use memvid::{BuildOpts, ParallelInput, ParallelPayload};
94pub use models::{
95 ModelManifest, ModelManifestEntry, ModelVerification, ModelVerificationStatus,
96 ModelVerifyOptions, verify_model_dir, verify_models,
97};
98pub use reader::{
99 DocumentFormat, DocumentReader, PassthroughReader, PdfReader, ReaderDiagnostics, ReaderHint,
100 ReaderOutput, ReaderRegistry,
101};
102pub use signature::{
103 parse_ed25519_public_key_base64, verify_model_manifest, verify_ticket_signature,
104};
105pub use text::{NormalizedText, normalize_text, truncate_at_grapheme_boundary};
106#[cfg(feature = "temporal_track")]
107pub use types::{
108 AnchorSource, SearchHitTemporal, SearchHitTemporalAnchor, SearchHitTemporalMention,
109 TEMPORAL_TRACK_FLAG_HAS_ANCHORS, TEMPORAL_TRACK_FLAG_HAS_MENTIONS, TemporalAnchor,
110 TemporalCapabilities, TemporalFilter, TemporalMention, TemporalMentionFlags,
111 TemporalMentionKind, TemporalTrack, TemporalTrackManifest,
112};
113pub use types::{
114 AskCitation, AskMode, AskRequest, AskResponse, AskRetriever, AskStats, AudioSegmentMetadata,
115 AuditOptions, AuditReport, CanonicalEncoding, DOCTOR_PLAN_VERSION, DocAudioMetadata,
116 DocExifMetadata, DocGpsMetadata, DocMetadata, DoctorActionDetail, DoctorActionKind,
117 DoctorActionPlan, DoctorActionReport, DoctorActionStatus, DoctorFinding, DoctorFindingCode,
118 DoctorMetrics, DoctorOptions, DoctorPhaseDuration, DoctorPhaseKind, DoctorPhasePlan,
119 DoctorPhaseReport, DoctorPhaseStatus, DoctorPlan, DoctorReport, DoctorSeverity, DoctorStatus,
120 EmbeddingIdentity, EmbeddingIdentityCount, EmbeddingIdentitySummary, Frame, FrameId, FrameRole,
121 FrameStatus, Header, IndexManifests, LexIndexManifest,
122 LexSegmentDescriptor, MediaManifest, MemvidHandle, Open, PutOptions, PutOptionsBuilder, Sealed,
123 SearchEngineKind, SearchHit, SearchHitMetadata, SearchParams, SearchRequest, SearchResponse,
124 SegmentCatalog, SegmentCommon, SegmentCompression, SegmentMeta, SegmentSpan, SourceSpan, Stats,
125 TextChunkManifest, TextChunkRange, Ticket, TicketRef, Tier, TimeIndexManifest,
126 TimeSegmentDescriptor, TimelineEntry, TimelineQuery, TimelineQueryBuilder, Toc, VecEmbedder,
127 VecIndexManifest, VecSegmentDescriptor, VectorCompression, VerificationCheck,
128 VerificationReport, VerificationStatus,
129 MEMVID_EMBEDDING_DIMENSION_KEY, MEMVID_EMBEDDING_MODEL_KEY, MEMVID_EMBEDDING_NORMALIZED_KEY,
130 MEMVID_EMBEDDING_PROVIDER_KEY,
131};
132pub use types::{
134 EngineStamp, EnrichmentManifest, EnrichmentRecord, MEMORIES_TRACK_MAGIC,
135 MEMORIES_TRACK_VERSION, MemoriesStats, MemoriesTrack, MemoryCard, MemoryCardBuilder,
136 MemoryCardBuilderError, MemoryCardId, MemoryKind, Polarity, SlotIndex, VersionRelation,
137};
138pub use types::{
140 EdgeDirection, EntityKind, FollowResult, LOGIC_MESH_MAGIC, LOGIC_MESH_VERSION, LinkType,
141 LogicMesh, LogicMeshManifest, MeshEdge, MeshNode,
142};
143pub use types::{
145 Cardinality, PredicateId, PredicateSchema, SchemaError, SchemaRegistry, ValueType,
146};
147pub use memvid::memory::SchemaSummaryEntry;
149pub use analysis::ner::{
151 ExtractedEntity, FrameEntities, NER_MODEL_NAME, NER_MODEL_SIZE_MB, NER_MODEL_URL,
152 NER_TOKENIZER_URL, NerModelInfo, default_ner_model_info, get_ner_model_info,
153 is_ner_model_installed, ner_model_path, ner_tokenizer_path, NER_MODELS,
154};
155#[cfg(feature = "logic_mesh")]
156pub use analysis::ner::NerModel;
157pub use enrich::{EnrichmentContext, EnrichmentEngine, EnrichmentResult, RulesEngine};
159pub use triplet::{ExtractionMode, ExtractionStats, TripletExtractor};
161pub use graph_search::{GraphMatcher, QueryPlanner, hybrid_search};
163pub use types::{
165 BatchEmbeddingResult, EmbeddingConfig, EmbeddingProvider, EmbeddingProviderKind,
166 EmbeddingResult,
167};
168pub use types::reranker::{
170 Reranker, RerankerConfig, RerankerDocument, RerankerKind, RerankerResult,
171};
172#[cfg(feature = "parallel_segments")]
173pub use types::{IndexSegmentRef, SegmentKind, SegmentStats};
174pub use vec::{VecIndex, VecIndexArtifact, VecSearchHit};
175pub use vec_pq::{
176 CompressionStats, ProductQuantizer, QuantizedVecIndex, QuantizedVecIndexArtifact,
177 QuantizedVecIndexBuilder,
178};
179pub use clip::{
181 CLIP_MODELS, ClipConfig, ClipDocument, ClipEmbeddingProvider, ClipError, ClipIndex,
182 ClipIndexArtifact, ClipIndexBuilder, ClipIndexManifest, ClipModelInfo, ClipSearchHit,
183 ImageInfo, MOBILECLIP_DIMS, SIGLIP_DIMS, default_model_info, filter_junk_images,
184 get_model_info,
185};
186#[cfg(feature = "clip")]
188pub use clip::{ClipModel, calculate_color_variance, get_image_info};
189pub use whisper::{
191 TranscriptionResult, TranscriptionSegment, WhisperConfig, WhisperError,
192 WhisperModelInfo, WHISPER_MODELS, default_whisper_model_info, get_whisper_model_info,
193};
194#[cfg(feature = "whisper")]
196pub use whisper::{WHISPER_SAMPLE_RATE, WhisperTranscriber, decode_audio_file};
197pub use structure::{
199 ChunkType, ChunkingOptions, ChunkingResult, StructuralChunker, StructuredChunk,
200 StructuredDocument, TableChunkingStrategy, chunk_structured, detect_structure,
201};
202pub use types::adaptive::{
204 AdaptiveConfig, AdaptiveResult, AdaptiveStats, CutoffStrategy, find_adaptive_cutoff,
205 normalize_scores,
206};
207pub use replay::{
209 ActionType, Checkpoint, ReplayAction, ReplayManifest, ReplaySession, SessionSummary,
210 StateSnapshot, REPLAY_SEGMENT_MAGIC, REPLAY_SEGMENT_VERSION,
211};
212#[cfg(feature = "replay")]
214pub use replay::{
215 ActiveSession, ComparisonReport, ComparisonSummary, Divergence, DivergenceType, ModelResult,
216 ReplayConfig, ReplayOptions, ReplayResult,
217};
218
219#[cfg(test)]
220use once_cell::sync::Lazy;
221use std::fs::File;
222use std::io::Cursor;
223use std::path::Path;
224#[cfg(test)]
225use std::sync::Mutex;
226
227use bincode::config::{self, Config};
228use io::header::HeaderCodec;
229
230const TIMELINE_PREVIEW_BYTES: usize = 120;
231const MAX_INDEX_BYTES: u64 = 512 * 1024 * 1024; const MAX_TIME_INDEX_BYTES: u64 = 512 * 1024 * 1024;
233const MAX_FRAME_BYTES: u64 = 256 * 1024 * 1024;
234const DEFAULT_SEARCH_TEXT_LIMIT: usize = 32_768;
235
236#[cfg(test)]
237static SERIAL_TEST_MUTEX: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
238
239#[cfg(test)]
240pub(crate) fn run_serial_test<T>(f: impl FnOnce() -> T) -> T {
241 let _guard = SERIAL_TEST_MUTEX
242 .lock()
243 .expect("memvid-core serial test mutex poisoned");
244 f()
245}
246
247impl Memvid {
248 #[cfg(feature = "lex")]
249 fn tantivy_index_pending(&self) -> bool {
250 self.tantivy_dirty
251 }
252
253 #[cfg(not(feature = "lex"))]
254 fn tantivy_index_pending(&self) -> bool {
255 false
256 }
257
258 #[cfg(feature = "lex")]
259 fn flush_tantivy_conditional(&mut self, embed_snapshot: bool) -> Result<()> {
260 if !self.tantivy_dirty {
261 return Ok(());
262 }
263 if let Some(engine) = self.tantivy.as_mut() {
264 engine.commit()?;
265 if embed_snapshot {
266 let snapshot = engine.snapshot_segments()?;
267 self.update_embedded_lex_snapshot(snapshot)?;
268 }
269 }
270 self.tantivy_dirty = false;
271 Ok(())
272 }
273
274 #[cfg(feature = "lex")]
275 fn flush_tantivy(&mut self) -> Result<()> {
276 self.flush_tantivy_conditional(true)
277 }
278
279 #[cfg(feature = "lex")]
280 #[allow(dead_code)]
281 fn flush_tantivy_skip_embed(&mut self) -> Result<()> {
282 self.flush_tantivy_conditional(false)
283 }
284
285 #[cfg(not(feature = "lex"))]
286 fn flush_tantivy(&mut self) -> Result<()> {
287 Ok(())
288 }
289
290 #[cfg(not(feature = "lex"))]
291 #[allow(dead_code)]
292 fn flush_tantivy_skip_embed(&mut self) -> Result<()> {
293 Ok(())
294 }
295 pub fn path(&self) -> &Path {
296 &self.path
297 }
298
299 pub fn lock_handle(&self) -> &FileLock {
300 &self.lock
301 }
302
303 pub fn is_read_only(&self) -> bool {
304 self.read_only
305 }
306
307 pub(crate) fn ensure_writable(&mut self) -> Result<()> {
308 if self.read_only {
309 self.lock.upgrade_to_exclusive()?;
310 self.read_only = false;
311 }
312 Ok(())
313 }
314
315 pub fn downgrade_to_shared(&mut self) -> Result<()> {
316 if self.read_only {
317 return Ok(());
318 }
319 if self.dirty || self.tantivy_index_pending() {
320 return Ok(());
321 }
322 self.lock.downgrade_to_shared()?;
323 self.read_only = true;
324 Ok(())
325 }
326}
327
328impl Drop for Memvid {
329 fn drop(&mut self) {
330 if self.dirty {
331 let _ = self.commit();
332 }
333 #[cfg(feature = "parallel_segments")]
335 {
336 use crate::memvid::lifecycle::cleanup_manifest_wal_public;
337 cleanup_manifest_wal_public(self.path());
338 }
339 }
340}
341
342pub(crate) fn persist_header(file: &mut File, header: &Header) -> Result<()> {
343 HeaderCodec::write(file, header)
344}
345
346fn wal_config() -> impl Config {
347 config::standard()
348 .with_fixed_int_encoding()
349 .with_little_endian()
350}
351
352pub(crate) fn decode_canonical_bytes(
353 payload: &[u8],
354 encoding: CanonicalEncoding,
355 frame_id: FrameId,
356) -> Result<Vec<u8>> {
357 match encoding {
358 CanonicalEncoding::Plain => Ok(payload.to_vec()),
359 CanonicalEncoding::Zstd => {
360 zstd::decode_all(Cursor::new(payload)).map_err(|_| MemvidError::InvalidFrame {
361 frame_id,
362 reason: "failed to decode canonical payload",
363 })
364 }
365 }
366}
367
368pub(crate) fn default_uri(frame_id: FrameId) -> String {
369 format!("mv2://frames/{frame_id}")
370}
371
372pub(crate) fn infer_title_from_uri(uri: &str) -> Option<String> {
373 let trimmed = uri.trim();
374 if trimmed.is_empty() {
375 return None;
376 }
377
378 let without_scheme = trimmed.splitn(2, "://").nth(1).unwrap_or(trimmed);
379 let without_fragment = without_scheme.split('#').next().unwrap_or(without_scheme);
380 let without_query = without_fragment
381 .split('?')
382 .next()
383 .unwrap_or(without_fragment);
384 let segment = without_query
385 .trim_end_matches('/')
386 .rsplit('/')
387 .next()
388 .map(str::trim)?;
389 if segment.is_empty() {
390 return None;
391 }
392
393 let stem = segment.rsplitn(2, '.').nth(1).unwrap_or(segment).trim();
394 if stem.is_empty() {
395 return None;
396 }
397
398 let words: Vec<String> = stem
399 .split(|c: char| c == '-' || c == '_' || c == ' ')
400 .filter(|part| !part.is_empty())
401 .map(|part| {
402 let mut chars = part.chars();
403 match chars.next() {
404 Some(first) => {
405 let first = first.to_ascii_uppercase();
406 let rest: String = chars.map(|c| c.to_ascii_lowercase()).collect();
407 if rest.is_empty() {
408 first.to_string()
409 } else {
410 format!("{}{}", first, rest)
411 }
412 }
413 None => String::new(),
414 }
415 })
416 .filter(|word| !word.is_empty())
417 .collect();
418
419 if words.is_empty() {
420 None
421 } else {
422 Some(words.join(" "))
423 }
424}
425
426fn truncate_preview(text: &str) -> String {
427 text.chars().take(TIMELINE_PREVIEW_BYTES).collect()
428}
429
430fn image_preview_from_metadata(meta: &DocMetadata) -> Option<String> {
431 let mime = meta.mime.as_deref()?;
432 if !mime.starts_with("image/") {
433 return None;
434 }
435
436 if let Some(caption) = meta.caption.as_ref() {
437 let trimmed = caption.trim();
438 if !trimmed.is_empty() {
439 return Some(truncate_preview(trimmed));
440 }
441 }
442
443 let mut segments: Vec<String> = Vec::new();
444 if let (Some(w), Some(h)) = (meta.width, meta.height) {
445 segments.push(format!("{}×{} px", w, h));
446 }
447 if let Some(exif) = meta.exif.as_ref() {
448 if let Some(model) = exif
449 .model
450 .as_ref()
451 .map(|s| s.trim())
452 .filter(|s| !s.is_empty())
453 {
454 segments.push(model.to_string());
455 } else if let Some(make) = exif
456 .make
457 .as_ref()
458 .map(|s| s.trim())
459 .filter(|s| !s.is_empty())
460 {
461 segments.push(make.to_string());
462 }
463
464 if let Some(datetime) = exif
465 .datetime
466 .as_ref()
467 .map(|s| s.trim())
468 .filter(|s| !s.is_empty())
469 {
470 segments.push(datetime.to_string());
471 }
472 }
473
474 if segments.is_empty() {
475 return Some("Image frame".to_string());
476 }
477
478 Some(truncate_preview(&segments.join(" · ")))
479}
480
481#[cfg(test)]
482mod tests {
483 use super::*;
484 use std::io::Read;
485 use std::num::NonZeroU64;
486 use tempfile::tempdir;
487
488 #[test]
489 fn create_put_commit_reopen() {
490 run_serial_test(|| {
491 let dir = tempdir().expect("tmp");
492 let path = dir.path().join("memory.mv2");
493
494 let mut mem = Memvid::create(&path).expect("create");
495 let seq = mem.put_bytes(b"hello").expect("put");
496 assert_eq!(seq, 1);
497 mem.commit().expect("commit");
498
499 drop(mem);
500
501 let mut reopened = Memvid::open(&path).expect("open");
502 let stats = reopened.stats().expect("stats");
503 assert_eq!(stats.frame_count, 1);
504 assert!(stats.has_time_index);
505
506 let timeline = reopened
507 .timeline(TimelineQuery::default())
508 .expect("timeline");
509 assert_eq!(timeline.len(), 1);
510 assert!(timeline[0].preview.contains("hello"));
511
512 let wal_stats = reopened.wal.stats();
513 assert_eq!(wal_stats.pending_bytes, 0);
514 assert_eq!(wal_stats.sequence, 2);
516 });
517 }
518
519 #[test]
520 fn timeline_limit_and_reverse() {
521 run_serial_test(|| {
522 let dir = tempdir().expect("tmp");
523 let path = dir.path().join("timeline.mv2");
524
525 let mut mem = Memvid::create(&path).expect("create");
526 mem.put_bytes(b"alpha").expect("put alpha");
527 mem.put_bytes(b"beta").expect("put beta");
528 mem.commit().expect("commit");
529 drop(mem);
530
531 let mut reopened = Memvid::open(&path).expect("open");
532 let limited = reopened
533 .timeline(TimelineQuery {
534 limit: NonZeroU64::new(1),
535 since: None,
536 until: None,
537 reverse: false,
538 #[cfg(feature = "temporal_track")]
539 temporal: None,
540 })
541 .expect("timeline limit");
542 assert_eq!(limited.len(), 1);
543 assert!(limited[0].preview.contains("alpha"));
544
545 let reversed = reopened
546 .timeline(TimelineQuery {
547 limit: NonZeroU64::new(1),
548 since: None,
549 until: None,
550 reverse: true,
551 #[cfg(feature = "temporal_track")]
552 temporal: None,
553 })
554 .expect("timeline reverse");
555 assert_eq!(reversed.len(), 1);
556 assert!(reversed[0].preview.contains("beta"));
557 });
558 }
559
560 #[test]
561 fn lex_search_roundtrip() {
562 run_serial_test(|| {
563 let dir = tempdir().expect("tmp");
564 let path = dir.path().join("lex.mv2");
565
566 let mut mem = Memvid::create(&path).expect("create");
567 mem.enable_lex().expect("enable");
568 mem.put_bytes(b"Rust memory engine").expect("put");
569 mem.put_bytes(b"Deterministic WAL").expect("put2");
570 mem.commit().expect("commit");
571
572 let request = SearchRequest {
574 query: "memory".to_string(),
575 top_k: 10,
576 snippet_chars: 200,
577 uri: None,
578 scope: None,
579 cursor: None,
580 #[cfg(feature = "temporal_track")]
581 temporal: None,
582 as_of_frame: None,
583 as_of_ts: None,
584 };
585 let response = mem.search(request).expect("search");
586 assert_eq!(response.hits.len(), 1);
587
588 drop(mem);
589
590 let mut reopened = Memvid::open(&path).expect("open");
591 let request = SearchRequest {
592 query: "wal".to_string(),
593 top_k: 10,
594 snippet_chars: 200,
595 uri: None,
596 scope: None,
597 cursor: None,
598 #[cfg(feature = "temporal_track")]
599 temporal: None,
600 as_of_frame: None,
601 as_of_ts: None,
602 };
603 let response = reopened.search(request).expect("search reopened");
604 assert_eq!(response.hits.len(), 1);
605 });
606 }
607
608 #[test]
609 fn vec_search_roundtrip() {
610 run_serial_test(|| {
611 let dir = tempdir().expect("tmp");
612 let path = dir.path().join("vec.mv2");
613
614 let mut mem = Memvid::create(&path).expect("create");
615 mem.enable_vec().expect("enable");
616 mem.put_with_embedding(b"vector", vec![0.0, 1.0])
617 .expect("put");
618 mem.put_with_embedding(b"vector-two", vec![1.0, 0.0])
619 .expect("put2");
620 mem.commit().expect("commit");
621
622 let stats = mem.stats().expect("stats");
623 assert!(stats.has_vec_index, "vec index should exist after commit");
624
625 let hits = mem.search_vec(&[0.0, 1.0], 5).expect("search");
626 assert_eq!(hits.first().map(|hit| hit.frame_id), Some(0));
627
628 drop(mem);
629
630 let mut reopened = Memvid::open(&path).expect("open");
631 let reopened_stats = reopened.stats().expect("stats reopen");
632 assert!(
633 reopened_stats.has_vec_index,
634 "vec index should exist after reopen: has_manifest={}, vec_enabled={}",
635 reopened.toc.indexes.vec.is_some(),
636 reopened.vec_enabled
637 );
638 let hits = reopened.search_vec(&[1.0, 0.0], 5).expect("search reopen");
639 assert_eq!(hits.first().map(|hit| hit.frame_id), Some(1));
640 });
641 }
642
643 #[test]
644 fn search_snippet_ranges_match_bytes() {
645 run_serial_test(|| {
646 let dir = tempdir().expect("tmp");
647 let path = dir.path().join("search.mv2");
648
649 let mut mem = Memvid::create(&path).expect("create");
650 mem.enable_lex().expect("enable lex");
651 let options = PutOptions::builder()
652 .uri("mv2://docs/pricing.md")
653 .title("Pricing")
654 .build();
655 let text = "Capacity tickets are signed grants that raise per-file caps.";
656 mem.put_bytes_with_options(text.as_bytes(), options)
657 .expect("put doc");
658 mem.commit().expect("commit");
659
660 let response = mem
661 .search(SearchRequest {
662 query: "capacity tickets".into(),
663 top_k: 5,
664 snippet_chars: 160,
665 uri: None,
666 scope: None,
667 cursor: None,
668 #[cfg(feature = "temporal_track")]
669 temporal: None,
670 as_of_frame: None,
671 as_of_ts: None,
672 })
673 .expect("search");
674
675 assert_eq!(response.total_hits, 1);
676 assert_eq!(response.engine, SearchEngineKind::Tantivy);
677 let hit = response.hits.first().expect("hit");
678 let frame = mem
679 .toc
680 .frames
681 .get(hit.frame_id as usize)
682 .cloned()
683 .expect("frame");
684 let canonical = mem.frame_content(&frame).expect("content");
685 let bytes = canonical.as_bytes();
686 let (start, end) = hit.range;
687 assert!(end <= bytes.len());
688 assert_eq!(hit.text.as_bytes(), &bytes[start..end]);
689 let chunk = hit.chunk_range.expect("chunk range");
690 assert!(chunk.0 <= start);
691 assert!(chunk.1 >= end);
692 let chunk_text = hit.chunk_text.as_ref().expect("chunk text");
693 let chunk_slice = &canonical[chunk.0..chunk.1];
694 assert_eq!(chunk_text, chunk_slice);
695 });
696 }
697
698 #[test]
699 fn search_chunk_range_reflects_chunk_offset() {
700 run_serial_test(|| {
701 let dir = tempdir().expect("tmp");
702 let path = dir.path().join("chunked.mv2");
703
704 let mut mem = Memvid::create(&path).expect("create");
705 mem.enable_lex().expect("enable lex");
706
707 let options = PutOptions::builder()
708 .uri("mv2://docs/manual.txt")
709 .title("Manual")
710 .build();
711 let prefix = "alpha beta gamma delta. ".repeat(200);
712 let content = format!(
713 "{}target segment appears here. Trailing context for verification.",
714 prefix
715 );
716 mem.put_bytes_with_options(content.as_bytes(), options)
717 .expect("put doc");
718 mem.commit().expect("commit");
719
720 let response = mem
721 .search(SearchRequest {
722 query: "target segment".into(),
723 top_k: 5,
724 snippet_chars: 160,
725 uri: None,
726 scope: None,
727 cursor: None,
728 #[cfg(feature = "temporal_track")]
729 temporal: None,
730 as_of_frame: None,
731 as_of_ts: None,
732 })
733 .expect("search");
734
735 let hit = response.hits.first().expect("hit");
736 assert_eq!(response.engine, SearchEngineKind::Tantivy);
737 let chunk_range = hit.chunk_range.expect("chunk range");
738 assert!(chunk_range.0 > 0);
739 assert!(hit.range.0 >= chunk_range.0);
740 assert!(hit.range.1 <= chunk_range.1);
741 assert!(hit.text.contains("target segment"));
742 let chunk_text = hit.chunk_text.as_ref().expect("chunk text");
743 assert_eq!(chunk_text, &content[chunk_range.0..chunk_range.1]);
744 });
745 }
746
747 #[test]
748 fn auto_tag_populates_frame_metadata() {
749 run_serial_test(|| {
750 let dir = tempdir().expect("tmp");
751 let path = dir.path().join("autotag.mv2");
752
753 let mut mem = Memvid::create(&path).expect("create");
754 mem.enable_lex().expect("enable lex");
755
756 let options = PutOptions::builder()
757 .search_text("Neural networks planning session 2024-10-08")
758 .auto_tag(true)
759 .extract_dates(true)
760 .build();
761 mem.put_bytes_with_options(b"agenda", options)
762 .expect("put bytes");
763 mem.commit().expect("commit");
764
765 let frame = mem.toc.frames.first().expect("frame present");
766 assert!(!frame.tags.is_empty());
767 assert!(frame.content_dates.iter().any(|date| date.contains("2024")));
768 });
769 }
770
771 #[test]
772 fn search_filters_by_uri_and_scope() {
773 run_serial_test(|| {
774 let dir = tempdir().expect("tmp");
775 let path = dir.path().join("filters.mv2");
776
777 let mut mem = Memvid::create(&path).expect("create");
778 mem.enable_lex().expect("enable lex");
779
780 let options_a = PutOptions::builder()
781 .uri("mv2://docs/pricing.md")
782 .title("Pricing")
783 .build();
784 mem.put_bytes_with_options(b"Capacity tickets add per-file allowances", options_a)
785 .expect("put a");
786
787 let options_b = PutOptions::builder()
788 .uri("mv2://docs/faq.md")
789 .title("FAQ")
790 .build();
791 mem.put_bytes_with_options(b"Tickets can be issued by admins", options_b)
792 .expect("put b");
793
794 let options_c = PutOptions::builder()
795 .uri("mv2://blog/launch.md")
796 .title("Launch")
797 .build();
798 mem.put_bytes_with_options(b"Launch day tickets boost visibility", options_c)
799 .expect("put c");
800
801 mem.commit().expect("commit");
802
803 let uri_response = mem
804 .search(SearchRequest {
805 query: "tickets".into(),
806 top_k: 10,
807 snippet_chars: 120,
808 uri: Some("mv2://docs/pricing.md".into()),
809 scope: None,
810 cursor: None,
811 #[cfg(feature = "temporal_track")]
812 temporal: None,
813 as_of_frame: None,
814 as_of_ts: None,
815 })
816 .expect("uri search");
817 assert_eq!(uri_response.engine, SearchEngineKind::Tantivy);
818 assert!(
819 uri_response
820 .hits
821 .iter()
822 .all(|hit| hit.uri == "mv2://docs/pricing.md")
823 );
824
825 let scope_response = mem
826 .search(SearchRequest {
827 query: "tickets".into(),
828 top_k: 10,
829 snippet_chars: 120,
830 uri: None,
831 scope: Some("mv2://docs/".into()),
832 cursor: None,
833 #[cfg(feature = "temporal_track")]
834 temporal: None,
835 as_of_frame: None,
836 as_of_ts: None,
837 })
838 .expect("scope search");
839 assert_eq!(scope_response.engine, SearchEngineKind::Tantivy);
840 assert!(
841 scope_response
842 .hits
843 .iter()
844 .all(|hit| hit.uri.starts_with("mv2://docs/"))
845 );
846 });
847 }
848
849 #[test]
850 fn search_pagination_and_params() {
851 run_serial_test(|| {
852 let dir = tempdir().expect("tmp");
853 let path = dir.path().join("paging.mv2");
854
855 let mut mem = Memvid::create(&path).expect("create");
856 mem.enable_lex().expect("enable lex");
857
858 for (idx, text) in [
859 "tickets unlock tier upgrades",
860 "tickets expire after 30 days",
861 "tickets may be revoked",
862 ]
863 .iter()
864 .enumerate()
865 {
866 let uri = format!("mv2://docs/doc{idx}.md");
867 let options = PutOptions::builder()
868 .uri(&uri)
869 .title(format!("Doc {idx}"))
870 .build();
871 mem.put_bytes_with_options(text.as_bytes(), options)
872 .expect("put doc");
873 }
874
875 mem.commit().expect("commit");
876
877 let first_page = mem
878 .search(SearchRequest {
879 query: "tickets".into(),
880 top_k: 1,
881 snippet_chars: 90,
882 uri: None,
883 scope: None,
884 cursor: None,
885 #[cfg(feature = "temporal_track")]
886 temporal: None,
887 as_of_frame: None,
888 as_of_ts: None,
889 })
890 .expect("page one");
891 assert_eq!(first_page.engine, SearchEngineKind::Tantivy);
892 assert_eq!(first_page.hits.len(), 1);
893 assert_eq!(first_page.params.top_k, 1);
894 assert_eq!(first_page.params.snippet_chars, 90);
895 assert!(first_page.total_hits >= first_page.hits.len());
896 let cursor = first_page.next_cursor.clone().expect("cursor");
897 let first_id = first_page.hits[0].frame_id;
898
899 let second_page = mem
900 .search(SearchRequest {
901 query: "tickets".into(),
902 top_k: 1,
903 snippet_chars: 90,
904 uri: None,
905 scope: None,
906 cursor: Some(cursor),
907 #[cfg(feature = "temporal_track")]
908 temporal: None,
909 as_of_frame: None,
910 as_of_ts: None,
911 })
912 .expect("page two");
913 assert_eq!(second_page.engine, SearchEngineKind::Tantivy);
914 assert_eq!(second_page.hits.len(), 1);
915 assert_ne!(second_page.hits[0].frame_id, first_id);
916 assert_eq!(second_page.total_hits, first_page.total_hits);
917 });
918 }
919
920 #[cfg(feature = "lex")]
921 #[test]
922 fn search_falls_back_when_tantivy_missing() {
923 run_serial_test(|| {
924 let dir = tempdir().expect("tmp");
925 let path = dir.path().join("fallback.mv2");
926
927 let mut mem = Memvid::create(&path).expect("create");
928 mem.enable_lex().expect("enable lex");
929 mem.put_bytes(b"tickets fallback test").expect("put");
930 mem.commit().expect("commit");
931
932 assert!(
935 mem.tantivy.is_some(),
936 "Tantivy should be initialized after commit"
937 );
938
939 let response = mem
940 .search(SearchRequest {
941 query: "tickets".into(),
942 top_k: 5,
943 snippet_chars: 120,
944 uri: None,
945 scope: None,
946 cursor: None,
947 #[cfg(feature = "temporal_track")]
948 temporal: None,
949 as_of_frame: None,
950 as_of_ts: None,
951 })
952 .expect("search with tantivy");
953
954 assert_eq!(response.engine, SearchEngineKind::Tantivy);
955 assert!(!response.hits.is_empty());
956 });
957 }
958
959 #[test]
960 fn verify_reports_success() {
961 run_serial_test(|| {
962 let dir = tempdir().expect("tmp");
963 let path = dir.path().join("verify.mv2");
964
965 {
966 let mut mem = Memvid::create(&path).expect("create");
967 mem.enable_lex().expect("enable lex");
968 mem.enable_vec().expect("enable vec");
969 mem.put_with_embedding(b"check", vec![0.5, 0.1])
970 .expect("put");
971 mem.commit().expect("commit");
972 }
973
974 let report = Memvid::verify(&path, true).expect("verify");
975 assert_eq!(report.overall_status, VerificationStatus::Passed);
976 });
977 }
978
979 #[test]
980 fn test_create_enables_indexes_by_default() {
981 run_serial_test(|| {
982 let dir = tempdir().expect("tmp");
983 let path = dir.path().join("default_indexes.mv2");
984
985 let mem = Memvid::create(&path).expect("create");
987
988 let stats = mem.stats().expect("stats");
990 println!(
991 "After create (before drop): lex={}, vec={}",
992 stats.has_lex_index, stats.has_vec_index
993 );
994
995 drop(mem);
996
997 let reopened = Memvid::open(&path).expect("reopen");
999 let stats2 = reopened.stats().expect("stats after reopen");
1000 println!(
1001 "After reopen: lex={}, vec={}",
1002 stats2.has_lex_index, stats2.has_vec_index
1003 );
1004
1005 #[cfg(feature = "lex")]
1006 assert!(
1007 stats2.has_lex_index,
1008 "lex index should be enabled by default"
1009 );
1010
1011 #[cfg(feature = "vec")]
1012 assert!(
1013 stats2.has_vec_index,
1014 "vec index should be enabled by default"
1015 );
1016 });
1017 }
1018
1019 #[test]
1020 fn doctor_rebuilds_time_index() {
1021 use std::fs::OpenOptions;
1022 use std::io::{Seek, SeekFrom, Write};
1023
1024 run_serial_test(|| {
1025 let dir = tempdir().expect("tmp");
1026 let path = dir.path().join("doctor.mv2");
1027
1028 let manifest = {
1029 let mut mem = Memvid::create(&path).expect("create");
1030 mem.put_bytes(b"repair").expect("put");
1031 mem.commit().expect("commit");
1032 mem.rebuild_indexes(&[]).expect("rebuild");
1034 mem.commit().expect("commit after rebuild");
1035 println!(
1036 "test: post-commit header footer_offset={}",
1037 mem.header.footer_offset
1038 );
1039 println!(
1040 "test: post-commit manifest offset={} length={}",
1041 mem.toc
1042 .time_index
1043 .as_ref()
1044 .map(|m| m.bytes_offset)
1045 .unwrap_or(0),
1046 mem.toc
1047 .time_index
1048 .as_ref()
1049 .map(|m| m.bytes_length)
1050 .unwrap_or(0)
1051 );
1052 mem.toc.time_index.clone().expect("time index manifest")
1053 };
1054
1055 {
1056 let mut file = OpenOptions::new()
1057 .read(true)
1058 .write(true)
1059 .open(&path)
1060 .expect("open file");
1061 file.seek(SeekFrom::Start(manifest.bytes_offset))
1062 .expect("seek");
1063 let zeros = vec![0u8; manifest.bytes_length as usize];
1064 file.write_all(&zeros).expect("corrupt time index");
1065 file.flush().expect("flush");
1066 file.sync_all().expect("sync");
1067 }
1068
1069 println!(
1070 "test: footer scan: {:?}",
1071 crate::footer::find_last_valid_footer(&std::fs::read(&path).expect("read file"))
1072 .as_ref()
1073 .map(|s| (s.footer_offset, s.toc_offset, s.footer.toc_len))
1074 );
1075 println!("test: verifying corrupted memory");
1076 match Memvid::verify(&path, false) {
1077 Ok(report) => {
1078 assert_eq!(report.overall_status, VerificationStatus::Failed);
1079 }
1080 Err(e) => {
1081 println!("test: verify failed with error (expected): {}", e);
1082 }
1083 }
1084
1085 println!("test: running doctor");
1086 let report = Memvid::doctor(
1087 &path,
1088 DoctorOptions {
1089 rebuild_time_index: true,
1090 rebuild_lex_index: false,
1091 ..DoctorOptions::default()
1092 },
1093 )
1094 .expect("doctor");
1095 println!("test: doctor completed with status: {:?}", report.status);
1096 println!("test: verifying repaired memory");
1101 let reopened = Memvid::open(&path).expect("reopen after doctor");
1103 assert!(
1104 reopened.toc.time_index.is_some(),
1105 "time index should exist after doctor"
1106 );
1107 });
1108 }
1109
1110 #[test]
1111 fn blob_reader_roundtrip_with_media_manifest() {
1112 run_serial_test(|| {
1113 let dir = tempdir().expect("tmp");
1114 let path = dir.path().join("blob.mv2");
1115 let payload = vec![0u8, 159, 1, 128, 42, 99, 200];
1116
1117 let manifest = MediaManifest {
1118 kind: "video".to_string(),
1119 mime: "video/mp4".to_string(),
1120 bytes: payload.len() as u64,
1121 filename: Some("clip.mp4".to_string()),
1122 duration_ms: Some(1234),
1123 width: Some(1920),
1124 height: Some(1080),
1125 codec: Some("h264".to_string()),
1126 };
1127
1128 let mut doc_meta = DocMetadata::default();
1129 doc_meta.media = Some(manifest.clone());
1130 doc_meta.mime = Some("video/mp4".to_string());
1131 doc_meta.bytes = Some(payload.len() as u64);
1132 assert!(
1133 !doc_meta.is_empty(),
1134 "media manifest must count as metadata"
1135 );
1136
1137 let options = PutOptions::builder()
1138 .metadata(doc_meta)
1139 .kind("video")
1140 .uri("mv2://video/clip.mp4")
1141 .build();
1142
1143 {
1144 let mut mem = Memvid::create(&path).expect("create");
1145 mem.put_bytes_with_options(&payload, options)
1146 .expect("put bytes");
1147 mem.commit().expect("commit");
1148 }
1149
1150 let mut reopened = Memvid::open(&path).expect("open");
1151 let mut reader = reopened
1152 .blob_reader_by_uri("mv2://video/clip.mp4")
1153 .expect("blob reader");
1154 let mut buffered = Vec::new();
1155 reader.read_to_end(&mut buffered).expect("read payload");
1156 assert_eq!(buffered, payload);
1157
1158 let roundtrip = reopened
1159 .media_manifest_by_uri("mv2://video/clip.mp4")
1160 .expect("manifest lookup")
1161 .expect("manifest present");
1162 assert_eq!(roundtrip.mime, "video/mp4");
1163 assert_eq!(roundtrip.kind, "video");
1164 assert_eq!(roundtrip.bytes, payload.len() as u64);
1165 assert_eq!(roundtrip.filename.as_deref(), Some("clip.mp4"));
1166 assert_eq!(roundtrip.duration_ms, Some(1234));
1167 assert_eq!(roundtrip.width, Some(1920));
1168 assert_eq!(roundtrip.height, Some(1080));
1169 assert_eq!(roundtrip.codec.as_deref(), Some("h264"));
1170
1171 drop(dir);
1172 });
1173 }
1174
1175 #[test]
1176 fn video_frame_roundtrip_does_not_corrupt_toc() {
1177 use crate::types::MediaManifest;
1178
1179 run_serial_test(|| {
1180 let dir = tempdir().expect("tmp");
1181 let path = dir.path().join("video.mv2");
1182 let mut seed = 0xDEADBEEF_u64;
1183 let mut video_bytes = vec![0u8; 1_600_000];
1184 for byte in &mut video_bytes {
1185 seed = seed ^ (seed << 7);
1186 seed = seed ^ (seed >> 9);
1187 seed = seed ^ (seed << 8);
1188 *byte = (seed & 0xFF) as u8;
1189 }
1190
1191 let hash_hex = blake3::hash(&video_bytes).to_hex().to_string();
1192
1193 let manifest = MediaManifest {
1194 kind: "video".to_string(),
1195 mime: "video/mp4".to_string(),
1196 bytes: video_bytes.len() as u64,
1197 filename: Some("clip.mp4".to_string()),
1198 duration_ms: Some(1_000),
1199 width: Some(1920),
1200 height: Some(1080),
1201 codec: Some("h264".to_string()),
1202 };
1203
1204 let mut meta = DocMetadata::default();
1205 meta.mime = Some("video/mp4".to_string());
1206 meta.bytes = Some(video_bytes.len() as u64);
1207 meta.hash = Some(hash_hex);
1208 meta.caption = Some("Test clip".to_string());
1209 meta.media = Some(manifest);
1210
1211 let options = PutOptions::builder()
1212 .kind("video")
1213 .metadata(meta)
1214 .tag("kind", "video")
1215 .uri("mv2://video/test.mp4")
1216 .title("Test clip")
1217 .build();
1218
1219 {
1220 let mut mem = Memvid::create(&path).expect("create");
1221 mem.put_bytes_with_options(&video_bytes, options)
1222 .expect("put video");
1223 mem.commit().expect("commit");
1224 }
1225
1226 let reopened = Memvid::open(&path).expect("reopen");
1227 let stats = reopened.stats().expect("stats");
1228 assert_eq!(stats.frame_count, 1);
1229 });
1230 }
1231
1232 #[test]
1233 fn ticket_sequence_enforced() {
1234 run_serial_test(|| {
1235 let dir = tempdir().expect("tmp");
1236 let path = dir.path().join("ticket.mv2");
1237
1238 let mut mem = Memvid::create(&path).expect("create");
1239 mem.apply_ticket(Ticket::new("issuer", 2))
1240 .expect("apply first");
1241
1242 let err = mem
1243 .apply_ticket(Ticket::new("issuer", 2))
1244 .expect_err("sequence must increase");
1245 assert!(matches!(err, MemvidError::TicketSequence { .. }));
1246 });
1247 }
1248
1249 #[test]
1250 fn capacity_limit_enforced() {
1251 run_serial_test(|| {
1252 let dir = tempdir().expect("tmp");
1253 let path = dir.path().join("capacity.mv2");
1254
1255 let mut mem = Memvid::create(&path).expect("create");
1256 let base = mem.data_end;
1257 mem.apply_ticket(Ticket::new("issuer", 2).capacity_bytes(base + 64))
1258 .expect("apply ticket");
1259
1260 mem.put_bytes(&vec![0xFF; 32]).expect("first put");
1261 mem.commit().expect("commit");
1262
1263 let err = mem
1264 .put_bytes(&vec![0xFF; 40])
1265 .expect_err("capacity exceeded");
1266 assert!(matches!(err, MemvidError::CapacityExceeded { .. }));
1267 });
1268 }
1269}