1#![deny(clippy::all, clippy::pedantic)]
2#![cfg_attr(not(test), deny(clippy::unwrap_used, clippy::expect_used))]
3#![allow(clippy::module_name_repetitions)]
4
5pub const MEMVID_CORE_VERSION: &str = env!("CARGO_PKG_VERSION");
7
8mod analysis;
9pub mod constants;
10pub mod enrich;
11pub mod enrichment_worker;
12pub mod error;
13pub mod extract;
14pub mod extract_budgeted;
15pub mod footer;
16pub mod io;
17pub mod lex;
18mod lock;
19pub mod lockfile;
20pub mod memvid;
21pub mod models;
22pub mod pii;
23pub mod reader;
24mod registry;
25mod search;
26pub mod signature;
27pub mod structure;
28pub mod table;
29pub mod text;
30mod toc;
31pub mod types;
32pub mod vec;
33pub mod vec_pq;
34
35pub mod triplet;
37
38pub mod graph_search;
40
41pub mod clip;
44
45pub mod whisper;
48
49pub mod replay;
53
54#[cfg(feature = "encryption")]
57pub mod encryption;
58
59#[cfg(feature = "symspell_cleanup")]
61pub mod symspell_cleanup;
62
63#[cfg(test)]
64mod tests_lex_flag;
65
66#[cfg(feature = "temporal_track")]
67pub use analysis::temporal::{
68 TemporalContext, TemporalNormalizer, TemporalResolution, TemporalResolutionFlag,
69 TemporalResolutionValue, parse_clock_inheritance, parse_week_start,
70};
71#[cfg(feature = "temporal_enrich")]
73pub use analysis::temporal_enrich::{
74 AnchorSource as TemporalEnrichAnchorSource, RelativePhrase, ResolvedTemporal,
75 TemporalAnchorInfo, TemporalAnchorTracker, TemporalEnrichment, detect_relative_phrases,
76 enrich_chunk, enrich_chunks, enrich_document, resolve_relative_phrase,
77};
78pub use constants::*;
79pub use enrichment_worker::{EnrichmentWorkerConfig, EnrichmentWorkerStats};
80pub use error::{MemvidError, Result};
81pub use extract::{DocumentProcessor, ExtractedDocument, ProcessorConfig};
82pub use footer::{CommitFooter, find_last_valid_footer};
83#[cfg(feature = "temporal_track")]
84pub use io::temporal_index::{
85 append_track as temporal_track_append, calculate_checksum as temporal_track_checksum,
86 read_track as temporal_track_read, window as temporal_track_window,
87};
88pub use io::time_index::{
89 TimeIndexEntry, append_track as time_index_append, calculate_checksum as time_index_checksum,
90 read_track as time_index_read,
91};
92pub use io::wal::{EmbeddedWal, WalRecord, WalStats};
93pub use lex::{LexIndex, LexIndexArtifact, LexIndexBuilder, LexSearchHit};
94pub use lock::FileLock;
95pub use memvid::{
96 BlobReader, EnrichmentHandle, EnrichmentStats, LockSettings, Memvid, OpenReadOptions,
97 SketchCandidate, SketchSearchOptions, SketchSearchStats,
98 mutation::{CommitMode, CommitOptions},
99 start_enrichment_worker, start_enrichment_worker_with_embeddings,
100};
101#[cfg(feature = "parallel_segments")]
102pub use memvid::{BuildOpts, ParallelInput, ParallelPayload};
103pub use models::{
104 ModelManifest, ModelManifestEntry, ModelVerification, ModelVerificationStatus,
105 ModelVerifyOptions, verify_model_dir, verify_models,
106};
107pub use reader::{
108 DocumentFormat, DocumentReader, PassthroughReader, PdfReader, ReaderDiagnostics, ReaderHint,
109 ReaderOutput, ReaderRegistry,
110};
111pub use signature::{
112 parse_ed25519_public_key_base64, verify_model_manifest, verify_ticket_signature,
113};
114pub use text::{NormalizedText, normalize_text, truncate_at_grapheme_boundary};
115#[cfg(feature = "temporal_track")]
116pub use types::{
117 AnchorSource, SearchHitTemporal, SearchHitTemporalAnchor, SearchHitTemporalMention,
118 TEMPORAL_TRACK_FLAG_HAS_ANCHORS, TEMPORAL_TRACK_FLAG_HAS_MENTIONS, TemporalAnchor,
119 TemporalCapabilities, TemporalFilter, TemporalMention, TemporalMentionFlags,
120 TemporalMentionKind, TemporalTrack, TemporalTrackManifest,
121};
122pub use types::{
123 AskCitation, AskMode, AskRequest, AskResponse, AskRetriever, AskStats, AudioSegmentMetadata,
124 AuditOptions, AuditReport, CanonicalEncoding, DOCTOR_PLAN_VERSION, DocAudioMetadata,
125 DocExifMetadata, DocGpsMetadata, DocMetadata, DoctorActionDetail, DoctorActionKind,
126 DoctorActionPlan, DoctorActionReport, DoctorActionStatus, DoctorFinding, DoctorFindingCode,
127 DoctorMetrics, DoctorOptions, DoctorPhaseDuration, DoctorPhaseKind, DoctorPhasePlan,
128 DoctorPhaseReport, DoctorPhaseStatus, DoctorPlan, DoctorReport, DoctorSeverity, DoctorStatus,
129 EmbeddingIdentity, EmbeddingIdentityCount, EmbeddingIdentitySummary, Frame, FrameId, FrameRole,
130 FrameStatus, Header, IndexManifests, LexIndexManifest, LexSegmentDescriptor,
131 MEMVID_EMBEDDING_DIMENSION_KEY, MEMVID_EMBEDDING_MODEL_KEY, MEMVID_EMBEDDING_NORMALIZED_KEY,
132 MEMVID_EMBEDDING_PROVIDER_KEY, MediaManifest, MemvidHandle, Open, PutOptions,
133 PutOptionsBuilder, Sealed, SearchEngineKind, SearchHit, SearchHitMetadata, SearchParams,
134 SearchRequest, SearchResponse, SegmentCatalog, SegmentCommon, SegmentCompression, SegmentMeta,
135 SegmentSpan, SourceSpan, Stats, TextChunkManifest, TextChunkRange, Ticket, TicketRef, Tier,
136 TimeIndexManifest, TimeSegmentDescriptor, TimelineEntry, TimelineQuery, TimelineQueryBuilder,
137 Toc, VecEmbedder, VecIndexManifest, VecSegmentDescriptor, VectorCompression, VerificationCheck,
138 VerificationReport, VerificationStatus,
139};
140pub use types::{
142 EngineStamp, EnrichmentManifest, EnrichmentRecord, MEMORIES_TRACK_MAGIC,
143 MEMORIES_TRACK_VERSION, MemoriesStats, MemoriesTrack, MemoryCard, MemoryCardBuilder,
144 MemoryCardBuilderError, MemoryCardId, MemoryKind, Polarity, SlotIndex, VersionRelation,
145};
146pub use types::{
148 EdgeDirection, EntityKind, FollowResult, LOGIC_MESH_MAGIC, LOGIC_MESH_VERSION, LinkType,
149 LogicMesh, LogicMeshManifest, MeshEdge, MeshNode,
150};
151pub use types::{
153 DEFAULT_HAMMING_THRESHOLD, QuerySketch, SKETCH_TRACK_MAGIC, SKETCH_TRACK_VERSION, SketchEntry,
154 SketchFlags, SketchTrack, SketchTrackHeader, SketchTrackManifest, SketchTrackStats,
155 SketchVariant, build_term_filter, compute_simhash, compute_token_weights, generate_sketch,
156 hash_token, hash_token_u32, read_sketch_track, term_filter_maybe_contains, tokenize_for_sketch,
157 write_sketch_track,
158};
159pub use types::{
161 Cardinality, PredicateId, PredicateSchema, SchemaError, SchemaRegistry, ValueType,
162};
163pub use memvid::memory::SchemaSummaryEntry;
165#[cfg(feature = "logic_mesh")]
167pub use analysis::ner::NerModel;
168pub use analysis::ner::{
169 ExtractedEntity, FrameEntities, NER_MODEL_NAME, NER_MODEL_SIZE_MB, NER_MODEL_URL, NER_MODELS,
170 NER_TOKENIZER_URL, NerModelInfo, default_ner_model_info, get_ner_model_info,
171 is_ner_model_installed, ner_model_path, ner_tokenizer_path,
172};
173pub use enrich::{EnrichmentContext, EnrichmentEngine, EnrichmentResult, RulesEngine};
175pub use triplet::{ExtractionMode, ExtractionStats, TripletExtractor};
177pub use graph_search::{GraphMatcher, QueryPlanner, hybrid_search};
179pub use types::{
181 BatchEmbeddingResult, EmbeddingConfig, EmbeddingProvider, EmbeddingProviderKind,
182 EmbeddingResult,
183};
184pub use types::reranker::{
186 Reranker, RerankerConfig, RerankerDocument, RerankerKind, RerankerResult,
187};
188#[cfg(feature = "parallel_segments")]
189pub use types::{IndexSegmentRef, SegmentKind, SegmentStats};
190pub use vec::{VecIndex, VecIndexArtifact, VecSearchHit};
191pub use vec_pq::{
192 CompressionStats, ProductQuantizer, QuantizedVecIndex, QuantizedVecIndexArtifact,
193 QuantizedVecIndexBuilder,
194};
195pub use clip::{
197 CLIP_MODELS, ClipConfig, ClipDocument, ClipEmbeddingProvider, ClipError, ClipIndex,
198 ClipIndexArtifact, ClipIndexBuilder, ClipIndexManifest, ClipModelInfo, ClipSearchHit,
199 ImageInfo, MOBILECLIP_DIMS, SIGLIP_DIMS, default_model_info, filter_junk_images,
200 get_model_info,
201};
202#[cfg(feature = "clip")]
204pub use clip::{ClipModel, calculate_color_variance, get_image_info};
205pub use whisper::{
207 TranscriptionResult, TranscriptionSegment, WHISPER_MODELS, WhisperConfig, WhisperError,
208 WhisperModelInfo, default_whisper_model_info, get_whisper_model_info,
209};
210#[cfg(feature = "whisper")]
212pub use whisper::{WHISPER_SAMPLE_RATE, WhisperTranscriber, decode_audio_file};
213pub use structure::{
215 ChunkType, ChunkingOptions, ChunkingResult, StructuralChunker, StructuredChunk,
216 StructuredDocument, TableChunkingStrategy, chunk_structured, detect_structure,
217};
218pub use types::adaptive::{
220 AdaptiveConfig, AdaptiveResult, AdaptiveStats, CutoffStrategy, find_adaptive_cutoff,
221 normalize_scores,
222};
223pub use replay::{
225 ActionType, Checkpoint, REPLAY_SEGMENT_MAGIC, REPLAY_SEGMENT_VERSION, ReplayAction,
226 ReplayManifest, ReplaySession, SessionSummary, StateSnapshot,
227};
228#[cfg(feature = "replay")]
230pub use replay::{
231 ActiveSession, ComparisonReport, ComparisonSummary, Divergence, DivergenceType, ModelResult,
232 ReplayConfig, ReplayOptions, ReplayResult,
233};
234
235#[cfg(test)]
236use once_cell::sync::Lazy;
237use std::fs::File;
238use std::io::Cursor;
239use std::path::Path;
240#[cfg(test)]
241use std::sync::Mutex;
242
243use bincode::config::{self, Config};
244use io::header::HeaderCodec;
245
246const TIMELINE_PREVIEW_BYTES: usize = 120;
247const MAX_INDEX_BYTES: u64 = 512 * 1024 * 1024; const MAX_TIME_INDEX_BYTES: u64 = 512 * 1024 * 1024;
249const MAX_FRAME_BYTES: u64 = 256 * 1024 * 1024;
250const DEFAULT_SEARCH_TEXT_LIMIT: usize = 32_768;
251
252#[cfg(test)]
253static SERIAL_TEST_MUTEX: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
254
255#[cfg(test)]
256pub(crate) fn run_serial_test<T>(f: impl FnOnce() -> T) -> T {
257 let _guard = SERIAL_TEST_MUTEX
258 .lock()
259 .expect("memvid-core serial test mutex poisoned");
260 f()
261}
262
263impl Memvid {
264 #[cfg(feature = "lex")]
265 fn tantivy_index_pending(&self) -> bool {
266 self.tantivy_dirty
267 }
268
269 #[cfg(not(feature = "lex"))]
270 fn tantivy_index_pending(&self) -> bool {
271 false
272 }
273
274 #[cfg(feature = "lex")]
275 fn flush_tantivy_conditional(&mut self, embed_snapshot: bool) -> Result<()> {
276 if !self.tantivy_dirty {
277 return Ok(());
278 }
279 if let Some(engine) = self.tantivy.as_mut() {
280 engine.commit()?;
281 if embed_snapshot {
282 let snapshot = engine.snapshot_segments()?;
283 self.update_embedded_lex_snapshot(snapshot)?;
284 }
285 }
286 self.tantivy_dirty = false;
287 Ok(())
288 }
289
290 #[cfg(feature = "lex")]
291 fn flush_tantivy(&mut self) -> Result<()> {
292 self.flush_tantivy_conditional(true)
293 }
294
295 #[cfg(feature = "lex")]
296 #[allow(dead_code)]
297 fn flush_tantivy_skip_embed(&mut self) -> Result<()> {
298 self.flush_tantivy_conditional(false)
299 }
300
301 #[cfg(not(feature = "lex"))]
302 fn flush_tantivy(&mut self) -> Result<()> {
303 Ok(())
304 }
305
306 #[cfg(not(feature = "lex"))]
307 #[allow(dead_code)]
308 fn flush_tantivy_skip_embed(&mut self) -> Result<()> {
309 Ok(())
310 }
311 pub fn path(&self) -> &Path {
312 &self.path
313 }
314
315 pub fn lock_handle(&self) -> &FileLock {
316 &self.lock
317 }
318
319 pub fn is_read_only(&self) -> bool {
320 self.read_only
321 }
322
323 pub(crate) fn ensure_writable(&mut self) -> Result<()> {
324 if self.read_only {
325 self.lock.upgrade_to_exclusive()?;
326 self.read_only = false;
327 }
328 Ok(())
329 }
330
331 pub fn downgrade_to_shared(&mut self) -> Result<()> {
332 if self.read_only {
333 return Ok(());
334 }
335 if self.dirty || self.tantivy_index_pending() {
336 return Ok(());
337 }
338 self.lock.downgrade_to_shared()?;
339 self.read_only = true;
340 Ok(())
341 }
342}
343
344impl Drop for Memvid {
345 fn drop(&mut self) {
346 if self.dirty {
347 let _ = self.commit();
348 }
349 #[cfg(feature = "parallel_segments")]
351 {
352 use crate::memvid::lifecycle::cleanup_manifest_wal_public;
353 cleanup_manifest_wal_public(self.path());
354 }
355 }
356}
357
358pub(crate) fn persist_header(file: &mut File, header: &Header) -> Result<()> {
359 HeaderCodec::write(file, header)
360}
361
362fn wal_config() -> impl Config {
363 config::standard()
364 .with_fixed_int_encoding()
365 .with_little_endian()
366}
367
368pub(crate) fn decode_canonical_bytes(
369 payload: &[u8],
370 encoding: CanonicalEncoding,
371 frame_id: FrameId,
372) -> Result<Vec<u8>> {
373 match encoding {
374 CanonicalEncoding::Plain => Ok(payload.to_vec()),
375 CanonicalEncoding::Zstd => {
376 zstd::decode_all(Cursor::new(payload)).map_err(|_| MemvidError::InvalidFrame {
377 frame_id,
378 reason: "failed to decode canonical payload",
379 })
380 }
381 }
382}
383
384pub(crate) fn default_uri(frame_id: FrameId) -> String {
385 format!("mv2://frames/{frame_id}")
386}
387
388pub(crate) fn infer_title_from_uri(uri: &str) -> Option<String> {
389 let trimmed = uri.trim();
390 if trimmed.is_empty() {
391 return None;
392 }
393
394 let without_scheme = trimmed.splitn(2, "://").nth(1).unwrap_or(trimmed);
395 let without_fragment = without_scheme.split('#').next().unwrap_or(without_scheme);
396 let without_query = without_fragment
397 .split('?')
398 .next()
399 .unwrap_or(without_fragment);
400 let segment = without_query
401 .trim_end_matches('/')
402 .rsplit('/')
403 .next()
404 .map(str::trim)?;
405 if segment.is_empty() {
406 return None;
407 }
408
409 let stem = segment.rsplitn(2, '.').nth(1).unwrap_or(segment).trim();
410 if stem.is_empty() {
411 return None;
412 }
413
414 let words: Vec<String> = stem
415 .split(|c: char| c == '-' || c == '_' || c == ' ')
416 .filter(|part| !part.is_empty())
417 .map(|part| {
418 let mut chars = part.chars();
419 match chars.next() {
420 Some(first) => {
421 let first = first.to_ascii_uppercase();
422 let rest: String = chars.map(|c| c.to_ascii_lowercase()).collect();
423 if rest.is_empty() {
424 first.to_string()
425 } else {
426 format!("{}{}", first, rest)
427 }
428 }
429 None => String::new(),
430 }
431 })
432 .filter(|word| !word.is_empty())
433 .collect();
434
435 if words.is_empty() {
436 None
437 } else {
438 Some(words.join(" "))
439 }
440}
441
442fn truncate_preview(text: &str) -> String {
443 text.chars().take(TIMELINE_PREVIEW_BYTES).collect()
444}
445
446fn image_preview_from_metadata(meta: &DocMetadata) -> Option<String> {
447 let mime = meta.mime.as_deref()?;
448 if !mime.starts_with("image/") {
449 return None;
450 }
451
452 if let Some(caption) = meta.caption.as_ref() {
453 let trimmed = caption.trim();
454 if !trimmed.is_empty() {
455 return Some(truncate_preview(trimmed));
456 }
457 }
458
459 let mut segments: Vec<String> = Vec::new();
460 if let (Some(w), Some(h)) = (meta.width, meta.height) {
461 segments.push(format!("{}×{} px", w, h));
462 }
463 if let Some(exif) = meta.exif.as_ref() {
464 if let Some(model) = exif
465 .model
466 .as_ref()
467 .map(|s| s.trim())
468 .filter(|s| !s.is_empty())
469 {
470 segments.push(model.to_string());
471 } else if let Some(make) = exif
472 .make
473 .as_ref()
474 .map(|s| s.trim())
475 .filter(|s| !s.is_empty())
476 {
477 segments.push(make.to_string());
478 }
479
480 if let Some(datetime) = exif
481 .datetime
482 .as_ref()
483 .map(|s| s.trim())
484 .filter(|s| !s.is_empty())
485 {
486 segments.push(datetime.to_string());
487 }
488 }
489
490 if segments.is_empty() {
491 return Some("Image frame".to_string());
492 }
493
494 Some(truncate_preview(&segments.join(" · ")))
495}
496
497#[cfg(test)]
498mod tests {
499 use super::*;
500 use std::io::Read;
501 use std::num::NonZeroU64;
502 use tempfile::tempdir;
503
504 #[test]
505 fn create_put_commit_reopen() {
506 run_serial_test(|| {
507 let dir = tempdir().expect("tmp");
508 let path = dir.path().join("memory.mv2");
509
510 let mut mem = Memvid::create(&path).expect("create");
511 let seq = mem.put_bytes(b"hello").expect("put");
512 assert_eq!(seq, 1);
513 mem.commit().expect("commit");
514
515 drop(mem);
516
517 let mut reopened = Memvid::open(&path).expect("open");
518 let stats = reopened.stats().expect("stats");
519 assert_eq!(stats.frame_count, 1);
520 assert!(stats.has_time_index);
521
522 let timeline = reopened
523 .timeline(TimelineQuery::default())
524 .expect("timeline");
525 assert_eq!(timeline.len(), 1);
526 assert!(timeline[0].preview.contains("hello"));
527
528 let wal_stats = reopened.wal.stats();
529 assert_eq!(wal_stats.pending_bytes, 0);
530 assert_eq!(wal_stats.sequence, 2);
532 });
533 }
534
535 #[test]
536 fn timeline_limit_and_reverse() {
537 run_serial_test(|| {
538 let dir = tempdir().expect("tmp");
539 let path = dir.path().join("timeline.mv2");
540
541 let mut mem = Memvid::create(&path).expect("create");
542 mem.put_bytes(b"alpha").expect("put alpha");
543 mem.put_bytes(b"beta").expect("put beta");
544 mem.commit().expect("commit");
545 drop(mem);
546
547 let mut reopened = Memvid::open(&path).expect("open");
548 let limited = reopened
549 .timeline(TimelineQuery {
550 limit: NonZeroU64::new(1),
551 since: None,
552 until: None,
553 reverse: false,
554 #[cfg(feature = "temporal_track")]
555 temporal: None,
556 })
557 .expect("timeline limit");
558 assert_eq!(limited.len(), 1);
559 assert!(limited[0].preview.contains("alpha"));
560
561 let reversed = reopened
562 .timeline(TimelineQuery {
563 limit: NonZeroU64::new(1),
564 since: None,
565 until: None,
566 reverse: true,
567 #[cfg(feature = "temporal_track")]
568 temporal: None,
569 })
570 .expect("timeline reverse");
571 assert_eq!(reversed.len(), 1);
572 assert!(reversed[0].preview.contains("beta"));
573 });
574 }
575
576 #[test]
577 fn lex_search_roundtrip() {
578 run_serial_test(|| {
579 let dir = tempdir().expect("tmp");
580 let path = dir.path().join("lex.mv2");
581
582 let mut mem = Memvid::create(&path).expect("create");
583 mem.enable_lex().expect("enable");
584 let _seq1 = mem.put_bytes(b"Rust memory engine").expect("put");
585 let _seq2 = mem.put_bytes(b"Deterministic WAL").expect("put2");
586 mem.commit().expect("commit");
587
588 let request = SearchRequest {
590 query: "memory".to_string(),
591 top_k: 10,
592 snippet_chars: 200,
593 uri: None,
594 scope: None,
595 cursor: None,
596 #[cfg(feature = "temporal_track")]
597 temporal: None,
598 as_of_frame: None,
599 as_of_ts: None,
600 no_sketch: false,
601 };
602 let response = mem.search(request).expect("search");
603 assert_eq!(response.hits.len(), 1);
604
605 drop(mem);
606
607 let mut reopened = Memvid::open(&path).expect("open");
608 let request = SearchRequest {
609 query: "wal".to_string(),
610 top_k: 10,
611 snippet_chars: 200,
612 uri: None,
613 scope: None,
614 cursor: None,
615 #[cfg(feature = "temporal_track")]
616 temporal: None,
617 as_of_frame: None,
618 as_of_ts: None,
619 no_sketch: false,
620 };
621 let response = reopened.search(request).expect("search reopened");
622 assert_eq!(response.hits.len(), 1);
623 });
624 }
625
626 #[test]
627 fn vec_search_roundtrip() {
628 run_serial_test(|| {
629 let dir = tempdir().expect("tmp");
630 let path = dir.path().join("vec.mv2");
631
632 let mut mem = Memvid::create(&path).expect("create");
633 mem.enable_vec().expect("enable");
634 mem.put_with_embedding(b"vector", vec![0.0, 1.0])
635 .expect("put");
636 mem.put_with_embedding(b"vector-two", vec![1.0, 0.0])
637 .expect("put2");
638 mem.commit().expect("commit");
639
640 let stats = mem.stats().expect("stats");
641 assert!(stats.has_vec_index, "vec index should exist after commit");
642
643 let hits = mem.search_vec(&[0.0, 1.0], 5).expect("search");
644 assert_eq!(hits.first().map(|hit| hit.frame_id), Some(0));
645
646 drop(mem);
647
648 let mut reopened = Memvid::open(&path).expect("open");
649 let reopened_stats = reopened.stats().expect("stats reopen");
650 assert!(
651 reopened_stats.has_vec_index,
652 "vec index should exist after reopen: has_manifest={}, vec_enabled={}",
653 reopened.toc.indexes.vec.is_some(),
654 reopened.vec_enabled
655 );
656 let hits = reopened.search_vec(&[1.0, 0.0], 5).expect("search reopen");
657 assert_eq!(hits.first().map(|hit| hit.frame_id), Some(1));
658 });
659 }
660
661 #[test]
662 fn search_snippet_ranges_match_bytes() {
663 run_serial_test(|| {
664 let dir = tempdir().expect("tmp");
665 let path = dir.path().join("search.mv2");
666
667 let mut mem = Memvid::create(&path).expect("create");
668 mem.enable_lex().expect("enable lex");
669 let options = PutOptions::builder()
670 .uri("mv2://docs/pricing.md")
671 .title("Pricing")
672 .build();
673 let text = "Capacity tickets are signed grants that raise per-file caps.";
674 mem.put_bytes_with_options(text.as_bytes(), options)
675 .expect("put doc");
676 mem.commit().expect("commit");
677
678 let response = mem
679 .search(SearchRequest {
680 query: "capacity tickets".into(),
681 top_k: 5,
682 snippet_chars: 160,
683 uri: None,
684 scope: None,
685 cursor: None,
686 #[cfg(feature = "temporal_track")]
687 temporal: None,
688 as_of_frame: None,
689 as_of_ts: None,
690 no_sketch: false,
691 })
692 .expect("search");
693
694 assert_eq!(response.total_hits, 1);
695 assert_eq!(response.engine, SearchEngineKind::Tantivy);
696 let hit = response.hits.first().expect("hit");
697 let frame = mem
698 .toc
699 .frames
700 .get(hit.frame_id as usize)
701 .cloned()
702 .expect("frame");
703 let canonical = mem.frame_content(&frame).expect("content");
704 let bytes = canonical.as_bytes();
705 let (start, end) = hit.range;
706 assert!(end <= bytes.len());
707 assert_eq!(hit.text.as_bytes(), &bytes[start..end]);
708 let chunk = hit.chunk_range.expect("chunk range");
709 assert!(chunk.0 <= start);
710 assert!(chunk.1 >= end);
711 let chunk_text = hit.chunk_text.as_ref().expect("chunk text");
712 let chunk_slice = &canonical[chunk.0..chunk.1];
713 assert_eq!(chunk_text, chunk_slice);
714 });
715 }
716
717 #[test]
718 fn search_chunk_range_reflects_chunk_offset() {
719 run_serial_test(|| {
720 let dir = tempdir().expect("tmp");
721 let path = dir.path().join("chunked.mv2");
722
723 let mut mem = Memvid::create(&path).expect("create");
724 mem.enable_lex().expect("enable lex");
725
726 let options = PutOptions::builder()
727 .uri("mv2://docs/manual.txt")
728 .title("Manual")
729 .build();
730 let prefix = "alpha beta gamma delta. ".repeat(200);
731 let content = format!(
732 "{}target segment appears here. Trailing context for verification.",
733 prefix
734 );
735 mem.put_bytes_with_options(content.as_bytes(), options)
736 .expect("put doc");
737 mem.commit().expect("commit");
738
739 let response = mem
740 .search(SearchRequest {
741 query: "target segment".into(),
742 top_k: 5,
743 snippet_chars: 160,
744 uri: None,
745 scope: None,
746 cursor: None,
747 #[cfg(feature = "temporal_track")]
748 temporal: None,
749 as_of_frame: None,
750 as_of_ts: None,
751 no_sketch: false,
752 })
753 .expect("search");
754
755 let hit = response.hits.first().expect("hit");
756 assert_eq!(response.engine, SearchEngineKind::Tantivy);
757 let chunk_range = hit.chunk_range.expect("chunk range");
758 assert!(chunk_range.0 > 0);
759 assert!(hit.range.0 >= chunk_range.0);
760 assert!(hit.range.1 <= chunk_range.1);
761 assert!(hit.text.contains("target segment"));
762 let chunk_text = hit.chunk_text.as_ref().expect("chunk text");
763 assert_eq!(chunk_text, &content[chunk_range.0..chunk_range.1]);
764 });
765 }
766
767 #[test]
768 fn auto_tag_populates_frame_metadata() {
769 run_serial_test(|| {
770 let dir = tempdir().expect("tmp");
771 let path = dir.path().join("autotag.mv2");
772
773 let mut mem = Memvid::create(&path).expect("create");
774 mem.enable_lex().expect("enable lex");
775
776 let options = PutOptions::builder()
777 .search_text("Neural networks planning session 2024-10-08")
778 .auto_tag(true)
779 .extract_dates(true)
780 .build();
781 mem.put_bytes_with_options(b"agenda", options)
782 .expect("put bytes");
783 mem.commit().expect("commit");
784
785 let frame = mem.toc.frames.first().expect("frame present");
786 assert!(!frame.tags.is_empty());
787 assert!(frame.content_dates.iter().any(|date| date.contains("2024")));
788 });
789 }
790
791 #[test]
792 fn search_filters_by_uri_and_scope() {
793 run_serial_test(|| {
794 let dir = tempdir().expect("tmp");
795 let path = dir.path().join("filters.mv2");
796
797 let mut mem = Memvid::create(&path).expect("create");
798 mem.enable_lex().expect("enable lex");
799
800 let options_a = PutOptions::builder()
801 .uri("mv2://docs/pricing.md")
802 .title("Pricing")
803 .build();
804 mem.put_bytes_with_options(b"Capacity tickets add per-file allowances", options_a)
805 .expect("put a");
806
807 let options_b = PutOptions::builder()
808 .uri("mv2://docs/faq.md")
809 .title("FAQ")
810 .build();
811 mem.put_bytes_with_options(b"Tickets can be issued by admins", options_b)
812 .expect("put b");
813
814 let options_c = PutOptions::builder()
815 .uri("mv2://blog/launch.md")
816 .title("Launch")
817 .build();
818 mem.put_bytes_with_options(b"Launch day tickets boost visibility", options_c)
819 .expect("put c");
820
821 mem.commit().expect("commit");
822
823 let uri_response = mem
824 .search(SearchRequest {
825 query: "tickets".into(),
826 top_k: 10,
827 snippet_chars: 120,
828 uri: Some("mv2://docs/pricing.md".into()),
829 scope: None,
830 cursor: None,
831 #[cfg(feature = "temporal_track")]
832 temporal: None,
833 as_of_frame: None,
834 as_of_ts: None,
835 no_sketch: false,
836 })
837 .expect("uri search");
838 assert_eq!(uri_response.engine, SearchEngineKind::Tantivy);
839 assert!(
840 uri_response
841 .hits
842 .iter()
843 .all(|hit| hit.uri == "mv2://docs/pricing.md")
844 );
845
846 let scope_response = mem
847 .search(SearchRequest {
848 query: "tickets".into(),
849 top_k: 10,
850 snippet_chars: 120,
851 uri: None,
852 scope: Some("mv2://docs/".into()),
853 cursor: None,
854 #[cfg(feature = "temporal_track")]
855 temporal: None,
856 as_of_frame: None,
857 as_of_ts: None,
858 no_sketch: false,
859 })
860 .expect("scope search");
861 assert_eq!(scope_response.engine, SearchEngineKind::Tantivy);
862 assert!(
863 scope_response
864 .hits
865 .iter()
866 .all(|hit| hit.uri.starts_with("mv2://docs/"))
867 );
868 });
869 }
870
871 #[test]
872 fn search_pagination_and_params() {
873 run_serial_test(|| {
874 let dir = tempdir().expect("tmp");
875 let path = dir.path().join("paging.mv2");
876
877 let mut mem = Memvid::create(&path).expect("create");
878 mem.enable_lex().expect("enable lex");
879
880 for (idx, text) in [
881 "tickets unlock tier upgrades",
882 "tickets expire after 30 days",
883 "tickets may be revoked",
884 ]
885 .iter()
886 .enumerate()
887 {
888 let uri = format!("mv2://docs/doc{idx}.md");
889 let options = PutOptions::builder()
890 .uri(&uri)
891 .title(format!("Doc {idx}"))
892 .build();
893 mem.put_bytes_with_options(text.as_bytes(), options)
894 .expect("put doc");
895 }
896
897 mem.commit().expect("commit");
898
899 let first_page = mem
900 .search(SearchRequest {
901 query: "tickets".into(),
902 top_k: 1,
903 snippet_chars: 90,
904 uri: None,
905 scope: None,
906 cursor: None,
907 #[cfg(feature = "temporal_track")]
908 temporal: None,
909 as_of_frame: None,
910 as_of_ts: None,
911 no_sketch: false,
912 })
913 .expect("page one");
914 assert_eq!(first_page.engine, SearchEngineKind::Tantivy);
915 assert_eq!(first_page.hits.len(), 1);
916 assert_eq!(first_page.params.top_k, 1);
917 assert_eq!(first_page.params.snippet_chars, 90);
918 assert!(first_page.total_hits >= first_page.hits.len());
919 let cursor = first_page.next_cursor.clone().expect("cursor");
920 let first_id = first_page.hits[0].frame_id;
921
922 let second_page = mem
923 .search(SearchRequest {
924 query: "tickets".into(),
925 top_k: 1,
926 snippet_chars: 90,
927 uri: None,
928 scope: None,
929 cursor: Some(cursor),
930 #[cfg(feature = "temporal_track")]
931 temporal: None,
932 as_of_frame: None,
933 as_of_ts: None,
934 no_sketch: false,
935 })
936 .expect("page two");
937 assert_eq!(second_page.engine, SearchEngineKind::Tantivy);
938 assert_eq!(second_page.hits.len(), 1);
939 assert_ne!(second_page.hits[0].frame_id, first_id);
940 assert_eq!(second_page.total_hits, first_page.total_hits);
941 });
942 }
943
944 #[cfg(feature = "lex")]
945 #[test]
946 fn search_falls_back_when_tantivy_missing() {
947 run_serial_test(|| {
948 let dir = tempdir().expect("tmp");
949 let path = dir.path().join("fallback.mv2");
950
951 let mut mem = Memvid::create(&path).expect("create");
952 mem.enable_lex().expect("enable lex");
953 mem.put_bytes(b"tickets fallback test").expect("put");
954 mem.commit().expect("commit");
955
956 assert!(
959 mem.tantivy.is_some(),
960 "Tantivy should be initialized after commit"
961 );
962
963 let response = mem
964 .search(SearchRequest {
965 query: "tickets".into(),
966 top_k: 5,
967 snippet_chars: 120,
968 uri: None,
969 scope: None,
970 cursor: None,
971 #[cfg(feature = "temporal_track")]
972 temporal: None,
973 as_of_frame: None,
974 as_of_ts: None,
975 no_sketch: false,
976 })
977 .expect("search with tantivy");
978
979 assert_eq!(response.engine, SearchEngineKind::Tantivy);
980 assert!(!response.hits.is_empty());
981 });
982 }
983
984 #[test]
985 fn verify_reports_success() {
986 run_serial_test(|| {
987 let dir = tempdir().expect("tmp");
988 let path = dir.path().join("verify.mv2");
989
990 {
991 let mut mem = Memvid::create(&path).expect("create");
992 mem.enable_lex().expect("enable lex");
993 mem.enable_vec().expect("enable vec");
994 mem.put_with_embedding(b"check", vec![0.5, 0.1])
995 .expect("put");
996 mem.commit().expect("commit");
997 }
998
999 let report = Memvid::verify(&path, true).expect("verify");
1000 assert_eq!(report.overall_status, VerificationStatus::Passed);
1001 });
1002 }
1003
1004 #[test]
1005 fn test_create_enables_indexes_by_default() {
1006 run_serial_test(|| {
1007 let dir = tempdir().expect("tmp");
1008 let path = dir.path().join("default_indexes.mv2");
1009
1010 let mem = Memvid::create(&path).expect("create");
1012
1013 let stats = mem.stats().expect("stats");
1015 println!(
1016 "After create (before drop): lex={}, vec={}",
1017 stats.has_lex_index, stats.has_vec_index
1018 );
1019
1020 drop(mem);
1021
1022 let reopened = Memvid::open(&path).expect("reopen");
1024 let stats2 = reopened.stats().expect("stats after reopen");
1025 println!(
1026 "After reopen: lex={}, vec={}",
1027 stats2.has_lex_index, stats2.has_vec_index
1028 );
1029
1030 #[cfg(feature = "lex")]
1031 assert!(
1032 stats2.has_lex_index,
1033 "lex index should be enabled by default"
1034 );
1035
1036 #[cfg(feature = "vec")]
1037 assert!(
1038 stats2.has_vec_index,
1039 "vec index should be enabled by default"
1040 );
1041 });
1042 }
1043
1044 #[test]
1045 fn doctor_rebuilds_time_index() {
1046 use std::fs::OpenOptions;
1047 use std::io::{Seek, SeekFrom, Write};
1048
1049 run_serial_test(|| {
1050 let dir = tempdir().expect("tmp");
1051 let path = dir.path().join("doctor.mv2");
1052
1053 let manifest = {
1054 let mut mem = Memvid::create(&path).expect("create");
1055 mem.put_bytes(b"repair").expect("put");
1056 mem.commit().expect("commit");
1057 mem.rebuild_indexes(&[]).expect("rebuild");
1059 mem.commit().expect("commit after rebuild");
1060 println!(
1061 "test: post-commit header footer_offset={}",
1062 mem.header.footer_offset
1063 );
1064 println!(
1065 "test: post-commit manifest offset={} length={}",
1066 mem.toc
1067 .time_index
1068 .as_ref()
1069 .map(|m| m.bytes_offset)
1070 .unwrap_or(0),
1071 mem.toc
1072 .time_index
1073 .as_ref()
1074 .map(|m| m.bytes_length)
1075 .unwrap_or(0)
1076 );
1077 mem.toc.time_index.clone().expect("time index manifest")
1078 };
1079
1080 {
1081 let mut file = OpenOptions::new()
1082 .read(true)
1083 .write(true)
1084 .open(&path)
1085 .expect("open file");
1086 file.seek(SeekFrom::Start(manifest.bytes_offset))
1087 .expect("seek");
1088 let zeros = vec![0u8; manifest.bytes_length as usize];
1089 file.write_all(&zeros).expect("corrupt time index");
1090 file.flush().expect("flush");
1091 file.sync_all().expect("sync");
1092 }
1093
1094 println!(
1095 "test: footer scan: {:?}",
1096 crate::footer::find_last_valid_footer(&std::fs::read(&path).expect("read file"))
1097 .as_ref()
1098 .map(|s| (s.footer_offset, s.toc_offset, s.footer.toc_len))
1099 );
1100 println!("test: verifying corrupted memory");
1101 match Memvid::verify(&path, false) {
1102 Ok(report) => {
1103 assert_eq!(report.overall_status, VerificationStatus::Failed);
1104 }
1105 Err(e) => {
1106 println!("test: verify failed with error (expected): {}", e);
1107 }
1108 }
1109
1110 println!("test: running doctor");
1111 let report = Memvid::doctor(
1112 &path,
1113 DoctorOptions {
1114 rebuild_time_index: true,
1115 rebuild_lex_index: false,
1116 ..DoctorOptions::default()
1117 },
1118 )
1119 .expect("doctor");
1120 println!("test: doctor completed with status: {:?}", report.status);
1121 println!("test: verifying repaired memory");
1126 let reopened = Memvid::open(&path).expect("reopen after doctor");
1128 assert!(
1129 reopened.toc.time_index.is_some(),
1130 "time index should exist after doctor"
1131 );
1132 });
1133 }
1134
1135 #[test]
1136 fn blob_reader_roundtrip_with_media_manifest() {
1137 run_serial_test(|| {
1138 let dir = tempdir().expect("tmp");
1139 let path = dir.path().join("blob.mv2");
1140 let payload = vec![0u8, 159, 1, 128, 42, 99, 200];
1141
1142 let manifest = MediaManifest {
1143 kind: "video".to_string(),
1144 mime: "video/mp4".to_string(),
1145 bytes: payload.len() as u64,
1146 filename: Some("clip.mp4".to_string()),
1147 duration_ms: Some(1234),
1148 width: Some(1920),
1149 height: Some(1080),
1150 codec: Some("h264".to_string()),
1151 };
1152
1153 let mut doc_meta = DocMetadata::default();
1154 doc_meta.media = Some(manifest.clone());
1155 doc_meta.mime = Some("video/mp4".to_string());
1156 doc_meta.bytes = Some(payload.len() as u64);
1157 assert!(
1158 !doc_meta.is_empty(),
1159 "media manifest must count as metadata"
1160 );
1161
1162 let options = PutOptions::builder()
1163 .metadata(doc_meta)
1164 .kind("video")
1165 .uri("mv2://video/clip.mp4")
1166 .build();
1167
1168 {
1169 let mut mem = Memvid::create(&path).expect("create");
1170 mem.put_bytes_with_options(&payload, options)
1171 .expect("put bytes");
1172 mem.commit().expect("commit");
1173 }
1174
1175 let mut reopened = Memvid::open(&path).expect("open");
1176 let mut reader = reopened
1177 .blob_reader_by_uri("mv2://video/clip.mp4")
1178 .expect("blob reader");
1179 let mut buffered = Vec::new();
1180 reader.read_to_end(&mut buffered).expect("read payload");
1181 assert_eq!(buffered, payload);
1182
1183 let roundtrip = reopened
1184 .media_manifest_by_uri("mv2://video/clip.mp4")
1185 .expect("manifest lookup")
1186 .expect("manifest present");
1187 assert_eq!(roundtrip.mime, "video/mp4");
1188 assert_eq!(roundtrip.kind, "video");
1189 assert_eq!(roundtrip.bytes, payload.len() as u64);
1190 assert_eq!(roundtrip.filename.as_deref(), Some("clip.mp4"));
1191 assert_eq!(roundtrip.duration_ms, Some(1234));
1192 assert_eq!(roundtrip.width, Some(1920));
1193 assert_eq!(roundtrip.height, Some(1080));
1194 assert_eq!(roundtrip.codec.as_deref(), Some("h264"));
1195
1196 drop(dir);
1197 });
1198 }
1199
1200 #[test]
1201 fn video_frame_roundtrip_does_not_corrupt_toc() {
1202 use crate::types::MediaManifest;
1203
1204 run_serial_test(|| {
1205 let dir = tempdir().expect("tmp");
1206 let path = dir.path().join("video.mv2");
1207 let mut seed = 0xDEADBEEF_u64;
1208 let mut video_bytes = vec![0u8; 1_600_000];
1209 for byte in &mut video_bytes {
1210 seed = seed ^ (seed << 7);
1211 seed = seed ^ (seed >> 9);
1212 seed = seed ^ (seed << 8);
1213 *byte = (seed & 0xFF) as u8;
1214 }
1215
1216 let hash_hex = blake3::hash(&video_bytes).to_hex().to_string();
1217
1218 let manifest = MediaManifest {
1219 kind: "video".to_string(),
1220 mime: "video/mp4".to_string(),
1221 bytes: video_bytes.len() as u64,
1222 filename: Some("clip.mp4".to_string()),
1223 duration_ms: Some(1_000),
1224 width: Some(1920),
1225 height: Some(1080),
1226 codec: Some("h264".to_string()),
1227 };
1228
1229 let mut meta = DocMetadata::default();
1230 meta.mime = Some("video/mp4".to_string());
1231 meta.bytes = Some(video_bytes.len() as u64);
1232 meta.hash = Some(hash_hex);
1233 meta.caption = Some("Test clip".to_string());
1234 meta.media = Some(manifest);
1235
1236 let options = PutOptions::builder()
1237 .kind("video")
1238 .metadata(meta)
1239 .tag("kind", "video")
1240 .uri("mv2://video/test.mp4")
1241 .title("Test clip")
1242 .build();
1243
1244 {
1245 let mut mem = Memvid::create(&path).expect("create");
1246 mem.put_bytes_with_options(&video_bytes, options)
1247 .expect("put video");
1248 mem.commit().expect("commit");
1249 }
1250
1251 let reopened = Memvid::open(&path).expect("reopen");
1252 let stats = reopened.stats().expect("stats");
1253 assert_eq!(stats.frame_count, 1);
1254 });
1255 }
1256
1257 #[test]
1258 fn ticket_sequence_enforced() {
1259 run_serial_test(|| {
1260 let dir = tempdir().expect("tmp");
1261 let path = dir.path().join("ticket.mv2");
1262
1263 let mut mem = Memvid::create(&path).expect("create");
1264 mem.apply_ticket(Ticket::new("issuer", 2))
1265 .expect("apply first");
1266
1267 let err = mem
1268 .apply_ticket(Ticket::new("issuer", 2))
1269 .expect_err("sequence must increase");
1270 assert!(matches!(err, MemvidError::TicketSequence { .. }));
1271 });
1272 }
1273
1274 #[test]
1275 fn capacity_limit_enforced() {
1276 run_serial_test(|| {
1277 let dir = tempdir().expect("tmp");
1278 let path = dir.path().join("capacity.mv2");
1279
1280 let mut mem = Memvid::create(&path).expect("create");
1281 let base = mem.data_end;
1282 mem.apply_ticket(Ticket::new("issuer", 2).capacity_bytes(base + 64))
1283 .expect("apply ticket");
1284
1285 mem.put_bytes(&vec![0xFF; 32]).expect("first put");
1286 mem.commit().expect("commit");
1287
1288 let err = mem
1289 .put_bytes(&vec![0xFF; 40])
1290 .expect_err("capacity exceeded");
1291 assert!(matches!(err, MemvidError::CapacityExceeded { .. }));
1292 });
1293 }
1294}