1#![deny(clippy::all, clippy::pedantic)]
2#![cfg_attr(not(test), deny(clippy::unwrap_used, clippy::expect_used))]
3#![allow(clippy::module_name_repetitions)]
4
5pub const MEMVID_CORE_VERSION: &str = env!("CARGO_PKG_VERSION");
7
8mod analysis;
9pub mod constants;
10pub mod enrich;
11pub mod enrichment_worker;
12pub mod error;
13pub mod extract;
14pub mod extract_budgeted;
15pub mod footer;
16pub mod io;
17pub mod lex;
18mod lock;
19pub mod lockfile;
20pub mod memvid;
21pub mod models;
22pub mod pii;
23pub mod reader;
24mod registry;
25mod search;
26pub mod signature;
27pub mod structure;
28pub mod table;
29pub mod text;
30mod toc;
31pub mod types;
32pub mod vec;
33pub mod vec_pq;
34
35#[cfg(feature = "vec")]
36pub mod text_embed;
37
38pub mod triplet;
40
41pub mod graph_search;
43
44pub mod clip;
47
48pub mod whisper;
51
52pub mod replay;
56
57#[cfg(feature = "encryption")]
60pub mod encryption;
61
62#[cfg(feature = "symspell_cleanup")]
64pub mod symspell_cleanup;
65
66#[cfg(test)]
67mod tests_lex_flag;
68
69#[cfg(feature = "temporal_track")]
70pub use analysis::temporal::{
71 TemporalContext, TemporalNormalizer, TemporalResolution, TemporalResolutionFlag,
72 TemporalResolutionValue, parse_clock_inheritance, parse_week_start,
73};
74#[cfg(feature = "temporal_enrich")]
76pub use analysis::temporal_enrich::{
77 AnchorSource as TemporalEnrichAnchorSource, RelativePhrase, ResolvedTemporal,
78 TemporalAnchorInfo, TemporalAnchorTracker, TemporalEnrichment, detect_relative_phrases,
79 enrich_chunk, enrich_chunks, enrich_document, resolve_relative_phrase,
80};
81pub use constants::*;
82pub use enrichment_worker::{EnrichmentWorkerConfig, EnrichmentWorkerStats};
83pub use error::{MemvidError, Result};
84pub use extract::{DocumentProcessor, ExtractedDocument, ProcessorConfig};
85pub use footer::{CommitFooter, find_last_valid_footer};
86#[cfg(feature = "temporal_track")]
87pub use io::temporal_index::{
88 append_track as temporal_track_append, calculate_checksum as temporal_track_checksum,
89 read_track as temporal_track_read, window as temporal_track_window,
90};
91pub use io::time_index::{
92 TimeIndexEntry, append_track as time_index_append, calculate_checksum as time_index_checksum,
93 read_track as time_index_read,
94};
95pub use io::wal::{EmbeddedWal, WalRecord, WalStats};
96pub use lex::{LexIndex, LexIndexArtifact, LexIndexBuilder, LexSearchHit};
97pub use lock::FileLock;
98pub use memvid::{
99 BlobReader, EnrichmentHandle, EnrichmentStats, LockSettings, Memvid, OpenReadOptions,
100 SketchCandidate, SketchSearchOptions, SketchSearchStats,
101 mutation::{CommitMode, CommitOptions},
102 start_enrichment_worker, start_enrichment_worker_with_embeddings,
103};
104#[cfg(feature = "parallel_segments")]
105pub use memvid::{BuildOpts, ParallelInput, ParallelPayload};
106pub use models::{
107 ModelManifest, ModelManifestEntry, ModelVerification, ModelVerificationStatus,
108 ModelVerifyOptions, verify_model_dir, verify_models,
109};
110pub use reader::{
111 DocumentFormat, DocumentReader, PassthroughReader, PdfReader, ReaderDiagnostics, ReaderHint,
112 ReaderOutput, ReaderRegistry,
113};
114pub use signature::{
115 parse_ed25519_public_key_base64, verify_model_manifest, verify_ticket_signature,
116};
117pub use text::{NormalizedText, normalize_text, truncate_at_grapheme_boundary};
118#[cfg(feature = "temporal_track")]
119pub use types::{
120 AnchorSource, SearchHitTemporal, SearchHitTemporalAnchor, SearchHitTemporalMention,
121 TEMPORAL_TRACK_FLAG_HAS_ANCHORS, TEMPORAL_TRACK_FLAG_HAS_MENTIONS, TemporalAnchor,
122 TemporalCapabilities, TemporalFilter, TemporalMention, TemporalMentionFlags,
123 TemporalMentionKind, TemporalTrack, TemporalTrackManifest,
124};
125pub use types::{
126 AskCitation, AskMode, AskRequest, AskResponse, AskRetriever, AskStats, AudioSegmentMetadata,
127 AuditOptions, AuditReport, CanonicalEncoding, DOCTOR_PLAN_VERSION, DocAudioMetadata,
128 DocExifMetadata, DocGpsMetadata, DocMetadata, DoctorActionDetail, DoctorActionKind,
129 DoctorActionPlan, DoctorActionReport, DoctorActionStatus, DoctorFinding, DoctorFindingCode,
130 DoctorMetrics, DoctorOptions, DoctorPhaseDuration, DoctorPhaseKind, DoctorPhasePlan,
131 DoctorPhaseReport, DoctorPhaseStatus, DoctorPlan, DoctorReport, DoctorSeverity, DoctorStatus,
132 EmbeddingIdentity, EmbeddingIdentityCount, EmbeddingIdentitySummary, Frame, FrameId, FrameRole,
133 FrameStatus, Header, IndexManifests, LexIndexManifest, LexSegmentDescriptor,
134 MEMVID_EMBEDDING_DIMENSION_KEY, MEMVID_EMBEDDING_MODEL_KEY, MEMVID_EMBEDDING_NORMALIZED_KEY,
135 MEMVID_EMBEDDING_PROVIDER_KEY, MediaManifest, MemvidHandle, Open, PutOptions,
136 PutOptionsBuilder, Sealed, SearchEngineKind, SearchHit, SearchHitMetadata, SearchParams,
137 SearchRequest, SearchResponse, SegmentCatalog, SegmentCommon, SegmentCompression, SegmentMeta,
138 SegmentSpan, SourceSpan, Stats, TextChunkManifest, TextChunkRange, Ticket, TicketRef, Tier,
139 TimeIndexManifest, TimeSegmentDescriptor, TimelineEntry, TimelineQuery, TimelineQueryBuilder,
140 Toc, VecEmbedder, VecIndexManifest, VecSegmentDescriptor, VectorCompression, VerificationCheck,
141 VerificationReport, VerificationStatus,
142};
143pub use types::{
145 EngineStamp, EnrichmentManifest, EnrichmentRecord, MEMORIES_TRACK_MAGIC,
146 MEMORIES_TRACK_VERSION, MemoriesStats, MemoriesTrack, MemoryCard, MemoryCardBuilder,
147 MemoryCardBuilderError, MemoryCardId, MemoryKind, Polarity, SlotIndex, VersionRelation,
148};
149pub use types::{
151 EdgeDirection, EntityKind, FollowResult, LOGIC_MESH_MAGIC, LOGIC_MESH_VERSION, LinkType,
152 LogicMesh, LogicMeshManifest, MeshEdge, MeshNode,
153};
154pub use types::{
156 DEFAULT_HAMMING_THRESHOLD, QuerySketch, SKETCH_TRACK_MAGIC, SKETCH_TRACK_VERSION, SketchEntry,
157 SketchFlags, SketchTrack, SketchTrackHeader, SketchTrackManifest, SketchTrackStats,
158 SketchVariant, build_term_filter, compute_simhash, compute_token_weights, generate_sketch,
159 hash_token, hash_token_u32, read_sketch_track, term_filter_maybe_contains, tokenize_for_sketch,
160 write_sketch_track,
161};
162pub use types::{
164 Cardinality, PredicateId, PredicateSchema, SchemaError, SchemaRegistry, ValueType,
165};
166pub use memvid::memory::SchemaSummaryEntry;
168#[cfg(feature = "logic_mesh")]
170pub use analysis::ner::NerModel;
171pub use analysis::ner::{
172 ExtractedEntity, FrameEntities, NER_MODEL_NAME, NER_MODEL_SIZE_MB, NER_MODEL_URL, NER_MODELS,
173 NER_TOKENIZER_URL, NerModelInfo, default_ner_model_info, get_ner_model_info,
174 is_ner_model_installed, ner_model_path, ner_tokenizer_path,
175};
176pub use enrich::{EnrichmentContext, EnrichmentEngine, EnrichmentResult, RulesEngine};
178pub use triplet::{ExtractionMode, ExtractionStats, TripletExtractor};
180pub use graph_search::{GraphMatcher, QueryPlanner, hybrid_search};
182pub use types::{
184 BatchEmbeddingResult, EmbeddingConfig, EmbeddingProvider, EmbeddingProviderKind,
185 EmbeddingResult,
186};
187pub use types::reranker::{
189 Reranker, RerankerConfig, RerankerDocument, RerankerKind, RerankerResult,
190};
191#[cfg(feature = "parallel_segments")]
192pub use types::{IndexSegmentRef, SegmentKind, SegmentStats};
193pub use vec::{VecIndex, VecIndexArtifact, VecSearchHit};
194pub use vec_pq::{
195 CompressionStats, ProductQuantizer, QuantizedVecIndex, QuantizedVecIndexArtifact,
196 QuantizedVecIndexBuilder,
197};
198#[cfg(feature = "vec")]
200pub use text_embed::{
201 LocalTextEmbedder, TEXT_EMBED_MODELS, TextEmbedConfig, TextEmbedModelInfo,
202 default_text_model_info, get_text_model_info,
203};
204pub use clip::{
206 CLIP_MODELS, ClipConfig, ClipDocument, ClipEmbeddingProvider, ClipError, ClipIndex,
207 ClipIndexArtifact, ClipIndexBuilder, ClipIndexManifest, ClipModelInfo, ClipSearchHit,
208 ImageInfo, MOBILECLIP_DIMS, SIGLIP_DIMS, default_model_info, filter_junk_images,
209 get_model_info,
210};
211#[cfg(feature = "clip")]
213pub use clip::{ClipModel, calculate_color_variance, get_image_info};
214pub use whisper::{
216 TranscriptionResult, TranscriptionSegment, WHISPER_MODELS, WhisperConfig, WhisperError,
217 WhisperModelInfo, default_whisper_model_info, get_whisper_model_info,
218};
219#[cfg(feature = "whisper")]
221pub use whisper::{WHISPER_SAMPLE_RATE, WhisperTranscriber, decode_audio_file};
222pub use structure::{
224 ChunkType, ChunkingOptions, ChunkingResult, StructuralChunker, StructuredChunk,
225 StructuredDocument, TableChunkingStrategy, chunk_structured, detect_structure,
226};
227pub use types::adaptive::{
229 AdaptiveConfig, AdaptiveResult, AdaptiveStats, CutoffStrategy, find_adaptive_cutoff,
230 normalize_scores,
231};
232pub use replay::{
234 ActionType, Checkpoint, REPLAY_SEGMENT_MAGIC, REPLAY_SEGMENT_VERSION, ReplayAction,
235 ReplayManifest, ReplaySession, SessionSummary, StateSnapshot,
236};
237#[cfg(feature = "replay")]
239pub use replay::{
240 ActiveSession, ComparisonReport, ComparisonSummary, Divergence, DivergenceType, ModelResult,
241 ReplayConfig, ReplayOptions, ReplayResult,
242};
243
244#[cfg(test)]
245use once_cell::sync::Lazy;
246use std::fs::File;
247use std::io::Cursor;
248use std::path::Path;
249#[cfg(test)]
250use std::sync::Mutex;
251
252use bincode::config::{self, Config};
253use io::header::HeaderCodec;
254
255const TIMELINE_PREVIEW_BYTES: usize = 120;
256const MAX_INDEX_BYTES: u64 = 512 * 1024 * 1024; const MAX_TIME_INDEX_BYTES: u64 = 512 * 1024 * 1024;
258const MAX_FRAME_BYTES: u64 = 256 * 1024 * 1024;
259const DEFAULT_SEARCH_TEXT_LIMIT: usize = 32_768;
260
261#[cfg(test)]
262static SERIAL_TEST_MUTEX: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
263
264#[cfg(test)]
265pub(crate) fn run_serial_test<T>(f: impl FnOnce() -> T) -> T {
266 let _guard = SERIAL_TEST_MUTEX
267 .lock()
268 .expect("memvid-core serial test mutex poisoned");
269 f()
270}
271
272impl Memvid {
273 #[cfg(feature = "lex")]
274 fn tantivy_index_pending(&self) -> bool {
275 self.tantivy_dirty
276 }
277
278 #[cfg(not(feature = "lex"))]
279 fn tantivy_index_pending(&self) -> bool {
280 false
281 }
282
283 #[cfg(feature = "lex")]
284 fn flush_tantivy_conditional(&mut self, embed_snapshot: bool) -> Result<()> {
285 if !self.tantivy_dirty {
286 return Ok(());
287 }
288 if let Some(engine) = self.tantivy.as_mut() {
289 engine.commit()?;
290 if embed_snapshot {
291 let snapshot = engine.snapshot_segments()?;
292 self.update_embedded_lex_snapshot(snapshot)?;
293 }
294 }
295 self.tantivy_dirty = false;
296 Ok(())
297 }
298
299 #[cfg(feature = "lex")]
300 fn flush_tantivy(&mut self) -> Result<()> {
301 self.flush_tantivy_conditional(true)
302 }
303
304 #[cfg(feature = "lex")]
305 #[allow(dead_code)]
306 fn flush_tantivy_skip_embed(&mut self) -> Result<()> {
307 self.flush_tantivy_conditional(false)
308 }
309
310 #[cfg(not(feature = "lex"))]
311 fn flush_tantivy(&mut self) -> Result<()> {
312 Ok(())
313 }
314
315 #[cfg(not(feature = "lex"))]
316 #[allow(dead_code)]
317 fn flush_tantivy_skip_embed(&mut self) -> Result<()> {
318 Ok(())
319 }
320 pub fn path(&self) -> &Path {
321 &self.path
322 }
323
324 pub fn lock_handle(&self) -> &FileLock {
325 &self.lock
326 }
327
328 pub fn is_read_only(&self) -> bool {
329 self.read_only
330 }
331
332 pub(crate) fn ensure_writable(&mut self) -> Result<()> {
333 if self.read_only {
334 self.lock.upgrade_to_exclusive()?;
335 self.read_only = false;
336 }
337 Ok(())
338 }
339
340 pub fn downgrade_to_shared(&mut self) -> Result<()> {
341 if self.read_only {
342 return Ok(());
343 }
344 if self.dirty || self.tantivy_index_pending() {
345 return Ok(());
346 }
347 self.lock.downgrade_to_shared()?;
348 self.read_only = true;
349 Ok(())
350 }
351}
352
353impl Drop for Memvid {
354 fn drop(&mut self) {
355 if self.dirty {
356 let _ = self.commit();
357 }
358 #[cfg(feature = "parallel_segments")]
360 {
361 use crate::memvid::lifecycle::cleanup_manifest_wal_public;
362 cleanup_manifest_wal_public(self.path());
363 }
364 }
365}
366
367pub(crate) fn persist_header(file: &mut File, header: &Header) -> Result<()> {
368 HeaderCodec::write(file, header)
369}
370
371fn wal_config() -> impl Config {
372 config::standard()
373 .with_fixed_int_encoding()
374 .with_little_endian()
375}
376
377pub(crate) fn decode_canonical_bytes(
378 payload: &[u8],
379 encoding: CanonicalEncoding,
380 frame_id: FrameId,
381) -> Result<Vec<u8>> {
382 match encoding {
383 CanonicalEncoding::Plain => Ok(payload.to_vec()),
384 CanonicalEncoding::Zstd => {
385 zstd::decode_all(Cursor::new(payload)).map_err(|_| MemvidError::InvalidFrame {
386 frame_id,
387 reason: "failed to decode canonical payload",
388 })
389 }
390 }
391}
392
393pub(crate) fn default_uri(frame_id: FrameId) -> String {
394 format!("mv2://frames/{frame_id}")
395}
396
397pub(crate) fn infer_title_from_uri(uri: &str) -> Option<String> {
398 let trimmed = uri.trim();
399 if trimmed.is_empty() {
400 return None;
401 }
402
403 let without_scheme = trimmed.splitn(2, "://").nth(1).unwrap_or(trimmed);
404 let without_fragment = without_scheme.split('#').next().unwrap_or(without_scheme);
405 let without_query = without_fragment
406 .split('?')
407 .next()
408 .unwrap_or(without_fragment);
409 let segment = without_query
410 .trim_end_matches('/')
411 .rsplit('/')
412 .next()
413 .map(str::trim)?;
414 if segment.is_empty() {
415 return None;
416 }
417
418 let stem = segment.rsplitn(2, '.').nth(1).unwrap_or(segment).trim();
419 if stem.is_empty() {
420 return None;
421 }
422
423 let words: Vec<String> = stem
424 .split(|c: char| c == '-' || c == '_' || c == ' ')
425 .filter(|part| !part.is_empty())
426 .map(|part| {
427 let mut chars = part.chars();
428 match chars.next() {
429 Some(first) => {
430 let first = first.to_ascii_uppercase();
431 let rest: String = chars.map(|c| c.to_ascii_lowercase()).collect();
432 if rest.is_empty() {
433 first.to_string()
434 } else {
435 format!("{}{}", first, rest)
436 }
437 }
438 None => String::new(),
439 }
440 })
441 .filter(|word| !word.is_empty())
442 .collect();
443
444 if words.is_empty() {
445 None
446 } else {
447 Some(words.join(" "))
448 }
449}
450
451fn truncate_preview(text: &str) -> String {
452 text.chars().take(TIMELINE_PREVIEW_BYTES).collect()
453}
454
455fn image_preview_from_metadata(meta: &DocMetadata) -> Option<String> {
456 let mime = meta.mime.as_deref()?;
457 if !mime.starts_with("image/") {
458 return None;
459 }
460
461 if let Some(caption) = meta.caption.as_ref() {
462 let trimmed = caption.trim();
463 if !trimmed.is_empty() {
464 return Some(truncate_preview(trimmed));
465 }
466 }
467
468 let mut segments: Vec<String> = Vec::new();
469 if let (Some(w), Some(h)) = (meta.width, meta.height) {
470 segments.push(format!("{}×{} px", w, h));
471 }
472 if let Some(exif) = meta.exif.as_ref() {
473 if let Some(model) = exif
474 .model
475 .as_ref()
476 .map(|s| s.trim())
477 .filter(|s| !s.is_empty())
478 {
479 segments.push(model.to_string());
480 } else if let Some(make) = exif
481 .make
482 .as_ref()
483 .map(|s| s.trim())
484 .filter(|s| !s.is_empty())
485 {
486 segments.push(make.to_string());
487 }
488
489 if let Some(datetime) = exif
490 .datetime
491 .as_ref()
492 .map(|s| s.trim())
493 .filter(|s| !s.is_empty())
494 {
495 segments.push(datetime.to_string());
496 }
497 }
498
499 if segments.is_empty() {
500 return Some("Image frame".to_string());
501 }
502
503 Some(truncate_preview(&segments.join(" · ")))
504}
505
506#[cfg(test)]
507mod tests {
508 use super::*;
509 use std::io::Read;
510 use std::num::NonZeroU64;
511 use tempfile::tempdir;
512
513 #[test]
514 fn create_put_commit_reopen() {
515 run_serial_test(|| {
516 let dir = tempdir().expect("tmp");
517 let path = dir.path().join("memory.mv2");
518
519 let mut mem = Memvid::create(&path).expect("create");
520 let seq = mem.put_bytes(b"hello").expect("put");
521 assert_eq!(seq, 1);
522 mem.commit().expect("commit");
523
524 drop(mem);
525
526 let mut reopened = Memvid::open(&path).expect("open");
527 let stats = reopened.stats().expect("stats");
528 assert_eq!(stats.frame_count, 1);
529 assert!(stats.has_time_index);
530
531 let timeline = reopened
532 .timeline(TimelineQuery::default())
533 .expect("timeline");
534 assert_eq!(timeline.len(), 1);
535 assert!(timeline[0].preview.contains("hello"));
536
537 let wal_stats = reopened.wal.stats();
538 assert_eq!(wal_stats.pending_bytes, 0);
539 assert_eq!(wal_stats.sequence, 2);
541 });
542 }
543
544 #[test]
545 fn timeline_limit_and_reverse() {
546 run_serial_test(|| {
547 let dir = tempdir().expect("tmp");
548 let path = dir.path().join("timeline.mv2");
549
550 let mut mem = Memvid::create(&path).expect("create");
551 mem.put_bytes(b"alpha").expect("put alpha");
552 mem.put_bytes(b"beta").expect("put beta");
553 mem.commit().expect("commit");
554 drop(mem);
555
556 let mut reopened = Memvid::open(&path).expect("open");
557 let limited = reopened
558 .timeline(TimelineQuery {
559 limit: NonZeroU64::new(1),
560 since: None,
561 until: None,
562 reverse: false,
563 #[cfg(feature = "temporal_track")]
564 temporal: None,
565 })
566 .expect("timeline limit");
567 assert_eq!(limited.len(), 1);
568 assert!(limited[0].preview.contains("alpha"));
569
570 let reversed = reopened
571 .timeline(TimelineQuery {
572 limit: NonZeroU64::new(1),
573 since: None,
574 until: None,
575 reverse: true,
576 #[cfg(feature = "temporal_track")]
577 temporal: None,
578 })
579 .expect("timeline reverse");
580 assert_eq!(reversed.len(), 1);
581 assert!(reversed[0].preview.contains("beta"));
582 });
583 }
584
585 #[test]
586 fn lex_search_roundtrip() {
587 run_serial_test(|| {
588 let dir = tempdir().expect("tmp");
589 let path = dir.path().join("lex.mv2");
590
591 let mut mem = Memvid::create(&path).expect("create");
592 mem.enable_lex().expect("enable");
593 let _seq1 = mem.put_bytes(b"Rust memory engine").expect("put");
594 let _seq2 = mem.put_bytes(b"Deterministic WAL").expect("put2");
595 mem.commit().expect("commit");
596
597 let request = SearchRequest {
599 query: "memory".to_string(),
600 top_k: 10,
601 snippet_chars: 200,
602 uri: None,
603 scope: None,
604 cursor: None,
605 #[cfg(feature = "temporal_track")]
606 temporal: None,
607 as_of_frame: None,
608 as_of_ts: None,
609 no_sketch: false,
610 };
611 let response = mem.search(request).expect("search");
612 assert_eq!(response.hits.len(), 1);
613
614 drop(mem);
615
616 let mut reopened = Memvid::open(&path).expect("open");
617 let request = SearchRequest {
618 query: "wal".to_string(),
619 top_k: 10,
620 snippet_chars: 200,
621 uri: None,
622 scope: None,
623 cursor: None,
624 #[cfg(feature = "temporal_track")]
625 temporal: None,
626 as_of_frame: None,
627 as_of_ts: None,
628 no_sketch: false,
629 };
630 let response = reopened.search(request).expect("search reopened");
631 assert_eq!(response.hits.len(), 1);
632 });
633 }
634
635 #[test]
636 fn vec_search_roundtrip() {
637 run_serial_test(|| {
638 let dir = tempdir().expect("tmp");
639 let path = dir.path().join("vec.mv2");
640
641 let mut mem = Memvid::create(&path).expect("create");
642 mem.enable_vec().expect("enable");
643 mem.put_with_embedding(b"vector", vec![0.0, 1.0])
644 .expect("put");
645 mem.put_with_embedding(b"vector-two", vec![1.0, 0.0])
646 .expect("put2");
647 mem.commit().expect("commit");
648
649 let stats = mem.stats().expect("stats");
650 assert!(stats.has_vec_index, "vec index should exist after commit");
651
652 let hits = mem.search_vec(&[0.0, 1.0], 5).expect("search");
653 assert_eq!(hits.first().map(|hit| hit.frame_id), Some(0));
654
655 drop(mem);
656
657 let mut reopened = Memvid::open(&path).expect("open");
658 let reopened_stats = reopened.stats().expect("stats reopen");
659 assert!(
660 reopened_stats.has_vec_index,
661 "vec index should exist after reopen: has_manifest={}, vec_enabled={}",
662 reopened.toc.indexes.vec.is_some(),
663 reopened.vec_enabled
664 );
665 let hits = reopened.search_vec(&[1.0, 0.0], 5).expect("search reopen");
666 assert_eq!(hits.first().map(|hit| hit.frame_id), Some(1));
667 });
668 }
669
670 #[test]
671 fn search_snippet_ranges_match_bytes() {
672 run_serial_test(|| {
673 let dir = tempdir().expect("tmp");
674 let path = dir.path().join("search.mv2");
675
676 let mut mem = Memvid::create(&path).expect("create");
677 mem.enable_lex().expect("enable lex");
678 let options = PutOptions::builder()
679 .uri("mv2://docs/pricing.md")
680 .title("Pricing")
681 .build();
682 let text = "Capacity tickets are signed grants that raise per-file caps.";
683 mem.put_bytes_with_options(text.as_bytes(), options)
684 .expect("put doc");
685 mem.commit().expect("commit");
686
687 let response = mem
688 .search(SearchRequest {
689 query: "capacity tickets".into(),
690 top_k: 5,
691 snippet_chars: 160,
692 uri: None,
693 scope: None,
694 cursor: None,
695 #[cfg(feature = "temporal_track")]
696 temporal: None,
697 as_of_frame: None,
698 as_of_ts: None,
699 no_sketch: false,
700 })
701 .expect("search");
702
703 assert_eq!(response.total_hits, 1);
704 assert_eq!(response.engine, SearchEngineKind::Tantivy);
705 let hit = response.hits.first().expect("hit");
706 let frame = mem
707 .toc
708 .frames
709 .get(hit.frame_id as usize)
710 .cloned()
711 .expect("frame");
712 let canonical = mem.frame_content(&frame).expect("content");
713 let bytes = canonical.as_bytes();
714 let (start, end) = hit.range;
715 assert!(end <= bytes.len());
716 assert_eq!(hit.text.as_bytes(), &bytes[start..end]);
717 let chunk = hit.chunk_range.expect("chunk range");
718 assert!(chunk.0 <= start);
719 assert!(chunk.1 >= end);
720 let chunk_text = hit.chunk_text.as_ref().expect("chunk text");
721 let chunk_slice = &canonical[chunk.0..chunk.1];
722 assert_eq!(chunk_text, chunk_slice);
723 });
724 }
725
726 #[test]
727 fn search_chunk_range_reflects_chunk_offset() {
728 run_serial_test(|| {
729 let dir = tempdir().expect("tmp");
730 let path = dir.path().join("chunked.mv2");
731
732 let mut mem = Memvid::create(&path).expect("create");
733 mem.enable_lex().expect("enable lex");
734
735 let options = PutOptions::builder()
736 .uri("mv2://docs/manual.txt")
737 .title("Manual")
738 .build();
739 let prefix = "alpha beta gamma delta. ".repeat(200);
740 let content = format!(
741 "{}target segment appears here. Trailing context for verification.",
742 prefix
743 );
744 mem.put_bytes_with_options(content.as_bytes(), options)
745 .expect("put doc");
746 mem.commit().expect("commit");
747
748 let response = mem
749 .search(SearchRequest {
750 query: "target segment".into(),
751 top_k: 5,
752 snippet_chars: 160,
753 uri: None,
754 scope: None,
755 cursor: None,
756 #[cfg(feature = "temporal_track")]
757 temporal: None,
758 as_of_frame: None,
759 as_of_ts: None,
760 no_sketch: false,
761 })
762 .expect("search");
763
764 let hit = response.hits.first().expect("hit");
765 assert_eq!(response.engine, SearchEngineKind::Tantivy);
766 let chunk_range = hit.chunk_range.expect("chunk range");
767 assert!(chunk_range.0 > 0);
768 assert!(hit.range.0 >= chunk_range.0);
769 assert!(hit.range.1 <= chunk_range.1);
770 assert!(hit.text.contains("target segment"));
771 let chunk_text = hit.chunk_text.as_ref().expect("chunk text");
772 assert_eq!(chunk_text, &content[chunk_range.0..chunk_range.1]);
773 });
774 }
775
776 #[test]
777 fn auto_tag_populates_frame_metadata() {
778 run_serial_test(|| {
779 let dir = tempdir().expect("tmp");
780 let path = dir.path().join("autotag.mv2");
781
782 let mut mem = Memvid::create(&path).expect("create");
783 mem.enable_lex().expect("enable lex");
784
785 let options = PutOptions::builder()
786 .search_text("Neural networks planning session 2024-10-08")
787 .auto_tag(true)
788 .extract_dates(true)
789 .build();
790 mem.put_bytes_with_options(b"agenda", options)
791 .expect("put bytes");
792 mem.commit().expect("commit");
793
794 let frame = mem.toc.frames.first().expect("frame present");
795 assert!(!frame.tags.is_empty());
796 assert!(frame.content_dates.iter().any(|date| date.contains("2024")));
797 });
798 }
799
800 #[test]
801 fn search_filters_by_uri_and_scope() {
802 run_serial_test(|| {
803 let dir = tempdir().expect("tmp");
804 let path = dir.path().join("filters.mv2");
805
806 let mut mem = Memvid::create(&path).expect("create");
807 mem.enable_lex().expect("enable lex");
808
809 let options_a = PutOptions::builder()
810 .uri("mv2://docs/pricing.md")
811 .title("Pricing")
812 .build();
813 mem.put_bytes_with_options(b"Capacity tickets add per-file allowances", options_a)
814 .expect("put a");
815
816 let options_b = PutOptions::builder()
817 .uri("mv2://docs/faq.md")
818 .title("FAQ")
819 .build();
820 mem.put_bytes_with_options(b"Tickets can be issued by admins", options_b)
821 .expect("put b");
822
823 let options_c = PutOptions::builder()
824 .uri("mv2://blog/launch.md")
825 .title("Launch")
826 .build();
827 mem.put_bytes_with_options(b"Launch day tickets boost visibility", options_c)
828 .expect("put c");
829
830 mem.commit().expect("commit");
831
832 let uri_response = mem
833 .search(SearchRequest {
834 query: "tickets".into(),
835 top_k: 10,
836 snippet_chars: 120,
837 uri: Some("mv2://docs/pricing.md".into()),
838 scope: None,
839 cursor: None,
840 #[cfg(feature = "temporal_track")]
841 temporal: None,
842 as_of_frame: None,
843 as_of_ts: None,
844 no_sketch: false,
845 })
846 .expect("uri search");
847 assert_eq!(uri_response.engine, SearchEngineKind::Tantivy);
848 assert!(
849 uri_response
850 .hits
851 .iter()
852 .all(|hit| hit.uri == "mv2://docs/pricing.md")
853 );
854
855 let scope_response = mem
856 .search(SearchRequest {
857 query: "tickets".into(),
858 top_k: 10,
859 snippet_chars: 120,
860 uri: None,
861 scope: Some("mv2://docs/".into()),
862 cursor: None,
863 #[cfg(feature = "temporal_track")]
864 temporal: None,
865 as_of_frame: None,
866 as_of_ts: None,
867 no_sketch: false,
868 })
869 .expect("scope search");
870 assert_eq!(scope_response.engine, SearchEngineKind::Tantivy);
871 assert!(
872 scope_response
873 .hits
874 .iter()
875 .all(|hit| hit.uri.starts_with("mv2://docs/"))
876 );
877 });
878 }
879
880 #[test]
881 fn search_pagination_and_params() {
882 run_serial_test(|| {
883 let dir = tempdir().expect("tmp");
884 let path = dir.path().join("paging.mv2");
885
886 let mut mem = Memvid::create(&path).expect("create");
887 mem.enable_lex().expect("enable lex");
888
889 for (idx, text) in [
890 "tickets unlock tier upgrades",
891 "tickets expire after 30 days",
892 "tickets may be revoked",
893 ]
894 .iter()
895 .enumerate()
896 {
897 let uri = format!("mv2://docs/doc{idx}.md");
898 let options = PutOptions::builder()
899 .uri(&uri)
900 .title(format!("Doc {idx}"))
901 .build();
902 mem.put_bytes_with_options(text.as_bytes(), options)
903 .expect("put doc");
904 }
905
906 mem.commit().expect("commit");
907
908 let first_page = mem
909 .search(SearchRequest {
910 query: "tickets".into(),
911 top_k: 1,
912 snippet_chars: 90,
913 uri: None,
914 scope: None,
915 cursor: None,
916 #[cfg(feature = "temporal_track")]
917 temporal: None,
918 as_of_frame: None,
919 as_of_ts: None,
920 no_sketch: false,
921 })
922 .expect("page one");
923 assert_eq!(first_page.engine, SearchEngineKind::Tantivy);
924 assert_eq!(first_page.hits.len(), 1);
925 assert_eq!(first_page.params.top_k, 1);
926 assert_eq!(first_page.params.snippet_chars, 90);
927 assert!(first_page.total_hits >= first_page.hits.len());
928 let cursor = first_page.next_cursor.clone().expect("cursor");
929 let first_id = first_page.hits[0].frame_id;
930
931 let second_page = mem
932 .search(SearchRequest {
933 query: "tickets".into(),
934 top_k: 1,
935 snippet_chars: 90,
936 uri: None,
937 scope: None,
938 cursor: Some(cursor),
939 #[cfg(feature = "temporal_track")]
940 temporal: None,
941 as_of_frame: None,
942 as_of_ts: None,
943 no_sketch: false,
944 })
945 .expect("page two");
946 assert_eq!(second_page.engine, SearchEngineKind::Tantivy);
947 assert_eq!(second_page.hits.len(), 1);
948 assert_ne!(second_page.hits[0].frame_id, first_id);
949 assert_eq!(second_page.total_hits, first_page.total_hits);
950 });
951 }
952
953 #[cfg(feature = "lex")]
954 #[test]
955 fn search_falls_back_when_tantivy_missing() {
956 run_serial_test(|| {
957 let dir = tempdir().expect("tmp");
958 let path = dir.path().join("fallback.mv2");
959
960 let mut mem = Memvid::create(&path).expect("create");
961 mem.enable_lex().expect("enable lex");
962 mem.put_bytes(b"tickets fallback test").expect("put");
963 mem.commit().expect("commit");
964
965 assert!(
968 mem.tantivy.is_some(),
969 "Tantivy should be initialized after commit"
970 );
971
972 let response = mem
973 .search(SearchRequest {
974 query: "tickets".into(),
975 top_k: 5,
976 snippet_chars: 120,
977 uri: None,
978 scope: None,
979 cursor: None,
980 #[cfg(feature = "temporal_track")]
981 temporal: None,
982 as_of_frame: None,
983 as_of_ts: None,
984 no_sketch: false,
985 })
986 .expect("search with tantivy");
987
988 assert_eq!(response.engine, SearchEngineKind::Tantivy);
989 assert!(!response.hits.is_empty());
990 });
991 }
992
993 #[test]
994 fn verify_reports_success() {
995 run_serial_test(|| {
996 let dir = tempdir().expect("tmp");
997 let path = dir.path().join("verify.mv2");
998
999 {
1000 let mut mem = Memvid::create(&path).expect("create");
1001 mem.enable_lex().expect("enable lex");
1002 mem.enable_vec().expect("enable vec");
1003 mem.put_with_embedding(b"check", vec![0.5, 0.1])
1004 .expect("put");
1005 mem.commit().expect("commit");
1006 }
1007
1008 let report = Memvid::verify(&path, true).expect("verify");
1009 assert_eq!(report.overall_status, VerificationStatus::Passed);
1010 });
1011 }
1012
1013 #[test]
1014 fn test_create_enables_indexes_by_default() {
1015 run_serial_test(|| {
1016 let dir = tempdir().expect("tmp");
1017 let path = dir.path().join("default_indexes.mv2");
1018
1019 let mem = Memvid::create(&path).expect("create");
1021
1022 let stats = mem.stats().expect("stats");
1024 println!(
1025 "After create (before drop): lex={}, vec={}",
1026 stats.has_lex_index, stats.has_vec_index
1027 );
1028
1029 drop(mem);
1030
1031 let reopened = Memvid::open(&path).expect("reopen");
1033 let stats2 = reopened.stats().expect("stats after reopen");
1034 println!(
1035 "After reopen: lex={}, vec={}",
1036 stats2.has_lex_index, stats2.has_vec_index
1037 );
1038
1039 #[cfg(feature = "lex")]
1040 assert!(
1041 stats2.has_lex_index,
1042 "lex index should be enabled by default"
1043 );
1044
1045 #[cfg(feature = "vec")]
1046 assert!(
1047 stats2.has_vec_index,
1048 "vec index should be enabled by default"
1049 );
1050 });
1051 }
1052
1053 #[test]
1054 fn doctor_rebuilds_time_index() {
1055 use std::fs::OpenOptions;
1056 use std::io::{Seek, SeekFrom, Write};
1057
1058 run_serial_test(|| {
1059 let dir = tempdir().expect("tmp");
1060 let path = dir.path().join("doctor.mv2");
1061
1062 let manifest = {
1063 let mut mem = Memvid::create(&path).expect("create");
1064 mem.put_bytes(b"repair").expect("put");
1065 mem.commit().expect("commit");
1066 mem.rebuild_indexes(&[]).expect("rebuild");
1068 mem.commit().expect("commit after rebuild");
1069 println!(
1070 "test: post-commit header footer_offset={}",
1071 mem.header.footer_offset
1072 );
1073 println!(
1074 "test: post-commit manifest offset={} length={}",
1075 mem.toc
1076 .time_index
1077 .as_ref()
1078 .map(|m| m.bytes_offset)
1079 .unwrap_or(0),
1080 mem.toc
1081 .time_index
1082 .as_ref()
1083 .map(|m| m.bytes_length)
1084 .unwrap_or(0)
1085 );
1086 mem.toc.time_index.clone().expect("time index manifest")
1087 };
1088
1089 {
1090 let mut file = OpenOptions::new()
1091 .read(true)
1092 .write(true)
1093 .open(&path)
1094 .expect("open file");
1095 file.seek(SeekFrom::Start(manifest.bytes_offset))
1096 .expect("seek");
1097 let zeros = vec![0u8; manifest.bytes_length as usize];
1098 file.write_all(&zeros).expect("corrupt time index");
1099 file.flush().expect("flush");
1100 file.sync_all().expect("sync");
1101 }
1102
1103 println!(
1104 "test: footer scan: {:?}",
1105 crate::footer::find_last_valid_footer(&std::fs::read(&path).expect("read file"))
1106 .as_ref()
1107 .map(|s| (s.footer_offset, s.toc_offset, s.footer.toc_len))
1108 );
1109 println!("test: verifying corrupted memory");
1110 match Memvid::verify(&path, false) {
1111 Ok(report) => {
1112 assert_eq!(report.overall_status, VerificationStatus::Failed);
1113 }
1114 Err(e) => {
1115 println!("test: verify failed with error (expected): {}", e);
1116 }
1117 }
1118
1119 println!("test: running doctor");
1120 let report = Memvid::doctor(
1121 &path,
1122 DoctorOptions {
1123 rebuild_time_index: true,
1124 rebuild_lex_index: false,
1125 ..DoctorOptions::default()
1126 },
1127 )
1128 .expect("doctor");
1129 println!("test: doctor completed with status: {:?}", report.status);
1130 println!("test: verifying repaired memory");
1135 let reopened = Memvid::open(&path).expect("reopen after doctor");
1137 assert!(
1138 reopened.toc.time_index.is_some(),
1139 "time index should exist after doctor"
1140 );
1141 });
1142 }
1143
1144 #[test]
1145 fn blob_reader_roundtrip_with_media_manifest() {
1146 run_serial_test(|| {
1147 let dir = tempdir().expect("tmp");
1148 let path = dir.path().join("blob.mv2");
1149 let payload = vec![0u8, 159, 1, 128, 42, 99, 200];
1150
1151 let manifest = MediaManifest {
1152 kind: "video".to_string(),
1153 mime: "video/mp4".to_string(),
1154 bytes: payload.len() as u64,
1155 filename: Some("clip.mp4".to_string()),
1156 duration_ms: Some(1234),
1157 width: Some(1920),
1158 height: Some(1080),
1159 codec: Some("h264".to_string()),
1160 };
1161
1162 let mut doc_meta = DocMetadata::default();
1163 doc_meta.media = Some(manifest.clone());
1164 doc_meta.mime = Some("video/mp4".to_string());
1165 doc_meta.bytes = Some(payload.len() as u64);
1166 assert!(
1167 !doc_meta.is_empty(),
1168 "media manifest must count as metadata"
1169 );
1170
1171 let options = PutOptions::builder()
1172 .metadata(doc_meta)
1173 .kind("video")
1174 .uri("mv2://video/clip.mp4")
1175 .build();
1176
1177 {
1178 let mut mem = Memvid::create(&path).expect("create");
1179 mem.put_bytes_with_options(&payload, options)
1180 .expect("put bytes");
1181 mem.commit().expect("commit");
1182 }
1183
1184 let mut reopened = Memvid::open(&path).expect("open");
1185 let mut reader = reopened
1186 .blob_reader_by_uri("mv2://video/clip.mp4")
1187 .expect("blob reader");
1188 let mut buffered = Vec::new();
1189 reader.read_to_end(&mut buffered).expect("read payload");
1190 assert_eq!(buffered, payload);
1191
1192 let roundtrip = reopened
1193 .media_manifest_by_uri("mv2://video/clip.mp4")
1194 .expect("manifest lookup")
1195 .expect("manifest present");
1196 assert_eq!(roundtrip.mime, "video/mp4");
1197 assert_eq!(roundtrip.kind, "video");
1198 assert_eq!(roundtrip.bytes, payload.len() as u64);
1199 assert_eq!(roundtrip.filename.as_deref(), Some("clip.mp4"));
1200 assert_eq!(roundtrip.duration_ms, Some(1234));
1201 assert_eq!(roundtrip.width, Some(1920));
1202 assert_eq!(roundtrip.height, Some(1080));
1203 assert_eq!(roundtrip.codec.as_deref(), Some("h264"));
1204
1205 drop(dir);
1206 });
1207 }
1208
1209 #[test]
1210 fn video_frame_roundtrip_does_not_corrupt_toc() {
1211 use crate::types::MediaManifest;
1212
1213 run_serial_test(|| {
1214 let dir = tempdir().expect("tmp");
1215 let path = dir.path().join("video.mv2");
1216 let mut seed = 0xDEADBEEF_u64;
1217 let mut video_bytes = vec![0u8; 1_600_000];
1218 for byte in &mut video_bytes {
1219 seed = seed ^ (seed << 7);
1220 seed = seed ^ (seed >> 9);
1221 seed = seed ^ (seed << 8);
1222 *byte = (seed & 0xFF) as u8;
1223 }
1224
1225 let hash_hex = blake3::hash(&video_bytes).to_hex().to_string();
1226
1227 let manifest = MediaManifest {
1228 kind: "video".to_string(),
1229 mime: "video/mp4".to_string(),
1230 bytes: video_bytes.len() as u64,
1231 filename: Some("clip.mp4".to_string()),
1232 duration_ms: Some(1_000),
1233 width: Some(1920),
1234 height: Some(1080),
1235 codec: Some("h264".to_string()),
1236 };
1237
1238 let mut meta = DocMetadata::default();
1239 meta.mime = Some("video/mp4".to_string());
1240 meta.bytes = Some(video_bytes.len() as u64);
1241 meta.hash = Some(hash_hex);
1242 meta.caption = Some("Test clip".to_string());
1243 meta.media = Some(manifest);
1244
1245 let options = PutOptions::builder()
1246 .kind("video")
1247 .metadata(meta)
1248 .tag("kind", "video")
1249 .uri("mv2://video/test.mp4")
1250 .title("Test clip")
1251 .build();
1252
1253 {
1254 let mut mem = Memvid::create(&path).expect("create");
1255 mem.put_bytes_with_options(&video_bytes, options)
1256 .expect("put video");
1257 mem.commit().expect("commit");
1258 }
1259
1260 let reopened = Memvid::open(&path).expect("reopen");
1261 let stats = reopened.stats().expect("stats");
1262 assert_eq!(stats.frame_count, 1);
1263 });
1264 }
1265
1266 #[test]
1267 fn ticket_sequence_enforced() {
1268 run_serial_test(|| {
1269 let dir = tempdir().expect("tmp");
1270 let path = dir.path().join("ticket.mv2");
1271
1272 let mut mem = Memvid::create(&path).expect("create");
1273 mem.apply_ticket(Ticket::new("issuer", 2))
1274 .expect("apply first");
1275
1276 let err = mem
1277 .apply_ticket(Ticket::new("issuer", 2))
1278 .expect_err("sequence must increase");
1279 assert!(matches!(err, MemvidError::TicketSequence { .. }));
1280 });
1281 }
1282
1283 #[test]
1284 fn capacity_limit_enforced() {
1285 run_serial_test(|| {
1286 let dir = tempdir().expect("tmp");
1287 let path = dir.path().join("capacity.mv2");
1288
1289 let mut mem = Memvid::create(&path).expect("create");
1290 let base = mem.data_end;
1291 mem.apply_ticket(Ticket::new("issuer", 2).capacity_bytes(base + 64))
1292 .expect("apply ticket");
1293
1294 mem.put_bytes(&vec![0xFF; 32]).expect("first put");
1295 mem.commit().expect("commit");
1296
1297 let err = mem
1298 .put_bytes(&vec![0xFF; 40])
1299 .expect_err("capacity exceeded");
1300 assert!(matches!(err, MemvidError::CapacityExceeded { .. }));
1301 });
1302 }
1303}