1#![deny(clippy::all, clippy::pedantic)]
2#![cfg_attr(not(test), deny(clippy::unwrap_used, clippy::expect_used))]
3#![allow(clippy::module_name_repetitions)]
4
5mod analysis;
6pub mod constants;
7pub mod enrich;
8pub mod error;
9pub mod extract;
10pub mod footer;
11pub mod io;
12pub mod lex;
13mod lock;
14pub mod lockfile;
15pub mod memvid;
16pub mod models;
17pub mod pii;
18pub mod reader;
19mod registry;
20mod search;
21pub mod signature;
22pub mod table;
23pub mod text;
24mod toc;
25pub mod types;
26pub mod vec;
27pub mod vec_pq;
28
29pub mod clip;
32
33#[cfg(test)]
34mod tests_lex_flag;
35
36#[cfg(feature = "temporal_track")]
37pub use analysis::temporal::{
38 TemporalContext, TemporalNormalizer, TemporalResolution, TemporalResolutionFlag,
39 TemporalResolutionValue, parse_clock_inheritance, parse_week_start,
40};
41#[cfg(feature = "temporal_enrich")]
43pub use analysis::temporal_enrich::{
44 AnchorSource as TemporalEnrichAnchorSource, RelativePhrase, ResolvedTemporal,
45 TemporalAnchorInfo, TemporalAnchorTracker, TemporalEnrichment, detect_relative_phrases,
46 enrich_chunk, enrich_chunks, enrich_document, resolve_relative_phrase,
47};
48pub use constants::*;
49pub use error::{MemvidError, Result};
50pub use extract::{DocumentProcessor, ExtractedDocument, ProcessorConfig};
51pub use footer::{CommitFooter, find_last_valid_footer};
52#[cfg(feature = "temporal_track")]
53pub use io::temporal_index::{
54 append_track as temporal_track_append, calculate_checksum as temporal_track_checksum,
55 read_track as temporal_track_read, window as temporal_track_window,
56};
57pub use io::time_index::{
58 TimeIndexEntry, append_track as time_index_append, calculate_checksum as time_index_checksum,
59 read_track as time_index_read,
60};
61pub use io::wal::{EmbeddedWal, WalRecord, WalStats};
62pub use lex::{LexIndex, LexIndexArtifact, LexIndexBuilder, LexSearchHit};
63pub use lock::FileLock;
64pub use memvid::{
65 BlobReader, LockSettings, Memvid, OpenReadOptions,
66 mutation::{CommitMode, CommitOptions},
67};
68#[cfg(feature = "parallel_segments")]
69pub use memvid::{BuildOpts, ParallelInput, ParallelPayload};
70pub use models::{
71 ModelManifest, ModelManifestEntry, ModelVerification, ModelVerificationStatus,
72 ModelVerifyOptions, verify_model_dir, verify_models,
73};
74pub use reader::{
75 DocumentFormat, DocumentReader, PassthroughReader, PdfReader, ReaderDiagnostics, ReaderHint,
76 ReaderOutput, ReaderRegistry,
77};
78pub use signature::{
79 parse_ed25519_public_key_base64, verify_model_manifest, verify_ticket_signature,
80};
81pub use text::{NormalizedText, normalize_text, truncate_at_grapheme_boundary};
82#[cfg(feature = "temporal_track")]
83pub use types::{
84 AnchorSource, SearchHitTemporal, SearchHitTemporalAnchor, SearchHitTemporalMention,
85 TEMPORAL_TRACK_FLAG_HAS_ANCHORS, TEMPORAL_TRACK_FLAG_HAS_MENTIONS, TemporalAnchor,
86 TemporalCapabilities, TemporalFilter, TemporalMention, TemporalMentionFlags,
87 TemporalMentionKind, TemporalTrack, TemporalTrackManifest,
88};
89pub use types::{
90 AskCitation, AskMode, AskRequest, AskResponse, AskRetriever, AskStats, AudioSegmentMetadata,
91 AuditOptions, AuditReport, CanonicalEncoding, DOCTOR_PLAN_VERSION, DocAudioMetadata,
92 DocExifMetadata, DocGpsMetadata, DocMetadata, DoctorActionDetail, DoctorActionKind,
93 DoctorActionPlan, DoctorActionReport, DoctorActionStatus, DoctorFinding, DoctorFindingCode,
94 DoctorMetrics, DoctorOptions, DoctorPhaseDuration, DoctorPhaseKind, DoctorPhasePlan,
95 DoctorPhaseReport, DoctorPhaseStatus, DoctorPlan, DoctorReport, DoctorSeverity, DoctorStatus,
96 Frame, FrameId, FrameRole, FrameStatus, Header, IndexManifests, LexIndexManifest,
97 LexSegmentDescriptor, MediaManifest, MemvidHandle, Open, PutOptions, PutOptionsBuilder, Sealed,
98 SearchEngineKind, SearchHit, SearchHitMetadata, SearchParams, SearchRequest, SearchResponse,
99 SegmentCatalog, SegmentCommon, SegmentCompression, SegmentMeta, SegmentSpan, SourceSpan, Stats,
100 TextChunkManifest, TextChunkRange, Ticket, TicketRef, Tier, TimeIndexManifest,
101 TimeSegmentDescriptor, TimelineEntry, TimelineQuery, TimelineQueryBuilder, Toc, VecEmbedder,
102 VecIndexManifest, VecSegmentDescriptor, VectorCompression, VerificationCheck,
103 VerificationReport, VerificationStatus,
104};
105pub use types::{
107 EngineStamp, EnrichmentManifest, EnrichmentRecord, MEMORIES_TRACK_MAGIC,
108 MEMORIES_TRACK_VERSION, MemoriesStats, MemoriesTrack, MemoryCard, MemoryCardBuilder,
109 MemoryCardBuilderError, MemoryCardId, MemoryKind, Polarity, SlotIndex, VersionRelation,
110};
111pub use types::{
113 EdgeDirection, EntityKind, FollowResult, LOGIC_MESH_MAGIC, LOGIC_MESH_VERSION, LinkType,
114 LogicMesh, LogicMeshManifest, MeshEdge, MeshNode,
115};
116pub use analysis::ner::{
118 ExtractedEntity, FrameEntities, NER_MODEL_NAME, NER_MODEL_SIZE_MB, NER_MODEL_URL,
119 NER_TOKENIZER_URL, NerModelInfo, default_ner_model_info, get_ner_model_info,
120 is_ner_model_installed, ner_model_path, ner_tokenizer_path, NER_MODELS,
121};
122#[cfg(feature = "logic_mesh")]
123pub use analysis::ner::NerModel;
124pub use enrich::{EnrichmentContext, EnrichmentEngine, EnrichmentResult, RulesEngine};
126pub use types::{
128 BatchEmbeddingResult, EmbeddingConfig, EmbeddingProvider, EmbeddingProviderKind,
129 EmbeddingResult,
130};
131pub use types::reranker::{
133 Reranker, RerankerConfig, RerankerDocument, RerankerKind, RerankerResult,
134};
135#[cfg(feature = "parallel_segments")]
136pub use types::{IndexSegmentRef, SegmentKind, SegmentStats};
137pub use vec::{VecIndex, VecIndexArtifact, VecSearchHit};
138pub use vec_pq::{
139 CompressionStats, ProductQuantizer, QuantizedVecIndex, QuantizedVecIndexArtifact,
140 QuantizedVecIndexBuilder,
141};
142pub use clip::{
144 CLIP_MODELS, ClipConfig, ClipDocument, ClipEmbeddingProvider, ClipError, ClipIndex,
145 ClipIndexArtifact, ClipIndexBuilder, ClipIndexManifest, ClipModelInfo, ClipSearchHit,
146 ImageInfo, MOBILECLIP_DIMS, SIGLIP_DIMS, default_model_info, filter_junk_images,
147 get_model_info,
148};
149#[cfg(feature = "clip")]
151pub use clip::{ClipModel, calculate_color_variance, get_image_info};
152
153#[cfg(test)]
154use once_cell::sync::Lazy;
155use std::fs::File;
156use std::io::Cursor;
157use std::path::Path;
158#[cfg(test)]
159use std::sync::Mutex;
160
161use bincode::config::{self, Config};
162use io::header::HeaderCodec;
163
164const TIMELINE_PREVIEW_BYTES: usize = 120;
165const MAX_INDEX_BYTES: u64 = 512 * 1024 * 1024; const MAX_TIME_INDEX_BYTES: u64 = 512 * 1024 * 1024;
167const MAX_FRAME_BYTES: u64 = 256 * 1024 * 1024;
168const DEFAULT_SEARCH_TEXT_LIMIT: usize = 32_768;
169
170#[cfg(test)]
171static SERIAL_TEST_MUTEX: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
172
173#[cfg(test)]
174pub(crate) fn run_serial_test<T>(f: impl FnOnce() -> T) -> T {
175 let _guard = SERIAL_TEST_MUTEX
176 .lock()
177 .expect("memvid-core serial test mutex poisoned");
178 f()
179}
180
181impl Memvid {
182 #[cfg(feature = "lex")]
183 fn tantivy_index_pending(&self) -> bool {
184 self.tantivy_dirty
185 }
186
187 #[cfg(not(feature = "lex"))]
188 fn tantivy_index_pending(&self) -> bool {
189 false
190 }
191
192 #[cfg(feature = "lex")]
193 fn flush_tantivy_conditional(&mut self, embed_snapshot: bool) -> Result<()> {
194 if !self.tantivy_dirty {
195 return Ok(());
196 }
197 if let Some(engine) = self.tantivy.as_mut() {
198 engine.commit()?;
199 if embed_snapshot {
200 let snapshot = engine.snapshot_segments()?;
201 self.update_embedded_lex_snapshot(snapshot)?;
202 }
203 }
204 self.tantivy_dirty = false;
205 Ok(())
206 }
207
208 #[cfg(feature = "lex")]
209 fn flush_tantivy(&mut self) -> Result<()> {
210 self.flush_tantivy_conditional(true)
211 }
212
213 #[cfg(feature = "lex")]
214 #[allow(dead_code)]
215 fn flush_tantivy_skip_embed(&mut self) -> Result<()> {
216 self.flush_tantivy_conditional(false)
217 }
218
219 #[cfg(not(feature = "lex"))]
220 fn flush_tantivy(&mut self) -> Result<()> {
221 Ok(())
222 }
223
224 #[cfg(not(feature = "lex"))]
225 #[allow(dead_code)]
226 fn flush_tantivy_skip_embed(&mut self) -> Result<()> {
227 Ok(())
228 }
229 pub fn path(&self) -> &Path {
230 &self.path
231 }
232
233 pub fn lock_handle(&self) -> &FileLock {
234 &self.lock
235 }
236
237 pub fn is_read_only(&self) -> bool {
238 self.read_only
239 }
240
241 pub(crate) fn ensure_writable(&mut self) -> Result<()> {
242 if self.read_only {
243 self.lock.upgrade_to_exclusive()?;
244 self.read_only = false;
245 }
246 Ok(())
247 }
248
249 pub fn downgrade_to_shared(&mut self) -> Result<()> {
250 if self.read_only {
251 return Ok(());
252 }
253 if self.dirty || self.tantivy_index_pending() {
254 return Ok(());
255 }
256 self.lock.downgrade_to_shared()?;
257 self.read_only = true;
258 Ok(())
259 }
260}
261
262impl Drop for Memvid {
263 fn drop(&mut self) {
264 if self.dirty {
265 let _ = self.commit();
266 }
267 #[cfg(feature = "parallel_segments")]
269 {
270 use crate::memvid::lifecycle::cleanup_manifest_wal_public;
271 cleanup_manifest_wal_public(self.path());
272 }
273 }
274}
275
276pub(crate) fn persist_header(file: &mut File, header: &Header) -> Result<()> {
277 HeaderCodec::write(file, header)
278}
279
280fn wal_config() -> impl Config {
281 config::standard()
282 .with_fixed_int_encoding()
283 .with_little_endian()
284}
285
286pub(crate) fn decode_canonical_bytes(
287 payload: &[u8],
288 encoding: CanonicalEncoding,
289 frame_id: FrameId,
290) -> Result<Vec<u8>> {
291 match encoding {
292 CanonicalEncoding::Plain => Ok(payload.to_vec()),
293 CanonicalEncoding::Zstd => {
294 zstd::decode_all(Cursor::new(payload)).map_err(|_| MemvidError::InvalidFrame {
295 frame_id,
296 reason: "failed to decode canonical payload",
297 })
298 }
299 }
300}
301
302pub(crate) fn default_uri(frame_id: FrameId) -> String {
303 format!("mv2://frames/{frame_id}")
304}
305
306pub(crate) fn infer_title_from_uri(uri: &str) -> Option<String> {
307 let trimmed = uri.trim();
308 if trimmed.is_empty() {
309 return None;
310 }
311
312 let without_scheme = trimmed.splitn(2, "://").nth(1).unwrap_or(trimmed);
313 let without_fragment = without_scheme.split('#').next().unwrap_or(without_scheme);
314 let without_query = without_fragment
315 .split('?')
316 .next()
317 .unwrap_or(without_fragment);
318 let segment = without_query
319 .trim_end_matches('/')
320 .rsplit('/')
321 .next()
322 .map(str::trim)?;
323 if segment.is_empty() {
324 return None;
325 }
326
327 let stem = segment.rsplitn(2, '.').nth(1).unwrap_or(segment).trim();
328 if stem.is_empty() {
329 return None;
330 }
331
332 let words: Vec<String> = stem
333 .split(|c: char| c == '-' || c == '_' || c == ' ')
334 .filter(|part| !part.is_empty())
335 .map(|part| {
336 let mut chars = part.chars();
337 match chars.next() {
338 Some(first) => {
339 let first = first.to_ascii_uppercase();
340 let rest: String = chars.map(|c| c.to_ascii_lowercase()).collect();
341 if rest.is_empty() {
342 first.to_string()
343 } else {
344 format!("{}{}", first, rest)
345 }
346 }
347 None => String::new(),
348 }
349 })
350 .filter(|word| !word.is_empty())
351 .collect();
352
353 if words.is_empty() {
354 None
355 } else {
356 Some(words.join(" "))
357 }
358}
359
360fn truncate_preview(text: &str) -> String {
361 text.chars().take(TIMELINE_PREVIEW_BYTES).collect()
362}
363
364fn image_preview_from_metadata(meta: &DocMetadata) -> Option<String> {
365 let mime = meta.mime.as_deref()?;
366 if !mime.starts_with("image/") {
367 return None;
368 }
369
370 if let Some(caption) = meta.caption.as_ref() {
371 let trimmed = caption.trim();
372 if !trimmed.is_empty() {
373 return Some(truncate_preview(trimmed));
374 }
375 }
376
377 let mut segments: Vec<String> = Vec::new();
378 if let (Some(w), Some(h)) = (meta.width, meta.height) {
379 segments.push(format!("{}×{} px", w, h));
380 }
381 if let Some(exif) = meta.exif.as_ref() {
382 if let Some(model) = exif
383 .model
384 .as_ref()
385 .map(|s| s.trim())
386 .filter(|s| !s.is_empty())
387 {
388 segments.push(model.to_string());
389 } else if let Some(make) = exif
390 .make
391 .as_ref()
392 .map(|s| s.trim())
393 .filter(|s| !s.is_empty())
394 {
395 segments.push(make.to_string());
396 }
397
398 if let Some(datetime) = exif
399 .datetime
400 .as_ref()
401 .map(|s| s.trim())
402 .filter(|s| !s.is_empty())
403 {
404 segments.push(datetime.to_string());
405 }
406 }
407
408 if segments.is_empty() {
409 return Some("Image frame".to_string());
410 }
411
412 Some(truncate_preview(&segments.join(" · ")))
413}
414
415#[cfg(test)]
416mod tests {
417 use super::*;
418 use std::io::Read;
419 use std::num::NonZeroU64;
420 use tempfile::tempdir;
421
422 #[test]
423 fn create_put_commit_reopen() {
424 run_serial_test(|| {
425 let dir = tempdir().expect("tmp");
426 let path = dir.path().join("memory.mv2");
427
428 let mut mem = Memvid::create(&path).expect("create");
429 let seq = mem.put_bytes(b"hello").expect("put");
430 assert_eq!(seq, 1);
431 mem.commit().expect("commit");
432
433 drop(mem);
434
435 let mut reopened = Memvid::open(&path).expect("open");
436 let stats = reopened.stats().expect("stats");
437 assert_eq!(stats.frame_count, 1);
438 assert!(stats.has_time_index);
439
440 let timeline = reopened
441 .timeline(TimelineQuery::default())
442 .expect("timeline");
443 assert_eq!(timeline.len(), 1);
444 assert!(timeline[0].preview.contains("hello"));
445
446 let wal_stats = reopened.wal.stats();
447 assert_eq!(wal_stats.pending_bytes, 0);
448 assert_eq!(wal_stats.sequence, 2);
450 });
451 }
452
453 #[test]
454 fn timeline_limit_and_reverse() {
455 run_serial_test(|| {
456 let dir = tempdir().expect("tmp");
457 let path = dir.path().join("timeline.mv2");
458
459 let mut mem = Memvid::create(&path).expect("create");
460 mem.put_bytes(b"alpha").expect("put alpha");
461 mem.put_bytes(b"beta").expect("put beta");
462 mem.commit().expect("commit");
463 drop(mem);
464
465 let mut reopened = Memvid::open(&path).expect("open");
466 let limited = reopened
467 .timeline(TimelineQuery {
468 limit: NonZeroU64::new(1),
469 since: None,
470 until: None,
471 reverse: false,
472 #[cfg(feature = "temporal_track")]
473 temporal: None,
474 })
475 .expect("timeline limit");
476 assert_eq!(limited.len(), 1);
477 assert!(limited[0].preview.contains("alpha"));
478
479 let reversed = reopened
480 .timeline(TimelineQuery {
481 limit: NonZeroU64::new(1),
482 since: None,
483 until: None,
484 reverse: true,
485 #[cfg(feature = "temporal_track")]
486 temporal: None,
487 })
488 .expect("timeline reverse");
489 assert_eq!(reversed.len(), 1);
490 assert!(reversed[0].preview.contains("beta"));
491 });
492 }
493
494 #[test]
495 fn lex_search_roundtrip() {
496 run_serial_test(|| {
497 let dir = tempdir().expect("tmp");
498 let path = dir.path().join("lex.mv2");
499
500 let mut mem = Memvid::create(&path).expect("create");
501 mem.enable_lex().expect("enable");
502 mem.put_bytes(b"Rust memory engine").expect("put");
503 mem.put_bytes(b"Deterministic WAL").expect("put2");
504 mem.commit().expect("commit");
505
506 let request = SearchRequest {
508 query: "memory".to_string(),
509 top_k: 10,
510 snippet_chars: 200,
511 uri: None,
512 scope: None,
513 cursor: None,
514 #[cfg(feature = "temporal_track")]
515 temporal: None,
516 as_of_frame: None,
517 as_of_ts: None,
518 };
519 let response = mem.search(request).expect("search");
520 assert_eq!(response.hits.len(), 1);
521
522 drop(mem);
523
524 let mut reopened = Memvid::open(&path).expect("open");
525 let request = SearchRequest {
526 query: "wal".to_string(),
527 top_k: 10,
528 snippet_chars: 200,
529 uri: None,
530 scope: None,
531 cursor: None,
532 #[cfg(feature = "temporal_track")]
533 temporal: None,
534 as_of_frame: None,
535 as_of_ts: None,
536 };
537 let response = reopened.search(request).expect("search reopened");
538 assert_eq!(response.hits.len(), 1);
539 });
540 }
541
542 #[test]
543 fn vec_search_roundtrip() {
544 run_serial_test(|| {
545 let dir = tempdir().expect("tmp");
546 let path = dir.path().join("vec.mv2");
547
548 let mut mem = Memvid::create(&path).expect("create");
549 mem.enable_vec().expect("enable");
550 mem.put_with_embedding(b"vector", vec![0.0, 1.0])
551 .expect("put");
552 mem.put_with_embedding(b"vector-two", vec![1.0, 0.0])
553 .expect("put2");
554 mem.commit().expect("commit");
555
556 let stats = mem.stats().expect("stats");
557 assert!(stats.has_vec_index, "vec index should exist after commit");
558
559 let hits = mem.search_vec(&[0.0, 1.0], 5).expect("search");
560 assert_eq!(hits.first().map(|hit| hit.frame_id), Some(0));
561
562 drop(mem);
563
564 let mut reopened = Memvid::open(&path).expect("open");
565 let reopened_stats = reopened.stats().expect("stats reopen");
566 assert!(
567 reopened_stats.has_vec_index,
568 "vec index should exist after reopen: has_manifest={}, vec_enabled={}",
569 reopened.toc.indexes.vec.is_some(),
570 reopened.vec_enabled
571 );
572 let hits = reopened.search_vec(&[1.0, 0.0], 5).expect("search reopen");
573 assert_eq!(hits.first().map(|hit| hit.frame_id), Some(1));
574 });
575 }
576
577 #[test]
578 fn search_snippet_ranges_match_bytes() {
579 run_serial_test(|| {
580 let dir = tempdir().expect("tmp");
581 let path = dir.path().join("search.mv2");
582
583 let mut mem = Memvid::create(&path).expect("create");
584 mem.enable_lex().expect("enable lex");
585 let options = PutOptions::builder()
586 .uri("mv2://docs/pricing.md")
587 .title("Pricing")
588 .build();
589 let text = "Capacity tickets are signed grants that raise per-file caps.";
590 mem.put_bytes_with_options(text.as_bytes(), options)
591 .expect("put doc");
592 mem.commit().expect("commit");
593
594 let response = mem
595 .search(SearchRequest {
596 query: "capacity tickets".into(),
597 top_k: 5,
598 snippet_chars: 160,
599 uri: None,
600 scope: None,
601 cursor: None,
602 #[cfg(feature = "temporal_track")]
603 temporal: None,
604 as_of_frame: None,
605 as_of_ts: None,
606 })
607 .expect("search");
608
609 assert_eq!(response.total_hits, 1);
610 assert_eq!(response.engine, SearchEngineKind::Tantivy);
611 let hit = response.hits.first().expect("hit");
612 let frame = mem
613 .toc
614 .frames
615 .get(hit.frame_id as usize)
616 .cloned()
617 .expect("frame");
618 let canonical = mem.frame_content(&frame).expect("content");
619 let bytes = canonical.as_bytes();
620 let (start, end) = hit.range;
621 assert!(end <= bytes.len());
622 assert_eq!(hit.text.as_bytes(), &bytes[start..end]);
623 let chunk = hit.chunk_range.expect("chunk range");
624 assert!(chunk.0 <= start);
625 assert!(chunk.1 >= end);
626 let chunk_text = hit.chunk_text.as_ref().expect("chunk text");
627 let chunk_slice = &canonical[chunk.0..chunk.1];
628 assert_eq!(chunk_text, chunk_slice);
629 });
630 }
631
632 #[test]
633 fn search_chunk_range_reflects_chunk_offset() {
634 run_serial_test(|| {
635 let dir = tempdir().expect("tmp");
636 let path = dir.path().join("chunked.mv2");
637
638 let mut mem = Memvid::create(&path).expect("create");
639 mem.enable_lex().expect("enable lex");
640
641 let options = PutOptions::builder()
642 .uri("mv2://docs/manual.txt")
643 .title("Manual")
644 .build();
645 let prefix = "alpha beta gamma delta. ".repeat(200);
646 let content = format!(
647 "{}target segment appears here. Trailing context for verification.",
648 prefix
649 );
650 mem.put_bytes_with_options(content.as_bytes(), options)
651 .expect("put doc");
652 mem.commit().expect("commit");
653
654 let response = mem
655 .search(SearchRequest {
656 query: "target segment".into(),
657 top_k: 5,
658 snippet_chars: 160,
659 uri: None,
660 scope: None,
661 cursor: None,
662 #[cfg(feature = "temporal_track")]
663 temporal: None,
664 as_of_frame: None,
665 as_of_ts: None,
666 })
667 .expect("search");
668
669 let hit = response.hits.first().expect("hit");
670 assert_eq!(response.engine, SearchEngineKind::Tantivy);
671 let chunk_range = hit.chunk_range.expect("chunk range");
672 assert!(chunk_range.0 > 0);
673 assert!(hit.range.0 >= chunk_range.0);
674 assert!(hit.range.1 <= chunk_range.1);
675 assert!(hit.text.contains("target segment"));
676 let chunk_text = hit.chunk_text.as_ref().expect("chunk text");
677 assert_eq!(chunk_text, &content[chunk_range.0..chunk_range.1]);
678 });
679 }
680
681 #[test]
682 fn auto_tag_populates_frame_metadata() {
683 run_serial_test(|| {
684 let dir = tempdir().expect("tmp");
685 let path = dir.path().join("autotag.mv2");
686
687 let mut mem = Memvid::create(&path).expect("create");
688 mem.enable_lex().expect("enable lex");
689
690 let options = PutOptions::builder()
691 .search_text("Neural networks planning session 2024-10-08")
692 .auto_tag(true)
693 .extract_dates(true)
694 .build();
695 mem.put_bytes_with_options(b"agenda", options)
696 .expect("put bytes");
697 mem.commit().expect("commit");
698
699 let frame = mem.toc.frames.first().expect("frame present");
700 assert!(!frame.tags.is_empty());
701 assert!(frame.content_dates.iter().any(|date| date.contains("2024")));
702 });
703 }
704
705 #[test]
706 fn search_filters_by_uri_and_scope() {
707 run_serial_test(|| {
708 let dir = tempdir().expect("tmp");
709 let path = dir.path().join("filters.mv2");
710
711 let mut mem = Memvid::create(&path).expect("create");
712 mem.enable_lex().expect("enable lex");
713
714 let options_a = PutOptions::builder()
715 .uri("mv2://docs/pricing.md")
716 .title("Pricing")
717 .build();
718 mem.put_bytes_with_options(b"Capacity tickets add per-file allowances", options_a)
719 .expect("put a");
720
721 let options_b = PutOptions::builder()
722 .uri("mv2://docs/faq.md")
723 .title("FAQ")
724 .build();
725 mem.put_bytes_with_options(b"Tickets can be issued by admins", options_b)
726 .expect("put b");
727
728 let options_c = PutOptions::builder()
729 .uri("mv2://blog/launch.md")
730 .title("Launch")
731 .build();
732 mem.put_bytes_with_options(b"Launch day tickets boost visibility", options_c)
733 .expect("put c");
734
735 mem.commit().expect("commit");
736
737 let uri_response = mem
738 .search(SearchRequest {
739 query: "tickets".into(),
740 top_k: 10,
741 snippet_chars: 120,
742 uri: Some("mv2://docs/pricing.md".into()),
743 scope: None,
744 cursor: None,
745 #[cfg(feature = "temporal_track")]
746 temporal: None,
747 as_of_frame: None,
748 as_of_ts: None,
749 })
750 .expect("uri search");
751 assert_eq!(uri_response.engine, SearchEngineKind::Tantivy);
752 assert!(
753 uri_response
754 .hits
755 .iter()
756 .all(|hit| hit.uri == "mv2://docs/pricing.md")
757 );
758
759 let scope_response = mem
760 .search(SearchRequest {
761 query: "tickets".into(),
762 top_k: 10,
763 snippet_chars: 120,
764 uri: None,
765 scope: Some("mv2://docs/".into()),
766 cursor: None,
767 #[cfg(feature = "temporal_track")]
768 temporal: None,
769 as_of_frame: None,
770 as_of_ts: None,
771 })
772 .expect("scope search");
773 assert_eq!(scope_response.engine, SearchEngineKind::Tantivy);
774 assert!(
775 scope_response
776 .hits
777 .iter()
778 .all(|hit| hit.uri.starts_with("mv2://docs/"))
779 );
780 });
781 }
782
783 #[test]
784 fn search_pagination_and_params() {
785 run_serial_test(|| {
786 let dir = tempdir().expect("tmp");
787 let path = dir.path().join("paging.mv2");
788
789 let mut mem = Memvid::create(&path).expect("create");
790 mem.enable_lex().expect("enable lex");
791
792 for (idx, text) in [
793 "tickets unlock tier upgrades",
794 "tickets expire after 30 days",
795 "tickets may be revoked",
796 ]
797 .iter()
798 .enumerate()
799 {
800 let uri = format!("mv2://docs/doc{idx}.md");
801 let options = PutOptions::builder()
802 .uri(&uri)
803 .title(format!("Doc {idx}"))
804 .build();
805 mem.put_bytes_with_options(text.as_bytes(), options)
806 .expect("put doc");
807 }
808
809 mem.commit().expect("commit");
810
811 let first_page = mem
812 .search(SearchRequest {
813 query: "tickets".into(),
814 top_k: 1,
815 snippet_chars: 90,
816 uri: None,
817 scope: None,
818 cursor: None,
819 #[cfg(feature = "temporal_track")]
820 temporal: None,
821 as_of_frame: None,
822 as_of_ts: None,
823 })
824 .expect("page one");
825 assert_eq!(first_page.engine, SearchEngineKind::Tantivy);
826 assert_eq!(first_page.hits.len(), 1);
827 assert_eq!(first_page.params.top_k, 1);
828 assert_eq!(first_page.params.snippet_chars, 90);
829 assert!(first_page.total_hits >= first_page.hits.len());
830 let cursor = first_page.next_cursor.clone().expect("cursor");
831 let first_id = first_page.hits[0].frame_id;
832
833 let second_page = mem
834 .search(SearchRequest {
835 query: "tickets".into(),
836 top_k: 1,
837 snippet_chars: 90,
838 uri: None,
839 scope: None,
840 cursor: Some(cursor),
841 #[cfg(feature = "temporal_track")]
842 temporal: None,
843 as_of_frame: None,
844 as_of_ts: None,
845 })
846 .expect("page two");
847 assert_eq!(second_page.engine, SearchEngineKind::Tantivy);
848 assert_eq!(second_page.hits.len(), 1);
849 assert_ne!(second_page.hits[0].frame_id, first_id);
850 assert_eq!(second_page.total_hits, first_page.total_hits);
851 });
852 }
853
854 #[cfg(feature = "lex")]
855 #[test]
856 fn search_falls_back_when_tantivy_missing() {
857 run_serial_test(|| {
858 let dir = tempdir().expect("tmp");
859 let path = dir.path().join("fallback.mv2");
860
861 let mut mem = Memvid::create(&path).expect("create");
862 mem.enable_lex().expect("enable lex");
863 mem.put_bytes(b"tickets fallback test").expect("put");
864 mem.commit().expect("commit");
865
866 assert!(
869 mem.tantivy.is_some(),
870 "Tantivy should be initialized after commit"
871 );
872
873 let response = mem
874 .search(SearchRequest {
875 query: "tickets".into(),
876 top_k: 5,
877 snippet_chars: 120,
878 uri: None,
879 scope: None,
880 cursor: None,
881 #[cfg(feature = "temporal_track")]
882 temporal: None,
883 as_of_frame: None,
884 as_of_ts: None,
885 })
886 .expect("search with tantivy");
887
888 assert_eq!(response.engine, SearchEngineKind::Tantivy);
889 assert!(!response.hits.is_empty());
890 });
891 }
892
893 #[test]
894 fn verify_reports_success() {
895 run_serial_test(|| {
896 let dir = tempdir().expect("tmp");
897 let path = dir.path().join("verify.mv2");
898
899 {
900 let mut mem = Memvid::create(&path).expect("create");
901 mem.enable_lex().expect("enable lex");
902 mem.enable_vec().expect("enable vec");
903 mem.put_with_embedding(b"check", vec![0.5, 0.1])
904 .expect("put");
905 mem.commit().expect("commit");
906 }
907
908 let report = Memvid::verify(&path, true).expect("verify");
909 assert_eq!(report.overall_status, VerificationStatus::Passed);
910 });
911 }
912
913 #[test]
914 fn test_create_enables_indexes_by_default() {
915 run_serial_test(|| {
916 let dir = tempdir().expect("tmp");
917 let path = dir.path().join("default_indexes.mv2");
918
919 let mem = Memvid::create(&path).expect("create");
921
922 let stats = mem.stats().expect("stats");
924 println!(
925 "After create (before drop): lex={}, vec={}",
926 stats.has_lex_index, stats.has_vec_index
927 );
928
929 drop(mem);
930
931 let reopened = Memvid::open(&path).expect("reopen");
933 let stats2 = reopened.stats().expect("stats after reopen");
934 println!(
935 "After reopen: lex={}, vec={}",
936 stats2.has_lex_index, stats2.has_vec_index
937 );
938
939 #[cfg(feature = "lex")]
940 assert!(
941 stats2.has_lex_index,
942 "lex index should be enabled by default"
943 );
944
945 #[cfg(feature = "vec")]
946 assert!(
947 stats2.has_vec_index,
948 "vec index should be enabled by default"
949 );
950 });
951 }
952
953 #[test]
954 fn doctor_rebuilds_time_index() {
955 use std::fs::OpenOptions;
956 use std::io::{Seek, SeekFrom, Write};
957
958 run_serial_test(|| {
959 let dir = tempdir().expect("tmp");
960 let path = dir.path().join("doctor.mv2");
961
962 let manifest = {
963 let mut mem = Memvid::create(&path).expect("create");
964 mem.put_bytes(b"repair").expect("put");
965 mem.commit().expect("commit");
966 mem.rebuild_indexes(&[]).expect("rebuild");
968 mem.commit().expect("commit after rebuild");
969 println!(
970 "test: post-commit header footer_offset={}",
971 mem.header.footer_offset
972 );
973 println!(
974 "test: post-commit manifest offset={} length={}",
975 mem.toc
976 .time_index
977 .as_ref()
978 .map(|m| m.bytes_offset)
979 .unwrap_or(0),
980 mem.toc
981 .time_index
982 .as_ref()
983 .map(|m| m.bytes_length)
984 .unwrap_or(0)
985 );
986 mem.toc.time_index.clone().expect("time index manifest")
987 };
988
989 {
990 let mut file = OpenOptions::new()
991 .read(true)
992 .write(true)
993 .open(&path)
994 .expect("open file");
995 file.seek(SeekFrom::Start(manifest.bytes_offset))
996 .expect("seek");
997 let zeros = vec![0u8; manifest.bytes_length as usize];
998 file.write_all(&zeros).expect("corrupt time index");
999 file.flush().expect("flush");
1000 file.sync_all().expect("sync");
1001 }
1002
1003 println!(
1004 "test: footer scan: {:?}",
1005 crate::footer::find_last_valid_footer(&std::fs::read(&path).expect("read file"))
1006 .as_ref()
1007 .map(|s| (s.footer_offset, s.toc_offset, s.footer.toc_len))
1008 );
1009 println!("test: verifying corrupted memory");
1010 match Memvid::verify(&path, false) {
1011 Ok(report) => {
1012 assert_eq!(report.overall_status, VerificationStatus::Failed);
1013 }
1014 Err(e) => {
1015 println!("test: verify failed with error (expected): {}", e);
1016 }
1017 }
1018
1019 println!("test: running doctor");
1020 let report = Memvid::doctor(
1021 &path,
1022 DoctorOptions {
1023 rebuild_time_index: true,
1024 rebuild_lex_index: false,
1025 ..DoctorOptions::default()
1026 },
1027 )
1028 .expect("doctor");
1029 println!("test: doctor completed with status: {:?}", report.status);
1030 println!("test: verifying repaired memory");
1035 let reopened = Memvid::open(&path).expect("reopen after doctor");
1037 assert!(
1038 reopened.toc.time_index.is_some(),
1039 "time index should exist after doctor"
1040 );
1041 });
1042 }
1043
1044 #[test]
1045 fn blob_reader_roundtrip_with_media_manifest() {
1046 run_serial_test(|| {
1047 let dir = tempdir().expect("tmp");
1048 let path = dir.path().join("blob.mv2");
1049 let payload = vec![0u8, 159, 1, 128, 42, 99, 200];
1050
1051 let manifest = MediaManifest {
1052 kind: "video".to_string(),
1053 mime: "video/mp4".to_string(),
1054 bytes: payload.len() as u64,
1055 filename: Some("clip.mp4".to_string()),
1056 duration_ms: Some(1234),
1057 width: Some(1920),
1058 height: Some(1080),
1059 codec: Some("h264".to_string()),
1060 };
1061
1062 let mut doc_meta = DocMetadata::default();
1063 doc_meta.media = Some(manifest.clone());
1064 doc_meta.mime = Some("video/mp4".to_string());
1065 doc_meta.bytes = Some(payload.len() as u64);
1066 assert!(
1067 !doc_meta.is_empty(),
1068 "media manifest must count as metadata"
1069 );
1070
1071 let options = PutOptions::builder()
1072 .metadata(doc_meta)
1073 .kind("video")
1074 .uri("mv2://video/clip.mp4")
1075 .build();
1076
1077 {
1078 let mut mem = Memvid::create(&path).expect("create");
1079 mem.put_bytes_with_options(&payload, options)
1080 .expect("put bytes");
1081 mem.commit().expect("commit");
1082 }
1083
1084 let mut reopened = Memvid::open(&path).expect("open");
1085 let mut reader = reopened
1086 .blob_reader_by_uri("mv2://video/clip.mp4")
1087 .expect("blob reader");
1088 let mut buffered = Vec::new();
1089 reader.read_to_end(&mut buffered).expect("read payload");
1090 assert_eq!(buffered, payload);
1091
1092 let roundtrip = reopened
1093 .media_manifest_by_uri("mv2://video/clip.mp4")
1094 .expect("manifest lookup")
1095 .expect("manifest present");
1096 assert_eq!(roundtrip.mime, "video/mp4");
1097 assert_eq!(roundtrip.kind, "video");
1098 assert_eq!(roundtrip.bytes, payload.len() as u64);
1099 assert_eq!(roundtrip.filename.as_deref(), Some("clip.mp4"));
1100 assert_eq!(roundtrip.duration_ms, Some(1234));
1101 assert_eq!(roundtrip.width, Some(1920));
1102 assert_eq!(roundtrip.height, Some(1080));
1103 assert_eq!(roundtrip.codec.as_deref(), Some("h264"));
1104
1105 drop(dir);
1106 });
1107 }
1108
1109 #[test]
1110 fn video_frame_roundtrip_does_not_corrupt_toc() {
1111 use crate::types::MediaManifest;
1112
1113 run_serial_test(|| {
1114 let dir = tempdir().expect("tmp");
1115 let path = dir.path().join("video.mv2");
1116 let mut seed = 0xDEADBEEF_u64;
1117 let mut video_bytes = vec![0u8; 1_600_000];
1118 for byte in &mut video_bytes {
1119 seed = seed ^ (seed << 7);
1120 seed = seed ^ (seed >> 9);
1121 seed = seed ^ (seed << 8);
1122 *byte = (seed & 0xFF) as u8;
1123 }
1124
1125 let hash_hex = blake3::hash(&video_bytes).to_hex().to_string();
1126
1127 let manifest = MediaManifest {
1128 kind: "video".to_string(),
1129 mime: "video/mp4".to_string(),
1130 bytes: video_bytes.len() as u64,
1131 filename: Some("clip.mp4".to_string()),
1132 duration_ms: Some(1_000),
1133 width: Some(1920),
1134 height: Some(1080),
1135 codec: Some("h264".to_string()),
1136 };
1137
1138 let mut meta = DocMetadata::default();
1139 meta.mime = Some("video/mp4".to_string());
1140 meta.bytes = Some(video_bytes.len() as u64);
1141 meta.hash = Some(hash_hex);
1142 meta.caption = Some("Test clip".to_string());
1143 meta.media = Some(manifest);
1144
1145 let options = PutOptions::builder()
1146 .kind("video")
1147 .metadata(meta)
1148 .tag("kind", "video")
1149 .uri("mv2://video/test.mp4")
1150 .title("Test clip")
1151 .build();
1152
1153 {
1154 let mut mem = Memvid::create(&path).expect("create");
1155 mem.put_bytes_with_options(&video_bytes, options)
1156 .expect("put video");
1157 mem.commit().expect("commit");
1158 }
1159
1160 let reopened = Memvid::open(&path).expect("reopen");
1161 let stats = reopened.stats().expect("stats");
1162 assert_eq!(stats.frame_count, 1);
1163 });
1164 }
1165
1166 #[test]
1167 fn ticket_sequence_enforced() {
1168 run_serial_test(|| {
1169 let dir = tempdir().expect("tmp");
1170 let path = dir.path().join("ticket.mv2");
1171
1172 let mut mem = Memvid::create(&path).expect("create");
1173 mem.apply_ticket(Ticket::new("issuer", 2))
1174 .expect("apply first");
1175
1176 let err = mem
1177 .apply_ticket(Ticket::new("issuer", 2))
1178 .expect_err("sequence must increase");
1179 assert!(matches!(err, MemvidError::TicketSequence { .. }));
1180 });
1181 }
1182
1183 #[test]
1184 fn capacity_limit_enforced() {
1185 run_serial_test(|| {
1186 let dir = tempdir().expect("tmp");
1187 let path = dir.path().join("capacity.mv2");
1188
1189 let mut mem = Memvid::create(&path).expect("create");
1190 let base = mem.data_end;
1191 mem.apply_ticket(Ticket::new("issuer", 2).capacity_bytes(base + 64))
1192 .expect("apply ticket");
1193
1194 mem.put_bytes(&vec![0xFF; 32]).expect("first put");
1195 mem.commit().expect("commit");
1196
1197 let err = mem
1198 .put_bytes(&vec![0xFF; 40])
1199 .expect_err("capacity exceeded");
1200 assert!(matches!(err, MemvidError::CapacityExceeded { .. }));
1201 });
1202 }
1203}