1#![deny(clippy::all, clippy::pedantic)]
2#![cfg_attr(not(test), deny(clippy::unwrap_used, clippy::expect_used))]
3#![cfg_attr(
4 test,
5 allow(
6 clippy::useless_vec,
7 clippy::uninlined_format_args,
8 clippy::cast_possible_truncation,
9 clippy::float_cmp,
10 clippy::cast_precision_loss
11 )
12)]
13#![allow(clippy::module_name_repetitions)]
14#![allow(clippy::missing_errors_doc)]
20#![allow(clippy::missing_panics_doc)]
21#![allow(clippy::doc_markdown)]
22#![allow(clippy::cast_precision_loss)]
27#![allow(clippy::cast_possible_wrap)]
28#![allow(clippy::cast_sign_loss)]
29#![allow(clippy::cast_lossless)]
30#![allow(clippy::too_many_lines)]
34#![allow(clippy::too_many_arguments)]
35#![allow(clippy::items_after_statements)]
36#![allow(clippy::similar_names)]
37#![allow(clippy::manual_let_else)]
41#![allow(clippy::match_same_arms)]
42#![allow(clippy::if_same_then_else)]
43#![allow(clippy::collapsible_match)]
44#![allow(clippy::needless_pass_by_value)] #![allow(clippy::return_self_not_must_use)] #![allow(clippy::format_push_string)] #![allow(clippy::assigning_clones)] #![allow(clippy::struct_excessive_bools)] #![allow(clippy::needless_continue)]
54#![allow(clippy::needless_range_loop)]
55#![allow(clippy::case_sensitive_file_extension_comparisons)]
56#![allow(clippy::default_trait_access)]
57#![allow(clippy::field_reassign_with_default)]
58#![allow(clippy::unreadable_literal)] #![allow(clippy::implicit_hasher)]
60#![allow(clippy::manual_clamp)]
61#![allow(clippy::len_without_is_empty)] #![allow(clippy::large_enum_variant)]
63#![allow(clippy::ptr_arg)]
64#![allow(clippy::map_unwrap_or)]
65#![allow(clippy::incompatible_msrv)]
66#![allow(clippy::should_implement_trait)] #![allow(clippy::duplicated_attributes)]
68#![allow(clippy::unnecessary_wraps)]
72#![allow(clippy::unused_self)] pub const MEMVID_CORE_VERSION: &str = env!("CARGO_PKG_VERSION");
76
77mod analysis;
78pub mod constants;
79pub mod enrich;
80pub mod enrichment_worker;
81pub mod error;
82pub mod extract;
83pub mod extract_budgeted;
84pub mod footer;
85pub mod io;
86pub mod lex;
87mod lock;
88pub mod lockfile;
89pub mod memvid;
90pub mod models;
91pub mod pii;
92pub mod reader;
93mod registry;
94mod search;
95pub mod signature;
96pub mod structure;
97pub mod table;
98pub mod text;
99mod toc;
100pub mod types;
101pub mod vec;
102pub mod vec_pq;
103
104pub mod simd;
106
107#[cfg(feature = "vec")]
108pub mod text_embed;
109
110pub mod triplet;
112
113pub mod graph_search;
115
116pub mod clip;
119
120pub mod whisper;
123
124pub mod replay;
128
129#[cfg(feature = "encryption")]
132pub mod encryption;
133
134#[cfg(feature = "symspell_cleanup")]
136pub mod symspell_cleanup;
137
138#[cfg(feature = "api_embed")]
140pub mod api_embed;
141
142#[cfg(test)]
143mod tests_lex_flag;
144
145#[cfg(feature = "temporal_track")]
146pub use analysis::temporal::{
147 TemporalContext, TemporalNormalizer, TemporalResolution, TemporalResolutionFlag,
148 TemporalResolutionValue, parse_clock_inheritance, parse_week_start,
149};
150#[cfg(feature = "temporal_enrich")]
152pub use analysis::temporal_enrich::{
153 AnchorSource as TemporalEnrichAnchorSource, RelativePhrase, ResolvedTemporal,
154 TemporalAnchorInfo, TemporalAnchorTracker, TemporalEnrichment, detect_relative_phrases,
155 enrich_chunk, enrich_chunks, enrich_document, resolve_relative_phrase,
156};
157pub use constants::*;
158pub use enrichment_worker::{EnrichmentWorkerConfig, EnrichmentWorkerStats};
159pub use error::{MemvidError, Result};
160pub use extract::{DocumentProcessor, ExtractedDocument, ProcessorConfig};
161pub use footer::{CommitFooter, find_last_valid_footer};
162#[cfg(feature = "temporal_track")]
163pub use io::temporal_index::{
164 append_track as temporal_track_append, calculate_checksum as temporal_track_checksum,
165 read_track as temporal_track_read, window as temporal_track_window,
166};
167pub use io::time_index::{
168 TimeIndexEntry, append_track as time_index_append, calculate_checksum as time_index_checksum,
169 read_track as time_index_read,
170};
171pub use io::wal::{EmbeddedWal, WalRecord, WalStats};
172pub use lex::{LexIndex, LexIndexArtifact, LexIndexBuilder, LexSearchHit};
173pub use lock::FileLock;
174pub use memvid::{
175 BlobReader, EnrichmentHandle, EnrichmentStats, LockSettings, Memvid, OpenReadOptions,
176 SketchCandidate, SketchSearchOptions, SketchSearchStats,
177 mutation::{CommitMode, CommitOptions},
178 start_enrichment_worker, start_enrichment_worker_with_embeddings,
179};
180#[cfg(feature = "parallel_segments")]
181pub use memvid::{BuildOpts, ParallelInput, ParallelPayload};
182pub use models::{
183 ModelManifest, ModelManifestEntry, ModelVerification, ModelVerificationStatus,
184 ModelVerifyOptions, verify_model_dir, verify_models,
185};
186pub use reader::{
187 DocumentFormat, DocumentReader, PassthroughReader, PdfReader, ReaderDiagnostics, ReaderHint,
188 ReaderOutput, ReaderRegistry,
189};
190pub use signature::{
191 parse_ed25519_public_key_base64, verify_model_manifest, verify_ticket_signature,
192};
193pub use text::{NormalizedText, normalize_text, truncate_at_grapheme_boundary};
194#[cfg(feature = "temporal_track")]
195pub use types::{
196 AnchorSource, SearchHitTemporal, SearchHitTemporalAnchor, SearchHitTemporalMention,
197 TEMPORAL_TRACK_FLAG_HAS_ANCHORS, TEMPORAL_TRACK_FLAG_HAS_MENTIONS, TemporalAnchor,
198 TemporalCapabilities, TemporalFilter, TemporalMention, TemporalMentionFlags,
199 TemporalMentionKind, TemporalTrack, TemporalTrackManifest,
200};
201pub use types::{
202 AskCitation, AskMode, AskRequest, AskResponse, AskRetriever, AskStats, AudioSegmentMetadata,
203 AuditOptions, AuditReport, CanonicalEncoding, DOCTOR_PLAN_VERSION, DocAudioMetadata,
204 DocExifMetadata, DocGpsMetadata, DocMetadata, DoctorActionDetail, DoctorActionKind,
205 DoctorActionPlan, DoctorActionReport, DoctorActionStatus, DoctorFinding, DoctorFindingCode,
206 DoctorMetrics, DoctorOptions, DoctorPhaseDuration, DoctorPhaseKind, DoctorPhasePlan,
207 DoctorPhaseReport, DoctorPhaseStatus, DoctorPlan, DoctorReport, DoctorSeverity, DoctorStatus,
208 EmbeddingIdentity, EmbeddingIdentityCount, EmbeddingIdentitySummary, Frame, FrameId, FrameRole,
209 FrameStatus, Header, IndexManifests, LexIndexManifest, LexSegmentDescriptor,
210 MEMVID_EMBEDDING_DIMENSION_KEY, MEMVID_EMBEDDING_MODEL_KEY, MEMVID_EMBEDDING_NORMALIZED_KEY,
211 MEMVID_EMBEDDING_PROVIDER_KEY, MediaManifest, MemvidHandle, Open, PutOptions,
212 PutOptionsBuilder, Sealed, SearchEngineKind, SearchHit, SearchHitMetadata, SearchParams,
213 SearchRequest, SearchResponse, SegmentCatalog, SegmentCommon, SegmentCompression, SegmentMeta,
214 SegmentSpan, SourceSpan, Stats, TextChunkManifest, TextChunkRange, Ticket, TicketRef, Tier,
215 TimeIndexManifest, TimeSegmentDescriptor, TimelineEntry, TimelineQuery, TimelineQueryBuilder,
216 Toc, VecEmbedder, VecIndexManifest, VecSegmentDescriptor, VectorCompression, VerificationCheck,
217 VerificationReport, VerificationStatus,
218};
219pub use types::{
221 EngineStamp, EnrichmentManifest, EnrichmentRecord, MEMORIES_TRACK_MAGIC,
222 MEMORIES_TRACK_VERSION, MemoriesStats, MemoriesTrack, MemoryCard, MemoryCardBuilder,
223 MemoryCardBuilderError, MemoryCardId, MemoryKind, Polarity, SlotIndex, VersionRelation,
224};
225pub use types::{
227 EdgeDirection, EntityKind, FollowResult, LOGIC_MESH_MAGIC, LOGIC_MESH_VERSION, LinkType,
228 LogicMesh, LogicMeshManifest, MeshEdge, MeshNode,
229};
230pub use types::{
232 DEFAULT_HAMMING_THRESHOLD, QuerySketch, SKETCH_TRACK_MAGIC, SKETCH_TRACK_VERSION, SketchEntry,
233 SketchFlags, SketchTrack, SketchTrackHeader, SketchTrackManifest, SketchTrackStats,
234 SketchVariant, build_term_filter, compute_simhash, compute_token_weights, generate_sketch,
235 hash_token, hash_token_u32, read_sketch_track, term_filter_maybe_contains, tokenize_for_sketch,
236 write_sketch_track,
237};
238pub use types::{
240 Cardinality, PredicateId, PredicateSchema, SchemaError, SchemaRegistry, ValueType,
241};
242pub use memvid::memory::SchemaSummaryEntry;
244#[cfg(feature = "logic_mesh")]
246pub use analysis::ner::NerModel;
247pub use analysis::ner::{
248 ExtractedEntity, FrameEntities, NER_MODEL_NAME, NER_MODEL_SIZE_MB, NER_MODEL_URL, NER_MODELS,
249 NER_TOKENIZER_URL, NerModelInfo, default_ner_model_info, get_ner_model_info,
250 is_ner_model_installed, ner_model_path, ner_tokenizer_path,
251};
252pub use enrich::{EnrichmentContext, EnrichmentEngine, EnrichmentResult, RulesEngine};
254pub use triplet::{ExtractionMode, ExtractionStats, TripletExtractor};
256pub use graph_search::{GraphMatcher, QueryPlanner, hybrid_search};
258pub use types::{
260 BatchEmbeddingResult, EmbeddingConfig, EmbeddingProvider, EmbeddingProviderKind,
261 EmbeddingResult,
262};
263pub use types::reranker::{
265 Reranker, RerankerConfig, RerankerDocument, RerankerKind, RerankerResult,
266};
267#[cfg(feature = "parallel_segments")]
268pub use types::{IndexSegmentRef, SegmentKind, SegmentStats};
269pub use vec::{VecIndex, VecIndexArtifact, VecSearchHit};
270pub use vec_pq::{
271 CompressionStats, ProductQuantizer, QuantizedVecIndex, QuantizedVecIndexArtifact,
272 QuantizedVecIndexBuilder,
273};
274#[cfg(feature = "vec")]
276pub use text_embed::{
277 LocalTextEmbedder, TEXT_EMBED_MODELS, TextEmbedConfig, TextEmbedModelInfo,
278 default_text_model_info, get_text_model_info,
279};
280#[cfg(feature = "api_embed")]
282pub use api_embed::{
283 OPENAI_MODELS, OpenAIConfig, OpenAIEmbedder, OpenAIModelInfo, default_openai_model_info,
284 get_openai_model_info,
285};
286pub use clip::{
288 CLIP_MODELS, ClipConfig, ClipDocument, ClipEmbeddingProvider, ClipError, ClipIndex,
289 ClipIndexArtifact, ClipIndexBuilder, ClipIndexManifest, ClipModelInfo, ClipSearchHit,
290 ImageInfo, MOBILECLIP_DIMS, SIGLIP_DIMS, default_model_info, filter_junk_images,
291 get_model_info,
292};
293#[cfg(feature = "clip")]
295pub use clip::{ClipModel, calculate_color_variance, get_image_info};
296pub use whisper::{
298 TranscriptionResult, TranscriptionSegment, WHISPER_MODELS, WhisperConfig, WhisperError,
299 WhisperModelInfo, default_whisper_model_info, get_whisper_model_info,
300};
301#[cfg(feature = "whisper")]
303pub use whisper::{WHISPER_SAMPLE_RATE, WhisperTranscriber, decode_audio_file};
304pub use structure::{
306 ChunkType, ChunkingOptions, ChunkingResult, StructuralChunker, StructuredChunk,
307 StructuredDocument, TableChunkingStrategy, chunk_structured, detect_structure,
308};
309pub use types::adaptive::{
311 AdaptiveConfig, AdaptiveResult, AdaptiveStats, CutoffStrategy, find_adaptive_cutoff,
312 normalize_scores,
313};
314pub use replay::{
316 ActionType, Checkpoint, REPLAY_SEGMENT_MAGIC, REPLAY_SEGMENT_VERSION, ReplayAction,
317 ReplayManifest, ReplaySession, SessionSummary, StateSnapshot,
318};
319#[cfg(feature = "replay")]
321pub use replay::{
322 ActiveSession, ComparisonReport, ComparisonSummary, Divergence, DivergenceType, ModelResult,
323 ReplayConfig, ReplayOptions, ReplayResult,
324};
325
326#[cfg(test)]
327use once_cell::sync::Lazy;
328use std::fs::File;
329use std::io::Cursor;
330use std::path::Path;
331#[cfg(test)]
332use std::sync::Mutex;
333
334use bincode::config::{self, Config};
335use io::header::HeaderCodec;
336
337const TIMELINE_PREVIEW_BYTES: usize = 120;
338const MAX_INDEX_BYTES: u64 = 512 * 1024 * 1024; const MAX_TIME_INDEX_BYTES: u64 = 512 * 1024 * 1024;
340const MAX_FRAME_BYTES: u64 = 256 * 1024 * 1024;
341const DEFAULT_SEARCH_TEXT_LIMIT: usize = 32_768;
342
343#[cfg(test)]
344#[allow(clippy::non_std_lazy_statics)]
345static SERIAL_TEST_MUTEX: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
346
347#[cfg(test)]
348pub(crate) fn run_serial_test<T>(f: impl FnOnce() -> T) -> T {
349 let _guard = SERIAL_TEST_MUTEX
350 .lock()
351 .expect("memvid-core serial test mutex poisoned");
352 f()
353}
354
355impl Memvid {
356 #[cfg(feature = "lex")]
357 fn tantivy_index_pending(&self) -> bool {
358 self.tantivy_dirty
359 }
360
361 #[cfg(not(feature = "lex"))]
362 fn tantivy_index_pending(&self) -> bool {
363 false
364 }
365
366 #[cfg(feature = "lex")]
367 fn flush_tantivy_conditional(&mut self, embed_snapshot: bool) -> Result<()> {
368 if !self.tantivy_dirty {
369 return Ok(());
370 }
371 if let Some(engine) = self.tantivy.as_mut() {
372 engine.commit()?;
373 if embed_snapshot {
374 let snapshot = engine.snapshot_segments()?;
375 self.update_embedded_lex_snapshot(snapshot)?;
376 }
377 }
378 self.tantivy_dirty = false;
379 Ok(())
380 }
381
382 #[cfg(feature = "lex")]
383 fn flush_tantivy(&mut self) -> Result<()> {
384 self.flush_tantivy_conditional(true)
385 }
386
387 #[cfg(feature = "lex")]
388 #[allow(dead_code)]
389 fn flush_tantivy_skip_embed(&mut self) -> Result<()> {
390 self.flush_tantivy_conditional(false)
391 }
392
393 #[cfg(not(feature = "lex"))]
394 fn flush_tantivy(&mut self) -> Result<()> {
395 Ok(())
396 }
397
398 #[cfg(not(feature = "lex"))]
399 #[allow(dead_code)]
400 fn flush_tantivy_skip_embed(&mut self) -> Result<()> {
401 Ok(())
402 }
403 #[must_use]
404 pub fn path(&self) -> &Path {
405 &self.path
406 }
407
408 #[must_use]
409 pub fn lock_handle(&self) -> &FileLock {
410 &self.lock
411 }
412
413 #[must_use]
414 pub fn is_read_only(&self) -> bool {
415 self.read_only
416 }
417
418 pub(crate) fn ensure_writable(&mut self) -> Result<()> {
419 if self.read_only {
420 self.lock.upgrade_to_exclusive()?;
421 self.read_only = false;
422 }
423 Ok(())
424 }
425
426 pub fn downgrade_to_shared(&mut self) -> Result<()> {
427 if self.read_only {
428 return Ok(());
429 }
430 if self.dirty || self.tantivy_index_pending() {
431 return Ok(());
432 }
433 self.lock.downgrade_to_shared()?;
434 self.read_only = true;
435 Ok(())
436 }
437}
438
439impl Drop for Memvid {
440 fn drop(&mut self) {
441 if self.dirty {
442 let _ = self.commit();
443 }
444 #[cfg(feature = "parallel_segments")]
446 {
447 use crate::memvid::lifecycle::cleanup_manifest_wal_public;
448 cleanup_manifest_wal_public(self.path());
449 }
450 }
451}
452
453pub(crate) fn persist_header(file: &mut File, header: &Header) -> Result<()> {
454 HeaderCodec::write(file, header)
455}
456
457fn wal_config() -> impl Config {
458 config::standard()
459 .with_fixed_int_encoding()
460 .with_little_endian()
461}
462
463pub(crate) fn decode_canonical_bytes(
464 payload: &[u8],
465 encoding: CanonicalEncoding,
466 frame_id: FrameId,
467) -> Result<Vec<u8>> {
468 match encoding {
469 CanonicalEncoding::Plain => Ok(payload.to_vec()),
470 CanonicalEncoding::Zstd => {
471 zstd::decode_all(Cursor::new(payload)).map_err(|_| MemvidError::InvalidFrame {
472 frame_id,
473 reason: "failed to decode canonical payload",
474 })
475 }
476 }
477}
478
479pub(crate) fn default_uri(frame_id: FrameId) -> String {
480 format!("mv2://frames/{frame_id}")
481}
482
483pub(crate) fn infer_title_from_uri(uri: &str) -> Option<String> {
484 let trimmed = uri.trim();
485 if trimmed.is_empty() {
486 return None;
487 }
488
489 let without_scheme = trimmed.split_once("://").map_or(trimmed, |x| x.1);
490 let without_fragment = without_scheme.split('#').next().unwrap_or(without_scheme);
491 let without_query = without_fragment
492 .split('?')
493 .next()
494 .unwrap_or(without_fragment);
495 let segment = without_query
496 .trim_end_matches('/')
497 .rsplit('/')
498 .next()
499 .map(str::trim)?;
500 if segment.is_empty() {
501 return None;
502 }
503
504 let stem = segment.rsplit_once('.').map_or(segment, |x| x.0).trim();
505 if stem.is_empty() {
506 return None;
507 }
508
509 let words: Vec<String> = stem
510 .split(['-', '_', ' '])
511 .filter(|part| !part.is_empty())
512 .map(|part| {
513 let mut chars = part.chars();
514 match chars.next() {
515 Some(first) => {
516 let first = first.to_ascii_uppercase();
517 let rest: String = chars.map(|c| c.to_ascii_lowercase()).collect();
518 if rest.is_empty() {
519 first.to_string()
520 } else {
521 format!("{first}{rest}")
522 }
523 }
524 None => String::new(),
525 }
526 })
527 .filter(|word| !word.is_empty())
528 .collect();
529
530 if words.is_empty() {
531 None
532 } else {
533 Some(words.join(" "))
534 }
535}
536
537fn truncate_preview(text: &str) -> String {
538 text.chars().take(TIMELINE_PREVIEW_BYTES).collect()
539}
540
541fn image_preview_from_metadata(meta: &DocMetadata) -> Option<String> {
542 let mime = meta.mime.as_deref()?;
543 if !mime.starts_with("image/") {
544 return None;
545 }
546
547 if let Some(caption) = meta.caption.as_ref() {
548 let trimmed = caption.trim();
549 if !trimmed.is_empty() {
550 return Some(truncate_preview(trimmed));
551 }
552 }
553
554 let mut segments: Vec<String> = Vec::new();
555 if let (Some(w), Some(h)) = (meta.width, meta.height) {
556 segments.push(format!("{w}×{h} px"));
557 }
558 if let Some(exif) = meta.exif.as_ref() {
559 if let Some(model) = exif
560 .model
561 .as_ref()
562 .map(|s| s.trim())
563 .filter(|s| !s.is_empty())
564 {
565 segments.push(model.to_string());
566 } else if let Some(make) = exif
567 .make
568 .as_ref()
569 .map(|s| s.trim())
570 .filter(|s| !s.is_empty())
571 {
572 segments.push(make.to_string());
573 }
574
575 if let Some(datetime) = exif
576 .datetime
577 .as_ref()
578 .map(|s| s.trim())
579 .filter(|s| !s.is_empty())
580 {
581 segments.push(datetime.to_string());
582 }
583 }
584
585 if segments.is_empty() {
586 return Some("Image frame".to_string());
587 }
588
589 Some(truncate_preview(&segments.join(" · ")))
590}
591
592#[cfg(test)]
593mod tests {
594 use super::*;
595 use std::io::Read;
596 use std::num::NonZeroU64;
597 use tempfile::tempdir;
598
599 #[test]
600 fn create_put_commit_reopen() {
601 run_serial_test(|| {
602 let dir = tempdir().expect("tmp");
603 let path = dir.path().join("memory.mv2");
604
605 let mut mem = Memvid::create(&path).expect("create");
606 let seq = mem.put_bytes(b"hello").expect("put");
607 assert_eq!(seq, 1);
608 mem.commit().expect("commit");
609
610 drop(mem);
611
612 let mut reopened = Memvid::open(&path).expect("open");
613 let stats = reopened.stats().expect("stats");
614 assert_eq!(stats.frame_count, 1);
615 assert!(stats.has_time_index);
616
617 let timeline = reopened
618 .timeline(TimelineQuery::default())
619 .expect("timeline");
620 assert_eq!(timeline.len(), 1);
621 assert!(timeline[0].preview.contains("hello"));
622
623 let wal_stats = reopened.wal.stats();
624 assert_eq!(wal_stats.pending_bytes, 0);
625 assert_eq!(wal_stats.sequence, 2);
627 });
628 }
629
630 #[test]
631 fn timeline_limit_and_reverse() {
632 run_serial_test(|| {
633 let dir = tempdir().expect("tmp");
634 let path = dir.path().join("timeline.mv2");
635
636 let mut mem = Memvid::create(&path).expect("create");
637 mem.put_bytes(b"alpha").expect("put alpha");
638 mem.put_bytes(b"beta").expect("put beta");
639 mem.commit().expect("commit");
640 drop(mem);
641
642 let mut reopened = Memvid::open(&path).expect("open");
643 let limited = reopened
644 .timeline(TimelineQuery {
645 limit: NonZeroU64::new(1),
646 since: None,
647 until: None,
648 reverse: false,
649 #[cfg(feature = "temporal_track")]
650 temporal: None,
651 })
652 .expect("timeline limit");
653 assert_eq!(limited.len(), 1);
654 assert!(limited[0].preview.contains("alpha"));
655
656 let reversed = reopened
657 .timeline(TimelineQuery {
658 limit: NonZeroU64::new(1),
659 since: None,
660 until: None,
661 reverse: true,
662 #[cfg(feature = "temporal_track")]
663 temporal: None,
664 })
665 .expect("timeline reverse");
666 assert_eq!(reversed.len(), 1);
667 assert!(reversed[0].preview.contains("beta"));
668 });
669 }
670
671 #[test]
672 fn lex_search_roundtrip() {
673 run_serial_test(|| {
674 let dir = tempdir().expect("tmp");
675 let path = dir.path().join("lex.mv2");
676
677 let mut mem = Memvid::create(&path).expect("create");
678 mem.enable_lex().expect("enable");
679 let _seq1 = mem.put_bytes(b"Rust memory engine").expect("put");
680 let _seq2 = mem.put_bytes(b"Deterministic WAL").expect("put2");
681 mem.commit().expect("commit");
682
683 let request = SearchRequest {
685 query: "memory".to_string(),
686 top_k: 10,
687 snippet_chars: 200,
688 uri: None,
689 scope: None,
690 cursor: None,
691 #[cfg(feature = "temporal_track")]
692 temporal: None,
693 as_of_frame: None,
694 as_of_ts: None,
695 no_sketch: false,
696 };
697 let response = mem.search(request).expect("search");
698 assert_eq!(response.hits.len(), 1);
699
700 drop(mem);
701
702 let mut reopened = Memvid::open(&path).expect("open");
703 let request = SearchRequest {
704 query: "wal".to_string(),
705 top_k: 10,
706 snippet_chars: 200,
707 uri: None,
708 scope: None,
709 cursor: None,
710 #[cfg(feature = "temporal_track")]
711 temporal: None,
712 as_of_frame: None,
713 as_of_ts: None,
714 no_sketch: false,
715 };
716 let response = reopened.search(request).expect("search reopened");
717 assert_eq!(response.hits.len(), 1);
718 });
719 }
720
721 #[test]
722 fn vec_search_roundtrip() {
723 run_serial_test(|| {
724 let dir = tempdir().expect("tmp");
725 let path = dir.path().join("vec.mv2");
726
727 let mut mem = Memvid::create(&path).expect("create");
728 mem.enable_vec().expect("enable");
729 mem.put_with_embedding(b"vector", vec![0.0, 1.0])
730 .expect("put");
731 mem.put_with_embedding(b"vector-two", vec![1.0, 0.0])
732 .expect("put2");
733 mem.commit().expect("commit");
734
735 let stats = mem.stats().expect("stats");
736 assert!(stats.has_vec_index, "vec index should exist after commit");
737
738 let hits = mem.search_vec(&[0.0, 1.0], 5).expect("search");
739 assert_eq!(hits.first().map(|hit| hit.frame_id), Some(0));
740
741 drop(mem);
742
743 let mut reopened = Memvid::open(&path).expect("open");
744 let reopened_stats = reopened.stats().expect("stats reopen");
745 assert!(
746 reopened_stats.has_vec_index,
747 "vec index should exist after reopen: has_manifest={}, vec_enabled={}",
748 reopened.toc.indexes.vec.is_some(),
749 reopened.vec_enabled
750 );
751 let hits = reopened.search_vec(&[1.0, 0.0], 5).expect("search reopen");
752 assert_eq!(hits.first().map(|hit| hit.frame_id), Some(1));
753 });
754 }
755
756 #[test]
757 fn search_snippet_ranges_match_bytes() {
758 run_serial_test(|| {
759 let dir = tempdir().expect("tmp");
760 let path = dir.path().join("search.mv2");
761
762 let mut mem = Memvid::create(&path).expect("create");
763 mem.enable_lex().expect("enable lex");
764 let options = PutOptions::builder()
765 .uri("mv2://docs/pricing.md")
766 .title("Pricing")
767 .build();
768 let text = "Capacity tickets are signed grants that raise per-file caps.";
769 mem.put_bytes_with_options(text.as_bytes(), options)
770 .expect("put doc");
771 mem.commit().expect("commit");
772
773 let response = mem
774 .search(SearchRequest {
775 query: "capacity tickets".into(),
776 top_k: 5,
777 snippet_chars: 160,
778 uri: None,
779 scope: None,
780 cursor: None,
781 #[cfg(feature = "temporal_track")]
782 temporal: None,
783 as_of_frame: None,
784 as_of_ts: None,
785 no_sketch: false,
786 })
787 .expect("search");
788
789 assert_eq!(response.total_hits, 1);
790 assert_eq!(response.engine, SearchEngineKind::Tantivy);
791 let hit = response.hits.first().expect("hit");
792 let frame = mem
793 .toc
794 .frames
795 .get(hit.frame_id as usize)
796 .cloned()
797 .expect("frame");
798 let canonical = mem.frame_content(&frame).expect("content");
799 let bytes = canonical.as_bytes();
800 let (start, end) = hit.range;
801 assert!(end <= bytes.len());
802 assert_eq!(hit.text.as_bytes(), &bytes[start..end]);
803 let chunk = hit.chunk_range.expect("chunk range");
804 assert!(chunk.0 <= start);
805 assert!(chunk.1 >= end);
806 let chunk_text = hit.chunk_text.as_ref().expect("chunk text");
807 let chunk_slice = &canonical[chunk.0..chunk.1];
808 assert_eq!(chunk_text, chunk_slice);
809 });
810 }
811
812 #[test]
813 fn search_chunk_range_reflects_chunk_offset() {
814 run_serial_test(|| {
815 let dir = tempdir().expect("tmp");
816 let path = dir.path().join("chunked.mv2");
817
818 let mut mem = Memvid::create(&path).expect("create");
819 mem.enable_lex().expect("enable lex");
820
821 let options = PutOptions::builder()
822 .uri("mv2://docs/manual.txt")
823 .title("Manual")
824 .build();
825 let prefix = "alpha beta gamma delta. ".repeat(200);
826 let content = format!(
827 "{}target segment appears here. Trailing context for verification.",
828 prefix
829 );
830 mem.put_bytes_with_options(content.as_bytes(), options)
831 .expect("put doc");
832 mem.commit().expect("commit");
833
834 let response = mem
835 .search(SearchRequest {
836 query: "target segment".into(),
837 top_k: 5,
838 snippet_chars: 160,
839 uri: None,
840 scope: None,
841 cursor: None,
842 #[cfg(feature = "temporal_track")]
843 temporal: None,
844 as_of_frame: None,
845 as_of_ts: None,
846 no_sketch: false,
847 })
848 .expect("search");
849
850 let hit = response.hits.first().expect("hit");
851 assert_eq!(response.engine, SearchEngineKind::Tantivy);
852 let chunk_range = hit.chunk_range.expect("chunk range");
853 assert!(chunk_range.0 > 0);
854 assert!(hit.range.0 >= chunk_range.0);
855 assert!(hit.range.1 <= chunk_range.1);
856 assert!(hit.text.contains("target segment"));
857 let chunk_text = hit.chunk_text.as_ref().expect("chunk text");
858 assert_eq!(chunk_text, &content[chunk_range.0..chunk_range.1]);
859 });
860 }
861
862 #[test]
863 fn auto_tag_populates_frame_metadata() {
864 run_serial_test(|| {
865 let dir = tempdir().expect("tmp");
866 let path = dir.path().join("autotag.mv2");
867
868 let mut mem = Memvid::create(&path).expect("create");
869 mem.enable_lex().expect("enable lex");
870
871 let options = PutOptions::builder()
872 .search_text("Neural networks planning session 2024-10-08")
873 .auto_tag(true)
874 .extract_dates(true)
875 .build();
876 mem.put_bytes_with_options(b"agenda", options)
877 .expect("put bytes");
878 mem.commit().expect("commit");
879
880 let frame = mem.toc.frames.first().expect("frame present");
881 assert!(!frame.tags.is_empty());
882 assert!(frame.content_dates.iter().any(|date| date.contains("2024")));
883 });
884 }
885
886 #[test]
887 fn search_filters_by_uri_and_scope() {
888 run_serial_test(|| {
889 let dir = tempdir().expect("tmp");
890 let path = dir.path().join("filters.mv2");
891
892 let mut mem = Memvid::create(&path).expect("create");
893 mem.enable_lex().expect("enable lex");
894
895 let options_a = PutOptions::builder()
896 .uri("mv2://docs/pricing.md")
897 .title("Pricing")
898 .build();
899 mem.put_bytes_with_options(b"Capacity tickets add per-file allowances", options_a)
900 .expect("put a");
901
902 let options_b = PutOptions::builder()
903 .uri("mv2://docs/faq.md")
904 .title("FAQ")
905 .build();
906 mem.put_bytes_with_options(b"Tickets can be issued by admins", options_b)
907 .expect("put b");
908
909 let options_c = PutOptions::builder()
910 .uri("mv2://blog/launch.md")
911 .title("Launch")
912 .build();
913 mem.put_bytes_with_options(b"Launch day tickets boost visibility", options_c)
914 .expect("put c");
915
916 mem.commit().expect("commit");
917
918 let uri_response = mem
919 .search(SearchRequest {
920 query: "tickets".into(),
921 top_k: 10,
922 snippet_chars: 120,
923 uri: Some("mv2://docs/pricing.md".into()),
924 scope: None,
925 cursor: None,
926 #[cfg(feature = "temporal_track")]
927 temporal: None,
928 as_of_frame: None,
929 as_of_ts: None,
930 no_sketch: false,
931 })
932 .expect("uri search");
933 assert_eq!(uri_response.engine, SearchEngineKind::Tantivy);
934 assert!(
935 uri_response
936 .hits
937 .iter()
938 .all(|hit| hit.uri == "mv2://docs/pricing.md")
939 );
940
941 let scope_response = mem
942 .search(SearchRequest {
943 query: "tickets".into(),
944 top_k: 10,
945 snippet_chars: 120,
946 uri: None,
947 scope: Some("mv2://docs/".into()),
948 cursor: None,
949 #[cfg(feature = "temporal_track")]
950 temporal: None,
951 as_of_frame: None,
952 as_of_ts: None,
953 no_sketch: false,
954 })
955 .expect("scope search");
956 assert_eq!(scope_response.engine, SearchEngineKind::Tantivy);
957 assert!(
958 scope_response
959 .hits
960 .iter()
961 .all(|hit| hit.uri.starts_with("mv2://docs/"))
962 );
963 });
964 }
965
966 #[test]
967 fn search_pagination_and_params() {
968 run_serial_test(|| {
969 let dir = tempdir().expect("tmp");
970 let path = dir.path().join("paging.mv2");
971
972 let mut mem = Memvid::create(&path).expect("create");
973 mem.enable_lex().expect("enable lex");
974
975 for (idx, text) in [
976 "tickets unlock tier upgrades",
977 "tickets expire after 30 days",
978 "tickets may be revoked",
979 ]
980 .iter()
981 .enumerate()
982 {
983 let uri = format!("mv2://docs/doc{idx}.md");
984 let options = PutOptions::builder()
985 .uri(&uri)
986 .title(format!("Doc {idx}"))
987 .build();
988 mem.put_bytes_with_options(text.as_bytes(), options)
989 .expect("put doc");
990 }
991
992 mem.commit().expect("commit");
993
994 let first_page = mem
995 .search(SearchRequest {
996 query: "tickets".into(),
997 top_k: 1,
998 snippet_chars: 90,
999 uri: None,
1000 scope: None,
1001 cursor: None,
1002 #[cfg(feature = "temporal_track")]
1003 temporal: None,
1004 as_of_frame: None,
1005 as_of_ts: None,
1006 no_sketch: false,
1007 })
1008 .expect("page one");
1009 assert_eq!(first_page.engine, SearchEngineKind::Tantivy);
1010 assert_eq!(first_page.hits.len(), 1);
1011 assert_eq!(first_page.params.top_k, 1);
1012 assert_eq!(first_page.params.snippet_chars, 90);
1013 assert!(first_page.total_hits >= first_page.hits.len());
1014 let cursor = first_page.next_cursor.clone().expect("cursor");
1015 let first_id = first_page.hits[0].frame_id;
1016
1017 let second_page = mem
1018 .search(SearchRequest {
1019 query: "tickets".into(),
1020 top_k: 1,
1021 snippet_chars: 90,
1022 uri: None,
1023 scope: None,
1024 cursor: Some(cursor),
1025 #[cfg(feature = "temporal_track")]
1026 temporal: None,
1027 as_of_frame: None,
1028 as_of_ts: None,
1029 no_sketch: false,
1030 })
1031 .expect("page two");
1032 assert_eq!(second_page.engine, SearchEngineKind::Tantivy);
1033 assert_eq!(second_page.hits.len(), 1);
1034 assert_ne!(second_page.hits[0].frame_id, first_id);
1035 assert_eq!(second_page.total_hits, first_page.total_hits);
1036 });
1037 }
1038
1039 #[cfg(feature = "lex")]
1040 #[test]
1041 fn search_falls_back_when_tantivy_missing() {
1042 run_serial_test(|| {
1043 let dir = tempdir().expect("tmp");
1044 let path = dir.path().join("fallback.mv2");
1045
1046 let mut mem = Memvid::create(&path).expect("create");
1047 mem.enable_lex().expect("enable lex");
1048 mem.put_bytes(b"tickets fallback test").expect("put");
1049 mem.commit().expect("commit");
1050
1051 assert!(
1054 mem.tantivy.is_some(),
1055 "Tantivy should be initialized after commit"
1056 );
1057
1058 let response = mem
1059 .search(SearchRequest {
1060 query: "tickets".into(),
1061 top_k: 5,
1062 snippet_chars: 120,
1063 uri: None,
1064 scope: None,
1065 cursor: None,
1066 #[cfg(feature = "temporal_track")]
1067 temporal: None,
1068 as_of_frame: None,
1069 as_of_ts: None,
1070 no_sketch: false,
1071 })
1072 .expect("search with tantivy");
1073
1074 assert_eq!(response.engine, SearchEngineKind::Tantivy);
1075 assert!(!response.hits.is_empty());
1076 });
1077 }
1078
1079 #[test]
1080 fn verify_reports_success() {
1081 run_serial_test(|| {
1082 let dir = tempdir().expect("tmp");
1083 let path = dir.path().join("verify.mv2");
1084
1085 {
1086 let mut mem = Memvid::create(&path).expect("create");
1087 mem.enable_lex().expect("enable lex");
1088 mem.enable_vec().expect("enable vec");
1089 mem.put_with_embedding(b"check", vec![0.5, 0.1])
1090 .expect("put");
1091 mem.commit().expect("commit");
1092 }
1093
1094 let report = Memvid::verify(&path, true).expect("verify");
1095 assert_eq!(report.overall_status, VerificationStatus::Passed);
1096 });
1097 }
1098
1099 #[test]
1100 fn test_create_enables_indexes_by_default() {
1101 run_serial_test(|| {
1102 let dir = tempdir().expect("tmp");
1103 let path = dir.path().join("default_indexes.mv2");
1104
1105 let mem = Memvid::create(&path).expect("create");
1107
1108 let stats = mem.stats().expect("stats");
1110 println!(
1111 "After create (before drop): lex={}, vec={}",
1112 stats.has_lex_index, stats.has_vec_index
1113 );
1114
1115 drop(mem);
1116
1117 let reopened = Memvid::open(&path).expect("reopen");
1119 let stats2 = reopened.stats().expect("stats after reopen");
1120 println!(
1121 "After reopen: lex={}, vec={}",
1122 stats2.has_lex_index, stats2.has_vec_index
1123 );
1124
1125 #[cfg(feature = "lex")]
1126 assert!(
1127 stats2.has_lex_index,
1128 "lex index should be enabled by default"
1129 );
1130
1131 #[cfg(feature = "vec")]
1132 assert!(
1133 stats2.has_vec_index,
1134 "vec index should be enabled by default"
1135 );
1136 });
1137 }
1138
1139 #[test]
1140 fn doctor_rebuilds_time_index() {
1141 use std::fs::OpenOptions;
1142 use std::io::{Seek, SeekFrom, Write};
1143
1144 run_serial_test(|| {
1145 let dir = tempdir().expect("tmp");
1146 let path = dir.path().join("doctor.mv2");
1147
1148 let manifest = {
1149 let mut mem = Memvid::create(&path).expect("create");
1150 mem.put_bytes(b"repair").expect("put");
1151 mem.commit().expect("commit");
1152 mem.rebuild_indexes(&[]).expect("rebuild");
1154 mem.commit().expect("commit after rebuild");
1155 println!(
1156 "test: post-commit header footer_offset={}",
1157 mem.header.footer_offset
1158 );
1159 println!(
1160 "test: post-commit manifest offset={} length={}",
1161 mem.toc
1162 .time_index
1163 .as_ref()
1164 .map(|m| m.bytes_offset)
1165 .unwrap_or(0),
1166 mem.toc
1167 .time_index
1168 .as_ref()
1169 .map(|m| m.bytes_length)
1170 .unwrap_or(0)
1171 );
1172 mem.toc.time_index.clone().expect("time index manifest")
1173 };
1174
1175 {
1176 let mut file = OpenOptions::new()
1177 .read(true)
1178 .write(true)
1179 .open(&path)
1180 .expect("open file");
1181 file.seek(SeekFrom::Start(manifest.bytes_offset))
1182 .expect("seek");
1183 let zeros = vec![0u8; usize::try_from(manifest.bytes_length).unwrap_or(0)];
1184 file.write_all(&zeros).expect("corrupt time index");
1185 file.flush().expect("flush");
1186 file.sync_all().expect("sync");
1187 }
1188
1189 println!(
1190 "test: footer scan: {:?}",
1191 crate::footer::find_last_valid_footer(&std::fs::read(&path).expect("read file"))
1192 .as_ref()
1193 .map(|s| (s.footer_offset, s.toc_offset, s.footer.toc_len))
1194 );
1195 println!("test: verifying corrupted memory");
1196 match Memvid::verify(&path, false) {
1197 Ok(report) => {
1198 assert_eq!(report.overall_status, VerificationStatus::Failed);
1199 }
1200 Err(e) => {
1201 println!("test: verify failed with error (expected): {e}");
1202 }
1203 }
1204
1205 println!("test: running doctor");
1206 let report = Memvid::doctor(
1207 &path,
1208 DoctorOptions {
1209 rebuild_time_index: true,
1210 rebuild_lex_index: false,
1211 ..DoctorOptions::default()
1212 },
1213 )
1214 .expect("doctor");
1215 println!("test: doctor completed with status: {:?}", report.status);
1216 println!("test: verifying repaired memory");
1221 let reopened = Memvid::open(&path).expect("reopen after doctor");
1223 assert!(
1224 reopened.toc.time_index.is_some(),
1225 "time index should exist after doctor"
1226 );
1227 });
1228 }
1229
1230 #[test]
1231 fn blob_reader_roundtrip_with_media_manifest() {
1232 run_serial_test(|| {
1233 let dir = tempdir().expect("tmp");
1234 let path = dir.path().join("blob.mv2");
1235 let payload = vec![0u8, 159, 1, 128, 42, 99, 200];
1236
1237 let manifest = MediaManifest {
1238 kind: "video".to_string(),
1239 mime: "video/mp4".to_string(),
1240 bytes: payload.len() as u64,
1241 filename: Some("clip.mp4".to_string()),
1242 duration_ms: Some(1234),
1243 width: Some(1920),
1244 height: Some(1080),
1245 codec: Some("h264".to_string()),
1246 };
1247
1248 let mut doc_meta = DocMetadata::default();
1249 doc_meta.media = Some(manifest.clone());
1250 doc_meta.mime = Some("video/mp4".to_string());
1251 doc_meta.bytes = Some(payload.len() as u64);
1252 assert!(
1253 !doc_meta.is_empty(),
1254 "media manifest must count as metadata"
1255 );
1256
1257 let options = PutOptions::builder()
1258 .metadata(doc_meta)
1259 .kind("video")
1260 .uri("mv2://video/clip.mp4")
1261 .build();
1262
1263 {
1264 let mut mem = Memvid::create(&path).expect("create");
1265 mem.put_bytes_with_options(&payload, options)
1266 .expect("put bytes");
1267 mem.commit().expect("commit");
1268 }
1269
1270 let mut reopened = Memvid::open(&path).expect("open");
1271 let mut reader = reopened
1272 .blob_reader_by_uri("mv2://video/clip.mp4")
1273 .expect("blob reader");
1274 let mut buffered = Vec::new();
1275 reader.read_to_end(&mut buffered).expect("read payload");
1276 assert_eq!(buffered, payload);
1277
1278 let roundtrip = reopened
1279 .media_manifest_by_uri("mv2://video/clip.mp4")
1280 .expect("manifest lookup")
1281 .expect("manifest present");
1282 assert_eq!(roundtrip.mime, "video/mp4");
1283 assert_eq!(roundtrip.kind, "video");
1284 assert_eq!(roundtrip.bytes, payload.len() as u64);
1285 assert_eq!(roundtrip.filename.as_deref(), Some("clip.mp4"));
1286 assert_eq!(roundtrip.duration_ms, Some(1234));
1287 assert_eq!(roundtrip.width, Some(1920));
1288 assert_eq!(roundtrip.height, Some(1080));
1289 assert_eq!(roundtrip.codec.as_deref(), Some("h264"));
1290
1291 drop(dir);
1292 });
1293 }
1294
1295 #[test]
1296 fn video_frame_roundtrip_does_not_corrupt_toc() {
1297 use crate::types::MediaManifest;
1298
1299 run_serial_test(|| {
1300 let dir = tempdir().expect("tmp");
1301 let path = dir.path().join("video.mv2");
1302 let mut seed = 0xDEADBEEF_u64;
1303 let mut video_bytes = vec![0u8; 1_600_000];
1304 for byte in &mut video_bytes {
1305 seed = seed ^ (seed << 7);
1306 seed = seed ^ (seed >> 9);
1307 seed = seed ^ (seed << 8);
1308 *byte = (seed & 0xFF) as u8;
1309 }
1310
1311 let hash_hex = blake3::hash(&video_bytes).to_hex().to_string();
1312
1313 let manifest = MediaManifest {
1314 kind: "video".to_string(),
1315 mime: "video/mp4".to_string(),
1316 bytes: video_bytes.len() as u64,
1317 filename: Some("clip.mp4".to_string()),
1318 duration_ms: Some(1_000),
1319 width: Some(1920),
1320 height: Some(1080),
1321 codec: Some("h264".to_string()),
1322 };
1323
1324 let mut meta = DocMetadata::default();
1325 meta.mime = Some("video/mp4".to_string());
1326 meta.bytes = Some(video_bytes.len() as u64);
1327 meta.hash = Some(hash_hex);
1328 meta.caption = Some("Test clip".to_string());
1329 meta.media = Some(manifest);
1330
1331 let options = PutOptions::builder()
1332 .kind("video")
1333 .metadata(meta)
1334 .tag("kind", "video")
1335 .uri("mv2://video/test.mp4")
1336 .title("Test clip")
1337 .build();
1338
1339 {
1340 let mut mem = Memvid::create(&path).expect("create");
1341 mem.put_bytes_with_options(&video_bytes, options)
1342 .expect("put video");
1343 mem.commit().expect("commit");
1344 }
1345
1346 let reopened = Memvid::open(&path).expect("reopen");
1347 let stats = reopened.stats().expect("stats");
1348 assert_eq!(stats.frame_count, 1);
1349 });
1350 }
1351
1352 #[test]
1353 #[allow(deprecated)]
1354 fn ticket_sequence_enforced() {
1355 run_serial_test(|| {
1356 let dir = tempdir().expect("tmp");
1357 let path = dir.path().join("ticket.mv2");
1358
1359 let mut mem = Memvid::create(&path).expect("create");
1360 mem.apply_ticket(Ticket::new("issuer", 2))
1361 .expect("apply first");
1362
1363 let err = mem
1364 .apply_ticket(Ticket::new("issuer", 2))
1365 .expect_err("sequence must increase");
1366 assert!(matches!(err, MemvidError::TicketSequence { .. }));
1367 });
1368 }
1369
1370 #[test]
1371 #[allow(deprecated)]
1372 fn capacity_limit_enforced() {
1373 run_serial_test(|| {
1374 let dir = tempdir().expect("tmp");
1375 let path = dir.path().join("capacity.mv2");
1376
1377 let mut mem = Memvid::create(&path).expect("create");
1378 let base = mem.data_end;
1379 mem.apply_ticket(Ticket::new("issuer", 2).capacity_bytes(base + 64))
1380 .expect("apply ticket");
1381
1382 mem.put_bytes(&vec![0xFF; 32]).expect("first put");
1383 mem.commit().expect("commit");
1384
1385 let err = mem.put_bytes(&[0xFF; 40]).expect_err("capacity exceeded");
1386 assert!(matches!(err, MemvidError::CapacityExceeded { .. }));
1387 });
1388 }
1389}