#![deny(clippy::all, clippy::pedantic)]
#![cfg_attr(not(test), deny(clippy::unwrap_used, clippy::expect_used))]
#![cfg_attr(
test,
allow(
clippy::useless_vec,
clippy::uninlined_format_args,
clippy::cast_possible_truncation,
clippy::float_cmp,
clippy::cast_precision_loss
)
)]
#![allow(clippy::module_name_repetitions)]
#![allow(clippy::missing_errors_doc)]
#![allow(clippy::missing_panics_doc)]
#![allow(clippy::doc_markdown)]
#![allow(clippy::cast_precision_loss)]
#![allow(clippy::cast_possible_wrap)]
#![allow(clippy::cast_sign_loss)]
#![allow(clippy::cast_lossless)]
#![allow(clippy::too_many_lines)]
#![allow(clippy::too_many_arguments)]
#![allow(clippy::items_after_statements)]
#![allow(clippy::similar_names)]
#![allow(clippy::manual_let_else)]
#![allow(clippy::match_same_arms)]
#![allow(clippy::if_same_then_else)]
#![allow(clippy::collapsible_match)]
#![allow(clippy::needless_pass_by_value)] #![allow(clippy::return_self_not_must_use)] #![allow(clippy::format_push_string)] #![allow(clippy::assigning_clones)] #![allow(clippy::struct_excessive_bools)] #![allow(clippy::needless_continue)]
#![allow(clippy::needless_range_loop)]
#![allow(clippy::case_sensitive_file_extension_comparisons)]
#![allow(clippy::default_trait_access)]
#![allow(clippy::field_reassign_with_default)]
#![allow(clippy::unreadable_literal)] #![allow(clippy::implicit_hasher)]
#![allow(clippy::manual_clamp)]
#![allow(clippy::len_without_is_empty)] #![allow(clippy::large_enum_variant)]
#![allow(clippy::ptr_arg)]
#![allow(clippy::map_unwrap_or)]
#![allow(clippy::incompatible_msrv)]
#![allow(clippy::should_implement_trait)] #![allow(clippy::duplicated_attributes)]
#![allow(clippy::unnecessary_wraps)]
#![allow(clippy::unused_self)]
pub const MEMVID_CORE_VERSION: &str = env!("CARGO_PKG_VERSION");
mod analysis;
pub mod constants;
pub mod enrich;
pub mod enrichment_worker;
pub mod error;
pub mod extract;
pub mod extract_budgeted;
pub mod footer;
pub mod io;
pub mod lex;
mod lock;
pub mod lockfile;
pub mod memvid;
pub mod models;
pub mod pii;
pub mod reader;
mod registry;
mod search;
pub mod signature;
pub mod structure;
pub mod table;
pub mod text;
mod toc;
pub mod types;
pub mod vec;
pub mod vec_pq;
pub mod simd;
#[cfg(feature = "vec")]
pub mod text_embed;
pub mod triplet;
pub mod graph_search;
pub mod clip;
pub mod whisper;
pub mod replay;
#[cfg(feature = "encryption")]
pub mod encryption;
#[cfg(feature = "symspell_cleanup")]
pub mod symspell_cleanup;
#[cfg(feature = "api_embed")]
pub mod api_embed;
#[cfg(test)]
mod tests_lex_flag;
#[cfg(feature = "temporal_track")]
pub use analysis::temporal::{
TemporalContext, TemporalNormalizer, TemporalResolution, TemporalResolutionFlag,
TemporalResolutionValue, parse_clock_inheritance, parse_week_start,
};
#[cfg(feature = "temporal_enrich")]
pub use analysis::temporal_enrich::{
AnchorSource as TemporalEnrichAnchorSource, RelativePhrase, ResolvedTemporal,
TemporalAnchorInfo, TemporalAnchorTracker, TemporalEnrichment, detect_relative_phrases,
enrich_chunk, enrich_chunks, enrich_document, resolve_relative_phrase,
};
pub use constants::*;
pub use enrichment_worker::{EnrichmentWorkerConfig, EnrichmentWorkerStats};
pub use error::{MemvidError, Result};
pub use extract::{DocumentProcessor, ExtractedDocument, ProcessorConfig};
pub use footer::{CommitFooter, find_last_valid_footer};
#[cfg(feature = "temporal_track")]
pub use io::temporal_index::{
append_track as temporal_track_append, calculate_checksum as temporal_track_checksum,
read_track as temporal_track_read, window as temporal_track_window,
};
pub use io::time_index::{
TimeIndexEntry, append_track as time_index_append, calculate_checksum as time_index_checksum,
read_track as time_index_read,
};
pub use io::wal::{EmbeddedWal, WalRecord, WalStats};
pub use lex::{LexIndex, LexIndexArtifact, LexIndexBuilder, LexSearchHit};
pub use lock::FileLock;
pub use memvid::{
BlobReader, EnrichmentHandle, EnrichmentStats, LockSettings, Memvid, OpenReadOptions,
SketchCandidate, SketchSearchOptions, SketchSearchStats,
mutation::{CommitMode, CommitOptions},
start_enrichment_worker, start_enrichment_worker_with_embeddings,
};
#[cfg(feature = "parallel_segments")]
pub use memvid::{BuildOpts, ParallelInput, ParallelPayload};
pub use models::{
ModelManifest, ModelManifestEntry, ModelVerification, ModelVerificationStatus,
ModelVerifyOptions, verify_model_dir, verify_models,
};
pub use reader::{
DetectedTable, DocumentFormat, DocumentReader, PassthroughReader, PdfReader, ReaderDiagnostics,
ReaderHint, ReaderOutput, ReaderRegistry, XlsxChunkingOptions, XlsxReader,
};
pub use signature::{
parse_ed25519_public_key_base64, verify_model_manifest, verify_ticket_signature,
};
pub use text::{NormalizedText, normalize_text, truncate_at_grapheme_boundary};
pub use types::{
ACL_POLICY_VERSION_KEY, ACL_READ_GROUPS_KEY, ACL_READ_PRINCIPALS_KEY, ACL_READ_ROLES_KEY,
ACL_RESOURCE_ID_KEY, ACL_TENANT_ID_KEY, ACL_VISIBILITY_KEY, AclContext, AclEnforcementMode,
AskCitation, AskMode, AskRequest, AskResponse, AskRetriever, AskStats, AudioSegmentMetadata,
AuditOptions, AuditReport, CanonicalEncoding, DOCTOR_PLAN_VERSION, DocAudioMetadata,
DocExifMetadata, DocGpsMetadata, DocMetadata, DoctorActionDetail, DoctorActionKind,
DoctorActionPlan, DoctorActionReport, DoctorActionStatus, DoctorFinding, DoctorFindingCode,
DoctorMetrics, DoctorOptions, DoctorPhaseDuration, DoctorPhaseKind, DoctorPhasePlan,
DoctorPhaseReport, DoctorPhaseStatus, DoctorPlan, DoctorReport, DoctorSeverity, DoctorStatus,
EmbeddingIdentity, EmbeddingIdentityCount, EmbeddingIdentitySummary, Frame, FrameId, FrameRole,
FrameStatus, Header, IndexManifests, LexIndexManifest, LexSegmentDescriptor,
MEMVID_EMBEDDING_DIMENSION_KEY, MEMVID_EMBEDDING_MODEL_KEY, MEMVID_EMBEDDING_NORMALIZED_KEY,
MEMVID_EMBEDDING_PROVIDER_KEY, MediaManifest, MemvidHandle, Open, PutManyOpts, PutOptions,
PutOptionsBuilder, Sealed, SearchEngineKind, SearchHit, SearchHitMetadata, SearchParams,
SearchRequest, SearchResponse, SegmentCatalog, SegmentCommon, SegmentCompression, SegmentMeta,
SegmentSpan, SourceSpan, Stats, TextChunkManifest, TextChunkRange, Ticket, TicketRef, Tier,
TimeIndexManifest, TimeSegmentDescriptor, TimelineEntry, TimelineQuery, TimelineQueryBuilder,
Toc, VecEmbedder, VecIndexManifest, VecSegmentDescriptor, VectorCompression, VerificationCheck,
VerificationReport, VerificationStatus,
};
#[cfg(feature = "temporal_track")]
pub use types::{
AnchorSource, SearchHitTemporal, SearchHitTemporalAnchor, SearchHitTemporalMention,
TEMPORAL_TRACK_FLAG_HAS_ANCHORS, TEMPORAL_TRACK_FLAG_HAS_MENTIONS, TemporalAnchor,
TemporalCapabilities, TemporalFilter, TemporalMention, TemporalMentionFlags,
TemporalMentionKind, TemporalTrack, TemporalTrackManifest,
};
pub use types::{
EngineStamp, EnrichmentManifest, EnrichmentRecord, MEMORIES_TRACK_MAGIC,
MEMORIES_TRACK_VERSION, MemoriesStats, MemoriesTrack, MemoryCard, MemoryCardBuilder,
MemoryCardBuilderError, MemoryCardId, MemoryKind, Polarity, SlotIndex, VersionRelation,
};
pub use types::{
EdgeDirection, EntityKind, FollowResult, LOGIC_MESH_MAGIC, LOGIC_MESH_VERSION, LinkType,
LogicMesh, LogicMeshManifest, MeshEdge, MeshNode,
};
pub use types::{
DEFAULT_HAMMING_THRESHOLD, QuerySketch, SKETCH_TRACK_MAGIC, SKETCH_TRACK_VERSION, SketchEntry,
SketchFlags, SketchTrack, SketchTrackHeader, SketchTrackManifest, SketchTrackStats,
SketchVariant, build_term_filter, compute_simhash, compute_token_weights, generate_sketch,
hash_token, hash_token_u32, read_sketch_track, term_filter_maybe_contains, tokenize_for_sketch,
write_sketch_track,
};
pub use types::{
Cardinality, PredicateId, PredicateSchema, SchemaError, SchemaRegistry, ValueType,
};
pub use memvid::memory::SchemaSummaryEntry;
#[cfg(feature = "logic_mesh")]
pub use analysis::ner::NerModel;
pub use analysis::ner::{
ExtractedEntity, FrameEntities, NER_MODEL_NAME, NER_MODEL_SIZE_MB, NER_MODEL_URL, NER_MODELS,
NER_TOKENIZER_URL, NerModelInfo, default_ner_model_info, get_ner_model_info,
is_ner_model_installed, ner_model_path, ner_tokenizer_path,
};
pub use enrich::{EnrichmentContext, EnrichmentEngine, EnrichmentResult, RulesEngine};
pub use triplet::{ExtractionMode, ExtractionStats, TripletExtractor};
pub use graph_search::{GraphMatcher, QueryPlanner, hybrid_search};
pub use types::{
BatchEmbeddingResult, EmbeddingConfig, EmbeddingProvider, EmbeddingProviderKind,
EmbeddingResult,
};
pub use types::reranker::{
Reranker, RerankerConfig, RerankerDocument, RerankerKind, RerankerResult,
};
#[cfg(feature = "parallel_segments")]
pub use types::{IndexSegmentRef, SegmentKind, SegmentStats};
pub use vec::{VecIndex, VecIndexArtifact, VecSearchHit};
pub use vec_pq::{
CompressionStats, ProductQuantizer, QuantizedVecIndex, QuantizedVecIndexArtifact,
QuantizedVecIndexBuilder,
};
#[cfg(feature = "vec")]
pub use text_embed::{
LocalTextEmbedder, TEXT_EMBED_MODELS, TextEmbedConfig, TextEmbedModelInfo,
default_text_model_info, get_text_model_info,
};
#[cfg(feature = "api_embed")]
pub use api_embed::{
OPENAI_MODELS, OpenAIConfig, OpenAIEmbedder, OpenAIModelInfo, default_openai_model_info,
get_openai_model_info,
};
pub use clip::{
CLIP_MODELS, ClipConfig, ClipDocument, ClipEmbeddingProvider, ClipError, ClipIndex,
ClipIndexArtifact, ClipIndexBuilder, ClipIndexManifest, ClipModelInfo, ClipSearchHit,
ImageInfo, MOBILECLIP_DIMS, SIGLIP_DIMS, default_model_info, filter_junk_images,
get_model_info,
};
#[cfg(feature = "clip")]
pub use clip::{ClipModel, calculate_color_variance, get_image_info};
pub use whisper::{
TranscriptionResult, TranscriptionSegment, WHISPER_MODELS, WhisperConfig, WhisperError,
WhisperModelInfo, default_whisper_model_info, get_whisper_model_info,
};
#[cfg(feature = "whisper")]
pub use whisper::{WHISPER_SAMPLE_RATE, WhisperTranscriber, decode_audio_file};
pub use structure::{
ChunkType, ChunkingOptions, ChunkingResult, StructuralChunker, StructuredChunk,
StructuredDocument, TableChunkingStrategy, chunk_structured, detect_structure,
};
pub use types::adaptive::{
AdaptiveConfig, AdaptiveResult, AdaptiveStats, CutoffStrategy, find_adaptive_cutoff,
normalize_scores,
};
pub use replay::{
ActionType, Checkpoint, REPLAY_SEGMENT_MAGIC, REPLAY_SEGMENT_VERSION, ReplayAction,
ReplayManifest, ReplaySession, SessionSummary, StateSnapshot,
};
#[cfg(feature = "replay")]
pub use replay::{
ActiveSession, ComparisonReport, ComparisonSummary, Divergence, DivergenceType, ModelResult,
ReplayConfig, ReplayOptions, ReplayResult,
};
#[cfg(test)]
use once_cell::sync::Lazy;
use std::fs::File;
use std::io::Cursor;
use std::path::Path;
#[cfg(test)]
use std::sync::Mutex;
use bincode::config::{self, Config};
use io::header::HeaderCodec;
const TIMELINE_PREVIEW_BYTES: usize = 120;
const MAX_INDEX_BYTES: u64 = 512 * 1024 * 1024; const MAX_TIME_INDEX_BYTES: u64 = 512 * 1024 * 1024;
const MAX_FRAME_BYTES: u64 = 256 * 1024 * 1024;
const DEFAULT_SEARCH_TEXT_LIMIT: usize = 32_768;
#[cfg(test)]
#[allow(clippy::non_std_lazy_statics)]
static SERIAL_TEST_MUTEX: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
#[cfg(test)]
pub(crate) fn run_serial_test<T>(f: impl FnOnce() -> T) -> T {
let _guard = SERIAL_TEST_MUTEX
.lock()
.expect("memvid-core serial test mutex poisoned");
f()
}
impl Memvid {
#[cfg(feature = "lex")]
fn tantivy_index_pending(&self) -> bool {
self.tantivy_dirty
}
#[cfg(not(feature = "lex"))]
fn tantivy_index_pending(&self) -> bool {
false
}
#[cfg(feature = "lex")]
fn flush_tantivy_conditional(&mut self, embed_snapshot: bool) -> Result<()> {
if !self.tantivy_dirty {
return Ok(());
}
if let Some(engine) = self.tantivy.as_mut() {
engine.commit()?;
if embed_snapshot {
let snapshot = engine.snapshot_segments()?;
self.update_embedded_lex_snapshot(snapshot)?;
}
}
self.tantivy_dirty = false;
Ok(())
}
#[cfg(feature = "lex")]
fn flush_tantivy(&mut self) -> Result<()> {
self.flush_tantivy_conditional(true)
}
#[cfg(feature = "lex")]
#[allow(dead_code)]
fn flush_tantivy_skip_embed(&mut self) -> Result<()> {
self.flush_tantivy_conditional(false)
}
#[cfg(not(feature = "lex"))]
fn flush_tantivy(&mut self) -> Result<()> {
Ok(())
}
#[cfg(not(feature = "lex"))]
#[allow(dead_code)]
fn flush_tantivy_skip_embed(&mut self) -> Result<()> {
Ok(())
}
#[must_use]
pub fn path(&self) -> &Path {
&self.path
}
#[must_use]
pub fn lock_handle(&self) -> &FileLock {
&self.lock
}
#[must_use]
pub fn is_read_only(&self) -> bool {
self.read_only
}
pub(crate) fn ensure_writable(&mut self) -> Result<()> {
if self.read_only {
self.lock.upgrade_to_exclusive()?;
self.read_only = false;
}
Ok(())
}
pub fn downgrade_to_shared(&mut self) -> Result<()> {
if self.read_only {
return Ok(());
}
if self.dirty || self.tantivy_index_pending() {
return Ok(());
}
self.lock.downgrade_to_shared()?;
self.read_only = true;
Ok(())
}
}
impl Drop for Memvid {
fn drop(&mut self) {
if self.dirty {
let _ = self.commit();
}
#[cfg(feature = "parallel_segments")]
{
use crate::memvid::lifecycle::cleanup_manifest_wal_public;
cleanup_manifest_wal_public(self.path());
}
}
}
pub(crate) fn persist_header(file: &mut File, header: &Header) -> Result<()> {
HeaderCodec::write(file, header)
}
fn wal_config() -> impl Config {
config::standard()
.with_fixed_int_encoding()
.with_little_endian()
}
pub(crate) fn decode_canonical_bytes(
payload: &[u8],
encoding: CanonicalEncoding,
frame_id: FrameId,
) -> Result<Vec<u8>> {
match encoding {
CanonicalEncoding::Plain => Ok(payload.to_vec()),
CanonicalEncoding::Zstd => {
zstd::decode_all(Cursor::new(payload)).map_err(|_| MemvidError::InvalidFrame {
frame_id,
reason: "failed to decode canonical payload",
})
}
}
}
pub(crate) fn default_uri(frame_id: FrameId) -> String {
format!("mv2://frames/{frame_id}")
}
pub(crate) fn infer_title_from_uri(uri: &str) -> Option<String> {
let trimmed = uri.trim();
if trimmed.is_empty() {
return None;
}
let without_scheme = trimmed.split_once("://").map_or(trimmed, |x| x.1);
let without_fragment = without_scheme.split('#').next().unwrap_or(without_scheme);
let without_query = without_fragment
.split('?')
.next()
.unwrap_or(without_fragment);
let segment = without_query
.trim_end_matches('/')
.rsplit('/')
.next()
.map(str::trim)?;
if segment.is_empty() {
return None;
}
let stem = segment.rsplit_once('.').map_or(segment, |x| x.0).trim();
if stem.is_empty() {
return None;
}
let words: Vec<String> = stem
.split(['-', '_', ' '])
.filter(|part| !part.is_empty())
.map(|part| {
let mut chars = part.chars();
match chars.next() {
Some(first) => {
let first = first.to_ascii_uppercase();
let rest: String = chars.map(|c| c.to_ascii_lowercase()).collect();
if rest.is_empty() {
first.to_string()
} else {
format!("{first}{rest}")
}
}
None => String::new(),
}
})
.filter(|word| !word.is_empty())
.collect();
if words.is_empty() {
None
} else {
Some(words.join(" "))
}
}
fn truncate_preview(text: &str) -> String {
text.chars().take(TIMELINE_PREVIEW_BYTES).collect()
}
fn image_preview_from_metadata(meta: &DocMetadata) -> Option<String> {
let mime = meta.mime.as_deref()?;
if !mime.starts_with("image/") {
return None;
}
if let Some(caption) = meta.caption.as_ref() {
let trimmed = caption.trim();
if !trimmed.is_empty() {
return Some(truncate_preview(trimmed));
}
}
let mut segments: Vec<String> = Vec::new();
if let (Some(w), Some(h)) = (meta.width, meta.height) {
segments.push(format!("{w}×{h} px"));
}
if let Some(exif) = meta.exif.as_ref() {
if let Some(model) = exif
.model
.as_ref()
.map(|s| s.trim())
.filter(|s| !s.is_empty())
{
segments.push(model.to_string());
} else if let Some(make) = exif
.make
.as_ref()
.map(|s| s.trim())
.filter(|s| !s.is_empty())
{
segments.push(make.to_string());
}
if let Some(datetime) = exif
.datetime
.as_ref()
.map(|s| s.trim())
.filter(|s| !s.is_empty())
{
segments.push(datetime.to_string());
}
}
if segments.is_empty() {
return Some("Image frame".to_string());
}
Some(truncate_preview(&segments.join(" · ")))
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Read;
use std::num::NonZeroU64;
use tempfile::tempdir;
#[test]
fn create_put_commit_reopen() {
run_serial_test(|| {
let dir = tempdir().expect("tmp");
let path = dir.path().join("memory.mv2");
let mut mem = Memvid::create(&path).expect("create");
let seq = mem.put_bytes(b"hello").expect("put");
assert_eq!(seq, 1);
mem.commit().expect("commit");
drop(mem);
let mut reopened = Memvid::open(&path).expect("open");
let stats = reopened.stats().expect("stats");
assert_eq!(stats.frame_count, 1);
assert!(stats.has_time_index);
let timeline = reopened
.timeline(TimelineQuery::default())
.expect("timeline");
assert_eq!(timeline.len(), 1);
assert!(timeline[0].preview.contains("hello"));
let wal_stats = reopened.wal.stats();
assert_eq!(wal_stats.pending_bytes, 0);
assert_eq!(wal_stats.sequence, 2);
});
}
#[test]
fn timeline_limit_and_reverse() {
run_serial_test(|| {
let dir = tempdir().expect("tmp");
let path = dir.path().join("timeline.mv2");
let mut mem = Memvid::create(&path).expect("create");
mem.put_bytes(b"alpha").expect("put alpha");
mem.put_bytes(b"beta").expect("put beta");
mem.commit().expect("commit");
drop(mem);
let mut reopened = Memvid::open(&path).expect("open");
let limited = reopened
.timeline(TimelineQuery {
limit: NonZeroU64::new(1),
since: None,
until: None,
reverse: false,
#[cfg(feature = "temporal_track")]
temporal: None,
})
.expect("timeline limit");
assert_eq!(limited.len(), 1);
assert!(limited[0].preview.contains("alpha"));
let reversed = reopened
.timeline(TimelineQuery {
limit: NonZeroU64::new(1),
since: None,
until: None,
reverse: true,
#[cfg(feature = "temporal_track")]
temporal: None,
})
.expect("timeline reverse");
assert_eq!(reversed.len(), 1);
assert!(reversed[0].preview.contains("beta"));
});
}
#[test]
fn lex_search_roundtrip() {
run_serial_test(|| {
let dir = tempdir().expect("tmp");
let path = dir.path().join("lex.mv2");
let mut mem = Memvid::create(&path).expect("create");
mem.enable_lex().expect("enable");
let _seq1 = mem.put_bytes(b"Rust memory engine").expect("put");
let _seq2 = mem.put_bytes(b"Deterministic WAL").expect("put2");
mem.commit().expect("commit");
let request = SearchRequest {
query: "memory".to_string(),
top_k: 10,
snippet_chars: 200,
uri: None,
scope: None,
cursor: None,
#[cfg(feature = "temporal_track")]
temporal: None,
as_of_frame: None,
as_of_ts: None,
no_sketch: false,
acl_context: None,
acl_enforcement_mode: crate::types::AclEnforcementMode::Audit,
};
let response = mem.search(request).expect("search");
assert_eq!(response.hits.len(), 1);
drop(mem);
let mut reopened = Memvid::open(&path).expect("open");
let request = SearchRequest {
query: "wal".to_string(),
top_k: 10,
snippet_chars: 200,
uri: None,
scope: None,
cursor: None,
#[cfg(feature = "temporal_track")]
temporal: None,
as_of_frame: None,
as_of_ts: None,
no_sketch: false,
acl_context: None,
acl_enforcement_mode: crate::types::AclEnforcementMode::Audit,
};
let response = reopened.search(request).expect("search reopened");
assert_eq!(response.hits.len(), 1);
});
}
#[test]
fn vec_search_roundtrip() {
run_serial_test(|| {
let dir = tempdir().expect("tmp");
let path = dir.path().join("vec.mv2");
let mut mem = Memvid::create(&path).expect("create");
mem.enable_vec().expect("enable");
mem.put_with_embedding(b"vector", vec![0.0, 1.0])
.expect("put");
mem.put_with_embedding(b"vector-two", vec![1.0, 0.0])
.expect("put2");
mem.commit().expect("commit");
let stats = mem.stats().expect("stats");
assert!(stats.has_vec_index, "vec index should exist after commit");
let hits = mem.search_vec(&[0.0, 1.0], 5).expect("search");
assert_eq!(hits.first().map(|hit| hit.frame_id), Some(0));
drop(mem);
let mut reopened = Memvid::open(&path).expect("open");
let reopened_stats = reopened.stats().expect("stats reopen");
assert!(
reopened_stats.has_vec_index,
"vec index should exist after reopen: has_manifest={}, vec_enabled={}",
reopened.toc.indexes.vec.is_some(),
reopened.vec_enabled
);
let hits = reopened.search_vec(&[1.0, 0.0], 5).expect("search reopen");
assert_eq!(hits.first().map(|hit| hit.frame_id), Some(1));
});
}
#[test]
fn search_snippet_ranges_match_bytes() {
run_serial_test(|| {
let dir = tempdir().expect("tmp");
let path = dir.path().join("search.mv2");
let mut mem = Memvid::create(&path).expect("create");
mem.enable_lex().expect("enable lex");
let options = PutOptions::builder()
.uri("mv2://docs/pricing.md")
.title("Pricing")
.build();
let text = "Capacity tickets are signed grants that raise per-file caps.";
mem.put_bytes_with_options(text.as_bytes(), options)
.expect("put doc");
mem.commit().expect("commit");
let response = mem
.search(SearchRequest {
query: "capacity tickets".into(),
top_k: 5,
snippet_chars: 160,
uri: None,
scope: None,
cursor: None,
#[cfg(feature = "temporal_track")]
temporal: None,
as_of_frame: None,
as_of_ts: None,
no_sketch: false,
acl_context: None,
acl_enforcement_mode: crate::types::AclEnforcementMode::Audit,
})
.expect("search");
assert_eq!(response.total_hits, 1);
assert_eq!(response.engine, SearchEngineKind::Tantivy);
let hit = response.hits.first().expect("hit");
let frame = mem
.toc
.frames
.get(hit.frame_id as usize)
.cloned()
.expect("frame");
let canonical = mem.frame_content(&frame).expect("content");
let bytes = canonical.as_bytes();
let (start, end) = hit.range;
assert!(end <= bytes.len());
assert_eq!(hit.text.as_bytes(), &bytes[start..end]);
let chunk = hit.chunk_range.expect("chunk range");
assert!(chunk.0 <= start);
assert!(chunk.1 >= end);
let chunk_text = hit.chunk_text.as_ref().expect("chunk text");
let chunk_slice = &canonical[chunk.0..chunk.1];
assert_eq!(chunk_text, chunk_slice);
});
}
#[test]
fn search_chunk_range_reflects_chunk_offset() {
run_serial_test(|| {
let dir = tempdir().expect("tmp");
let path = dir.path().join("chunked.mv2");
let mut mem = Memvid::create(&path).expect("create");
mem.enable_lex().expect("enable lex");
let options = PutOptions::builder()
.uri("mv2://docs/manual.txt")
.title("Manual")
.build();
let prefix = "alpha beta gamma delta. ".repeat(200);
let content = format!(
"{}target segment appears here. Trailing context for verification.",
prefix
);
mem.put_bytes_with_options(content.as_bytes(), options)
.expect("put doc");
mem.commit().expect("commit");
let response = mem
.search(SearchRequest {
query: "target segment".into(),
top_k: 5,
snippet_chars: 160,
uri: None,
scope: None,
cursor: None,
#[cfg(feature = "temporal_track")]
temporal: None,
as_of_frame: None,
as_of_ts: None,
no_sketch: false,
acl_context: None,
acl_enforcement_mode: crate::types::AclEnforcementMode::Audit,
})
.expect("search");
let hit = response.hits.first().expect("hit");
assert_eq!(response.engine, SearchEngineKind::Tantivy);
let chunk_range = hit.chunk_range.expect("chunk range");
assert!(chunk_range.0 > 0);
assert!(hit.range.0 >= chunk_range.0);
assert!(hit.range.1 <= chunk_range.1);
assert!(hit.text.contains("target segment"));
let chunk_text = hit.chunk_text.as_ref().expect("chunk text");
assert_eq!(chunk_text, &content[chunk_range.0..chunk_range.1]);
});
}
#[test]
fn auto_tag_populates_frame_metadata() {
run_serial_test(|| {
let dir = tempdir().expect("tmp");
let path = dir.path().join("autotag.mv2");
let mut mem = Memvid::create(&path).expect("create");
mem.enable_lex().expect("enable lex");
let options = PutOptions::builder()
.search_text("Neural networks planning session 2024-10-08")
.auto_tag(true)
.extract_dates(true)
.build();
mem.put_bytes_with_options(b"agenda", options)
.expect("put bytes");
mem.commit().expect("commit");
let frame = mem.toc.frames.first().expect("frame present");
assert!(!frame.tags.is_empty());
assert!(frame.content_dates.iter().any(|date| date.contains("2024")));
});
}
#[test]
fn search_filters_by_uri_and_scope() {
run_serial_test(|| {
let dir = tempdir().expect("tmp");
let path = dir.path().join("filters.mv2");
let mut mem = Memvid::create(&path).expect("create");
mem.enable_lex().expect("enable lex");
let options_a = PutOptions::builder()
.uri("mv2://docs/pricing.md")
.title("Pricing")
.build();
mem.put_bytes_with_options(b"Capacity tickets add per-file allowances", options_a)
.expect("put a");
let options_b = PutOptions::builder()
.uri("mv2://docs/faq.md")
.title("FAQ")
.build();
mem.put_bytes_with_options(b"Tickets can be issued by admins", options_b)
.expect("put b");
let options_c = PutOptions::builder()
.uri("mv2://blog/launch.md")
.title("Launch")
.build();
mem.put_bytes_with_options(b"Launch day tickets boost visibility", options_c)
.expect("put c");
mem.commit().expect("commit");
let uri_response = mem
.search(SearchRequest {
query: "tickets".into(),
top_k: 10,
snippet_chars: 120,
uri: Some("mv2://docs/pricing.md".into()),
scope: None,
cursor: None,
#[cfg(feature = "temporal_track")]
temporal: None,
as_of_frame: None,
as_of_ts: None,
no_sketch: false,
acl_context: None,
acl_enforcement_mode: crate::types::AclEnforcementMode::Audit,
})
.expect("uri search");
assert_eq!(uri_response.engine, SearchEngineKind::Tantivy);
assert!(
uri_response
.hits
.iter()
.all(|hit| hit.uri == "mv2://docs/pricing.md")
);
let scope_response = mem
.search(SearchRequest {
query: "tickets".into(),
top_k: 10,
snippet_chars: 120,
uri: None,
scope: Some("mv2://docs/".into()),
cursor: None,
#[cfg(feature = "temporal_track")]
temporal: None,
as_of_frame: None,
as_of_ts: None,
no_sketch: false,
acl_context: None,
acl_enforcement_mode: crate::types::AclEnforcementMode::Audit,
})
.expect("scope search");
assert_eq!(scope_response.engine, SearchEngineKind::Tantivy);
assert!(
scope_response
.hits
.iter()
.all(|hit| hit.uri.starts_with("mv2://docs/"))
);
});
}
#[test]
fn search_pagination_and_params() {
run_serial_test(|| {
let dir = tempdir().expect("tmp");
let path = dir.path().join("paging.mv2");
let mut mem = Memvid::create(&path).expect("create");
mem.enable_lex().expect("enable lex");
for (idx, text) in [
"tickets unlock tier upgrades",
"tickets expire after 30 days",
"tickets may be revoked",
]
.iter()
.enumerate()
{
let uri = format!("mv2://docs/doc{idx}.md");
let options = PutOptions::builder()
.uri(&uri)
.title(format!("Doc {idx}"))
.build();
mem.put_bytes_with_options(text.as_bytes(), options)
.expect("put doc");
}
mem.commit().expect("commit");
let first_page = mem
.search(SearchRequest {
query: "tickets".into(),
top_k: 1,
snippet_chars: 90,
uri: None,
scope: None,
cursor: None,
#[cfg(feature = "temporal_track")]
temporal: None,
as_of_frame: None,
as_of_ts: None,
no_sketch: false,
acl_context: None,
acl_enforcement_mode: crate::types::AclEnforcementMode::Audit,
})
.expect("page one");
assert_eq!(first_page.engine, SearchEngineKind::Tantivy);
assert_eq!(first_page.hits.len(), 1);
assert_eq!(first_page.params.top_k, 1);
assert_eq!(first_page.params.snippet_chars, 90);
assert!(first_page.total_hits >= first_page.hits.len());
let cursor = first_page.next_cursor.clone().expect("cursor");
let first_id = first_page.hits[0].frame_id;
let second_page = mem
.search(SearchRequest {
query: "tickets".into(),
top_k: 1,
snippet_chars: 90,
uri: None,
scope: None,
cursor: Some(cursor),
#[cfg(feature = "temporal_track")]
temporal: None,
as_of_frame: None,
as_of_ts: None,
no_sketch: false,
acl_context: None,
acl_enforcement_mode: crate::types::AclEnforcementMode::Audit,
})
.expect("page two");
assert_eq!(second_page.engine, SearchEngineKind::Tantivy);
assert_eq!(second_page.hits.len(), 1);
assert_ne!(second_page.hits[0].frame_id, first_id);
assert_eq!(second_page.total_hits, first_page.total_hits);
});
}
#[cfg(feature = "lex")]
#[test]
fn search_falls_back_when_tantivy_missing() {
run_serial_test(|| {
let dir = tempdir().expect("tmp");
let path = dir.path().join("fallback.mv2");
let mut mem = Memvid::create(&path).expect("create");
mem.enable_lex().expect("enable lex");
mem.put_bytes(b"tickets fallback test").expect("put");
mem.commit().expect("commit");
assert!(
mem.tantivy.is_some(),
"Tantivy should be initialized after commit"
);
let response = mem
.search(SearchRequest {
query: "tickets".into(),
top_k: 5,
snippet_chars: 120,
uri: None,
scope: None,
cursor: None,
#[cfg(feature = "temporal_track")]
temporal: None,
as_of_frame: None,
as_of_ts: None,
no_sketch: false,
acl_context: None,
acl_enforcement_mode: crate::types::AclEnforcementMode::Audit,
})
.expect("search with tantivy");
assert_eq!(response.engine, SearchEngineKind::Tantivy);
assert!(!response.hits.is_empty());
});
}
#[test]
fn verify_reports_success() {
run_serial_test(|| {
let dir = tempdir().expect("tmp");
let path = dir.path().join("verify.mv2");
{
let mut mem = Memvid::create(&path).expect("create");
mem.enable_lex().expect("enable lex");
mem.enable_vec().expect("enable vec");
mem.put_with_embedding(b"check", vec![0.5, 0.1])
.expect("put");
mem.commit().expect("commit");
}
let report = Memvid::verify(&path, true).expect("verify");
assert_eq!(report.overall_status, VerificationStatus::Passed);
});
}
#[test]
fn test_create_enables_indexes_by_default() {
run_serial_test(|| {
let dir = tempdir().expect("tmp");
let path = dir.path().join("default_indexes.mv2");
let mem = Memvid::create(&path).expect("create");
let stats = mem.stats().expect("stats");
println!(
"After create (before drop): lex={}, vec={}",
stats.has_lex_index, stats.has_vec_index
);
drop(mem);
let reopened = Memvid::open(&path).expect("reopen");
let stats2 = reopened.stats().expect("stats after reopen");
println!(
"After reopen: lex={}, vec={}",
stats2.has_lex_index, stats2.has_vec_index
);
#[cfg(feature = "lex")]
assert!(
stats2.has_lex_index,
"lex index should be enabled by default"
);
#[cfg(feature = "vec")]
assert!(
stats2.has_vec_index,
"vec index should be enabled by default"
);
});
}
#[test]
fn doctor_rebuilds_time_index() {
use std::fs::OpenOptions;
use std::io::{Seek, SeekFrom, Write};
run_serial_test(|| {
let dir = tempdir().expect("tmp");
let path = dir.path().join("doctor.mv2");
let manifest = {
let mut mem = Memvid::create(&path).expect("create");
mem.put_bytes(b"repair").expect("put");
mem.commit().expect("commit");
mem.rebuild_indexes(&[], &[]).expect("rebuild");
mem.commit().expect("commit after rebuild");
println!(
"test: post-commit header footer_offset={}",
mem.header.footer_offset
);
println!(
"test: post-commit manifest offset={} length={}",
mem.toc
.time_index
.as_ref()
.map(|m| m.bytes_offset)
.unwrap_or(0),
mem.toc
.time_index
.as_ref()
.map(|m| m.bytes_length)
.unwrap_or(0)
);
mem.toc.time_index.clone().expect("time index manifest")
};
{
let mut file = OpenOptions::new()
.read(true)
.write(true)
.open(&path)
.expect("open file");
file.seek(SeekFrom::Start(manifest.bytes_offset))
.expect("seek");
let zeros = vec![0u8; usize::try_from(manifest.bytes_length).unwrap_or(0)];
file.write_all(&zeros).expect("corrupt time index");
file.flush().expect("flush");
file.sync_all().expect("sync");
}
println!(
"test: footer scan: {:?}",
crate::footer::find_last_valid_footer(&std::fs::read(&path).expect("read file"))
.as_ref()
.map(|s| (s.footer_offset, s.toc_offset, s.footer.toc_len))
);
println!("test: verifying corrupted memory");
match Memvid::verify(&path, false) {
Ok(report) => {
assert_eq!(report.overall_status, VerificationStatus::Failed);
}
Err(e) => {
println!("test: verify failed with error (expected): {e}");
}
}
println!("test: running doctor");
let report = Memvid::doctor(
&path,
DoctorOptions {
rebuild_time_index: true,
rebuild_lex_index: false,
..DoctorOptions::default()
},
)
.expect("doctor");
println!("test: doctor completed with status: {:?}", report.status);
println!("test: verifying repaired memory");
let reopened = Memvid::open(&path).expect("reopen after doctor");
assert!(
reopened.toc.time_index.is_some(),
"time index should exist after doctor"
);
});
}
#[test]
fn blob_reader_roundtrip_with_media_manifest() {
run_serial_test(|| {
let dir = tempdir().expect("tmp");
let path = dir.path().join("blob.mv2");
let payload = vec![0u8, 159, 1, 128, 42, 99, 200];
let manifest = MediaManifest {
kind: "video".to_string(),
mime: "video/mp4".to_string(),
bytes: payload.len() as u64,
filename: Some("clip.mp4".to_string()),
duration_ms: Some(1234),
width: Some(1920),
height: Some(1080),
codec: Some("h264".to_string()),
};
let mut doc_meta = DocMetadata::default();
doc_meta.media = Some(manifest.clone());
doc_meta.mime = Some("video/mp4".to_string());
doc_meta.bytes = Some(payload.len() as u64);
assert!(
!doc_meta.is_empty(),
"media manifest must count as metadata"
);
let options = PutOptions::builder()
.metadata(doc_meta)
.kind("video")
.uri("mv2://video/clip.mp4")
.build();
{
let mut mem = Memvid::create(&path).expect("create");
mem.put_bytes_with_options(&payload, options)
.expect("put bytes");
mem.commit().expect("commit");
}
let mut reopened = Memvid::open(&path).expect("open");
let mut reader = reopened
.blob_reader_by_uri("mv2://video/clip.mp4")
.expect("blob reader");
let mut buffered = Vec::new();
reader.read_to_end(&mut buffered).expect("read payload");
assert_eq!(buffered, payload);
let roundtrip = reopened
.media_manifest_by_uri("mv2://video/clip.mp4")
.expect("manifest lookup")
.expect("manifest present");
assert_eq!(roundtrip.mime, "video/mp4");
assert_eq!(roundtrip.kind, "video");
assert_eq!(roundtrip.bytes, payload.len() as u64);
assert_eq!(roundtrip.filename.as_deref(), Some("clip.mp4"));
assert_eq!(roundtrip.duration_ms, Some(1234));
assert_eq!(roundtrip.width, Some(1920));
assert_eq!(roundtrip.height, Some(1080));
assert_eq!(roundtrip.codec.as_deref(), Some("h264"));
drop(dir);
});
}
#[test]
fn video_frame_roundtrip_does_not_corrupt_toc() {
use crate::types::MediaManifest;
run_serial_test(|| {
let dir = tempdir().expect("tmp");
let path = dir.path().join("video.mv2");
let mut seed = 0xDEADBEEF_u64;
let mut video_bytes = vec![0u8; 1_600_000];
for byte in &mut video_bytes {
seed = seed ^ (seed << 7);
seed = seed ^ (seed >> 9);
seed = seed ^ (seed << 8);
*byte = (seed & 0xFF) as u8;
}
let hash_hex = blake3::hash(&video_bytes).to_hex().to_string();
let manifest = MediaManifest {
kind: "video".to_string(),
mime: "video/mp4".to_string(),
bytes: video_bytes.len() as u64,
filename: Some("clip.mp4".to_string()),
duration_ms: Some(1_000),
width: Some(1920),
height: Some(1080),
codec: Some("h264".to_string()),
};
let mut meta = DocMetadata::default();
meta.mime = Some("video/mp4".to_string());
meta.bytes = Some(video_bytes.len() as u64);
meta.hash = Some(hash_hex);
meta.caption = Some("Test clip".to_string());
meta.media = Some(manifest);
let options = PutOptions::builder()
.kind("video")
.metadata(meta)
.tag("kind", "video")
.uri("mv2://video/test.mp4")
.title("Test clip")
.build();
{
let mut mem = Memvid::create(&path).expect("create");
mem.put_bytes_with_options(&video_bytes, options)
.expect("put video");
mem.commit().expect("commit");
}
let reopened = Memvid::open(&path).expect("reopen");
let stats = reopened.stats().expect("stats");
assert_eq!(stats.frame_count, 1);
});
}
#[test]
#[allow(deprecated)]
fn ticket_sequence_enforced() {
run_serial_test(|| {
let dir = tempdir().expect("tmp");
let path = dir.path().join("ticket.mv2");
let mut mem = Memvid::create(&path).expect("create");
mem.apply_ticket(Ticket::new("issuer", 2))
.expect("apply first");
let err = mem
.apply_ticket(Ticket::new("issuer", 2))
.expect_err("sequence must increase");
assert!(matches!(err, MemvidError::TicketSequence { .. }));
});
}
#[test]
#[allow(deprecated)]
fn capacity_limit_enforced() {
run_serial_test(|| {
let dir = tempdir().expect("tmp");
let path = dir.path().join("capacity.mv2");
let mut mem = Memvid::create(&path).expect("create");
let base = mem.data_end;
mem.apply_ticket(Ticket::new("issuer", 2).capacity_bytes(base + 64))
.expect("apply ticket");
mem.put_bytes(&vec![0xFF; 32]).expect("first put");
mem.commit().expect("commit");
let err = mem.put_bytes(&[0xFF; 40]).expect_err("capacity exceeded");
assert!(matches!(err, MemvidError::CapacityExceeded { .. }));
});
}
}