use std::collections::HashMap;
use std::io::{Read, Seek, SeekFrom};
use std::panic::{AssertUnwindSafe, catch_unwind};
use crate::lex::{LexIndex, LexIndexArtifact, LexIndexBuilder};
use crate::memvid::lifecycle::Memvid;
use crate::types::{Frame, FrameId, FrameStatus, VectorCompression};
use crate::{MemvidError, Result, VecIndex, VecIndexArtifact};
impl Memvid {
#[allow(dead_code)]
pub(crate) fn build_lex_artifact(&mut self) -> Result<Option<(LexIndexArtifact, LexIndex)>> {
if !self.lex_enabled {
return Ok(None);
}
let mut builder = LexIndexBuilder::new();
let empty_tags = HashMap::new();
let frames: Vec<Frame> = self
.toc
.frames
.iter()
.filter(|frame| frame.status == FrameStatus::Active)
.cloned()
.collect();
for frame in frames {
let content = self.frame_content(&frame)?;
let uri = frame
.uri
.clone()
.unwrap_or_else(|| crate::default_uri(frame.id));
builder.add_document(
frame.id,
&uri,
frame.title.as_deref(),
&content,
&empty_tags,
);
}
let artifact = builder.finish()?;
let index = LexIndex::decode(&artifact.bytes)?;
Ok(Some((artifact, index)))
}
pub(crate) fn build_vec_artifact(
&mut self,
new_docs: &[(FrameId, Vec<f32>)],
) -> Result<Option<(VecIndexArtifact, VecIndex)>> {
if !self.vec_enabled {
return Ok(None);
}
let mut builder = VecIndexBuilder::new();
if let Some(index) = self.vec_index.as_ref() {
for (frame_id, embedding) in index.entries() {
if self.frame_is_active(frame_id) {
builder.add_document(frame_id, embedding.to_vec());
}
}
}
for (frame_id, embedding) in new_docs {
builder.add_document(*frame_id, embedding.clone());
}
let artifact = builder.finish()?;
let index = VecIndex::decode(&artifact.bytes)?;
Ok(Some((artifact, index)))
}
pub(crate) fn ensure_lex_index(&mut self) -> Result<()> {
if self.lex_index.is_some() {
return Ok(());
}
self.load_lex_index_from_manifest()
}
pub(crate) fn ensure_vec_index(&mut self) -> Result<()> {
if self.vec_index.is_some() {
return Ok(());
}
self.load_vec_index_from_manifest()?;
if self.vec_index.is_none()
&& self.vec_enabled
&& !self.toc.segment_catalog.vec_segments.is_empty()
{
self.build_vec_index_from_segments()?;
}
Ok(())
}
pub(crate) fn load_lex_index_from_manifest(&mut self) -> Result<()> {
if let Some(manifest) = &self.toc.indexes.lex {
if manifest.bytes_length == 0 {
self.lex_index = None;
return Ok(());
}
let bytes =
if let Ok(bytes) = self.read_range(manifest.bytes_offset, manifest.bytes_length) {
bytes
} else {
self.lex_index = None;
return Ok(());
};
match LexIndex::decode(&bytes) {
Ok(mut index) => {
self.hydrate_lex_index_metadata(&mut index);
self.lex_index = Some(index);
}
Err(_) => {
self.lex_index = None;
}
}
} else {
self.lex_index = None;
}
Ok(())
}
pub(crate) fn load_vec_index_from_manifest(&mut self) -> Result<()> {
self.vec_model = self.toc.indexes.vec.as_ref().and_then(|m| m.model.clone());
if let Some(manifest) = &self.toc.indexes.vec {
if manifest.bytes_length == 0 {
self.vec_index = None;
return Ok(());
}
let bytes =
if let Ok(bytes) = self.read_range(manifest.bytes_offset, manifest.bytes_length) {
bytes
} else {
self.vec_index = None;
return Ok(());
};
match catch_unwind(AssertUnwindSafe(|| VecIndex::decode(&bytes))) {
Ok(Ok(index)) => self.vec_index = Some(index),
Ok(Err(_)) | Err(_) => {
self.vec_index = None;
}
}
} else {
self.vec_index = None;
}
Ok(())
}
pub(crate) fn load_clip_index_from_manifest(&mut self) -> Result<()> {
use crate::clip::ClipIndex;
if let Some(manifest) = &self.toc.indexes.clip {
if manifest.bytes_length == 0 {
self.clip_index = None;
return Ok(());
}
let bytes =
if let Ok(bytes) = self.read_range(manifest.bytes_offset, manifest.bytes_length) {
bytes
} else {
self.clip_index = None;
return Ok(());
};
match catch_unwind(AssertUnwindSafe(|| ClipIndex::decode(&bytes))) {
Ok(Ok(index)) => self.clip_index = Some(index),
Ok(Err(_)) | Err(_) => {
self.clip_index = None;
}
}
} else {
self.clip_index = None;
}
Ok(())
}
pub fn read_range(&mut self, offset: u64, length: u64) -> Result<Vec<u8>> {
let file_len = self.file.metadata()?.len();
let end = offset.checked_add(length).ok_or(MemvidError::InvalidToc {
reason: "manifest range overflow".into(),
})?;
if end > file_len || length > crate::MAX_INDEX_BYTES {
return Err(MemvidError::InvalidToc {
reason: "manifest range invalid".into(),
});
}
self.file.seek(SeekFrom::Start(offset))?;
#[allow(clippy::cast_possible_truncation)]
let mut buf = vec![0u8; length as usize];
self.file.read_exact(&mut buf)?;
Ok(buf)
}
#[allow(dead_code)]
fn disable_lex(&mut self) {
self.lex_index = None;
self.lex_enabled = false;
self.toc.indexes.lex = None;
}
fn build_vec_index_from_segments(&mut self) -> Result<()> {
use crate::vec::VecIndexBuilder;
let mut builder = VecIndexBuilder::new();
let segments = self.toc.segment_catalog.vec_segments.clone();
for segment_desc in &segments {
let bytes = match self.read_range(
segment_desc.common.bytes_offset,
segment_desc.common.bytes_length,
) {
Ok(bytes) => bytes,
Err(err) => {
tracing::warn!(
error = %err,
segment_id = segment_desc.common.segment_id,
"failed to load vec segment, skipping"
);
continue;
}
};
let compression_hint = segment_desc.vector_compression.clone();
tracing::debug!(
segment_id = segment_desc.common.segment_id,
compression_hint = ?compression_hint,
bytes_len = bytes.len(),
"attempting to decode vec segment"
);
match VecIndex::decode_with_compression(&bytes, compression_hint) {
Ok(segment_index) => {
for (frame_id, embedding) in segment_index.entries() {
if self.frame_is_active(frame_id) {
builder.add_document(frame_id, embedding.to_vec());
}
}
}
Err(err) => {
tracing::warn!(
error = %err,
segment_id = segment_desc.common.segment_id,
"failed to decode vec segment, skipping"
);
}
}
}
let artifact = builder.finish()?;
if artifact.vector_count > 0 {
let index =
VecIndex::decode_with_compression(&artifact.bytes, VectorCompression::None)?;
self.vec_index = Some(index);
}
Ok(())
}
fn hydrate_lex_index_metadata(&self, index: &mut LexIndex) {
for document in index.documents_mut() {
let frame_idx = usize::try_from(document.frame_id).ok();
let frame_meta = frame_idx.and_then(|idx| self.toc.frames.get(idx));
if document.uri.is_none() {
let derived = frame_meta
.and_then(|frame| frame.uri.clone())
.unwrap_or_else(|| crate::default_uri(document.frame_id));
document.uri = Some(derived);
}
if document.title.is_none() {
let title = frame_meta
.and_then(|frame| frame.title.clone())
.or_else(|| {
document
.uri
.as_ref()
.and_then(|uri| crate::infer_title_from_uri(uri))
});
document.title = title;
}
}
}
}
use crate::vec::VecIndexBuilder;