1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
// Safe expect: guaranteed non-empty iterators after length check.
#![allow(clippy::expect_used)]
use std::collections::HashMap;
use crate::memvid::lifecycle::Memvid;
use crate::memvid::mutation::augment_search_text;
use crate::types::Frame;
use crate::types::{
EmbeddingIdentity, EmbeddingIdentityCount, EmbeddingIdentitySummary, FrameStatus,
};
use crate::{MemvidError, Result};
impl Memvid {
/// Returns the vector index dimension stored in the MV2 file, if available.
/// This is useful for auto-detecting which embedding model was used to create the file.
#[must_use]
pub fn vec_index_dimension(&self) -> Option<u32> {
self.toc
.indexes
.vec
.as_ref()
.map(|manifest| manifest.dimension)
.filter(|dim| *dim > 0)
}
/// Returns the best-effort vector index dimension for this memory.
///
/// This is segment-aware and supports both:
/// - Monolithic vector index manifests (`toc.indexes.vec.dimension`)
/// - Segment-based vector storage (`toc.segment_catalog.vec_segments[*].dimension`)
///
/// Returns `Ok(None)` when no vector dimension information exists.
/// Returns an error if multiple, conflicting dimensions are detected.
pub fn effective_vec_index_dimension(&self) -> Result<Option<u32>> {
let manifest_dim = self
.toc
.indexes
.vec
.as_ref()
.map(|manifest| manifest.dimension)
.filter(|dim| *dim > 0);
let mut segment_dim: Option<u32> = None;
for descriptor in &self.toc.segment_catalog.vec_segments {
if descriptor.dimension == 0 {
continue;
}
match segment_dim {
None => segment_dim = Some(descriptor.dimension),
Some(existing) if existing == descriptor.dimension => {}
Some(existing) => {
return Err(MemvidError::InvalidToc {
reason: format!(
"mixed vector dimensions detected in segment catalog: {} and {}",
existing, descriptor.dimension
)
.into(),
});
}
}
}
match (manifest_dim, segment_dim) {
(Some(manifest), Some(segment)) if manifest != segment => {
Err(MemvidError::InvalidToc {
reason: format!(
"vector dimension mismatch between manifest ({manifest}) and segment catalog ({segment})"
)
.into(),
})
}
(Some(manifest), _) => Ok(Some(manifest)),
(None, Some(segment)) => Ok(Some(segment)),
(None, None) => Ok(None),
}
}
/// Summarize the embedding identity used by this memory, if present.
///
/// The identity is inferred from per-frame `extra_metadata` keys:
/// - `memvid.embedding.provider`
/// - `memvid.embedding.model`
/// - `memvid.embedding.dimension` (optional)
/// - `memvid.embedding.normalized` (optional)
///
/// To preserve `.mv2` binary compatibility, this metadata is stored per-frame rather than in
/// the TOC schema. This helper scans up to `max_frames` active frames and returns:
/// - `Unknown` if no identity metadata is present
/// - `Single` if exactly one identity is observed
/// - `Mixed` if multiple identities are observed (counts included, descending)
#[must_use]
pub fn embedding_identity_summary(&self, max_frames: usize) -> EmbeddingIdentitySummary {
let mut counts: HashMap<EmbeddingIdentity, u64> = HashMap::new();
let mut scanned = 0usize;
for frame in &self.toc.frames {
if frame.status != FrameStatus::Active {
continue;
}
if scanned >= max_frames {
break;
}
scanned += 1;
if let Some(identity) = EmbeddingIdentity::from_extra_metadata(&frame.extra_metadata) {
*counts.entry(identity).or_insert(0) += 1;
}
}
if counts.is_empty() {
return EmbeddingIdentitySummary::Unknown;
}
if counts.len() == 1 {
return EmbeddingIdentitySummary::Single(
counts
.into_iter()
.next()
.map(|(identity, _)| identity)
.expect("counts.len()==1 implies one entry"),
);
}
let mut identities: Vec<EmbeddingIdentityCount> = counts
.into_iter()
.map(|(identity, count)| EmbeddingIdentityCount { identity, count })
.collect();
identities.sort_by(|a, b| b.count.cmp(&a.count));
EmbeddingIdentitySummary::Mixed(identities)
}
pub(crate) fn render_binary_summary(len: usize) -> String {
if len == 0 {
"<binary payload: 0 bytes>".into()
} else {
format!("<binary payload: {len} bytes>")
}
}
pub(crate) fn augment_text_for_frame(
&mut self,
base: Option<String>,
frame: &Frame,
) -> Option<String> {
augment_search_text(
base,
frame.uri.as_deref(),
frame.title.as_deref(),
frame.track.as_deref(),
&frame.tags,
&frame.labels,
&frame.extra_metadata,
&frame.content_dates,
frame.metadata.as_ref(),
)
}
pub(crate) fn frame_search_text(&mut self, frame: &Frame) -> Result<String> {
if let Some(text) = &frame.search_text {
Ok(text.clone())
} else {
let base = self.frame_content(frame)?;
Ok(self
.augment_text_for_frame(Some(base), frame)
.unwrap_or_default())
}
}
}