1use std::fmt;
18use std::fs;
19use std::io::{self, Read};
20use std::path::{Path, PathBuf};
21
22use skeg_vector::DiskVamanaIndex;
23
24#[derive(Debug, Clone)]
26pub struct VindexEntry {
27 pub name: String,
28 pub dim: usize,
29 pub n_vectors: Option<usize>,
32 pub graph_bytes: u64,
34 pub vectors_bytes: u64,
36}
37
38#[derive(Debug, Clone)]
40pub struct ShardReport {
41 pub shard_id: usize,
42 pub vindexes: Vec<VindexEntry>,
43 pub kv_bytes: u64,
47}
48
49#[derive(Debug, Clone)]
51pub struct InspectReport {
52 pub data_dir: PathBuf,
53 pub shards: Vec<ShardReport>,
54}
55
56pub fn inspect(data_dir: &Path) -> io::Result<InspectReport> {
65 let mut shards = Vec::new();
66 if !data_dir.exists() {
67 return Err(io::Error::new(
68 io::ErrorKind::NotFound,
69 format!("data directory not found: {}", data_dir.display()),
70 ));
71 }
72 for entry in fs::read_dir(data_dir)? {
73 let entry = entry?;
74 let name = entry.file_name();
75 let name = name.to_string_lossy();
76 let Some(rest) = name.strip_prefix("shard-") else {
77 continue;
78 };
79 let Ok(shard_id) = rest.parse::<usize>() else {
80 continue;
81 };
82 shards.push(inspect_shard(shard_id, &entry.path())?);
83 }
84 shards.sort_by_key(|s| s.shard_id);
85 Ok(InspectReport {
86 data_dir: data_dir.to_path_buf(),
87 shards,
88 })
89}
90
91fn inspect_shard(shard_id: usize, shard_dir: &Path) -> io::Result<ShardReport> {
92 let entries = read_registry(&shard_dir.join("vindexes.registry"))?;
93 let mut vindexes = Vec::with_capacity(entries.len());
94 for (name, dim) in entries {
95 let vindex_dir = shard_dir.join(format!("vindex-{name}"));
96 let graph_bytes = fs::metadata(vindex_dir.join("graph.vmn"))
97 .map(|m| m.len())
98 .unwrap_or(0);
99 let vectors_bytes = fs::metadata(vindex_dir.join("vectors.bin"))
100 .map(|m| m.len())
101 .unwrap_or(0);
102 let n_vectors = DiskVamanaIndex::open(&vindex_dir).ok().map(|i| i.len());
103 vindexes.push(VindexEntry {
104 name,
105 dim,
106 n_vectors,
107 graph_bytes,
108 vectors_bytes,
109 });
110 }
111 let kv_bytes = kv_size_in_shard(shard_dir)?;
112 Ok(ShardReport {
113 shard_id,
114 vindexes,
115 kv_bytes,
116 })
117}
118
119fn kv_size_in_shard(shard_dir: &Path) -> io::Result<u64> {
122 let mut total = 0;
123 for entry in fs::read_dir(shard_dir)? {
124 let entry = entry?;
125 let ty = entry.file_type()?;
126 if ty.is_file() {
127 let name = entry.file_name();
128 if name == "vindexes.registry" {
129 continue;
130 }
131 total += entry.metadata()?.len();
132 }
133 }
134 Ok(total)
135}
136
137fn read_registry(path: &Path) -> io::Result<Vec<(String, usize)>> {
140 let mut file = match fs::File::open(path) {
141 Ok(f) => f,
142 Err(e) if e.kind() == io::ErrorKind::NotFound => return Ok(Vec::new()),
143 Err(e) => return Err(e),
144 };
145 let mut buf = Vec::new();
146 file.read_to_end(&mut buf)?;
147 if buf.len() < 4 {
148 return Ok(Vec::new());
149 }
150 let count = u32::from_le_bytes(buf[0..4].try_into().unwrap()) as usize;
151 let mut out = Vec::with_capacity(count);
152 let mut i = 4;
153 for _ in 0..count {
154 if i + 2 > buf.len() {
155 break;
156 }
157 let name_len = u16::from_le_bytes([buf[i], buf[i + 1]]) as usize;
158 i += 2;
159 if i + name_len + 4 > buf.len() {
160 break;
161 }
162 let name = String::from_utf8_lossy(&buf[i..i + name_len]).into_owned();
163 i += name_len;
164 let dim = u32::from_le_bytes(buf[i..i + 4].try_into().unwrap()) as usize;
165 i += 4;
166 out.push((name, dim));
167 }
168 Ok(out)
169}
170
171impl fmt::Display for InspectReport {
172 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
173 writeln!(f, "data_dir: {}", self.data_dir.display())?;
174 writeln!(f, "shards: {}", self.shards.len())?;
175 if self.shards.is_empty() {
176 writeln!(f, "(no shard-* subdirectories found)")?;
177 return Ok(());
178 }
179 let total_kv: u64 = self.shards.iter().map(|s| s.kv_bytes).sum();
180 let total_vindexes: usize = self.shards.iter().map(|s| s.vindexes.len()).sum();
181 writeln!(f, "kv_bytes: {}", human_bytes(total_kv))?;
182 writeln!(f, "vindexes: {total_vindexes}")?;
183 writeln!(f)?;
184 for shard in &self.shards {
185 writeln!(f, "[shard-{}]", shard.shard_id)?;
186 writeln!(f, " kv_bytes: {}", human_bytes(shard.kv_bytes))?;
187 if shard.vindexes.is_empty() {
188 writeln!(f, " (no vindexes)")?;
189 continue;
190 }
191 for v in &shard.vindexes {
192 let n = v
193 .n_vectors
194 .map_or_else(|| "?".to_string(), |n| n.to_string());
195 writeln!(
196 f,
197 " vindex={} dim={} n={} graph={} vectors={}",
198 v.name,
199 v.dim,
200 n,
201 human_bytes(v.graph_bytes),
202 human_bytes(v.vectors_bytes),
203 )?;
204 }
205 }
206 Ok(())
207 }
208}
209
210#[allow(clippy::cast_precision_loss)]
211fn human_bytes(n: u64) -> String {
212 const KB: f64 = 1024.0;
213 const MB: f64 = KB * 1024.0;
214 const GB: f64 = MB * 1024.0;
215 let x = n as f64;
216 if x >= GB {
217 format!("{:.2} GiB", x / GB)
218 } else if x >= MB {
219 format!("{:.2} MiB", x / MB)
220 } else if x >= KB {
221 format!("{:.2} KiB", x / KB)
222 } else {
223 format!("{n} B")
224 }
225}
226
227#[cfg(test)]
228mod tests {
229 use super::*;
230 use crate::build_index_from;
231 use skeg_vector::VamanaConfig;
232 use tempfile::TempDir;
233
234 #[allow(clippy::cast_precision_loss)]
235 fn tvec(seed: u64, dim: usize) -> Vec<f32> {
236 let mut s = (seed << 1) | 1;
237 (0..dim)
238 .map(|_| {
239 s ^= s << 13;
240 s ^= s >> 7;
241 s ^= s << 17;
242 ((s & 0xFFFF) as f32 / 32768.0) - 1.0
243 })
244 .collect()
245 }
246
247 #[test]
248 fn inspect_of_a_built_index_reports_vindex_and_dim() {
249 let n = 32;
250 let dim = 8;
251 let flat: Vec<f32> = (0..n).flat_map(|i| tvec(i as u64 + 1, dim)).collect();
252 let out = TempDir::new().unwrap();
253 build_index_from(flat, n, dim, out.path(), "docs", &VamanaConfig::default()).unwrap();
254
255 let report = inspect(out.path()).unwrap();
256 assert_eq!(report.shards.len(), 1);
257 let shard = &report.shards[0];
258 assert_eq!(shard.shard_id, 0);
259 assert_eq!(shard.vindexes.len(), 1);
260 let v = &shard.vindexes[0];
261 assert_eq!(v.name, "docs");
262 assert_eq!(v.dim, dim);
263 assert_eq!(v.n_vectors, Some(n));
264 assert!(v.graph_bytes > 0);
265 assert!(v.vectors_bytes > 0);
266 }
267
268 #[test]
269 fn inspect_of_an_empty_dir_returns_empty_report() {
270 let dir = TempDir::new().unwrap();
271 let report = inspect(dir.path()).unwrap();
272 assert!(report.shards.is_empty());
273 }
274
275 #[test]
276 fn inspect_of_a_missing_dir_is_an_error() {
277 let dir = TempDir::new().unwrap();
278 let missing = dir.path().join("not-here");
279 assert!(inspect(&missing).is_err());
280 }
281
282 #[test]
283 fn registry_with_one_entry_roundtrips() {
284 let dir = TempDir::new().unwrap();
285 let shard = dir.path().join("shard-0");
286 fs::create_dir_all(&shard).unwrap();
287 let mut buf = Vec::new();
289 buf.extend_from_slice(&1u32.to_le_bytes());
290 buf.extend_from_slice(&4u16.to_le_bytes());
291 buf.extend_from_slice(b"docs");
292 buf.extend_from_slice(&8u32.to_le_bytes());
293 fs::write(shard.join("vindexes.registry"), &buf).unwrap();
294
295 let entries = read_registry(&shard.join("vindexes.registry")).unwrap();
296 assert_eq!(entries, vec![("docs".to_string(), 8)]);
297 }
298}