Skip to main content

skeg_cli/
inspect.rs

1//! Offline inspection of a skeg data directory.
2//!
3//! Walks the on-disk layout produced by `skeg-server` (or `skeg-cli build`)
4//! and reports what is in it: the VINDEX names per shard, vector counts,
5//! graph/vectors file sizes. The server does not need to be running.
6//!
7//! The layout this expects mirrors what the server writes:
8//!
9//! ```text
10//! <data-dir>/shard-<N>/
11//!   vindexes.registry        <- [u32 count] + per entry [u16 name_len][name][u32 dim]
12//!   vindex-<name>/
13//!     graph.vmn
14//!     vectors.bin
15//! ```
16
17use std::fmt;
18use std::fs;
19use std::io::{self, Read};
20use std::path::{Path, PathBuf};
21
22use skeg_vector::DiskVamanaIndex;
23
24/// One VINDEX as recorded in a shard's registry, with on-disk sizes.
25#[derive(Debug, Clone)]
26pub struct VindexEntry {
27    pub name: String,
28    pub dim: usize,
29    /// Number of live vectors. `None` if the directory exists but the
30    /// `DiskVamanaIndex` could not be opened (corrupted / partial write).
31    pub n_vectors: Option<usize>,
32    /// Size of `graph.vmn` in bytes. 0 if missing.
33    pub graph_bytes: u64,
34    /// Size of `vectors.bin` in bytes. 0 if missing.
35    pub vectors_bytes: u64,
36}
37
38/// One shard's contribution to the inspect report.
39#[derive(Debug, Clone)]
40pub struct ShardReport {
41    pub shard_id: usize,
42    pub vindexes: Vec<VindexEntry>,
43    /// Sum of every regular file directly under `shard-<N>/` that is not a
44    /// `vindex-*` directory. Mostly: the vLog segments and the index
45    /// snapshot.
46    pub kv_bytes: u64,
47}
48
49/// Full report on a data directory.
50#[derive(Debug, Clone)]
51pub struct InspectReport {
52    pub data_dir: PathBuf,
53    pub shards: Vec<ShardReport>,
54}
55
56/// Inspect a data directory. Returns an empty report if the directory
57/// contains no `shard-*` subdirectories.
58///
59/// # Errors
60///
61/// Returns an error only on filesystem errors that are not "missing file"
62/// (e.g. permission denied). A truncated or missing `vindexes.registry`
63/// is reported by leaving the shard's `vindexes` list empty.
64pub fn inspect(data_dir: &Path) -> io::Result<InspectReport> {
65    let mut shards = Vec::new();
66    if !data_dir.exists() {
67        return Err(io::Error::new(
68            io::ErrorKind::NotFound,
69            format!("data directory not found: {}", data_dir.display()),
70        ));
71    }
72    for entry in fs::read_dir(data_dir)? {
73        let entry = entry?;
74        let name = entry.file_name();
75        let name = name.to_string_lossy();
76        let Some(rest) = name.strip_prefix("shard-") else {
77            continue;
78        };
79        let Ok(shard_id) = rest.parse::<usize>() else {
80            continue;
81        };
82        shards.push(inspect_shard(shard_id, &entry.path())?);
83    }
84    shards.sort_by_key(|s| s.shard_id);
85    Ok(InspectReport {
86        data_dir: data_dir.to_path_buf(),
87        shards,
88    })
89}
90
91fn inspect_shard(shard_id: usize, shard_dir: &Path) -> io::Result<ShardReport> {
92    let entries = read_registry(&shard_dir.join("vindexes.registry"))?;
93    let mut vindexes = Vec::with_capacity(entries.len());
94    for (name, dim) in entries {
95        let vindex_dir = shard_dir.join(format!("vindex-{name}"));
96        let graph_bytes = fs::metadata(vindex_dir.join("graph.vmn"))
97            .map(|m| m.len())
98            .unwrap_or(0);
99        let vectors_bytes = fs::metadata(vindex_dir.join("vectors.bin"))
100            .map(|m| m.len())
101            .unwrap_or(0);
102        let n_vectors = DiskVamanaIndex::open(&vindex_dir).ok().map(|i| i.len());
103        vindexes.push(VindexEntry {
104            name,
105            dim,
106            n_vectors,
107            graph_bytes,
108            vectors_bytes,
109        });
110    }
111    let kv_bytes = kv_size_in_shard(shard_dir)?;
112    Ok(ShardReport {
113        shard_id,
114        vindexes,
115        kv_bytes,
116    })
117}
118
119/// Sum every regular file directly inside `shard-<N>/` that is not the
120/// registry and not a `vindex-*` subdirectory.
121fn kv_size_in_shard(shard_dir: &Path) -> io::Result<u64> {
122    let mut total = 0;
123    for entry in fs::read_dir(shard_dir)? {
124        let entry = entry?;
125        let ty = entry.file_type()?;
126        if ty.is_file() {
127            let name = entry.file_name();
128            if name == "vindexes.registry" {
129                continue;
130            }
131            total += entry.metadata()?.len();
132        }
133    }
134    Ok(total)
135}
136
137/// Parse a `vindexes.registry` file into `(name, dim)` pairs. Missing file
138/// yields an empty list.
139fn read_registry(path: &Path) -> io::Result<Vec<(String, usize)>> {
140    let mut file = match fs::File::open(path) {
141        Ok(f) => f,
142        Err(e) if e.kind() == io::ErrorKind::NotFound => return Ok(Vec::new()),
143        Err(e) => return Err(e),
144    };
145    let mut buf = Vec::new();
146    file.read_to_end(&mut buf)?;
147    if buf.len() < 4 {
148        return Ok(Vec::new());
149    }
150    let count = u32::from_le_bytes(buf[0..4].try_into().unwrap()) as usize;
151    let mut out = Vec::with_capacity(count);
152    let mut i = 4;
153    for _ in 0..count {
154        if i + 2 > buf.len() {
155            break;
156        }
157        let name_len = u16::from_le_bytes([buf[i], buf[i + 1]]) as usize;
158        i += 2;
159        if i + name_len + 4 > buf.len() {
160            break;
161        }
162        let name = String::from_utf8_lossy(&buf[i..i + name_len]).into_owned();
163        i += name_len;
164        let dim = u32::from_le_bytes(buf[i..i + 4].try_into().unwrap()) as usize;
165        i += 4;
166        out.push((name, dim));
167    }
168    Ok(out)
169}
170
171impl fmt::Display for InspectReport {
172    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
173        writeln!(f, "data_dir: {}", self.data_dir.display())?;
174        writeln!(f, "shards:   {}", self.shards.len())?;
175        if self.shards.is_empty() {
176            writeln!(f, "(no shard-* subdirectories found)")?;
177            return Ok(());
178        }
179        let total_kv: u64 = self.shards.iter().map(|s| s.kv_bytes).sum();
180        let total_vindexes: usize = self.shards.iter().map(|s| s.vindexes.len()).sum();
181        writeln!(f, "kv_bytes: {}", human_bytes(total_kv))?;
182        writeln!(f, "vindexes: {total_vindexes}")?;
183        writeln!(f)?;
184        for shard in &self.shards {
185            writeln!(f, "[shard-{}]", shard.shard_id)?;
186            writeln!(f, "  kv_bytes: {}", human_bytes(shard.kv_bytes))?;
187            if shard.vindexes.is_empty() {
188                writeln!(f, "  (no vindexes)")?;
189                continue;
190            }
191            for v in &shard.vindexes {
192                let n = v
193                    .n_vectors
194                    .map_or_else(|| "?".to_string(), |n| n.to_string());
195                writeln!(
196                    f,
197                    "  vindex={} dim={} n={} graph={} vectors={}",
198                    v.name,
199                    v.dim,
200                    n,
201                    human_bytes(v.graph_bytes),
202                    human_bytes(v.vectors_bytes),
203                )?;
204            }
205        }
206        Ok(())
207    }
208}
209
210#[allow(clippy::cast_precision_loss)]
211fn human_bytes(n: u64) -> String {
212    const KB: f64 = 1024.0;
213    const MB: f64 = KB * 1024.0;
214    const GB: f64 = MB * 1024.0;
215    let x = n as f64;
216    if x >= GB {
217        format!("{:.2} GiB", x / GB)
218    } else if x >= MB {
219        format!("{:.2} MiB", x / MB)
220    } else if x >= KB {
221        format!("{:.2} KiB", x / KB)
222    } else {
223        format!("{n} B")
224    }
225}
226
227#[cfg(test)]
228mod tests {
229    use super::*;
230    use crate::build_index_from;
231    use skeg_vector::VamanaConfig;
232    use tempfile::TempDir;
233
234    #[allow(clippy::cast_precision_loss)]
235    fn tvec(seed: u64, dim: usize) -> Vec<f32> {
236        let mut s = (seed << 1) | 1;
237        (0..dim)
238            .map(|_| {
239                s ^= s << 13;
240                s ^= s >> 7;
241                s ^= s << 17;
242                ((s & 0xFFFF) as f32 / 32768.0) - 1.0
243            })
244            .collect()
245    }
246
247    #[test]
248    fn inspect_of_a_built_index_reports_vindex_and_dim() {
249        let n = 32;
250        let dim = 8;
251        let flat: Vec<f32> = (0..n).flat_map(|i| tvec(i as u64 + 1, dim)).collect();
252        let out = TempDir::new().unwrap();
253        build_index_from(flat, n, dim, out.path(), "docs", &VamanaConfig::default()).unwrap();
254
255        let report = inspect(out.path()).unwrap();
256        assert_eq!(report.shards.len(), 1);
257        let shard = &report.shards[0];
258        assert_eq!(shard.shard_id, 0);
259        assert_eq!(shard.vindexes.len(), 1);
260        let v = &shard.vindexes[0];
261        assert_eq!(v.name, "docs");
262        assert_eq!(v.dim, dim);
263        assert_eq!(v.n_vectors, Some(n));
264        assert!(v.graph_bytes > 0);
265        assert!(v.vectors_bytes > 0);
266    }
267
268    #[test]
269    fn inspect_of_an_empty_dir_returns_empty_report() {
270        let dir = TempDir::new().unwrap();
271        let report = inspect(dir.path()).unwrap();
272        assert!(report.shards.is_empty());
273    }
274
275    #[test]
276    fn inspect_of_a_missing_dir_is_an_error() {
277        let dir = TempDir::new().unwrap();
278        let missing = dir.path().join("not-here");
279        assert!(inspect(&missing).is_err());
280    }
281
282    #[test]
283    fn registry_with_one_entry_roundtrips() {
284        let dir = TempDir::new().unwrap();
285        let shard = dir.path().join("shard-0");
286        fs::create_dir_all(&shard).unwrap();
287        // count=1, name="docs", dim=8; mirrors what build_index writes.
288        let mut buf = Vec::new();
289        buf.extend_from_slice(&1u32.to_le_bytes());
290        buf.extend_from_slice(&4u16.to_le_bytes());
291        buf.extend_from_slice(b"docs");
292        buf.extend_from_slice(&8u32.to_le_bytes());
293        fs::write(shard.join("vindexes.registry"), &buf).unwrap();
294
295        let entries = read_registry(&shard.join("vindexes.registry")).unwrap();
296        assert_eq!(entries, vec![("docs".to_string(), 8)]);
297    }
298}