iqdb_eval/dataset.rs
1//! Dataset loaders for the TEXMEX SIFT family.
2//!
3//! The SIFT corpus (and its `siftsmall` and `GIST` siblings) is shipped as
4//! a pair of `.fvecs` files (base vectors and query vectors) and one
5//! `.ivecs` file (per-query top-100 ground-truth neighbour ids). All three
6//! share the same record layout: a little-endian `u32 dim` header
7//! followed by `dim` payload elements (`f32` for `.fvecs`, `i32` for
8//! `.ivecs`).
9//!
10//! The readers and [`load_sift_dataset`] are minimal, hand-rolled, and
11//! pull in no new external parsing dependencies. They generalize the
12//! one-off versions that previously lived in `iqdb-hnsw/tests/sift_recall.rs`.
13
14use std::fs::File;
15use std::io::{BufReader, Read};
16use std::path::{Path, PathBuf};
17
18use crate::error::{EvalError, Result};
19
20/// One full SIFT-family dataset: base vectors, query vectors, per-query
21/// ground-truth neighbour ids, and the shared dimensionality.
22///
23/// `base[i]` is the `i`-th base vector — `i` is also the row-index ID
24/// used in [`crate::build_index_from_base`] and the value stored in the
25/// `.ivecs` ground-truth entries.
26#[derive(Debug, Clone)]
27pub struct SiftDataset {
28 /// The base vectors used to build the index under test.
29 pub base: Vec<Vec<f32>>,
30 /// The query vectors against which recall and latency are measured.
31 pub queries: Vec<Vec<f32>>,
32 /// Per-query exact top-`k` neighbour ids (ids index into `base`).
33 pub ground_truth: Vec<Vec<u32>>,
34 /// The dimensionality every base and query vector shares.
35 pub dim: usize,
36}
37
38/// Upper bound on a single record's dimensionality, enforced by the `.fvecs`
39/// and `.ivecs` readers.
40///
41/// A record's `u32 dim` header comes from an untrusted file: a corrupt or
42/// hostile file can claim any value up to `u32::MAX`, which without a cap would
43/// drive a single ~16 GiB allocation (`4 * u32::MAX` bytes) before the read
44/// even fails. The largest real TEXMEX vectors (GIST) are 960-D, so the cap of
45/// `2^20` is orders of magnitude above any legitimate dataset while bounding a
46/// single record's scratch buffer to 4 MiB. A header above this returns
47/// [`EvalError::Parse`].
48const MAX_RECORD_DIM: usize = 1 << 20;
49
50/// Read a length-prefixed TEXMEX record stream into one `Vec<T>` per record,
51/// decoding each little-endian 4-byte payload word with `decode`.
52///
53/// Shared by [`read_fvecs`] and [`read_ivecs`], which differ only in how the
54/// 4-byte words are interpreted. Centralizes the bounds check on the untrusted
55/// per-record dimension (see [`MAX_RECORD_DIM`]) and the truncated-record
56/// handling so both readers stay identical and hardened.
57fn read_vecs<T, F>(path: &Path, truncated_reason: &'static str, decode: F) -> Result<Vec<Vec<T>>>
58where
59 F: Fn([u8; 4]) -> T,
60{
61 let file = File::open(path).map_err(|source| EvalError::Io {
62 path: path.to_path_buf(),
63 source,
64 })?;
65 let mut r = BufReader::new(file);
66 let mut out: Vec<Vec<T>> = Vec::new();
67 let mut dim_buf = [0u8; 4];
68 loop {
69 match r.read_exact(&mut dim_buf) {
70 Ok(()) => {}
71 Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => break,
72 Err(source) => {
73 return Err(EvalError::Io {
74 path: path.to_path_buf(),
75 source,
76 });
77 }
78 }
79 let dim = u32::from_le_bytes(dim_buf) as usize;
80 if dim > MAX_RECORD_DIM {
81 return Err(EvalError::Parse {
82 path: path.to_path_buf(),
83 reason: "record dimension exceeds the maximum supported (file likely corrupt)",
84 });
85 }
86 // `dim <= MAX_RECORD_DIM` (2^20), so `dim * 4` cannot overflow `usize`.
87 let mut payload = vec![0u8; dim * 4];
88 r.read_exact(&mut payload).map_err(|source| {
89 if source.kind() == std::io::ErrorKind::UnexpectedEof {
90 EvalError::Parse {
91 path: path.to_path_buf(),
92 reason: truncated_reason,
93 }
94 } else {
95 EvalError::Io {
96 path: path.to_path_buf(),
97 source,
98 }
99 }
100 })?;
101 let row: Vec<T> = payload
102 .chunks_exact(4)
103 .map(|c| decode([c[0], c[1], c[2], c[3]]))
104 .collect();
105 out.push(row);
106 }
107 Ok(out)
108}
109
110/// Read a `.fvecs` file (TEXMEX corpus format) into one `Vec<f32>` per
111/// record.
112///
113/// Each on-disk record is a little-endian `u32 dim` followed by `dim`
114/// little-endian `f32` payload values. A truncated trailing record returns
115/// [`EvalError::Parse`]; a record whose header claims a dimension above the
116/// internal cap of `2^20` (treated as corruption) also returns
117/// [`EvalError::Parse`]; an open or read failure returns [`EvalError::Io`].
118///
119/// # Examples
120///
121/// ```no_run
122/// use iqdb_eval::read_fvecs;
123///
124/// # fn run() -> Result<(), iqdb_eval::EvalError> {
125/// let rows = read_fvecs(".bench-data/siftsmall/siftsmall_base.fvecs")?;
126/// assert!(!rows.is_empty());
127/// # Ok(())
128/// # }
129/// ```
130pub fn read_fvecs(path: impl AsRef<Path>) -> Result<Vec<Vec<f32>>> {
131 read_vecs(
132 path.as_ref(),
133 "truncated fvecs record payload",
134 f32::from_le_bytes,
135 )
136}
137
138/// Read an `.ivecs` file (TEXMEX corpus format) into one `Vec<u32>` per
139/// record.
140///
141/// Identical on-disk layout to [`read_fvecs`], but the payload is
142/// little-endian `i32`. SIFT ground-truth ids are always non-negative
143/// row indices, so `u32` is the natural fit; this reader does not check
144/// for negative values. The same `2^20` dimension bound and
145/// truncated-record handling as [`read_fvecs`] apply.
146///
147/// # Examples
148///
149/// ```no_run
150/// use iqdb_eval::read_ivecs;
151///
152/// # fn run() -> Result<(), iqdb_eval::EvalError> {
153/// let gt = read_ivecs(".bench-data/siftsmall/siftsmall_groundtruth.ivecs")?;
154/// assert!(!gt.is_empty());
155/// # Ok(())
156/// # }
157/// ```
158pub fn read_ivecs(path: impl AsRef<Path>) -> Result<Vec<Vec<u32>>> {
159 read_vecs(
160 path.as_ref(),
161 "truncated ivecs record payload",
162 u32::from_le_bytes,
163 )
164}
165
166/// Load a SIFT-family dataset rooted at `root` and named by `prefix`.
167///
168/// Resolves the canonical TEXMEX file names: `{prefix}_base.fvecs`,
169/// `{prefix}_query.fvecs`, and `{prefix}_groundtruth.ivecs` directly
170/// under `root`. For example, `load_sift_dataset(".bench-data/siftsmall",
171/// "siftsmall")` reads `.bench-data/siftsmall/siftsmall_base.fvecs` and
172/// its siblings.
173///
174/// Validates: every set is non-empty; every row in `base` and `queries`
175/// has the same dimensionality; `queries.len() == ground_truth.len()`.
176/// Returns [`EvalError::EmptyInput`], [`EvalError::DimensionMismatch`],
177/// or [`EvalError::LengthMismatch`] accordingly.
178///
179/// # Examples
180///
181/// ```no_run
182/// use iqdb_eval::load_sift_dataset;
183///
184/// # fn run() -> Result<(), iqdb_eval::EvalError> {
185/// let dataset = load_sift_dataset(".bench-data/siftsmall", "siftsmall")?;
186/// assert_eq!(dataset.queries.len(), dataset.ground_truth.len());
187/// # Ok(())
188/// # }
189/// ```
190pub fn load_sift_dataset(root: impl AsRef<Path>, prefix: &str) -> Result<SiftDataset> {
191 let root = root.as_ref();
192 let base_path: PathBuf = root.join(format!("{prefix}_base.fvecs"));
193 let query_path: PathBuf = root.join(format!("{prefix}_query.fvecs"));
194 let gt_path: PathBuf = root.join(format!("{prefix}_groundtruth.ivecs"));
195
196 let base = read_fvecs(&base_path)?;
197 let queries = read_fvecs(&query_path)?;
198 let ground_truth = read_ivecs(>_path)?;
199
200 if base.is_empty() {
201 return Err(EvalError::EmptyInput { kind: "base" });
202 }
203 if queries.is_empty() {
204 return Err(EvalError::EmptyInput { kind: "queries" });
205 }
206 if ground_truth.is_empty() {
207 return Err(EvalError::EmptyInput {
208 kind: "ground_truth",
209 });
210 }
211
212 let dim = base[0].len();
213 if let Some(row) = base.iter().find(|r| r.len() != dim) {
214 return Err(EvalError::DimensionMismatch {
215 expected: dim,
216 found: row.len(),
217 });
218 }
219 if let Some(row) = queries.iter().find(|r| r.len() != dim) {
220 return Err(EvalError::DimensionMismatch {
221 expected: dim,
222 found: row.len(),
223 });
224 }
225 if queries.len() != ground_truth.len() {
226 return Err(EvalError::LengthMismatch {
227 kind: "queries vs ground_truth",
228 expected: queries.len(),
229 found: ground_truth.len(),
230 });
231 }
232
233 Ok(SiftDataset {
234 base,
235 queries,
236 ground_truth,
237 dim,
238 })
239}
240
241#[cfg(test)]
242mod tests {
243 #![allow(clippy::unwrap_used, clippy::expect_used)]
244
245 use super::*;
246 use std::fs;
247
248 /// Encode rows in TEXMEX `.fvecs` layout: per record, a little-endian
249 /// `u32` dimension followed by `dim` little-endian `f32` payload words.
250 fn encode_fvecs(rows: &[&[f32]]) -> Vec<u8> {
251 let mut bytes = Vec::new();
252 for row in rows {
253 bytes.extend_from_slice(&(row.len() as u32).to_le_bytes());
254 for &x in *row {
255 bytes.extend_from_slice(&x.to_le_bytes());
256 }
257 }
258 bytes
259 }
260
261 /// Encode rows in TEXMEX `.ivecs` layout (same header, `u32`/`i32`
262 /// payload words).
263 fn encode_ivecs(rows: &[&[u32]]) -> Vec<u8> {
264 let mut bytes = Vec::new();
265 for row in rows {
266 bytes.extend_from_slice(&(row.len() as u32).to_le_bytes());
267 for &x in *row {
268 bytes.extend_from_slice(&x.to_le_bytes());
269 }
270 }
271 bytes
272 }
273
274 /// A unique temp path per test name; removed on drop so failures do not
275 /// leak files. No timestamp/random source is used — the name is enough to
276 /// keep parallel tests from colliding.
277 struct TempFile(PathBuf);
278 impl TempFile {
279 fn new(name: &str, bytes: &[u8]) -> Self {
280 let path = std::env::temp_dir().join(format!("iqdb_eval_{name}"));
281 fs::write(&path, bytes).unwrap();
282 Self(path)
283 }
284 fn path(&self) -> &Path {
285 &self.0
286 }
287 }
288 impl Drop for TempFile {
289 fn drop(&mut self) {
290 let _ = fs::remove_file(&self.0);
291 }
292 }
293
294 #[test]
295 fn fvecs_round_trips() {
296 let rows: &[&[f32]] = &[&[1.0, 2.0, 3.0], &[-4.5, 0.0, 9.25]];
297 let f = TempFile::new("rt.fvecs", &encode_fvecs(rows));
298 let got = read_fvecs(f.path()).unwrap();
299 assert_eq!(got, vec![vec![1.0, 2.0, 3.0], vec![-4.5, 0.0, 9.25]]);
300 }
301
302 #[test]
303 fn ivecs_round_trips() {
304 let rows: &[&[u32]] = &[&[0, 1, 2], &[7, 8, 9]];
305 let f = TempFile::new("rt.ivecs", &encode_ivecs(rows));
306 let got = read_ivecs(f.path()).unwrap();
307 assert_eq!(got, vec![vec![0u32, 1, 2], vec![7, 8, 9]]);
308 }
309
310 #[test]
311 fn empty_file_reads_empty() {
312 let f = TempFile::new("empty.fvecs", &[]);
313 assert!(read_fvecs(f.path()).unwrap().is_empty());
314 }
315
316 #[test]
317 fn truncated_payload_is_parse_error() {
318 // Header claims dim=3 but only two floats follow.
319 let mut bytes = 3u32.to_le_bytes().to_vec();
320 bytes.extend_from_slice(&1.0f32.to_le_bytes());
321 bytes.extend_from_slice(&2.0f32.to_le_bytes());
322 let f = TempFile::new("trunc.fvecs", &bytes);
323 let err = read_fvecs(f.path()).unwrap_err();
324 assert!(matches!(err, EvalError::Parse { .. }), "got {err:?}");
325 }
326
327 #[test]
328 fn trailing_partial_header_stops_cleanly() {
329 // One full record, then two stray bytes (an incomplete next header).
330 let mut bytes = encode_fvecs(&[&[1.0, 2.0]]);
331 bytes.extend_from_slice(&[0xAB, 0xCD]);
332 let f = TempFile::new("partial.fvecs", &bytes);
333 let got = read_fvecs(f.path()).unwrap();
334 assert_eq!(got, vec![vec![1.0, 2.0]]);
335 }
336
337 #[test]
338 fn oversized_dim_is_rejected_without_allocating() {
339 // A hostile header claiming a dimension above the cap must error before
340 // attempting the (here, ~16 GiB) payload allocation.
341 let bytes = u32::MAX.to_le_bytes().to_vec();
342 let f = TempFile::new("huge.fvecs", &bytes);
343 let err = read_fvecs(f.path()).unwrap_err();
344 match err {
345 EvalError::Parse { reason, .. } => {
346 assert!(reason.contains("dimension"), "unexpected reason: {reason}");
347 }
348 other => panic!("expected Parse, got {other:?}"),
349 }
350 }
351
352 #[test]
353 fn dim_exactly_at_cap_is_accepted_in_header() {
354 // The cap itself is allowed by the bound check; the read then fails as
355 // a truncated payload (we do not write 4 MiB), proving the boundary is
356 // inclusive and that rejection is by truncation, not by the cap.
357 let bytes = (MAX_RECORD_DIM as u32).to_le_bytes().to_vec();
358 let f = TempFile::new("atcap.fvecs", &bytes);
359 let err = read_fvecs(f.path()).unwrap_err();
360 assert!(
361 matches!(&err, EvalError::Parse { reason, .. } if reason.contains("truncated")),
362 "expected truncated-payload parse error, got {err:?}",
363 );
364 }
365
366 #[test]
367 fn missing_file_is_io_error() {
368 let path = std::env::temp_dir().join("iqdb_eval_does_not_exist_xyz.fvecs");
369 let err = read_fvecs(&path).unwrap_err();
370 assert!(matches!(err, EvalError::Io { .. }), "got {err:?}");
371 }
372}