Skip to main content

iqdb_eval/
dataset.rs

1//! Dataset loaders for the TEXMEX SIFT family.
2//!
3//! The SIFT corpus (and its `siftsmall` and `GIST` siblings) is shipped as
4//! a pair of `.fvecs` files (base vectors and query vectors) and one
5//! `.ivecs` file (per-query top-100 ground-truth neighbour ids). All three
6//! share the same record layout: a little-endian `u32 dim` header
7//! followed by `dim` payload elements (`f32` for `.fvecs`, `i32` for
8//! `.ivecs`).
9//!
10//! The readers and [`load_sift_dataset`] are minimal, hand-rolled, and
11//! pull in no new external parsing dependencies. They generalize the
12//! one-off versions that previously lived in `iqdb-hnsw/tests/sift_recall.rs`.
13
14use std::fs::File;
15use std::io::{BufReader, Read};
16use std::path::{Path, PathBuf};
17
18use crate::error::{EvalError, Result};
19
20/// One full SIFT-family dataset: base vectors, query vectors, per-query
21/// ground-truth neighbour ids, and the shared dimensionality.
22///
23/// `base[i]` is the `i`-th base vector — `i` is also the row-index ID
24/// used in [`crate::build_index_from_base`] and the value stored in the
25/// `.ivecs` ground-truth entries.
26#[derive(Debug, Clone)]
27pub struct SiftDataset {
28    /// The base vectors used to build the index under test.
29    pub base: Vec<Vec<f32>>,
30    /// The query vectors against which recall and latency are measured.
31    pub queries: Vec<Vec<f32>>,
32    /// Per-query exact top-`k` neighbour ids (ids index into `base`).
33    pub ground_truth: Vec<Vec<u32>>,
34    /// The dimensionality every base and query vector shares.
35    pub dim: usize,
36}
37
38/// Upper bound on a single record's dimensionality, enforced by the `.fvecs`
39/// and `.ivecs` readers.
40///
41/// A record's `u32 dim` header comes from an untrusted file: a corrupt or
42/// hostile file can claim any value up to `u32::MAX`, which without a cap would
43/// drive a single ~16 GiB allocation (`4 * u32::MAX` bytes) before the read
44/// even fails. The largest real TEXMEX vectors (GIST) are 960-D, so the cap of
45/// `2^20` is orders of magnitude above any legitimate dataset while bounding a
46/// single record's scratch buffer to 4 MiB. A header above this returns
47/// [`EvalError::Parse`].
48const MAX_RECORD_DIM: usize = 1 << 20;
49
50/// Read a length-prefixed TEXMEX record stream into one `Vec<T>` per record,
51/// decoding each little-endian 4-byte payload word with `decode`.
52///
53/// Shared by [`read_fvecs`] and [`read_ivecs`], which differ only in how the
54/// 4-byte words are interpreted. Centralizes the bounds check on the untrusted
55/// per-record dimension (see [`MAX_RECORD_DIM`]) and the truncated-record
56/// handling so both readers stay identical and hardened.
57fn read_vecs<T, F>(path: &Path, truncated_reason: &'static str, decode: F) -> Result<Vec<Vec<T>>>
58where
59    F: Fn([u8; 4]) -> T,
60{
61    let file = File::open(path).map_err(|source| EvalError::Io {
62        path: path.to_path_buf(),
63        source,
64    })?;
65    let mut r = BufReader::new(file);
66    let mut out: Vec<Vec<T>> = Vec::new();
67    let mut dim_buf = [0u8; 4];
68    loop {
69        match r.read_exact(&mut dim_buf) {
70            Ok(()) => {}
71            Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => break,
72            Err(source) => {
73                return Err(EvalError::Io {
74                    path: path.to_path_buf(),
75                    source,
76                });
77            }
78        }
79        let dim = u32::from_le_bytes(dim_buf) as usize;
80        if dim > MAX_RECORD_DIM {
81            return Err(EvalError::Parse {
82                path: path.to_path_buf(),
83                reason: "record dimension exceeds the maximum supported (file likely corrupt)",
84            });
85        }
86        // `dim <= MAX_RECORD_DIM` (2^20), so `dim * 4` cannot overflow `usize`.
87        let mut payload = vec![0u8; dim * 4];
88        r.read_exact(&mut payload).map_err(|source| {
89            if source.kind() == std::io::ErrorKind::UnexpectedEof {
90                EvalError::Parse {
91                    path: path.to_path_buf(),
92                    reason: truncated_reason,
93                }
94            } else {
95                EvalError::Io {
96                    path: path.to_path_buf(),
97                    source,
98                }
99            }
100        })?;
101        let row: Vec<T> = payload
102            .chunks_exact(4)
103            .map(|c| decode([c[0], c[1], c[2], c[3]]))
104            .collect();
105        out.push(row);
106    }
107    Ok(out)
108}
109
110/// Read a `.fvecs` file (TEXMEX corpus format) into one `Vec<f32>` per
111/// record.
112///
113/// Each on-disk record is a little-endian `u32 dim` followed by `dim`
114/// little-endian `f32` payload values. A truncated trailing record returns
115/// [`EvalError::Parse`]; a record whose header claims a dimension above the
116/// internal cap of `2^20` (treated as corruption) also returns
117/// [`EvalError::Parse`]; an open or read failure returns [`EvalError::Io`].
118///
119/// # Examples
120///
121/// ```no_run
122/// use iqdb_eval::read_fvecs;
123///
124/// # fn run() -> Result<(), iqdb_eval::EvalError> {
125/// let rows = read_fvecs(".bench-data/siftsmall/siftsmall_base.fvecs")?;
126/// assert!(!rows.is_empty());
127/// # Ok(())
128/// # }
129/// ```
130pub fn read_fvecs(path: impl AsRef<Path>) -> Result<Vec<Vec<f32>>> {
131    read_vecs(
132        path.as_ref(),
133        "truncated fvecs record payload",
134        f32::from_le_bytes,
135    )
136}
137
138/// Read an `.ivecs` file (TEXMEX corpus format) into one `Vec<u32>` per
139/// record.
140///
141/// Identical on-disk layout to [`read_fvecs`], but the payload is
142/// little-endian `i32`. SIFT ground-truth ids are always non-negative
143/// row indices, so `u32` is the natural fit; this reader does not check
144/// for negative values. The same `2^20` dimension bound and
145/// truncated-record handling as [`read_fvecs`] apply.
146///
147/// # Examples
148///
149/// ```no_run
150/// use iqdb_eval::read_ivecs;
151///
152/// # fn run() -> Result<(), iqdb_eval::EvalError> {
153/// let gt = read_ivecs(".bench-data/siftsmall/siftsmall_groundtruth.ivecs")?;
154/// assert!(!gt.is_empty());
155/// # Ok(())
156/// # }
157/// ```
158pub fn read_ivecs(path: impl AsRef<Path>) -> Result<Vec<Vec<u32>>> {
159    read_vecs(
160        path.as_ref(),
161        "truncated ivecs record payload",
162        u32::from_le_bytes,
163    )
164}
165
166/// Load a SIFT-family dataset rooted at `root` and named by `prefix`.
167///
168/// Resolves the canonical TEXMEX file names: `{prefix}_base.fvecs`,
169/// `{prefix}_query.fvecs`, and `{prefix}_groundtruth.ivecs` directly
170/// under `root`. For example, `load_sift_dataset(".bench-data/siftsmall",
171/// "siftsmall")` reads `.bench-data/siftsmall/siftsmall_base.fvecs` and
172/// its siblings.
173///
174/// Validates: every set is non-empty; every row in `base` and `queries`
175/// has the same dimensionality; `queries.len() == ground_truth.len()`.
176/// Returns [`EvalError::EmptyInput`], [`EvalError::DimensionMismatch`],
177/// or [`EvalError::LengthMismatch`] accordingly.
178///
179/// # Examples
180///
181/// ```no_run
182/// use iqdb_eval::load_sift_dataset;
183///
184/// # fn run() -> Result<(), iqdb_eval::EvalError> {
185/// let dataset = load_sift_dataset(".bench-data/siftsmall", "siftsmall")?;
186/// assert_eq!(dataset.queries.len(), dataset.ground_truth.len());
187/// # Ok(())
188/// # }
189/// ```
190pub fn load_sift_dataset(root: impl AsRef<Path>, prefix: &str) -> Result<SiftDataset> {
191    let root = root.as_ref();
192    let base_path: PathBuf = root.join(format!("{prefix}_base.fvecs"));
193    let query_path: PathBuf = root.join(format!("{prefix}_query.fvecs"));
194    let gt_path: PathBuf = root.join(format!("{prefix}_groundtruth.ivecs"));
195
196    let base = read_fvecs(&base_path)?;
197    let queries = read_fvecs(&query_path)?;
198    let ground_truth = read_ivecs(&gt_path)?;
199
200    if base.is_empty() {
201        return Err(EvalError::EmptyInput { kind: "base" });
202    }
203    if queries.is_empty() {
204        return Err(EvalError::EmptyInput { kind: "queries" });
205    }
206    if ground_truth.is_empty() {
207        return Err(EvalError::EmptyInput {
208            kind: "ground_truth",
209        });
210    }
211
212    let dim = base[0].len();
213    if let Some(row) = base.iter().find(|r| r.len() != dim) {
214        return Err(EvalError::DimensionMismatch {
215            expected: dim,
216            found: row.len(),
217        });
218    }
219    if let Some(row) = queries.iter().find(|r| r.len() != dim) {
220        return Err(EvalError::DimensionMismatch {
221            expected: dim,
222            found: row.len(),
223        });
224    }
225    if queries.len() != ground_truth.len() {
226        return Err(EvalError::LengthMismatch {
227            kind: "queries vs ground_truth",
228            expected: queries.len(),
229            found: ground_truth.len(),
230        });
231    }
232
233    Ok(SiftDataset {
234        base,
235        queries,
236        ground_truth,
237        dim,
238    })
239}
240
241#[cfg(test)]
242mod tests {
243    #![allow(clippy::unwrap_used, clippy::expect_used)]
244
245    use super::*;
246    use std::fs;
247
248    /// Encode rows in TEXMEX `.fvecs` layout: per record, a little-endian
249    /// `u32` dimension followed by `dim` little-endian `f32` payload words.
250    fn encode_fvecs(rows: &[&[f32]]) -> Vec<u8> {
251        let mut bytes = Vec::new();
252        for row in rows {
253            bytes.extend_from_slice(&(row.len() as u32).to_le_bytes());
254            for &x in *row {
255                bytes.extend_from_slice(&x.to_le_bytes());
256            }
257        }
258        bytes
259    }
260
261    /// Encode rows in TEXMEX `.ivecs` layout (same header, `u32`/`i32`
262    /// payload words).
263    fn encode_ivecs(rows: &[&[u32]]) -> Vec<u8> {
264        let mut bytes = Vec::new();
265        for row in rows {
266            bytes.extend_from_slice(&(row.len() as u32).to_le_bytes());
267            for &x in *row {
268                bytes.extend_from_slice(&x.to_le_bytes());
269            }
270        }
271        bytes
272    }
273
274    /// A unique temp path per test name; removed on drop so failures do not
275    /// leak files. No timestamp/random source is used — the name is enough to
276    /// keep parallel tests from colliding.
277    struct TempFile(PathBuf);
278    impl TempFile {
279        fn new(name: &str, bytes: &[u8]) -> Self {
280            let path = std::env::temp_dir().join(format!("iqdb_eval_{name}"));
281            fs::write(&path, bytes).unwrap();
282            Self(path)
283        }
284        fn path(&self) -> &Path {
285            &self.0
286        }
287    }
288    impl Drop for TempFile {
289        fn drop(&mut self) {
290            let _ = fs::remove_file(&self.0);
291        }
292    }
293
294    #[test]
295    fn fvecs_round_trips() {
296        let rows: &[&[f32]] = &[&[1.0, 2.0, 3.0], &[-4.5, 0.0, 9.25]];
297        let f = TempFile::new("rt.fvecs", &encode_fvecs(rows));
298        let got = read_fvecs(f.path()).unwrap();
299        assert_eq!(got, vec![vec![1.0, 2.0, 3.0], vec![-4.5, 0.0, 9.25]]);
300    }
301
302    #[test]
303    fn ivecs_round_trips() {
304        let rows: &[&[u32]] = &[&[0, 1, 2], &[7, 8, 9]];
305        let f = TempFile::new("rt.ivecs", &encode_ivecs(rows));
306        let got = read_ivecs(f.path()).unwrap();
307        assert_eq!(got, vec![vec![0u32, 1, 2], vec![7, 8, 9]]);
308    }
309
310    #[test]
311    fn empty_file_reads_empty() {
312        let f = TempFile::new("empty.fvecs", &[]);
313        assert!(read_fvecs(f.path()).unwrap().is_empty());
314    }
315
316    #[test]
317    fn truncated_payload_is_parse_error() {
318        // Header claims dim=3 but only two floats follow.
319        let mut bytes = 3u32.to_le_bytes().to_vec();
320        bytes.extend_from_slice(&1.0f32.to_le_bytes());
321        bytes.extend_from_slice(&2.0f32.to_le_bytes());
322        let f = TempFile::new("trunc.fvecs", &bytes);
323        let err = read_fvecs(f.path()).unwrap_err();
324        assert!(matches!(err, EvalError::Parse { .. }), "got {err:?}");
325    }
326
327    #[test]
328    fn trailing_partial_header_stops_cleanly() {
329        // One full record, then two stray bytes (an incomplete next header).
330        let mut bytes = encode_fvecs(&[&[1.0, 2.0]]);
331        bytes.extend_from_slice(&[0xAB, 0xCD]);
332        let f = TempFile::new("partial.fvecs", &bytes);
333        let got = read_fvecs(f.path()).unwrap();
334        assert_eq!(got, vec![vec![1.0, 2.0]]);
335    }
336
337    #[test]
338    fn oversized_dim_is_rejected_without_allocating() {
339        // A hostile header claiming a dimension above the cap must error before
340        // attempting the (here, ~16 GiB) payload allocation.
341        let bytes = u32::MAX.to_le_bytes().to_vec();
342        let f = TempFile::new("huge.fvecs", &bytes);
343        let err = read_fvecs(f.path()).unwrap_err();
344        match err {
345            EvalError::Parse { reason, .. } => {
346                assert!(reason.contains("dimension"), "unexpected reason: {reason}");
347            }
348            other => panic!("expected Parse, got {other:?}"),
349        }
350    }
351
352    #[test]
353    fn dim_exactly_at_cap_is_accepted_in_header() {
354        // The cap itself is allowed by the bound check; the read then fails as
355        // a truncated payload (we do not write 4 MiB), proving the boundary is
356        // inclusive and that rejection is by truncation, not by the cap.
357        let bytes = (MAX_RECORD_DIM as u32).to_le_bytes().to_vec();
358        let f = TempFile::new("atcap.fvecs", &bytes);
359        let err = read_fvecs(f.path()).unwrap_err();
360        assert!(
361            matches!(&err, EvalError::Parse { reason, .. } if reason.contains("truncated")),
362            "expected truncated-payload parse error, got {err:?}",
363        );
364    }
365
366    #[test]
367    fn missing_file_is_io_error() {
368        let path = std::env::temp_dir().join("iqdb_eval_does_not_exist_xyz.fvecs");
369        let err = read_fvecs(&path).unwrap_err();
370        assert!(matches!(err, EvalError::Io { .. }), "got {err:?}");
371    }
372}