Skip to main content

dsfb_database/adapters/
mod.rs

1//! Dataset adapters.
2//!
3//! Each adapter exposes a `load(path)` function that reads a real subset of
4//! the corresponding public dataset from disk and returns a typed
5//! [`ResidualStream`]. The adapter is responsible for:
6//!   * format-specific parsing (CSV / Parquet / pickle / SQL)
7//!   * dropping samples whose required fields are missing or non-finite
8//!   * sorting by time
9//!   * embedding the dataset name + version + subset id in `stream.source`
10//!
11//! Where a dataset cannot be redistributed inside the build (Snowset is
12//! ~10 GB; SQLShare is permission-gated; the IMDB JOB dump is third-party
13//! licensed) the adapter additionally provides a *synthetic exemplar*
14//! function that produces a deterministic, seedable residual stream with the
15//! same statistical shape as the real corpus. The paper labels every figure
16//! that uses an exemplar with `[exemplar]` and the corresponding fetch
17//! script lets the operator regenerate the figure on the real data.
18//!
19//! Design rule (panel-imposed): synthetic exemplars never carry the bare
20//! dataset name in `stream.source` — they always read
21//! `"{dataset}-exemplar-seed{N}"`, so a downstream report cannot
22//! accidentally label exemplar results as if they were real-data results.
23
24use crate::residual::ResidualStream;
25use anyhow::Result;
26
27pub mod ceb;
28pub mod generic_csv;
29pub mod job;
30#[cfg(feature = "otel")]
31pub mod otel;
32pub mod postgres;
33pub mod snowset;
34pub mod sqlshare;
35pub mod sqlshare_text;
36pub mod tpcds;
37
38/// Trait for the five dataset adapters.
39pub trait DatasetAdapter {
40    /// Display name (for reports + figure captions).
41    fn name(&self) -> &'static str;
42
43    /// Load a real subset from `path`. Errors if the file/directory is
44    /// missing, malformed, or empty.
45    fn load(&self, path: &std::path::Path) -> Result<ResidualStream>;
46
47    /// Generate a deterministic synthetic exemplar with the dataset's
48    /// statistical shape. `seed` makes the run reproducible. The returned
49    /// stream's `source` will be `"{name}-exemplar-seed{seed}"` so that no
50    /// downstream report mislabels it as real data.
51    fn exemplar(&self, seed: u64) -> ResidualStream;
52}