tokitai-operator 0.1.0

Verified DL kernel compiler: formally-checked GEMM, p-adic, sheaf, contract-carrying ops. Paper-artifact grade.
Documentation
//! Local dataset type that pairs a `LocalSample` stream with metadata
//! (sample count, paths).
//!
//! [`LocalDataset`] is the user-facing entry point. It opens the two
//! SQLite files via [`SqliteDatasetReader`] and exposes a `Vec`-shaped
//! `iter()` API. We intentionally materialize the inner-join result
//! into a `Vec<LocalSample>` once so that:
//!
//! - The model driver can `.iter().cycle().take(n)` over the dataset
//!   for an arbitrary number of epochs without re-querying SQLite.
//! - The `n_samples()` accessor is `O(1)`.
//! - Shuffling, train/val split, and stratified sampling can be
//!   implemented as plain `Vec` slices in a follow-up phase.
//!
//! For the Phase 2.5 smoke tests the dataset is small (< 1000 samples
//! in the fixture) so materialization is essentially free.

use std::path::Path;

use crate::dataset_bridge::sqlite_reader::SqliteDatasetReader;
use crate::dataset_bridge::{FEATURE_DIM, LABEL_DIM};
use crate::error::Result;

/// One paired (decision, outcome) sample, ready to feed into the
/// model. Both `features` and `labels` are flat `Vec<f32>` with
/// `len() == FEATURE_DIM` and `len() == LABEL_DIM` respectively.
#[derive(Debug, Clone, PartialEq)]
pub struct LocalSample {
    /// 96-dim input vector: 74 categorical one-hot (0.0/1.0) followed
    /// by 22 numerical milli-units in `[0, 1000]`.
    pub features: Vec<f32>,
    /// 20-dim target vector: 12-way outcome one-hot followed by 8
    /// aux metric scalars in `[0, 1]`.
    pub labels: Vec<f32>,
}

impl LocalSample {
    /// Compile-time shape check.
    pub fn shape_ok(&self) -> bool {
        self.features.len() == FEATURE_DIM && self.labels.len() == LABEL_DIM
    }
}

/// A materialized local dataset. Holds the inner-joined samples in
/// memory for fast iteration. Construction reads the SQLite files
/// once; subsequent `iter()` / `n_samples()` calls are O(1).
pub struct LocalDataset {
    /// The paired (decision, outcome) samples, materialized at open
    /// time. `samples.len() == n_samples()`.
    samples: Vec<LocalSample>,
}

impl LocalDataset {
    /// Open the two SQLite files, run the inner join, and materialize
    /// the paired samples.
    pub fn open_sqlite(decisions_path: &Path, outcomes_path: &Path) -> Result<Self> {
        let reader = SqliteDatasetReader::open(decisions_path, outcomes_path)?;
        let mut samples: Vec<LocalSample> = Vec::new();
        for s in reader.iter_pairs()? {
            samples.push(s?);
        }
        Ok(Self { samples })
    }

    /// Number of paired samples. `O(1)`.
    pub fn n_samples(&self) -> usize {
        self.samples.len()
    }

    /// Iterate over the materialized samples in row order. For
    /// shuffled / batched iteration, callers should use
    /// `iter_shuffled(seed)` or collect the samples into a `Vec` and
    /// shuffle via `rand`.
    pub fn iter(&self) -> impl Iterator<Item = &LocalSample> {
        self.samples.iter()
    }

    /// Borrow the materialized samples as a slice (useful for
    /// indexing, slicing, and tests).
    pub fn as_slice(&self) -> &[LocalSample] {
        &self.samples
    }
}