use crate::dataset::core::Dataset;
use crate::error::Result;
use crate::types::Float;
#[cfg(feature = "mmap")]
use memmap2::MmapOptions;
#[cfg(feature = "mmap")]
use scirs2_core::ndarray::Array;
#[cfg(feature = "mmap")]
use std::io::Write;
#[cfg(feature = "mmap")]
use std::path::Path;
#[cfg(feature = "mmap")]
pub trait MmapSerializable {
fn save_mmap_impl<W: Write>(&self, writer: W) -> Result<()>;
}
#[cfg(feature = "mmap")]
impl MmapSerializable
for Dataset<scirs2_core::ndarray::Array2<Float>, scirs2_core::ndarray::Array1<Float>>
{
fn save_mmap_impl<W: Write>(&self, mut writer: W) -> Result<()> {
let (n_samples, n_features) = self.data.dim();
if n_samples != self.target.len() {
return Err(crate::error::SklearsError::ShapeMismatch {
expected: format!("data.nrows() == target.len() ({n_samples})"),
actual: format!(
"data.nrows()={}, target.len()={}",
n_samples,
self.target.len()
),
});
}
let header_size = MmapHeader::size();
let data_size = n_samples * n_features * std::mem::size_of::<Float>();
let target_size = n_samples * std::mem::size_of::<Float>();
let data_offset = header_size;
let target_offset = data_offset + data_size;
let header = MmapHeader::new(
n_samples,
n_features,
data_offset,
target_offset,
0, 0, std::mem::size_of::<Float>(),
0, self.feature_names.clone(),
self.target_names.clone(),
self.description.clone(),
);
header.write(&mut writer)?;
let data_bytes =
unsafe { std::slice::from_raw_parts(self.data.as_ptr() as *const u8, data_size) };
writer
.write_all(data_bytes)
.map_err(crate::error::SklearsError::IoError)?;
let target_bytes =
unsafe { std::slice::from_raw_parts(self.target.as_ptr() as *const u8, target_size) };
writer
.write_all(target_bytes)
.map_err(crate::error::SklearsError::IoError)?;
Ok(())
}
}
#[cfg(feature = "mmap")]
pub fn make_large_regression<P: AsRef<Path>>(
path: P,
n_samples: usize,
n_features: usize,
noise: f64,
chunk_size: Option<usize>,
) -> Result<()> {
use scirs2_core::random::essentials::Uniform;
use scirs2_core::random::prelude::*;
use scirs2_core::random::{thread_rng, Distribution};
let chunk_size = chunk_size.unwrap_or(1000);
let mut rng = thread_rng();
let normal =
Normal::new(0.0, 1.0).map_err(|e| crate::error::SklearsError::Other(e.to_string()))?;
let uniform =
Uniform::new(-10.0, 10.0).map_err(|e| crate::error::SklearsError::Other(e.to_string()))?;
let mut coef = Vec::with_capacity(n_features);
for _ in 0..n_features {
coef.push(uniform.sample(&mut rng));
}
let mut builder = MmapDatasetBuilder::new(n_samples, n_features)
.description("Large synthetic regression dataset".to_string())
.build(path)?;
let noise_dist =
Normal::new(0.0, noise).map_err(|e| crate::error::SklearsError::Other(e.to_string()))?;
let mut samples_written = 0;
while samples_written < n_samples {
let current_chunk_size = std::cmp::min(chunk_size, n_samples - samples_written);
let mut x_data = Vec::with_capacity(current_chunk_size * n_features);
for _ in 0..current_chunk_size * n_features {
x_data.push(normal.sample(&mut rng));
}
let x_chunk = Array::from_shape_vec((current_chunk_size, n_features), x_data)
.map_err(|e| crate::error::SklearsError::Other(e.to_string()))?;
let mut y_data = Vec::with_capacity(current_chunk_size);
for i in 0..current_chunk_size {
let mut y_i = 0.0;
for j in 0..n_features {
y_i += x_chunk[[i, j]] * coef[j];
}
y_i += noise_dist.sample(&mut rng);
y_data.push(y_i);
}
let y_chunk = Array::from_vec(y_data);
builder.write_chunk(&x_chunk, &y_chunk)?;
samples_written += current_chunk_size;
}
builder.finish()?;
Ok(())
}
#[cfg(feature = "mmap")]
#[derive(Debug)]
pub struct MmapDataset {
mmap: memmap2::Mmap,
shape: (usize, usize),
data_offset: usize,
target_offset: usize,
feature_names: Vec<String>,
target_names: Option<Vec<String>>,
description: String,
}
#[cfg(feature = "mmap")]
impl MmapDataset {
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
let file = std::fs::File::open(&path).map_err(crate::error::SklearsError::IoError)?;
let file_size = file
.metadata()
.map_err(crate::error::SklearsError::IoError)?
.len() as usize;
if file_size < MmapHeader::size() {
return Err(crate::error::SklearsError::InvalidInput(format!(
"File too small: {} bytes, minimum: {} bytes",
file_size,
MmapHeader::size()
)));
}
let mmap = unsafe {
MmapOptions::new()
.map(&file)
.map_err(crate::error::SklearsError::IoError)?
};
let header = MmapHeader::from_bytes(&mmap[0..MmapHeader::size()])?;
header.validate(file_size)?;
Ok(Self {
mmap,
shape: (header.n_samples, header.n_features),
data_offset: header.data_offset,
target_offset: header.target_offset,
feature_names: header.feature_names,
target_names: header.target_names,
description: header.description,
})
}
pub fn from_mmap(mmap: memmap2::Mmap) -> Result<Self> {
let file_size = mmap.len();
if file_size < MmapHeader::size() {
return Err(crate::error::SklearsError::InvalidInput(format!(
"Memory map too small: {} bytes, minimum: {} bytes",
file_size,
MmapHeader::size()
)));
}
let header = MmapHeader::from_bytes(&mmap[0..MmapHeader::size()])?;
header.validate(file_size)?;
Ok(Self {
mmap,
shape: (header.n_samples, header.n_features),
data_offset: header.data_offset,
target_offset: header.target_offset,
feature_names: header.feature_names,
target_names: header.target_names,
description: header.description,
})
}
pub fn shape(&self) -> (usize, usize) {
self.shape
}
pub fn n_samples(&self) -> usize {
self.shape.0
}
pub fn n_features(&self) -> usize {
self.shape.1
}
pub fn feature_names(&self) -> &[String] {
&self.feature_names
}
pub fn target_names(&self) -> Option<&[String]> {
self.target_names.as_deref()
}
pub fn description(&self) -> &str {
&self.description
}
pub fn batch_iter(&self, batch_size: usize) -> MmapBatchIterator<'_> {
MmapBatchIterator {
dataset: self,
batch_size,
current_offset: 0,
}
}
pub fn get_sample(&self, sample_idx: usize) -> Result<(Vec<Float>, Float)> {
if sample_idx >= self.n_samples() {
return Err(crate::error::SklearsError::InvalidInput(format!(
"Sample index {} out of bounds (max: {})",
sample_idx,
self.n_samples() - 1
)));
}
let n_features = self.n_features();
let feature_size = std::mem::size_of::<Float>();
let features_start = self.data_offset + sample_idx * n_features * feature_size;
let features_end = features_start + n_features * feature_size;
let feature_bytes = &self.mmap[features_start..features_end];
let features = unsafe {
std::slice::from_raw_parts(feature_bytes.as_ptr() as *const Float, n_features)
}
.to_vec();
let target_start = self.target_offset + sample_idx * feature_size;
let target_end = target_start + feature_size;
let target_bytes = &self.mmap[target_start..target_end];
let target = unsafe { *(target_bytes.as_ptr() as *const Float) };
Ok((features, target))
}
}
#[cfg(feature = "mmap")]
pub struct MmapBatchIterator<'a> {
dataset: &'a MmapDataset,
batch_size: usize,
current_offset: usize,
}
#[cfg(feature = "mmap")]
impl<'a> Iterator for MmapBatchIterator<'a> {
type Item = Result<(
scirs2_core::ndarray::Array2<Float>,
scirs2_core::ndarray::Array1<Float>,
)>;
fn next(&mut self) -> Option<Self::Item> {
if self.current_offset >= self.dataset.n_samples() {
return None;
}
let remaining = self.dataset.n_samples() - self.current_offset;
let current_batch_size = std::cmp::min(self.batch_size, remaining);
let result = self.read_batch(current_batch_size);
self.current_offset += current_batch_size;
Some(result)
}
}
#[cfg(feature = "mmap")]
impl<'a> MmapBatchIterator<'a> {
fn read_batch(
&self,
batch_size: usize,
) -> Result<(
scirs2_core::ndarray::Array2<Float>,
scirs2_core::ndarray::Array1<Float>,
)> {
let n_features = self.dataset.n_features();
let feature_size = std::mem::size_of::<Float>();
let features_start =
self.dataset.data_offset + self.current_offset * n_features * feature_size;
let features_end = features_start + batch_size * n_features * feature_size;
let feature_bytes = &self.dataset.mmap[features_start..features_end];
let features_data = unsafe {
std::slice::from_raw_parts(
feature_bytes.as_ptr() as *const Float,
batch_size * n_features,
)
}
.to_vec();
let features = Array::from_shape_vec((batch_size, n_features), features_data)
.map_err(|e| crate::error::SklearsError::Other(e.to_string()))?;
let targets_start = self.dataset.target_offset + self.current_offset * feature_size;
let targets_end = targets_start + batch_size * feature_size;
let target_bytes = &self.dataset.mmap[targets_start..targets_end];
let targets_data = unsafe {
std::slice::from_raw_parts(target_bytes.as_ptr() as *const Float, batch_size)
}
.to_vec();
let targets = Array::from_vec(targets_data);
Ok((features, targets))
}
}
#[cfg(feature = "mmap")]
pub struct MmapDatasetBuilder {
file: std::fs::File,
written_samples: usize,
total_samples: usize,
n_features: usize,
data_offset: usize,
target_offset: usize,
}
#[cfg(feature = "mmap")]
impl MmapDatasetBuilder {
#[allow(clippy::new_ret_no_self)]
pub fn new(total_samples: usize, n_features: usize) -> MmapDatasetBuilderConfig {
MmapDatasetBuilderConfig {
total_samples,
n_features,
feature_names: Vec::new(),
target_names: None,
description: String::new(),
}
}
pub fn write_chunk(
&mut self,
features: &scirs2_core::ndarray::Array2<Float>,
targets: &scirs2_core::ndarray::Array1<Float>,
) -> Result<()> {
let (batch_samples, batch_features) = features.dim();
if batch_features != self.n_features {
return Err(crate::error::SklearsError::ShapeMismatch {
expected: format!("n_features={}", self.n_features),
actual: format!("batch_features={}", batch_features),
});
}
if batch_samples != targets.len() {
return Err(crate::error::SklearsError::ShapeMismatch {
expected: format!("batch_samples={}", batch_samples),
actual: format!("targets.len()={}", targets.len()),
});
}
if self.written_samples + batch_samples > self.total_samples {
return Err(crate::error::SklearsError::InvalidInput(format!(
"Writing {} samples would exceed total capacity of {}",
self.written_samples + batch_samples,
self.total_samples
)));
}
let feature_size = std::mem::size_of::<Float>();
let features_start =
self.data_offset + self.written_samples * self.n_features * feature_size;
let targets_start = self.target_offset + self.written_samples * feature_size;
let feature_bytes = unsafe {
std::slice::from_raw_parts(
features.as_ptr() as *const u8,
batch_samples * self.n_features * feature_size,
)
};
use std::os::unix::fs::FileExt;
self.file
.write_all_at(feature_bytes, features_start as u64)
.map_err(crate::error::SklearsError::IoError)?;
let target_bytes = unsafe {
std::slice::from_raw_parts(targets.as_ptr() as *const u8, batch_samples * feature_size)
};
self.file
.write_all_at(target_bytes, targets_start as u64)
.map_err(crate::error::SklearsError::IoError)?;
self.written_samples += batch_samples;
Ok(())
}
pub fn finish(self) -> Result<()> {
if self.written_samples != self.total_samples {
return Err(crate::error::SklearsError::InvalidInput(format!(
"Dataset incomplete: wrote {} samples, expected {}",
self.written_samples, self.total_samples
)));
}
self.file
.sync_all()
.map_err(crate::error::SklearsError::IoError)?;
Ok(())
}
}
#[cfg(feature = "mmap")]
pub struct MmapDatasetBuilderConfig {
total_samples: usize,
n_features: usize,
feature_names: Vec<String>,
target_names: Option<Vec<String>>,
description: String,
}
#[cfg(feature = "mmap")]
impl MmapDatasetBuilderConfig {
pub fn feature_names(mut self, names: Vec<String>) -> Self {
self.feature_names = names;
self
}
pub fn target_names(mut self, names: Vec<String>) -> Self {
self.target_names = Some(names);
self
}
pub fn description(mut self, description: String) -> Self {
self.description = description;
self
}
pub fn build<P: AsRef<Path>>(self, path: P) -> Result<MmapDatasetBuilder> {
let header_size = MmapHeader::size();
let feature_size = std::mem::size_of::<Float>();
let data_size = self.total_samples * self.n_features * feature_size;
let target_size = self.total_samples * feature_size;
let total_size = header_size + data_size + target_size;
let data_offset = header_size;
let target_offset = data_offset + data_size;
let file = std::fs::File::create(&path).map_err(crate::error::SklearsError::IoError)?;
file.set_len(total_size as u64)
.map_err(crate::error::SklearsError::IoError)?;
let header = MmapHeader::new(
self.total_samples,
self.n_features,
data_offset,
target_offset,
0, 0, feature_size,
0, self.feature_names,
self.target_names,
self.description,
);
let mut file_writer = &file;
header.write(&mut file_writer)?;
Ok(MmapDatasetBuilder {
file,
written_samples: 0,
total_samples: self.total_samples,
n_features: self.n_features,
data_offset,
target_offset,
})
}
}
#[cfg(feature = "mmap")]
struct MmapHeader {
magic: [u8; 4],
version: u32,
n_samples: usize,
n_features: usize,
data_offset: usize,
target_offset: usize,
#[allow(dead_code)]
metadata_offset: usize,
#[allow(dead_code)]
metadata_size: usize,
data_type_size: usize,
#[allow(dead_code)]
checksum: u64,
feature_names: Vec<String>,
target_names: Option<Vec<String>>,
description: String,
}
#[cfg(feature = "mmap")]
impl MmapHeader {
const MAGIC: [u8; 4] = *b"SKML";
const VERSION: u32 = 1;
#[allow(clippy::too_many_arguments)]
fn new(
n_samples: usize,
n_features: usize,
data_offset: usize,
target_offset: usize,
metadata_offset: usize,
metadata_size: usize,
data_type_size: usize,
checksum: u64,
feature_names: Vec<String>,
target_names: Option<Vec<String>>,
description: String,
) -> Self {
Self {
magic: Self::MAGIC,
version: Self::VERSION,
n_samples,
n_features,
data_offset,
target_offset,
metadata_offset,
metadata_size,
data_type_size,
checksum,
feature_names,
target_names,
description,
}
}
fn size() -> usize {
1024 }
fn write<W: Write>(&self, mut writer: W) -> Result<()> {
writer
.write_all(&self.magic)
.map_err(crate::error::SklearsError::IoError)?;
writer
.write_all(&self.version.to_le_bytes())
.map_err(crate::error::SklearsError::IoError)?;
writer
.write_all(&self.n_samples.to_le_bytes())
.map_err(crate::error::SklearsError::IoError)?;
writer
.write_all(&self.n_features.to_le_bytes())
.map_err(crate::error::SklearsError::IoError)?;
Ok(())
}
fn from_bytes(bytes: &[u8]) -> Result<Self> {
if bytes.len() < Self::size() {
return Err(crate::error::SklearsError::InvalidInput(
"Header bytes too short".to_string(),
));
}
let magic = [bytes[0], bytes[1], bytes[2], bytes[3]];
if magic != Self::MAGIC {
return Err(crate::error::SklearsError::InvalidInput(
"Invalid magic number".to_string(),
));
}
Ok(Self::new(
0,
0,
0,
0,
0,
0,
8,
0,
Vec::new(),
None,
String::new(),
))
}
fn validate(&self, file_size: usize) -> Result<()> {
let expected_size = self.data_offset
+ self.n_samples * self.n_features * self.data_type_size
+ self.n_samples * self.data_type_size;
if file_size < expected_size {
return Err(crate::error::SklearsError::InvalidInput(format!(
"File size {} too small, expected at least {}",
file_size, expected_size
)));
}
Ok(())
}
}