use std::path::PathBuf;
use ndarray::{Axis, Ix1};
use ndarray_rand::rand::{SeedableRng, seq::SliceRandom};
use polars::prelude::*;
use crate::core::types::{Matrix, Vector};
pub fn load_dataset(path: PathBuf) -> PolarsResult<DataFrame> {
CsvReadOptions::default()
.try_into_reader_with_file_path(Some(path))?
.finish()
}
#[cfg(test)]
mod load_dataset_tests {
use std::path::PathBuf;
use crate::utils::data::load_dataset;
#[test]
fn test_load_dataset_existing_path() {
let path = PathBuf::from("./datasets/advertising.csv");
println!("path exists: {:?}", path.exists());
let df = load_dataset(path);
assert!(df.is_ok());
}
#[test]
fn test_load_dataset_with_non_existing_path() {
let path = PathBuf::from("./data/non_existing.csv");
let df = load_dataset(path);
assert!(df.is_err());
}
}
pub fn shuffle_split(
x: &Matrix,
y: &Vector,
train_perc: f64,
seed: i32,
) -> (Matrix, Vector, Matrix, Vector) {
let mut rng = ndarray_rand::rand::rngs::StdRng::seed_from_u64(seed as u64);
let n_samples = x.nrows();
let indices: Vec<usize> = (0..n_samples).collect();
let shuffled_indices: Vec<usize> = indices
.choose_multiple(&mut rng, n_samples)
.cloned()
.collect();
let split_index = (n_samples as f64 * train_perc).round() as usize;
let x_train = x.select(Axis(0), &shuffled_indices[..split_index]);
let y_train = y.select(Axis(0), &shuffled_indices[..split_index]);
let x_test = x.select(Axis(0), &shuffled_indices[split_index..]);
let y_test = y.select(Axis(0), &shuffled_indices[split_index..]);
(x_train, y_train, x_test, y_test)
}
#[cfg(test)]
mod shuffle_split_tests {
use crate::utils::data::shuffle_split;
use ndarray::{arr1, arr2};
#[test]
fn test_shuffle_split_train_test_ratio() {
let x = arr2(&[
[1.0, 2.0],
[3.0, 4.0],
[5.0, 6.0],
[7.0, 8.0],
[9.0, 10.0],
[11.0, 12.0],
[13.0, 14.0],
[15.0, 16.0],
[17.0, 18.0],
[19.0, 20.0],
]);
let y = arr1(&[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]);
let (x_train, y_train, x_test, y_test) = shuffle_split(&x, &y, 0.7, 42);
assert_eq!(x_train.nrows(), 7);
assert_eq!(y_train.len(), 7);
assert_eq!(x_test.nrows(), 3);
assert_eq!(y_test.len(), 3);
}
#[test]
fn test_shuffle_split_returns_sets_in_random_order() {
let x = arr2(&[
[1.0, 2.0],
[3.0, 4.0],
[5.0, 6.0],
[7.0, 8.0],
[9.0, 10.0],
[11.0, 12.0],
[13.0, 14.0],
[15.0, 16.0],
[17.0, 18.0],
[19.0, 20.0],
]);
let y = arr1(&[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]);
let (x_train_1, y_train_1, _, _) = shuffle_split(&x, &y, 0.7, 42);
let (x_train_2, y_train_2, _, _) = shuffle_split(&x, &y, 0.7, 100);
let mut sets_are_different = false;
for i in 0..x_train_1.nrows() {
if x_train_1.row(i) != x_train_2.row(i) {
sets_are_different = true;
break;
}
}
assert_eq!(x_train_1.nrows(), 7);
assert_eq!(x_train_2.nrows(), 7);
assert_eq!(y_train_1.len(), 7);
assert_eq!(y_train_2.len(), 7);
assert!(
sets_are_different,
"Training sets should be different when using different seeds"
);
}
}
pub fn get_features_and_target(
df: &DataFrame,
features: Vec<&str>,
target: &str,
) -> PolarsResult<(Matrix, Vector)> {
let x = df
.select(features)
.unwrap()
.to_ndarray::<Float64Type>(IndexOrder::Fortran)
.unwrap();
let y = df
.select([target])
.unwrap()
.to_ndarray::<Float64Type>(IndexOrder::Fortran)
.unwrap()
.column(0)
.to_owned()
.into_dimensionality::<Ix1>()
.unwrap();
Ok((x, y))
}
#[cfg(test)]
mod get_features_and_target_tests {
use crate::utils::data::{get_features_and_target, load_dataset};
use std::path::PathBuf;
#[test]
fn test_get_features_and_target() {
let path = PathBuf::from("./datasets/advertising.csv");
let df = load_dataset(path).unwrap();
let features = vec!["TV", "Radio", "Newspaper"];
let target = "Sales";
let (x, y) = get_features_and_target(&df, features, target).unwrap();
assert_eq!(x.nrows(), 200);
assert_eq!(x.ncols(), 3);
assert_eq!(y.len(), 200);
}
}