use ndarray::{
s, stack, Array1, Array2, ArrayBase, ArrayView2, ArrayViewMut2, Axis, Data, DataMut, Dimension,
Ix1, Ix2,
};
use rand::{seq::SliceRandom, Rng};
use std::collections::HashMap;
use super::{
super::traits::{Predict, PredictRef},
iter::{ChunksIter, DatasetIter, Iter},
AsTargets, AsTargetsMut, CountedTargets, Dataset, DatasetBase, DatasetView, Float,
FromTargetArray, Label, Labels, Records, Result,
};
impl<R: Records, S> DatasetBase<R, S> {
pub fn new<T: IntoTargets<S>>(records: R, targets: T) -> DatasetBase<R, S> {
let targets = targets.into();
DatasetBase {
records,
targets,
weights: Array1::zeros(0),
feature_names: Vec::new(),
}
}
pub fn targets(&self) -> &S {
&self.targets
}
pub fn weights(&self) -> Option<&[f32]> {
if !self.weights.is_empty() {
Some(self.weights.as_slice().unwrap())
} else {
None
}
}
pub fn weight_for(&self, idx: usize) -> f32 {
self.weights.get(idx).copied().unwrap_or(1.0)
}
pub fn feature_names(&self) -> Vec<String> {
if !self.feature_names.is_empty() {
self.feature_names.clone()
} else {
(0..self.records.nfeatures())
.map(|idx| format!("feature-{}", idx))
.collect()
}
}
pub fn records(&self) -> &R {
&self.records
}
pub fn with_records<T: Records>(self, records: T) -> DatasetBase<T, S> {
DatasetBase {
records,
targets: self.targets,
weights: Array1::zeros(0),
feature_names: Vec::new(),
}
}
pub fn with_targets<T>(self, targets: T) -> DatasetBase<R, T> {
DatasetBase {
records: self.records,
targets,
weights: self.weights,
feature_names: self.feature_names,
}
}
pub fn with_weights(mut self, weights: Array1<f32>) -> DatasetBase<R, S> {
self.weights = weights;
self
}
pub fn with_feature_names<I: Into<String>>(mut self, names: Vec<I>) -> DatasetBase<R, S> {
let feature_names = names.into_iter().map(|x| x.into()).collect();
self.feature_names = feature_names;
self
}
}
impl<L, R: Records, T: AsTargets<Elem = L>> DatasetBase<R, T> {
pub fn map_targets<S, G: FnMut(&L) -> S>(self, fnc: G) -> DatasetBase<R, Array2<S>> {
let DatasetBase {
records,
targets,
weights,
feature_names,
..
} = self;
let targets = targets.as_multi_targets();
DatasetBase {
records,
targets: targets.map(fnc),
weights,
feature_names,
}
}
pub fn ntargets(&self) -> usize {
self.targets.as_multi_targets().len_of(Axis(1))
}
}
impl<'a, F: Float, L, D, T> DatasetBase<ArrayBase<D, Ix2>, T>
where
D: Data<Elem = F>,
T: AsTargets<Elem = L>,
{
pub fn sample_iter(&'a self) -> Iter<'a, '_, F, T::Elem> {
Iter::new(self.records.view(), self.targets.as_multi_targets())
}
}
impl<'a, F: Float, L: 'a, D, T> DatasetBase<ArrayBase<D, Ix2>, T>
where
D: Data<Elem = F>,
T: AsTargets<Elem = L> + FromTargetArray<'a, L>,
{
pub fn view(&'a self) -> DatasetBase<ArrayView2<'a, F>, T::View> {
let records = self.records().view();
let targets = T::new_targets_view(self.as_multi_targets());
DatasetBase::new(records, targets)
.with_feature_names(self.feature_names.clone())
.with_weights(self.weights.clone())
}
pub fn feature_iter(&'a self) -> DatasetIter<'a, '_, ArrayBase<D, Ix2>, T> {
DatasetIter::new(self, true)
}
pub fn target_iter(&'a self) -> DatasetIter<'a, '_, ArrayBase<D, Ix2>, T> {
DatasetIter::new(self, false)
}
}
impl<L, R: Records, T: AsTargets<Elem = L>> AsTargets for DatasetBase<R, T> {
type Elem = L;
fn as_multi_targets(&self) -> ArrayView2<'_, Self::Elem> {
self.targets.as_multi_targets()
}
}
impl<L, R: Records, T: AsTargetsMut<Elem = L>> AsTargetsMut for DatasetBase<R, T> {
type Elem = L;
fn as_multi_targets_mut(&mut self) -> ArrayViewMut2<'_, Self::Elem> {
self.targets.as_multi_targets_mut()
}
}
#[allow(clippy::type_complexity)]
impl<'a, L: 'a, F: Float, T> DatasetBase<ArrayView2<'a, F>, T>
where
T: AsTargets<Elem = L> + FromTargetArray<'a, L>,
{
pub fn split_with_ratio(
&'a self,
ratio: f32,
) -> (
DatasetBase<ArrayView2<'a, F>, T::View>,
DatasetBase<ArrayView2<'a, F>, T::View>,
) {
let n = (self.nsamples() as f32 * ratio).ceil() as usize;
let (records_first, records_second) = self.records.view().split_at(Axis(0), n);
let (targets_first, targets_second) = self.targets.as_multi_targets().split_at(Axis(0), n);
let targets_first = T::new_targets_view(targets_first);
let targets_second = T::new_targets_view(targets_second);
let (first_weights, second_weights) = if self.weights.len() == self.nsamples() {
let a = self.weights.slice(s![..n]).to_vec();
let b = self.weights.slice(s![n..]).to_vec();
(Array1::from(a), Array1::from(b))
} else {
(Array1::zeros(0), Array1::zeros(0))
};
let dataset1 = DatasetBase::new(records_first, targets_first)
.with_weights(first_weights)
.with_feature_names(self.feature_names.clone());
let dataset2 = DatasetBase::new(records_second, targets_second)
.with_weights(second_weights)
.with_feature_names(self.feature_names.clone());
(dataset1, dataset2)
}
}
impl<L: Label, T: Labels<Elem = L>, R: Records> Labels for DatasetBase<R, T> {
type Elem = L;
fn label_count(&self) -> Vec<HashMap<L, usize>> {
self.targets().label_count()
}
}
#[allow(clippy::type_complexity)]
impl<'a, 'b: 'a, F: Float, L: Label, T, D> DatasetBase<ArrayBase<D, Ix2>, T>
where
D: Data<Elem = F>,
T: AsTargets<Elem = L> + Labels<Elem = L>,
{
pub fn one_vs_all(
&self,
) -> Result<Vec<DatasetBase<ArrayView2<'_, F>, CountedTargets<bool, Array2<bool>>>>> {
let targets = self.targets().try_single_target()?;
Ok(self
.labels()
.into_iter()
.map(|label| {
let targets = targets
.iter()
.map(|x| x == &label)
.collect::<Array1<_>>()
.insert_axis(Axis(1));
let targets = CountedTargets::new(targets);
DatasetBase::new(self.records().view(), targets)
.with_feature_names(self.feature_names.clone())
.with_weights(self.weights.clone())
})
.collect())
}
}
impl<L: Label, R: Records, S: AsTargets<Elem = L>> DatasetBase<R, S> {
pub fn label_frequencies_with_mask(&self, mask: &[bool]) -> HashMap<L, f32> {
let mut freqs = HashMap::new();
for (elms, val) in self
.targets
.as_multi_targets()
.axis_iter(Axis(0))
.enumerate()
.filter(|(i, _)| *mask.get(*i).unwrap_or(&true))
.map(|(i, x)| (x, self.weight_for(i)))
{
for elm in elms {
if !freqs.contains_key(elm) {
freqs.insert(elm.clone(), 0.0);
}
*freqs.get_mut(&elm).unwrap() += val;
}
}
freqs
}
pub fn label_frequencies(&self) -> HashMap<L, f32> {
self.label_frequencies_with_mask(&[])
}
}
impl<F: Float, D: Data<Elem = F>, I: Dimension> From<ArrayBase<D, I>>
for DatasetBase<ArrayBase<D, I>, Array2<()>>
{
fn from(records: ArrayBase<D, I>) -> Self {
let empty_targets = Array2::default((records.len_of(Axis(0)), 1));
DatasetBase {
records,
targets: empty_targets,
weights: Array1::zeros(0),
feature_names: Vec::new(),
}
}
}
impl<F: Float, E, D, S> From<(ArrayBase<D, Ix2>, ArrayBase<S, Ix2>)>
for DatasetBase<ArrayBase<D, Ix2>, ArrayBase<S, Ix2>>
where
D: Data<Elem = F>,
S: Data<Elem = E>,
{
fn from(rec_tar: (ArrayBase<D, Ix2>, ArrayBase<S, Ix2>)) -> Self {
DatasetBase {
records: rec_tar.0,
targets: rec_tar.1,
weights: Array1::zeros(0),
feature_names: Vec::new(),
}
}
}
impl<F: Float, E, D, S> From<(ArrayBase<D, Ix2>, ArrayBase<S, Ix1>)>
for DatasetBase<ArrayBase<D, Ix2>, ArrayBase<S, Ix2>>
where
D: Data<Elem = F>,
S: Data<Elem = E>,
{
fn from(rec_tar: (ArrayBase<D, Ix2>, ArrayBase<S, Ix1>)) -> Self {
DatasetBase {
records: rec_tar.0,
targets: rec_tar.1.insert_axis(Axis(1)),
weights: Array1::zeros(0),
feature_names: Vec::new(),
}
}
}
impl<'b, F: Float, E: Copy + 'b, D, T> DatasetBase<ArrayBase<D, Ix2>, T>
where
D: Data<Elem = F>,
T: AsTargets<Elem = E> + FromTargetArray<'b, E>,
T::Owned: AsTargets,
{
pub fn bootstrap<R: Rng>(
&'b self,
sample_feature_size: (usize, usize),
rng: &'b mut R,
) -> impl Iterator<Item = DatasetBase<Array2<F>, <T as FromTargetArray<'b, E>>::Owned>> + 'b
{
std::iter::repeat(()).map(move |_| {
let indices = (0..sample_feature_size.0)
.map(|_| rng.gen_range(0, self.nsamples()))
.collect::<Vec<_>>();
let records = self.records().select(Axis(0), &indices);
let targets = T::new_targets(self.as_multi_targets().select(Axis(0), &indices));
let indices = (0..sample_feature_size.1)
.map(|_| rng.gen_range(0, self.nfeatures()))
.collect::<Vec<_>>();
let records = records.select(Axis(1), &indices);
DatasetBase::new(records, targets)
})
}
pub fn bootstrap_samples<R: Rng>(
&'b self,
num_samples: usize,
rng: &'b mut R,
) -> impl Iterator<Item = DatasetBase<Array2<F>, <T as FromTargetArray<'b, E>>::Owned>> + 'b
{
std::iter::repeat(()).map(move |_| {
let indices = (0..num_samples)
.map(|_| rng.gen_range(0, self.nsamples()))
.collect::<Vec<_>>();
let records = self.records().select(Axis(0), &indices);
let targets = T::new_targets(self.as_multi_targets().select(Axis(0), &indices));
DatasetBase::new(records, targets)
})
}
pub fn bootstrap_features<R: Rng>(
&'b self,
num_features: usize,
rng: &'b mut R,
) -> impl Iterator<Item = DatasetBase<Array2<F>, <T as FromTargetArray<'b, E>>::Owned>> + 'b
{
std::iter::repeat(()).map(move |_| {
let targets = T::new_targets(self.as_multi_targets().to_owned());
let indices = (0..num_features)
.map(|_| rng.gen_range(0, self.nfeatures()))
.collect::<Vec<_>>();
let records = self.records.select(Axis(1), &indices);
DatasetBase::new(records, targets)
})
}
pub fn shuffle<R: Rng>(&self, rng: &mut R) -> DatasetBase<Array2<F>, T::Owned> {
let mut indices = (0..self.nsamples()).collect::<Vec<_>>();
indices.shuffle(rng);
let records = (&self).records().select(Axis(0), &indices);
let targets = (&self).as_multi_targets().select(Axis(0), &indices);
let targets = T::new_targets(targets);
DatasetBase::new(records, targets)
}
#[allow(clippy::type_complexity)]
pub fn fold(
&self,
k: usize,
) -> Vec<(
DatasetBase<Array2<F>, T::Owned>,
DatasetBase<Array2<F>, T::Owned>,
)> {
let targets = self.as_multi_targets();
let fold_size = targets.len() / k;
let mut res = Vec::new();
let mut records_chunks: Vec<_> =
self.records.axis_chunks_iter(Axis(0), fold_size).collect();
let mut targets_chunks: Vec<_> = targets.axis_chunks_iter(Axis(0), fold_size).collect();
for i in 0..k {
let remaining_records = stack(Axis(0), &records_chunks.as_slice()[1..]).unwrap();
let remaining_targets = stack(Axis(0), &targets_chunks.as_slice()[1..]).unwrap();
res.push((
DatasetBase::new(remaining_records, T::new_targets(remaining_targets)),
DatasetBase::new(
records_chunks[0].into_owned(),
T::new_targets(targets_chunks[0].into_owned()),
),
));
if i < k - 1 {
records_chunks.swap(0, i + 1);
targets_chunks.swap(0, i + 1);
}
}
res
}
pub fn sample_chunks<'a: 'b>(&'b self, chunk_size: usize) -> ChunksIter<'b, 'a, F, T> {
ChunksIter::new(self.records().view(), &self.targets, chunk_size, Axis(0))
}
pub fn to_owned(&self) -> DatasetBase<Array2<F>, T::Owned> {
DatasetBase::new(
self.records().to_owned(),
T::new_targets(self.as_multi_targets().to_owned()),
)
}
}
impl<'a, F: Float, E: Copy + 'a, D, S> DatasetBase<ArrayBase<D, Ix2>, ArrayBase<S, Ix2>>
where
D: DataMut<Elem = F>,
S: DataMut<Elem = E>,
{
pub fn iter_fold<O, C: Fn(DatasetView<F, E>) -> O>(
&'a mut self,
k: usize,
fit_closure: C,
) -> impl Iterator<Item = (O, DatasetBase<ArrayView2<F>, ArrayView2<E>>)> {
assert!(k > 0);
assert!(k <= self.nsamples());
let samples_count = self.nsamples();
let fold_size = samples_count / k;
let features = self.nfeatures();
let targets = self.ntargets();
let mut records_sl = self.records.as_slice_mut().unwrap();
let mut targets_sl2 = self.targets.as_multi_targets_mut();
let mut targets_sl = targets_sl2.as_slice_mut().unwrap();
let mut objs: Vec<O> = Vec::new();
for i in 0..k {
assist_swap_array2(&mut records_sl, i, fold_size, features);
assist_swap_array2(&mut targets_sl, i, fold_size, targets);
let train = DatasetBase::new(
ArrayView2::from_shape(
(samples_count - fold_size, features),
records_sl.split_at(fold_size * features).1,
)
.unwrap(),
ArrayView2::from_shape(
(samples_count - fold_size, targets),
targets_sl.split_at(fold_size * targets).1,
)
.unwrap(),
);
let obj = fit_closure(train);
objs.push(obj);
assist_swap_array2(&mut records_sl, i, fold_size, features);
assist_swap_array2(&mut targets_sl, i, fold_size, targets);
}
objs.into_iter().zip(self.sample_chunks(fold_size))
}
}
fn assist_swap_array2<F>(slice: &mut [F], index: usize, fold_size: usize, features: usize) {
if index == 0 {
return;
}
let adj_fold_size = fold_size * features;
let start = adj_fold_size * index;
let (first_s, second_s) = slice.split_at_mut(start);
let (mut fold, _) = second_s.split_at_mut(adj_fold_size);
first_s[..fold_size * features].swap_with_slice(&mut fold);
}
impl<F: Float, E> Dataset<F, E> {
pub fn split_with_ratio(mut self, ratio: f32) -> (Self, Self) {
let (nfeatures, ntargets) = (self.nfeatures(), self.ntargets());
let n1 = (self.nsamples() as f32 * ratio).ceil() as usize;
let n2 = self.nsamples() - n1;
let feature_names = self.feature_names();
let mut array_buf = self.records.into_raw_vec();
let second_array_buf = array_buf.split_off(n1 * nfeatures);
let first = Array2::from_shape_vec((n1, nfeatures), array_buf).unwrap();
let second = Array2::from_shape_vec((n2, nfeatures), second_array_buf).unwrap();
let mut array_buf = self.targets.into_raw_vec();
let second_array_buf = array_buf.split_off(n1 * ntargets);
let first_targets = Array2::from_shape_vec((n1, ntargets), array_buf).unwrap();
let second_targets = Array2::from_shape_vec((n2, ntargets), second_array_buf).unwrap();
let second_weights = if self.weights.len() == n1 + n2 {
let mut weights = self.weights.into_raw_vec();
let weights2 = weights.split_off(n1);
self.weights = Array1::from(weights);
Array1::from(weights2)
} else {
Array1::zeros(0)
};
let dataset1 = Dataset::new(first, first_targets)
.with_weights(self.weights)
.with_feature_names(feature_names.clone());
let dataset2 = Dataset::new(second, second_targets)
.with_weights(second_weights)
.with_feature_names(feature_names);
(dataset1, dataset2)
}
}
impl<F: Float, D, T, O> Predict<ArrayBase<D, Ix2>, DatasetBase<ArrayBase<D, Ix2>, T>> for O
where
D: Data<Elem = F>,
O: PredictRef<ArrayBase<D, Ix2>, T>,
{
fn predict(&self, records: ArrayBase<D, Ix2>) -> DatasetBase<ArrayBase<D, Ix2>, T> {
let new_targets = self.predict_ref(&records);
DatasetBase::new(records, new_targets)
}
}
impl<F: Float, R, T, S, O> Predict<DatasetBase<R, T>, DatasetBase<R, S>> for O
where
R: Records<Elem = F>,
O: PredictRef<R, S>,
{
fn predict(&self, ds: DatasetBase<R, T>) -> DatasetBase<R, S> {
let new_targets = self.predict_ref(&ds.records);
DatasetBase::new(ds.records, new_targets)
}
}
impl<'a, F: Float, R, T, S, O> Predict<&'a DatasetBase<R, T>, S> for O
where
R: Records<Elem = F>,
O: PredictRef<R, S>,
{
fn predict(&self, ds: &'a DatasetBase<R, T>) -> S {
self.predict_ref(&ds.records)
}
}
impl<'a, F: Float, D, T, O> Predict<&'a ArrayBase<D, Ix2>, T> for O
where
D: Data<Elem = F>,
O: PredictRef<ArrayBase<D, Ix2>, T>,
{
fn predict(&self, records: &'a ArrayBase<D, Ix2>) -> T {
self.predict_ref(records)
}
}
impl<L: Label, S: Labels<Elem = L>> CountedTargets<L, S> {
pub fn new(targets: S) -> Self {
let labels = targets.label_count();
CountedTargets { targets, labels }
}
}
pub trait IntoTargets<T> {
fn into(self) -> T;
}
impl<F, D: Data<Elem = F>> IntoTargets<ArrayBase<D, Ix2>> for ArrayBase<D, Ix1> {
fn into(self) -> ArrayBase<D, Ix2> {
self.insert_axis(Axis(1))
}
}
impl<T> IntoTargets<T> for T {
fn into(self) -> T {
self
}
}