use super::{
super::traits::{Predict, PredictRef},
iter::{ChunksIter, DatasetIter, Iter},
AsTargets, AsTargetsMut, CountedTargets, Dataset, DatasetBase, DatasetView, Float,
FromTargetArray, Label, Labels, Records, Result,
};
use crate::traits::Fit;
use ndarray::{
concatenate, s, Array, Array1, Array2, ArrayBase, ArrayView1, ArrayView2, ArrayViewMut2, Axis,
Data, DataMut, Dimension, Ix1, Ix2, OwnedRepr,
};
use rand::{seq::SliceRandom, Rng};
use std::collections::HashMap;
use std::ops::AddAssign;
impl<R: Records, S> DatasetBase<R, S> {
pub fn new<T: IntoTargets<S>>(records: R, targets: T) -> DatasetBase<R, S> {
let targets = targets.into();
DatasetBase {
records,
targets,
weights: Array1::zeros(0),
feature_names: Vec::new(),
}
}
pub fn targets(&self) -> &S {
&self.targets
}
pub fn weights(&self) -> Option<&[f32]> {
if !self.weights.is_empty() {
Some(self.weights.as_slice().unwrap())
} else {
None
}
}
pub fn weight_for(&self, idx: usize) -> f32 {
self.weights.get(idx).copied().unwrap_or(1.0)
}
pub fn feature_names(&self) -> Vec<String> {
if !self.feature_names.is_empty() {
self.feature_names.clone()
} else {
(0..self.records.nfeatures())
.map(|idx| format!("feature-{}", idx))
.collect()
}
}
pub fn records(&self) -> &R {
&self.records
}
pub fn with_records<T: Records>(self, records: T) -> DatasetBase<T, S> {
DatasetBase {
records,
targets: self.targets,
weights: Array1::zeros(0),
feature_names: Vec::new(),
}
}
pub fn with_targets<T>(self, targets: T) -> DatasetBase<R, T> {
DatasetBase {
records: self.records,
targets,
weights: self.weights,
feature_names: self.feature_names,
}
}
pub fn with_weights(mut self, weights: Array1<f32>) -> DatasetBase<R, S> {
self.weights = weights;
self
}
pub fn with_feature_names<I: Into<String>>(mut self, names: Vec<I>) -> DatasetBase<R, S> {
let feature_names = names.into_iter().map(|x| x.into()).collect();
self.feature_names = feature_names;
self
}
}
impl<L, R: Records, T: AsTargets<Elem = L>> DatasetBase<R, T> {
pub fn map_targets<S, G: FnMut(&L) -> S>(self, fnc: G) -> DatasetBase<R, Array2<S>> {
let DatasetBase {
records,
targets,
weights,
feature_names,
..
} = self;
let targets = targets.as_multi_targets();
DatasetBase {
records,
targets: targets.map(fnc),
weights,
feature_names,
}
}
pub fn ntargets(&self) -> usize {
self.targets.as_multi_targets().len_of(Axis(1))
}
}
impl<'a, F: Float, L, D, T> DatasetBase<ArrayBase<D, Ix2>, T>
where
D: Data<Elem = F>,
T: AsTargets<Elem = L>,
{
pub fn sample_iter(&'a self) -> Iter<'a, '_, F, T::Elem> {
Iter::new(self.records.view(), self.targets.as_multi_targets())
}
}
impl<'a, F: Float, L: 'a, D, T> DatasetBase<ArrayBase<D, Ix2>, T>
where
D: Data<Elem = F>,
T: AsTargets<Elem = L> + FromTargetArray<'a, L>,
{
pub fn view(&'a self) -> DatasetBase<ArrayView2<'a, F>, T::View> {
let records = self.records().view();
let targets = T::new_targets_view(self.as_multi_targets());
DatasetBase::new(records, targets)
.with_feature_names(self.feature_names.clone())
.with_weights(self.weights.clone())
}
pub fn feature_iter(&'a self) -> DatasetIter<'a, '_, ArrayBase<D, Ix2>, T> {
DatasetIter::new(self, true)
}
pub fn target_iter(&'a self) -> DatasetIter<'a, '_, ArrayBase<D, Ix2>, T> {
DatasetIter::new(self, false)
}
}
impl<L, R: Records, T: AsTargets<Elem = L>> AsTargets for DatasetBase<R, T> {
type Elem = L;
fn as_multi_targets(&self) -> ArrayView2<'_, Self::Elem> {
self.targets.as_multi_targets()
}
}
impl<L, R: Records, T: AsTargetsMut<Elem = L>> AsTargetsMut for DatasetBase<R, T> {
type Elem = L;
fn as_multi_targets_mut(&mut self) -> ArrayViewMut2<'_, Self::Elem> {
self.targets.as_multi_targets_mut()
}
}
#[allow(clippy::type_complexity)]
impl<'a, L: 'a, F: Float, T> DatasetBase<ArrayView2<'a, F>, T>
where
T: AsTargets<Elem = L> + FromTargetArray<'a, L>,
{
pub fn split_with_ratio(
&'a self,
ratio: f32,
) -> (
DatasetBase<ArrayView2<'a, F>, T::View>,
DatasetBase<ArrayView2<'a, F>, T::View>,
) {
let n = (self.nsamples() as f32 * ratio).ceil() as usize;
let (records_first, records_second) = self.records.view().split_at(Axis(0), n);
let (targets_first, targets_second) = self.targets.as_multi_targets().split_at(Axis(0), n);
let targets_first = T::new_targets_view(targets_first);
let targets_second = T::new_targets_view(targets_second);
let (first_weights, second_weights) = if self.weights.len() == self.nsamples() {
let a = self.weights.slice(s![..n]).to_vec();
let b = self.weights.slice(s![n..]).to_vec();
(Array1::from(a), Array1::from(b))
} else {
(Array1::zeros(0), Array1::zeros(0))
};
let dataset1 = DatasetBase::new(records_first, targets_first)
.with_weights(first_weights)
.with_feature_names(self.feature_names.clone());
let dataset2 = DatasetBase::new(records_second, targets_second)
.with_weights(second_weights)
.with_feature_names(self.feature_names.clone());
(dataset1, dataset2)
}
}
impl<L: Label, T: Labels<Elem = L>, R: Records> Labels for DatasetBase<R, T> {
type Elem = L;
fn label_count(&self) -> Vec<HashMap<L, usize>> {
self.targets().label_count()
}
}
#[allow(clippy::type_complexity)]
impl<'a, 'b: 'a, F: Float, L: Label, T, D> DatasetBase<ArrayBase<D, Ix2>, T>
where
D: Data<Elem = F>,
T: AsTargets<Elem = L> + Labels<Elem = L>,
{
pub fn one_vs_all(
&self,
) -> Result<
Vec<(
L,
DatasetBase<ArrayView2<'_, F>, CountedTargets<bool, Array2<bool>>>,
)>,
> {
let targets = self.targets().try_single_target()?;
Ok(self
.labels()
.into_iter()
.map(|label| {
let targets = targets
.iter()
.map(|x| x == &label)
.collect::<Array1<_>>()
.insert_axis(Axis(1));
let targets = CountedTargets::new(targets);
(
label,
DatasetBase::new(self.records().view(), targets)
.with_feature_names(self.feature_names.clone())
.with_weights(self.weights.clone()),
)
})
.collect())
}
}
impl<L: Label, R: Records, S: AsTargets<Elem = L>> DatasetBase<R, S> {
pub fn label_frequencies_with_mask(&self, mask: &[bool]) -> HashMap<L, f32> {
let mut freqs = HashMap::new();
for (elms, val) in self
.targets
.as_multi_targets()
.axis_iter(Axis(0))
.enumerate()
.filter(|(i, _)| *mask.get(*i).unwrap_or(&true))
.map(|(i, x)| (x, self.weight_for(i)))
{
for elm in elms {
if !freqs.contains_key(elm) {
freqs.insert(elm.clone(), 0.0);
}
*freqs.get_mut(&elm).unwrap() += val;
}
}
freqs
}
pub fn label_frequencies(&self) -> HashMap<L, f32> {
self.label_frequencies_with_mask(&[])
}
}
impl<F: Float, D: Data<Elem = F>, I: Dimension> From<ArrayBase<D, I>>
for DatasetBase<ArrayBase<D, I>, Array2<()>>
{
fn from(records: ArrayBase<D, I>) -> Self {
let empty_targets = Array2::default((records.len_of(Axis(0)), 1));
DatasetBase {
records,
targets: empty_targets,
weights: Array1::zeros(0),
feature_names: Vec::new(),
}
}
}
impl<F: Float, E, D, S> From<(ArrayBase<D, Ix2>, ArrayBase<S, Ix2>)>
for DatasetBase<ArrayBase<D, Ix2>, ArrayBase<S, Ix2>>
where
D: Data<Elem = F>,
S: Data<Elem = E>,
{
fn from(rec_tar: (ArrayBase<D, Ix2>, ArrayBase<S, Ix2>)) -> Self {
DatasetBase {
records: rec_tar.0,
targets: rec_tar.1,
weights: Array1::zeros(0),
feature_names: Vec::new(),
}
}
}
impl<F: Float, E, D, S> From<(ArrayBase<D, Ix2>, ArrayBase<S, Ix1>)>
for DatasetBase<ArrayBase<D, Ix2>, ArrayBase<S, Ix2>>
where
D: Data<Elem = F>,
S: Data<Elem = E>,
{
fn from(rec_tar: (ArrayBase<D, Ix2>, ArrayBase<S, Ix1>)) -> Self {
DatasetBase {
records: rec_tar.0,
targets: rec_tar.1.insert_axis(Axis(1)),
weights: Array1::zeros(0),
feature_names: Vec::new(),
}
}
}
impl<'b, F: Float, E: Copy + 'b, D, T> DatasetBase<ArrayBase<D, Ix2>, T>
where
D: Data<Elem = F>,
T: AsTargets<Elem = E> + FromTargetArray<'b, E>,
T::Owned: AsTargets,
{
pub fn bootstrap<R: Rng>(
&'b self,
sample_feature_size: (usize, usize),
rng: &'b mut R,
) -> impl Iterator<Item = DatasetBase<Array2<F>, <T as FromTargetArray<'b, E>>::Owned>> + 'b
{
std::iter::repeat(()).map(move |_| {
let indices = (0..sample_feature_size.0)
.map(|_| rng.gen_range(0..self.nsamples()))
.collect::<Vec<_>>();
let records = self.records().select(Axis(0), &indices);
let targets = T::new_targets(self.as_multi_targets().select(Axis(0), &indices));
let indices = (0..sample_feature_size.1)
.map(|_| rng.gen_range(0..self.nfeatures()))
.collect::<Vec<_>>();
let records = records.select(Axis(1), &indices);
DatasetBase::new(records, targets)
})
}
pub fn bootstrap_samples<R: Rng>(
&'b self,
num_samples: usize,
rng: &'b mut R,
) -> impl Iterator<Item = DatasetBase<Array2<F>, <T as FromTargetArray<'b, E>>::Owned>> + 'b
{
std::iter::repeat(()).map(move |_| {
let indices = (0..num_samples)
.map(|_| rng.gen_range(0..self.nsamples()))
.collect::<Vec<_>>();
let records = self.records().select(Axis(0), &indices);
let targets = T::new_targets(self.as_multi_targets().select(Axis(0), &indices));
DatasetBase::new(records, targets)
})
}
pub fn bootstrap_features<R: Rng>(
&'b self,
num_features: usize,
rng: &'b mut R,
) -> impl Iterator<Item = DatasetBase<Array2<F>, <T as FromTargetArray<'b, E>>::Owned>> + 'b
{
std::iter::repeat(()).map(move |_| {
let targets = T::new_targets(self.as_multi_targets().to_owned());
let indices = (0..num_features)
.map(|_| rng.gen_range(0..self.nfeatures()))
.collect::<Vec<_>>();
let records = self.records.select(Axis(1), &indices);
DatasetBase::new(records, targets)
})
}
pub fn shuffle<R: Rng>(&self, rng: &mut R) -> DatasetBase<Array2<F>, T::Owned> {
let mut indices = (0..self.nsamples()).collect::<Vec<_>>();
indices.shuffle(rng);
let records = (&self).records().select(Axis(0), &indices);
let targets = (&self).as_multi_targets().select(Axis(0), &indices);
let targets = T::new_targets(targets);
DatasetBase::new(records, targets)
}
#[allow(clippy::type_complexity)]
pub fn fold(
&self,
k: usize,
) -> Vec<(
DatasetBase<Array2<F>, T::Owned>,
DatasetBase<Array2<F>, T::Owned>,
)> {
let targets = self.as_multi_targets();
let fold_size = targets.len() / k;
let mut res = Vec::new();
let mut records_chunks: Vec<_> =
self.records.axis_chunks_iter(Axis(0), fold_size).collect();
let mut targets_chunks: Vec<_> = targets.axis_chunks_iter(Axis(0), fold_size).collect();
for i in 0..k {
let remaining_records = concatenate(Axis(0), &records_chunks.as_slice()[1..]).unwrap();
let remaining_targets = concatenate(Axis(0), &targets_chunks.as_slice()[1..]).unwrap();
res.push((
DatasetBase::new(remaining_records, T::new_targets(remaining_targets)),
DatasetBase::new(
records_chunks[0].into_owned(),
T::new_targets(targets_chunks[0].into_owned()),
),
));
if i < k - 1 {
records_chunks.swap(0, i + 1);
targets_chunks.swap(0, i + 1);
}
}
res
}
pub fn sample_chunks<'a: 'b>(&'b self, chunk_size: usize) -> ChunksIter<'b, 'a, F, T> {
ChunksIter::new(self.records().view(), &self.targets, chunk_size, Axis(0))
}
pub fn to_owned(&self) -> DatasetBase<Array2<F>, T::Owned> {
DatasetBase::new(
self.records().to_owned(),
T::new_targets(self.as_multi_targets().to_owned()),
)
}
}
macro_rules! assist_swap_array2 {
($slice: expr, $index: expr, $fold_size: expr, $features: expr) => {
if $index != 0 {
let adj_fold_size = $fold_size * $features;
let start = adj_fold_size * $index;
let (first_s, second_s) = $slice.split_at_mut(start);
let (mut fold, _) = second_s.split_at_mut(adj_fold_size);
first_s[..$fold_size * $features].swap_with_slice(&mut fold);
}
};
}
impl<'a, F: Float, E: Copy + 'a, D, S> DatasetBase<ArrayBase<D, Ix2>, ArrayBase<S, Ix2>>
where
D: DataMut<Elem = F>,
S: DataMut<Elem = E>,
{
pub fn iter_fold<O, C: Fn(&DatasetView<F, E>) -> O>(
&'a mut self,
k: usize,
fit_closure: C,
) -> impl Iterator<Item = (O, DatasetBase<ArrayView2<F>, ArrayView2<E>>)> {
assert!(k > 0);
assert!(k <= self.nsamples());
let samples_count = self.nsamples();
let fold_size = samples_count / k;
let features = self.nfeatures();
let targets = self.ntargets();
let mut objs: Vec<O> = Vec::new();
{
let records_sl = self.records.as_slice_mut().unwrap();
let mut targets_sl2 = self.targets.as_multi_targets_mut();
let targets_sl = targets_sl2.as_slice_mut().unwrap();
for i in 0..k {
assist_swap_array2!(records_sl, i, fold_size, features);
assist_swap_array2!(targets_sl, i, fold_size, targets);
{
let train = DatasetBase::new(
ArrayView2::from_shape(
(samples_count - fold_size, features),
records_sl.split_at(fold_size * features).1,
)
.unwrap(),
ArrayView2::from_shape(
(samples_count - fold_size, targets),
targets_sl.split_at(fold_size * targets).1,
)
.unwrap(),
);
let obj = fit_closure(&train);
objs.push(obj);
}
assist_swap_array2!(records_sl, i, fold_size, features);
assist_swap_array2!(targets_sl, i, fold_size, targets);
}
}
objs.into_iter().zip(self.sample_chunks(fold_size))
}
pub fn cross_validate_multi<O, ER, M, FACC, C>(
&'a mut self,
k: usize,
parameters: &[M],
eval: C,
) -> std::result::Result<Array2<FACC>, ER>
where
ER: std::error::Error + std::convert::From<crate::error::Error>,
M: for<'c> Fit<ArrayView2<'c, F>, ArrayView2<'c, E>, ER, Object = O>,
O: for<'d> PredictRef<ArrayView2<'a, F>, Array2<E>>,
FACC: Float,
C: Fn(&Array2<E>, &ArrayView2<E>) -> std::result::Result<Array1<FACC>, crate::error::Error>,
{
let mut evaluations = Array2::from_elem((parameters.len(), self.ntargets()), FACC::zero());
let folds_evaluations: std::result::Result<Vec<_>, ER> = self
.iter_fold(k, |train| {
let fit_result: std::result::Result<Vec<_>, ER> =
parameters.iter().map(|p| p.fit(&train)).collect();
fit_result
})
.map(|(models, valid)| {
let targets = valid.targets();
let models = models?;
let mut eval_predictions =
Array2::from_elem((models.len(), targets.len()), FACC::zero());
for (i, model) in models.iter().enumerate() {
let predicted = model.predict(valid.records());
let eval_pred = match eval(&predicted, &targets) {
Err(e) => Err(ER::from(e)),
Ok(res) => Ok(res),
}?;
eval_predictions.row_mut(i).add_assign(&eval_pred);
}
Ok(eval_predictions)
})
.collect();
for fold_evaluation in folds_evaluations? {
evaluations.add_assign(&fold_evaluation)
}
Ok(evaluations / FACC::from(k).unwrap())
}
pub fn cross_validate<O, ER, M, FACC, C, I>(
&'a mut self,
k: usize,
parameters: &[M],
eval: C,
) -> std::result::Result<ArrayBase<OwnedRepr<FACC>, I>, ER>
where
ER: std::error::Error + std::convert::From<crate::error::Error>,
M: for<'c> Fit<ArrayView2<'c, F>, ArrayView2<'c, E>, ER, Object = O>,
O: for<'d> PredictRef<ArrayView2<'a, F>, ArrayBase<OwnedRepr<E>, I>>,
FACC: Float,
C: Fn(&ArrayView1<E>, &ArrayView1<E>) -> std::result::Result<FACC, crate::error::Error>,
I: Dimension,
{
let mut shape = match I::NDIM {
Some(1) | Some(2) => Ok(I::zeros(I::NDIM.unwrap())),
_ => Err(crate::Error::NdShape(ndarray::ShapeError::from_kind(
ndarray::ErrorKind::IncompatibleShape,
))),
}?;
let mut tmp = shape.as_array_view_mut();
tmp[0] = parameters.len();
if tmp.len() == 2 {
tmp[1] = self.ntargets();
}
let folds_evaluations = self
.iter_fold(k, |train| {
let fit_result: std::result::Result<Vec<_>, ER> =
parameters.iter().map(|p| p.fit(&train)).collect();
fit_result
})
.map(|(models, valid)| {
let targets = valid.as_multi_targets();
let models = models?;
let eval_predictions = models
.iter()
.map(|m| {
let nsamples = valid.nsamples();
let predicted = m.predict(valid.records());
let ntargets = if predicted.ndim() == 1 {
1
} else {
predicted.len_of(Axis(1))
};
let predicted: Array2<_> =
predicted.into_shape((nsamples, ntargets)).unwrap();
predicted
.gencolumns()
.into_iter()
.zip(targets.gencolumns().into_iter())
.map(|(p, t)| eval(&p.view(), &t).map_err(ER::from))
.collect()
})
.collect::<std::result::Result<Vec<Vec<FACC>>, ER>>()?
.into_iter()
.flatten()
.collect();
Ok(Array::from_shape_vec(shape.clone(), eval_predictions).unwrap())
})
.collect::<std::result::Result<Vec<_>, ER>>();
let res = folds_evaluations?
.into_iter()
.fold(Array::<FACC, _>::zeros(shape.clone()), std::ops::Add::add);
Ok(res / FACC::cast(k))
}
}
impl<F: Float, E> Dataset<F, E> {
pub fn split_with_ratio(mut self, ratio: f32) -> (Self, Self) {
let (nfeatures, ntargets) = (self.nfeatures(), self.ntargets());
let n1 = (self.nsamples() as f32 * ratio).ceil() as usize;
let n2 = self.nsamples() - n1;
let feature_names = self.feature_names();
let mut array_buf = self.records.into_raw_vec();
let second_array_buf = array_buf.split_off(n1 * nfeatures);
let first = Array2::from_shape_vec((n1, nfeatures), array_buf).unwrap();
let second = Array2::from_shape_vec((n2, nfeatures), second_array_buf).unwrap();
let mut array_buf = self.targets.into_raw_vec();
let second_array_buf = array_buf.split_off(n1 * ntargets);
let first_targets = Array2::from_shape_vec((n1, ntargets), array_buf).unwrap();
let second_targets = Array2::from_shape_vec((n2, ntargets), second_array_buf).unwrap();
let second_weights = if self.weights.len() == n1 + n2 {
let mut weights = self.weights.into_raw_vec();
let weights2 = weights.split_off(n1);
self.weights = Array1::from(weights);
Array1::from(weights2)
} else {
Array1::zeros(0)
};
let dataset1 = Dataset::new(first, first_targets)
.with_weights(self.weights)
.with_feature_names(feature_names.clone());
let dataset2 = Dataset::new(second, second_targets)
.with_weights(second_weights)
.with_feature_names(feature_names);
(dataset1, dataset2)
}
}
impl<F: Float, D, T, O> Predict<ArrayBase<D, Ix2>, DatasetBase<ArrayBase<D, Ix2>, T>> for O
where
D: Data<Elem = F>,
O: PredictRef<ArrayBase<D, Ix2>, T>,
{
fn predict(&self, records: ArrayBase<D, Ix2>) -> DatasetBase<ArrayBase<D, Ix2>, T> {
let new_targets = self.predict_ref(&records);
DatasetBase::new(records, new_targets)
}
}
impl<F: Float, R, T, S, O> Predict<DatasetBase<R, T>, DatasetBase<R, S>> for O
where
R: Records<Elem = F>,
O: PredictRef<R, S>,
{
fn predict(&self, ds: DatasetBase<R, T>) -> DatasetBase<R, S> {
let new_targets = self.predict_ref(&ds.records);
DatasetBase::new(ds.records, new_targets)
}
}
impl<'a, F: Float, R, T, S, O> Predict<&'a DatasetBase<R, T>, S> for O
where
R: Records<Elem = F>,
O: PredictRef<R, S>,
{
fn predict(&self, ds: &'a DatasetBase<R, T>) -> S {
self.predict_ref(&ds.records)
}
}
impl<'a, F: Float, D, T, O> Predict<&'a ArrayBase<D, Ix2>, T> for O
where
D: Data<Elem = F>,
O: PredictRef<ArrayBase<D, Ix2>, T>,
{
fn predict(&self, records: &'a ArrayBase<D, Ix2>) -> T {
self.predict_ref(records)
}
}
impl<L: Label, S: Labels<Elem = L>> CountedTargets<L, S> {
pub fn new(targets: S) -> Self {
let labels = targets.label_count();
CountedTargets { targets, labels }
}
}
pub trait IntoTargets<T> {
fn into(self) -> T;
}
impl<F, D: Data<Elem = F>> IntoTargets<ArrayBase<D, Ix2>> for ArrayBase<D, Ix1> {
fn into(self) -> ArrayBase<D, Ix2> {
self.insert_axis(Axis(1))
}
}
impl<T> IntoTargets<T> for T {
fn into(self) -> T {
self
}
}