use std::path::Path;
use std::fs::File;
use std::io::{self, BufRead, BufReader};
use std::collections::HashMap;
use std::ops::Index;
use std::cell::{Ref, RefMut, RefCell};
use std::mem;
use polars::prelude::*;
use rayon::prelude::*;
use super::feature::*;
#[derive(Debug)]
pub struct Sample {
pub(super) name_to_index: HashMap<String, usize>,
pub(super) features: Vec<Feature>,
pub(super) target: RefCell<Vec<f64>>,
pub(super) n_sample: usize,
pub(super) n_feature: usize,
}
impl Sample {
pub fn from_dataframe(data: DataFrame, target: Series)
-> io::Result<Self>
{
let (n_sample, n_feature) = data.shape();
let target = target.f64()
.expect("The target is not a dtype f64")
.into_iter()
.collect::<Option<Vec<_>>>()
.unwrap();
let target = RefCell::new(target);
let features = data.get_columns()
.into_par_iter()
.map(|series|
Feature::Dense(DenseFeature::from_series(series))
)
.collect::<Vec<_>>();
let name_to_index = features.iter()
.enumerate()
.map(|(i, f)| (f.name().to_string(), i))
.collect::<HashMap<_, _>>();
let sample = Self {
name_to_index, features, target, n_sample, n_feature,
};
Ok(sample)
}
pub fn from_csv<P>(file: P, mut has_header: bool) -> io::Result<Self>
where P: AsRef<Path>,
{
let file = File::open(file)?;
let mut lines = BufReader::new(file).lines();
let mut features = Vec::new();
if has_header {
let line = lines.next().unwrap();
features = line?.split(',')
.map(|name| DenseFeature::new(name))
.collect::<Vec<_>>();
}
let mut n_sample = 0_usize;
for line in lines {
let line = line?;
if !has_header {
let xs = line.split(',')
.map(|x| x.trim().parse::<f64>().unwrap())
.collect::<Vec<_>>();
let n_feature = xs.len();
features = (1..=n_feature).into_iter()
.map(|i| {
let name = format!("Feat. [{i}]");
DenseFeature::new(name)
})
.collect::<Vec<_>>();
for (feat, x) in features.iter_mut().zip(xs) {
feat.append(x);
}
has_header = true;
n_sample += 1;
continue;
}
line.split(',')
.map(|x| x.trim().parse::<f64>().unwrap())
.enumerate()
.for_each(|(i, x)| {
features[i].append(x);
});
n_sample += 1;
}
let features = features.into_par_iter()
.map(|feat| Feature::Dense(feat))
.collect::<Vec<_>>();
let n_feature = features.len();
let target = Vec::with_capacity(0);
let target = RefCell::new(target);
let name_to_index = features.iter()
.enumerate()
.map(|(i, f)| (f.name().to_string(), i))
.collect::<HashMap<_, _>>();
let sample = Self {
name_to_index, features, target, n_sample, n_feature,
};
Ok(sample)
}
pub fn target(&self) -> Ref<'_, [f64]> {
Ref::map(self.target.borrow(), |x| &x[..])
}
pub fn target_mut(&self) -> RefMut<'_, [f64]> {
RefMut::map(self.target.borrow_mut(), |x| &mut x[..])
}
pub fn features(&self) -> &[Feature] {
&self.features[..]
}
pub fn set_target<S: AsRef<str>>(mut self, target: S) -> Self {
let target = target.as_ref();
let pos = self.features.iter()
.position(|feat| feat.name() == target)
.expect("The target class does not exist");
let target = self.features.remove(pos).into_target();
self.target = RefCell::new(target);
self.n_feature -= 1;
self.name_to_index = self.features.iter()
.enumerate()
.map(|(i, f)| (f.name().to_string(), i))
.collect::<HashMap<_, _>>();
self
}
pub fn from_svmlight<P: AsRef<Path>>(file: P) -> io::Result<Self> {
let mut features = Vec::new();
let mut target = Vec::new();
let mut n_sample = 0_usize;
let file = File::open(file)?;
let lines = BufReader::new(file).lines();
for line in lines {
let line = line?;
let mut words = line.split_whitespace();
let y = words.next().unwrap().trim().parse::<f64>().unwrap();
target.push(y);
for word in words {
let (i, x) = index_and_feature(word);
while features.len() <= i {
let k = features.len() + 1;
let name = format!("Feat. [{k}]");
features.push(SparseFeature::new(name));
}
features[i].append((n_sample, x));
}
n_sample += 1;
}
let target = RefCell::new(target);
let n_feature = features.len();
let features = features.into_iter()
.map(|mut feat| {
feat.n_sample = n_sample;
Feature::Sparse(feat)
})
.collect::<Vec<_>>();
let name_to_index = features.iter()
.enumerate()
.map(|(i, f)| (f.name().to_string(), i))
.collect::<HashMap<_, _>>();
let mut sample = Self {
name_to_index, features, target, n_sample, n_feature,
};
sample.remove_allzero_features();
Ok(sample)
}
fn remove_allzero_features(&mut self) {
let features = mem::replace(&mut self.features, vec![]);
self.name_to_index = features.iter()
.filter_map(|feat| {
if feat.len() > 0 {
Some(feat.name().to_string())
} else {
None
}
})
.enumerate()
.map(|(i, name)| (name, i))
.collect();
self.features = features.into_iter()
.filter(|feat| feat.len() > 0)
.collect();
self.n_feature = self.features.len();
}
pub fn shape(&self) -> (usize, usize) {
(self.n_sample, self.n_feature)
}
pub fn replace_names<S, T>(&mut self, names: T) -> Vec<String>
where S: ToString + std::fmt::Display,
T: AsRef<[S]>,
{
let names = names.as_ref();
let n_features = self.shape().1;
let n_names = names.len();
if n_features != n_names {
panic!("The number of names is not equals to the one of `self.features`");
}
let old_names = names.into_iter()
.zip(&mut self.features[..])
.map(|(name, feature)| feature.replace_name(name))
.collect();
self.name_to_index = self.features.iter()
.map(|feature| feature.name().to_string())
.enumerate()
.map(|(i, name)| (name, i))
.collect();
old_names
}
pub fn at(&self, idx: usize) -> (Vec<f64>, f64) {
let x = self.features.iter()
.map(|feat| feat[idx])
.collect::<Vec<f64>>();
let y = self.target.borrow()[idx];
(x, y)
}
}
pub(self) fn index_and_feature(word: &str) -> (usize, f64) {
let mut i_x = word.split(':');
let i = i_x.next().unwrap().trim().parse::<usize>().unwrap();
let x = i_x.next().unwrap().trim().parse::<f64>().unwrap();
(i, x)
}
impl<S> Index<S> for Sample
where S: AsRef<str>
{
type Output = Feature;
fn index(&self, name: S) -> &Self::Output {
let name: &str = name.as_ref();
let k = *self.name_to_index.get(name).unwrap();
&self.features[k]
}
}