use std::path::Path;
use std::fs::File;
use std::io::{self, BufRead, BufReader};
use std::collections::{HashMap, HashSet};
use std::ops::Index;
use std::mem;
use polars::prelude::*;
use rayon::prelude::*;
use super::feature_struct::*;
#[derive(Debug,Clone)]
pub struct Sample {
pub(super) name_to_index: HashMap<String, usize>,
pub(super) features: Vec<Feature>,
pub(super) target: Vec<f64>,
pub(super) n_sample: usize,
pub(super) n_feature: usize,
}
impl Sample {
pub fn dummy(n_sample: usize) -> Self {
let half = n_sample / 2;
let mut target = vec![1f64; n_sample];
target[half..].iter_mut()
.for_each(|y| { *y = -1f64; });
let mut features = Vec::with_capacity(1);
features.push(Feature::new_sparse("dummy"));
Self {
name_to_index: HashMap::from([("dummy".to_string(), 0)]),
features,
target,
n_sample,
n_feature: 1usize,
}
}
pub(super) fn from_csv<P>(file: P, mut has_header: bool)
-> io::Result<Self>
where P: AsRef<Path>,
{
let file = File::open(file)?;
let mut lines = BufReader::new(file).lines();
let mut features = Vec::new();
if has_header {
let line = lines.next().unwrap();
features = line?.split(',')
.map(DenseFeature::new)
.collect::<Vec<_>>();
}
let mut n_sample = 0_usize;
for (i, line) in lines.enumerate() {
let line = line?;
if !has_header {
let xs = line.split(',')
.map(|x| {
x.trim().parse::<f64>()
.unwrap_or_else(|_| {
panic!(
"The file contains non-numerical value. \
Got {x} in Line {i}"
)
})
})
.collect::<Vec<_>>();
let n_feature = xs.len();
features = (1..=n_feature).map(|i| {
let name = format!("Feat. [{i}]");
DenseFeature::new(name)
})
.collect::<Vec<_>>();
for (feat, x) in features.iter_mut().zip(xs) {
feat.append(x);
}
has_header = true;
n_sample += 1;
continue;
}
line.split(',')
.map(|x| x.trim().parse::<f64>().unwrap())
.enumerate()
.for_each(|(i, x)| {
features[i].append(x);
});
n_sample += 1;
}
let features = features.into_par_iter()
.map(Feature::Dense)
.collect::<Vec<_>>();
let n_feature = features.len();
let target = Vec::with_capacity(0);
let name_to_index = features.iter()
.enumerate()
.map(|(i, f)| (f.name().to_string(), i))
.collect::<HashMap<_, _>>();
let sample = Self {
name_to_index, features, target, n_sample, n_feature,
};
Ok(sample)
}
pub fn from_dataframe(data: DataFrame, target: Series)
-> io::Result<Self>
{
let (n_sample, n_feature) = data.shape();
let target = target.f64()
.expect("The target is not a dtype f64")
.into_iter()
.collect::<Option<Vec<_>>>()
.unwrap();
let features = data.get_columns()
.into_par_iter()
.map(|series|
Feature::Dense(DenseFeature::from_series(series))
)
.collect::<Vec<_>>();
let name_to_index = features.iter()
.enumerate()
.map(|(i, f)| (f.name().to_string(), i))
.collect::<HashMap<_, _>>();
let sample = Self {
name_to_index, features, target, n_sample, n_feature,
};
Ok(sample)
}
pub fn target(&self) -> &[f64] {
&self.target[..]
}
pub fn unique_target(&self) -> Vec<f64> {
let mut target = self.target().to_vec();
target.sort_by(|a, b| a.partial_cmp(&b).unwrap());
target.dedup();
target
}
pub fn features(&self) -> &[Feature] {
&self.features[..]
}
pub fn set_target<S: AsRef<str>>(mut self, target: S) -> Self {
let target = target.as_ref();
let pos = self.features.iter()
.position(|feat| feat.name() == target)
.expect("The target class does not exist");
let target = self.features.remove(pos).into_target();
self.target = target;
self.n_feature -= 1;
self.name_to_index = self.features.iter()
.enumerate()
.map(|(i, f)| (f.name().to_string(), i))
.collect::<HashMap<_, _>>();
self
}
pub(super) fn from_svmlight<P: AsRef<Path>>(file: P)
-> io::Result<Self>
{
let mut features = Vec::new();
let mut target = Vec::new();
let mut n_sample = 0_usize;
let file = File::open(file)?;
let lines = BufReader::new(file).lines();
for line in lines {
let line = line?;
let mut words = line.split_whitespace();
let y = words.next()
.unwrap()
.trim()
.parse::<f64>()
.expect("Failed to parse the target value.");
target.push(y);
for word in words {
let (i, x) = index_and_feature(word);
while features.len() <= i {
let k = features.len() + 1;
let name = format!("Feat. [{k}]");
features.push(SparseFeature::new(name));
}
features[i].append((n_sample, x));
}
n_sample += 1;
}
let n_feature = features.len();
let features = features.into_iter()
.map(|mut feat| {
feat.n_sample = n_sample;
Feature::Sparse(feat)
})
.collect::<Vec<_>>();
let name_to_index = features.iter()
.enumerate()
.map(|(i, f)| (f.name().to_string(), i))
.collect::<HashMap<_, _>>();
let mut sample = Self {
name_to_index, features, target, n_sample, n_feature,
};
sample.remove_allzero_features();
Ok(sample)
}
fn remove_allzero_features(&mut self) {
let features = mem::take(&mut self.features);
self.name_to_index = features.iter()
.filter_map(|feat| {
if feat.is_empty() {
None
} else {
Some(feat.name().to_string())
}
})
.enumerate()
.map(|(i, name)| (name, i))
.collect();
self.features = features.into_iter()
.filter(|feat| !feat.is_empty())
.collect();
self.n_feature = self.features.len();
}
pub fn shape(&self) -> (usize, usize) {
(self.n_sample, self.n_feature)
}
pub fn replace_names<S, T>(&mut self, names: T) -> Vec<String>
where S: ToString + std::fmt::Display,
T: AsRef<[S]>,
{
let names = names.as_ref();
let n_features = self.shape().1;
let n_names = names.len();
assert_eq!(
n_names, n_features,
"The number of names is \
not equal to the one of `self.features.`"
);
let old_names = names.iter()
.zip(&mut self.features[..])
.map(|(name, feature)| feature.replace_name(name))
.collect();
self.name_to_index = self.features.iter()
.map(|feature| feature.name().to_string())
.enumerate()
.map(|(i, name)| (name, i))
.collect();
old_names
}
pub fn at(&self, idx: usize) -> (Vec<f64>, f64) {
let x = self.features.iter()
.map(|feat| feat[idx])
.collect::<Vec<f64>>();
let y = self.target[idx];
(x, y)
}
fn target_is_specified(&self) {
let n_sample = self.shape().0;
if n_sample != self.target.len() {
panic!(
"The target class is not specified.\n\
Use `Sample::set_target(\"Column Name\")`."
);
}
}
pub fn is_valid_binary_instance(&self) {
self.target_is_specified();
let non_integers = self.target.iter()
.filter(|&yi| !yi.trunc().eq(yi))
.collect::<Vec<_>>();
if !non_integers.is_empty() {
let line = non_integers.iter().take(5)
.map(|yi| yi.to_string())
.collect::<Vec<_>>()
.join(", ");
panic!(
"Target values are non-integer types.\n\
Ex. [{line}, ...]."
);
}
let set = self.target.iter()
.copied()
.map(|yi| yi as i32)
.collect::<HashSet<_>>();
let n_label = set.len();
if n_label > 2 {
panic!(
"The target values take more than 2 kinds. \
Expected 2 kinds, got {n_label} kinds."
);
} else if n_label < 2 {
panic!(
"The target values take less than 2 kinds. \
Expected 2 kinds, got {n_label} kinds."
);
}
let is_pm = set.iter().all(|y| y.eq(&1) || y.eq(&-1));
if !is_pm {
let line = set.iter()
.map(|y| y.to_string())
.collect::<Vec<_>>()
.join(", ");
println!(
"Warning: the target values take values not in [-1.0, 1.0].\n\
Currently, the labels are: [{line}]."
);
}
}
pub fn weighted_mean_and_variance<T>(&self, weight: T)
-> Vec<(f64, f64)>
where T: AsRef<[f64]>
{
let weight = weight.as_ref();
self.features()
.par_iter()
.map(|feat| feat.weighted_mean_and_variance(weight))
.collect()
}
pub fn weighted_mean<T>(&self, weight: T) -> Vec<f64>
where T: AsRef<[f64]>
{
let weight = weight.as_ref();
self.features()
.par_iter()
.map(|feat| feat.weighted_mean(weight))
.collect()
}
pub fn weighted_mean_for_label<T>(
&self,
y: f64,
weight: T
) -> Vec<f64>
where T: AsRef<[f64]>
{
let weight = weight.as_ref();
let target = self.target();
self.features()
.par_iter()
.map(|feat|
feat.weighted_mean_for_label(y, target, weight)
)
.collect()
}
pub fn weighted_mean_and_variance_for_label<T>(
&self,
y: f64,
weight: T
) -> Vec<(f64, f64)>
where T: AsRef<[f64]>
{
let weight = weight.as_ref();
let target = self.target();
self.features()
.par_iter()
.map(|feat|
feat.weighted_mean_and_variance_for_label(y, target, weight)
)
.collect()
}
fn append(&mut self, row: usize, feat: Vec<f64>, y: f64) {
self.features.par_iter_mut()
.zip(feat)
.for_each(|(col, f)| {
col.append(row, f);
});
self.target.push(y);
}
pub(crate) fn split<T>(&self, ix: T, start: usize, end: usize)
-> (Sample, Sample)
where T: AsRef<[usize]>
{
let n_feature = self.features.len();
let test_size = end - start;
let train_size = self.n_sample - test_size;
let ix = ix.as_ref();
let name_to_ix = self.name_to_index.clone();
let mut train = Self {
n_sample: train_size,
n_feature: n_feature,
name_to_index: name_to_ix.clone(),
features: vec![Feature::new_sparse("dummy"); n_feature],
target: Vec::with_capacity(train_size),
};
let mut test = Self {
n_sample: test_size,
n_feature: n_feature,
name_to_index: name_to_ix,
features: vec![Feature::new_sparse("dummy"); n_feature],
target: Vec::with_capacity(test_size),
};
for (name, &i) in self.name_to_index.iter() {
if self.features[i].is_sparse() {
train.features[i] = Feature::new_sparse(name.to_string());
test.features[i] = Feature::new_sparse(name.to_string());
train.features[i].set_n_sample(train_size);
test.features[i].set_n_sample(test_size);
} else {
train.features[i] = Feature::new_dense(name.to_string());
test.features[i] = Feature::new_dense(name.to_string());
}
}
for i in 0..start {
let ii = ix[i];
let (x, y) = self.at(ii);
train.append(i, x, y);
}
for i in start..end {
let ii = ix[i];
let (x, y) = self.at(ii);
test.append(i, x, y);
}
for i in end..self.n_sample {
let ii = ix[i];
let (x, y) = self.at(ii);
train.append(i, x, y);
}
(train, test)
}
}
pub(self) fn index_and_feature(word: &str) -> (usize, f64) {
let mut i_x = word.split(':');
let i = i_x.next()
.unwrap()
.trim()
.parse::<usize>()
.expect("Failed to parse an index.");
let x = i_x.next()
.unwrap()
.trim()
.parse::<f64>()
.expect("Failed to parse a feature value.");
(i, x)
}
impl<S> Index<S> for Sample
where S: AsRef<str>
{
type Output = Feature;
fn index(&self, name: S) -> &Self::Output {
let name: &str = name.as_ref();
let k = *self.name_to_index.get(name).unwrap();
&self.features[k]
}
}