use whitenoise_validator::errors::*;
use whitenoise_validator::{proto, Float, Integer};
use whitenoise_validator::base::{Value, Array, Jagged, ReleaseNode, IndexKey};
use whitenoise_validator::utilities::{take_argument, standardize_numeric_argument};
use ndarray::{ArrayD, Axis};
use crate::NodeArguments;
use crate::components::Evaluable;
use crate::utilities::create_subset;
use crate::components::impute::{impute_float_gaussian, impute_float_uniform, impute_categorical};
use crate::utilities::get_num_columns;
use whitenoise_validator::utilities::array::{slow_select, slow_stack};
use std::cmp::Ordering;
use crate::utilities::noise::sample_uniform_int;
use ndarray::prelude::*;
use std::hash::Hash;
impl Evaluable for proto::Resize {
fn evaluate(&self, privacy_definition: &Option<proto::PrivacyDefinition>, mut arguments: NodeArguments) -> Result<ReleaseNode> {
let enforce_constant_time = privacy_definition.as_ref()
.map(|v| v.protect_elapsed_time).unwrap_or(false);
let mut number_rows = arguments.remove::<IndexKey>(&"number_rows".into())
.and_then(|v| v.array().ok()?.first_int().ok()).map(|v| v as i64);
let number_cols = arguments.remove::<IndexKey>(&"number_columns".into())
.and_then(|v| v.array().ok()?.first_int().ok()).map(|v| v as i64);
let minimum_rows = arguments.remove::<IndexKey>(&"minimum_rows".into())
.and_then(|v| v.array().ok()?.first_int().ok()).map(|v| v as i64);
if let Some(minimum_rows) = minimum_rows {
number_rows = Some(minimum_rows)
}
if arguments.contains_key::<IndexKey>(&"categories".into()) {
let weights = take_argument(&mut arguments, "weights")
.and_then(|v| v.jagged()).and_then(|v| v.float()).ok();
match (take_argument(&mut arguments, "data")?, take_argument(&mut arguments, "categories")?) {
(Value::Array(data), Value::Jagged(categories)) =>
Ok(match (data, categories) {
(Array::Float(_), Jagged::Float(_)) =>
return Err("categorical resizing over floats in not currently supported- try continuous imputation instead".into()),
(Array::Int(data), Jagged::Int(categories)) =>
resize_categorical(
data, number_rows, number_cols, categories, weights,
minimum_rows, enforce_constant_time)?.into(),
(Array::Bool(data), Jagged::Bool(categories)) =>
resize_categorical(
data, number_rows, number_cols, categories, weights,
minimum_rows, enforce_constant_time)?.into(),
(Array::Str(data), Jagged::Str(categories)) =>
resize_categorical(
data, number_rows, number_cols, categories, weights,
minimum_rows, enforce_constant_time)?.into(),
_ => return Err("types of data, categories, and nulls must be homogeneous, weights must be f64".into())
}),
_ => return Err("data and nulls must be arrays, categories must be a jagged matrix".into())
}
}
else {
match (
take_argument(&mut arguments, "data")?.array()?,
take_argument(&mut arguments, "lower")?.array()?,
take_argument(&mut arguments, "upper")?.array()?
) {
(Array::Float(data), Array::Float(lower), Array::Float(upper)) => {
let distribution = match take_argument(&mut arguments, "distribution") {
Ok(distribution) => distribution.array()?.first_string()?,
Err(_) => "uniform".to_string()
};
let shift = match take_argument(&mut arguments, "shift") {
Ok(shift) => Some(shift.array()?.float()?),
Err(_) => None
};
let scale = match take_argument(&mut arguments, "scale") {
Ok(scale) => Some(scale.array()?.float()?),
Err(_) => None
};
Ok(resize_float(
data, number_rows, number_cols, &distribution,
lower, upper, shift, scale, minimum_rows,
enforce_constant_time)?.into())
}
(Array::Int(data), Array::Int(lower), Array::Int(upper)) =>
Ok(resize_integer(
data, number_rows, number_cols,
lower, upper, minimum_rows,
enforce_constant_time)?.into()),
_ => Err("data, lower, and upper must be of a homogeneous numeric type".into())
}
}.map(ReleaseNode::new)
}
}
pub fn resize_float(
mut data: ArrayD<Float>,
number_rows: Option<i64>,
number_cols: Option<i64>,
distribution: &str,
lower: ArrayD<Float>, upper: ArrayD<Float>,
shift: Option<ArrayD<Float>>, scale: Option<ArrayD<Float>>,
minimum_rows: Option<i64>,
enforce_constant_time: bool
) -> Result<ArrayD<Float>> {
if let Some(number_cols) = number_cols {
let real_n = get_num_columns(&data)?;
data = match real_n.cmp(&number_cols) {
Ordering::Equal =>
data,
Ordering::Less => {
let mut synthetic_shape = data.shape().to_vec();
synthetic_shape[1] = (number_cols - real_n) as usize;
let synthetic_base = ndarray::ArrayD::from_elem(synthetic_shape, Float::NAN).into_dyn();
let synthetic = match distribution.to_lowercase().as_str() {
"uniform" => impute_float_uniform(synthetic_base, lower.clone(), upper.clone(), enforce_constant_time),
"gaussian" => impute_float_gaussian(
synthetic_base, lower.clone(), upper.clone(),
shift.clone().ok_or_else(|| Error::from("shift must be defined for gaussian imputation"))?,
scale.clone().ok_or_else(|| Error::from("scale must be defined for gaussian imputation"))?,
enforce_constant_time),
_ => Err("unrecognized distribution".into())
}?;
match ndarray::stack(Axis(1), &[data.view(), synthetic.view()]) {
Ok(value) => value,
Err(_) => return Err("failed to stack real and synthetic data".into())
}
}
Ordering::Greater =>
data.select(Axis(1), &create_sampling_indices(number_cols, real_n, enforce_constant_time)?)
}
}
if let Some(number_rows) = number_rows {
let real_n: i64 = data.len_of(Axis(0)) as i64;
if let Some(minimum_rows) = minimum_rows {
if minimum_rows > real_n {
return Ok(data)
}
}
data = match real_n.cmp(&number_rows) {
Ordering::Equal =>
data,
Ordering::Less => {
let mut synthetic_shape = data.shape().to_vec();
synthetic_shape[0] = (number_rows - real_n) as usize;
let synthetic_base = ndarray::ArrayD::from_elem(synthetic_shape, Float::NAN).into_dyn();
let synthetic = match distribution.to_lowercase().as_str() {
"uniform" => impute_float_uniform(synthetic_base, lower, upper, enforce_constant_time),
"gaussian" => impute_float_gaussian(
synthetic_base, lower, upper,
shift.ok_or_else(|| Error::from("shift must be defined for gaussian imputation"))?,
scale.ok_or_else(|| Error::from("scale must be defined for gaussian imputation"))?,
enforce_constant_time),
_ => Err("unrecognized distribution".into())
}?;
match ndarray::stack(Axis(0), &[data.view(), synthetic.view()]) {
Ok(value) => value,
Err(_) => return Err("failed to stack real and synthetic data".into())
}
}
Ordering::Greater =>
data.select(Axis(0), &create_sampling_indices(number_rows, real_n, enforce_constant_time)?)
}
}
Ok(data)
}
pub fn resize_integer(
mut data: ArrayD<Integer>,
number_rows: Option<i64>,
number_cols: Option<i64>,
lower: ArrayD<Integer>, upper: ArrayD<Integer>,
minimum_rows: Option<i64>,
enforce_constant_time: bool
) -> Result<ArrayD<Integer>> {
if let Some(number_cols) = number_cols {
let real_n = get_num_columns(&data)?;
data = match real_n.cmp(&number_cols) {
Ordering::Equal =>
data,
Ordering::Less => {
let mut synthetic_shape = data.shape().to_vec();
synthetic_shape[1] = (number_cols - real_n) as usize;
let lower = standardize_numeric_argument(lower.clone(), number_cols - real_n)?
.into_dimensionality::<Ix1>()?.to_vec();
let upper = standardize_numeric_argument(upper.clone(), number_cols - real_n)?
.into_dimensionality::<Ix1>()?.to_vec();
let mut synthetic = ndarray::ArrayD::zeros(synthetic_shape);
synthetic.gencolumns_mut().into_iter().zip(lower.into_iter().zip(upper.into_iter()))
.map(|(mut column, (min, max))| column.iter_mut()
.map(|v| {
*v = sample_uniform_int(min, max)?;
Ok(())
})
.collect::<Result<_>>())
.collect::<Result<_>>()?;
match ndarray::stack(Axis(0), &[data.view(), synthetic.view()]) {
Ok(value) => value,
Err(_) => return Err("failed to stack real and synthetic data".into())
}
}
Ordering::Greater =>
data.select(Axis(1), &create_sampling_indices(number_cols, real_n, enforce_constant_time)?)
}
}
if let Some(number_rows) = number_rows {
let real_n = data.len_of(Axis(0)) as i64;
if let Some(minimum_rows) = minimum_rows {
if minimum_rows > real_n {
return Ok(data)
}
}
data = match &real_n.cmp(&number_rows) {
Ordering::Equal =>
data,
Ordering::Less => {
let mut synthetic_shape = data.shape().to_vec();
synthetic_shape[0] = (number_rows - real_n) as usize;
let num_columns = get_num_columns(&data)?;
let lower = standardize_numeric_argument(lower, num_columns)?
.into_dimensionality::<Ix1>()?.to_vec();
let upper = standardize_numeric_argument(upper, num_columns)?
.into_dimensionality::<Ix1>()?.to_vec();
let mut synthetic = ndarray::ArrayD::zeros(synthetic_shape);
synthetic.gencolumns_mut().into_iter().zip(lower.into_iter().zip(upper.into_iter()))
.map(|(mut column, (min, max))| column.iter_mut()
.map(|v| {
*v = sample_uniform_int(min, max)?;
Ok(())
})
.collect::<Result<_>>())
.collect::<Result<_>>()?;
match ndarray::stack(Axis(0), &[data.view(), synthetic.view()]) {
Ok(value) => value,
Err(_) => return Err("failed to stack real and synthetic data".into())
}
}
Ordering::Greater =>
data.select(Axis(0), &create_sampling_indices(number_rows, real_n, enforce_constant_time)?)
}
}
Ok(data)
}
pub fn resize_categorical<T>(
mut data: ArrayD<T>,
number_rows: Option<i64>,
number_cols: Option<i64>,
categories: Vec<Vec<T>>,
weights: Option<Vec<Vec<Float>>>,
minimum_rows: Option<i64>,
enforce_constant_time: bool
) -> Result<ArrayD<T>> where T: Clone, T: PartialEq, T: Default, T: Ord, T: Hash {
if let Some(number_cols) = number_cols {
let real_n = get_num_columns(&data)?;
data = match real_n.cmp(&number_cols) {
Ordering::Equal =>
data,
Ordering::Less => {
let mut synthetic_shape = data.shape().to_vec();
synthetic_shape[0] = (number_cols - real_n) as usize;
let num_columns = get_num_columns(&data)?;
let mut synthetic = ndarray::Array::default(synthetic_shape).into_dyn();
synthetic.gencolumns_mut().into_iter()
.for_each(|mut col| col.iter_mut()
.for_each(|v| *v = T::default()));
let null_value = (0..num_columns).map(|_| vec![T::default()]).collect::<Vec<Vec<T>>>();
synthetic = impute_categorical(
synthetic, categories.clone(), weights.clone(), null_value, enforce_constant_time)?;
match slow_stack(Axis(0), &[data.view(), synthetic.view()]) {
Ok(value) => value,
Err(_) => return Err("failed to stack real and synthetic data".into())
}
}
Ordering::Greater =>
slow_select(&data, Axis(0), &create_sampling_indices(number_cols, real_n, enforce_constant_time)?).to_owned(),
}
}
if let Some(number_rows) = number_rows {
let real_n: i64 = data.len_of(Axis(0)) as i64;
if let Some(minimum_rows) = minimum_rows {
if minimum_rows > real_n {
return Ok(data)
}
}
data = match &real_n.cmp(&number_rows) {
Ordering::Equal =>
data,
Ordering::Less => {
let mut synthetic_shape = data.shape().to_vec();
synthetic_shape[0] = (number_rows - real_n) as usize;
let num_columns = get_num_columns(&data)?;
let mut synthetic = ndarray::Array::default(synthetic_shape).into_dyn();
synthetic.gencolumns_mut().into_iter()
.for_each(|mut col| col.iter_mut()
.for_each(|v| *v = T::default()));
let null_value = (0..num_columns).map(|_| vec![T::default()]).collect::<Vec<Vec<T>>>();
synthetic = impute_categorical(
synthetic, categories, weights, null_value, enforce_constant_time)?;
match slow_stack(Axis(0), &[data.view(), synthetic.view()]) {
Ok(value) => value,
Err(_) => return Err("failed to stack real and synthetic data".into())
}
}
Ordering::Greater =>
slow_select(&data, Axis(0), &create_sampling_indices(
number_rows, real_n, enforce_constant_time)?).to_owned(),
}
}
Ok(data)
}
pub fn create_sampling_indices(k: i64, n: i64, enforce_constant_time: bool) -> Result<Vec<usize>> {
let index_vec: Vec<usize> = (0..(n as usize)).collect();
let weight_vec: Vec<f64> = vec![1.; n as usize];
create_subset(&index_vec, &weight_vec, k as usize, enforce_constant_time)
}