use std::collections::BTreeSet;
use std::collections::HashMap;
use std::collections::HashSet;
use std::convert::TryInto;
use indexmap::IndexSet;
use rv::data::CategoricalSuffStat;
use rv::dist::Categorical;
use rv::dist::SymmetricDirichlet;
use serde::Deserialize;
use serde::Serialize;
use super::error::InsertDataError;
use crate::cc::feature::ColModel;
use crate::cc::feature::Column;
use crate::cc::feature::FType;
use crate::codebook::Codebook;
use crate::codebook::ColMetadataList;
use crate::codebook::ColType;
use crate::codebook::ValueMap;
use crate::codebook::ValueMapExtension;
use crate::codebook::ValueMapExtensionError;
use crate::data::Category;
use crate::data::Datum;
use crate::data::SparseContainer;
use crate::interface::HasCodebook;
use crate::ColumnIndex;
use crate::Engine;
use crate::HasStates;
use crate::OracleT;
use crate::RowIndex;
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum OverwriteMode {
Allow,
Deny,
MissingOnly,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum InsertMode {
Unrestricted,
DenyNewRows,
DenyNewColumns,
DenyNewRowsAndColumns,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[derive(Default)]
pub enum AppendStrategy {
#[default]
None,
Window,
Trench {
max_n_rows: usize,
trench_ix: usize,
},
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub struct WriteMode {
pub insert: InsertMode,
pub overwrite: OverwriteMode,
#[serde(default)]
pub allow_extend_support: bool,
#[serde(default)]
pub append_strategy: AppendStrategy,
}
impl WriteMode {
#[inline]
pub fn new() -> Self {
Self {
insert: InsertMode::Unrestricted,
overwrite: OverwriteMode::Deny,
allow_extend_support: false,
append_strategy: AppendStrategy::None,
}
}
#[inline]
pub fn unrestricted() -> Self {
Self {
insert: InsertMode::Unrestricted,
overwrite: OverwriteMode::Allow,
allow_extend_support: true,
append_strategy: AppendStrategy::None,
}
}
}
impl Default for WriteMode {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Value<C: ColumnIndex> {
pub col_ix: C,
pub value: Datum,
}
impl<C: ColumnIndex> From<(C, Datum)> for Value<C> {
fn from(value: (C, Datum)) -> Self {
Self {
col_ix: value.0,
value: value.1,
}
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Row<R: RowIndex, C: ColumnIndex> {
pub row_ix: R,
pub values: Vec<Value<C>>,
}
impl<R, C> From<(R, Vec<(C, Datum)>)> for Row<R, C>
where
R: RowIndex,
C: ColumnIndex,
{
fn from(mut row: (R, Vec<(C, Datum)>)) -> Self {
Self {
row_ix: row.0,
values: row.1.drain(..).map(Value::from).collect(),
}
}
}
impl<R: RowIndex, C: ColumnIndex> From<(R, Vec<Value<C>>)> for Row<R, C> {
fn from(row: (R, Vec<Value<C>>)) -> Self {
Self {
row_ix: row.0,
values: row.1,
}
}
}
impl<R: RowIndex, C: ColumnIndex> Row<R, C> {
#[inline]
pub fn len(&self) -> usize {
self.values.len()
}
#[inline]
pub fn is_empty(&self) -> bool {
self.values.is_empty()
}
}
#[derive(Debug, PartialEq)]
pub(crate) struct IndexValue {
pub col_ix: usize,
pub value: Datum,
}
#[derive(Debug, PartialEq)]
pub(crate) struct IndexRow {
pub row_ix: usize,
pub values: Vec<IndexValue>,
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub enum SupportExtension {
Categorical {
col_ix: usize,
col_name: String,
value_map_extension: ValueMapExtension,
},
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct InsertDataActions {
pub(crate) new_rows: IndexSet<String>,
pub(crate) new_cols: HashSet<String>,
pub(crate) support_extensions: Vec<SupportExtension>,
}
impl Default for InsertDataActions {
fn default() -> Self {
Self::new()
}
}
impl InsertDataActions {
pub fn new() -> Self {
Self {
new_rows: IndexSet::new(),
new_cols: HashSet::new(),
support_extensions: Vec::new(),
}
}
pub fn new_rows(&self) -> Option<&IndexSet<String>> {
if self.new_rows.is_empty() {
None
} else {
Some(&self.new_rows)
}
}
pub fn new_cols(&self) -> Option<&HashSet<String>> {
if self.new_cols.is_empty() {
None
} else {
Some(&self.new_cols)
}
}
pub fn support_extensions(&self) -> Option<&Vec<SupportExtension>> {
if self.support_extensions.is_empty() {
None
} else {
Some(&self.support_extensions)
}
}
}
#[derive(Debug)]
pub(crate) struct InsertDataTasks {
pub new_rows: IndexSet<String>,
pub new_cols: HashSet<String>,
pub overwrite_missing: bool,
pub overwrite_present: bool,
}
impl InsertDataTasks {
fn new() -> Self {
Self {
new_rows: IndexSet::new(),
new_cols: HashSet::new(),
overwrite_missing: false,
overwrite_present: false,
}
}
pub(crate) fn validate_insert_mode(
&self,
mode: WriteMode,
) -> Result<(), InsertDataError> {
match mode.overwrite {
OverwriteMode::Deny => {
if self.overwrite_present || self.overwrite_missing {
Err(InsertDataError::ModeForbidsOverwrite)
} else {
Ok(())
}
}
OverwriteMode::MissingOnly => {
if self.overwrite_present {
Err(InsertDataError::ModeForbidsOverwrite)
} else {
Ok(())
}
}
OverwriteMode::Allow => Ok(()),
}
.and_then(|_| match mode.insert {
InsertMode::DenyNewRows => {
if !self.new_rows.is_empty() {
Err(InsertDataError::ModeForbidsNewRows)
} else {
Ok(())
}
}
InsertMode::DenyNewColumns => {
if !self.new_cols.is_empty() {
Err(InsertDataError::ModeForbidsNewColumns)
} else {
Ok(())
}
}
InsertMode::DenyNewRowsAndColumns => {
if !(self.new_rows.is_empty() && self.new_cols.is_empty()) {
Err(InsertDataError::ModeForbidsNewRowsOrColumns)
} else {
Ok(())
}
}
_ => Ok(()),
})
}
}
#[inline]
fn ix_lookup_from_codebook(
col_metadata: &Option<ColMetadataList>,
) -> Option<HashMap<&str, usize>> {
col_metadata.as_ref().map(|colmds| {
colmds
.iter()
.enumerate()
.map(|(ix, md)| (md.name.as_str(), ix))
.collect()
})
}
#[inline]
fn col_ix_from_lookup(
col: &str,
lookup: &Option<HashMap<&str, usize>>,
) -> Result<usize, InsertDataError> {
match lookup {
Some(lkp) => lkp
.get(col)
.ok_or_else(|| {
InsertDataError::NewColumnNotInColumnMetadata(col.to_owned())
})
.copied(),
None => Err(InsertDataError::NewColumnNotInColumnMetadata(
String::from(col),
)),
}
}
pub(crate) fn append_empty_columns(
tasks: &InsertDataTasks,
col_metadata: Option<ColMetadataList>,
engine: &mut Engine,
) -> Result<(), InsertDataError> {
match col_metadata {
Some(colmds) if !tasks.new_cols.is_empty() => {
tasks.new_cols.iter().try_for_each(|col| {
if colmds.contains_key(col) {
Ok(())
} else {
Err(InsertDataError::NewColumnNotInColumnMetadata(
col.clone(),
))
}
})?;
if colmds.len() != tasks.new_cols.len() {
Err(InsertDataError::WrongNumberOfColumnMetadataEntries {
ncolmd: colmds.len(),
nnew: tasks.new_cols.len(),
})
} else {
let shape = (engine.n_rows(), engine.n_cols());
create_new_columns(&colmds, shape, &mut engine.rng).map(
|col_models| {
let mut rng = &mut engine.rng;
engine.states.iter_mut().for_each(|state| {
state.append_blank_features(
col_models.clone(),
&mut rng,
);
});
engine.codebook.append_col_metadata(colmds).unwrap();
},
)
}
}
None if !tasks.new_cols.is_empty() => {
Err(InsertDataError::WrongNumberOfColumnMetadataEntries {
ncolmd: 0,
nnew: tasks.new_cols.len(),
})
}
_ => Ok(()),
}
}
fn validate_new_col_ftype(
new_metadata: &Option<ColMetadataList>,
value: &Value<&str>,
) -> Result<(), InsertDataError> {
let col_ftype = new_metadata
.as_ref()
.ok_or_else(|| {
InsertDataError::NewColumnNotInColumnMetadata(value.col_ix.into())
})?
.get(value.col_ix)
.ok_or_else(|| {
InsertDataError::NewColumnNotInColumnMetadata(value.col_ix.into())
})
.map(|(_, md)| FType::from_coltype(&md.coltype))?;
let (is_compat, compat_info) = col_ftype.datum_compatible(&value.value);
let bad_continuous_value = match value.value {
Datum::Continuous(ref x) => !x.is_finite(),
_ => false,
};
if is_compat {
if bad_continuous_value {
Err(InsertDataError::NonFiniteContinuousValue {
col: value.col_ix.to_owned(),
value: value.value.to_f64_opt().unwrap(),
})
} else {
Ok(())
}
} else {
Err(InsertDataError::DatumIncompatibleWithColumn {
col: value.col_ix.to_owned(),
ftype: compat_info.ftype,
ftype_req: compat_info.ftype_req,
})
}
}
fn validate_row_values<R: RowIndex, C: ColumnIndex>(
row: &Row<R, C>,
row_ix: usize,
row_exists: bool,
col_metadata: &Option<ColMetadataList>,
col_ix_lookup: &Option<HashMap<&str, usize>>,
insert_tasks: &mut InsertDataTasks,
engine: &Engine,
) -> Result<IndexRow, InsertDataError> {
let n_cols = engine.n_cols();
let mut index_row = IndexRow {
row_ix,
values: vec![],
};
row.values.iter().try_for_each(|value| {
match value.col_ix.col_ix(engine.codebook()) {
Ok(col_ix) => {
if row_exists {
if engine.datum(row_ix, col_ix).unwrap().is_missing() {
insert_tasks.overwrite_missing = true;
} else {
insert_tasks.overwrite_present = true;
}
}
let ftype_compat = engine
.ftype(col_ix)
.unwrap()
.datum_compatible(&value.value);
let bad_continuous_value = match value.value {
Datum::Continuous(ref x) => !x.is_finite(),
_ => false,
};
if ftype_compat.0 {
if bad_continuous_value {
let col = &engine.codebook.col_metadata[col_ix].name;
Err(InsertDataError::NonFiniteContinuousValue {
col: col.clone(),
value: value.value.to_f64_opt().unwrap(),
})
} else {
Ok(col_ix)
}
} else {
let col = &engine.codebook.col_metadata[col_ix].name;
Err(InsertDataError::DatumIncompatibleWithColumn {
col: col.clone(),
ftype_req: ftype_compat.1.ftype_req,
ftype: ftype_compat.1.ftype,
})
}
}
Err(_) => {
value
.col_ix
.col_str()
.ok_or_else(|| {
InsertDataError::IntegerIndexNewColumn(
value
.col_ix
.col_usize()
.expect("Column index does not have a string or usize representation")
)
})
.and_then(|name| {
let new_val = Value {
col_ix: name,
value: value.value.clone(),
};
validate_new_col_ftype(col_metadata, &new_val).and_then(
|_| {
insert_tasks.new_cols.insert(name.to_owned());
col_ix_from_lookup(name, col_ix_lookup)
.map(|ix| ix + n_cols)
},
)
})
}
}
.map(|col_ix| {
index_row.values.push(IndexValue {
col_ix,
value: value.value.clone(),
});
})
})?;
Ok(index_row)
}
pub(crate) fn insert_data_tasks<R: RowIndex, C: ColumnIndex>(
rows: &[Row<R, C>],
col_metadata: &Option<ColMetadataList>,
engine: &Engine,
) -> Result<(InsertDataTasks, Vec<IndexRow>), InsertDataError> {
const EXISTING_ROW: bool = true;
const NEW_ROW: bool = false;
let col_ix_lookup = ix_lookup_from_codebook(col_metadata);
let n_rows = engine.n_rows();
let mut tasks = InsertDataTasks::new();
let index_rows: Vec<IndexRow> = rows
.iter()
.map(|row| match row.row_ix.row_ix(engine.codebook()) {
Ok(row_ix) => {
if row.is_empty() {
let name = engine.codebook.row_names.name(row_ix).unwrap();
Err(InsertDataError::EmptyRow(name.clone()))
} else {
validate_row_values(
row,
row_ix,
EXISTING_ROW,
col_metadata,
&col_ix_lookup,
&mut tasks,
engine,
)
}
}
Err(_) => {
if row.is_empty() {
Err(InsertDataError::EmptyRow(format!("{:?}", row.row_ix)))
} else {
validate_row_values(
row,
{
let n = tasks.new_rows.len();
row.row_ix
.row_str()
.ok_or_else(|| {
let ix = row
.row_ix
.row_usize()
.expect("Index doesn't have a string or usize representation");
InsertDataError::IntegerIndexNewRow(ix)
})
.map(|row_name| {
tasks
.new_rows
.insert(String::from(row_name));
})?;
n_rows + n
},
NEW_ROW,
col_metadata,
&col_ix_lookup,
&mut tasks,
engine,
)
}
}
})
.collect::<Result<Vec<IndexRow>, InsertDataError>>()?;
Ok((tasks, index_rows))
}
pub(crate) fn maybe_add_categories<R: RowIndex, C: ColumnIndex>(
rows: &[Row<R, C>],
engine: &mut Engine,
mode: WriteMode,
) -> Result<Vec<SupportExtension>, InsertDataError> {
let mut extended_value_map: HashMap<usize, ValueMapExtension> =
HashMap::new();
rows.iter().try_for_each(|row| {
row.values.iter().try_for_each(|new_value| {
match new_value.col_ix.col_ix(engine.codebook()) {
Err(_) => Ok(()), Ok(col_ix) => {
let col_metadata = &engine.codebook.col_metadata[col_ix];
let col_name = col_metadata.name.as_str();
match &col_metadata.coltype {
ColType::Categorical { value_map, .. } => {
match (&new_value.value, value_map) {
(Datum::Categorical(Category::String(x)), ValueMap::String(vm)) => {
if !vm.contains_cat(x) {
let ext_vm = extended_value_map.entry(col_ix).or_insert_with(ValueMapExtension::new_string);
ext_vm.extend(Category::String(x.clone())).map_err(
|e| match e {
ValueMapExtensionError::ExtensionOfDifferingType(a, b) => {
InsertDataError::WrongCategoryAndType(a, b, col_name.to_string())
}
})?;
};
Ok(())
},
(Datum::Categorical(Category::UInt(x)), ValueMap::UInt(old_max)) => {
if *old_max as u32 <= *x {
let ext_max = extended_value_map.entry(col_ix).or_insert_with(ValueMapExtension::new_uint);
ext_max.extend(Category::UInt(*x)).map_err(
|e| match e {
ValueMapExtensionError::ExtensionOfDifferingType(a, b) => {
InsertDataError::WrongCategoryAndType(a, b, col_name.to_string())
}
})?;
}
Ok(())
},
(Datum::Missing, _) |
(Datum::Categorical(Category::Bool(_)), ValueMap::Bool) => {
Ok(())
},
_ => {
Err(
InsertDataError::DatumIncompatibleWithColumn {
col: (*col_name).into(),
ftype_req: FType::Categorical,
ftype: (&new_value.value).try_into().unwrap(),
},
)}
}
}
_ => Ok(())
}
}
}
})
})?;
if !mode.allow_extend_support && !extended_value_map.is_empty() {
return Err(InsertDataError::ModeForbidsCategoryExtension);
}
let mut cols_extended: Vec<SupportExtension> = Vec::new();
for (col_ix, value_map_extension) in extended_value_map.drain() {
incr_column_categories(engine, col_ix, &value_map_extension)?;
cols_extended.push(SupportExtension::Categorical {
col_ix,
col_name: engine.codebook.col_metadata[col_ix].name.clone(),
value_map_extension,
})
}
Ok(cols_extended)
}
fn incr_category_in_codebook(
codebook: &mut Codebook,
col_ix: usize,
value_map_extension: &ValueMapExtension,
) -> Result<(), InsertDataError> {
let col_name = codebook.col_metadata[col_ix].name.clone();
match codebook.col_metadata[col_ix].coltype {
ColType::Categorical {
ref mut k,
ref mut value_map,
..
} => {
value_map.extend(value_map_extension.clone()).map_err(
|e| match e {
ValueMapExtensionError::ExtensionOfDifferingType(a, b) => {
InsertDataError::WrongCategoryAndType(a, b, col_name)
}
},
)?;
*k = value_map.len();
Ok(())
}
_ => panic!("Tried to change cardinality of non-categorical column"),
}
}
fn incr_column_categories(
engine: &mut Engine,
col_ix: usize,
extended_value_map: &ValueMapExtension,
) -> Result<(), InsertDataError> {
incr_category_in_codebook(
&mut engine.codebook,
col_ix,
extended_value_map,
)?;
let n_cats_req = match engine.codebook.col_metadata[col_ix].coltype {
ColType::Categorical { k, .. } => k,
_ => panic!("Requested non-categorical column"),
};
engine.states.iter_mut().for_each(|state| {
match state.feature_mut(col_ix) {
ColModel::Categorical(column) => {
column.prior = SymmetricDirichlet::new_unchecked(
column.prior.alpha(),
n_cats_req,
);
column.components.iter_mut().for_each(|cpnt| {
cpnt.stat = CategoricalSuffStat::from_parts_unchecked(
cpnt.stat.n(),
{
let mut counts = cpnt.stat.counts().clone();
counts.resize(n_cats_req, 0.0);
counts
},
);
cpnt.fx = Categorical::new_unchecked({
let mut ln_weights = cpnt.fx.ln_weights().clone();
ln_weights.resize(n_cats_req, f64::NEG_INFINITY);
ln_weights
});
})
}
_ => panic!("Requested non-categorical column"),
}
});
Ok(())
}
macro_rules! new_col_arm {
(
$coltype: ident,
$htype: ty,
$errvar: ident,
$colmd: ident,
$hyper: ident,
$prior: ident,
$n_rows: ident,
$id: ident,
$xtype: ty,
$rng: ident
) => {{
let data: SparseContainer<$xtype> =
SparseContainer::all_missing($n_rows);
match ($hyper, $prior) {
(Some(h), _) => {
let pr = if let Some(pr) = $prior {
pr.clone()
} else {
h.draw(&mut $rng)
};
let column = Column::new($id, data, pr, h.clone());
Ok(ColModel::$coltype(column))
}
(None, Some(pr)) => {
let mut column =
Column::new($id, data, pr.clone(), <$htype>::default());
column.ignore_hyper = true;
Ok(ColModel::$coltype(column))
}
(None, None) => Err(InsertDataError::NoGaussianHyperForNewColumn(
$colmd.name.clone(),
)),
}
}};
}
pub(crate) fn create_new_columns<R: rand::Rng>(
col_metadata: &ColMetadataList,
state_shape: (usize, usize),
mut rng: &mut R,
) -> Result<Vec<ColModel>, InsertDataError> {
let (n_rows, n_cols) = state_shape;
col_metadata
.iter()
.enumerate()
.map(|(i, colmd)| {
let id = i + n_cols;
match &colmd.coltype {
ColType::Continuous { hyper, prior } => new_col_arm!(
Continuous,
crate::stats::prior::nix::NixHyper,
NoGaussianHyperForNewColumn,
colmd,
hyper,
prior,
n_rows,
id,
f64,
rng
),
ColType::Count { hyper, prior } => new_col_arm!(
Count,
crate::stats::prior::pg::PgHyper,
NoPoissonHyperForNewColumn,
colmd,
hyper,
prior,
n_rows,
id,
u32,
rng
),
ColType::Categorical {
k, hyper, prior, ..
} => {
let data: SparseContainer<u32> =
SparseContainer::all_missing(n_rows);
let id = i + n_cols;
match (hyper, prior) {
(Some(h), _) => {
let pr = if let Some(pr) = prior {
pr.clone()
} else {
h.draw(*k, &mut rng)
};
let column = Column::new(id, data, pr, h.clone());
Ok(ColModel::Categorical(column))
}
(None, Some(pr)) => {
use crate::stats::prior::csd::CsdHyper;
let mut column = Column::new(
id,
data,
pr.clone(),
CsdHyper::default(),
);
column.ignore_hyper = true;
Ok(ColModel::Categorical(column))
}
(None, None) => Err(
InsertDataError::NoCategoricalHyperForNewColumn(
colmd.name.clone(),
),
),
}
}
}
})
.collect()
}
pub(crate) fn remove_cell(engine: &mut Engine, row_ix: usize, col_ix: usize) {
engine.states.iter_mut().for_each(|state| {
state.remove_datum(row_ix, col_ix);
})
}
pub(crate) fn remove_col(engine: &mut Engine, col_ix: usize) {
engine.codebook.col_metadata.remove_by_index(col_ix);
let mut rng = engine.rng.clone();
engine.states.iter_mut().for_each(|state| {
state.del_col(col_ix, &mut rng);
});
}
pub(crate) fn check_if_removes_col(
engine: &Engine,
rm_rows: &BTreeSet<usize>,
mut rm_cell_cols: HashMap<usize, i64>,
) -> BTreeSet<usize> {
let mut to_rm: BTreeSet<usize> = BTreeSet::new();
rm_cell_cols.iter_mut().for_each(|(col_ix, val)| {
let mut present_count = 0_i64;
let mut remove = true;
for row_ix in 0..engine.n_rows() {
if present_count > *val {
remove = false;
break;
}
if !rm_rows.contains(&row_ix)
&& !engine.datum(row_ix, *col_ix).unwrap().is_missing()
{
present_count += 1;
}
}
if remove {
to_rm.insert(*col_ix);
}
});
to_rm
}
pub(crate) fn check_if_removes_row(
engine: &Engine,
rm_cols: &BTreeSet<usize>,
mut rm_cell_rows: HashMap<usize, i64>,
) -> BTreeSet<usize> {
let mut to_rm: BTreeSet<usize> = BTreeSet::new();
rm_cell_rows.iter_mut().for_each(|(row_ix, val)| {
let mut present_count = 0_i64;
let mut remove = true;
for col_ix in 0..engine.n_cols() {
if present_count > *val {
remove = false;
break;
}
if !rm_cols.contains(&col_ix)
&& !engine.datum(*row_ix, col_ix).unwrap().is_missing()
{
present_count += 1;
}
}
if remove {
to_rm.insert(*row_ix);
}
});
to_rm
}
#[cfg(test)]
mod tests {
use rand::SeedableRng;
use super::*;
use crate::codebook::ColMetadata;
use crate::codebook::ColType;
use crate::codebook::ValueMap;
use crate::data::data_source;
use crate::stats::prior::csd::CsdHyper;
#[cfg(feature = "examples")]
mod requiring_examples {
use super::*;
use crate::examples::Example;
#[test]
fn errors_when_no_col_metadata_when_new_columns() {
let engine = Example::Animals.engine().unwrap();
let moose_updates = Row::<String, String> {
row_ix: "moose".into(),
values: vec![
Value {
col_ix: "does+taxes".into(),
value: Datum::Categorical(1_u32.into()),
},
Value {
col_ix: "flys".into(),
value: Datum::Categorical(1_u32.into()),
},
],
};
let result = insert_data_tasks(&[moose_updates], &None, &engine);
assert!(result.is_err());
match result {
Err(InsertDataError::NewColumnNotInColumnMetadata(s)) => {
assert_eq!(s, String::from("does+taxes"))
}
Err(err) => panic!("wrong error: {:?}", err),
Ok(_) => panic!("failed to fail"),
}
}
#[test]
fn errors_when_new_column_not_in_col_metadata() {
let engine = Example::Animals.engine().unwrap();
let moose_updates = Row::<String, String> {
row_ix: "moose".into(),
values: vec![
Value {
col_ix: "does+taxes".into(),
value: Datum::Categorical(1_u32.into()),
},
Value {
col_ix: "flys".into(),
value: Datum::Categorical(1_u32.into()),
},
],
};
let col_metadata = ColMetadataList::new(vec![ColMetadata {
name: "dances".into(),
coltype: ColType::Categorical {
k: 2,
hyper: None,
prior: None,
value_map: ValueMap::UInt(2),
},
notes: None,
missing_not_at_random: false,
}])
.unwrap();
let result = insert_data_tasks(
&[moose_updates],
&Some(col_metadata),
&engine,
);
assert!(result.is_err());
assert_eq!(
result.unwrap_err(),
InsertDataError::NewColumnNotInColumnMetadata(
"does+taxes".into()
)
);
}
#[test]
fn tasks_on_one_existing_row() {
let engine = Example::Animals.engine().unwrap();
let moose_updates = Row::<String, String> {
row_ix: "moose".into(),
values: vec![
Value {
col_ix: "swims".into(),
value: Datum::Categorical(1_u32.into()),
},
Value {
col_ix: "flys".into(),
value: Datum::Categorical(1_u32.into()),
},
],
};
let rows = vec![moose_updates];
let (tasks, ixrows) =
insert_data_tasks(&rows, &None, &engine).unwrap();
assert!(tasks.new_rows.is_empty());
assert!(tasks.new_cols.is_empty());
assert!(!tasks.overwrite_missing);
assert!(tasks.overwrite_present);
assert_eq!(
ixrows,
vec![IndexRow {
row_ix: 15,
values: vec![
IndexValue {
col_ix: 36,
value: Datum::Categorical(1_u32.into())
},
IndexValue {
col_ix: 34,
value: Datum::Categorical(1_u32.into())
},
]
}]
);
}
#[test]
fn tasks_on_one_new_row() {
let engine = Example::Animals.engine().unwrap();
let pegasus = Row::<String, String> {
row_ix: "pegasus".into(),
values: vec![
Value {
col_ix: "swims".into(),
value: Datum::Categorical(1_u32.into()),
},
Value {
col_ix: "flys".into(),
value: Datum::Categorical(1_u32.into()),
},
],
};
let rows = vec![pegasus];
let (tasks, ixrows) =
insert_data_tasks(&rows, &None, &engine).unwrap();
assert_eq!(tasks.new_rows.len(), 1);
assert!(tasks.new_rows.contains("pegasus"));
assert!(tasks.new_cols.is_empty());
assert!(!tasks.overwrite_missing);
assert!(!tasks.overwrite_present);
assert_eq!(
ixrows,
vec![IndexRow {
row_ix: 50,
values: vec![
IndexValue {
col_ix: 36,
value: Datum::Categorical(1_u32.into())
},
IndexValue {
col_ix: 34,
value: Datum::Categorical(1_u32.into())
},
]
}]
);
}
#[test]
fn tasks_on_two_new_rows() {
let engine = Example::Animals.engine().unwrap();
let pegasus = Row::<String, String> {
row_ix: "pegasus".into(),
values: vec![
Value {
col_ix: "swims".into(),
value: Datum::Categorical(1_u32.into()),
},
Value {
col_ix: "flys".into(),
value: Datum::Categorical(1_u32.into()),
},
],
};
let man = Row::<String, String> {
row_ix: "man".into(),
values: vec![
Value {
col_ix: "smart".into(),
value: Datum::Categorical(1_u32.into()),
},
Value {
col_ix: "hunter".into(),
value: Datum::Categorical(0_u32.into()),
},
],
};
let rows = vec![pegasus, man];
let (tasks, ixrows) =
insert_data_tasks(&rows, &None, &engine).unwrap();
assert_eq!(tasks.new_rows.len(), 2);
assert!(tasks.new_rows.contains("pegasus"));
assert!(tasks.new_rows.contains("man"));
assert!(tasks.new_cols.is_empty());
assert!(!tasks.overwrite_missing);
assert!(!tasks.overwrite_present);
assert_eq!(
ixrows,
vec![
IndexRow {
row_ix: 50,
values: vec![
IndexValue {
col_ix: 36,
value: Datum::Categorical(1_u32.into())
},
IndexValue {
col_ix: 34,
value: Datum::Categorical(1_u32.into())
},
]
},
IndexRow {
row_ix: 51,
values: vec![
IndexValue {
col_ix: 80,
value: Datum::Categorical(1_u32.into())
},
IndexValue {
col_ix: 58,
value: Datum::Categorical(0_u32.into())
},
]
}
]
);
}
#[test]
fn tasks_on_one_new_and_one_existing_row() {
let engine = Example::Animals.engine().unwrap();
let pegasus = Row::<String, String> {
row_ix: "pegasus".into(),
values: vec![
Value {
col_ix: "swims".into(),
value: Datum::Categorical(1_u32.into()),
},
Value {
col_ix: "flys".into(),
value: Datum::Categorical(1_u32.into()),
},
],
};
let moose = Row::<String, String> {
row_ix: "moose".into(),
values: vec![
Value {
col_ix: "smart".into(),
value: Datum::Categorical(1_u32.into()),
},
Value {
col_ix: "hunter".into(),
value: Datum::Categorical(0_u32.into()),
},
],
};
let rows = vec![pegasus, moose];
let (tasks, ixrows) =
insert_data_tasks(&rows, &None, &engine).unwrap();
assert_eq!(tasks.new_rows.len(), 1);
assert!(tasks.new_rows.contains("pegasus"));
assert!(tasks.new_cols.is_empty());
assert!(!tasks.overwrite_missing);
assert!(tasks.overwrite_present);
assert_eq!(
ixrows,
vec![
IndexRow {
row_ix: 50,
values: vec![
IndexValue {
col_ix: 36,
value: Datum::Categorical(1_u32.into())
},
IndexValue {
col_ix: 34,
value: Datum::Categorical(1_u32.into())
},
]
},
IndexRow {
row_ix: 15,
values: vec![
IndexValue {
col_ix: 80,
value: Datum::Categorical(1_u32.into())
},
IndexValue {
col_ix: 58,
value: Datum::Categorical(0_u32.into())
},
]
}
]
);
}
#[test]
fn tasks_on_one_new_col_in_existing_row() {
let engine = Example::Animals.engine().unwrap();
let col_metadata = ColMetadataList::new(vec![ColMetadata {
name: "dances".into(),
coltype: ColType::Categorical {
k: 2,
hyper: None,
prior: None,
value_map: ValueMap::UInt(2),
},
notes: None,
missing_not_at_random: false,
}])
.unwrap();
let moose_updates = Row::<String, String> {
row_ix: "moose".into(),
values: vec![
Value {
col_ix: "dances".into(),
value: Datum::Categorical(1_u32.into()),
},
Value {
col_ix: "flys".into(),
value: Datum::Categorical(1_u32.into()),
},
],
};
let rows = vec![moose_updates];
let (tasks, ixrows) =
insert_data_tasks(&rows, &Some(col_metadata), &engine).unwrap();
assert!(tasks.new_rows.is_empty());
assert_eq!(tasks.new_cols.len(), 1);
assert!(tasks.new_cols.contains("dances"));
assert!(!tasks.overwrite_missing);
assert!(tasks.overwrite_present);
assert_eq!(
ixrows,
vec![IndexRow {
row_ix: 15,
values: vec![
IndexValue {
col_ix: 85,
value: Datum::Categorical(1_u32.into())
},
IndexValue {
col_ix: 34,
value: Datum::Categorical(1_u32.into())
},
]
}]
);
}
#[test]
fn tasks_on_one_new_col_in_new_row() {
let engine = Example::Animals.engine().unwrap();
let col_metadata = ColMetadataList::new(vec![ColMetadata {
name: "dances".into(),
coltype: ColType::Categorical {
k: 2,
hyper: None,
prior: None,
value_map: ValueMap::UInt(2),
},
notes: None,
missing_not_at_random: false,
}])
.unwrap();
let peanut = Row::<String, String> {
row_ix: "peanut".into(),
values: vec![
Value {
col_ix: "dances".into(),
value: Datum::Categorical(1_u32.into()),
},
Value {
col_ix: "flys".into(),
value: Datum::Categorical(0_u32.into()),
},
],
};
let rows = vec![peanut];
let (tasks, ixrows) =
insert_data_tasks(&rows, &Some(col_metadata), &engine).unwrap();
assert_eq!(tasks.new_rows.len(), 1);
assert!(tasks.new_rows.contains("peanut"));
assert_eq!(tasks.new_cols.len(), 1);
assert!(tasks.new_cols.contains("dances"));
assert!(!tasks.overwrite_missing);
assert!(!tasks.overwrite_present);
assert_eq!(
ixrows,
vec![IndexRow {
row_ix: 50,
values: vec![
IndexValue {
col_ix: 85,
value: Datum::Categorical(1_u32.into())
},
IndexValue {
col_ix: 34,
value: Datum::Categorical(0_u32.into())
},
]
}]
);
}
#[test]
fn tasks_on_two_new_cols_in_existing_row() {
let engine = Example::Animals.engine().unwrap();
let col_metadata = ColMetadataList::new(vec![
ColMetadata {
name: "dances".into(),
coltype: ColType::Categorical {
k: 2,
hyper: None,
prior: None,
value_map: ValueMap::UInt(2),
},
notes: None,
missing_not_at_random: false,
},
ColMetadata {
name: "eats+figs".into(),
coltype: ColType::Categorical {
k: 2,
hyper: None,
prior: None,
value_map: ValueMap::UInt(2),
},
notes: None,
missing_not_at_random: false,
},
])
.unwrap();
let moose_updates = Row::<String, String> {
row_ix: "moose".into(),
values: vec![
Value {
col_ix: "flys".into(),
value: Datum::Categorical(1_u32.into()),
},
Value {
col_ix: "eats+figs".into(),
value: Datum::Categorical(0_u32.into()),
},
Value {
col_ix: "dances".into(),
value: Datum::Categorical(1_u32.into()),
},
],
};
let rows = vec![moose_updates];
let (tasks, ixrows) =
insert_data_tasks(&rows, &Some(col_metadata), &engine).unwrap();
assert!(tasks.new_rows.is_empty());
assert_eq!(tasks.new_cols.len(), 2);
assert!(tasks.new_cols.contains("dances"));
assert!(tasks.new_cols.contains("eats+figs"));
assert!(!tasks.overwrite_missing);
assert!(tasks.overwrite_present);
assert_eq!(
ixrows,
vec![IndexRow {
row_ix: 15,
values: vec![
IndexValue {
col_ix: 34,
value: Datum::Categorical(1_u32.into())
},
IndexValue {
col_ix: 86,
value: Datum::Categorical(0_u32.into())
},
IndexValue {
col_ix: 85,
value: Datum::Categorical(1_u32.into())
},
]
}]
);
}
}
fn quick_codebook() -> Codebook {
let coltype = ColType::Categorical {
k: 2,
hyper: None,
prior: None,
value_map: ValueMap::try_from(vec![
String::from("red"),
String::from("green"),
])
.unwrap(),
};
let md0 = ColMetadata {
name: "0".to_string(),
coltype: coltype.clone(),
notes: None,
missing_not_at_random: false,
};
let md1 = ColMetadata {
name: "1".to_string(),
coltype,
notes: None,
missing_not_at_random: false,
};
let md2 = ColMetadata {
name: "2".to_string(),
coltype: ColType::Categorical {
k: 3,
hyper: None,
prior: None,
value_map: ValueMap::UInt(3),
},
notes: None,
missing_not_at_random: false,
};
let col_metadata = ColMetadataList::new(vec![md0, md1, md2]).unwrap();
Codebook::new("table".to_string(), col_metadata)
}
#[test]
fn incr_cats_in_codebook_without_suppl_metadata_for_no_valmap_col() {
let mut codebook = quick_codebook();
let n_cats_before = match &codebook.col_metadata[2].coltype {
ColType::Categorical {
k,
value_map: ValueMap::UInt(3),
..
} => *k,
ColType::Categorical { value_map, .. } => {
panic!(
"starting value_map should have been U32(3), was {:?}",
value_map
);
}
_ => panic!("should've been categorical"),
};
assert_eq!(n_cats_before, 3);
let mut extension = ValueMapExtension::new_uint();
extension.extend(Category::UInt(3)).unwrap();
let result = incr_category_in_codebook(&mut codebook, 2, &extension);
result.unwrap();
let n_cats_after = match &codebook.col_metadata[2].coltype {
ColType::Categorical {
k,
value_map: ValueMap::UInt(4),
..
} => *k,
ColType::Categorical { value_map, .. } => {
panic!("value_map should be U32(4), was: {:?}", value_map)
}
_ => panic!("should've been categorical"),
};
assert_eq!(n_cats_after, 4);
}
#[test]
fn incr_cats_in_codebook_with_suppl_metadata_for_valmap_col() {
let mut codebook = quick_codebook();
match &codebook.col_metadata[0].coltype {
ColType::Categorical {
k, value_map: vm, ..
} => {
assert_eq!(*k, 2);
assert_eq!(vm.len(), 2);
}
_ => panic!("should've been categorical with valmap"),
};
let mut extension = ValueMapExtension::new_string();
extension
.extend(Category::String("blue".to_string()))
.unwrap();
let result = incr_category_in_codebook(&mut codebook, 0, &extension);
assert!(result.is_ok());
match &codebook.col_metadata[0].coltype {
ColType::Categorical {
k, value_map: vm, ..
} => {
assert_eq!(vm.len(), 3);
assert_eq!(*k, 3);
}
_ => panic!("should've been categorical with valmap"),
};
}
#[test]
fn append_bool() {
let coltype = ColType::Categorical {
k: 2,
hyper: Some(CsdHyper::default()),
prior: None,
value_map: ValueMap::Bool,
};
let md0 = ColMetadata {
name: "bool_col".to_string(),
coltype: coltype.clone(),
notes: None,
missing_not_at_random: false,
};
let mut engine = Engine::new(
1,
Codebook::new(
"test".to_string(),
ColMetadataList::new(vec![]).unwrap(),
),
data_source::DataSource::Empty,
0,
rand_xoshiro::Xoshiro256Plus::seed_from_u64(0x1234),
)
.unwrap();
engine
.insert_data(
vec![(
"abc",
vec![(
"bool_col",
Datum::Categorical(Category::Bool(false)),
)],
)
.into()],
Some(ColMetadataList::new(vec![md0]).unwrap()),
WriteMode::unrestricted(),
)
.unwrap();
engine
.insert_data(
vec![(
"def",
vec![(
"bool_col",
Datum::Categorical(Category::Bool(false)),
)],
)
.into()],
None,
WriteMode::unrestricted(),
)
.unwrap();
}
}