use ndarray::{Array1, Array2, Axis};
use std::collections::HashMap;
pub struct DiscreteDataset {
pub data: Array1<i32>,
pub counts: HashMap<i32, usize>,
pub n: usize,
pub k: usize,
pub dist: HashMap<i32, f64>,
}
impl DiscreteDataset {
pub fn from_data(data: Array1<i32>) -> Self {
let n = data.len();
let counts = count_frequencies(&data);
let k = counts.len();
let n_f = n as f64;
let mut dist = HashMap::with_capacity(k);
for (val, cnt) in counts.iter() {
dist.insert(*val, *cnt as f64 / n_f);
}
Self {
data,
counts,
n,
k,
dist,
}
}
pub fn from_counts_and_data(data: Array1<i32>, counts: HashMap<i32, usize>) -> Self {
let n = data.len();
let k = counts.len();
let n_f = n as f64;
let mut dist = HashMap::with_capacity(k);
for (val, cnt) in counts.iter() {
dist.insert(*val, *cnt as f64 / n_f);
}
Self {
data,
counts,
n,
k,
dist,
}
}
pub fn map_probs(&self) -> Array1<f64> {
self.data.mapv(|v| self.dist[&v])
}
}
pub fn count_frequencies(data: &Array1<i32>) -> HashMap<i32, usize> {
count_frequencies_slice(
data.as_slice()
.expect("ndarray Array1 should be contiguous"),
)
}
pub fn count_frequencies_slice(data: &[i32]) -> HashMap<i32, usize> {
let n = data.len();
if n == 0 {
return HashMap::new();
}
let mut min_v = i32::MAX;
let mut max_v = i32::MIN;
for &v in data.iter() {
if v < min_v {
min_v = v;
}
if v > max_v {
max_v = v;
}
}
const MAX_DENSE_RANGE: i32 = 4096;
if min_v >= 0 {
let range = max_v - min_v; if range <= MAX_DENSE_RANGE {
let len = (range as usize) + 1;
let mut dense = vec![0usize; len];
for &v in data.iter() {
let idx = (v - min_v) as usize;
dense[idx] += 1;
}
let mut map = HashMap::with_capacity(len);
for (i, &cnt) in dense.iter().enumerate() {
if cnt != 0 {
map.insert(min_v + (i as i32), cnt);
}
}
return map;
}
}
let mut frequency_map = HashMap::new();
for &value in data.iter() {
*frequency_map.entry(value).or_insert(0) += 1;
}
frequency_map
}
pub fn rows_as_vec(data: Array2<i32>) -> Vec<Array1<i32>> {
data.axis_iter(Axis(0)).map(|row| row.to_owned()).collect()
}
pub fn reduce_joint_space_compact(code_arrays: &[Array1<i32>]) -> Array1<i32> {
if code_arrays.is_empty() {
return Array1::zeros(0);
}
let len = code_arrays[0].len();
for arr in code_arrays.iter() {
assert_eq!(
arr.len(),
len,
"All code arrays must have the same length for joint reduction"
);
}
let mut map: HashMap<Vec<i32>, i32> = HashMap::new();
let mut next_id: i32 = 0;
let mut out: Vec<i32> = Vec::with_capacity(len);
let k = code_arrays.len();
for i in 0..len {
let mut key: Vec<i32> = Vec::with_capacity(k);
for arr in code_arrays.iter() {
key.push(arr[i]);
}
let id = *map.entry(key).or_insert_with(|| {
let v = next_id;
next_id = next_id
.checked_add(1)
.expect("Too many unique joint patterns to fit into i32");
v
});
out.push(id);
}
Array1::from(out)
}
pub fn reduce_array2_compact(data: &Array2<i32>) -> Array1<i32> {
let columns: Vec<Array1<i32>> = data.axis_iter(Axis(1)).map(|col| col.to_owned()).collect();
reduce_joint_space_compact(&columns)
}