1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194
use num_traits::ToPrimitive;
/// Represents a unique value found within a sorted dataset along with the indices of its first and last occurrences in the dataset
pub struct UniqueVal {
pub val: f64,
pub first: usize,
pub last: usize,
}
/// Represents a single bin in a classification, including the bin's lowest (inclusive) and highest (exclusive) values and the number of points within it
pub struct Bin {
pub bin_start: f64,
pub bin_end: f64,
pub count: u64,
}
impl PartialEq for Bin {
fn eq(&self, other: &Self) -> bool {
let starts_eq: bool = self.bin_start == other.bin_start;
let ends_eq: bool = self.bin_end == other.bin_end;
let counts_eq: bool = self.count == other.count;
starts_eq && ends_eq && counts_eq
}
}
/// Represents a full classification, which is a collection of Bin objects
pub type Classification = Vec<Bin>;
/// Translates generic numeric vectors to Vec<f64> using the ToPrimitive trait from the num crate
///
/// # Arguments
///
/// * `data` - A reference to a vector of generic type T where T implements the ToPrimitive trait
pub fn to_vec_f64<T: ToPrimitive>(data: &[T]) -> Vec<f64> {
let mut result: Vec<f64> = vec![];
for item in data {
result.push(item.to_f64().unwrap());
}
result
}
/// Populates an empty vector of UniqueVal objects for each unique value in the dataset in the format (value, first occurrence index, last occurrence index)
///
/// # Arguments
///
/// * `unique_val_map` - A mutable reference to an empty vector of UniqueVals
/// * `vals` - A reference to the data (sorted, ascending) to use in populating unique_val_map
pub fn create_unique_val_mapping(unique_val_map: &mut Vec<UniqueVal>, vals: &[f64]) {
unique_val_map.clear();
let mut idx: i64 = -1;
for (i, item) in vals.iter().enumerate() {
if unique_val_map.is_empty() {
idx += 1;
unique_val_map.push(UniqueVal {
val: *item,
first: i,
last: i,
});
} else if unique_val_map[idx as usize].val != *item {
unique_val_map[idx as usize].last = i - 1;
idx += 1;
unique_val_map.push(UniqueVal {
val: *item,
first: i,
last: i,
});
}
}
}
/// Adjusts break indices from unique value breaks to normal breaks, accounting for repeated values, given unique value breaks, a unique value map, and an empty vector for normal breaks
///
/// # Arguments
///
/// * `u_val_breaks` - A reference to a vector of uniquely valued breaks (sorted, ascending)
/// * `u_val_map` - A reference to a map of unique values to their first and last occurrences in the dataset
/// * `normal_breaks` - A mutable reference to an empty vector to populate with adjusted break indices
pub fn unique_to_normal_breaks(
u_val_breaks: &Vec<usize>,
u_val_map: &[UniqueVal],
normal_breaks: &mut Vec<usize>,
) {
if normal_breaks.len() != u_val_breaks.len() {
normal_breaks.resize(u_val_breaks.len(), 0);
}
for i in 0..u_val_breaks.len() {
normal_breaks[i] = u_val_map[u_val_breaks[i]].first;
}
}
/// Returns a Classification object given a set of breaks between bins and the original dataset
///
/// # Arguments
///
/// * `breaks` - A reference to a vector of breaks (f64) generated through any classification function or manually
/// * `data` - A reference to a vector of unsorted data points (f64) used to count the points in each bin
///
/// # Examples
///
/// ```
/// use classify::{breaks_to_classification};
/// use classify::{Classification, Bin};
///
/// let data: Vec<f64> = vec![1.0, 2.0, 4.0, 5.0, 7.0, 8.0];
/// let breaks: Vec<f64> = vec![2.0, 5.0];
///
/// let result: Classification = breaks_to_classification(&breaks, &data);
/// let expected: Classification = vec![
/// Bin{bin_start: 1.0, bin_end: 2.0, count: 1},
/// Bin{bin_start: 2.0, bin_end: 5.0, count: 2},
/// Bin{bin_start: 5.0, bin_end: 8.0, count: 3}
/// ];
///
/// assert!(result == expected);
/// ```
pub fn breaks_to_classification<T: ToPrimitive>(breaks: &Vec<f64>, data: &[T]) -> Classification {
let data = to_vec_f64(data);
let mut min_value = data[0];
let mut max_value = data[0];
for item in &data {
if *item < min_value {
min_value = *item;
}
if *item > max_value {
max_value = *item;
}
}
let mut bounds: Vec<f64> = vec![min_value];
for item in breaks {
bounds.push(*item);
}
bounds.push(max_value);
let mut results: Classification = vec![];
for i in 0..(bounds.len() - 1) {
results.push(Bin {
bin_start: bounds[i],
bin_end: bounds[i + 1],
count: 0,
});
}
let num_bins = results.len();
for bin in results.iter_mut().take(num_bins) {
for item in &data {
if bin.bin_start <= *item && *item < bin.bin_end {
bin.count += 1;
}
}
}
results[num_bins - 1].count += 1;
results
}
/// Returns an Option<usize> containing the index of the Bin within which a value should fall given the value and a Classification (returns None if the value is outside of the Classification's range)
///
/// # Arguments
///
/// * `val` - Data value to classify
/// * `class` - Classification object
///
/// # Examples
///
/// ```
/// use classify::{classify_val};
/// use classify::{Classification, Bin};
///
/// let vals: Vec<f64> = vec![0.0, 1.5, 3.5];
/// let class: Classification = vec![
/// Bin{bin_start: 0.0, bin_end: 1.0, count: 5},
/// Bin{bin_start: 1.0, bin_end: 2.0, count: 5},
/// Bin{bin_start: 2.0, bin_end: 3.0, count: 5}
/// ];
///
/// let mut results: Vec<Option<usize>> = vec![];
/// for val in vals {results.push(classify_val(val, &class))}
///
/// assert_eq!(results, vec![Some(0), Some(1), None])
/// ```
pub fn classify_val(val: f64, class: &Classification) -> Option<usize> {
if val < class[0].bin_start || val > class[class.len() - 1].bin_end {
return None;
}
for (i, _item) in class.iter().enumerate() {
if class[i].bin_start <= val && val < class[i].bin_end {
return Some(i);
}
}
Some(class.len() - 1) // Accounts for case where val is maximum within dataset
}