1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
use crate::utilities::Classification;
use crate::utilities::{breaks_to_classification, to_vec_f64};
use num_traits::ToPrimitive;
/// Returns a Classification object following the Hinge Breaks algorithm given the desired number of bins and one-dimensional data
///
/// # Arguments
///
/// * `hinge_coefficient` - A coefficient representing the size of the hinge as a multiple of the data's IQR (usually 1.5 or 3)
/// * `data` - A reference to a collection of unsorted data points to generate a Classification for
///
/// # Edge cases
///
/// * Inputting large u64/i64 data (near their max values) will result in loss of precision because data is being cast to f64
/// * If the data doesn't have outliers below/above the hinges, the algorithm may not produce all six bins
///
/// # Examples
///
/// ```
/// use classify::get_hinge_classification;
/// use classify::{Classification, Bin};
///
/// let data: Vec<f32> = vec![0.0, 1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 20.0, 25.0];
/// let hinge_coefficient = 1.5;
///
/// let result: Classification = get_hinge_classification(hinge_coefficient, &data);
/// let expected: Classification = vec![
/// Bin{bin_start: 0.0, bin_end: 3.0, count: 2},
/// Bin{bin_start: 3.0, bin_end: 10.5, count: 1},
/// Bin{bin_start: 10.5, bin_end: 13.0, count: 2},
/// Bin{bin_start: 13.0, bin_end: 15.5, count: 3},
/// Bin{bin_start: 15.5, bin_end: 23.0, count: 2},
/// Bin{bin_start: 23.0, bin_end: 25.0, count: 1}
/// ];
///
/// assert!(result == expected);
/// ```
pub fn get_hinge_classification<T: ToPrimitive, S: ToPrimitive>(
hinge_coefficient: S,
data: &[T],
) -> Classification {
let breaks: Vec<f64> = get_hinge_breaks(hinge_coefficient, data);
breaks_to_classification(&breaks, data)
}
/// Returns a vector of breaks generated through the Hinge Breaks algorithm given the desired number of bins and a dataset
///
/// # Arguments
///
/// * `hinge_coefficient` - A coefficient representing the size of the hinge as a multiple of the data's IQR (usually 1.5 or 3)
/// * `data` - A reference to a collection of unsorted data points to generate breaks for
///
/// # Edge cases
///
/// * Inputting large u64/i64 data (near their max values) will result in loss of precision because data is being cast to f64
/// * If the data doesn't have outliers below/above the hinges, the algorithm may not produce all six bins
///
/// # Examples
///
/// ```
/// use classify::get_hinge_breaks;
///
/// let data: Vec<usize> = vec![0, 1, 10, 11, 12, 13, 14, 15, 16, 20, 25];
/// let hinge_coefficient = 1.5;
///
/// let result: Vec<f64> = get_hinge_breaks(hinge_coefficient, &data);
///
/// assert_eq!(result, vec![3.0, 10.5, 13.0, 15.5, 23.0]);
/// ```
pub fn get_hinge_breaks<T: ToPrimitive, S: ToPrimitive>(
hinge_coefficient: S,
data: &[T],
) -> Vec<f64> {
let hinge_coefficient = hinge_coefficient.to_f64().unwrap();
let data = to_vec_f64(data);
let num_vals = data.len();
let mut sorted_data: Vec<f64> = vec![];
for item in data.iter().take(num_vals) {
sorted_data.push(*item);
}
sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
let min_val = sorted_data[0];
let max_val = sorted_data[num_vals - 1];
let perc_25 = percentile(25, &sorted_data);
let perc_50 = percentile(50, &sorted_data);
let perc_75 = percentile(75, &sorted_data);
let iqr = perc_75 - perc_25;
let hinge = iqr * hinge_coefficient;
let mut breaks: Vec<f64> = vec![];
if perc_25 - hinge > min_val {
breaks.push(perc_25 - hinge);
}
breaks.push(perc_25);
breaks.push(perc_50);
breaks.push(perc_75);
if perc_75 + hinge < max_val {
breaks.push(perc_75 + hinge);
}
breaks
}
/// Calculates percentiles of a given dataset
pub fn percentile(perc: u8, data: &Vec<f64>) -> f64 {
let num_vals = data.len();
let mut sorted_data: Vec<f64> = vec![];
for item in data.iter().take(num_vals) {
sorted_data.push(*item);
}
sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
let rank = (perc as f64 / 100.0) * (num_vals as f64 - 1.0);
if rank as usize == num_vals - 1 {
sorted_data[num_vals - 1]
} else {
let rank_int = rank as usize;
let rank_dec = rank - rank_int as f64;
sorted_data[rank_int] + rank_dec * (sorted_data[rank_int + 1] - sorted_data[rank_int])
}
}