1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
use num_traits::ToPrimitive;

/// Represents a unique value found within a sorted dataset along with the indices of its first and last occurrences in the dataset
pub struct UniqueVal {
    pub val: f64,
    pub first: usize,
    pub last: usize,
}

/// Represents a single bin in a classification, including the bin's lowest (inclusive) and highest (exclusive) values and the number of points within it
pub struct Bin {
    pub bin_start: f64,
    pub bin_end: f64,
    pub count: u64,
}

impl PartialEq for Bin {
    fn eq(&self, other: &Self) -> bool {
        let starts_eq: bool = self.bin_start == other.bin_start;
        let ends_eq: bool = self.bin_end == other.bin_end;
        let counts_eq: bool = self.count == other.count;
        starts_eq && ends_eq && counts_eq
    }
}

/// Represents a full classification, which is a collection of Bin objects
pub type Classification = Vec<Bin>;

/// Translates generic numeric vectors to Vec<f64> using the ToPrimitive trait from the num crate
///
/// # Arguments
///
/// * `data` - A reference to a vector of generic type T where T implements the ToPrimitive trait
pub fn to_vec_f64<T: ToPrimitive>(data: &[T]) -> Vec<f64> {
    let mut result: Vec<f64> = vec![];
    for item in data {
        result.push(item.to_f64().unwrap());
    }
    result
}

/// Populates an empty vector of UniqueVal objects for each unique value in the dataset in the format (value, first occurrence index, last occurrence index)
///
/// # Arguments
///
/// * `unique_val_map` - A mutable reference to an empty vector of UniqueVals
/// * `vals` - A reference to the data (sorted, ascending) to use in populating unique_val_map
pub fn create_unique_val_mapping(unique_val_map: &mut Vec<UniqueVal>, vals: &[f64]) {
    unique_val_map.clear();
    let mut idx: i64 = -1;

    for (i, item) in vals.iter().enumerate() {
        if unique_val_map.is_empty() {
            idx += 1;
            unique_val_map.push(UniqueVal {
                val: *item,
                first: i,
                last: i,
            });
        } else if unique_val_map[idx as usize].val != *item {
            unique_val_map[idx as usize].last = i - 1;
            idx += 1;
            unique_val_map.push(UniqueVal {
                val: *item,
                first: i,
                last: i,
            });
        }
    }
}

/// Adjusts break indices from unique value breaks to normal breaks, accounting for repeated values, given unique value breaks, a unique value map, and an empty vector for normal breaks
///
/// # Arguments
///
/// * `u_val_breaks` - A reference to a vector of uniquely valued breaks (sorted, ascending)
/// * `u_val_map` - A reference to a map of unique values to their first and last occurrences in the dataset
/// * `normal_breaks` - A mutable reference to an empty vector to populate with adjusted break indices
pub fn unique_to_normal_breaks(
    u_val_breaks: &Vec<usize>,
    u_val_map: &[UniqueVal],
    normal_breaks: &mut Vec<usize>,
) {
    if normal_breaks.len() != u_val_breaks.len() {
        normal_breaks.resize(u_val_breaks.len(), 0);
    }
    for i in 0..u_val_breaks.len() {
        normal_breaks[i] = u_val_map[u_val_breaks[i]].first;
    }
}

/// Returns a Classification object given a set of breaks between bins and the original dataset
///
/// # Arguments
///
/// * `breaks` - A reference to a vector of breaks (f64) generated through any classification function or manually
/// * `data` - A reference to a vector of unsorted data points (f64) used to count the points in each bin
///
/// # Examples
///
/// ```
/// use classify::{breaks_to_classification};
/// use classify::{Classification, Bin};
///
/// let data: Vec<f64> = vec![1.0, 2.0, 4.0, 5.0, 7.0, 8.0];
/// let breaks: Vec<f64> = vec![2.0, 5.0];
///
/// let result: Classification = breaks_to_classification(&breaks, &data);
/// let expected: Classification = vec![
///     Bin{bin_start: 1.0, bin_end: 2.0, count: 1},
///     Bin{bin_start: 2.0, bin_end: 5.0, count: 2},
///     Bin{bin_start: 5.0, bin_end: 8.0, count: 3}
/// ];
///
/// assert!(result == expected);
/// ```
pub fn breaks_to_classification<T: ToPrimitive>(breaks: &Vec<f64>, data: &[T]) -> Classification {
    let data = to_vec_f64(data);

    let mut min_value = data[0];
    let mut max_value = data[0];
    for item in &data {
        if *item < min_value {
            min_value = *item;
        }
        if *item > max_value {
            max_value = *item;
        }
    }

    let mut bounds: Vec<f64> = vec![min_value];
    for item in breaks {
        bounds.push(*item);
    }
    bounds.push(max_value);

    let mut results: Classification = vec![];
    for i in 0..(bounds.len() - 1) {
        results.push(Bin {
            bin_start: bounds[i],
            bin_end: bounds[i + 1],
            count: 0,
        });
    }

    let num_bins = results.len();
    for bin in results.iter_mut().take(num_bins) {
        for item in &data {
            if bin.bin_start <= *item && *item < bin.bin_end {
                bin.count += 1;
            }
        }
    }
    results[num_bins - 1].count += 1;

    results
}

/// Returns an Option<usize> containing the index of the Bin within which a value should fall given the value and a Classification (returns None if the value is outside of the Classification's range)
///
/// # Arguments
///
/// * `val` - Data value to classify
/// * `class` - Classification object
///
/// # Examples
///
/// ```
/// use classify::{classify_val};
/// use classify::{Classification, Bin};
///
/// let vals: Vec<f64> = vec![0.0, 1.5, 3.5];
/// let class: Classification = vec![
///     Bin{bin_start: 0.0, bin_end: 1.0, count: 5},
///     Bin{bin_start: 1.0, bin_end: 2.0, count: 5},
///     Bin{bin_start: 2.0, bin_end: 3.0, count: 5}
/// ];
///
/// let mut results: Vec<Option<usize>> = vec![];
/// for val in vals {results.push(classify_val(val, &class))}
///
/// assert_eq!(results, vec![Some(0), Some(1), None])
/// ```
pub fn classify_val(val: f64, class: &Classification) -> Option<usize> {
    if val < class[0].bin_start || val > class[class.len() - 1].bin_end {
        return None;
    }
    for (i, _item) in class.iter().enumerate() {
        if class[i].bin_start <= val && val < class[i].bin_end {
            return Some(i);
        }
    }
    Some(class.len() - 1) // Accounts for case where val is maximum within dataset
}