1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
use num_traits::ToPrimitive;
use rand::prelude::*;
use rand::rngs::StdRng;

use std::collections::HashSet;

use crate::utilities::{
    breaks_to_classification, create_unique_val_mapping, to_vec_f64, unique_to_normal_breaks,
};
use crate::utilities::{Classification, UniqueVal};

/// Returns a Classification object following the Jenks Natural Breaks algorithm given the desired number of bins and one-dimensional data
///
/// # Arguments
///
/// * `num_bins` - An integer (usize) representing the desired number of bins
/// * `data` - A reference to a collection of unsorted data points to generate a Classification for
///      
/// # Edge Cases
///
/// * Inputting large u64/i64 data (near their max values) will result in loss of precision because data is being cast to f64
/// * The maximum number of bins generated by this algorithm is the number of unique values in the dataset
///
/// # Examples
///
/// ```
/// use classify::get_jenks_classification;
/// use classify::{Classification, Bin};
/// use rand::prelude::*;
/// use rand::rngs::StdRng;
///
/// let data: Vec<usize> = vec![1, 2, 4, 5, 7, 8];
/// let num_bins = 3;
///
/// let result: Classification = get_jenks_classification(num_bins, &data);
/// let expected: Classification = vec![
///     Bin{bin_start: 1.0, bin_end: 4.0, count: 2},
///     Bin{bin_start: 4.0, bin_end: 7.0, count: 2},
///     Bin{bin_start: 7.0, bin_end: 8.0, count: 2}
/// ];
///
/// assert!(result == expected);
/// ```
pub fn get_jenks_classification<T: ToPrimitive>(num_bins: usize, data: &[T]) -> Classification {
    let breaks: Vec<f64> = get_jenks_breaks(num_bins, data);
    breaks_to_classification(&breaks, data)
}

/// Returns a vector of breaks generated through the Jenks Natural Breaks algorithm given the desired number of bins and a dataset
///
/// # Arguments
///
/// * `num_bins` - The desired number of bins
/// * `data` - A reference to a collection of unsorted data points to generate breaks for
///
/// # Edge Cases
///
/// * Inputting large u64/i64 data (near their max values) will result in loss of precision because data is being cast to f64
/// * The maximum number of bins generated by this algorithm is the number of unique values in the dataset
///
/// # Examples
///
/// ```
/// use classify::get_jenks_breaks;
/// use rand::prelude::*;
/// use rand::rngs::StdRng;
///
/// let data: Vec<i8> = vec![1, 2, 4, 5, 7, 8];
/// let num_bins = 3;
///
/// let result: Vec<f64> = get_jenks_breaks(num_bins, &data);
///
/// assert_eq!(result, vec![4.0, 7.0]);
/// ```
pub fn get_jenks_breaks<T: ToPrimitive>(num_bins: usize, data: &[T]) -> Vec<f64> {
    let data = to_vec_f64(data);

    let num_vals = data.len();

    let mut sorted_data: Vec<f64> = vec![];
    for item in data.iter().take(num_vals) {
        sorted_data.push(*item);
    }
    sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap());

    let mut unique_val_map: Vec<UniqueVal> = vec![];
    create_unique_val_mapping(&mut unique_val_map, &sorted_data);

    let num_unique_vals = unique_val_map.len();
    let true_num_bins = std::cmp::min(num_unique_vals, num_bins);

    let gssd = calc_gssd(&sorted_data);

    let mut rand_breaks: Vec<usize> = vec![0_usize; true_num_bins - 1];
    let mut best_breaks: Vec<usize> = vec![0_usize; true_num_bins - 1];
    let mut unique_rand_breaks: Vec<usize> = vec![0_usize; true_num_bins - 1];

    let mut max_gvf: f64 = 0.0;

    let c = 5000 * 2200 * 4;
    let mut permutations = c / num_vals;
    if permutations < 10 {
        permutations = 10
    }
    if permutations > 10000 {
        permutations = 10000
    }
    println!("permutations: {}", permutations);

    let mut pseudo_rng = StdRng::seed_from_u64(123456789);

    for _ in 0..permutations {
        pick_rand_breaks(&mut unique_rand_breaks, &num_unique_vals, &mut pseudo_rng);
        unique_to_normal_breaks(&unique_rand_breaks, &unique_val_map, &mut rand_breaks);
        let new_gvf: f64 = calc_gvf(&rand_breaks, &sorted_data, &gssd);
        if new_gvf > max_gvf {
            max_gvf = new_gvf;
            best_breaks[..rand_breaks.len()].copy_from_slice(&rand_breaks[..]);
        }
    }

    let mut nat_breaks: Vec<f64> = vec![];
    nat_breaks.resize(best_breaks.len(), 0.0);
    for i in 0..best_breaks.len() {
        nat_breaks[i] = sorted_data[best_breaks[i]];
    }
    println!("Breaks: {:#?}", nat_breaks);

    nat_breaks
}

/// Populates a vector with a set of breaks as unique random integers that are valid indices within the dataset given the number of data points and an RNG
///
/// # Arguments
///
/// * `breaks` - A mutable reference to an empty vector of breaks whose length is taken to be the desired number of breaks
/// * `num_vals` - A reference to the number of data points
/// * `rng` - A mutable reference to a seedable random number generator (RNG) from the "rand" crate
pub fn pick_rand_breaks(breaks: &mut Vec<usize>, num_vals: &usize, rng: &mut StdRng) {
    let num_breaks = breaks.len();
    if num_breaks > num_vals - 1 {
        return;
    }

    let mut set = HashSet::new();
    while set.len() < num_breaks {
        set.insert(rng.gen_range(1..*num_vals));
    }
    let mut set_iter = set.iter();
    for item in breaks.iter_mut().take(set_iter.len()) {
        *item = *set_iter.next().unwrap();
    }
    breaks.sort_unstable();
}

/// Calculates goodness of variance fit (GVF) for a particular set of breaks on a dataset
///
/// # Arguments
///
/// * `breaks` - A reference to a vector (usize) of break indices (sorted, ascending)
/// * `vals` - A reference to a vector (f64) of data points (sorted, ascending)
/// * `gssd` - A reference to the global sum of squared deviations (GSSD)
pub fn calc_gvf(breaks: &Vec<usize>, vals: &Vec<f64>, gssd: &f64) -> f64 {
    let num_vals = vals.len();
    let num_bins = breaks.len() + 1;
    let mut tssd: f64 = 0.0;
    for i in 0..num_bins {
        let lower = if i == 0 { 0 } else { breaks[i - 1] };
        let upper = if i == num_bins - 1 {
            num_vals
        } else {
            breaks[i]
        };

        let mut mean: f64 = 0.0;
        let mut ssd: f64 = 0.0;
        for item in vals.iter().take(upper).skip(lower) {
            mean += item;
        }
        mean /= (upper - lower) as f64;
        for item in vals.iter().take(upper).skip(lower) {
            ssd += (item - mean) * (item - mean)
        }
        tssd += ssd;
    }
    1.0 - (tssd / gssd)
}

/// Calculates global sum of squared deviations (GSSD) for a particular dataset
///
/// # Arguments
///
/// * `data` - A reference to a vector (f64) of data points (sorted, ascending)
pub fn calc_gssd(data: &Vec<f64>) -> f64 {
    let num_vals = data.len();
    let mut mean = 0.0;
    let mut max_val: f64 = data[0];
    for item in data.iter().take(num_vals) {
        let val = *item;
        if val > max_val {
            max_val = val
        }
        mean += val;
    }
    mean /= num_vals as f64;

    let mut gssd: f64 = 0.0;
    for item in data.iter().take(num_vals) {
        let val = *item;
        gssd += (val - mean) * (val - mean);
    }

    gssd
}