ghostflow_ml/
neighbors.rs

1//! K-Nearest Neighbors algorithms
2
3use ghostflow_core::Tensor;
4use rayon::prelude::*;
5
6/// K-Nearest Neighbors Classifier
7pub struct KNeighborsClassifier {
8    pub n_neighbors: usize,
9    pub weights: Weights,
10    pub metric: Metric,
11    pub p: f32,
12    x_train_: Option<Vec<f32>>,
13    y_train_: Option<Vec<f32>>,
14    n_samples_: usize,
15    n_features_: usize,
16    n_classes_: usize,
17}
18
19#[derive(Clone, Copy, Debug)]
20pub enum Weights {
21    Uniform,
22    Distance,
23}
24
25#[derive(Clone, Copy, Debug)]
26pub enum Metric {
27    Euclidean,
28    Manhattan,
29    Minkowski,
30    Cosine,
31}
32
33impl KNeighborsClassifier {
34    pub fn new(n_neighbors: usize) -> Self {
35        KNeighborsClassifier {
36            n_neighbors,
37            weights: Weights::Uniform,
38            metric: Metric::Euclidean,
39            p: 2.0,
40            x_train_: None,
41            y_train_: None,
42            n_samples_: 0,
43            n_features_: 0,
44            n_classes_: 0,
45        }
46    }
47
48    pub fn weights(mut self, weights: Weights) -> Self {
49        self.weights = weights;
50        self
51    }
52
53    pub fn metric(mut self, metric: Metric) -> Self {
54        self.metric = metric;
55        self
56    }
57
58    fn distance(&self, a: &[f32], b: &[f32]) -> f32 {
59        match self.metric {
60            Metric::Euclidean => {
61                a.iter().zip(b.iter()).map(|(&ai, &bi)| (ai - bi).powi(2)).sum::<f32>().sqrt()
62            }
63            Metric::Manhattan => {
64                a.iter().zip(b.iter()).map(|(&ai, &bi)| (ai - bi).abs()).sum()
65            }
66            Metric::Minkowski => {
67                a.iter().zip(b.iter())
68                    .map(|(&ai, &bi)| (ai - bi).abs().powf(self.p))
69                    .sum::<f32>()
70                    .powf(1.0 / self.p)
71            }
72            Metric::Cosine => {
73                let dot: f32 = a.iter().zip(b.iter()).map(|(&ai, &bi)| ai * bi).sum();
74                let norm_a: f32 = a.iter().map(|&x| x * x).sum::<f32>().sqrt();
75                let norm_b: f32 = b.iter().map(|&x| x * x).sum::<f32>().sqrt();
76                1.0 - dot / (norm_a * norm_b + 1e-10)
77            }
78        }
79    }
80
81    fn find_k_nearest(&self, query: &[f32]) -> Vec<(usize, f32)> {
82        let x_train = self.x_train_.as_ref().unwrap();
83        
84        let mut distances: Vec<(usize, f32)> = (0..self.n_samples_)
85            .map(|i| {
86                let train_point = &x_train[i * self.n_features_..(i + 1) * self.n_features_];
87                let dist = self.distance(query, train_point);
88                (i, dist)
89            })
90            .collect();
91
92        distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
93        distances.truncate(self.n_neighbors);
94        distances
95    }
96
97    pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
98        let x_data = x.data_f32();
99        let y_data = y.data_f32();
100        
101        self.n_samples_ = x.dims()[0];
102        self.n_features_ = x.dims()[1];
103        self.n_classes_ = y_data.iter().map(|&v| v as usize).max().unwrap_or(0) + 1;
104        
105        self.x_train_ = Some(x_data);
106        self.y_train_ = Some(y_data);
107    }
108
109    pub fn predict(&self, x: &Tensor) -> Tensor {
110        let x_data = x.data_f32();
111        let n_samples = x.dims()[0];
112        let n_features = x.dims()[1];
113        
114        let y_train = self.y_train_.as_ref().expect("Model not fitted");
115        
116        let predictions: Vec<f32> = (0..n_samples)
117            .into_par_iter()
118            .map(|i| {
119                let query = &x_data[i * n_features..(i + 1) * n_features];
120                let neighbors = self.find_k_nearest(query);
121                
122                match self.weights {
123                    Weights::Uniform => {
124                        let mut class_counts = vec![0usize; self.n_classes_];
125                        for (neighbor_idx, _) in neighbors {
126                            let class = y_train[neighbor_idx] as usize;
127                            if class < self.n_classes_ {
128                                class_counts[class] += 1;
129                            }
130                        }
131                        class_counts.iter()
132                            .enumerate()
133                            .max_by_key(|(_, &count)| count)
134                            .map(|(class, _)| class as f32)
135                            .unwrap_or(0.0)
136                    }
137                    Weights::Distance => {
138                        let mut class_weights = vec![0.0f32; self.n_classes_];
139                        for (neighbor_idx, dist) in neighbors {
140                            let class = y_train[neighbor_idx] as usize;
141                            if class < self.n_classes_ {
142                                let weight = if dist < 1e-10 { 1e10 } else { 1.0 / dist };
143                                class_weights[class] += weight;
144                            }
145                        }
146                        class_weights.iter()
147                            .enumerate()
148                            .max_by(|(_, &a), (_, &b)| a.partial_cmp(&b).unwrap())
149                            .map(|(class, _)| class as f32)
150                            .unwrap_or(0.0)
151                    }
152                }
153            })
154            .collect();
155        
156        Tensor::from_slice(&predictions, &[n_samples]).unwrap()
157    }
158
159    pub fn score(&self, x: &Tensor, y: &Tensor) -> f32 {
160        let predictions = self.predict(x);
161        let pred_data = predictions.data_f32();
162        let y_data = y.data_f32();
163        
164        let correct: usize = pred_data.iter()
165            .zip(y_data.iter())
166            .filter(|(&p, &y)| (p - y).abs() < 0.5)
167            .count();
168        
169        correct as f32 / y_data.len() as f32
170    }
171}
172
173
174/// K-Nearest Neighbors Regressor
175pub struct KNeighborsRegressor {
176    pub n_neighbors: usize,
177    pub weights: Weights,
178    pub metric: Metric,
179    pub p: f32,
180    x_train_: Option<Vec<f32>>,
181    y_train_: Option<Vec<f32>>,
182    n_samples_: usize,
183    n_features_: usize,
184}
185
186impl KNeighborsRegressor {
187    pub fn new(n_neighbors: usize) -> Self {
188        KNeighborsRegressor {
189            n_neighbors,
190            weights: Weights::Uniform,
191            metric: Metric::Euclidean,
192            p: 2.0,
193            x_train_: None,
194            y_train_: None,
195            n_samples_: 0,
196            n_features_: 0,
197        }
198    }
199
200    pub fn weights(mut self, weights: Weights) -> Self {
201        self.weights = weights;
202        self
203    }
204
205    fn distance(&self, a: &[f32], b: &[f32]) -> f32 {
206        match self.metric {
207            Metric::Euclidean => {
208                a.iter().zip(b.iter()).map(|(&ai, &bi)| (ai - bi).powi(2)).sum::<f32>().sqrt()
209            }
210            Metric::Manhattan => {
211                a.iter().zip(b.iter()).map(|(&ai, &bi)| (ai - bi).abs()).sum()
212            }
213            _ => {
214                a.iter().zip(b.iter()).map(|(&ai, &bi)| (ai - bi).powi(2)).sum::<f32>().sqrt()
215            }
216        }
217    }
218
219    fn find_k_nearest(&self, query: &[f32]) -> Vec<(usize, f32)> {
220        let x_train = self.x_train_.as_ref().unwrap();
221        
222        let mut distances: Vec<(usize, f32)> = (0..self.n_samples_)
223            .map(|i| {
224                let train_point = &x_train[i * self.n_features_..(i + 1) * self.n_features_];
225                let dist = self.distance(query, train_point);
226                (i, dist)
227            })
228            .collect();
229
230        distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
231        distances.truncate(self.n_neighbors);
232        distances
233    }
234
235    pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
236        self.n_samples_ = x.dims()[0];
237        self.n_features_ = x.dims()[1];
238        self.x_train_ = Some(x.data_f32());
239        self.y_train_ = Some(y.data_f32());
240    }
241
242    pub fn predict(&self, x: &Tensor) -> Tensor {
243        let x_data = x.data_f32();
244        let n_samples = x.dims()[0];
245        let n_features = x.dims()[1];
246        
247        let y_train = self.y_train_.as_ref().expect("Model not fitted");
248        
249        let predictions: Vec<f32> = (0..n_samples)
250            .into_par_iter()
251            .map(|i| {
252                let query = &x_data[i * n_features..(i + 1) * n_features];
253                let neighbors = self.find_k_nearest(query);
254                
255                match self.weights {
256                    Weights::Uniform => {
257                        let sum: f32 = neighbors.iter()
258                            .map(|(idx, _)| y_train[*idx])
259                            .sum();
260                        sum / neighbors.len() as f32
261                    }
262                    Weights::Distance => {
263                        let mut weighted_sum = 0.0f32;
264                        let mut total_weight = 0.0f32;
265                        for (idx, dist) in neighbors {
266                            let weight = if dist < 1e-10 { 1e10 } else { 1.0 / dist };
267                            weighted_sum += weight * y_train[idx];
268                            total_weight += weight;
269                        }
270                        weighted_sum / total_weight.max(1e-10)
271                    }
272                }
273            })
274            .collect();
275        
276        Tensor::from_slice(&predictions, &[n_samples]).unwrap()
277    }
278
279    pub fn score(&self, x: &Tensor, y: &Tensor) -> f32 {
280        let predictions = self.predict(x);
281        let pred_data = predictions.data_f32();
282        let y_data = y.data_f32();
283        
284        let y_mean: f32 = y_data.iter().sum::<f32>() / y_data.len() as f32;
285        let ss_res: f32 = pred_data.iter()
286            .zip(y_data.iter())
287            .map(|(&p, &y)| (y - p).powi(2))
288            .sum();
289        let ss_tot: f32 = y_data.iter()
290            .map(|&y| (y - y_mean).powi(2))
291            .sum();
292        
293        1.0 - ss_res / ss_tot.max(1e-10)
294    }
295}
296
297#[cfg(test)]
298mod tests {
299    use super::*;
300
301    #[test]
302    fn test_knn_classifier() {
303        let x = Tensor::from_slice(&[0.0f32, 0.0,
304            0.1, 0.1,
305            1.0, 1.0,
306            1.1, 1.1,
307        ], &[4, 2]).unwrap();
308        
309        let y = Tensor::from_slice(&[0.0f32, 0.0, 1.0, 1.0], &[4]).unwrap();
310        
311        let mut knn = KNeighborsClassifier::new(3);
312        knn.fit(&x, &y);
313        
314        let score = knn.score(&x, &y);
315        assert!(score >= 0.5);
316    }
317
318    #[test]
319    fn test_knn_regressor() {
320        let x = Tensor::from_slice(&[1.0f32, 2.0, 3.0, 4.0, 5.0], &[5, 1]).unwrap();
321        let y = Tensor::from_slice(&[2.0f32, 4.0, 6.0, 8.0, 10.0], &[5]).unwrap();
322        
323        let mut knn = KNeighborsRegressor::new(3);
324        knn.fit(&x, &y);
325        
326        let score = knn.score(&x, &y);
327        assert!(score >= 0.5);
328    }
329}
330
331