1use ghostflow_core::Tensor;
4use std::f32::consts::PI;
5
6pub struct GaussianNB {
8 pub var_smoothing: f32,
9 class_prior_: Option<Vec<f32>>,
10 class_count_: Option<Vec<usize>>,
11 theta_: Option<Vec<Vec<f32>>>, var_: Option<Vec<Vec<f32>>>, n_classes_: usize,
14 n_features_: usize,
15}
16
17impl GaussianNB {
18 pub fn new() -> Self {
19 GaussianNB {
20 var_smoothing: 1e-9,
21 class_prior_: None,
22 class_count_: None,
23 theta_: None,
24 var_: None,
25 n_classes_: 0,
26 n_features_: 0,
27 }
28 }
29
30 pub fn var_smoothing(mut self, smoothing: f32) -> Self {
31 self.var_smoothing = smoothing;
32 self
33 }
34
35 pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
36 let x_data = x.data_f32();
37 let y_data = y.data_f32();
38 let n_samples = x.dims()[0];
39 let n_features = x.dims()[1];
40
41 self.n_features_ = n_features;
42 self.n_classes_ = y_data.iter().map(|&v| v as usize).max().unwrap_or(0) + 1;
43
44 let mut class_count = vec![0usize; self.n_classes_];
46 for &label in &y_data {
47 class_count[label as usize] += 1;
48 }
49
50 let class_prior: Vec<f32> = class_count.iter()
52 .map(|&c| c as f32 / n_samples as f32)
53 .collect();
54
55 let mut theta = vec![vec![0.0f32; n_features]; self.n_classes_];
57 let mut var = vec![vec![0.0f32; n_features]; self.n_classes_];
58
59 for i in 0..n_samples {
61 let class = y_data[i] as usize;
62 for j in 0..n_features {
63 theta[class][j] += x_data[i * n_features + j];
64 }
65 }
66 for c in 0..self.n_classes_ {
67 if class_count[c] > 0 {
68 for j in 0..n_features {
69 theta[c][j] /= class_count[c] as f32;
70 }
71 }
72 }
73
74 for i in 0..n_samples {
76 let class = y_data[i] as usize;
77 for j in 0..n_features {
78 let diff = x_data[i * n_features + j] - theta[class][j];
79 var[class][j] += diff * diff;
80 }
81 }
82 for c in 0..self.n_classes_ {
83 if class_count[c] > 0 {
84 for j in 0..n_features {
85 var[c][j] = var[c][j] / class_count[c] as f32 + self.var_smoothing;
86 }
87 }
88 }
89
90 self.class_prior_ = Some(class_prior);
91 self.class_count_ = Some(class_count);
92 self.theta_ = Some(theta);
93 self.var_ = Some(var);
94 }
95
96 fn log_likelihood(&self, x: &[f32], class: usize) -> f32 {
97 let theta = self.theta_.as_ref().unwrap();
98 let var = self.var_.as_ref().unwrap();
99 let prior = self.class_prior_.as_ref().unwrap();
100
101 let mut log_prob = prior[class].ln();
102
103 for j in 0..self.n_features_ {
104 let mean = theta[class][j];
105 let variance = var[class][j];
106 let diff = x[j] - mean;
107
108 log_prob += -0.5 * (2.0 * PI * variance).ln() - (diff * diff) / (2.0 * variance);
110 }
111
112 log_prob
113 }
114
115 pub fn predict(&self, x: &Tensor) -> Tensor {
116 let x_data = x.data_f32();
117 let n_samples = x.dims()[0];
118 let n_features = x.dims()[1];
119
120 let predictions: Vec<f32> = (0..n_samples)
121 .map(|i| {
122 let sample = &x_data[i * n_features..(i + 1) * n_features];
123
124 let mut best_class = 0;
125 let mut best_log_prob = f32::NEG_INFINITY;
126
127 for c in 0..self.n_classes_ {
128 let log_prob = self.log_likelihood(sample, c);
129 if log_prob > best_log_prob {
130 best_log_prob = log_prob;
131 best_class = c;
132 }
133 }
134
135 best_class as f32
136 })
137 .collect();
138
139 Tensor::from_slice(&predictions, &[n_samples]).unwrap()
140 }
141
142 pub fn predict_proba(&self, x: &Tensor) -> Tensor {
143 let x_data = x.data_f32();
144 let n_samples = x.dims()[0];
145 let n_features = x.dims()[1];
146
147 let mut probs = Vec::with_capacity(n_samples * self.n_classes_);
148
149 for i in 0..n_samples {
150 let sample = &x_data[i * n_features..(i + 1) * n_features];
151
152 let log_probs: Vec<f32> = (0..self.n_classes_)
153 .map(|c| self.log_likelihood(sample, c))
154 .collect();
155
156 let max_log = log_probs.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
158 let exp_sum: f32 = log_probs.iter().map(|&lp| (lp - max_log).exp()).sum();
159
160 for &lp in &log_probs {
161 probs.push((lp - max_log).exp() / exp_sum);
162 }
163 }
164
165 Tensor::from_slice(&probs, &[n_samples, self.n_classes_]).unwrap()
166 }
167
168 pub fn score(&self, x: &Tensor, y: &Tensor) -> f32 {
169 let predictions = self.predict(x);
170 let pred_data = predictions.data_f32();
171 let y_data = y.data_f32();
172
173 let correct: usize = pred_data.iter()
174 .zip(y_data.iter())
175 .filter(|(&p, &y)| (p - y).abs() < 0.5)
176 .count();
177
178 correct as f32 / y_data.len() as f32
179 }
180}
181
182impl Default for GaussianNB {
183 fn default() -> Self {
184 Self::new()
185 }
186}
187
188
189pub struct MultinomialNB {
191 pub alpha: f32, class_log_prior_: Option<Vec<f32>>,
193 feature_log_prob_: Option<Vec<Vec<f32>>>,
194 class_count_: Option<Vec<f32>>,
195 feature_count_: Option<Vec<Vec<f32>>>,
196 n_classes_: usize,
197 n_features_: usize,
198}
199
200impl MultinomialNB {
201 pub fn new() -> Self {
202 MultinomialNB {
203 alpha: 1.0,
204 class_log_prior_: None,
205 feature_log_prob_: None,
206 class_count_: None,
207 feature_count_: None,
208 n_classes_: 0,
209 n_features_: 0,
210 }
211 }
212
213 pub fn alpha(mut self, alpha: f32) -> Self {
214 self.alpha = alpha;
215 self
216 }
217
218 pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
219 let x_data = x.data_f32();
220 let y_data = y.data_f32();
221 let n_samples = x.dims()[0];
222 let n_features = x.dims()[1];
223
224 self.n_features_ = n_features;
225 self.n_classes_ = y_data.iter().map(|&v| v as usize).max().unwrap_or(0) + 1;
226
227 let mut class_count = vec![0.0f32; self.n_classes_];
229 let mut feature_count = vec![vec![0.0f32; n_features]; self.n_classes_];
230
231 for i in 0..n_samples {
232 let class = y_data[i] as usize;
233 class_count[class] += 1.0;
234 for j in 0..n_features {
235 feature_count[class][j] += x_data[i * n_features + j];
236 }
237 }
238
239 let mut class_log_prior = vec![0.0f32; self.n_classes_];
241 let mut feature_log_prob = vec![vec![0.0f32; n_features]; self.n_classes_];
242
243 let total_samples = n_samples as f32;
244 for c in 0..self.n_classes_ {
245 class_log_prior[c] = (class_count[c] / total_samples).ln();
246
247 let smoothed_total: f32 = feature_count[c].iter().sum::<f32>() + self.alpha * n_features as f32;
248 for j in 0..n_features {
249 feature_log_prob[c][j] = ((feature_count[c][j] + self.alpha) / smoothed_total).ln();
250 }
251 }
252
253 self.class_log_prior_ = Some(class_log_prior);
254 self.feature_log_prob_ = Some(feature_log_prob);
255 self.class_count_ = Some(class_count);
256 self.feature_count_ = Some(feature_count);
257 }
258
259 pub fn predict(&self, x: &Tensor) -> Tensor {
260 let x_data = x.data_f32();
261 let n_samples = x.dims()[0];
262 let n_features = x.dims()[1];
263
264 let class_log_prior = self.class_log_prior_.as_ref().unwrap();
265 let feature_log_prob = self.feature_log_prob_.as_ref().unwrap();
266
267 let predictions: Vec<f32> = (0..n_samples)
268 .map(|i| {
269 let sample = &x_data[i * n_features..(i + 1) * n_features];
270
271 let mut best_class = 0;
272 let mut best_log_prob = f32::NEG_INFINITY;
273
274 for c in 0..self.n_classes_ {
275 let mut log_prob = class_log_prior[c];
276 for j in 0..n_features {
277 log_prob += sample[j] * feature_log_prob[c][j];
278 }
279
280 if log_prob > best_log_prob {
281 best_log_prob = log_prob;
282 best_class = c;
283 }
284 }
285
286 best_class as f32
287 })
288 .collect();
289
290 Tensor::from_slice(&predictions, &[n_samples]).unwrap()
291 }
292
293 pub fn score(&self, x: &Tensor, y: &Tensor) -> f32 {
294 let predictions = self.predict(x);
295 let pred_data = predictions.data_f32();
296 let y_data = y.data_f32();
297
298 let correct: usize = pred_data.iter()
299 .zip(y_data.iter())
300 .filter(|(&p, &y)| (p - y).abs() < 0.5)
301 .count();
302
303 correct as f32 / y_data.len() as f32
304 }
305}
306
307impl Default for MultinomialNB {
308 fn default() -> Self {
309 Self::new()
310 }
311}
312
313pub struct BernoulliNB {
315 pub alpha: f32,
316 pub binarize: Option<f32>,
317 class_log_prior_: Option<Vec<f32>>,
318 feature_log_prob_: Option<Vec<Vec<f32>>>,
319 n_classes_: usize,
320 n_features_: usize,
321}
322
323impl BernoulliNB {
324 pub fn new() -> Self {
325 BernoulliNB {
326 alpha: 1.0,
327 binarize: Some(0.0),
328 class_log_prior_: None,
329 feature_log_prob_: None,
330 n_classes_: 0,
331 n_features_: 0,
332 }
333 }
334
335 pub fn binarize(mut self, threshold: f32) -> Self {
336 self.binarize = Some(threshold);
337 self
338 }
339
340 fn binarize_data(&self, x: &[f32]) -> Vec<f32> {
341 if let Some(threshold) = self.binarize {
342 x.iter().map(|&v| if v > threshold { 1.0 } else { 0.0 }).collect()
343 } else {
344 x.to_vec()
345 }
346 }
347
348 pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
349 let x_data = self.binarize_data(&x.data_f32());
350 let y_data = y.data_f32();
351 let n_samples = x.dims()[0];
352 let n_features = x.dims()[1];
353
354 self.n_features_ = n_features;
355 self.n_classes_ = y_data.iter().map(|&v| v as usize).max().unwrap_or(0) + 1;
356
357 let mut class_count = vec![0.0f32; self.n_classes_];
358 let mut feature_count = vec![vec![0.0f32; n_features]; self.n_classes_];
359
360 for i in 0..n_samples {
361 let class = y_data[i] as usize;
362 class_count[class] += 1.0;
363 for j in 0..n_features {
364 feature_count[class][j] += x_data[i * n_features + j];
365 }
366 }
367
368 let mut class_log_prior = vec![0.0f32; self.n_classes_];
369 let mut feature_log_prob = vec![vec![0.0f32; n_features * 2]; self.n_classes_];
370
371 let total_samples = n_samples as f32;
372 for c in 0..self.n_classes_ {
373 class_log_prior[c] = (class_count[c] / total_samples).ln();
374
375 for j in 0..n_features {
376 let p = (feature_count[c][j] + self.alpha) / (class_count[c] + 2.0 * self.alpha);
377 feature_log_prob[c][j * 2] = p.ln(); feature_log_prob[c][j * 2 + 1] = (1.0 - p).ln(); }
380 }
381
382 self.class_log_prior_ = Some(class_log_prior);
383 self.feature_log_prob_ = Some(feature_log_prob);
384 }
385
386 pub fn predict(&self, x: &Tensor) -> Tensor {
387 let x_data = self.binarize_data(&x.data_f32());
388 let n_samples = x.dims()[0];
389 let n_features = x.dims()[1];
390
391 let class_log_prior = self.class_log_prior_.as_ref().unwrap();
392 let feature_log_prob = self.feature_log_prob_.as_ref().unwrap();
393
394 let predictions: Vec<f32> = (0..n_samples)
395 .map(|i| {
396 let sample = &x_data[i * n_features..(i + 1) * n_features];
397
398 let mut best_class = 0;
399 let mut best_log_prob = f32::NEG_INFINITY;
400
401 for c in 0..self.n_classes_ {
402 let mut log_prob = class_log_prior[c];
403 for j in 0..n_features {
404 if sample[j] > 0.5 {
405 log_prob += feature_log_prob[c][j * 2];
406 } else {
407 log_prob += feature_log_prob[c][j * 2 + 1];
408 }
409 }
410
411 if log_prob > best_log_prob {
412 best_log_prob = log_prob;
413 best_class = c;
414 }
415 }
416
417 best_class as f32
418 })
419 .collect();
420
421 Tensor::from_slice(&predictions, &[n_samples]).unwrap()
422 }
423
424 pub fn score(&self, x: &Tensor, y: &Tensor) -> f32 {
425 let predictions = self.predict(x);
426 let pred_data = predictions.data_f32();
427 let y_data = y.data_f32();
428
429 let correct: usize = pred_data.iter()
430 .zip(y_data.iter())
431 .filter(|(&p, &y)| (p - y).abs() < 0.5)
432 .count();
433
434 correct as f32 / y_data.len() as f32
435 }
436}
437
438impl Default for BernoulliNB {
439 fn default() -> Self {
440 Self::new()
441 }
442}
443
444pub struct ComplementNB {
446 pub alpha: f32,
447 pub norm: bool,
448 class_log_prior_: Option<Vec<f32>>,
449 feature_log_prob_: Option<Vec<Vec<f32>>>,
450 n_classes_: usize,
451 n_features_: usize,
452}
453
454impl ComplementNB {
455 pub fn new() -> Self {
456 ComplementNB {
457 alpha: 1.0,
458 norm: false,
459 class_log_prior_: None,
460 feature_log_prob_: None,
461 n_classes_: 0,
462 n_features_: 0,
463 }
464 }
465
466 pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
467 let x_data = x.data_f32();
468 let y_data = y.data_f32();
469 let n_samples = x.dims()[0];
470 let n_features = x.dims()[1];
471
472 self.n_features_ = n_features;
473 self.n_classes_ = y_data.iter().map(|&v| v as usize).max().unwrap_or(0) + 1;
474
475 let mut total_feature_count = vec![0.0f32; n_features];
477 let mut class_feature_count = vec![vec![0.0f32; n_features]; self.n_classes_];
478 let mut class_count = vec![0.0f32; self.n_classes_];
479
480 for i in 0..n_samples {
481 let class = y_data[i] as usize;
482 class_count[class] += 1.0;
483 for j in 0..n_features {
484 let val = x_data[i * n_features + j];
485 total_feature_count[j] += val;
486 class_feature_count[class][j] += val;
487 }
488 }
489
490 let mut feature_log_prob = vec![vec![0.0f32; n_features]; self.n_classes_];
492
493 for c in 0..self.n_classes_ {
494 let mut complement_sum = 0.0f32;
495 for j in 0..n_features {
496 let complement_count = total_feature_count[j] - class_feature_count[c][j] + self.alpha;
497 complement_sum += complement_count;
498 }
499
500 for j in 0..n_features {
501 let complement_count = total_feature_count[j] - class_feature_count[c][j] + self.alpha;
502 feature_log_prob[c][j] = (complement_count / complement_sum).ln();
503 }
504
505 if self.norm {
507 let norm: f32 = feature_log_prob[c].iter().map(|&x| x.abs()).sum();
508 if norm > 0.0 {
509 for j in 0..n_features {
510 feature_log_prob[c][j] /= norm;
511 }
512 }
513 }
514 }
515
516 let class_log_prior: Vec<f32> = class_count.iter()
517 .map(|&c| (c / n_samples as f32).ln())
518 .collect();
519
520 self.class_log_prior_ = Some(class_log_prior);
521 self.feature_log_prob_ = Some(feature_log_prob);
522 }
523
524 pub fn predict(&self, x: &Tensor) -> Tensor {
525 let x_data = x.data_f32();
526 let n_samples = x.dims()[0];
527 let n_features = x.dims()[1];
528
529 let feature_log_prob = self.feature_log_prob_.as_ref().unwrap();
530
531 let predictions: Vec<f32> = (0..n_samples)
532 .map(|i| {
533 let sample = &x_data[i * n_features..(i + 1) * n_features];
534
535 let mut best_class = 0;
536 let mut best_score = f32::NEG_INFINITY;
537
538 for c in 0..self.n_classes_ {
539 let mut score = 0.0f32;
541 for j in 0..n_features {
542 score -= sample[j] * feature_log_prob[c][j];
543 }
544
545 if score > best_score {
546 best_score = score;
547 best_class = c;
548 }
549 }
550
551 best_class as f32
552 })
553 .collect();
554
555 Tensor::from_slice(&predictions, &[n_samples]).unwrap()
556 }
557}
558
559impl Default for ComplementNB {
560 fn default() -> Self {
561 Self::new()
562 }
563}
564
565#[cfg(test)]
566mod tests {
567 use super::*;
568
569 #[test]
570 fn test_gaussian_nb() {
571 let x = Tensor::from_slice(&[1.0f32, 2.0,
572 1.5, 1.8,
573 5.0, 8.0,
574 6.0, 9.0,
575 ], &[4, 2]).unwrap();
576
577 let y = Tensor::from_slice(&[0.0f32, 0.0, 1.0, 1.0], &[4]).unwrap();
578
579 let mut gnb = GaussianNB::new();
580 gnb.fit(&x, &y);
581
582 let score = gnb.score(&x, &y);
583 assert!(score >= 0.5);
584 }
585
586 #[test]
587 fn test_multinomial_nb() {
588 let x = Tensor::from_slice(&[2.0f32, 1.0, 0.0,
589 1.0, 2.0, 0.0,
590 0.0, 1.0, 2.0,
591 0.0, 0.0, 3.0,
592 ], &[4, 3]).unwrap();
593
594 let y = Tensor::from_slice(&[0.0f32, 0.0, 1.0, 1.0], &[4]).unwrap();
595
596 let mut mnb = MultinomialNB::new();
597 mnb.fit(&x, &y);
598
599 let predictions = mnb.predict(&x);
600 assert_eq!(predictions.dims(), &[4]);
601 }
602}
603
604