1#![allow(non_snake_case)] use ndarray::{Array1, Array2};
18use serde::{Deserialize, Serialize};
19
20use so_core::error::{Error, Result};
21use so_linalg::solve;
22use so_stats::{mean, median, std};
23
24#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
26pub enum Kernel {
27 Gaussian,
29 Epanechnikov,
31 Uniform,
33 Triangular,
35 Biweight,
37 Triweight,
39 Cosine,
41}
42
43impl Kernel {
44 fn evaluate(&self, u: f64) -> f64 {
46 let abs_u = u.abs();
47
48 match self {
49 Kernel::Gaussian => (-0.5 * u * u).exp() / (2.0 * std::f64::consts::PI).sqrt(),
50 Kernel::Epanechnikov => {
51 if abs_u <= 1.0 {
52 0.75 * (1.0 - u * u)
53 } else {
54 0.0
55 }
56 }
57 Kernel::Uniform => {
58 if abs_u <= 1.0 {
59 0.5
60 } else {
61 0.0
62 }
63 }
64 Kernel::Triangular => {
65 if abs_u <= 1.0 {
66 1.0 - abs_u
67 } else {
68 0.0
69 }
70 }
71 Kernel::Biweight => {
72 if abs_u <= 1.0 {
73 let t = 1.0 - u * u;
74 0.9375 * t * t } else {
76 0.0
77 }
78 }
79 Kernel::Triweight => {
80 if abs_u <= 1.0 {
81 let t = 1.0 - u * u;
82 1.09375 * t * t * t } else {
84 0.0
85 }
86 }
87 Kernel::Cosine => {
88 if abs_u <= 1.0 {
89 (std::f64::consts::PI / 2.0 * u).cos() * std::f64::consts::PI / 4.0
90 } else {
91 0.0
92 }
93 }
94 }
95 }
96
97 #[allow(dead_code)]
99 fn efficiency(&self) -> f64 {
100 match self {
101 Kernel::Gaussian => 0.951, Kernel::Epanechnikov => 1.0, Kernel::Uniform => 0.930, Kernel::Triangular => 0.986, Kernel::Biweight => 0.994, Kernel::Triweight => 0.999, Kernel::Cosine => 0.924, }
109 }
110}
111
112#[derive(Debug, Clone, Serialize, Deserialize)]
114pub struct KernelRegressionResults {
115 pub fitted_values: Array1<f64>,
117 pub evaluation_points: Array1<f64>,
119 pub bandwidth: f64,
121 pub df: f64,
123 pub rss: f64,
125}
126
127pub struct KernelRegression {
129 kernel: Kernel,
130 bandwidth: Option<f64>,
131 bandwidth_method: BandwidthMethod,
132}
133
134#[derive(Debug, Clone, Copy)]
136pub enum BandwidthMethod {
137 Silverman,
139 Scott,
141 LSCV,
143 Plugin,
145 Fixed(f64),
147}
148
149impl KernelRegression {
150 pub fn new() -> Self {
152 Self {
153 kernel: Kernel::Gaussian,
154 bandwidth: None,
155 bandwidth_method: BandwidthMethod::Silverman,
156 }
157 }
158
159 pub fn kernel(mut self, kernel: Kernel) -> Self {
161 self.kernel = kernel;
162 self
163 }
164
165 pub fn bandwidth(mut self, bandwidth: f64) -> Self {
167 self.bandwidth = Some(bandwidth);
168 self.bandwidth_method = BandwidthMethod::Fixed(bandwidth);
169 self
170 }
171
172 pub fn bandwidth_method(mut self, method: BandwidthMethod) -> Self {
174 self.bandwidth_method = method;
175 self
176 }
177
178 pub fn fit(&self, x: &Array1<f64>, y: &Array1<f64>) -> Result<KernelRegressionResults> {
180 let n = x.len();
181
182 if n != y.len() {
183 return Err(Error::DataError(
184 "x and y must have the same length".to_string(),
185 ));
186 }
187
188 if n < 3 {
189 return Err(Error::DataError(
190 "Need at least 3 observations for kernel regression".to_string(),
191 ));
192 }
193
194 let h = match self.bandwidth {
196 Some(bw) => bw,
197 None => self.select_bandwidth(x, y)?,
198 };
199
200 let mut sorted_indices: Vec<usize> = (0..n).collect();
202 sorted_indices.sort_by(|&i, &j| x[i].partial_cmp(&x[j]).unwrap());
203
204 let x_sorted: Array1<f64> = sorted_indices.iter().map(|&i| x[i]).collect();
205 let mut fitted = Array1::zeros(n);
206
207 for (i, &x_i) in x_sorted.iter().enumerate() {
209 let mut numerator = 0.0;
210 let mut denominator = 0.0;
211
212 for j in 0..n {
213 let u = (x_i - x[j]) / h;
214 let k = self.kernel.evaluate(u);
215 numerator += k * y[j];
216 denominator += k;
217 }
218
219 if denominator > 1e-10 {
220 fitted[i] = numerator / denominator;
221 } else {
222 fitted[i] = mean(y).unwrap_or(0.0);
224 }
225 }
226
227 let mut fitted_original = Array1::zeros(n);
229 for (sorted_idx, &orig_idx) in sorted_indices.iter().enumerate() {
230 fitted_original[orig_idx] = fitted[sorted_idx];
231 }
232
233 let residuals = y - &fitted_original;
235 let rss = residuals.dot(&residuals);
236
237 let df = self.estimate_df(x, h);
239
240 Ok(KernelRegressionResults {
241 fitted_values: fitted_original,
242 evaluation_points: x_sorted,
243 bandwidth: h,
244 df,
245 rss,
246 })
247 }
248
249 fn select_bandwidth(&self, x: &Array1<f64>, _y: &Array1<f64>) -> Result<f64> {
251 let n = x.len() as f64;
252
253 match self.bandwidth_method {
254 BandwidthMethod::Silverman => {
255 let sigma = std(x, 1.0).unwrap_or(1.0);
257 let iqr = so_stats::iqr(x).unwrap_or(1.349 * sigma);
258 let scale = sigma.min(iqr / 1.349);
259 Ok(1.06 * scale * n.powf(-0.2))
260 }
261 BandwidthMethod::Scott => {
262 let sigma = std(x, 1.0).unwrap_or(1.0);
264 Ok(1.059 * sigma * n.powf(-0.2))
265 }
266 BandwidthMethod::LSCV => {
267 let mut best_h = 0.0;
269 let mut best_cv = f64::INFINITY;
270
271 let sigma = std(x, 1.0).unwrap_or(1.0);
273 let h_min = 0.1 * sigma * n.powf(-0.2);
274 let h_max = 2.0 * sigma * n.powf(-0.2);
275
276 for h in (1..=20).map(|i| h_min + (h_max - h_min) * (i as f64) / 20.0) {
277 let cv_score = self.cross_validation_score(x, _y, h);
278 if cv_score < best_cv {
279 best_cv = cv_score;
280 best_h = h;
281 }
282 }
283
284 Ok(best_h)
285 }
286 BandwidthMethod::Plugin => {
287 let sigma = std(x, 1.0).unwrap_or(1.0);
289 Ok(1.06 * sigma * n.powf(-0.2))
290 }
291 BandwidthMethod::Fixed(h) => Ok(h),
292 }
293 }
294
295 fn cross_validation_score(&self, x: &Array1<f64>, y: &Array1<f64>, h: f64) -> f64 {
297 let n = x.len();
298 let mut cv_sum = 0.0;
299
300 for i in 0..n {
301 let mut numerator = 0.0;
303 let mut denominator = 0.0;
304
305 for j in 0..n {
306 if i != j {
307 let u = (x[i] - x[j]) / h;
308 let k = self.kernel.evaluate(u);
309 numerator += k * y[j];
310 denominator += k;
311 }
312 }
313
314 if denominator > 1e-10 {
315 let y_pred = numerator / denominator;
316 cv_sum += (y[i] - y_pred).powi(2);
317 } else {
318 let y_mean = mean(y).unwrap_or(0.0);
320 cv_sum += (y[i] - y_mean).powi(2);
321 }
322 }
323
324 cv_sum / n as f64
325 }
326
327 fn estimate_df(&self, x: &Array1<f64>, h: f64) -> f64 {
329 let n = x.len();
330 let mut trace = 0.0;
331
332 for i in 0..n {
334 let mut weight_sum = 0.0;
335 for j in 0..n {
336 let u = (x[i] - x[j]) / h;
337 weight_sum += self.kernel.evaluate(u);
338 }
339 if weight_sum > 0.0 {
340 trace += self.kernel.evaluate(0.0) / weight_sum;
341 }
342 }
343
344 trace
345 }
346}
347
348#[derive(Debug, Clone, Serialize, Deserialize)]
350pub struct LocalRegressionResults {
351 pub fitted_values: Array1<f64>,
353 pub evaluation_points: Array1<f64>,
355 pub degree: usize,
357 pub span: f64,
359 pub rss: f64,
361}
362
363pub struct LocalRegression {
365 degree: usize,
366 span: f64,
367 kernel: Kernel,
368 robust: bool,
369 iterations: usize,
370}
371
372impl Default for LocalRegression {
373 fn default() -> Self {
374 Self {
375 degree: 1,
376 span: 0.75,
377 kernel: Kernel::Triweight, robust: false,
379 iterations: 4,
380 }
381 }
382}
383
384impl LocalRegression {
385 pub fn new() -> Self {
387 Self::default()
388 }
389
390 pub fn degree(mut self, degree: usize) -> Self {
392 self.degree = degree.min(2); self
394 }
395
396 pub fn span(mut self, span: f64) -> Self {
398 self.span = span.clamp(0.1, 1.0);
399 self
400 }
401
402 pub fn kernel(mut self, kernel: Kernel) -> Self {
404 self.kernel = kernel;
405 self
406 }
407
408 pub fn robust(mut self, robust: bool) -> Self {
410 self.robust = robust;
411 self
412 }
413
414 pub fn iterations(mut self, iterations: usize) -> Self {
416 self.iterations = iterations.max(1);
417 self
418 }
419
420 pub fn fit(&self, x: &Array1<f64>, y: &Array1<f64>) -> Result<LocalRegressionResults> {
422 let n = x.len();
423
424 if n != y.len() {
425 return Err(Error::DataError(
426 "x and y must have the same length".to_string(),
427 ));
428 }
429
430 if n < 3 {
431 return Err(Error::DataError(
432 "Need at least 3 observations for local regression".to_string(),
433 ));
434 }
435
436 let k = (self.span * n as f64).ceil() as usize;
438 let k = k.max(3).min(n);
439
440 let mut indices: Vec<usize> = (0..n).collect();
442 indices.sort_by(|&i, &j| x[i].partial_cmp(&x[j]).unwrap());
443
444 let x_sorted: Array1<f64> = indices.iter().map(|&i| x[i]).collect();
445 let y_sorted: Array1<f64> = indices.iter().map(|&i| y[i]).collect();
446
447 let mut fitted = Array1::zeros(n);
448 let mut robustness_weights = Array1::ones(n);
449
450 for iter in 0..self.iterations {
452 for i in 0..n {
453 let x0 = x_sorted[i];
454
455 let mut distances: Vec<(f64, usize)> =
457 (0..n).map(|j| ((x_sorted[j] - x0).abs(), j)).collect();
458
459 distances.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
460
461 let neighbor_indices: Vec<usize> =
462 distances[..k].iter().map(|&(_, idx)| idx).collect();
463
464 let max_dist = distances[k - 1].0;
466 let mut weights = Array1::zeros(k);
467
468 for (w_idx, &n_idx) in neighbor_indices.iter().enumerate() {
469 let dist = distances[w_idx].0;
470 let u = dist / max_dist; let kernel_weight = self.kernel.evaluate(u);
472 let robust_weight = robustness_weights[n_idx];
473 weights[w_idx] = kernel_weight * robust_weight;
474 }
475
476 let X_local = self
478 .build_design_matrix(&x_sorted.select(ndarray::Axis(0), &neighbor_indices), x0);
479 let y_local = y_sorted.select(ndarray::Axis(0), &neighbor_indices);
480
481 let W_sqrt = weights.mapv(|w: f64| w.sqrt());
483 let X_weighted = &X_local * &W_sqrt.clone().insert_axis(ndarray::Axis(1));
484 let y_weighted = &y_local * &W_sqrt;
485
486 if let Ok(beta) = solve(
487 &X_weighted.t().dot(&X_weighted),
488 &X_weighted.t().dot(&y_weighted),
489 ) {
490 fitted[i] = beta[0];
492 } else {
493 let weight_sum: f64 = weights.iter().sum();
495 if weight_sum > 0.0 {
496 fitted[i] = weights
497 .iter()
498 .zip(y_local.iter())
499 .map(|(&w, &y_val)| w * y_val)
500 .sum::<f64>()
501 / weight_sum;
502 } else {
503 fitted[i] = mean(&y_local).unwrap_or(0.0);
504 }
505 }
506 }
507
508 if self.robust && iter < self.iterations - 1 {
510 let residuals = &y_sorted - &fitted;
511 let mad = self.mad(&residuals);
512 let scale = mad / 0.6745;
513
514 if scale > 1e-10 {
515 for i in 0..n {
516 let u = residuals[i] / (6.0 * scale);
517 robustness_weights[i] = self.tukey_weight(u);
518 }
519 }
520 }
521 }
522
523 let mut fitted_original = Array1::zeros(n);
525 for (sorted_idx, &orig_idx) in indices.iter().enumerate() {
526 fitted_original[orig_idx] = fitted[sorted_idx];
527 }
528
529 let residuals = y - &fitted_original;
530 let rss = residuals.dot(&residuals);
531
532 Ok(LocalRegressionResults {
533 fitted_values: fitted_original,
534 evaluation_points: x_sorted,
535 degree: self.degree,
536 span: self.span,
537 rss,
538 })
539 }
540
541 fn build_design_matrix(&self, x_local: &Array1<f64>, x0: f64) -> Array2<f64> {
543 let n_local = x_local.len();
544 let mut X = Array2::ones((n_local, self.degree + 1));
545
546 for i in 0..n_local {
547 let centered = x_local[i] - x0;
548 for d in 1..=self.degree {
549 X[(i, d)] = centered.powi(d as i32);
550 }
551 }
552
553 X
554 }
555
556 fn mad(&self, data: &Array1<f64>) -> f64 {
558 let med = median(data).unwrap_or(0.0);
559 let abs_dev: Array1<f64> = data.mapv(|x| (x - med).abs());
560 median(&abs_dev).unwrap_or(0.0)
561 }
562
563 fn tukey_weight(&self, u: f64) -> f64 {
565 if u.abs() <= 1.0 {
566 let t = 1.0 - u * u;
567 t * t
568 } else {
569 0.0
570 }
571 }
572}
573
574#[derive(Debug, Clone, Serialize, Deserialize)]
576pub struct SmoothingSplineResults {
577 pub fitted_values: Array1<f64>,
579 pub knots: Array1<f64>,
581 pub coefficients: Array1<f64>,
583 pub lambda: f64,
585 pub df: f64,
587 pub gcv: f64,
589}
590
591pub struct SmoothingSpline {
593 lambda: Option<f64>,
594 df: Option<f64>,
595 knots: Option<Vec<f64>>,
596 n_knots: usize,
597}
598
599impl Default for SmoothingSpline {
600 fn default() -> Self {
601 Self {
602 lambda: None,
603 df: None,
604 knots: None,
605 n_knots: 20,
606 }
607 }
608}
609
610impl SmoothingSpline {
611 pub fn new() -> Self {
613 Self::default()
614 }
615
616 pub fn lambda(mut self, lambda: f64) -> Self {
618 self.lambda = Some(lambda.max(0.0));
619 self.df = None; self
621 }
622
623 pub fn df(mut self, df: f64) -> Self {
625 self.df = Some(df.max(1.0));
626 self.lambda = None; self
628 }
629
630 pub fn knots(mut self, knots: Vec<f64>) -> Self {
632 self.knots = Some(knots);
633 self
634 }
635
636 pub fn n_knots(mut self, n_knots: usize) -> Self {
638 self.n_knots = n_knots.max(3);
639 self
640 }
641
642 pub fn fit(&self, x: &Array1<f64>, y: &Array1<f64>) -> Result<SmoothingSplineResults> {
644 let n = x.len();
645
646 if n != y.len() {
647 return Err(Error::DataError(
648 "x and y must have the same length".to_string(),
649 ));
650 }
651
652 if n < 3 {
653 return Err(Error::DataError(
654 "Need at least 3 observations for smoothing spline".to_string(),
655 ));
656 }
657
658 let mut indices: Vec<usize> = (0..n).collect();
660 indices.sort_by(|&i, &j| x[i].partial_cmp(&x[j]).unwrap());
661
662 let x_sorted: Array1<f64> = indices.iter().map(|&i| x[i]).collect();
663 let y_sorted: Array1<f64> = indices.iter().map(|&i| y[i]).collect();
664
665 let knots = match &self.knots {
667 Some(k) => Array1::from(k.clone()),
668 None => {
669 let min_x = x_sorted[0];
670 let max_x = x_sorted[n - 1];
671 let step = (max_x - min_x) / (self.n_knots as f64 - 1.0);
672 Array1::from_iter((0..self.n_knots).map(|i| min_x + i as f64 * step))
673 }
674 };
675
676 let basis = self.build_basis(&x_sorted, &knots);
678
679 let penalty = self.build_penalty(&knots);
681
682 let lambda = match (self.lambda, self.df) {
684 (Some(lambda), _) => lambda,
685 (None, Some(df_target)) => {
686 self.find_lambda_for_df(&basis, &penalty, df_target, n as f64)?
687 }
688 (None, None) => {
689 self.find_lambda_by_gcv(&basis, &penalty, &y_sorted)?
691 }
692 };
693
694 let XtX = basis.t().dot(&basis);
696 let XtX_penalized = &XtX + &(penalty * lambda);
697 let Xty = basis.t().dot(&y_sorted);
698
699 let coefficients = solve(&XtX_penalized, &Xty)
700 .map_err(|e| Error::LinearAlgebraError(format!("Spline solve failed: {}", e)))?;
701
702 let fitted = basis.dot(&coefficients);
703
704 let p = basis.shape()[1];
707 let df = p as f64; let _S = Array2::<f64>::eye(basis.shape()[0]); let residuals = &y_sorted - &fitted;
712 let rss = residuals.dot(&residuals);
713 let gcv = rss / ((1.0 - df / n as f64).powi(2) * n as f64);
714
715 let mut fitted_original = Array1::zeros(n);
717 for (sorted_idx, &orig_idx) in indices.iter().enumerate() {
718 fitted_original[orig_idx] = fitted[sorted_idx];
719 }
720
721 Ok(SmoothingSplineResults {
722 fitted_values: fitted_original,
723 knots,
724 coefficients,
725 lambda,
726 df,
727 gcv,
728 })
729 }
730
731 fn build_basis(&self, x: &Array1<f64>, knots: &Array1<f64>) -> Array2<f64> {
733 let n = x.len();
734 let n_knots = knots.len();
735 let n_basis = n_knots + 2; let mut basis = Array2::zeros((n, n_basis));
738
739 for i in 0..n {
740 let xi = x[i];
741
742 basis[(i, 0)] = 1.0;
744 basis[(i, 1)] = xi;
745
746 for (j, &knot) in knots.iter().enumerate() {
748 let diff = xi - knot;
749 basis[(i, j + 2)] = if diff > 0.0 { diff.powi(3) } else { 0.0 };
750 }
751 }
752
753 basis
754 }
755
756 fn build_penalty(&self, knots: &Array1<f64>) -> Array2<f64> {
758 let n_knots = knots.len();
759 let n_basis = n_knots + 2;
760
761 let mut penalty = Array2::zeros((n_basis, n_basis));
762
763 for i in 2..n_basis {
766 penalty[(i, i)] = 1.0;
767 }
768
769 penalty
770 }
771
772 fn find_lambda_for_df(
774 &self,
775 _basis: &Array2<f64>,
776 _penalty: &Array2<f64>,
777 _df_target: f64,
778 _n: f64,
779 ) -> Result<f64> {
780 Ok(1.0)
782 }
783
784 fn find_lambda_by_gcv(
786 &self,
787 _basis: &Array2<f64>,
788 _penalty: &Array2<f64>,
789 _y: &Array1<f64>,
790 ) -> Result<f64> {
791 Ok(1.0)
793 }
794}