1use crate::distributions::{
62 beta::Beta,
63 binomial_distribution::Binomial,
64 chi_squared::ChiSquared,
65 f_distribution::FDistribution,
66 gamma_distribution::Gamma,
67 geometric::Geometric,
68 lognormal::LogNormal,
69 negative_binomial::NegativeBinomial,
70 normal_distribution::Normal,
71 poisson_distribution::Poisson,
72 student_t::StudentT,
73 traits::{DiscreteDistribution, Distribution},
74 uniform_distribution::Uniform,
75 weibull::Weibull,
76};
77use crate::error::{StatsError, StatsResult};
78use rayon::prelude::*;
79
80#[derive(Debug, Clone, Copy, PartialEq, Eq)]
84pub enum DataKind {
85 Discrete,
87 Continuous,
89}
90
91pub fn detect_data_type(data: &[f64]) -> DataKind {
101 if data
102 .iter()
103 .all(|&x| x >= 0.0 && x.fract() == 0.0 && x.is_finite())
104 {
105 DataKind::Discrete
106 } else {
107 DataKind::Continuous
108 }
109}
110
111#[derive(Debug, Clone, Copy)]
115pub struct KsResult {
116 pub statistic: f64,
118 pub p_value: f64,
120}
121
122pub fn ks_test(data: &[f64], cdf: impl Fn(f64) -> f64) -> KsResult {
126 let mut buf = Vec::with_capacity(data.len());
127 buf.extend_from_slice(data);
128 ks_test_with_scratch(&mut buf, cdf)
129}
130
131pub fn ks_test_with_scratch(scratch: &mut [f64], cdf: impl Fn(f64) -> f64) -> KsResult {
139 let n = scratch.len();
140 if n == 0 {
141 return KsResult {
142 statistic: 0.0,
143 p_value: 1.0,
144 };
145 }
146 scratch.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
147
148 let nf = n as f64;
149 let mut d = 0.0_f64;
150 for (i, &x) in scratch.iter().enumerate() {
151 let f = cdf(x);
152 let upper = (i + 1) as f64 / nf;
153 let lower = i as f64 / nf;
154 d = d.max((upper - f).abs()).max((f - lower).abs());
155 }
156
157 let p_value = kolmogorov_p(((nf).sqrt() + 0.12 + 0.11 / nf.sqrt()) * d);
158
159 KsResult {
160 statistic: d,
161 p_value,
162 }
163}
164
165pub fn ks_test_discrete(data: &[f64], cdf: impl Fn(u64) -> f64) -> KsResult {
167 let mut buf = Vec::with_capacity(data.len());
168 buf.extend_from_slice(data);
169 ks_test_discrete_with_scratch(&mut buf, cdf)
170}
171
172pub fn ks_test_discrete_with_scratch(scratch: &mut [f64], cdf: impl Fn(u64) -> f64) -> KsResult {
174 let n = scratch.len();
175 if n == 0 {
176 return KsResult {
177 statistic: 0.0,
178 p_value: 1.0,
179 };
180 }
181 scratch.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
182
183 let nf = n as f64;
184 let mut d = 0.0_f64;
185 for (i, &x) in scratch.iter().enumerate() {
186 let k = x.round() as u64;
187 let f = cdf(k);
188 let upper = (i + 1) as f64 / nf;
189 let lower = i as f64 / nf;
190 d = d.max((upper - f).abs()).max((f - lower).abs());
191 }
192
193 let p_value = kolmogorov_p(((nf).sqrt() + 0.12 + 0.11 / nf.sqrt()) * d);
194
195 KsResult {
196 statistic: d,
197 p_value,
198 }
199}
200
201fn kolmogorov_p(x: f64) -> f64 {
203 if x <= 0.0 {
204 return 1.0;
205 }
206 let mut sum = 0.0_f64;
208 for j in 1_u32..=100 {
209 let term = (-(2.0 * (j as f64).powi(2) * x * x)).exp();
210 if j % 2 == 1 {
211 sum += term;
212 } else {
213 sum -= term;
214 }
215 if term < 1e-15 {
216 break;
217 }
218 }
219 (2.0 * sum).clamp(0.0, 1.0)
220}
221
222#[derive(Debug, Clone)]
226pub struct FitResult {
227 pub name: String,
229 pub aic: f64,
231 pub bic: f64,
233 pub ks_statistic: f64,
235 pub ks_p_value: f64,
237}
238
239fn try_fit_continuous<D: Distribution<X = f64>>(data: &[f64], dist: D) -> Option<FitResult> {
245 let aic = dist.aic(data).ok().filter(|x| x.is_finite())?;
246 let bic = dist.bic(data).ok().filter(|x| x.is_finite())?;
247 let ks = ks_test(data, |x| dist.cdf(x).unwrap_or(0.0));
248 Some(FitResult {
249 name: dist.name().to_string(),
250 aic,
251 bic,
252 ks_statistic: ks.statistic,
253 ks_p_value: ks.p_value,
254 })
255}
256
257type ContinuousFitter = fn(&[f64]) -> Option<FitResult>;
261
262const CONTINUOUS_FITTERS: &[ContinuousFitter] = &[
263 |d| Normal::fit(d).ok().and_then(|x| try_fit_continuous(d, x)),
264 |d| {
265 crate::distributions::exponential_distribution::Exponential::fit(d)
266 .ok()
267 .and_then(|x| try_fit_continuous(d, x))
268 },
269 |d| Uniform::fit(d).ok().and_then(|x| try_fit_continuous(d, x)),
270 |d| Gamma::fit(d).ok().and_then(|x| try_fit_continuous(d, x)),
271 |d| {
272 LogNormal::fit(d)
273 .ok()
274 .and_then(|x| try_fit_continuous(d, x))
275 },
276 |d| Weibull::fit(d).ok().and_then(|x| try_fit_continuous(d, x)),
277 |d| Beta::fit(d).ok().and_then(|x| try_fit_continuous(d, x)),
278 |d| StudentT::fit(d).ok().and_then(|x| try_fit_continuous(d, x)),
279 |d| {
280 FDistribution::fit(d)
281 .ok()
282 .and_then(|x| try_fit_continuous(d, x))
283 },
284 |d| {
285 ChiSquared::fit(d)
286 .ok()
287 .and_then(|x| try_fit_continuous(d, x))
288 },
289];
290
291pub fn fit_all(data: &[f64]) -> StatsResult<Vec<FitResult>> {
297 if data.is_empty() {
298 return Err(StatsError::InvalidInput {
299 message: "fit_all: data must not be empty".to_string(),
300 });
301 }
302
303 let mut results: Vec<FitResult> = CONTINUOUS_FITTERS
304 .par_iter()
305 .filter_map(|fit| fit(data))
306 .collect();
307
308 if results.is_empty() {
309 return Err(StatsError::InvalidInput {
310 message: "fit_all: no distribution could be fitted to the data".to_string(),
311 });
312 }
313
314 results.sort_by(|a, b| {
315 a.aic
316 .partial_cmp(&b.aic)
317 .unwrap_or(std::cmp::Ordering::Equal)
318 });
319 Ok(results)
320}
321
322pub fn fit_best(data: &[f64]) -> StatsResult<FitResult> {
324 let mut all = fit_all(data)?;
325 Ok(all.remove(0))
326}
327
328#[derive(Debug, Clone)]
334pub struct SkippedFit {
335 pub name: &'static str,
337 pub reason: String,
339}
340
341fn try_fit_verbose<D: Distribution<X = f64>>(
358 name: &'static str,
359 data: &[f64],
360 fit_res: StatsResult<D>,
361) -> Result<FitResult, SkippedFit> {
362 let dist = fit_res.map_err(|e| SkippedFit {
363 name,
364 reason: format!("fit failed: {e}"),
365 })?;
366 let aic = dist
367 .aic(data)
368 .ok()
369 .filter(|x| x.is_finite())
370 .ok_or_else(|| SkippedFit {
371 name,
372 reason: "non-finite AIC (log-likelihood diverged)".to_string(),
373 })?;
374 let bic = dist
375 .bic(data)
376 .ok()
377 .filter(|x| x.is_finite())
378 .ok_or_else(|| SkippedFit {
379 name,
380 reason: "non-finite BIC".to_string(),
381 })?;
382 let ks = ks_test(data, |x| dist.cdf(x).unwrap_or(0.0));
383 Ok(FitResult {
384 name: dist.name().to_string(),
385 aic,
386 bic,
387 ks_statistic: ks.statistic,
388 ks_p_value: ks.p_value,
389 })
390}
391
392type ContinuousVerboseFitter = fn(&[f64]) -> Result<FitResult, SkippedFit>;
393
394const CONTINUOUS_VERBOSE_FITTERS: &[ContinuousVerboseFitter] = &[
395 |d| try_fit_verbose("Normal", d, Normal::fit(d)),
396 |d| {
397 try_fit_verbose(
398 "Exponential",
399 d,
400 crate::distributions::exponential_distribution::Exponential::fit(d),
401 )
402 },
403 |d| try_fit_verbose("Uniform", d, Uniform::fit(d)),
404 |d| try_fit_verbose("Gamma", d, Gamma::fit(d)),
405 |d| try_fit_verbose("LogNormal", d, LogNormal::fit(d)),
406 |d| try_fit_verbose("Weibull", d, Weibull::fit(d)),
407 |d| try_fit_verbose("Beta", d, Beta::fit(d)),
408 |d| try_fit_verbose("StudentT", d, StudentT::fit(d)),
409 |d| try_fit_verbose("FDistribution", d, FDistribution::fit(d)),
410 |d| try_fit_verbose("ChiSquared", d, ChiSquared::fit(d)),
411];
412
413pub fn fit_all_verbose(data: &[f64]) -> StatsResult<(Vec<FitResult>, Vec<SkippedFit>)> {
414 if data.is_empty() {
415 return Err(StatsError::InvalidInput {
416 message: "fit_all_verbose: data must not be empty".to_string(),
417 });
418 }
419
420 let outcomes: Vec<Result<FitResult, SkippedFit>> = CONTINUOUS_VERBOSE_FITTERS
421 .par_iter()
422 .map(|f| f(data))
423 .collect();
424
425 let (mut results, skipped): (Vec<FitResult>, Vec<SkippedFit>) =
426 outcomes
427 .into_iter()
428 .fold((Vec::new(), Vec::new()), |(mut ok, mut err), o| {
429 match o {
430 Ok(r) => ok.push(r),
431 Err(s) => err.push(s),
432 }
433 (ok, err)
434 });
435
436 if results.is_empty() {
437 return Err(StatsError::InvalidInput {
438 message: "fit_all_verbose: no distribution could be fitted to the data".to_string(),
439 });
440 }
441
442 results.sort_by(|a, b| {
443 a.aic
444 .partial_cmp(&b.aic)
445 .unwrap_or(std::cmp::Ordering::Equal)
446 });
447 Ok((results, skipped))
448}
449
450fn try_fit_discrete<D: DiscreteDistribution>(
453 data: &[f64],
454 int_data: &[u64],
455 dist: D,
456) -> Option<FitResult> {
457 let aic = dist.aic(int_data).ok().filter(|x| x.is_finite())?;
458 let bic = dist.bic(int_data).ok().filter(|x| x.is_finite())?;
459 let ks = ks_test_discrete(data, |k| dist.cdf(k).unwrap_or(0.0));
460 Some(FitResult {
461 name: dist.name().to_string(),
462 aic,
463 bic,
464 ks_statistic: ks.statistic,
465 ks_p_value: ks.p_value,
466 })
467}
468
469fn try_fit_discrete_verbose<D: DiscreteDistribution>(
470 name: &'static str,
471 data: &[f64],
472 int_data: &[u64],
473 fit_res: StatsResult<D>,
474) -> Result<FitResult, SkippedFit> {
475 let dist = fit_res.map_err(|e| SkippedFit {
476 name,
477 reason: format!("fit failed: {e}"),
478 })?;
479 let aic = dist
480 .aic(int_data)
481 .ok()
482 .filter(|x| x.is_finite())
483 .ok_or_else(|| SkippedFit {
484 name,
485 reason: "non-finite AIC".to_string(),
486 })?;
487 let bic = dist
488 .bic(int_data)
489 .ok()
490 .filter(|x| x.is_finite())
491 .ok_or_else(|| SkippedFit {
492 name,
493 reason: "non-finite BIC".to_string(),
494 })?;
495 let ks = ks_test_discrete(data, |k| dist.cdf(k).unwrap_or(0.0));
496 Ok(FitResult {
497 name: dist.name().to_string(),
498 aic,
499 bic,
500 ks_statistic: ks.statistic,
501 ks_p_value: ks.p_value,
502 })
503}
504
505type DiscreteFitter = fn(&[f64], &[u64]) -> Option<FitResult>;
506type DiscreteVerboseFitter = fn(&[f64], &[u64]) -> Result<FitResult, SkippedFit>;
507
508const DISCRETE_FITTERS: &[DiscreteFitter] = &[
509 |d, i| Poisson::fit(d).ok().and_then(|x| try_fit_discrete(d, i, x)),
510 |d, i| {
511 Geometric::fit(d)
512 .ok()
513 .and_then(|x| try_fit_discrete(d, i, x))
514 },
515 |d, i| {
516 NegativeBinomial::fit(d)
517 .ok()
518 .and_then(|x| try_fit_discrete(d, i, x))
519 },
520 |d, i| {
521 Binomial::fit(d)
522 .ok()
523 .and_then(|x| try_fit_discrete(d, i, x))
524 },
525];
526
527const DISCRETE_VERBOSE_FITTERS: &[DiscreteVerboseFitter] = &[
528 |d, i| try_fit_discrete_verbose("Poisson", d, i, Poisson::fit(d)),
529 |d, i| try_fit_discrete_verbose("Geometric", d, i, Geometric::fit(d)),
530 |d, i| try_fit_discrete_verbose("NegativeBinomial", d, i, NegativeBinomial::fit(d)),
531 |d, i| try_fit_discrete_verbose("Binomial", d, i, Binomial::fit(d)),
532];
533
534pub fn fit_all_discrete_verbose(data: &[f64]) -> StatsResult<(Vec<FitResult>, Vec<SkippedFit>)> {
536 if data.is_empty() {
537 return Err(StatsError::InvalidInput {
538 message: "fit_all_discrete_verbose: data must not be empty".to_string(),
539 });
540 }
541
542 let int_data: Vec<u64> = data.iter().map(|&x| x.round() as u64).collect();
543
544 let outcomes: Vec<Result<FitResult, SkippedFit>> = DISCRETE_VERBOSE_FITTERS
545 .par_iter()
546 .map(|f| f(data, &int_data))
547 .collect();
548
549 let (mut results, skipped): (Vec<FitResult>, Vec<SkippedFit>) =
550 outcomes
551 .into_iter()
552 .fold((Vec::new(), Vec::new()), |(mut ok, mut err), o| {
553 match o {
554 Ok(r) => ok.push(r),
555 Err(s) => err.push(s),
556 }
557 (ok, err)
558 });
559
560 if results.is_empty() {
561 return Err(StatsError::InvalidInput {
562 message: "fit_all_discrete_verbose: no distribution could be fitted".to_string(),
563 });
564 }
565
566 results.sort_by(|a, b| {
567 a.aic
568 .partial_cmp(&b.aic)
569 .unwrap_or(std::cmp::Ordering::Equal)
570 });
571 Ok((results, skipped))
572}
573
574pub fn fit_all_discrete(data: &[f64]) -> StatsResult<Vec<FitResult>> {
581 if data.is_empty() {
582 return Err(StatsError::InvalidInput {
583 message: "fit_all_discrete: data must not be empty".to_string(),
584 });
585 }
586
587 let int_data: Vec<u64> = data.iter().map(|&x| x.round() as u64).collect();
588
589 let mut results: Vec<FitResult> = DISCRETE_FITTERS
590 .par_iter()
591 .filter_map(|f| f(data, &int_data))
592 .collect();
593
594 if results.is_empty() {
595 return Err(StatsError::InvalidInput {
596 message: "fit_all_discrete: no distribution could be fitted to the data".to_string(),
597 });
598 }
599
600 results.sort_by(|a, b| {
601 a.aic
602 .partial_cmp(&b.aic)
603 .unwrap_or(std::cmp::Ordering::Equal)
604 });
605 Ok(results)
606}
607
608pub fn fit_best_discrete(data: &[f64]) -> StatsResult<FitResult> {
610 let mut all = fit_all_discrete(data)?;
611 Ok(all.remove(0))
612}
613
614pub fn auto_fit(data: &[f64]) -> StatsResult<FitResult> {
628 match detect_data_type(data) {
629 DataKind::Discrete => fit_best_discrete(data),
630 DataKind::Continuous => fit_best(data),
631 }
632}
633
634#[cfg(test)]
635mod tests {
636 use super::*;
637
638 #[test]
639 fn test_detect_data_type_discrete() {
640 assert_eq!(detect_data_type(&[0.0, 1.0, 2.0, 3.0]), DataKind::Discrete);
641 assert_eq!(detect_data_type(&[0.0, 0.0, 1.0]), DataKind::Discrete);
642 }
643
644 #[test]
645 fn test_detect_data_type_continuous() {
646 assert_eq!(detect_data_type(&[0.5, 1.5, 2.3]), DataKind::Continuous);
647 assert_eq!(detect_data_type(&[-1.0, 0.0, 1.0]), DataKind::Continuous);
648 assert_eq!(detect_data_type(&[1.0, 2.5, 3.0]), DataKind::Continuous);
649 }
650
651 #[test]
652 fn test_ks_test_uniform() {
653 let data: Vec<f64> = (0..20).map(|i| i as f64 / 20.0).collect();
655 let ks = ks_test(&data, |x| x.clamp(0.0, 1.0));
656 assert!(ks.statistic < 0.15);
657 }
658
659 #[test]
660 fn test_fit_all_returns_results() {
661 let data: Vec<f64> = (0..50).map(|i| (i as f64) * 0.1 + 0.5).collect();
662 let results = fit_all(&data).unwrap();
663 assert!(!results.is_empty());
664 for i in 1..results.len() {
666 assert!(results[i].aic >= results[i - 1].aic);
667 }
668 }
669
670 #[test]
671 fn test_fit_best_normal_data() {
672 let data = vec![
674 4.1, 5.2, 5.8, 4.7, 5.3, 4.9, 6.1, 4.5, 5.5, 5.0, 4.8, 5.1, 4.3, 5.7, 4.6, 5.4, 4.2,
675 5.9, 5.2, 4.4,
676 ];
677 let best = fit_best(&data).unwrap();
678 assert!(best.aic.is_finite());
680 }
681
682 #[test]
683 fn test_fit_all_discrete() {
684 let data = vec![0.0, 1.0, 2.0, 3.0, 1.0, 0.0, 2.0, 1.0, 0.0, 4.0];
685 let results = fit_all_discrete(&data).unwrap();
686 assert!(!results.is_empty());
687 }
688
689 #[test]
690 fn test_auto_fit_continuous() {
691 let data = vec![1.5, 2.3, 1.8, 2.1, 2.7, 1.9, 2.4, 2.0];
692 let best = auto_fit(&data).unwrap();
693 assert!(!best.name.is_empty());
694 }
695
696 #[test]
697 fn test_auto_fit_discrete() {
698 let data = vec![0.0, 1.0, 2.0, 1.0, 0.0, 3.0, 1.0, 2.0];
699 let best = auto_fit(&data).unwrap();
700 assert!(!best.name.is_empty());
701 }
702
703 #[test]
704 fn test_fit_all_empty_data() {
705 assert!(fit_all(&[]).is_err());
706 }
707}