1use crate::distributions::{
62 beta::Beta,
63 binomial_distribution::Binomial,
64 chi_squared::ChiSquared,
65 f_distribution::FDistribution,
66 gamma_distribution::Gamma,
67 geometric::Geometric,
68 lognormal::LogNormal,
69 negative_binomial::NegativeBinomial,
70 normal_distribution::Normal,
71 poisson_distribution::Poisson,
72 student_t::StudentT,
73 traits::{DiscreteDistribution, Distribution},
74 uniform_distribution::Uniform,
75 weibull::Weibull,
76};
77use crate::error::{StatsError, StatsResult};
78
79#[derive(Debug, Clone, Copy, PartialEq, Eq)]
83pub enum DataKind {
84 Discrete,
86 Continuous,
88}
89
90pub fn detect_data_type(data: &[f64]) -> DataKind {
100 if data
101 .iter()
102 .all(|&x| x >= 0.0 && x.fract() == 0.0 && x.is_finite())
103 {
104 DataKind::Discrete
105 } else {
106 DataKind::Continuous
107 }
108}
109
110#[derive(Debug, Clone, Copy)]
114pub struct KsResult {
115 pub statistic: f64,
117 pub p_value: f64,
119}
120
121pub fn ks_test(data: &[f64], cdf: impl Fn(f64) -> f64) -> KsResult {
125 let mut buf = Vec::with_capacity(data.len());
126 buf.extend_from_slice(data);
127 ks_test_with_scratch(&mut buf, cdf)
128}
129
130pub fn ks_test_with_scratch(scratch: &mut [f64], cdf: impl Fn(f64) -> f64) -> KsResult {
138 let n = scratch.len();
139 if n == 0 {
140 return KsResult {
141 statistic: 0.0,
142 p_value: 1.0,
143 };
144 }
145 scratch.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
146
147 let nf = n as f64;
148 let mut d = 0.0_f64;
149 for (i, &x) in scratch.iter().enumerate() {
150 let f = cdf(x);
151 let upper = (i + 1) as f64 / nf;
152 let lower = i as f64 / nf;
153 d = d.max((upper - f).abs()).max((f - lower).abs());
154 }
155
156 let p_value = kolmogorov_p(((nf).sqrt() + 0.12 + 0.11 / nf.sqrt()) * d);
157
158 KsResult {
159 statistic: d,
160 p_value,
161 }
162}
163
164pub fn ks_test_discrete(data: &[f64], cdf: impl Fn(u64) -> f64) -> KsResult {
166 let mut buf = Vec::with_capacity(data.len());
167 buf.extend_from_slice(data);
168 ks_test_discrete_with_scratch(&mut buf, cdf)
169}
170
171pub fn ks_test_discrete_with_scratch(scratch: &mut [f64], cdf: impl Fn(u64) -> f64) -> KsResult {
173 let n = scratch.len();
174 if n == 0 {
175 return KsResult {
176 statistic: 0.0,
177 p_value: 1.0,
178 };
179 }
180 scratch.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
181
182 let nf = n as f64;
183 let mut d = 0.0_f64;
184 for (i, &x) in scratch.iter().enumerate() {
185 let k = x.round() as u64;
186 let f = cdf(k);
187 let upper = (i + 1) as f64 / nf;
188 let lower = i as f64 / nf;
189 d = d.max((upper - f).abs()).max((f - lower).abs());
190 }
191
192 let p_value = kolmogorov_p(((nf).sqrt() + 0.12 + 0.11 / nf.sqrt()) * d);
193
194 KsResult {
195 statistic: d,
196 p_value,
197 }
198}
199
200fn kolmogorov_p(x: f64) -> f64 {
202 if x <= 0.0 {
203 return 1.0;
204 }
205 let mut sum = 0.0_f64;
207 for j in 1_u32..=100 {
208 let term = (-(2.0 * (j as f64).powi(2) * x * x)).exp();
209 if j % 2 == 1 {
210 sum += term;
211 } else {
212 sum -= term;
213 }
214 if term < 1e-15 {
215 break;
216 }
217 }
218 (2.0 * sum).clamp(0.0, 1.0)
219}
220
221#[derive(Debug, Clone)]
225pub struct FitResult {
226 pub name: String,
228 pub aic: f64,
230 pub bic: f64,
232 pub ks_statistic: f64,
234 pub ks_p_value: f64,
236}
237
238pub fn fit_all(data: &[f64]) -> StatsResult<Vec<FitResult>> {
244 if data.is_empty() {
245 return Err(StatsError::InvalidInput {
246 message: "fit_all: data must not be empty".to_string(),
247 });
248 }
249
250 let mut ks_buf: Vec<f64> = Vec::with_capacity(data.len());
252 let mut results: Vec<FitResult> = Vec::with_capacity(10);
253
254 macro_rules! try_fit {
255 ($dist_type:ty, $fit_expr:expr) => {
256 if let Ok(dist) = $fit_expr {
257 if let (Ok(aic), Ok(bic)) = (dist.aic(data), dist.bic(data)) {
258 if aic.is_finite() && bic.is_finite() {
259 ks_buf.clear();
260 ks_buf.extend_from_slice(data);
261 let ks = ks_test_with_scratch(&mut ks_buf, |x| dist.cdf(x).unwrap_or(0.0));
262 results.push(FitResult {
263 name: dist.name().to_string(),
264 aic,
265 bic,
266 ks_statistic: ks.statistic,
267 ks_p_value: ks.p_value,
268 });
269 }
270 }
271 }
272 };
273 }
274
275 try_fit!(Normal, Normal::fit(data));
276 try_fit!(
277 Exponential,
278 crate::distributions::exponential_distribution::Exponential::fit(data)
279 );
280 try_fit!(Uniform, Uniform::fit(data));
281 try_fit!(Gamma, Gamma::fit(data));
282 try_fit!(LogNormal, LogNormal::fit(data));
283 try_fit!(Weibull, Weibull::fit(data));
284 try_fit!(Beta, Beta::fit(data));
285 try_fit!(StudentT, StudentT::fit(data));
286 try_fit!(FDistribution, FDistribution::fit(data));
287 try_fit!(ChiSquared, ChiSquared::fit(data));
288
289 if results.is_empty() {
290 return Err(StatsError::InvalidInput {
291 message: "fit_all: no distribution could be fitted to the data".to_string(),
292 });
293 }
294
295 results.sort_by(|a, b| {
296 a.aic
297 .partial_cmp(&b.aic)
298 .unwrap_or(std::cmp::Ordering::Equal)
299 });
300 Ok(results)
301}
302
303pub fn fit_best(data: &[f64]) -> StatsResult<FitResult> {
305 let mut all = fit_all(data)?;
306 Ok(all.remove(0))
307}
308
309#[derive(Debug, Clone)]
315pub struct SkippedFit {
316 pub name: &'static str,
318 pub reason: String,
320}
321
322pub fn fit_all_verbose(data: &[f64]) -> StatsResult<(Vec<FitResult>, Vec<SkippedFit>)> {
337 if data.is_empty() {
338 return Err(StatsError::InvalidInput {
339 message: "fit_all_verbose: data must not be empty".to_string(),
340 });
341 }
342
343 let mut ks_buf: Vec<f64> = Vec::with_capacity(data.len());
344 let mut results: Vec<FitResult> = Vec::with_capacity(10);
345 let mut skipped: Vec<SkippedFit> = Vec::with_capacity(10);
346
347 macro_rules! try_fit_v {
348 ($name:literal, $fit_expr:expr) => {
349 match $fit_expr {
350 Err(e) => skipped.push(SkippedFit {
351 name: $name,
352 reason: format!("fit failed: {e}"),
353 }),
354 Ok(dist) => match (dist.aic(data), dist.bic(data)) {
355 (Ok(aic), Ok(bic)) if aic.is_finite() && bic.is_finite() => {
356 ks_buf.clear();
357 ks_buf.extend_from_slice(data);
358 let ks = ks_test_with_scratch(&mut ks_buf, |x| dist.cdf(x).unwrap_or(0.0));
359 results.push(FitResult {
360 name: dist.name().to_string(),
361 aic,
362 bic,
363 ks_statistic: ks.statistic,
364 ks_p_value: ks.p_value,
365 });
366 }
367 _ => skipped.push(SkippedFit {
368 name: $name,
369 reason: "non-finite AIC/BIC (log-likelihood diverged)".to_string(),
370 }),
371 },
372 }
373 };
374 }
375
376 try_fit_v!("Normal", Normal::fit(data));
377 try_fit_v!(
378 "Exponential",
379 crate::distributions::exponential_distribution::Exponential::fit(data)
380 );
381 try_fit_v!("Uniform", Uniform::fit(data));
382 try_fit_v!("Gamma", Gamma::fit(data));
383 try_fit_v!("LogNormal", LogNormal::fit(data));
384 try_fit_v!("Weibull", Weibull::fit(data));
385 try_fit_v!("Beta", Beta::fit(data));
386 try_fit_v!("StudentT", StudentT::fit(data));
387 try_fit_v!("FDistribution", FDistribution::fit(data));
388 try_fit_v!("ChiSquared", ChiSquared::fit(data));
389
390 if results.is_empty() {
391 return Err(StatsError::InvalidInput {
392 message: "fit_all_verbose: no distribution could be fitted to the data".to_string(),
393 });
394 }
395
396 results.sort_by(|a, b| {
397 a.aic
398 .partial_cmp(&b.aic)
399 .unwrap_or(std::cmp::Ordering::Equal)
400 });
401 Ok((results, skipped))
402}
403
404pub fn fit_all_discrete_verbose(data: &[f64]) -> StatsResult<(Vec<FitResult>, Vec<SkippedFit>)> {
406 if data.is_empty() {
407 return Err(StatsError::InvalidInput {
408 message: "fit_all_discrete_verbose: data must not be empty".to_string(),
409 });
410 }
411
412 let mut int_data: Vec<u64> = Vec::with_capacity(data.len());
413 int_data.extend(data.iter().map(|&x| x.round() as u64));
414 let mut ks_buf: Vec<f64> = Vec::with_capacity(data.len());
415 let mut results: Vec<FitResult> = Vec::with_capacity(4);
416 let mut skipped: Vec<SkippedFit> = Vec::with_capacity(4);
417
418 macro_rules! try_fit_disc_v {
419 ($name:literal, $fit_expr:expr) => {
420 match $fit_expr {
421 Err(e) => skipped.push(SkippedFit {
422 name: $name,
423 reason: format!("fit failed: {e}"),
424 }),
425 Ok(dist) => match (dist.aic(&int_data), dist.bic(&int_data)) {
426 (Ok(aic), Ok(bic)) if aic.is_finite() && bic.is_finite() => {
427 ks_buf.clear();
428 ks_buf.extend_from_slice(data);
429 let ks = ks_test_discrete_with_scratch(&mut ks_buf, |k| {
430 dist.cdf(k).unwrap_or(0.0)
431 });
432 results.push(FitResult {
433 name: dist.name().to_string(),
434 aic,
435 bic,
436 ks_statistic: ks.statistic,
437 ks_p_value: ks.p_value,
438 });
439 }
440 _ => skipped.push(SkippedFit {
441 name: $name,
442 reason: "non-finite AIC/BIC (log-likelihood diverged)".to_string(),
443 }),
444 },
445 }
446 };
447 }
448
449 try_fit_disc_v!("Poisson", Poisson::fit(data));
450 try_fit_disc_v!("Geometric", Geometric::fit(data));
451 try_fit_disc_v!("NegativeBinomial", NegativeBinomial::fit(data));
452 try_fit_disc_v!("Binomial", Binomial::fit(data));
453
454 if results.is_empty() {
455 return Err(StatsError::InvalidInput {
456 message: "fit_all_discrete_verbose: no distribution could be fitted".to_string(),
457 });
458 }
459
460 results.sort_by(|a, b| {
461 a.aic
462 .partial_cmp(&b.aic)
463 .unwrap_or(std::cmp::Ordering::Equal)
464 });
465 Ok((results, skipped))
466}
467
468pub fn fit_all_discrete(data: &[f64]) -> StatsResult<Vec<FitResult>> {
474 if data.is_empty() {
475 return Err(StatsError::InvalidInput {
476 message: "fit_all_discrete: data must not be empty".to_string(),
477 });
478 }
479
480 let mut int_data: Vec<u64> = Vec::with_capacity(data.len());
482 int_data.extend(data.iter().map(|&x| x.round() as u64));
483 let mut ks_buf: Vec<f64> = Vec::with_capacity(data.len());
484 let mut results: Vec<FitResult> = Vec::with_capacity(4);
485
486 macro_rules! try_fit_disc {
487 ($fit_expr:expr) => {
488 if let Ok(dist) = $fit_expr {
489 if let (Ok(aic), Ok(bic)) = (dist.aic(&int_data), dist.bic(&int_data)) {
490 if aic.is_finite() && bic.is_finite() {
491 ks_buf.clear();
492 ks_buf.extend_from_slice(data);
493 let ks = ks_test_discrete_with_scratch(&mut ks_buf, |k| {
494 dist.cdf(k).unwrap_or(0.0)
495 });
496 results.push(FitResult {
497 name: dist.name().to_string(),
498 aic,
499 bic,
500 ks_statistic: ks.statistic,
501 ks_p_value: ks.p_value,
502 });
503 }
504 }
505 }
506 };
507 }
508
509 try_fit_disc!(Poisson::fit(data));
510 try_fit_disc!(Geometric::fit(data));
511 try_fit_disc!(NegativeBinomial::fit(data));
512 try_fit_disc!(Binomial::fit(data));
513
514 if results.is_empty() {
515 return Err(StatsError::InvalidInput {
516 message: "fit_all_discrete: no distribution could be fitted to the data".to_string(),
517 });
518 }
519
520 results.sort_by(|a, b| {
521 a.aic
522 .partial_cmp(&b.aic)
523 .unwrap_or(std::cmp::Ordering::Equal)
524 });
525 Ok(results)
526}
527
528pub fn fit_best_discrete(data: &[f64]) -> StatsResult<FitResult> {
530 let mut all = fit_all_discrete(data)?;
531 Ok(all.remove(0))
532}
533
534pub fn auto_fit(data: &[f64]) -> StatsResult<FitResult> {
548 match detect_data_type(data) {
549 DataKind::Discrete => fit_best_discrete(data),
550 DataKind::Continuous => fit_best(data),
551 }
552}
553
554#[cfg(test)]
555mod tests {
556 use super::*;
557
558 #[test]
559 fn test_detect_data_type_discrete() {
560 assert_eq!(detect_data_type(&[0.0, 1.0, 2.0, 3.0]), DataKind::Discrete);
561 assert_eq!(detect_data_type(&[0.0, 0.0, 1.0]), DataKind::Discrete);
562 }
563
564 #[test]
565 fn test_detect_data_type_continuous() {
566 assert_eq!(detect_data_type(&[0.5, 1.5, 2.3]), DataKind::Continuous);
567 assert_eq!(detect_data_type(&[-1.0, 0.0, 1.0]), DataKind::Continuous);
568 assert_eq!(detect_data_type(&[1.0, 2.5, 3.0]), DataKind::Continuous);
569 }
570
571 #[test]
572 fn test_ks_test_uniform() {
573 let data: Vec<f64> = (0..20).map(|i| i as f64 / 20.0).collect();
575 let ks = ks_test(&data, |x| x.clamp(0.0, 1.0));
576 assert!(ks.statistic < 0.15);
577 }
578
579 #[test]
580 fn test_fit_all_returns_results() {
581 let data: Vec<f64> = (0..50).map(|i| (i as f64) * 0.1 + 0.5).collect();
582 let results = fit_all(&data).unwrap();
583 assert!(!results.is_empty());
584 for i in 1..results.len() {
586 assert!(results[i].aic >= results[i - 1].aic);
587 }
588 }
589
590 #[test]
591 fn test_fit_best_normal_data() {
592 let data = vec![
594 4.1, 5.2, 5.8, 4.7, 5.3, 4.9, 6.1, 4.5, 5.5, 5.0, 4.8, 5.1, 4.3, 5.7, 4.6, 5.4, 4.2,
595 5.9, 5.2, 4.4,
596 ];
597 let best = fit_best(&data).unwrap();
598 assert!(best.aic.is_finite());
600 }
601
602 #[test]
603 fn test_fit_all_discrete() {
604 let data = vec![0.0, 1.0, 2.0, 3.0, 1.0, 0.0, 2.0, 1.0, 0.0, 4.0];
605 let results = fit_all_discrete(&data).unwrap();
606 assert!(!results.is_empty());
607 }
608
609 #[test]
610 fn test_auto_fit_continuous() {
611 let data = vec![1.5, 2.3, 1.8, 2.1, 2.7, 1.9, 2.4, 2.0];
612 let best = auto_fit(&data).unwrap();
613 assert!(!best.name.is_empty());
614 }
615
616 #[test]
617 fn test_auto_fit_discrete() {
618 let data = vec![0.0, 1.0, 2.0, 1.0, 0.0, 3.0, 1.0, 2.0];
619 let best = auto_fit(&data).unwrap();
620 assert!(!best.name.is_empty());
621 }
622
623 #[test]
624 fn test_fit_all_empty_data() {
625 assert!(fit_all(&[]).is_err());
626 }
627}