1use color_eyre::eyre::Report;
2use color_eyre::Result;
3use polars::polars_compute::rolling::QuantileMethod;
4use polars::prelude::*;
5use std::collections::HashMap;
6
7pub fn collect_lazy(
14 lf: LazyFrame,
15 use_streaming: bool,
16) -> std::result::Result<DataFrame, PolarsError> {
17 #[cfg(feature = "streaming")]
18 {
19 if use_streaming {
20 lf.with_new_streaming(true).collect()
21 } else {
22 lf.collect()
23 }
24 }
25 #[cfg(not(feature = "streaming"))]
26 {
27 let _ = use_streaming; lf.collect()
29 }
30}
31
32pub const SAMPLING_THRESHOLD: usize = 10_000;
35
36#[derive(Clone)]
37pub struct ColumnStatistics {
38 pub name: String,
39 pub dtype: DataType,
40 pub count: usize,
41 pub null_count: usize,
42 pub numeric_stats: Option<NumericStatistics>,
43 pub categorical_stats: Option<CategoricalStatistics>,
44 pub distribution_info: Option<DistributionInfo>,
45}
46
47#[derive(Clone)]
48pub struct NumericStatistics {
49 pub mean: f64,
50 pub std: f64,
51 pub min: f64,
52 pub max: f64,
53 pub median: f64,
54 pub q25: f64,
55 pub q75: f64,
56 pub percentiles: HashMap<u8, f64>, pub skewness: f64,
58 pub kurtosis: f64,
59 pub outliers_iqr: usize,
60 pub outliers_zscore: usize,
61}
62
63#[derive(Clone)]
64pub struct CategoricalStatistics {
65 pub unique_count: usize,
66 pub mode: Option<String>,
67 pub top_values: Vec<(String, usize)>,
68 pub min: Option<String>, pub max: Option<String>, }
71
72#[derive(Clone)]
73pub struct DistributionInfo {
74 pub distribution_type: DistributionType,
75 pub confidence: f64,
76 pub sample_size: usize,
77 pub is_sampled: bool,
78 pub fit_quality: Option<f64>, pub all_distribution_pvalues: HashMap<DistributionType, f64>, }
81
82#[derive(Clone)]
83pub struct DistributionAnalysis {
84 pub column_name: String,
85 pub distribution_type: DistributionType,
86 pub confidence: f64, pub fit_quality: f64, pub characteristics: DistributionCharacteristics,
89 pub outliers: OutlierAnalysis,
90 pub percentiles: PercentileBreakdown,
91 pub sorted_sample_values: Vec<f64>, pub is_sampled: bool, pub sample_size: usize, pub all_distribution_pvalues: HashMap<DistributionType, f64>, }
96
97#[derive(Clone)]
98pub struct DistributionCharacteristics {
99 pub shapiro_wilk_stat: Option<f64>,
100 pub shapiro_wilk_pvalue: Option<f64>,
101 pub skewness: f64,
102 pub kurtosis: f64,
103 pub mean: f64,
104 pub median: f64,
105 pub std_dev: f64,
106 pub variance: f64,
107 pub coefficient_of_variation: f64,
108 pub mode: Option<f64>, }
110
111#[derive(Clone)]
112pub struct OutlierAnalysis {
113 pub total_count: usize,
114 pub percentage: f64,
115 pub iqr_count: usize,
116 pub zscore_count: usize,
117 pub outlier_rows: Vec<OutlierRow>, }
119
120#[derive(Clone)]
121pub struct OutlierRow {
122 pub row_index: usize,
123 pub column_value: f64,
124 pub context_data: HashMap<String, String>, pub detection_method: OutlierMethod,
126 pub z_score: Option<f64>,
127 pub iqr_position: Option<IqrPosition>, }
129
130#[derive(Clone, Debug)]
131pub enum OutlierMethod {
132 IQR,
133 ZScore,
134 Both,
135}
136
137#[derive(Clone, Debug)]
138pub enum IqrPosition {
139 BelowLowerFence,
140 AboveUpperFence,
141}
142
143#[derive(Clone)]
144pub struct PercentileBreakdown {
145 pub p1: f64,
146 pub p5: f64,
147 pub p25: f64,
148 pub p50: f64,
149 pub p75: f64,
150 pub p95: f64,
151 pub p99: f64,
152}
153
154#[derive(Clone)]
156pub struct CorrelationMatrix {
157 pub columns: Vec<String>, pub correlations: Vec<Vec<f64>>, pub p_values: Option<Vec<Vec<f64>>>, pub sample_sizes: Vec<Vec<usize>>, }
162
163#[derive(Clone)]
164pub struct CorrelationPair {
165 pub column1: String,
166 pub column2: String,
167 pub correlation: f64,
168 pub p_value: Option<f64>,
169 pub sample_size: usize,
170 pub covariance: f64,
171 pub r_squared: f64,
172 pub stats1: ColumnStats,
173 pub stats2: ColumnStats,
174}
175
176#[derive(Clone)]
177pub struct ColumnStats {
178 pub mean: f64,
179 pub std: f64,
180 pub min: f64,
181 pub max: f64,
182}
183
184#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash)]
185pub enum DistributionType {
186 #[default]
187 Normal,
188 LogNormal,
189 Uniform,
190 PowerLaw,
191 Exponential,
192 Beta,
193 Gamma,
194 ChiSquared,
195 StudentsT,
196 Poisson,
197 Bernoulli,
198 Binomial,
199 Geometric,
200 Weibull,
201 Unknown,
202}
203
204impl std::fmt::Display for DistributionType {
205 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
206 match self {
207 DistributionType::Normal => write!(f, "Normal"),
208 DistributionType::LogNormal => write!(f, "Log-Normal"),
209 DistributionType::Uniform => write!(f, "Uniform"),
210 DistributionType::PowerLaw => write!(f, "Power Law"),
211 DistributionType::Exponential => write!(f, "Exponential"),
212 DistributionType::Beta => write!(f, "Beta"),
213 DistributionType::Gamma => write!(f, "Gamma"),
214 DistributionType::ChiSquared => write!(f, "Chi-Squared"),
215 DistributionType::StudentsT => write!(f, "Student's t"),
216 DistributionType::Poisson => write!(f, "Poisson"),
217 DistributionType::Bernoulli => write!(f, "Bernoulli"),
218 DistributionType::Binomial => write!(f, "Binomial"),
219 DistributionType::Geometric => write!(f, "Geometric"),
220 DistributionType::Weibull => write!(f, "Weibull"),
221 DistributionType::Unknown => write!(f, "Unknown"),
222 }
223 }
224}
225
226#[derive(Clone)]
227pub struct AnalysisResults {
228 pub column_statistics: Vec<ColumnStatistics>,
229 pub total_rows: usize,
230 pub sample_size: Option<usize>,
231 pub sample_seed: u64,
232 pub correlation_matrix: Option<CorrelationMatrix>,
233 pub distribution_analyses: Vec<DistributionAnalysis>,
234}
235
236pub struct AnalysisContext {
237 pub has_query: bool,
238 pub query: String,
239 pub has_filters: bool,
240 pub filter_count: usize,
241 pub is_drilled_down: bool,
242 pub group_key: Option<Vec<String>>,
243 pub group_columns: Option<Vec<String>>,
244}
245
246#[derive(Debug, Clone, Copy)]
247pub struct ComputeOptions {
248 pub include_distribution_info: bool,
249 pub include_distribution_analyses: bool,
250 pub include_correlation_matrix: bool,
251 pub include_skewness_kurtosis_outliers: bool,
252 pub polars_streaming: bool,
254}
255
256impl Default for ComputeOptions {
257 fn default() -> Self {
258 Self {
259 include_distribution_info: false,
260 include_distribution_analyses: false,
261 include_correlation_matrix: false,
262 include_skewness_kurtosis_outliers: false,
263 polars_streaming: true,
264 }
265 }
266}
267
268pub fn compute_statistics(
272 lf: &LazyFrame,
273 sample_size: Option<usize>,
274 seed: u64,
275) -> Result<AnalysisResults> {
276 compute_statistics_with_options(lf, sample_size, seed, ComputeOptions::default())
277}
278
279pub fn compute_statistics_with_options(
290 lf: &LazyFrame,
291 sample_size: Option<usize>,
292 seed: u64,
293 options: ComputeOptions,
294) -> Result<AnalysisResults> {
295 let schema = lf.clone().collect_schema()?;
296 let use_streaming = options.polars_streaming;
297 let total_rows = {
299 let count_df =
300 collect_lazy(lf.clone().select([len()]), use_streaming).map_err(Report::from)?;
301 if let Some(col) = count_df.get(0) {
302 if let Some(AnyValue::UInt32(n)) = col.first() {
303 *n as usize
304 } else {
305 0
306 }
307 } else {
308 0
309 }
310 };
311
312 let should_sample = sample_size.is_some_and(|t| total_rows >= t);
314 let (sampling_threshold, actual_sample_size) = if should_sample {
315 let t = sample_size.unwrap();
316 (t, Some(t))
317 } else {
318 (0, None)
319 };
320
321 let df = if should_sample {
322 sample_dataframe(lf, sampling_threshold, seed, use_streaming)?
323 } else {
324 collect_lazy(lf.clone(), use_streaming).map_err(Report::from)?
325 };
326
327 let mut column_statistics = Vec::new();
328
329 for (name, dtype) in schema.iter() {
330 let col = df.column(name)?;
331 let series = col.as_materialized_series();
332 let count = series.len();
333 let null_count = series.null_count();
334
335 let numeric_stats = if is_numeric_type(dtype) {
336 Some(compute_numeric_stats(
337 series,
338 options.include_skewness_kurtosis_outliers,
339 )?)
340 } else {
341 None
342 };
343
344 let categorical_stats = if is_categorical_type(dtype) {
345 Some(compute_categorical_stats(series)?)
346 } else {
347 None
348 };
349
350 let distribution_info =
351 if options.include_distribution_info && is_numeric_type(dtype) && null_count < count {
352 Some(infer_distribution(
354 series,
355 series,
356 actual_sample_size.unwrap_or(count),
357 should_sample,
358 ))
359 } else {
360 None
361 };
362
363 column_statistics.push(ColumnStatistics {
364 name: name.to_string(),
365 dtype: dtype.clone(),
366 count,
367 null_count,
368 numeric_stats,
369 categorical_stats,
370 distribution_info,
371 });
372 }
373
374 let distribution_analyses = if options.include_distribution_analyses {
375 column_statistics
376 .iter()
377 .filter_map(|col_stat| {
378 if let (Some(numeric_stats), Some(dist_info)) =
379 (&col_stat.numeric_stats, &col_stat.distribution_info)
380 {
381 if let Ok(series_col) = df.column(&col_stat.name) {
382 let series = series_col.as_materialized_series();
383 Some(compute_advanced_distribution_analysis(
384 &col_stat.name,
385 series,
386 numeric_stats,
387 dist_info,
388 actual_sample_size.unwrap_or(total_rows),
389 should_sample,
390 ))
391 } else {
392 None
393 }
394 } else {
395 None
396 }
397 })
398 .collect()
399 } else {
400 Vec::new()
401 };
402
403 let correlation_matrix = if options.include_correlation_matrix {
404 compute_correlation_matrix(&df).ok()
405 } else {
406 None
407 };
408
409 Ok(AnalysisResults {
410 column_statistics,
411 total_rows,
412 sample_size: actual_sample_size,
413 sample_seed: seed,
414 correlation_matrix,
415 distribution_analyses,
416 })
417}
418
419pub fn compute_describe_column(
424 df: &DataFrame,
425 schema: &Schema,
426 column_index: usize,
427 options: &ComputeOptions,
428 actual_sample_size: Option<usize>,
429 is_sampled: bool,
430) -> Result<ColumnStatistics> {
431 let (name, dtype) = schema
432 .iter()
433 .nth(column_index)
434 .ok_or_else(|| color_eyre::eyre::eyre!("column index {} out of range", column_index))?;
435 let col = df.column(name)?;
436 let series = col.as_materialized_series();
437 let count = series.len();
438 let null_count = series.null_count();
439
440 let numeric_stats = if is_numeric_type(dtype) {
441 Some(compute_numeric_stats(
442 series,
443 options.include_skewness_kurtosis_outliers,
444 )?)
445 } else {
446 None
447 };
448
449 let categorical_stats = if is_categorical_type(dtype) {
450 Some(compute_categorical_stats(series)?)
451 } else {
452 None
453 };
454
455 let distribution_info =
456 if options.include_distribution_info && is_numeric_type(dtype) && null_count < count {
457 Some(infer_distribution(
458 series,
459 series,
460 actual_sample_size.unwrap_or(count),
461 is_sampled,
462 ))
463 } else {
464 None
465 };
466
467 Ok(ColumnStatistics {
468 name: name.to_string(),
469 dtype: dtype.clone(),
470 count,
471 null_count,
472 numeric_stats,
473 categorical_stats,
474 distribution_info,
475 })
476}
477
478pub fn analysis_results_from_describe(
482 column_statistics: Vec<ColumnStatistics>,
483 total_rows: usize,
484 sample_size: Option<usize>,
485 sample_seed: u64,
486) -> AnalysisResults {
487 AnalysisResults {
488 column_statistics,
489 total_rows,
490 sample_size,
491 sample_seed,
492 correlation_matrix: None,
493 distribution_analyses: Vec::new(),
494 }
495}
496
497fn build_describe_aggregation_exprs(schema: &Schema) -> Vec<Expr> {
500 let mut exprs = Vec::new();
501 for (name, dtype) in schema.iter() {
502 let name = name.as_str();
503 let prefix = format!("{}::", name);
504 exprs.push(col(name).count().alias(format!("{}count", prefix)));
505 exprs.push(
506 col(name)
507 .null_count()
508 .alias(format!("{}null_count", prefix)),
509 );
510 if is_numeric_type(dtype) {
511 let c = col(name).cast(DataType::Float64);
512 exprs.push(c.clone().mean().alias(format!("{}mean", prefix)));
513 exprs.push(c.clone().std(1).alias(format!("{}std", prefix)));
514 exprs.push(c.clone().min().alias(format!("{}min", prefix)));
515 exprs.push(
516 c.clone()
517 .quantile(lit(0.25), QuantileMethod::Nearest)
518 .alias(format!("{}q25", prefix)),
519 );
520 exprs.push(
521 c.clone()
522 .quantile(lit(0.5), QuantileMethod::Nearest)
523 .alias(format!("{}median", prefix)),
524 );
525 exprs.push(
526 c.clone()
527 .quantile(lit(0.75), QuantileMethod::Nearest)
528 .alias(format!("{}q75", prefix)),
529 );
530 exprs.push(c.max().alias(format!("{}max", prefix)));
531 } else if is_categorical_type(dtype) {
532 exprs.push(col(name).min().alias(format!("{}min", prefix)));
533 exprs.push(col(name).max().alias(format!("{}max", prefix)));
534 }
535 }
536 exprs
537}
538
539fn parse_describe_agg_row(agg_df: &DataFrame, schema: &Schema) -> Vec<ColumnStatistics> {
541 let row = 0usize;
542 let mut column_statistics = Vec::with_capacity(schema.len());
543 for (name, dtype) in schema.iter() {
544 let name_str = name.as_str();
545 let prefix = format!("{}::", name_str);
546 let count: usize = agg_df
547 .column(&format!("{}count", prefix))
548 .ok()
549 .map(|s| match s.get(row) {
550 Ok(AnyValue::UInt32(x)) => x as usize,
551 _ => 0,
552 })
553 .unwrap_or(0);
554 let null_count: usize = agg_df
555 .column(&format!("{}null_count", prefix))
556 .ok()
557 .map(|s| match s.get(row) {
558 Ok(AnyValue::UInt32(x)) => x as usize,
559 _ => 0,
560 })
561 .unwrap_or(0);
562 let numeric_stats = if is_numeric_type(dtype) {
563 let mean = get_f64(agg_df, &format!("{}mean", prefix), row);
564 let std = get_f64(agg_df, &format!("{}std", prefix), row);
565 let min = get_f64(agg_df, &format!("{}min", prefix), row);
566 let q25 = get_f64(agg_df, &format!("{}q25", prefix), row);
567 let median = get_f64(agg_df, &format!("{}median", prefix), row);
568 let q75 = get_f64(agg_df, &format!("{}q75", prefix), row);
569 let max = get_f64(agg_df, &format!("{}max", prefix), row);
570 let mut percentiles = HashMap::new();
571 percentiles.insert(25u8, q25);
572 percentiles.insert(50u8, median);
573 percentiles.insert(75u8, q75);
574 Some(NumericStatistics {
575 mean,
576 std,
577 min,
578 max,
579 median,
580 q25,
581 q75,
582 percentiles,
583 skewness: 0.0,
584 kurtosis: 3.0,
585 outliers_iqr: 0,
586 outliers_zscore: 0,
587 })
588 } else {
589 None
590 };
591 let categorical_stats = if is_categorical_type(dtype) {
592 let min = get_str(agg_df, &format!("{}min", prefix), row);
593 let max = get_str(agg_df, &format!("{}max", prefix), row);
594 Some(CategoricalStatistics {
595 unique_count: 0,
596 mode: None,
597 top_values: Vec::new(),
598 min,
599 max,
600 })
601 } else {
602 None
603 };
604 column_statistics.push(ColumnStatistics {
605 name: name_str.to_string(),
606 dtype: dtype.clone(),
607 count,
608 null_count,
609 numeric_stats,
610 categorical_stats,
611 distribution_info: None,
612 });
613 }
614 column_statistics
615}
616
617pub fn compute_describe_from_lazy(
621 lf: &LazyFrame,
622 total_rows: usize,
623 sample_size: Option<usize>,
624 seed: u64,
625 polars_streaming: bool,
626) -> Result<AnalysisResults> {
627 let schema = lf.clone().collect_schema()?;
628 let should_sample = sample_size.is_some_and(|t| total_rows >= t);
629 if should_sample {
630 let threshold = sample_size.unwrap();
631 let df = sample_dataframe(lf, threshold, seed, polars_streaming)?;
632 return compute_describe_single_aggregation(
633 &df,
634 &schema,
635 total_rows,
636 Some(threshold),
637 seed,
638 polars_streaming,
639 );
640 }
641 let exprs = build_describe_aggregation_exprs(&schema);
642 let agg_df = collect_lazy(lf.clone().select(exprs), polars_streaming).map_err(Report::from)?;
643 let column_statistics = parse_describe_agg_row(&agg_df, &schema);
644 Ok(analysis_results_from_describe(
645 column_statistics,
646 total_rows,
647 None,
648 seed,
649 ))
650}
651
652pub fn compute_describe_single_aggregation(
655 df: &DataFrame,
656 schema: &Schema,
657 total_rows: usize,
658 sample_size: Option<usize>,
659 sample_seed: u64,
660 polars_streaming: bool,
661) -> Result<AnalysisResults> {
662 let exprs = build_describe_aggregation_exprs(schema);
663 let agg_df =
664 collect_lazy(df.clone().lazy().select(exprs), polars_streaming).map_err(Report::from)?;
665 let column_statistics = parse_describe_agg_row(&agg_df, schema);
666 Ok(analysis_results_from_describe(
667 column_statistics,
668 total_rows,
669 sample_size,
670 sample_seed,
671 ))
672}
673
674fn get_f64(df: &DataFrame, col_name: &str, row: usize) -> f64 {
675 df.column(col_name)
676 .ok()
677 .and_then(|s| {
678 let v = s.get(row).ok()?;
679 match v {
680 AnyValue::Float64(x) => Some(x),
681 AnyValue::Float32(x) => Some(x as f64),
682 AnyValue::Int32(x) => Some(x as f64),
683 AnyValue::Int64(x) => Some(x as f64),
684 AnyValue::UInt32(x) => Some(x as f64),
685 AnyValue::Null => Some(f64::NAN),
686 _ => None,
687 }
688 })
689 .unwrap_or(f64::NAN)
690}
691
692fn get_str(df: &DataFrame, col_name: &str, row: usize) -> Option<String> {
693 df.column(col_name)
694 .ok()
695 .and_then(|s| s.get(row).ok().map(|v| v.str_value().to_string()))
696}
697
698pub fn compute_distribution_statistics(
704 results: &mut AnalysisResults,
705 lf: &LazyFrame,
706 sample_size: Option<usize>,
707 seed: u64,
708 polars_streaming: bool,
709) -> Result<()> {
710 let should_sample = sample_size.is_some_and(|t| results.total_rows >= t);
712 let sampling_threshold = sample_size.unwrap_or(0);
713 let actual_sample_size = if should_sample {
714 Some(sampling_threshold)
715 } else {
716 None
717 };
718
719 let df = if should_sample {
720 sample_dataframe(lf, sampling_threshold, seed, polars_streaming)?
721 } else {
722 collect_lazy(lf.clone(), polars_streaming).map_err(Report::from)?
723 };
724
725 for col_stat in &mut results.column_statistics {
726 if col_stat.distribution_info.is_none()
727 && is_numeric_type(&col_stat.dtype)
728 && col_stat.null_count < col_stat.count
729 {
730 let series = df.column(&col_stat.name)?.as_materialized_series();
731 col_stat.distribution_info = Some(infer_distribution(
732 series,
733 series,
734 actual_sample_size.unwrap_or(col_stat.count),
735 should_sample,
736 ));
737 }
738
739 if let Some(ref mut num_stats) = col_stat.numeric_stats {
740 let needs_advanced_stats = num_stats.skewness == 0.0
741 && num_stats.kurtosis == 3.0
742 && num_stats.outliers_iqr == 0
743 && num_stats.outliers_zscore == 0
744 && col_stat.count > 0;
745 if needs_advanced_stats {
746 let series = df.column(&col_stat.name)?.as_materialized_series();
747 num_stats.skewness = compute_skewness(series);
748 num_stats.kurtosis = compute_kurtosis(series);
749 let (out_iqr, out_zscore) = detect_outliers(
750 series,
751 num_stats.q25,
752 num_stats.q75,
753 num_stats.median,
754 num_stats.mean,
755 num_stats.std,
756 );
757 num_stats.outliers_iqr = out_iqr;
758 num_stats.outliers_zscore = out_zscore;
759 }
760 }
761 }
762
763 if results.distribution_analyses.is_empty() {
764 results.distribution_analyses = results
765 .column_statistics
766 .iter()
767 .filter_map(|col_stat| {
768 if let (Some(numeric_stats), Some(dist_info)) =
769 (&col_stat.numeric_stats, &col_stat.distribution_info)
770 {
771 if let Ok(series_col) = df.column(&col_stat.name) {
772 let series = series_col.as_materialized_series();
773 Some(compute_advanced_distribution_analysis(
774 &col_stat.name,
775 series,
776 numeric_stats,
777 dist_info,
778 actual_sample_size.unwrap_or(results.total_rows),
779 should_sample,
780 ))
781 } else {
782 None
783 }
784 } else {
785 None
786 }
787 })
788 .collect();
789 }
790
791 Ok(())
792}
793
794pub fn compute_correlation_statistics(
796 results: &mut AnalysisResults,
797 lf: &LazyFrame,
798 polars_streaming: bool,
799) -> Result<()> {
800 if results.correlation_matrix.is_none() {
801 let df = collect_lazy(lf.clone(), polars_streaming).map_err(Report::from)?;
802 results.correlation_matrix = compute_correlation_matrix(&df).ok();
803 }
804 Ok(())
805}
806
807fn is_numeric_type(dtype: &DataType) -> bool {
809 dtype.is_numeric()
810}
811
812fn is_categorical_type(dtype: &DataType) -> bool {
813 matches!(dtype, DataType::String | DataType::Categorical(..))
814}
815
816pub fn sample_dataframe(
818 lf: &LazyFrame,
819 sample_size: usize,
820 seed: u64,
821 polars_streaming: bool,
822) -> Result<DataFrame> {
823 let collect_multiplier = if sample_size <= 1000 {
824 5
825 } else if sample_size <= 5000 {
826 3
827 } else {
828 2
829 };
830
831 let collect_limit = (sample_size * collect_multiplier).min(50_000);
832 let df = collect_lazy(lf.clone().limit(collect_limit as u32), polars_streaming)
833 .map_err(Report::from)?;
834 let total_collected = df.height();
835
836 if total_collected <= sample_size {
837 return Ok(df);
838 }
839
840 let step = total_collected / sample_size;
841 let start_offset = (seed as usize) % step;
842
843 let indices: Vec<u32> = (0..sample_size)
844 .map(|i| {
845 let idx = start_offset + i * step;
846 (idx.min(total_collected - 1)) as u32
847 })
848 .collect();
849
850 let indices_ca = UInt32Chunked::new("indices".into(), indices);
851 df.take(&indices_ca)
852 .map_err(|e| color_eyre::eyre::eyre!("Sampling error: {}", e))
853}
854
855fn get_numeric_values_as_f64(series: &Series) -> Vec<f64> {
856 let max_len = 10000;
857 let limited_series = if series.len() > max_len {
858 series.slice(0, max_len)
859 } else {
860 series.clone()
861 };
862
863 if let Ok(f64_series) = limited_series.f64() {
864 f64_series.iter().flatten().take(max_len).collect()
865 } else if let Ok(i64_series) = limited_series.i64() {
866 i64_series
867 .iter()
868 .filter_map(|v| v.map(|x| x as f64))
869 .take(max_len)
870 .collect()
871 } else if let Ok(i32_series) = limited_series.i32() {
872 i32_series
873 .iter()
874 .filter_map(|v| v.map(|x| x as f64))
875 .take(max_len)
876 .collect()
877 } else if let Ok(u64_series) = limited_series.u64() {
878 u64_series
879 .iter()
880 .filter_map(|v| v.map(|x| x as f64))
881 .take(max_len)
882 .collect()
883 } else if let Ok(u32_series) = limited_series.u32() {
884 u32_series
885 .iter()
886 .filter_map(|v| v.map(|x| x as f64))
887 .take(max_len)
888 .collect()
889 } else if let Ok(f32_series) = limited_series.f32() {
890 f32_series
891 .iter()
892 .filter_map(|v| v.map(|x| x as f64))
893 .take(max_len)
894 .collect()
895 } else {
896 match limited_series.cast(&DataType::Float64) {
897 Ok(cast_series) => {
898 if let Ok(f64_series) = cast_series.f64() {
899 f64_series.iter().flatten().take(max_len).collect()
900 } else {
901 Vec::new()
902 }
903 }
904 Err(_) => Vec::new(),
905 }
906 }
907}
908
909fn compute_numeric_stats(series: &Series, include_advanced: bool) -> Result<NumericStatistics> {
910 let mean = series.mean().unwrap_or(f64::NAN);
911 let std = series.std(1).unwrap_or(f64::NAN); let min = if let Ok(min_val) = series.min::<f64>() {
914 min_val.unwrap_or(f64::NAN)
915 } else if let Ok(min_val) = series.min::<i64>() {
916 min_val.map(|v| v as f64).unwrap_or(f64::NAN)
917 } else if let Ok(min_val) = series.min::<i32>() {
918 min_val.map(|v| v as f64).unwrap_or(f64::NAN)
919 } else {
920 f64::NAN
921 };
922
923 let max = if let Ok(max_val) = series.max::<f64>() {
924 max_val.unwrap_or(f64::NAN)
925 } else if let Ok(max_val) = series.max::<i64>() {
926 max_val.map(|v| v as f64).unwrap_or(f64::NAN)
927 } else if let Ok(max_val) = series.max::<i32>() {
928 max_val.map(|v| v as f64).unwrap_or(f64::NAN)
929 } else {
930 f64::NAN
931 };
932
933 let mut percentiles = HashMap::new();
934 let values: Vec<f64> = get_numeric_values_as_f64(series);
935
936 if !values.is_empty() {
937 let mut sorted = values.clone();
938 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
939 let n = sorted.len();
940
941 for p in &[1, 5, 25, 50, 75, 95, 99] {
942 let idx = ((*p as f64 / 100.0) * (n - 1) as f64).round() as usize;
943 let idx = idx.min(n - 1);
944 percentiles.insert(*p, sorted[idx]);
945 }
946 }
947
948 let median = percentiles.get(&50).copied().unwrap_or(f64::NAN);
949 let q25 = percentiles.get(&25).copied().unwrap_or(f64::NAN);
950 let q75 = percentiles.get(&75).copied().unwrap_or(f64::NAN);
951
952 let (skewness, kurtosis, outliers_iqr, outliers_zscore) = if include_advanced {
953 let (out_iqr, out_zscore) = detect_outliers(series, q25, q75, median, mean, std);
954 (
955 compute_skewness(series),
956 compute_kurtosis(series),
957 out_iqr,
958 out_zscore,
959 )
960 } else {
961 (0.0, 3.0, 0, 0) };
963
964 Ok(NumericStatistics {
965 mean,
966 std,
967 min,
968 max,
969 median,
970 q25,
971 q75,
972 percentiles,
973 skewness,
974 kurtosis,
975 outliers_iqr,
976 outliers_zscore,
977 })
978}
979
980fn compute_skewness(series: &Series) -> f64 {
981 let mean = series.mean().unwrap_or(0.0);
982 let std = series.std(1).unwrap_or(1.0);
983 let n = series.len() as f64;
984
985 if std == 0.0 || n < 3.0 {
986 return 0.0;
987 }
988
989 let values: Vec<f64> = get_numeric_values_as_f64(series);
990
991 if values.is_empty() {
992 return 0.0;
993 }
994
995 let sum_cubed_deviations: f64 = values
996 .iter()
997 .map(|v| {
998 let deviation = (v - mean) / std;
999 deviation * deviation * deviation
1000 })
1001 .sum();
1002
1003 (n / ((n - 1.0) * (n - 2.0))) * sum_cubed_deviations
1004}
1005
1006fn compute_kurtosis(series: &Series) -> f64 {
1007 let mean = series.mean().unwrap_or(0.0);
1008 let std = series.std(1).unwrap_or(1.0);
1009 let n = series.len() as f64;
1010
1011 if std == 0.0 || n < 4.0 {
1012 return 3.0; }
1014
1015 let values: Vec<f64> = get_numeric_values_as_f64(series);
1016
1017 if values.is_empty() {
1018 return 3.0;
1019 }
1020
1021 let sum_fourth_deviations: f64 = values
1022 .iter()
1023 .map(|v| {
1024 let deviation = (v - mean) / std;
1025 let d2 = deviation * deviation;
1026 d2 * d2
1027 })
1028 .sum();
1029
1030 let k = (n * (n + 1.0) / ((n - 1.0) * (n - 2.0) * (n - 3.0))) * sum_fourth_deviations
1031 - 3.0 * (n - 1.0) * (n - 1.0) / ((n - 2.0) * (n - 3.0));
1032
1033 k + 3.0 }
1035
1036fn detect_outliers(
1037 series: &Series,
1038 q25: f64,
1039 q75: f64,
1040 _median: f64,
1041 mean: f64,
1042 std: f64,
1043) -> (usize, usize) {
1044 if q25.is_nan() || q75.is_nan() {
1045 return (0, 0);
1046 }
1047
1048 let iqr = q75 - q25;
1049 let lower_fence = q25 - 1.5 * iqr;
1050 let upper_fence = q75 + 1.5 * iqr;
1051
1052 let mut outliers_iqr = 0;
1053 let mut outliers_zscore = 0;
1054
1055 if std > 0.0 {
1056 let values: Vec<f64> = get_numeric_values_as_f64(series);
1058
1059 for v in values {
1060 if v < lower_fence || v > upper_fence {
1062 outliers_iqr += 1;
1063 }
1064
1065 let z = (v - mean).abs() / std;
1067 if z > 3.0 {
1068 outliers_zscore += 1;
1069 }
1070 }
1071 }
1072
1073 (outliers_iqr, outliers_zscore)
1074}
1075
1076fn compute_categorical_stats(series: &Series) -> Result<CategoricalStatistics> {
1077 let value_counts = series.value_counts(false, false, "counts".into(), false)?;
1078 let unique_count = value_counts.height();
1079
1080 let mode = if unique_count > 0 {
1081 if let Some(col) = value_counts.get(0) {
1082 col.first().map(|v| v.str_value().to_string())
1083 } else {
1084 None
1085 }
1086 } else {
1087 None
1088 };
1089
1090 let mut top_values = Vec::new();
1091 for i in 0..unique_count.min(10) {
1092 if let (Some(value_col), Some(count_col)) = (value_counts.get(0), value_counts.get(1)) {
1093 if let (Some(value), Some(count)) = (value_col.get(i), count_col.get(i)) {
1094 let value_str = value.str_value();
1095 if let Ok(count_u32) = count.try_extract::<u32>() {
1096 top_values.push((value_str.to_string(), count_u32 as usize));
1097 }
1098 }
1099 }
1100 }
1101
1102 let min = if let Ok(str_series) = series.str() {
1103 let mut min_val: Option<String> = None;
1104 for s in str_series.iter().flatten() {
1105 let s_str = s.to_string();
1106 min_val = match min_val {
1107 None => Some(s_str.clone()),
1108 Some(ref current) if s_str < *current => Some(s_str),
1109 Some(current) => Some(current),
1110 };
1111 }
1112 min_val
1113 } else {
1114 None
1115 };
1116
1117 let max = if let Ok(str_series) = series.str() {
1118 let mut max_val: Option<String> = None;
1119 for s in str_series.iter().flatten() {
1120 let s_str = s.to_string();
1121 max_val = match max_val {
1122 None => Some(s_str.clone()),
1123 Some(ref current) if s_str > *current => Some(s_str),
1124 Some(current) => Some(current),
1125 };
1126 }
1127 max_val
1128 } else {
1129 None
1130 };
1131
1132 Ok(CategoricalStatistics {
1133 unique_count,
1134 mode,
1135 top_values,
1136 min,
1137 max,
1138 })
1139}
1140
1141fn infer_distribution(
1142 _series: &Series,
1143 sample: &Series,
1144 sample_size: usize,
1145 is_sampled: bool,
1146) -> DistributionInfo {
1147 if sample_size < 3 {
1148 return DistributionInfo {
1149 distribution_type: DistributionType::Unknown,
1150 confidence: 0.0,
1151 sample_size,
1152 is_sampled,
1153 fit_quality: None,
1154 all_distribution_pvalues: HashMap::new(),
1155 };
1156 }
1157
1158 let max_convert = 10000.min(sample.len());
1159 let values: Vec<f64> = if sample.len() > max_convert {
1160 let all_values = get_numeric_values_as_f64(sample);
1161 all_values.into_iter().take(max_convert).collect()
1162 } else {
1163 get_numeric_values_as_f64(sample)
1164 };
1165
1166 if values.is_empty() {
1167 return DistributionInfo {
1168 distribution_type: DistributionType::Unknown,
1169 confidence: 0.0,
1170 sample_size,
1171 is_sampled,
1172 fit_quality: None,
1173 all_distribution_pvalues: HashMap::new(),
1174 };
1175 }
1176
1177 let mean: f64 = values.iter().sum::<f64>() / values.len() as f64;
1178 let variance: f64 =
1179 values.iter().map(|v| (v - mean).powi(2)).sum::<f64>() / (values.len() - 1) as f64;
1180 let std = variance.sqrt();
1181
1182 let mut candidates: Vec<(DistributionType, f64, f64)> = Vec::new();
1183
1184 let normal_fit = calculate_normal_fit_quality(&values, mean, std);
1185 let normal_confidence = normal_fit.min(0.95);
1186 candidates.push((DistributionType::Normal, normal_fit, normal_confidence));
1187
1188 let positive_values: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
1189 if positive_values.len() > 10 {
1190 let lognormal_fit = calculate_lognormal_fit_quality(&values);
1191 let lognormal_confidence = lognormal_fit.min(0.95);
1192 if lognormal_fit > 0.01 {
1193 candidates.push((
1194 DistributionType::LogNormal,
1195 lognormal_fit,
1196 lognormal_confidence,
1197 ));
1198 }
1199 }
1200
1201 let uniformity_pvalue = chi_square_uniformity_test(&values);
1202 let uniform_fit = calculate_uniform_fit_quality(&values);
1203 let uniform_confidence = uniformity_pvalue.min(0.95);
1204 candidates.push((DistributionType::Uniform, uniform_fit, uniform_confidence));
1205
1206 let powerlaw_pvalue = test_power_law(&values);
1207 let powerlaw_fit = calculate_power_law_fit_quality(&values);
1208 let powerlaw_confidence = powerlaw_pvalue.min(0.95);
1209 if powerlaw_pvalue > 0.0 {
1210 candidates.push((
1211 DistributionType::PowerLaw,
1212 powerlaw_fit,
1213 powerlaw_confidence,
1214 ));
1215 }
1216
1217 let positive_exp: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
1218 if positive_exp.len() > 10 {
1219 let exp_score = test_exponential(&values);
1220 let exp_fit = calculate_exponential_fit_quality(&values);
1221 let exp_confidence = exp_score.min(0.95);
1222 if exp_score > 0.0 {
1223 candidates.push((DistributionType::Exponential, exp_fit, exp_confidence));
1224 }
1225 }
1226
1227 let beta_score = test_beta(&values);
1228 let beta_fit = calculate_beta_fit_quality(&values);
1229 let beta_confidence = beta_score.min(0.95);
1230 if beta_score > 0.0 {
1231 candidates.push((DistributionType::Beta, beta_fit, beta_confidence));
1232 }
1233
1234 let gamma_score = test_gamma(&values);
1235 let gamma_fit = calculate_gamma_fit_quality(&values);
1236 let gamma_confidence = gamma_score.min(0.95);
1237 if gamma_score > 0.0 {
1238 candidates.push((DistributionType::Gamma, gamma_fit, gamma_confidence));
1239 }
1240
1241 let chi2_score = test_chi_squared(&values);
1242 let chi2_fit = calculate_chi_squared_fit_quality(&values);
1243 let chi2_confidence = chi2_score.min(0.95);
1244 if chi2_score > 0.0 {
1245 candidates.push((DistributionType::ChiSquared, chi2_fit, chi2_confidence));
1246 }
1247
1248 let t_score = test_students_t(&values);
1249 let t_fit = calculate_students_t_fit_quality(&values);
1250 let t_confidence = t_score.min(0.95);
1251 if t_score > 0.0 {
1252 candidates.push((DistributionType::StudentsT, t_fit, t_confidence));
1253 }
1254
1255 let poisson_score = test_poisson(&values);
1256 let poisson_fit = calculate_poisson_fit_quality(&values);
1257 let poisson_confidence = poisson_score.min(0.95);
1258 if poisson_score > 0.0 {
1259 candidates.push((DistributionType::Poisson, poisson_fit, poisson_confidence));
1260 }
1261
1262 let bernoulli_score = test_bernoulli(&values);
1263 let bernoulli_fit = calculate_bernoulli_fit_quality(&values);
1264 let bernoulli_confidence = bernoulli_score.min(0.95);
1265 let binary_count = values.iter().filter(|&&v| v == 0.0 || v == 1.0).count();
1266 if bernoulli_score > 0.01 && binary_count as f64 / values.len() as f64 > 0.9 {
1267 candidates.push((
1268 DistributionType::Bernoulli,
1269 bernoulli_fit,
1270 bernoulli_confidence,
1271 ));
1272 }
1273
1274 let max_value = values.iter().fold(0.0f64, |a, &b| a.max(b));
1275 if max_value > 1.0 {
1276 let binomial_score = test_binomial(&values);
1277 let binomial_fit = calculate_binomial_fit_quality(&values);
1278 let binomial_confidence = binomial_score.min(0.95);
1279 let threshold = if values.len() > 5000 { 0.0 } else { 0.01 };
1280 if binomial_score > threshold {
1281 candidates.push((
1282 DistributionType::Binomial,
1283 binomial_fit,
1284 binomial_confidence,
1285 ));
1286 }
1287 }
1288
1289 if values.len() <= 10000 {
1290 let non_negative_int_count = values
1291 .iter()
1292 .filter(|&&v| v >= 0.0 && v == v.floor() && v.is_finite())
1293 .count();
1294
1295 if non_negative_int_count as f64 / values.len() as f64 > 0.9 {
1296 let geometric_score = test_geometric(&values);
1297 let geometric_fit = calculate_geometric_fit_quality(&values);
1298 let geometric_confidence = geometric_score.min(0.95);
1299 if geometric_score > 0.01 {
1300 candidates.push((
1301 DistributionType::Geometric,
1302 geometric_fit,
1303 geometric_confidence,
1304 ));
1305 }
1306 }
1307 }
1308
1309 let positive_weibull: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
1310 if positive_weibull.len() > 10 {
1311 let weibull_score = test_weibull(&values);
1312 let weibull_fit = calculate_weibull_fit_quality(&values);
1313 let weibull_confidence = weibull_score.min(0.95);
1314 if weibull_score > 0.01 {
1315 candidates.push((DistributionType::Weibull, weibull_fit, weibull_confidence));
1316 }
1317 }
1318
1319 let mut all_pvalues = HashMap::new();
1321 for (dist_type, _, confidence) in &candidates {
1322 all_pvalues.insert(*dist_type, *confidence);
1323 }
1324
1325 if let Some(best) = candidates.iter().max_by(|a, b| {
1326 let p_cmp = a.2.partial_cmp(&b.2).unwrap_or(std::cmp::Ordering::Equal);
1328 if p_cmp != std::cmp::Ordering::Equal {
1329 return p_cmp;
1330 }
1331 if (a.2 - b.2).abs() < 0.01 {
1332 a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal)
1333 } else {
1334 p_cmp
1335 }
1336 }) {
1337 DistributionInfo {
1338 distribution_type: best.0,
1339 confidence: best.2,
1340 sample_size,
1341 is_sampled,
1342 fit_quality: Some(best.1),
1343 all_distribution_pvalues: all_pvalues,
1344 }
1345 } else {
1346 DistributionInfo {
1347 distribution_type: DistributionType::Unknown,
1348 confidence: 0.50,
1349 sample_size,
1350 is_sampled,
1351 fit_quality: Some(0.5),
1352 all_distribution_pvalues: HashMap::new(),
1353 }
1354 }
1355}
1356
1357fn approximate_shapiro_wilk(values: &[f64]) -> (Option<f64>, Option<f64>) {
1358 let n = values.len();
1359 if n < 3 {
1360 return (None, None);
1361 }
1362
1363 let mean: f64 = values.iter().sum::<f64>() / n as f64;
1364 let variance: f64 = values.iter().map(|v| (v - mean).powi(2)).sum::<f64>() / (n - 1) as f64;
1365 let std = variance.sqrt();
1366
1367 if std == 0.0 {
1368 return (None, None);
1369 }
1370
1371 let mut sorted = values.to_vec();
1372 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
1373
1374 let mut sum_expected_sq = 0.0;
1375 let mut sum_data_sq = 0.0;
1376 let mut sum_product = 0.0;
1377
1378 for (i, &value) in sorted.iter().enumerate() {
1379 let p = (i as f64 + 1.0 - 0.375) / (n as f64 + 0.25);
1380 let expected_quantile = normal_quantile(p);
1381 let standardized_value = (value - mean) / std;
1382
1383 sum_expected_sq += expected_quantile * expected_quantile;
1384 sum_data_sq += standardized_value * standardized_value;
1385 sum_product += expected_quantile * standardized_value;
1386 }
1387
1388 let sw_stat = if sum_expected_sq > 0.0 && sum_data_sq > 0.0 {
1389 (sum_product * sum_product) / (sum_expected_sq * sum_data_sq)
1390 } else {
1391 0.0
1392 };
1393
1394 let sw_stat = sw_stat.clamp(0.0, 1.0);
1395
1396 let skewness: f64 = values
1397 .iter()
1398 .map(|v| ((v - mean) / std).powi(3))
1399 .sum::<f64>()
1400 / n as f64;
1401
1402 let kurtosis: f64 = values
1403 .iter()
1404 .map(|v| ((v - mean) / std).powi(4))
1405 .sum::<f64>()
1406 / n as f64;
1407
1408 let skew_penalty = (skewness.abs() / 2.0).min(1.0);
1409 let kurt_penalty = ((kurtosis - 3.0).abs() / 2.0).min(1.0);
1410
1411 let w_factor = sw_stat;
1412 let penalty_factor = 1.0 - (skew_penalty + kurt_penalty) / 2.0;
1413 let pvalue = (w_factor * 0.7 + penalty_factor * 0.3).clamp(0.0, 1.0);
1414
1415 (Some(sw_stat), Some(pvalue))
1416}
1417
1418fn calculate_chi_square_uniformity(values: &[f64]) -> Option<f64> {
1424 let n = values.len();
1425 if n < 10 {
1426 return None;
1427 }
1428
1429 let min = values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
1430 let max = values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
1431 let range = max - min;
1432
1433 if range == 0.0 {
1434 return None;
1435 }
1436
1437 let bins = 10;
1438 let mut counts = vec![0; bins];
1439
1440 for &v in values {
1441 let bin = (((v - min) / range) * bins as f64) as usize;
1442 let bin = bin.min(bins - 1);
1443 counts[bin] += 1;
1444 }
1445
1446 let expected = n as f64 / bins as f64;
1447 let chi_square: f64 = counts
1448 .iter()
1449 .map(|&count| {
1450 let diff = count as f64 - expected;
1451 diff * diff / expected
1452 })
1453 .sum();
1454
1455 Some(chi_square)
1456}
1457
1458fn kolmogorov_smirnov_test<F>(values: &[f64], theoretical_cdf: F) -> f64
1459where
1460 F: Fn(f64) -> f64,
1461{
1462 if values.is_empty() {
1463 return 1.0;
1464 }
1465
1466 let mut sorted = values.to_vec();
1467 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
1468
1469 let n = sorted.len();
1470 let mut max_distance: f64 = 0.0;
1471
1472 for (i, &x) in sorted.iter().enumerate() {
1473 let empirical_cdf = (i + 1) as f64 / n as f64;
1474 let theoretical_cdf_val = theoretical_cdf(x);
1475 let distance = (empirical_cdf - theoretical_cdf_val).abs();
1476 max_distance = max_distance.max(distance);
1477 }
1478
1479 max_distance
1480}
1481
1482fn chi_square_uniformity_test(values: &[f64]) -> f64 {
1483 if let Some(chi_square) = calculate_chi_square_uniformity(values) {
1484 (-chi_square / 20.0).exp().clamp(0.0, 1.0)
1485 } else {
1486 0.0
1487 }
1488}
1489
1490fn estimate_power_law_mle(values: &[f64]) -> Option<(f64, f64)> {
1491 let positive_values: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
1492 if positive_values.len() < 10 {
1493 return None;
1494 }
1495
1496 let mut sorted = positive_values.clone();
1497 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
1498
1499 let xmin = sorted[0];
1500 let sum_log_ratio: f64 = sorted.iter().map(|&x| (x / xmin).ln()).sum();
1501
1502 if sum_log_ratio <= 0.0 {
1503 return None;
1504 }
1505
1506 let n = sorted.len() as f64;
1507 let alpha = 1.0 + n / sum_log_ratio;
1508 if !(1.5..=4.0).contains(&alpha) {
1509 return None;
1510 }
1511
1512 Some((xmin, alpha))
1513}
1514
1515fn power_law_ks_test(values: &[f64], xmin: f64, alpha: f64) -> f64 {
1516 let mut filtered: Vec<f64> = values.iter().filter(|&&v| v >= xmin).copied().collect();
1517 if filtered.is_empty() {
1518 return 1.0;
1519 }
1520 filtered.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
1521
1522 let n = filtered.len();
1523 let mut max_distance: f64 = 0.0;
1524
1525 for (i, &x) in filtered.iter().enumerate() {
1526 let empirical_cdf = (i + 1) as f64 / n as f64;
1527 let theoretical_cdf = powerlaw_cdf(x, xmin, alpha);
1528 let distance = (empirical_cdf - theoretical_cdf).abs();
1529 max_distance = max_distance.max(distance);
1530 }
1531
1532 max_distance
1533}
1534
1535fn approximate_ks_pvalue(ks_stat: f64, n: usize) -> f64 {
1536 if n < 1 || ks_stat <= 0.0 {
1537 return 0.0;
1538 }
1539 if ks_stat >= 1.0 {
1540 return 0.0;
1541 }
1542
1543 let sqrt_n = (n as f64).sqrt();
1544 let exponent = -2.0 * (sqrt_n * ks_stat).powi(2);
1545 let p_value = 2.0 * exponent.exp();
1546
1547 p_value.clamp(0.0, 1.0)
1548}
1549
1550fn chi_square_goodness_of_fit(observed: &[usize], expected: &[f64]) -> f64 {
1551 if observed.len() != expected.len() || observed.is_empty() {
1552 return 0.0;
1553 }
1554
1555 let mut chi_square = 0.0;
1556 let mut total_observed = 0;
1557 let mut total_expected = 0.0;
1558
1559 for (obs, exp) in observed.iter().zip(expected.iter()) {
1560 total_observed += obs;
1561 total_expected += exp;
1562 if *exp > 0.0 {
1563 let diff = *obs as f64 - exp;
1564 chi_square += (diff * diff) / exp;
1565 }
1566 }
1567
1568 let df = (observed.len() as i32 - 2).max(1) as usize;
1569 let p_value = if df > 30 {
1571 let z = ((2.0 * chi_square).sqrt() - (2.0 * df as f64 - 1.0).sqrt()).max(0.0);
1573 (-z * z / 2.0).exp()
1574 } else {
1575 (-chi_square / (2.0 * df as f64)).exp()
1577 };
1578
1579 p_value.clamp(0.0, 1.0)
1580}
1581
1582fn power_law_log_likelihood(values: &[f64], xmin: f64, alpha: f64) -> f64 {
1583 let filtered: Vec<f64> = values.iter().filter(|&&v| v >= xmin).copied().collect();
1584 if filtered.is_empty() {
1585 return f64::NEG_INFINITY;
1586 }
1587
1588 let n = filtered.len() as f64;
1589 let sum_log = filtered.iter().map(|&x| (x / xmin).ln()).sum::<f64>();
1590
1591 n * (alpha - 1.0).ln() - n * xmin.ln() - alpha * sum_log
1592}
1593
1594fn weibull_log_likelihood(values: &[f64], shape: f64, scale: f64) -> f64 {
1595 let positive: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
1596 if positive.is_empty() || shape <= 0.0 || scale <= 0.0 {
1597 return f64::NEG_INFINITY;
1598 }
1599
1600 let n = positive.len() as f64;
1601 let sum_power = positive
1602 .iter()
1603 .map(|&x| (x / scale).powf(shape))
1604 .sum::<f64>();
1605 let sum_log = positive.iter().map(|&x| x.ln()).sum::<f64>();
1606
1607 n * (shape / scale).ln() + (shape - 1.0) * sum_log - sum_power
1608}
1609
1610fn test_power_law(values: &[f64]) -> f64 {
1611 let Some((xmin, alpha)) = estimate_power_law_mle(values) else {
1612 return 0.0;
1613 };
1614
1615 let ks_stat = power_law_ks_test(values, xmin, alpha);
1616
1617 let positive_values: Vec<f64> = values.iter().filter(|&&v| v >= xmin).copied().collect();
1618 let n = positive_values.len();
1619 if n < 10 {
1620 return 0.0;
1621 }
1622
1623 let mut p_value = approximate_ks_pvalue(ks_stat, n);
1624
1625 let uniform_p = chi_square_uniformity_test(values);
1626 if uniform_p > 0.1 {
1627 p_value *= 0.3;
1628 }
1629
1630 if p_value > 0.05 {
1631 let n_f64 = n as f64;
1632 let mean = positive_values.iter().sum::<f64>() / n_f64;
1633 let scale = mean / 1.0;
1634 let shape = 1.5;
1635
1636 let pl_likelihood = power_law_log_likelihood(values, xmin, alpha);
1637 let wb_likelihood = weibull_log_likelihood(values, shape, scale);
1638
1639 if wb_likelihood > pl_likelihood + 5.0 {
1640 p_value *= 0.5;
1641 }
1642 }
1643
1644 p_value
1645}
1646
1647fn test_exponential(values: &[f64]) -> f64 {
1648 calculate_exponential_fit_quality(values)
1650}
1651
1652fn test_beta(values: &[f64]) -> f64 {
1653 if values.is_empty() || values.len() < 10 {
1654 return 0.0;
1655 }
1656
1657 let has_negatives = values.iter().any(|&v| v < 0.0);
1658 if has_negatives {
1659 return 0.0;
1660 }
1661
1662 let values_in_range = values.iter().filter(|&&v| (0.0..=1.0).contains(&v)).count();
1663 let ratio_in_range = values_in_range as f64 / values.len() as f64;
1664 if ratio_in_range < 0.85 {
1665 return 0.0;
1666 }
1667
1668 let valid_values: Vec<f64> = values
1669 .iter()
1670 .filter(|&&v| (0.0..=1.0).contains(&v))
1671 .copied()
1672 .collect();
1673
1674 if valid_values.len() < 10 {
1675 return 0.0;
1676 }
1677
1678 let mean = valid_values.iter().sum::<f64>() / valid_values.len() as f64;
1679 let variance: f64 = valid_values.iter().map(|v| (v - mean).powi(2)).sum::<f64>()
1680 / (valid_values.len() - 1) as f64;
1681
1682 if variance <= 0.0 || mean <= 0.0 || mean >= 1.0 {
1683 return 0.0;
1684 }
1685
1686 let temp = mean * (1.0 - mean) / variance - 1.0;
1687 if temp <= 0.0 {
1688 return 0.0; }
1690 let alpha = mean * temp;
1691 let beta = (1.0 - mean) * temp;
1692
1693 if alpha <= 0.0 || beta <= 0.0 {
1694 return 0.0;
1695 }
1696 let alpha = alpha.min(1000.0);
1697 let beta = beta.min(1000.0);
1698
1699 let ks_stat = kolmogorov_smirnov_test(&valid_values, |x| beta_cdf(x, alpha, beta));
1700 let n = valid_values.len();
1701 let ks_pvalue = approximate_ks_pvalue(ks_stat, n);
1702
1703 let max_variance = mean * (1.0 - mean);
1704 let variance_score = if max_variance > 0.0 {
1705 (1.0 - (variance / max_variance).min(1.0)).max(0.0)
1706 } else {
1707 0.0
1708 };
1709 let confidence = if ks_pvalue > 0.1 {
1710 ks_pvalue.min(0.95)
1711 } else if ks_pvalue > 0.05 {
1712 (ks_pvalue * 7.0).min(0.7)
1713 } else if ks_pvalue > 0.01 {
1714 (ks_pvalue * 0.7 + variance_score * 0.3).min(0.5)
1715 } else {
1716 (ks_pvalue * 0.3 + variance_score * 0.7).min(0.4)
1717 };
1718
1719 confidence.max(0.1)
1720}
1721
1722fn gamma_log_likelihood(values: &[f64], shape: f64, scale: f64) -> f64 {
1724 let positive: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
1725 if positive.is_empty() || shape <= 0.0 || scale <= 0.0 {
1726 return f64::NEG_INFINITY;
1727 }
1728
1729 let n = positive.len() as f64;
1730 let sum_log = positive.iter().map(|&x| x.ln()).sum::<f64>();
1731 let sum_x = positive.iter().sum::<f64>();
1732
1733 let gamma_term = ln_gamma_approx(shape);
1734 -n * gamma_term - n * shape * scale.ln() + (shape - 1.0) * sum_log - sum_x / scale
1735}
1736
1737fn lognormal_log_likelihood(values: &[f64], mu: f64, sigma: f64) -> f64 {
1738 let positive: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
1739 if positive.is_empty() || sigma <= 0.0 {
1740 return f64::NEG_INFINITY;
1741 }
1742
1743 let n = positive.len() as f64;
1744 let sum_log = positive.iter().map(|&x| x.ln()).sum::<f64>();
1745 let sum_log_sq = positive.iter().map(|&x| (x.ln() - mu).powi(2)).sum::<f64>();
1746
1747 -n / 2.0 * (2.0 * std::f64::consts::PI).ln()
1748 - n * sigma.ln()
1749 - sum_log
1750 - sum_log_sq / (2.0 * sigma * sigma)
1751}
1752
1753fn test_gamma(values: &[f64]) -> f64 {
1754 let positive_values: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
1755 if positive_values.len() < 10 {
1756 return 0.0;
1757 }
1758
1759 let mean = positive_values.iter().sum::<f64>() / positive_values.len() as f64;
1760 let variance: f64 = positive_values
1761 .iter()
1762 .map(|&v| (v - mean).powi(2))
1763 .sum::<f64>()
1764 / (positive_values.len() - 1) as f64;
1765
1766 if mean <= 0.0 || variance <= 0.0 {
1767 return 0.0;
1768 }
1769
1770 let shape = (mean * mean) / variance;
1771 let scale = variance / mean;
1772
1773 if shape <= 0.0 || scale <= 0.0 {
1774 return 0.0;
1775 }
1776
1777 let ks_stat = kolmogorov_smirnov_test(&positive_values, |x| gamma_cdf(x, shape, scale));
1778 let n = positive_values.len();
1779 let mut p_value = approximate_ks_pvalue(ks_stat, n);
1780
1781 let cv = variance.sqrt() / mean;
1782 let p_value_threshold = if n > 5000 {
1783 0.001 } else {
1785 0.01 };
1787 let ks_stat_threshold = if n > 5000 {
1788 0.15 } else {
1790 0.12 };
1792 if n > 5000 {
1793 if ks_stat < 0.3
1794 && p_value >= 0.0
1795 && shape > 0.1
1796 && shape < 500.0
1797 && scale > 0.001
1798 && scale < 500.0
1799 && cv > 0.1
1800 && cv < 5.0
1801 {
1802 p_value = (ks_stat * 6.0).clamp(0.05, 0.30);
1803 } else if ks_stat < 0.5 && p_value >= 0.0 && shape > 0.1 && scale > 0.001 {
1804 p_value = (ks_stat * 3.0).clamp(0.01, 0.15);
1805 }
1806 } else if p_value < p_value_threshold
1807 && ks_stat < ks_stat_threshold
1808 && p_value > 0.0
1809 && shape > 0.5
1810 && shape < 100.0
1811 && scale > 0.01
1812 && scale < 100.0
1813 && cv > 0.2
1814 && cv < 2.0
1815 {
1816 p_value = (ks_stat * 2.0).min(0.10).max(p_value);
1817 }
1818
1819 let gamma_likelihood = gamma_log_likelihood(values, shape, scale);
1820
1821 let cv = variance.sqrt() / mean;
1822 let weibull_shape = if cv < 0.5 {
1823 2.0
1824 } else if cv < 1.0 {
1825 1.5
1826 } else {
1827 1.0
1828 };
1829 let gamma_approx = match weibull_shape {
1830 1.0 => 1.0,
1831 1.5 => 0.9,
1832 2.0 => 0.886,
1833 _ => 0.9,
1834 };
1835 let weibull_scale = mean / gamma_approx;
1836
1837 if weibull_scale > 0.0 && weibull_shape > 0.0 && weibull_scale < 1000.0 && weibull_shape < 100.0
1838 {
1839 let weibull_likelihood = weibull_log_likelihood(values, weibull_shape, weibull_scale);
1840
1841 let is_discrete = values.iter().all(|&v| v == v.floor());
1842 if is_discrete && p_value > 0.0 {
1843 p_value *= 0.1;
1844 } else if p_value > 0.1 && weibull_likelihood > gamma_likelihood + 5.0 {
1845 p_value *= 0.5;
1846 } else if p_value > 0.05 && weibull_likelihood > gamma_likelihood + 5.0 {
1847 p_value *= 0.7;
1848 } else if p_value > 0.05 && weibull_likelihood > gamma_likelihood + 2.0 {
1849 p_value *= 0.85;
1850 }
1851 }
1852
1853 let lognormal_p = calculate_lognormal_fit_quality(values);
1854 if p_value > 0.05 && lognormal_p > 0.05 {
1855 let e_x = mean;
1856 let var_x = variance;
1857 let sigma_sq = (1.0 + var_x / (e_x * e_x)).ln();
1858 let mu = e_x.ln() - sigma_sq / 2.0;
1859 let sigma = sigma_sq.sqrt();
1860
1861 let lognormal_likelihood = lognormal_log_likelihood(values, mu, sigma);
1862 if lognormal_likelihood > gamma_likelihood + 5.0 {
1863 p_value *= 0.5;
1864 }
1865 }
1866
1867 p_value
1868}
1869
1870fn test_chi_squared(values: &[f64]) -> f64 {
1872 if values.is_empty() || values.len() < 10 {
1873 return 0.0;
1874 }
1875
1876 let positive_values: Vec<f64> = values
1877 .iter()
1878 .filter(|&&v| v >= 0.0 && v.is_finite())
1879 .copied()
1880 .collect();
1881
1882 let ratio_positive = positive_values.len() as f64 / values.len() as f64;
1883 if ratio_positive < 0.95 {
1884 return 0.0;
1885 }
1886
1887 if positive_values.len() < 10 {
1888 return 0.0;
1889 }
1890
1891 let mean = positive_values.iter().sum::<f64>() / positive_values.len() as f64;
1892 let variance: f64 = positive_values
1893 .iter()
1894 .map(|&v| (v - mean).powi(2))
1895 .sum::<f64>()
1896 / (positive_values.len() - 1) as f64;
1897
1898 if mean <= 0.0 {
1899 return 0.0;
1900 }
1901
1902 let expected_var = 2.0 * mean;
1903 let variance_ratio = if expected_var > 0.0 {
1904 (variance / expected_var).min(expected_var / variance)
1905 } else {
1906 0.0
1907 };
1908
1909 let variance_error = (variance - expected_var).abs() / expected_var;
1910 if variance_error > 0.1 {
1911 return 0.0;
1912 }
1913
1914 let df = mean;
1915 if df <= 0.0 || df > 1000.0 {
1916 return 0.0;
1917 }
1918
1919 let ks_stat = kolmogorov_smirnov_test(&positive_values, |x| chi_squared_cdf(x, df));
1920 let n = positive_values.len();
1921 let ks_pvalue = approximate_ks_pvalue(ks_stat, n);
1922
1923 let confidence = if ks_pvalue > 0.1 {
1924 (ks_pvalue * 0.8 + variance_ratio * 0.2).min(0.95)
1925 } else if ks_pvalue > 0.05 {
1926 (ks_pvalue * 0.6 + variance_ratio * 0.4).min(0.7)
1927 } else if ks_pvalue > 0.01 {
1928 (ks_pvalue * 0.3 + variance_ratio * 0.7).min(0.5)
1929 } else {
1930 (variance_ratio * 0.5).min(0.3)
1931 };
1932
1933 confidence.max(0.05)
1934}
1935
1936fn test_students_t(values: &[f64]) -> f64 {
1938 if values.len() < 10 {
1939 return 0.0;
1940 }
1941 let mean = values.iter().sum::<f64>() / values.len() as f64;
1942 let variance: f64 =
1943 values.iter().map(|&v| (v - mean).powi(2)).sum::<f64>() / (values.len() - 1) as f64;
1944 if variance <= 0.0 {
1945 return 0.0;
1946 }
1947
1948 let df = if variance > 1.0 {
1949 2.0 * variance / (variance - 1.0)
1950 } else {
1951 3.0
1952 };
1953 let df = df.clamp(1.0, 100.0);
1954
1955 let ks_stat = kolmogorov_smirnov_test(values, |x| students_t_cdf(x, df));
1957 let n = values.len();
1958 approximate_ks_pvalue(ks_stat, n)
1959}
1960
1961fn test_poisson(values: &[f64]) -> f64 {
1962 let non_negative: Vec<f64> = values
1963 .iter()
1964 .filter(|&&v| v >= 0.0 && v == v.floor())
1965 .copied()
1966 .collect();
1967 if non_negative.len() < 10 {
1968 return 0.0;
1969 }
1970
1971 let lambda = non_negative.iter().sum::<f64>() / non_negative.len() as f64;
1972 if lambda <= 0.0 {
1973 return 0.0;
1974 }
1975
1976 let variance: f64 = non_negative
1977 .iter()
1978 .map(|&v| (v - lambda).powi(2))
1979 .sum::<f64>()
1980 / (non_negative.len() - 1) as f64;
1981 let variance_ratio = (variance / lambda).min(lambda / variance);
1982 if variance_ratio < 0.7 {
1983 return 0.0;
1984 }
1985
1986 let max_val = non_negative.iter().fold(0.0f64, |a, &b| a.max(b)) as usize;
1987 let num_bins = (max_val.min(15) + 1).clamp(5, 20); let mut observed = vec![0; num_bins];
1990 for &v in &non_negative {
1991 let bin = (v as usize).min(num_bins - 1);
1992 observed[bin] += 1;
1993 }
1994
1995 let mut expected = vec![0.0; num_bins];
1996 for (bin, exp) in expected.iter_mut().enumerate() {
1997 let k = bin;
1998 let mut factorial = 1.0;
2000 for i in 1..=k {
2001 factorial *= i as f64;
2002 }
2003 let pmf = lambda.powi(k as i32) * (-lambda).exp() / factorial;
2004 *exp = pmf * non_negative.len() as f64;
2005 }
2006
2007 chi_square_goodness_of_fit(&observed, &expected)
2009}
2010
2011fn test_bernoulli(values: &[f64]) -> f64 {
2012 let binary_values: Vec<f64> = values
2014 .iter()
2015 .filter(|&&v| v == 0.0 || v == 1.0)
2016 .copied()
2017 .collect();
2018 if binary_values.len() < 10 || binary_values.len() < values.len() / 2 {
2019 return 0.0;
2020 }
2021
2022 let count_0 = binary_values.iter().filter(|&&v| v == 0.0).count();
2024 let count_1 = binary_values.iter().filter(|&&v| v == 1.0).count();
2025 let n = binary_values.len();
2026
2027 let p = count_1 as f64 / n as f64;
2028 if p <= 0.0 || p >= 1.0 {
2029 return 0.0;
2030 }
2031
2032 let expected = vec![n as f64 * (1.0 - p), n as f64 * p];
2034 let observed = vec![count_0, count_1];
2035
2036 chi_square_goodness_of_fit(&observed, &expected)
2038}
2039
2040fn test_binomial(values: &[f64]) -> f64 {
2041 let non_negative_int: Vec<f64> = values
2043 .iter()
2044 .filter(|&&v| v >= 0.0 && v == v.floor())
2045 .copied()
2046 .collect();
2047 if non_negative_int.len() < 10 || non_negative_int.len() < values.len() / 2 {
2048 return 0.0;
2049 }
2050
2051 let max_val = non_negative_int.iter().fold(0.0f64, |a, &b| a.max(b));
2052 let mean = non_negative_int.iter().sum::<f64>() / non_negative_int.len() as f64;
2053
2054 let variance: f64 = non_negative_int
2055 .iter()
2056 .map(|&v| (v - mean).powi(2))
2057 .sum::<f64>()
2058 / (non_negative_int.len() - 1) as f64;
2059
2060 if mean <= 0.0 || max_val <= 0.0 {
2061 return 0.0;
2062 }
2063
2064 let n = max_val as usize;
2066 if n == 0 {
2067 return 0.0;
2068 }
2069
2070 let p = mean / n as f64;
2071 if p <= 0.0 || p >= 1.0 {
2072 return 0.0;
2073 }
2074
2075 let expected_var = n as f64 * p * (1.0 - p);
2076 let variance_ratio = if expected_var > 0.0 {
2077 (variance / expected_var).min(expected_var / variance)
2078 } else {
2079 0.0
2080 };
2081 if variance_ratio < 0.7 {
2082 return 0.0; }
2084
2085 let num_bins = (n + 1).min(20);
2086 let mut observed = vec![0; num_bins];
2087 for &v in &non_negative_int {
2088 let bin = (v as usize).min(num_bins - 1);
2089 observed[bin] += 1;
2090 }
2091
2092 let mut expected = vec![0.0; num_bins];
2093 for (bin, exp) in expected.iter_mut().enumerate() {
2094 let k = bin;
2095 if k <= n {
2096 let coeff = binomial_coeff(n, k);
2097 let pmf = coeff * p.powi(k as i32) * (1.0 - p).powi((n - k) as i32);
2098 *exp = pmf * non_negative_int.len() as f64;
2099 }
2100 }
2101
2102 let mut score = chi_square_goodness_of_fit(&observed, &expected);
2104
2105 let n_samples = non_negative_int.len();
2106 if n_samples > 5000 && score > 0.0 {
2107 if score < 0.01 {
2108 score = (score * 200.0).min(0.20);
2109 } else if score < 0.1 {
2110 score = (score * 2.0).min(0.25);
2111 }
2112 if score > 0.0 && variance_ratio >= 0.7 {
2113 score = score.max(0.05);
2114 }
2115 }
2116
2117 score
2118}
2119
2120fn test_geometric(values: &[f64]) -> f64 {
2121 if values.is_empty() || values.len() < 10 {
2122 return 0.0;
2123 }
2124
2125 let process_limit = 5000.min(values.len());
2126 let sample: &[f64] = &values[..process_limit];
2127
2128 let non_negative_int: Vec<f64> = sample
2129 .iter()
2130 .filter(|&&v| v >= 0.0 && v == v.floor() && v.is_finite())
2131 .copied()
2132 .collect();
2133
2134 if non_negative_int.len() < 10 {
2135 return 0.0;
2136 }
2137
2138 let mean = non_negative_int.iter().sum::<f64>() / non_negative_int.len() as f64;
2139 if mean <= 0.0 {
2140 return 0.0;
2141 }
2142
2143 let max_seen = non_negative_int.iter().fold(0.0f64, |a, &b| a.max(b));
2144 if max_seen > 100.0 {
2145 return 0.0;
2146 }
2147
2148 if mean > 30.0 {
2149 return 0.0;
2150 }
2151
2152 let variance: f64 = non_negative_int
2154 .iter()
2155 .map(|&v| (v - mean).powi(2))
2156 .sum::<f64>()
2157 / (non_negative_int.len() - 1) as f64;
2158 if variance <= 0.0 {
2159 return 0.0;
2160 }
2161
2162 let expected_var = mean * (mean + 1.0);
2163 if expected_var <= 0.0 {
2164 return 0.0;
2165 }
2166
2167 let variance_ratio = (variance / expected_var).min(expected_var / variance);
2168 if variance_ratio < 0.5 {
2169 return 0.0;
2170 }
2171
2172 let p = 1.0 / (mean + 1.0);
2173 if p <= 0.0 || p >= 1.0 {
2174 return 0.0;
2175 }
2176
2177 let max_bin = (max_seen as usize + 1).min(50);
2179 let mut observed = vec![0; max_bin];
2180 let mut expected = vec![0.0; max_bin];
2181
2182 for &v in &non_negative_int {
2183 let bin = v as usize;
2184 if bin < max_bin {
2185 observed[bin] += 1;
2186 }
2187 }
2188
2189 let n = non_negative_int.len() as f64;
2192 let mut total_pmf = 0.0;
2193 for (k, exp) in expected.iter_mut().enumerate().take(max_bin) {
2194 let pmf = p * (1.0 - p).powi(k as i32);
2195 *exp = pmf;
2196 total_pmf += pmf;
2197 }
2198
2199 if total_pmf > 0.0 {
2200 for exp in &mut expected {
2201 *exp = (*exp / total_pmf) * n;
2202 }
2203 } else {
2204 let uniform_prob = 1.0 / max_bin as f64;
2205 expected.fill(uniform_prob * n);
2206 }
2207
2208 let min_expected = expected.iter().fold(f64::INFINITY, |a, &b| a.min(b));
2209 let chi_square_score = if min_expected > 0.1 {
2210 chi_square_goodness_of_fit(&observed, &expected)
2211 } else {
2212 variance_ratio
2213 };
2214
2215 let confidence = if chi_square_score > 0.1 {
2216 (chi_square_score * 0.8 + variance_ratio * 0.2).min(0.95)
2217 } else if chi_square_score > 0.05 {
2218 (chi_square_score * 0.6 + variance_ratio * 0.4).min(0.7)
2219 } else if chi_square_score > 0.01 {
2220 (chi_square_score * 0.3 + variance_ratio * 0.7).min(0.5)
2221 } else {
2222 (variance_ratio * 0.5).min(0.3)
2223 };
2224
2225 confidence.max(0.05)
2226}
2227
2228fn test_weibull(values: &[f64]) -> f64 {
2229 let has_negatives = values.iter().any(|&v| v < 0.0);
2230 if has_negatives {
2231 return 0.0;
2232 }
2233
2234 let positive_values: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
2235 if positive_values.len() < 10 {
2236 return 0.0;
2237 }
2238
2239 let n = positive_values.len() as f64;
2240 let mean = positive_values.iter().sum::<f64>() / n;
2241 let variance: f64 = positive_values
2242 .iter()
2243 .map(|&v| (v - mean).powi(2))
2244 .sum::<f64>()
2245 / (n - 1.0);
2246
2247 if mean <= 0.0 || variance <= 0.0 {
2248 return 0.0;
2249 }
2250
2251 let cv = variance.sqrt() / mean;
2252
2253 let candidate_shapes: Vec<f64> = if cv < 0.3 {
2254 vec![3.0, 2.5, 2.0, 1.8]
2255 } else if cv < 0.6 {
2256 vec![2.0, 1.8, 1.5, 1.3]
2257 } else if cv < 0.8 {
2258 vec![1.5, 1.3, 1.2, 1.1]
2259 } else if cv < 1.0 {
2260 vec![1.2, 1.1, 1.0, 0.9]
2261 } else {
2262 vec![1.0, 0.9, 0.8, 0.7]
2263 };
2264
2265 let mut best_shape = candidate_shapes[0];
2266 let mut best_scale = 1.0;
2267 let mut best_ks_stat = 1.0;
2268
2269 for &candidate_shape in &candidate_shapes {
2270 let k_inv = 1.0 / candidate_shape;
2271 let gamma_1_plus_1_over_shape = ln_gamma_approx(1.0 + k_inv).exp();
2272 let candidate_scale = mean / gamma_1_plus_1_over_shape;
2273
2274 if candidate_scale > 0.0 && candidate_scale < 1000.0 {
2275 let ks_stat = kolmogorov_smirnov_test(&positive_values, |x| {
2276 weibull_cdf(x, candidate_shape, candidate_scale)
2277 });
2278 if ks_stat < best_ks_stat {
2279 best_ks_stat = ks_stat;
2280 best_shape = candidate_shape;
2281 best_scale = candidate_scale;
2282 }
2283 }
2284 }
2285
2286 let shape = best_shape;
2287 let scale = best_scale;
2288
2289 if scale <= 0.0 || shape <= 0.0 || scale > 1000.0 || shape > 100.0 {
2291 return 0.0;
2292 }
2293
2294 let n_usize = positive_values.len();
2296 let mut p_value = approximate_ks_pvalue(best_ks_stat, n_usize);
2297
2298 let gamma_shape = (mean * mean) / variance; let gamma_scale = variance / mean;
2300
2301 if gamma_shape > 0.0 && gamma_scale > 0.0 && gamma_shape < 1000.0 && gamma_scale < 1000.0 {
2302 let wb_likelihood = weibull_log_likelihood(values, shape, scale);
2303 let gamma_likelihood = gamma_log_likelihood(values, gamma_shape, gamma_scale);
2304
2305 if p_value > 0.1 && gamma_likelihood > wb_likelihood + 5.0 {
2306 p_value *= 0.4;
2307 } else if p_value > 0.05 && gamma_likelihood > wb_likelihood + 5.0 {
2308 p_value *= 0.5;
2309 } else if p_value > 0.05 && gamma_likelihood > wb_likelihood + 2.0 {
2310 p_value *= 0.7;
2312 } else if p_value > 0.02 && gamma_likelihood > wb_likelihood + 5.0 {
2313 p_value *= 0.85;
2315 }
2316 }
2317
2318 let shape_tolerance = if positive_values.len() > 5000 {
2319 0.5
2320 } else {
2321 0.3
2322 };
2323 if (shape - 1.0).abs() < shape_tolerance && p_value > 0.3 {
2324 let exp_lambda = 1.0 / mean;
2325 if exp_lambda > 0.0 {
2326 let exp_likelihood = -(positive_values.len() as f64) * exp_lambda.ln()
2327 - exp_lambda * positive_values.iter().sum::<f64>();
2328 let wb_likelihood = weibull_log_likelihood(values, shape, scale);
2329
2330 if exp_likelihood > wb_likelihood + 0.5 {
2331 p_value *= 0.4;
2332 } else if exp_likelihood > wb_likelihood {
2333 p_value *= 0.6;
2334 } else if (shape - 1.0).abs() < 0.2 {
2335 p_value *= 0.7;
2336 }
2337 }
2338 }
2339
2340 if let Some((xmin, alpha)) = estimate_power_law_mle(values) {
2341 let wb_likelihood = weibull_log_likelihood(values, shape, scale);
2342 let pl_likelihood = power_law_log_likelihood(values, xmin, alpha);
2343
2344 if p_value > 0.05 && pl_likelihood > wb_likelihood + 5.0 {
2345 p_value *= 0.5;
2346 }
2347 }
2348
2349 p_value
2350}
2351
2352fn compute_advanced_distribution_analysis(
2354 column_name: &str,
2355 series: &Series,
2356 numeric_stats: &NumericStatistics,
2357 dist_info: &DistributionInfo,
2358 _sample_size: usize,
2359 is_sampled: bool,
2360) -> DistributionAnalysis {
2361 let max_values = 5000.min(series.len());
2362 let mut values: Vec<f64> = if series.len() > max_values {
2363 let limited_series = series.slice(0, max_values);
2364 get_numeric_values_as_f64(&limited_series)
2365 } else {
2366 get_numeric_values_as_f64(series)
2367 };
2368
2369 values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
2371 let sorted_sample_values = values.clone();
2372 let actual_sample_size = sorted_sample_values.len();
2373
2374 let (sw_stat, sw_pvalue) = if values.len() >= 3 {
2376 approximate_shapiro_wilk(&values)
2377 } else {
2378 (None, None)
2379 };
2380 let coefficient_of_variation = if numeric_stats.mean != 0.0 {
2381 numeric_stats.std / numeric_stats.mean.abs()
2382 } else {
2383 0.0
2384 };
2385
2386 let mode = compute_mode(&values);
2387
2388 let characteristics = DistributionCharacteristics {
2389 shapiro_wilk_stat: sw_stat,
2390 shapiro_wilk_pvalue: sw_pvalue,
2391 skewness: numeric_stats.skewness,
2392 kurtosis: numeric_stats.kurtosis,
2393 mean: numeric_stats.mean,
2394 median: numeric_stats.median,
2395 std_dev: numeric_stats.std,
2396 variance: numeric_stats.std * numeric_stats.std,
2397 coefficient_of_variation,
2398 mode,
2399 };
2400
2401 let fit_quality = dist_info.fit_quality.unwrap_or_else(|| {
2402 calculate_fit_quality(
2403 &values,
2404 dist_info.distribution_type,
2405 numeric_stats.mean,
2406 numeric_stats.std,
2407 )
2408 });
2409
2410 let outliers = compute_outlier_analysis(series, numeric_stats);
2411
2412 let percentiles = PercentileBreakdown {
2413 p1: numeric_stats
2414 .percentiles
2415 .get(&1)
2416 .copied()
2417 .unwrap_or(f64::NAN),
2418 p5: numeric_stats
2419 .percentiles
2420 .get(&5)
2421 .copied()
2422 .unwrap_or(f64::NAN),
2423 p25: numeric_stats.q25,
2424 p50: numeric_stats.median,
2425 p75: numeric_stats.q75,
2426 p95: numeric_stats
2427 .percentiles
2428 .get(&95)
2429 .copied()
2430 .unwrap_or(f64::NAN),
2431 p99: numeric_stats
2432 .percentiles
2433 .get(&99)
2434 .copied()
2435 .unwrap_or(f64::NAN),
2436 };
2437
2438 DistributionAnalysis {
2439 column_name: column_name.to_string(),
2440 distribution_type: dist_info.distribution_type,
2441 confidence: dist_info.confidence,
2442 fit_quality,
2443 characteristics,
2444 outliers,
2445 percentiles,
2446 sorted_sample_values,
2447 is_sampled,
2448 sample_size: actual_sample_size,
2449 all_distribution_pvalues: dist_info.all_distribution_pvalues.clone(),
2450 }
2451}
2452
2453fn compute_mode(values: &[f64]) -> Option<f64> {
2454 if values.is_empty() {
2455 return None;
2456 }
2457
2458 let min = values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
2460 let max = values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
2461 let range = max - min;
2462
2463 if range == 0.0 {
2464 return Some(min);
2465 }
2466
2467 let bins = 50.min(values.len());
2468 let mut bin_counts = vec![0; bins];
2469 let mut bin_sums = vec![0.0; bins];
2470
2471 for &v in values {
2472 let bin = (((v - min) / range) * (bins - 1) as f64) as usize;
2473 let bin = bin.min(bins - 1);
2474 bin_counts[bin] += 1;
2475 bin_sums[bin] += v;
2476 }
2477
2478 let max_bin = bin_counts
2480 .iter()
2481 .enumerate()
2482 .max_by_key(|(_, &count)| count)
2483 .map(|(idx, _)| idx);
2484
2485 max_bin.map(|idx| bin_sums[idx] / bin_counts[idx] as f64)
2486}
2487
2488pub fn calculate_fit_quality(
2492 values: &[f64],
2493 dist_type: DistributionType,
2494 mean: f64,
2495 std: f64,
2496) -> f64 {
2497 match dist_type {
2498 DistributionType::Normal => calculate_normal_fit_quality(values, mean, std),
2499 DistributionType::LogNormal => calculate_lognormal_fit_quality(values),
2500 DistributionType::Uniform => calculate_uniform_fit_quality(values),
2501 DistributionType::PowerLaw => calculate_power_law_fit_quality(values),
2502 DistributionType::Exponential => calculate_exponential_fit_quality(values),
2503 DistributionType::Beta => calculate_beta_fit_quality(values),
2504 DistributionType::Gamma => calculate_gamma_fit_quality(values),
2505 DistributionType::ChiSquared => calculate_chi_squared_fit_quality(values),
2506 DistributionType::StudentsT => calculate_students_t_fit_quality(values),
2507 DistributionType::Poisson => calculate_poisson_fit_quality(values),
2508 DistributionType::Bernoulli => calculate_bernoulli_fit_quality(values),
2509 DistributionType::Binomial => calculate_binomial_fit_quality(values),
2510 DistributionType::Geometric => {
2511 if values.len() > 10000 {
2512 return 0.0;
2513 }
2514 calculate_geometric_fit_quality(values)
2515 }
2516 DistributionType::Weibull => calculate_weibull_fit_quality(values),
2517 DistributionType::Unknown => 0.5,
2518 }
2519}
2520
2521fn calculate_normal_fit_quality(values: &[f64], mean: f64, std: f64) -> f64 {
2522 if values.is_empty() || std == 0.0 {
2524 return 0.0;
2525 }
2526
2527 let ks_stat = kolmogorov_smirnov_test(values, |x| normal_cdf(x, mean, std));
2529 let n = values.len();
2530 approximate_ks_pvalue(ks_stat, n)
2531}
2532
2533fn normal_quantile(p: f64) -> f64 {
2534 if p < 0.5 {
2537 -normal_quantile(1.0 - p)
2538 } else {
2539 let t = (-2.0 * (1.0 - p).ln()).sqrt();
2540 t - (2.515517 + 0.802853 * t + 0.010328 * t * t)
2541 / (1.0 + 1.432788 * t + 0.189269 * t * t + 0.001308 * t * t * t)
2542 }
2543}
2544
2545fn calculate_lognormal_fit_quality(values: &[f64]) -> f64 {
2546 let positive_values: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
2547 if positive_values.len() < 10 {
2548 return 0.0;
2549 }
2550
2551 let mean = positive_values.iter().sum::<f64>() / positive_values.len() as f64;
2553 let variance: f64 = positive_values
2554 .iter()
2555 .map(|v| (v - mean).powi(2))
2556 .sum::<f64>()
2557 / (positive_values.len() - 1) as f64;
2558
2559 let e_x = mean;
2561 let var_x = variance;
2562 if e_x <= 0.0 || var_x <= 0.0 {
2563 return 0.0;
2564 }
2565 let sigma_sq = (1.0 + var_x / (e_x * e_x)).ln();
2566 let mu = e_x.ln() - sigma_sq / 2.0;
2567 let sigma = sigma_sq.sqrt();
2568
2569 let ks_stat = kolmogorov_smirnov_test(&positive_values, |x| lognormal_cdf(x, mu, sigma));
2571 let n = positive_values.len();
2572 approximate_ks_pvalue(ks_stat, n)
2573}
2574
2575fn calculate_uniform_fit_quality(values: &[f64]) -> f64 {
2576 if let Some(chi_square) = calculate_chi_square_uniformity(values) {
2577 let base_fit = 1.0 / (1.0 + chi_square / 17.0);
2578 base_fit.clamp(0.01, 1.0)
2579 } else {
2580 0.0
2581 }
2582}
2583
2584fn calculate_power_law_fit_quality(values: &[f64]) -> f64 {
2585 test_power_law(values)
2586}
2587
2588fn calculate_exponential_fit_quality(values: &[f64]) -> f64 {
2589 let positive_values: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
2590 if positive_values.len() < 10 {
2591 return 0.0;
2592 }
2593
2594 let mean = positive_values.iter().sum::<f64>() / positive_values.len() as f64;
2596 if mean <= 0.0 {
2597 return 0.0;
2598 }
2599 let lambda = 1.0 / mean;
2600
2601 let ks_stat = kolmogorov_smirnov_test(&positive_values, |x| exponential_cdf(x, lambda));
2603 let n = positive_values.len();
2604 approximate_ks_pvalue(ks_stat, n)
2605}
2606
2607fn calculate_beta_fit_quality(values: &[f64]) -> f64 {
2608 test_beta(values)
2609}
2610
2611fn calculate_gamma_fit_quality(values: &[f64]) -> f64 {
2612 test_gamma(values) }
2614
2615fn calculate_chi_squared_fit_quality(values: &[f64]) -> f64 {
2616 test_chi_squared(values)
2617}
2618
2619fn calculate_students_t_fit_quality(values: &[f64]) -> f64 {
2620 test_students_t(values)
2622}
2623
2624fn calculate_poisson_fit_quality(values: &[f64]) -> f64 {
2625 test_poisson(values)
2626}
2627
2628fn calculate_bernoulli_fit_quality(values: &[f64]) -> f64 {
2629 test_bernoulli(values)
2630}
2631
2632fn calculate_binomial_fit_quality(values: &[f64]) -> f64 {
2633 test_binomial(values)
2634}
2635
2636fn calculate_geometric_fit_quality(values: &[f64]) -> f64 {
2637 if values.len() > 10000 {
2638 return 0.0;
2639 }
2640 test_geometric(values)
2641}
2642
2643fn calculate_weibull_fit_quality(values: &[f64]) -> f64 {
2644 test_weibull(values) }
2646
2647fn normal_cdf(x: f64, mean: f64, std: f64) -> f64 {
2649 if std <= 0.0 {
2650 return if x < mean { 0.0 } else { 1.0 };
2651 }
2652 let z = (x - mean) / std;
2653 let z_normalized = z / std::f64::consts::SQRT_2;
2656 let t = 1.0 / (1.0 + 0.3275911 * z_normalized.abs());
2657 let a1 = 0.254829592;
2658 let a2 = -0.284496736;
2659 let a3 = 1.421413741;
2660 let a4 = -1.453152027;
2661 let a5 = 1.061405429;
2662 let erf_approx = 1.0
2663 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1)
2664 * t
2665 * (-z_normalized * z_normalized).exp();
2666 let erf_val = if z_normalized >= 0.0 {
2667 erf_approx
2668 } else {
2669 -erf_approx
2670 };
2671 0.5 * (1.0 + erf_val)
2672}
2673
2674fn lognormal_cdf(x: f64, mu: f64, sigma: f64) -> f64 {
2675 if x <= 0.0 {
2676 return 0.0;
2677 }
2678 if sigma <= 0.0 {
2679 return if x < mu.exp() { 0.0 } else { 1.0 };
2680 }
2681 normal_cdf(x.ln(), mu, sigma)
2683}
2684
2685fn exponential_cdf(x: f64, lambda: f64) -> f64 {
2686 if x < 0.0 {
2687 return 0.0;
2688 }
2689 if lambda <= 0.0 {
2690 return if x < 0.0 { 0.0 } else { 1.0 };
2691 }
2692 1.0 - (-lambda * x).exp()
2694}
2695
2696fn powerlaw_cdf(x: f64, xmin: f64, alpha: f64) -> f64 {
2697 if x < xmin {
2698 return 0.0;
2699 }
2700 if alpha <= 1.0 {
2701 return if x >= xmin { 1.0 } else { 0.0 };
2702 }
2703 if alpha <= 1.0 || xmin <= 0.0 {
2706 return if x >= xmin { 1.0 } else { 0.0 };
2707 }
2708 1.0 - (x / xmin).powf(-alpha + 1.0)
2709}
2710
2711fn ln_gamma_approx(z: f64) -> f64 {
2713 if z <= 0.0 {
2714 return f64::NAN;
2715 }
2716 if z > 50.0 {
2718 (z - 0.5) * z.ln() - z + 0.5 * (2.0 * std::f64::consts::PI).ln() + 1.0 / (12.0 * z)
2719 } else {
2720 let mut result = 0.0;
2722 let mut z_val = z;
2723 let max_iter = 100;
2725 let mut iter = 0;
2726 while z_val < 50.0 && iter < max_iter {
2727 result -= z_val.ln();
2728 z_val += 1.0;
2729 iter += 1;
2730 }
2731 if z_val >= 50.0 {
2733 result
2734 + ((z_val - 0.5) * z_val.ln() - z_val
2735 + 0.5 * (2.0 * std::f64::consts::PI).ln()
2736 + 1.0 / (12.0 * z_val))
2737 } else {
2738 result
2739 }
2740 }
2741}
2742
2743fn beta_cdf(x: f64, alpha: f64, beta: f64) -> f64 {
2745 if x <= 0.0 {
2746 return 0.0;
2747 }
2748 if x >= 1.0 {
2749 return 1.0;
2750 }
2751 if alpha <= 0.0 || beta <= 0.0 {
2752 return 0.0;
2753 }
2754 if alpha + beta > 50.0 {
2757 let mean = alpha / (alpha + beta);
2759 let variance = (alpha * beta) / ((alpha + beta).powi(2) * (alpha + beta + 1.0));
2760 if variance > 0.0 {
2761 normal_cdf(x, mean, variance.sqrt())
2762 } else if x < mean {
2763 0.0
2764 } else {
2765 1.0
2766 }
2767 } else {
2768 let ln_beta =
2773 ln_gamma_approx(alpha) + ln_gamma_approx(beta) - ln_gamma_approx(alpha + beta);
2774 let beta_const = ln_beta.exp();
2775 if beta_const > 0.0 {
2776 let integrand = x.powf(alpha) * (1.0 - x).powf(beta) / beta_const;
2777 integrand.clamp(0.0, 1.0)
2778 } else {
2779 let mean = alpha / (alpha + beta);
2781 let variance = (alpha * beta) / ((alpha + beta).powi(2) * (alpha + beta + 1.0));
2782 if variance > 0.0 {
2783 normal_cdf(x, mean, variance.sqrt())
2784 } else if x < mean {
2785 0.0
2786 } else {
2787 1.0
2788 }
2789 }
2790 }
2791}
2792
2793pub(crate) fn beta_pdf(x: f64, alpha: f64, beta: f64) -> f64 {
2795 if x <= 0.0 || x >= 1.0 || alpha <= 0.0 || beta <= 0.0 {
2796 return 0.0;
2797 }
2798 let ln_x = x.ln();
2802 let ln_one_minus_x = (1.0 - x).ln();
2803 let ln_beta = ln_gamma_approx(alpha) + ln_gamma_approx(beta) - ln_gamma_approx(alpha + beta);
2804 let ln_pdf = (alpha - 1.0) * ln_x + (beta - 1.0) * ln_one_minus_x - ln_beta;
2805 if ln_pdf < -700.0 {
2807 return 0.0;
2808 }
2809 if ln_pdf > 700.0 {
2810 return f64::INFINITY;
2811 }
2812 ln_pdf.exp()
2813}
2814
2815pub(crate) fn gamma_cdf(x: f64, shape: f64, scale: f64) -> f64 {
2817 if x <= 0.0 {
2818 return 0.0;
2819 }
2820 if shape <= 0.0 || scale <= 0.0 {
2821 return 0.0;
2822 }
2823 if shape > 30.0 {
2826 let mean = shape * scale;
2827 let variance = shape * scale * scale;
2828 if variance > 0.0 {
2829 normal_cdf(x, mean, variance.sqrt())
2830 } else if x < mean {
2831 0.0
2832 } else {
2833 1.0
2834 }
2835 } else {
2836 let z = x / scale;
2839 let sum: f64 = (0..(shape as usize * 10).min(100))
2840 .map(|n| {
2841 if (n as f64) < shape {
2842 (-z).exp() * z.powi(n as i32) / (1..=n).map(|i| i as f64).product::<f64>()
2843 } else {
2844 0.0
2845 }
2846 })
2847 .sum();
2848 (1.0 - sum).clamp(0.0, 1.0)
2849 }
2850}
2851
2852pub(crate) fn gamma_pdf(x: f64, shape: f64, scale: f64) -> f64 {
2854 if x <= 0.0 || shape <= 0.0 || scale <= 0.0 {
2855 return 0.0;
2856 }
2857 let ln_pdf = (shape - 1.0) * x.ln() - x / scale - shape * scale.ln() - ln_gamma_approx(shape);
2860 if ln_pdf < -700.0 {
2862 return 0.0;
2863 }
2864 if ln_pdf > 700.0 {
2865 return f64::INFINITY;
2866 }
2867 ln_pdf.exp()
2868}
2869
2870pub(crate) fn gamma_quantile(p: f64, shape: f64, scale: f64) -> f64 {
2872 let p = p.clamp(0.0, 1.0);
2873 if p <= 0.0 {
2874 return 0.0;
2875 }
2876 if p >= 1.0 {
2877 let mean = shape * scale;
2879 let std = (shape * scale * scale).sqrt();
2880 return mean + 5.0 * std;
2881 }
2882 if shape <= 0.0 || scale <= 0.0 {
2883 return 0.0;
2884 }
2885
2886 let mean = shape * scale;
2887 let std = (shape * scale * scale).sqrt();
2888
2889 if shape > 30.0 {
2891 let z = normal_quantile(p);
2892 return (mean + z * std).max(0.0);
2893 }
2894
2895 if shape < 0.1 {
2898 let exp_scale = mean; if exp_scale > 0.0 {
2902 return -exp_scale * (1.0 - p).ln();
2904 } else {
2905 let z = normal_quantile(p);
2907 return (mean + z * std.max(mean * 0.1)).max(0.0);
2908 }
2909 }
2910
2911 let mut low = 0.0;
2914 let mut high = mean + 5.0 * std;
2915
2916 while gamma_cdf(high, shape, scale) < p {
2918 high *= 2.0;
2919 if high > 1e10 {
2920 let z = normal_quantile(p);
2922 return (mean + z * std).max(0.0);
2923 }
2924 }
2925
2926 let tolerance = 1e-6;
2928 let mut mid = (low + high) / 2.0;
2929 for _ in 0..100 {
2930 let cdf_val = gamma_cdf(mid, shape, scale);
2931 if (cdf_val - p).abs() < tolerance {
2932 return mid;
2933 }
2934 if cdf_val < p {
2935 low = mid;
2936 } else {
2937 high = mid;
2938 }
2939 mid = (low + high) / 2.0;
2940 }
2941
2942 mid
2943}
2944
2945fn chi_squared_cdf(x: f64, df: f64) -> f64 {
2947 if x <= 0.0 {
2948 return 0.0;
2949 }
2950 if df <= 0.0 {
2951 return 0.0;
2952 }
2953 gamma_cdf(x, df / 2.0, 2.0)
2954}
2955
2956pub(crate) fn chi_squared_pdf(x: f64, df: f64) -> f64 {
2958 if x <= 0.0 || df <= 0.0 {
2959 return 0.0;
2960 }
2961 gamma_pdf(x, df / 2.0, 2.0)
2962}
2963
2964fn students_t_cdf(x: f64, df: f64) -> f64 {
2966 if df <= 0.0 {
2967 return 0.5; }
2969 if df > 30.0 {
2971 normal_cdf(x, 0.0, 1.0)
2972 } else {
2973 let z = x * (1.0 - 1.0 / (4.0 * df));
2975 normal_cdf(z, 0.0, 1.0)
2976 }
2977}
2978
2979pub(crate) fn students_t_pdf(x: f64, df: f64) -> f64 {
2981 if df <= 0.0 {
2982 return 0.0;
2983 }
2984 if df > 30.0 {
2987 let z = x;
2989 let pdf = (1.0 / (2.0 * std::f64::consts::PI).sqrt()) * (-0.5 * z * z).exp();
2990 return pdf;
2991 }
2992 let x_sq_over_df = (x * x) / df;
2996 let exponent = -(df + 1.0) / 2.0;
2997 let power_term = (1.0 + x_sq_over_df).powf(exponent);
2998 let ln_gamma_half_df_plus_one = ln_gamma_approx((df + 1.0) / 2.0);
3000 let ln_gamma_half_df = ln_gamma_approx(df / 2.0);
3001 let ln_const =
3002 ln_gamma_half_df_plus_one - ln_gamma_half_df - 0.5 * (df * std::f64::consts::PI).ln();
3003 let const_term = ln_const.exp();
3004 let pdf = const_term * power_term;
3005 if pdf.is_finite() && pdf > 0.0 {
3007 pdf
3008 } else {
3009 0.0
3010 }
3011}
3012
3013fn poisson_cdf(x: f64, lambda: f64) -> f64 {
3015 if x < 0.0 {
3016 return 0.0;
3017 }
3018 if lambda <= 0.0 {
3019 return if x >= 0.0 { 1.0 } else { 0.0 };
3020 }
3021 if lambda > 20.0 {
3023 normal_cdf(x, lambda, lambda.sqrt())
3024 } else {
3025 let k_max = x.floor() as usize;
3027 let mut cdf = 0.0;
3028 let mut factorial = 1.0;
3029 for k in 0..=k_max.min(100) {
3030 if k > 0 {
3031 factorial *= k as f64;
3032 }
3033 let ln_pmf = (k as f64) * lambda.ln() - lambda - factorial.ln();
3034 let pmf = ln_pmf.exp();
3035 cdf += pmf;
3036 if cdf > 1.0 {
3037 break;
3038 }
3039 }
3040 cdf.min(1.0)
3041 }
3042}
3043
3044fn bernoulli_cdf(x: f64, p: f64) -> f64 {
3046 if x < 0.0 {
3047 return 0.0;
3048 }
3049 if x >= 1.0 {
3050 return 1.0;
3051 }
3052 if p < 0.0 {
3053 return 0.0;
3054 }
3055 if p > 1.0 {
3056 return 1.0;
3057 }
3058 1.0 - p }
3060
3061fn binomial_coeff(n: usize, k: usize) -> f64 {
3063 if k > n {
3064 0.0
3065 } else if k == 0 || k == n {
3066 1.0
3067 } else {
3068 let k = k.min(n - k); (1..=k).map(|i| (n - k + i) as f64 / i as f64).product()
3070 }
3071}
3072
3073fn binomial_cdf(x: f64, n: usize, p: f64) -> f64 {
3075 if x < 0.0 {
3076 return 0.0;
3077 }
3078 if p <= 0.0 {
3079 return if x >= n as f64 { 1.0 } else { 0.0 };
3080 }
3081 if p >= 1.0 {
3082 return if x >= 0.0 { 1.0 } else { 0.0 };
3083 }
3084 if n > 50 {
3086 let mean = n as f64 * p;
3087 let variance = n as f64 * p * (1.0 - p);
3088 if variance > 0.0 {
3089 normal_cdf(x + 0.5, mean, variance.sqrt()) } else if x < mean {
3091 0.0
3092 } else {
3093 1.0
3094 }
3095 } else {
3096 let k_max = x.floor() as usize;
3098 let mut cdf = 0.0;
3099 for k in 0..=k_max.min(n) {
3100 let coeff = binomial_coeff(n, k);
3101 let pmf = coeff * p.powi(k as i32) * (1.0 - p).powi((n - k) as i32);
3102 cdf += pmf;
3103 }
3104 cdf.min(1.0)
3105 }
3106}
3107
3108fn geometric_cdf(x: f64, p: f64) -> f64 {
3110 if x < 0.0 {
3111 return 0.0;
3112 }
3113 if p <= 0.0 || p >= 1.0 {
3114 return if x >= 0.0 && p >= 1.0 { 1.0 } else { 0.0 };
3115 }
3116
3117 let k = x.floor().min(50.0); let log_one_minus_p = (1.0 - p).ln();
3124 if log_one_minus_p.is_nan() || log_one_minus_p.is_infinite() {
3125 return if x >= 0.0 { 1.0 } else { 0.0 };
3126 }
3127
3128 let exponent = (k + 1.0) * log_one_minus_p;
3130
3131 if exponent < -50.0 {
3133 return 1.0;
3134 }
3135
3136 let one_minus_p_power = exponent.exp();
3138 let result = 1.0 - one_minus_p_power;
3139 result.clamp(0.0, 1.0)
3140}
3141
3142pub(crate) fn geometric_pmf(x: f64, p: f64) -> f64 {
3146 if x < 0.0 || p <= 0.0 || p >= 1.0 {
3147 return 0.0;
3148 }
3149 let k = x.floor().min(100.0); let log_p = p.ln();
3154 let log_one_minus_p = (1.0 - p).ln();
3155 if log_p.is_nan()
3156 || log_p.is_infinite()
3157 || log_one_minus_p.is_nan()
3158 || log_one_minus_p.is_infinite()
3159 {
3160 return 0.0;
3161 }
3162 let ln_pmf = log_p + k * log_one_minus_p;
3163 if ln_pmf < -700.0 {
3165 return 0.0;
3166 }
3167 if ln_pmf > 700.0 {
3168 return f64::INFINITY;
3169 }
3170 ln_pmf.exp()
3171}
3172
3173pub(crate) fn geometric_quantile(p: f64, p_param: f64) -> f64 {
3175 let p = p.clamp(0.0, 1.0);
3176 if p <= 0.0 {
3177 return 0.0;
3178 }
3179 if p >= 1.0 {
3180 return 100.0;
3182 }
3183 if p_param <= 0.0 || p_param >= 1.0 {
3184 return 0.0;
3185 }
3186
3187 let log_one_minus_p = (1.0 - p).ln();
3190 let log_one_minus_p_param = (1.0 - p_param).ln();
3191
3192 if log_one_minus_p.is_nan()
3194 || log_one_minus_p.is_infinite()
3195 || log_one_minus_p_param.is_nan()
3196 || log_one_minus_p_param.is_infinite()
3197 || log_one_minus_p_param.abs() < 1e-10
3198 {
3199 return geometric_quantile_binary_search(p, p_param);
3201 }
3202
3203 let quantile = log_one_minus_p / log_one_minus_p_param;
3204 quantile.clamp(0.0, 100.0) }
3206
3207fn geometric_quantile_binary_search(p: f64, p_param: f64) -> f64 {
3209 let mut low = 0.0;
3211 let mut high = 100.0;
3212
3213 while geometric_cdf(high, p_param) < p {
3215 high *= 2.0;
3216 if high > 1000.0 {
3217 return 100.0; }
3219 }
3220
3221 let tolerance = 1e-6;
3223 let mut mid = (low + high) / 2.0;
3224 for _ in 0..100 {
3225 let cdf_val = geometric_cdf(mid, p_param);
3226 if (cdf_val - p).abs() < tolerance {
3227 return mid;
3228 }
3229 if cdf_val < p {
3230 low = mid;
3231 } else {
3232 high = mid;
3233 }
3234 mid = (low + high) / 2.0;
3235 }
3236
3237 mid
3238}
3239
3240fn weibull_cdf(x: f64, shape: f64, scale: f64) -> f64 {
3242 if x <= 0.0 {
3243 return 0.0;
3244 }
3245 if shape <= 0.0 || scale <= 0.0 {
3246 return 0.0;
3247 }
3248 1.0 - (-(x / scale).powf(shape)).exp()
3250}
3251
3252pub(crate) fn weibull_pdf(x: f64, shape: f64, scale: f64) -> f64 {
3254 if x <= 0.0 || shape <= 0.0 || scale <= 0.0 {
3255 return 0.0;
3256 }
3257 let ratio = x / scale;
3260 let power = ratio.powf(shape);
3261 let pdf = (shape / scale) * ratio.powf(shape - 1.0) * (-power).exp();
3262 if pdf.is_finite() {
3264 pdf
3265 } else {
3266 0.0
3267 }
3268}
3269
3270pub fn calculate_theoretical_probability_in_interval(
3276 dist: &DistributionAnalysis,
3277 dist_type: DistributionType,
3278 lower: f64,
3279 upper: f64,
3280) -> f64 {
3281 let mean = dist.characteristics.mean;
3282 let std = dist.characteristics.std_dev;
3283 let sorted_data = &dist.sorted_sample_values;
3284
3285 match dist_type {
3286 DistributionType::Normal => {
3287 let cdf_upper = normal_cdf(upper, mean, std);
3288 let cdf_lower = normal_cdf(lower, mean, std);
3289 cdf_upper - cdf_lower
3290 }
3291 DistributionType::LogNormal => {
3292 if sorted_data.is_empty() || !sorted_data.iter().all(|&v| v > 0.0) {
3293 0.0
3294 } else {
3295 let e_x = mean;
3296 let var_x = std * std;
3297 let sigma_sq = (1.0 + var_x / (e_x * e_x)).ln();
3298 let mu = e_x.ln() - sigma_sq / 2.0;
3299 let sigma = sigma_sq.sqrt();
3300
3301 if lower > 0.0 && upper > 0.0 {
3302 let cdf_upper = lognormal_cdf(upper, mu, sigma);
3303 let cdf_lower = lognormal_cdf(lower, mu, sigma);
3304 cdf_upper - cdf_lower
3305 } else {
3306 0.0
3307 }
3308 }
3309 }
3310 DistributionType::Uniform => {
3311 if sorted_data.is_empty() {
3312 0.0
3313 } else {
3314 let data_min = sorted_data[0];
3315 let data_max = sorted_data[sorted_data.len() - 1];
3316 let data_range = data_max - data_min;
3317 if data_range > 0.0 {
3318 (upper - lower) / data_range
3319 } else {
3320 0.0
3321 }
3322 }
3323 }
3324 DistributionType::Exponential => {
3325 if mean > 0.0 {
3326 let lambda = 1.0 / mean;
3327 let cdf_upper = exponential_cdf(upper, lambda);
3328 let cdf_lower = exponential_cdf(lower, lambda);
3329 cdf_upper - cdf_lower
3330 } else {
3331 0.0
3332 }
3333 }
3334 DistributionType::PowerLaw => {
3335 if sorted_data.is_empty() || !sorted_data.iter().any(|&v| v > 0.0) {
3336 0.0
3337 } else {
3338 let positive_values: Vec<f64> =
3339 sorted_data.iter().filter(|&&v| v > 0.0).copied().collect();
3340 if positive_values.is_empty() {
3341 0.0
3342 } else {
3343 let xmin = positive_values[0];
3344 let n_pos = positive_values.len();
3345 if n_pos < 2 || xmin <= 0.0 {
3346 0.0
3347 } else {
3348 let sum_log = positive_values
3349 .iter()
3350 .map(|&x| (x / xmin).ln())
3351 .sum::<f64>();
3352 if sum_log > 0.0 {
3353 let alpha = 1.0 + (n_pos as f64) / sum_log;
3354 let cdf_upper = powerlaw_cdf(upper, xmin, alpha);
3355 let cdf_lower = powerlaw_cdf(lower, xmin, alpha);
3356 cdf_upper - cdf_lower
3357 } else {
3358 0.0
3359 }
3360 }
3361 }
3362 }
3363 }
3364 DistributionType::Beta => {
3365 let mean_val = mean;
3367 let variance = std * std;
3368 if mean_val > 0.0 && mean_val < 1.0 && variance > 0.0 {
3369 let max_var = mean_val * (1.0 - mean_val);
3370 if variance < max_var {
3371 let sum = mean_val * (1.0 - mean_val) / variance - 1.0;
3372 let alpha = mean_val * sum;
3373 let beta = (1.0 - mean_val) * sum;
3374 if alpha > 0.0 && beta > 0.0 {
3375 let cdf_upper = beta_cdf(upper, alpha, beta);
3376 let cdf_lower = beta_cdf(lower, alpha, beta);
3377 cdf_upper - cdf_lower
3378 } else {
3379 0.0
3380 }
3381 } else {
3382 0.0
3383 }
3384 } else {
3385 0.0
3386 }
3387 }
3388 DistributionType::Gamma => {
3389 if mean > 0.0 && std > 0.0 {
3390 let variance = std * std;
3391 let shape = (mean * mean) / variance;
3392 let scale = variance / mean;
3393 if shape > 0.0 && scale > 0.0 {
3394 let cdf_upper = gamma_cdf(upper, shape, scale);
3395 let cdf_lower = gamma_cdf(lower, shape, scale);
3396 cdf_upper - cdf_lower
3397 } else {
3398 0.0
3399 }
3400 } else {
3401 0.0
3402 }
3403 }
3404 DistributionType::ChiSquared => {
3405 let df = mean; if df > 0.0 {
3408 let cdf_upper = chi_squared_cdf(upper, df);
3409 let cdf_lower = chi_squared_cdf(lower, df);
3410 cdf_upper - cdf_lower
3411 } else {
3412 0.0
3413 }
3414 }
3415 DistributionType::StudentsT => {
3416 let variance = std * std;
3418 let df = if variance > 1.0 {
3419 2.0 * variance / (variance - 1.0)
3420 } else {
3421 30.0
3422 };
3423 let cdf_upper = students_t_cdf(upper, df);
3424 let cdf_lower = students_t_cdf(lower, df);
3425 cdf_upper - cdf_lower
3426 }
3427 DistributionType::Poisson => {
3428 let lambda = mean;
3429 if lambda > 0.0 {
3430 let cdf_upper = poisson_cdf(upper, lambda);
3431 let cdf_lower = poisson_cdf(lower, lambda);
3432 cdf_upper - cdf_lower
3433 } else {
3434 0.0
3435 }
3436 }
3437 DistributionType::Bernoulli => {
3438 let p = mean; let cdf_upper = bernoulli_cdf(upper, p);
3440 let cdf_lower = bernoulli_cdf(lower, p);
3441 cdf_upper - cdf_lower
3442 }
3443 DistributionType::Binomial => {
3444 let sorted_data = &dist.sorted_sample_values;
3446 if !sorted_data.is_empty() {
3447 let max_val = sorted_data[sorted_data.len() - 1];
3448 let n = max_val.floor() as usize;
3449 let p = if n > 0 { mean / n as f64 } else { 0.5 };
3450 if n > 0 && p > 0.0 && p < 1.0 {
3451 let cdf_upper = binomial_cdf(upper, n, p);
3452 let cdf_lower = binomial_cdf(lower, n, p);
3453 cdf_upper - cdf_lower
3454 } else {
3455 0.0
3456 }
3457 } else {
3458 0.0
3459 }
3460 }
3461 DistributionType::Geometric => {
3462 let mean_val = mean; if mean_val > 0.0 {
3464 let p = 1.0 / (mean_val + 1.0);
3465 let cdf_upper = geometric_cdf(upper, p);
3466 let cdf_lower = geometric_cdf(lower, p);
3467 cdf_upper - cdf_lower
3468 } else {
3469 0.0
3470 }
3471 }
3472 DistributionType::Weibull => {
3473 if mean > 0.0 && std > 0.0 {
3474 let cv = std / mean;
3476 let shape = if cv < 1.0 { 1.0 / cv } else { 1.0 };
3477 let gamma_1_over_shape = 1.0 + 1.0 / shape; let scale = mean / gamma_1_over_shape;
3480 if shape > 0.0 && scale > 0.0 {
3481 let cdf_upper = weibull_cdf(upper, shape, scale);
3482 let cdf_lower = weibull_cdf(lower, shape, scale);
3483 cdf_upper - cdf_lower
3484 } else {
3485 0.0
3486 }
3487 } else {
3488 0.0
3489 }
3490 }
3491 _ => 0.0,
3492 }
3493}
3494
3495pub fn calculate_theoretical_bin_probabilities(
3500 dist: &DistributionAnalysis,
3501 dist_type: DistributionType,
3502 bin_boundaries: &[f64],
3503) -> Vec<f64> {
3504 if bin_boundaries.len() < 2 {
3505 return vec![];
3506 }
3507
3508 let mean = dist.characteristics.mean;
3509 let std = dist.characteristics.std_dev;
3510 let sorted_data = &dist.sorted_sample_values;
3511
3512 let mut probabilities = Vec::new();
3514
3515 for i in 0..(bin_boundaries.len() - 1) {
3516 let lower = bin_boundaries[i];
3517 let upper = bin_boundaries[i + 1];
3518
3519 let prob = match dist_type {
3520 DistributionType::Normal => {
3521 let cdf_upper = normal_cdf(upper, mean, std);
3522 let cdf_lower = normal_cdf(lower, mean, std);
3523 cdf_upper - cdf_lower
3524 }
3525 DistributionType::LogNormal => {
3526 if sorted_data.is_empty() || !sorted_data.iter().all(|&v| v > 0.0) {
3530 0.0
3531 } else {
3532 let e_x = mean;
3533 let var_x = std * std;
3534 let sigma_sq = (1.0 + var_x / (e_x * e_x)).ln();
3535 let mu = e_x.ln() - sigma_sq / 2.0;
3536 let sigma = sigma_sq.sqrt();
3537
3538 if lower > 0.0 && upper > 0.0 {
3539 let cdf_upper = lognormal_cdf(upper, mu, sigma);
3540 let cdf_lower = lognormal_cdf(lower, mu, sigma);
3541 cdf_upper - cdf_lower
3542 } else {
3543 0.0
3544 }
3545 }
3546 }
3547 DistributionType::Uniform => {
3548 if sorted_data.is_empty() {
3551 0.0
3552 } else {
3553 let data_min = sorted_data[0];
3554 let data_max = sorted_data[sorted_data.len() - 1];
3555 let data_range = data_max - data_min;
3556 if data_range > 0.0 {
3557 (upper - lower) / data_range
3558 } else if i == 0 {
3559 1.0 } else {
3561 0.0
3562 }
3563 }
3564 }
3565 DistributionType::Exponential => {
3566 if mean > 0.0 {
3568 let lambda = 1.0 / mean;
3569 let cdf_upper = exponential_cdf(upper, lambda);
3570 let cdf_lower = exponential_cdf(lower, lambda);
3571 cdf_upper - cdf_lower
3572 } else {
3573 0.0
3574 }
3575 }
3576 DistributionType::PowerLaw => {
3577 if sorted_data.is_empty() || !sorted_data.iter().any(|&v| v > 0.0) {
3580 0.0
3581 } else {
3582 let positive_values: Vec<f64> =
3583 sorted_data.iter().filter(|&&v| v > 0.0).copied().collect();
3584 if positive_values.is_empty() {
3585 0.0
3586 } else {
3587 let xmin = positive_values[0];
3588 let n_pos = positive_values.len();
3591 if n_pos < 2 || xmin <= 0.0 {
3592 0.0
3593 } else {
3594 let sum_log = positive_values
3595 .iter()
3596 .map(|&x| (x / xmin).ln())
3597 .sum::<f64>();
3598 let alpha = if sum_log > 0.0 {
3599 1.0 + (n_pos as f64) / sum_log
3600 } else {
3601 2.5 };
3603
3604 if lower >= xmin && alpha > 1.0 {
3605 let cdf_upper = powerlaw_cdf(upper, xmin, alpha);
3606 let cdf_lower = powerlaw_cdf(lower, xmin, alpha);
3607 cdf_upper - cdf_lower
3608 } else {
3609 0.0
3610 }
3611 }
3612 }
3613 }
3614 }
3615 DistributionType::Beta => {
3616 let mean_val = mean;
3618 let variance = std * std;
3619 if mean_val > 0.0 && mean_val < 1.0 && variance > 0.0 {
3620 let max_var = mean_val * (1.0 - mean_val);
3621 if variance < max_var {
3622 let sum = mean_val * (1.0 - mean_val) / variance - 1.0;
3623 let alpha = mean_val * sum;
3624 let beta = (1.0 - mean_val) * sum;
3625 if alpha > 0.0 && beta > 0.0 {
3626 let cdf_upper = beta_cdf(upper, alpha, beta);
3627 let cdf_lower = beta_cdf(lower, alpha, beta);
3628 cdf_upper - cdf_lower
3629 } else {
3630 0.0
3631 }
3632 } else {
3633 0.0
3634 }
3635 } else {
3636 0.0
3637 }
3638 }
3639 DistributionType::Gamma => {
3640 if mean > 0.0 && std > 0.0 {
3641 let variance = std * std;
3642 let shape = (mean * mean) / variance;
3643 let scale = variance / mean;
3644 if shape > 0.0 && scale > 0.0 {
3645 let cdf_upper = gamma_cdf(upper, shape, scale);
3646 let cdf_lower = gamma_cdf(lower, shape, scale);
3647 cdf_upper - cdf_lower
3648 } else {
3649 0.0
3650 }
3651 } else {
3652 0.0
3653 }
3654 }
3655 DistributionType::ChiSquared => {
3656 let df = mean; if df > 0.0 {
3659 let cdf_upper = chi_squared_cdf(upper, df);
3660 let cdf_lower = chi_squared_cdf(lower, df);
3661 cdf_upper - cdf_lower
3662 } else {
3663 0.0
3664 }
3665 }
3666 DistributionType::StudentsT => {
3667 let variance = std * std;
3669 let df = if variance > 1.0 {
3670 2.0 * variance / (variance - 1.0)
3671 } else {
3672 30.0
3673 };
3674 let cdf_upper = students_t_cdf(upper, df);
3675 let cdf_lower = students_t_cdf(lower, df);
3676 cdf_upper - cdf_lower
3677 }
3678 DistributionType::Poisson => {
3679 let lambda = mean;
3680 if lambda > 0.0 {
3681 let cdf_upper = poisson_cdf(upper, lambda);
3682 let cdf_lower = poisson_cdf(lower, lambda);
3683 cdf_upper - cdf_lower
3684 } else {
3685 0.0
3686 }
3687 }
3688 DistributionType::Bernoulli => {
3689 let p = mean; let cdf_upper = bernoulli_cdf(upper, p);
3691 let cdf_lower = bernoulli_cdf(lower, p);
3692 cdf_upper - cdf_lower
3693 }
3694 DistributionType::Binomial => {
3695 if !sorted_data.is_empty() {
3697 let max_val = sorted_data[sorted_data.len() - 1];
3698 let n = max_val.floor() as usize;
3699 let p = if n > 0 { mean / n as f64 } else { 0.5 };
3700 if n > 0 && p > 0.0 && p < 1.0 {
3701 let cdf_upper = binomial_cdf(upper, n, p);
3702 let cdf_lower = binomial_cdf(lower, n, p);
3703 cdf_upper - cdf_lower
3704 } else {
3705 0.0
3706 }
3707 } else {
3708 0.0
3709 }
3710 }
3711 DistributionType::Geometric => {
3712 let mean_val = mean; if mean_val > 0.0 && mean_val <= 20.0 {
3715 let p = 1.0 / (mean_val + 1.0);
3716 let upper_capped = upper.min(50.0);
3718 let lower_capped = lower.clamp(0.0, 50.0);
3719 let cdf_upper = geometric_cdf(upper_capped, p);
3720 let cdf_lower = geometric_cdf(lower_capped, p);
3721 (cdf_upper - cdf_lower).max(0.0)
3722 } else {
3723 0.0
3724 }
3725 }
3726 DistributionType::Weibull => {
3727 if mean > 0.0 && std > 0.0 {
3728 let cv = std / mean;
3730 let shape = if cv < 1.0 { 1.0 / cv } else { 1.0 };
3731 let gamma_1_over_shape = 1.0 + 1.0 / shape; let scale = mean / gamma_1_over_shape;
3734 if shape > 0.0 && scale > 0.0 {
3735 let cdf_upper = weibull_cdf(upper, shape, scale);
3736 let cdf_lower = weibull_cdf(lower, shape, scale);
3737 cdf_upper - cdf_lower
3738 } else {
3739 0.0
3740 }
3741 } else {
3742 0.0
3743 }
3744 }
3745 DistributionType::Unknown => {
3746 if sorted_data.is_empty() {
3748 0.0
3749 } else {
3750 let data_min = sorted_data[0];
3751 let data_max = sorted_data[sorted_data.len() - 1];
3752 let data_range = data_max - data_min;
3753 if data_range > 0.0 {
3754 (upper - lower) / data_range
3755 } else if i == 0 {
3756 1.0
3757 } else {
3758 0.0
3759 }
3760 }
3761 }
3762 };
3763
3764 probabilities.push(prob.max(0.0)); }
3766
3767 probabilities
3768}
3769
3770fn compute_outlier_analysis(series: &Series, numeric_stats: &NumericStatistics) -> OutlierAnalysis {
3771 let values: Vec<f64> = get_numeric_values_as_f64(series);
3772 let q25 = numeric_stats.q25;
3773 let q75 = numeric_stats.q75;
3774 let mean = numeric_stats.mean;
3775 let std = numeric_stats.std;
3776
3777 if q25.is_nan() || q75.is_nan() || std == 0.0 {
3778 return OutlierAnalysis {
3779 total_count: 0,
3780 percentage: 0.0,
3781 iqr_count: 0,
3782 zscore_count: 0,
3783 outlier_rows: Vec::new(),
3784 };
3785 }
3786
3787 let iqr = q75 - q25;
3788 let lower_fence = q25 - 1.5 * iqr;
3789 let upper_fence = q75 + 1.5 * iqr;
3790 let z_threshold = 3.0;
3791
3792 let mut outlier_rows = Vec::new();
3793
3794 for (idx, &value) in values.iter().enumerate() {
3795 let mut methods = Vec::new();
3796 let mut z_score = None;
3797 let mut iqr_position = None;
3798
3799 if value < lower_fence {
3801 methods.push(OutlierMethod::IQR);
3802 iqr_position = Some(IqrPosition::BelowLowerFence);
3803 } else if value > upper_fence {
3804 methods.push(OutlierMethod::IQR);
3805 iqr_position = Some(IqrPosition::AboveUpperFence);
3806 }
3807
3808 if std > 0.0 {
3810 let z = (value - mean).abs() / std;
3811 z_score = Some(z);
3812 if z > z_threshold {
3813 methods.push(OutlierMethod::ZScore);
3814 }
3815 }
3816
3817 if !methods.is_empty() {
3818 outlier_rows.push(OutlierRow {
3819 row_index: idx,
3820 column_value: value,
3821 context_data: HashMap::new(), detection_method: if methods.len() == 2 {
3823 OutlierMethod::Both
3824 } else {
3825 methods[0].clone()
3826 },
3827 z_score,
3828 iqr_position,
3829 });
3830 }
3831 }
3832
3833 outlier_rows.sort_by(|a, b| {
3835 let a_dev = (a.column_value - mean).abs();
3836 let b_dev = (b.column_value - mean).abs();
3837 b_dev
3838 .partial_cmp(&a_dev)
3839 .unwrap_or(std::cmp::Ordering::Equal)
3840 });
3841
3842 outlier_rows.truncate(100);
3844
3845 let total_count = outlier_rows.len();
3846 let percentage = if values.is_empty() {
3847 0.0
3848 } else {
3849 (total_count as f64 / values.len() as f64) * 100.0
3850 };
3851
3852 OutlierAnalysis {
3853 total_count,
3854 percentage,
3855 iqr_count: outlier_rows
3856 .iter()
3857 .filter(|r| matches!(r.detection_method, OutlierMethod::IQR | OutlierMethod::Both))
3858 .count(),
3859 zscore_count: outlier_rows
3860 .iter()
3861 .filter(|r| {
3862 matches!(
3863 r.detection_method,
3864 OutlierMethod::ZScore | OutlierMethod::Both
3865 )
3866 })
3867 .count(),
3868 outlier_rows,
3869 }
3870}
3871
3872pub fn compute_correlation_matrix(df: &DataFrame) -> Result<CorrelationMatrix> {
3878 let schema = df.schema();
3880 let numeric_cols: Vec<String> = schema
3881 .iter()
3882 .filter(|(_, dtype)| is_numeric_type(dtype))
3883 .map(|(name, _)| name.to_string())
3884 .collect();
3885
3886 if numeric_cols.len() < 2 {
3887 return Err(color_eyre::eyre::eyre!(
3888 "Need at least 2 numeric columns for correlation matrix"
3889 ));
3890 }
3891
3892 let n = numeric_cols.len();
3893 let mut correlations = vec![vec![1.0; n]; n];
3894 let mut p_values = vec![vec![0.0; n]; n];
3895 let mut sample_sizes = vec![vec![0; n]; n];
3896
3897 for i in 0..n {
3899 for j in (i + 1)..n {
3900 let col1 = df.column(&numeric_cols[i])?;
3901 let col2 = df.column(&numeric_cols[j])?;
3902
3903 let mask = col1.is_not_null() & col2.is_not_null();
3905 let col1_clean = col1.filter(&mask)?;
3906 let col2_clean = col2.filter(&mask)?;
3907
3908 let sample_size = col1_clean.len();
3909 sample_sizes[i][j] = sample_size;
3910 sample_sizes[j][i] = sample_size;
3911
3912 if sample_size < 3 {
3913 correlations[i][j] = f64::NAN;
3915 correlations[j][i] = f64::NAN;
3916 continue;
3917 }
3918
3919 let col1_series = col1_clean.as_materialized_series();
3921 let col2_series = col2_clean.as_materialized_series();
3922 let correlation = compute_pearson_correlation(col1_series, col2_series)?;
3923 correlations[i][j] = correlation;
3924 correlations[j][i] = correlation; if sample_size >= 3 {
3928 let p_value = compute_correlation_p_value(correlation, sample_size);
3929 p_values[i][j] = p_value;
3930 p_values[j][i] = p_value;
3931 }
3932 }
3933 }
3934
3935 Ok(CorrelationMatrix {
3936 columns: numeric_cols,
3937 correlations,
3938 p_values: Some(p_values),
3939 sample_sizes,
3940 })
3941}
3942
3943fn compute_pearson_correlation(col1: &Series, col2: &Series) -> Result<f64> {
3944 let values1: Vec<f64> = get_numeric_values_as_f64(col1);
3946 let values2: Vec<f64> = get_numeric_values_as_f64(col2);
3947
3948 if values1.len() != values2.len() || values1.len() < 2 {
3949 return Err(color_eyre::eyre::eyre!("Invalid data for correlation"));
3950 }
3951
3952 let mean1: f64 = values1.iter().sum::<f64>() / values1.len() as f64;
3953 let mean2: f64 = values2.iter().sum::<f64>() / values2.len() as f64;
3954
3955 let numerator: f64 = values1
3956 .iter()
3957 .zip(values2.iter())
3958 .map(|(v1, v2)| (v1 - mean1) * (v2 - mean2))
3959 .sum();
3960
3961 let var1: f64 = values1.iter().map(|v| (v - mean1).powi(2)).sum();
3962 let var2: f64 = values2.iter().map(|v| (v - mean2).powi(2)).sum();
3963
3964 if var1 == 0.0 || var2 == 0.0 {
3965 return Ok(0.0);
3966 }
3967
3968 let correlation = numerator / (var1.sqrt() * var2.sqrt());
3969 Ok(correlation)
3970}
3971
3972fn compute_correlation_p_value(correlation: f64, n: usize) -> f64 {
3973 if correlation.abs() >= 1.0 || n < 3 {
3977 return 1.0;
3978 }
3979
3980 let t_statistic = correlation * ((n - 2) as f64 / (1.0 - correlation * correlation)).sqrt();
3981 let _degrees_of_freedom = (n - 2) as f64;
3982
3983 let normal_cdf = |x: f64| -> f64 {
3986 0.5 * (1.0 + (x / std::f64::consts::SQRT_2).tanh())
3988 };
3989
3990 let p_value = 2.0 * (1.0 - normal_cdf(t_statistic.abs()));
3991 p_value.clamp(0.0, 1.0)
3992}
3993
3994pub fn compute_correlation_pair(
3999 df: &DataFrame,
4000 col1_name: &str,
4001 col2_name: &str,
4002) -> Result<CorrelationPair> {
4003 let col1 = df.column(col1_name)?;
4004 let col2 = df.column(col2_name)?;
4005
4006 let mask = col1.is_not_null() & col2.is_not_null();
4008 let col1_clean = col1.filter(&mask)?;
4009 let col2_clean = col2.filter(&mask)?;
4010
4011 let sample_size = col1_clean.len();
4012 if sample_size < 3 {
4013 return Err(color_eyre::eyre::eyre!("Not enough data for correlation"));
4014 }
4015
4016 let col1_series = col1_clean.as_materialized_series();
4017 let col2_series = col2_clean.as_materialized_series();
4018 let correlation = compute_pearson_correlation(col1_series, col2_series)?;
4019 let p_value = Some(compute_correlation_p_value(correlation, sample_size));
4020
4021 let mean1 = col1_series.mean().unwrap_or(0.0);
4023 let mean2 = col2_series.mean().unwrap_or(0.0);
4024 let values1: Vec<f64> = get_numeric_values_as_f64(col1_series);
4025 let values2: Vec<f64> = get_numeric_values_as_f64(col2_series);
4026
4027 let covariance = if values1.len() == values2.len() {
4028 values1
4029 .iter()
4030 .zip(values2.iter())
4031 .map(|(v1, v2)| (v1 - mean1) * (v2 - mean2))
4032 .sum::<f64>()
4033 / (values1.len() - 1) as f64
4034 } else {
4035 0.0
4036 };
4037
4038 let r_squared = correlation * correlation;
4039
4040 let stats1 = ColumnStats {
4042 mean: mean1,
4043 std: col1_series.std(1).unwrap_or(0.0),
4044 min: col1_series
4045 .min::<f64>()
4046 .unwrap_or(Some(f64::NAN))
4047 .unwrap_or(f64::NAN),
4048 max: col1_series
4049 .max::<f64>()
4050 .unwrap_or(Some(f64::NAN))
4051 .unwrap_or(f64::NAN),
4052 };
4053
4054 let stats2 = ColumnStats {
4055 mean: mean2,
4056 std: col2_series.std(1).unwrap_or(0.0),
4057 min: col2_series
4058 .min::<f64>()
4059 .unwrap_or(Some(f64::NAN))
4060 .unwrap_or(f64::NAN),
4061 max: col2_series
4062 .max::<f64>()
4063 .unwrap_or(Some(f64::NAN))
4064 .unwrap_or(f64::NAN),
4065 };
4066
4067 Ok(CorrelationPair {
4068 column1: col1_name.to_string(),
4069 column2: col2_name.to_string(),
4070 correlation,
4071 p_value,
4072 sample_size,
4073 covariance,
4074 r_squared,
4075 stats1,
4076 stats2,
4077 })
4078}