datui_lib/
statistics.rs

1use color_eyre::eyre::Report;
2use color_eyre::Result;
3use polars::polars_compute::rolling::QuantileMethod;
4use polars::prelude::*;
5use std::collections::HashMap;
6
7/// Collects a LazyFrame into a DataFrame.
8///
9/// When the `streaming` feature is enabled and `use_streaming` is true, uses the Polars
10/// streaming engine (batch processing, lower memory). Otherwise collects normally.
11/// Returns `PolarsError` so callers that need to display or store the error (e.g.
12/// `DataTableState::error`) can do so without converting.
13pub fn collect_lazy(
14    lf: LazyFrame,
15    use_streaming: bool,
16) -> std::result::Result<DataFrame, PolarsError> {
17    #[cfg(feature = "streaming")]
18    {
19        if use_streaming {
20            lf.with_new_streaming(true).collect()
21        } else {
22            lf.collect()
23        }
24    }
25    #[cfg(not(feature = "streaming"))]
26    {
27        let _ = use_streaming; // ignored when streaming feature is disabled
28        lf.collect()
29    }
30}
31
32/// Default sampling threshold: datasets >= this size are sampled.
33/// Used as fallback when sample_size is None. App uses config value.
34pub const SAMPLING_THRESHOLD: usize = 10_000;
35
36#[derive(Clone)]
37pub struct ColumnStatistics {
38    pub name: String,
39    pub dtype: DataType,
40    pub count: usize,
41    pub null_count: usize,
42    pub numeric_stats: Option<NumericStatistics>,
43    pub categorical_stats: Option<CategoricalStatistics>,
44    pub distribution_info: Option<DistributionInfo>,
45}
46
47#[derive(Clone)]
48pub struct NumericStatistics {
49    pub mean: f64,
50    pub std: f64,
51    pub min: f64,
52    pub max: f64,
53    pub median: f64,
54    pub q25: f64,
55    pub q75: f64,
56    pub percentiles: HashMap<u8, f64>, // 1, 5, 25, 50, 75, 95, 99
57    pub skewness: f64,
58    pub kurtosis: f64,
59    pub outliers_iqr: usize,
60    pub outliers_zscore: usize,
61}
62
63#[derive(Clone)]
64pub struct CategoricalStatistics {
65    pub unique_count: usize,
66    pub mode: Option<String>,
67    pub top_values: Vec<(String, usize)>,
68    pub min: Option<String>, // Lexicographically smallest string
69    pub max: Option<String>, // Lexicographically largest string
70}
71
72#[derive(Clone)]
73pub struct DistributionInfo {
74    pub distribution_type: DistributionType,
75    pub confidence: f64,
76    pub sample_size: usize,
77    pub is_sampled: bool,
78    pub fit_quality: Option<f64>, // 0.0-1.0, how well data fits detected type
79    pub all_distribution_pvalues: HashMap<DistributionType, f64>, // P-values for all tested distributions
80}
81
82#[derive(Clone)]
83pub struct DistributionAnalysis {
84    pub column_name: String,
85    pub distribution_type: DistributionType,
86    pub confidence: f64,  // 0.0-1.0
87    pub fit_quality: f64, // 0.0-1.0, how well data fits detected type
88    pub characteristics: DistributionCharacteristics,
89    pub outliers: OutlierAnalysis,
90    pub percentiles: PercentileBreakdown,
91    pub sorted_sample_values: Vec<f64>, // Sorted data values for Q-Q plot (all data if < threshold, sampled if >= threshold)
92    pub is_sampled: bool,               // Whether data was sampled
93    pub sample_size: usize,             // Actual number of values used
94    pub all_distribution_pvalues: HashMap<DistributionType, f64>, // P-values for all tested distributions
95}
96
97#[derive(Clone)]
98pub struct DistributionCharacteristics {
99    pub shapiro_wilk_stat: Option<f64>,
100    pub shapiro_wilk_pvalue: Option<f64>,
101    pub skewness: f64,
102    pub kurtosis: f64,
103    pub mean: f64,
104    pub median: f64,
105    pub std_dev: f64,
106    pub variance: f64,
107    pub coefficient_of_variation: f64,
108    pub mode: Option<f64>, // For unimodal distributions
109}
110
111#[derive(Clone)]
112pub struct OutlierAnalysis {
113    pub total_count: usize,
114    pub percentage: f64,
115    pub iqr_count: usize,
116    pub zscore_count: usize,
117    pub outlier_rows: Vec<OutlierRow>, // Limited to top N for performance
118}
119
120#[derive(Clone)]
121pub struct OutlierRow {
122    pub row_index: usize,
123    pub column_value: f64,
124    pub context_data: HashMap<String, String>, // Other column values for context
125    pub detection_method: OutlierMethod,
126    pub z_score: Option<f64>,
127    pub iqr_position: Option<IqrPosition>, // Below Q1-1.5*IQR or above Q3+1.5*IQR
128}
129
130#[derive(Clone, Debug)]
131pub enum OutlierMethod {
132    IQR,
133    ZScore,
134    Both,
135}
136
137#[derive(Clone, Debug)]
138pub enum IqrPosition {
139    BelowLowerFence,
140    AboveUpperFence,
141}
142
143#[derive(Clone)]
144pub struct PercentileBreakdown {
145    pub p1: f64,
146    pub p5: f64,
147    pub p25: f64,
148    pub p50: f64,
149    pub p75: f64,
150    pub p95: f64,
151    pub p99: f64,
152}
153
154// Correlation matrix structures
155#[derive(Clone)]
156pub struct CorrelationMatrix {
157    pub columns: Vec<String>,            // Numeric column names
158    pub correlations: Vec<Vec<f64>>,     // Square matrix of correlations
159    pub p_values: Option<Vec<Vec<f64>>>, // Statistical significance (optional)
160    pub sample_sizes: Vec<Vec<usize>>,   // Sample size for each pair
161}
162
163#[derive(Clone)]
164pub struct CorrelationPair {
165    pub column1: String,
166    pub column2: String,
167    pub correlation: f64,
168    pub p_value: Option<f64>,
169    pub sample_size: usize,
170    pub covariance: f64,
171    pub r_squared: f64,
172    pub stats1: ColumnStats,
173    pub stats2: ColumnStats,
174}
175
176#[derive(Clone)]
177pub struct ColumnStats {
178    pub mean: f64,
179    pub std: f64,
180    pub min: f64,
181    pub max: f64,
182}
183
184#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash)]
185pub enum DistributionType {
186    #[default]
187    Normal,
188    LogNormal,
189    Uniform,
190    PowerLaw,
191    Exponential,
192    Beta,
193    Gamma,
194    ChiSquared,
195    StudentsT,
196    Poisson,
197    Bernoulli,
198    Binomial,
199    Geometric,
200    Weibull,
201    Unknown,
202}
203
204impl std::fmt::Display for DistributionType {
205    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
206        match self {
207            DistributionType::Normal => write!(f, "Normal"),
208            DistributionType::LogNormal => write!(f, "Log-Normal"),
209            DistributionType::Uniform => write!(f, "Uniform"),
210            DistributionType::PowerLaw => write!(f, "Power Law"),
211            DistributionType::Exponential => write!(f, "Exponential"),
212            DistributionType::Beta => write!(f, "Beta"),
213            DistributionType::Gamma => write!(f, "Gamma"),
214            DistributionType::ChiSquared => write!(f, "Chi-Squared"),
215            DistributionType::StudentsT => write!(f, "Student's t"),
216            DistributionType::Poisson => write!(f, "Poisson"),
217            DistributionType::Bernoulli => write!(f, "Bernoulli"),
218            DistributionType::Binomial => write!(f, "Binomial"),
219            DistributionType::Geometric => write!(f, "Geometric"),
220            DistributionType::Weibull => write!(f, "Weibull"),
221            DistributionType::Unknown => write!(f, "Unknown"),
222        }
223    }
224}
225
226#[derive(Clone)]
227pub struct AnalysisResults {
228    pub column_statistics: Vec<ColumnStatistics>,
229    pub total_rows: usize,
230    pub sample_size: Option<usize>,
231    pub sample_seed: u64,
232    pub correlation_matrix: Option<CorrelationMatrix>,
233    pub distribution_analyses: Vec<DistributionAnalysis>,
234}
235
236pub struct AnalysisContext {
237    pub has_query: bool,
238    pub query: String,
239    pub has_filters: bool,
240    pub filter_count: usize,
241    pub is_drilled_down: bool,
242    pub group_key: Option<Vec<String>>,
243    pub group_columns: Option<Vec<String>>,
244}
245
246#[derive(Debug, Clone, Copy)]
247pub struct ComputeOptions {
248    pub include_distribution_info: bool,
249    pub include_distribution_analyses: bool,
250    pub include_correlation_matrix: bool,
251    pub include_skewness_kurtosis_outliers: bool,
252    /// When true, use Polars streaming engine for LazyFrame collect when the streaming feature is enabled.
253    pub polars_streaming: bool,
254}
255
256impl Default for ComputeOptions {
257    fn default() -> Self {
258        Self {
259            include_distribution_info: false,
260            include_distribution_analyses: false,
261            include_correlation_matrix: false,
262            include_skewness_kurtosis_outliers: false,
263            polars_streaming: true,
264        }
265    }
266}
267
268/// Computes statistics for a LazyFrame with default options.
269///
270/// Convenience wrapper around `compute_statistics_with_options`.
271pub fn compute_statistics(
272    lf: &LazyFrame,
273    sample_size: Option<usize>,
274    seed: u64,
275) -> Result<AnalysisResults> {
276    compute_statistics_with_options(lf, sample_size, seed, ComputeOptions::default())
277}
278
279/// Computes comprehensive statistics for a LazyFrame.
280///
281/// Main entry point for statistical analysis. Computes:
282/// - Basic statistics (count, nulls, min, max, mean) for all columns
283/// - Numeric statistics (percentiles, skewness, kurtosis, outliers) for numeric columns
284/// - Categorical statistics (unique count, mode, top values) for categorical columns
285/// - Distribution detection and analysis for numeric columns (if enabled)
286/// - Correlation matrix for numeric columns (if enabled)
287///
288/// Large datasets are automatically sampled when exceeding the sampling threshold.
289pub fn compute_statistics_with_options(
290    lf: &LazyFrame,
291    sample_size: Option<usize>,
292    seed: u64,
293    options: ComputeOptions,
294) -> Result<AnalysisResults> {
295    let schema = lf.clone().collect_schema()?;
296    let use_streaming = options.polars_streaming;
297    // Always count actual rows, regardless of sample_size parameter
298    let total_rows = {
299        let count_df =
300            collect_lazy(lf.clone().select([len()]), use_streaming).map_err(Report::from)?;
301        if let Some(col) = count_df.get(0) {
302            if let Some(AnyValue::UInt32(n)) = col.first() {
303                *n as usize
304            } else {
305                0
306            }
307        } else {
308            0
309        }
310    };
311
312    // sample_size: None = never sample (full data); Some(threshold) = sample when total_rows >= threshold
313    let should_sample = sample_size.is_some_and(|t| total_rows >= t);
314    let (sampling_threshold, actual_sample_size) = if should_sample {
315        let t = sample_size.unwrap();
316        (t, Some(t))
317    } else {
318        (0, None)
319    };
320
321    let df = if should_sample {
322        sample_dataframe(lf, sampling_threshold, seed, use_streaming)?
323    } else {
324        collect_lazy(lf.clone(), use_streaming).map_err(Report::from)?
325    };
326
327    let mut column_statistics = Vec::new();
328
329    for (name, dtype) in schema.iter() {
330        let col = df.column(name)?;
331        let series = col.as_materialized_series();
332        let count = series.len();
333        let null_count = series.null_count();
334
335        let numeric_stats = if is_numeric_type(dtype) {
336            Some(compute_numeric_stats(
337                series,
338                options.include_skewness_kurtosis_outliers,
339            )?)
340        } else {
341            None
342        };
343
344        let categorical_stats = if is_categorical_type(dtype) {
345            Some(compute_categorical_stats(series)?)
346        } else {
347            None
348        };
349
350        let distribution_info =
351            if options.include_distribution_info && is_numeric_type(dtype) && null_count < count {
352                // Get sample for distribution inference
353                Some(infer_distribution(
354                    series,
355                    series,
356                    actual_sample_size.unwrap_or(count),
357                    should_sample,
358                ))
359            } else {
360                None
361            };
362
363        column_statistics.push(ColumnStatistics {
364            name: name.to_string(),
365            dtype: dtype.clone(),
366            count,
367            null_count,
368            numeric_stats,
369            categorical_stats,
370            distribution_info,
371        });
372    }
373
374    let distribution_analyses = if options.include_distribution_analyses {
375        column_statistics
376            .iter()
377            .filter_map(|col_stat| {
378                if let (Some(numeric_stats), Some(dist_info)) =
379                    (&col_stat.numeric_stats, &col_stat.distribution_info)
380                {
381                    if let Ok(series_col) = df.column(&col_stat.name) {
382                        let series = series_col.as_materialized_series();
383                        Some(compute_advanced_distribution_analysis(
384                            &col_stat.name,
385                            series,
386                            numeric_stats,
387                            dist_info,
388                            actual_sample_size.unwrap_or(total_rows),
389                            should_sample,
390                        ))
391                    } else {
392                        None
393                    }
394                } else {
395                    None
396                }
397            })
398            .collect()
399    } else {
400        Vec::new()
401    };
402
403    let correlation_matrix = if options.include_correlation_matrix {
404        compute_correlation_matrix(&df).ok()
405    } else {
406        None
407    };
408
409    Ok(AnalysisResults {
410        column_statistics,
411        total_rows,
412        sample_size: actual_sample_size,
413        sample_seed: seed,
414        correlation_matrix,
415        distribution_analyses,
416    })
417}
418
419/// Computes describe statistics for a single column of an already-collected DataFrame.
420///
421/// Used by the UI to compute stats per column and yield between columns for progress display.
422/// No progress or chunk types; the library only exposes granularity.
423pub fn compute_describe_column(
424    df: &DataFrame,
425    schema: &Schema,
426    column_index: usize,
427    options: &ComputeOptions,
428    actual_sample_size: Option<usize>,
429    is_sampled: bool,
430) -> Result<ColumnStatistics> {
431    let (name, dtype) = schema
432        .iter()
433        .nth(column_index)
434        .ok_or_else(|| color_eyre::eyre::eyre!("column index {} out of range", column_index))?;
435    let col = df.column(name)?;
436    let series = col.as_materialized_series();
437    let count = series.len();
438    let null_count = series.null_count();
439
440    let numeric_stats = if is_numeric_type(dtype) {
441        Some(compute_numeric_stats(
442            series,
443            options.include_skewness_kurtosis_outliers,
444        )?)
445    } else {
446        None
447    };
448
449    let categorical_stats = if is_categorical_type(dtype) {
450        Some(compute_categorical_stats(series)?)
451    } else {
452        None
453    };
454
455    let distribution_info =
456        if options.include_distribution_info && is_numeric_type(dtype) && null_count < count {
457            Some(infer_distribution(
458                series,
459                series,
460                actual_sample_size.unwrap_or(count),
461                is_sampled,
462            ))
463        } else {
464            None
465        };
466
467    Ok(ColumnStatistics {
468        name: name.to_string(),
469        dtype: dtype.clone(),
470        count,
471        null_count,
472        numeric_stats,
473        categorical_stats,
474        distribution_info,
475    })
476}
477
478/// Builds describe-only AnalysisResults from a list of column statistics.
479///
480/// Used when completing chunked describe; correlation and distribution analyses stay empty/None.
481pub fn analysis_results_from_describe(
482    column_statistics: Vec<ColumnStatistics>,
483    total_rows: usize,
484    sample_size: Option<usize>,
485    sample_seed: u64,
486) -> AnalysisResults {
487    AnalysisResults {
488        column_statistics,
489        total_rows,
490        sample_size,
491        sample_seed,
492        correlation_matrix: None,
493        distribution_analyses: Vec::new(),
494    }
495}
496
497/// Builds aggregation expressions for describe (count, null_count, mean, std, percentiles, min, max).
498/// Used so we can run a single collect on a LazyFrame without materializing all rows.
499fn build_describe_aggregation_exprs(schema: &Schema) -> Vec<Expr> {
500    let mut exprs = Vec::new();
501    for (name, dtype) in schema.iter() {
502        let name = name.as_str();
503        let prefix = format!("{}::", name);
504        exprs.push(col(name).count().alias(format!("{}count", prefix)));
505        exprs.push(
506            col(name)
507                .null_count()
508                .alias(format!("{}null_count", prefix)),
509        );
510        if is_numeric_type(dtype) {
511            let c = col(name).cast(DataType::Float64);
512            exprs.push(c.clone().mean().alias(format!("{}mean", prefix)));
513            exprs.push(c.clone().std(1).alias(format!("{}std", prefix)));
514            exprs.push(c.clone().min().alias(format!("{}min", prefix)));
515            exprs.push(
516                c.clone()
517                    .quantile(lit(0.25), QuantileMethod::Nearest)
518                    .alias(format!("{}q25", prefix)),
519            );
520            exprs.push(
521                c.clone()
522                    .quantile(lit(0.5), QuantileMethod::Nearest)
523                    .alias(format!("{}median", prefix)),
524            );
525            exprs.push(
526                c.clone()
527                    .quantile(lit(0.75), QuantileMethod::Nearest)
528                    .alias(format!("{}q75", prefix)),
529            );
530            exprs.push(c.max().alias(format!("{}max", prefix)));
531        } else if is_categorical_type(dtype) {
532            exprs.push(col(name).min().alias(format!("{}min", prefix)));
533            exprs.push(col(name).max().alias(format!("{}max", prefix)));
534        }
535    }
536    exprs
537}
538
539/// Parses the single-row aggregation result from describe into column statistics.
540fn parse_describe_agg_row(agg_df: &DataFrame, schema: &Schema) -> Vec<ColumnStatistics> {
541    let row = 0usize;
542    let mut column_statistics = Vec::with_capacity(schema.len());
543    for (name, dtype) in schema.iter() {
544        let name_str = name.as_str();
545        let prefix = format!("{}::", name_str);
546        let count: usize = agg_df
547            .column(&format!("{}count", prefix))
548            .ok()
549            .map(|s| match s.get(row) {
550                Ok(AnyValue::UInt32(x)) => x as usize,
551                _ => 0,
552            })
553            .unwrap_or(0);
554        let null_count: usize = agg_df
555            .column(&format!("{}null_count", prefix))
556            .ok()
557            .map(|s| match s.get(row) {
558                Ok(AnyValue::UInt32(x)) => x as usize,
559                _ => 0,
560            })
561            .unwrap_or(0);
562        let numeric_stats = if is_numeric_type(dtype) {
563            let mean = get_f64(agg_df, &format!("{}mean", prefix), row);
564            let std = get_f64(agg_df, &format!("{}std", prefix), row);
565            let min = get_f64(agg_df, &format!("{}min", prefix), row);
566            let q25 = get_f64(agg_df, &format!("{}q25", prefix), row);
567            let median = get_f64(agg_df, &format!("{}median", prefix), row);
568            let q75 = get_f64(agg_df, &format!("{}q75", prefix), row);
569            let max = get_f64(agg_df, &format!("{}max", prefix), row);
570            let mut percentiles = HashMap::new();
571            percentiles.insert(25u8, q25);
572            percentiles.insert(50u8, median);
573            percentiles.insert(75u8, q75);
574            Some(NumericStatistics {
575                mean,
576                std,
577                min,
578                max,
579                median,
580                q25,
581                q75,
582                percentiles,
583                skewness: 0.0,
584                kurtosis: 3.0,
585                outliers_iqr: 0,
586                outliers_zscore: 0,
587            })
588        } else {
589            None
590        };
591        let categorical_stats = if is_categorical_type(dtype) {
592            let min = get_str(agg_df, &format!("{}min", prefix), row);
593            let max = get_str(agg_df, &format!("{}max", prefix), row);
594            Some(CategoricalStatistics {
595                unique_count: 0,
596                mode: None,
597                top_values: Vec::new(),
598                min,
599                max,
600            })
601        } else {
602            None
603        };
604        column_statistics.push(ColumnStatistics {
605            name: name_str.to_string(),
606            dtype: dtype.clone(),
607            count,
608            null_count,
609            numeric_stats,
610            categorical_stats,
611            distribution_info: None,
612        });
613    }
614    column_statistics
615}
616
617/// Computes describe statistics from a LazyFrame without materializing all rows.
618/// When sampling is disabled, runs a single aggregation collect (like Polars describe) for similar performance.
619/// When sampling is enabled, samples then runs describe on the sample.
620pub fn compute_describe_from_lazy(
621    lf: &LazyFrame,
622    total_rows: usize,
623    sample_size: Option<usize>,
624    seed: u64,
625    polars_streaming: bool,
626) -> Result<AnalysisResults> {
627    let schema = lf.clone().collect_schema()?;
628    let should_sample = sample_size.is_some_and(|t| total_rows >= t);
629    if should_sample {
630        let threshold = sample_size.unwrap();
631        let df = sample_dataframe(lf, threshold, seed, polars_streaming)?;
632        return compute_describe_single_aggregation(
633            &df,
634            &schema,
635            total_rows,
636            Some(threshold),
637            seed,
638            polars_streaming,
639        );
640    }
641    let exprs = build_describe_aggregation_exprs(&schema);
642    let agg_df = collect_lazy(lf.clone().select(exprs), polars_streaming).map_err(Report::from)?;
643    let column_statistics = parse_describe_agg_row(&agg_df, &schema);
644    Ok(analysis_results_from_describe(
645        column_statistics,
646        total_rows,
647        None,
648        seed,
649    ))
650}
651
652/// Computes describe statistics in a single aggregation pass over the DataFrame.
653/// Uses one collect() with aggregated expressions for all columns (count, null_count, mean, std, min, percentiles, max).
654pub fn compute_describe_single_aggregation(
655    df: &DataFrame,
656    schema: &Schema,
657    total_rows: usize,
658    sample_size: Option<usize>,
659    sample_seed: u64,
660    polars_streaming: bool,
661) -> Result<AnalysisResults> {
662    let exprs = build_describe_aggregation_exprs(schema);
663    let agg_df =
664        collect_lazy(df.clone().lazy().select(exprs), polars_streaming).map_err(Report::from)?;
665    let column_statistics = parse_describe_agg_row(&agg_df, schema);
666    Ok(analysis_results_from_describe(
667        column_statistics,
668        total_rows,
669        sample_size,
670        sample_seed,
671    ))
672}
673
674fn get_f64(df: &DataFrame, col_name: &str, row: usize) -> f64 {
675    df.column(col_name)
676        .ok()
677        .and_then(|s| {
678            let v = s.get(row).ok()?;
679            match v {
680                AnyValue::Float64(x) => Some(x),
681                AnyValue::Float32(x) => Some(x as f64),
682                AnyValue::Int32(x) => Some(x as f64),
683                AnyValue::Int64(x) => Some(x as f64),
684                AnyValue::UInt32(x) => Some(x as f64),
685                AnyValue::Null => Some(f64::NAN),
686                _ => None,
687            }
688        })
689        .unwrap_or(f64::NAN)
690}
691
692fn get_str(df: &DataFrame, col_name: &str, row: usize) -> Option<String> {
693    df.column(col_name)
694        .ok()
695        .and_then(|s| s.get(row).ok().map(|v| v.str_value().to_string()))
696}
697
698/// Computes distribution statistics for numeric columns.
699///
700/// - Infers distribution types for numeric columns missing distribution_info
701/// - Computes advanced statistics (skewness, kurtosis, outliers) if missing
702/// - Generates distribution_analyses for columns with detected distributions
703pub fn compute_distribution_statistics(
704    results: &mut AnalysisResults,
705    lf: &LazyFrame,
706    sample_size: Option<usize>,
707    seed: u64,
708    polars_streaming: bool,
709) -> Result<()> {
710    // sample_size: None = never sample; Some(threshold) = sample when total_rows >= threshold
711    let should_sample = sample_size.is_some_and(|t| results.total_rows >= t);
712    let sampling_threshold = sample_size.unwrap_or(0);
713    let actual_sample_size = if should_sample {
714        Some(sampling_threshold)
715    } else {
716        None
717    };
718
719    let df = if should_sample {
720        sample_dataframe(lf, sampling_threshold, seed, polars_streaming)?
721    } else {
722        collect_lazy(lf.clone(), polars_streaming).map_err(Report::from)?
723    };
724
725    for col_stat in &mut results.column_statistics {
726        if col_stat.distribution_info.is_none()
727            && is_numeric_type(&col_stat.dtype)
728            && col_stat.null_count < col_stat.count
729        {
730            let series = df.column(&col_stat.name)?.as_materialized_series();
731            col_stat.distribution_info = Some(infer_distribution(
732                series,
733                series,
734                actual_sample_size.unwrap_or(col_stat.count),
735                should_sample,
736            ));
737        }
738
739        if let Some(ref mut num_stats) = col_stat.numeric_stats {
740            let needs_advanced_stats = num_stats.skewness == 0.0
741                && num_stats.kurtosis == 3.0
742                && num_stats.outliers_iqr == 0
743                && num_stats.outliers_zscore == 0
744                && col_stat.count > 0;
745            if needs_advanced_stats {
746                let series = df.column(&col_stat.name)?.as_materialized_series();
747                num_stats.skewness = compute_skewness(series);
748                num_stats.kurtosis = compute_kurtosis(series);
749                let (out_iqr, out_zscore) = detect_outliers(
750                    series,
751                    num_stats.q25,
752                    num_stats.q75,
753                    num_stats.median,
754                    num_stats.mean,
755                    num_stats.std,
756                );
757                num_stats.outliers_iqr = out_iqr;
758                num_stats.outliers_zscore = out_zscore;
759            }
760        }
761    }
762
763    if results.distribution_analyses.is_empty() {
764        results.distribution_analyses = results
765            .column_statistics
766            .iter()
767            .filter_map(|col_stat| {
768                if let (Some(numeric_stats), Some(dist_info)) =
769                    (&col_stat.numeric_stats, &col_stat.distribution_info)
770                {
771                    if let Ok(series_col) = df.column(&col_stat.name) {
772                        let series = series_col.as_materialized_series();
773                        Some(compute_advanced_distribution_analysis(
774                            &col_stat.name,
775                            series,
776                            numeric_stats,
777                            dist_info,
778                            actual_sample_size.unwrap_or(results.total_rows),
779                            should_sample,
780                        ))
781                    } else {
782                        None
783                    }
784                } else {
785                    None
786                }
787            })
788            .collect();
789    }
790
791    Ok(())
792}
793
794/// Computes correlation matrix if not already present in results.
795pub fn compute_correlation_statistics(
796    results: &mut AnalysisResults,
797    lf: &LazyFrame,
798    polars_streaming: bool,
799) -> Result<()> {
800    if results.correlation_matrix.is_none() {
801        let df = collect_lazy(lf.clone(), polars_streaming).map_err(Report::from)?;
802        results.correlation_matrix = compute_correlation_matrix(&df).ok();
803    }
804    Ok(())
805}
806
807/// Uses Polars' definition so Int128, UInt128, Decimal, and future numeric types are included.
808fn is_numeric_type(dtype: &DataType) -> bool {
809    dtype.is_numeric()
810}
811
812fn is_categorical_type(dtype: &DataType) -> bool {
813    matches!(dtype, DataType::String | DataType::Categorical(..))
814}
815
816/// Samples a LazyFrame for analysis when row count exceeds threshold. Used by chunked describe.
817pub fn sample_dataframe(
818    lf: &LazyFrame,
819    sample_size: usize,
820    seed: u64,
821    polars_streaming: bool,
822) -> Result<DataFrame> {
823    let collect_multiplier = if sample_size <= 1000 {
824        5
825    } else if sample_size <= 5000 {
826        3
827    } else {
828        2
829    };
830
831    let collect_limit = (sample_size * collect_multiplier).min(50_000);
832    let df = collect_lazy(lf.clone().limit(collect_limit as u32), polars_streaming)
833        .map_err(Report::from)?;
834    let total_collected = df.height();
835
836    if total_collected <= sample_size {
837        return Ok(df);
838    }
839
840    let step = total_collected / sample_size;
841    let start_offset = (seed as usize) % step;
842
843    let indices: Vec<u32> = (0..sample_size)
844        .map(|i| {
845            let idx = start_offset + i * step;
846            (idx.min(total_collected - 1)) as u32
847        })
848        .collect();
849
850    let indices_ca = UInt32Chunked::new("indices".into(), indices);
851    df.take(&indices_ca)
852        .map_err(|e| color_eyre::eyre::eyre!("Sampling error: {}", e))
853}
854
855fn get_numeric_values_as_f64(series: &Series) -> Vec<f64> {
856    let max_len = 10000;
857    let limited_series = if series.len() > max_len {
858        series.slice(0, max_len)
859    } else {
860        series.clone()
861    };
862
863    if let Ok(f64_series) = limited_series.f64() {
864        f64_series.iter().flatten().take(max_len).collect()
865    } else if let Ok(i64_series) = limited_series.i64() {
866        i64_series
867            .iter()
868            .filter_map(|v| v.map(|x| x as f64))
869            .take(max_len)
870            .collect()
871    } else if let Ok(i32_series) = limited_series.i32() {
872        i32_series
873            .iter()
874            .filter_map(|v| v.map(|x| x as f64))
875            .take(max_len)
876            .collect()
877    } else if let Ok(u64_series) = limited_series.u64() {
878        u64_series
879            .iter()
880            .filter_map(|v| v.map(|x| x as f64))
881            .take(max_len)
882            .collect()
883    } else if let Ok(u32_series) = limited_series.u32() {
884        u32_series
885            .iter()
886            .filter_map(|v| v.map(|x| x as f64))
887            .take(max_len)
888            .collect()
889    } else if let Ok(f32_series) = limited_series.f32() {
890        f32_series
891            .iter()
892            .filter_map(|v| v.map(|x| x as f64))
893            .take(max_len)
894            .collect()
895    } else {
896        match limited_series.cast(&DataType::Float64) {
897            Ok(cast_series) => {
898                if let Ok(f64_series) = cast_series.f64() {
899                    f64_series.iter().flatten().take(max_len).collect()
900                } else {
901                    Vec::new()
902                }
903            }
904            Err(_) => Vec::new(),
905        }
906    }
907}
908
909fn compute_numeric_stats(series: &Series, include_advanced: bool) -> Result<NumericStatistics> {
910    let mean = series.mean().unwrap_or(f64::NAN);
911    let std = series.std(1).unwrap_or(f64::NAN); // Sample std (ddof=1)
912
913    let min = if let Ok(min_val) = series.min::<f64>() {
914        min_val.unwrap_or(f64::NAN)
915    } else if let Ok(min_val) = series.min::<i64>() {
916        min_val.map(|v| v as f64).unwrap_or(f64::NAN)
917    } else if let Ok(min_val) = series.min::<i32>() {
918        min_val.map(|v| v as f64).unwrap_or(f64::NAN)
919    } else {
920        f64::NAN
921    };
922
923    let max = if let Ok(max_val) = series.max::<f64>() {
924        max_val.unwrap_or(f64::NAN)
925    } else if let Ok(max_val) = series.max::<i64>() {
926        max_val.map(|v| v as f64).unwrap_or(f64::NAN)
927    } else if let Ok(max_val) = series.max::<i32>() {
928        max_val.map(|v| v as f64).unwrap_or(f64::NAN)
929    } else {
930        f64::NAN
931    };
932
933    let mut percentiles = HashMap::new();
934    let values: Vec<f64> = get_numeric_values_as_f64(series);
935
936    if !values.is_empty() {
937        let mut sorted = values.clone();
938        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
939        let n = sorted.len();
940
941        for p in &[1, 5, 25, 50, 75, 95, 99] {
942            let idx = ((*p as f64 / 100.0) * (n - 1) as f64).round() as usize;
943            let idx = idx.min(n - 1);
944            percentiles.insert(*p, sorted[idx]);
945        }
946    }
947
948    let median = percentiles.get(&50).copied().unwrap_or(f64::NAN);
949    let q25 = percentiles.get(&25).copied().unwrap_or(f64::NAN);
950    let q75 = percentiles.get(&75).copied().unwrap_or(f64::NAN);
951
952    let (skewness, kurtosis, outliers_iqr, outliers_zscore) = if include_advanced {
953        let (out_iqr, out_zscore) = detect_outliers(series, q25, q75, median, mean, std);
954        (
955            compute_skewness(series),
956            compute_kurtosis(series),
957            out_iqr,
958            out_zscore,
959        )
960    } else {
961        (0.0, 3.0, 0, 0) // Default values when not computed
962    };
963
964    Ok(NumericStatistics {
965        mean,
966        std,
967        min,
968        max,
969        median,
970        q25,
971        q75,
972        percentiles,
973        skewness,
974        kurtosis,
975        outliers_iqr,
976        outliers_zscore,
977    })
978}
979
980fn compute_skewness(series: &Series) -> f64 {
981    let mean = series.mean().unwrap_or(0.0);
982    let std = series.std(1).unwrap_or(1.0);
983    let n = series.len() as f64;
984
985    if std == 0.0 || n < 3.0 {
986        return 0.0;
987    }
988
989    let values: Vec<f64> = get_numeric_values_as_f64(series);
990
991    if values.is_empty() {
992        return 0.0;
993    }
994
995    let sum_cubed_deviations: f64 = values
996        .iter()
997        .map(|v| {
998            let deviation = (v - mean) / std;
999            deviation * deviation * deviation
1000        })
1001        .sum();
1002
1003    (n / ((n - 1.0) * (n - 2.0))) * sum_cubed_deviations
1004}
1005
1006fn compute_kurtosis(series: &Series) -> f64 {
1007    let mean = series.mean().unwrap_or(0.0);
1008    let std = series.std(1).unwrap_or(1.0);
1009    let n = series.len() as f64;
1010
1011    if std == 0.0 || n < 4.0 {
1012        return 3.0; // Normal distribution kurtosis
1013    }
1014
1015    let values: Vec<f64> = get_numeric_values_as_f64(series);
1016
1017    if values.is_empty() {
1018        return 3.0;
1019    }
1020
1021    let sum_fourth_deviations: f64 = values
1022        .iter()
1023        .map(|v| {
1024            let deviation = (v - mean) / std;
1025            let d2 = deviation * deviation;
1026            d2 * d2
1027        })
1028        .sum();
1029
1030    let k = (n * (n + 1.0) / ((n - 1.0) * (n - 2.0) * (n - 3.0))) * sum_fourth_deviations
1031        - 3.0 * (n - 1.0) * (n - 1.0) / ((n - 2.0) * (n - 3.0));
1032
1033    k + 3.0 // Excess kurtosis -> kurtosis
1034}
1035
1036fn detect_outliers(
1037    series: &Series,
1038    q25: f64,
1039    q75: f64,
1040    _median: f64,
1041    mean: f64,
1042    std: f64,
1043) -> (usize, usize) {
1044    if q25.is_nan() || q75.is_nan() {
1045        return (0, 0);
1046    }
1047
1048    let iqr = q75 - q25;
1049    let lower_fence = q25 - 1.5 * iqr;
1050    let upper_fence = q75 + 1.5 * iqr;
1051
1052    let mut outliers_iqr = 0;
1053    let mut outliers_zscore = 0;
1054
1055    if std > 0.0 {
1056        // Get values as f64, handling both integer and float types
1057        let values: Vec<f64> = get_numeric_values_as_f64(series);
1058
1059        for v in values {
1060            // IQR method
1061            if v < lower_fence || v > upper_fence {
1062                outliers_iqr += 1;
1063            }
1064
1065            // Z-score method (3 sigma rule)
1066            let z = (v - mean).abs() / std;
1067            if z > 3.0 {
1068                outliers_zscore += 1;
1069            }
1070        }
1071    }
1072
1073    (outliers_iqr, outliers_zscore)
1074}
1075
1076fn compute_categorical_stats(series: &Series) -> Result<CategoricalStatistics> {
1077    let value_counts = series.value_counts(false, false, "counts".into(), false)?;
1078    let unique_count = value_counts.height();
1079
1080    let mode = if unique_count > 0 {
1081        if let Some(col) = value_counts.get(0) {
1082            col.first().map(|v| v.str_value().to_string())
1083        } else {
1084            None
1085        }
1086    } else {
1087        None
1088    };
1089
1090    let mut top_values = Vec::new();
1091    for i in 0..unique_count.min(10) {
1092        if let (Some(value_col), Some(count_col)) = (value_counts.get(0), value_counts.get(1)) {
1093            if let (Some(value), Some(count)) = (value_col.get(i), count_col.get(i)) {
1094                let value_str = value.str_value();
1095                if let Ok(count_u32) = count.try_extract::<u32>() {
1096                    top_values.push((value_str.to_string(), count_u32 as usize));
1097                }
1098            }
1099        }
1100    }
1101
1102    let min = if let Ok(str_series) = series.str() {
1103        let mut min_val: Option<String> = None;
1104        for s in str_series.iter().flatten() {
1105            let s_str = s.to_string();
1106            min_val = match min_val {
1107                None => Some(s_str.clone()),
1108                Some(ref current) if s_str < *current => Some(s_str),
1109                Some(current) => Some(current),
1110            };
1111        }
1112        min_val
1113    } else {
1114        None
1115    };
1116
1117    let max = if let Ok(str_series) = series.str() {
1118        let mut max_val: Option<String> = None;
1119        for s in str_series.iter().flatten() {
1120            let s_str = s.to_string();
1121            max_val = match max_val {
1122                None => Some(s_str.clone()),
1123                Some(ref current) if s_str > *current => Some(s_str),
1124                Some(current) => Some(current),
1125            };
1126        }
1127        max_val
1128    } else {
1129        None
1130    };
1131
1132    Ok(CategoricalStatistics {
1133        unique_count,
1134        mode,
1135        top_values,
1136        min,
1137        max,
1138    })
1139}
1140
1141fn infer_distribution(
1142    _series: &Series,
1143    sample: &Series,
1144    sample_size: usize,
1145    is_sampled: bool,
1146) -> DistributionInfo {
1147    if sample_size < 3 {
1148        return DistributionInfo {
1149            distribution_type: DistributionType::Unknown,
1150            confidence: 0.0,
1151            sample_size,
1152            is_sampled,
1153            fit_quality: None,
1154            all_distribution_pvalues: HashMap::new(),
1155        };
1156    }
1157
1158    let max_convert = 10000.min(sample.len());
1159    let values: Vec<f64> = if sample.len() > max_convert {
1160        let all_values = get_numeric_values_as_f64(sample);
1161        all_values.into_iter().take(max_convert).collect()
1162    } else {
1163        get_numeric_values_as_f64(sample)
1164    };
1165
1166    if values.is_empty() {
1167        return DistributionInfo {
1168            distribution_type: DistributionType::Unknown,
1169            confidence: 0.0,
1170            sample_size,
1171            is_sampled,
1172            fit_quality: None,
1173            all_distribution_pvalues: HashMap::new(),
1174        };
1175    }
1176
1177    let mean: f64 = values.iter().sum::<f64>() / values.len() as f64;
1178    let variance: f64 =
1179        values.iter().map(|v| (v - mean).powi(2)).sum::<f64>() / (values.len() - 1) as f64;
1180    let std = variance.sqrt();
1181
1182    let mut candidates: Vec<(DistributionType, f64, f64)> = Vec::new();
1183
1184    let normal_fit = calculate_normal_fit_quality(&values, mean, std);
1185    let normal_confidence = normal_fit.min(0.95);
1186    candidates.push((DistributionType::Normal, normal_fit, normal_confidence));
1187
1188    let positive_values: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
1189    if positive_values.len() > 10 {
1190        let lognormal_fit = calculate_lognormal_fit_quality(&values);
1191        let lognormal_confidence = lognormal_fit.min(0.95);
1192        if lognormal_fit > 0.01 {
1193            candidates.push((
1194                DistributionType::LogNormal,
1195                lognormal_fit,
1196                lognormal_confidence,
1197            ));
1198        }
1199    }
1200
1201    let uniformity_pvalue = chi_square_uniformity_test(&values);
1202    let uniform_fit = calculate_uniform_fit_quality(&values);
1203    let uniform_confidence = uniformity_pvalue.min(0.95);
1204    candidates.push((DistributionType::Uniform, uniform_fit, uniform_confidence));
1205
1206    let powerlaw_pvalue = test_power_law(&values);
1207    let powerlaw_fit = calculate_power_law_fit_quality(&values);
1208    let powerlaw_confidence = powerlaw_pvalue.min(0.95);
1209    if powerlaw_pvalue > 0.0 {
1210        candidates.push((
1211            DistributionType::PowerLaw,
1212            powerlaw_fit,
1213            powerlaw_confidence,
1214        ));
1215    }
1216
1217    let positive_exp: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
1218    if positive_exp.len() > 10 {
1219        let exp_score = test_exponential(&values);
1220        let exp_fit = calculate_exponential_fit_quality(&values);
1221        let exp_confidence = exp_score.min(0.95);
1222        if exp_score > 0.0 {
1223            candidates.push((DistributionType::Exponential, exp_fit, exp_confidence));
1224        }
1225    }
1226
1227    let beta_score = test_beta(&values);
1228    let beta_fit = calculate_beta_fit_quality(&values);
1229    let beta_confidence = beta_score.min(0.95);
1230    if beta_score > 0.0 {
1231        candidates.push((DistributionType::Beta, beta_fit, beta_confidence));
1232    }
1233
1234    let gamma_score = test_gamma(&values);
1235    let gamma_fit = calculate_gamma_fit_quality(&values);
1236    let gamma_confidence = gamma_score.min(0.95);
1237    if gamma_score > 0.0 {
1238        candidates.push((DistributionType::Gamma, gamma_fit, gamma_confidence));
1239    }
1240
1241    let chi2_score = test_chi_squared(&values);
1242    let chi2_fit = calculate_chi_squared_fit_quality(&values);
1243    let chi2_confidence = chi2_score.min(0.95);
1244    if chi2_score > 0.0 {
1245        candidates.push((DistributionType::ChiSquared, chi2_fit, chi2_confidence));
1246    }
1247
1248    let t_score = test_students_t(&values);
1249    let t_fit = calculate_students_t_fit_quality(&values);
1250    let t_confidence = t_score.min(0.95);
1251    if t_score > 0.0 {
1252        candidates.push((DistributionType::StudentsT, t_fit, t_confidence));
1253    }
1254
1255    let poisson_score = test_poisson(&values);
1256    let poisson_fit = calculate_poisson_fit_quality(&values);
1257    let poisson_confidence = poisson_score.min(0.95);
1258    if poisson_score > 0.0 {
1259        candidates.push((DistributionType::Poisson, poisson_fit, poisson_confidence));
1260    }
1261
1262    let bernoulli_score = test_bernoulli(&values);
1263    let bernoulli_fit = calculate_bernoulli_fit_quality(&values);
1264    let bernoulli_confidence = bernoulli_score.min(0.95);
1265    let binary_count = values.iter().filter(|&&v| v == 0.0 || v == 1.0).count();
1266    if bernoulli_score > 0.01 && binary_count as f64 / values.len() as f64 > 0.9 {
1267        candidates.push((
1268            DistributionType::Bernoulli,
1269            bernoulli_fit,
1270            bernoulli_confidence,
1271        ));
1272    }
1273
1274    let max_value = values.iter().fold(0.0f64, |a, &b| a.max(b));
1275    if max_value > 1.0 {
1276        let binomial_score = test_binomial(&values);
1277        let binomial_fit = calculate_binomial_fit_quality(&values);
1278        let binomial_confidence = binomial_score.min(0.95);
1279        let threshold = if values.len() > 5000 { 0.0 } else { 0.01 };
1280        if binomial_score > threshold {
1281            candidates.push((
1282                DistributionType::Binomial,
1283                binomial_fit,
1284                binomial_confidence,
1285            ));
1286        }
1287    }
1288
1289    if values.len() <= 10000 {
1290        let non_negative_int_count = values
1291            .iter()
1292            .filter(|&&v| v >= 0.0 && v == v.floor() && v.is_finite())
1293            .count();
1294
1295        if non_negative_int_count as f64 / values.len() as f64 > 0.9 {
1296            let geometric_score = test_geometric(&values);
1297            let geometric_fit = calculate_geometric_fit_quality(&values);
1298            let geometric_confidence = geometric_score.min(0.95);
1299            if geometric_score > 0.01 {
1300                candidates.push((
1301                    DistributionType::Geometric,
1302                    geometric_fit,
1303                    geometric_confidence,
1304                ));
1305            }
1306        }
1307    }
1308
1309    let positive_weibull: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
1310    if positive_weibull.len() > 10 {
1311        let weibull_score = test_weibull(&values);
1312        let weibull_fit = calculate_weibull_fit_quality(&values);
1313        let weibull_confidence = weibull_score.min(0.95);
1314        if weibull_score > 0.01 {
1315            candidates.push((DistributionType::Weibull, weibull_fit, weibull_confidence));
1316        }
1317    }
1318
1319    // Build HashMap of all distribution p-values for reuse in detail page
1320    let mut all_pvalues = HashMap::new();
1321    for (dist_type, _, confidence) in &candidates {
1322        all_pvalues.insert(*dist_type, *confidence);
1323    }
1324
1325    if let Some(best) = candidates.iter().max_by(|a, b| {
1326        // Primary comparison: p-value (confidence)
1327        let p_cmp = a.2.partial_cmp(&b.2).unwrap_or(std::cmp::Ordering::Equal);
1328        if p_cmp != std::cmp::Ordering::Equal {
1329            return p_cmp;
1330        }
1331        if (a.2 - b.2).abs() < 0.01 {
1332            a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal)
1333        } else {
1334            p_cmp
1335        }
1336    }) {
1337        DistributionInfo {
1338            distribution_type: best.0,
1339            confidence: best.2,
1340            sample_size,
1341            is_sampled,
1342            fit_quality: Some(best.1),
1343            all_distribution_pvalues: all_pvalues,
1344        }
1345    } else {
1346        DistributionInfo {
1347            distribution_type: DistributionType::Unknown,
1348            confidence: 0.50,
1349            sample_size,
1350            is_sampled,
1351            fit_quality: Some(0.5),
1352            all_distribution_pvalues: HashMap::new(),
1353        }
1354    }
1355}
1356
1357fn approximate_shapiro_wilk(values: &[f64]) -> (Option<f64>, Option<f64>) {
1358    let n = values.len();
1359    if n < 3 {
1360        return (None, None);
1361    }
1362
1363    let mean: f64 = values.iter().sum::<f64>() / n as f64;
1364    let variance: f64 = values.iter().map(|v| (v - mean).powi(2)).sum::<f64>() / (n - 1) as f64;
1365    let std = variance.sqrt();
1366
1367    if std == 0.0 {
1368        return (None, None);
1369    }
1370
1371    let mut sorted = values.to_vec();
1372    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
1373
1374    let mut sum_expected_sq = 0.0;
1375    let mut sum_data_sq = 0.0;
1376    let mut sum_product = 0.0;
1377
1378    for (i, &value) in sorted.iter().enumerate() {
1379        let p = (i as f64 + 1.0 - 0.375) / (n as f64 + 0.25);
1380        let expected_quantile = normal_quantile(p);
1381        let standardized_value = (value - mean) / std;
1382
1383        sum_expected_sq += expected_quantile * expected_quantile;
1384        sum_data_sq += standardized_value * standardized_value;
1385        sum_product += expected_quantile * standardized_value;
1386    }
1387
1388    let sw_stat = if sum_expected_sq > 0.0 && sum_data_sq > 0.0 {
1389        (sum_product * sum_product) / (sum_expected_sq * sum_data_sq)
1390    } else {
1391        0.0
1392    };
1393
1394    let sw_stat = sw_stat.clamp(0.0, 1.0);
1395
1396    let skewness: f64 = values
1397        .iter()
1398        .map(|v| ((v - mean) / std).powi(3))
1399        .sum::<f64>()
1400        / n as f64;
1401
1402    let kurtosis: f64 = values
1403        .iter()
1404        .map(|v| ((v - mean) / std).powi(4))
1405        .sum::<f64>()
1406        / n as f64;
1407
1408    let skew_penalty = (skewness.abs() / 2.0).min(1.0);
1409    let kurt_penalty = ((kurtosis - 3.0).abs() / 2.0).min(1.0);
1410
1411    let w_factor = sw_stat;
1412    let penalty_factor = 1.0 - (skew_penalty + kurt_penalty) / 2.0;
1413    let pvalue = (w_factor * 0.7 + penalty_factor * 0.3).clamp(0.0, 1.0);
1414
1415    (Some(sw_stat), Some(pvalue))
1416}
1417
1418/// Calculates chi-square statistic for uniformity test.
1419///
1420/// Divides values into 10 bins and compares observed vs expected frequencies.
1421/// Returns the raw chi-square value, or None if the test cannot be performed
1422/// (insufficient data or zero range).
1423fn calculate_chi_square_uniformity(values: &[f64]) -> Option<f64> {
1424    let n = values.len();
1425    if n < 10 {
1426        return None;
1427    }
1428
1429    let min = values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
1430    let max = values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
1431    let range = max - min;
1432
1433    if range == 0.0 {
1434        return None;
1435    }
1436
1437    let bins = 10;
1438    let mut counts = vec![0; bins];
1439
1440    for &v in values {
1441        let bin = (((v - min) / range) * bins as f64) as usize;
1442        let bin = bin.min(bins - 1);
1443        counts[bin] += 1;
1444    }
1445
1446    let expected = n as f64 / bins as f64;
1447    let chi_square: f64 = counts
1448        .iter()
1449        .map(|&count| {
1450            let diff = count as f64 - expected;
1451            diff * diff / expected
1452        })
1453        .sum();
1454
1455    Some(chi_square)
1456}
1457
1458fn kolmogorov_smirnov_test<F>(values: &[f64], theoretical_cdf: F) -> f64
1459where
1460    F: Fn(f64) -> f64,
1461{
1462    if values.is_empty() {
1463        return 1.0;
1464    }
1465
1466    let mut sorted = values.to_vec();
1467    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
1468
1469    let n = sorted.len();
1470    let mut max_distance: f64 = 0.0;
1471
1472    for (i, &x) in sorted.iter().enumerate() {
1473        let empirical_cdf = (i + 1) as f64 / n as f64;
1474        let theoretical_cdf_val = theoretical_cdf(x);
1475        let distance = (empirical_cdf - theoretical_cdf_val).abs();
1476        max_distance = max_distance.max(distance);
1477    }
1478
1479    max_distance
1480}
1481
1482fn chi_square_uniformity_test(values: &[f64]) -> f64 {
1483    if let Some(chi_square) = calculate_chi_square_uniformity(values) {
1484        (-chi_square / 20.0).exp().clamp(0.0, 1.0)
1485    } else {
1486        0.0
1487    }
1488}
1489
1490fn estimate_power_law_mle(values: &[f64]) -> Option<(f64, f64)> {
1491    let positive_values: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
1492    if positive_values.len() < 10 {
1493        return None;
1494    }
1495
1496    let mut sorted = positive_values.clone();
1497    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
1498
1499    let xmin = sorted[0];
1500    let sum_log_ratio: f64 = sorted.iter().map(|&x| (x / xmin).ln()).sum();
1501
1502    if sum_log_ratio <= 0.0 {
1503        return None;
1504    }
1505
1506    let n = sorted.len() as f64;
1507    let alpha = 1.0 + n / sum_log_ratio;
1508    if !(1.5..=4.0).contains(&alpha) {
1509        return None;
1510    }
1511
1512    Some((xmin, alpha))
1513}
1514
1515fn power_law_ks_test(values: &[f64], xmin: f64, alpha: f64) -> f64 {
1516    let mut filtered: Vec<f64> = values.iter().filter(|&&v| v >= xmin).copied().collect();
1517    if filtered.is_empty() {
1518        return 1.0;
1519    }
1520    filtered.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
1521
1522    let n = filtered.len();
1523    let mut max_distance: f64 = 0.0;
1524
1525    for (i, &x) in filtered.iter().enumerate() {
1526        let empirical_cdf = (i + 1) as f64 / n as f64;
1527        let theoretical_cdf = powerlaw_cdf(x, xmin, alpha);
1528        let distance = (empirical_cdf - theoretical_cdf).abs();
1529        max_distance = max_distance.max(distance);
1530    }
1531
1532    max_distance
1533}
1534
1535fn approximate_ks_pvalue(ks_stat: f64, n: usize) -> f64 {
1536    if n < 1 || ks_stat <= 0.0 {
1537        return 0.0;
1538    }
1539    if ks_stat >= 1.0 {
1540        return 0.0;
1541    }
1542
1543    let sqrt_n = (n as f64).sqrt();
1544    let exponent = -2.0 * (sqrt_n * ks_stat).powi(2);
1545    let p_value = 2.0 * exponent.exp();
1546
1547    p_value.clamp(0.0, 1.0)
1548}
1549
1550fn chi_square_goodness_of_fit(observed: &[usize], expected: &[f64]) -> f64 {
1551    if observed.len() != expected.len() || observed.is_empty() {
1552        return 0.0;
1553    }
1554
1555    let mut chi_square = 0.0;
1556    let mut total_observed = 0;
1557    let mut total_expected = 0.0;
1558
1559    for (obs, exp) in observed.iter().zip(expected.iter()) {
1560        total_observed += obs;
1561        total_expected += exp;
1562        if *exp > 0.0 {
1563            let diff = *obs as f64 - exp;
1564            chi_square += (diff * diff) / exp;
1565        }
1566    }
1567
1568    let df = (observed.len() as i32 - 2).max(1) as usize;
1569    // For smaller df, use a more conservative approximation
1570    let p_value = if df > 30 {
1571        // Large df: use normal approximation
1572        let z = ((2.0 * chi_square).sqrt() - (2.0 * df as f64 - 1.0).sqrt()).max(0.0);
1573        (-z * z / 2.0).exp()
1574    } else {
1575        // Small df: use exponential decay approximation
1576        (-chi_square / (2.0 * df as f64)).exp()
1577    };
1578
1579    p_value.clamp(0.0, 1.0)
1580}
1581
1582fn power_law_log_likelihood(values: &[f64], xmin: f64, alpha: f64) -> f64 {
1583    let filtered: Vec<f64> = values.iter().filter(|&&v| v >= xmin).copied().collect();
1584    if filtered.is_empty() {
1585        return f64::NEG_INFINITY;
1586    }
1587
1588    let n = filtered.len() as f64;
1589    let sum_log = filtered.iter().map(|&x| (x / xmin).ln()).sum::<f64>();
1590
1591    n * (alpha - 1.0).ln() - n * xmin.ln() - alpha * sum_log
1592}
1593
1594fn weibull_log_likelihood(values: &[f64], shape: f64, scale: f64) -> f64 {
1595    let positive: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
1596    if positive.is_empty() || shape <= 0.0 || scale <= 0.0 {
1597        return f64::NEG_INFINITY;
1598    }
1599
1600    let n = positive.len() as f64;
1601    let sum_power = positive
1602        .iter()
1603        .map(|&x| (x / scale).powf(shape))
1604        .sum::<f64>();
1605    let sum_log = positive.iter().map(|&x| x.ln()).sum::<f64>();
1606
1607    n * (shape / scale).ln() + (shape - 1.0) * sum_log - sum_power
1608}
1609
1610fn test_power_law(values: &[f64]) -> f64 {
1611    let Some((xmin, alpha)) = estimate_power_law_mle(values) else {
1612        return 0.0;
1613    };
1614
1615    let ks_stat = power_law_ks_test(values, xmin, alpha);
1616
1617    let positive_values: Vec<f64> = values.iter().filter(|&&v| v >= xmin).copied().collect();
1618    let n = positive_values.len();
1619    if n < 10 {
1620        return 0.0;
1621    }
1622
1623    let mut p_value = approximate_ks_pvalue(ks_stat, n);
1624
1625    let uniform_p = chi_square_uniformity_test(values);
1626    if uniform_p > 0.1 {
1627        p_value *= 0.3;
1628    }
1629
1630    if p_value > 0.05 {
1631        let n_f64 = n as f64;
1632        let mean = positive_values.iter().sum::<f64>() / n_f64;
1633        let scale = mean / 1.0;
1634        let shape = 1.5;
1635
1636        let pl_likelihood = power_law_log_likelihood(values, xmin, alpha);
1637        let wb_likelihood = weibull_log_likelihood(values, shape, scale);
1638
1639        if wb_likelihood > pl_likelihood + 5.0 {
1640            p_value *= 0.5;
1641        }
1642    }
1643
1644    p_value
1645}
1646
1647fn test_exponential(values: &[f64]) -> f64 {
1648    // Use KS test for exponential distribution (same as fit quality for consistency)
1649    calculate_exponential_fit_quality(values)
1650}
1651
1652fn test_beta(values: &[f64]) -> f64 {
1653    if values.is_empty() || values.len() < 10 {
1654        return 0.0;
1655    }
1656
1657    let has_negatives = values.iter().any(|&v| v < 0.0);
1658    if has_negatives {
1659        return 0.0;
1660    }
1661
1662    let values_in_range = values.iter().filter(|&&v| (0.0..=1.0).contains(&v)).count();
1663    let ratio_in_range = values_in_range as f64 / values.len() as f64;
1664    if ratio_in_range < 0.85 {
1665        return 0.0;
1666    }
1667
1668    let valid_values: Vec<f64> = values
1669        .iter()
1670        .filter(|&&v| (0.0..=1.0).contains(&v))
1671        .copied()
1672        .collect();
1673
1674    if valid_values.len() < 10 {
1675        return 0.0;
1676    }
1677
1678    let mean = valid_values.iter().sum::<f64>() / valid_values.len() as f64;
1679    let variance: f64 = valid_values.iter().map(|v| (v - mean).powi(2)).sum::<f64>()
1680        / (valid_values.len() - 1) as f64;
1681
1682    if variance <= 0.0 || mean <= 0.0 || mean >= 1.0 {
1683        return 0.0;
1684    }
1685
1686    let temp = mean * (1.0 - mean) / variance - 1.0;
1687    if temp <= 0.0 {
1688        return 0.0; // Invalid parameters
1689    }
1690    let alpha = mean * temp;
1691    let beta = (1.0 - mean) * temp;
1692
1693    if alpha <= 0.0 || beta <= 0.0 {
1694        return 0.0;
1695    }
1696    let alpha = alpha.min(1000.0);
1697    let beta = beta.min(1000.0);
1698
1699    let ks_stat = kolmogorov_smirnov_test(&valid_values, |x| beta_cdf(x, alpha, beta));
1700    let n = valid_values.len();
1701    let ks_pvalue = approximate_ks_pvalue(ks_stat, n);
1702
1703    let max_variance = mean * (1.0 - mean);
1704    let variance_score = if max_variance > 0.0 {
1705        (1.0 - (variance / max_variance).min(1.0)).max(0.0)
1706    } else {
1707        0.0
1708    };
1709    let confidence = if ks_pvalue > 0.1 {
1710        ks_pvalue.min(0.95)
1711    } else if ks_pvalue > 0.05 {
1712        (ks_pvalue * 7.0).min(0.7)
1713    } else if ks_pvalue > 0.01 {
1714        (ks_pvalue * 0.7 + variance_score * 0.3).min(0.5)
1715    } else {
1716        (ks_pvalue * 0.3 + variance_score * 0.7).min(0.4)
1717    };
1718
1719    confidence.max(0.1)
1720}
1721
1722// Test Gamma distribution
1723fn gamma_log_likelihood(values: &[f64], shape: f64, scale: f64) -> f64 {
1724    let positive: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
1725    if positive.is_empty() || shape <= 0.0 || scale <= 0.0 {
1726        return f64::NEG_INFINITY;
1727    }
1728
1729    let n = positive.len() as f64;
1730    let sum_log = positive.iter().map(|&x| x.ln()).sum::<f64>();
1731    let sum_x = positive.iter().sum::<f64>();
1732
1733    let gamma_term = ln_gamma_approx(shape);
1734    -n * gamma_term - n * shape * scale.ln() + (shape - 1.0) * sum_log - sum_x / scale
1735}
1736
1737fn lognormal_log_likelihood(values: &[f64], mu: f64, sigma: f64) -> f64 {
1738    let positive: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
1739    if positive.is_empty() || sigma <= 0.0 {
1740        return f64::NEG_INFINITY;
1741    }
1742
1743    let n = positive.len() as f64;
1744    let sum_log = positive.iter().map(|&x| x.ln()).sum::<f64>();
1745    let sum_log_sq = positive.iter().map(|&x| (x.ln() - mu).powi(2)).sum::<f64>();
1746
1747    -n / 2.0 * (2.0 * std::f64::consts::PI).ln()
1748        - n * sigma.ln()
1749        - sum_log
1750        - sum_log_sq / (2.0 * sigma * sigma)
1751}
1752
1753fn test_gamma(values: &[f64]) -> f64 {
1754    let positive_values: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
1755    if positive_values.len() < 10 {
1756        return 0.0;
1757    }
1758
1759    let mean = positive_values.iter().sum::<f64>() / positive_values.len() as f64;
1760    let variance: f64 = positive_values
1761        .iter()
1762        .map(|&v| (v - mean).powi(2))
1763        .sum::<f64>()
1764        / (positive_values.len() - 1) as f64;
1765
1766    if mean <= 0.0 || variance <= 0.0 {
1767        return 0.0;
1768    }
1769
1770    let shape = (mean * mean) / variance;
1771    let scale = variance / mean;
1772
1773    if shape <= 0.0 || scale <= 0.0 {
1774        return 0.0;
1775    }
1776
1777    let ks_stat = kolmogorov_smirnov_test(&positive_values, |x| gamma_cdf(x, shape, scale));
1778    let n = positive_values.len();
1779    let mut p_value = approximate_ks_pvalue(ks_stat, n);
1780
1781    let cv = variance.sqrt() / mean;
1782    let p_value_threshold = if n > 5000 {
1783        0.001 // Very low p-value threshold for large samples
1784    } else {
1785        0.01 // Standard threshold
1786    };
1787    let ks_stat_threshold = if n > 5000 {
1788        0.15 // Slightly higher KS stat threshold for large samples
1789    } else {
1790        0.12 // Standard threshold
1791    };
1792    if n > 5000 {
1793        if ks_stat < 0.3
1794            && p_value >= 0.0
1795            && shape > 0.1
1796            && shape < 500.0
1797            && scale > 0.001
1798            && scale < 500.0
1799            && cv > 0.1
1800            && cv < 5.0
1801        {
1802            p_value = (ks_stat * 6.0).clamp(0.05, 0.30);
1803        } else if ks_stat < 0.5 && p_value >= 0.0 && shape > 0.1 && scale > 0.001 {
1804            p_value = (ks_stat * 3.0).clamp(0.01, 0.15);
1805        }
1806    } else if p_value < p_value_threshold
1807        && ks_stat < ks_stat_threshold
1808        && p_value > 0.0
1809        && shape > 0.5
1810        && shape < 100.0
1811        && scale > 0.01
1812        && scale < 100.0
1813        && cv > 0.2
1814        && cv < 2.0
1815    {
1816        p_value = (ks_stat * 2.0).min(0.10).max(p_value);
1817    }
1818
1819    let gamma_likelihood = gamma_log_likelihood(values, shape, scale);
1820
1821    let cv = variance.sqrt() / mean;
1822    let weibull_shape = if cv < 0.5 {
1823        2.0
1824    } else if cv < 1.0 {
1825        1.5
1826    } else {
1827        1.0
1828    };
1829    let gamma_approx = match weibull_shape {
1830        1.0 => 1.0,
1831        1.5 => 0.9,
1832        2.0 => 0.886,
1833        _ => 0.9,
1834    };
1835    let weibull_scale = mean / gamma_approx;
1836
1837    if weibull_scale > 0.0 && weibull_shape > 0.0 && weibull_scale < 1000.0 && weibull_shape < 100.0
1838    {
1839        let weibull_likelihood = weibull_log_likelihood(values, weibull_shape, weibull_scale);
1840
1841        let is_discrete = values.iter().all(|&v| v == v.floor());
1842        if is_discrete && p_value > 0.0 {
1843            p_value *= 0.1;
1844        } else if p_value > 0.1 && weibull_likelihood > gamma_likelihood + 5.0 {
1845            p_value *= 0.5;
1846        } else if p_value > 0.05 && weibull_likelihood > gamma_likelihood + 5.0 {
1847            p_value *= 0.7;
1848        } else if p_value > 0.05 && weibull_likelihood > gamma_likelihood + 2.0 {
1849            p_value *= 0.85;
1850        }
1851    }
1852
1853    let lognormal_p = calculate_lognormal_fit_quality(values);
1854    if p_value > 0.05 && lognormal_p > 0.05 {
1855        let e_x = mean;
1856        let var_x = variance;
1857        let sigma_sq = (1.0 + var_x / (e_x * e_x)).ln();
1858        let mu = e_x.ln() - sigma_sq / 2.0;
1859        let sigma = sigma_sq.sqrt();
1860
1861        let lognormal_likelihood = lognormal_log_likelihood(values, mu, sigma);
1862        if lognormal_likelihood > gamma_likelihood + 5.0 {
1863            p_value *= 0.5;
1864        }
1865    }
1866
1867    p_value
1868}
1869
1870// Test Chi-squared distribution
1871fn test_chi_squared(values: &[f64]) -> f64 {
1872    if values.is_empty() || values.len() < 10 {
1873        return 0.0;
1874    }
1875
1876    let positive_values: Vec<f64> = values
1877        .iter()
1878        .filter(|&&v| v >= 0.0 && v.is_finite())
1879        .copied()
1880        .collect();
1881
1882    let ratio_positive = positive_values.len() as f64 / values.len() as f64;
1883    if ratio_positive < 0.95 {
1884        return 0.0;
1885    }
1886
1887    if positive_values.len() < 10 {
1888        return 0.0;
1889    }
1890
1891    let mean = positive_values.iter().sum::<f64>() / positive_values.len() as f64;
1892    let variance: f64 = positive_values
1893        .iter()
1894        .map(|&v| (v - mean).powi(2))
1895        .sum::<f64>()
1896        / (positive_values.len() - 1) as f64;
1897
1898    if mean <= 0.0 {
1899        return 0.0;
1900    }
1901
1902    let expected_var = 2.0 * mean;
1903    let variance_ratio = if expected_var > 0.0 {
1904        (variance / expected_var).min(expected_var / variance)
1905    } else {
1906        0.0
1907    };
1908
1909    let variance_error = (variance - expected_var).abs() / expected_var;
1910    if variance_error > 0.1 {
1911        return 0.0;
1912    }
1913
1914    let df = mean;
1915    if df <= 0.0 || df > 1000.0 {
1916        return 0.0;
1917    }
1918
1919    let ks_stat = kolmogorov_smirnov_test(&positive_values, |x| chi_squared_cdf(x, df));
1920    let n = positive_values.len();
1921    let ks_pvalue = approximate_ks_pvalue(ks_stat, n);
1922
1923    let confidence = if ks_pvalue > 0.1 {
1924        (ks_pvalue * 0.8 + variance_ratio * 0.2).min(0.95)
1925    } else if ks_pvalue > 0.05 {
1926        (ks_pvalue * 0.6 + variance_ratio * 0.4).min(0.7)
1927    } else if ks_pvalue > 0.01 {
1928        (ks_pvalue * 0.3 + variance_ratio * 0.7).min(0.5)
1929    } else {
1930        (variance_ratio * 0.5).min(0.3)
1931    };
1932
1933    confidence.max(0.05)
1934}
1935
1936// Test Student's t distribution
1937fn test_students_t(values: &[f64]) -> f64 {
1938    if values.len() < 10 {
1939        return 0.0;
1940    }
1941    let mean = values.iter().sum::<f64>() / values.len() as f64;
1942    let variance: f64 =
1943        values.iter().map(|&v| (v - mean).powi(2)).sum::<f64>() / (values.len() - 1) as f64;
1944    if variance <= 0.0 {
1945        return 0.0;
1946    }
1947
1948    let df = if variance > 1.0 {
1949        2.0 * variance / (variance - 1.0)
1950    } else {
1951        3.0
1952    };
1953    let df = df.clamp(1.0, 100.0);
1954
1955    // KS test against Student's t distribution
1956    let ks_stat = kolmogorov_smirnov_test(values, |x| students_t_cdf(x, df));
1957    let n = values.len();
1958    approximate_ks_pvalue(ks_stat, n)
1959}
1960
1961fn test_poisson(values: &[f64]) -> f64 {
1962    let non_negative: Vec<f64> = values
1963        .iter()
1964        .filter(|&&v| v >= 0.0 && v == v.floor())
1965        .copied()
1966        .collect();
1967    if non_negative.len() < 10 {
1968        return 0.0;
1969    }
1970
1971    let lambda = non_negative.iter().sum::<f64>() / non_negative.len() as f64;
1972    if lambda <= 0.0 {
1973        return 0.0;
1974    }
1975
1976    let variance: f64 = non_negative
1977        .iter()
1978        .map(|&v| (v - lambda).powi(2))
1979        .sum::<f64>()
1980        / (non_negative.len() - 1) as f64;
1981    let variance_ratio = (variance / lambda).min(lambda / variance);
1982    if variance_ratio < 0.7 {
1983        return 0.0;
1984    }
1985
1986    let max_val = non_negative.iter().fold(0.0f64, |a, &b| a.max(b)) as usize;
1987    let num_bins = (max_val.min(15) + 1).clamp(5, 20); // At least 5 bins, max 20
1988
1989    let mut observed = vec![0; num_bins];
1990    for &v in &non_negative {
1991        let bin = (v as usize).min(num_bins - 1);
1992        observed[bin] += 1;
1993    }
1994
1995    let mut expected = vec![0.0; num_bins];
1996    for (bin, exp) in expected.iter_mut().enumerate() {
1997        let k = bin;
1998        // Poisson PMF: lambda^k * exp(-lambda) / k!
1999        let mut factorial = 1.0;
2000        for i in 1..=k {
2001            factorial *= i as f64;
2002        }
2003        let pmf = lambda.powi(k as i32) * (-lambda).exp() / factorial;
2004        *exp = pmf * non_negative.len() as f64;
2005    }
2006
2007    // Chi-square goodness-of-fit test
2008    chi_square_goodness_of_fit(&observed, &expected)
2009}
2010
2011fn test_bernoulli(values: &[f64]) -> f64 {
2012    // Bernoulli: values should be 0 or 1
2013    let binary_values: Vec<f64> = values
2014        .iter()
2015        .filter(|&&v| v == 0.0 || v == 1.0)
2016        .copied()
2017        .collect();
2018    if binary_values.len() < 10 || binary_values.len() < values.len() / 2 {
2019        return 0.0;
2020    }
2021
2022    // Count occurrences of 0 and 1
2023    let count_0 = binary_values.iter().filter(|&&v| v == 0.0).count();
2024    let count_1 = binary_values.iter().filter(|&&v| v == 1.0).count();
2025    let n = binary_values.len();
2026
2027    let p = count_1 as f64 / n as f64;
2028    if p <= 0.0 || p >= 1.0 {
2029        return 0.0;
2030    }
2031
2032    // Expected frequencies: n * (1-p) for 0, n * p for 1
2033    let expected = vec![n as f64 * (1.0 - p), n as f64 * p];
2034    let observed = vec![count_0, count_1];
2035
2036    // Chi-square goodness-of-fit test
2037    chi_square_goodness_of_fit(&observed, &expected)
2038}
2039
2040fn test_binomial(values: &[f64]) -> f64 {
2041    // Binomial: non-negative integer values
2042    let non_negative_int: Vec<f64> = values
2043        .iter()
2044        .filter(|&&v| v >= 0.0 && v == v.floor())
2045        .copied()
2046        .collect();
2047    if non_negative_int.len() < 10 || non_negative_int.len() < values.len() / 2 {
2048        return 0.0;
2049    }
2050
2051    let max_val = non_negative_int.iter().fold(0.0f64, |a, &b| a.max(b));
2052    let mean = non_negative_int.iter().sum::<f64>() / non_negative_int.len() as f64;
2053
2054    let variance: f64 = non_negative_int
2055        .iter()
2056        .map(|&v| (v - mean).powi(2))
2057        .sum::<f64>()
2058        / (non_negative_int.len() - 1) as f64;
2059
2060    if mean <= 0.0 || max_val <= 0.0 {
2061        return 0.0;
2062    }
2063
2064    // Estimate n (number of trials) from max value
2065    let n = max_val as usize;
2066    if n == 0 {
2067        return 0.0;
2068    }
2069
2070    let p = mean / n as f64;
2071    if p <= 0.0 || p >= 1.0 {
2072        return 0.0;
2073    }
2074
2075    let expected_var = n as f64 * p * (1.0 - p);
2076    let variance_ratio = if expected_var > 0.0 {
2077        (variance / expected_var).min(expected_var / variance)
2078    } else {
2079        0.0
2080    };
2081    if variance_ratio < 0.7 {
2082        return 0.0; // Variance too different - not Binomial
2083    }
2084
2085    let num_bins = (n + 1).min(20);
2086    let mut observed = vec![0; num_bins];
2087    for &v in &non_negative_int {
2088        let bin = (v as usize).min(num_bins - 1);
2089        observed[bin] += 1;
2090    }
2091
2092    let mut expected = vec![0.0; num_bins];
2093    for (bin, exp) in expected.iter_mut().enumerate() {
2094        let k = bin;
2095        if k <= n {
2096            let coeff = binomial_coeff(n, k);
2097            let pmf = coeff * p.powi(k as i32) * (1.0 - p).powi((n - k) as i32);
2098            *exp = pmf * non_negative_int.len() as f64;
2099        }
2100    }
2101
2102    // Chi-square goodness-of-fit test
2103    let mut score = chi_square_goodness_of_fit(&observed, &expected);
2104
2105    let n_samples = non_negative_int.len();
2106    if n_samples > 5000 && score > 0.0 {
2107        if score < 0.01 {
2108            score = (score * 200.0).min(0.20);
2109        } else if score < 0.1 {
2110            score = (score * 2.0).min(0.25);
2111        }
2112        if score > 0.0 && variance_ratio >= 0.7 {
2113            score = score.max(0.05);
2114        }
2115    }
2116
2117    score
2118}
2119
2120fn test_geometric(values: &[f64]) -> f64 {
2121    if values.is_empty() || values.len() < 10 {
2122        return 0.0;
2123    }
2124
2125    let process_limit = 5000.min(values.len());
2126    let sample: &[f64] = &values[..process_limit];
2127
2128    let non_negative_int: Vec<f64> = sample
2129        .iter()
2130        .filter(|&&v| v >= 0.0 && v == v.floor() && v.is_finite())
2131        .copied()
2132        .collect();
2133
2134    if non_negative_int.len() < 10 {
2135        return 0.0;
2136    }
2137
2138    let mean = non_negative_int.iter().sum::<f64>() / non_negative_int.len() as f64;
2139    if mean <= 0.0 {
2140        return 0.0;
2141    }
2142
2143    let max_seen = non_negative_int.iter().fold(0.0f64, |a, &b| a.max(b));
2144    if max_seen > 100.0 {
2145        return 0.0;
2146    }
2147
2148    if mean > 30.0 {
2149        return 0.0;
2150    }
2151
2152    // Variance = E[X^2] - (E[X])^2
2153    let variance: f64 = non_negative_int
2154        .iter()
2155        .map(|&v| (v - mean).powi(2))
2156        .sum::<f64>()
2157        / (non_negative_int.len() - 1) as f64;
2158    if variance <= 0.0 {
2159        return 0.0;
2160    }
2161
2162    let expected_var = mean * (mean + 1.0);
2163    if expected_var <= 0.0 {
2164        return 0.0;
2165    }
2166
2167    let variance_ratio = (variance / expected_var).min(expected_var / variance);
2168    if variance_ratio < 0.5 {
2169        return 0.0;
2170    }
2171
2172    let p = 1.0 / (mean + 1.0);
2173    if p <= 0.0 || p >= 1.0 {
2174        return 0.0;
2175    }
2176
2177    // Bin values for chi-square test
2178    let max_bin = (max_seen as usize + 1).min(50);
2179    let mut observed = vec![0; max_bin];
2180    let mut expected = vec![0.0; max_bin];
2181
2182    for &v in &non_negative_int {
2183        let bin = v as usize;
2184        if bin < max_bin {
2185            observed[bin] += 1;
2186        }
2187    }
2188
2189    // Calculate expected frequencies using Geometric PMF: P(X=k) = p * (1-p)^k
2190    // For k = 0, 1, 2, ..., max_bin-1
2191    let n = non_negative_int.len() as f64;
2192    let mut total_pmf = 0.0;
2193    for (k, exp) in expected.iter_mut().enumerate().take(max_bin) {
2194        let pmf = p * (1.0 - p).powi(k as i32);
2195        *exp = pmf;
2196        total_pmf += pmf;
2197    }
2198
2199    if total_pmf > 0.0 {
2200        for exp in &mut expected {
2201            *exp = (*exp / total_pmf) * n;
2202        }
2203    } else {
2204        let uniform_prob = 1.0 / max_bin as f64;
2205        expected.fill(uniform_prob * n);
2206    }
2207
2208    let min_expected = expected.iter().fold(f64::INFINITY, |a, &b| a.min(b));
2209    let chi_square_score = if min_expected > 0.1 {
2210        chi_square_goodness_of_fit(&observed, &expected)
2211    } else {
2212        variance_ratio
2213    };
2214
2215    let confidence = if chi_square_score > 0.1 {
2216        (chi_square_score * 0.8 + variance_ratio * 0.2).min(0.95)
2217    } else if chi_square_score > 0.05 {
2218        (chi_square_score * 0.6 + variance_ratio * 0.4).min(0.7)
2219    } else if chi_square_score > 0.01 {
2220        (chi_square_score * 0.3 + variance_ratio * 0.7).min(0.5)
2221    } else {
2222        (variance_ratio * 0.5).min(0.3)
2223    };
2224
2225    confidence.max(0.05)
2226}
2227
2228fn test_weibull(values: &[f64]) -> f64 {
2229    let has_negatives = values.iter().any(|&v| v < 0.0);
2230    if has_negatives {
2231        return 0.0;
2232    }
2233
2234    let positive_values: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
2235    if positive_values.len() < 10 {
2236        return 0.0;
2237    }
2238
2239    let n = positive_values.len() as f64;
2240    let mean = positive_values.iter().sum::<f64>() / n;
2241    let variance: f64 = positive_values
2242        .iter()
2243        .map(|&v| (v - mean).powi(2))
2244        .sum::<f64>()
2245        / (n - 1.0);
2246
2247    if mean <= 0.0 || variance <= 0.0 {
2248        return 0.0;
2249    }
2250
2251    let cv = variance.sqrt() / mean;
2252
2253    let candidate_shapes: Vec<f64> = if cv < 0.3 {
2254        vec![3.0, 2.5, 2.0, 1.8]
2255    } else if cv < 0.6 {
2256        vec![2.0, 1.8, 1.5, 1.3]
2257    } else if cv < 0.8 {
2258        vec![1.5, 1.3, 1.2, 1.1]
2259    } else if cv < 1.0 {
2260        vec![1.2, 1.1, 1.0, 0.9]
2261    } else {
2262        vec![1.0, 0.9, 0.8, 0.7]
2263    };
2264
2265    let mut best_shape = candidate_shapes[0];
2266    let mut best_scale = 1.0;
2267    let mut best_ks_stat = 1.0;
2268
2269    for &candidate_shape in &candidate_shapes {
2270        let k_inv = 1.0 / candidate_shape;
2271        let gamma_1_plus_1_over_shape = ln_gamma_approx(1.0 + k_inv).exp();
2272        let candidate_scale = mean / gamma_1_plus_1_over_shape;
2273
2274        if candidate_scale > 0.0 && candidate_scale < 1000.0 {
2275            let ks_stat = kolmogorov_smirnov_test(&positive_values, |x| {
2276                weibull_cdf(x, candidate_shape, candidate_scale)
2277            });
2278            if ks_stat < best_ks_stat {
2279                best_ks_stat = ks_stat;
2280                best_shape = candidate_shape;
2281                best_scale = candidate_scale;
2282            }
2283        }
2284    }
2285
2286    let shape = best_shape;
2287    let scale = best_scale;
2288
2289    // Validate parameters
2290    if scale <= 0.0 || shape <= 0.0 || scale > 1000.0 || shape > 100.0 {
2291        return 0.0;
2292    }
2293
2294    // KS test against Weibull distribution using best parameters
2295    let n_usize = positive_values.len();
2296    let mut p_value = approximate_ks_pvalue(best_ks_stat, n_usize);
2297
2298    let gamma_shape = (mean * mean) / variance; // Method of moments for Gamma
2299    let gamma_scale = variance / mean;
2300
2301    if gamma_shape > 0.0 && gamma_scale > 0.0 && gamma_shape < 1000.0 && gamma_scale < 1000.0 {
2302        let wb_likelihood = weibull_log_likelihood(values, shape, scale);
2303        let gamma_likelihood = gamma_log_likelihood(values, gamma_shape, gamma_scale);
2304
2305        if p_value > 0.1 && gamma_likelihood > wb_likelihood + 5.0 {
2306            p_value *= 0.4;
2307        } else if p_value > 0.05 && gamma_likelihood > wb_likelihood + 5.0 {
2308            p_value *= 0.5;
2309        } else if p_value > 0.05 && gamma_likelihood > wb_likelihood + 2.0 {
2310            // Gamma fits somewhat better - moderate penalty
2311            p_value *= 0.7;
2312        } else if p_value > 0.02 && gamma_likelihood > wb_likelihood + 5.0 {
2313            // Gamma fits better but p-value is low - light penalty
2314            p_value *= 0.85;
2315        }
2316    }
2317
2318    let shape_tolerance = if positive_values.len() > 5000 {
2319        0.5
2320    } else {
2321        0.3
2322    };
2323    if (shape - 1.0).abs() < shape_tolerance && p_value > 0.3 {
2324        let exp_lambda = 1.0 / mean;
2325        if exp_lambda > 0.0 {
2326            let exp_likelihood = -(positive_values.len() as f64) * exp_lambda.ln()
2327                - exp_lambda * positive_values.iter().sum::<f64>();
2328            let wb_likelihood = weibull_log_likelihood(values, shape, scale);
2329
2330            if exp_likelihood > wb_likelihood + 0.5 {
2331                p_value *= 0.4;
2332            } else if exp_likelihood > wb_likelihood {
2333                p_value *= 0.6;
2334            } else if (shape - 1.0).abs() < 0.2 {
2335                p_value *= 0.7;
2336            }
2337        }
2338    }
2339
2340    if let Some((xmin, alpha)) = estimate_power_law_mle(values) {
2341        let wb_likelihood = weibull_log_likelihood(values, shape, scale);
2342        let pl_likelihood = power_law_log_likelihood(values, xmin, alpha);
2343
2344        if p_value > 0.05 && pl_likelihood > wb_likelihood + 5.0 {
2345            p_value *= 0.5;
2346        }
2347    }
2348
2349    p_value
2350}
2351
2352// Advanced distribution analysis computation
2353fn compute_advanced_distribution_analysis(
2354    column_name: &str,
2355    series: &Series,
2356    numeric_stats: &NumericStatistics,
2357    dist_info: &DistributionInfo,
2358    _sample_size: usize,
2359    is_sampled: bool,
2360) -> DistributionAnalysis {
2361    let max_values = 5000.min(series.len());
2362    let mut values: Vec<f64> = if series.len() > max_values {
2363        let limited_series = series.slice(0, max_values);
2364        get_numeric_values_as_f64(&limited_series)
2365    } else {
2366        get_numeric_values_as_f64(series)
2367    };
2368
2369    // Sort values for Q-Q plot (all data if not sampled, or sampled data if >= threshold)
2370    values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
2371    let sorted_sample_values = values.clone();
2372    let actual_sample_size = sorted_sample_values.len();
2373
2374    // Compute distribution characteristics
2375    let (sw_stat, sw_pvalue) = if values.len() >= 3 {
2376        approximate_shapiro_wilk(&values)
2377    } else {
2378        (None, None)
2379    };
2380    let coefficient_of_variation = if numeric_stats.mean != 0.0 {
2381        numeric_stats.std / numeric_stats.mean.abs()
2382    } else {
2383        0.0
2384    };
2385
2386    let mode = compute_mode(&values);
2387
2388    let characteristics = DistributionCharacteristics {
2389        shapiro_wilk_stat: sw_stat,
2390        shapiro_wilk_pvalue: sw_pvalue,
2391        skewness: numeric_stats.skewness,
2392        kurtosis: numeric_stats.kurtosis,
2393        mean: numeric_stats.mean,
2394        median: numeric_stats.median,
2395        std_dev: numeric_stats.std,
2396        variance: numeric_stats.std * numeric_stats.std,
2397        coefficient_of_variation,
2398        mode,
2399    };
2400
2401    let fit_quality = dist_info.fit_quality.unwrap_or_else(|| {
2402        calculate_fit_quality(
2403            &values,
2404            dist_info.distribution_type,
2405            numeric_stats.mean,
2406            numeric_stats.std,
2407        )
2408    });
2409
2410    let outliers = compute_outlier_analysis(series, numeric_stats);
2411
2412    let percentiles = PercentileBreakdown {
2413        p1: numeric_stats
2414            .percentiles
2415            .get(&1)
2416            .copied()
2417            .unwrap_or(f64::NAN),
2418        p5: numeric_stats
2419            .percentiles
2420            .get(&5)
2421            .copied()
2422            .unwrap_or(f64::NAN),
2423        p25: numeric_stats.q25,
2424        p50: numeric_stats.median,
2425        p75: numeric_stats.q75,
2426        p95: numeric_stats
2427            .percentiles
2428            .get(&95)
2429            .copied()
2430            .unwrap_or(f64::NAN),
2431        p99: numeric_stats
2432            .percentiles
2433            .get(&99)
2434            .copied()
2435            .unwrap_or(f64::NAN),
2436    };
2437
2438    DistributionAnalysis {
2439        column_name: column_name.to_string(),
2440        distribution_type: dist_info.distribution_type,
2441        confidence: dist_info.confidence,
2442        fit_quality,
2443        characteristics,
2444        outliers,
2445        percentiles,
2446        sorted_sample_values,
2447        is_sampled,
2448        sample_size: actual_sample_size,
2449        all_distribution_pvalues: dist_info.all_distribution_pvalues.clone(),
2450    }
2451}
2452
2453fn compute_mode(values: &[f64]) -> Option<f64> {
2454    if values.is_empty() {
2455        return None;
2456    }
2457
2458    // Bin values and find most frequent bin
2459    let min = values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
2460    let max = values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
2461    let range = max - min;
2462
2463    if range == 0.0 {
2464        return Some(min);
2465    }
2466
2467    let bins = 50.min(values.len());
2468    let mut bin_counts = vec![0; bins];
2469    let mut bin_sums = vec![0.0; bins];
2470
2471    for &v in values {
2472        let bin = (((v - min) / range) * (bins - 1) as f64) as usize;
2473        let bin = bin.min(bins - 1);
2474        bin_counts[bin] += 1;
2475        bin_sums[bin] += v;
2476    }
2477
2478    // Find bin with maximum count
2479    let max_bin = bin_counts
2480        .iter()
2481        .enumerate()
2482        .max_by_key(|(_, &count)| count)
2483        .map(|(idx, _)| idx);
2484
2485    max_bin.map(|idx| bin_sums[idx] / bin_counts[idx] as f64)
2486}
2487
2488/// Calculates fit quality (p-value) for a given distribution type.
2489///
2490/// Returns a value between 0.0 and 1.0, where higher values indicate better fit.
2491pub fn calculate_fit_quality(
2492    values: &[f64],
2493    dist_type: DistributionType,
2494    mean: f64,
2495    std: f64,
2496) -> f64 {
2497    match dist_type {
2498        DistributionType::Normal => calculate_normal_fit_quality(values, mean, std),
2499        DistributionType::LogNormal => calculate_lognormal_fit_quality(values),
2500        DistributionType::Uniform => calculate_uniform_fit_quality(values),
2501        DistributionType::PowerLaw => calculate_power_law_fit_quality(values),
2502        DistributionType::Exponential => calculate_exponential_fit_quality(values),
2503        DistributionType::Beta => calculate_beta_fit_quality(values),
2504        DistributionType::Gamma => calculate_gamma_fit_quality(values),
2505        DistributionType::ChiSquared => calculate_chi_squared_fit_quality(values),
2506        DistributionType::StudentsT => calculate_students_t_fit_quality(values),
2507        DistributionType::Poisson => calculate_poisson_fit_quality(values),
2508        DistributionType::Bernoulli => calculate_bernoulli_fit_quality(values),
2509        DistributionType::Binomial => calculate_binomial_fit_quality(values),
2510        DistributionType::Geometric => {
2511            if values.len() > 10000 {
2512                return 0.0;
2513            }
2514            calculate_geometric_fit_quality(values)
2515        }
2516        DistributionType::Weibull => calculate_weibull_fit_quality(values),
2517        DistributionType::Unknown => 0.5,
2518    }
2519}
2520
2521fn calculate_normal_fit_quality(values: &[f64], mean: f64, std: f64) -> f64 {
2522    // Phase 2: Use KS test for proper statistical testing
2523    if values.is_empty() || std == 0.0 {
2524        return 0.0;
2525    }
2526
2527    // KS test against normal distribution
2528    let ks_stat = kolmogorov_smirnov_test(values, |x| normal_cdf(x, mean, std));
2529    let n = values.len();
2530    approximate_ks_pvalue(ks_stat, n)
2531}
2532
2533fn normal_quantile(p: f64) -> f64 {
2534    // Approximation of normal quantile function
2535    // Using Beasley-Springer-Moro algorithm approximation
2536    if p < 0.5 {
2537        -normal_quantile(1.0 - p)
2538    } else {
2539        let t = (-2.0 * (1.0 - p).ln()).sqrt();
2540        t - (2.515517 + 0.802853 * t + 0.010328 * t * t)
2541            / (1.0 + 1.432788 * t + 0.189269 * t * t + 0.001308 * t * t * t)
2542    }
2543}
2544
2545fn calculate_lognormal_fit_quality(values: &[f64]) -> f64 {
2546    let positive_values: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
2547    if positive_values.len() < 10 {
2548        return 0.0;
2549    }
2550
2551    // Estimate log-normal parameters from data
2552    let mean = positive_values.iter().sum::<f64>() / positive_values.len() as f64;
2553    let variance: f64 = positive_values
2554        .iter()
2555        .map(|v| (v - mean).powi(2))
2556        .sum::<f64>()
2557        / (positive_values.len() - 1) as f64;
2558
2559    // Log-normal parameters: mu and sigma from method of moments
2560    let e_x = mean;
2561    let var_x = variance;
2562    if e_x <= 0.0 || var_x <= 0.0 {
2563        return 0.0;
2564    }
2565    let sigma_sq = (1.0 + var_x / (e_x * e_x)).ln();
2566    let mu = e_x.ln() - sigma_sq / 2.0;
2567    let sigma = sigma_sq.sqrt();
2568
2569    // KS test against log-normal distribution
2570    let ks_stat = kolmogorov_smirnov_test(&positive_values, |x| lognormal_cdf(x, mu, sigma));
2571    let n = positive_values.len();
2572    approximate_ks_pvalue(ks_stat, n)
2573}
2574
2575fn calculate_uniform_fit_quality(values: &[f64]) -> f64 {
2576    if let Some(chi_square) = calculate_chi_square_uniformity(values) {
2577        let base_fit = 1.0 / (1.0 + chi_square / 17.0);
2578        base_fit.clamp(0.01, 1.0)
2579    } else {
2580        0.0
2581    }
2582}
2583
2584fn calculate_power_law_fit_quality(values: &[f64]) -> f64 {
2585    test_power_law(values)
2586}
2587
2588fn calculate_exponential_fit_quality(values: &[f64]) -> f64 {
2589    let positive_values: Vec<f64> = values.iter().filter(|&&v| v > 0.0).copied().collect();
2590    if positive_values.len() < 10 {
2591        return 0.0;
2592    }
2593
2594    // Estimate lambda (rate parameter) from mean
2595    let mean = positive_values.iter().sum::<f64>() / positive_values.len() as f64;
2596    if mean <= 0.0 {
2597        return 0.0;
2598    }
2599    let lambda = 1.0 / mean;
2600
2601    // KS test against exponential distribution
2602    let ks_stat = kolmogorov_smirnov_test(&positive_values, |x| exponential_cdf(x, lambda));
2603    let n = positive_values.len();
2604    approximate_ks_pvalue(ks_stat, n)
2605}
2606
2607fn calculate_beta_fit_quality(values: &[f64]) -> f64 {
2608    test_beta(values)
2609}
2610
2611fn calculate_gamma_fit_quality(values: &[f64]) -> f64 {
2612    test_gamma(values) // Now returns p-value from KS test
2613}
2614
2615fn calculate_chi_squared_fit_quality(values: &[f64]) -> f64 {
2616    test_chi_squared(values)
2617}
2618
2619fn calculate_students_t_fit_quality(values: &[f64]) -> f64 {
2620    // Use same test as initial detection for consistency
2621    test_students_t(values)
2622}
2623
2624fn calculate_poisson_fit_quality(values: &[f64]) -> f64 {
2625    test_poisson(values)
2626}
2627
2628fn calculate_bernoulli_fit_quality(values: &[f64]) -> f64 {
2629    test_bernoulli(values)
2630}
2631
2632fn calculate_binomial_fit_quality(values: &[f64]) -> f64 {
2633    test_binomial(values)
2634}
2635
2636fn calculate_geometric_fit_quality(values: &[f64]) -> f64 {
2637    if values.len() > 10000 {
2638        return 0.0;
2639    }
2640    test_geometric(values)
2641}
2642
2643fn calculate_weibull_fit_quality(values: &[f64]) -> f64 {
2644    test_weibull(values) // Now returns p-value from KS test
2645}
2646
2647// CDF (Cumulative Distribution Function) implementations for histogram theoretical probabilities
2648fn normal_cdf(x: f64, mean: f64, std: f64) -> f64 {
2649    if std <= 0.0 {
2650        return if x < mean { 0.0 } else { 1.0 };
2651    }
2652    let z = (x - mean) / std;
2653    // Use error function approximation: CDF = 0.5 * (1 + erf(z/sqrt(2)))
2654    // erf approximation using Abramowitz and Stegun
2655    let z_normalized = z / std::f64::consts::SQRT_2;
2656    let t = 1.0 / (1.0 + 0.3275911 * z_normalized.abs());
2657    let a1 = 0.254829592;
2658    let a2 = -0.284496736;
2659    let a3 = 1.421413741;
2660    let a4 = -1.453152027;
2661    let a5 = 1.061405429;
2662    let erf_approx = 1.0
2663        - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1)
2664            * t
2665            * (-z_normalized * z_normalized).exp();
2666    let erf_val = if z_normalized >= 0.0 {
2667        erf_approx
2668    } else {
2669        -erf_approx
2670    };
2671    0.5 * (1.0 + erf_val)
2672}
2673
2674fn lognormal_cdf(x: f64, mu: f64, sigma: f64) -> f64 {
2675    if x <= 0.0 {
2676        return 0.0;
2677    }
2678    if sigma <= 0.0 {
2679        return if x < mu.exp() { 0.0 } else { 1.0 };
2680    }
2681    // Lognormal: CDF(x) = Normal CDF of ln(x) with parameters mu, sigma
2682    normal_cdf(x.ln(), mu, sigma)
2683}
2684
2685fn exponential_cdf(x: f64, lambda: f64) -> f64 {
2686    if x < 0.0 {
2687        return 0.0;
2688    }
2689    if lambda <= 0.0 {
2690        return if x < 0.0 { 0.0 } else { 1.0 };
2691    }
2692    // Exponential CDF: 1 - exp(-lambda * x)
2693    1.0 - (-lambda * x).exp()
2694}
2695
2696fn powerlaw_cdf(x: f64, xmin: f64, alpha: f64) -> f64 {
2697    if x < xmin {
2698        return 0.0;
2699    }
2700    if alpha <= 1.0 {
2701        return if x >= xmin { 1.0 } else { 0.0 };
2702    }
2703    // Power law CDF: 1 - (x/xmin)^(-alpha + 1) for x >= xmin
2704    // Valid for alpha > 1
2705    if alpha <= 1.0 || xmin <= 0.0 {
2706        return if x >= xmin { 1.0 } else { 0.0 };
2707    }
2708    1.0 - (x / xmin).powf(-alpha + 1.0)
2709}
2710
2711// Stirling's approximation for ln(gamma(z))
2712fn ln_gamma_approx(z: f64) -> f64 {
2713    if z <= 0.0 {
2714        return f64::NAN;
2715    }
2716    // Stirling's approximation: ln(Gamma(z)) ≈ (z - 0.5)*ln(z) - z + 0.5*ln(2π) + 1/(12z)
2717    if z > 50.0 {
2718        (z - 0.5) * z.ln() - z + 0.5 * (2.0 * std::f64::consts::PI).ln() + 1.0 / (12.0 * z)
2719    } else {
2720        // For smaller z, use iterative calculation (NO RECURSION)
2721        let mut result = 0.0;
2722        let mut z_val = z;
2723        // Cap iterations to prevent issues
2724        let max_iter = 100;
2725        let mut iter = 0;
2726        while z_val < 50.0 && iter < max_iter {
2727            result -= z_val.ln();
2728            z_val += 1.0;
2729            iter += 1;
2730        }
2731        // Use Stirling's approximation for final value (no recursion)
2732        if z_val >= 50.0 {
2733            result
2734                + ((z_val - 0.5) * z_val.ln() - z_val
2735                    + 0.5 * (2.0 * std::f64::consts::PI).ln()
2736                    + 1.0 / (12.0 * z_val))
2737        } else {
2738            result
2739        }
2740    }
2741}
2742
2743// Beta distribution CDF (requires incomplete beta function approximation)
2744fn beta_cdf(x: f64, alpha: f64, beta: f64) -> f64 {
2745    if x <= 0.0 {
2746        return 0.0;
2747    }
2748    if x >= 1.0 {
2749        return 1.0;
2750    }
2751    if alpha <= 0.0 || beta <= 0.0 {
2752        return 0.0;
2753    }
2754    // Approximation using normal approximation for large parameters
2755    // For small parameters, use simple approximation
2756    if alpha + beta > 50.0 {
2757        // Normal approximation
2758        let mean = alpha / (alpha + beta);
2759        let variance = (alpha * beta) / ((alpha + beta).powi(2) * (alpha + beta + 1.0));
2760        if variance > 0.0 {
2761            normal_cdf(x, mean, variance.sqrt())
2762        } else if x < mean {
2763            0.0
2764        } else {
2765            1.0
2766        }
2767    } else {
2768        // Simple polynomial approximation for small parameters
2769        // Beta CDF is related to incomplete beta function I_x(alpha, beta)
2770        // For small alpha, beta, use approximation: I_x(a,b) ≈ x^a * (1-x)^b / B(a,b) for small x
2771        // Simplified approximation using Stirling's approximation
2772        let ln_beta =
2773            ln_gamma_approx(alpha) + ln_gamma_approx(beta) - ln_gamma_approx(alpha + beta);
2774        let beta_const = ln_beta.exp();
2775        if beta_const > 0.0 {
2776            let integrand = x.powf(alpha) * (1.0 - x).powf(beta) / beta_const;
2777            integrand.clamp(0.0, 1.0)
2778        } else {
2779            // Fallback to normal approximation
2780            let mean = alpha / (alpha + beta);
2781            let variance = (alpha * beta) / ((alpha + beta).powi(2) * (alpha + beta + 1.0));
2782            if variance > 0.0 {
2783                normal_cdf(x, mean, variance.sqrt())
2784            } else if x < mean {
2785                0.0
2786            } else {
2787                1.0
2788            }
2789        }
2790    }
2791}
2792
2793// Beta distribution PDF
2794pub(crate) fn beta_pdf(x: f64, alpha: f64, beta: f64) -> f64 {
2795    if x <= 0.0 || x >= 1.0 || alpha <= 0.0 || beta <= 0.0 {
2796        return 0.0;
2797    }
2798    // Beta PDF: x^(α-1) * (1-x)^(β-1) / B(α,β)
2799    // where B(α,β) = Γ(α) * Γ(β) / Γ(α+β)
2800    // Use log form to avoid overflow: ln(PDF) = (α-1)*ln(x) + (β-1)*ln(1-x) - ln(B(α,β))
2801    let ln_x = x.ln();
2802    let ln_one_minus_x = (1.0 - x).ln();
2803    let ln_beta = ln_gamma_approx(alpha) + ln_gamma_approx(beta) - ln_gamma_approx(alpha + beta);
2804    let ln_pdf = (alpha - 1.0) * ln_x + (beta - 1.0) * ln_one_minus_x - ln_beta;
2805    // Clamp to avoid overflow/underflow
2806    if ln_pdf < -700.0 {
2807        return 0.0;
2808    }
2809    if ln_pdf > 700.0 {
2810        return f64::INFINITY;
2811    }
2812    ln_pdf.exp()
2813}
2814
2815// Gamma distribution CDF (requires incomplete gamma function approximation)
2816pub(crate) fn gamma_cdf(x: f64, shape: f64, scale: f64) -> f64 {
2817    if x <= 0.0 {
2818        return 0.0;
2819    }
2820    if shape <= 0.0 || scale <= 0.0 {
2821        return 0.0;
2822    }
2823    // Gamma CDF uses incomplete gamma function
2824    // For large shape, use normal approximation
2825    if shape > 30.0 {
2826        let mean = shape * scale;
2827        let variance = shape * scale * scale;
2828        if variance > 0.0 {
2829            normal_cdf(x, mean, variance.sqrt())
2830        } else if x < mean {
2831            0.0
2832        } else {
2833            1.0
2834        }
2835    } else {
2836        // Series approximation for incomplete gamma: P(x, k) = gamma(k, x) / Gamma(k)
2837        // Simplified approximation for small shape
2838        let z = x / scale;
2839        let sum: f64 = (0..(shape as usize * 10).min(100))
2840            .map(|n| {
2841                if (n as f64) < shape {
2842                    (-z).exp() * z.powi(n as i32) / (1..=n).map(|i| i as f64).product::<f64>()
2843                } else {
2844                    0.0
2845                }
2846            })
2847            .sum();
2848        (1.0 - sum).clamp(0.0, 1.0)
2849    }
2850}
2851
2852// Gamma distribution PDF
2853pub(crate) fn gamma_pdf(x: f64, shape: f64, scale: f64) -> f64 {
2854    if x <= 0.0 || shape <= 0.0 || scale <= 0.0 {
2855        return 0.0;
2856    }
2857    // Gamma PDF: (x^(shape-1) * exp(-x/scale)) / (scale^shape * Gamma(shape))
2858    // Use log form to avoid overflow: ln(PDF) = (shape-1)*ln(x) - x/scale - shape*ln(scale) - ln(Gamma(shape))
2859    let ln_pdf = (shape - 1.0) * x.ln() - x / scale - shape * scale.ln() - ln_gamma_approx(shape);
2860    // Clamp to reasonable range to avoid overflow/underflow
2861    if ln_pdf < -700.0 {
2862        return 0.0;
2863    }
2864    if ln_pdf > 700.0 {
2865        return f64::INFINITY;
2866    }
2867    ln_pdf.exp()
2868}
2869
2870// Gamma distribution quantile (inverse CDF) using binary search
2871pub(crate) fn gamma_quantile(p: f64, shape: f64, scale: f64) -> f64 {
2872    let p = p.clamp(0.0, 1.0);
2873    if p <= 0.0 {
2874        return 0.0;
2875    }
2876    if p >= 1.0 {
2877        // For p=1, return a large value (mean + 5*std as approximation)
2878        let mean = shape * scale;
2879        let std = (shape * scale * scale).sqrt();
2880        return mean + 5.0 * std;
2881    }
2882    if shape <= 0.0 || scale <= 0.0 {
2883        return 0.0;
2884    }
2885
2886    let mean = shape * scale;
2887    let std = (shape * scale * scale).sqrt();
2888
2889    // For large shape, use normal approximation
2890    if shape > 30.0 {
2891        let z = normal_quantile(p);
2892        return (mean + z * std).max(0.0);
2893    }
2894
2895    // For very small shape (< 0.1), the series approximation in gamma_cdf may be unreliable
2896    // Use normal approximation or exponential approximation (shape=1) as fallback
2897    if shape < 0.1 {
2898        // For very small shape, approximate as exponential (shape=1) with adjusted scale
2899        // This avoids numerical issues with the series approximation
2900        let exp_scale = mean; // For exponential, mean = scale
2901        if exp_scale > 0.0 {
2902            // Exponential quantile: -scale * ln(1-p)
2903            return -exp_scale * (1.0 - p).ln();
2904        } else {
2905            // Fallback to normal approximation
2906            let z = normal_quantile(p);
2907            return (mean + z * std.max(mean * 0.1)).max(0.0);
2908        }
2909    }
2910
2911    // Binary search for quantile
2912    // Initial bounds: [0, mean + 5*std]
2913    let mut low = 0.0;
2914    let mut high = mean + 5.0 * std;
2915
2916    // Ensure high is large enough
2917    while gamma_cdf(high, shape, scale) < p {
2918        high *= 2.0;
2919        if high > 1e10 {
2920            // Fallback: use normal approximation if search fails
2921            let z = normal_quantile(p);
2922            return (mean + z * std).max(0.0);
2923        }
2924    }
2925
2926    // Binary search with tolerance
2927    let tolerance = 1e-6;
2928    let mut mid = (low + high) / 2.0;
2929    for _ in 0..100 {
2930        let cdf_val = gamma_cdf(mid, shape, scale);
2931        if (cdf_val - p).abs() < tolerance {
2932            return mid;
2933        }
2934        if cdf_val < p {
2935            low = mid;
2936        } else {
2937            high = mid;
2938        }
2939        mid = (low + high) / 2.0;
2940    }
2941
2942    mid
2943}
2944
2945// Chi-squared distribution CDF (special case of Gamma with shape = df/2, scale = 2)
2946fn chi_squared_cdf(x: f64, df: f64) -> f64 {
2947    if x <= 0.0 {
2948        return 0.0;
2949    }
2950    if df <= 0.0 {
2951        return 0.0;
2952    }
2953    gamma_cdf(x, df / 2.0, 2.0)
2954}
2955
2956// Chi-squared PDF (special case of Gamma with shape = df/2, scale = 2)
2957pub(crate) fn chi_squared_pdf(x: f64, df: f64) -> f64 {
2958    if x <= 0.0 || df <= 0.0 {
2959        return 0.0;
2960    }
2961    gamma_pdf(x, df / 2.0, 2.0)
2962}
2963
2964// Student's t distribution CDF (approximation)
2965fn students_t_cdf(x: f64, df: f64) -> f64 {
2966    if df <= 0.0 {
2967        return 0.5; // Invalid, return median
2968    }
2969    // For large df, approximate with normal
2970    if df > 30.0 {
2971        normal_cdf(x, 0.0, 1.0)
2972    } else {
2973        // Approximation using normal with correction
2974        let z = x * (1.0 - 1.0 / (4.0 * df));
2975        normal_cdf(z, 0.0, 1.0)
2976    }
2977}
2978
2979// Student's t distribution PDF (approximation)
2980pub(crate) fn students_t_pdf(x: f64, df: f64) -> f64 {
2981    if df <= 0.0 {
2982        return 0.0;
2983    }
2984    // Student's t PDF: Γ((df+1)/2) / (sqrt(df*π) * Γ(df/2)) * (1 + x²/df)^(-(df+1)/2)
2985    // For large df, approximate with normal PDF
2986    if df > 30.0 {
2987        // Normal approximation: N(0, 1)
2988        let z = x;
2989        let pdf = (1.0 / (2.0 * std::f64::consts::PI).sqrt()) * (-0.5 * z * z).exp();
2990        return pdf;
2991    }
2992    // For small df, use approximation
2993    // Simplified: PDF ≈ (1 + x²/df)^(-(df+1)/2) * constant
2994    // Constant normalization factor approximated
2995    let x_sq_over_df = (x * x) / df;
2996    let exponent = -(df + 1.0) / 2.0;
2997    let power_term = (1.0 + x_sq_over_df).powf(exponent);
2998    // Approximate normalization constant
2999    let ln_gamma_half_df_plus_one = ln_gamma_approx((df + 1.0) / 2.0);
3000    let ln_gamma_half_df = ln_gamma_approx(df / 2.0);
3001    let ln_const =
3002        ln_gamma_half_df_plus_one - ln_gamma_half_df - 0.5 * (df * std::f64::consts::PI).ln();
3003    let const_term = ln_const.exp();
3004    let pdf = const_term * power_term;
3005    // Clamp to avoid overflow/underflow
3006    if pdf.is_finite() && pdf > 0.0 {
3007        pdf
3008    } else {
3009        0.0
3010    }
3011}
3012
3013// Poisson CDF (discrete, but return as continuous approximation)
3014fn poisson_cdf(x: f64, lambda: f64) -> f64 {
3015    if x < 0.0 {
3016        return 0.0;
3017    }
3018    if lambda <= 0.0 {
3019        return if x >= 0.0 { 1.0 } else { 0.0 };
3020    }
3021    // For large lambda, use normal approximation
3022    if lambda > 20.0 {
3023        normal_cdf(x, lambda, lambda.sqrt())
3024    } else {
3025        // Sum Poisson PMF from 0 to floor(x)
3026        let k_max = x.floor() as usize;
3027        let mut cdf = 0.0;
3028        let mut factorial = 1.0;
3029        for k in 0..=k_max.min(100) {
3030            if k > 0 {
3031                factorial *= k as f64;
3032            }
3033            let ln_pmf = (k as f64) * lambda.ln() - lambda - factorial.ln();
3034            let pmf = ln_pmf.exp();
3035            cdf += pmf;
3036            if cdf > 1.0 {
3037                break;
3038            }
3039        }
3040        cdf.min(1.0)
3041    }
3042}
3043
3044// Bernoulli CDF (discrete, p = probability of success)
3045fn bernoulli_cdf(x: f64, p: f64) -> f64 {
3046    if x < 0.0 {
3047        return 0.0;
3048    }
3049    if x >= 1.0 {
3050        return 1.0;
3051    }
3052    if p < 0.0 {
3053        return 0.0;
3054    }
3055    if p > 1.0 {
3056        return 1.0;
3057    }
3058    1.0 - p // CDF(x) = 0 for x < 0, 1-p for 0 <= x < 1, 1 for x >= 1
3059}
3060
3061// Binomial coefficient helper
3062fn binomial_coeff(n: usize, k: usize) -> f64 {
3063    if k > n {
3064        0.0
3065    } else if k == 0 || k == n {
3066        1.0
3067    } else {
3068        let k = k.min(n - k); // Use symmetry
3069        (1..=k).map(|i| (n - k + i) as f64 / i as f64).product()
3070    }
3071}
3072
3073// Binomial CDF (discrete)
3074fn binomial_cdf(x: f64, n: usize, p: f64) -> f64 {
3075    if x < 0.0 {
3076        return 0.0;
3077    }
3078    if p <= 0.0 {
3079        return if x >= n as f64 { 1.0 } else { 0.0 };
3080    }
3081    if p >= 1.0 {
3082        return if x >= 0.0 { 1.0 } else { 0.0 };
3083    }
3084    // For large n, use normal approximation
3085    if n > 50 {
3086        let mean = n as f64 * p;
3087        let variance = n as f64 * p * (1.0 - p);
3088        if variance > 0.0 {
3089            normal_cdf(x + 0.5, mean, variance.sqrt()) // Continuity correction
3090        } else if x < mean {
3091            0.0
3092        } else {
3093            1.0
3094        }
3095    } else {
3096        // Sum binomial PMF
3097        let k_max = x.floor() as usize;
3098        let mut cdf = 0.0;
3099        for k in 0..=k_max.min(n) {
3100            let coeff = binomial_coeff(n, k);
3101            let pmf = coeff * p.powi(k as i32) * (1.0 - p).powi((n - k) as i32);
3102            cdf += pmf;
3103        }
3104        cdf.min(1.0)
3105    }
3106}
3107
3108// Geometric CDF (discrete, number of failures before first success)
3109fn geometric_cdf(x: f64, p: f64) -> f64 {
3110    if x < 0.0 {
3111        return 0.0;
3112    }
3113    if p <= 0.0 || p >= 1.0 {
3114        return if x >= 0.0 && p >= 1.0 { 1.0 } else { 0.0 };
3115    }
3116
3117    // Geometric CDF: 1 - (1-p)^(k+1) for k failures
3118    // Use log-space to avoid numerical underflow: (1-p)^(k+1) = exp((k+1) * ln(1-p))
3119    // But cap k aggressively: beyond k=50, CDF is essentially 1.0 for most p values
3120    let k = x.floor().min(50.0); // Aggressive cap at 50 (was 1000)
3121
3122    // For very small (1-p)^(k+1), we can approximate as 0
3123    let log_one_minus_p = (1.0 - p).ln();
3124    if log_one_minus_p.is_nan() || log_one_minus_p.is_infinite() {
3125        return if x >= 0.0 { 1.0 } else { 0.0 };
3126    }
3127
3128    // Calculate (k+1) * ln(1-p)
3129    let exponent = (k + 1.0) * log_one_minus_p;
3130
3131    // If exponent is very negative, (1-p)^(k+1) is essentially 0, so CDF ≈ 1.0
3132    if exponent < -50.0 {
3133        return 1.0;
3134    }
3135
3136    // Otherwise calculate normally using exp
3137    let one_minus_p_power = exponent.exp();
3138    let result = 1.0 - one_minus_p_power;
3139    result.clamp(0.0, 1.0)
3140}
3141
3142// Geometric PMF (probability mass function) - for continuous approximation in histograms
3143// Geometric PMF: P(X=k) = p * (1-p)^k for k = 0, 1, 2, ...
3144// For continuous approximation, we use the PMF at the floor of x
3145pub(crate) fn geometric_pmf(x: f64, p: f64) -> f64 {
3146    if x < 0.0 || p <= 0.0 || p >= 1.0 {
3147        return 0.0;
3148    }
3149    // For continuous approximation, use floor(x) as the discrete value
3150    let k = x.floor().min(100.0); // Cap at reasonable value
3151                                  // PMF: p * (1-p)^k
3152                                  // Use log form: ln(PMF) = ln(p) + k * ln(1-p)
3153    let log_p = p.ln();
3154    let log_one_minus_p = (1.0 - p).ln();
3155    if log_p.is_nan()
3156        || log_p.is_infinite()
3157        || log_one_minus_p.is_nan()
3158        || log_one_minus_p.is_infinite()
3159    {
3160        return 0.0;
3161    }
3162    let ln_pmf = log_p + k * log_one_minus_p;
3163    // Clamp to avoid overflow/underflow
3164    if ln_pmf < -700.0 {
3165        return 0.0;
3166    }
3167    if ln_pmf > 700.0 {
3168        return f64::INFINITY;
3169    }
3170    ln_pmf.exp()
3171}
3172
3173// Geometric quantile (inverse CDF) using binary search for better accuracy
3174pub(crate) fn geometric_quantile(p: f64, p_param: f64) -> f64 {
3175    let p = p.clamp(0.0, 1.0);
3176    if p <= 0.0 {
3177        return 0.0;
3178    }
3179    if p >= 1.0 {
3180        // For p=1, return a large value (cap at 100 for Geometric)
3181        return 100.0;
3182    }
3183    if p_param <= 0.0 || p_param >= 1.0 {
3184        return 0.0;
3185    }
3186
3187    // Direct formula: q(p) = floor(ln(1-p) / ln(1-p_param))
3188    // But handle edge cases better
3189    let log_one_minus_p = (1.0 - p).ln();
3190    let log_one_minus_p_param = (1.0 - p_param).ln();
3191
3192    // Check for numerical issues
3193    if log_one_minus_p.is_nan()
3194        || log_one_minus_p.is_infinite()
3195        || log_one_minus_p_param.is_nan()
3196        || log_one_minus_p_param.is_infinite()
3197        || log_one_minus_p_param.abs() < 1e-10
3198    {
3199        // Fallback: use binary search
3200        return geometric_quantile_binary_search(p, p_param);
3201    }
3202
3203    let quantile = log_one_minus_p / log_one_minus_p_param;
3204    quantile.clamp(0.0, 100.0) // Cap at reasonable value
3205}
3206
3207// Binary search fallback for Geometric quantile
3208fn geometric_quantile_binary_search(p: f64, p_param: f64) -> f64 {
3209    // Binary search for quantile
3210    let mut low = 0.0;
3211    let mut high = 100.0;
3212
3213    // Ensure high is large enough
3214    while geometric_cdf(high, p_param) < p {
3215        high *= 2.0;
3216        if high > 1000.0 {
3217            return 100.0; // Cap at 100
3218        }
3219    }
3220
3221    // Binary search with tolerance
3222    let tolerance = 1e-6;
3223    let mut mid = (low + high) / 2.0;
3224    for _ in 0..100 {
3225        let cdf_val = geometric_cdf(mid, p_param);
3226        if (cdf_val - p).abs() < tolerance {
3227            return mid;
3228        }
3229        if cdf_val < p {
3230            low = mid;
3231        } else {
3232            high = mid;
3233        }
3234        mid = (low + high) / 2.0;
3235    }
3236
3237    mid
3238}
3239
3240// Weibull distribution CDF
3241fn weibull_cdf(x: f64, shape: f64, scale: f64) -> f64 {
3242    if x <= 0.0 {
3243        return 0.0;
3244    }
3245    if shape <= 0.0 || scale <= 0.0 {
3246        return 0.0;
3247    }
3248    // Weibull CDF: 1 - exp(-(x/scale)^shape)
3249    1.0 - (-(x / scale).powf(shape)).exp()
3250}
3251
3252// Weibull distribution PDF
3253pub(crate) fn weibull_pdf(x: f64, shape: f64, scale: f64) -> f64 {
3254    if x <= 0.0 || shape <= 0.0 || scale <= 0.0 {
3255        return 0.0;
3256    }
3257    // Weibull PDF: (k/λ) * (x/λ)^(k-1) * exp(-(x/λ)^k)
3258    // where k = shape, λ = scale
3259    let ratio = x / scale;
3260    let power = ratio.powf(shape);
3261    let pdf = (shape / scale) * ratio.powf(shape - 1.0) * (-power).exp();
3262    // Clamp to avoid overflow/underflow
3263    if pdf.is_finite() {
3264        pdf
3265    } else {
3266        0.0
3267    }
3268}
3269
3270// Calculate theoretical probability in an interval [lower, upper] for a distribution
3271// Helper function for dense sampling of theoretical distribution
3272/// Calculates the probability that a value falls in [lower, upper] for the given distribution.
3273///
3274/// Uses the distribution's CDF to compute P(lower ≤ X < upper).
3275pub fn calculate_theoretical_probability_in_interval(
3276    dist: &DistributionAnalysis,
3277    dist_type: DistributionType,
3278    lower: f64,
3279    upper: f64,
3280) -> f64 {
3281    let mean = dist.characteristics.mean;
3282    let std = dist.characteristics.std_dev;
3283    let sorted_data = &dist.sorted_sample_values;
3284
3285    match dist_type {
3286        DistributionType::Normal => {
3287            let cdf_upper = normal_cdf(upper, mean, std);
3288            let cdf_lower = normal_cdf(lower, mean, std);
3289            cdf_upper - cdf_lower
3290        }
3291        DistributionType::LogNormal => {
3292            if sorted_data.is_empty() || !sorted_data.iter().all(|&v| v > 0.0) {
3293                0.0
3294            } else {
3295                let e_x = mean;
3296                let var_x = std * std;
3297                let sigma_sq = (1.0 + var_x / (e_x * e_x)).ln();
3298                let mu = e_x.ln() - sigma_sq / 2.0;
3299                let sigma = sigma_sq.sqrt();
3300
3301                if lower > 0.0 && upper > 0.0 {
3302                    let cdf_upper = lognormal_cdf(upper, mu, sigma);
3303                    let cdf_lower = lognormal_cdf(lower, mu, sigma);
3304                    cdf_upper - cdf_lower
3305                } else {
3306                    0.0
3307                }
3308            }
3309        }
3310        DistributionType::Uniform => {
3311            if sorted_data.is_empty() {
3312                0.0
3313            } else {
3314                let data_min = sorted_data[0];
3315                let data_max = sorted_data[sorted_data.len() - 1];
3316                let data_range = data_max - data_min;
3317                if data_range > 0.0 {
3318                    (upper - lower) / data_range
3319                } else {
3320                    0.0
3321                }
3322            }
3323        }
3324        DistributionType::Exponential => {
3325            if mean > 0.0 {
3326                let lambda = 1.0 / mean;
3327                let cdf_upper = exponential_cdf(upper, lambda);
3328                let cdf_lower = exponential_cdf(lower, lambda);
3329                cdf_upper - cdf_lower
3330            } else {
3331                0.0
3332            }
3333        }
3334        DistributionType::PowerLaw => {
3335            if sorted_data.is_empty() || !sorted_data.iter().any(|&v| v > 0.0) {
3336                0.0
3337            } else {
3338                let positive_values: Vec<f64> =
3339                    sorted_data.iter().filter(|&&v| v > 0.0).copied().collect();
3340                if positive_values.is_empty() {
3341                    0.0
3342                } else {
3343                    let xmin = positive_values[0];
3344                    let n_pos = positive_values.len();
3345                    if n_pos < 2 || xmin <= 0.0 {
3346                        0.0
3347                    } else {
3348                        let sum_log = positive_values
3349                            .iter()
3350                            .map(|&x| (x / xmin).ln())
3351                            .sum::<f64>();
3352                        if sum_log > 0.0 {
3353                            let alpha = 1.0 + (n_pos as f64) / sum_log;
3354                            let cdf_upper = powerlaw_cdf(upper, xmin, alpha);
3355                            let cdf_lower = powerlaw_cdf(lower, xmin, alpha);
3356                            cdf_upper - cdf_lower
3357                        } else {
3358                            0.0
3359                        }
3360                    }
3361                }
3362            }
3363        }
3364        DistributionType::Beta => {
3365            // Estimate parameters from mean and variance
3366            let mean_val = mean;
3367            let variance = std * std;
3368            if mean_val > 0.0 && mean_val < 1.0 && variance > 0.0 {
3369                let max_var = mean_val * (1.0 - mean_val);
3370                if variance < max_var {
3371                    let sum = mean_val * (1.0 - mean_val) / variance - 1.0;
3372                    let alpha = mean_val * sum;
3373                    let beta = (1.0 - mean_val) * sum;
3374                    if alpha > 0.0 && beta > 0.0 {
3375                        let cdf_upper = beta_cdf(upper, alpha, beta);
3376                        let cdf_lower = beta_cdf(lower, alpha, beta);
3377                        cdf_upper - cdf_lower
3378                    } else {
3379                        0.0
3380                    }
3381                } else {
3382                    0.0
3383                }
3384            } else {
3385                0.0
3386            }
3387        }
3388        DistributionType::Gamma => {
3389            if mean > 0.0 && std > 0.0 {
3390                let variance = std * std;
3391                let shape = (mean * mean) / variance;
3392                let scale = variance / mean;
3393                if shape > 0.0 && scale > 0.0 {
3394                    let cdf_upper = gamma_cdf(upper, shape, scale);
3395                    let cdf_lower = gamma_cdf(lower, shape, scale);
3396                    cdf_upper - cdf_lower
3397                } else {
3398                    0.0
3399                }
3400            } else {
3401                0.0
3402            }
3403        }
3404        DistributionType::ChiSquared => {
3405            // Chi-squared is gamma(df/2, 2)
3406            let df = mean; // For chi-squared, mean = df
3407            if df > 0.0 {
3408                let cdf_upper = chi_squared_cdf(upper, df);
3409                let cdf_lower = chi_squared_cdf(lower, df);
3410                cdf_upper - cdf_lower
3411            } else {
3412                0.0
3413            }
3414        }
3415        DistributionType::StudentsT => {
3416            // Estimate df from variance
3417            let variance = std * std;
3418            let df = if variance > 1.0 {
3419                2.0 * variance / (variance - 1.0)
3420            } else {
3421                30.0
3422            };
3423            let cdf_upper = students_t_cdf(upper, df);
3424            let cdf_lower = students_t_cdf(lower, df);
3425            cdf_upper - cdf_lower
3426        }
3427        DistributionType::Poisson => {
3428            let lambda = mean;
3429            if lambda > 0.0 {
3430                let cdf_upper = poisson_cdf(upper, lambda);
3431                let cdf_lower = poisson_cdf(lower, lambda);
3432                cdf_upper - cdf_lower
3433            } else {
3434                0.0
3435            }
3436        }
3437        DistributionType::Bernoulli => {
3438            let p = mean; // For Bernoulli, mean = p
3439            let cdf_upper = bernoulli_cdf(upper, p);
3440            let cdf_lower = bernoulli_cdf(lower, p);
3441            cdf_upper - cdf_lower
3442        }
3443        DistributionType::Binomial => {
3444            // Estimate n from data range
3445            let sorted_data = &dist.sorted_sample_values;
3446            if !sorted_data.is_empty() {
3447                let max_val = sorted_data[sorted_data.len() - 1];
3448                let n = max_val.floor() as usize;
3449                let p = if n > 0 { mean / n as f64 } else { 0.5 };
3450                if n > 0 && p > 0.0 && p < 1.0 {
3451                    let cdf_upper = binomial_cdf(upper, n, p);
3452                    let cdf_lower = binomial_cdf(lower, n, p);
3453                    cdf_upper - cdf_lower
3454                } else {
3455                    0.0
3456                }
3457            } else {
3458                0.0
3459            }
3460        }
3461        DistributionType::Geometric => {
3462            let mean_val = mean; // mean = (1-p)/p for geometric
3463            if mean_val > 0.0 {
3464                let p = 1.0 / (mean_val + 1.0);
3465                let cdf_upper = geometric_cdf(upper, p);
3466                let cdf_lower = geometric_cdf(lower, p);
3467                cdf_upper - cdf_lower
3468            } else {
3469                0.0
3470            }
3471        }
3472        DistributionType::Weibull => {
3473            if mean > 0.0 && std > 0.0 {
3474                // Approximate shape from CV
3475                let cv = std / mean;
3476                let shape = if cv < 1.0 { 1.0 / cv } else { 1.0 };
3477                // Scale from mean
3478                let gamma_1_over_shape = 1.0 + 1.0 / shape; // Approximation
3479                let scale = mean / gamma_1_over_shape;
3480                if shape > 0.0 && scale > 0.0 {
3481                    let cdf_upper = weibull_cdf(upper, shape, scale);
3482                    let cdf_lower = weibull_cdf(lower, shape, scale);
3483                    cdf_upper - cdf_lower
3484                } else {
3485                    0.0
3486                }
3487            } else {
3488                0.0
3489            }
3490        }
3491        _ => 0.0,
3492    }
3493}
3494
3495// Calculate theoretical bin probabilities using CDF for histogram
3496/// Calculates probabilities for each bin defined by bin_boundaries.
3497///
3498/// Returns a vector where each element is P(lower ≤ X < upper) for the corresponding bin.
3499pub fn calculate_theoretical_bin_probabilities(
3500    dist: &DistributionAnalysis,
3501    dist_type: DistributionType,
3502    bin_boundaries: &[f64],
3503) -> Vec<f64> {
3504    if bin_boundaries.len() < 2 {
3505        return vec![];
3506    }
3507
3508    let mean = dist.characteristics.mean;
3509    let std = dist.characteristics.std_dev;
3510    let sorted_data = &dist.sorted_sample_values;
3511
3512    // Calculate probability for each bin: P(lower <= X < upper) = CDF(upper) - CDF(lower)
3513    let mut probabilities = Vec::new();
3514
3515    for i in 0..(bin_boundaries.len() - 1) {
3516        let lower = bin_boundaries[i];
3517        let upper = bin_boundaries[i + 1];
3518
3519        let prob = match dist_type {
3520            DistributionType::Normal => {
3521                let cdf_upper = normal_cdf(upper, mean, std);
3522                let cdf_lower = normal_cdf(lower, mean, std);
3523                cdf_upper - cdf_lower
3524            }
3525            DistributionType::LogNormal => {
3526                // Estimate lognormal parameters from data characteristics
3527                // For lognormal: if X ~ LN(mu, sigma), then E[X] = exp(mu + sigma^2/2), Var[X] = (exp(sigma^2) - 1) * exp(2*mu + sigma^2)
3528                // Solving: sigma^2 = ln(1 + Var/E^2), mu = ln(E) - sigma^2/2
3529                if sorted_data.is_empty() || !sorted_data.iter().all(|&v| v > 0.0) {
3530                    0.0
3531                } else {
3532                    let e_x = mean;
3533                    let var_x = std * std;
3534                    let sigma_sq = (1.0 + var_x / (e_x * e_x)).ln();
3535                    let mu = e_x.ln() - sigma_sq / 2.0;
3536                    let sigma = sigma_sq.sqrt();
3537
3538                    if lower > 0.0 && upper > 0.0 {
3539                        let cdf_upper = lognormal_cdf(upper, mu, sigma);
3540                        let cdf_lower = lognormal_cdf(lower, mu, sigma);
3541                        cdf_upper - cdf_lower
3542                    } else {
3543                        0.0
3544                    }
3545                }
3546            }
3547            DistributionType::Uniform => {
3548                // Uniform distribution: equal probability in each bin (if data range matches)
3549                // For uniform [a, b], probability in [lower, upper] = (upper - lower) / (b - a)
3550                if sorted_data.is_empty() {
3551                    0.0
3552                } else {
3553                    let data_min = sorted_data[0];
3554                    let data_max = sorted_data[sorted_data.len() - 1];
3555                    let data_range = data_max - data_min;
3556                    if data_range > 0.0 {
3557                        (upper - lower) / data_range
3558                    } else if i == 0 {
3559                        1.0 // All data in first bin if constant
3560                    } else {
3561                        0.0
3562                    }
3563                }
3564            }
3565            DistributionType::Exponential => {
3566                // Exponential distribution: parameter lambda = 1 / mean (for rate parameterization)
3567                if mean > 0.0 {
3568                    let lambda = 1.0 / mean;
3569                    let cdf_upper = exponential_cdf(upper, lambda);
3570                    let cdf_lower = exponential_cdf(lower, lambda);
3571                    cdf_upper - cdf_lower
3572                } else {
3573                    0.0
3574                }
3575            }
3576            DistributionType::PowerLaw => {
3577                // Power law: estimate parameters from data
3578                // Estimate xmin as minimum positive value, and alpha from data
3579                if sorted_data.is_empty() || !sorted_data.iter().any(|&v| v > 0.0) {
3580                    0.0
3581                } else {
3582                    let positive_values: Vec<f64> =
3583                        sorted_data.iter().filter(|&&v| v > 0.0).copied().collect();
3584                    if positive_values.is_empty() {
3585                        0.0
3586                    } else {
3587                        let xmin = positive_values[0];
3588                        // Estimate alpha using maximum likelihood or approximation
3589                        // For power law with continuous values: alpha ≈ 1 + n / sum(ln(x_i / xmin))
3590                        let n_pos = positive_values.len();
3591                        if n_pos < 2 || xmin <= 0.0 {
3592                            0.0
3593                        } else {
3594                            let sum_log = positive_values
3595                                .iter()
3596                                .map(|&x| (x / xmin).ln())
3597                                .sum::<f64>();
3598                            let alpha = if sum_log > 0.0 {
3599                                1.0 + (n_pos as f64) / sum_log
3600                            } else {
3601                                2.5 // Default fallback
3602                            };
3603
3604                            if lower >= xmin && alpha > 1.0 {
3605                                let cdf_upper = powerlaw_cdf(upper, xmin, alpha);
3606                                let cdf_lower = powerlaw_cdf(lower, xmin, alpha);
3607                                cdf_upper - cdf_lower
3608                            } else {
3609                                0.0
3610                            }
3611                        }
3612                    }
3613                }
3614            }
3615            DistributionType::Beta => {
3616                // Estimate parameters from mean and variance
3617                let mean_val = mean;
3618                let variance = std * std;
3619                if mean_val > 0.0 && mean_val < 1.0 && variance > 0.0 {
3620                    let max_var = mean_val * (1.0 - mean_val);
3621                    if variance < max_var {
3622                        let sum = mean_val * (1.0 - mean_val) / variance - 1.0;
3623                        let alpha = mean_val * sum;
3624                        let beta = (1.0 - mean_val) * sum;
3625                        if alpha > 0.0 && beta > 0.0 {
3626                            let cdf_upper = beta_cdf(upper, alpha, beta);
3627                            let cdf_lower = beta_cdf(lower, alpha, beta);
3628                            cdf_upper - cdf_lower
3629                        } else {
3630                            0.0
3631                        }
3632                    } else {
3633                        0.0
3634                    }
3635                } else {
3636                    0.0
3637                }
3638            }
3639            DistributionType::Gamma => {
3640                if mean > 0.0 && std > 0.0 {
3641                    let variance = std * std;
3642                    let shape = (mean * mean) / variance;
3643                    let scale = variance / mean;
3644                    if shape > 0.0 && scale > 0.0 {
3645                        let cdf_upper = gamma_cdf(upper, shape, scale);
3646                        let cdf_lower = gamma_cdf(lower, shape, scale);
3647                        cdf_upper - cdf_lower
3648                    } else {
3649                        0.0
3650                    }
3651                } else {
3652                    0.0
3653                }
3654            }
3655            DistributionType::ChiSquared => {
3656                // Chi-squared is gamma(df/2, 2)
3657                let df = mean; // For chi-squared, mean = df
3658                if df > 0.0 {
3659                    let cdf_upper = chi_squared_cdf(upper, df);
3660                    let cdf_lower = chi_squared_cdf(lower, df);
3661                    cdf_upper - cdf_lower
3662                } else {
3663                    0.0
3664                }
3665            }
3666            DistributionType::StudentsT => {
3667                // Estimate df from variance
3668                let variance = std * std;
3669                let df = if variance > 1.0 {
3670                    2.0 * variance / (variance - 1.0)
3671                } else {
3672                    30.0
3673                };
3674                let cdf_upper = students_t_cdf(upper, df);
3675                let cdf_lower = students_t_cdf(lower, df);
3676                cdf_upper - cdf_lower
3677            }
3678            DistributionType::Poisson => {
3679                let lambda = mean;
3680                if lambda > 0.0 {
3681                    let cdf_upper = poisson_cdf(upper, lambda);
3682                    let cdf_lower = poisson_cdf(lower, lambda);
3683                    cdf_upper - cdf_lower
3684                } else {
3685                    0.0
3686                }
3687            }
3688            DistributionType::Bernoulli => {
3689                let p = mean; // For Bernoulli, mean = p
3690                let cdf_upper = bernoulli_cdf(upper, p);
3691                let cdf_lower = bernoulli_cdf(lower, p);
3692                cdf_upper - cdf_lower
3693            }
3694            DistributionType::Binomial => {
3695                // Estimate n from data range
3696                if !sorted_data.is_empty() {
3697                    let max_val = sorted_data[sorted_data.len() - 1];
3698                    let n = max_val.floor() as usize;
3699                    let p = if n > 0 { mean / n as f64 } else { 0.5 };
3700                    if n > 0 && p > 0.0 && p < 1.0 {
3701                        let cdf_upper = binomial_cdf(upper, n, p);
3702                        let cdf_lower = binomial_cdf(lower, n, p);
3703                        cdf_upper - cdf_lower
3704                    } else {
3705                        0.0
3706                    }
3707                } else {
3708                    0.0
3709                }
3710            }
3711            DistributionType::Geometric => {
3712                // Safe calculation with limits
3713                let mean_val = mean; // mean = (1-p)/p for geometric
3714                if mean_val > 0.0 && mean_val <= 20.0 {
3715                    let p = 1.0 / (mean_val + 1.0);
3716                    // Cap upper/lower to prevent issues with large values
3717                    let upper_capped = upper.min(50.0);
3718                    let lower_capped = lower.clamp(0.0, 50.0);
3719                    let cdf_upper = geometric_cdf(upper_capped, p);
3720                    let cdf_lower = geometric_cdf(lower_capped, p);
3721                    (cdf_upper - cdf_lower).max(0.0)
3722                } else {
3723                    0.0
3724                }
3725            }
3726            DistributionType::Weibull => {
3727                if mean > 0.0 && std > 0.0 {
3728                    // Approximate shape from CV
3729                    let cv = std / mean;
3730                    let shape = if cv < 1.0 { 1.0 / cv } else { 1.0 };
3731                    // Scale from mean
3732                    let gamma_1_over_shape = 1.0 + 1.0 / shape; // Approximation
3733                    let scale = mean / gamma_1_over_shape;
3734                    if shape > 0.0 && scale > 0.0 {
3735                        let cdf_upper = weibull_cdf(upper, shape, scale);
3736                        let cdf_lower = weibull_cdf(lower, shape, scale);
3737                        cdf_upper - cdf_lower
3738                    } else {
3739                        0.0
3740                    }
3741                } else {
3742                    0.0
3743                }
3744            }
3745            DistributionType::Unknown => {
3746                // Fallback: uniform distribution
3747                if sorted_data.is_empty() {
3748                    0.0
3749                } else {
3750                    let data_min = sorted_data[0];
3751                    let data_max = sorted_data[sorted_data.len() - 1];
3752                    let data_range = data_max - data_min;
3753                    if data_range > 0.0 {
3754                        (upper - lower) / data_range
3755                    } else if i == 0 {
3756                        1.0
3757                    } else {
3758                        0.0
3759                    }
3760                }
3761            }
3762        };
3763
3764        probabilities.push(prob.max(0.0)); // Ensure non-negative
3765    }
3766
3767    probabilities
3768}
3769
3770fn compute_outlier_analysis(series: &Series, numeric_stats: &NumericStatistics) -> OutlierAnalysis {
3771    let values: Vec<f64> = get_numeric_values_as_f64(series);
3772    let q25 = numeric_stats.q25;
3773    let q75 = numeric_stats.q75;
3774    let mean = numeric_stats.mean;
3775    let std = numeric_stats.std;
3776
3777    if q25.is_nan() || q75.is_nan() || std == 0.0 {
3778        return OutlierAnalysis {
3779            total_count: 0,
3780            percentage: 0.0,
3781            iqr_count: 0,
3782            zscore_count: 0,
3783            outlier_rows: Vec::new(),
3784        };
3785    }
3786
3787    let iqr = q75 - q25;
3788    let lower_fence = q25 - 1.5 * iqr;
3789    let upper_fence = q75 + 1.5 * iqr;
3790    let z_threshold = 3.0;
3791
3792    let mut outlier_rows = Vec::new();
3793
3794    for (idx, &value) in values.iter().enumerate() {
3795        let mut methods = Vec::new();
3796        let mut z_score = None;
3797        let mut iqr_position = None;
3798
3799        // Check IQR
3800        if value < lower_fence {
3801            methods.push(OutlierMethod::IQR);
3802            iqr_position = Some(IqrPosition::BelowLowerFence);
3803        } else if value > upper_fence {
3804            methods.push(OutlierMethod::IQR);
3805            iqr_position = Some(IqrPosition::AboveUpperFence);
3806        }
3807
3808        // Check Z-Score
3809        if std > 0.0 {
3810            let z = (value - mean).abs() / std;
3811            z_score = Some(z);
3812            if z > z_threshold {
3813                methods.push(OutlierMethod::ZScore);
3814            }
3815        }
3816
3817        if !methods.is_empty() {
3818            outlier_rows.push(OutlierRow {
3819                row_index: idx,
3820                column_value: value,
3821                context_data: HashMap::new(), // Will be populated by caller if needed
3822                detection_method: if methods.len() == 2 {
3823                    OutlierMethod::Both
3824                } else {
3825                    methods[0].clone()
3826                },
3827                z_score,
3828                iqr_position,
3829            });
3830        }
3831    }
3832
3833    // Sort by absolute deviation from mean (most extreme first)
3834    outlier_rows.sort_by(|a, b| {
3835        let a_dev = (a.column_value - mean).abs();
3836        let b_dev = (b.column_value - mean).abs();
3837        b_dev
3838            .partial_cmp(&a_dev)
3839            .unwrap_or(std::cmp::Ordering::Equal)
3840    });
3841
3842    // Limit to top 100 for performance
3843    outlier_rows.truncate(100);
3844
3845    let total_count = outlier_rows.len();
3846    let percentage = if values.is_empty() {
3847        0.0
3848    } else {
3849        (total_count as f64 / values.len() as f64) * 100.0
3850    };
3851
3852    OutlierAnalysis {
3853        total_count,
3854        percentage,
3855        iqr_count: outlier_rows
3856            .iter()
3857            .filter(|r| matches!(r.detection_method, OutlierMethod::IQR | OutlierMethod::Both))
3858            .count(),
3859        zscore_count: outlier_rows
3860            .iter()
3861            .filter(|r| {
3862                matches!(
3863                    r.detection_method,
3864                    OutlierMethod::ZScore | OutlierMethod::Both
3865                )
3866            })
3867            .count(),
3868        outlier_rows,
3869    }
3870}
3871
3872// Correlation matrix computation
3873/// Computes pairwise Pearson correlation matrix for all numeric columns.
3874///
3875/// Returns correlations, p-values, and sample sizes for each pair.
3876/// Requires at least 2 numeric columns.
3877pub fn compute_correlation_matrix(df: &DataFrame) -> Result<CorrelationMatrix> {
3878    // Get all numeric columns
3879    let schema = df.schema();
3880    let numeric_cols: Vec<String> = schema
3881        .iter()
3882        .filter(|(_, dtype)| is_numeric_type(dtype))
3883        .map(|(name, _)| name.to_string())
3884        .collect();
3885
3886    if numeric_cols.len() < 2 {
3887        return Err(color_eyre::eyre::eyre!(
3888            "Need at least 2 numeric columns for correlation matrix"
3889        ));
3890    }
3891
3892    let n = numeric_cols.len();
3893    let mut correlations = vec![vec![1.0; n]; n];
3894    let mut p_values = vec![vec![0.0; n]; n];
3895    let mut sample_sizes = vec![vec![0; n]; n];
3896
3897    // Compute pairwise correlations
3898    for i in 0..n {
3899        for j in (i + 1)..n {
3900            let col1 = df.column(&numeric_cols[i])?;
3901            let col2 = df.column(&numeric_cols[j])?;
3902
3903            // Remove nulls for this pair
3904            let mask = col1.is_not_null() & col2.is_not_null();
3905            let col1_clean = col1.filter(&mask)?;
3906            let col2_clean = col2.filter(&mask)?;
3907
3908            let sample_size = col1_clean.len();
3909            sample_sizes[i][j] = sample_size;
3910            sample_sizes[j][i] = sample_size;
3911
3912            if sample_size < 3 {
3913                // Not enough data for correlation
3914                correlations[i][j] = f64::NAN;
3915                correlations[j][i] = f64::NAN;
3916                continue;
3917            }
3918
3919            // Compute Pearson correlation
3920            let col1_series = col1_clean.as_materialized_series();
3921            let col2_series = col2_clean.as_materialized_series();
3922            let correlation = compute_pearson_correlation(col1_series, col2_series)?;
3923            correlations[i][j] = correlation;
3924            correlations[j][i] = correlation; // Symmetric
3925
3926            // Compute p-value (statistical significance)
3927            if sample_size >= 3 {
3928                let p_value = compute_correlation_p_value(correlation, sample_size);
3929                p_values[i][j] = p_value;
3930                p_values[j][i] = p_value;
3931            }
3932        }
3933    }
3934
3935    Ok(CorrelationMatrix {
3936        columns: numeric_cols,
3937        correlations,
3938        p_values: Some(p_values),
3939        sample_sizes,
3940    })
3941}
3942
3943fn compute_pearson_correlation(col1: &Series, col2: &Series) -> Result<f64> {
3944    // Compute Pearson correlation manually
3945    let values1: Vec<f64> = get_numeric_values_as_f64(col1);
3946    let values2: Vec<f64> = get_numeric_values_as_f64(col2);
3947
3948    if values1.len() != values2.len() || values1.len() < 2 {
3949        return Err(color_eyre::eyre::eyre!("Invalid data for correlation"));
3950    }
3951
3952    let mean1: f64 = values1.iter().sum::<f64>() / values1.len() as f64;
3953    let mean2: f64 = values2.iter().sum::<f64>() / values2.len() as f64;
3954
3955    let numerator: f64 = values1
3956        .iter()
3957        .zip(values2.iter())
3958        .map(|(v1, v2)| (v1 - mean1) * (v2 - mean2))
3959        .sum();
3960
3961    let var1: f64 = values1.iter().map(|v| (v - mean1).powi(2)).sum();
3962    let var2: f64 = values2.iter().map(|v| (v - mean2).powi(2)).sum();
3963
3964    if var1 == 0.0 || var2 == 0.0 {
3965        return Ok(0.0);
3966    }
3967
3968    let correlation = numerator / (var1.sqrt() * var2.sqrt());
3969    Ok(correlation)
3970}
3971
3972fn compute_correlation_p_value(correlation: f64, n: usize) -> f64 {
3973    // t-test for correlation coefficient
3974    // t = r * sqrt((n-2) / (1-r^2))
3975    // Then use t-distribution to get p-value
3976    if correlation.abs() >= 1.0 || n < 3 {
3977        return 1.0;
3978    }
3979
3980    let t_statistic = correlation * ((n - 2) as f64 / (1.0 - correlation * correlation)).sqrt();
3981    let _degrees_of_freedom = (n - 2) as f64;
3982
3983    // Approximate p-value using t-distribution
3984    // Simplified approximation: p ≈ 2 * (1 - normal_cdf(|t|))
3985    let normal_cdf = |x: f64| -> f64 {
3986        // Approximation of normal CDF
3987        0.5 * (1.0 + (x / std::f64::consts::SQRT_2).tanh())
3988    };
3989
3990    let p_value = 2.0 * (1.0 - normal_cdf(t_statistic.abs()));
3991    p_value.clamp(0.0, 1.0)
3992}
3993
3994/// Computes correlation statistics for a pair of columns.
3995///
3996/// Returns Pearson correlation coefficient, p-value, covariance, and sample size.
3997/// Requires at least 3 non-null pairs of values.
3998pub fn compute_correlation_pair(
3999    df: &DataFrame,
4000    col1_name: &str,
4001    col2_name: &str,
4002) -> Result<CorrelationPair> {
4003    let col1 = df.column(col1_name)?;
4004    let col2 = df.column(col2_name)?;
4005
4006    // Remove nulls
4007    let mask = col1.is_not_null() & col2.is_not_null();
4008    let col1_clean = col1.filter(&mask)?;
4009    let col2_clean = col2.filter(&mask)?;
4010
4011    let sample_size = col1_clean.len();
4012    if sample_size < 3 {
4013        return Err(color_eyre::eyre::eyre!("Not enough data for correlation"));
4014    }
4015
4016    let col1_series = col1_clean.as_materialized_series();
4017    let col2_series = col2_clean.as_materialized_series();
4018    let correlation = compute_pearson_correlation(col1_series, col2_series)?;
4019    let p_value = Some(compute_correlation_p_value(correlation, sample_size));
4020
4021    // Compute covariance
4022    let mean1 = col1_series.mean().unwrap_or(0.0);
4023    let mean2 = col2_series.mean().unwrap_or(0.0);
4024    let values1: Vec<f64> = get_numeric_values_as_f64(col1_series);
4025    let values2: Vec<f64> = get_numeric_values_as_f64(col2_series);
4026
4027    let covariance = if values1.len() == values2.len() {
4028        values1
4029            .iter()
4030            .zip(values2.iter())
4031            .map(|(v1, v2)| (v1 - mean1) * (v2 - mean2))
4032            .sum::<f64>()
4033            / (values1.len() - 1) as f64
4034    } else {
4035        0.0
4036    };
4037
4038    let r_squared = correlation * correlation;
4039
4040    // Compute stats for both columns
4041    let stats1 = ColumnStats {
4042        mean: mean1,
4043        std: col1_series.std(1).unwrap_or(0.0),
4044        min: col1_series
4045            .min::<f64>()
4046            .unwrap_or(Some(f64::NAN))
4047            .unwrap_or(f64::NAN),
4048        max: col1_series
4049            .max::<f64>()
4050            .unwrap_or(Some(f64::NAN))
4051            .unwrap_or(f64::NAN),
4052    };
4053
4054    let stats2 = ColumnStats {
4055        mean: mean2,
4056        std: col2_series.std(1).unwrap_or(0.0),
4057        min: col2_series
4058            .min::<f64>()
4059            .unwrap_or(Some(f64::NAN))
4060            .unwrap_or(f64::NAN),
4061        max: col2_series
4062            .max::<f64>()
4063            .unwrap_or(Some(f64::NAN))
4064            .unwrap_or(f64::NAN),
4065    };
4066
4067    Ok(CorrelationPair {
4068        column1: col1_name.to_string(),
4069        column2: col2_name.to_string(),
4070        correlation,
4071        p_value,
4072        sample_size,
4073        covariance,
4074        r_squared,
4075        stats1,
4076        stats2,
4077    })
4078}
datui_lib/statistics.rs

datui_lib/
statistics.rs