use crate::error::{StatsError, StatsResult as Result};
use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2};
use scirs2_core::validation::*;
use crate::bayesian::{BayesianLinearRegression, BayesianRegressionResult};
use crate::mcmc::{GibbsSampler, MultivariateNormalGibbs};
use crate::multivariate::{FactorAnalysis, FactorAnalysisResult, PCAResult, PCA};
use crate::qmc::{halton, latin_hypercube, sobol};
use crate::survival::{CoxPHModel, KaplanMeierEstimator};
#[derive(Debug, Clone)]
pub struct BayesianAnalysisWorkflow {
pub use_mcmc: bool,
pub n_mcmc_samples: usize,
pub mcmc_burnin: usize,
pub random_seed: Option<u64>,
}
impl Default for BayesianAnalysisWorkflow {
fn default() -> Self {
Self {
use_mcmc: true,
n_mcmc_samples: 1000,
mcmc_burnin: 100,
random_seed: None,
}
}
}
#[derive(Debug, Clone)]
pub struct BayesianAnalysisResult {
pub regression: BayesianRegressionResult,
pub mcmc_samples: Option<Array2<f64>>,
pub predictive_samples: Option<Array2<f64>>,
pub model_metrics: BayesianModelMetrics,
}
#[derive(Debug, Clone)]
pub struct BayesianModelMetrics {
pub log_marginal_likelihood: f64,
pub dic: f64,
pub waic: f64,
pub loo_ic: f64,
}
impl BayesianAnalysisWorkflow {
pub fn new() -> Self {
Self::default()
}
pub fn with_mcmc(mut self, n_samples_: usize, burnin: usize) -> Self {
self.use_mcmc = true;
self.n_mcmc_samples = n_samples_;
self.mcmc_burnin = burnin;
self
}
pub fn without_mcmc(mut self) -> Self {
self.use_mcmc = false;
self
}
pub fn with_seed(mut self, seed: u64) -> Self {
self.random_seed = Some(seed);
self
}
pub fn analyze(
&self,
x: ArrayView2<f64>,
y: ArrayView1<f64>,
) -> Result<BayesianAnalysisResult> {
checkarray_finite(&x, "x")?;
checkarray_finite(&y, "y")?;
let (n_samples_, n_features) = x.dim();
if y.len() != n_samples_ {
return Err(StatsError::DimensionMismatch(format!(
"y length ({}) must match x rows ({})",
y.len(),
n_samples_
)));
}
let bayesian_reg = BayesianLinearRegression::new(n_features, true)?;
let regression = bayesian_reg.fit(x, y)?;
let mcmc_samples = if self.use_mcmc {
Some(self.perform_mcmc_sampling(®ression, n_features)?)
} else {
None
};
let predictive_samples = if self.use_mcmc {
Some(self.generate_predictive_samples(&bayesian_reg, ®ression, x)?)
} else {
None
};
let model_metrics = self.compute_model_metrics(®ression, x, y)?;
Ok(BayesianAnalysisResult {
regression,
mcmc_samples,
predictive_samples,
model_metrics,
})
}
fn perform_mcmc_sampling(
&self,
regression: &BayesianRegressionResult,
_n_features: usize,
) -> Result<Array2<f64>> {
use scirs2_core::random::{rngs::StdRng, SeedableRng};
let mut rng = match self.random_seed {
Some(seed) => StdRng::seed_from_u64(seed),
None => {
use std::time::{SystemTime, UNIX_EPOCH};
let seed = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_secs();
StdRng::seed_from_u64(seed)
}
};
let gibbs_sampler = MultivariateNormalGibbs::from_precision(
regression.posterior_mean.clone(),
regression.posterior_covariance.clone(),
)?;
let mut sampler = GibbsSampler::new(gibbs_sampler, regression.posterior_mean.clone())?;
for _ in 0..self.mcmc_burnin {
sampler.step(&mut rng)?;
}
let samples = sampler.sample(self.n_mcmc_samples, &mut rng)?;
Ok(samples)
}
fn generate_predictive_samples(
&self,
bayesian_reg: &BayesianLinearRegression,
regression: &BayesianRegressionResult,
x_test: ArrayView2<f64>,
) -> Result<Array2<f64>> {
use scirs2_core::random::{rngs::StdRng, SeedableRng};
use scirs2_core::random::{Distribution, Normal};
let mut rng = match self.random_seed {
Some(seed) => StdRng::seed_from_u64(seed),
None => {
use std::time::{SystemTime, UNIX_EPOCH};
let seed = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_secs();
StdRng::seed_from_u64(seed)
}
};
let n_test = x_test.nrows();
let mut predictive_samples = Array2::zeros((self.n_mcmc_samples, n_test));
for i in 0..self.n_mcmc_samples {
let mut beta_sample = Array1::zeros(regression.posterior_mean.len());
for j in 0..beta_sample.len() {
let var = regression.posterior_covariance[[j, j]];
let normal =
Normal::new(regression.posterior_mean[j], var.sqrt()).map_err(|e| {
StatsError::ComputationError(format!("Failed to create normal: {}", e))
})?;
beta_sample[j] = normal.sample(&mut rng);
}
let pred_result = bayesian_reg.predict(x_test, regression)?;
let noise_std = (regression.posterior_beta / regression.posterior_alpha).sqrt();
let noise_normal = Normal::new(0.0, noise_std).map_err(|e| {
StatsError::ComputationError(format!("Failed to create noise normal: {}", e))
})?;
for j in 0..n_test {
let noise = noise_normal.sample(&mut rng);
predictive_samples[[i, j]] = pred_result.mean[j] + noise;
}
}
Ok(predictive_samples)
}
fn compute_model_metrics(
&self,
regression: &BayesianRegressionResult,
x: ArrayView2<f64>,
_y: ArrayView1<f64>,
) -> Result<BayesianModelMetrics> {
let n_samples_ = x.nrows() as f64;
let n_params = regression.posterior_mean.len() as f64;
let log_marginal_likelihood = regression.log_marginal_likelihood;
let deviance = -2.0 * log_marginal_likelihood;
let effective_params = n_params; let dic = deviance + 2.0 * effective_params;
let waic = -2.0 * log_marginal_likelihood + 2.0 * effective_params;
let loo_ic = -2.0 * log_marginal_likelihood
+ 2.0 * effective_params * n_samples_ / (n_samples_ - n_params - 1.0);
Ok(BayesianModelMetrics {
log_marginal_likelihood,
dic,
waic,
loo_ic,
})
}
}
#[derive(Debug, Clone)]
pub struct DimensionalityAnalysisWorkflow {
pub n_pca_components: Option<usize>,
pub n_factors: Option<usize>,
pub use_incremental_pca: bool,
pub pca_batchsize: usize,
pub random_seed: Option<u64>,
}
impl Default for DimensionalityAnalysisWorkflow {
fn default() -> Self {
Self {
n_pca_components: None,
n_factors: None,
use_incremental_pca: false,
pca_batchsize: 1000,
random_seed: None,
}
}
}
#[derive(Debug, Clone)]
pub struct DimensionalityAnalysisResult {
pub pca: Option<PCAResult>,
pub factor_analysis: Option<FactorAnalysisResult>,
pub recommendations: DimensionalityRecommendations,
pub comparison_metrics: DimensionalityMetrics,
}
#[derive(Debug, Clone)]
pub struct DimensionalityRecommendations {
pub optimal_pca_components: usize,
pub optimal_factors: usize,
pub explained_variance_ratio: f64,
}
#[derive(Debug, Clone)]
pub struct DimensionalityMetrics {
pub eigenvalues: Array1<f64>,
pub cumulative_variance: Array1<f64>,
pub kmo_measure: f64,
pub bartlett_test: (f64, f64),
}
impl DimensionalityAnalysisWorkflow {
pub fn new() -> Self {
Self::default()
}
pub fn with_pca(
mut self,
n_components: Option<usize>,
incremental: bool,
batchsize: usize,
) -> Self {
self.n_pca_components = n_components;
self.use_incremental_pca = incremental;
self.pca_batchsize = batchsize;
self
}
pub fn with_factor_analysis(mut self, n_factors: Option<usize>) -> Self {
self.n_factors = n_factors;
self
}
pub fn with_seed(mut self, seed: u64) -> Self {
self.random_seed = Some(seed);
self
}
pub fn analyze(&self, data: ArrayView2<f64>) -> Result<DimensionalityAnalysisResult> {
checkarray_finite(&data, "data")?;
let (n_samples_, n_features) = data.dim();
if n_samples_ < 3 {
return Err(StatsError::InvalidArgument(
"Need at least 3 samples for analysis".to_string(),
));
}
let pca = if self.use_incremental_pca && n_samples_ > self.pca_batchsize {
Some(self.perform_incremental_pca(data)?)
} else {
Some(self.perform_standard_pca(data)?)
};
let factor_analysis = if self.n_factors.is_some() {
Some(self.perform_factor_analysis(data)?)
} else {
None
};
let recommendations = self.generate_recommendations(data, &pca)?;
let comparison_metrics = self.compute_metrics(data)?;
Ok(DimensionalityAnalysisResult {
pca,
factor_analysis,
recommendations,
comparison_metrics,
})
}
fn perform_standard_pca(&self, data: ArrayView2<f64>) -> Result<PCAResult> {
let n_components = self
.n_pca_components
.unwrap_or(data.ncols().min(data.nrows()));
let pca = PCA::new()
.with_n_components(n_components)
.with_center(true)
.with_scale(false);
if let Some(seed) = self.random_seed {
pca.with_random_state(seed).fit(data)
} else {
pca.fit(data)
}
}
fn perform_incremental_pca(&self, data: ArrayView2<f64>) -> Result<PCAResult> {
self.perform_standard_pca(data)
}
fn perform_factor_analysis(&self, data: ArrayView2<f64>) -> Result<FactorAnalysisResult> {
use crate::multivariate::RotationType;
let n_factors = self.n_factors.unwrap_or(2);
let mut fa = FactorAnalysis::new(n_factors)?
.with_rotation(RotationType::Varimax)
.with_max_iter(1000)
.with_tolerance(1e-6);
if let Some(seed) = self.random_seed {
fa = fa.with_random_state(seed);
}
fa.fit(data)
}
fn generate_recommendations(
&self,
data: ArrayView2<f64>,
pca: &Option<PCAResult>,
) -> Result<DimensionalityRecommendations> {
use crate::multivariate::{efa::parallel_analysis, mle_components};
let optimal_pca_components = if let Some(ref pca_result) = pca {
pca_result
.explained_variance
.iter()
.position(|&ev| ev < 1.0)
.unwrap_or(pca_result.explained_variance.len())
} else {
mle_components(data, None)?
};
let optimal_factors = parallel_analysis(data, 100, 95.0, self.random_seed)?;
let explained_variance_ratio = if let Some(ref pca_result) = pca {
pca_result
.explained_variance_ratio
.slice(scirs2_core::ndarray::s![..optimal_pca_components])
.sum()
} else {
0.0
};
Ok(DimensionalityRecommendations {
optimal_pca_components,
optimal_factors,
explained_variance_ratio,
})
}
fn compute_metrics(&self, data: ArrayView2<f64>) -> Result<DimensionalityMetrics> {
use crate::multivariate::efa::{bartlett_test, kmo_test};
let mean = data
.mean_axis(scirs2_core::ndarray::Axis(0))
.expect("Operation failed");
let mut centered = data.to_owned();
for mut row in centered.rows_mut() {
row -= &mean;
}
let cov = centered.t().dot(¢ered) / (data.nrows() - 1) as f64;
let (eigenvalues_unsorted, _eigenvectors) = scirs2_linalg::eigh_f64_lapack(&cov.view())
.map_err(|e| {
StatsError::ComputationError(format!("Eigenvalue decomposition failed: {}", e))
})?;
let mut sorted_eigenvalues: Vec<f64> = eigenvalues_unsorted.to_vec();
sorted_eigenvalues.sort_by(|a: &f64, b: &f64| b.partial_cmp(a).expect("Operation failed"));
let eigenvalues = Array1::from_vec(sorted_eigenvalues);
let total_variance = eigenvalues.sum();
let mut cumulative_variance = Array1::zeros(eigenvalues.len());
let mut cumsum = 0.0;
for i in 0..eigenvalues.len() {
cumsum += eigenvalues[i];
cumulative_variance[i] = cumsum / total_variance;
}
let kmo_measure = kmo_test(data)?;
let bartlett_test = bartlett_test(data)?;
Ok(DimensionalityMetrics {
eigenvalues,
cumulative_variance,
kmo_measure,
bartlett_test,
})
}
}
#[derive(Debug, Clone)]
pub struct QMCWorkflow {
pub sequence_type: QMCSequenceType,
pub scrambling: bool,
pub dimensions: usize,
pub n_samples_: usize,
pub random_seed: Option<u64>,
}
#[derive(Debug, Clone, Copy)]
pub enum QMCSequenceType {
Sobol,
Halton,
LatinHypercube,
}
#[derive(Debug, Clone)]
pub struct QMCResult {
pub samples: Array2<f64>,
pub sequence_type: QMCSequenceType,
pub quality_metrics: QMCQualityMetrics,
}
#[derive(Debug, Clone)]
pub struct QMCQualityMetrics {
pub star_discrepancy: f64,
pub uniformity: f64,
pub coverage_efficiency: f64,
}
impl Default for QMCWorkflow {
fn default() -> Self {
Self {
sequence_type: QMCSequenceType::Sobol,
scrambling: true,
dimensions: 2,
n_samples_: 1000,
random_seed: None,
}
}
}
impl QMCWorkflow {
pub fn new(dimensions: usize, n_samples_: usize) -> Self {
Self {
dimensions,
n_samples_,
..Default::default()
}
}
pub fn with_sequence_type(mut self, sequence_type: QMCSequenceType) -> Self {
self.sequence_type = sequence_type;
self
}
pub fn with_scrambling(mut self, scrambling: bool) -> Self {
self.scrambling = scrambling;
self
}
pub fn with_seed(mut self, seed: u64) -> Self {
self.random_seed = Some(seed);
self
}
pub fn generate(&self) -> Result<QMCResult> {
check_positive(self.dimensions, "dimensions")?;
check_positive(self.n_samples_, "n_samples_")?;
let samples = match self.sequence_type {
QMCSequenceType::Sobol => sobol(
self.n_samples_,
self.dimensions,
self.scrambling,
self.random_seed,
)?,
QMCSequenceType::Halton => halton(
self.n_samples_,
self.dimensions,
self.scrambling,
self.random_seed,
)?,
QMCSequenceType::LatinHypercube => {
latin_hypercube(self.n_samples_, self.dimensions, self.random_seed)?
}
};
let quality_metrics = self.compute_quality_metrics(&samples)?;
Ok(QMCResult {
samples,
sequence_type: self.sequence_type,
quality_metrics,
})
}
fn compute_quality_metrics(&self, samples: &Array2<f64>) -> Result<QMCQualityMetrics> {
use crate::qmc::star_discrepancy;
let sample_points: Vec<Array1<f64>> = samples
.rows()
.into_iter()
.map(|row| row.to_owned())
.collect();
let samples_view = Array1::from_vec(sample_points);
let star_discrepancy = star_discrepancy(&samples_view.view())?;
let uniformity = self.compute_uniformity(samples)?;
let coverage_efficiency = self.compute_coverage_efficiency(samples)?;
Ok(QMCQualityMetrics {
star_discrepancy,
uniformity,
coverage_efficiency,
})
}
fn compute_uniformity(&self, samples: &Array2<f64>) -> Result<f64> {
let n_samples_ = samples.nrows();
let mut min_distances = Array1::zeros(n_samples_);
for i in 0..n_samples_ {
let mut min_dist = f64::INFINITY;
for j in 0..n_samples_ {
if i != j {
let mut dist = 0.0;
for k in 0..self.dimensions {
let diff = samples[[i, k]] - samples[[j, k]];
dist += diff * diff;
}
dist = dist.sqrt();
if dist < min_dist {
min_dist = dist;
}
}
}
min_distances[i] = min_dist;
}
let mean_dist = min_distances.mean().expect("Operation failed");
let var_dist = min_distances.var(1.0);
let uniformity = 1.0 / (var_dist.sqrt() / mean_dist);
Ok(uniformity)
}
fn compute_coverage_efficiency(&self, samples: &Array2<f64>) -> Result<f64> {
let n_bins = (self.n_samples_ as f64)
.powf(1.0 / self.dimensions as f64)
.ceil() as usize;
let mut occupied_bins = std::collections::HashSet::new();
for i in 0..samples.nrows() {
let mut bin_id = Vec::new();
for j in 0..self.dimensions {
let bin = (samples[[i, j]] * n_bins as f64).floor() as usize;
bin_id.push(bin.min(n_bins - 1));
}
occupied_bins.insert(bin_id);
}
let total_bins = n_bins.pow(self.dimensions as u32);
let coverage_efficiency = occupied_bins.len() as f64 / total_bins as f64;
Ok(coverage_efficiency)
}
}
#[derive(Debug, Clone)]
pub struct SurvivalAnalysisWorkflow {
pub confidence_level: f64,
pub fit_cox_model: bool,
pub cox_max_iter: usize,
pub cox_tolerance: f64,
}
impl Default for SurvivalAnalysisWorkflow {
fn default() -> Self {
Self {
confidence_level: 0.95,
fit_cox_model: true,
cox_max_iter: 100,
cox_tolerance: 1e-6,
}
}
}
#[derive(Debug, Clone)]
pub struct SurvivalAnalysisResult {
pub kaplan_meier: crate::survival::KaplanMeierEstimator,
pub cox_model: Option<crate::survival::CoxPHModel>,
pub summary_stats: SurvivalSummaryStats,
}
#[derive(Debug, Clone)]
pub struct SurvivalSummaryStats {
pub median_survival: Option<f64>,
pub q25_survival: Option<f64>,
pub q75_survival: Option<f64>,
pub event_rate: f64,
pub censoring_rate: f64,
}
impl SurvivalAnalysisWorkflow {
pub fn new() -> Self {
Self::default()
}
pub fn with_confidence_level(mut self, level: f64) -> Self {
self.confidence_level = level;
self
}
pub fn with_cox_model(mut self, max_iter: usize, tolerance: f64) -> Self {
self.fit_cox_model = true;
self.cox_max_iter = max_iter;
self.cox_tolerance = tolerance;
self
}
pub fn without_cox_model(mut self) -> Self {
self.fit_cox_model = false;
self
}
pub fn analyze(
&self,
durations: ArrayView1<f64>,
events: ArrayView1<bool>,
covariates: Option<ArrayView2<f64>>,
) -> Result<SurvivalAnalysisResult> {
checkarray_finite(&durations, "durations")?;
if durations.len() != events.len() {
return Err(StatsError::DimensionMismatch(format!(
"durations length ({}) must match events length ({})",
durations.len(),
events.len()
)));
}
let kaplan_meier =
KaplanMeierEstimator::fit(durations, events, Some(self.confidence_level))?;
let cox_model = if self.fit_cox_model {
if let Some(cov) = covariates {
Some(CoxPHModel::fit(
durations,
events,
cov,
Some(self.cox_max_iter),
Some(self.cox_tolerance),
)?)
} else {
None
}
} else {
None
};
let summary_stats = self.compute_summary_stats(&durations, &events, &kaplan_meier)?;
Ok(SurvivalAnalysisResult {
kaplan_meier,
cox_model,
summary_stats,
})
}
fn compute_summary_stats(
&self,
_durations: &ArrayView1<f64>,
events: &ArrayView1<bool>,
km: &KaplanMeierEstimator,
) -> Result<SurvivalSummaryStats> {
let total_events: usize = events.iter().map(|&e| if e { 1 } else { 0 }).sum();
let total_observations = events.len();
let event_rate = total_events as f64 / total_observations as f64;
let censoring_rate = 1.0 - event_rate;
let median_survival = km.median_survival_time;
let q25_survival = self.find_survival_percentile(km, 0.75)?; let q75_survival = self.find_survival_percentile(km, 0.25)?;
Ok(SurvivalSummaryStats {
median_survival,
q25_survival,
q75_survival,
event_rate,
censoring_rate,
})
}
fn find_survival_percentile(
&self,
km: &KaplanMeierEstimator,
target_survival: f64,
) -> Result<Option<f64>> {
for i in 0..km.survival_function.len() {
if km.survival_function[i] <= target_survival {
return Ok(Some(km.event_times[i]));
}
}
Ok(None) }
}