use super::*;
pub(crate) const DENSE_OUTER_MAX_P: usize = 1024;
pub(crate) const DENSE_OUTER_PARALLEL_FLOP_THRESHOLD: u64 = 100_000;
pub(crate) enum XtWxBackend {
Dense(DenseOuterState),
Sparse(SparseSpGemmState),
}
pub(crate) struct DenseOuterState {
pub(crate) xtwx_dense: Array2<f64>,
pub(crate) thread_buffers: Vec<Array2<f64>>,
}
pub(crate) struct SparseSpGemmState {
pub(crate) wxvalues: Vec<f64>,
pub(crate) wx_tvalues: Vec<f64>,
pub(crate) sqrt_weights: Vec<f64>,
pub(crate) info: SparseMatMulInfo,
pub(crate) scratch: MemBuffer,
pub(crate) par: Par,
}
pub(crate) struct SparseXtWxCache {
pub(crate) xtwx_symbolic: SymbolicSparseColMat<usize>,
pub(crate) xtwxvalues: Vec<f64>,
pub(crate) nrows: usize,
pub(crate) ncols: usize,
pub(crate) nnz: usize,
pub(crate) x_col_ptr: Vec<usize>,
pub(crate) xrow_idx: Vec<usize>,
pub(crate) x_t_csc: SparseColMat<usize, f64>,
pub(crate) backend: XtWxBackend,
}
impl SparseXtWxCache {
pub(crate) fn new(x: &SparseColMat<usize, f64>) -> Result<Self, EstimationError> {
let x_t_csc =
x.as_ref().transpose().to_col_major().map_err(|_| {
EstimationError::InvalidInput("failed to transpose to CSC".to_string())
})?;
let (xtwx_symbolic, info) = sparse_sparse_matmul_symbolic(x_t_csc.symbolic(), x.symbolic())
.map_err(|_| {
EstimationError::InvalidInput("failed to build symbolic XtWX cache".to_string())
})?;
let xtwxvalues = vec![0.0; xtwx_symbolic.row_idx().len()];
let backend = if x.ncols() <= DENSE_OUTER_MAX_P {
XtWxBackend::Dense(DenseOuterState {
xtwx_dense: Array2::<f64>::zeros((x.ncols(), x.ncols())),
thread_buffers: Vec::new(),
})
} else {
let par = get_global_parallelism();
let scratch = MemBuffer::new(sparse_sparse_matmul_numeric_scratch::<usize, f64>(
xtwx_symbolic.as_ref(),
par,
));
XtWxBackend::Sparse(SparseSpGemmState {
wxvalues: vec![0.0; x.val().len()],
wx_tvalues: vec![0.0; x_t_csc.val().len()],
sqrt_weights: vec![0.0; x.nrows()],
info,
scratch,
par,
})
};
Ok(Self {
xtwx_symbolic,
xtwxvalues,
nrows: x.nrows(),
ncols: x.ncols(),
nnz: x.val().len(),
x_col_ptr: x.symbolic().col_ptr().to_vec(),
xrow_idx: x.symbolic().row_idx().to_vec(),
x_t_csc,
backend,
})
}
pub(crate) fn matches(&self, x: &SparseColMat<usize, f64>) -> bool {
if self.nrows != x.nrows() || self.ncols != x.ncols() || self.nnz != x.val().len() {
return false;
}
let sym = x.symbolic();
self.x_col_ptr.as_slice() == sym.col_ptr() && self.xrow_idx.as_slice() == sym.row_idx()
}
pub(crate) fn compute_numeric(
&mut self,
x: &SparseColMat<usize, f64>,
weights: &Array1<f64>,
) -> Result<(), EstimationError> {
if weights.len() != self.nrows {
crate::bail_invalid_estim!(
"weights length {} does not match design rows {}",
weights.len(),
self.nrows
);
}
match &mut self.backend {
XtWxBackend::Dense(state) => {
state.compute(self.x_t_csc.as_ref(), weights, self.nrows, self.ncols);
let col_ptr = self.xtwx_symbolic.col_ptr();
let row_idx = self.xtwx_symbolic.row_idx();
let dense = &state.xtwx_dense;
for col in 0..self.ncols {
let start = col_ptr[col];
let end = col_ptr[col + 1];
for idx in start..end {
let row = row_idx[idx];
if row <= col {
self.xtwxvalues[idx] = dense[[row, col]];
}
}
}
}
XtWxBackend::Sparse(state) => state.compute(
x,
self.x_t_csc.as_ref(),
weights,
self.ncols,
self.xtwx_symbolic.as_ref(),
&mut self.xtwxvalues,
),
}
Ok(())
}
}
impl DenseOuterState {
pub(crate) fn compute(
&mut self,
x_t: SparseColMatRef<'_, usize, f64>,
weights: &Array1<f64>,
n: usize,
p: usize,
) {
assert_eq!(self.xtwx_dense.dim(), (p, p));
self.xtwx_dense.fill(0.0);
if n == 0 || p == 0 {
return;
}
let xtwx_start = std::time::Instant::now();
let nnz_total = x_t.symbolic().row_idx().len() as u64;
let work = nnz_total
.saturating_mul(nnz_total)
.checked_div(n as u64)
.unwrap_or(u64::MAX);
let n_threads = rayon::current_num_threads();
let parallelize = n_threads > 1 && work >= DENSE_OUTER_PARALLEL_FLOP_THRESHOLD;
if !parallelize {
accumulate_outer_upper(&mut self.xtwx_dense, x_t, weights, 0..n);
log::info!(
"[STAGE] PIRLS dense XᵀWX assembly (serial) n={} p={} flops~{} elapsed={:.3}s",
n,
p,
(n as u64).saturating_mul((p as u64).saturating_mul(p as u64)),
xtwx_start.elapsed().as_secs_f64(),
);
return;
}
if self.thread_buffers.len() != n_threads {
self.thread_buffers
.resize_with(n_threads, || Array2::<f64>::zeros((p, p)));
}
let chunk = n.div_ceil(n_threads);
self.thread_buffers
.par_iter_mut()
.enumerate()
.for_each(|(t, buf)| {
buf.fill(0.0);
let start = t * chunk;
let end = (start + chunk).min(n);
if start < end {
accumulate_outer_upper(buf, x_t, weights, start..end);
}
});
for buf in &self.thread_buffers {
self.xtwx_dense += buf;
}
log::info!(
"[STAGE] PIRLS dense XᵀWX assembly (parallel, threads={}) n={} p={} flops~{} elapsed={:.3}s",
rayon::current_num_threads(),
n,
p,
(n as u64).saturating_mul((p as u64).saturating_mul(p as u64)),
xtwx_start.elapsed().as_secs_f64(),
);
}
}
impl SparseSpGemmState {
pub(crate) fn compute(
&mut self,
x: &SparseColMat<usize, f64>,
x_t: SparseColMatRef<'_, usize, f64>,
weights: &Array1<f64>,
p: usize,
xtwx_symbolic: SymbolicSparseColMatRef<'_, usize>,
xtwxvalues: &mut [f64],
) {
let n = x_t.ncols();
assert_eq!(weights.len(), n);
assert_eq!(self.sqrt_weights.len(), n);
assert!(
weights.iter().all(|&w| w.is_finite() && w >= 0.0),
"SparseSpGemmState::compute requires finite nonnegative PIRLS weights"
);
let sqrt_w = self.sqrt_weights.as_mut_slice();
for (dst, &w) in sqrt_w.iter_mut().zip(weights.iter()) {
*dst = w.sqrt();
}
let sqrt_w: &[f64] = sqrt_w;
let x_ref = x.as_ref();
for col in 0..p {
let rows = x_ref.row_idx_of_col_raw(col);
let xvals = x_ref.val_of_col(col);
let range = x_ref.col_range(col);
let dst = &mut self.wxvalues[range];
for ((d, &s), row) in dst.iter_mut().zip(xvals.iter()).zip(rows.iter()) {
*d = s * sqrt_w[row.unbound()];
}
}
for col in 0..n {
let w = sqrt_w[col];
let xvals = x_t.val_of_col(col);
let range = x_t.col_range(col);
let dst = &mut self.wx_tvalues[range];
for (d, &s) in dst.iter_mut().zip(xvals.iter()) {
*d = s * w;
}
}
let wx_ref = SparseColMatRef::new(x.symbolic(), &self.wxvalues[..]);
let wx_t_ref = SparseColMatRef::new(x_t.symbolic(), &self.wx_tvalues[..]);
let stack = MemStack::new(&mut self.scratch);
let xtwxmut = SparseColMatMut::new(xtwx_symbolic, xtwxvalues);
sparse_sparse_matmul_numeric(
xtwxmut,
Accum::Replace,
wx_t_ref,
wx_ref,
1.0,
&self.info,
self.par,
stack,
);
}
}
#[inline]
pub(crate) fn accumulate_outer_upper(
acc: &mut Array2<f64>,
x_t: SparseColMatRef<'_, usize, f64>,
weights: &Array1<f64>,
rows: std::ops::Range<usize>,
) {
assert_eq!(acc.nrows(), acc.ncols());
let p = acc.ncols();
let acc_data = acc
.as_slice_mut()
.expect("dense XᵀWX accumulator is row-major and contiguous");
for i in rows {
let w_i = weights[i].max(0.0);
if w_i == 0.0 {
continue;
}
let cols = x_t.row_idx_of_col_raw(i);
let vals = x_t.val_of_col(i);
let nnz_i = cols.len();
for jj in 0..nnz_i {
let j = cols[jj].unbound();
let wvj = w_i * vals[jj];
let row = &mut acc_data[j * p..j * p + p];
for kk in jj..nnz_i {
let k = cols[kk].unbound();
row[k] += wvj * vals[kk];
}
}
}
}
pub(super) fn compute_jeffreys_pirls_diagnostics_sparse(
link: &InverseLink,
x_design_csr: &SparseRowMat<usize, f64>,
eta: ArrayView1<f64>,
observation_weights: ArrayView1<f64>,
) -> Result<(Array1<f64>, f64, Array1<f64>), EstimationError> {
let n = x_design_csr.nrows();
let p = x_design_csr.ncols();
let mut x_dense = Array2::<f64>::zeros((n, p));
let xview = x_design_csr.as_ref();
for i in 0..n {
let vals = xview.val_of_row(i);
let cols = xview.col_idx_of_row_raw(i);
if cols.len() != vals.len() {
crate::bail_invalid_estim!(
"sparse row structure mismatch: column/value lengths differ"
);
}
for (idx, &col) in cols.iter().enumerate() {
x_dense[[i, col.unbound()]] = vals[idx];
}
}
compute_jeffreys_pirls_diagnostics(link, x_dense.view(), eta, observation_weights)
}
pub(super) fn compute_jeffreys_pirls_diagnostics(
link: &InverseLink,
x_design: ArrayView2<f64>,
eta: ArrayView1<f64>,
observation_weights: ArrayView1<f64>,
) -> Result<(Array1<f64>, f64, Array1<f64>), EstimationError> {
let op = FirthDenseOperator::build_with_observation_weights_for_link(
link,
&x_design.to_owned(),
&eta.to_owned(),
observation_weights,
)?;
Ok((
op.pirls_hat_diag(),
op.jeffreys_logdet(),
op.pirls_firth_score_shift(),
))
}
pub(crate) fn ensure_positive_definitewithridge(
hess: &mut Array2<f64>,
label: &str,
) -> Result<f64, EstimationError> {
let ridge = if FIXED_STABILIZATION_RIDGE > 0.0 {
FIXED_STABILIZATION_RIDGE
} else {
0.0
};
if hess.cholesky(Side::Lower).is_ok() {
return Ok(0.0);
}
if ridge > 0.0 {
for i in 0..hess.nrows() {
hess[[i, i]] += ridge;
}
if hess.cholesky(Side::Lower).is_ok() {
log::debug!("{} stabilized with fixed ridge {:.1e}.", label, ridge);
return Ok(ridge);
}
}
if let Ok((evals, _)) = hess.eigh(Side::Lower) {
let min_eig = evals.iter().fold(f64::INFINITY, |a, &b| a.min(b));
return Err(EstimationError::HessianNotPositiveDefinite {
min_eigenvalue: min_eig,
});
}
Err(EstimationError::HessianNotPositiveDefinite {
min_eigenvalue: f64::NEG_INFINITY,
})
}
pub(super) fn solve_newton_direction_dense(
hessian: &Array2<f64>,
gradient: &Array1<f64>,
direction_out: &mut Array1<f64>,
) -> Result<(), EstimationError> {
solve_newton_direction_dense_with_factor(hessian, gradient, direction_out).map(|_| ())
}
pub(super) fn solve_direction_with_dense_factor(
factor: &FaerSymmetricFactor,
gradient: &Array1<f64>,
direction_out: &mut Array1<f64>,
) {
if direction_out.len() != gradient.len() {
*direction_out = Array1::zeros(gradient.len());
}
direction_out.assign(gradient);
let mut rhsview = array1_to_col_matmut(direction_out);
factor.solve_in_place(rhsview.as_mut());
direction_out.mapv_inplace(|v| -v);
}
pub(super) fn solve_newton_direction_dense_with_factor(
hessian: &Array2<f64>,
gradient: &Array1<f64>,
direction_out: &mut Array1<f64>,
) -> Result<Option<FaerSymmetricFactor>, EstimationError> {
let dense_solve_start = std::time::Instant::now();
let p = hessian.nrows();
if direction_out.len() != gradient.len() {
*direction_out = Array1::zeros(gradient.len());
}
if crate::gpu::cuda_selected() {
let rhs = Array2::from_shape_vec((p, 1), gradient.to_vec()).map_err(|e| {
EstimationError::InvalidInput(format!("CUDA PIRLS RHS layout failed: {e}"))
})?;
let (solved, _) =
crate::solver::gpu::pirls_gpu::cholesky_solve_gpu(hessian.view(), rhs.view())
.map_err(EstimationError::InvalidInput)?;
direction_out.assign(&solved.column(0));
direction_out.mapv_inplace(|v| -v);
if array_is_finite(direction_out) {
log::info!(
"[STAGE] PIRLS dense newton solve backend=CUDA p={} flops~{} elapsed={:.3}s route=\"cuSOLVER potrf/potrs\"",
p,
(p as u64).saturating_mul((p as u64).saturating_mul(p as u64)) / 3,
dense_solve_start.elapsed().as_secs_f64(),
);
return Ok(None);
}
}
let cpu_route = String::from("CPU stable solver");
let factor = StableSolver::new("pirls newton direction")
.factorize(hessian)
.map_err(EstimationError::LinearSystemSolveFailed)?;
solve_direction_with_dense_factor(&factor, gradient, direction_out);
let validation_residual = {
let h_delta = hessian.dot(direction_out);
h_delta
.iter()
.zip(gradient.iter())
.map(|(h, g)| (h + g).abs())
.fold(0.0_f64, f64::max)
};
let g_inf = gradient.iter().map(|v| v.abs()).fold(0.0_f64, f64::max);
let rel = validation_residual / (1.0 + g_inf);
if !rel.is_finite() || rel > 1.0e-3 {
let rhs = gradient.mapv(|v| -v);
if let Some(pseudo) = StableSolver::new("pirls newton direction (pseudoinverse fallback)")
.solve_with_pseudoinverse_fallback(hessian, &rhs, 1.0e-10, 1.0e-3, 1.0e-10)
{
direction_out.assign(&pseudo);
log::info!(
"[STAGE] PIRLS dense newton solve backend=CPU p={} elapsed={:.3}s route=\"{} + pseudoinverse fallback (rel={:.3e} > 1e-3)\"",
p,
dense_solve_start.elapsed().as_secs_f64(),
cpu_route,
rel,
);
return Ok(Some(factor));
}
}
if array_is_finite(direction_out) {
log::info!(
"[STAGE] PIRLS dense newton solve backend=CPU p={} flops~{} elapsed={:.3}s route=\"{}\"",
p,
(p as u64).saturating_mul((p as u64).saturating_mul(p as u64)) / 3,
dense_solve_start.elapsed().as_secs_f64(),
cpu_route,
);
return Ok(Some(factor));
}
Err(EstimationError::LinearSystemSolveFailed(
FaerLinalgError::FactorizationFailed {
context: "PIRLS dense newton solve exhausted",
},
))
}
pub fn solve_newton_direction_implicit<F>(
apply_xtwx: F,
xtwx_diag: ArrayView1<'_, f64>,
dense_penalties: &[(f64, &Array2<f64>)],
op_penalties: &[(f64, &dyn crate::terms::analytic_penalties::PenaltyOp)],
gradient: &Array1<f64>,
direction_out: &mut Array1<f64>,
ridge: f64,
rel_tol: f64,
max_iter: usize,
) -> Result<(), EstimationError>
where
F: Fn(&Array1<f64>) -> Array1<f64>,
{
let p = gradient.len();
if xtwx_diag.len() != p {
crate::bail_invalid_estim!(
"solve_newton_direction_implicit: xtwx_diag length {} != gradient length {}",
xtwx_diag.len(),
p
);
}
for (_, s) in dense_penalties.iter() {
if s.nrows() != p || s.ncols() != p {
crate::bail_invalid_estim!(
"solve_newton_direction_implicit: dense penalty dim {}×{} != p={}",
s.nrows(),
s.ncols(),
p
);
}
}
for (_, op) in op_penalties.iter() {
if op.dim() != p {
crate::bail_invalid_estim!(
"solve_newton_direction_implicit: op penalty dim {} != p={}",
op.dim(),
p
);
}
}
if direction_out.len() != p {
*direction_out = Array1::zeros(p);
}
let pcg_start = std::time::Instant::now();
let mut precond_diag = xtwx_diag.to_owned();
if ridge > 0.0 {
precond_diag.mapv_inplace(|d| d + ridge);
}
for (lambda, s) in dense_penalties.iter() {
if *lambda == 0.0 {
continue;
}
for i in 0..p {
precond_diag[i] += *lambda * s[[i, i]];
}
}
for (lambda, op) in op_penalties.iter() {
if *lambda == 0.0 {
continue;
}
let d = op.diag();
for i in 0..p {
precond_diag[i] += *lambda * d[i];
}
}
let apply_h = |v: &Array1<f64>| -> Array1<f64> {
let mut hv = apply_xtwx(v);
if ridge > 0.0 {
hv.zip_mut_with(v, |h, &x| *h += ridge * x);
}
for (lambda, s) in dense_penalties.iter() {
if *lambda == 0.0 {
continue;
}
let sv = fast_av(s, v);
hv.scaled_add(*lambda, &sv);
}
for (lambda, op) in op_penalties.iter() {
if *lambda == 0.0 {
continue;
}
let mut sv = Array1::<f64>::zeros(p);
op.matvec(v.view(), sv.view_mut());
hv.scaled_add(*lambda, &sv);
}
hv
};
let solution =
crate::linalg::utils::solve_spd_pcg(apply_h, gradient, &precond_diag, rel_tol, max_iter)
.ok_or(EstimationError::LinearSystemSolveFailed(
FaerLinalgError::FactorizationFailed {
context: "PIRLS implicit PCG solve exhausted",
},
))?;
direction_out.assign(&solution);
direction_out.mapv_inplace(|v| -v);
if !array_is_finite(direction_out) {
return Err(EstimationError::LinearSystemSolveFailed(
FaerLinalgError::FactorizationFailed {
context: "PIRLS implicit PCG non-finite direction",
},
));
}
log::info!(
"[STAGE] PIRLS implicit (PCG) newton solve p={} dense_pens={} op_pens={} elapsed={:.3}s",
p,
dense_penalties.len(),
op_penalties.len(),
pcg_start.elapsed().as_secs_f64(),
);
Ok(())
}
pub(super) fn project_coefficients_to_lower_bounds(
beta: &mut Array1<f64>,
lower_bounds: &Array1<f64>,
) {
for i in 0..beta.len() {
let lb = lower_bounds[i];
if lb.is_finite() && beta[i] < lb {
beta[i] = lb;
}
}
}
pub(crate) const ACTIVE_BOUND_REL_TOL: f64 = 1e-6;
pub(crate) const ACTIVE_BOUND_ABS_TOL: f64 = 1e-10;
pub(super) fn projected_gradient_norm(
gradient: &Array1<f64>,
beta: &Array1<f64>,
lower_bounds: Option<&Array1<f64>>,
) -> f64 {
let Some(lb) = lower_bounds else {
return gradient.dot(gradient).sqrt();
};
let mut sum_sq = 0.0;
for i in 0..gradient.len() {
let g = gradient[i];
if lb[i].is_finite() && g > 0.0 {
let slack = beta[i] - lb[i];
let scale = beta[i].abs().max(lb[i].abs()).max(1.0);
let tol = ACTIVE_BOUND_REL_TOL * scale + ACTIVE_BOUND_ABS_TOL;
if slack < tol {
continue;
}
}
sum_sq += g * g;
}
sum_sq.sqrt()
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub(super) enum PirlsSoftAccept {
NearStationaryPlateau,
BoundarySaturation,
RelativeBandPlateau,
}
#[derive(Clone, Copy, Debug)]
pub(super) enum SoftAcceptProgress {
Realized { dev_change: f64 },
Predicted {
predicted_reduction: f64,
current_penalized: f64,
},
}
#[inline]
pub(super) fn pirls_soft_acceptance(
state: &WorkingState,
projected_grad: f64,
progress: SoftAcceptProgress,
max_abs_eta: f64,
progress_tol: f64,
kkt_tol: f64,
) -> Option<PirlsSoftAccept> {
let objective_scale = state.deviance.abs() + state.penalty_term.abs();
let scaled_dev_tol = progress_tol * objective_scale;
let near_stationary_plateau = match progress {
SoftAcceptProgress::Realized { dev_change } => {
state.near_stationary_kkt(projected_grad, kkt_tol) && dev_change.abs() < scaled_dev_tol
}
SoftAcceptProgress::Predicted {
predicted_reduction,
current_penalized,
} => {
let reduction_noise_floor = current_penalized.abs() * 1e-12;
state.near_stationary_kkt(projected_grad, kkt_tol)
&& predicted_reduction.abs() <= reduction_noise_floor
}
};
if near_stationary_plateau {
return Some(PirlsSoftAccept::NearStationaryPlateau);
}
let dev_change = match progress {
SoftAcceptProgress::Realized { dev_change } => dev_change,
SoftAcceptProgress::Predicted { .. } => return None,
};
if max_abs_eta >= PIRLS_ETA_ABS_CAP * (1.0 - 1e-12) && dev_change.abs() < scaled_dev_tol {
return Some(PirlsSoftAccept::BoundarySaturation);
}
if state.relative_gradient_norm(projected_grad) <= progress_tol.max(1e-6)
&& dev_change.abs() < scaled_dev_tol * 0.1
&& dev_change >= 0.0
{
return Some(PirlsSoftAccept::RelativeBandPlateau);
}
None
}
pub(super) fn constrained_stationarity_norm(
gradient: &Array1<f64>,
beta: &Array1<f64>,
lower_bounds: Option<&Array1<f64>>,
linear_constraints: Option<&LinearInequalityConstraints>,
) -> f64 {
if let Some(constraints) = linear_constraints {
let kkt = compute_constraint_kkt_diagnostics(beta, gradient, constraints);
return kkt
.primal_feasibility
.max(kkt.dual_feasibility)
.max(kkt.complementarity)
.max(kkt.stationarity);
}
projected_gradient_norm(gradient, beta, lower_bounds)
}
pub(crate) fn count_dense_upper_nnz(matrix: &Array2<f64>, tol: f64) -> usize {
let p = matrix.nrows().min(matrix.ncols());
let mut nnz = 0usize;
for col in 0..p {
for row in 0..=col {
if matrix[[row, col]].abs() > tol {
nnz += 1;
}
}
}
nnz
}
pub(crate) fn estimate_sparse_native_decision(
workspace: &mut PirlsWorkspace,
x_original: &DesignMatrix,
s_lambda: &Array2<f64>,
coefficient_lower_bounds: Option<&Array1<f64>>,
linear_constraints_original: Option<&LinearInequalityConstraints>,
) -> SparsePirlsDecision {
let p = x_original.ncols();
let nnz_s_lambda = count_dense_upper_nnz(s_lambda, 1e-12);
let dense_reject = |reason: &'static str, nnz_x: usize| SparsePirlsDecision {
path: PirlsLinearSolvePath::DenseTransformed,
reason,
p,
nnz_x,
nnz_xtwx_symbolic: None,
nnz_s_lambda,
nnz_h_est: None,
density_h_est: None,
};
let has_finite_lower_bounds = coefficient_lower_bounds
.map(|lb| lb.iter().any(|bound| bound.is_finite()))
.unwrap_or(false);
if has_finite_lower_bounds || linear_constraints_original.is_some() {
return dense_reject("constraints_present", 0);
}
let x_sparse = if let Some(sparse) = x_original.as_sparse() {
sparse
} else {
let row_chunk_start = std::time::Instant::now();
let n = x_original.nrows();
let chunk = row_chunk_for_byte_budget(n, x_original.ncols());
let mut nnz: usize = 0;
let mut chunks_processed = 0usize;
if chunk > 0 && n > 0 {
let mut start = 0;
while start < n {
let end = (start + chunk).min(n);
chunks_processed += 1;
match x_original.try_row_chunk(start..end) {
Ok(rows) => {
nnz = nnz.saturating_add(rows.iter().filter(|v| v.abs() > 1e-12).count());
}
Err(_) => {
nnz = nnz.saturating_add((end - start).saturating_mul(x_original.ncols()));
}
}
start = end;
}
}
log::info!(
"[STAGE] PIRLS row-chunk generation chunks={} n={} p={} nnz={} elapsed={:.3}s",
chunks_processed,
n,
x_original.ncols(),
nnz,
row_chunk_start.elapsed().as_secs_f64(),
);
return dense_reject("design_not_sparse", nnz);
};
let nnz_x = x_sparse.val().len();
match workspace.sparse_penalized_system_stats(x_sparse, s_lambda) {
Ok(stats) => SparsePirlsDecision {
path: if stats.density_upper <= SPARSE_NATIVE_MAX_H_DENSITY {
PirlsLinearSolvePath::SparseNative
} else {
PirlsLinearSolvePath::DenseTransformed
},
reason: if stats.density_upper <= SPARSE_NATIVE_MAX_H_DENSITY {
"sparse_native_eligible"
} else {
"penalized_hessian_too_dense"
},
p,
nnz_x,
nnz_xtwx_symbolic: Some(stats.nnz_xtwx_symbolic),
nnz_s_lambda: stats.nnz_s_lambda_upper,
nnz_h_est: Some(stats.nnz_h_upper),
density_h_est: Some(stats.density_upper),
},
Err(_) => dense_reject("sparse_stats_failed", nnz_x),
}
}
pub(super) fn should_use_sparse_native_pirls(
workspace: &mut PirlsWorkspace,
x_original: &DesignMatrix,
s_lambda: &Array2<f64>,
coefficient_lower_bounds: Option<&Array1<f64>>,
linear_constraints_original: Option<&LinearInequalityConstraints>,
) -> SparsePirlsDecision {
estimate_sparse_native_decision(
workspace,
x_original,
s_lambda,
coefficient_lower_bounds,
linear_constraints_original,
)
}
pub(super) fn ensure_sparse_positive_definitewithridge<F>(
mut assemble: F,
) -> Result<
(
SparseColMat<usize, f64>,
crate::linalg::sparse_exact::SparseExactFactor,
f64,
),
EstimationError,
>
where
F: FnMut(f64) -> Result<SparseColMat<usize, f64>, EstimationError>,
{
let h0 = assemble(0.0)?;
if let Ok(factor) = factorize_sparse_spd(&h0) {
return Ok((h0, factor, 0.0));
}
let h_eps = assemble(FIXED_STABILIZATION_RIDGE)?;
if let Ok(factor) = factorize_sparse_spd(&h_eps) {
return Ok((h_eps, factor, FIXED_STABILIZATION_RIDGE));
}
let (gershgorin_min, diag_scale) = gershgorin_min_eig_lower_bound(&h_eps);
let scale = diag_scale.max(1.0);
let margin = FIXED_STABILIZATION_RIDGE * scale;
let direct_ridge = (margin - gershgorin_min).max(FIXED_STABILIZATION_RIDGE);
log::warn!(
"sparse penalized Hessian is not positive definite (Gershgorin λ_min ≥ {:.3e}, \
diag scale {:.3e}); regularizing curvature with direct ridge {:.3e}. Exported \
curvature/SEs are stabilized, not exact — investigate rank-deficiency or weight \
underflow in the Hessian assembly.",
gershgorin_min,
scale,
direct_ridge,
);
for ridge in [direct_ridge, direct_ridge * 2.0] {
let h = assemble(ridge)?;
if let Ok(factor) = factorize_sparse_spd(&h) {
return Ok((h, factor, ridge));
}
}
Err(EstimationError::HessianNotPositiveDefinite {
min_eigenvalue: gershgorin_min,
})
}
pub(crate) fn gershgorin_min_eig_lower_bound(h: &SparseColMat<usize, f64>) -> (f64, f64) {
let n = h.ncols();
let mut diag = vec![0.0_f64; n];
let mut radius = vec![0.0_f64; n];
let (symbolic, values) = h.parts();
let col_ptr = symbolic.col_ptr();
let row_idx = symbolic.row_idx();
for col in 0..n {
let start = col_ptr[col];
let end = col_ptr[col + 1];
for idx in start..end {
let row = row_idx[idx];
let value = values[idx];
if row == col {
diag[col] += value;
} else {
let a = value.abs();
radius[row] += a;
radius[col] += a;
}
}
}
let mut min_bound = f64::INFINITY;
let mut diag_scale = 0.0_f64;
for i in 0..n {
min_bound = min_bound.min(diag[i] - radius[i]);
diag_scale = diag_scale.max(diag[i].abs());
}
if !min_bound.is_finite() {
min_bound = f64::NEG_INFINITY;
}
(min_bound, diag_scale)
}
pub(crate) fn solve_subsystem_direction(
h_sub: ndarray::ArrayView2<f64>,
g_sub: ndarray::ArrayView1<f64>,
out: &mut Array1<f64>,
) -> Result<(), EstimationError> {
let n = g_sub.len();
if out.len() != n {
*out = Array1::zeros(n);
}
if let Ok(factor) = StableSolver::new("pirls bounded subsystem").factorize_any(&h_sub) {
out.assign(&g_sub);
let mut rhs = array1_to_col_matmut(out);
factor.solve_in_place(rhs.as_mut());
out.mapv_inplace(|v| -v);
if array_is_finite(out) {
return Ok(());
}
}
let diag_scale = (0..n)
.map(|i| h_sub[[i, i]].abs())
.fold(0.0_f64, f64::max)
.max(1.0);
let mut tau = 1e-8 * diag_scale;
let mut h_reg = h_sub.to_owned();
for _ in 0..12 {
for i in 0..n {
h_reg[[i, i]] = h_sub[[i, i]] + tau;
}
if let Ok(factor) = StableSolver::new("pirls bounded subsystem ridge").factorize(&h_reg) {
out.assign(&g_sub);
let mut rhs = array1_to_col_matmut(out);
factor.solve_in_place(rhs.as_mut());
out.mapv_inplace(|v| -v);
if array_is_finite(out) {
return Ok(());
}
}
tau *= 10.0;
}
let gnorm = g_sub.dot(&g_sub).sqrt();
if gnorm > 0.0 {
let scale = 1.0 / gnorm.max(diag_scale);
for i in 0..n {
out[i] = -g_sub[i] * scale;
}
return Ok(());
}
out.fill(0.0);
Ok(())
}
pub(super) fn linear_constraints_from_lower_bounds(
lower_bounds: &Array1<f64>,
) -> Option<LinearInequalityConstraints> {
LinearInequalityConstraints::from_per_coordinate_lower_bounds(lower_bounds)
}
pub(super) fn compute_constraint_kkt_diagnostics(
beta: &Array1<f64>,
gradient: &Array1<f64>,
constraints: &LinearInequalityConstraints,
) -> ConstraintKktDiagnostics {
active_set::compute_constraint_kkt_diagnostics(beta, gradient, constraints)
}
pub(super) fn select_active_set_release(
gradient: &Array1<f64>,
hd: &Array1<f64>,
active_idx: &[usize],
use_blands: bool,
) -> Option<usize> {
if use_blands {
for &i in active_idx {
let lambda_i = gradient[i] + hd[i];
let scale = gradient[i].abs().max(hd[i].abs()).max(1.0);
let tol = 64.0 * f64::EPSILON * scale;
if lambda_i < -tol {
return Some(i);
}
}
None
} else {
let mut worst = 0.0_f64;
let mut idx = None;
for &i in active_idx {
let lambda_i = gradient[i] + hd[i];
if lambda_i < worst {
worst = lambda_i;
idx = Some(i);
}
}
idx
}
}
pub(crate) fn solve_newton_directionwith_lower_bounds(
hessian: &Array2<f64>,
gradient: &Array1<f64>,
beta: &Array1<f64>,
lower_bounds: &Array1<f64>,
direction_out: &mut Array1<f64>,
active_hint: Option<&mut Vec<usize>>,
) -> Result<(), EstimationError> {
let p = gradient.len();
if lower_bounds.len() != p || beta.len() != p {
crate::bail_invalid_estim!(
"lower-bound size mismatch: beta={}, gradient={}, bounds={}",
beta.len(),
gradient.len(),
lower_bounds.len()
);
}
if direction_out.len() != p {
*direction_out = Array1::zeros(p);
}
direction_out.fill(0.0);
let has_active_hint = active_hint
.as_ref()
.map(|hint| !hint.is_empty())
.unwrap_or(false);
if !has_active_hint && solve_newton_direction_dense(hessian, gradient, direction_out).is_ok() {
let mut feasible = true;
for i in 0..p {
let lb = lower_bounds[i];
if lb.is_finite() && beta[i] + direction_out[i] < lb {
feasible = false;
break;
}
}
if feasible {
return Ok(());
}
}
let mut active = vec![false; p];
if let Some(hint) = active_hint.as_ref() {
for &idx in hint.iter() {
if idx < p {
active[idx] = true;
}
}
}
for i in 0..p {
let lb = lower_bounds[i];
if lb.is_finite() && gradient[i] > 0.0 {
let scale = beta[i].abs().max(lb.abs()).max(1.0);
let tol = ACTIVE_BOUND_REL_TOL * scale + ACTIVE_BOUND_ABS_TOL;
if beta[i] <= lb + tol {
active[i] = true;
}
}
}
const BLANDS_RULE_GRACE: usize = 2;
let blands_threshold = BLANDS_RULE_GRACE * (p + 1);
let max_iters = 8 * (p + 1);
let mut d_free = Array1::<f64>::zeros(p);
let mut h_ff_buf = Array2::<f64>::zeros((p, p));
let mut g_f_buf = Array1::<f64>::zeros(p);
for it in 0..max_iters {
let use_blands = it >= blands_threshold;
let free_idx: Vec<usize> = (0..p).filter(|&i| !active[i]).collect();
let active_idx: Vec<usize> = (0..p).filter(|&i| active[i]).collect();
direction_out.fill(0.0);
for &i in &active_idx {
let lb = lower_bounds[i];
if lb.is_finite() {
direction_out[i] = lb - beta[i];
}
}
if free_idx.is_empty() {
let hd = fast_av(hessian, direction_out);
if let Some(idx) = select_active_set_release(gradient, &hd, &active_idx, use_blands) {
active[idx] = false;
continue;
}
if let Some(hint) = active_hint {
hint.clear();
hint.extend((0..p).filter(|&i| active[i]));
}
return Ok(());
}
let n_free = free_idx.len();
{
let mut h_ff = h_ff_buf.slice_mut(ndarray::s![..n_free, ..n_free]);
let mut g_f = g_f_buf.slice_mut(ndarray::s![..n_free]);
for (ii, &i) in free_idx.iter().enumerate() {
let mut gi = gradient[i];
for &j in &active_idx {
gi += hessian[[i, j]] * direction_out[j];
}
g_f[ii] = gi;
for (jj, &j) in free_idx.iter().enumerate() {
h_ff[[ii, jj]] = hessian[[i, j]];
}
}
}
solve_subsystem_direction(
h_ff_buf.slice(ndarray::s![..n_free, ..n_free]),
g_f_buf.slice(ndarray::s![..n_free]),
&mut d_free,
)?;
for (ii, &i) in free_idx.iter().enumerate() {
direction_out[i] = d_free[ii];
}
let mut hit_idx: Option<usize> = None;
let mut best_alpha = 1.0_f64;
for &i in &free_idx {
let lb = lower_bounds[i];
if !lb.is_finite() {
continue;
}
let slack = beta[i] - lb;
let di = direction_out[i];
if let Some(alpha_i) = boundary_hit_step_fraction(slack, di, best_alpha) {
best_alpha = alpha_i;
hit_idx = Some(i);
}
}
if let Some(i_hit) = hit_idx {
for i in 0..p {
direction_out[i] *= best_alpha;
}
active[i_hit] = true;
continue;
}
let hd = fast_av(hessian, direction_out);
if let Some(idx) = select_active_set_release(gradient, &hd, &active_idx, use_blands) {
active[idx] = false;
continue;
}
if let Some(hint) = active_hint {
hint.clear();
hint.extend((0..p).filter(|&i| active[i]));
}
return Ok(());
}
let gnorm = gradient.dot(gradient).sqrt();
if gnorm > 0.0 {
let diag_scale = (0..p)
.map(|i| hessian[[i, i]].abs())
.fold(0.0_f64, f64::max)
.max(1.0);
let step_scale = 1.0 / diag_scale;
for i in 0..p {
let di = -gradient[i] * step_scale;
let lb = lower_bounds[i];
if lb.is_finite() && beta[i] + di < lb {
direction_out[i] = lb - beta[i];
} else {
direction_out[i] = di;
}
}
} else {
direction_out.fill(0.0);
}
if let Some(hint) = active_hint {
hint.clear();
}
Ok(())
}
pub(super) fn solve_newton_directionwith_linear_constraints(
hessian: &Array2<f64>,
gradient: &Array1<f64>,
beta: &Array1<f64>,
constraints: &LinearInequalityConstraints,
direction_out: &mut Array1<f64>,
active_hint: Option<&mut Vec<usize>>,
) -> Result<(), EstimationError> {
active_set::solve_newton_direction_with_linear_constraints(
hessian,
gradient,
beta,
constraints,
direction_out,
active_hint,
)
}