1use ndarray::{Array1, Array2};
20
21use gam_linalg::triangular::{CholeskyGuard, cholesky_factor_in_place, cholesky_solve_vector};
22use crate::arrow_schur::{ArrowSchurSystem, DeviceSaePcgData, PcgDiagnostics};
23
24pub struct ArrowSchurGpuSolution {
26 pub delta_t: Array1<f64>,
27 pub delta_beta: Array1<f64>,
28 pub log_det_hessian: f64,
31}
32
33#[derive(Debug, Clone)]
37pub enum ArrowSchurGpuFailure {
38 Unavailable,
40 RidgeBumpRequired { row: usize, bump: f64 },
43 SchurFactorFailed { reason: String },
46 GpuRequiresDenseSystem {
52 had_hbb_matvec: bool,
53 had_htbeta_matvec: bool,
54 },
55}
56
57const RIDGE_BUMP_EPS_MARGIN: f64 = 1024.0;
70
71pub fn solve_arrow_newton_step(
75 sys: &ArrowSchurSystem,
76 ridge_t: f64,
77 ridge_beta: f64,
78) -> Result<ArrowSchurGpuSolution, ArrowSchurGpuFailure> {
79 let n = sys.rows.len();
80 let d = sys.d;
81 let k = sys.k;
82
83 let had_hbb_matvec = sys.hbb_matvec.is_some();
88 let had_htbeta_matvec = sys.htbeta_matvec.is_some();
89 if had_hbb_matvec || had_htbeta_matvec {
90 return Err(ArrowSchurGpuFailure::GpuRequiresDenseSystem {
91 had_hbb_matvec,
92 had_htbeta_matvec,
93 });
94 }
95
96 if sys.hbb.dim() != (k, k) {
97 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
98 reason: "CUDA arrow-Schur requires a dense shared beta block".to_string(),
99 });
100 }
101 if n == 0 || d == 0 {
102 return Err(ArrowSchurGpuFailure::Unavailable);
103 }
104 if sys
105 .rows
106 .iter()
107 .any(|row| row.htt.dim() != (d, d) || row.htbeta.dim() != (d, k) || row.gt.len() != d)
108 {
109 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
110 reason: "row block dimension mismatch".to_string(),
111 });
112 }
113
114 #[cfg(not(target_os = "linux"))]
115 {
116 if ridge_t.is_nan() || ridge_beta.is_nan() {
117 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
118 reason: "ridge is NaN".to_string(),
119 });
120 }
121 Err(ArrowSchurGpuFailure::Unavailable)
122 }
123
124 #[cfg(target_os = "linux")]
125 {
126 if gam_gpu::device_runtime::GpuRuntime::global()
135 .map(gam_gpu::device_runtime::GpuRuntime::device_count)
136 .unwrap_or(0)
137 > 1
138 {
139 match cuda::solve_multi_gpu(sys, ridge_t, ridge_beta) {
140 Ok(sol) => return Ok(sol),
141 Err(ArrowSchurGpuFailure::RidgeBumpRequired { row, bump }) => {
142 return Err(ArrowSchurGpuFailure::RidgeBumpRequired { row, bump });
143 }
144 Err(ArrowSchurGpuFailure::SchurFactorFailed { reason }) => {
145 return Err(ArrowSchurGpuFailure::SchurFactorFailed { reason });
146 }
147 Err(_) => {}
150 }
151 }
152 if crate::gpu_kernels::arrow_schur_nvrtc::system_admits_fused_path(sys) {
158 match cuda::solve_fused(sys, ridge_t, ridge_beta) {
159 Ok(sol) => return Ok(sol),
160 Err(ArrowSchurGpuFailure::RidgeBumpRequired { row, bump }) => {
164 return Err(ArrowSchurGpuFailure::RidgeBumpRequired { row, bump });
165 }
166 Err(_) => {}
170 }
171 }
172 cuda::solve(sys, ridge_t, ridge_beta)
173 }
174}
175
176#[cfg(target_os = "linux")]
182fn pack_host(sys: &ArrowSchurSystem, ridge_t: f64) -> (Vec<f64>, Vec<f64>, Vec<f64>) {
183 let n = sys.rows.len();
184 let d = sys.d;
185 let k = sys.k;
186 let mut d_buf = Vec::with_capacity(n * d * d);
187 let mut b_buf = Vec::with_capacity(n * d * k);
188 let mut g_buf = Vec::with_capacity(n * d);
189 for row in &sys.rows {
190 pack_block(row, ridge_t, d, k, &mut d_buf, &mut b_buf, &mut g_buf);
191 }
192 (d_buf, b_buf, g_buf)
193}
194
195#[cfg(target_os = "linux")]
196#[inline]
197fn pack_block(
198 row: &crate::arrow_schur::ArrowRowBlock,
199 ridge_t: f64,
200 d: usize,
201 k: usize,
202 d_buf: &mut Vec<f64>,
203 b_buf: &mut Vec<f64>,
204 g_buf: &mut Vec<f64>,
205) {
206 for col in 0..d {
207 for r in 0..d {
208 let mut value = row.htt[[r, col]];
209 if r == col {
210 value += ridge_t;
211 }
212 d_buf.push(value);
213 }
214 }
215 for col in 0..k {
216 for r in 0..d {
217 b_buf.push(row.htbeta[[r, col]]);
218 }
219 }
220 for r in 0..d {
221 g_buf.push(row.gt[r]);
222 }
223}
224
225#[doc(hidden)]
230#[cfg_attr(not(target_os = "linux"), allow(unused_variables))] pub fn solve_arrow_newton_step_fused_force(
232 sys: &ArrowSchurSystem,
233 ridge_t: f64,
234 ridge_beta: f64,
235) -> Result<ArrowSchurGpuSolution, ArrowSchurGpuFailure> {
236 if ridge_t.is_nan() || ridge_beta.is_nan() {
237 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
238 reason: "ridge is NaN".to_string(),
239 });
240 }
241 #[cfg(not(target_os = "linux"))]
242 {
243 Err(ArrowSchurGpuFailure::Unavailable)
248 }
249 #[cfg(target_os = "linux")]
250 {
251 if crate::gpu_kernels::arrow_schur_nvrtc::plan_fused_launch(sys.rows.len(), sys.d, sys.k)
252 .is_none()
253 {
254 return Err(ArrowSchurGpuFailure::Unavailable);
255 }
256 cuda::solve_fused(sys, ridge_t, ridge_beta)
257 }
258}
259
260pub struct ResidentArrowFrameHandle {
270 #[cfg(target_os = "linux")]
271 inner: cuda::ResidentArrowFrame,
272 #[cfg(not(target_os = "linux"))]
273 _never: std::convert::Infallible,
274}
275
276impl ResidentArrowFrameHandle {
277 pub fn new(
279 sys: &ArrowSchurSystem,
280 ridge_t: f64,
281 ridge_beta: f64,
282 ) -> Result<Self, ArrowSchurGpuFailure> {
283 if sys.hbb_matvec.is_some() || sys.htbeta_matvec.is_some() {
286 return Err(ArrowSchurGpuFailure::GpuRequiresDenseSystem {
287 had_hbb_matvec: sys.hbb_matvec.is_some(),
288 had_htbeta_matvec: sys.htbeta_matvec.is_some(),
289 });
290 }
291 #[cfg(not(target_os = "linux"))]
292 {
293 if ridge_t.is_nan() || ridge_beta.is_nan() {
294 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
295 reason: "ridge is NaN".to_string(),
296 });
297 }
298 Err(ArrowSchurGpuFailure::Unavailable)
299 }
300 #[cfg(target_os = "linux")]
301 {
302 Ok(Self {
303 inner: cuda::ResidentArrowFrame::new(sys, ridge_t, ridge_beta)?,
304 })
305 }
306 }
307
308 pub fn solve_gradient(
310 &self,
311 g_t: &[f64],
312 g_beta: &[f64],
313 ) -> Result<ArrowSchurGpuSolution, ArrowSchurGpuFailure> {
314 #[cfg(not(target_os = "linux"))]
315 {
316 if g_t.iter().chain(g_beta).any(|v| !v.is_finite()) {
317 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
318 reason: "non-finite gradient entry".to_string(),
319 });
320 }
321 Err(ArrowSchurGpuFailure::Unavailable)
322 }
323 #[cfg(target_os = "linux")]
324 {
325 self.inner.solve_gradient(g_t, g_beta)
326 }
327 }
328
329 #[must_use]
331 pub fn log_det_hessian(&self) -> f64 {
332 #[cfg(not(target_os = "linux"))]
333 {
334 panic!("ResidentArrowFrameHandle cannot be constructed off CUDA")
340 }
341 #[cfg(target_os = "linux")]
342 {
343 self.inner.log_det_hessian()
344 }
345 }
346}
347
348pub fn gpu_schur_matvec_backend(
391 sys: &ArrowSchurSystem,
392 ridge_t: f64,
393 ridge_beta: f64,
394) -> Result<crate::arrow_schur::GpuSchurMatvec, ArrowSchurGpuFailure> {
395 if sys.htbeta_matvec.is_some() {
398 return build_row_procedural_matvec(sys, ridge_t, ridge_beta);
399 }
400
401 #[cfg(not(target_os = "linux"))]
402 {
403 if ridge_t.is_nan() || ridge_beta.is_nan() {
406 return Err(ArrowSchurGpuFailure::Unavailable);
407 }
408 Err(ArrowSchurGpuFailure::Unavailable)
409 }
410
411 #[cfg(target_os = "linux")]
412 {
413 cuda::build_schur_matvec_backend(sys, ridge_t, ridge_beta)
414 }
415}
416
417fn build_row_procedural_matvec(
434 sys: &ArrowSchurSystem,
435 ridge_t: f64,
436 ridge_beta: f64,
437) -> Result<crate::arrow_schur::GpuSchurMatvec, ArrowSchurGpuFailure> {
438 use std::sync::Arc;
439 let n = sys.rows.len();
440 let k = sys.k;
441 let forward = sys
442 .htbeta_matvec
443 .clone()
444 .ok_or(ArrowSchurGpuFailure::Unavailable)?;
445 let transpose = sys.htbeta_transpose_matvec.clone().ok_or_else(|| {
446 ArrowSchurGpuFailure::SchurFactorFailed {
451 reason: "row-procedural Schur matvec requires htbeta_transpose_matvec; \
452 forward operator installed without its sparse adjoint"
453 .to_string(),
454 }
455 })?;
456
457 let mut factors: Vec<Array2<f64>> = Vec::with_capacity(n);
462 for (i, row) in sys.rows.iter().enumerate() {
463 let di = row.htt.nrows();
464 if row.htt.ncols() != di || row.gt.len() != di {
465 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
466 reason: format!("row {i}: malformed H_tt block {:?}", row.htt.dim()),
467 });
468 }
469 let mut block = row.htt.clone();
470 for r in 0..di {
471 block[[r, r]] += ridge_t;
472 }
473 let factor = cholesky_factor_in_place(block.view(), CholeskyGuard::NonnegativePivot)
474 .ok_or_else(|| {
475 let scale = row
476 .htt
477 .diag()
478 .iter()
479 .map(|v| v.abs())
480 .fold(0.0_f64, f64::max)
481 .max(1.0);
482 ArrowSchurGpuFailure::RidgeBumpRequired {
483 row: i,
484 bump: scale * f64::EPSILON.sqrt() * RIDGE_BUMP_EPS_MARGIN,
485 }
486 })?;
487 factors.push(factor);
488 }
489
490 let penalty_op = sys.effective_penalty_op();
497 let row_dims: Vec<usize> = sys.rows.iter().map(|row| row.htt.nrows()).collect();
498
499 let closure: crate::arrow_schur::GpuSchurMatvec =
500 Arc::new(move |x: &Array1<f64>, out: &mut Array1<f64>| {
501 assert_eq!(x.len(), k, "row-procedural matvec: x.len() != k");
502 assert_eq!(out.len(), k, "row-procedural matvec: out.len() != k");
503
504 {
507 let x_slice = x.as_slice().expect("x must be contiguous");
508 let out_slice = out.as_slice_mut().expect("out must be contiguous");
509 for a in 0..k {
510 out_slice[a] = ridge_beta * x_slice[a];
511 }
512 penalty_op.matvec(x_slice, out_slice);
513 }
514
515 let parallel = n >= crate::arrow_schur::SCHUR_MATVEC_PARALLEL_ROW_MIN
542 && rayon::current_thread_index().is_none();
543 if parallel {
544 use rayon::prelude::*;
545 const CHUNK: usize = 64;
546 let partials: Vec<Array1<f64>> = (0..n)
547 .into_par_iter()
548 .chunks(CHUNK)
549 .map(|idxs| {
550 let mut neg = Array1::<f64>::zeros(k);
554 for i in idxs {
555 let di = row_dims[i];
556 let mut v_i = Array1::<f64>::zeros(di);
558 forward(i, x.view(), &mut v_i);
559 let w_i = cholesky_solve_vector(factors[i].view(), v_i.view());
561 transpose(i, w_i.view(), &mut neg);
563 }
564 neg
565 })
566 .collect();
567 let mut neg = Array1::<f64>::zeros(k);
576 for part in &partials {
577 for a in 0..k {
578 neg[a] += part[a];
579 }
580 }
581 for a in 0..k {
582 out[a] -= neg[a];
583 }
584 } else {
585 let mut neg = Array1::<f64>::zeros(k);
587 for i in 0..n {
588 let di = row_dims[i];
589 let mut v_i = Array1::<f64>::zeros(di);
591 forward(i, x.view(), &mut v_i);
592 let w_i = cholesky_solve_vector(factors[i].view(), v_i.view());
594 transpose(i, w_i.view(), &mut neg);
596 }
597 for a in 0..k {
598 out[a] -= neg[a];
599 }
600 }
601 });
602
603 Ok(closure)
604}
605
606pub fn solve_reduced_beta_pcg(
628 s_acc: &Array2<f64>,
629 rhs_beta: &Array1<f64>,
630 max_iterations: usize,
631 relative_tolerance: f64,
632) -> Result<Array1<f64>, ArrowSchurGpuFailure> {
633 solve_reduced_beta_pcg_with_diagnostics(s_acc, rhs_beta, max_iterations, relative_tolerance)
634 .map(|(x, _)| x)
635}
636
637#[doc(hidden)]
638pub fn solve_reduced_beta_pcg_with_diagnostics(
639 s_acc: &Array2<f64>,
640 rhs_beta: &Array1<f64>,
641 max_iterations: usize,
642 relative_tolerance: f64,
643) -> Result<(Array1<f64>, PcgDiagnostics), ArrowSchurGpuFailure> {
644 let k = rhs_beta.len();
645 if s_acc.dim() != (k, k) {
646 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
647 reason: format!(
648 "reduced-β GPU PCG requires a square (k×k) Schur block; got {:?} for k={k}",
649 s_acc.dim()
650 ),
651 });
652 }
653 if k == 0 {
654 return Err(ArrowSchurGpuFailure::Unavailable);
655 }
656
657 #[cfg(not(target_os = "linux"))]
658 {
659 if relative_tolerance.is_nan() || max_iterations == 0 {
660 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
661 reason: "reduced-β GPU PCG: invalid CG controls".to_string(),
662 });
663 }
664 Err(ArrowSchurGpuFailure::Unavailable)
665 }
666
667 #[cfg(target_os = "linux")]
668 {
669 cuda::solve_reduced_beta_pcg_with_diagnostics(
670 s_acc,
671 rhs_beta,
672 max_iterations,
673 relative_tolerance,
674 )
675 }
676}
677
678pub fn solve_sae_matrix_free_pcg(
679 sys: &ArrowSchurSystem,
680 data: &DeviceSaePcgData,
681 ridge_t: f64,
682 ridge_beta: f64,
683 rhs_beta: &Array1<f64>,
684 max_iterations: usize,
685 relative_tolerance: f64,
686) -> Result<(Array1<f64>, PcgDiagnostics), ArrowSchurGpuFailure> {
687 if sys.k != data.beta_dim || rhs_beta.len() != data.beta_dim || data.p == 0 {
688 return Err(ArrowSchurGpuFailure::Unavailable);
689 }
690 #[cfg(not(target_os = "linux"))]
691 {
692 if ridge_t.is_nan()
693 || ridge_beta.is_nan()
694 || relative_tolerance.is_nan()
695 || max_iterations == 0
696 {
697 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
698 reason: "SAE matrix-free GPU PCG: invalid controls".to_string(),
699 });
700 }
701 Err(ArrowSchurGpuFailure::Unavailable)
702 }
703 #[cfg(target_os = "linux")]
704 {
705 if data.frame.is_some() {
712 cuda::solve_sae_matrix_free_pcg_framed(
713 sys,
714 data,
715 ridge_t,
716 ridge_beta,
717 rhs_beta,
718 max_iterations,
719 relative_tolerance,
720 )
721 } else {
722 cuda::solve_sae_matrix_free_pcg(
723 sys,
724 data,
725 ridge_t,
726 ridge_beta,
727 rhs_beta,
728 max_iterations,
729 relative_tolerance,
730 )
731 }
732 }
733}
734
735#[doc(hidden)]
739pub fn solve_arrow_newton_step_dense_reference(
740 sys: &ArrowSchurSystem,
741 ridge_t: f64,
742 ridge_beta: f64,
743) -> Result<ArrowSchurGpuSolution, String> {
744 let n = sys.rows.len();
745 let d = sys.d;
746 let k = sys.k;
747 let total = n.checked_mul(d).ok_or("dimension overflow")? + k;
748 let mut h = Array2::<f64>::zeros((total, total));
749 let mut rhs = Array1::<f64>::zeros(total);
750 for (i, row) in sys.rows.iter().enumerate() {
751 let base = i * d;
752 for c in 0..d {
753 for r in 0..d {
754 h[[base + r, base + c]] = row.htt[[r, c]];
755 }
756 h[[base + c, base + c]] += ridge_t;
757 }
758 for c in 0..k {
759 for r in 0..d {
760 let value = row.htbeta[[r, c]];
761 h[[base + r, n * d + c]] = value;
762 h[[n * d + c, base + r]] = value;
763 }
764 }
765 for r in 0..d {
766 rhs[base + r] = -row.gt[r];
767 }
768 }
769 for c in 0..k {
770 for r in 0..k {
771 h[[n * d + r, n * d + c]] += sys.hbb[[r, c]];
772 }
773 h[[n * d + c, n * d + c]] += ridge_beta;
774 rhs[n * d + c] = -sys.gb[c];
775 }
776 let factor = cholesky_factor_in_place(h.view(), CholeskyGuard::NonnegativePivot)
777 .ok_or_else(|| "dense reference Cholesky failed".to_string())?;
778 let mut log_det = 0.0_f64;
779 for i in 0..total {
780 log_det += factor[[i, i]].ln();
781 }
782 log_det *= 2.0;
783 let solved = cholesky_solve_vector(factor.view(), rhs.view());
784 let delta_t = solved.slice(ndarray::s![..n * d]).to_owned();
785 let delta_beta = solved.slice(ndarray::s![n * d..]).to_owned();
786 Ok(ArrowSchurGpuSolution {
787 delta_t,
788 delta_beta,
789 log_det_hessian: log_det,
790 })
791}
792
793#[doc(hidden)]
804pub fn sae_framed_penalty_matvec_cpu(
805 data: &DeviceSaePcgData,
806 ridge_beta: f64,
807 x: &[f64],
808 out: &mut [f64],
809) {
810 let frame = data
811 .frame
812 .as_ref()
813 .expect("sae_framed_penalty_matvec_cpu requires frame metadata");
814 let k = data.beta_dim;
815 for a in 0..k {
816 out[a] = ridge_beta * x[a];
817 }
818 for (blk, &r) in data.smooth_blocks.iter().zip(frame.smooth_ranks.iter()) {
820 let off = blk.global_offset;
821 let m = blk.factor_a.nrows();
822 for i_a in 0..m {
823 for i_b in 0..r {
824 let mut acc = 0.0_f64;
825 for j_a in 0..m {
826 let s = blk.factor_a[[i_a, j_a]];
827 if s == 0.0 {
828 continue;
829 }
830 acc += s * x[off + j_a * r + i_b];
831 }
832 out[off + i_a * r + i_b] += acc;
833 }
834 }
835 }
836 for blk in &frame.frame_blocks {
838 let r_i = frame.ranks[blk.atom_i];
839 let r_j = frame.ranks[blk.atom_j];
840 let off_i = frame.border_offsets[blk.atom_i];
841 let off_j = frame.border_offsets[blk.atom_j];
842 let (m_i, m_j) = blk.g.dim();
843 for li in 0..m_i {
844 let yi_base = off_i + li * r_i;
845 for lj in 0..m_j {
846 let g = blk.g[[li, lj]];
847 if g == 0.0 {
848 continue;
849 }
850 let xj_base = off_j + lj * r_j;
851 for a in 0..r_i {
852 let mut acc = 0.0_f64;
853 for b in 0..r_j {
854 acc += blk.w[[a, b]] * x[xj_base + b];
855 }
856 out[yi_base + a] += g * acc;
857 }
858 }
859 }
860 }
861}
862
863#[doc(hidden)]
872pub fn sae_framed_schur_matvec_cpu(
873 sys: &ArrowSchurSystem,
874 data: &DeviceSaePcgData,
875 ridge_t: f64,
876 ridge_beta: f64,
877 x: &[f64],
878 out: &mut [f64],
879) -> Result<(), String> {
880 let frame = data
881 .frame
882 .as_ref()
883 .ok_or("sae_framed_schur_matvec_cpu requires frame metadata")?;
884 let k = data.beta_dim;
885 sae_framed_penalty_matvec_cpu(data, ridge_beta, x, out);
886 if frame.row_htbeta.len() != sys.rows.len() {
887 return Err(format!(
888 "sae_framed_schur_matvec_cpu: {} row_htbeta slabs but {} rows",
889 frame.row_htbeta.len(),
890 sys.rows.len()
891 ));
892 }
893 for (i, row) in sys.rows.iter().enumerate() {
894 let slab = &frame.row_htbeta[i];
895 if slab.is_empty() {
896 continue;
897 }
898 let qi = sys.row_dims[i];
899 if qi == 0 || slab.len() != qi * k {
900 continue;
901 }
902 let mut h = vec![0.0_f64; qi];
904 for c in 0..qi {
905 let base = c * k;
906 let mut acc = 0.0_f64;
907 for a in 0..k {
908 acc += slab[base + a] * x[a];
909 }
910 h[c] = acc;
911 }
912 let mut block = row.htt.clone();
914 for d in 0..qi {
915 block[[d, d]] += ridge_t;
916 }
917 let factor = cholesky_factor_in_place(block.view(), CholeskyGuard::NonnegativePivot)
918 .ok_or_else(|| format!("sae_framed_schur_matvec_cpu: row {i} H_tt not PD"))?;
919 let s = cholesky_solve_vector(factor.view(), Array1::from_vec(h).view());
920 for c in 0..qi {
922 let sc = s[c];
923 if sc == 0.0 {
924 continue;
925 }
926 let base = c * k;
927 for a in 0..k {
928 out[a] -= slab[base + a] * sc;
929 }
930 }
931 }
932 Ok(())
933}
934
935#[cfg(target_os = "linux")]
936mod cuda {
937 use super::{ArrowSchurGpuFailure, ArrowSchurGpuSolution, pack_block, pack_host};
938 use gam_gpu::driver::to_i32;
939 use gam_gpu::linalg_dispatch::{DispatchOp, route_through_gpu};
940 use crate::arrow_schur::{
941 ArrowSchurSystem, DeviceSaeFrameData, DeviceSaePcgData, PcgDiagnostics, PcgStopReason,
942 };
943 use cudarc::cublas::sys::{
944 cublasDiagType_t, cublasFillMode_t, cublasOperation_t, cublasSideMode_t, cublasStatus_t,
945 };
946 use cudarc::cublas::{CudaBlas, Gemm, GemmConfig, Gemv, GemvConfig};
947 use cudarc::cusolver::{DnHandle, sys as cusolver_sys};
948 use cudarc::driver::{
949 CudaContext, CudaModule, CudaSlice, CudaStream, DevicePtr, DevicePtrMut, LaunchConfig,
950 PushKernelArg,
951 };
952 use ndarray::Array1;
953 use std::sync::{Arc, OnceLock};
954
955 struct RowSlot {
960 d_block: Vec<f64>, b_block: Vec<f64>, g_vec: Vec<f64>, diag_scale: f64, l_block: Vec<f64>, u_vec: Vec<f64>, y_block: Vec<f64>, log_det_local: f64,
970 bump: Option<f64>,
973 tile_partial_schur: Option<Vec<f64>>, tile_partial_rhs: Option<Vec<f64>>, delta_t_block: Vec<f64>, }
979
980 pub(super) fn solve_multi_gpu(
1001 sys: &ArrowSchurSystem,
1002 ridge_t: f64,
1003 ridge_beta: f64,
1004 ) -> Result<ArrowSchurGpuSolution, ArrowSchurGpuFailure> {
1005 let n = sys.rows.len();
1006 let d = sys.d;
1007 let k = sys.k;
1008 if n == 0 || d == 0 || k == 0 {
1009 return Err(ArrowSchurGpuFailure::Unavailable);
1010 }
1011 if sys.hbb_matvec.is_some() || sys.htbeta_matvec.is_some() || sys.hbb.dim() != (k, k) {
1015 return Err(ArrowSchurGpuFailure::Unavailable);
1016 }
1017
1018 let runtime = gam_gpu::device_runtime::GpuRuntime::global()
1019 .ok_or(ArrowSchurGpuFailure::Unavailable)?;
1020 if runtime.device_count() < 2 {
1021 return Err(ArrowSchurGpuFailure::Unavailable);
1022 }
1023
1024 let mut slots: Vec<RowSlot> = Vec::with_capacity(n);
1026 for row in &sys.rows {
1027 if row.htt.dim() != (d, d) || row.htbeta.dim() != (d, k) || row.gt.len() != d {
1028 return Err(ArrowSchurGpuFailure::Unavailable);
1029 }
1030 let mut d_block = Vec::with_capacity(d * d);
1031 let mut b_block = Vec::with_capacity(d * k);
1032 let mut g_vec = Vec::with_capacity(d);
1033 pack_block(row, ridge_t, d, k, &mut d_block, &mut b_block, &mut g_vec);
1034 let diag_scale = row
1035 .htt
1036 .diag()
1037 .iter()
1038 .map(|v| v.abs())
1039 .fold(0.0_f64, f64::max)
1040 .max(1.0);
1041 slots.push(RowSlot {
1042 d_block,
1043 b_block,
1044 g_vec,
1045 diag_scale,
1046 l_block: Vec::new(),
1047 u_vec: Vec::new(),
1048 y_block: Vec::new(),
1049 log_det_local: 0.0,
1050 bump: None,
1051 tile_partial_schur: None,
1052 tile_partial_rhs: None,
1053 delta_t_block: vec![0.0; d],
1054 });
1055 }
1056
1057 let forward_ok = gam_gpu::pool::scatter_batched(runtime, &mut slots, |ordinal, tile| {
1059 forward_tile(ordinal, d, k, tile)
1060 });
1061 if forward_ok.is_none() {
1062 return Err(ArrowSchurGpuFailure::Unavailable);
1063 }
1064
1065 let row_base_of_tile = gam_gpu::pool::balanced_partition(runtime, n);
1067 if let Some((row, bump)) = slots
1068 .iter()
1069 .enumerate()
1070 .find_map(|(i, slot)| slot.bump.map(|b| (i, b)))
1071 {
1072 return Err(ArrowSchurGpuFailure::RidgeBumpRequired { row, bump });
1073 }
1074
1075 let mut schur_host = vec![0.0_f64; k * k];
1080 for col in 0..k {
1081 for row in 0..k {
1082 let mut v = sys.hbb[[row, col]];
1083 if row == col {
1084 v += ridge_beta;
1085 }
1086 schur_host[col * k + row] = v;
1087 }
1088 }
1089 let mut rhs_host: Vec<f64> = sys.gb.iter().map(|v| -v).collect();
1090 let mut log_det = 0.0_f64;
1091 for start in tile_starts(&row_base_of_tile) {
1092 let slot = &slots[start];
1093 let partial_schur = slot
1094 .tile_partial_schur
1095 .as_ref()
1096 .ok_or(ArrowSchurGpuFailure::Unavailable)?;
1097 let partial_rhs = slot
1098 .tile_partial_rhs
1099 .as_ref()
1100 .ok_or(ArrowSchurGpuFailure::Unavailable)?;
1101 for idx in 0..k * k {
1106 schur_host[idx] += partial_schur[idx];
1107 }
1108 for a in 0..k {
1109 rhs_host[a] += partial_rhs[a];
1110 }
1111 }
1112 for slot in &slots {
1113 log_det += slot.log_det_local;
1114 }
1115
1116 let primary = runtime.selected_device().ordinal;
1120 let stream = gam_gpu::device_runtime::cuda_context_for(primary)
1121 .and_then(|ctx| ctx.new_stream().ok())
1122 .ok_or(ArrowSchurGpuFailure::Unavailable)?;
1123 let solver =
1124 DnHandle::new(stream.clone()).map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1125 let blas = CudaBlas::new(stream.clone()).map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1126 let mut schur_dev = stream
1127 .clone_htod(&schur_host)
1128 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1129 let mut rhs_dev = stream
1130 .clone_htod(&rhs_host)
1131 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1132 let info = potrf_single(&solver, &stream, k, &mut schur_dev)?;
1133 if info != 0 {
1134 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
1135 reason: format!("multi-GPU Schur Cholesky failed at pivot {info}"),
1136 });
1137 }
1138 trsm_single(&blas, &stream, k, &schur_dev, &mut rhs_dev, false, false)?;
1139 trsm_single(&blas, &stream, k, &schur_dev, &mut rhs_dev, false, true)?;
1140 let delta_beta_host = stream
1141 .clone_dtoh(&rhs_dev)
1142 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1143 let delta_beta = Array1::from_vec(delta_beta_host.clone());
1144 let l_schur_host = stream
1145 .clone_dtoh(&schur_dev)
1146 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1147 for j in 0..k {
1148 log_det += l_schur_host[j * k + j].ln();
1149 }
1150 log_det *= 2.0;
1151
1152 let delta_beta_ref = &delta_beta_host;
1154 let back_ok = gam_gpu::pool::scatter_batched(runtime, &mut slots, |ordinal, tile| {
1155 back_sub_tile(ordinal, d, k, delta_beta_ref, tile)
1156 });
1157 if back_ok.is_none() {
1158 return Err(ArrowSchurGpuFailure::Unavailable);
1159 }
1160
1161 let mut delta_t = Array1::<f64>::zeros(n * d);
1163 for (i, slot) in slots.iter().enumerate() {
1164 let base = i * d;
1165 for r in 0..d {
1166 delta_t[base + r] = slot.delta_t_block[r];
1167 }
1168 }
1169
1170 Ok(ArrowSchurGpuSolution {
1171 delta_t,
1172 delta_beta,
1173 log_det_hessian: log_det,
1174 })
1175 }
1176
1177 fn tile_starts(tiles: &[(usize, std::ops::Range<usize>)]) -> impl Iterator<Item = usize> + '_ {
1180 tiles.iter().map(|(_, range)| range.start)
1181 }
1182
1183 fn forward_tile(ordinal: usize, d: usize, k: usize, tile: &mut [RowSlot]) -> Option<()> {
1191 if tile.is_empty() {
1192 return Some(());
1193 }
1194 let stream = gam_gpu::device_runtime::cuda_context_for(ordinal)
1197 .and_then(|ctx| ctx.new_stream().ok())?;
1198 let solver = DnHandle::new(stream.clone()).ok()?;
1199 let blas = CudaBlas::new(stream.clone()).ok()?;
1200 let m = tile.len();
1201
1202 let mut d_host = Vec::with_capacity(m * d * d);
1205 let mut b_host = Vec::with_capacity(m * d * k);
1206 let mut g_host = Vec::with_capacity(m * d);
1207 for slot in tile.iter() {
1208 d_host.extend_from_slice(&slot.d_block);
1209 b_host.extend_from_slice(&slot.b_block);
1210 g_host.extend_from_slice(&slot.g_vec);
1211 }
1212 let mut d_dev = stream.clone_htod(&d_host).ok()?;
1213 let mut b_dev = stream.clone_htod(&b_host).ok()?;
1214 let mut g_dev = stream.clone_htod(&g_host).ok()?;
1215
1216 let info_host = potrf_batched(&solver, &stream, d, m, &mut d_dev).ok()?;
1218 if let Some(local) = info_host.iter().position(|info| *info != 0) {
1219 let pivot = info_host[local];
1220 tile[local].bump = Some(
1221 tile[local].diag_scale
1222 * (f64::from(pivot).abs()).max(1.0)
1223 * f64::EPSILON.sqrt()
1224 * super::RIDGE_BUMP_EPS_MARGIN,
1225 );
1226 return Some(());
1227 }
1228
1229 trsm_batched_lower_inplace(&blas, &stream, d, m, 1, &d_dev, &mut g_dev).ok()?;
1231 trsm_batched_lower_inplace(&blas, &stream, d, m, k, &d_dev, &mut b_dev).ok()?;
1232
1233 let mut schur_dev = stream.alloc_zeros::<f64>(k * k).ok()?;
1235 let mut rhs_dev = stream.alloc_zeros::<f64>(k).ok()?;
1236 accumulate_schur(&blas, d, k, m, &b_dev, &g_dev, &mut schur_dev, &mut rhs_dev).ok()?;
1237
1238 let l_host = stream.clone_dtoh(&d_dev).ok()?;
1240 let u_host = stream.clone_dtoh(&g_dev).ok()?;
1241 let y_host = stream.clone_dtoh(&b_dev).ok()?;
1242 let partial_schur = stream.clone_dtoh(&schur_dev).ok()?;
1243 let partial_rhs = stream.clone_dtoh(&rhs_dev).ok()?;
1244
1245 for (local, slot) in tile.iter_mut().enumerate() {
1246 let l_base = local * d * d;
1247 let u_base = local * d;
1248 let y_base = local * d * k;
1249 slot.l_block = l_host[l_base..l_base + d * d].to_vec();
1250 slot.u_vec = u_host[u_base..u_base + d].to_vec();
1251 slot.y_block = y_host[y_base..y_base + d * k].to_vec();
1252 let mut log_det_local = 0.0_f64;
1253 for j in 0..d {
1254 log_det_local += l_host[l_base + j * d + j].ln();
1255 }
1256 slot.log_det_local = log_det_local;
1257 }
1258 tile[0].tile_partial_schur = Some(partial_schur);
1259 tile[0].tile_partial_rhs = Some(partial_rhs);
1260 Some(())
1261 }
1262
1263 fn back_sub_tile(
1267 ordinal: usize,
1268 d: usize,
1269 k: usize,
1270 delta_beta: &[f64],
1271 tile: &mut [RowSlot],
1272 ) -> Option<()> {
1273 if tile.is_empty() {
1274 return Some(());
1275 }
1276 let stream = gam_gpu::device_runtime::cuda_context_for(ordinal)
1279 .and_then(|ctx| ctx.new_stream().ok())?;
1280 let blas = CudaBlas::new(stream.clone()).ok()?;
1281 let m = tile.len();
1282
1283 let mut l_host = Vec::with_capacity(m * d * d);
1284 let mut u_host = Vec::with_capacity(m * d);
1285 let mut y_host = Vec::with_capacity(m * d * k);
1286 for slot in tile.iter() {
1287 l_host.extend_from_slice(&slot.l_block);
1288 u_host.extend_from_slice(&slot.u_vec);
1289 y_host.extend_from_slice(&slot.y_block);
1290 }
1291 let d_dev = stream.clone_htod(&l_host).ok()?;
1292 let mut g_dev = stream.clone_htod(&u_host).ok()?;
1293 let b_dev = stream.clone_htod(&y_host).ok()?;
1294 let rhs_dev = stream.clone_htod(&delta_beta.to_vec()).ok()?;
1295
1296 accumulate_back_sub_rhs(&blas, d, k, m, &b_dev, &rhs_dev, &mut g_dev).ok()?;
1298 trsm_batched_lower_inplace_transposed(&blas, &stream, d, m, 1, &d_dev, &mut g_dev).ok()?;
1299 let x_host = stream.clone_dtoh(&g_dev).ok()?;
1300 for (local, slot) in tile.iter_mut().enumerate() {
1301 let base = local * d;
1302 for r in 0..d {
1303 slot.delta_t_block[r] = -x_host[base + r];
1304 }
1305 }
1306 Some(())
1307 }
1308
1309 pub(super) fn solve(
1310 sys: &ArrowSchurSystem,
1311 ridge_t: f64,
1312 ridge_beta: f64,
1313 ) -> Result<ArrowSchurGpuSolution, ArrowSchurGpuFailure> {
1314 let n = sys.rows.len();
1315 let d = sys.d;
1316 let k = sys.k;
1317 let runtime = route_through_gpu(DispatchOp::SmallDenseBatchedPotrf { p: d, batch: n })
1318 .ok_or(ArrowSchurGpuFailure::Unavailable)?;
1319
1320 let stream = gam_gpu::device_runtime::cuda_context_for(runtime.device.ordinal)
1321 .and_then(|ctx| ctx.new_stream().ok())
1322 .ok_or(ArrowSchurGpuFailure::Unavailable)?;
1323 let solver =
1324 DnHandle::new(stream.clone()).map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1325 let blas = CudaBlas::new(stream.clone()).map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1326
1327 let (d_host, b_host, g_host) = pack_host(sys, ridge_t);
1329 let mut d_dev = stream
1330 .clone_htod(&d_host)
1331 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1332 let mut b_dev = stream
1333 .clone_htod(&b_host)
1334 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1335 let mut g_dev = stream
1336 .clone_htod(&g_host)
1337 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1338
1339 let info_host = potrf_batched(&solver, &stream, d, n, &mut d_dev)?;
1347 if let Some(idx) = info_host.iter().position(|info| *info != 0) {
1348 let pivot = info_host[idx];
1349 let scale = sys.rows[idx]
1350 .htt
1351 .diag()
1352 .iter()
1353 .map(|v| v.abs())
1354 .fold(0.0_f64, f64::max)
1355 .max(1.0);
1356 return Err(ArrowSchurGpuFailure::RidgeBumpRequired {
1357 row: idx,
1358 bump: scale * (pivot.abs() as f64).max(1.0) * f64::EPSILON.sqrt() * 1024.0,
1359 });
1360 }
1361
1362 trsm_batched_lower_inplace(&blas, &stream, d, n, 1, &d_dev, &mut g_dev)?;
1365 trsm_batched_lower_inplace(&blas, &stream, d, n, k, &d_dev, &mut b_dev)?;
1368
1369 let schur_init: Vec<f64> = {
1388 let mut tmp = Vec::with_capacity(k * k);
1389 for col in 0..k {
1390 for row in 0..k {
1391 let mut v = sys.hbb[[row, col]];
1392 if row == col {
1393 v += ridge_beta;
1394 }
1395 tmp.push(v);
1396 }
1397 }
1398 tmp
1399 };
1400 let mut schur_dev = stream
1401 .clone_htod(&schur_init)
1402 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1403 let rhs_init: Vec<f64> = sys.gb.iter().map(|v| -v).collect();
1404 let mut rhs_dev = stream
1405 .clone_htod(&rhs_init)
1406 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1407
1408 accumulate_schur(&blas, d, k, n, &b_dev, &g_dev, &mut schur_dev, &mut rhs_dev)?;
1409
1410 let info = potrf_single(&solver, &stream, k, &mut schur_dev)?;
1412 if info != 0 {
1413 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
1414 reason: format!("Schur Cholesky failed at pivot {info}"),
1415 });
1416 }
1417 trsm_single(&blas, &stream, k, &schur_dev, &mut rhs_dev, false, false)?;
1419 trsm_single(&blas, &stream, k, &schur_dev, &mut rhs_dev, false, true)?;
1420 let delta_beta_host = stream
1421 .clone_dtoh(&rhs_dev)
1422 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1423 let delta_beta = Array1::from_vec(delta_beta_host.clone());
1424
1425 accumulate_back_sub_rhs(&blas, d, k, n, &b_dev, &rhs_dev, &mut g_dev)?;
1433 trsm_batched_lower_inplace_transposed(&blas, &stream, d, n, 1, &d_dev, &mut g_dev)?;
1434
1435 let x_host = stream
1436 .clone_dtoh(&g_dev)
1437 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1438 let mut delta_t = Array1::<f64>::zeros(n * d);
1439 for (i, v) in x_host.iter().enumerate() {
1440 delta_t[i] = -*v;
1441 }
1442
1443 let l_local_host = stream
1445 .clone_dtoh(&d_dev)
1446 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1447 let l_schur_host = stream
1448 .clone_dtoh(&schur_dev)
1449 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1450 let mut log_det = 0.0_f64;
1451 for i in 0..n {
1452 let base = i * d * d;
1453 for j in 0..d {
1454 log_det += l_local_host[base + j * d + j].ln();
1455 }
1456 }
1457 for j in 0..k {
1458 log_det += l_schur_host[j * k + j].ln();
1459 }
1460 log_det *= 2.0;
1461
1462 Ok(ArrowSchurGpuSolution {
1463 delta_t,
1464 delta_beta,
1465 log_det_hessian: log_det,
1466 })
1467 }
1468
1469 fn potrf_batched(
1470 solver: &DnHandle,
1471 stream: &Arc<CudaStream>,
1472 p: usize,
1473 batch: usize,
1474 matrices: &mut CudaSlice<f64>,
1475 ) -> Result<Vec<i32>, ArrowSchurGpuFailure> {
1476 let p_i = to_i32(p).ok_or(ArrowSchurGpuFailure::Unavailable)?;
1477 let batch_i = to_i32(batch).ok_or(ArrowSchurGpuFailure::Unavailable)?;
1478 let matrix_len = p * p;
1479 let bytes_per = (matrix_len * std::mem::size_of::<f64>()) as u64;
1480 let (base_ptr, _record) = matrices.device_ptr_mut(stream);
1481 let mut ptrs = Vec::with_capacity(batch);
1482 for idx in 0..batch {
1483 ptrs.push(base_ptr + (idx as u64) * bytes_per);
1484 }
1485 let mut ptrs_dev = stream
1486 .clone_htod(&ptrs)
1487 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1488 let mut info_dev = stream
1489 .alloc_zeros::<i32>(batch)
1490 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1491 let status = {
1492 let (ptrs_ptr, _ptrs_record) = ptrs_dev.device_ptr_mut(stream);
1493 let (info_ptr, _info_record) = info_dev.device_ptr_mut(stream);
1494 unsafe {
1497 cusolver_sys::cusolverDnDpotrfBatched(
1498 solver.cu(),
1499 cusolver_sys::cublasFillMode_t::CUBLAS_FILL_MODE_LOWER,
1500 p_i,
1501 ptrs_ptr as *mut *mut f64,
1502 p_i,
1503 info_ptr as *mut i32,
1504 batch_i,
1505 )
1506 }
1507 };
1508 if status != cusolver_sys::cusolverStatus_t::CUSOLVER_STATUS_SUCCESS {
1509 return Err(ArrowSchurGpuFailure::Unavailable);
1510 }
1511 stream
1512 .clone_dtoh(&info_dev)
1513 .map_err(|_| ArrowSchurGpuFailure::Unavailable)
1514 }
1515
1516 fn potrf_single(
1517 solver: &DnHandle,
1518 stream: &Arc<CudaStream>,
1519 p: usize,
1520 matrix: &mut CudaSlice<f64>,
1521 ) -> Result<i32, ArrowSchurGpuFailure> {
1522 let p_i = to_i32(p).ok_or(ArrowSchurGpuFailure::Unavailable)?;
1523 let uplo = cusolver_sys::cublasFillMode_t::CUBLAS_FILL_MODE_LOWER;
1524 let mut lwork = 0_i32;
1525 {
1526 let (mat_ptr, _rec) = matrix.device_ptr_mut(stream);
1527 let status = unsafe {
1529 cusolver_sys::cusolverDnDpotrf_bufferSize(
1530 solver.cu(),
1531 uplo,
1532 p_i,
1533 mat_ptr as *mut f64,
1534 p_i,
1535 &mut lwork,
1536 )
1537 };
1538 if status != cusolver_sys::cusolverStatus_t::CUSOLVER_STATUS_SUCCESS {
1539 return Err(ArrowSchurGpuFailure::Unavailable);
1540 }
1541 }
1542 let lwork_usize = usize::try_from(lwork).map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1543 let mut workspace = stream
1544 .alloc_zeros::<f64>(lwork_usize.max(1))
1545 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1546 let mut info_dev = stream
1547 .alloc_zeros::<i32>(1)
1548 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1549 {
1550 let (mat_ptr, _rec) = matrix.device_ptr_mut(stream);
1551 let (work_ptr, _wrec) = workspace.device_ptr_mut(stream);
1552 let (info_ptr, _irec) = info_dev.device_ptr_mut(stream);
1553 let status = unsafe {
1555 cusolver_sys::cusolverDnDpotrf(
1556 solver.cu(),
1557 uplo,
1558 p_i,
1559 mat_ptr as *mut f64,
1560 p_i,
1561 work_ptr as *mut f64,
1562 lwork,
1563 info_ptr as *mut i32,
1564 )
1565 };
1566 if status != cusolver_sys::cusolverStatus_t::CUSOLVER_STATUS_SUCCESS {
1567 return Err(ArrowSchurGpuFailure::Unavailable);
1568 }
1569 }
1570 let info_host = stream
1571 .clone_dtoh(&info_dev)
1572 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1573 Ok(info_host[0])
1574 }
1575
1576 fn trsm_batched_lower_inplace(
1580 blas: &CudaBlas,
1581 stream: &Arc<CudaStream>,
1582 d: usize,
1583 n: usize,
1584 nrhs: usize,
1585 l_stack: &CudaSlice<f64>,
1586 rhs_stack: &mut CudaSlice<f64>,
1587 ) -> Result<(), ArrowSchurGpuFailure> {
1588 trsm_batched_inplace_inner(blas, stream, d, n, nrhs, l_stack, rhs_stack, false)
1589 }
1590
1591 fn trsm_batched_lower_inplace_transposed(
1593 blas: &CudaBlas,
1594 stream: &Arc<CudaStream>,
1595 d: usize,
1596 n: usize,
1597 nrhs: usize,
1598 l_stack: &CudaSlice<f64>,
1599 rhs_stack: &mut CudaSlice<f64>,
1600 ) -> Result<(), ArrowSchurGpuFailure> {
1601 trsm_batched_inplace_inner(blas, stream, d, n, nrhs, l_stack, rhs_stack, true)
1602 }
1603
1604 fn trsm_batched_inplace_inner(
1605 blas: &CudaBlas,
1606 stream: &Arc<CudaStream>,
1607 d: usize,
1608 n: usize,
1609 nrhs: usize,
1610 l_stack: &CudaSlice<f64>,
1611 rhs_stack: &mut CudaSlice<f64>,
1612 transposed: bool,
1613 ) -> Result<(), ArrowSchurGpuFailure> {
1614 let alpha = 1.0_f64;
1615 let d_i = to_i32(d).ok_or(ArrowSchurGpuFailure::Unavailable)?;
1616 let nrhs_i = to_i32(nrhs).ok_or(ArrowSchurGpuFailure::Unavailable)?;
1617 let batch_i = to_i32(n).ok_or(ArrowSchurGpuFailure::Unavailable)?;
1618 let l_bytes_per = (d * d * std::mem::size_of::<f64>()) as u64;
1619 let rhs_bytes_per = (d * nrhs * std::mem::size_of::<f64>()) as u64;
1620 let (l_base, _l_record) = l_stack.device_ptr(stream);
1621 let (rhs_base, _rhs_record) = rhs_stack.device_ptr_mut(stream);
1622 let mut l_ptrs = Vec::with_capacity(n);
1623 let mut rhs_ptrs = Vec::with_capacity(n);
1624 for i in 0..n {
1625 l_ptrs.push(l_base + (i as u64) * l_bytes_per);
1626 rhs_ptrs.push(rhs_base + (i as u64) * rhs_bytes_per);
1627 }
1628 let mut l_ptrs_dev = stream
1629 .clone_htod(&l_ptrs)
1630 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1631 let mut rhs_ptrs_dev = stream
1632 .clone_htod(&rhs_ptrs)
1633 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1634 let (l_ptrs_ptr, _l_ptrs_rec) = l_ptrs_dev.device_ptr_mut(stream);
1635 let (rhs_ptrs_ptr, _rhs_ptrs_rec) = rhs_ptrs_dev.device_ptr_mut(stream);
1636 let op = if transposed {
1637 cublasOperation_t::CUBLAS_OP_T
1638 } else {
1639 cublasOperation_t::CUBLAS_OP_N
1640 };
1641 let handle = *blas.handle();
1642 let status = unsafe {
1645 cudarc::cublas::sys::cublasDtrsmBatched(
1646 handle,
1647 cublasSideMode_t::CUBLAS_SIDE_LEFT,
1648 cublasFillMode_t::CUBLAS_FILL_MODE_LOWER,
1649 op,
1650 cublasDiagType_t::CUBLAS_DIAG_NON_UNIT,
1651 d_i,
1652 nrhs_i,
1653 &alpha,
1654 l_ptrs_ptr as *const *const f64,
1655 d_i,
1656 rhs_ptrs_ptr as *const *mut f64,
1657 d_i,
1658 batch_i,
1659 )
1660 };
1661 if status != cublasStatus_t::CUBLAS_STATUS_SUCCESS {
1662 return Err(ArrowSchurGpuFailure::Unavailable);
1663 }
1664 Ok(())
1665 }
1666
1667 fn trsm_single(
1670 blas: &CudaBlas,
1671 stream: &Arc<CudaStream>,
1672 n: usize,
1673 l: &CudaSlice<f64>,
1674 rhs: &mut CudaSlice<f64>,
1675 upper: bool,
1676 transposed: bool,
1677 ) -> Result<(), ArrowSchurGpuFailure> {
1678 let alpha = 1.0_f64;
1679 let n_i = to_i32(n).ok_or(ArrowSchurGpuFailure::Unavailable)?;
1680 let handle = *blas.handle();
1681 let (l_ptr, _l_rec) = l.device_ptr(stream);
1682 let (rhs_ptr, _rhs_rec) = rhs.device_ptr_mut(stream);
1683 let status = unsafe {
1685 cudarc::cublas::sys::cublasDtrsm_v2(
1686 handle,
1687 cublasSideMode_t::CUBLAS_SIDE_LEFT,
1688 if upper {
1689 cublasFillMode_t::CUBLAS_FILL_MODE_UPPER
1690 } else {
1691 cublasFillMode_t::CUBLAS_FILL_MODE_LOWER
1692 },
1693 if transposed {
1694 cublasOperation_t::CUBLAS_OP_T
1695 } else {
1696 cublasOperation_t::CUBLAS_OP_N
1697 },
1698 cublasDiagType_t::CUBLAS_DIAG_NON_UNIT,
1699 n_i,
1700 1,
1701 &alpha,
1702 l_ptr as *const f64,
1703 n_i,
1704 rhs_ptr as *mut f64,
1705 n_i,
1706 )
1707 };
1708 if status != cublasStatus_t::CUBLAS_STATUS_SUCCESS {
1709 return Err(ArrowSchurGpuFailure::Unavailable);
1710 }
1711 Ok(())
1712 }
1713
1714 fn accumulate_schur(
1718 blas: &CudaBlas,
1719 d: usize,
1720 k: usize,
1721 n: usize,
1722 y_stack: &CudaSlice<f64>,
1723 u_stack: &CudaSlice<f64>,
1724 schur: &mut CudaSlice<f64>,
1725 rhs: &mut CudaSlice<f64>,
1726 ) -> Result<(), ArrowSchurGpuFailure> {
1727 let y_block_elems = d * k;
1728 let u_block_elems = d;
1729 for i in 0..n {
1730 let y_slice = y_stack.slice(i * y_block_elems..(i + 1) * y_block_elems);
1731 let u_slice = u_stack.slice(i * u_block_elems..(i + 1) * u_block_elems);
1732 let gemm_cfg = GemmConfig::<f64> {
1734 transa: cublasOperation_t::CUBLAS_OP_T,
1735 transb: cublasOperation_t::CUBLAS_OP_N,
1736 m: to_i32(k).ok_or(ArrowSchurGpuFailure::Unavailable)?,
1737 n: to_i32(k).ok_or(ArrowSchurGpuFailure::Unavailable)?,
1738 k: to_i32(d).ok_or(ArrowSchurGpuFailure::Unavailable)?,
1739 alpha: -1.0,
1740 lda: to_i32(d).ok_or(ArrowSchurGpuFailure::Unavailable)?,
1741 ldb: to_i32(d).ok_or(ArrowSchurGpuFailure::Unavailable)?,
1742 beta: 1.0,
1743 ldc: to_i32(k).ok_or(ArrowSchurGpuFailure::Unavailable)?,
1744 };
1745 unsafe { blas.gemm(gemm_cfg, &y_slice, &y_slice, schur) }
1747 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1748 let gemv_cfg = GemvConfig::<f64> {
1750 trans: cublasOperation_t::CUBLAS_OP_T,
1751 m: to_i32(d).ok_or(ArrowSchurGpuFailure::Unavailable)?,
1752 n: to_i32(k).ok_or(ArrowSchurGpuFailure::Unavailable)?,
1753 alpha: 1.0,
1754 lda: to_i32(d).ok_or(ArrowSchurGpuFailure::Unavailable)?,
1755 incx: 1,
1756 beta: 1.0,
1757 incy: 1,
1758 };
1759 unsafe { blas.gemv(gemv_cfg, &y_slice, &u_slice, rhs) }
1762 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1763 }
1764 Ok(())
1765 }
1766
1767 fn accumulate_schur_rhs_only(
1775 blas: &CudaBlas,
1776 d: usize,
1777 k: usize,
1778 n: usize,
1779 y_stack: &CudaSlice<f64>,
1780 u_stack: &CudaSlice<f64>,
1781 rhs: &mut CudaSlice<f64>,
1782 ) -> Result<(), ArrowSchurGpuFailure> {
1783 let y_block_elems = d * k;
1784 let u_block_elems = d;
1785 for i in 0..n {
1786 let y_slice = y_stack.slice(i * y_block_elems..(i + 1) * y_block_elems);
1787 let u_slice = u_stack.slice(i * u_block_elems..(i + 1) * u_block_elems);
1788 let gemv_cfg = GemvConfig::<f64> {
1789 trans: cublasOperation_t::CUBLAS_OP_T,
1790 m: to_i32(d).ok_or(ArrowSchurGpuFailure::Unavailable)?,
1791 n: to_i32(k).ok_or(ArrowSchurGpuFailure::Unavailable)?,
1792 alpha: 1.0,
1793 lda: to_i32(d).ok_or(ArrowSchurGpuFailure::Unavailable)?,
1794 incx: 1,
1795 beta: 1.0,
1796 incy: 1,
1797 };
1798 unsafe { blas.gemv(gemv_cfg, &y_slice, &u_slice, rhs) }
1801 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1802 }
1803 Ok(())
1804 }
1805
1806 fn accumulate_back_sub_rhs(
1809 blas: &CudaBlas,
1810 d: usize,
1811 k: usize,
1812 n: usize,
1813 y_stack: &CudaSlice<f64>,
1814 delta_beta: &CudaSlice<f64>,
1815 u_stack: &mut CudaSlice<f64>,
1816 ) -> Result<(), ArrowSchurGpuFailure> {
1817 let y_block_elems = d * k;
1818 let u_block_elems = d;
1819 for i in 0..n {
1820 let y_slice = y_stack.slice(i * y_block_elems..(i + 1) * y_block_elems);
1821 let mut u_slice = u_stack.slice_mut(i * u_block_elems..(i + 1) * u_block_elems);
1822 let gemv_cfg = GemvConfig::<f64> {
1823 trans: cublasOperation_t::CUBLAS_OP_N,
1824 m: to_i32(d).ok_or(ArrowSchurGpuFailure::Unavailable)?,
1825 n: to_i32(k).ok_or(ArrowSchurGpuFailure::Unavailable)?,
1826 alpha: 1.0,
1827 lda: to_i32(d).ok_or(ArrowSchurGpuFailure::Unavailable)?,
1828 incx: 1,
1829 beta: 1.0,
1830 incy: 1,
1831 };
1832 unsafe { blas.gemv(gemv_cfg, &y_slice, delta_beta, &mut u_slice) }
1835 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1836 }
1837 Ok(())
1838 }
1839
1840 use std::collections::HashMap;
1856 use std::sync::Mutex;
1857
1858 struct FusedModuleCache {
1863 modules: Mutex<
1864 HashMap<crate::gpu_kernels::arrow_schur_nvrtc::FusedModuleCacheKey, Arc<CudaModule>>,
1865 >,
1866 }
1867
1868 fn fused_module_cache() -> &'static FusedModuleCache {
1869 static CACHE: OnceLock<FusedModuleCache> = OnceLock::new();
1870 CACHE.get_or_init(|| FusedModuleCache {
1871 modules: Mutex::new(HashMap::new()),
1872 })
1873 }
1874
1875 fn fused_module_for(
1876 ctx: &Arc<CudaContext>,
1877 key: crate::gpu_kernels::arrow_schur_nvrtc::FusedModuleCacheKey,
1878 ) -> Result<Arc<CudaModule>, ArrowSchurGpuFailure> {
1879 let cache = fused_module_cache();
1880 if let Ok(guard) = cache.modules.lock() {
1881 if let Some(existing) = guard.get(&key) {
1882 return Ok(existing.clone());
1883 }
1884 }
1885 let src = crate::gpu_kernels::arrow_schur_nvrtc::forward_kernel_source(
1886 key.p_max as usize,
1887 key.r_template as usize,
1888 );
1889 let ptx = cudarc::nvrtc::compile_ptx(&src).map_err(|err| {
1890 ArrowSchurGpuFailure::SchurFactorFailed {
1891 reason: format!(
1892 "arrow-schur fused NVRTC compile (p_max={}, r={}): {err}",
1893 key.p_max, key.r_template
1894 ),
1895 }
1896 })?;
1897 let module = ctx
1898 .load_module(ptx)
1899 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
1900 if let Ok(mut guard) = cache.modules.lock() {
1901 guard.entry(key).or_insert_with(|| module.clone());
1902 }
1903 Ok(module)
1904 }
1905
1906 const PCG_VECTOR_KERNEL_SOURCE: &str = r#"
1907extern "C" __global__ void arrow_pcg_jacobi_mul(
1908 const double* __restrict__ inv_diag,
1909 const double* __restrict__ r,
1910 double* __restrict__ z,
1911 int n
1912) {
1913 int idx = blockIdx.x * blockDim.x + threadIdx.x;
1914 if (idx < n) {
1915 z[idx] = inv_diag[idx] * r[idx];
1916 }
1917}
1918
1919extern "C" __global__ void arrow_pcg_update_p(
1920 const double* __restrict__ z,
1921 double beta,
1922 double* __restrict__ p,
1923 int n
1924) {
1925 int idx = blockIdx.x * blockDim.x + threadIdx.x;
1926 if (idx < n) {
1927 p[idx] = z[idx] + beta * p[idx];
1928 }
1929}
1930
1931extern "C" __global__ void arrow_sae_init(
1932 double* __restrict__ out,
1933 const double* __restrict__ x,
1934 double ridge,
1935 int n
1936) {
1937 int idx = blockIdx.x * blockDim.x + threadIdx.x;
1938 if (idx < n) {
1939 out[idx] = ridge * x[idx];
1940 }
1941}
1942
1943extern "C" __global__ void arrow_sae_smooth_matvec(
1944 const double* __restrict__ x,
1945 double* __restrict__ out,
1946 const int* __restrict__ block_offsets,
1947 const int* __restrict__ block_m,
1948 const int* __restrict__ factor_ptr,
1949 const double* __restrict__ factors,
1950 int p,
1951 int n_blocks
1952) {
1953 int block_id = blockIdx.y;
1954 int linear = blockIdx.x * blockDim.x + threadIdx.x;
1955 if (block_id >= n_blocks) {
1956 return;
1957 }
1958 int m = block_m[block_id];
1959 int total = m * p;
1960 if (linear >= total) {
1961 return;
1962 }
1963 int li = linear / p;
1964 int oc = linear - li * p;
1965 int off = block_offsets[block_id];
1966 int fbase = factor_ptr[block_id];
1967 double acc = 0.0;
1968 for (int lj = 0; lj < m; ++lj) {
1969 double a = factors[fbase + li * m + lj];
1970 acc += a * x[off + lj * p + oc];
1971 }
1972 out[off + li * p + oc] += acc;
1973}
1974
1975extern "C" __global__ void arrow_sae_sparse_g_matvec(
1976 const double* __restrict__ x,
1977 double* __restrict__ out,
1978 const int* __restrict__ row_off,
1979 const int* __restrict__ col_off,
1980 const int* __restrict__ rows,
1981 const int* __restrict__ cols,
1982 const int* __restrict__ data_ptr,
1983 const double* __restrict__ data,
1984 int p,
1985 int n_blocks
1986) {
1987 int block_id = blockIdx.y;
1988 int linear = blockIdx.x * blockDim.x + threadIdx.x;
1989 if (block_id >= n_blocks) {
1990 return;
1991 }
1992 int m_i = rows[block_id];
1993 int m_j = cols[block_id];
1994 int total = m_i * p;
1995 if (linear >= total) {
1996 return;
1997 }
1998 int li = linear / p;
1999 int oc = linear - li * p;
2000 int rbase = row_off[block_id];
2001 int cbase = col_off[block_id];
2002 int dbase = data_ptr[block_id];
2003 double acc = 0.0;
2004 for (int lj = 0; lj < m_j; ++lj) {
2005 acc += data[dbase + li * m_j + lj] * x[(cbase + lj) * p + oc];
2006 }
2007 // #1017 — a row atom co-occurs with multiple column atoms, so several
2008 // concurrent (atom_i, atom_j) blocks (blockIdx.y) write the SAME output
2009 // element `out[(rbase+li)*p+oc]`. A plain `+=` races and loses updates
2010 // (silently-wrong Schur matvec); accumulate atomically. `double` atomicAdd
2011 // needs sm_60+, guaranteed by the NVRTC arch pin (#1551).
2012 atomicAdd(&out[(rbase + li) * p + oc], acc);
2013}
2014
2015extern "C" __global__ void arrow_sae_gather_u(
2016 const double* __restrict__ x,
2017 const int* __restrict__ row_ptr,
2018 const int* __restrict__ beta_base,
2019 const double* __restrict__ phi,
2020 double* __restrict__ u,
2021 int p,
2022 int n_rows
2023) {
2024 int row = blockIdx.y;
2025 int oc = blockIdx.x * blockDim.x + threadIdx.x;
2026 if (row >= n_rows || oc >= p) {
2027 return;
2028 }
2029 double acc = 0.0;
2030 int start = row_ptr[row];
2031 int end = row_ptr[row + 1];
2032 for (int e = start; e < end; ++e) {
2033 acc += phi[e] * x[beta_base[e] + oc];
2034 }
2035 u[row * p + oc] = acc;
2036}
2037
2038extern "C" __global__ void arrow_sae_apply_l(
2039 const double* __restrict__ u,
2040 const int* __restrict__ jac_ptr,
2041 const double* __restrict__ jac,
2042 double* __restrict__ w,
2043 int p,
2044 int max_q,
2045 int n_rows
2046) {
2047 int row = blockIdx.y;
2048 int c = blockIdx.x * blockDim.x + threadIdx.x;
2049 if (row >= n_rows) {
2050 return;
2051 }
2052 int jstart = jac_ptr[row];
2053 int q = (jac_ptr[row + 1] - jstart) / p;
2054 if (c >= q) {
2055 return;
2056 }
2057 double acc = 0.0;
2058 for (int oc = 0; oc < p; ++oc) {
2059 acc += jac[jstart + c * p + oc] * u[row * p + oc];
2060 }
2061 w[row * max_q + c] = acc;
2062}
2063
2064extern "C" __global__ void arrow_sae_apply_ainv(
2065 const double* __restrict__ ainv,
2066 const double* __restrict__ w,
2067 double* __restrict__ v,
2068 int max_q,
2069 int n_rows
2070) {
2071 int row = blockIdx.y;
2072 int c = blockIdx.x * blockDim.x + threadIdx.x;
2073 if (row >= n_rows || c >= max_q) {
2074 return;
2075 }
2076 double acc = 0.0;
2077 int base = row * max_q * max_q;
2078 for (int j = 0; j < max_q; ++j) {
2079 acc += ainv[base + c * max_q + j] * w[row * max_q + j];
2080 }
2081 v[row * max_q + c] = acc;
2082}
2083
2084extern "C" __global__ void arrow_sae_scatter_sub(
2085 const double* __restrict__ v,
2086 const int* __restrict__ jac_ptr,
2087 const double* __restrict__ jac,
2088 const int* __restrict__ row_ptr,
2089 const int* __restrict__ beta_base,
2090 const double* __restrict__ phi,
2091 double* __restrict__ out,
2092 int p,
2093 int max_q,
2094 int n_rows
2095) {
2096 int row = blockIdx.y;
2097 int oc = blockIdx.x * blockDim.x + threadIdx.x;
2098 if (row >= n_rows || oc >= p) {
2099 return;
2100 }
2101 int jstart = jac_ptr[row];
2102 int q = (jac_ptr[row + 1] - jstart) / p;
2103 double lt_v = 0.0;
2104 for (int c = 0; c < q; ++c) {
2105 lt_v += jac[jstart + c * p + oc] * v[row * max_q + c];
2106 }
2107 int start = row_ptr[row];
2108 int end = row_ptr[row + 1];
2109 for (int e = start; e < end; ++e) {
2110 atomicAdd(&out[beta_base[e] + oc], -phi[e] * lt_v);
2111 }
2112}
2113
2114extern "C" __global__ void arrow_sae_diag_sub(
2115 double* __restrict__ diag,
2116 const double* __restrict__ ainv,
2117 const int* __restrict__ jac_ptr,
2118 const double* __restrict__ jac,
2119 const int* __restrict__ row_ptr,
2120 const int* __restrict__ beta_base,
2121 const double* __restrict__ phi,
2122 int p,
2123 int max_q,
2124 int n_rows
2125) {
2126 int row = blockIdx.y;
2127 int oc = blockIdx.x * blockDim.x + threadIdx.x;
2128 if (row >= n_rows || oc >= p) {
2129 return;
2130 }
2131 int jstart = jac_ptr[row];
2132 int q = (jac_ptr[row + 1] - jstart) / p;
2133 int abase = row * max_q * max_q;
2134 double quad = 0.0;
2135 for (int c = 0; c < q; ++c) {
2136 double lc = jac[jstart + c * p + oc];
2137 for (int d = 0; d < q; ++d) {
2138 quad += lc * ainv[abase + c * max_q + d] * jac[jstart + d * p + oc];
2139 }
2140 }
2141 int start = row_ptr[row];
2142 int end = row_ptr[row + 1];
2143 for (int e = start; e < end; ++e) {
2144 double pe = phi[e];
2145 atomicAdd(&diag[beta_base[e] + oc], -(pe * pe) * quad);
2146 }
2147}
2148
2149/* ── #1017/#1026 frames-engaged device kernels ─────────────────────────────
2150 * The factored β border is C-space (width Σ M_k·r_k). The penalty side is the
2151 * smooth `λ S_k ⊗ I_{r_k}` (per-block right-width r_k) plus the data-fit
2152 * `G_{ij} ⊗ W_{ij}` (W = U_iᵀU_j, dense r_i×r_j). The reduced-Schur term uses
2153 * the per-row DENSE cross-block H_tβ^(i) (q_i × border_dim, row-major). */
2154
2155extern "C" __global__ void arrow_sae_frame_smooth_matvec(
2156 const double* __restrict__ x,
2157 double* __restrict__ out,
2158 const int* __restrict__ block_offsets,
2159 const int* __restrict__ block_m,
2160 const int* __restrict__ block_r,
2161 const int* __restrict__ factor_ptr,
2162 const double* __restrict__ factors,
2163 int n_blocks
2164) {
2165 int block_id = blockIdx.y;
2166 int linear = blockIdx.x * blockDim.x + threadIdx.x;
2167 if (block_id >= n_blocks) {
2168 return;
2169 }
2170 int m = block_m[block_id];
2171 int r = block_r[block_id];
2172 int total = m * r;
2173 if (linear >= total) {
2174 return;
2175 }
2176 int li = linear / r;
2177 int ib = linear - li * r;
2178 int off = block_offsets[block_id];
2179 int fbase = factor_ptr[block_id];
2180 double acc = 0.0;
2181 for (int lj = 0; lj < m; ++lj) {
2182 double a = factors[fbase + li * m + lj];
2183 acc += a * x[off + lj * r + ib];
2184 }
2185 out[off + li * r + ib] += acc;
2186}
2187
2188extern "C" __global__ void arrow_sae_frame_g_matvec(
2189 const double* __restrict__ x,
2190 double* __restrict__ out,
2191 const int* __restrict__ off_i,
2192 const int* __restrict__ off_j,
2193 const int* __restrict__ r_i,
2194 const int* __restrict__ r_j,
2195 const int* __restrict__ m_i,
2196 const int* __restrict__ m_j,
2197 const int* __restrict__ g_ptr,
2198 const double* __restrict__ g_data,
2199 const int* __restrict__ w_ptr,
2200 const double* __restrict__ w_data,
2201 int n_blocks
2202) {
2203 int block_id = blockIdx.y;
2204 int linear = blockIdx.x * blockDim.x + threadIdx.x;
2205 if (block_id >= n_blocks) {
2206 return;
2207 }
2208 int ri = r_i[block_id];
2209 int rj = r_j[block_id];
2210 int mi = m_i[block_id];
2211 int mj = m_j[block_id];
2212 int total = mi * ri;
2213 if (linear >= total) {
2214 return;
2215 }
2216 int li = linear / ri; // basis row in atom i
2217 int a = linear - li * ri; // frame coord in atom i
2218 int oi = off_i[block_id];
2219 int oj = off_j[block_id];
2220 int gbase = g_ptr[block_id];
2221 int wbase = w_ptr[block_id];
2222 double acc = 0.0;
2223 for (int lj = 0; lj < mj; ++lj) {
2224 double g = g_data[gbase + li * mj + lj];
2225 if (g == 0.0) { continue; }
2226 int xj_base = oj + lj * rj;
2227 double inner = 0.0;
2228 for (int b = 0; b < rj; ++b) {
2229 inner += w_data[wbase + a * rj + b] * x[xj_base + b];
2230 }
2231 acc += g * inner;
2232 }
2233 // #1017 — same race as `arrow_sae_sparse_g_matvec`: atom i is the row atom of
2234 // multiple co-occurring (i,j) frame blocks running concurrently on
2235 // blockIdx.y, all targeting `out[oi+li*ri+a]`. Accumulate atomically so the
2236 // framed G⊗W matvec is correct (the CPU oracle sums these sequentially).
2237 atomicAdd(&out[oi + li * ri + a], acc);
2238}
2239
2240/* Per-row reduced-Schur subtraction with a DENSE cross-block H_tβ^(i).
2241 * h_i = H_tβ^(i) · x (length q_i)
2242 * s_i = (H_tt^(i)+ρ_t I)⁻¹ h_i (apply cached ainv, length q_i)
2243 * out -= (H_tβ^(i))ᵀ · s_i (scatter into border_dim)
2244 * `htb` is row-major (q_i × k) flattened, `htb_ptr` gives each row's base and
2245 * (htb_ptr[row+1]-htb_ptr[row])/k == q_i. `q_of` carries q_i directly. */
2246extern "C" __global__ void arrow_sae_frame_apply_h(
2247 const double* __restrict__ x,
2248 const int* __restrict__ htb_ptr,
2249 const double* __restrict__ htb,
2250 const int* __restrict__ q_of,
2251 double* __restrict__ hvec,
2252 int k,
2253 int max_q,
2254 int n_rows
2255) {
2256 int row = blockIdx.y;
2257 int c = blockIdx.x * blockDim.x + threadIdx.x;
2258 if (row >= n_rows) { return; }
2259 int q = q_of[row];
2260 if (c >= q) { return; }
2261 int base = htb_ptr[row] + c * k;
2262 double acc = 0.0;
2263 for (int a = 0; a < k; ++a) {
2264 acc += htb[base + a] * x[a];
2265 }
2266 hvec[row * max_q + c] = acc;
2267}
2268
2269extern "C" __global__ void arrow_sae_frame_apply_ainv(
2270 const double* __restrict__ ainv,
2271 const double* __restrict__ hvec,
2272 const int* __restrict__ q_of,
2273 double* __restrict__ svec,
2274 int max_q,
2275 int n_rows
2276) {
2277 int row = blockIdx.y;
2278 int c = blockIdx.x * blockDim.x + threadIdx.x;
2279 if (row >= n_rows || c >= max_q) { return; }
2280 int q = q_of[row];
2281 double acc = 0.0;
2282 int abase = row * max_q * max_q;
2283 for (int j = 0; j < q; ++j) {
2284 acc += ainv[abase + c * max_q + j] * hvec[row * max_q + j];
2285 }
2286 svec[row * max_q + c] = acc;
2287}
2288
2289extern "C" __global__ void arrow_sae_frame_scatter_h(
2290 const double* __restrict__ svec,
2291 const int* __restrict__ htb_ptr,
2292 const double* __restrict__ htb,
2293 const int* __restrict__ q_of,
2294 double* __restrict__ out,
2295 int k,
2296 int max_q,
2297 int n_rows
2298) {
2299 int row = blockIdx.y;
2300 int a = blockIdx.x * blockDim.x + threadIdx.x;
2301 if (row >= n_rows || a >= k) { return; }
2302 int q = q_of[row];
2303 int hbase = htb_ptr[row];
2304 double acc = 0.0;
2305 for (int c = 0; c < q; ++c) {
2306 acc += htb[hbase + c * k + a] * svec[row * max_q + c];
2307 }
2308 atomicAdd(&out[a], -acc);
2309}
2310
2311/* Frame Jacobi diagonal subtraction: diag[a] -= Σ_c Σ_d H_tβ[c,a]·ainv[c,d]·H_tβ[d,a]. */
2312extern "C" __global__ void arrow_sae_frame_diag_sub(
2313 double* __restrict__ diag,
2314 const double* __restrict__ ainv,
2315 const int* __restrict__ htb_ptr,
2316 const double* __restrict__ htb,
2317 const int* __restrict__ q_of,
2318 int k,
2319 int max_q,
2320 int n_rows
2321) {
2322 int row = blockIdx.y;
2323 int a = blockIdx.x * blockDim.x + threadIdx.x;
2324 if (row >= n_rows || a >= k) { return; }
2325 int q = q_of[row];
2326 int hbase = htb_ptr[row];
2327 int abase = row * max_q * max_q;
2328 double quad = 0.0;
2329 for (int c = 0; c < q; ++c) {
2330 double hc = htb[hbase + c * k + a];
2331 for (int d = 0; d < q; ++d) {
2332 quad += hc * ainv[abase + c * max_q + d] * htb[hbase + d * k + a];
2333 }
2334 }
2335 atomicAdd(&diag[a], -quad);
2336}
2337"#;
2338
2339 fn pcg_vector_module(
2340 ctx: &Arc<CudaContext>,
2341 ) -> Result<&'static Arc<CudaModule>, ArrowSchurGpuFailure> {
2342 static CACHE: gam_gpu::device_cache::PtxModuleCache =
2343 gam_gpu::device_cache::PtxModuleCache::new();
2344 CACHE
2345 .get_or_compile(ctx, "arrow_pcg_vector", PCG_VECTOR_KERNEL_SOURCE)
2346 .map_err(|err| {
2347 log::warn!("[#1551] pcg_vector_module get_or_compile failed: {err}");
2353 ArrowSchurGpuFailure::Unavailable
2354 })
2355 }
2356
2357 fn pcg_launch_config(n: usize) -> Result<LaunchConfig, ArrowSchurGpuFailure> {
2358 let threads = 256u32;
2359 let blocks = ((n as u32).saturating_add(threads - 1) / threads).max(1);
2360 Ok(LaunchConfig {
2361 grid_dim: (blocks, 1, 1),
2362 block_dim: (threads, 1, 1),
2363 shared_mem_bytes: 0,
2364 })
2365 }
2366
2367 fn launch_jacobi_mul(
2368 stream: &Arc<CudaStream>,
2369 module: &Arc<CudaModule>,
2370 inv_diag: &CudaSlice<f64>,
2371 r: &CudaSlice<f64>,
2372 z: &mut CudaSlice<f64>,
2373 n: usize,
2374 ) -> Result<(), ArrowSchurGpuFailure> {
2375 let kernel = module
2376 .load_function("arrow_pcg_jacobi_mul")
2377 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
2378 let n_i32 = to_i32(n).ok_or(ArrowSchurGpuFailure::Unavailable)?;
2379 let mut builder = stream.launch_builder(&kernel);
2380 builder.arg(inv_diag).arg(r).arg(z).arg(&n_i32);
2381 unsafe { builder.launch(pcg_launch_config(n)?) }
2384 .map(drop)
2385 .map_err(|_| ArrowSchurGpuFailure::Unavailable)
2386 }
2387
2388 fn launch_update_p(
2389 stream: &Arc<CudaStream>,
2390 module: &Arc<CudaModule>,
2391 z: &CudaSlice<f64>,
2392 beta: f64,
2393 p: &mut CudaSlice<f64>,
2394 n: usize,
2395 ) -> Result<(), ArrowSchurGpuFailure> {
2396 let kernel = module
2397 .load_function("arrow_pcg_update_p")
2398 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
2399 let n_i32 = to_i32(n).ok_or(ArrowSchurGpuFailure::Unavailable)?;
2400 let mut builder = stream.launch_builder(&kernel);
2401 builder.arg(z).arg(&beta).arg(p).arg(&n_i32);
2402 unsafe { builder.launch(pcg_launch_config(n)?) }
2405 .map(drop)
2406 .map_err(|_| ArrowSchurGpuFailure::Unavailable)
2407 }
2408
2409 struct DeviceSaePcgBuffers {
2410 row_ptr: CudaSlice<i32>,
2411 beta_base: CudaSlice<i32>,
2412 phi: CudaSlice<f64>,
2413 jac_ptr: CudaSlice<i32>,
2414 jac: CudaSlice<f64>,
2415 smooth_offsets: CudaSlice<i32>,
2416 smooth_m: CudaSlice<i32>,
2417 smooth_ptr: CudaSlice<i32>,
2418 smooth_data: CudaSlice<f64>,
2419 g_row_off: CudaSlice<i32>,
2420 g_col_off: CudaSlice<i32>,
2421 g_rows: CudaSlice<i32>,
2422 g_cols: CudaSlice<i32>,
2423 g_ptr: CudaSlice<i32>,
2424 g_data: CudaSlice<f64>,
2425 ainv: CudaSlice<f64>,
2426 u: CudaSlice<f64>,
2427 w: CudaSlice<f64>,
2428 v: CudaSlice<f64>,
2429 n_rows: usize,
2430 p: usize,
2431 k: usize,
2432 max_q: usize,
2433 smooth_blocks: usize,
2434 g_blocks: usize,
2435 }
2436
2437 fn checked_i32(value: usize) -> Result<i32, ArrowSchurGpuFailure> {
2438 to_i32(value).ok_or(ArrowSchurGpuFailure::Unavailable)
2439 }
2440
2441 fn sae_penalty_diag_host(
2442 data: &DeviceSaePcgData,
2443 ridge_beta: f64,
2444 ) -> Result<Vec<f64>, ArrowSchurGpuFailure> {
2445 let mut diag = vec![ridge_beta; data.beta_dim];
2446 for block in &data.smooth_blocks {
2447 let (rows, cols) = block.factor_a.dim();
2448 if rows != cols {
2449 return Err(ArrowSchurGpuFailure::Unavailable);
2450 }
2451 for row in 0..rows {
2452 let coeff = block.factor_a[[row, row]];
2453 let base = block
2454 .global_offset
2455 .checked_add(
2456 row.checked_mul(data.p)
2457 .ok_or(ArrowSchurGpuFailure::Unavailable)?,
2458 )
2459 .ok_or(ArrowSchurGpuFailure::Unavailable)?;
2460 let end = base
2461 .checked_add(data.p)
2462 .ok_or(ArrowSchurGpuFailure::Unavailable)?;
2463 if end > diag.len() {
2464 return Err(ArrowSchurGpuFailure::Unavailable);
2465 }
2466 for channel in 0..data.p {
2467 diag[base + channel] += coeff;
2468 }
2469 }
2470 }
2471 for block in &data.sparse_g_blocks {
2472 if block.row_off != block.col_off {
2473 continue;
2474 }
2475 let (rows, cols) = block.data.dim();
2476 for row in 0..rows.min(cols) {
2477 let coeff = block.data[[row, row]];
2478 let beta_row = block
2479 .row_off
2480 .checked_add(row)
2481 .ok_or(ArrowSchurGpuFailure::Unavailable)?;
2482 let base = beta_row
2483 .checked_mul(data.p)
2484 .ok_or(ArrowSchurGpuFailure::Unavailable)?;
2485 let end = base
2486 .checked_add(data.p)
2487 .ok_or(ArrowSchurGpuFailure::Unavailable)?;
2488 if end > diag.len() {
2489 return Err(ArrowSchurGpuFailure::Unavailable);
2490 }
2491 for channel in 0..data.p {
2492 diag[base + channel] += coeff;
2493 }
2494 }
2495 }
2496 Ok(diag)
2497 }
2498
2499 fn flatten_device_sae_data(
2500 sys: &ArrowSchurSystem,
2501 data: &DeviceSaePcgData,
2502 ridge_t: f64,
2503 stream: &Arc<CudaStream>,
2504 ) -> Result<DeviceSaePcgBuffers, ArrowSchurGpuFailure> {
2505 let n_rows = sys.rows.len();
2506 let p = data.p;
2507 let k = data.beta_dim;
2508 if data.a_phi.len() != n_rows || data.local_jac.len() != n_rows {
2509 return Err(ArrowSchurGpuFailure::Unavailable);
2510 }
2511
2512 let mut row_ptr_host = Vec::with_capacity(n_rows + 1);
2513 let mut beta_base_host = Vec::<i32>::new();
2514 let mut phi_host = Vec::<f64>::new();
2515 row_ptr_host.push(0_i32);
2516 for row in data.a_phi.iter() {
2517 for &(base, phi) in row {
2518 beta_base_host.push(checked_i32(base)?);
2519 phi_host.push(phi);
2520 }
2521 row_ptr_host.push(checked_i32(beta_base_host.len())?);
2522 }
2523
2524 let mut jac_ptr_host = Vec::with_capacity(n_rows + 1);
2525 let mut jac_host = Vec::<f64>::new();
2526 let mut max_q = 0usize;
2527 jac_ptr_host.push(0_i32);
2528 for row_jac in data.local_jac.iter() {
2529 if row_jac.len() % p != 0 {
2530 return Err(ArrowSchurGpuFailure::Unavailable);
2531 }
2532 max_q = max_q.max(row_jac.len() / p);
2533 jac_host.extend_from_slice(row_jac);
2534 jac_ptr_host.push(checked_i32(jac_host.len())?);
2535 }
2536 if max_q == 0 {
2537 return Err(ArrowSchurGpuFailure::Unavailable);
2538 }
2539
2540 let mut smooth_offsets_host = Vec::with_capacity(data.smooth_blocks.len());
2541 let mut smooth_m_host = Vec::with_capacity(data.smooth_blocks.len());
2542 let mut smooth_ptr_host = Vec::with_capacity(data.smooth_blocks.len() + 1);
2543 let mut smooth_data_host = Vec::<f64>::new();
2544 smooth_ptr_host.push(0_i32);
2545 for block in &data.smooth_blocks {
2546 let (rows, cols) = block.factor_a.dim();
2547 if rows != cols {
2548 return Err(ArrowSchurGpuFailure::Unavailable);
2549 }
2550 smooth_offsets_host.push(checked_i32(block.global_offset)?);
2551 smooth_m_host.push(checked_i32(rows)?);
2552 for r in 0..rows {
2553 for c in 0..cols {
2554 smooth_data_host.push(block.factor_a[[r, c]]);
2555 }
2556 }
2557 smooth_ptr_host.push(checked_i32(smooth_data_host.len())?);
2558 }
2559
2560 let mut g_row_off_host = Vec::with_capacity(data.sparse_g_blocks.len());
2561 let mut g_col_off_host = Vec::with_capacity(data.sparse_g_blocks.len());
2562 let mut g_rows_host = Vec::with_capacity(data.sparse_g_blocks.len());
2563 let mut g_cols_host = Vec::with_capacity(data.sparse_g_blocks.len());
2564 let mut g_ptr_host = Vec::with_capacity(data.sparse_g_blocks.len() + 1);
2565 let mut g_data_host = Vec::<f64>::new();
2566 g_ptr_host.push(0_i32);
2567 for block in &data.sparse_g_blocks {
2568 let (rows, cols) = block.data.dim();
2569 g_row_off_host.push(checked_i32(block.row_off)?);
2570 g_col_off_host.push(checked_i32(block.col_off)?);
2571 g_rows_host.push(checked_i32(rows)?);
2572 g_cols_host.push(checked_i32(cols)?);
2573 for r in 0..rows {
2574 for c in 0..cols {
2575 g_data_host.push(block.data[[r, c]]);
2576 }
2577 }
2578 g_ptr_host.push(checked_i32(g_data_host.len())?);
2579 }
2580
2581 let mut ainv_host = vec![0.0_f64; n_rows * max_q * max_q];
2582 for (row_idx, row) in sys.rows.iter().enumerate() {
2583 let q = data.local_jac[row_idx].len() / p;
2584 if row.htt.dim() != (q, q) {
2585 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
2586 reason: format!(
2587 "SAE device PCG row {row_idx}: H_tt shape {:?} != ({q}, {q})",
2588 row.htt.dim()
2589 ),
2590 });
2591 }
2592 let mut block = row.htt.clone();
2593 for d in 0..q {
2594 block[[d, d]] += ridge_t;
2595 }
2596 let factor = gam_linalg::triangular::cholesky_factor_in_place(
2597 block.view(),
2598 gam_linalg::triangular::CholeskyGuard::NonnegativePivot,
2599 )
2600 .ok_or_else(|| {
2601 let scale = row
2602 .htt
2603 .diag()
2604 .iter()
2605 .map(|v| v.abs())
2606 .fold(0.0_f64, f64::max)
2607 .max(1.0);
2608 ArrowSchurGpuFailure::RidgeBumpRequired {
2609 row: row_idx,
2610 bump: scale * f64::EPSILON.sqrt() * super::RIDGE_BUMP_EPS_MARGIN,
2611 }
2612 })?;
2613 for col in 0..q {
2614 let mut e = Array1::<f64>::zeros(q);
2615 e[col] = 1.0;
2616 let solved =
2617 gam_linalg::triangular::cholesky_solve_vector(factor.view(), e.view());
2618 for r in 0..q {
2619 ainv_host[row_idx * max_q * max_q + r * max_q + col] = solved[r];
2620 }
2621 }
2622 }
2623
2624 Ok(DeviceSaePcgBuffers {
2625 row_ptr: stream
2626 .clone_htod(&row_ptr_host)
2627 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?,
2628 beta_base: stream
2629 .clone_htod(&beta_base_host)
2630 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?,
2631 phi: stream
2632 .clone_htod(&phi_host)
2633 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?,
2634 jac_ptr: stream
2635 .clone_htod(&jac_ptr_host)
2636 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?,
2637 jac: stream
2638 .clone_htod(&jac_host)
2639 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?,
2640 smooth_offsets: stream
2641 .clone_htod(&smooth_offsets_host)
2642 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?,
2643 smooth_m: stream
2644 .clone_htod(&smooth_m_host)
2645 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?,
2646 smooth_ptr: stream
2647 .clone_htod(&smooth_ptr_host)
2648 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?,
2649 smooth_data: stream
2650 .clone_htod(&smooth_data_host)
2651 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?,
2652 g_row_off: stream
2653 .clone_htod(&g_row_off_host)
2654 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?,
2655 g_col_off: stream
2656 .clone_htod(&g_col_off_host)
2657 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?,
2658 g_rows: stream
2659 .clone_htod(&g_rows_host)
2660 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?,
2661 g_cols: stream
2662 .clone_htod(&g_cols_host)
2663 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?,
2664 g_ptr: stream
2665 .clone_htod(&g_ptr_host)
2666 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?,
2667 g_data: stream
2668 .clone_htod(&g_data_host)
2669 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?,
2670 ainv: stream
2671 .clone_htod(&ainv_host)
2672 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?,
2673 u: stream
2674 .alloc_zeros::<f64>(n_rows * p)
2675 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?,
2676 w: stream
2677 .alloc_zeros::<f64>(n_rows * max_q)
2678 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?,
2679 v: stream
2680 .alloc_zeros::<f64>(n_rows * max_q)
2681 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?,
2682 n_rows,
2683 p,
2684 k,
2685 max_q,
2686 smooth_blocks: data.smooth_blocks.len(),
2687 g_blocks: data.sparse_g_blocks.len(),
2688 })
2689 }
2690
2691 fn launch_sae_init(
2692 stream: &Arc<CudaStream>,
2693 module: &Arc<CudaModule>,
2694 out: &mut CudaSlice<f64>,
2695 x: &CudaSlice<f64>,
2696 ridge: f64,
2697 n: usize,
2698 ) -> Result<(), ArrowSchurGpuFailure> {
2699 let kernel = module
2700 .load_function("arrow_sae_init")
2701 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
2702 let n_i32 = checked_i32(n)?;
2703 let mut builder = stream.launch_builder(&kernel);
2704 builder.arg(out).arg(x).arg(&ridge).arg(&n_i32);
2705 unsafe { builder.launch(pcg_launch_config(n)?) }
2709 .map(drop)
2710 .map_err(|_| ArrowSchurGpuFailure::Unavailable)
2711 }
2712
2713 fn launch_sae_penalty_matvec(
2714 stream: &Arc<CudaStream>,
2715 module: &Arc<CudaModule>,
2716 buffers: &mut DeviceSaePcgBuffers,
2717 x: &CudaSlice<f64>,
2718 out: &mut CudaSlice<f64>,
2719 ridge_beta: f64,
2720 ) -> Result<(), ArrowSchurGpuFailure> {
2721 launch_sae_init(stream, module, out, x, ridge_beta, buffers.k)?;
2722 if buffers.smooth_blocks > 0 {
2723 let kernel = module
2724 .load_function("arrow_sae_smooth_matvec")
2725 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
2726 let max_m = buffers.k;
2727 let p_i32 = checked_i32(buffers.p)?;
2728 let blocks_i32 = checked_i32(buffers.smooth_blocks)?;
2729 let cfg = LaunchConfig {
2730 grid_dim: (
2731 ((max_m as u32).saturating_add(255) / 256).max(1),
2732 checked_i32(buffers.smooth_blocks)? as u32,
2733 1,
2734 ),
2735 block_dim: (256, 1, 1),
2736 shared_mem_bytes: 0,
2737 };
2738 let mut builder = stream.launch_builder(&kernel);
2739 builder
2740 .arg(x)
2741 .arg(&mut *out)
2742 .arg(&buffers.smooth_offsets)
2743 .arg(&buffers.smooth_m)
2744 .arg(&buffers.smooth_ptr)
2745 .arg(&buffers.smooth_data)
2746 .arg(&p_i32)
2747 .arg(&blocks_i32);
2748 unsafe { builder.launch(cfg) }.map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
2753 }
2754 if buffers.g_blocks > 0 {
2755 let kernel = module
2756 .load_function("arrow_sae_sparse_g_matvec")
2757 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
2758 let max_work = buffers
2759 .k
2760 .checked_div(buffers.p)
2761 .unwrap_or(0)
2762 .saturating_mul(buffers.p);
2763 let p_i32 = checked_i32(buffers.p)?;
2764 let blocks_i32 = checked_i32(buffers.g_blocks)?;
2765 let cfg = LaunchConfig {
2766 grid_dim: (
2767 ((max_work as u32).saturating_add(255) / 256).max(1),
2768 checked_i32(buffers.g_blocks)? as u32,
2769 1,
2770 ),
2771 block_dim: (256, 1, 1),
2772 shared_mem_bytes: 0,
2773 };
2774 let mut builder = stream.launch_builder(&kernel);
2775 builder
2776 .arg(x)
2777 .arg(&mut *out)
2778 .arg(&buffers.g_row_off)
2779 .arg(&buffers.g_col_off)
2780 .arg(&buffers.g_rows)
2781 .arg(&buffers.g_cols)
2782 .arg(&buffers.g_ptr)
2783 .arg(&buffers.g_data)
2784 .arg(&p_i32)
2785 .arg(&blocks_i32);
2786 unsafe { builder.launch(cfg) }.map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
2791 }
2792 Ok(())
2793 }
2794
2795 fn launch_sae_row_schur_sub(
2796 stream: &Arc<CudaStream>,
2797 module: &Arc<CudaModule>,
2798 buffers: &mut DeviceSaePcgBuffers,
2799 x: &CudaSlice<f64>,
2800 out: &mut CudaSlice<f64>,
2801 ) -> Result<(), ArrowSchurGpuFailure> {
2802 let p_i32 = checked_i32(buffers.p)?;
2803 let max_q_i32 = checked_i32(buffers.max_q)?;
2804 let n_rows_i32 = checked_i32(buffers.n_rows)?;
2805 let cfg_p_rows = LaunchConfig {
2806 grid_dim: (
2807 ((buffers.p as u32).saturating_add(255) / 256).max(1),
2808 checked_i32(buffers.n_rows)? as u32,
2809 1,
2810 ),
2811 block_dim: (256, 1, 1),
2812 shared_mem_bytes: 0,
2813 };
2814 let gather = module
2815 .load_function("arrow_sae_gather_u")
2816 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
2817 {
2818 let mut builder = stream.launch_builder(&gather);
2819 builder
2820 .arg(x)
2821 .arg(&buffers.row_ptr)
2822 .arg(&buffers.beta_base)
2823 .arg(&buffers.phi)
2824 .arg(&mut buffers.u)
2825 .arg(&p_i32)
2826 .arg(&n_rows_i32);
2827 unsafe { builder.launch(cfg_p_rows) }.map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
2831 }
2832
2833 let cfg_q_rows = LaunchConfig {
2834 grid_dim: (
2835 ((buffers.max_q as u32).saturating_add(255) / 256).max(1),
2836 checked_i32(buffers.n_rows)? as u32,
2837 1,
2838 ),
2839 block_dim: (256, 1, 1),
2840 shared_mem_bytes: 0,
2841 };
2842 let apply_l = module
2843 .load_function("arrow_sae_apply_l")
2844 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
2845 {
2846 let mut builder = stream.launch_builder(&apply_l);
2847 builder
2848 .arg(&buffers.u)
2849 .arg(&buffers.jac_ptr)
2850 .arg(&buffers.jac)
2851 .arg(&mut buffers.w)
2852 .arg(&p_i32)
2853 .arg(&max_q_i32)
2854 .arg(&n_rows_i32);
2855 unsafe { builder.launch(cfg_q_rows) }.map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
2859 }
2860
2861 let apply_ainv = module
2862 .load_function("arrow_sae_apply_ainv")
2863 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
2864 {
2865 let mut builder = stream.launch_builder(&apply_ainv);
2866 builder
2867 .arg(&buffers.ainv)
2868 .arg(&buffers.w)
2869 .arg(&mut buffers.v)
2870 .arg(&max_q_i32)
2871 .arg(&n_rows_i32);
2872 unsafe { builder.launch(cfg_q_rows) }.map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
2876 }
2877
2878 let scatter = module
2879 .load_function("arrow_sae_scatter_sub")
2880 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
2881 {
2882 let mut builder = stream.launch_builder(&scatter);
2883 builder
2884 .arg(&buffers.v)
2885 .arg(&buffers.jac_ptr)
2886 .arg(&buffers.jac)
2887 .arg(&buffers.row_ptr)
2888 .arg(&buffers.beta_base)
2889 .arg(&buffers.phi)
2890 .arg(out)
2891 .arg(&p_i32)
2892 .arg(&max_q_i32)
2893 .arg(&n_rows_i32);
2894 unsafe { builder.launch(cfg_p_rows) }.map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
2898 }
2899 Ok(())
2900 }
2901
2902 fn launch_sae_diag_sub(
2903 stream: &Arc<CudaStream>,
2904 module: &Arc<CudaModule>,
2905 buffers: &DeviceSaePcgBuffers,
2906 diag: &mut CudaSlice<f64>,
2907 ) -> Result<(), ArrowSchurGpuFailure> {
2908 let kernel = module
2909 .load_function("arrow_sae_diag_sub")
2910 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
2911 let p_i32 = checked_i32(buffers.p)?;
2912 let max_q_i32 = checked_i32(buffers.max_q)?;
2913 let n_rows_i32 = checked_i32(buffers.n_rows)?;
2914 let cfg = LaunchConfig {
2915 grid_dim: (
2916 ((buffers.p as u32).saturating_add(255) / 256).max(1),
2917 checked_i32(buffers.n_rows)? as u32,
2918 1,
2919 ),
2920 block_dim: (256, 1, 1),
2921 shared_mem_bytes: 0,
2922 };
2923 let mut builder = stream.launch_builder(&kernel);
2924 builder
2925 .arg(diag)
2926 .arg(&buffers.ainv)
2927 .arg(&buffers.jac_ptr)
2928 .arg(&buffers.jac)
2929 .arg(&buffers.row_ptr)
2930 .arg(&buffers.beta_base)
2931 .arg(&buffers.phi)
2932 .arg(&p_i32)
2933 .arg(&max_q_i32)
2934 .arg(&n_rows_i32);
2935 unsafe { builder.launch(cfg) }
2939 .map(drop)
2940 .map_err(|_| ArrowSchurGpuFailure::Unavailable)
2941 }
2942
2943 fn launch_sae_matvec(
2944 stream: &Arc<CudaStream>,
2945 module: &Arc<CudaModule>,
2946 buffers: &mut DeviceSaePcgBuffers,
2947 x: &CudaSlice<f64>,
2948 out: &mut CudaSlice<f64>,
2949 ridge_beta: f64,
2950 ) -> Result<(), ArrowSchurGpuFailure> {
2951 launch_sae_penalty_matvec(stream, module, buffers, x, out, ridge_beta)?;
2952 launch_sae_row_schur_sub(stream, module, buffers, x, out)
2953 }
2954
2955 fn pack_fused_host(
2960 sys: &ArrowSchurSystem,
2961 ridge_t: f64,
2962 p_max: usize,
2963 r_template: usize,
2964 ) -> (Vec<f64>, Vec<f64>, Vec<f64>) {
2965 let n = sys.rows.len();
2966 let d = sys.d;
2967 let k = sys.k;
2968 let mut d_buf = vec![0.0_f64; n * p_max * p_max];
2969 let mut b_buf = vec![0.0_f64; n * p_max * r_template];
2970 let mut g_buf = vec![0.0_f64; n * p_max];
2971 for (i, row) in sys.rows.iter().enumerate() {
2972 for col in 0..d {
2974 let base = (i * p_max + col) * p_max;
2975 for r in 0..d {
2976 let mut value = row.htt[[r, col]];
2977 if r == col {
2978 value += ridge_t;
2979 }
2980 d_buf[base + r] = value;
2981 }
2982 }
2983 for col in 0..k {
2985 let base = (i * p_max + col) * p_max;
2986 for r in 0..d {
2987 b_buf[base + r] = row.htbeta[[r, col]];
2988 }
2989 }
2990 let g_base = i * p_max;
2992 for r in 0..d {
2993 g_buf[g_base + r] = row.gt[r];
2994 }
2995 }
2996 (d_buf, b_buf, g_buf)
2997 }
2998
2999 pub(super) struct ResidentArrowFrame {
3026 n: usize,
3027 d: usize,
3028 k: usize,
3029 stream: Arc<CudaStream>,
3030 blas: CudaBlas,
3031 l_dev: CudaSlice<f64>,
3034 y_dev: CudaSlice<f64>,
3037 schur_dev: CudaSlice<f64>,
3040 log_det_hessian: f64,
3043 }
3044
3045 impl ResidentArrowFrame {
3046 pub(super) fn new(
3050 sys: &ArrowSchurSystem,
3051 ridge_t: f64,
3052 ridge_beta: f64,
3053 ) -> Result<Self, ArrowSchurGpuFailure> {
3054 if ridge_t.is_nan() || ridge_beta.is_nan() {
3055 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
3056 reason: "ridge is NaN".to_string(),
3057 });
3058 }
3059 let n = sys.rows.len();
3060 let d = sys.d;
3061 let k = sys.k;
3062 let runtime = route_through_gpu(DispatchOp::SmallDenseBatchedPotrf { p: d, batch: n })
3063 .ok_or(ArrowSchurGpuFailure::Unavailable)?;
3064 let stream = gam_gpu::device_runtime::cuda_context_for(runtime.device.ordinal)
3065 .and_then(|ctx| ctx.new_stream().ok())
3066 .ok_or(ArrowSchurGpuFailure::Unavailable)?;
3067 let solver =
3068 DnHandle::new(stream.clone()).map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3069 let blas =
3070 CudaBlas::new(stream.clone()).map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3071
3072 let (d_host, b_host, _g_host) = pack_host(sys, ridge_t);
3074 let mut l_dev = stream
3075 .clone_htod(&d_host)
3076 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3077 let mut y_dev = stream
3078 .clone_htod(&b_host)
3079 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3080
3081 let info_host = potrf_batched(&solver, &stream, d, n, &mut l_dev)?;
3083 if let Some(idx) = info_host.iter().position(|info| *info != 0) {
3084 let pivot = info_host[idx];
3085 let scale = sys.rows[idx]
3086 .htt
3087 .diag()
3088 .iter()
3089 .map(|v| v.abs())
3090 .fold(0.0_f64, f64::max)
3091 .max(1.0);
3092 return Err(ArrowSchurGpuFailure::RidgeBumpRequired {
3093 row: idx,
3094 bump: scale * (pivot.abs() as f64).max(1.0) * f64::EPSILON.sqrt() * 1024.0,
3095 });
3096 }
3097
3098 trsm_batched_lower_inplace(&blas, &stream, d, n, k, &l_dev, &mut y_dev)?;
3100
3101 let schur_init: Vec<f64> = {
3106 let mut tmp = Vec::with_capacity(k * k);
3107 for col in 0..k {
3108 for row in 0..k {
3109 let mut v = sys.hbb[[row, col]];
3110 if row == col {
3111 v += ridge_beta;
3112 }
3113 tmp.push(v);
3114 }
3115 }
3116 tmp
3117 };
3118 let mut schur_dev = stream
3119 .clone_htod(&schur_init)
3120 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3121 let zero_u = stream
3124 .clone_htod(&vec![0.0_f64; n * d])
3125 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3126 let mut throwaway_rhs = stream
3127 .clone_htod(&vec![0.0_f64; k])
3128 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3129 accumulate_schur(
3130 &blas,
3131 d,
3132 k,
3133 n,
3134 &y_dev,
3135 &zero_u,
3136 &mut schur_dev,
3137 &mut throwaway_rhs,
3138 )?;
3139 let info = potrf_single(&solver, &stream, k, &mut schur_dev)?;
3140 if info != 0 {
3141 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
3142 reason: format!("Schur Cholesky failed at pivot {info}"),
3143 });
3144 }
3145
3146 let l_local_host = stream
3148 .clone_dtoh(&l_dev)
3149 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3150 let l_schur_host = stream
3151 .clone_dtoh(&schur_dev)
3152 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3153 let mut log_det = 0.0_f64;
3154 for i in 0..n {
3155 let base = i * d * d;
3156 for j in 0..d {
3157 log_det += l_local_host[base + j * d + j].ln();
3158 }
3159 }
3160 for j in 0..k {
3161 log_det += l_schur_host[j * k + j].ln();
3162 }
3163 log_det *= 2.0;
3164
3165 Ok(Self {
3166 n,
3167 d,
3168 k,
3169 stream,
3170 blas,
3171 l_dev,
3172 y_dev,
3173 schur_dev,
3174 log_det_hessian: log_det,
3175 })
3176 }
3177
3178 #[inline]
3179 pub(super) fn log_det_hessian(&self) -> f64 {
3180 self.log_det_hessian
3181 }
3182
3183 pub(super) fn solve_gradient(
3187 &self,
3188 g_t: &[f64],
3189 g_beta: &[f64],
3190 ) -> Result<ArrowSchurGpuSolution, ArrowSchurGpuFailure> {
3191 let n = self.n;
3192 let d = self.d;
3193 let k = self.k;
3194 if g_t.len() != n * d || g_beta.len() != k {
3195 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
3196 reason: format!(
3197 "resident gradient shape mismatch: g_t={} (want {}), g_beta={} (want {})",
3198 g_t.len(),
3199 n * d,
3200 g_beta.len(),
3201 k
3202 ),
3203 });
3204 }
3205 let mut u_dev = self
3207 .stream
3208 .clone_htod(&g_t.to_vec())
3209 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3210 trsm_batched_lower_inplace(&self.blas, &self.stream, d, n, 1, &self.l_dev, &mut u_dev)?;
3211
3212 let rhs_init: Vec<f64> = g_beta.iter().map(|v| -v).collect();
3215 let mut rhs_dev = self
3216 .stream
3217 .clone_htod(&rhs_init)
3218 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3219 accumulate_schur_rhs_only(&self.blas, d, k, n, &self.y_dev, &u_dev, &mut rhs_dev)?;
3220
3221 trsm_single(
3223 &self.blas,
3224 &self.stream,
3225 k,
3226 &self.schur_dev,
3227 &mut rhs_dev,
3228 false,
3229 false,
3230 )?;
3231 trsm_single(
3232 &self.blas,
3233 &self.stream,
3234 k,
3235 &self.schur_dev,
3236 &mut rhs_dev,
3237 false,
3238 true,
3239 )?;
3240 let delta_beta_host = self
3241 .stream
3242 .clone_dtoh(&rhs_dev)
3243 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3244 let delta_beta = Array1::from_vec(delta_beta_host);
3245
3246 accumulate_back_sub_rhs(&self.blas, d, k, n, &self.y_dev, &rhs_dev, &mut u_dev)?;
3248 trsm_batched_lower_inplace_transposed(
3249 &self.blas,
3250 &self.stream,
3251 d,
3252 n,
3253 1,
3254 &self.l_dev,
3255 &mut u_dev,
3256 )?;
3257 let x_host = self
3258 .stream
3259 .clone_dtoh(&u_dev)
3260 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3261 let mut delta_t = Array1::<f64>::zeros(n * d);
3262 for (i, v) in x_host.iter().enumerate() {
3263 delta_t[i] = -*v;
3264 }
3265
3266 Ok(ArrowSchurGpuSolution {
3267 delta_t,
3268 delta_beta,
3269 log_det_hessian: self.log_det_hessian,
3270 })
3271 }
3272 }
3273
3274 pub(super) fn solve_fused(
3275 sys: &ArrowSchurSystem,
3276 ridge_t: f64,
3277 ridge_beta: f64,
3278 ) -> Result<ArrowSchurGpuSolution, ArrowSchurGpuFailure> {
3279 let n = sys.rows.len();
3280 let d = sys.d;
3281 let k = sys.k;
3282 let plan = crate::gpu_kernels::arrow_schur_nvrtc::plan_fused_launch(n, d, k)
3283 .ok_or(ArrowSchurGpuFailure::Unavailable)?;
3284 let p_max = plan.p_max;
3285 let r_template = plan.r_template;
3286
3287 let runtime = gam_gpu::linalg_dispatch::route_through_gpu(
3288 gam_gpu::linalg_dispatch::DispatchOp::SmallDenseBatchedPotrf { p: d, batch: n },
3289 )
3290 .ok_or(ArrowSchurGpuFailure::Unavailable)?;
3291 let ctx = gam_gpu::device_runtime::cuda_context_for(runtime.device.ordinal)
3292 .ok_or(ArrowSchurGpuFailure::Unavailable)?;
3293 let stream = ctx
3294 .new_stream()
3295 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3296 let cap = &runtime.device.capability;
3297 let key = crate::gpu_kernels::arrow_schur_nvrtc::FusedModuleCacheKey {
3298 cc_major: cap.compute_major,
3299 cc_minor: cap.compute_minor,
3300 p_max: p_max as u32,
3301 r_template: r_template as u32,
3302 };
3303 let module = fused_module_for(&ctx, key)?;
3304 let forward = module
3305 .load_function("arrow_schur_forward_pgroup")
3306 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3307 let back_sub = module
3308 .load_function("arrow_schur_back_sub_pgroup")
3309 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3310
3311 let (d_host, b_host, g_host) = pack_fused_host(sys, ridge_t, p_max, r_template);
3313 let d_dev = stream
3314 .clone_htod(&d_host)
3315 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3316 let b_dev = stream
3317 .clone_htod(&b_host)
3318 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3319 let g_dev = stream
3320 .clone_htod(&g_host)
3321 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3322 let mut l_out = stream
3323 .alloc_zeros::<f64>(n * p_max * p_max)
3324 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3325 let mut u_out = stream
3326 .alloc_zeros::<f64>(n * p_max)
3327 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3328 let mut y_out = stream
3329 .alloc_zeros::<f64>(n * p_max * r_template)
3330 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3331 let mut partial_s = stream
3332 .alloc_zeros::<f64>(plan.partial_s_doubles)
3333 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3334 let mut partial_r = stream
3335 .alloc_zeros::<f64>(plan.partial_r_doubles)
3336 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3337 let mut status_dev = stream
3338 .alloc_zeros::<i32>(n)
3339 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3340
3341 let cfg = LaunchConfig {
3343 grid_dim: (plan.blocks, 1, 1),
3344 block_dim: (plan.threads_per_block, 1, 1),
3345 shared_mem_bytes: 0,
3346 };
3347 let n_i32 = to_i32(n).ok_or(ArrowSchurGpuFailure::Unavailable)?;
3348 let p_i32 = to_i32(d).ok_or(ArrowSchurGpuFailure::Unavailable)?;
3349 let r_i32 = to_i32(k).ok_or(ArrowSchurGpuFailure::Unavailable)?;
3350 let ridge_arg = ridge_t;
3351 {
3352 let mut builder = stream.launch_builder(&forward);
3353 builder
3354 .arg(&d_dev)
3355 .arg(&b_dev)
3356 .arg(&g_dev)
3357 .arg(&n_i32)
3358 .arg(&p_i32)
3359 .arg(&r_i32)
3360 .arg(&ridge_arg)
3361 .arg(&mut l_out)
3362 .arg(&mut u_out)
3363 .arg(&mut y_out)
3364 .arg(&mut partial_s)
3365 .arg(&mut partial_r)
3366 .arg(&mut status_dev);
3367 unsafe { builder.launch(cfg) }.map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3371 }
3372 stream
3373 .synchronize()
3374 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3375
3376 let status_host = stream
3378 .clone_dtoh(&status_dev)
3379 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3380 if let Some(row) = status_host.iter().position(|s| *s != 0) {
3381 let pivot = status_host[row];
3382 let scale = sys.rows[row]
3383 .htt
3384 .diag()
3385 .iter()
3386 .map(|v| v.abs())
3387 .fold(0.0_f64, f64::max)
3388 .max(1.0);
3389 return Err(ArrowSchurGpuFailure::RidgeBumpRequired {
3390 row,
3391 bump: scale * (pivot.abs() as f64).max(1.0) * f64::EPSILON.sqrt() * 1024.0,
3392 });
3393 }
3394
3395 let partial_s_host = stream
3397 .clone_dtoh(&partial_s)
3398 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3399 let partial_r_host = stream
3400 .clone_dtoh(&partial_r)
3401 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3402 let mut schur_host = vec![0.0_f64; k * k];
3403 for col in 0..k {
3404 for row in 0..k {
3405 let mut v = sys.hbb[[row, col]];
3406 if row == col {
3407 v += ridge_beta;
3408 }
3409 schur_host[col * k + row] = v;
3410 }
3411 }
3412 let mut rhs_host: Vec<f64> = sys.gb.iter().map(|v| -v).collect();
3413 for i in 0..n {
3414 let s_base = i * r_template * r_template;
3417 for col in 0..k {
3418 let col_base = s_base + col * r_template;
3419 let dst_col_base = col * k;
3420 for row in 0..k {
3421 schur_host[dst_col_base + row] -= partial_s_host[col_base + row];
3422 }
3423 }
3424 let r_base = i * r_template;
3425 for a in 0..k {
3426 rhs_host[a] += partial_r_host[r_base + a];
3427 }
3428 }
3429
3430 let mut schur_dev = stream
3432 .clone_htod(&schur_host)
3433 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3434 let mut rhs_dev = stream
3435 .clone_htod(&rhs_host)
3436 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3437 let solver =
3438 DnHandle::new(stream.clone()).map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3439 let blas = CudaBlas::new(stream.clone()).map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3440 let info = potrf_single(&solver, &stream, k, &mut schur_dev)?;
3441 if info != 0 {
3442 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
3443 reason: format!("fused Schur Cholesky failed at pivot {info}"),
3444 });
3445 }
3446 trsm_single(&blas, &stream, k, &schur_dev, &mut rhs_dev, false, false)?;
3447 trsm_single(&blas, &stream, k, &schur_dev, &mut rhs_dev, false, true)?;
3448 let delta_beta_host = stream
3449 .clone_dtoh(&rhs_dev)
3450 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3451 let delta_beta = Array1::from_vec(delta_beta_host.clone());
3452
3453 let mut delta_t_dev = stream
3455 .alloc_zeros::<f64>(n * p_max)
3456 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3457 let back_cfg = LaunchConfig {
3458 grid_dim: (plan.blocks, 1, 1),
3459 block_dim: (plan.threads_per_block, 1, 1),
3460 shared_mem_bytes: 0,
3461 };
3462 {
3463 let mut builder = stream.launch_builder(&back_sub);
3464 builder
3465 .arg(&l_out)
3466 .arg(&u_out)
3467 .arg(&y_out)
3468 .arg(&rhs_dev)
3469 .arg(&n_i32)
3470 .arg(&p_i32)
3471 .arg(&r_i32)
3472 .arg(&mut delta_t_dev);
3473 unsafe { builder.launch(back_cfg) }.map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3477 }
3478 stream
3479 .synchronize()
3480 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3481
3482 let delta_t_host = stream
3483 .clone_dtoh(&delta_t_dev)
3484 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3485 let mut delta_t = Array1::<f64>::zeros(n * d);
3486 for i in 0..n {
3487 let src_base = i * p_max;
3488 let dst_base = i * d;
3489 for r in 0..d {
3490 delta_t[dst_base + r] = delta_t_host[src_base + r];
3491 }
3492 }
3493
3494 let l_local_host = stream
3496 .clone_dtoh(&l_out)
3497 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3498 let l_schur_host = stream
3499 .clone_dtoh(&schur_dev)
3500 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
3501 let mut log_det = 0.0_f64;
3502 for i in 0..n {
3503 let base = i * p_max * p_max;
3504 for j in 0..d {
3505 log_det += l_local_host[base + j * p_max + j].ln();
3506 }
3507 }
3508 for j in 0..k {
3509 log_det += l_schur_host[j * k + j].ln();
3510 }
3511 log_det *= 2.0;
3512
3513 Ok(ArrowSchurGpuSolution {
3514 delta_t,
3515 delta_beta,
3516 log_det_hessian: log_det,
3517 })
3518 }
3519
3520 pub(super) fn build_schur_matvec_backend(
3530 sys: &ArrowSchurSystem,
3531 ridge_t: f64,
3532 ridge_beta: f64,
3533 ) -> Result<crate::arrow_schur::GpuSchurMatvec, super::ArrowSchurGpuFailure> {
3534 let n = sys.rows.len();
3535 let d = sys.d;
3536 let k = sys.k;
3537 let plan = crate::gpu_kernels::arrow_schur_nvrtc::plan_fused_launch(n, d, k)
3538 .ok_or(super::ArrowSchurGpuFailure::Unavailable)?;
3539 let p_max = plan.p_max;
3540 let r_template = plan.r_template;
3541
3542 let runtime = gam_gpu::linalg_dispatch::route_through_gpu(
3543 gam_gpu::linalg_dispatch::DispatchOp::SmallDenseBatchedPotrf { p: d, batch: n },
3544 )
3545 .ok_or(super::ArrowSchurGpuFailure::Unavailable)?;
3546 let ctx = gam_gpu::device_runtime::cuda_context_for(runtime.device.ordinal)
3547 .ok_or(super::ArrowSchurGpuFailure::Unavailable)?;
3548 let stream = ctx
3549 .new_stream()
3550 .map_err(|_| super::ArrowSchurGpuFailure::Unavailable)?;
3551 let cap = &runtime.device.capability;
3552 let key = crate::gpu_kernels::arrow_schur_nvrtc::FusedModuleCacheKey {
3553 cc_major: cap.compute_major,
3554 cc_minor: cap.compute_minor,
3555 p_max: p_max as u32,
3556 r_template: r_template as u32,
3557 };
3558 let module = fused_module_for(&ctx, key)?;
3559 let forward = module
3560 .load_function("arrow_schur_forward_pgroup")
3561 .map_err(|_| super::ArrowSchurGpuFailure::Unavailable)?;
3562
3563 let (d_host, b_host, g_host) = pack_fused_host(sys, ridge_t, p_max, r_template);
3564 let d_dev = stream
3565 .clone_htod(&d_host)
3566 .map_err(|_| super::ArrowSchurGpuFailure::Unavailable)?;
3567 let b_dev = stream
3568 .clone_htod(&b_host)
3569 .map_err(|_| super::ArrowSchurGpuFailure::Unavailable)?;
3570 let g_dev = stream
3571 .clone_htod(&g_host)
3572 .map_err(|_| super::ArrowSchurGpuFailure::Unavailable)?;
3573 let mut l_out = stream
3574 .alloc_zeros::<f64>(n * p_max * p_max)
3575 .map_err(|_| super::ArrowSchurGpuFailure::Unavailable)?;
3576 let mut u_out = stream
3577 .alloc_zeros::<f64>(n * p_max)
3578 .map_err(|_| super::ArrowSchurGpuFailure::Unavailable)?;
3579 let mut y_out = stream
3580 .alloc_zeros::<f64>(n * p_max * r_template)
3581 .map_err(|_| super::ArrowSchurGpuFailure::Unavailable)?;
3582 let mut partial_s = stream
3583 .alloc_zeros::<f64>(plan.partial_s_doubles)
3584 .map_err(|_| super::ArrowSchurGpuFailure::Unavailable)?;
3585 let mut partial_r = stream
3586 .alloc_zeros::<f64>(plan.partial_r_doubles)
3587 .map_err(|_| super::ArrowSchurGpuFailure::Unavailable)?;
3588 let mut status_dev = stream
3589 .alloc_zeros::<i32>(n)
3590 .map_err(|_| super::ArrowSchurGpuFailure::Unavailable)?;
3591
3592 let cfg = LaunchConfig {
3593 grid_dim: (plan.blocks, 1, 1),
3594 block_dim: (plan.threads_per_block, 1, 1),
3595 shared_mem_bytes: 0,
3596 };
3597 let n_i32 = to_i32(n).ok_or(super::ArrowSchurGpuFailure::Unavailable)?;
3598 let p_i32 = to_i32(d).ok_or(super::ArrowSchurGpuFailure::Unavailable)?;
3599 let r_i32 = to_i32(k).ok_or(super::ArrowSchurGpuFailure::Unavailable)?;
3600 let ridge_arg = ridge_t;
3601 {
3602 let mut builder = stream.launch_builder(&forward);
3603 builder
3604 .arg(&d_dev)
3605 .arg(&b_dev)
3606 .arg(&g_dev)
3607 .arg(&n_i32)
3608 .arg(&p_i32)
3609 .arg(&r_i32)
3610 .arg(&ridge_arg)
3611 .arg(&mut l_out)
3612 .arg(&mut u_out)
3613 .arg(&mut y_out)
3614 .arg(&mut partial_s)
3615 .arg(&mut partial_r)
3616 .arg(&mut status_dev);
3617 unsafe { builder.launch(cfg) }.map_err(|_| super::ArrowSchurGpuFailure::Unavailable)?;
3620 }
3621 stream
3622 .synchronize()
3623 .map_err(|_| super::ArrowSchurGpuFailure::Unavailable)?;
3624
3625 let status_host = stream
3626 .clone_dtoh(&status_dev)
3627 .map_err(|_| super::ArrowSchurGpuFailure::Unavailable)?;
3628 if let Some(row) = status_host.iter().position(|s| *s != 0) {
3629 let pivot = status_host[row];
3630 let scale = sys.rows[row]
3631 .htt
3632 .diag()
3633 .iter()
3634 .map(|v| v.abs())
3635 .fold(0.0_f64, f64::max)
3636 .max(1.0);
3637 return Err(super::ArrowSchurGpuFailure::RidgeBumpRequired {
3638 row,
3639 bump: scale * (pivot.abs() as f64).max(1.0) * f64::EPSILON.sqrt() * 1024.0,
3640 });
3641 }
3642
3643 let y_host = stream
3645 .clone_dtoh(&y_out)
3646 .map_err(|_| super::ArrowSchurGpuFailure::Unavailable)?;
3647
3648 let hbb_host: Vec<f64> = sys.hbb.iter().copied().collect();
3651 let hbb_is_kk = sys.hbb.dim() == (k, k);
3652 let hbb_matvec_opt = sys.hbb_matvec.clone();
3653
3654 let closure: crate::arrow_schur::GpuSchurMatvec =
3655 Arc::new(move |x: &Array1<f64>, out: &mut Array1<f64>| {
3656 assert_eq!(x.len(), k, "gpu_schur_matvec: x.len() != k");
3657 assert_eq!(out.len(), k, "gpu_schur_matvec: out.len() != k");
3658
3659 if let Some(ref mv) = hbb_matvec_opt {
3661 mv(x.view(), out);
3662 for a in 0..k {
3663 out[a] += ridge_beta * x[a];
3664 }
3665 } else if hbb_is_kk {
3666 for a in 0..k {
3668 let mut acc = ridge_beta * x[a];
3669 for b in 0..k {
3670 acc += hbb_host[a * k + b] * x[b];
3671 }
3672 out[a] = acc;
3673 }
3674 } else {
3675 for a in 0..k {
3676 out[a] = ridge_beta * x[a];
3677 }
3678 }
3679
3680 let mut z = vec![0.0_f64; d];
3683 for i in 0..n {
3684 let y_base = i * p_max * r_template;
3685 for r in 0..d {
3686 let mut acc = 0.0;
3687 for c in 0..k {
3688 acc += y_host[y_base + c * p_max + r] * x[c];
3689 }
3690 z[r] = acc;
3691 }
3692 for c in 0..k {
3693 let mut acc = 0.0;
3694 for r in 0..d {
3695 acc += y_host[y_base + c * p_max + r] * z[r];
3696 }
3697 out[c] -= acc;
3698 }
3699 }
3700 });
3701
3702 Ok(closure)
3703 }
3704
3705 struct DeviceSaeFrameBuffers {
3708 s_off: CudaSlice<i32>,
3710 s_m: CudaSlice<i32>,
3711 s_r: CudaSlice<i32>,
3712 s_ptr: CudaSlice<i32>,
3713 s_data: CudaSlice<f64>,
3714 s_blocks: usize,
3715 g_off_i: CudaSlice<i32>,
3717 g_off_j: CudaSlice<i32>,
3718 g_ri: CudaSlice<i32>,
3719 g_rj: CudaSlice<i32>,
3720 g_mi: CudaSlice<i32>,
3721 g_mj: CudaSlice<i32>,
3722 g_ptr: CudaSlice<i32>,
3723 g_data: CudaSlice<f64>,
3724 w_ptr: CudaSlice<i32>,
3725 w_data: CudaSlice<f64>,
3726 g_blocks: usize,
3727 g_max_work: usize,
3728 htb_ptr: CudaSlice<i32>,
3730 htb: CudaSlice<f64>,
3731 q_of: CudaSlice<i32>,
3732 ainv: CudaSlice<f64>,
3733 hvec: CudaSlice<f64>,
3734 svec: CudaSlice<f64>,
3735 n_rows: usize,
3736 k: usize,
3737 max_q: usize,
3738 }
3739
3740 fn flatten_device_sae_frame_data(
3741 sys: &ArrowSchurSystem,
3742 data: &DeviceSaePcgData,
3743 frame: &DeviceSaeFrameData,
3744 ridge_t: f64,
3745 stream: &Arc<CudaStream>,
3746 ) -> Result<DeviceSaeFrameBuffers, ArrowSchurGpuFailure> {
3747 let n_rows = sys.rows.len();
3748 let k = data.beta_dim;
3749 if frame.row_htbeta.len() != n_rows
3750 || frame.ranks.len() != frame.basis_sizes.len()
3751 || frame.border_offsets.len() != frame.ranks.len()
3752 || data.smooth_blocks.len() != frame.smooth_ranks.len()
3753 {
3754 return Err(ArrowSchurGpuFailure::Unavailable);
3755 }
3756
3757 let mut s_off = Vec::new();
3759 let mut s_m = Vec::new();
3760 let mut s_r = Vec::new();
3761 let mut s_ptr = vec![0_i32];
3762 let mut s_data = Vec::<f64>::new();
3763 for (blk, &r) in data.smooth_blocks.iter().zip(frame.smooth_ranks.iter()) {
3764 let (m, mc) = blk.factor_a.dim();
3765 if m != mc {
3766 return Err(ArrowSchurGpuFailure::Unavailable);
3767 }
3768 s_off.push(checked_i32(blk.global_offset)?);
3769 s_m.push(checked_i32(m)?);
3770 s_r.push(checked_i32(r)?);
3771 for ri in 0..m {
3772 for ci in 0..m {
3773 s_data.push(blk.factor_a[[ri, ci]]);
3774 }
3775 }
3776 s_ptr.push(checked_i32(s_data.len())?);
3777 }
3778
3779 let mut g_off_i = Vec::new();
3781 let mut g_off_j = Vec::new();
3782 let mut g_ri = Vec::new();
3783 let mut g_rj = Vec::new();
3784 let mut g_mi = Vec::new();
3785 let mut g_mj = Vec::new();
3786 let mut g_ptr = vec![0_i32];
3787 let mut g_data = Vec::<f64>::new();
3788 let mut w_ptr = vec![0_i32];
3789 let mut w_data = Vec::<f64>::new();
3790 let mut g_max_work = 0usize;
3791 for blk in &frame.frame_blocks {
3792 let ri = frame.ranks[blk.atom_i];
3793 let rj = frame.ranks[blk.atom_j];
3794 let (mi, mj) = blk.g.dim();
3795 if blk.w.dim() != (ri, rj) {
3796 return Err(ArrowSchurGpuFailure::Unavailable);
3797 }
3798 g_off_i.push(checked_i32(frame.border_offsets[blk.atom_i])?);
3799 g_off_j.push(checked_i32(frame.border_offsets[blk.atom_j])?);
3800 g_ri.push(checked_i32(ri)?);
3801 g_rj.push(checked_i32(rj)?);
3802 g_mi.push(checked_i32(mi)?);
3803 g_mj.push(checked_i32(mj)?);
3804 for r in 0..mi {
3805 for c in 0..mj {
3806 g_data.push(blk.g[[r, c]]);
3807 }
3808 }
3809 g_ptr.push(checked_i32(g_data.len())?);
3810 for a in 0..ri {
3811 for b in 0..rj {
3812 w_data.push(blk.w[[a, b]]);
3813 }
3814 }
3815 w_ptr.push(checked_i32(w_data.len())?);
3816 g_max_work = g_max_work.max(mi * ri);
3817 }
3818
3819 let mut htb_ptr = vec![0_i32];
3821 let mut htb = Vec::<f64>::new();
3822 let mut q_of = Vec::<i32>::with_capacity(n_rows);
3823 let mut max_q = 0usize;
3824 for (i, slab) in frame.row_htbeta.iter().enumerate() {
3825 let qi = sys.row_dims[i];
3826 let q_eff = if !slab.is_empty() && slab.len() == qi * k {
3829 qi
3830 } else {
3831 0
3832 };
3833 q_of.push(checked_i32(q_eff)?);
3834 max_q = max_q.max(q_eff);
3835 if q_eff > 0 {
3836 htb.extend_from_slice(slab);
3837 }
3838 htb_ptr.push(checked_i32(htb.len())?);
3839 }
3840 if max_q == 0 {
3841 max_q = 1;
3844 }
3845
3846 let mut ainv = vec![0.0_f64; n_rows * max_q * max_q];
3847 for (i, row) in sys.rows.iter().enumerate() {
3848 let q = q_of[i] as usize;
3849 if q == 0 {
3850 continue;
3851 }
3852 if row.htt.dim() != (q, q) {
3853 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
3854 reason: format!(
3855 "framed SAE device PCG row {i}: H_tt shape {:?} != ({q}, {q})",
3856 row.htt.dim()
3857 ),
3858 });
3859 }
3860 let mut block = row.htt.clone();
3861 for d in 0..q {
3862 block[[d, d]] += ridge_t;
3863 }
3864 let factor = gam_linalg::triangular::cholesky_factor_in_place(
3865 block.view(),
3866 gam_linalg::triangular::CholeskyGuard::NonnegativePivot,
3867 )
3868 .ok_or_else(|| {
3869 let scale = row
3870 .htt
3871 .diag()
3872 .iter()
3873 .map(|v| v.abs())
3874 .fold(0.0_f64, f64::max)
3875 .max(1.0);
3876 ArrowSchurGpuFailure::RidgeBumpRequired {
3877 row: i,
3878 bump: scale * f64::EPSILON.sqrt() * super::RIDGE_BUMP_EPS_MARGIN,
3879 }
3880 })?;
3881 for col in 0..q {
3882 let mut e = Array1::<f64>::zeros(q);
3883 e[col] = 1.0;
3884 let solved =
3885 gam_linalg::triangular::cholesky_solve_vector(factor.view(), e.view());
3886 for r in 0..q {
3887 ainv[i * max_q * max_q + r * max_q + col] = solved[r];
3888 }
3889 }
3890 }
3891
3892 let htod_i = |v: &[i32]| {
3893 stream
3894 .clone_htod(v)
3895 .map_err(|_| ArrowSchurGpuFailure::Unavailable)
3896 };
3897 let htod_f = |v: &[f64]| {
3898 stream
3899 .clone_htod(v)
3900 .map_err(|_| ArrowSchurGpuFailure::Unavailable)
3901 };
3902 Ok(DeviceSaeFrameBuffers {
3903 s_off: htod_i(&s_off)?,
3904 s_m: htod_i(&s_m)?,
3905 s_r: htod_i(&s_r)?,
3906 s_ptr: htod_i(&s_ptr)?,
3907 s_data: htod_f(&s_data)?,
3908 s_blocks: data.smooth_blocks.len(),
3909 g_off_i: htod_i(&g_off_i)?,
3910 g_off_j: htod_i(&g_off_j)?,
3911 g_ri: htod_i(&g_ri)?,
3912 g_rj: htod_i(&g_rj)?,
3913 g_mi: htod_i(&g_mi)?,
3914 g_mj: htod_i(&g_mj)?,
3915 g_ptr: htod_i(&g_ptr)?,
3916 g_data: htod_f(&g_data)?,
3917 w_ptr: htod_i(&w_ptr)?,
3918 w_data: htod_f(&w_data)?,
3919 g_blocks: frame.frame_blocks.len(),
3920 g_max_work,
3921 htb_ptr: htod_i(&htb_ptr)?,
3922 htb: htod_f(&htb)?,
3923 q_of: htod_i(&q_of)?,
3924 ainv: htod_f(&ainv)?,
3925 hvec: stream
3926 .alloc_zeros::<f64>(n_rows * max_q)
3927 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?,
3928 svec: stream
3929 .alloc_zeros::<f64>(n_rows * max_q)
3930 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?,
3931 n_rows,
3932 k,
3933 max_q,
3934 })
3935 }
3936
3937 fn sae_frame_penalty_diag_host(
3938 data: &DeviceSaePcgData,
3939 frame: &DeviceSaeFrameData,
3940 ridge_beta: f64,
3941 ) -> Result<Vec<f64>, ArrowSchurGpuFailure> {
3942 let mut diag = vec![ridge_beta; data.beta_dim];
3943 for (blk, &r) in data.smooth_blocks.iter().zip(frame.smooth_ranks.iter()) {
3945 let m = blk.factor_a.nrows();
3946 for ia in 0..m {
3947 let coeff = blk.factor_a[[ia, ia]];
3948 let base = blk.global_offset + ia * r;
3949 for ib in 0..r {
3950 if base + ib >= diag.len() {
3951 return Err(ArrowSchurGpuFailure::Unavailable);
3952 }
3953 diag[base + ib] += coeff;
3954 }
3955 }
3956 }
3957 for blk in &frame.frame_blocks {
3959 if blk.atom_i != blk.atom_j {
3960 continue;
3961 }
3962 let r = frame.ranks[blk.atom_i];
3963 let off = frame.border_offsets[blk.atom_i];
3964 let (mi, mj) = blk.g.dim();
3965 for li in 0..mi.min(mj) {
3966 let gii = blk.g[[li, li]];
3967 let base = off + li * r;
3968 for a in 0..r {
3969 if base + a >= diag.len() {
3970 return Err(ArrowSchurGpuFailure::Unavailable);
3971 }
3972 diag[base + a] += gii * blk.w[[a, a]];
3973 }
3974 }
3975 }
3976 Ok(diag)
3977 }
3978
3979 fn frame_grid(work: usize, n_rows: usize) -> Result<LaunchConfig, ArrowSchurGpuFailure> {
3980 Ok(LaunchConfig {
3981 grid_dim: (
3982 ((work as u32).saturating_add(255) / 256).max(1),
3983 checked_i32(n_rows)? as u32,
3984 1,
3985 ),
3986 block_dim: (256, 1, 1),
3987 shared_mem_bytes: 0,
3988 })
3989 }
3990
3991 fn launch_sae_frame_matvec(
3992 stream: &Arc<CudaStream>,
3993 module: &Arc<CudaModule>,
3994 buffers: &mut DeviceSaeFrameBuffers,
3995 x: &CudaSlice<f64>,
3996 out: &mut CudaSlice<f64>,
3997 ridge_beta: f64,
3998 ) -> Result<(), ArrowSchurGpuFailure> {
3999 launch_sae_init(stream, module, out, x, ridge_beta, buffers.k)?;
4000 if buffers.s_blocks > 0 {
4002 let kernel = module
4003 .load_function("arrow_sae_frame_smooth_matvec")
4004 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4005 let blocks_i32 = checked_i32(buffers.s_blocks)?;
4006 let cfg = frame_grid(buffers.k, buffers.s_blocks)?;
4007 let mut b = stream.launch_builder(&kernel);
4008 b.arg(x)
4009 .arg(&mut *out)
4010 .arg(&buffers.s_off)
4011 .arg(&buffers.s_m)
4012 .arg(&buffers.s_r)
4013 .arg(&buffers.s_ptr)
4014 .arg(&buffers.s_data)
4015 .arg(&blocks_i32);
4016 unsafe { b.launch(cfg) }.map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4019 }
4020 if buffers.g_blocks > 0 {
4022 let kernel = module
4023 .load_function("arrow_sae_frame_g_matvec")
4024 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4025 let blocks_i32 = checked_i32(buffers.g_blocks)?;
4026 let cfg = frame_grid(buffers.g_max_work.max(1), buffers.g_blocks)?;
4027 let mut b = stream.launch_builder(&kernel);
4028 b.arg(x)
4029 .arg(&mut *out)
4030 .arg(&buffers.g_off_i)
4031 .arg(&buffers.g_off_j)
4032 .arg(&buffers.g_ri)
4033 .arg(&buffers.g_rj)
4034 .arg(&buffers.g_mi)
4035 .arg(&buffers.g_mj)
4036 .arg(&buffers.g_ptr)
4037 .arg(&buffers.g_data)
4038 .arg(&buffers.w_ptr)
4039 .arg(&buffers.w_data)
4040 .arg(&blocks_i32);
4041 unsafe { b.launch(cfg) }.map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4044 }
4045 let k_i32 = checked_i32(buffers.k)?;
4047 let max_q_i32 = checked_i32(buffers.max_q)?;
4048 let n_rows_i32 = checked_i32(buffers.n_rows)?;
4049 {
4050 let kernel = module
4051 .load_function("arrow_sae_frame_apply_h")
4052 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4053 let cfg = frame_grid(buffers.max_q, buffers.n_rows)?;
4054 let mut b = stream.launch_builder(&kernel);
4055 b.arg(x)
4056 .arg(&buffers.htb_ptr)
4057 .arg(&buffers.htb)
4058 .arg(&buffers.q_of)
4059 .arg(&mut buffers.hvec)
4060 .arg(&k_i32)
4061 .arg(&max_q_i32)
4062 .arg(&n_rows_i32);
4063 unsafe { b.launch(cfg) }.map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4066 }
4067 {
4068 let kernel = module
4069 .load_function("arrow_sae_frame_apply_ainv")
4070 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4071 let cfg = frame_grid(buffers.max_q, buffers.n_rows)?;
4072 let mut b = stream.launch_builder(&kernel);
4073 b.arg(&buffers.ainv)
4074 .arg(&buffers.hvec)
4075 .arg(&buffers.q_of)
4076 .arg(&mut buffers.svec)
4077 .arg(&max_q_i32)
4078 .arg(&n_rows_i32);
4079 unsafe { b.launch(cfg) }.map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4082 }
4083 {
4084 let kernel = module
4085 .load_function("arrow_sae_frame_scatter_h")
4086 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4087 let cfg = frame_grid(buffers.k, buffers.n_rows)?;
4088 let mut b = stream.launch_builder(&kernel);
4089 b.arg(&buffers.svec)
4090 .arg(&buffers.htb_ptr)
4091 .arg(&buffers.htb)
4092 .arg(&buffers.q_of)
4093 .arg(out)
4094 .arg(&k_i32)
4095 .arg(&max_q_i32)
4096 .arg(&n_rows_i32);
4097 unsafe { b.launch(cfg) }.map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4100 }
4101 Ok(())
4102 }
4103
4104 fn launch_sae_frame_diag_sub(
4105 stream: &Arc<CudaStream>,
4106 module: &Arc<CudaModule>,
4107 buffers: &DeviceSaeFrameBuffers,
4108 diag: &mut CudaSlice<f64>,
4109 ) -> Result<(), ArrowSchurGpuFailure> {
4110 let kernel = module
4111 .load_function("arrow_sae_frame_diag_sub")
4112 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4113 let k_i32 = checked_i32(buffers.k)?;
4114 let max_q_i32 = checked_i32(buffers.max_q)?;
4115 let n_rows_i32 = checked_i32(buffers.n_rows)?;
4116 let cfg = frame_grid(buffers.k, buffers.n_rows)?;
4117 let mut b = stream.launch_builder(&kernel);
4118 b.arg(diag)
4119 .arg(&buffers.ainv)
4120 .arg(&buffers.htb_ptr)
4121 .arg(&buffers.htb)
4122 .arg(&buffers.q_of)
4123 .arg(&k_i32)
4124 .arg(&max_q_i32)
4125 .arg(&n_rows_i32);
4126 unsafe { b.launch(cfg) }
4128 .map(drop)
4129 .map_err(|_| ArrowSchurGpuFailure::Unavailable)
4130 }
4131
4132 pub(super) fn solve_sae_matrix_free_pcg_framed(
4133 sys: &ArrowSchurSystem,
4134 data: &DeviceSaePcgData,
4135 ridge_t: f64,
4136 ridge_beta: f64,
4137 rhs_beta: &Array1<f64>,
4138 max_iterations: usize,
4139 relative_tolerance: f64,
4140 ) -> Result<(Array1<f64>, PcgDiagnostics), ArrowSchurGpuFailure> {
4141 let k = rhs_beta.len();
4142 if k == 0 || data.beta_dim != k || sys.k != k {
4143 return Err(ArrowSchurGpuFailure::Unavailable);
4144 }
4145 let frame = data
4146 .frame
4147 .as_ref()
4148 .ok_or(ArrowSchurGpuFailure::Unavailable)?;
4149 let runtime = gam_gpu::device_runtime::GpuRuntime::global()
4150 .filter(|rt| {
4151 rt.policy().reduced_schur_matvec_should_offload(
4152 sys.rows.len(),
4153 sys.k,
4154 sys.d,
4155 max_iterations,
4156 )
4157 })
4158 .ok_or(ArrowSchurGpuFailure::Unavailable)?;
4159 let ctx = gam_gpu::device_runtime::cuda_context_for(runtime.selected_device().ordinal)
4160 .ok_or(ArrowSchurGpuFailure::Unavailable)?;
4161 let stream = ctx
4162 .new_stream()
4163 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4164 let blas = CudaBlas::new(stream.clone()).map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4165 let vector_module = pcg_vector_module(&ctx)?;
4166 let mut buffers = flatten_device_sae_frame_data(sys, data, frame, ridge_t, &stream)?;
4167
4168 let rhs_norm = rhs_beta.iter().map(|v| v * v).sum::<f64>().sqrt();
4169 if rhs_norm == 0.0 {
4170 return Ok((Array1::<f64>::zeros(k), PcgDiagnostics::default()));
4171 }
4172 let tol = (relative_tolerance.max(0.0) * rhs_norm).max(1e-12);
4173 let rhs_dev = stream
4174 .clone_htod(
4175 rhs_beta
4176 .as_slice()
4177 .ok_or(ArrowSchurGpuFailure::Unavailable)?,
4178 )
4179 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4180 let diag_host = sae_frame_penalty_diag_host(data, frame, ridge_beta)?;
4181 let mut diag_dev = stream
4182 .clone_htod(&diag_host)
4183 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4184 launch_sae_frame_diag_sub(&stream, vector_module, &buffers, &mut diag_dev)?;
4185 let diag_host = stream
4186 .clone_dtoh(&diag_dev)
4187 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4188 let mut inv_diag = Vec::with_capacity(k);
4189 for (idx, &d) in diag_host.iter().enumerate() {
4190 if !d.is_finite() || d <= 1.0e-18 {
4191 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
4192 reason: format!(
4193 "framed SAE GPU PCG: non-positive Jacobi diagonal at {idx}: {d:e}"
4194 ),
4195 });
4196 }
4197 inv_diag.push(1.0 / d);
4198 }
4199 let inv_diag_dev = stream
4200 .clone_htod(&inv_diag)
4201 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4202
4203 let mut x_dev = stream
4204 .alloc_zeros::<f64>(k)
4205 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4206 let mut r_dev = stream
4207 .alloc_zeros::<f64>(k)
4208 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4209 device_copy(&blas, &stream, k, &rhs_dev, &mut r_dev)?;
4210 let mut z_dev = stream
4211 .alloc_zeros::<f64>(k)
4212 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4213 launch_jacobi_mul(&stream, vector_module, &inv_diag_dev, &r_dev, &mut z_dev, k)?;
4214 let mut p_dev = stream
4215 .alloc_zeros::<f64>(k)
4216 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4217 device_copy(&blas, &stream, k, &z_dev, &mut p_dev)?;
4218 let mut ap_dev = stream
4219 .alloc_zeros::<f64>(k)
4220 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4221
4222 let mut rz = device_dot(&blas, &stream, k, &r_dev, &z_dev)?;
4223 if rz <= 0.0 || !rz.is_finite() {
4224 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
4225 reason: format!("framed SAE GPU PCG: non-positive initial rᵀM⁻¹r={rz:e}"),
4226 });
4227 }
4228 let mut diag = PcgDiagnostics {
4229 precond_apply_calls: 1,
4230 stopping_reason: PcgStopReason::MaxIter,
4231 ..PcgDiagnostics::default()
4232 };
4233 for _ in 0..max_iterations.max(1) {
4234 launch_sae_frame_matvec(
4235 &stream,
4236 vector_module,
4237 &mut buffers,
4238 &p_dev,
4239 &mut ap_dev,
4240 ridge_beta,
4241 )?;
4242 diag.matvec_calls += 1;
4243 diag.iterations += 1;
4244 let pap = device_dot(&blas, &stream, k, &p_dev, &ap_dev)?;
4245 if pap <= 0.0 || !pap.is_finite() {
4246 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
4247 reason: format!("framed SAE GPU PCG: non-positive curvature pᵀAp={pap:e}"),
4248 });
4249 }
4250 let alpha = rz / pap;
4251 device_axpy(&blas, &stream, k, alpha, &p_dev, &mut x_dev)?;
4252 device_axpy(&blas, &stream, k, -alpha, &ap_dev, &mut r_dev)?;
4253 let r_norm = device_nrm2(&blas, &stream, k, &r_dev)?;
4254 if r_norm <= tol {
4255 diag.final_relative_residual = r_norm / rhs_norm;
4256 diag.stopping_reason = PcgStopReason::Converged;
4257 break;
4258 }
4259 launch_jacobi_mul(&stream, vector_module, &inv_diag_dev, &r_dev, &mut z_dev, k)?;
4260 diag.precond_apply_calls += 1;
4261 let rz_new = device_dot(&blas, &stream, k, &r_dev, &z_dev)?;
4262 if rz_new <= 0.0 || !rz_new.is_finite() {
4263 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
4264 reason: format!("framed SAE GPU PCG: non-positive rᵀM⁻¹r={rz_new:e}"),
4265 });
4266 }
4267 let beta = rz_new / rz;
4268 launch_update_p(&stream, vector_module, &z_dev, beta, &mut p_dev, k)?;
4269 rz = rz_new;
4270 }
4271 if diag.stopping_reason != PcgStopReason::Converged {
4272 let r_norm = device_nrm2(&blas, &stream, k, &r_dev)?;
4273 diag.final_relative_residual = r_norm / rhs_norm;
4274 diag.stopping_reason = PcgStopReason::MaxIter;
4275 }
4276 let x = stream
4277 .clone_dtoh(&x_dev)
4278 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4279 Ok((Array1::from_vec(x), diag))
4280 }
4281
4282 pub(super) fn solve_sae_matrix_free_pcg(
4289 sys: &ArrowSchurSystem,
4290 data: &DeviceSaePcgData,
4291 ridge_t: f64,
4292 ridge_beta: f64,
4293 rhs_beta: &Array1<f64>,
4294 max_iterations: usize,
4295 relative_tolerance: f64,
4296 ) -> Result<(Array1<f64>, PcgDiagnostics), ArrowSchurGpuFailure> {
4297 let k = rhs_beta.len();
4298 if k == 0 || data.beta_dim != k || sys.k != k {
4299 return Err(ArrowSchurGpuFailure::Unavailable);
4300 }
4301 if data.frame.is_some() {
4305 return Err(ArrowSchurGpuFailure::Unavailable);
4306 }
4307 let runtime = gam_gpu::device_runtime::GpuRuntime::global()
4321 .filter(|rt| {
4322 rt.policy().reduced_schur_matvec_should_offload(
4323 sys.rows.len(),
4324 sys.k,
4325 sys.d,
4326 max_iterations,
4327 )
4328 })
4329 .ok_or(ArrowSchurGpuFailure::Unavailable)?;
4330 let ctx = gam_gpu::device_runtime::cuda_context_for(runtime.selected_device().ordinal)
4331 .ok_or(ArrowSchurGpuFailure::Unavailable)?;
4332 let stream = ctx
4333 .new_stream()
4334 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4335 let blas = CudaBlas::new(stream.clone()).map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4336 let vector_module = pcg_vector_module(&ctx)?;
4337 let mut buffers = flatten_device_sae_data(sys, data, ridge_t, &stream)?;
4338
4339 let rhs_norm = rhs_beta.iter().map(|v| v * v).sum::<f64>().sqrt();
4340 if rhs_norm == 0.0 {
4341 return Ok((Array1::<f64>::zeros(k), PcgDiagnostics::default()));
4342 }
4343 let tol = (relative_tolerance.max(0.0) * rhs_norm).max(1e-12);
4344 let rhs_dev = stream
4345 .clone_htod(
4346 rhs_beta
4347 .as_slice()
4348 .ok_or(ArrowSchurGpuFailure::Unavailable)?,
4349 )
4350 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4351 let diag_host = sae_penalty_diag_host(data, ridge_beta)?;
4352 let mut diag_dev = stream
4353 .clone_htod(&diag_host)
4354 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4355 launch_sae_diag_sub(&stream, vector_module, &buffers, &mut diag_dev)?;
4356 let diag_host = stream
4357 .clone_dtoh(&diag_dev)
4358 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4359 let mut inv_diag = Vec::with_capacity(k);
4360 for (idx, &d) in diag_host.iter().enumerate() {
4361 if !d.is_finite() || d <= 1.0e-18 {
4362 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
4363 reason: format!(
4364 "SAE matrix-free GPU PCG: non-positive Schur Jacobi diagonal at {idx}: {d:e}"
4365 ),
4366 });
4367 }
4368 inv_diag.push(1.0 / d);
4369 }
4370 let inv_diag_dev = stream
4371 .clone_htod(&inv_diag)
4372 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4373
4374 let mut x_dev = stream
4375 .alloc_zeros::<f64>(k)
4376 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4377 let mut r_dev = stream
4378 .alloc_zeros::<f64>(k)
4379 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4380 device_copy(&blas, &stream, k, &rhs_dev, &mut r_dev)?;
4381 let mut z_dev = stream
4382 .alloc_zeros::<f64>(k)
4383 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4384 launch_jacobi_mul(&stream, vector_module, &inv_diag_dev, &r_dev, &mut z_dev, k)?;
4385 let mut p_dev = stream
4386 .alloc_zeros::<f64>(k)
4387 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4388 device_copy(&blas, &stream, k, &z_dev, &mut p_dev)?;
4389 let mut ap_dev = stream
4390 .alloc_zeros::<f64>(k)
4391 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4392
4393 let mut rz = device_dot(&blas, &stream, k, &r_dev, &z_dev)?;
4394 if rz <= 0.0 || !rz.is_finite() {
4395 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
4396 reason: format!("SAE matrix-free GPU PCG: non-positive initial rᵀM⁻¹r={rz:e}"),
4397 });
4398 }
4399 let mut diag = PcgDiagnostics {
4400 precond_apply_calls: 1,
4401 stopping_reason: PcgStopReason::MaxIter,
4402 ..PcgDiagnostics::default()
4403 };
4404
4405 for _ in 0..max_iterations.max(1) {
4406 launch_sae_matvec(
4407 &stream,
4408 vector_module,
4409 &mut buffers,
4410 &p_dev,
4411 &mut ap_dev,
4412 ridge_beta,
4413 )?;
4414 diag.matvec_calls += 1;
4415 diag.iterations += 1;
4416 let pap = device_dot(&blas, &stream, k, &p_dev, &ap_dev)?;
4417 if pap <= 0.0 || !pap.is_finite() {
4418 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
4419 reason: format!("SAE matrix-free GPU PCG: non-positive curvature pᵀAp={pap:e}"),
4420 });
4421 }
4422 let alpha = rz / pap;
4423 device_axpy(&blas, &stream, k, alpha, &p_dev, &mut x_dev)?;
4424 device_axpy(&blas, &stream, k, -alpha, &ap_dev, &mut r_dev)?;
4425 let r_norm = device_nrm2(&blas, &stream, k, &r_dev)?;
4426 if r_norm <= tol {
4427 diag.final_relative_residual = r_norm / rhs_norm;
4428 diag.stopping_reason = PcgStopReason::Converged;
4429 break;
4430 }
4431 launch_jacobi_mul(&stream, vector_module, &inv_diag_dev, &r_dev, &mut z_dev, k)?;
4432 diag.precond_apply_calls += 1;
4433 let rz_new = device_dot(&blas, &stream, k, &r_dev, &z_dev)?;
4434 if rz_new <= 0.0 || !rz_new.is_finite() {
4435 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
4436 reason: format!("SAE matrix-free GPU PCG: non-positive rᵀM⁻¹r={rz_new:e}"),
4437 });
4438 }
4439 let beta = rz_new / rz;
4440 launch_update_p(&stream, vector_module, &z_dev, beta, &mut p_dev, k)?;
4441 rz = rz_new;
4442 }
4443 if diag.stopping_reason != PcgStopReason::Converged {
4444 let r_norm = device_nrm2(&blas, &stream, k, &r_dev)?;
4445 diag.final_relative_residual = r_norm / rhs_norm;
4446 diag.stopping_reason = PcgStopReason::MaxIter;
4447 }
4448 let x = stream
4449 .clone_dtoh(&x_dev)
4450 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4451 Ok((Array1::from_vec(x), diag))
4452 }
4453
4454 pub(super) fn solve_reduced_beta_pcg_with_diagnostics(
4455 s_acc: &ndarray::Array2<f64>,
4456 rhs_beta: &Array1<f64>,
4457 max_iterations: usize,
4458 relative_tolerance: f64,
4459 ) -> Result<(Array1<f64>, PcgDiagnostics), ArrowSchurGpuFailure> {
4460 let k = rhs_beta.len();
4461 let cg_iters = max_iterations.max(1);
4473 let runtime = gam_gpu::linalg_dispatch::route_through_gpu(
4474 gam_gpu::linalg_dispatch::DispatchOp::Gemm {
4475 m: k,
4476 n: k,
4477 k: cg_iters,
4478 },
4479 )
4480 .ok_or(ArrowSchurGpuFailure::Unavailable)?;
4481 let stream = gam_gpu::device_runtime::cuda_context_for(runtime.device.ordinal)
4482 .and_then(|ctx| ctx.new_stream().ok())
4483 .ok_or(ArrowSchurGpuFailure::Unavailable)?;
4484 let blas = CudaBlas::new(stream.clone()).map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4485 let ctx = gam_gpu::device_runtime::cuda_context_for(runtime.device.ordinal)
4486 .ok_or(ArrowSchurGpuFailure::Unavailable)?;
4487 let vector_module = pcg_vector_module(&ctx)?;
4488
4489 let mut inv_diag = vec![0.0_f64; k];
4491 for j in 0..k {
4492 let djj = s_acc[[j, j]];
4493 if !(djj > 0.0) {
4494 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
4495 reason: format!(
4496 "reduced-β GPU PCG: Jacobi diagonal S[{j},{j}]={djj:e} not positive"
4497 ),
4498 });
4499 }
4500 inv_diag[j] = 1.0 / djj;
4501 }
4502
4503 let mut s_host = vec![0.0_f64; k * k];
4505 for col in 0..k {
4506 for row in 0..k {
4507 s_host[col * k + row] = s_acc[[row, col]];
4508 }
4509 }
4510 let s_dev = stream
4511 .clone_htod(&s_host)
4512 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4513
4514 let rhs_norm = rhs_beta.iter().map(|v| v * v).sum::<f64>().sqrt();
4518 if rhs_norm == 0.0 {
4519 return Ok((Array1::<f64>::zeros(k), PcgDiagnostics::default()));
4520 }
4521 let tol = (relative_tolerance.max(0.0) * rhs_norm).max(1e-12);
4522
4523 let mut x_dev = stream
4526 .alloc_zeros::<f64>(k)
4527 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4528 let mut r_dev = stream
4529 .clone_htod(
4530 rhs_beta
4531 .as_slice()
4532 .ok_or(ArrowSchurGpuFailure::Unavailable)?,
4533 )
4534 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4535 let inv_diag_dev = stream
4536 .clone_htod(&inv_diag)
4537 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4538 let mut z_dev = stream
4539 .alloc_zeros::<f64>(k)
4540 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4541 launch_jacobi_mul(&stream, vector_module, &inv_diag_dev, &r_dev, &mut z_dev, k)?;
4542 let mut p_dev = stream
4543 .alloc_zeros::<f64>(k)
4544 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4545 device_copy(&blas, &stream, k, &z_dev, &mut p_dev)?;
4546 let mut sp_dev = stream
4547 .alloc_zeros::<f64>(k)
4548 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4549 let mut rz = device_dot(&blas, &stream, k, &r_dev, &z_dev)?;
4550 let mut diag = PcgDiagnostics {
4551 precond_apply_calls: 1,
4552 stopping_reason: PcgStopReason::MaxIter,
4553 ..PcgDiagnostics::default()
4554 };
4555 if rz <= 0.0 || !rz.is_finite() {
4556 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
4557 reason: format!("reduced-β GPU PCG: non-positive initial rᵀM⁻¹r={rz:e}"),
4558 });
4559 }
4560
4561 let max_iters = max_iterations.max(1);
4562 for _ in 0..max_iters {
4563 let gemv_cfg = GemvConfig::<f64> {
4565 trans: cublasOperation_t::CUBLAS_OP_N,
4566 m: to_i32(k).ok_or(ArrowSchurGpuFailure::Unavailable)?,
4567 n: to_i32(k).ok_or(ArrowSchurGpuFailure::Unavailable)?,
4568 alpha: 1.0,
4569 lda: to_i32(k).ok_or(ArrowSchurGpuFailure::Unavailable)?,
4570 incx: 1,
4571 beta: 0.0,
4572 incy: 1,
4573 };
4574 unsafe { blas.gemv(gemv_cfg, &s_dev, &p_dev, &mut sp_dev) }
4576 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4577 diag.matvec_calls += 1;
4578 diag.iterations += 1;
4579
4580 let p_sp = device_dot(&blas, &stream, k, &p_dev, &sp_dev)?;
4581 if !(p_sp > 0.0) {
4582 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
4585 reason: format!("reduced-β GPU PCG: non-positive curvature pᵀSp={p_sp:e}"),
4586 });
4587 }
4588 let alpha = rz / p_sp;
4589 device_axpy(&blas, &stream, k, alpha, &p_dev, &mut x_dev)?;
4590 device_axpy(&blas, &stream, k, -alpha, &sp_dev, &mut r_dev)?;
4591 let r_norm = device_nrm2(&blas, &stream, k, &r_dev)?;
4592 if r_norm <= tol {
4593 diag.final_relative_residual = r_norm / rhs_norm;
4594 diag.stopping_reason = PcgStopReason::Converged;
4595 break;
4596 }
4597 launch_jacobi_mul(&stream, vector_module, &inv_diag_dev, &r_dev, &mut z_dev, k)?;
4598 diag.precond_apply_calls += 1;
4599 let rz_new = device_dot(&blas, &stream, k, &r_dev, &z_dev)?;
4600 if rz_new <= 0.0 || !rz_new.is_finite() {
4601 return Err(ArrowSchurGpuFailure::SchurFactorFailed {
4602 reason: format!("reduced-β GPU PCG: non-positive rᵀM⁻¹r={rz_new:e}"),
4603 });
4604 }
4605 let beta = rz_new / rz;
4606 launch_update_p(&stream, vector_module, &z_dev, beta, &mut p_dev, k)?;
4607 rz = rz_new;
4608 }
4609 if diag.stopping_reason != PcgStopReason::Converged {
4610 let r_norm = device_nrm2(&blas, &stream, k, &r_dev)?;
4611 diag.final_relative_residual = r_norm / rhs_norm;
4612 diag.stopping_reason = PcgStopReason::MaxIter;
4613 }
4614
4615 let x = stream
4616 .clone_dtoh(&x_dev)
4617 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4618 Ok((Array1::from_vec(x), diag))
4619 }
4620
4621 fn device_copy(
4622 blas: &CudaBlas,
4623 stream: &Arc<CudaStream>,
4624 n: usize,
4625 src: &CudaSlice<f64>,
4626 dst: &mut CudaSlice<f64>,
4627 ) -> Result<(), ArrowSchurGpuFailure> {
4628 let n_i = to_i32(n).ok_or(ArrowSchurGpuFailure::Unavailable)?;
4629 let (src_ptr, _src_rec) = src.device_ptr(stream);
4630 let (dst_ptr, _dst_rec) = dst.device_ptr_mut(stream);
4631 let status = unsafe {
4634 cudarc::cublas::sys::cublasDcopy_v2(
4635 *blas.handle(),
4636 n_i,
4637 src_ptr as *const f64,
4638 1,
4639 dst_ptr as *mut f64,
4640 1,
4641 )
4642 };
4643 if status == cublasStatus_t::CUBLAS_STATUS_SUCCESS {
4644 Ok(())
4645 } else {
4646 Err(ArrowSchurGpuFailure::Unavailable)
4647 }
4648 }
4649
4650 fn device_axpy(
4651 blas: &CudaBlas,
4652 stream: &Arc<CudaStream>,
4653 n: usize,
4654 alpha: f64,
4655 x: &CudaSlice<f64>,
4656 y: &mut CudaSlice<f64>,
4657 ) -> Result<(), ArrowSchurGpuFailure> {
4658 let n_i = to_i32(n).ok_or(ArrowSchurGpuFailure::Unavailable)?;
4659 let (x_ptr, _x_rec) = x.device_ptr(stream);
4660 let (y_ptr, _y_rec) = y.device_ptr_mut(stream);
4661 let status = unsafe {
4664 cudarc::cublas::sys::cublasDaxpy_v2(
4665 *blas.handle(),
4666 n_i,
4667 &alpha,
4668 x_ptr as *const f64,
4669 1,
4670 y_ptr as *mut f64,
4671 1,
4672 )
4673 };
4674 if status == cublasStatus_t::CUBLAS_STATUS_SUCCESS {
4675 Ok(())
4676 } else {
4677 Err(ArrowSchurGpuFailure::Unavailable)
4678 }
4679 }
4680
4681 fn device_dot(
4682 blas: &CudaBlas,
4683 stream: &Arc<CudaStream>,
4684 n: usize,
4685 x: &CudaSlice<f64>,
4686 y: &CudaSlice<f64>,
4687 ) -> Result<f64, ArrowSchurGpuFailure> {
4688 let n_i = to_i32(n).ok_or(ArrowSchurGpuFailure::Unavailable)?;
4689 let (x_ptr, _x_rec) = x.device_ptr(stream);
4690 let (y_ptr, _y_rec) = y.device_ptr(stream);
4691 let mut result = 0.0_f64;
4692 let status = unsafe {
4696 cudarc::cublas::sys::cublasDdot_v2(
4697 *blas.handle(),
4698 n_i,
4699 x_ptr as *const f64,
4700 1,
4701 y_ptr as *const f64,
4702 1,
4703 &mut result,
4704 )
4705 };
4706 if status == cublasStatus_t::CUBLAS_STATUS_SUCCESS {
4707 Ok(result)
4708 } else {
4709 Err(ArrowSchurGpuFailure::Unavailable)
4710 }
4711 }
4712
4713 fn device_nrm2(
4714 blas: &CudaBlas,
4715 stream: &Arc<CudaStream>,
4716 n: usize,
4717 x: &CudaSlice<f64>,
4718 ) -> Result<f64, ArrowSchurGpuFailure> {
4719 let n_i = to_i32(n).ok_or(ArrowSchurGpuFailure::Unavailable)?;
4720 let (x_ptr, _x_rec) = x.device_ptr(stream);
4721 let mut result = 0.0_f64;
4722 let status = unsafe {
4726 cudarc::cublas::sys::cublasDnrm2_v2(
4727 *blas.handle(),
4728 n_i,
4729 x_ptr as *const f64,
4730 1,
4731 &mut result,
4732 )
4733 };
4734 if status == cublasStatus_t::CUBLAS_STATUS_SUCCESS {
4735 Ok(result)
4736 } else {
4737 Err(ArrowSchurGpuFailure::Unavailable)
4738 }
4739 }
4740
4741 #[cfg(test)]
4742 mod tests {
4743 use super::*;
4748 use crate::arrow_schur::{
4749 ArrowSchurSystem, DeviceSaeFrameData, DeviceSaePcgData, DeviceSaeSmoothBlock,
4750 FactoredFrameGBlock,
4751 };
4752 use ndarray::Array2;
4753
4754 fn device_matvec_once(
4757 sys: &ArrowSchurSystem,
4758 data: &DeviceSaePcgData,
4759 ridge_t: f64,
4760 ridge_beta: f64,
4761 x_host: &[f64],
4762 ) -> Result<Vec<f64>, ArrowSchurGpuFailure> {
4763 let k = x_host.len();
4764 let frame = data
4765 .frame
4766 .as_ref()
4767 .ok_or(ArrowSchurGpuFailure::Unavailable)?;
4768 let runtime = gam_gpu::device_runtime::GpuRuntime::global()
4769 .ok_or(ArrowSchurGpuFailure::Unavailable)?;
4770 let ctx =
4771 gam_gpu::device_runtime::cuda_context_for(runtime.selected_device().ordinal)
4772 .ok_or(ArrowSchurGpuFailure::Unavailable)?;
4773 let stream = ctx
4774 .new_stream()
4775 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4776 let vector_module = pcg_vector_module(&ctx)?;
4777 let mut buffers = flatten_device_sae_frame_data(sys, data, frame, ridge_t, &stream)?;
4778 let x_dev = stream
4779 .clone_htod(x_host)
4780 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4781 let mut out_dev = stream
4782 .alloc_zeros::<f64>(k)
4783 .map_err(|_| ArrowSchurGpuFailure::Unavailable)?;
4784 launch_sae_frame_matvec(
4785 &stream,
4786 vector_module,
4787 &mut buffers,
4788 &x_dev,
4789 &mut out_dev,
4790 ridge_beta,
4791 )?;
4792 stream
4793 .clone_dtoh(&out_dev)
4794 .map_err(|_| ArrowSchurGpuFailure::Unavailable)
4795 }
4796
4797 #[test]
4803 fn framed_sae_device_matvec_stage_diff_tiny_1551() {
4804 if gam_gpu::device_runtime::GpuRuntime::global().is_none() {
4805 return;
4806 }
4807 let p = 3usize;
4808 let ranks = vec![2usize, 3usize];
4809 let basis_sizes = vec![2usize, 2usize];
4810 let mut border_offsets = Vec::new();
4811 let mut acc = 0usize;
4812 for k in 0..2 {
4813 border_offsets.push(acc);
4814 acc += basis_sizes[k] * ranks[k];
4815 }
4816 let border_dim = acc; let frame_of = |k: usize| -> Array2<f64> {
4818 Array2::from_shape_fn((p, ranks[k]), |(i, j)| {
4819 0.1 + 0.2 * ((i + 1) as f64) * ((j + 1 + 2 * k) as f64)
4820 })
4821 };
4822 let frames: Vec<Array2<f64>> = (0..2).map(frame_of).collect();
4823 let w_of = |i: usize, j: usize| -> Array2<f64> {
4824 let (ui, uj) = (&frames[i], &frames[j]);
4825 Array2::from_shape_fn((ranks[i], ranks[j]), |(a, b)| {
4826 (0..p).map(|c| ui[[c, a]] * uj[[c, b]]).sum()
4827 })
4828 };
4829 let mut frame_blocks = Vec::new();
4830 for &(i, j) in &[(0usize, 0usize), (1usize, 1usize), (0, 1), (1, 0)] {
4831 let (mi, mj) = (basis_sizes[i], basis_sizes[j]);
4832 let mut g =
4833 Array2::<f64>::from_shape_fn((mi, mj), |(r, c)| 0.1 * (r + 2 * c + 1) as f64);
4834 if i == j {
4835 for r in 0..mi.min(mj) {
4836 g[[r, r]] += mi as f64 + 2.0;
4837 }
4838 }
4839 frame_blocks.push(FactoredFrameGBlock {
4840 atom_i: i,
4841 atom_j: j,
4842 g,
4843 w: w_of(i, j),
4844 });
4845 }
4846 let mut smooth_blocks = Vec::new();
4847 for k in 0..2 {
4848 let m = basis_sizes[k];
4849 let mut s =
4850 Array2::<f64>::from_shape_fn((m, m), |(r, c)| 0.05 * (r + c + 1) as f64);
4851 for r in 0..m {
4852 s[[r, r]] += 1.0;
4853 }
4854 smooth_blocks.push(DeviceSaeSmoothBlock {
4855 global_offset: border_offsets[k],
4856 factor_a: s,
4857 });
4858 }
4859 let smooth_ranks = ranks.clone();
4860 let n = 2usize;
4861 let q = 2usize;
4862 let mut sys = ArrowSchurSystem::new(n, q, border_dim);
4863 let mut row_htbeta = Vec::new();
4864 for i in 0..n {
4865 let mut htt =
4866 Array2::<f64>::from_shape_fn((q, q), |(r, c)| 0.3 * (r + c + 1) as f64);
4867 for r in 0..q {
4868 htt[[r, r]] += q as f64 + 2.0;
4869 }
4870 sys.rows[i].htt = htt;
4871 let mut slab = vec![0.0_f64; q * border_dim];
4872 for c in 0..q {
4873 for col in 0..border_dim {
4874 let v = 0.01 * ((c + 1) * (col + 1) + i) as f64;
4875 slab[c * border_dim + col] = v;
4876 sys.rows[i].htbeta[[c, col]] = v;
4877 }
4878 }
4879 row_htbeta.push(slab);
4880 }
4881 let data = DeviceSaePcgData {
4882 p,
4883 beta_dim: border_dim,
4884 a_phi: std::sync::Arc::from(Vec::new().into_boxed_slice()),
4885 local_jac: std::sync::Arc::from(Vec::new().into_boxed_slice()),
4886 smooth_blocks,
4887 sparse_g_blocks: Vec::new(),
4888 frame: Some(DeviceSaeFrameData {
4889 ranks,
4890 basis_sizes,
4891 border_offsets,
4892 frame_blocks,
4893 smooth_ranks,
4894 row_htbeta,
4895 }),
4896 };
4897 let ridge_t = 1e-7;
4898 let ridge_beta = 1e-6;
4899 let mut first_bad: Option<usize> = None;
4900 let mut worst = 0.0_f64;
4901 let mut worst_at = 0usize;
4902 let mut worst_dev = 0.0_f64;
4903 let mut worst_cpu = 0.0_f64;
4904 for col in 0..border_dim {
4905 let mut x = vec![0.0_f64; border_dim];
4906 x[col] = 1.0;
4907 let dev = match device_matvec_once(&sys, &data, ridge_t, ridge_beta, &x) {
4908 Ok(v) => v,
4909 Err(_) => return,
4910 };
4911 let mut cpu = vec![0.0_f64; border_dim];
4912 super::super::sae_framed_schur_matvec_cpu(
4913 &sys, &data, ridge_t, ridge_beta, &x, &mut cpu,
4914 )
4915 .expect("cpu matvec");
4916 for r in 0..border_dim {
4917 let d = (dev[r] - cpu[r]).abs();
4918 if d > 1e-9 && first_bad.is_none() {
4919 first_bad = Some(r * border_dim + col);
4920 }
4921 if d > worst {
4922 worst = d;
4923 worst_at = r * border_dim + col;
4924 worst_dev = dev[r];
4925 worst_cpu = cpu[r];
4926 }
4927 }
4928 }
4929 assert!(
4930 worst <= 1e-9,
4931 "[#1551 stage-diff] device framed matvec != CPU oracle: worst abs={worst:e} at \
4932 (row*K+col)={worst_at} (dev={worst_dev:e} cpu={worst_cpu:e}), \
4933 first_bad_idx={first_bad:?}; border layout: atom0 [0..4) rank2, atom1 [4..10) \
4934 rank3 — which atom-range the bad row/col falls in pins the stage (smooth=diag, \
4935 G⊗W=cross, reduced-Schur=dense per-row)",
4936 );
4937 }
4938 }
4939}
4940
4941#[cfg(test)]
4942mod tests {
4943 use super::*;
4944 use crate::arrow_schur::ArrowSchurSystem;
4945 use ndarray::{Array2, ArrayView1};
4946
4947 fn build_fixture(n: usize, d: usize, k: usize, seed: u64) -> ArrowSchurSystem {
4948 let mut sys = ArrowSchurSystem::new(n, d, k);
4949 let mut state = seed.wrapping_mul(0x9E37_79B9_7F4A_7C15);
4950 let mut sample = || -> f64 {
4951 state = state
4952 .wrapping_mul(6364136223846793005)
4953 .wrapping_add(1442695040888963407);
4954 ((state >> 33) as f64) / ((1u64 << 31) as f64) - 1.0
4955 };
4956 for row in &mut sys.rows {
4957 let mut a = Array2::<f64>::zeros((d, d));
4958 for r in 0..d {
4959 for c in 0..d {
4960 a[[r, c]] = sample();
4961 }
4962 }
4963 let mut htt = a.t().dot(&a);
4964 for r in 0..d {
4965 htt[[r, r]] += d as f64 + 1.0;
4966 }
4967 row.htt = htt;
4968 for r in 0..d {
4969 for c in 0..k {
4970 row.htbeta[[r, c]] = 0.1 * sample();
4971 }
4972 row.gt[r] = sample();
4973 }
4974 }
4975 let mut hbb_a = Array2::<f64>::zeros((k, k));
4976 for r in 0..k {
4977 for c in 0..k {
4978 hbb_a[[r, c]] = sample();
4979 }
4980 }
4981 let mut hbb = hbb_a.t().dot(&hbb_a);
4982 for r in 0..k {
4983 hbb[[r, r]] += k as f64 + 1.0;
4984 }
4985 sys.hbb = hbb;
4986 for r in 0..k {
4987 sys.gb[r] = sample();
4988 }
4989 sys
4990 }
4991
4992 fn device_pcg_fixture(k: usize) -> (Array2<f64>, Array1<f64>) {
4993 let mut s = Array2::<f64>::zeros((k, k));
4994 for row in 0..k {
4995 s[[row, row]] = 2.5 + 0.001 * ((row % 17) as f64);
4996 if row + 1 < k {
4997 s[[row, row + 1]] = -0.05;
4998 s[[row + 1, row]] = -0.05;
4999 }
5000 if row + 7 < k {
5001 s[[row, row + 7]] = 0.01;
5002 s[[row + 7, row]] = 0.01;
5003 }
5004 }
5005 let rhs = Array1::from_shape_fn(k, |idx| ((idx as f64 + 1.0) * 0.013).sin());
5006 (s, rhs)
5007 }
5008
5009 fn dense_pcg_cpu_reference(
5010 s: &Array2<f64>,
5011 rhs: &Array1<f64>,
5012 max_iterations: usize,
5013 relative_tolerance: f64,
5014 ) -> Array1<f64> {
5015 let k = rhs.len();
5016 let rhs_norm = rhs.iter().map(|v| v * v).sum::<f64>().sqrt();
5017 if rhs_norm == 0.0 {
5018 return Array1::<f64>::zeros(k);
5019 }
5020 let tol = (relative_tolerance.max(0.0) * rhs_norm).max(1e-12);
5021 let inv_diag: Vec<f64> = (0..k).map(|idx| 1.0 / s[[idx, idx]]).collect();
5022 let mut x = Array1::<f64>::zeros(k);
5023 let mut r = rhs.clone();
5024 let mut z = Array1::from_shape_fn(k, |idx| inv_diag[idx] * r[idx]);
5025 let mut p = z.clone();
5026 let mut sp = Array1::<f64>::zeros(k);
5027 let mut rz = r.iter().zip(z.iter()).map(|(a, b)| a * b).sum::<f64>();
5028 for _ in 0..max_iterations.max(1) {
5029 for row in 0..k {
5030 let mut acc = 0.0;
5031 for col in 0..k {
5032 acc += s[[row, col]] * p[col];
5033 }
5034 sp[row] = acc;
5035 }
5036 let p_sp = p.iter().zip(sp.iter()).map(|(a, b)| a * b).sum::<f64>();
5037 let alpha = rz / p_sp;
5038 for idx in 0..k {
5039 x[idx] += alpha * p[idx];
5040 r[idx] -= alpha * sp[idx];
5041 }
5042 let r_norm = r.iter().map(|v| v * v).sum::<f64>().sqrt();
5043 if r_norm <= tol {
5044 break;
5045 }
5046 for idx in 0..k {
5047 z[idx] = inv_diag[idx] * r[idx];
5048 }
5049 let rz_next = r.iter().zip(z.iter()).map(|(a, b)| a * b).sum::<f64>();
5050 let beta = rz_next / rz;
5051 for idx in 0..k {
5052 p[idx] = z[idx] + beta * p[idx];
5053 }
5054 rz = rz_next;
5055 }
5056 x
5057 }
5058
5059 #[test]
5060 fn device_resident_pcg_matches_cpu_reference_when_cuda_admits() {
5061 let (s, rhs) = device_pcg_fixture(512);
5062 let max_iterations = 200usize;
5063 let relative_tolerance = 1.0e-12;
5064 let cpu = dense_pcg_cpu_reference(&s, &rhs, max_iterations, relative_tolerance);
5065 let (device, diag) = match solve_reduced_beta_pcg_with_diagnostics(
5066 &s,
5067 &rhs,
5068 max_iterations,
5069 relative_tolerance,
5070 ) {
5071 Ok(result) => result,
5072 Err(failure) => {
5079 assert!(
5080 gam_gpu::device_runtime::GpuRuntime::global().is_none(),
5081 "#1017: CUDA device present but the device reduced-beta PCG \
5082 declined/faulted instead of returning a result (tag: {failure:?}) — \
5083 the kernel does not run correctly on GPU"
5084 );
5085 return;
5086 }
5087 };
5088 let max_err = cpu
5089 .iter()
5090 .zip(device.iter())
5091 .map(|(a, b)| (a - b).abs())
5092 .fold(0.0_f64, f64::max);
5093 assert!(
5094 max_err <= 1.0e-10,
5095 "device resident PCG parity failed: max_err={max_err:e}, diag={diag:?}"
5096 );
5097 assert!(diag.matvec_calls > 0);
5098 assert_eq!(diag.matvec_calls, diag.iterations);
5099 }
5100
5101 #[test]
5102 fn dense_reference_matches_independent_solve() {
5103 let sys = build_fixture(4, 5, 3, 7);
5104 let solution = solve_arrow_newton_step_dense_reference(&sys, 0.0, 0.0).unwrap();
5105 let n = sys.rows.len();
5109 let d = sys.d;
5110 let k = sys.k;
5111 let total = n * d + k;
5112 let mut h = Array2::<f64>::zeros((total, total));
5113 let mut g = ndarray::Array1::<f64>::zeros(total);
5114 for (i, row) in sys.rows.iter().enumerate() {
5115 let base = i * d;
5116 for c in 0..d {
5117 for r in 0..d {
5118 h[[base + r, base + c]] = row.htt[[r, c]];
5119 }
5120 }
5121 for c in 0..k {
5122 for r in 0..d {
5123 h[[base + r, n * d + c]] = row.htbeta[[r, c]];
5124 h[[n * d + c, base + r]] = row.htbeta[[r, c]];
5125 }
5126 }
5127 for r in 0..d {
5128 g[base + r] = row.gt[r];
5129 }
5130 }
5131 for c in 0..k {
5132 for r in 0..k {
5133 h[[n * d + r, n * d + c]] += sys.hbb[[r, c]];
5134 }
5135 g[n * d + c] = sys.gb[c];
5136 }
5137 let l = cholesky_factor_in_place(h.view(), CholeskyGuard::NonnegativePivot).unwrap();
5138 let rhs = g.mapv(|v| -v);
5139 let expected = cholesky_solve_vector(l.view(), rhs.view());
5140 for i in 0..n * d {
5141 assert!(
5142 (solution.delta_t[i] - expected[i]).abs() < 1e-10 * (1.0 + expected[i].abs()),
5143 "delta_t[{i}] mismatch: got {} expected {}",
5144 solution.delta_t[i],
5145 expected[i]
5146 );
5147 }
5148 for a in 0..k {
5149 assert!(
5150 (solution.delta_beta[a] - expected[n * d + a]).abs()
5151 < 1e-10 * (1.0 + expected[n * d + a].abs()),
5152 "delta_beta[{a}] mismatch"
5153 );
5154 }
5155 }
5156
5157 #[test]
5171 fn row_procedural_matvec_parallel_deterministic_and_matches_serial() {
5172 use crate::arrow_schur::SCHUR_MATVEC_PARALLEL_ROW_MIN;
5173 let n = SCHUR_MATVEC_PARALLEL_ROW_MIN + 96; let d = 3usize;
5175 let k = 24usize;
5176 let mut sys = build_fixture(n, d, k, 0xA17C_0FFE);
5177 let slabs: Vec<Array2<f64>> = sys.rows.iter().map(|row| row.htbeta.clone()).collect();
5182 let forward_slabs = slabs.clone();
5183 let transpose_slabs = slabs;
5184 sys.set_row_htbeta_operator(
5185 move |row: usize, x: ArrayView1<'_, f64>, out: &mut Array1<f64>| {
5186 let h = &forward_slabs[row];
5187 for r in 0..h.nrows() {
5188 let mut acc = 0.0_f64;
5189 for c in 0..h.ncols() {
5190 acc += h[[r, c]] * x[c];
5191 }
5192 out[r] = acc;
5193 }
5194 },
5195 move |row: usize, v: ArrayView1<'_, f64>, out: &mut Array1<f64>| {
5196 let h = &transpose_slabs[row];
5197 for r in 0..h.nrows() {
5198 for c in 0..h.ncols() {
5199 out[c] += h[[r, c]] * v[r];
5200 }
5201 }
5202 },
5203 );
5204
5205 let matvec = gpu_schur_matvec_backend(&sys, 0.0, 0.0)
5206 .expect("row-procedural matvec backend builds for matrix-free system");
5207 let x = Array1::from_shape_fn(k, |i| ((i as f64 + 1.0) * 0.37).sin());
5208
5209 let mut out_parallel_a = Array1::<f64>::zeros(k);
5213 matvec(&x, &mut out_parallel_a);
5214 let mut out_parallel_b = Array1::<f64>::zeros(k);
5215 matvec(&x, &mut out_parallel_b);
5216 for a in 0..k {
5217 assert_eq!(
5218 out_parallel_a[a].to_bits(),
5219 out_parallel_b[a].to_bits(),
5220 "row-procedural matvec parallel reduction is non-deterministic at index {a}"
5221 );
5222 }
5223
5224 let mut out_serial = Array1::<f64>::zeros(k);
5229 rayon::ThreadPoolBuilder::new()
5230 .num_threads(2)
5231 .build()
5232 .expect("build rayon pool")
5233 .install(|| matvec(&x, &mut out_serial));
5234
5235 let max_abs = out_serial.iter().fold(0.0_f64, |m, v| m.max(v.abs()));
5236 for a in 0..k {
5237 let diff = (out_parallel_a[a] - out_serial[a]).abs();
5238 assert!(
5239 diff <= 1e-12 * (1.0 + max_abs),
5240 "row-procedural matvec parallel vs serial diverged beyond reassociation \
5241 at index {a}: {} vs {} (diff={diff:e})",
5242 out_parallel_a[a],
5243 out_serial[a]
5244 );
5245 }
5246 }
5247
5248 #[test]
5255 fn framed_sae_schur_matvec_matches_dense_reference() {
5256 use crate::arrow_schur::{
5257 BetaPenaltyOp, DeviceSaeFrameData, DeviceSaePcgData, DeviceSaeSmoothBlock,
5258 FactoredFrameGBlock, FactoredFrameKroneckerOp, IdentityRightKroneckerPenaltyOp,
5259 };
5260
5261 let p = 4usize;
5262 let ranks = vec![2usize, 4usize, 3usize];
5264 let basis_sizes = vec![2usize, 1usize, 2usize];
5265 let n_atoms = ranks.len();
5266 let mut border_offsets = Vec::with_capacity(n_atoms);
5267 let mut acc = 0usize;
5268 for k in 0..n_atoms {
5269 border_offsets.push(acc);
5270 acc += basis_sizes[k] * ranks[k];
5271 }
5272 let border_dim = acc; let mut state = 0x1234_5678_9abc_def0u64;
5275 let mut sample = || -> f64 {
5276 state = state
5277 .wrapping_mul(6364136223846793005)
5278 .wrapping_add(1442695040888963407);
5279 ((state >> 33) as f64) / ((1u64 << 31) as f64) - 1.0
5280 };
5281
5282 let mut frames: Vec<Array2<f64>> = Vec::with_capacity(n_atoms);
5285 for k in 0..n_atoms {
5286 let r = ranks[k];
5287 let mut u = Array2::<f64>::zeros((p, r));
5288 for i in 0..p {
5289 for j in 0..r {
5290 u[[i, j]] = if r == p && i == j {
5291 1.0
5292 } else if r == p {
5293 0.0
5294 } else {
5295 sample()
5296 };
5297 }
5298 }
5299 frames.push(u);
5300 }
5301 let w_of = |i: usize, j: usize| -> Array2<f64> {
5302 let (ui, uj) = (&frames[i], &frames[j]);
5303 let (ri, rj) = (ranks[i], ranks[j]);
5304 let mut w = Array2::<f64>::zeros((ri, rj));
5305 for a in 0..ri {
5306 for b in 0..rj {
5307 let mut s = 0.0;
5308 for c in 0..p {
5309 s += ui[[c, a]] * uj[[c, b]];
5310 }
5311 w[[a, b]] = s;
5312 }
5313 }
5314 w
5315 };
5316
5317 let mut frame_blocks: Vec<FactoredFrameGBlock> = Vec::new();
5319 let mut pairs = vec![(0usize, 0usize), (1, 1), (2, 2), (0, 2), (2, 0)];
5320 pairs.sort();
5321 for &(i, j) in &pairs {
5322 let (mi, mj) = (basis_sizes[i], basis_sizes[j]);
5323 let mut g = Array2::<f64>::zeros((mi, mj));
5324 for r in 0..mi {
5325 for c in 0..mj {
5326 g[[r, c]] = 0.3 * sample();
5327 }
5328 }
5329 if i == j {
5331 for r in 0..mi.min(mj) {
5332 g[[r, r]] += mi as f64 + 2.0;
5333 }
5334 }
5335 frame_blocks.push(FactoredFrameGBlock {
5336 atom_i: i,
5337 atom_j: j,
5338 g,
5339 w: w_of(i, j),
5340 });
5341 }
5342
5343 let mut smooth_blocks: Vec<DeviceSaeSmoothBlock> = Vec::with_capacity(n_atoms);
5345 let mut smooth_ranks: Vec<usize> = Vec::with_capacity(n_atoms);
5346 for k in 0..n_atoms {
5347 let m = basis_sizes[k];
5348 let mut a = Array2::<f64>::zeros((m, m));
5349 for r in 0..m {
5350 for c in 0..m {
5351 a[[r, c]] = 0.2 * sample();
5352 }
5353 }
5354 let mut s = a.t().dot(&a);
5355 for r in 0..m {
5356 s[[r, r]] += 1.0;
5357 }
5358 smooth_blocks.push(DeviceSaeSmoothBlock {
5359 global_offset: border_offsets[k],
5360 factor_a: s,
5361 });
5362 smooth_ranks.push(ranks[k]);
5363 }
5364
5365 let n = 6usize;
5367 let q = 3usize;
5368 let mut sys = ArrowSchurSystem::new(n, q, border_dim);
5369 let mut row_htbeta: Vec<Vec<f64>> = Vec::with_capacity(n);
5370 for i in 0..n {
5371 let mut a = Array2::<f64>::zeros((q, q));
5373 for r in 0..q {
5374 for c in 0..q {
5375 a[[r, c]] = sample();
5376 }
5377 }
5378 let mut htt = a.t().dot(&a);
5379 for r in 0..q {
5380 htt[[r, r]] += q as f64 + 1.0;
5381 }
5382 sys.rows[i].htt = htt;
5383 let mut slab = vec![0.0_f64; q * border_dim];
5384 for c in 0..q {
5385 for col in 0..border_dim {
5386 let v = 0.15 * sample();
5387 slab[c * border_dim + col] = v;
5388 sys.rows[i].htbeta[[c, col]] = v;
5389 }
5390 }
5391 row_htbeta.push(slab);
5392 }
5393
5394 let data_op =
5397 FactoredFrameKroneckerOp::new(ranks.clone(), basis_sizes.clone(), frame_blocks.clone())
5398 .expect("frame op");
5399 let mut hbb = data_op.to_dense();
5400 for k in 0..n_atoms {
5401 let op = IdentityRightKroneckerPenaltyOp {
5402 factor_a: smooth_blocks[k].factor_a.clone(),
5403 p: ranks[k],
5404 global_offset: border_offsets[k],
5405 k: border_dim,
5406 };
5407 let d = op.to_dense();
5408 for r in 0..border_dim {
5409 for c in 0..border_dim {
5410 hbb[[r, c]] += d[[r, c]];
5411 }
5412 }
5413 }
5414 sys.hbb = hbb;
5415
5416 let data = DeviceSaePcgData {
5417 p,
5418 beta_dim: border_dim,
5419 a_phi: std::sync::Arc::from(Vec::new().into_boxed_slice()),
5420 local_jac: std::sync::Arc::from(Vec::new().into_boxed_slice()),
5421 smooth_blocks,
5422 sparse_g_blocks: Vec::new(),
5423 frame: Some(DeviceSaeFrameData {
5424 ranks: ranks.clone(),
5425 basis_sizes: basis_sizes.clone(),
5426 border_offsets: border_offsets.clone(),
5427 frame_blocks,
5428 smooth_ranks,
5429 row_htbeta,
5430 }),
5431 };
5432
5433 let ridge_t = 1e-7;
5434 let ridge_beta = 1e-6;
5435
5436 let mut s_dense = Array2::<f64>::zeros((border_dim, border_dim));
5440 for r in 0..border_dim {
5441 for c in 0..border_dim {
5442 s_dense[[r, c]] = sys.hbb[[r, c]];
5443 }
5444 s_dense[[r, r]] += ridge_beta;
5445 }
5446 for row in &sys.rows {
5447 let mut htt = row.htt.clone();
5448 for d in 0..q {
5449 htt[[d, d]] += ridge_t;
5450 }
5451 let factor = cholesky_factor_in_place(htt.view(), CholeskyGuard::NonnegativePivot)
5452 .expect("htt PD");
5453 let mut y = Array2::<f64>::zeros((q, border_dim));
5455 for col in 0..border_dim {
5456 let mut e = Array1::<f64>::zeros(q);
5457 for r in 0..q {
5458 e[r] = row.htbeta[[r, col]];
5459 }
5460 let solved = cholesky_solve_vector(factor.view(), e.view());
5461 for r in 0..q {
5462 y[[r, col]] = solved[r];
5463 }
5464 }
5465 for r in 0..border_dim {
5466 for c in 0..border_dim {
5467 let mut acc = 0.0;
5468 for d in 0..q {
5469 acc += row.htbeta[[d, r]] * y[[d, c]];
5470 }
5471 s_dense[[r, c]] -= acc;
5472 }
5473 }
5474 }
5475
5476 let mut max_rel = 0.0_f64;
5478 for trial in 0..4 {
5479 let x: Vec<f64> = (0..border_dim)
5480 .map(|a| 0.3 * ((a as f64 + trial as f64) * 0.21).cos() - 0.1)
5481 .collect();
5482 let mut got = vec![0.0_f64; border_dim];
5483 sae_framed_schur_matvec_cpu(&sys, &data, ridge_t, ridge_beta, &x, &mut got)
5484 .expect("framed matvec");
5485 let mut want = vec![0.0_f64; border_dim];
5486 for r in 0..border_dim {
5487 let mut acc = 0.0;
5488 for c in 0..border_dim {
5489 acc += s_dense[[r, c]] * x[c];
5490 }
5491 want[r] = acc;
5492 }
5493 let scale = want.iter().fold(0.0_f64, |m, v| m.max(v.abs())).max(1.0);
5494 for a in 0..border_dim {
5495 let rel = (got[a] - want[a]).abs() / scale;
5496 max_rel = max_rel.max(rel);
5497 }
5498 }
5499 assert!(
5500 max_rel <= 1e-10,
5501 "framed SAE Schur matvec vs dense reference diverged: max_rel={max_rel:e}"
5502 );
5503 }
5504
5505 #[test]
5511 fn framed_sae_device_pcg_matches_cpu_when_cuda_admits() {
5512 use crate::arrow_schur::{
5513 BetaPenaltyOp, DeviceSaeFrameData, DeviceSaePcgData, DeviceSaeSmoothBlock,
5514 FactoredFrameGBlock, FactoredFrameKroneckerOp, IdentityRightKroneckerPenaltyOp,
5515 };
5516
5517 let p = 6usize;
5521 let n_atoms = 8usize;
5522 let ranks: Vec<usize> = (0..n_atoms)
5523 .map(|k| if k % 2 == 0 { 3usize } else { p })
5524 .collect();
5525 let basis_sizes: Vec<usize> = (0..n_atoms).map(|_| 3usize).collect();
5526 let mut border_offsets = Vec::with_capacity(n_atoms);
5527 let mut acc = 0usize;
5528 for k in 0..n_atoms {
5529 border_offsets.push(acc);
5530 acc += basis_sizes[k] * ranks[k];
5531 }
5532 let border_dim = acc; let mut state = 0xfeed_face_dead_beefu64;
5535 let mut sample = || -> f64 {
5536 state = state
5537 .wrapping_mul(6364136223846793005)
5538 .wrapping_add(1442695040888963407);
5539 ((state >> 33) as f64) / ((1u64 << 31) as f64) - 1.0
5540 };
5541 let mut frames: Vec<Array2<f64>> = Vec::new();
5542 for k in 0..n_atoms {
5543 let r = ranks[k];
5544 let mut u = Array2::<f64>::zeros((p, r));
5545 for i in 0..p {
5546 for j in 0..r {
5547 u[[i, j]] = if r == p && i == j {
5548 1.0
5549 } else if r == p {
5550 0.0
5551 } else {
5552 sample()
5553 };
5554 }
5555 }
5556 frames.push(u);
5557 }
5558 let w_of = |i: usize, j: usize| {
5559 let (ui, uj) = (&frames[i], &frames[j]);
5560 let (ri, rj) = (ranks[i], ranks[j]);
5561 let mut w = Array2::<f64>::zeros((ri, rj));
5562 for a in 0..ri {
5563 for b in 0..rj {
5564 let mut s = 0.0;
5565 for c in 0..p {
5566 s += ui[[c, a]] * uj[[c, b]];
5567 }
5568 w[[a, b]] = s;
5569 }
5570 }
5571 w
5572 };
5573 let mut pairs: Vec<(usize, usize)> = (0..n_atoms).map(|k| (k, k)).collect();
5574 for &(i, j) in &[(0usize, 1usize), (2, 4), (3, 6)] {
5576 pairs.push((i, j));
5577 pairs.push((j, i));
5578 }
5579 let mut frame_blocks = Vec::new();
5580 for &(i, j) in &pairs {
5581 let (mi, mj) = (basis_sizes[i], basis_sizes[j]);
5582 let mut g = Array2::<f64>::zeros((mi, mj));
5583 for r in 0..mi {
5584 for c in 0..mj {
5585 g[[r, c]] = 0.25 * sample();
5586 }
5587 }
5588 if i == j {
5589 for r in 0..mi.min(mj) {
5590 g[[r, r]] += mi as f64 + 2.0;
5591 }
5592 }
5593 frame_blocks.push(FactoredFrameGBlock {
5594 atom_i: i,
5595 atom_j: j,
5596 g,
5597 w: w_of(i, j),
5598 });
5599 }
5600 let mut smooth_blocks = Vec::new();
5601 let mut smooth_ranks = Vec::new();
5602 for k in 0..n_atoms {
5603 let m = basis_sizes[k];
5604 let mut a = Array2::<f64>::zeros((m, m));
5605 for r in 0..m {
5606 for c in 0..m {
5607 a[[r, c]] = 0.2 * sample();
5608 }
5609 }
5610 let mut s = a.t().dot(&a);
5611 for r in 0..m {
5612 s[[r, r]] += 1.0;
5613 }
5614 smooth_blocks.push(DeviceSaeSmoothBlock {
5615 global_offset: border_offsets[k],
5616 factor_a: s,
5617 });
5618 smooth_ranks.push(ranks[k]);
5619 }
5620 let n = 400usize;
5621 let q = 4usize;
5622 let mut sys = ArrowSchurSystem::new(n, q, border_dim);
5623 let mut row_htbeta = Vec::new();
5624 for i in 0..n {
5625 let mut a = Array2::<f64>::zeros((q, q));
5626 for r in 0..q {
5627 for c in 0..q {
5628 a[[r, c]] = sample();
5629 }
5630 }
5631 let mut htt = a.t().dot(&a);
5632 for r in 0..q {
5633 htt[[r, r]] += q as f64 + 1.0;
5634 }
5635 sys.rows[i].htt = htt;
5636 let mut slab = vec![0.0_f64; q * border_dim];
5637 for c in 0..q {
5638 for col in 0..border_dim {
5639 let v = 0.02 * sample();
5642 slab[c * border_dim + col] = v;
5643 sys.rows[i].htbeta[[c, col]] = v;
5644 }
5645 }
5646 row_htbeta.push(slab);
5647 }
5648 let data_op =
5649 FactoredFrameKroneckerOp::new(ranks.clone(), basis_sizes.clone(), frame_blocks.clone())
5650 .expect("frame op");
5651 let mut hbb = data_op.to_dense();
5652 for k in 0..n_atoms {
5653 let op = IdentityRightKroneckerPenaltyOp {
5654 factor_a: smooth_blocks[k].factor_a.clone(),
5655 p: ranks[k],
5656 global_offset: border_offsets[k],
5657 k: border_dim,
5658 };
5659 let d = op.to_dense();
5660 for r in 0..border_dim {
5661 for c in 0..border_dim {
5662 hbb[[r, c]] += d[[r, c]];
5663 }
5664 }
5665 }
5666 sys.hbb = hbb;
5667 let data = DeviceSaePcgData {
5668 p,
5669 beta_dim: border_dim,
5670 a_phi: std::sync::Arc::from(Vec::new().into_boxed_slice()),
5671 local_jac: std::sync::Arc::from(Vec::new().into_boxed_slice()),
5672 smooth_blocks,
5673 sparse_g_blocks: Vec::new(),
5674 frame: Some(DeviceSaeFrameData {
5675 ranks: ranks.clone(),
5676 basis_sizes: basis_sizes.clone(),
5677 border_offsets: border_offsets.clone(),
5678 frame_blocks,
5679 smooth_ranks,
5680 row_htbeta,
5681 }),
5682 };
5683 let ridge_t = 1e-7;
5684 let ridge_beta = 1e-6;
5685 let rhs: Array1<f64> =
5686 Array1::from_shape_fn(border_dim, |a| ((a as f64 + 1.0) * 0.17).sin());
5687
5688 let (device, diag) =
5689 match solve_sae_matrix_free_pcg(&sys, &data, ridge_t, ridge_beta, &rhs, 400, 1e-12) {
5690 Ok(result) => result,
5691 Err(failure) => {
5697 assert!(
5698 gam_gpu::device_runtime::GpuRuntime::global().is_none(),
5699 "#1017: CUDA device present but the framed device SAE PCG \
5700 declined/faulted instead of returning a result (tag: {failure:?}) — \
5701 the kernel does not run correctly on GPU"
5702 );
5703 return;
5704 }
5705 };
5706
5707 let mut s_dense = Array2::<f64>::zeros((border_dim, border_dim));
5710 for col in 0..border_dim {
5711 let mut e = vec![0.0_f64; border_dim];
5712 e[col] = 1.0;
5713 let mut sc = vec![0.0_f64; border_dim];
5714 sae_framed_schur_matvec_cpu(&sys, &data, ridge_t, ridge_beta, &e, &mut sc)
5715 .expect("cpu matvec");
5716 for r in 0..border_dim {
5717 s_dense[[r, col]] = sc[r];
5718 }
5719 }
5720 let factor = cholesky_factor_in_place(s_dense.view(), CholeskyGuard::NonnegativePivot)
5721 .expect("S PD");
5722 let cpu = cholesky_solve_vector(factor.view(), rhs.view());
5723
5724 let scale = cpu.iter().fold(0.0_f64, |m, v| m.max(v.abs())).max(1.0);
5725 let mut max_rel = 0.0_f64;
5726 for a in 0..border_dim {
5727 max_rel = max_rel.max((device[a] - cpu[a]).abs() / scale);
5728 }
5729 let mut s_dev_resid = 0.0_f64;
5738 {
5739 let sx = s_dense.dot(&device);
5740 for a in 0..border_dim {
5741 s_dev_resid = s_dev_resid.max((sx[a] - rhs[a]).abs());
5742 }
5743 }
5744 let s_cpu_resid = {
5745 let sc = s_dense.dot(&cpu);
5746 let mut m = 0.0_f64;
5747 for a in 0..border_dim {
5748 m = m.max((sc[a] - rhs[a]).abs());
5749 }
5750 m
5751 };
5752 assert!(
5753 max_rel <= 1e-7,
5754 "[#1551 framed-triage] max_rel={max_rel:e} | device-vs-CPU-operator residual \
5755 ‖S_cpu·device−rhs‖={s_dev_resid:e} (CPU's own ={s_cpu_resid:e}) | device PCG \
5756 stop={:?} iters={} final_rel_resid={:e} — large operator-residual ⇒ device matvec \
5757 is a different operator (kernel bug); small ⇒ PCG/precond or singular-S issue",
5758 diag.stopping_reason,
5759 diag.iterations,
5760 diag.final_relative_residual,
5761 );
5762 }
5763}