singe_cusolver/
irs.rs

1use std::{ptr, slice};
2
3use singe_cuda::{data_type::DataTypeLike, memory::DeviceMemory};
4
5use crate::{
6    context::Context,
7    error::{Error, Result},
8    layout::{MatrixMut, MatrixRef},
9    sys, try_ffi,
10    types::{IrsRefinement, PrecisionType},
11    utility::{to_i32, to_u64, to_usize},
12};
13
14#[derive(Debug)]
15pub struct IrsParams {
16    handle: sys::cusolverDnIRSParams_t,
17    main_precision: Option<PrecisionType>,
18    lowest_precision: Option<PrecisionType>,
19}
20
21#[derive(Debug, Default)]
22pub struct IrsInfos {
23    handle: sys::cusolverDnIRSInfos_t,
24    residual_history_requested: bool,
25}
26
27#[derive(Debug, Clone, Copy, PartialEq)]
28pub struct ResidualHistoryEntry<T> {
29    pub total_iterations: T,
30    pub residual_norm: T,
31}
32
33#[derive(Debug, Clone, PartialEq)]
34pub struct ResidualHistory<T> {
35    pub rows: Vec<ResidualHistoryEntry<T>>,
36    pub leading_dimension: usize,
37}
38
39#[derive(Debug, Clone, Copy, PartialEq, Eq)]
40pub struct IrsSolve {
41    pub n: usize,
42    pub right_hand_sides: usize,
43}
44
45impl IrsSolve {
46    pub fn new(n: usize, right_hand_sides: usize) -> Self {
47        Self {
48            n,
49            right_hand_sides,
50        }
51    }
52
53    pub fn workspace_size<T: DataTypeLike>(
54        self,
55        ctx: &Context,
56        params: &mut IrsParams,
57    ) -> Result<usize> {
58        xgesv_buffer_size::<T>(ctx, params, self.n, self.right_hand_sides)
59    }
60
61    pub fn execute<T: DataTypeLike>(
62        self,
63        ctx: &Context,
64        params: &mut IrsParams,
65        infos: &IrsInfos,
66        bindings: IrsSolveBindings<'_, T>,
67    ) -> Result<i32> {
68        xgesv(
69            ctx,
70            params,
71            infos,
72            self.n,
73            self.right_hand_sides,
74            bindings.a,
75            bindings.b,
76            bindings.x,
77            bindings.device_workspace,
78            bindings.dev_info,
79        )
80    }
81}
82
83#[derive(Debug)]
84pub struct IrsSolveBindings<'a, T> {
85    pub a: MatrixMut<'a, T>,
86    pub b: MatrixRef<'a, T>,
87    pub x: MatrixMut<'a, T>,
88    pub device_workspace: &'a mut DeviceMemory<u8>,
89    pub dev_info: &'a mut DeviceMemory<i32>,
90}
91
92// IRS parameter/info handles expose mutation through &mut self and inspection
93// through shared references, so immutable sharing follows the cuSOLVER contract.
94unsafe impl Send for IrsParams {}
95unsafe impl Sync for IrsParams {}
96unsafe impl Send for IrsInfos {}
97unsafe impl Sync for IrsInfos {}
98
99impl IrsParams {
100    /// Creates and initializes the parameter structure for IRS solvers such as
101    /// [`xgesv`] and [`xgels`].
102    ///
103    /// The returned parameter structure can be reused across calls to the same
104    /// IRS solver or to different IRS solvers.
105    ///
106    /// In CUDA 10.2, the behavior was different and a new parameter structure
107    /// was required for each IRS solve call.
108    ///
109    /// You can also reconfigure the parameters between solves, but only after
110    /// the previous IRS call has completed.
111    ///
112    /// # Errors
113    ///
114    /// Returns an error if cuSOLVER cannot allocate the required resources
115    /// or does not return a valid handle.
116    pub fn create() -> Result<Self> {
117        let mut handle = ptr::null_mut();
118        unsafe {
119            try_ffi!(sys::cusolverDnIRSParamsCreate(&raw mut handle))?;
120        }
121        if handle.is_null() {
122            return Err(Error::NullHandle);
123        }
124        let mut params = Self {
125            handle,
126            main_precision: None,
127            lowest_precision: None,
128        };
129        params.set_refinement_solver(IrsRefinement::None)?;
130        Ok(params)
131    }
132
133    /// Sets the refinement solver used by IRS operations such as [`xgesv`] and
134    /// [`xgels`].
135    ///
136    /// Configure the refinement algorithm before the first IRS solve. Newly created [`IrsParams`] do not set one by default.
137    ///
138    /// The supported values are described below.
139    ///
140    /// [`IrsRefinement::NotSet`]: Solver is not set. The IRS solver returns an
141    /// error if this value is used.
142    ///
143    /// [`IrsRefinement::None`]: No refinement solver; the IRS solver performs a factorization followed by a solve without any refinement.
144    /// For example, if the IRS solver was [`xgesv`], this is equivalent to an
145    /// [`xgesv`] solve without refinement, with the factorization carried out in
146    /// the lowest configured precision.
147    /// If both the main and lowest precision are [`PrecisionType::R64F`], the
148    /// solve is effectively performed in `f64`.
149    ///
150    /// [`IrsRefinement::Classical`]: Classical iterative refinement solver.
151    /// Similar to the value used in LAPACK operations.
152    ///
153    /// [`IrsRefinement::Gmres`]: GMRES (Generalized Minimal Residual) based iterative refinement solver.
154    /// Recent studies use GMRES as a refinement solver that can outperform
155    /// classical iterative refinement.
156    /// Recommended setting based on cuSOLVER experimentation.
157    ///
158    /// [`IrsRefinement::ClassicalGmres`]: Classical iterative refinement solver that uses the GMRES (Generalized Minimal Residual) internally to solve the correction equation at each iteration.
159    /// The classical refinement iteration is the outer iteration, and GMRES is
160    /// the inner iteration.
161    /// If the tolerance of the inner GMRES is set very low, for
162    /// example near machine precision, then the outer *classical refinement
163    /// iteration* performs only one iteration and this option behaves like
164    /// [`IrsRefinement::Gmres`].
165    ///
166    /// [`IrsRefinement::GmresGmres`]: GMRES-based iterative refinement solver
167    /// that uses another GMRES solve internally for the preconditioned system.
168    ///
169    /// # Errors
170    ///
171    /// Returns an error if cuSOLVER rejects the parameter structure.
172    pub fn set_refinement_solver(&mut self, refinement: IrsRefinement) -> Result<()> {
173        unsafe {
174            try_ffi!(sys::cusolverDnIRSParamsSetRefinementSolver(
175                self.as_raw(),
176                refinement.into(),
177            ))?;
178        }
179        Ok(())
180    }
181
182    /// Sets the main precision for the Iterative Refinement Solver (IRS).
183    ///
184    /// The main precision is the type of the input and output data.
185    /// Configure both the main and lowest precision before the first IRS solve. Those
186    /// values are not inferred when the parameter structure is created because
187    /// they depend on the input/output data type and the requested solver
188    /// configuration. You can set them independently or together with
189    /// [`IrsParams::set_solver_precisions`].
190    ///
191    /// # Errors
192    ///
193    /// Returns an error if cuSOLVER rejects the parameter structure.
194    pub fn set_main_precision(&mut self, precision: PrecisionType) -> Result<()> {
195        unsafe {
196            try_ffi!(sys::cusolverDnIRSParamsSetSolverMainPrecision(
197                self.as_raw(),
198                precision.into(),
199            ))?;
200        }
201        self.main_precision = Some(precision);
202        Ok(())
203    }
204
205    /// Sets the lowest precision that the IRS solver may use.
206    ///
207    /// The lowest precision is the minimum compute precision used
208    /// during the LU factorization process.
209    ///
210    /// Configure both the main and lowest precision before the first IRS solve. They
211    /// are not inferred when creating the parameter structure because they
212    /// depend on the input and output data types and the requested solver
213    /// configuration.
214    /// Usually the lowest precision defines the speedup that can be achieved.
215    /// The ratio between the performance of the lowest precision and the main
216    /// precision gives an approximate upper bound on the speedup.
217    /// More precisely, it depends on many factors, but for large matrices it is
218    /// often tied to the performance ratio of large GEMM-like kernels.
219    /// For instance, if the input/output precision is real double precision
220    /// [`PrecisionType::R64F`] and the lowest precision is
221    /// [`PrecisionType::R32F`], then a speedup of at most about 2x is expected
222    /// for large problem sizes.
223    /// If the lowest precision is [`PrecisionType::R16F`], expect 3x-4x.
224    /// A reasonable strategy accounts for the number of right-hand sides, the matrix size, and the convergence rate.
225    ///
226    /// # Errors
227    ///
228    /// Returns an error if cuSOLVER rejects the parameter structure.
229    pub fn set_lowest_precision(&mut self, precision: PrecisionType) -> Result<()> {
230        unsafe {
231            try_ffi!(sys::cusolverDnIRSParamsSetSolverLowestPrecision(
232                self.as_raw(),
233                precision.into(),
234            ))?;
235        }
236        self.lowest_precision = Some(precision);
237        Ok(())
238    }
239
240    /// Sets both the main and lowest precision for the Iterative Refinement
241    /// Solver (IRS).
242    ///
243    /// The main precision is the precision of the input and output data.
244    /// The lowest precision is the minimum compute precision used
245    /// during the LU factorization process.
246    ///
247    /// Configure both values before the first IRS solve. They are not inferred when
248    /// creating the parameter structure because they depend on the input and
249    /// output data types and the requested solver configuration.
250    ///
251    /// Convenience wrapper around
252    /// [`IrsParams::set_main_precision`] and
253    /// [`IrsParams::set_lowest_precision`].
254    /// All possible combinations of main/lowest precision are described in the table below.
255    /// Usually the lowest precision defines the speedup that can be achieved.
256    /// The ratio between the performance of the lowest precision and the main
257    /// precision gives an approximate upper bound on the speedup.
258    /// More precisely, it depends on many factors, but for large matrices it is
259    /// often tied to the performance ratio of large GEMM-like kernels.
260    /// For instance, if the input/output precision is real double precision
261    /// [`PrecisionType::R64F`] and the lowest precision is
262    /// [`PrecisionType::R32F`], then a speedup of at most about 2x is expected
263    /// for large problem sizes.
264    /// If the lowest precision is [`PrecisionType::R16F`], expect 3x-4x.
265    /// A reasonable strategy accounts for the number of right-hand sides, the matrix size, and the convergence rate.
266    ///
267    /// **Supported input/output data type and lower precision for the IRS solver**
268    ///
269    /// | **input/output Data Type (for example, main precision)** | **Supported values for the lowest precision** |
270    /// | --- | --- |
271    /// | [`PrecisionType::C64F`] | [`PrecisionType::C64F`], [`PrecisionType::C32F`], [`PrecisionType::C16F`], [`PrecisionType::C16Bf`], [`PrecisionType::CTf32`] |
272    /// | [`PrecisionType::C32F`] | [`PrecisionType::C32F`], [`PrecisionType::C16F`], [`PrecisionType::C16Bf`], [`PrecisionType::CTf32`] |
273    /// | [`PrecisionType::R64F`] | [`PrecisionType::R64F`], [`PrecisionType::R32F`], [`PrecisionType::R16F`], [`PrecisionType::R16Bf`], [`PrecisionType::RTf32`] |
274    /// | [`PrecisionType::R32F`] | [`PrecisionType::R32F`], [`PrecisionType::R16F`], [`PrecisionType::R16Bf`], [`PrecisionType::RTf32`] |
275    ///
276    /// # Errors
277    ///
278    /// Returns an error if cuSOLVER rejects the parameter structure.
279    pub fn set_solver_precisions(
280        &mut self,
281        main_precision: PrecisionType,
282        lowest_precision: PrecisionType,
283    ) -> Result<()> {
284        unsafe {
285            try_ffi!(sys::cusolverDnIRSParamsSetSolverPrecisions(
286                self.as_raw(),
287                main_precision.into(),
288                lowest_precision.into(),
289            ))?;
290        }
291        self.main_precision = Some(main_precision);
292        self.lowest_precision = Some(lowest_precision);
293        Ok(())
294    }
295
296    /// Sets the tolerance for the refinement solver.
297    /// By default it is such that all the RHS satisfy:
298    ///
299    /// `RNRM &lt; SQRT(N)*XNRM*ANRM*EPS*BWDMAX` where
300    ///
301    /// * RNRM is the infinity-norm of the residual
302    /// * XNRM is the infinity-norm of the solution
303    /// * ANRM is the infinity-operator-norm of the matrix A
304    /// * EPS is the machine epsilon for the input/output data type that matches
305    ///   LAPACK `xLAMCH('Epsilon')`
306    /// * BWDMAX, the value BWDMAX is fixed to 1.0
307    ///
308    /// Use this to set the tolerance to a lower or higher value.
309    /// The tolerance value is always stored in real double precision,
310    /// regardless of the input and output data type.
311    ///
312    /// # Errors
313    ///
314    /// Returns an error if cuSOLVER rejects the parameter structure.
315    pub fn set_tolerance(&mut self, tolerance: f64) -> Result<()> {
316        unsafe {
317            try_ffi!(sys::cusolverDnIRSParamsSetTol(self.as_raw(), tolerance))?;
318        }
319        Ok(())
320    }
321
322    /// Sets the tolerance for the inner refinement solver when
323    /// the refinement solver consists of two levels, for example
324    /// [`IrsRefinement::ClassicalGmres`] or [`IrsRefinement::GmresGmres`].
325    /// Ignored for one-level refinement solvers such as [`IrsRefinement::Classical`] or [`IrsRefinement::Gmres`].
326    /// The default value is 1e-4.
327    /// This sets the tolerance for the inner solver, such as the inner GMRES.
328    /// For example, if the refinement solver is
329    /// [`IrsRefinement::ClassicalGmres`], setting this tolerance means that the
330    /// inner GMRES solver converges to that tolerance at each outer
331    /// iteration of the classical refinement solver.
332    /// The tolerance value is always stored in real double precision,
333    /// regardless of the input and output data type.
334    ///
335    /// # Errors
336    ///
337    /// Returns an error if cuSOLVER rejects the parameter structure.
338    pub fn set_inner_tolerance(&mut self, tolerance: f64) -> Result<()> {
339        unsafe {
340            try_ffi!(sys::cusolverDnIRSParamsSetTolInner(
341                self.as_raw(),
342                tolerance,
343            ))?;
344        }
345        Ok(())
346    }
347
348    /// Sets the total number of allowed refinement iterations before the solver stops.
349    /// The total is the sum of the outer and inner iterations. Inner iterations are meaningful when a two-level refinement solver is configured.
350    /// The default value is 50.
351    ///
352    /// # Errors
353    ///
354    /// Returns an error if cuSOLVER rejects the parameter structure.
355    pub fn set_max_iterations(&mut self, max_iterations: i32) -> Result<()> {
356        unsafe {
357            try_ffi!(sys::cusolverDnIRSParamsSetMaxIters(
358                self.as_raw(),
359                max_iterations,
360            ))?;
361        }
362        Ok(())
363    }
364
365    /// Sets the maximum number of iterations allowed for the inner refinement solver.
366    /// Ignored for one-level refinement solvers such as [`IrsRefinement::Classical`] or [`IrsRefinement::Gmres`].
367    /// The inner refinement solver stops after reaching either the inner tolerance or `MaxItersInner`.
368    /// The default value is 50.
369    /// Cannot be larger than `MaxIters` because `MaxIters` is the total number of allowed iterations.
370    /// If [`IrsParams::set_max_iterations`] is called after this method, it has priority and overwrites `MaxItersInner` with `min(MaxIters, MaxItersInner)`.
371    ///
372    /// # Errors
373    ///
374    /// Returns an error if `max_iterations` is larger than `MaxIters`, or if
375    /// cuSOLVER rejects the parameter structure.
376    pub fn set_max_inner_iterations(&mut self, max_iterations: i32) -> Result<()> {
377        unsafe {
378            try_ffi!(sys::cusolverDnIRSParamsSetMaxItersInner(
379                self.as_raw(),
380                max_iterations,
381            ))?;
382        }
383        Ok(())
384    }
385
386    /// Returns the current maximum-iteration setting in this parameter structure.
387    /// Current parameter configuration, distinct from [`IrsInfos::max_iterations`], which returns the maximum number of iterations allowed for a particular IRS solver call.
388    /// The parameter structure can be reused across many IRS solver calls.
389    /// The allowed `MaxIters` value can change between calls, while the `Infos` structure contains information about one particular call and cannot be reused for different calls.
390    ///
391    /// # Errors
392    ///
393    /// Returns an error if cuSOLVER rejects the parameter structure.
394    pub fn max_iterations(&self) -> Result<i32> {
395        let mut value = 0;
396        unsafe {
397            try_ffi!(sys::cusolverDnIRSParamsGetMaxIters(
398                self.as_raw(),
399                &raw mut value,
400            ))?;
401        }
402        Ok(value)
403    }
404
405    /// Enables fallback to the main precision if the Iterative Refinement Solver (IRS) fails to converge.
406    /// If the IRS solver fails to converge, it returns a non-convergence code such as `niter < 0`.
407    /// With fallback disabled, it returns the non-convergent solution as-is.
408    /// With fallback enabled, it falls back to the main precision, which is the input/output data precision, and solves the problem again from scratch.
409    /// This fallback is the default behavior.
410    ///
411    /// # Errors
412    ///
413    /// Returns an error if cuSOLVER rejects the parameter structure.
414    pub fn enable_fallback(&mut self) -> Result<()> {
415        unsafe {
416            try_ffi!(sys::cusolverDnIRSParamsEnableFallback(self.as_raw()))?;
417        }
418        Ok(())
419    }
420
421    /// Disables fallback to the main precision if the Iterative Refinement Solver (IRS) fails to converge.
422    /// If the IRS solver fails to converge, it returns a non-convergence code such as `niter < 0`.
423    /// With fallback disabled, the returned solution is whatever the refinement solver reached before returning.
424    /// Disabling fallback does not guarantee that the solution is accurate.
425    /// Re-enable fallback with [`IrsParams::enable_fallback`].
426    ///
427    /// # Errors
428    ///
429    /// Returns an error if cuSOLVER rejects the parameter structure.
430    pub fn disable_fallback(&mut self) -> Result<()> {
431        unsafe {
432            try_ffi!(sys::cusolverDnIRSParamsDisableFallback(self.as_raw()))?;
433        }
434        Ok(())
435    }
436
437    fn ensure_type_precision<T: DataTypeLike>(&mut self) -> Result<()> {
438        let precision = PrecisionType::from_data_type(T::data_type())
439            .ok_or(Error::InvalidPrecisionConfiguration)?;
440        match self.main_precision {
441            Some(existing) if existing != precision => {
442                return Err(Error::InvalidPrecisionConfiguration);
443            }
444            None => self.set_main_precision(precision)?,
445            _ => {}
446        }
447        if self.lowest_precision.is_none() {
448            self.set_lowest_precision(precision)?;
449        }
450        Ok(())
451    }
452
453    pub fn as_raw(&self) -> sys::cusolverDnIRSParams_t {
454        self.handle
455    }
456
457    /// Takes ownership of a raw cuSOLVER IRS params handle.
458    ///
459    /// # Safety
460    ///
461    /// `handle` must be a valid `cusolverDnIRSParams_t` created by cuSOLVER.
462    /// The returned wrapper takes ownership and will destroy it with
463    /// `cusolverDnIRSParamsDestroy`; no other owner may destroy or keep using it.
464    pub unsafe fn from_raw(handle: sys::cusolverDnIRSParams_t) -> Result<Self> {
465        if handle.is_null() {
466            return Err(Error::NullHandle);
467        }
468        Ok(Self {
469            handle,
470            main_precision: None,
471            lowest_precision: None,
472        })
473    }
474
475    /// Releases ownership and returns the raw cuSOLVER IRS params handle.
476    ///
477    /// The caller becomes responsible for destroying the handle.
478    pub fn into_raw(self) -> sys::cusolverDnIRSParams_t {
479        let handle = self.handle;
480        std::mem::forget(self);
481        handle
482    }
483}
484
485impl Drop for IrsParams {
486    fn drop(&mut self) {
487        unsafe {
488            if let Err(err) = try_ffi!(sys::cusolverDnIRSParamsDestroy(self.handle)) {
489                #[cfg(debug_assertions)]
490                eprintln!("failed to destroy cusolver irs params: {err}");
491            }
492        }
493    }
494}
495
496impl IrsInfos {
497    /// Creates and initializes the `Infos` structure that holds refinement information for an Iterative Refinement Solver (IRS) call.
498    /// Such information includes the total number of iterations needed to converge (`Niters`), the number of outer iterations (meaningful when a two-level preconditioner such as [`IrsRefinement::ClassicalGmres`] is used), the maximum number of iterations allowed for that call, and a pointer to the convergence-history residual norm matrix.
499    /// Construct the `Infos` structure before calling an IRS solver.
500    /// The `Infos` structure is valid for only one call to an IRS solver, since it holds information about that solve; each solve requires its own `Infos` structure.
501    ///
502    /// # Errors
503    ///
504    /// Returns an error if cuSOLVER cannot allocate the required resources
505    /// or does not return a valid handle.
506    pub fn create() -> Result<Self> {
507        let mut handle = ptr::null_mut();
508        unsafe {
509            try_ffi!(sys::cusolverDnIRSInfosCreate(&raw mut handle))?;
510        }
511        if handle.is_null() {
512            return Err(Error::NullHandle);
513        }
514        Ok(Self {
515            handle,
516            residual_history_requested: false,
517        })
518    }
519
520    /// Returns the total number of iterations performed by the IRS solver.
521    /// If this value is negative, the IRS solver did not converge. If fallback to full precision was enabled, the solver fell back to a full-precision solution.
522    /// See [`xgesv`] and [`xgels`] for the meaning of negative `niters` values.
523    ///
524    /// # Errors
525    ///
526    /// Returns an error if cuSOLVER rejects the `Infos` structure.
527    pub fn niters(&self) -> Result<i32> {
528        let mut value = 0;
529        unsafe {
530            try_ffi!(sys::cusolverDnIRSInfosGetNiters(
531                self.as_raw(),
532                &raw mut value,
533            ))?;
534        }
535        Ok(value)
536    }
537
538    /// Returns the number of iterations performed by the outer refinement loop of the IRS solver.
539    /// For one-level solvers such as [`IrsRefinement::Classical`] or [`IrsRefinement::Gmres`], this is the same as `Niters`.
540    /// For two-level solvers such as [`IrsRefinement::ClassicalGmres`] or [`IrsRefinement::GmresGmres`], this is the number of outer-loop iterations.
541    /// See [`IrsRefinement`] for refinement mode details.
542    ///
543    /// # Errors
544    ///
545    /// Returns an error if cuSOLVER rejects the `Infos` structure.
546    pub fn outer_niters(&self) -> Result<i32> {
547        let mut value = 0;
548        unsafe {
549            try_ffi!(sys::cusolverDnIRSInfosGetOuterNiters(
550                self.as_raw(),
551                &raw mut value,
552            ))?;
553        }
554        Ok(value)
555    }
556
557    /// Returns the maximum number of iterations allowed for the corresponding IRS solver call.
558    /// Setting used when that call happened, distinct from [`IrsParams::max_iterations`], which returns the current setting in the `params` configuration structure.
559    /// The `params` structure can be reused for many IRS solver calls.
560    /// The allowed `MaxIters` value can change between calls, while this `Infos` structure contains information about one particular call and cannot be reused for different calls.
561    ///
562    /// # Errors
563    ///
564    /// Returns an error if cuSOLVER rejects the `Infos` structure.
565    pub fn max_iterations(&self) -> Result<i32> {
566        let mut value = 0;
567        unsafe {
568            try_ffi!(sys::cusolverDnIRSInfosGetMaxIters(
569                self.as_raw(),
570                &raw mut value,
571            ))?;
572        }
573        Ok(value)
574    }
575
576    /// Asks the IRS solver to store the convergence history
577    /// (residual norms) of the refinement phase so it can later be queried with
578    /// [`IrsInfos::residual_history_f32`] or [`IrsInfos::residual_history_f64`].
579    ///
580    /// # Errors
581    ///
582    /// Returns an error if cuSOLVER rejects the `Infos` structure.
583    pub fn request_residual_history(&mut self) -> Result<()> {
584        unsafe {
585            try_ffi!(sys::cusolverDnIRSInfosRequestResidual(self.as_raw()))?;
586        }
587        self.residual_history_requested = true;
588        Ok(())
589    }
590
591    /// Returns the convergence history stored by the IRS solver when [`IrsInfos::request_residual_history`] was called before solving.
592    /// The residual norm type depends on the input and output precision.
593    /// Double-precision real and complex configurations report `f64` residuals, while single-precision real and complex configurations report `f32` residuals.
594    ///
595    /// The residual history matrix has two columns, even for multiple right-hand sides, and `MaxIters + 1` rows.
596    /// Only the first `OuterNiters + 1` rows contain residual norms; the remaining rows are undefined.
597    /// In the first column, each row `i` contains the total number of iterations performed up to outer iteration `i`.
598    /// In the second column, each row contains the residual norm for that outer iteration.
599    /// Row 0 contains the initial residual before the refinement loop starts, and subsequent rows contain residuals obtained at each outer iteration.
600    /// The history only covers the outer loop.
601    ///
602    /// If the refinement solver was [`IrsRefinement::Classical`] or [`IrsRefinement::Gmres`], then `OuterNiters == Niters`, and there are `Niters + 1` rows of norms corresponding to the `Niters` outer iterations.
603    ///
604    /// If the refinement solver was [`IrsRefinement::ClassicalGmres`] or [`IrsRefinement::GmresGmres`], then `OuterNiters <= Niters` corresponds to the outer iterations performed by the outer refinement loop.
605    /// There are `OuterNiters + 1` residual norms. Row `i` corresponds to outer iteration `i`; the first column contains the total number of outer and inner iterations performed up to that step, and the second column contains the residual norm at that step.
606    ///
607    /// For example, if [`IrsRefinement::ClassicalGmres`] needs 3 outer iterations to converge and 4, 3, and 3 inner iterations at each outer iteration, it performs 10 total iterations.
608    /// Row 0 corresponds to the first residual before the refinement start, so it has 0 in its first column.
609    /// Row 1 corresponds to outer iteration 1 and contains 4 in its first column, row 2 contains 7, and row 3 contains 10.
610    ///
611    /// In summary, let `ldh = MaxIters + 1`, the leading dimension of the residual matrix. Then `residual_history[i]` contains the total number of iterations performed at outer iteration `i`, and `residual_history[i + ldh]` contains the residual norm at that outer iteration.
612    ///
613    /// # Errors
614    ///
615    /// Returns an error if residual history was not requested before solving,
616    /// or if cuSOLVER rejects the `Infos` structure.
617    pub fn residual_history_f32(&self) -> Result<ResidualHistory<f32>> {
618        if !self.residual_history_requested {
619            return Err(Error::InvalidPrecisionConfiguration);
620        }
621        let (leading_dimension, valid_rows) = self.residual_history_layout()?;
622        let mut history = ptr::null_mut();
623        unsafe {
624            try_ffi!(sys::cusolverDnIRSInfosGetResidualHistory(
625                self.as_raw(),
626                &raw mut history,
627            ))?;
628            Ok(copy_residual_history(
629                history.cast::<f32>(),
630                leading_dimension,
631                valid_rows,
632            ))
633        }
634    }
635
636    /// Returns the convergence history stored by the IRS solver when [`IrsInfos::request_residual_history`] was called before solving.
637    /// The residual norm type depends on the input and output precision.
638    /// Double-precision real and complex configurations report `f64` residuals, while single-precision real and complex configurations report `f32` residuals.
639    ///
640    /// The residual history matrix has two columns, even for multiple right-hand sides, and `MaxIters + 1` rows.
641    /// Only the first `OuterNiters + 1` rows contain residual norms; the remaining rows are undefined.
642    /// In the first column, each row `i` contains the total number of iterations performed up to outer iteration `i`.
643    /// In the second column, each row contains the residual norm for that outer iteration.
644    /// Row 0 contains the initial residual before the refinement loop starts, and subsequent rows contain residuals obtained at each outer iteration.
645    /// The history only covers the outer loop.
646    ///
647    /// If the refinement solver was [`IrsRefinement::Classical`] or [`IrsRefinement::Gmres`], then `OuterNiters == Niters`, and there are `Niters + 1` rows of norms corresponding to the `Niters` outer iterations.
648    ///
649    /// If the refinement solver was [`IrsRefinement::ClassicalGmres`] or [`IrsRefinement::GmresGmres`], then `OuterNiters <= Niters` corresponds to the outer iterations performed by the outer refinement loop.
650    /// There are `OuterNiters + 1` residual norms. Row `i` corresponds to outer iteration `i`; the first column contains the total number of outer and inner iterations performed up to that step, and the second column contains the residual norm at that step.
651    ///
652    /// For example, if [`IrsRefinement::ClassicalGmres`] needs 3 outer iterations to converge and 4, 3, and 3 inner iterations at each outer iteration, it performs 10 total iterations.
653    /// Row 0 corresponds to the first residual before the refinement start, so it has 0 in its first column.
654    /// Row 1 corresponds to outer iteration 1 and contains 4 in its first column, row 2 contains 7, and row 3 contains 10.
655    ///
656    /// In summary, let `ldh = MaxIters + 1`, the leading dimension of the residual matrix. Then `residual_history[i]` contains the total number of iterations performed at outer iteration `i`, and `residual_history[i + ldh]` contains the residual norm at that outer iteration.
657    ///
658    /// # Errors
659    ///
660    /// Returns an error if residual history was not requested before solving,
661    /// or if cuSOLVER rejects the `Infos` structure.
662    pub fn residual_history_f64(&self) -> Result<ResidualHistory<f64>> {
663        if !self.residual_history_requested {
664            return Err(Error::InvalidPrecisionConfiguration);
665        }
666        let (leading_dimension, valid_rows) = self.residual_history_layout()?;
667        let mut history = ptr::null_mut();
668        unsafe {
669            try_ffi!(sys::cusolverDnIRSInfosGetResidualHistory(
670                self.as_raw(),
671                &raw mut history,
672            ))?;
673            Ok(copy_residual_history(
674                history.cast::<f64>(),
675                leading_dimension,
676                valid_rows,
677            ))
678        }
679    }
680
681    pub fn as_raw(&self) -> sys::cusolverDnIRSInfos_t {
682        self.handle
683    }
684
685    /// Takes ownership of a raw cuSOLVER IRS infos handle.
686    ///
687    /// # Safety
688    ///
689    /// `handle` must be a valid `cusolverDnIRSInfos_t` created by cuSOLVER.
690    /// The returned wrapper takes ownership and will destroy it with
691    /// `cusolverDnIRSInfosDestroy`; no other owner may destroy or keep using it.
692    pub unsafe fn from_raw(handle: sys::cusolverDnIRSInfos_t) -> Result<Self> {
693        if handle.is_null() {
694            return Err(Error::NullHandle);
695        }
696        Ok(Self {
697            handle,
698            residual_history_requested: false,
699        })
700    }
701
702    /// Releases ownership and returns the raw cuSOLVER IRS infos handle.
703    ///
704    /// The caller becomes responsible for destroying the handle.
705    pub fn into_raw(self) -> sys::cusolverDnIRSInfos_t {
706        let handle = self.handle;
707        std::mem::forget(self);
708        handle
709    }
710
711    fn residual_history_layout(&self) -> Result<(usize, usize)> {
712        let leading_dimension = self
713            .max_iterations()?
714            .checked_add(1)
715            .ok_or(Error::InvalidResidualHistory)
716            .and_then(|value| {
717                usize::try_from(value).map_err(|_| Error::OutOfRange {
718                    name: "residual history leading dimension".into(),
719                })
720            })?;
721        let valid_rows = self
722            .outer_niters()?
723            .checked_add(1)
724            .ok_or(Error::InvalidResidualHistory)
725            .and_then(|value| {
726                usize::try_from(value).map_err(|_| Error::OutOfRange {
727                    name: "residual history rows".into(),
728                })
729            })?;
730
731        if valid_rows > leading_dimension {
732            return Err(Error::InvalidResidualHistory);
733        }
734
735        Ok((leading_dimension, valid_rows))
736    }
737}
738
739impl Drop for IrsInfos {
740    fn drop(&mut self) {
741        unsafe {
742            if let Err(err) = try_ffi!(sys::cusolverDnIRSInfosDestroy(self.handle)) {
743                #[cfg(debug_assertions)]
744                eprintln!("failed to destroy cusolver irs infos: {err}");
745            }
746        }
747    }
748}
749
750pub fn xgesv_buffer_size<T: DataTypeLike>(
751    ctx: &Context,
752    params: &mut IrsParams,
753    n: usize,
754    nrhs: usize,
755) -> Result<usize> {
756    ctx.bind()?;
757    if n == 0 || nrhs == 0 {
758        return Err(Error::InvalidMatrixShape);
759    }
760    params.ensure_type_precision::<T>()?;
761    let mut workspace_bytes = 0;
762    unsafe {
763        try_ffi!(sys::cusolverDnIRSXgesv_bufferSize(
764            ctx.as_raw(),
765            params.as_raw(),
766            to_i32(n, "n")?,
767            to_i32(nrhs, "nrhs")?,
768            &raw mut workspace_bytes,
769        ))?;
770    }
771    to_usize(workspace_bytes, "workspace size")
772}
773
774/// Provides the same solve as the typed cuSOLVER `gesv` entry
775/// points, but through a generic Rust wrapper that exposes IRS configuration
776/// and reporting more directly.
777/// [`xgesv`] allows additional control of the solver parameters such as setting:
778///
779/// * the main precision (input/output precision) of the solver
780/// * the lowest precision to be used internally by the solver
781/// * the refinement solver type
782/// * the maximum allowed number of iterations in the refinement phase
783/// * the tolerance of the refinement solver
784/// * the fallback to main precision
785/// * and more
786///
787/// through [`IrsParams`] and its helper methods.
788/// Moreover, [`xgesv`] provides additional output information such as the convergence history (for example, residual norms) at each iteration and the number of iterations needed to converge.
789/// [`IrsInfos`] exposes the information reported for a particular solve.
790///
791/// The returned value describes the solving process.
792/// `Ok` indicates that the solve finished successfully. An error indicates that one of the arguments is incorrect, that the parameter or info structures are misconfigured, or that the solve did not finish successfully.
793/// Check `niters` and `dinfo` for additional error details.
794/// Provide the required device workspace through `workspace`.
795/// Query the required byte count with [`xgesv_buffer_size`].
796/// Apply any required configuration through the parameter structure before calling [`xgesv_buffer_size`] so the workspace size matches that configuration.
797///
798/// Tensor Float (TF32), introduced with NVIDIA Ampere architecture GPUs, is the most robust tensor core accelerated compute mode for the iterative refinement solver.
799/// It solves a broad range of HPC problems and can provide up to 4x and 5x
800/// speedups for real and complex systems, respectively.
801/// On Volta and Turing architecture GPUs, half precision tensor core acceleration is recommended.
802/// In cases where the iterative refinement solver fails to converge to the desired accuracy (main precision, input/output data precision), it is recommended to use main precision as internal lowest precision.
803///
804/// The following table provides all possible lowest-precision values corresponding to the input/output data type.
805/// If the lowest precision matches the input/output data type, the main
806/// precision factorization is used.
807///
808/// **Supported input/output data type and lower precision for the IRS solver**
809///
810/// | **input/output Data Type (for example, main precision)** | **Supported values for the lowest precision** |
811/// | --- | --- |
812/// | [`PrecisionType::C64F`] | [`PrecisionType::C64F`], [`PrecisionType::C32F`], [`PrecisionType::C16F`], [`PrecisionType::C16Bf`], [`PrecisionType::CTf32`] |
813/// | [`PrecisionType::C32F`] | [`PrecisionType::C32F`], [`PrecisionType::C16F`], [`PrecisionType::C16Bf`], [`PrecisionType::CTf32`] |
814/// | [`PrecisionType::R64F`] | [`PrecisionType::R64F`], [`PrecisionType::R32F`], [`PrecisionType::R16F`], [`PrecisionType::R16Bf`], [`PrecisionType::RTf32`] |
815/// | [`PrecisionType::R32F`] | [`PrecisionType::R32F`], [`PrecisionType::R16F`], [`PrecisionType::R16Bf`], [`PrecisionType::RTf32`] |
816///
817/// [`xgesv_buffer_size`] returns the required workspace size in bytes for the
818/// current [`IrsParams`] configuration.
819///
820/// # Errors
821///
822/// Returns an error if cuSOLVER rejects the matrix dimensions, leading
823/// dimensions, parameter structure, info structure, or workspace. The workspace
824/// can become invalid if [`xgesv_buffer_size`] is called and then an IRS
825/// configuration value, such as the lowest precision, is changed. cuSOLVER can
826/// also report an error if host memory allocation fails, if the selected IRS
827/// configuration is not supported on the current GPU architecture, if the
828/// library has not been initialized, or if the solve ends with an internal or
829/// numerical failure. Check `niters` and `dinfo` for additional solver details.
830pub fn xgesv<T: DataTypeLike>(
831    ctx: &Context,
832    params: &mut IrsParams,
833    infos: &IrsInfos,
834    n: usize,
835    nrhs: usize,
836    a: MatrixMut<'_, T>,
837    b: MatrixRef<'_, T>,
838    x: MatrixMut<'_, T>,
839    device_workspace: &mut DeviceMemory<u8>,
840    dev_info: &mut DeviceMemory<i32>,
841) -> Result<i32> {
842    ctx.bind()?;
843    validate_matrix(n, n, a.data.len(), a.leading_dimension)?;
844    validate_matrix(n, nrhs, b.data.len(), b.leading_dimension)?;
845    validate_matrix(n, nrhs, x.data.len(), x.leading_dimension)?;
846    require_info_buffer(dev_info)?;
847    let workspace_bytes = xgesv_buffer_size::<T>(ctx, params, n, nrhs)?;
848    require_workspace_bytes(device_workspace.byte_len(), workspace_bytes)?;
849    let mut niters = 0;
850    unsafe {
851        try_ffi!(sys::cusolverDnIRSXgesv(
852            ctx.as_raw(),
853            params.as_raw(),
854            infos.as_raw(),
855            to_i32(n, "n")?,
856            to_i32(nrhs, "nrhs")?,
857            a.data.as_mut_ptr() as _,
858            to_i32(a.leading_dimension, "ldda")?,
859            b.data.as_ptr() as _,
860            to_i32(b.leading_dimension, "lddb")?,
861            x.data.as_mut_ptr() as _,
862            to_i32(x.leading_dimension, "lddx")?,
863            device_workspace.as_mut_ptr() as _,
864            to_u64(workspace_bytes, "lwork_bytes")?,
865            &raw mut niters,
866            dev_info.as_mut_ptr() as _,
867        ))?;
868    }
869    Ok(niters)
870}
871
872pub fn xgels_buffer_size<T: DataTypeLike>(
873    ctx: &Context,
874    params: &mut IrsParams,
875    m: usize,
876    n: usize,
877    nrhs: usize,
878) -> Result<usize> {
879    ctx.bind()?;
880    if m == 0 || n == 0 || nrhs == 0 || n > m {
881        return Err(Error::InvalidMatrixShape);
882    }
883    params.ensure_type_precision::<T>()?;
884    let mut workspace_bytes = 0;
885    unsafe {
886        try_ffi!(sys::cusolverDnIRSXgels_bufferSize(
887            ctx.as_raw(),
888            params.as_raw(),
889            to_i32(m, "m")?,
890            to_i32(n, "n")?,
891            to_i32(nrhs, "nrhs")?,
892            &raw mut workspace_bytes,
893        ))?;
894    }
895    to_usize(workspace_bytes, "workspace size")
896}
897
898/// Provides the same solve as the typed cuSOLVER `gels` entry
899/// points, but through a generic Rust wrapper that exposes IRS configuration
900/// and reporting more directly.
901/// [`xgels`] allows additional control of the solver parameters such as setting:
902///
903/// * the main precision (input/output precision) of the solver,
904/// * the lowest precision to be used internally by the solver,
905/// * the refinement solver type
906/// * the maximum allowed number of iterations in the refinement phase
907/// * the tolerance of the refinement solver
908/// * the fallback to main precision
909/// * and others
910///
911/// through [`IrsParams`] and its helper methods.
912/// Moreover, [`xgels`] provides additional output information such as the convergence history (for example, residual norms) at each iteration and the number of iterations needed to converge.
913/// [`IrsInfos`] exposes the information reported for a particular solve.
914///
915/// The returned value describes the solving process.
916/// `Ok` indicates that the solve finished successfully. An error indicates that one of the arguments is incorrect, that the parameter or info structures are misconfigured, or that the solve did not finish successfully.
917/// Check `niters` and `dinfo` for additional error details.
918/// Provide the required device workspace through `workspace`.
919/// Query the required byte count with [`xgels_buffer_size`].
920/// Apply any required configuration through the parameter structure before calling [`xgels_buffer_size`] so the workspace size matches that configuration.
921///
922/// The following table provides all possible lowest-precision values corresponding to the input/output data type.
923/// If the lowest precision matches the input/output data type, the main
924/// precision factorization is used.
925///
926/// Tensor Float (TF32), introduced with NVIDIA Ampere architecture GPUs, is the most robust tensor core accelerated compute mode for the iterative refinement solver.
927/// It solves a broad range of HPC problems and can provide up to 4x and 5x
928/// speedups for real and complex systems, respectively.
929/// On Volta and Turing architecture GPUs, half precision tensor core acceleration is recommended.
930/// In cases where the iterative refinement solver fails to converge to the desired accuracy (main precision, input/output data precision), it is recommended to use main precision as internal lowest precision.
931///
932/// **Supported input/output data type and lower precision for the IRS solver**
933///
934/// | **input/output Data Type (for example, main precision)** | **Supported values for the lowest precision** |
935/// | --- | --- |
936/// | [`PrecisionType::C64F`] | [`PrecisionType::C64F`], [`PrecisionType::C32F`], [`PrecisionType::C16F`], [`PrecisionType::C16Bf`], [`PrecisionType::CTf32`] |
937/// | [`PrecisionType::C32F`] | [`PrecisionType::C32F`], [`PrecisionType::C16F`], [`PrecisionType::C16Bf`], [`PrecisionType::CTf32`] |
938/// | [`PrecisionType::R64F`] | [`PrecisionType::R64F`], [`PrecisionType::R32F`], [`PrecisionType::R16F`], [`PrecisionType::R16Bf`], [`PrecisionType::RTf32`] |
939/// | [`PrecisionType::R32F`] | [`PrecisionType::R32F`], [`PrecisionType::R16F`], [`PrecisionType::R16Bf`], [`PrecisionType::RTf32`] |
940///
941/// [`xgels_buffer_size`] returns the required workspace size in bytes for the
942/// current [`IrsParams`] configuration.
943///
944/// # Errors
945///
946/// Returns an error if cuSOLVER rejects the matrix dimensions, leading
947/// dimensions, parameter structure, info structure, or workspace. The workspace
948/// can become invalid if [`xgels_buffer_size`] is called and then an IRS
949/// configuration value, such as the lowest precision, is changed. cuSOLVER can
950/// also report an error if host memory allocation fails, if the selected IRS
951/// configuration is not supported on the current GPU architecture, if the
952/// library has not been initialized, or if the solve ends with an internal or
953/// numerical failure. Check `niters` and `dinfo` for additional solver details.
954pub fn xgels<T: DataTypeLike>(
955    ctx: &Context,
956    params: &mut IrsParams,
957    infos: &IrsInfos,
958    m: usize,
959    n: usize,
960    nrhs: usize,
961    a: MatrixMut<'_, T>,
962    b: MatrixRef<'_, T>,
963    x: MatrixMut<'_, T>,
964    device_workspace: &mut DeviceMemory<u8>,
965    dev_info: &mut DeviceMemory<i32>,
966) -> Result<i32> {
967    ctx.bind()?;
968    if n > m {
969        return Err(Error::InvalidMatrixShape);
970    }
971    validate_matrix(m, n, a.data.len(), a.leading_dimension)?;
972    validate_matrix(m, nrhs, b.data.len(), b.leading_dimension)?;
973    validate_matrix(n, nrhs, x.data.len(), x.leading_dimension)?;
974    require_info_buffer(dev_info)?;
975    let workspace_bytes = xgels_buffer_size::<T>(ctx, params, m, n, nrhs)?;
976    require_workspace_bytes(device_workspace.byte_len(), workspace_bytes)?;
977    let mut niters = 0;
978    unsafe {
979        try_ffi!(sys::cusolverDnIRSXgels(
980            ctx.as_raw(),
981            params.as_raw(),
982            infos.as_raw(),
983            to_i32(m, "m")?,
984            to_i32(n, "n")?,
985            to_i32(nrhs, "nrhs")?,
986            a.data.as_mut_ptr() as _,
987            to_i32(a.leading_dimension, "ldda")?,
988            b.data.as_ptr() as _,
989            to_i32(b.leading_dimension, "lddb")?,
990            x.data.as_mut_ptr() as _,
991            to_i32(x.leading_dimension, "lddx")?,
992            device_workspace.as_mut_ptr() as _,
993            to_u64(workspace_bytes, "lwork_bytes")?,
994            &raw mut niters,
995            dev_info.as_mut_ptr() as _,
996        ))?;
997    }
998    Ok(niters)
999}
1000
1001fn require_info_buffer(dev_info: &DeviceMemory<i32>) -> Result<()> {
1002    if dev_info.is_empty() {
1003        return Err(Error::InvalidVectorShape);
1004    }
1005    Ok(())
1006}
1007
1008fn require_workspace_bytes(actual: usize, required: usize) -> Result<()> {
1009    if actual < required {
1010        return Err(Error::InsufficientWorkspaceSize { required, actual });
1011    }
1012    Ok(())
1013}
1014
1015unsafe fn copy_residual_history<T: Copy>(
1016    history: *const T,
1017    leading_dimension: usize,
1018    valid_rows: usize,
1019) -> ResidualHistory<T> {
1020    let history = unsafe { slice::from_raw_parts(history, leading_dimension.saturating_mul(2)) };
1021    let mut rows = Vec::with_capacity(valid_rows);
1022    for row in 0..valid_rows {
1023        rows.push(ResidualHistoryEntry {
1024            total_iterations: history[row],
1025            residual_norm: history[row + leading_dimension],
1026        });
1027    }
1028    ResidualHistory {
1029        rows,
1030        leading_dimension,
1031    }
1032}
1033
1034fn validate_matrix(rows: usize, cols: usize, len: usize, lda: usize) -> Result<()> {
1035    if rows == 0 || cols == 0 {
1036        return Err(Error::InvalidMatrixShape);
1037    }
1038    if lda < rows {
1039        return Err(Error::InvalidLeadingDimension);
1040    }
1041    let required = lda.checked_mul(cols).ok_or(Error::InvalidMatrixShape)?;
1042    if len < required {
1043        return Err(Error::InvalidMatrixShape);
1044    }
1045    Ok(())
1046}
1047
1048#[cfg(all(test, feature = "testing"))]
1049mod tests {
1050    use singe_cuda::memory::DeviceMemory;
1051
1052    use super::*;
1053    use crate::testing::setup_context_if_available;
1054
1055    #[test]
1056    fn test_xgesv_solves_diagonal_system() -> Result<()> {
1057        let Some(ctx) = setup_context_if_available()? else {
1058            return Ok(());
1059        };
1060        let mut params = IrsParams::create()?;
1061        let infos = IrsInfos::create()?;
1062
1063        let mut a = DeviceMemory::from_slice(&[
1064            2.0_f32, 0.0, //
1065            0.0, 4.0,
1066        ])?;
1067        let b = DeviceMemory::from_slice(&[
1068            6.0_f32, //
1069            8.0,
1070        ])?;
1071        let mut x = DeviceMemory::create(2)?;
1072        let workspace_bytes = xgesv_buffer_size::<f32>(&ctx, &mut params, 2, 1)?;
1073        let mut workspace = DeviceMemory::create(workspace_bytes.max(1))?;
1074        let mut dev_info = DeviceMemory::create(1)?;
1075
1076        let _ = xgesv(
1077            &ctx,
1078            &mut params,
1079            &infos,
1080            2,
1081            1,
1082            MatrixMut::new(&mut a, 2),
1083            MatrixRef::new(&b, 2),
1084            MatrixMut::new(&mut x, 2),
1085            &mut workspace,
1086            &mut dev_info,
1087        )?;
1088
1089        assert_eq!(dev_info.copy_to_host_vec()?, vec![0]);
1090        assert_eq!(x.copy_to_host_vec()?, vec![3.0, 2.0]);
1091        Ok(())
1092    }
1093}
singe_cusolver/irs.rs

singe_cusolver/
irs.rs