Skip to main content

feral_amd/
lib.rs

1//! Approximate Minimum Degree (AMD) fill-reducing ordering.
2//!
3//! Standalone implementation of the in-place quotient-graph AMD
4//! algorithm (Amestoy, Davis & Duff 1996, 2004). See the crate
5//! README and `dev/plans/ordering-amd-upgrade.md` for scope and
6//! references.
7//!
8//! Slice B is complete: mass elimination (Commit 9) and
9//! supervariable detection (Commit 10) are both live, so the
10//! ordering matches SuiteSparse / faer on the full oracle
11//! fixture suite.
12//!
13//! The public surface conforms to the FERAL ordering-crate
14//! contract (`dev/plans/ordering-crate-contract.md`). `CscPattern`,
15//! `OrderingStats`, `OrderingError`, and `CONTRACT_VERSION` are
16//! re-exported from `feral-ordering-core`.
17
18#![forbid(unsafe_code)]
19#![deny(missing_docs)]
20
21mod stats;
22
23pub use feral_ordering_core::{CscPattern, OrderingError, OrderingStats, CONTRACT_VERSION};
24pub use stats::AmdStats;
25
26use feral_ordering_core::quotient_graph::{
27    finalize_permutation, order, run_elimination, MinDegree, Workspace, WorkspaceOptions,
28};
29use std::time::Instant;
30
31/// Tunable parameters for AMD ordering.
32///
33/// Defaults match faer / SuiteSparse: `aggressive = true`,
34/// `dense_alpha = 10.0`.
35#[derive(Debug, Clone)]
36pub struct AmdOptions {
37    /// Enable aggressive element absorption in the Pass-2 degree
38    /// loop (faer `amd.rs:404-407`).
39    pub aggressive: bool,
40    /// Dense-row threshold multiplier. A variable with initial
41    /// degree exceeding `min(max(16, floor(dense_alpha * sqrt(n))), n)`
42    /// is deferred to the end of the ordering — the `max(16)` floor is
43    /// applied before the `min(n)` cap, matching faer `amd.rs:173-179`
44    /// / SuiteSparse AMD (the order matters: it guarantees the
45    /// threshold is `<= n`). A negative value uses a raw threshold of
46    /// `n - 2` in place of `dense_alpha * sqrt(n)`, with the same
47    /// `max(16)`/`min(n)` clamps; for `n >= 18` that is exactly
48    /// `n - 2`, suppressing deferral for all but true hubs of degree
49    /// `n - 1`.
50    pub dense_alpha: f64,
51}
52
53impl Default for AmdOptions {
54    fn default() -> Self {
55        Self {
56            aggressive: true,
57            dense_alpha: 10.0,
58        }
59    }
60}
61
62/// Compute a fill-reducing AMD ordering.
63///
64/// Returns a permutation `perm` (new-to-old) such that factoring
65/// `P·A·Pᵀ` with `P[k] = perm[k]` produces less fill than the
66/// natural ordering. The input must be the full symmetric pattern
67/// (both halves present).
68pub fn amd_order(pattern: &CscPattern<'_>) -> Result<Vec<i32>, OrderingError> {
69    amd_order_opts(pattern, &AmdOptions::default()).map(|(perm, _)| perm)
70}
71
72/// Compute an AMD ordering and return the crate-specific diagnostic
73/// counters.
74///
75/// See [`amd_order`] and [`AmdStats`]. Callers that also need the
76/// shared [`OrderingStats`] (wall time, fill estimate) should use
77/// [`amd_order_full`] instead.
78pub fn amd_order_with_stats(
79    pattern: &CscPattern<'_>,
80) -> Result<(Vec<i32>, AmdStats), OrderingError> {
81    amd_order_opts(pattern, &AmdOptions::default())
82}
83
84/// Compute an AMD ordering with explicit options.
85///
86/// Returns `(perm, amd_stats)`. See [`amd_order_full`] for the
87/// contract-conforming three-tuple return.
88pub fn amd_order_opts(
89    pattern: &CscPattern<'_>,
90    opts: &AmdOptions,
91) -> Result<(Vec<i32>, AmdStats), OrderingError> {
92    amd_order_full(pattern, opts).map(|(perm, _, amd_stats)| (perm, amd_stats))
93}
94
95/// Contract-conforming ordering producer.
96///
97/// Signature matches the shape every FERAL ordering crate must
98/// expose per `dev/plans/ordering-crate-contract.md`: input is a
99/// full-symmetric [`CscPattern`] and options; output is a
100/// three-tuple of `(perm, OrderingStats, crate-stats)`, with
101/// errors in [`OrderingError`].
102///
103/// `OrderingStats.time_us` is the wall-clock time of this call.
104/// `fill_estimate` and `flop_estimate` are left as `None` for AMD —
105/// the per-crate [`AmdStats`] carries `ndiv` / `nms_lu` / `nms_ldl`
106/// flop counters that may be surfaced here in a future revision
107/// without bumping the contract.
108pub fn amd_order_full(
109    pattern: &CscPattern<'_>,
110    opts: &AmdOptions,
111) -> Result<(Vec<i32>, OrderingStats, AmdStats), OrderingError> {
112    let t0 = Instant::now();
113    let ws_opts = WorkspaceOptions {
114        dense_alpha: opts.dense_alpha,
115    };
116    let (perm, diag) = order::<MinDegree>(pattern, &ws_opts, opts.aggressive)?;
117    let amd_stats = AmdStats {
118        ncmpa: diag.ncmpa,
119        n_clear_flag: 0,
120        n_mass_elim: diag.n_mass_elim,
121        n_supervar_merge: diag.n_supervar_merge,
122        n_dense_deferred: diag.ndense.max(0) as u32,
123        ndiv: diag.flops.ndiv.max(0.0) as u64,
124        nms_lu: diag.flops.nms_lu.max(0.0) as u64,
125        nms_ldl: diag.flops.nms_ldl.max(0.0) as u64,
126    };
127    let ordering_stats = OrderingStats {
128        time_us: t0.elapsed().as_micros() as u64,
129        fill_estimate: None,
130        flop_estimate: None,
131    };
132    Ok((perm, ordering_stats, amd_stats))
133}
134
135/// Per-sub-stage wall-clock breakdown of one AMD call.
136///
137/// Returned by [`amd_order_substages`] alongside the permutation.
138/// All fields are wall-clock microseconds for that single call.
139/// Sum is approximately equal to the total time reported by
140/// [`amd_order_full`]'s [`OrderingStats::time_us`], modulo a few
141/// hundred ns of `Instant::now()` overhead.
142///
143/// Used by the small-n diagnostic probe
144/// `feral::bin::diag_amd_substages` to attribute the per-call AMD
145/// cost between workspace allocation, the main elimination loop,
146/// and permutation finalisation. See
147/// `dev/plans/phase-2.13-tail-diagnostic.md` step 5.
148#[derive(Debug, Clone, Copy, Default)]
149pub struct AmdSubstages {
150    /// Time spent in `AmdWorkspace::new`: input ingest, vector
151    /// allocations, initial degree lists, dense-row deferral.
152    pub workspace_new_us: u64,
153    /// Time spent in the main pivot/eliminate/finalize loop
154    /// (`select_pivot` + `create_element` + `finalize_step`).
155    pub run_elimination_us: u64,
156    /// Time spent in the assembly-tree postorder and final
157    /// permutation emission.
158    pub finalize_permutation_us: u64,
159}
160
161/// Compute an AMD ordering and report a per-sub-stage timing
162/// breakdown.
163///
164/// Behaves identically to [`amd_order_full`] but additionally
165/// returns an [`AmdSubstages`] split of the wall-clock time across
166/// `workspace::new`, `run_elimination`, and `finalize_permutation`.
167/// Diagnostic-only — production callers should keep using
168/// [`amd_order`] / [`amd_order_full`]. The `Instant::now()` calls
169/// add at most ~100 ns vs the un-profiled path, but the API surface
170/// is intentionally separate to avoid polluting the stable
171/// contract-conforming function.
172pub fn amd_order_substages(
173    pattern: &CscPattern<'_>,
174    opts: &AmdOptions,
175) -> Result<(Vec<i32>, AmdSubstages), OrderingError> {
176    let t = Instant::now();
177    let ws_opts = WorkspaceOptions {
178        dense_alpha: opts.dense_alpha,
179    };
180    let mut ws = Workspace::new(pattern, &ws_opts)?;
181    let workspace_new_us = t.elapsed().as_micros() as u64;
182
183    let t = Instant::now();
184    let _flops = run_elimination(&mut ws, opts.aggressive)?;
185    let run_elimination_us = t.elapsed().as_micros() as u64;
186
187    let t = Instant::now();
188    let perm = finalize_permutation(&mut ws);
189    let finalize_permutation_us = t.elapsed().as_micros() as u64;
190
191    Ok((
192        perm,
193        AmdSubstages {
194            workspace_new_us,
195            run_elimination_us,
196            finalize_permutation_us,
197        },
198    ))
199}