Skip to main content

ferrolearn_preprocess/
random_projection.rs

1//! Random projection transformers for dimensionality reduction.
2//!
3//! Random projections preserve pairwise distances in expectation (Johnson-Lindenstrauss lemma).
4//!
5//! - [`GaussianRandomProjection`] — dense Gaussian random matrix
6//! - [`SparseRandomProjection`] — sparse random matrix with `{-1, 0, +1}` entries
7//!
8//! ## REQ status
9//!
10//! Translation target: scikit-learn 1.5.2 `GaussianRandomProjection` +
11//! `SparseRandomProjection` (`sklearn/random_projection.py`). Tracking: #1387.
12//! Each REQ is BINARY — SHIPPED (impl + non-test consumer + tests + green
13//! verification) or NOT-STARTED (with a concrete open blocker). RNG-COUPLED
14//! unit: ferrolearn uses Rust `SmallRng`, sklearn numpy `RandomState`, so exact
15//! projection-matrix VALUE parity is impossible (carve-out); the SHIPPED claims
16//! are the projection DISTRIBUTION/scale (vs the sklearn formula) + transform
17//! contract + determinism, not bit-exact matrix values.
18//!
19//! | REQ | Scope | Status | Evidence / Blocker |
20//! |-----|-------|--------|--------------------|
21//! | REQ-1 | Gaussian projection DISTRIBUTION `N(0, 1/n_components)` + transform `X@R` + determinism-given-seed | SHIPPED | [`GaussianRandomProjection`] `fit` scale `1/sqrt(n_components)` matches sklearn `_gaussian_random_matrix` `random_projection.py:200`; empirical variance ≈ 1/k guard + determinism + transform tests in `tests/divergence_random_projection.rs`. Consumer: re-export `lib.rs:167` + `PipelineTransformer` |
22//! | REQ-2 | Sparse projection DISTRIBUTION (support `±sqrt(1/(d·n_components))`, probs `{d/2, 1-d, d/2}`, default density `1/sqrt(n_features)`, `density=1` case) + transform | SHIPPED | [`SparseRandomProjection`] `fit` scale/support/probs match sklearn `_sparse_random_matrix` `:301` + `_check_density` `:148-149`; nonzero-entry == ±scale (exact, tol 1e-12) + default-density + density=1 tests |
23//! | REQ-3 | Error/parameter contracts (n_components==0, density∉(0,1], zero rows, transform ncols, unfitted) | SHIPPED (scoped) | both `fit`/`Fitted*` `transform`; in-module + divergence error tests |
24//! | REQ-4 | Exact projection-matrix VALUE parity (numpy `RandomState` stream) | NOT-STARTED | RNG-coupled CARVE-OUT (Rust `SmallRng` ≠ numpy MT19937), no committed failing test — blocker #1388 |
25//! | REQ-5 | Sparse sampling METHOD (per-row `binomial` + `sample_without_replacement` vs per-entry Bernoulli) | NOT-STARTED | sklearn `random_projection.py:281-301` — blocker #1389 |
26//! | REQ-6 | `n_components='auto'` + `eps` + `johnson_lindenstrauss_min_dim` | SHIPPED | [`johnson_lindenstrauss_min_dim`] = `floor(4*ln(n_samples)/(eps^2/2-eps^3/3))` matches sklearn `random_projection.py:142-143` (int64 truncation), validates `eps∈(0,1)`/`n_samples>0` (`:133,136`); [`NComponents::Auto`] default resolves via JL at `fit` (sklearn `:388-391`) with the `n_components_ <= 0` reject (`:393-397`). Consumer: re-export `lib.rs:167` + both `fit`. Tests `tests/divergence_random_projection_jl_2347.rs` |
27//! | REQ-7 | `components_` `(n_components, n_features)` orientation (#2346) | SHIPPED (orientation) — CSR sparse storage + `dense_output` NOT-STARTED | `components()` / `projection()` now store `(n_components, n_features)` matching sklearn `:419`, transform `X @ components_.T` (`:604,810`); CSR sparse storage + `dense_output` still dense — blocker #1391. Tests `tests/divergence_random_projection_components_orientation_2345.rs` |
28//! | REQ-8 | `inverse_transform` + `compute_inverse_components` | NOT-STARTED | sklearn `random_projection.py:356,431-458` — blocker #1392 |
29//! | REQ-9 | `n_components > n_features` warning + `components_`/`n_components_`/`n_features_in_`/`density_` attrs + `get_feature_names_out` | SHIPPED (attrs) — `DataDimensionalityWarning` emission NOT-STARTED | `n_components_()`/`n_features_in_()`/`density_()` getters + `(n_components,n_features)` `components_` (`:416,419,788`); the `n_components>n_features` `DataDimensionalityWarning` (`:407-414`) is NOT emitted (no warning facade in ferrolearn — fit still proceeds) + `get_feature_names_out` absent — blocker #1393 |
30//! | REQ-10 | PyO3 binding | NOT-STARTED | no `ferrolearn-python` registration — blocker #1394 |
31//! | REQ-11 | ferray substrate | NOT-STARTED | dense `Array2` + `num_traits::Float` only — blocker #1395 |
32
33use ferrolearn_core::error::FerroError;
34use ferrolearn_core::pipeline::{FittedPipelineTransformer, PipelineTransformer};
35use ferrolearn_core::traits::{Fit, FitTransform, Transform};
36use ndarray::{Array1, Array2};
37use num_traits::Float;
38use rand::SeedableRng;
39use rand::rngs::SmallRng;
40use rand_distr::{Distribution, StandardNormal};
41
42// ---------------------------------------------------------------------------
43// johnson_lindenstrauss_min_dim + NComponents
44// ---------------------------------------------------------------------------
45
46/// Find a "safe" number of components to randomly project to.
47///
48/// Mirrors scikit-learn 1.5.2 `johnson_lindenstrauss_min_dim`
49/// (`sklearn/random_projection.py:64-143`). The minimum number of components
50/// that guarantees an `eps`-embedding for `n_samples` points is
51///
52/// ```text
53/// n_components >= 4 * ln(n_samples) / (eps^2 / 2 - eps^3 / 3)
54/// ```
55///
56/// sklearn casts the result with `.astype(np.int64)`
57/// (`random_projection.py:142-143`), which TRUNCATES toward zero (a FLOOR for
58/// the positive values this expression produces). This function matches that
59/// truncation, e.g. `johnson_lindenstrauss_min_dim(1000, 0.1) == 5920`.
60///
61/// # Errors
62///
63/// Returns [`FerroError::InvalidParameter`] if `eps` is not in the open
64/// interval `(0, 1)` (sklearn raises `ValueError`, `random_projection.py:133`),
65/// or if `n_samples == 0` (sklearn `random_projection.py:136-140`).
66#[must_use = "the returned target dimension should be used"]
67pub fn johnson_lindenstrauss_min_dim<F: Float>(
68    n_samples: usize,
69    eps: F,
70) -> Result<usize, FerroError> {
71    // sklearn random_projection.py:133 — reject eps outside ]0, 1[.
72    let zero = F::zero();
73    let one = F::one();
74    if !(eps > zero && eps < one) {
75        return Err(FerroError::InvalidParameter {
76            name: "eps".into(),
77            reason: "the JL bound is defined for eps in (0, 1)".into(),
78        });
79    }
80    // sklearn random_projection.py:136 — reject n_samples <= 0.
81    if n_samples == 0 {
82        return Err(FerroError::InvalidParameter {
83            name: "n_samples".into(),
84            reason: "the JL bound is defined for n_samples greater than zero".into(),
85        });
86    }
87
88    // denominator = eps^2 / 2 - eps^3 / 3 (random_projection.py:142).
89    let two = one + one;
90    let three = two + one;
91    let denominator = (eps * eps / two) - (eps * eps * eps / three);
92
93    let n = F::from(n_samples).unwrap_or_else(F::infinity);
94    let value = F::from(4.0).unwrap_or_else(F::one) * n.ln() / denominator;
95
96    // .astype(np.int64) truncates toward zero (random_projection.py:143).
97    // `value` is positive for n_samples >= 1 and eps in (0, 1), so truncation
98    // equals floor. Guard against a non-finite/negative result defensively.
99    let truncated = value.trunc();
100    if !truncated.is_finite() || truncated < zero {
101        return Err(FerroError::InvalidParameter {
102            name: "n_samples".into(),
103            reason: "the JL bound produced a non-finite target dimension".into(),
104        });
105    }
106    Ok(truncated.to_usize().unwrap_or(0))
107}
108
109/// Target projection dimensionality, mirroring sklearn's `n_components`
110/// parameter which accepts an integer or the string `'auto'`
111/// (`random_projection.py:314-317`).
112#[derive(Debug, Clone, Copy, PartialEq, Eq)]
113pub enum NComponents {
114    /// Resolve `n_components` from the Johnson-Lindenstrauss bound at fit time
115    /// using the configured `eps` (sklearn `n_components='auto'`,
116    /// `random_projection.py:388-391`). This is the sklearn default.
117    Auto,
118    /// An explicit, user-provided number of output dimensions.
119    Fixed(usize),
120}
121
122impl Default for NComponents {
123    /// sklearn defaults `n_components='auto'` (`random_projection.py:326,477`).
124    fn default() -> Self {
125        Self::Auto
126    }
127}
128
129impl From<usize> for NComponents {
130    fn from(value: usize) -> Self {
131        Self::Fixed(value)
132    }
133}
134
135/// Resolve the configured `n_components` to a concrete dimension at fit time.
136///
137/// For [`NComponents::Auto`] this calls [`johnson_lindenstrauss_min_dim`] with
138/// `n_samples` and `eps` (sklearn `random_projection.py:388-391`). For both
139/// branches a resolved value of `0` is rejected — sklearn rejects the `auto`
140/// bound resolving to `<= 0` (`random_projection.py:393-397`) and requires
141/// explicit `n_components >= 1` (`_parameter_constraints`, `:314-315`).
142///
143/// For the [`NComponents::Auto`] path ONLY, a resolved dimension GREATER than
144/// `n_features` is also rejected — sklearn raises `ValueError` when the
145/// JL-resolved `n_components_` exceeds the original space
146/// (`random_projection.py:399-405`). The [`NComponents::Fixed`] path does NOT
147/// error here: sklearn only emits a `DataDimensionalityWarning` for an explicit
148/// `n_components > n_features` (`random_projection.py:407-414`, NOT-STARTED
149/// warning REQ-9 / blocker #1393), so the fixed path proceeds unchanged.
150///
151/// # Errors
152///
153/// Returns [`FerroError::InvalidParameter`] if the resolved dimension is `0`,
154/// if the `Auto` resolved dimension exceeds `n_features`, or if the JL bound's
155/// `eps`/`n_samples` validation fails.
156fn resolve_n_components(
157    n_components: NComponents,
158    eps: f64,
159    n_samples: usize,
160    n_features: usize,
161) -> Result<usize, FerroError> {
162    let resolved = match n_components {
163        NComponents::Fixed(k) => k,
164        NComponents::Auto => johnson_lindenstrauss_min_dim(n_samples, eps)?,
165    };
166    if resolved == 0 {
167        return Err(FerroError::InvalidParameter {
168            name: "n_components".into(),
169            reason: match n_components {
170                NComponents::Auto => format!(
171                    "eps={eps} and n_samples={n_samples} lead to a target dimension of 0 which is invalid"
172                ),
173                NComponents::Fixed(_) => "must be >= 1".into(),
174            },
175        });
176    }
177    // AUTO PATH ONLY (sklearn `random_projection.py:399-405`): a JL-resolved
178    // target dimension larger than the original space is a ValueError. The
179    // Fixed path is intentionally excluded — sklearn only WARNS there (`:407-414`).
180    if matches!(n_components, NComponents::Auto) && resolved > n_features {
181        return Err(FerroError::InvalidParameter {
182            name: "n_components".into(),
183            reason: format!(
184                "eps={eps} and n_samples={n_samples} lead to a target dimension of {resolved} which is larger than the original space with n_features={n_features}"
185            ),
186        });
187    }
188    Ok(resolved)
189}
190
191// ---------------------------------------------------------------------------
192// GaussianRandomProjection
193// ---------------------------------------------------------------------------
194
195/// Gaussian random projection transformer.
196///
197/// Projects data into a lower-dimensional space using a random matrix drawn
198/// from `N(0, 1/n_components)`.
199///
200/// # Examples
201///
202/// ```
203/// use ferrolearn_preprocess::random_projection::GaussianRandomProjection;
204/// use ferrolearn_core::traits::{Fit, Transform};
205/// use ndarray::Array2;
206///
207/// let x = Array2::<f64>::ones((10, 50));
208/// let proj = GaussianRandomProjection::<f64>::new(5);
209/// let fitted = proj.fit(&x, &()).unwrap();
210/// let out = fitted.transform(&x).unwrap();
211/// assert_eq!(out.shape(), &[10, 5]);
212/// ```
213#[derive(Debug, Clone)]
214pub struct GaussianRandomProjection<F> {
215    /// Number of output dimensions, or [`NComponents::Auto`] to resolve via the
216    /// Johnson-Lindenstrauss bound at fit time. Defaults to `Auto`.
217    n_components: NComponents,
218    /// Distortion-rate parameter for the `Auto` JL bound. Defaults to `0.1`
219    /// (sklearn `eps=0.1`, `random_projection.py:328`).
220    eps: f64,
221    /// Optional RNG seed for reproducibility.
222    random_state: Option<u64>,
223    _marker: std::marker::PhantomData<F>,
224}
225
226impl<F: Float + Send + Sync + 'static> GaussianRandomProjection<F> {
227    /// Create a new Gaussian random projection with an explicit `n_components`
228    /// number of output dimensions (`NComponents::Fixed`).
229    #[must_use]
230    pub fn new(n_components: usize) -> Self {
231        Self {
232            n_components: NComponents::Fixed(n_components),
233            eps: 0.1,
234            random_state: None,
235            _marker: std::marker::PhantomData,
236        }
237    }
238
239    /// Create a new Gaussian random projection with `n_components='auto'`
240    /// (sklearn default): the target dimension is resolved at fit time from the
241    /// Johnson-Lindenstrauss bound for the configured `eps`
242    /// (`random_projection.py:388-391`).
243    #[must_use]
244    pub fn new_auto() -> Self {
245        Self {
246            n_components: NComponents::Auto,
247            eps: 0.1,
248            random_state: None,
249            _marker: std::marker::PhantomData,
250        }
251    }
252
253    /// Set the distortion-rate `eps` used by the `Auto` JL bound
254    /// (sklearn `eps`, `random_projection.py:328`).
255    #[must_use]
256    pub fn eps(mut self, eps: f64) -> Self {
257        self.eps = eps;
258        self
259    }
260
261    /// Set the random seed for reproducibility.
262    #[must_use]
263    pub fn random_state(mut self, seed: u64) -> Self {
264        self.random_state = Some(seed);
265        self
266    }
267}
268
269/// Fitted Gaussian random projection holding the projection matrix.
270#[derive(Debug, Clone)]
271pub struct FittedGaussianRandomProjection<F> {
272    /// Projection matrix of shape `(n_components, n_features)` — the sklearn
273    /// `components_` orientation (`random_projection.py:419`).
274    components: Array2<F>,
275    /// Resolved number of output dimensions (`n_components_`).
276    n_components: usize,
277    /// Number of features seen during fit (`n_features_in_`).
278    n_features_in: usize,
279}
280
281impl<F: Float + Send + Sync + 'static> FittedGaussianRandomProjection<F> {
282    /// Return a reference to the projection matrix `components_` of shape
283    /// `(n_components, n_features)` (sklearn `components_`,
284    /// `random_projection.py:419`).
285    #[must_use]
286    pub fn components(&self) -> &Array2<F> {
287        &self.components
288    }
289
290    /// Alias for [`Self::components`] kept for backward compatibility. Returns
291    /// the projection matrix in the sklearn `(n_components, n_features)`
292    /// orientation.
293    #[must_use]
294    pub fn projection(&self) -> &Array2<F> {
295        &self.components
296    }
297
298    /// Resolved number of output dimensions (`n_components_`).
299    #[must_use]
300    pub fn n_components_(&self) -> usize {
301        self.n_components
302    }
303
304    /// Number of features seen during fit (`n_features_in_`).
305    #[must_use]
306    pub fn n_features_in_(&self) -> usize {
307        self.n_features_in
308    }
309}
310
311impl<F: Float + Send + Sync + 'static> Fit<Array2<F>, ()> for GaussianRandomProjection<F> {
312    type Fitted = FittedGaussianRandomProjection<F>;
313    type Error = FerroError;
314
315    /// Fit the projection by generating a random matrix `R ~ N(0, 1/n_components)`.
316    ///
317    /// When `n_components` is [`NComponents::Auto`] the target dimension is
318    /// resolved from the Johnson-Lindenstrauss bound for `eps`
319    /// (`random_projection.py:388-391`).
320    ///
321    /// # Errors
322    ///
323    /// Returns [`FerroError::InvalidParameter`] if the resolved `n_components`
324    /// is `0` (explicit, or the JL bound resolving to `<= 0`,
325    /// `random_projection.py:393-397`), or if `eps` is outside `(0, 1)` for the
326    /// `Auto` path. Returns [`FerroError::InsufficientSamples`] if `x` has zero
327    /// rows.
328    fn fit(&self, x: &Array2<F>, _y: &()) -> Result<FittedGaussianRandomProjection<F>, FerroError> {
329        if x.nrows() == 0 {
330            return Err(FerroError::InsufficientSamples {
331                required: 1,
332                actual: 0,
333                context: "GaussianRandomProjection::fit".into(),
334            });
335        }
336
337        let n_features = x.ncols();
338        let n_components =
339            resolve_n_components(self.n_components, self.eps, x.nrows(), n_features)?;
340
341        let mut rng: SmallRng = match self.random_state {
342            Some(seed) => SmallRng::seed_from_u64(seed),
343            None => SmallRng::from_os_rng(),
344        };
345
346        let scale = F::one() / F::from(n_components).unwrap_or_else(F::one).sqrt();
347        let normal = StandardNormal;
348        // components_ orientation (n_components, n_features) — sklearn
349        // `random_projection.py:419`.
350        let mut components = Array2::zeros((n_components, n_features));
351        for v in &mut components {
352            let sample: f64 = normal.sample(&mut rng);
353            *v = F::from(sample).unwrap_or_else(F::zero) * scale;
354        }
355
356        Ok(FittedGaussianRandomProjection {
357            components,
358            n_components,
359            n_features_in: n_features,
360        })
361    }
362}
363
364impl<F: Float + Send + Sync + 'static> Transform<Array2<F>> for FittedGaussianRandomProjection<F> {
365    type Output = Array2<F>;
366    type Error = FerroError;
367
368    /// Transform data by computing `X @ components_.T`
369    /// (sklearn `random_projection.py:604`).
370    ///
371    /// # Errors
372    ///
373    /// Returns [`FerroError::ShapeMismatch`] if `x.ncols() != n_features`.
374    fn transform(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
375        if x.ncols() != self.n_features_in {
376            return Err(FerroError::ShapeMismatch {
377                expected: vec![x.nrows(), self.n_features_in],
378                actual: vec![x.nrows(), x.ncols()],
379                context: "FittedGaussianRandomProjection::transform".into(),
380            });
381        }
382        // components_ is (n_components, n_features); X @ components_.T gives
383        // (n_samples, n_components) — sklearn `random_projection.py:604`.
384        Ok(x.dot(&self.components.t()))
385    }
386}
387
388impl<F: Float + Send + Sync + 'static> Transform<Array2<F>> for GaussianRandomProjection<F> {
389    type Output = Array2<F>;
390    type Error = FerroError;
391
392    /// Always returns an error — the projection must be fitted first.
393    fn transform(&self, _x: &Array2<F>) -> Result<Array2<F>, FerroError> {
394        Err(FerroError::InvalidParameter {
395            name: "GaussianRandomProjection".into(),
396            reason: "projection must be fitted before calling transform; use fit() first".into(),
397        })
398    }
399}
400
401impl<F: Float + Send + Sync + 'static> FitTransform<Array2<F>> for GaussianRandomProjection<F> {
402    type FitError = FerroError;
403
404    /// Fit and transform in one step.
405    fn fit_transform(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
406        let fitted = self.fit(x, &())?;
407        fitted.transform(x)
408    }
409}
410
411impl<F: Float + Send + Sync + 'static> PipelineTransformer<F> for GaussianRandomProjection<F> {
412    fn fit_pipeline(
413        &self,
414        x: &Array2<F>,
415        _y: &Array1<F>,
416    ) -> Result<Box<dyn FittedPipelineTransformer<F>>, FerroError> {
417        let fitted = self.fit(x, &())?;
418        Ok(Box::new(fitted))
419    }
420}
421
422impl<F: Float + Send + Sync + 'static> FittedPipelineTransformer<F>
423    for FittedGaussianRandomProjection<F>
424{
425    fn transform_pipeline(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
426        self.transform(x)
427    }
428}
429
430// ---------------------------------------------------------------------------
431// SparseRandomProjection
432// ---------------------------------------------------------------------------
433
434/// Sparse random projection transformer.
435///
436/// Projects data into a lower-dimensional space using a sparse random matrix
437/// with entries `{-1, 0, +1}` drawn with probabilities
438/// `{d/2, 1 - d, d/2}`, scaled by `sqrt(1 / (d * n_components))`.
439///
440/// The default density `d = 1 / sqrt(n_features)` is used when not specified.
441///
442/// # Examples
443///
444/// ```
445/// use ferrolearn_preprocess::random_projection::SparseRandomProjection;
446/// use ferrolearn_core::traits::{Fit, Transform};
447/// use ndarray::Array2;
448///
449/// let x = Array2::<f64>::ones((10, 100));
450/// let proj = SparseRandomProjection::<f64>::new(5).random_state(42);
451/// let fitted = proj.fit(&x, &()).unwrap();
452/// let out = fitted.transform(&x).unwrap();
453/// assert_eq!(out.shape(), &[10, 5]);
454/// ```
455#[derive(Debug, Clone)]
456pub struct SparseRandomProjection<F> {
457    /// Number of output dimensions, or [`NComponents::Auto`] to resolve via the
458    /// Johnson-Lindenstrauss bound at fit time. Defaults to `Auto`.
459    n_components: NComponents,
460    /// Density of non-zero entries. `None` means `'auto' = 1/sqrt(n_features)`.
461    density: Option<f64>,
462    /// Distortion-rate parameter for the `Auto` JL bound. Defaults to `0.1`.
463    eps: f64,
464    /// Optional RNG seed for reproducibility.
465    random_state: Option<u64>,
466    _marker: std::marker::PhantomData<F>,
467}
468
469impl<F: Float + Send + Sync + 'static> SparseRandomProjection<F> {
470    /// Create a new sparse random projection with an explicit `n_components`
471    /// number of output dimensions (`NComponents::Fixed`).
472    #[must_use]
473    pub fn new(n_components: usize) -> Self {
474        Self {
475            n_components: NComponents::Fixed(n_components),
476            density: None,
477            eps: 0.1,
478            random_state: None,
479            _marker: std::marker::PhantomData,
480        }
481    }
482
483    /// Create a new sparse random projection with `n_components='auto'`
484    /// (sklearn default): resolved at fit time from the Johnson-Lindenstrauss
485    /// bound for the configured `eps` (`random_projection.py:388-391`).
486    #[must_use]
487    pub fn new_auto() -> Self {
488        Self {
489            n_components: NComponents::Auto,
490            density: None,
491            eps: 0.1,
492            random_state: None,
493            _marker: std::marker::PhantomData,
494        }
495    }
496
497    /// Set the density of non-zero entries.
498    #[must_use]
499    pub fn density(mut self, density: f64) -> Self {
500        self.density = Some(density);
501        self
502    }
503
504    /// Set the distortion-rate `eps` used by the `Auto` JL bound.
505    #[must_use]
506    pub fn eps(mut self, eps: f64) -> Self {
507        self.eps = eps;
508        self
509    }
510
511    /// Set the random seed for reproducibility.
512    #[must_use]
513    pub fn random_state(mut self, seed: u64) -> Self {
514        self.random_state = Some(seed);
515        self
516    }
517}
518
519/// Fitted sparse random projection holding the projection matrix.
520#[derive(Debug, Clone)]
521pub struct FittedSparseRandomProjection<F> {
522    /// Projection matrix of shape `(n_components, n_features)` — the sklearn
523    /// `components_` orientation (`random_projection.py:419`).
524    components: Array2<F>,
525    /// Resolved number of output dimensions (`n_components_`).
526    n_components: usize,
527    /// Number of features seen during fit (`n_features_in_`).
528    n_features_in: usize,
529    /// Resolved density (`density_`), i.e. `1/sqrt(n_features)` when `'auto'`
530    /// (sklearn `density_`, `random_projection.py:788`).
531    density: f64,
532}
533
534impl<F: Float + Send + Sync + 'static> FittedSparseRandomProjection<F> {
535    /// Return a reference to the projection matrix `components_` of shape
536    /// `(n_components, n_features)` (sklearn `components_`,
537    /// `random_projection.py:419`).
538    #[must_use]
539    pub fn components(&self) -> &Array2<F> {
540        &self.components
541    }
542
543    /// Alias for [`Self::components`] kept for backward compatibility. Returns
544    /// the projection matrix in the sklearn `(n_components, n_features)`
545    /// orientation.
546    #[must_use]
547    pub fn projection(&self) -> &Array2<F> {
548        &self.components
549    }
550
551    /// Resolved number of output dimensions (`n_components_`).
552    #[must_use]
553    pub fn n_components_(&self) -> usize {
554        self.n_components
555    }
556
557    /// Number of features seen during fit (`n_features_in_`).
558    #[must_use]
559    pub fn n_features_in_(&self) -> usize {
560        self.n_features_in
561    }
562
563    /// Resolved density (`density_`), i.e. `1/sqrt(n_features)` when the
564    /// constructor density was left at the `'auto'` default
565    /// (sklearn `density_`, `random_projection.py:788`).
566    #[must_use]
567    pub fn density_(&self) -> f64 {
568        self.density
569    }
570}
571
572impl<F: Float + Send + Sync + 'static> Fit<Array2<F>, ()> for SparseRandomProjection<F> {
573    type Fitted = FittedSparseRandomProjection<F>;
574    type Error = FerroError;
575
576    /// Fit the projection by generating a sparse random matrix.
577    ///
578    /// When `n_components` is [`NComponents::Auto`] the target dimension is
579    /// resolved from the Johnson-Lindenstrauss bound for `eps`
580    /// (`random_projection.py:388-391`).
581    ///
582    /// # Errors
583    ///
584    /// Returns [`FerroError::InvalidParameter`] if the resolved `n_components`
585    /// is `0` or `density` is not in `(0, 1]`.
586    /// Returns [`FerroError::InsufficientSamples`] if `x` has zero rows.
587    fn fit(&self, x: &Array2<F>, _y: &()) -> Result<FittedSparseRandomProjection<F>, FerroError> {
588        if x.nrows() == 0 {
589            return Err(FerroError::InsufficientSamples {
590                required: 1,
591                actual: 0,
592                context: "SparseRandomProjection::fit".into(),
593            });
594        }
595
596        let n_features = x.ncols();
597        let n_components =
598            resolve_n_components(self.n_components, self.eps, x.nrows(), n_features)?;
599
600        // density='auto' => 1/sqrt(n_features) (sklearn `_check_density:148-149`).
601        let d = self
602            .density
603            .unwrap_or_else(|| 1.0 / (n_features as f64).sqrt());
604
605        if d <= 0.0 || d > 1.0 {
606            return Err(FerroError::InvalidParameter {
607                name: "density".into(),
608                reason: format!("must be in (0, 1], got {d}"),
609            });
610        }
611
612        let mut rng: SmallRng = match self.random_state {
613            Some(seed) => SmallRng::seed_from_u64(seed),
614            None => SmallRng::from_os_rng(),
615        };
616
617        let scale = F::from(1.0 / (d * n_components as f64).sqrt()).unwrap_or_else(F::one);
618        let uniform =
619            rand::distr::Uniform::new(0.0_f64, 1.0).map_err(|e| FerroError::InvalidParameter {
620                name: "density".into(),
621                reason: format!("failed to build uniform sampler: {e}"),
622            })?;
623
624        // components_ orientation (n_components, n_features) — sklearn
625        // `random_projection.py:419`.
626        let mut components = Array2::zeros((n_components, n_features));
627        for v in &mut components {
628            let u: f64 = uniform.sample(&mut rng);
629            if u < d / 2.0 {
630                *v = scale.neg();
631            } else if u < d {
632                *v = scale;
633            }
634            // else: remains 0
635        }
636
637        Ok(FittedSparseRandomProjection {
638            components,
639            n_components,
640            n_features_in: n_features,
641            density: d,
642        })
643    }
644}
645
646impl<F: Float + Send + Sync + 'static> Transform<Array2<F>> for FittedSparseRandomProjection<F> {
647    type Output = Array2<F>;
648    type Error = FerroError;
649
650    /// Transform data by computing `X @ components_.T`
651    /// (sklearn `random_projection.py:810`).
652    ///
653    /// # Errors
654    ///
655    /// Returns [`FerroError::ShapeMismatch`] if `x.ncols() != n_features`.
656    fn transform(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
657        if x.ncols() != self.n_features_in {
658            return Err(FerroError::ShapeMismatch {
659                expected: vec![x.nrows(), self.n_features_in],
660                actual: vec![x.nrows(), x.ncols()],
661                context: "FittedSparseRandomProjection::transform".into(),
662            });
663        }
664        // components_ is (n_components, n_features); X @ components_.T gives
665        // (n_samples, n_components) — sklearn `random_projection.py:810`.
666        Ok(x.dot(&self.components.t()))
667    }
668}
669
670impl<F: Float + Send + Sync + 'static> Transform<Array2<F>> for SparseRandomProjection<F> {
671    type Output = Array2<F>;
672    type Error = FerroError;
673
674    /// Always returns an error — the projection must be fitted first.
675    fn transform(&self, _x: &Array2<F>) -> Result<Array2<F>, FerroError> {
676        Err(FerroError::InvalidParameter {
677            name: "SparseRandomProjection".into(),
678            reason: "projection must be fitted before calling transform; use fit() first".into(),
679        })
680    }
681}
682
683impl<F: Float + Send + Sync + 'static> FitTransform<Array2<F>> for SparseRandomProjection<F> {
684    type FitError = FerroError;
685
686    /// Fit and transform in one step.
687    fn fit_transform(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
688        let fitted = self.fit(x, &())?;
689        fitted.transform(x)
690    }
691}
692
693impl<F: Float + Send + Sync + 'static> PipelineTransformer<F> for SparseRandomProjection<F> {
694    fn fit_pipeline(
695        &self,
696        x: &Array2<F>,
697        _y: &Array1<F>,
698    ) -> Result<Box<dyn FittedPipelineTransformer<F>>, FerroError> {
699        let fitted = self.fit(x, &())?;
700        Ok(Box::new(fitted))
701    }
702}
703
704impl<F: Float + Send + Sync + 'static> FittedPipelineTransformer<F>
705    for FittedSparseRandomProjection<F>
706{
707    fn transform_pipeline(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
708        self.transform(x)
709    }
710}
711
712// ---------------------------------------------------------------------------
713// Tests
714// ---------------------------------------------------------------------------
715
716#[cfg(test)]
717mod tests {
718    use super::*;
719    use ndarray::Array2;
720
721    // -- GaussianRandomProjection --
722
723    #[test]
724    fn test_gaussian_rp_output_shape() {
725        let x = Array2::<f64>::ones((10, 50));
726        let proj = GaussianRandomProjection::<f64>::new(5).random_state(42);
727        let fitted = proj.fit(&x, &()).unwrap();
728        let out = fitted.transform(&x).unwrap();
729        assert_eq!(out.shape(), &[10, 5]);
730    }
731
732    #[test]
733    fn test_gaussian_rp_deterministic() {
734        let x = Array2::<f64>::ones((10, 20));
735        let proj = GaussianRandomProjection::<f64>::new(3).random_state(42);
736        let fitted1 = proj.fit(&x, &()).unwrap();
737        let out1 = fitted1.transform(&x).unwrap();
738        let fitted2 = proj.fit(&x, &()).unwrap();
739        let out2 = fitted2.transform(&x).unwrap();
740        for (a, b) in out1.iter().zip(out2.iter()) {
741            assert!((a - b).abs() < 1e-10);
742        }
743    }
744
745    #[test]
746    fn test_gaussian_rp_zero_components() {
747        let x = Array2::<f64>::ones((5, 10));
748        let proj = GaussianRandomProjection::<f64>::new(0);
749        assert!(proj.fit(&x, &()).is_err());
750    }
751
752    #[test]
753    fn test_gaussian_rp_empty_input() {
754        let x = Array2::<f64>::zeros((0, 10));
755        let proj = GaussianRandomProjection::<f64>::new(5);
756        assert!(proj.fit(&x, &()).is_err());
757    }
758
759    #[test]
760    fn test_gaussian_rp_shape_mismatch() {
761        let x_train = Array2::<f64>::ones((10, 20));
762        let proj = GaussianRandomProjection::<f64>::new(5).random_state(42);
763        let fitted = proj.fit(&x_train, &()).unwrap();
764        let x_bad = Array2::<f64>::ones((5, 15));
765        assert!(fitted.transform(&x_bad).is_err());
766    }
767
768    #[test]
769    fn test_gaussian_rp_fit_transform() {
770        let x = Array2::<f64>::ones((10, 20));
771        let proj = GaussianRandomProjection::<f64>::new(5).random_state(42);
772        let out = proj.fit_transform(&x).unwrap();
773        assert_eq!(out.shape(), &[10, 5]);
774    }
775
776    #[test]
777    fn test_gaussian_rp_f32() {
778        let x = Array2::<f32>::ones((5, 10));
779        let proj = GaussianRandomProjection::<f32>::new(3).random_state(42);
780        let fitted = proj.fit(&x, &()).unwrap();
781        let out = fitted.transform(&x).unwrap();
782        assert_eq!(out.shape(), &[5, 3]);
783    }
784
785    // -- SparseRandomProjection --
786
787    #[test]
788    fn test_sparse_rp_output_shape() {
789        let x = Array2::<f64>::ones((10, 100));
790        let proj = SparseRandomProjection::<f64>::new(5).random_state(42);
791        let fitted = proj.fit(&x, &()).unwrap();
792        let out = fitted.transform(&x).unwrap();
793        assert_eq!(out.shape(), &[10, 5]);
794    }
795
796    #[test]
797    fn test_sparse_rp_deterministic() {
798        let x = Array2::<f64>::ones((10, 50));
799        let proj = SparseRandomProjection::<f64>::new(3).random_state(42);
800        let fitted1 = proj.fit(&x, &()).unwrap();
801        let out1 = fitted1.transform(&x).unwrap();
802        let fitted2 = proj.fit(&x, &()).unwrap();
803        let out2 = fitted2.transform(&x).unwrap();
804        for (a, b) in out1.iter().zip(out2.iter()) {
805            assert!((a - b).abs() < 1e-10);
806        }
807    }
808
809    #[test]
810    fn test_sparse_rp_sparsity() {
811        let x = Array2::<f64>::ones((5, 100));
812        let proj = SparseRandomProjection::<f64>::new(10).random_state(42);
813        let fitted = proj.fit(&x, &()).unwrap();
814        let r = fitted.projection();
815        // With density = 1/sqrt(100) = 0.1, about 90% should be zero
816        let total = r.len();
817        let zeros = r.iter().filter(|&&v| v == 0.0).count();
818        let sparsity = zeros as f64 / total as f64;
819        assert!(
820            sparsity > 0.5,
821            "expected sparse matrix, got sparsity={sparsity}"
822        );
823    }
824
825    #[test]
826    fn test_sparse_rp_custom_density() {
827        let x = Array2::<f64>::ones((5, 20));
828        let proj = SparseRandomProjection::<f64>::new(5)
829            .density(0.5)
830            .random_state(42);
831        let fitted = proj.fit(&x, &()).unwrap();
832        let out = fitted.transform(&x).unwrap();
833        assert_eq!(out.shape(), &[5, 5]);
834    }
835
836    #[test]
837    fn test_sparse_rp_zero_components() {
838        let x = Array2::<f64>::ones((5, 10));
839        let proj = SparseRandomProjection::<f64>::new(0);
840        assert!(proj.fit(&x, &()).is_err());
841    }
842
843    #[test]
844    fn test_sparse_rp_invalid_density() {
845        let x = Array2::<f64>::ones((5, 10));
846        let proj = SparseRandomProjection::<f64>::new(5).density(0.0);
847        assert!(proj.fit(&x, &()).is_err());
848    }
849
850    #[test]
851    fn test_sparse_rp_empty_input() {
852        let x = Array2::<f64>::zeros((0, 10));
853        let proj = SparseRandomProjection::<f64>::new(5);
854        assert!(proj.fit(&x, &()).is_err());
855    }
856
857    #[test]
858    fn test_sparse_rp_shape_mismatch() {
859        let x_train = Array2::<f64>::ones((10, 20));
860        let proj = SparseRandomProjection::<f64>::new(5).random_state(42);
861        let fitted = proj.fit(&x_train, &()).unwrap();
862        let x_bad = Array2::<f64>::ones((5, 15));
863        assert!(fitted.transform(&x_bad).is_err());
864    }
865
866    #[test]
867    fn test_sparse_rp_fit_transform() {
868        let x = Array2::<f64>::ones((10, 20));
869        let proj = SparseRandomProjection::<f64>::new(5).random_state(42);
870        let out = proj.fit_transform(&x).unwrap();
871        assert_eq!(out.shape(), &[10, 5]);
872    }
873
874    #[test]
875    fn test_sparse_rp_f32() {
876        let x = Array2::<f32>::ones((5, 10));
877        let proj = SparseRandomProjection::<f32>::new(3).random_state(42);
878        let fitted = proj.fit(&x, &()).unwrap();
879        let out = fitted.transform(&x).unwrap();
880        assert_eq!(out.shape(), &[5, 3]);
881    }
882}