//! Random projection transformers for dimensionality reduction.
//!
//! Random projections preserve pairwise distances in expectation (Johnson-Lindenstrauss lemma).
//!
//! - [`GaussianRandomProjection`] — dense Gaussian random matrix
//! - [`SparseRandomProjection`] — sparse random matrix with `{-1, 0, +1}` entries
//!
//! ## REQ status
//!
//! Translation target: scikit-learn 1.5.2 `GaussianRandomProjection` +
//! `SparseRandomProjection` (`sklearn/random_projection.py`). Tracking: #1387.
//! Each REQ is BINARY — SHIPPED (impl + non-test consumer + tests + green
//! verification) or NOT-STARTED (with a concrete open blocker). RNG-COUPLED
//! unit: ferrolearn uses Rust `SmallRng`, sklearn numpy `RandomState`, so exact
//! projection-matrix VALUE parity is impossible (carve-out); the SHIPPED claims
//! are the projection DISTRIBUTION/scale (vs the sklearn formula) + transform
//! contract + determinism, not bit-exact matrix values.
//!
//! | REQ | Scope | Status | Evidence / Blocker |
//! |-----|-------|--------|--------------------|
//! | REQ-1 | Gaussian projection DISTRIBUTION `N(0, 1/n_components)` + transform `X@R` + determinism-given-seed | SHIPPED | [`GaussianRandomProjection`] `fit` scale `1/sqrt(n_components)` matches sklearn `_gaussian_random_matrix` `random_projection.py:200`; empirical variance ≈ 1/k guard + determinism + transform tests in `tests/divergence_random_projection.rs`. Consumer: re-export `lib.rs:167` + `PipelineTransformer` |
//! | REQ-2 | Sparse projection DISTRIBUTION (support `±sqrt(1/(d·n_components))`, probs `{d/2, 1-d, d/2}`, default density `1/sqrt(n_features)`, `density=1` case) + transform | SHIPPED | [`SparseRandomProjection`] `fit` scale/support/probs match sklearn `_sparse_random_matrix` `:301` + `_check_density` `:148-149`; nonzero-entry == ±scale (exact, tol 1e-12) + default-density + density=1 tests |
//! | REQ-3 | Error/parameter contracts (n_components==0, density∉(0,1], zero rows, transform ncols, unfitted) | SHIPPED (scoped) | both `fit`/`Fitted*` `transform`; in-module + divergence error tests |
//! | REQ-4 | Exact projection-matrix VALUE parity (numpy `RandomState` stream) | NOT-STARTED | RNG-coupled CARVE-OUT (Rust `SmallRng` ≠ numpy MT19937), no committed failing test — blocker #1388 |
//! | REQ-5 | Sparse sampling METHOD (per-row `binomial` + `sample_without_replacement` vs per-entry Bernoulli) | NOT-STARTED | sklearn `random_projection.py:281-301` — blocker #1389 |
//! | REQ-6 | `n_components='auto'` + `eps` + `johnson_lindenstrauss_min_dim` | SHIPPED | [`johnson_lindenstrauss_min_dim`] = `floor(4*ln(n_samples)/(eps^2/2-eps^3/3))` matches sklearn `random_projection.py:142-143` (int64 truncation), validates `eps∈(0,1)`/`n_samples>0` (`:133,136`); [`NComponents::Auto`] default resolves via JL at `fit` (sklearn `:388-391`) with the `n_components_ <= 0` reject (`:393-397`). Consumer: re-export `lib.rs:167` + both `fit`. Tests `tests/divergence_random_projection_jl_2347.rs` |
//! | REQ-7 | `components_` `(n_components, n_features)` orientation (#2346) | SHIPPED (orientation) — CSR sparse storage + `dense_output` NOT-STARTED | `components()` / `projection()` now store `(n_components, n_features)` matching sklearn `:419`, transform `X @ components_.T` (`:604,810`); CSR sparse storage + `dense_output` still dense — blocker #1391. Tests `tests/divergence_random_projection_components_orientation_2345.rs` |
//! | REQ-8 | `inverse_transform` + `compute_inverse_components` | NOT-STARTED | sklearn `random_projection.py:356,431-458` — blocker #1392 |
//! | REQ-9 | `n_components > n_features` warning + `components_`/`n_components_`/`n_features_in_`/`density_` attrs + `get_feature_names_out` | SHIPPED (attrs) — `DataDimensionalityWarning` emission NOT-STARTED | `n_components_()`/`n_features_in_()`/`density_()` getters + `(n_components,n_features)` `components_` (`:416,419,788`); the `n_components>n_features` `DataDimensionalityWarning` (`:407-414`) is NOT emitted (no warning facade in ferrolearn — fit still proceeds) + `get_feature_names_out` absent — blocker #1393 |
//! | REQ-10 | PyO3 binding | NOT-STARTED | no `ferrolearn-python` registration — blocker #1394 |
//! | REQ-11 | ferray substrate | NOT-STARTED | dense `Array2` + `num_traits::Float` only — blocker #1395 |
use ferrolearn_core::error::FerroError;
use ferrolearn_core::pipeline::{FittedPipelineTransformer, PipelineTransformer};
use ferrolearn_core::traits::{Fit, FitTransform, Transform};
use ndarray::{Array1, Array2};
use num_traits::Float;
use rand::SeedableRng;
use rand::rngs::SmallRng;
use rand_distr::{Distribution, StandardNormal};
// ---------------------------------------------------------------------------
// johnson_lindenstrauss_min_dim + NComponents
// ---------------------------------------------------------------------------
/// Find a "safe" number of components to randomly project to.
///
/// Mirrors scikit-learn 1.5.2 `johnson_lindenstrauss_min_dim`
/// (`sklearn/random_projection.py:64-143`). The minimum number of components
/// that guarantees an `eps`-embedding for `n_samples` points is
///
/// ```text
/// n_components >= 4 * ln(n_samples) / (eps^2 / 2 - eps^3 / 3)
/// ```
///
/// sklearn casts the result with `.astype(np.int64)`
/// (`random_projection.py:142-143`), which TRUNCATES toward zero (a FLOOR for
/// the positive values this expression produces). This function matches that
/// truncation, e.g. `johnson_lindenstrauss_min_dim(1000, 0.1) == 5920`.
///
/// # Errors
///
/// Returns [`FerroError::InvalidParameter`] if `eps` is not in the open
/// interval `(0, 1)` (sklearn raises `ValueError`, `random_projection.py:133`),
/// or if `n_samples == 0` (sklearn `random_projection.py:136-140`).
#[must_use = "the returned target dimension should be used"]
pub fn johnson_lindenstrauss_min_dim<F: Float>(
n_samples: usize,
eps: F,
) -> Result<usize, FerroError> {
// sklearn random_projection.py:133 — reject eps outside ]0, 1[.
let zero = F::zero();
let one = F::one();
if !(eps > zero && eps < one) {
return Err(FerroError::InvalidParameter {
name: "eps".into(),
reason: "the JL bound is defined for eps in (0, 1)".into(),
});
}
// sklearn random_projection.py:136 — reject n_samples <= 0.
if n_samples == 0 {
return Err(FerroError::InvalidParameter {
name: "n_samples".into(),
reason: "the JL bound is defined for n_samples greater than zero".into(),
});
}
// denominator = eps^2 / 2 - eps^3 / 3 (random_projection.py:142).
let two = one + one;
let three = two + one;
let denominator = (eps * eps / two) - (eps * eps * eps / three);
let n = F::from(n_samples).unwrap_or_else(F::infinity);
let value = F::from(4.0).unwrap_or_else(F::one) * n.ln() / denominator;
// .astype(np.int64) truncates toward zero (random_projection.py:143).
// `value` is positive for n_samples >= 1 and eps in (0, 1), so truncation
// equals floor. Guard against a non-finite/negative result defensively.
let truncated = value.trunc();
if !truncated.is_finite() || truncated < zero {
return Err(FerroError::InvalidParameter {
name: "n_samples".into(),
reason: "the JL bound produced a non-finite target dimension".into(),
});
}
Ok(truncated.to_usize().unwrap_or(0))
}
/// Target projection dimensionality, mirroring sklearn's `n_components`
/// parameter which accepts an integer or the string `'auto'`
/// (`random_projection.py:314-317`).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum NComponents {
/// Resolve `n_components` from the Johnson-Lindenstrauss bound at fit time
/// using the configured `eps` (sklearn `n_components='auto'`,
/// `random_projection.py:388-391`). This is the sklearn default.
Auto,
/// An explicit, user-provided number of output dimensions.
Fixed(usize),
}
impl Default for NComponents {
/// sklearn defaults `n_components='auto'` (`random_projection.py:326,477`).
fn default() -> Self {
Self::Auto
}
}
impl From<usize> for NComponents {
fn from(value: usize) -> Self {
Self::Fixed(value)
}
}
/// Resolve the configured `n_components` to a concrete dimension at fit time.
///
/// For [`NComponents::Auto`] this calls [`johnson_lindenstrauss_min_dim`] with
/// `n_samples` and `eps` (sklearn `random_projection.py:388-391`). For both
/// branches a resolved value of `0` is rejected — sklearn rejects the `auto`
/// bound resolving to `<= 0` (`random_projection.py:393-397`) and requires
/// explicit `n_components >= 1` (`_parameter_constraints`, `:314-315`).
///
/// For the [`NComponents::Auto`] path ONLY, a resolved dimension GREATER than
/// `n_features` is also rejected — sklearn raises `ValueError` when the
/// JL-resolved `n_components_` exceeds the original space
/// (`random_projection.py:399-405`). The [`NComponents::Fixed`] path does NOT
/// error here: sklearn only emits a `DataDimensionalityWarning` for an explicit
/// `n_components > n_features` (`random_projection.py:407-414`, NOT-STARTED
/// warning REQ-9 / blocker #1393), so the fixed path proceeds unchanged.
///
/// # Errors
///
/// Returns [`FerroError::InvalidParameter`] if the resolved dimension is `0`,
/// if the `Auto` resolved dimension exceeds `n_features`, or if the JL bound's
/// `eps`/`n_samples` validation fails.
fn resolve_n_components(
n_components: NComponents,
eps: f64,
n_samples: usize,
n_features: usize,
) -> Result<usize, FerroError> {
let resolved = match n_components {
NComponents::Fixed(k) => k,
NComponents::Auto => johnson_lindenstrauss_min_dim(n_samples, eps)?,
};
if resolved == 0 {
return Err(FerroError::InvalidParameter {
name: "n_components".into(),
reason: match n_components {
NComponents::Auto => format!(
"eps={eps} and n_samples={n_samples} lead to a target dimension of 0 which is invalid"
),
NComponents::Fixed(_) => "must be >= 1".into(),
},
});
}
// AUTO PATH ONLY (sklearn `random_projection.py:399-405`): a JL-resolved
// target dimension larger than the original space is a ValueError. The
// Fixed path is intentionally excluded — sklearn only WARNS there (`:407-414`).
if matches!(n_components, NComponents::Auto) && resolved > n_features {
return Err(FerroError::InvalidParameter {
name: "n_components".into(),
reason: format!(
"eps={eps} and n_samples={n_samples} lead to a target dimension of {resolved} which is larger than the original space with n_features={n_features}"
),
});
}
Ok(resolved)
}
// ---------------------------------------------------------------------------
// GaussianRandomProjection
// ---------------------------------------------------------------------------
/// Gaussian random projection transformer.
///
/// Projects data into a lower-dimensional space using a random matrix drawn
/// from `N(0, 1/n_components)`.
///
/// # Examples
///
/// ```
/// use ferrolearn_preprocess::random_projection::GaussianRandomProjection;
/// use ferrolearn_core::traits::{Fit, Transform};
/// use ndarray::Array2;
///
/// let x = Array2::<f64>::ones((10, 50));
/// let proj = GaussianRandomProjection::<f64>::new(5);
/// let fitted = proj.fit(&x, &()).unwrap();
/// let out = fitted.transform(&x).unwrap();
/// assert_eq!(out.shape(), &[10, 5]);
/// ```
#[derive(Debug, Clone)]
pub struct GaussianRandomProjection<F> {
/// Number of output dimensions, or [`NComponents::Auto`] to resolve via the
/// Johnson-Lindenstrauss bound at fit time. Defaults to `Auto`.
n_components: NComponents,
/// Distortion-rate parameter for the `Auto` JL bound. Defaults to `0.1`
/// (sklearn `eps=0.1`, `random_projection.py:328`).
eps: f64,
/// Optional RNG seed for reproducibility.
random_state: Option<u64>,
_marker: std::marker::PhantomData<F>,
}
impl<F: Float + Send + Sync + 'static> GaussianRandomProjection<F> {
/// Create a new Gaussian random projection with an explicit `n_components`
/// number of output dimensions (`NComponents::Fixed`).
#[must_use]
pub fn new(n_components: usize) -> Self {
Self {
n_components: NComponents::Fixed(n_components),
eps: 0.1,
random_state: None,
_marker: std::marker::PhantomData,
}
}
/// Create a new Gaussian random projection with `n_components='auto'`
/// (sklearn default): the target dimension is resolved at fit time from the
/// Johnson-Lindenstrauss bound for the configured `eps`
/// (`random_projection.py:388-391`).
#[must_use]
pub fn new_auto() -> Self {
Self {
n_components: NComponents::Auto,
eps: 0.1,
random_state: None,
_marker: std::marker::PhantomData,
}
}
/// Set the distortion-rate `eps` used by the `Auto` JL bound
/// (sklearn `eps`, `random_projection.py:328`).
#[must_use]
pub fn eps(mut self, eps: f64) -> Self {
self.eps = eps;
self
}
/// Set the random seed for reproducibility.
#[must_use]
pub fn random_state(mut self, seed: u64) -> Self {
self.random_state = Some(seed);
self
}
}
/// Fitted Gaussian random projection holding the projection matrix.
#[derive(Debug, Clone)]
pub struct FittedGaussianRandomProjection<F> {
/// Projection matrix of shape `(n_components, n_features)` — the sklearn
/// `components_` orientation (`random_projection.py:419`).
components: Array2<F>,
/// Resolved number of output dimensions (`n_components_`).
n_components: usize,
/// Number of features seen during fit (`n_features_in_`).
n_features_in: usize,
}
impl<F: Float + Send + Sync + 'static> FittedGaussianRandomProjection<F> {
/// Return a reference to the projection matrix `components_` of shape
/// `(n_components, n_features)` (sklearn `components_`,
/// `random_projection.py:419`).
#[must_use]
pub fn components(&self) -> &Array2<F> {
&self.components
}
/// Alias for [`Self::components`] kept for backward compatibility. Returns
/// the projection matrix in the sklearn `(n_components, n_features)`
/// orientation.
#[must_use]
pub fn projection(&self) -> &Array2<F> {
&self.components
}
/// Resolved number of output dimensions (`n_components_`).
#[must_use]
pub fn n_components_(&self) -> usize {
self.n_components
}
/// Number of features seen during fit (`n_features_in_`).
#[must_use]
pub fn n_features_in_(&self) -> usize {
self.n_features_in
}
}
impl<F: Float + Send + Sync + 'static> Fit<Array2<F>, ()> for GaussianRandomProjection<F> {
type Fitted = FittedGaussianRandomProjection<F>;
type Error = FerroError;
/// Fit the projection by generating a random matrix `R ~ N(0, 1/n_components)`.
///
/// When `n_components` is [`NComponents::Auto`] the target dimension is
/// resolved from the Johnson-Lindenstrauss bound for `eps`
/// (`random_projection.py:388-391`).
///
/// # Errors
///
/// Returns [`FerroError::InvalidParameter`] if the resolved `n_components`
/// is `0` (explicit, or the JL bound resolving to `<= 0`,
/// `random_projection.py:393-397`), or if `eps` is outside `(0, 1)` for the
/// `Auto` path. Returns [`FerroError::InsufficientSamples`] if `x` has zero
/// rows.
fn fit(&self, x: &Array2<F>, _y: &()) -> Result<FittedGaussianRandomProjection<F>, FerroError> {
if x.nrows() == 0 {
return Err(FerroError::InsufficientSamples {
required: 1,
actual: 0,
context: "GaussianRandomProjection::fit".into(),
});
}
let n_features = x.ncols();
let n_components =
resolve_n_components(self.n_components, self.eps, x.nrows(), n_features)?;
let mut rng: SmallRng = match self.random_state {
Some(seed) => SmallRng::seed_from_u64(seed),
None => SmallRng::from_os_rng(),
};
let scale = F::one() / F::from(n_components).unwrap_or_else(F::one).sqrt();
let normal = StandardNormal;
// components_ orientation (n_components, n_features) — sklearn
// `random_projection.py:419`.
let mut components = Array2::zeros((n_components, n_features));
for v in &mut components {
let sample: f64 = normal.sample(&mut rng);
*v = F::from(sample).unwrap_or_else(F::zero) * scale;
}
Ok(FittedGaussianRandomProjection {
components,
n_components,
n_features_in: n_features,
})
}
}
impl<F: Float + Send + Sync + 'static> Transform<Array2<F>> for FittedGaussianRandomProjection<F> {
type Output = Array2<F>;
type Error = FerroError;
/// Transform data by computing `X @ components_.T`
/// (sklearn `random_projection.py:604`).
///
/// # Errors
///
/// Returns [`FerroError::ShapeMismatch`] if `x.ncols() != n_features`.
fn transform(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
if x.ncols() != self.n_features_in {
return Err(FerroError::ShapeMismatch {
expected: vec![x.nrows(), self.n_features_in],
actual: vec![x.nrows(), x.ncols()],
context: "FittedGaussianRandomProjection::transform".into(),
});
}
// components_ is (n_components, n_features); X @ components_.T gives
// (n_samples, n_components) — sklearn `random_projection.py:604`.
Ok(x.dot(&self.components.t()))
}
}
impl<F: Float + Send + Sync + 'static> Transform<Array2<F>> for GaussianRandomProjection<F> {
type Output = Array2<F>;
type Error = FerroError;
/// Always returns an error — the projection must be fitted first.
fn transform(&self, _x: &Array2<F>) -> Result<Array2<F>, FerroError> {
Err(FerroError::InvalidParameter {
name: "GaussianRandomProjection".into(),
reason: "projection must be fitted before calling transform; use fit() first".into(),
})
}
}
impl<F: Float + Send + Sync + 'static> FitTransform<Array2<F>> for GaussianRandomProjection<F> {
type FitError = FerroError;
/// Fit and transform in one step.
fn fit_transform(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
let fitted = self.fit(x, &())?;
fitted.transform(x)
}
}
impl<F: Float + Send + Sync + 'static> PipelineTransformer<F> for GaussianRandomProjection<F> {
fn fit_pipeline(
&self,
x: &Array2<F>,
_y: &Array1<F>,
) -> Result<Box<dyn FittedPipelineTransformer<F>>, FerroError> {
let fitted = self.fit(x, &())?;
Ok(Box::new(fitted))
}
}
impl<F: Float + Send + Sync + 'static> FittedPipelineTransformer<F>
for FittedGaussianRandomProjection<F>
{
fn transform_pipeline(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
self.transform(x)
}
}
// ---------------------------------------------------------------------------
// SparseRandomProjection
// ---------------------------------------------------------------------------
/// Sparse random projection transformer.
///
/// Projects data into a lower-dimensional space using a sparse random matrix
/// with entries `{-1, 0, +1}` drawn with probabilities
/// `{d/2, 1 - d, d/2}`, scaled by `sqrt(1 / (d * n_components))`.
///
/// The default density `d = 1 / sqrt(n_features)` is used when not specified.
///
/// # Examples
///
/// ```
/// use ferrolearn_preprocess::random_projection::SparseRandomProjection;
/// use ferrolearn_core::traits::{Fit, Transform};
/// use ndarray::Array2;
///
/// let x = Array2::<f64>::ones((10, 100));
/// let proj = SparseRandomProjection::<f64>::new(5).random_state(42);
/// let fitted = proj.fit(&x, &()).unwrap();
/// let out = fitted.transform(&x).unwrap();
/// assert_eq!(out.shape(), &[10, 5]);
/// ```
#[derive(Debug, Clone)]
pub struct SparseRandomProjection<F> {
/// Number of output dimensions, or [`NComponents::Auto`] to resolve via the
/// Johnson-Lindenstrauss bound at fit time. Defaults to `Auto`.
n_components: NComponents,
/// Density of non-zero entries. `None` means `'auto' = 1/sqrt(n_features)`.
density: Option<f64>,
/// Distortion-rate parameter for the `Auto` JL bound. Defaults to `0.1`.
eps: f64,
/// Optional RNG seed for reproducibility.
random_state: Option<u64>,
_marker: std::marker::PhantomData<F>,
}
impl<F: Float + Send + Sync + 'static> SparseRandomProjection<F> {
/// Create a new sparse random projection with an explicit `n_components`
/// number of output dimensions (`NComponents::Fixed`).
#[must_use]
pub fn new(n_components: usize) -> Self {
Self {
n_components: NComponents::Fixed(n_components),
density: None,
eps: 0.1,
random_state: None,
_marker: std::marker::PhantomData,
}
}
/// Create a new sparse random projection with `n_components='auto'`
/// (sklearn default): resolved at fit time from the Johnson-Lindenstrauss
/// bound for the configured `eps` (`random_projection.py:388-391`).
#[must_use]
pub fn new_auto() -> Self {
Self {
n_components: NComponents::Auto,
density: None,
eps: 0.1,
random_state: None,
_marker: std::marker::PhantomData,
}
}
/// Set the density of non-zero entries.
#[must_use]
pub fn density(mut self, density: f64) -> Self {
self.density = Some(density);
self
}
/// Set the distortion-rate `eps` used by the `Auto` JL bound.
#[must_use]
pub fn eps(mut self, eps: f64) -> Self {
self.eps = eps;
self
}
/// Set the random seed for reproducibility.
#[must_use]
pub fn random_state(mut self, seed: u64) -> Self {
self.random_state = Some(seed);
self
}
}
/// Fitted sparse random projection holding the projection matrix.
#[derive(Debug, Clone)]
pub struct FittedSparseRandomProjection<F> {
/// Projection matrix of shape `(n_components, n_features)` — the sklearn
/// `components_` orientation (`random_projection.py:419`).
components: Array2<F>,
/// Resolved number of output dimensions (`n_components_`).
n_components: usize,
/// Number of features seen during fit (`n_features_in_`).
n_features_in: usize,
/// Resolved density (`density_`), i.e. `1/sqrt(n_features)` when `'auto'`
/// (sklearn `density_`, `random_projection.py:788`).
density: f64,
}
impl<F: Float + Send + Sync + 'static> FittedSparseRandomProjection<F> {
/// Return a reference to the projection matrix `components_` of shape
/// `(n_components, n_features)` (sklearn `components_`,
/// `random_projection.py:419`).
#[must_use]
pub fn components(&self) -> &Array2<F> {
&self.components
}
/// Alias for [`Self::components`] kept for backward compatibility. Returns
/// the projection matrix in the sklearn `(n_components, n_features)`
/// orientation.
#[must_use]
pub fn projection(&self) -> &Array2<F> {
&self.components
}
/// Resolved number of output dimensions (`n_components_`).
#[must_use]
pub fn n_components_(&self) -> usize {
self.n_components
}
/// Number of features seen during fit (`n_features_in_`).
#[must_use]
pub fn n_features_in_(&self) -> usize {
self.n_features_in
}
/// Resolved density (`density_`), i.e. `1/sqrt(n_features)` when the
/// constructor density was left at the `'auto'` default
/// (sklearn `density_`, `random_projection.py:788`).
#[must_use]
pub fn density_(&self) -> f64 {
self.density
}
}
impl<F: Float + Send + Sync + 'static> Fit<Array2<F>, ()> for SparseRandomProjection<F> {
type Fitted = FittedSparseRandomProjection<F>;
type Error = FerroError;
/// Fit the projection by generating a sparse random matrix.
///
/// When `n_components` is [`NComponents::Auto`] the target dimension is
/// resolved from the Johnson-Lindenstrauss bound for `eps`
/// (`random_projection.py:388-391`).
///
/// # Errors
///
/// Returns [`FerroError::InvalidParameter`] if the resolved `n_components`
/// is `0` or `density` is not in `(0, 1]`.
/// Returns [`FerroError::InsufficientSamples`] if `x` has zero rows.
fn fit(&self, x: &Array2<F>, _y: &()) -> Result<FittedSparseRandomProjection<F>, FerroError> {
if x.nrows() == 0 {
return Err(FerroError::InsufficientSamples {
required: 1,
actual: 0,
context: "SparseRandomProjection::fit".into(),
});
}
let n_features = x.ncols();
let n_components =
resolve_n_components(self.n_components, self.eps, x.nrows(), n_features)?;
// density='auto' => 1/sqrt(n_features) (sklearn `_check_density:148-149`).
let d = self
.density
.unwrap_or_else(|| 1.0 / (n_features as f64).sqrt());
if d <= 0.0 || d > 1.0 {
return Err(FerroError::InvalidParameter {
name: "density".into(),
reason: format!("must be in (0, 1], got {d}"),
});
}
let mut rng: SmallRng = match self.random_state {
Some(seed) => SmallRng::seed_from_u64(seed),
None => SmallRng::from_os_rng(),
};
let scale = F::from(1.0 / (d * n_components as f64).sqrt()).unwrap_or_else(F::one);
let uniform =
rand::distr::Uniform::new(0.0_f64, 1.0).map_err(|e| FerroError::InvalidParameter {
name: "density".into(),
reason: format!("failed to build uniform sampler: {e}"),
})?;
// components_ orientation (n_components, n_features) — sklearn
// `random_projection.py:419`.
let mut components = Array2::zeros((n_components, n_features));
for v in &mut components {
let u: f64 = uniform.sample(&mut rng);
if u < d / 2.0 {
*v = scale.neg();
} else if u < d {
*v = scale;
}
// else: remains 0
}
Ok(FittedSparseRandomProjection {
components,
n_components,
n_features_in: n_features,
density: d,
})
}
}
impl<F: Float + Send + Sync + 'static> Transform<Array2<F>> for FittedSparseRandomProjection<F> {
type Output = Array2<F>;
type Error = FerroError;
/// Transform data by computing `X @ components_.T`
/// (sklearn `random_projection.py:810`).
///
/// # Errors
///
/// Returns [`FerroError::ShapeMismatch`] if `x.ncols() != n_features`.
fn transform(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
if x.ncols() != self.n_features_in {
return Err(FerroError::ShapeMismatch {
expected: vec![x.nrows(), self.n_features_in],
actual: vec![x.nrows(), x.ncols()],
context: "FittedSparseRandomProjection::transform".into(),
});
}
// components_ is (n_components, n_features); X @ components_.T gives
// (n_samples, n_components) — sklearn `random_projection.py:810`.
Ok(x.dot(&self.components.t()))
}
}
impl<F: Float + Send + Sync + 'static> Transform<Array2<F>> for SparseRandomProjection<F> {
type Output = Array2<F>;
type Error = FerroError;
/// Always returns an error — the projection must be fitted first.
fn transform(&self, _x: &Array2<F>) -> Result<Array2<F>, FerroError> {
Err(FerroError::InvalidParameter {
name: "SparseRandomProjection".into(),
reason: "projection must be fitted before calling transform; use fit() first".into(),
})
}
}
impl<F: Float + Send + Sync + 'static> FitTransform<Array2<F>> for SparseRandomProjection<F> {
type FitError = FerroError;
/// Fit and transform in one step.
fn fit_transform(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
let fitted = self.fit(x, &())?;
fitted.transform(x)
}
}
impl<F: Float + Send + Sync + 'static> PipelineTransformer<F> for SparseRandomProjection<F> {
fn fit_pipeline(
&self,
x: &Array2<F>,
_y: &Array1<F>,
) -> Result<Box<dyn FittedPipelineTransformer<F>>, FerroError> {
let fitted = self.fit(x, &())?;
Ok(Box::new(fitted))
}
}
impl<F: Float + Send + Sync + 'static> FittedPipelineTransformer<F>
for FittedSparseRandomProjection<F>
{
fn transform_pipeline(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
self.transform(x)
}
}
// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------
#[cfg(test)]
mod tests {
use super::*;
use ndarray::Array2;
// -- GaussianRandomProjection --
#[test]
fn test_gaussian_rp_output_shape() {
let x = Array2::<f64>::ones((10, 50));
let proj = GaussianRandomProjection::<f64>::new(5).random_state(42);
let fitted = proj.fit(&x, &()).unwrap();
let out = fitted.transform(&x).unwrap();
assert_eq!(out.shape(), &[10, 5]);
}
#[test]
fn test_gaussian_rp_deterministic() {
let x = Array2::<f64>::ones((10, 20));
let proj = GaussianRandomProjection::<f64>::new(3).random_state(42);
let fitted1 = proj.fit(&x, &()).unwrap();
let out1 = fitted1.transform(&x).unwrap();
let fitted2 = proj.fit(&x, &()).unwrap();
let out2 = fitted2.transform(&x).unwrap();
for (a, b) in out1.iter().zip(out2.iter()) {
assert!((a - b).abs() < 1e-10);
}
}
#[test]
fn test_gaussian_rp_zero_components() {
let x = Array2::<f64>::ones((5, 10));
let proj = GaussianRandomProjection::<f64>::new(0);
assert!(proj.fit(&x, &()).is_err());
}
#[test]
fn test_gaussian_rp_empty_input() {
let x = Array2::<f64>::zeros((0, 10));
let proj = GaussianRandomProjection::<f64>::new(5);
assert!(proj.fit(&x, &()).is_err());
}
#[test]
fn test_gaussian_rp_shape_mismatch() {
let x_train = Array2::<f64>::ones((10, 20));
let proj = GaussianRandomProjection::<f64>::new(5).random_state(42);
let fitted = proj.fit(&x_train, &()).unwrap();
let x_bad = Array2::<f64>::ones((5, 15));
assert!(fitted.transform(&x_bad).is_err());
}
#[test]
fn test_gaussian_rp_fit_transform() {
let x = Array2::<f64>::ones((10, 20));
let proj = GaussianRandomProjection::<f64>::new(5).random_state(42);
let out = proj.fit_transform(&x).unwrap();
assert_eq!(out.shape(), &[10, 5]);
}
#[test]
fn test_gaussian_rp_f32() {
let x = Array2::<f32>::ones((5, 10));
let proj = GaussianRandomProjection::<f32>::new(3).random_state(42);
let fitted = proj.fit(&x, &()).unwrap();
let out = fitted.transform(&x).unwrap();
assert_eq!(out.shape(), &[5, 3]);
}
// -- SparseRandomProjection --
#[test]
fn test_sparse_rp_output_shape() {
let x = Array2::<f64>::ones((10, 100));
let proj = SparseRandomProjection::<f64>::new(5).random_state(42);
let fitted = proj.fit(&x, &()).unwrap();
let out = fitted.transform(&x).unwrap();
assert_eq!(out.shape(), &[10, 5]);
}
#[test]
fn test_sparse_rp_deterministic() {
let x = Array2::<f64>::ones((10, 50));
let proj = SparseRandomProjection::<f64>::new(3).random_state(42);
let fitted1 = proj.fit(&x, &()).unwrap();
let out1 = fitted1.transform(&x).unwrap();
let fitted2 = proj.fit(&x, &()).unwrap();
let out2 = fitted2.transform(&x).unwrap();
for (a, b) in out1.iter().zip(out2.iter()) {
assert!((a - b).abs() < 1e-10);
}
}
#[test]
fn test_sparse_rp_sparsity() {
let x = Array2::<f64>::ones((5, 100));
let proj = SparseRandomProjection::<f64>::new(10).random_state(42);
let fitted = proj.fit(&x, &()).unwrap();
let r = fitted.projection();
// With density = 1/sqrt(100) = 0.1, about 90% should be zero
let total = r.len();
let zeros = r.iter().filter(|&&v| v == 0.0).count();
let sparsity = zeros as f64 / total as f64;
assert!(
sparsity > 0.5,
"expected sparse matrix, got sparsity={sparsity}"
);
}
#[test]
fn test_sparse_rp_custom_density() {
let x = Array2::<f64>::ones((5, 20));
let proj = SparseRandomProjection::<f64>::new(5)
.density(0.5)
.random_state(42);
let fitted = proj.fit(&x, &()).unwrap();
let out = fitted.transform(&x).unwrap();
assert_eq!(out.shape(), &[5, 5]);
}
#[test]
fn test_sparse_rp_zero_components() {
let x = Array2::<f64>::ones((5, 10));
let proj = SparseRandomProjection::<f64>::new(0);
assert!(proj.fit(&x, &()).is_err());
}
#[test]
fn test_sparse_rp_invalid_density() {
let x = Array2::<f64>::ones((5, 10));
let proj = SparseRandomProjection::<f64>::new(5).density(0.0);
assert!(proj.fit(&x, &()).is_err());
}
#[test]
fn test_sparse_rp_empty_input() {
let x = Array2::<f64>::zeros((0, 10));
let proj = SparseRandomProjection::<f64>::new(5);
assert!(proj.fit(&x, &()).is_err());
}
#[test]
fn test_sparse_rp_shape_mismatch() {
let x_train = Array2::<f64>::ones((10, 20));
let proj = SparseRandomProjection::<f64>::new(5).random_state(42);
let fitted = proj.fit(&x_train, &()).unwrap();
let x_bad = Array2::<f64>::ones((5, 15));
assert!(fitted.transform(&x_bad).is_err());
}
#[test]
fn test_sparse_rp_fit_transform() {
let x = Array2::<f64>::ones((10, 20));
let proj = SparseRandomProjection::<f64>::new(5).random_state(42);
let out = proj.fit_transform(&x).unwrap();
assert_eq!(out.shape(), &[10, 5]);
}
#[test]
fn test_sparse_rp_f32() {
let x = Array2::<f32>::ones((5, 10));
let proj = SparseRandomProjection::<f32>::new(3).random_state(42);
let fitted = proj.fit(&x, &()).unwrap();
let out = fitted.transform(&x).unwrap();
assert_eq!(out.shape(), &[5, 3]);
}
}