ferrolearn_preprocess/random_projection.rs
1//! Random projection transformers for dimensionality reduction.
2//!
3//! Random projections preserve pairwise distances in expectation (Johnson-Lindenstrauss lemma).
4//!
5//! - [`GaussianRandomProjection`] — dense Gaussian random matrix
6//! - [`SparseRandomProjection`] — sparse random matrix with `{-1, 0, +1}` entries
7//!
8//! ## REQ status
9//!
10//! Translation target: scikit-learn 1.5.2 `GaussianRandomProjection` +
11//! `SparseRandomProjection` (`sklearn/random_projection.py`). Tracking: #1387.
12//! Each REQ is BINARY — SHIPPED (impl + non-test consumer + tests + green
13//! verification) or NOT-STARTED (with a concrete open blocker). RNG-COUPLED
14//! unit: ferrolearn uses Rust `SmallRng`, sklearn numpy `RandomState`, so exact
15//! projection-matrix VALUE parity is impossible (carve-out); the SHIPPED claims
16//! are the projection DISTRIBUTION/scale (vs the sklearn formula) + transform
17//! contract + determinism, not bit-exact matrix values.
18//!
19//! | REQ | Scope | Status | Evidence / Blocker |
20//! |-----|-------|--------|--------------------|
21//! | REQ-1 | Gaussian projection DISTRIBUTION `N(0, 1/n_components)` + transform `X@R` + determinism-given-seed | SHIPPED | [`GaussianRandomProjection`] `fit` scale `1/sqrt(n_components)` matches sklearn `_gaussian_random_matrix` `random_projection.py:200`; empirical variance ≈ 1/k guard + determinism + transform tests in `tests/divergence_random_projection.rs`. Consumer: re-export `lib.rs:167` + `PipelineTransformer` |
22//! | REQ-2 | Sparse projection DISTRIBUTION (support `±sqrt(1/(d·n_components))`, probs `{d/2, 1-d, d/2}`, default density `1/sqrt(n_features)`, `density=1` case) + transform | SHIPPED | [`SparseRandomProjection`] `fit` scale/support/probs match sklearn `_sparse_random_matrix` `:301` + `_check_density` `:148-149`; nonzero-entry == ±scale (exact, tol 1e-12) + default-density + density=1 tests |
23//! | REQ-3 | Error/parameter contracts (n_components==0, density∉(0,1], zero rows, transform ncols, unfitted) | SHIPPED (scoped) | both `fit`/`Fitted*` `transform`; in-module + divergence error tests |
24//! | REQ-4 | Exact projection-matrix VALUE parity (numpy `RandomState` stream) | NOT-STARTED | RNG-coupled CARVE-OUT (Rust `SmallRng` ≠ numpy MT19937), no committed failing test — blocker #1388 |
25//! | REQ-5 | Sparse sampling METHOD (per-row `binomial` + `sample_without_replacement` vs per-entry Bernoulli) | NOT-STARTED | sklearn `random_projection.py:281-301` — blocker #1389 |
26//! | REQ-6 | `n_components='auto'` + `eps` + `johnson_lindenstrauss_min_dim` | SHIPPED | [`johnson_lindenstrauss_min_dim`] = `floor(4*ln(n_samples)/(eps^2/2-eps^3/3))` matches sklearn `random_projection.py:142-143` (int64 truncation), validates `eps∈(0,1)`/`n_samples>0` (`:133,136`); [`NComponents::Auto`] default resolves via JL at `fit` (sklearn `:388-391`) with the `n_components_ <= 0` reject (`:393-397`). Consumer: re-export `lib.rs:167` + both `fit`. Tests `tests/divergence_random_projection_jl_2347.rs` |
27//! | REQ-7 | `components_` `(n_components, n_features)` orientation (#2346) | SHIPPED (orientation) — CSR sparse storage + `dense_output` NOT-STARTED | `components()` / `projection()` now store `(n_components, n_features)` matching sklearn `:419`, transform `X @ components_.T` (`:604,810`); CSR sparse storage + `dense_output` still dense — blocker #1391. Tests `tests/divergence_random_projection_components_orientation_2345.rs` |
28//! | REQ-8 | `inverse_transform` + `compute_inverse_components` | NOT-STARTED | sklearn `random_projection.py:356,431-458` — blocker #1392 |
29//! | REQ-9 | `n_components > n_features` warning + `components_`/`n_components_`/`n_features_in_`/`density_` attrs + `get_feature_names_out` | SHIPPED (attrs) — `DataDimensionalityWarning` emission NOT-STARTED | `n_components_()`/`n_features_in_()`/`density_()` getters + `(n_components,n_features)` `components_` (`:416,419,788`); the `n_components>n_features` `DataDimensionalityWarning` (`:407-414`) is NOT emitted (no warning facade in ferrolearn — fit still proceeds) + `get_feature_names_out` absent — blocker #1393 |
30//! | REQ-10 | PyO3 binding | NOT-STARTED | no `ferrolearn-python` registration — blocker #1394 |
31//! | REQ-11 | ferray substrate | NOT-STARTED | dense `Array2` + `num_traits::Float` only — blocker #1395 |
32
33use ferrolearn_core::error::FerroError;
34use ferrolearn_core::pipeline::{FittedPipelineTransformer, PipelineTransformer};
35use ferrolearn_core::traits::{Fit, FitTransform, Transform};
36use ndarray::{Array1, Array2};
37use num_traits::Float;
38use rand::SeedableRng;
39use rand::rngs::SmallRng;
40use rand_distr::{Distribution, StandardNormal};
41
42// ---------------------------------------------------------------------------
43// johnson_lindenstrauss_min_dim + NComponents
44// ---------------------------------------------------------------------------
45
46/// Find a "safe" number of components to randomly project to.
47///
48/// Mirrors scikit-learn 1.5.2 `johnson_lindenstrauss_min_dim`
49/// (`sklearn/random_projection.py:64-143`). The minimum number of components
50/// that guarantees an `eps`-embedding for `n_samples` points is
51///
52/// ```text
53/// n_components >= 4 * ln(n_samples) / (eps^2 / 2 - eps^3 / 3)
54/// ```
55///
56/// sklearn casts the result with `.astype(np.int64)`
57/// (`random_projection.py:142-143`), which TRUNCATES toward zero (a FLOOR for
58/// the positive values this expression produces). This function matches that
59/// truncation, e.g. `johnson_lindenstrauss_min_dim(1000, 0.1) == 5920`.
60///
61/// # Errors
62///
63/// Returns [`FerroError::InvalidParameter`] if `eps` is not in the open
64/// interval `(0, 1)` (sklearn raises `ValueError`, `random_projection.py:133`),
65/// or if `n_samples == 0` (sklearn `random_projection.py:136-140`).
66#[must_use = "the returned target dimension should be used"]
67pub fn johnson_lindenstrauss_min_dim<F: Float>(
68 n_samples: usize,
69 eps: F,
70) -> Result<usize, FerroError> {
71 // sklearn random_projection.py:133 — reject eps outside ]0, 1[.
72 let zero = F::zero();
73 let one = F::one();
74 if !(eps > zero && eps < one) {
75 return Err(FerroError::InvalidParameter {
76 name: "eps".into(),
77 reason: "the JL bound is defined for eps in (0, 1)".into(),
78 });
79 }
80 // sklearn random_projection.py:136 — reject n_samples <= 0.
81 if n_samples == 0 {
82 return Err(FerroError::InvalidParameter {
83 name: "n_samples".into(),
84 reason: "the JL bound is defined for n_samples greater than zero".into(),
85 });
86 }
87
88 // denominator = eps^2 / 2 - eps^3 / 3 (random_projection.py:142).
89 let two = one + one;
90 let three = two + one;
91 let denominator = (eps * eps / two) - (eps * eps * eps / three);
92
93 let n = F::from(n_samples).unwrap_or_else(F::infinity);
94 let value = F::from(4.0).unwrap_or_else(F::one) * n.ln() / denominator;
95
96 // .astype(np.int64) truncates toward zero (random_projection.py:143).
97 // `value` is positive for n_samples >= 1 and eps in (0, 1), so truncation
98 // equals floor. Guard against a non-finite/negative result defensively.
99 let truncated = value.trunc();
100 if !truncated.is_finite() || truncated < zero {
101 return Err(FerroError::InvalidParameter {
102 name: "n_samples".into(),
103 reason: "the JL bound produced a non-finite target dimension".into(),
104 });
105 }
106 Ok(truncated.to_usize().unwrap_or(0))
107}
108
109/// Target projection dimensionality, mirroring sklearn's `n_components`
110/// parameter which accepts an integer or the string `'auto'`
111/// (`random_projection.py:314-317`).
112#[derive(Debug, Clone, Copy, PartialEq, Eq)]
113pub enum NComponents {
114 /// Resolve `n_components` from the Johnson-Lindenstrauss bound at fit time
115 /// using the configured `eps` (sklearn `n_components='auto'`,
116 /// `random_projection.py:388-391`). This is the sklearn default.
117 Auto,
118 /// An explicit, user-provided number of output dimensions.
119 Fixed(usize),
120}
121
122impl Default for NComponents {
123 /// sklearn defaults `n_components='auto'` (`random_projection.py:326,477`).
124 fn default() -> Self {
125 Self::Auto
126 }
127}
128
129impl From<usize> for NComponents {
130 fn from(value: usize) -> Self {
131 Self::Fixed(value)
132 }
133}
134
135/// Resolve the configured `n_components` to a concrete dimension at fit time.
136///
137/// For [`NComponents::Auto`] this calls [`johnson_lindenstrauss_min_dim`] with
138/// `n_samples` and `eps` (sklearn `random_projection.py:388-391`). For both
139/// branches a resolved value of `0` is rejected — sklearn rejects the `auto`
140/// bound resolving to `<= 0` (`random_projection.py:393-397`) and requires
141/// explicit `n_components >= 1` (`_parameter_constraints`, `:314-315`).
142///
143/// For the [`NComponents::Auto`] path ONLY, a resolved dimension GREATER than
144/// `n_features` is also rejected — sklearn raises `ValueError` when the
145/// JL-resolved `n_components_` exceeds the original space
146/// (`random_projection.py:399-405`). The [`NComponents::Fixed`] path does NOT
147/// error here: sklearn only emits a `DataDimensionalityWarning` for an explicit
148/// `n_components > n_features` (`random_projection.py:407-414`, NOT-STARTED
149/// warning REQ-9 / blocker #1393), so the fixed path proceeds unchanged.
150///
151/// # Errors
152///
153/// Returns [`FerroError::InvalidParameter`] if the resolved dimension is `0`,
154/// if the `Auto` resolved dimension exceeds `n_features`, or if the JL bound's
155/// `eps`/`n_samples` validation fails.
156fn resolve_n_components(
157 n_components: NComponents,
158 eps: f64,
159 n_samples: usize,
160 n_features: usize,
161) -> Result<usize, FerroError> {
162 let resolved = match n_components {
163 NComponents::Fixed(k) => k,
164 NComponents::Auto => johnson_lindenstrauss_min_dim(n_samples, eps)?,
165 };
166 if resolved == 0 {
167 return Err(FerroError::InvalidParameter {
168 name: "n_components".into(),
169 reason: match n_components {
170 NComponents::Auto => format!(
171 "eps={eps} and n_samples={n_samples} lead to a target dimension of 0 which is invalid"
172 ),
173 NComponents::Fixed(_) => "must be >= 1".into(),
174 },
175 });
176 }
177 // AUTO PATH ONLY (sklearn `random_projection.py:399-405`): a JL-resolved
178 // target dimension larger than the original space is a ValueError. The
179 // Fixed path is intentionally excluded — sklearn only WARNS there (`:407-414`).
180 if matches!(n_components, NComponents::Auto) && resolved > n_features {
181 return Err(FerroError::InvalidParameter {
182 name: "n_components".into(),
183 reason: format!(
184 "eps={eps} and n_samples={n_samples} lead to a target dimension of {resolved} which is larger than the original space with n_features={n_features}"
185 ),
186 });
187 }
188 Ok(resolved)
189}
190
191// ---------------------------------------------------------------------------
192// GaussianRandomProjection
193// ---------------------------------------------------------------------------
194
195/// Gaussian random projection transformer.
196///
197/// Projects data into a lower-dimensional space using a random matrix drawn
198/// from `N(0, 1/n_components)`.
199///
200/// # Examples
201///
202/// ```
203/// use ferrolearn_preprocess::random_projection::GaussianRandomProjection;
204/// use ferrolearn_core::traits::{Fit, Transform};
205/// use ndarray::Array2;
206///
207/// let x = Array2::<f64>::ones((10, 50));
208/// let proj = GaussianRandomProjection::<f64>::new(5);
209/// let fitted = proj.fit(&x, &()).unwrap();
210/// let out = fitted.transform(&x).unwrap();
211/// assert_eq!(out.shape(), &[10, 5]);
212/// ```
213#[derive(Debug, Clone)]
214pub struct GaussianRandomProjection<F> {
215 /// Number of output dimensions, or [`NComponents::Auto`] to resolve via the
216 /// Johnson-Lindenstrauss bound at fit time. Defaults to `Auto`.
217 n_components: NComponents,
218 /// Distortion-rate parameter for the `Auto` JL bound. Defaults to `0.1`
219 /// (sklearn `eps=0.1`, `random_projection.py:328`).
220 eps: f64,
221 /// Optional RNG seed for reproducibility.
222 random_state: Option<u64>,
223 _marker: std::marker::PhantomData<F>,
224}
225
226impl<F: Float + Send + Sync + 'static> GaussianRandomProjection<F> {
227 /// Create a new Gaussian random projection with an explicit `n_components`
228 /// number of output dimensions (`NComponents::Fixed`).
229 #[must_use]
230 pub fn new(n_components: usize) -> Self {
231 Self {
232 n_components: NComponents::Fixed(n_components),
233 eps: 0.1,
234 random_state: None,
235 _marker: std::marker::PhantomData,
236 }
237 }
238
239 /// Create a new Gaussian random projection with `n_components='auto'`
240 /// (sklearn default): the target dimension is resolved at fit time from the
241 /// Johnson-Lindenstrauss bound for the configured `eps`
242 /// (`random_projection.py:388-391`).
243 #[must_use]
244 pub fn new_auto() -> Self {
245 Self {
246 n_components: NComponents::Auto,
247 eps: 0.1,
248 random_state: None,
249 _marker: std::marker::PhantomData,
250 }
251 }
252
253 /// Set the distortion-rate `eps` used by the `Auto` JL bound
254 /// (sklearn `eps`, `random_projection.py:328`).
255 #[must_use]
256 pub fn eps(mut self, eps: f64) -> Self {
257 self.eps = eps;
258 self
259 }
260
261 /// Set the random seed for reproducibility.
262 #[must_use]
263 pub fn random_state(mut self, seed: u64) -> Self {
264 self.random_state = Some(seed);
265 self
266 }
267}
268
269/// Fitted Gaussian random projection holding the projection matrix.
270#[derive(Debug, Clone)]
271pub struct FittedGaussianRandomProjection<F> {
272 /// Projection matrix of shape `(n_components, n_features)` — the sklearn
273 /// `components_` orientation (`random_projection.py:419`).
274 components: Array2<F>,
275 /// Resolved number of output dimensions (`n_components_`).
276 n_components: usize,
277 /// Number of features seen during fit (`n_features_in_`).
278 n_features_in: usize,
279}
280
281impl<F: Float + Send + Sync + 'static> FittedGaussianRandomProjection<F> {
282 /// Return a reference to the projection matrix `components_` of shape
283 /// `(n_components, n_features)` (sklearn `components_`,
284 /// `random_projection.py:419`).
285 #[must_use]
286 pub fn components(&self) -> &Array2<F> {
287 &self.components
288 }
289
290 /// Alias for [`Self::components`] kept for backward compatibility. Returns
291 /// the projection matrix in the sklearn `(n_components, n_features)`
292 /// orientation.
293 #[must_use]
294 pub fn projection(&self) -> &Array2<F> {
295 &self.components
296 }
297
298 /// Resolved number of output dimensions (`n_components_`).
299 #[must_use]
300 pub fn n_components_(&self) -> usize {
301 self.n_components
302 }
303
304 /// Number of features seen during fit (`n_features_in_`).
305 #[must_use]
306 pub fn n_features_in_(&self) -> usize {
307 self.n_features_in
308 }
309}
310
311impl<F: Float + Send + Sync + 'static> Fit<Array2<F>, ()> for GaussianRandomProjection<F> {
312 type Fitted = FittedGaussianRandomProjection<F>;
313 type Error = FerroError;
314
315 /// Fit the projection by generating a random matrix `R ~ N(0, 1/n_components)`.
316 ///
317 /// When `n_components` is [`NComponents::Auto`] the target dimension is
318 /// resolved from the Johnson-Lindenstrauss bound for `eps`
319 /// (`random_projection.py:388-391`).
320 ///
321 /// # Errors
322 ///
323 /// Returns [`FerroError::InvalidParameter`] if the resolved `n_components`
324 /// is `0` (explicit, or the JL bound resolving to `<= 0`,
325 /// `random_projection.py:393-397`), or if `eps` is outside `(0, 1)` for the
326 /// `Auto` path. Returns [`FerroError::InsufficientSamples`] if `x` has zero
327 /// rows.
328 fn fit(&self, x: &Array2<F>, _y: &()) -> Result<FittedGaussianRandomProjection<F>, FerroError> {
329 if x.nrows() == 0 {
330 return Err(FerroError::InsufficientSamples {
331 required: 1,
332 actual: 0,
333 context: "GaussianRandomProjection::fit".into(),
334 });
335 }
336
337 let n_features = x.ncols();
338 let n_components =
339 resolve_n_components(self.n_components, self.eps, x.nrows(), n_features)?;
340
341 let mut rng: SmallRng = match self.random_state {
342 Some(seed) => SmallRng::seed_from_u64(seed),
343 None => SmallRng::from_os_rng(),
344 };
345
346 let scale = F::one() / F::from(n_components).unwrap_or_else(F::one).sqrt();
347 let normal = StandardNormal;
348 // components_ orientation (n_components, n_features) — sklearn
349 // `random_projection.py:419`.
350 let mut components = Array2::zeros((n_components, n_features));
351 for v in &mut components {
352 let sample: f64 = normal.sample(&mut rng);
353 *v = F::from(sample).unwrap_or_else(F::zero) * scale;
354 }
355
356 Ok(FittedGaussianRandomProjection {
357 components,
358 n_components,
359 n_features_in: n_features,
360 })
361 }
362}
363
364impl<F: Float + Send + Sync + 'static> Transform<Array2<F>> for FittedGaussianRandomProjection<F> {
365 type Output = Array2<F>;
366 type Error = FerroError;
367
368 /// Transform data by computing `X @ components_.T`
369 /// (sklearn `random_projection.py:604`).
370 ///
371 /// # Errors
372 ///
373 /// Returns [`FerroError::ShapeMismatch`] if `x.ncols() != n_features`.
374 fn transform(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
375 if x.ncols() != self.n_features_in {
376 return Err(FerroError::ShapeMismatch {
377 expected: vec![x.nrows(), self.n_features_in],
378 actual: vec![x.nrows(), x.ncols()],
379 context: "FittedGaussianRandomProjection::transform".into(),
380 });
381 }
382 // components_ is (n_components, n_features); X @ components_.T gives
383 // (n_samples, n_components) — sklearn `random_projection.py:604`.
384 Ok(x.dot(&self.components.t()))
385 }
386}
387
388impl<F: Float + Send + Sync + 'static> Transform<Array2<F>> for GaussianRandomProjection<F> {
389 type Output = Array2<F>;
390 type Error = FerroError;
391
392 /// Always returns an error — the projection must be fitted first.
393 fn transform(&self, _x: &Array2<F>) -> Result<Array2<F>, FerroError> {
394 Err(FerroError::InvalidParameter {
395 name: "GaussianRandomProjection".into(),
396 reason: "projection must be fitted before calling transform; use fit() first".into(),
397 })
398 }
399}
400
401impl<F: Float + Send + Sync + 'static> FitTransform<Array2<F>> for GaussianRandomProjection<F> {
402 type FitError = FerroError;
403
404 /// Fit and transform in one step.
405 fn fit_transform(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
406 let fitted = self.fit(x, &())?;
407 fitted.transform(x)
408 }
409}
410
411impl<F: Float + Send + Sync + 'static> PipelineTransformer<F> for GaussianRandomProjection<F> {
412 fn fit_pipeline(
413 &self,
414 x: &Array2<F>,
415 _y: &Array1<F>,
416 ) -> Result<Box<dyn FittedPipelineTransformer<F>>, FerroError> {
417 let fitted = self.fit(x, &())?;
418 Ok(Box::new(fitted))
419 }
420}
421
422impl<F: Float + Send + Sync + 'static> FittedPipelineTransformer<F>
423 for FittedGaussianRandomProjection<F>
424{
425 fn transform_pipeline(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
426 self.transform(x)
427 }
428}
429
430// ---------------------------------------------------------------------------
431// SparseRandomProjection
432// ---------------------------------------------------------------------------
433
434/// Sparse random projection transformer.
435///
436/// Projects data into a lower-dimensional space using a sparse random matrix
437/// with entries `{-1, 0, +1}` drawn with probabilities
438/// `{d/2, 1 - d, d/2}`, scaled by `sqrt(1 / (d * n_components))`.
439///
440/// The default density `d = 1 / sqrt(n_features)` is used when not specified.
441///
442/// # Examples
443///
444/// ```
445/// use ferrolearn_preprocess::random_projection::SparseRandomProjection;
446/// use ferrolearn_core::traits::{Fit, Transform};
447/// use ndarray::Array2;
448///
449/// let x = Array2::<f64>::ones((10, 100));
450/// let proj = SparseRandomProjection::<f64>::new(5).random_state(42);
451/// let fitted = proj.fit(&x, &()).unwrap();
452/// let out = fitted.transform(&x).unwrap();
453/// assert_eq!(out.shape(), &[10, 5]);
454/// ```
455#[derive(Debug, Clone)]
456pub struct SparseRandomProjection<F> {
457 /// Number of output dimensions, or [`NComponents::Auto`] to resolve via the
458 /// Johnson-Lindenstrauss bound at fit time. Defaults to `Auto`.
459 n_components: NComponents,
460 /// Density of non-zero entries. `None` means `'auto' = 1/sqrt(n_features)`.
461 density: Option<f64>,
462 /// Distortion-rate parameter for the `Auto` JL bound. Defaults to `0.1`.
463 eps: f64,
464 /// Optional RNG seed for reproducibility.
465 random_state: Option<u64>,
466 _marker: std::marker::PhantomData<F>,
467}
468
469impl<F: Float + Send + Sync + 'static> SparseRandomProjection<F> {
470 /// Create a new sparse random projection with an explicit `n_components`
471 /// number of output dimensions (`NComponents::Fixed`).
472 #[must_use]
473 pub fn new(n_components: usize) -> Self {
474 Self {
475 n_components: NComponents::Fixed(n_components),
476 density: None,
477 eps: 0.1,
478 random_state: None,
479 _marker: std::marker::PhantomData,
480 }
481 }
482
483 /// Create a new sparse random projection with `n_components='auto'`
484 /// (sklearn default): resolved at fit time from the Johnson-Lindenstrauss
485 /// bound for the configured `eps` (`random_projection.py:388-391`).
486 #[must_use]
487 pub fn new_auto() -> Self {
488 Self {
489 n_components: NComponents::Auto,
490 density: None,
491 eps: 0.1,
492 random_state: None,
493 _marker: std::marker::PhantomData,
494 }
495 }
496
497 /// Set the density of non-zero entries.
498 #[must_use]
499 pub fn density(mut self, density: f64) -> Self {
500 self.density = Some(density);
501 self
502 }
503
504 /// Set the distortion-rate `eps` used by the `Auto` JL bound.
505 #[must_use]
506 pub fn eps(mut self, eps: f64) -> Self {
507 self.eps = eps;
508 self
509 }
510
511 /// Set the random seed for reproducibility.
512 #[must_use]
513 pub fn random_state(mut self, seed: u64) -> Self {
514 self.random_state = Some(seed);
515 self
516 }
517}
518
519/// Fitted sparse random projection holding the projection matrix.
520#[derive(Debug, Clone)]
521pub struct FittedSparseRandomProjection<F> {
522 /// Projection matrix of shape `(n_components, n_features)` — the sklearn
523 /// `components_` orientation (`random_projection.py:419`).
524 components: Array2<F>,
525 /// Resolved number of output dimensions (`n_components_`).
526 n_components: usize,
527 /// Number of features seen during fit (`n_features_in_`).
528 n_features_in: usize,
529 /// Resolved density (`density_`), i.e. `1/sqrt(n_features)` when `'auto'`
530 /// (sklearn `density_`, `random_projection.py:788`).
531 density: f64,
532}
533
534impl<F: Float + Send + Sync + 'static> FittedSparseRandomProjection<F> {
535 /// Return a reference to the projection matrix `components_` of shape
536 /// `(n_components, n_features)` (sklearn `components_`,
537 /// `random_projection.py:419`).
538 #[must_use]
539 pub fn components(&self) -> &Array2<F> {
540 &self.components
541 }
542
543 /// Alias for [`Self::components`] kept for backward compatibility. Returns
544 /// the projection matrix in the sklearn `(n_components, n_features)`
545 /// orientation.
546 #[must_use]
547 pub fn projection(&self) -> &Array2<F> {
548 &self.components
549 }
550
551 /// Resolved number of output dimensions (`n_components_`).
552 #[must_use]
553 pub fn n_components_(&self) -> usize {
554 self.n_components
555 }
556
557 /// Number of features seen during fit (`n_features_in_`).
558 #[must_use]
559 pub fn n_features_in_(&self) -> usize {
560 self.n_features_in
561 }
562
563 /// Resolved density (`density_`), i.e. `1/sqrt(n_features)` when the
564 /// constructor density was left at the `'auto'` default
565 /// (sklearn `density_`, `random_projection.py:788`).
566 #[must_use]
567 pub fn density_(&self) -> f64 {
568 self.density
569 }
570}
571
572impl<F: Float + Send + Sync + 'static> Fit<Array2<F>, ()> for SparseRandomProjection<F> {
573 type Fitted = FittedSparseRandomProjection<F>;
574 type Error = FerroError;
575
576 /// Fit the projection by generating a sparse random matrix.
577 ///
578 /// When `n_components` is [`NComponents::Auto`] the target dimension is
579 /// resolved from the Johnson-Lindenstrauss bound for `eps`
580 /// (`random_projection.py:388-391`).
581 ///
582 /// # Errors
583 ///
584 /// Returns [`FerroError::InvalidParameter`] if the resolved `n_components`
585 /// is `0` or `density` is not in `(0, 1]`.
586 /// Returns [`FerroError::InsufficientSamples`] if `x` has zero rows.
587 fn fit(&self, x: &Array2<F>, _y: &()) -> Result<FittedSparseRandomProjection<F>, FerroError> {
588 if x.nrows() == 0 {
589 return Err(FerroError::InsufficientSamples {
590 required: 1,
591 actual: 0,
592 context: "SparseRandomProjection::fit".into(),
593 });
594 }
595
596 let n_features = x.ncols();
597 let n_components =
598 resolve_n_components(self.n_components, self.eps, x.nrows(), n_features)?;
599
600 // density='auto' => 1/sqrt(n_features) (sklearn `_check_density:148-149`).
601 let d = self
602 .density
603 .unwrap_or_else(|| 1.0 / (n_features as f64).sqrt());
604
605 if d <= 0.0 || d > 1.0 {
606 return Err(FerroError::InvalidParameter {
607 name: "density".into(),
608 reason: format!("must be in (0, 1], got {d}"),
609 });
610 }
611
612 let mut rng: SmallRng = match self.random_state {
613 Some(seed) => SmallRng::seed_from_u64(seed),
614 None => SmallRng::from_os_rng(),
615 };
616
617 let scale = F::from(1.0 / (d * n_components as f64).sqrt()).unwrap_or_else(F::one);
618 let uniform =
619 rand::distr::Uniform::new(0.0_f64, 1.0).map_err(|e| FerroError::InvalidParameter {
620 name: "density".into(),
621 reason: format!("failed to build uniform sampler: {e}"),
622 })?;
623
624 // components_ orientation (n_components, n_features) — sklearn
625 // `random_projection.py:419`.
626 let mut components = Array2::zeros((n_components, n_features));
627 for v in &mut components {
628 let u: f64 = uniform.sample(&mut rng);
629 if u < d / 2.0 {
630 *v = scale.neg();
631 } else if u < d {
632 *v = scale;
633 }
634 // else: remains 0
635 }
636
637 Ok(FittedSparseRandomProjection {
638 components,
639 n_components,
640 n_features_in: n_features,
641 density: d,
642 })
643 }
644}
645
646impl<F: Float + Send + Sync + 'static> Transform<Array2<F>> for FittedSparseRandomProjection<F> {
647 type Output = Array2<F>;
648 type Error = FerroError;
649
650 /// Transform data by computing `X @ components_.T`
651 /// (sklearn `random_projection.py:810`).
652 ///
653 /// # Errors
654 ///
655 /// Returns [`FerroError::ShapeMismatch`] if `x.ncols() != n_features`.
656 fn transform(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
657 if x.ncols() != self.n_features_in {
658 return Err(FerroError::ShapeMismatch {
659 expected: vec![x.nrows(), self.n_features_in],
660 actual: vec![x.nrows(), x.ncols()],
661 context: "FittedSparseRandomProjection::transform".into(),
662 });
663 }
664 // components_ is (n_components, n_features); X @ components_.T gives
665 // (n_samples, n_components) — sklearn `random_projection.py:810`.
666 Ok(x.dot(&self.components.t()))
667 }
668}
669
670impl<F: Float + Send + Sync + 'static> Transform<Array2<F>> for SparseRandomProjection<F> {
671 type Output = Array2<F>;
672 type Error = FerroError;
673
674 /// Always returns an error — the projection must be fitted first.
675 fn transform(&self, _x: &Array2<F>) -> Result<Array2<F>, FerroError> {
676 Err(FerroError::InvalidParameter {
677 name: "SparseRandomProjection".into(),
678 reason: "projection must be fitted before calling transform; use fit() first".into(),
679 })
680 }
681}
682
683impl<F: Float + Send + Sync + 'static> FitTransform<Array2<F>> for SparseRandomProjection<F> {
684 type FitError = FerroError;
685
686 /// Fit and transform in one step.
687 fn fit_transform(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
688 let fitted = self.fit(x, &())?;
689 fitted.transform(x)
690 }
691}
692
693impl<F: Float + Send + Sync + 'static> PipelineTransformer<F> for SparseRandomProjection<F> {
694 fn fit_pipeline(
695 &self,
696 x: &Array2<F>,
697 _y: &Array1<F>,
698 ) -> Result<Box<dyn FittedPipelineTransformer<F>>, FerroError> {
699 let fitted = self.fit(x, &())?;
700 Ok(Box::new(fitted))
701 }
702}
703
704impl<F: Float + Send + Sync + 'static> FittedPipelineTransformer<F>
705 for FittedSparseRandomProjection<F>
706{
707 fn transform_pipeline(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
708 self.transform(x)
709 }
710}
711
712// ---------------------------------------------------------------------------
713// Tests
714// ---------------------------------------------------------------------------
715
716#[cfg(test)]
717mod tests {
718 use super::*;
719 use ndarray::Array2;
720
721 // -- GaussianRandomProjection --
722
723 #[test]
724 fn test_gaussian_rp_output_shape() {
725 let x = Array2::<f64>::ones((10, 50));
726 let proj = GaussianRandomProjection::<f64>::new(5).random_state(42);
727 let fitted = proj.fit(&x, &()).unwrap();
728 let out = fitted.transform(&x).unwrap();
729 assert_eq!(out.shape(), &[10, 5]);
730 }
731
732 #[test]
733 fn test_gaussian_rp_deterministic() {
734 let x = Array2::<f64>::ones((10, 20));
735 let proj = GaussianRandomProjection::<f64>::new(3).random_state(42);
736 let fitted1 = proj.fit(&x, &()).unwrap();
737 let out1 = fitted1.transform(&x).unwrap();
738 let fitted2 = proj.fit(&x, &()).unwrap();
739 let out2 = fitted2.transform(&x).unwrap();
740 for (a, b) in out1.iter().zip(out2.iter()) {
741 assert!((a - b).abs() < 1e-10);
742 }
743 }
744
745 #[test]
746 fn test_gaussian_rp_zero_components() {
747 let x = Array2::<f64>::ones((5, 10));
748 let proj = GaussianRandomProjection::<f64>::new(0);
749 assert!(proj.fit(&x, &()).is_err());
750 }
751
752 #[test]
753 fn test_gaussian_rp_empty_input() {
754 let x = Array2::<f64>::zeros((0, 10));
755 let proj = GaussianRandomProjection::<f64>::new(5);
756 assert!(proj.fit(&x, &()).is_err());
757 }
758
759 #[test]
760 fn test_gaussian_rp_shape_mismatch() {
761 let x_train = Array2::<f64>::ones((10, 20));
762 let proj = GaussianRandomProjection::<f64>::new(5).random_state(42);
763 let fitted = proj.fit(&x_train, &()).unwrap();
764 let x_bad = Array2::<f64>::ones((5, 15));
765 assert!(fitted.transform(&x_bad).is_err());
766 }
767
768 #[test]
769 fn test_gaussian_rp_fit_transform() {
770 let x = Array2::<f64>::ones((10, 20));
771 let proj = GaussianRandomProjection::<f64>::new(5).random_state(42);
772 let out = proj.fit_transform(&x).unwrap();
773 assert_eq!(out.shape(), &[10, 5]);
774 }
775
776 #[test]
777 fn test_gaussian_rp_f32() {
778 let x = Array2::<f32>::ones((5, 10));
779 let proj = GaussianRandomProjection::<f32>::new(3).random_state(42);
780 let fitted = proj.fit(&x, &()).unwrap();
781 let out = fitted.transform(&x).unwrap();
782 assert_eq!(out.shape(), &[5, 3]);
783 }
784
785 // -- SparseRandomProjection --
786
787 #[test]
788 fn test_sparse_rp_output_shape() {
789 let x = Array2::<f64>::ones((10, 100));
790 let proj = SparseRandomProjection::<f64>::new(5).random_state(42);
791 let fitted = proj.fit(&x, &()).unwrap();
792 let out = fitted.transform(&x).unwrap();
793 assert_eq!(out.shape(), &[10, 5]);
794 }
795
796 #[test]
797 fn test_sparse_rp_deterministic() {
798 let x = Array2::<f64>::ones((10, 50));
799 let proj = SparseRandomProjection::<f64>::new(3).random_state(42);
800 let fitted1 = proj.fit(&x, &()).unwrap();
801 let out1 = fitted1.transform(&x).unwrap();
802 let fitted2 = proj.fit(&x, &()).unwrap();
803 let out2 = fitted2.transform(&x).unwrap();
804 for (a, b) in out1.iter().zip(out2.iter()) {
805 assert!((a - b).abs() < 1e-10);
806 }
807 }
808
809 #[test]
810 fn test_sparse_rp_sparsity() {
811 let x = Array2::<f64>::ones((5, 100));
812 let proj = SparseRandomProjection::<f64>::new(10).random_state(42);
813 let fitted = proj.fit(&x, &()).unwrap();
814 let r = fitted.projection();
815 // With density = 1/sqrt(100) = 0.1, about 90% should be zero
816 let total = r.len();
817 let zeros = r.iter().filter(|&&v| v == 0.0).count();
818 let sparsity = zeros as f64 / total as f64;
819 assert!(
820 sparsity > 0.5,
821 "expected sparse matrix, got sparsity={sparsity}"
822 );
823 }
824
825 #[test]
826 fn test_sparse_rp_custom_density() {
827 let x = Array2::<f64>::ones((5, 20));
828 let proj = SparseRandomProjection::<f64>::new(5)
829 .density(0.5)
830 .random_state(42);
831 let fitted = proj.fit(&x, &()).unwrap();
832 let out = fitted.transform(&x).unwrap();
833 assert_eq!(out.shape(), &[5, 5]);
834 }
835
836 #[test]
837 fn test_sparse_rp_zero_components() {
838 let x = Array2::<f64>::ones((5, 10));
839 let proj = SparseRandomProjection::<f64>::new(0);
840 assert!(proj.fit(&x, &()).is_err());
841 }
842
843 #[test]
844 fn test_sparse_rp_invalid_density() {
845 let x = Array2::<f64>::ones((5, 10));
846 let proj = SparseRandomProjection::<f64>::new(5).density(0.0);
847 assert!(proj.fit(&x, &()).is_err());
848 }
849
850 #[test]
851 fn test_sparse_rp_empty_input() {
852 let x = Array2::<f64>::zeros((0, 10));
853 let proj = SparseRandomProjection::<f64>::new(5);
854 assert!(proj.fit(&x, &()).is_err());
855 }
856
857 #[test]
858 fn test_sparse_rp_shape_mismatch() {
859 let x_train = Array2::<f64>::ones((10, 20));
860 let proj = SparseRandomProjection::<f64>::new(5).random_state(42);
861 let fitted = proj.fit(&x_train, &()).unwrap();
862 let x_bad = Array2::<f64>::ones((5, 15));
863 assert!(fitted.transform(&x_bad).is_err());
864 }
865
866 #[test]
867 fn test_sparse_rp_fit_transform() {
868 let x = Array2::<f64>::ones((10, 20));
869 let proj = SparseRandomProjection::<f64>::new(5).random_state(42);
870 let out = proj.fit_transform(&x).unwrap();
871 assert_eq!(out.shape(), &[10, 5]);
872 }
873
874 #[test]
875 fn test_sparse_rp_f32() {
876 let x = Array2::<f32>::ones((5, 10));
877 let proj = SparseRandomProjection::<f32>::new(3).random_state(42);
878 let fitted = proj.fit(&x, &()).unwrap();
879 let out = fitted.transform(&x).unwrap();
880 assert_eq!(out.shape(), &[5, 3]);
881 }
882}