use std::{borrow::Cow, num::NonZeroUsize, ops::AddAssign};
use ndarray::{Array, ArrayBase, ArrayViewMut, Data, Dimension, Ix2, ShapeError, Zip, s};
use num_traits::{ConstOne, ConstZero, Float, FloatConst};
use numcodecs::{
AnyArray, AnyArrayAssignError, AnyArrayDType, AnyArrayView, AnyArrayViewMut, AnyCowArray,
Codec, StaticCodec, StaticCodecConfig, StaticCodecVersion,
};
use schemars::{JsonSchema, Schema, SchemaGenerator, json_schema};
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use thiserror::Error;
#[cfg(test)]
use ::serde_json as _;
#[derive(Clone, Serialize, Deserialize, JsonSchema)]
pub struct RandomProjectionCodec {
pub seed: u64,
#[serde(flatten)]
pub reduction: RandomProjectionReduction,
#[serde(flatten)]
pub projection: RandomProjectionKind,
#[serde(default, rename = "_version")]
pub version: StaticCodecVersion<0, 1, 0>,
}
#[derive(Clone, Serialize, Deserialize, JsonSchema)]
#[serde(tag = "reduction", rename_all = "kebab-case")]
pub enum RandomProjectionReduction {
JohnsonLindenstrauss {
epsilon: OpenClosedUnit<f64>,
},
Explicit {
k: NonZeroUsize,
},
}
#[derive(Clone, Serialize, Deserialize, JsonSchema)]
#[serde(tag = "projection", rename_all = "kebab-case")]
pub enum RandomProjectionKind {
Gaussian,
Sparse {
#[serde(default, skip_serializing_if = "Option::is_none")]
density: Option<OpenClosedUnit<f64>>,
},
}
impl Codec for RandomProjectionCodec {
type Error = RandomProjectionCodecError;
fn encode(&self, data: AnyCowArray) -> Result<AnyArray, Self::Error> {
match data {
AnyCowArray::F32(data) => Ok(AnyArray::F32(
project_with_projection(data, self.seed, &self.reduction, &self.projection)?
.into_dyn(),
)),
AnyCowArray::F64(data) => Ok(AnyArray::F64(
project_with_projection(data, self.seed, &self.reduction, &self.projection)?
.into_dyn(),
)),
encoded => Err(RandomProjectionCodecError::UnsupportedDtype(
encoded.dtype(),
)),
}
}
fn decode(&self, encoded: AnyCowArray) -> Result<AnyArray, Self::Error> {
match encoded {
AnyCowArray::F32(encoded) => Ok(AnyArray::F32(
reconstruct_with_projection(encoded, self.seed, &self.projection)?.into_dyn(),
)),
AnyCowArray::F64(encoded) => Ok(AnyArray::F64(
reconstruct_with_projection(encoded, self.seed, &self.projection)?.into_dyn(),
)),
encoded => Err(RandomProjectionCodecError::UnsupportedDtype(
encoded.dtype(),
)),
}
}
fn decode_into(
&self,
encoded: AnyArrayView,
decoded: AnyArrayViewMut,
) -> Result<(), Self::Error> {
match (encoded, decoded) {
(AnyArrayView::F32(encoded), AnyArrayViewMut::F32(decoded)) => {
reconstruct_into_with_projection(encoded, decoded, self.seed, &self.projection)
}
(AnyArrayView::F64(encoded), AnyArrayViewMut::F64(decoded)) => {
reconstruct_into_with_projection(encoded, decoded, self.seed, &self.projection)
}
(encoded @ (AnyArrayView::F32(_) | AnyArrayView::F64(_)), decoded) => {
Err(RandomProjectionCodecError::MismatchedDecodeIntoArray {
source: AnyArrayAssignError::DTypeMismatch {
src: encoded.dtype(),
dst: decoded.dtype(),
},
})
}
(encoded, _decoded) => Err(RandomProjectionCodecError::UnsupportedDtype(
encoded.dtype(),
)),
}
}
}
impl StaticCodec for RandomProjectionCodec {
const CODEC_ID: &'static str = "random-projection.rs";
type Config<'de> = Self;
fn from_config(config: Self::Config<'_>) -> Self {
config
}
fn get_config(&self) -> StaticCodecConfig<'_, Self> {
StaticCodecConfig::from(self)
}
}
#[derive(Debug, Error)]
pub enum RandomProjectionCodecError {
#[error("RandomProjection does not support the dtype {0}")]
UnsupportedDtype(AnyArrayDType),
#[error("RandomProjection only supports matrix / 2d-shaped arrays")]
NonMatrixData {
#[from]
source: ShapeError,
},
#[error("RandomProjection does not support non-finite (infinite or NaN) floating point data")]
NonFiniteData,
#[error(
"RandomProjection cannot encode or decode from an array with {input} samples to an array with {output} samples"
)]
NumberOfSamplesMismatch {
input: usize,
output: usize,
},
#[error("RandomProjection cannot decode from an array with zero dimensionality `K`")]
ProjectedArrayZeroComponents,
#[error("RandomProjection cannot decode from an array with corrupted dimensionality metadata")]
CorruptedNumberOfComponents,
#[error(
"RandomProjection cannot decode into an array with {output} features that differs from the {metadata} features stored in the encoded metadata"
)]
NumberOfFeaturesMismatch {
metadata: usize,
output: usize,
},
#[error("RandomProjection cannot decode into the provided array")]
MismatchedDecodeIntoArray {
#[from]
source: AnyArrayAssignError,
},
}
pub fn project_with_projection<T: FloatExt, S: Data<Elem = T>, D: Dimension>(
data: ArrayBase<S, D>,
seed: u64,
reduction: &RandomProjectionReduction,
projection: &RandomProjectionKind,
) -> Result<Array<T, Ix2>, RandomProjectionCodecError> {
let data = data
.into_dimensionality()
.map_err(|err| RandomProjectionCodecError::NonMatrixData { source: err })?;
let (n, d) = data.dim();
let k = match reduction {
RandomProjectionReduction::JohnsonLindenstrauss { epsilon } => {
johnson_lindenstrauss_min_k(n, *epsilon)
}
RandomProjectionReduction::Explicit { k } => k.get(),
};
let mut projected = Array::<T, Ix2>::from_elem((n, k + 1), T::ZERO);
for p in projected.slice_mut(s!(.., k)) {
*p = T::from_usize(d);
}
match projection {
RandomProjectionKind::Gaussian => project_into(
data,
projected.slice_mut(s!(.., ..k)),
|x, y| gaussian_project(x, y, seed),
gaussian_normaliser(k),
),
RandomProjectionKind::Sparse { density } => {
let density = density_or_ping_li_minimum(*density, d);
project_into(
data,
projected.slice_mut(s!(.., ..k)),
|x, y| sparse_project(x, y, density, seed),
sparse_normaliser(k, density),
)
}
}?;
Ok(projected)
}
#[expect(clippy::needless_pass_by_value)]
pub fn project_into<T: FloatExt, S: Data<Elem = T>>(
data: ArrayBase<S, Ix2>,
mut projected: ArrayViewMut<T, Ix2>,
projection: impl Fn(usize, usize) -> T,
normalizer: T,
) -> Result<(), RandomProjectionCodecError> {
let (n, d) = data.dim();
let (n2, _k) = projected.dim();
if n2 != n {
return Err(RandomProjectionCodecError::NumberOfSamplesMismatch {
input: n,
output: n2,
});
}
let mut skip_projection_column = Vec::with_capacity(d);
for (j, projected_j) in projected.columns_mut().into_iter().enumerate() {
skip_projection_column.clear();
for l in 0..d {
let p = projection(l, j);
if !p.is_zero() {
skip_projection_column.push((l, p));
}
}
for (data_i, projected_ij) in data.rows().into_iter().zip(projected_j) {
let mut acc = T::ZERO;
#[expect(clippy::indexing_slicing)]
for &(l, p) in &skip_projection_column {
acc += data_i[l] * p;
}
*projected_ij = acc * normalizer;
}
}
if !Zip::from(projected).all(|x| x.is_finite()) {
return Err(RandomProjectionCodecError::NonFiniteData);
}
Ok(())
}
pub fn reconstruct_with_projection<T: FloatExt, S: Data<Elem = T>, D: Dimension>(
projected: ArrayBase<S, D>,
seed: u64,
projection: &RandomProjectionKind,
) -> Result<Array<T, Ix2>, RandomProjectionCodecError> {
let projected = projected
.into_dimensionality()
.map_err(|err| RandomProjectionCodecError::NonMatrixData { source: err })?;
if projected.is_empty() {
return Ok(projected.to_owned());
}
let (_n, k): (usize, usize) = projected.dim();
let Some(k) = k.checked_sub(1) else {
return Err(RandomProjectionCodecError::ProjectedArrayZeroComponents);
};
let ds = projected.slice(s!(.., k));
let Ok(Some(d)) = ds.fold(Ok(None), |acc, d| match acc {
Ok(None) => Ok(Some(d.into_usize())),
Ok(Some(d2)) if d2 == d.into_usize() => Ok(Some(d2)),
_ => Err(()),
}) else {
return Err(RandomProjectionCodecError::CorruptedNumberOfComponents);
};
let projected = projected.slice_move(s!(.., ..k));
match projection {
RandomProjectionKind::Gaussian => reconstruct(
projected,
d,
|x, y| gaussian_project(x, y, seed),
gaussian_normaliser(k),
),
RandomProjectionKind::Sparse { density } => {
let density = density_or_ping_li_minimum(*density, d);
reconstruct(
projected,
d,
|x, y| sparse_project(x, y, density, seed),
sparse_normaliser(k, density),
)
}
}
}
#[expect(clippy::needless_pass_by_value)]
pub fn reconstruct<T: FloatExt, S: Data<Elem = T>>(
projected: ArrayBase<S, Ix2>,
d: usize,
projection: impl Fn(usize, usize) -> T,
normalizer: T,
) -> Result<Array<T, Ix2>, RandomProjectionCodecError> {
if projected.is_empty() {
return Ok(projected.to_owned());
}
let (n, k) = projected.dim();
let mut reconstructed = Array::<T, Ix2>::from_elem((n, d), T::ZERO);
let mut skip_projection_row = Vec::with_capacity(k);
for (l, reconstructed_l) in reconstructed.columns_mut().into_iter().enumerate() {
skip_projection_row.clear();
for j in 0..k {
let p = projection(l, j);
if !p.is_zero() {
skip_projection_row.push((j, p));
}
}
for (projected_i, reconstructed_il) in projected.rows().into_iter().zip(reconstructed_l) {
let mut acc = T::ZERO;
#[expect(clippy::indexing_slicing)]
for &(j, p) in &skip_projection_row {
acc += projected_i[j] * p;
}
*reconstructed_il = acc * normalizer;
}
}
if !Zip::from(&reconstructed).all(|x| x.is_finite()) {
return Err(RandomProjectionCodecError::NonFiniteData);
}
Ok(reconstructed)
}
pub fn reconstruct_into_with_projection<T: FloatExt, S: Data<Elem = T>, D: Dimension>(
projected: ArrayBase<S, D>,
reconstructed: ArrayViewMut<T, D>,
seed: u64,
projection: &RandomProjectionKind,
) -> Result<(), RandomProjectionCodecError> {
let projected: ArrayBase<S, Ix2> = projected
.into_dimensionality()
.map_err(|err| RandomProjectionCodecError::NonMatrixData { source: err })?;
let reconstructed: ArrayViewMut<T, Ix2> = reconstructed
.into_dimensionality()
.map_err(|err| RandomProjectionCodecError::NonMatrixData { source: err })?;
let (n, k) = projected.dim();
let (n2, d2) = reconstructed.dim();
if n2 != n {
return Err(RandomProjectionCodecError::NumberOfSamplesMismatch {
input: n,
output: n2,
});
}
let Some(k) = k.checked_sub(1) else {
return Err(RandomProjectionCodecError::ProjectedArrayZeroComponents);
};
let ds = projected.slice(s!(.., k));
let Ok(Some(d)) = ds.fold(Ok(None), |acc, d| match acc {
Ok(None) => Ok(Some(d.into_usize())),
Ok(Some(d2)) if d2 == d.into_usize() => Ok(Some(d2)),
_ => Err(()),
}) else {
return Err(RandomProjectionCodecError::CorruptedNumberOfComponents);
};
if d2 != d {
return Err(RandomProjectionCodecError::NumberOfFeaturesMismatch {
metadata: d,
output: d2,
});
}
let projected = projected.slice_move(s!(.., ..k));
match projection {
RandomProjectionKind::Gaussian => reconstruct_into(
projected,
reconstructed,
|x, y| gaussian_project(x, y, seed),
gaussian_normaliser(k),
),
RandomProjectionKind::Sparse { density } => {
let density = density_or_ping_li_minimum(*density, d);
reconstruct_into(
projected,
reconstructed,
|x, y| sparse_project(x, y, density, seed),
sparse_normaliser(k, density),
)
}
}
}
#[expect(clippy::needless_pass_by_value)]
pub fn reconstruct_into<T: FloatExt, S: Data<Elem = T>>(
projected: ArrayBase<S, Ix2>,
mut reconstructed: ArrayViewMut<T, Ix2>,
projection: impl Fn(usize, usize) -> T,
normalizer: T,
) -> Result<(), RandomProjectionCodecError> {
let (n, k) = projected.dim();
let (n2, _d) = reconstructed.dim();
if n2 != n {
return Err(RandomProjectionCodecError::NumberOfSamplesMismatch {
input: n,
output: n2,
});
}
let mut skip_projection_row = Vec::with_capacity(k);
for (l, reconstructed_l) in reconstructed.columns_mut().into_iter().enumerate() {
skip_projection_row.clear();
for j in 0..k {
let p = projection(l, j);
if !p.is_zero() {
skip_projection_row.push((j, p));
}
}
for (projected_i, reconstructed_il) in projected.rows().into_iter().zip(reconstructed_l) {
let mut acc = T::ZERO;
#[expect(clippy::indexing_slicing)]
for &(j, p) in &skip_projection_row {
acc += projected_i[j] * p;
}
*reconstructed_il = acc * normalizer;
}
}
if !Zip::from(reconstructed).all(|x| x.is_finite()) {
return Err(RandomProjectionCodecError::NonFiniteData);
}
Ok(())
}
#[must_use]
pub fn johnson_lindenstrauss_min_k(
n_samples: usize,
OpenClosedUnit(epsilon): OpenClosedUnit<f64>,
) -> usize {
#[expect(clippy::suboptimal_flops)]
let denominator = (epsilon * epsilon * 0.5) - (epsilon * epsilon * epsilon / 3.0);
#[expect(clippy::cast_precision_loss)]
let min_k = (n_samples as f64).ln() * 4.0 / denominator;
#[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
let min_k = min_k as usize;
min_k
}
#[must_use]
pub fn density_or_ping_li_minimum<T: FloatExt>(
density: Option<OpenClosedUnit<f64>>,
d: usize,
) -> T {
match density {
Some(OpenClosedUnit(density)) => T::from_f64(density),
None => T::from_usize(d).sqrt().recip(),
}
}
fn gaussian_project<T: FloatExt>(x: usize, y: usize, seed: u64) -> T {
let (ClosedOpenUnit(u0), OpenClosedUnit(u1)) = T::u01x2(hash_matrix_index(x, y, seed));
let theta = -T::TAU() * u0;
let r = (-T::TWO * u1.ln()).sqrt();
r * theta.sin()
}
fn gaussian_normaliser<T: FloatExt>(k: usize) -> T {
T::from_usize(k).sqrt().recip()
}
fn sparse_project<T: FloatExt>(x: usize, y: usize, density: T, seed: u64) -> T {
let (ClosedOpenUnit(u0), _u1) = T::u01x2(hash_matrix_index(x, y, seed));
if u0 < (density * T::HALF) {
-T::ONE
} else if u0 < density {
T::ONE
} else {
T::ZERO
}
}
fn sparse_normaliser<T: FloatExt>(k: usize, density: T) -> T {
(T::from_usize(k) * density).recip().sqrt()
}
const fn hash_matrix_index(x: usize, y: usize, seed: u64) -> u64 {
seahash_diffuse(seahash_diffuse(x as u64) ^ seed ^ (y as u64))
}
const fn seahash_diffuse(mut x: u64) -> u64 {
x = x.wrapping_mul(0x6eed_0e9d_a4d9_4a4f);
let a = x >> 32;
let b = x >> 60;
x ^= a >> b;
x = x.wrapping_mul(0x6eed_0e9d_a4d9_4a4f);
x
}
#[expect(clippy::derive_partial_eq_without_eq)] #[derive(Copy, Clone, PartialEq, PartialOrd, Hash)]
pub struct ClosedOpenUnit<T: FloatExt>(T);
#[expect(clippy::derive_partial_eq_without_eq)] #[derive(Copy, Clone, PartialEq, PartialOrd, Hash)]
pub struct OpenClosedUnit<T: FloatExt>(T);
impl Serialize for OpenClosedUnit<f64> {
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
serializer.serialize_f64(self.0)
}
}
impl<'de> Deserialize<'de> for OpenClosedUnit<f64> {
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
let x = f64::deserialize(deserializer)?;
if x > 0.0 && x <= 1.0 {
Ok(Self(x))
} else {
Err(serde::de::Error::invalid_value(
serde::de::Unexpected::Float(x),
&"a value in (0.0, 1.0]",
))
}
}
}
impl JsonSchema for OpenClosedUnit<f64> {
fn schema_name() -> Cow<'static, str> {
Cow::Borrowed("OpenClosedUnitF64")
}
fn schema_id() -> Cow<'static, str> {
Cow::Borrowed(concat!(module_path!(), "::", "OpenClosedUnit<f64>"))
}
fn json_schema(_gen: &mut SchemaGenerator) -> Schema {
json_schema!({
"type": "number",
"exclusiveMinimum": 0.0,
"maximum": 1.0,
})
}
}
pub trait FloatExt: Float + ConstZero + ConstOne + FloatConst + AddAssign {
const HALF: Self;
const TWO: Self;
#[must_use]
fn from_f64(x: f64) -> Self;
#[must_use]
fn from_usize(n: usize) -> Self;
#[must_use]
fn into_usize(self) -> usize;
#[must_use]
fn u01x2(hash: u64) -> (ClosedOpenUnit<Self>, OpenClosedUnit<Self>);
}
impl FloatExt for f32 {
const HALF: Self = 0.5;
const TWO: Self = 2.0;
#[expect(clippy::cast_possible_truncation)]
fn from_f64(x: f64) -> Self {
x as Self
}
#[expect(clippy::cast_precision_loss)]
fn from_usize(n: usize) -> Self {
n as Self
}
#[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
fn into_usize(self) -> usize {
self as usize
}
fn u01x2(hash: u64) -> (ClosedOpenUnit<Self>, OpenClosedUnit<Self>) {
#[expect(clippy::cast_possible_truncation)] let (low, high) = (
(hash & u64::from(u32::MAX)) as u32,
((hash >> 32) & u64::from(u32::MAX)) as u32,
);
#[expect(clippy::cast_precision_loss)]
let u0 = ClosedOpenUnit(((high >> 8) as Self) * Self::from_bits(0x3380_0000_u32));
#[expect(clippy::cast_precision_loss)]
let u1 = OpenClosedUnit((((low >> 8) + 1) as Self) * Self::from_bits(0x3380_0000_u32));
(u0, u1)
}
}
impl FloatExt for f64 {
const HALF: Self = 0.5;
const TWO: Self = 2.0;
fn from_f64(x: f64) -> Self {
x
}
#[expect(clippy::cast_precision_loss)]
fn from_usize(n: usize) -> Self {
n as Self
}
#[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
fn into_usize(self) -> usize {
self as usize
}
fn u01x2(hash: u64) -> (ClosedOpenUnit<Self>, OpenClosedUnit<Self>) {
#[expect(clippy::cast_precision_loss)]
let u0 =
ClosedOpenUnit(((hash >> 11) as Self) * Self::from_bits(0x3CA0_0000_0000_0000_u64));
let hash = seahash_diffuse(hash + 1);
#[expect(clippy::cast_precision_loss)]
let u1 = OpenClosedUnit(
(((hash >> 11) + 1) as Self) * Self::from_bits(0x3CA0_0000_0000_0000_u64),
);
(u0, u1)
}
}
#[cfg(test)]
#[expect(clippy::unwrap_used, clippy::expect_used)]
mod tests {
use ndarray_rand::RandomExt;
use ndarray_rand::rand_distr::{Distribution, Normal};
use super::*;
#[test]
fn gaussian_f32() {
test_error_decline::<f32>(
(100, 100),
Normal::new(42.0, 24.0).unwrap(),
42,
RandomProjectionKind::Gaussian,
);
}
#[test]
fn gaussian_f64() {
test_error_decline::<f64>(
(100, 100),
Normal::new(42.0, 24.0).unwrap(),
42,
RandomProjectionKind::Gaussian,
);
}
#[test]
fn achlioptas_sparse_f32() {
test_error_decline::<f32>(
(100, 100),
Normal::new(42.0, 24.0).unwrap(),
42,
RandomProjectionKind::Sparse {
density: Some(OpenClosedUnit(1.0 / 3.0)),
},
);
}
#[test]
fn achlioptas_sparse_f64() {
test_error_decline::<f64>(
(100, 100),
Normal::new(42.0, 24.0).unwrap(),
42,
RandomProjectionKind::Sparse {
density: Some(OpenClosedUnit(1.0 / 3.0)),
},
);
}
#[test]
fn ping_li_sparse_f32() {
test_error_decline::<f32>(
(100, 100),
Normal::new(42.0, 24.0).unwrap(),
42,
RandomProjectionKind::Sparse { density: None },
);
}
#[test]
fn ping_li_sparse_f64() {
test_error_decline::<f64>(
(100, 100),
Normal::new(42.0, 24.0).unwrap(),
42,
RandomProjectionKind::Sparse { density: None },
);
}
#[expect(clippy::needless_pass_by_value)]
fn test_error_decline<T: FloatExt + std::fmt::Display>(
shape: (usize, usize),
distribution: impl Distribution<T>,
seed: u64,
projection: RandomProjectionKind,
) {
let data = Array::<T, Ix2>::random(shape, distribution);
let mut max_rmse = rmse(
&data,
&roundtrip(&data, 42, OpenClosedUnit(1.0), &projection),
);
for epsilon in [
OpenClosedUnit(0.5),
OpenClosedUnit(0.1),
OpenClosedUnit(0.01),
] {
let new_rmse = rmse(&data, &roundtrip(&data, seed, epsilon, &projection));
assert!(
new_rmse <= max_rmse,
"{new_rmse} > {max_rmse} for {epsilon}",
epsilon = epsilon.0
);
max_rmse = new_rmse;
}
}
fn roundtrip<T: FloatExt>(
data: &Array<T, Ix2>,
seed: u64,
epsilon: OpenClosedUnit<f64>,
projection: &RandomProjectionKind,
) -> Array<T, Ix2> {
let projected = project_with_projection(
data.view(),
seed,
&RandomProjectionReduction::JohnsonLindenstrauss { epsilon },
projection,
)
.expect("projecting must not fail");
let reconstructed = reconstruct_with_projection(projected, seed, projection)
.expect("reconstruction must not fail");
#[expect(clippy::let_and_return)]
reconstructed
}
fn rmse<T: FloatExt>(data: &Array<T, Ix2>, reconstructed: &Array<T, Ix2>) -> T {
let mut err = T::ZERO;
for (&d, &r) in data.iter().zip(reconstructed) {
err += (d - r) * (d - r);
}
let mse = err * T::from_usize(data.len()).recip();
mse.sqrt()
}
}