rag_umap/
lib.rs

1//! UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction
2//!
3//! A Rust implementation of the UMAP algorithm for manifold learning and dimension reduction.
4//! Based on the paper: "UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction"
5//! by Leland McInnes, John Healy, and James Melville.
6
7use ndarray::{Array2, ArrayView1, ArrayView2};
8use thiserror::Error;
9
10mod distance;
11mod nearest_neighbor;
12mod simplicial_set;
13mod spectral;
14mod optimization;
15
16/// Errors that can occur during UMAP computation
17#[derive(Error, Debug)]
18pub enum UmapError {
19    #[error("Invalid parameter: {0}")]
20    InvalidParameter(String),
21    #[error("Computation error: {0}")]
22    ComputationError(String),
23    #[error("Dimension mismatch: expected {expected}, got {actual}")]
24    DimensionMismatch { expected: usize, actual: usize },
25    #[error("Insufficient data: need at least {min} samples, got {actual}")]
26    InsufficientData { min: usize, actual: usize },
27}
28
29pub type Result<T> = std::result::Result<T, UmapError>;
30
31/// Distance metric trait for computing distances between data points
32trait DistanceMetric: Send + Sync {
33    /// Compute the distance between two points
34    fn distance(&self, x: ArrayView1<f64>, y: ArrayView1<f64>) -> f64;
35
36}
37
38/// UMAP hyperparameters and configuration
39#[derive(Debug, Clone)]
40struct UmapParams {
41    /// Number of nearest neighbors to consider for manifold approximation
42    pub n_neighbors: usize,
43    /// Target embedding dimension
44    pub n_components: usize,
45    /// Minimum distance between points in the embedding
46    pub min_dist: f64,
47    /// Number of optimization epochs
48    pub n_epochs: usize,
49    /// Number of negative samples per positive sample during optimization
50    pub negative_sample_rate: usize,
51    /// Random seed for reproducible results
52    pub random_seed: Option<u64>,
53    /// Spread parameter for low-dimensional representation
54    pub spread: f64,
55    /// Set connectivity parameter for local connectivity constraint
56    pub set_op_mix_ratio: f64,
57    /// Local connectivity parameter
58    pub local_connectivity: f64,
59    /// Repulsion strength parameter
60    pub repulsion_strength: f64,
61    /// Initial alpha for optimization
62    pub initial_alpha: f64,
63}
64
65impl Default for UmapParams {
66    fn default() -> Self {
67        Self {
68            n_neighbors: 15,
69            n_components: 2,
70            min_dist: 0.1,
71            n_epochs: 200,
72            negative_sample_rate: 5,
73            random_seed: None,
74            spread: 1.0,
75            set_op_mix_ratio: 1.0,
76            local_connectivity: 1.0,
77            repulsion_strength: 1.0,
78            initial_alpha: 1.0,
79        }
80    }
81}
82
83impl UmapParams {
84    /// Create a new UmapParams with default values
85    pub fn new() -> Self {
86        Self::default()
87    }
88
89    /// Set the number of neighbors
90    pub fn n_neighbors(mut self, n_neighbors: usize) -> Self {
91        self.n_neighbors = n_neighbors;
92        self
93    }
94
95    /// Set the number of components (embedding dimension)
96    pub fn n_components(mut self, n_components: usize) -> Self {
97        self.n_components = n_components;
98        self
99    }
100
101
102    /// Validate the parameters
103    pub fn validate(&self) -> Result<()> {
104        if self.n_neighbors < 2 {
105            return Err(UmapError::InvalidParameter(
106                "n_neighbors must be at least 2".to_string()
107            ));
108        }
109
110        if self.n_components < 1 {
111            return Err(UmapError::InvalidParameter(
112                "n_components must be at least 1".to_string()
113            ));
114        }
115
116        if self.min_dist < 0.0 {
117            return Err(UmapError::InvalidParameter(
118                "min_dist must be non-negative".to_string()
119            ));
120        }
121
122        if self.n_epochs == 0 {
123            return Err(UmapError::InvalidParameter(
124                "n_epochs must be positive".to_string()
125            ));
126        }
127
128        Ok(())
129    }
130}
131
132/// Represents a fuzzy simplicial set as a sparse graph
133#[derive(Debug, Clone)]
134struct FuzzySimplicialSet {
135    /// Adjacency matrix stored as (row, col, weight) tuples
136    pub edges: Vec<(usize, usize, f64)>,
137    /// Number of vertices in the graph
138    pub n_vertices: usize,
139}
140
141impl FuzzySimplicialSet {
142    /// Create a new empty fuzzy simplicial set
143    pub fn new(n_vertices: usize) -> Self {
144        Self {
145            edges: Vec::new(),
146            n_vertices,
147        }
148    }
149
150    /// Add an edge to the fuzzy simplicial set
151    pub fn add_edge(&mut self, i: usize, j: usize, weight: f64) {
152        if weight > 0.0 {
153            self.edges.push((i, j, weight));
154        }
155    }
156
157}
158
159/// Main UMAP struct for dimensionality reduction
160struct Umap {
161    /// UMAP parameters
162    pub params: UmapParams,
163    /// Distance metric to use
164    distance_metric: Box<dyn DistanceMetric>,
165    /// Fitted embedding (available after calling fit_transform)
166    pub embedding: Option<Array2<f64>>,
167    /// Fuzzy simplicial set representation of the data
168    pub fuzzy_set: Option<FuzzySimplicialSet>,
169}
170
171impl Umap {
172    /// Create a new UMAP instance with the given parameters and distance metric
173    pub fn new(params: UmapParams, distance_metric: Box<dyn DistanceMetric>) -> Result<Self> {
174        params.validate()?;
175
176        Ok(Self {
177            params,
178            distance_metric,
179            embedding: None,
180            fuzzy_set: None,
181        })
182    }
183
184    /// Create a new UMAP instance with default parameters and Euclidean distance
185    pub fn new_with_euclidean_distance() -> Result<Self> {
186        let params = UmapParams::default();
187        let distance_metric = Box::new(distance::EuclideanDistance);
188        Self::new(params, distance_metric)
189    }
190
191    /// Fit UMAP to the data and return the embedding
192    pub fn fit_transform(&mut self, data: ArrayView2<f64>) -> Result<Array2<f64>> {
193        let n_samples = data.nrows();
194        let _n_features = data.ncols();
195
196        // Validate input
197        if n_samples < self.params.n_neighbors {
198            return Err(UmapError::InsufficientData {
199                min: self.params.n_neighbors,
200                actual: n_samples,
201            });
202        }
203
204        // Step 1: Construct fuzzy simplicial set representation
205        let fuzzy_set = self.construct_fuzzy_simplicial_set(data)?;
206        self.fuzzy_set = Some(fuzzy_set.clone());
207
208        // Step 2: Initialize embedding using spectral embedding
209        let mut embedding = self.spectral_layout(&fuzzy_set)?;
210
211        // Step 3: Optimize embedding using SGD
212        self.optimize_layout(&mut embedding, &fuzzy_set, data)?;
213
214        self.embedding = Some(embedding.clone());
215        Ok(embedding)
216    }
217
218
219    /// Construct fuzzy simplicial set representation of the data
220    fn construct_fuzzy_simplicial_set(&self, data: ArrayView2<f64>) -> Result<FuzzySimplicialSet> {
221        simplicial_set::fuzzy_simplicial_set(
222            data,
223            self.params.n_neighbors,
224            self.distance_metric.as_ref(),
225            self.params.local_connectivity,
226            self.params.set_op_mix_ratio,
227            self.params.random_seed,
228        )
229    }
230
231    /// Initialize embedding using spectral layout
232    fn spectral_layout(&self, fuzzy_set: &FuzzySimplicialSet) -> Result<Array2<f64>> {
233        spectral::spectral_layout(fuzzy_set, self.params.n_components)
234    }
235
236    /// Optimize the embedding layout using SGD
237    fn optimize_layout(
238        &self,
239        embedding: &mut Array2<f64>,
240        fuzzy_set: &FuzzySimplicialSet,
241        _data: ArrayView2<f64>,
242    ) -> Result<()> {
243        let opt_params = optimization::OptimizationParams {
244            n_epochs: self.params.n_epochs,
245            initial_alpha: self.params.initial_alpha,
246            min_dist: self.params.min_dist,
247            spread: self.params.spread,
248            negative_sample_rate: self.params.negative_sample_rate,
249            repulsion_strength: self.params.repulsion_strength,
250            random_seed: self.params.random_seed,
251        };
252
253        optimization::optimize_embedding(embedding, fuzzy_set, &opt_params)
254    }
255}
256
257/// Convert embeddings to 2D space using UMAP
258///
259/// # Arguments
260/// * `embeddings` - Input embeddings as Vec<Vec<T>> where T can be converted to f64
261///
262/// # Returns
263/// * `Result<Vec<Vec<f64>>>` - 2D embedding with same number of rows as input
264pub fn convert_to_2d<T: Into<f64> + Copy>(embeddings: Vec<Vec<T>>) -> Result<Vec<Vec<f64>>> {
265    let n_samples = embeddings.len();
266    let n_features = embeddings.first().map(|row| row.len()).unwrap_or(0);
267
268    let mut data = Array2::zeros((n_samples, n_features));
269    for (i, row) in embeddings.iter().enumerate() {
270        for (j, &val) in row.iter().enumerate() {
271            data[[i, j]] = val.into();
272        }
273    }
274
275    let params = UmapParams::new()
276        .n_components(2)
277        .n_neighbors(15.min(n_samples - 1));
278
279    let mut umap = Umap::new_with_euclidean_distance()?;
280    umap.params = params;
281    let result = umap.fit_transform(data.view())?;
282
283    let mut output = Vec::new();
284    for i in 0..result.nrows() {
285        output.push(vec![result[[i, 0]], result[[i, 1]]]);
286    }
287    Ok(output)
288}
289
290/// Convert embeddings to 3D space using UMAP
291///
292/// # Arguments
293/// * `embeddings` - Input embeddings as Vec<Vec<T>> where T can be converted to f64
294///
295/// # Returns
296/// * `Result<Vec<Vec<f64>>>` - 3D embedding with same number of rows as input
297pub fn convert_to_3d<T: Into<f64> + Copy>(embeddings: Vec<Vec<T>>) -> Result<Vec<Vec<f64>>> {
298    let n_samples = embeddings.len();
299    let n_features = embeddings.first().map(|row| row.len()).unwrap_or(0);
300
301    let mut data = Array2::zeros((n_samples, n_features));
302    for (i, row) in embeddings.iter().enumerate() {
303        for (j, &val) in row.iter().enumerate() {
304            data[[i, j]] = val.into();
305        }
306    }
307
308    let params = UmapParams::new()
309        .n_components(3)
310        .n_neighbors(15.min(n_samples - 1));
311
312    let mut umap = Umap::new_with_euclidean_distance()?;
313    umap.params = params;
314    let result = umap.fit_transform(data.view())?;
315
316    let mut output = Vec::new();
317    for i in 0..result.nrows() {
318        output.push(vec![result[[i, 0]], result[[i, 1]], result[[i, 2]]]);
319    }
320    Ok(output)
321}
322
323#[cfg(test)]
324mod tests {
325    use super::*;
326    use ndarray::Array2;
327
328    #[test]
329    fn test_umap_params_validation() {
330        let valid_params = UmapParams::new()
331            .n_neighbors(10)
332            .n_components(2)
333            .min_dist(0.1);
334
335        assert!(valid_params.validate().is_ok());
336
337        let invalid_params = UmapParams::new().n_neighbors(1);
338        assert!(invalid_params.validate().is_err());
339    }
340
341    #[test]
342    fn test_fuzzy_simplicial_set() {
343        let mut fs_set = FuzzySimplicialSet::new(3);
344        fs_set.add_edge(0, 1, 0.5);
345        fs_set.add_edge(1, 2, 0.8);
346
347        assert_eq!(fs_set.edges.len(), 2);
348        assert_eq!(fs_set.vertex_degree(0), 0.5);
349        assert_eq!(fs_set.vertex_degree(1), 0.8);
350    }
351}