umap_rs/
config.rs

1use serde::Deserialize;
2use serde::Serialize;
3
4/// Configuration for manifold shape and embedding space properties.
5///
6/// These parameters control the geometric properties of the low-dimensional
7/// embedding space and how the manifold is shaped.
8#[derive(Debug, Clone, Serialize, Deserialize)]
9pub struct ManifoldParams {
10  /// Minimum distance between points in the embedding space.
11  ///
12  /// Controls how tightly points can be packed together. Smaller values
13  /// create more clustered embeddings, larger values spread points out more.
14  ///
15  /// Default: 0.1
16  pub min_dist: f32,
17
18  /// The effective scale of embedded points.
19  ///
20  /// Together with `min_dist`, this determines the embedding's overall spread.
21  /// The curve used in optimization is calibrated using these parameters.
22  ///
23  /// Default: 1.0
24  pub spread: f32,
25
26  /// Parameter 'a' of the distance-to-probability curve: 1 / (1 + a*x^(2b))
27  ///
28  /// If `None`, will be automatically computed from `min_dist` and `spread`.
29  /// Manually setting this overrides automatic calibration.
30  ///
31  /// Default: None (auto-compute)
32  pub a: Option<f32>,
33
34  /// Parameter 'b' of the distance-to-probability curve: 1 / (1 + a*x^(2b))
35  ///
36  /// If `None`, will be automatically computed from `min_dist` and `spread`.
37  /// Manually setting this overrides automatic calibration.
38  ///
39  /// Default: None (auto-compute)
40  pub b: Option<f32>,
41}
42
43impl Default for ManifoldParams {
44  fn default() -> Self {
45    Self {
46      min_dist: 0.1,
47      spread: 1.0,
48      a: None,
49      b: None,
50    }
51  }
52}
53
54/// Configuration for k-nearest neighbor graph construction.
55///
56/// These parameters control how the high-dimensional manifold structure
57/// is captured via a fuzzy topological representation.
58#[derive(Debug, Clone, Serialize, Deserialize)]
59pub struct GraphParams {
60  /// Number of nearest neighbors to use for manifold approximation.
61  ///
62  /// Larger values capture more global structure but may miss fine details.
63  /// Smaller values focus on local structure but may fragment the manifold.
64  ///
65  /// Must be >= 2.
66  ///
67  /// Default: 15
68  pub n_neighbors: usize,
69
70  /// Local connectivity requirement (number of nearest neighbors assumed connected).
71  ///
72  /// Higher values make the manifold more locally connected, which can help
73  /// with datasets that have variable density. Should generally not exceed
74  /// the local intrinsic dimensionality.
75  ///
76  /// Default: 1.0
77  pub local_connectivity: f32,
78
79  /// Interpolation between fuzzy union (1.0) and fuzzy intersection (0.0).
80  ///
81  /// Controls how local fuzzy simplicial sets are combined. Pure union (1.0)
82  /// gives equal weight to all edges, pure intersection (0.0) only keeps
83  /// mutually nearest neighbors.
84  ///
85  /// Must be in range [0.0, 1.0].
86  ///
87  /// Default: 1.0
88  pub set_op_mix_ratio: f32,
89
90  /// Distance threshold beyond which edges are disconnected.
91  ///
92  /// If `None`, uses the metric's default (typically infinity for unbounded metrics).
93  /// Useful for bounded metrics or to explicitly remove long-range connections.
94  ///
95  /// Default: None (use metric default)
96  pub disconnection_distance: Option<f32>,
97
98  /// Whether to symmetrize the KNN graph.
99  ///
100  /// KNN graphs are directed (A→B doesn't imply B→A). Symmetrization makes
101  /// edges bidirectional using fuzzy set operations. This is the standard
102  /// UMAP behavior and produces slightly better cluster coherence.
103  ///
104  /// Set to `false` to skip symmetrization, which:
105  /// - Reduces memory usage (avoids nearly doubling edge count)
106  /// - Speeds up graph construction and SGD optimization
107  /// - May produce slightly less polished cluster boundaries
108  ///
109  /// For 2D visualization, the difference is typically subtle. Disabling
110  /// symmetrization is a reasonable tradeoff for very large datasets where
111  /// memory is constrained.
112  ///
113  /// Default: true
114  pub symmetrize: bool,
115}
116
117impl Default for GraphParams {
118  fn default() -> Self {
119    Self {
120      n_neighbors: 15,
121      local_connectivity: 1.0,
122      set_op_mix_ratio: 1.0,
123      disconnection_distance: None,
124      symmetrize: true,
125    }
126  }
127}
128
129/// Configuration for stochastic gradient descent optimization.
130///
131/// These parameters control the embedding optimization process via SGD
132/// on the fuzzy set cross-entropy.
133#[derive(Debug, Clone, Serialize, Deserialize)]
134pub struct OptimizationParams {
135  /// Number of optimization epochs.
136  ///
137  /// If `None`, will be automatically determined based on dataset size:
138  /// - <= 10,000 samples: 500 epochs
139  /// - > 10,000 samples: 200 epochs
140  ///
141  /// More epochs improve convergence but increase runtime.
142  ///
143  /// Default: None (auto-determine)
144  pub n_epochs: Option<usize>,
145
146  /// Initial learning rate for SGD.
147  ///
148  /// The learning rate decays linearly to 0 over the course of optimization.
149  /// Higher values converge faster but may overshoot; lower values are more stable.
150  ///
151  /// Default: 1.0
152  pub learning_rate: f32,
153
154  /// Number of negative samples per positive sample.
155  ///
156  /// Negative sampling is used to push apart non-neighboring points.
157  /// Higher values improve separation but increase computation.
158  ///
159  /// Default: 5
160  pub negative_sample_rate: usize,
161
162  /// Weight applied to negative samples (repulsion strength).
163  ///
164  /// Higher values push non-neighbors apart more strongly.
165  /// Lower values focus more on pulling neighbors together.
166  ///
167  /// Default: 1.0
168  pub repulsion_strength: f32,
169}
170
171impl Default for OptimizationParams {
172  fn default() -> Self {
173    Self {
174      n_epochs: None,
175      learning_rate: 1.0,
176      negative_sample_rate: 5,
177      repulsion_strength: 1.0,
178    }
179  }
180}
181
182/// Complete UMAP configuration.
183///
184/// Groups all parameters for dimensionality reduction into a coherent structure.
185/// All parameter groups have sensible defaults and can be customized individually.
186///
187/// # Example
188///
189/// ```ignore
190/// use umap::config::{UmapConfig, GraphParams};
191///
192/// // Use all defaults
193/// let config = UmapConfig::default();
194///
195/// // Customize specific groups
196/// let config = UmapConfig {
197///     n_components: 3,
198///     graph: GraphParams {
199///         n_neighbors: 30,
200///         ..Default::default()
201///     },
202///     ..Default::default()
203/// };
204/// ```
205#[derive(Debug, Clone, Serialize, Deserialize)]
206pub struct UmapConfig {
207  /// Number of dimensions in the output embedding.
208  ///
209  /// Typically 2 for visualization or 3-50 for downstream ML tasks.
210  ///
211  /// Must be >= 1.
212  ///
213  /// Default: 2
214  pub n_components: usize,
215
216  /// Manifold shape configuration.
217  pub manifold: ManifoldParams,
218
219  /// Graph construction configuration.
220  pub graph: GraphParams,
221
222  /// Optimization configuration.
223  pub optimization: OptimizationParams,
224}
225
226impl Default for UmapConfig {
227  fn default() -> Self {
228    Self {
229      n_components: 2,
230      manifold: ManifoldParams::default(),
231      graph: GraphParams::default(),
232      optimization: OptimizationParams::default(),
233    }
234  }
235}