umap_rs/config.rs
1use serde::Deserialize;
2use serde::Serialize;
3
4/// Configuration for manifold shape and embedding space properties.
5///
6/// These parameters control the geometric properties of the low-dimensional
7/// embedding space and how the manifold is shaped.
8#[derive(Debug, Clone, Serialize, Deserialize)]
9pub struct ManifoldParams {
10 /// Minimum distance between points in the embedding space.
11 ///
12 /// Controls how tightly points can be packed together. Smaller values
13 /// create more clustered embeddings, larger values spread points out more.
14 ///
15 /// Default: 0.1
16 pub min_dist: f32,
17
18 /// The effective scale of embedded points.
19 ///
20 /// Together with `min_dist`, this determines the embedding's overall spread.
21 /// The curve used in optimization is calibrated using these parameters.
22 ///
23 /// Default: 1.0
24 pub spread: f32,
25
26 /// Parameter 'a' of the distance-to-probability curve: 1 / (1 + a*x^(2b))
27 ///
28 /// If `None`, will be automatically computed from `min_dist` and `spread`.
29 /// Manually setting this overrides automatic calibration.
30 ///
31 /// Default: None (auto-compute)
32 pub a: Option<f32>,
33
34 /// Parameter 'b' of the distance-to-probability curve: 1 / (1 + a*x^(2b))
35 ///
36 /// If `None`, will be automatically computed from `min_dist` and `spread`.
37 /// Manually setting this overrides automatic calibration.
38 ///
39 /// Default: None (auto-compute)
40 pub b: Option<f32>,
41}
42
43impl Default for ManifoldParams {
44 fn default() -> Self {
45 Self {
46 min_dist: 0.1,
47 spread: 1.0,
48 a: None,
49 b: None,
50 }
51 }
52}
53
54/// Configuration for k-nearest neighbor graph construction.
55///
56/// These parameters control how the high-dimensional manifold structure
57/// is captured via a fuzzy topological representation.
58#[derive(Debug, Clone, Serialize, Deserialize)]
59pub struct GraphParams {
60 /// Number of nearest neighbors to use for manifold approximation.
61 ///
62 /// Larger values capture more global structure but may miss fine details.
63 /// Smaller values focus on local structure but may fragment the manifold.
64 ///
65 /// Must be >= 2.
66 ///
67 /// Default: 15
68 pub n_neighbors: usize,
69
70 /// Local connectivity requirement (number of nearest neighbors assumed connected).
71 ///
72 /// Higher values make the manifold more locally connected, which can help
73 /// with datasets that have variable density. Should generally not exceed
74 /// the local intrinsic dimensionality.
75 ///
76 /// Default: 1.0
77 pub local_connectivity: f32,
78
79 /// Interpolation between fuzzy union (1.0) and fuzzy intersection (0.0).
80 ///
81 /// Controls how local fuzzy simplicial sets are combined. Pure union (1.0)
82 /// gives equal weight to all edges, pure intersection (0.0) only keeps
83 /// mutually nearest neighbors.
84 ///
85 /// Must be in range [0.0, 1.0].
86 ///
87 /// Default: 1.0
88 pub set_op_mix_ratio: f32,
89
90 /// Distance threshold beyond which edges are disconnected.
91 ///
92 /// If `None`, uses the metric's default (typically infinity for unbounded metrics).
93 /// Useful for bounded metrics or to explicitly remove long-range connections.
94 ///
95 /// Default: None (use metric default)
96 pub disconnection_distance: Option<f32>,
97
98 /// Whether to symmetrize the KNN graph.
99 ///
100 /// KNN graphs are directed (A→B doesn't imply B→A). Symmetrization makes
101 /// edges bidirectional using fuzzy set operations. This is the standard
102 /// UMAP behavior and produces slightly better cluster coherence.
103 ///
104 /// Set to `false` to skip symmetrization, which:
105 /// - Reduces memory usage (avoids nearly doubling edge count)
106 /// - Speeds up graph construction and SGD optimization
107 /// - May produce slightly less polished cluster boundaries
108 ///
109 /// For 2D visualization, the difference is typically subtle. Disabling
110 /// symmetrization is a reasonable tradeoff for very large datasets where
111 /// memory is constrained.
112 ///
113 /// Default: true
114 pub symmetrize: bool,
115}
116
117impl Default for GraphParams {
118 fn default() -> Self {
119 Self {
120 n_neighbors: 15,
121 local_connectivity: 1.0,
122 set_op_mix_ratio: 1.0,
123 disconnection_distance: None,
124 symmetrize: true,
125 }
126 }
127}
128
129/// Configuration for stochastic gradient descent optimization.
130///
131/// These parameters control the embedding optimization process via SGD
132/// on the fuzzy set cross-entropy.
133#[derive(Debug, Clone, Serialize, Deserialize)]
134pub struct OptimizationParams {
135 /// Number of optimization epochs.
136 ///
137 /// If `None`, will be automatically determined based on dataset size:
138 /// - <= 10,000 samples: 500 epochs
139 /// - > 10,000 samples: 200 epochs
140 ///
141 /// More epochs improve convergence but increase runtime.
142 ///
143 /// Default: None (auto-determine)
144 pub n_epochs: Option<usize>,
145
146 /// Initial learning rate for SGD.
147 ///
148 /// The learning rate decays linearly to 0 over the course of optimization.
149 /// Higher values converge faster but may overshoot; lower values are more stable.
150 ///
151 /// Default: 1.0
152 pub learning_rate: f32,
153
154 /// Number of negative samples per positive sample.
155 ///
156 /// Negative sampling is used to push apart non-neighboring points.
157 /// Higher values improve separation but increase computation.
158 ///
159 /// Default: 5
160 pub negative_sample_rate: usize,
161
162 /// Weight applied to negative samples (repulsion strength).
163 ///
164 /// Higher values push non-neighbors apart more strongly.
165 /// Lower values focus more on pulling neighbors together.
166 ///
167 /// Default: 1.0
168 pub repulsion_strength: f32,
169}
170
171impl Default for OptimizationParams {
172 fn default() -> Self {
173 Self {
174 n_epochs: None,
175 learning_rate: 1.0,
176 negative_sample_rate: 5,
177 repulsion_strength: 1.0,
178 }
179 }
180}
181
182/// Complete UMAP configuration.
183///
184/// Groups all parameters for dimensionality reduction into a coherent structure.
185/// All parameter groups have sensible defaults and can be customized individually.
186///
187/// # Example
188///
189/// ```ignore
190/// use umap::config::{UmapConfig, GraphParams};
191///
192/// // Use all defaults
193/// let config = UmapConfig::default();
194///
195/// // Customize specific groups
196/// let config = UmapConfig {
197/// n_components: 3,
198/// graph: GraphParams {
199/// n_neighbors: 30,
200/// ..Default::default()
201/// },
202/// ..Default::default()
203/// };
204/// ```
205#[derive(Debug, Clone, Serialize, Deserialize)]
206pub struct UmapConfig {
207 /// Number of dimensions in the output embedding.
208 ///
209 /// Typically 2 for visualization or 3-50 for downstream ML tasks.
210 ///
211 /// Must be >= 1.
212 ///
213 /// Default: 2
214 pub n_components: usize,
215
216 /// Manifold shape configuration.
217 pub manifold: ManifoldParams,
218
219 /// Graph construction configuration.
220 pub graph: GraphParams,
221
222 /// Optimization configuration.
223 pub optimization: OptimizationParams,
224}
225
226impl Default for UmapConfig {
227 fn default() -> Self {
228 Self {
229 n_components: 2,
230 manifold: ManifoldParams::default(),
231 graph: GraphParams::default(),
232 optimization: OptimizationParams::default(),
233 }
234 }
235}