tenflowers-dataset 0.1.1

Data pipeline and dataset utilities for TenfloweRS
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
//! Classical Machine Learning Dataset Generation
//!
//! This module contains implementations for generating standard synthetic datasets
//! commonly used in machine learning research and benchmarking.

use super::core::{DatasetGenerator, SyntheticConfig, SyntheticDataset};
// use crate::Dataset; // Unused import removed
use scirs2_core::random::Random;
use std::f64::consts::PI;
use tenflowers_core::{Result, Tensor, TensorError};

impl DatasetGenerator {
    /// Generate two interleaving half circles (moons)
    pub fn make_moons(config: SyntheticConfig) -> Result<SyntheticDataset<f64>> {
        let mut rng = if let Some(seed) = config.random_seed {
            Random::seed(seed)
        } else {
            Random::seed(0)
        };

        let n_samples_out = config.n_samples / 2;
        let n_samples_in = config.n_samples - n_samples_out;

        let mut features = Vec::new();
        let mut labels = Vec::new();

        // Generate outer semicircle
        for i in 0..n_samples_out {
            let angle = PI * (i as f64) / (n_samples_out as f64 - 1.0);
            let x = angle.cos();
            let y = angle.sin();

            // Add noise
            let noise_x = rng.gen_range(-config.noise_level..config.noise_level);
            let noise_y = rng.gen_range(-config.noise_level..config.noise_level);

            features.push(x + noise_x);
            features.push(y + noise_y);
            labels.push(0.0);
        }

        // Generate inner semicircle
        for i in 0..n_samples_in {
            let angle = PI * (i as f64) / (n_samples_in as f64 - 1.0);
            let x = 1.0 - angle.cos();
            let y = 1.0 - angle.sin() - 0.5;

            // Add noise
            let noise_x = rng.gen_range(-config.noise_level..config.noise_level);
            let noise_y = rng.gen_range(-config.noise_level..config.noise_level);

            features.push(x + noise_x);
            features.push(y + noise_y);
            labels.push(1.0);
        }

        // Shuffle if requested
        if config.shuffle {
            let mut combined: Vec<(f64, f64, f64)> = features
                .chunks_exact(2)
                .zip(labels.iter())
                .map(|(chunk, &label)| (chunk[0], chunk[1], label))
                .collect();

            rng.shuffle(&mut combined);

            features.clear();
            labels.clear();
            for (x, y, label) in combined {
                features.push(x);
                features.push(y);
                labels.push(label);
            }
        }

        let feature_tensor = Tensor::from_vec(features, &[config.n_samples, 2])?;
        let label_tensor = Tensor::from_vec(labels, &[config.n_samples])?;

        Ok(SyntheticDataset::new(feature_tensor, label_tensor))
    }

    /// Generate concentric circles
    pub fn make_circles(config: SyntheticConfig, factor: f64) -> Result<SyntheticDataset<f64>> {
        let mut rng = if let Some(seed) = config.random_seed {
            Random::seed(seed)
        } else {
            Random::seed(0)
        };

        let n_samples_out = config.n_samples / 2;
        let n_samples_in = config.n_samples - n_samples_out;

        let mut features = Vec::new();
        let mut labels = Vec::new();

        // Generate outer circle
        for _ in 0..n_samples_out {
            let angle = rng.gen_range(0.0..2.0 * PI);
            let radius = 1.0;
            let x = radius * angle.cos();
            let y = radius * angle.sin();

            // Add noise
            let noise_x = rng.gen_range(-config.noise_level..config.noise_level);
            let noise_y = rng.gen_range(-config.noise_level..config.noise_level);

            features.push(x + noise_x);
            features.push(y + noise_y);
            labels.push(0.0);
        }

        // Generate inner circle
        for _ in 0..n_samples_in {
            let angle = rng.gen_range(0.0..2.0 * PI);
            let radius = factor;
            let x = radius * angle.cos();
            let y = radius * angle.sin();

            // Add noise
            let noise_x = rng.gen_range(-config.noise_level..config.noise_level);
            let noise_y = rng.gen_range(-config.noise_level..config.noise_level);

            features.push(x + noise_x);
            features.push(y + noise_y);
            labels.push(1.0);
        }

        // Shuffle if requested
        if config.shuffle {
            let mut combined: Vec<(f64, f64, f64)> = features
                .chunks_exact(2)
                .zip(labels.iter())
                .map(|(chunk, &label)| (chunk[0], chunk[1], label))
                .collect();

            rng.shuffle(&mut combined);

            features.clear();
            labels.clear();
            for (x, y, label) in combined {
                features.push(x);
                features.push(y);
                labels.push(label);
            }
        }

        let feature_tensor = Tensor::from_vec(features, &[config.n_samples, 2])?;
        let label_tensor = Tensor::from_vec(labels, &[config.n_samples])?;

        Ok(SyntheticDataset::new(feature_tensor, label_tensor))
    }

    /// Generate Gaussian blobs
    pub fn make_blobs(
        config: SyntheticConfig,
        n_features: usize,
        centers: Option<usize>,
        cluster_std: f64,
        center_box: (f64, f64),
    ) -> Result<SyntheticDataset<f64>> {
        let mut rng = if let Some(seed) = config.random_seed {
            Random::seed(seed)
        } else {
            Random::seed(0)
        };

        let n_centers = centers.unwrap_or(3);

        // Generate random cluster centers
        let mut cluster_centers = Vec::new();
        for _ in 0..n_centers {
            let mut center = Vec::new();
            for _ in 0..n_features {
                center.push(rng.gen_range(center_box.0..center_box.1));
            }
            cluster_centers.push(center);
        }

        let mut features = Vec::new();
        let mut labels = Vec::new();

        // Generate samples for each cluster
        let samples_per_cluster = config.n_samples / n_centers;
        let remaining_samples = config.n_samples % n_centers;

        for (cluster_id, center) in cluster_centers.iter().enumerate() {
            let cluster_samples = if cluster_id < remaining_samples {
                samples_per_cluster + 1
            } else {
                samples_per_cluster
            };

            for _ in 0..cluster_samples {
                for &center_val in center.iter().take(n_features) {
                    let noise = rng.random_range(-cluster_std..cluster_std);
                    let value = center_val + noise;
                    features.push(value);
                }
                labels.push(cluster_id as f64);
            }
        }

        // Shuffle if requested
        if config.shuffle {
            let mut combined: Vec<(Vec<f64>, f64)> = features
                .chunks_exact(n_features)
                .zip(labels.iter())
                .map(|(chunk, &label)| (chunk.to_vec(), label))
                .collect();

            rng.shuffle(&mut combined);

            features.clear();
            labels.clear();
            for (feat_vec, label) in combined {
                features.extend(feat_vec);
                labels.push(label);
            }
        }

        let feature_tensor = Tensor::from_vec(features, &[config.n_samples, n_features])?;
        let label_tensor = Tensor::from_vec(labels, &[config.n_samples])?;

        Ok(SyntheticDataset::new(feature_tensor, label_tensor))
    }

    /// Generate linearly separable classification data
    pub fn make_classification(
        config: SyntheticConfig,
        n_features: usize,
        n_informative: usize,
        n_redundant: usize,
        n_classes: usize,
        flip_y: f64,
    ) -> Result<SyntheticDataset<f64>> {
        let mut rng = if let Some(seed) = config.random_seed {
            Random::seed(seed)
        } else {
            Random::seed(0)
        };

        if n_informative + n_redundant > n_features {
            return Err(TensorError::invalid_argument(
                "n_informative + n_redundant cannot exceed n_features".to_string(),
            ));
        }

        // Generate informative features
        let mut features = vec![0.0; config.n_samples * n_features];
        let mut labels = Vec::new();

        // Generate random weights for each class
        let mut class_weights = Vec::new();
        for _ in 0..n_classes {
            let mut weights = Vec::new();
            for _ in 0..n_informative {
                weights.push(rng.gen_range(-1.0..1.0));
            }
            class_weights.push(weights);
        }

        // Generate samples
        for sample_idx in 0..config.n_samples {
            // Choose random class
            let class_id = rng.random_range(0..n_classes);

            // Generate informative features
            for feat_idx in 0..n_informative {
                let base_value = rng.gen_range(-1.0..1.0);
                let class_bias = class_weights[class_id][feat_idx];
                let feature_value = base_value
                    + class_bias
                    + rng.gen_range(-config.noise_level..config.noise_level);

                features[sample_idx * n_features + feat_idx] = feature_value;
            }

            // Generate redundant features (linear combinations of informative features)
            for redundant_idx in 0..n_redundant {
                let feat_idx = n_informative + redundant_idx;
                let mut redundant_value = 0.0;

                for info_idx in 0..n_informative {
                    let weight = rng.gen_range(-0.5..0.5);
                    redundant_value += weight * features[sample_idx * n_features + info_idx];
                }

                redundant_value += rng.gen_range(-config.noise_level..config.noise_level);
                features[sample_idx * n_features + feat_idx] = redundant_value;
            }

            // Generate noise features
            for noise_idx in (n_informative + n_redundant)..n_features {
                features[sample_idx * n_features + noise_idx] = rng.gen_range(-1.0..1.0);
            }

            // Assign label with possible flip
            let final_label = if rng.gen_range(0.0..1.0) < flip_y {
                rng.random_range(0..n_classes)
            } else {
                class_id
            };

            labels.push(final_label as f64);
        }

        // Shuffle if requested
        if config.shuffle {
            let mut combined: Vec<(Vec<f64>, f64)> = features
                .chunks_exact(n_features)
                .zip(labels.iter())
                .map(|(chunk, &label)| (chunk.to_vec(), label))
                .collect();

            rng.shuffle(&mut combined);

            features.clear();
            labels.clear();
            for (feat_vec, label) in combined {
                features.extend(feat_vec);
                labels.push(label);
            }
        }

        let feature_tensor = Tensor::from_vec(features, &[config.n_samples, n_features])?;
        let label_tensor = Tensor::from_vec(labels, &[config.n_samples])?;

        Ok(SyntheticDataset::new(feature_tensor, label_tensor))
    }

    /// Generate regression data
    pub fn make_regression(
        config: SyntheticConfig,
        n_features: usize,
        n_informative: usize,
        effective_rank: Option<usize>,
        tail_strength: f64,
        bias: f64,
    ) -> Result<SyntheticDataset<f64>> {
        let mut rng = if let Some(seed) = config.random_seed {
            Random::seed(seed)
        } else {
            Random::seed(0)
        };

        // Generate random X matrix
        let mut features = Vec::new();
        for _ in 0..(config.n_samples * n_features) {
            features.push(rng.gen_range(-1.0..1.0));
        }

        // Generate random ground truth weights
        let mut true_weights = Vec::new();
        for i in 0..n_informative {
            let weight = if let Some(rank) = effective_rank {
                if i < rank {
                    100.0 * rng.gen_range(-1.0..1.0)
                } else {
                    tail_strength * rng.gen_range(-1.0..1.0)
                }
            } else {
                rng.gen_range(-1.0..1.0)
            };
            true_weights.push(weight);
        }

        // Extend weights with zeros for non-informative features
        while true_weights.len() < n_features {
            true_weights.push(0.0);
        }

        // Generate targets
        let mut labels = Vec::new();
        for sample_idx in 0..config.n_samples {
            let mut target = bias;

            for feat_idx in 0..n_features {
                let feature_value = features[sample_idx * n_features + feat_idx];
                target += feature_value * true_weights[feat_idx];
            }

            // Add noise
            target += rng.gen_range(-config.noise_level..config.noise_level);
            labels.push(target);
        }

        let feature_tensor = Tensor::from_vec(features, &[config.n_samples, n_features])?;
        let label_tensor = Tensor::from_vec(labels, &[config.n_samples])?;

        Ok(SyntheticDataset::new(feature_tensor, label_tensor))
    }

    /// Generate S-curve manifold
    pub fn make_s_curve(config: SyntheticConfig, noise: f64) -> Result<SyntheticDataset<f64>> {
        let mut rng = if let Some(seed) = config.random_seed {
            Random::seed(seed)
        } else {
            Random::seed(0)
        };

        let mut features = Vec::new();
        let mut labels = Vec::new(); // Will contain the parameter t for color coding

        for _ in 0..config.n_samples {
            let t = rng.gen_range(0.0..1.0);

            // S-curve parametric equations
            let arg: f64 = 1.5 * (1.5 * t - 1.0);
            let x = arg.sin();
            let y = 2.0 * rng.gen_range(-1.0..1.0); // Random y coordinate
            let z = arg.signum() * arg.cos();

            // Add noise
            features.push(x + noise * rng.gen_range(-1.0..1.0));
            features.push(y + noise * rng.gen_range(-1.0..1.0));
            features.push(z + noise * rng.gen_range(-1.0..1.0));

            labels.push(t);
        }

        let feature_tensor = Tensor::from_vec(features, &[config.n_samples, 3])?;
        let label_tensor = Tensor::from_vec(labels, &[config.n_samples])?;

        Ok(SyntheticDataset::new(feature_tensor, label_tensor))
    }

    /// Generate Swiss roll manifold
    pub fn make_swiss_roll(config: SyntheticConfig, noise: f64) -> Result<SyntheticDataset<f64>> {
        let mut rng = if let Some(seed) = config.random_seed {
            Random::seed(seed)
        } else {
            Random::seed(0)
        };

        let mut features = Vec::new();
        let mut labels = Vec::new(); // Will contain the parameter t for color coding

        for _ in 0..config.n_samples {
            let t = rng.gen_range(1.5 * PI..4.5 * PI);
            let height = rng.gen_range(0.0..21.0);

            // Swiss roll parametric equations
            let x = t * t.cos();
            let y = height;
            let z = t * t.sin();

            // Add noise
            features.push(x + noise * rng.gen_range(-1.0..1.0));
            features.push(y + noise * rng.gen_range(-1.0..1.0));
            features.push(z + noise * rng.gen_range(-1.0..1.0));

            labels.push(t);
        }

        let feature_tensor = Tensor::from_vec(features, &[config.n_samples, 3])?;
        let label_tensor = Tensor::from_vec(labels, &[config.n_samples])?;

        Ok(SyntheticDataset::new(feature_tensor, label_tensor))
    }
}