pub fn make_time_series(
n_samples: usize,
n_features: usize,
trend: bool,
seasonality: bool,
noise: f64,
randomseed: Option<u64>,
) -> Result<Dataset>
Expand description
Generate a random time series dataset
Examples found in repository?
examples/noise_models_demo.rs (line 156)
152fn demonstrate_time_series_noise() {
153 println!("Testing different time series noise types:");
154
155 // Create a simple time series
156 let clean_ts = make_time_series(100, 2, true, true, 0.0, Some(42)).unwrap();
157
158 let noise_configs = [
159 vec![("gaussian", 0.2)],
160 vec![("spikes", 0.1)],
161 vec![("drift", 0.5)],
162 vec![("seasonal", 0.3)],
163 vec![("autocorrelated", 0.1)],
164 vec![("heteroscedastic", 0.2)],
165 vec![("gaussian", 0.1), ("spikes", 0.05), ("drift", 0.2)], // Combined noise
166 ];
167
168 let noisenames = [
169 "Gaussian White Noise",
170 "Impulse Spikes",
171 "Linear Drift",
172 "Seasonal Pattern",
173 "Autocorrelated Noise",
174 "Heteroscedastic Noise",
175 "Combined Noise",
176 ];
177
178 for (config, name) in noise_configs.iter().zip(noisenames.iter()) {
179 let mut noisydata = clean_ts.data.clone();
180 let original_stats = calculate_basic_stats(&noisydata);
181
182 add_time_series_noise(&mut noisydata, config, Some(42)).unwrap();
183 let noisy_stats = calculate_basic_stats(&noisydata);
184
185 println!("{name}:");
186 println!(" Mean: {:.3} -> {:.3}", original_stats.0, noisy_stats.0);
187 println!(" Std: {:.3} -> {:.3}", original_stats.1, noisy_stats.1);
188 println!(
189 " Range: [{:.3}, {:.3}] -> [{:.3}, {:.3}]",
190 original_stats.2, original_stats.3, noisy_stats.2, noisy_stats.3
191 );
192 }
193}
194
195#[allow(dead_code)]
196fn demonstrate_comprehensive_corruption() {
197 println!("Testing comprehensive dataset corruption:");
198
199 // Load a real dataset
200 let iris = load_iris().unwrap();
201 println!(
202 "Original Iris dataset: {} samples, {} features",
203 iris.n_samples(),
204 iris.n_features()
205 );
206
207 let original_stats = calculate_basic_stats(&iris.data);
208 println!(
209 "Original stats - Mean: {:.3}, Std: {:.3}",
210 original_stats.0, original_stats.1
211 );
212
213 // Create different levels of corruption
214 let corruption_levels = [
215 (0.05, 0.02, "Light corruption"),
216 (0.1, 0.05, "Moderate corruption"),
217 (0.2, 0.1, "Heavy corruption"),
218 (0.3, 0.15, "Severe corruption"),
219 ];
220
221 for (missing_rate, outlier_rate, description) in corruption_levels {
222 let corrupted = make_corrupted_dataset(
223 &iris,
224 missing_rate,
225 MissingPattern::MAR, // More realistic than MCAR
226 outlier_rate,
227 OutlierType::Point,
228 2.5,
229 Some(42),
230 )
231 .unwrap();
232
233 // Calculate how much data is usable
234 let total_elements = corrupted.data.len();
235 let missing_elements = corrupted.data.iter().filter(|&&x| x.is_nan()).count();
236 let usable_percentage =
237 ((total_elements - missing_elements) as f64 / total_elements as f64) * 100.0;
238
239 println!("{description}:");
240 println!(" Missing data: {:.1}%", missing_rate * 100.0);
241 println!(" Outliers: {:.1}%", outlier_rate * 100.0);
242 println!(" Usable data: {:.1}%", usable_percentage);
243
244 // Show metadata
245 if let Some(missing_count) = corrupted.metadata.get("missing_count") {
246 println!(" Actual missing: {missing_count} elements");
247 }
248 if let Some(outlier_count) = corrupted.metadata.get("outlier_count") {
249 println!(" Actual outliers: {outlier_count} samples");
250 }
251 }
252}
253
254#[allow(dead_code)]
255fn demonstrate_real_world_applications() {
256 println!("Real-world application scenarios:");
257
258 println!("\n1. **Medical Data Simulation**:");
259 let medicaldata = load_iris().unwrap(); // Stand-in for medical measurements
260 let _corrupted_medical = make_corrupted_dataset(
261 &medicaldata,
262 0.15, // 15% missing - common in medical data
263 MissingPattern::MNAR, // High values often missing (privacy, measurement issues)
264 0.05, // 5% outliers - measurement errors
265 OutlierType::Point,
266 2.0,
267 Some(42),
268 )
269 .unwrap();
270
271 println!(" Medical dataset simulation:");
272 println!(" Missing data pattern: MNAR (high values more likely missing)");
273 println!(" Outliers: Point outliers (measurement errors)");
274 println!(" Use case: Testing imputation algorithms for clinical data");
275
276 println!("\n2. **Sensor Network Simulation**:");
277 let sensordata = make_time_series(200, 4, true, true, 0.1, Some(42)).unwrap();
278 let mut sensor_ts = sensordata.data.clone();
279
280 // Add realistic sensor noise
281 add_time_series_noise(
282 &mut sensor_ts,
283 &[
284 ("gaussian", 0.05), // Background noise
285 ("spikes", 0.02), // Electrical interference
286 ("drift", 0.1), // Sensor calibration drift
287 ("heteroscedastic", 0.03), // Temperature-dependent noise
288 ],
289 Some(42),
290 )
291 .unwrap();
292
293 // Add missing data (sensor failures)
294 inject_missing_data(&mut sensor_ts, 0.08, MissingPattern::Block, Some(42)).unwrap();
295
296 println!(" Sensor network simulation:");
297 println!(" Multiple noise types: gaussian + spikes + drift + heteroscedastic");
298 println!(" Missing data: Block pattern (sensor failures)");
299 println!(" Use case: Testing robust time series algorithms");
300
301 println!("\n3. **Survey Data Simulation**:");
302 let surveydata = load_iris().unwrap(); // Stand-in for survey responses
303 let _corrupted_survey = make_corrupted_dataset(
304 &surveydata,
305 0.25, // 25% missing - typical for surveys
306 MissingPattern::MAR, // Missing depends on other responses
307 0.08, // 8% outliers - data entry errors, extreme responses
308 OutlierType::Contextual,
309 1.5,
310 Some(42),
311 )
312 .unwrap();
313
314 println!(" Survey data simulation:");
315 println!(" Missing data pattern: MAR (depends on other responses)");
316 println!(" Outliers: Contextual (unusual response patterns)");
317 println!(" Use case: Testing survey analysis robustness");
318
319 println!("\n4. **Financial Data Simulation**:");
320 let mut financial_ts = make_time_series(500, 3, false, false, 0.02, Some(42))
321 .unwrap()
322 .data;
323
324 // Add financial market-specific noise
325 add_time_series_noise(
326 &mut financial_ts,
327 &[
328 ("gaussian", 0.1), // Market volatility
329 ("spikes", 0.05), // Market shocks
330 ("autocorrelated", 0.15), // Momentum effects
331 ("heteroscedastic", 0.2), // Volatility clustering
332 ],
333 Some(42),
334 )
335 .unwrap();
336
337 println!(" Financial data simulation:");
338 println!(" Noise types: volatility + shocks + momentum + clustering");
339 println!(" Use case: Testing financial models under realistic conditions");
340}
More examples
examples/data_generators.rs (lines 77-84)
7fn main() -> Result<(), Box<dyn std::error::Error>> {
8 println!("Creating synthetic datasets...\n");
9
10 // Generate classification dataset
11 let n_samples = 100;
12 let n_features = 5;
13
14 let classificationdata = make_classification(
15 n_samples,
16 n_features,
17 3, // 3 classes
18 2, // 2 clusters per class
19 3, // 3 informative features
20 Some(42), // random seed
21 )?;
22
23 // Train-test split
24 let (train, test) = train_test_split(&classificationdata, 0.2, Some(42))?;
25
26 println!("Classification dataset:");
27 println!(" Total samples: {}", classificationdata.n_samples());
28 println!(" Features: {}", classificationdata.n_features());
29 println!(" Training samples: {}", train.n_samples());
30 println!(" Test samples: {}", test.n_samples());
31
32 // Generate regression dataset
33 let regressiondata = make_regression(
34 n_samples,
35 n_features,
36 3, // 3 informative features
37 0.5, // noise level
38 Some(42),
39 )?;
40
41 println!("\nRegression dataset:");
42 println!(" Samples: {}", regressiondata.n_samples());
43 println!(" Features: {}", regressiondata.n_features());
44
45 // Normalize the data (in-place)
46 let mut data_copy = regressiondata.data.clone();
47 normalize(&mut data_copy);
48 println!(" Data normalized successfully");
49
50 // Generate clustering data (blobs)
51 let clusteringdata = make_blobs(
52 n_samples,
53 2, // 2 features for easy visualization
54 4, // 4 clusters
55 0.8, // cluster standard deviation
56 Some(42),
57 )?;
58
59 println!("\nClustering dataset (blobs):");
60 println!(" Samples: {}", clusteringdata.n_samples());
61 println!(" Features: {}", clusteringdata.n_features());
62
63 // Find the number of clusters by finding the max value of target
64 let num_clusters = clusteringdata.target.as_ref().map_or(0, |t| {
65 let mut max_val = -1.0;
66 for &val in t.iter() {
67 if val > max_val {
68 max_val = val;
69 }
70 }
71 (max_val as usize) + 1
72 });
73
74 println!(" Clusters: {num_clusters}");
75
76 // Generate time series data
77 let time_series = make_time_series(
78 100, // 100 time steps
79 3, // 3 features/variables
80 true, // with trend
81 true, // with seasonality
82 0.2, // noise level
83 Some(42),
84 )?;
85
86 println!("\nTime series dataset:");
87 println!(" Time steps: {}", time_series.n_samples());
88 println!(" Features: {}", time_series.n_features());
89
90 Ok(())
91}