make_time_series

Function make_time_series 

Source
pub fn make_time_series(
    n_samples: usize,
    n_features: usize,
    trend: bool,
    seasonality: bool,
    noise: f64,
    randomseed: Option<u64>,
) -> Result<Dataset>
Expand description

Generate a random time series dataset

Examples found in repository?
examples/noise_models_demo.rs (line 156)
152fn demonstrate_time_series_noise() {
153    println!("Testing different time series noise types:");
154
155    // Create a simple time series
156    let clean_ts = make_time_series(100, 2, true, true, 0.0, Some(42)).unwrap();
157
158    let noise_configs = [
159        vec![("gaussian", 0.2)],
160        vec![("spikes", 0.1)],
161        vec![("drift", 0.5)],
162        vec![("seasonal", 0.3)],
163        vec![("autocorrelated", 0.1)],
164        vec![("heteroscedastic", 0.2)],
165        vec![("gaussian", 0.1), ("spikes", 0.05), ("drift", 0.2)], // Combined noise
166    ];
167
168    let noisenames = [
169        "Gaussian White Noise",
170        "Impulse Spikes",
171        "Linear Drift",
172        "Seasonal Pattern",
173        "Autocorrelated Noise",
174        "Heteroscedastic Noise",
175        "Combined Noise",
176    ];
177
178    for (config, name) in noise_configs.iter().zip(noisenames.iter()) {
179        let mut noisydata = clean_ts.data.clone();
180        let original_stats = calculate_basic_stats(&noisydata);
181
182        add_time_series_noise(&mut noisydata, config, Some(42)).unwrap();
183        let noisy_stats = calculate_basic_stats(&noisydata);
184
185        println!("{name}:");
186        println!("  Mean: {:.3} -> {:.3}", original_stats.0, noisy_stats.0);
187        println!("  Std: {:.3} -> {:.3}", original_stats.1, noisy_stats.1);
188        println!(
189            "  Range: [{:.3}, {:.3}] -> [{:.3}, {:.3}]",
190            original_stats.2, original_stats.3, noisy_stats.2, noisy_stats.3
191        );
192    }
193}
194
195#[allow(dead_code)]
196fn demonstrate_comprehensive_corruption() {
197    println!("Testing comprehensive dataset corruption:");
198
199    // Load a real dataset
200    let iris = load_iris().unwrap();
201    println!(
202        "Original Iris dataset: {} samples, {} features",
203        iris.n_samples(),
204        iris.n_features()
205    );
206
207    let original_stats = calculate_basic_stats(&iris.data);
208    println!(
209        "Original stats - Mean: {:.3}, Std: {:.3}",
210        original_stats.0, original_stats.1
211    );
212
213    // Create different levels of corruption
214    let corruption_levels = [
215        (0.05, 0.02, "Light corruption"),
216        (0.1, 0.05, "Moderate corruption"),
217        (0.2, 0.1, "Heavy corruption"),
218        (0.3, 0.15, "Severe corruption"),
219    ];
220
221    for (missing_rate, outlier_rate, description) in corruption_levels {
222        let corrupted = make_corrupted_dataset(
223            &iris,
224            missing_rate,
225            MissingPattern::MAR, // More realistic than MCAR
226            outlier_rate,
227            OutlierType::Point,
228            2.5,
229            Some(42),
230        )
231        .unwrap();
232
233        // Calculate how much data is usable
234        let total_elements = corrupted.data.len();
235        let missing_elements = corrupted.data.iter().filter(|&&x| x.is_nan()).count();
236        let usable_percentage =
237            ((total_elements - missing_elements) as f64 / total_elements as f64) * 100.0;
238
239        println!("{description}:");
240        println!("  Missing data: {:.1}%", missing_rate * 100.0);
241        println!("  Outliers: {:.1}%", outlier_rate * 100.0);
242        println!("  Usable data: {:.1}%", usable_percentage);
243
244        // Show metadata
245        if let Some(missing_count) = corrupted.metadata.get("missing_count") {
246            println!("  Actual missing: {missing_count} elements");
247        }
248        if let Some(outlier_count) = corrupted.metadata.get("outlier_count") {
249            println!("  Actual outliers: {outlier_count} samples");
250        }
251    }
252}
253
254#[allow(dead_code)]
255fn demonstrate_real_world_applications() {
256    println!("Real-world application scenarios:");
257
258    println!("\n1. **Medical Data Simulation**:");
259    let medicaldata = load_iris().unwrap(); // Stand-in for medical measurements
260    let _corrupted_medical = make_corrupted_dataset(
261        &medicaldata,
262        0.15,                 // 15% missing - common in medical data
263        MissingPattern::MNAR, // High values often missing (privacy, measurement issues)
264        0.05,                 // 5% outliers - measurement errors
265        OutlierType::Point,
266        2.0,
267        Some(42),
268    )
269    .unwrap();
270
271    println!("  Medical dataset simulation:");
272    println!("    Missing data pattern: MNAR (high values more likely missing)");
273    println!("    Outliers: Point outliers (measurement errors)");
274    println!("    Use case: Testing imputation algorithms for clinical data");
275
276    println!("\n2. **Sensor Network Simulation**:");
277    let sensordata = make_time_series(200, 4, true, true, 0.1, Some(42)).unwrap();
278    let mut sensor_ts = sensordata.data.clone();
279
280    // Add realistic sensor noise
281    add_time_series_noise(
282        &mut sensor_ts,
283        &[
284            ("gaussian", 0.05),        // Background noise
285            ("spikes", 0.02),          // Electrical interference
286            ("drift", 0.1),            // Sensor calibration drift
287            ("heteroscedastic", 0.03), // Temperature-dependent noise
288        ],
289        Some(42),
290    )
291    .unwrap();
292
293    // Add missing data (sensor failures)
294    inject_missing_data(&mut sensor_ts, 0.08, MissingPattern::Block, Some(42)).unwrap();
295
296    println!("  Sensor network simulation:");
297    println!("    Multiple noise types: gaussian + spikes + drift + heteroscedastic");
298    println!("    Missing data: Block pattern (sensor failures)");
299    println!("    Use case: Testing robust time series algorithms");
300
301    println!("\n3. **Survey Data Simulation**:");
302    let surveydata = load_iris().unwrap(); // Stand-in for survey responses
303    let _corrupted_survey = make_corrupted_dataset(
304        &surveydata,
305        0.25,                // 25% missing - typical for surveys
306        MissingPattern::MAR, // Missing depends on other responses
307        0.08,                // 8% outliers - data entry errors, extreme responses
308        OutlierType::Contextual,
309        1.5,
310        Some(42),
311    )
312    .unwrap();
313
314    println!("  Survey data simulation:");
315    println!("    Missing data pattern: MAR (depends on other responses)");
316    println!("    Outliers: Contextual (unusual response patterns)");
317    println!("    Use case: Testing survey analysis robustness");
318
319    println!("\n4. **Financial Data Simulation**:");
320    let mut financial_ts = make_time_series(500, 3, false, false, 0.02, Some(42))
321        .unwrap()
322        .data;
323
324    // Add financial market-specific noise
325    add_time_series_noise(
326        &mut financial_ts,
327        &[
328            ("gaussian", 0.1),        // Market volatility
329            ("spikes", 0.05),         // Market shocks
330            ("autocorrelated", 0.15), // Momentum effects
331            ("heteroscedastic", 0.2), // Volatility clustering
332        ],
333        Some(42),
334    )
335    .unwrap();
336
337    println!("  Financial data simulation:");
338    println!("    Noise types: volatility + shocks + momentum + clustering");
339    println!("    Use case: Testing financial models under realistic conditions");
340}
More examples
Hide additional examples
examples/data_generators.rs (lines 77-84)
7fn main() -> Result<(), Box<dyn std::error::Error>> {
8    println!("Creating synthetic datasets...\n");
9
10    // Generate classification dataset
11    let n_samples = 100;
12    let n_features = 5;
13
14    let classificationdata = make_classification(
15        n_samples,
16        n_features,
17        3,        // 3 classes
18        2,        // 2 clusters per class
19        3,        // 3 informative features
20        Some(42), // random seed
21    )?;
22
23    // Train-test split
24    let (train, test) = train_test_split(&classificationdata, 0.2, Some(42))?;
25
26    println!("Classification dataset:");
27    println!("  Total samples: {}", classificationdata.n_samples());
28    println!("  Features: {}", classificationdata.n_features());
29    println!("  Training samples: {}", train.n_samples());
30    println!("  Test samples: {}", test.n_samples());
31
32    // Generate regression dataset
33    let regressiondata = make_regression(
34        n_samples,
35        n_features,
36        3,   // 3 informative features
37        0.5, // noise level
38        Some(42),
39    )?;
40
41    println!("\nRegression dataset:");
42    println!("  Samples: {}", regressiondata.n_samples());
43    println!("  Features: {}", regressiondata.n_features());
44
45    // Normalize the data (in-place)
46    let mut data_copy = regressiondata.data.clone();
47    normalize(&mut data_copy);
48    println!("  Data normalized successfully");
49
50    // Generate clustering data (blobs)
51    let clusteringdata = make_blobs(
52        n_samples,
53        2,   // 2 features for easy visualization
54        4,   // 4 clusters
55        0.8, // cluster standard deviation
56        Some(42),
57    )?;
58
59    println!("\nClustering dataset (blobs):");
60    println!("  Samples: {}", clusteringdata.n_samples());
61    println!("  Features: {}", clusteringdata.n_features());
62
63    // Find the number of clusters by finding the max value of target
64    let num_clusters = clusteringdata.target.as_ref().map_or(0, |t| {
65        let mut max_val = -1.0;
66        for &val in t.iter() {
67            if val > max_val {
68                max_val = val;
69            }
70        }
71        (max_val as usize) + 1
72    });
73
74    println!("  Clusters: {num_clusters}");
75
76    // Generate time series data
77    let time_series = make_time_series(
78        100,  // 100 time steps
79        3,    // 3 features/variables
80        true, // with trend
81        true, // with seasonality
82        0.2,  // noise level
83        Some(42),
84    )?;
85
86    println!("\nTime series dataset:");
87    println!("  Time steps: {}", time_series.n_samples());
88    println!("  Features: {}", time_series.n_features());
89
90    Ok(())
91}