noise_models_demo/
noise_models_demo.rs

1//! Realistic noise models demonstration
2//!
3//! This example demonstrates the use of realistic noise injection utilities for creating
4//! datasets with missing data, outliers, and various noise patterns that mimic real-world
5//! data quality issues.
6
7use scirs2_core::ndarray::Array2;
8use scirs2_datasets::{
9    add_time_series_noise, inject_missing_data, inject_outliers, load_iris, make_corrupted_dataset,
10    make_time_series, MissingPattern, OutlierType,
11};
12
13#[allow(dead_code)]
14fn main() {
15    println!("=== Realistic Noise Models Demonstration ===\n");
16
17    // Demonstrate missing data patterns
18    println!("=== Missing Data Patterns ========================");
19    demonstrate_missing_data_patterns();
20
21    // Demonstrate outlier injection
22    println!("\n=== Outlier Injection ============================");
23    demonstrate_outlier_injection();
24
25    // Demonstrate time series noise
26    println!("\n=== Time Series Noise ============================");
27    demonstrate_time_series_noise();
28
29    // Demonstrate comprehensive corruption
30    println!("\n=== Comprehensive Dataset Corruption =============");
31    demonstrate_comprehensive_corruption();
32
33    // Real-world applications
34    println!("\n=== Real-World Applications ======================");
35    demonstrate_real_world_applications();
36
37    println!("\n=== Noise Models Demo Complete ===================");
38}
39
40#[allow(dead_code)]
41fn demonstrate_missing_data_patterns() {
42    println!("Testing different missing data patterns on a sample dataset:");
43
44    let originaldata = Array2::from_shape_vec(
45        (8, 4),
46        vec![
47            1.0, 2.0, 3.0, 4.0, 2.0, 4.0, 6.0, 8.0, 3.0, 6.0, 9.0, 12.0, 4.0, 8.0, 12.0, 16.0, 5.0,
48            10.0, 15.0, 20.0, 6.0, 12.0, 18.0, 24.0, 7.0, 14.0, 21.0, 28.0, 8.0, 16.0, 24.0, 32.0,
49        ],
50    )
51    .unwrap();
52
53    let patterns = [
54        (MissingPattern::MCAR, "Missing Completely at Random"),
55        (MissingPattern::MAR, "Missing at Random"),
56        (MissingPattern::MNAR, "Missing Not at Random"),
57        (MissingPattern::Block, "Block-wise Missing"),
58    ];
59
60    for (pattern, description) in patterns {
61        let mut testdata = originaldata.clone();
62        let missing_mask = inject_missing_data(&mut testdata, 0.3, pattern, Some(42)).unwrap();
63
64        let missing_count = missing_mask.iter().filter(|&&x| x).count();
65        let total_elements = testdata.len();
66        let missing_percentage = (missing_count as f64 / total_elements as f64) * 100.0;
67
68        println!("{description}:");
69        println!(
70            "  Missing elements: {} / {} ({:.1}%)",
71            missing_count, total_elements, missing_percentage
72        );
73
74        // Show pattern of missing data
75        print!("  Pattern (X = missing): ");
76        for i in 0..testdata.nrows() {
77            for j in 0..testdata.ncols() {
78                if missing_mask[[i, j]] {
79                    print!("X ");
80                } else {
81                    print!(". ");
82                }
83            }
84            if i < testdata.nrows() - 1 {
85                print!("| ");
86            }
87        }
88        println!();
89    }
90}
91
92#[allow(dead_code)]
93fn demonstrate_outlier_injection() {
94    println!("Testing different outlier types on a sample dataset:");
95
96    // Create a clean dataset with known statistics
97    let mut cleandata = Array2::ones((20, 3));
98    // Add some structure
99    for i in 0..20 {
100        for j in 0..3 {
101            cleandata[[i, j]] = (i as f64 + j as f64) / 2.0;
102        }
103    }
104
105    let outlier_types = [
106        (OutlierType::Point, "Point Outliers"),
107        (OutlierType::Contextual, "Contextual Outliers"),
108        (OutlierType::Collective, "Collective Outliers"),
109    ];
110
111    for (outlier_type, description) in outlier_types {
112        let mut testdata = cleandata.clone();
113        let original_stats = calculate_basic_stats(&testdata);
114
115        let outlier_mask =
116            inject_outliers(&mut testdata, 0.2, outlier_type, 3.0, Some(42)).unwrap();
117        let corrupted_stats = calculate_basic_stats(&testdata);
118
119        let outlier_count = outlier_mask.iter().filter(|&&x| x).count();
120
121        println!("{description}:");
122        println!(
123            "  Outliers injected: {} / {} samples",
124            outlier_count,
125            testdata.nrows()
126        );
127        println!(
128            "  Mean change: {:.3} -> {:.3} (Δ={:.3})",
129            original_stats.0,
130            corrupted_stats.0,
131            corrupted_stats.0 - original_stats.0
132        );
133        println!(
134            "  Std change: {:.3} -> {:.3} (Δ={:.3})",
135            original_stats.1,
136            corrupted_stats.1,
137            corrupted_stats.1 - original_stats.1
138        );
139
140        // Show which samples are outliers
141        print!("  Outlier samples: ");
142        for (i, &is_outlier) in outlier_mask.iter().enumerate() {
143            if is_outlier {
144                print!("{} ", i);
145            }
146        }
147        println!();
148    }
149}
150
151#[allow(dead_code)]
152fn demonstrate_time_series_noise() {
153    println!("Testing different time series noise types:");
154
155    // Create a simple time series
156    let clean_ts = make_time_series(100, 2, true, true, 0.0, Some(42)).unwrap();
157
158    let noise_configs = [
159        vec![("gaussian", 0.2)],
160        vec![("spikes", 0.1)],
161        vec![("drift", 0.5)],
162        vec![("seasonal", 0.3)],
163        vec![("autocorrelated", 0.1)],
164        vec![("heteroscedastic", 0.2)],
165        vec![("gaussian", 0.1), ("spikes", 0.05), ("drift", 0.2)], // Combined noise
166    ];
167
168    let noisenames = [
169        "Gaussian White Noise",
170        "Impulse Spikes",
171        "Linear Drift",
172        "Seasonal Pattern",
173        "Autocorrelated Noise",
174        "Heteroscedastic Noise",
175        "Combined Noise",
176    ];
177
178    for (config, name) in noise_configs.iter().zip(noisenames.iter()) {
179        let mut noisydata = clean_ts.data.clone();
180        let original_stats = calculate_basic_stats(&noisydata);
181
182        add_time_series_noise(&mut noisydata, config, Some(42)).unwrap();
183        let noisy_stats = calculate_basic_stats(&noisydata);
184
185        println!("{name}:");
186        println!("  Mean: {:.3} -> {:.3}", original_stats.0, noisy_stats.0);
187        println!("  Std: {:.3} -> {:.3}", original_stats.1, noisy_stats.1);
188        println!(
189            "  Range: [{:.3}, {:.3}] -> [{:.3}, {:.3}]",
190            original_stats.2, original_stats.3, noisy_stats.2, noisy_stats.3
191        );
192    }
193}
194
195#[allow(dead_code)]
196fn demonstrate_comprehensive_corruption() {
197    println!("Testing comprehensive dataset corruption:");
198
199    // Load a real dataset
200    let iris = load_iris().unwrap();
201    println!(
202        "Original Iris dataset: {} samples, {} features",
203        iris.n_samples(),
204        iris.n_features()
205    );
206
207    let original_stats = calculate_basic_stats(&iris.data);
208    println!(
209        "Original stats - Mean: {:.3}, Std: {:.3}",
210        original_stats.0, original_stats.1
211    );
212
213    // Create different levels of corruption
214    let corruption_levels = [
215        (0.05, 0.02, "Light corruption"),
216        (0.1, 0.05, "Moderate corruption"),
217        (0.2, 0.1, "Heavy corruption"),
218        (0.3, 0.15, "Severe corruption"),
219    ];
220
221    for (missing_rate, outlier_rate, description) in corruption_levels {
222        let corrupted = make_corrupted_dataset(
223            &iris,
224            missing_rate,
225            MissingPattern::MAR, // More realistic than MCAR
226            outlier_rate,
227            OutlierType::Point,
228            2.5,
229            Some(42),
230        )
231        .unwrap();
232
233        // Calculate how much data is usable
234        let total_elements = corrupted.data.len();
235        let missing_elements = corrupted.data.iter().filter(|&&x| x.is_nan()).count();
236        let usable_percentage =
237            ((total_elements - missing_elements) as f64 / total_elements as f64) * 100.0;
238
239        println!("{description}:");
240        println!("  Missing data: {:.1}%", missing_rate * 100.0);
241        println!("  Outliers: {:.1}%", outlier_rate * 100.0);
242        println!("  Usable data: {:.1}%", usable_percentage);
243
244        // Show metadata
245        if let Some(missing_count) = corrupted.metadata.get("missing_count") {
246            println!("  Actual missing: {missing_count} elements");
247        }
248        if let Some(outlier_count) = corrupted.metadata.get("outlier_count") {
249            println!("  Actual outliers: {outlier_count} samples");
250        }
251    }
252}
253
254#[allow(dead_code)]
255fn demonstrate_real_world_applications() {
256    println!("Real-world application scenarios:");
257
258    println!("\n1. **Medical Data Simulation**:");
259    let medicaldata = load_iris().unwrap(); // Stand-in for medical measurements
260    let _corrupted_medical = make_corrupted_dataset(
261        &medicaldata,
262        0.15,                 // 15% missing - common in medical data
263        MissingPattern::MNAR, // High values often missing (privacy, measurement issues)
264        0.05,                 // 5% outliers - measurement errors
265        OutlierType::Point,
266        2.0,
267        Some(42),
268    )
269    .unwrap();
270
271    println!("  Medical dataset simulation:");
272    println!("    Missing data pattern: MNAR (high values more likely missing)");
273    println!("    Outliers: Point outliers (measurement errors)");
274    println!("    Use case: Testing imputation algorithms for clinical data");
275
276    println!("\n2. **Sensor Network Simulation**:");
277    let sensordata = make_time_series(200, 4, true, true, 0.1, Some(42)).unwrap();
278    let mut sensor_ts = sensordata.data.clone();
279
280    // Add realistic sensor noise
281    add_time_series_noise(
282        &mut sensor_ts,
283        &[
284            ("gaussian", 0.05),        // Background noise
285            ("spikes", 0.02),          // Electrical interference
286            ("drift", 0.1),            // Sensor calibration drift
287            ("heteroscedastic", 0.03), // Temperature-dependent noise
288        ],
289        Some(42),
290    )
291    .unwrap();
292
293    // Add missing data (sensor failures)
294    inject_missing_data(&mut sensor_ts, 0.08, MissingPattern::Block, Some(42)).unwrap();
295
296    println!("  Sensor network simulation:");
297    println!("    Multiple noise types: gaussian + spikes + drift + heteroscedastic");
298    println!("    Missing data: Block pattern (sensor failures)");
299    println!("    Use case: Testing robust time series algorithms");
300
301    println!("\n3. **Survey Data Simulation**:");
302    let surveydata = load_iris().unwrap(); // Stand-in for survey responses
303    let _corrupted_survey = make_corrupted_dataset(
304        &surveydata,
305        0.25,                // 25% missing - typical for surveys
306        MissingPattern::MAR, // Missing depends on other responses
307        0.08,                // 8% outliers - data entry errors, extreme responses
308        OutlierType::Contextual,
309        1.5,
310        Some(42),
311    )
312    .unwrap();
313
314    println!("  Survey data simulation:");
315    println!("    Missing data pattern: MAR (depends on other responses)");
316    println!("    Outliers: Contextual (unusual response patterns)");
317    println!("    Use case: Testing survey analysis robustness");
318
319    println!("\n4. **Financial Data Simulation**:");
320    let mut financial_ts = make_time_series(500, 3, false, false, 0.02, Some(42))
321        .unwrap()
322        .data;
323
324    // Add financial market-specific noise
325    add_time_series_noise(
326        &mut financial_ts,
327        &[
328            ("gaussian", 0.1),        // Market volatility
329            ("spikes", 0.05),         // Market shocks
330            ("autocorrelated", 0.15), // Momentum effects
331            ("heteroscedastic", 0.2), // Volatility clustering
332        ],
333        Some(42),
334    )
335    .unwrap();
336
337    println!("  Financial data simulation:");
338    println!("    Noise types: volatility + shocks + momentum + clustering");
339    println!("    Use case: Testing financial models under realistic conditions");
340}
341
342/// Calculate basic statistics for a 2D array
343#[allow(dead_code)]
344fn calculate_basic_stats(data: &Array2<f64>) -> (f64, f64, f64, f64) {
345    let valid_values: Vec<f64> = data.iter().filter(|&&x| !x.is_nan()).cloned().collect();
346
347    if valid_values.is_empty() {
348        return (0.0, 0.0, 0.0, 0.0);
349    }
350
351    let mean = valid_values.iter().sum::<f64>() / valid_values.len() as f64;
352    let variance = valid_values
353        .iter()
354        .map(|&x| (x - mean).powi(2))
355        .sum::<f64>()
356        / valid_values.len() as f64;
357    let std = variance.sqrt();
358    let min = valid_values.iter().cloned().fold(f64::INFINITY, f64::min);
359    let max = valid_values
360        .iter()
361        .cloned()
362        .fold(f64::NEG_INFINITY, f64::max);
363
364    (mean, std, min, max)
365}