Function inject_missing_data

Source
pub fn inject_missing_data(
    data: &mut Array2<f64>,
    missing_rate: f64,
    pattern: MissingPattern,
    random_seed: Option<u64>,
) -> Result<Array2<bool>>
Expand description

Inject missing data into a dataset with realistic patterns

Examples found in repository?
examples/noise_models_demo.rs (line 60)
39fn demonstrate_missing_data_patterns() {
40    println!("Testing different missing data patterns on a sample dataset:");
41
42    let original_data = Array2::from_shape_vec(
43        (8, 4),
44        vec![
45            1.0, 2.0, 3.0, 4.0, 2.0, 4.0, 6.0, 8.0, 3.0, 6.0, 9.0, 12.0, 4.0, 8.0, 12.0, 16.0, 5.0,
46            10.0, 15.0, 20.0, 6.0, 12.0, 18.0, 24.0, 7.0, 14.0, 21.0, 28.0, 8.0, 16.0, 24.0, 32.0,
47        ],
48    )
49    .unwrap();
50
51    let patterns = [
52        (MissingPattern::MCAR, "Missing Completely at Random"),
53        (MissingPattern::MAR, "Missing at Random"),
54        (MissingPattern::MNAR, "Missing Not at Random"),
55        (MissingPattern::Block, "Block-wise Missing"),
56    ];
57
58    for (pattern, description) in patterns {
59        let mut test_data = original_data.clone();
60        let missing_mask = inject_missing_data(&mut test_data, 0.3, pattern, Some(42)).unwrap();
61
62        let missing_count = missing_mask.iter().filter(|&&x| x).count();
63        let total_elements = test_data.len();
64        let missing_percentage = (missing_count as f64 / total_elements as f64) * 100.0;
65
66        println!("{}:", description);
67        println!(
68            "  Missing elements: {} / {} ({:.1}%)",
69            missing_count, total_elements, missing_percentage
70        );
71
72        // Show pattern of missing data
73        print!("  Pattern (X = missing): ");
74        for i in 0..test_data.nrows() {
75            for j in 0..test_data.ncols() {
76                if missing_mask[[i, j]] {
77                    print!("X ");
78                } else {
79                    print!(". ");
80                }
81            }
82            if i < test_data.nrows() - 1 {
83                print!("| ");
84            }
85        }
86        println!();
87    }
88}
89
90fn demonstrate_outlier_injection() {
91    println!("Testing different outlier types on a sample dataset:");
92
93    // Create a clean dataset with known statistics
94    let mut clean_data = Array2::ones((20, 3));
95    // Add some structure
96    for i in 0..20 {
97        for j in 0..3 {
98            clean_data[[i, j]] = (i as f64 + j as f64) / 2.0;
99        }
100    }
101
102    let outlier_types = [
103        (OutlierType::Point, "Point Outliers"),
104        (OutlierType::Contextual, "Contextual Outliers"),
105        (OutlierType::Collective, "Collective Outliers"),
106    ];
107
108    for (outlier_type, description) in outlier_types {
109        let mut test_data = clean_data.clone();
110        let original_stats = calculate_basic_stats(&test_data);
111
112        let outlier_mask =
113            inject_outliers(&mut test_data, 0.2, outlier_type, 3.0, Some(42)).unwrap();
114        let corrupted_stats = calculate_basic_stats(&test_data);
115
116        let outlier_count = outlier_mask.iter().filter(|&&x| x).count();
117
118        println!("{}:", description);
119        println!(
120            "  Outliers injected: {} / {} samples",
121            outlier_count,
122            test_data.nrows()
123        );
124        println!(
125            "  Mean change: {:.3} -> {:.3} (Δ={:.3})",
126            original_stats.0,
127            corrupted_stats.0,
128            corrupted_stats.0 - original_stats.0
129        );
130        println!(
131            "  Std change: {:.3} -> {:.3} (Δ={:.3})",
132            original_stats.1,
133            corrupted_stats.1,
134            corrupted_stats.1 - original_stats.1
135        );
136
137        // Show which samples are outliers
138        print!("  Outlier samples: ");
139        for (i, &is_outlier) in outlier_mask.iter().enumerate() {
140            if is_outlier {
141                print!("{} ", i);
142            }
143        }
144        println!();
145    }
146}
147
148fn demonstrate_time_series_noise() {
149    println!("Testing different time series noise types:");
150
151    // Create a simple time series
152    let clean_ts = make_time_series(100, 2, true, true, 0.0, Some(42)).unwrap();
153
154    let noise_configs = [
155        vec![("gaussian", 0.2)],
156        vec![("spikes", 0.1)],
157        vec![("drift", 0.5)],
158        vec![("seasonal", 0.3)],
159        vec![("autocorrelated", 0.1)],
160        vec![("heteroscedastic", 0.2)],
161        vec![("gaussian", 0.1), ("spikes", 0.05), ("drift", 0.2)], // Combined noise
162    ];
163
164    let noise_names = [
165        "Gaussian White Noise",
166        "Impulse Spikes",
167        "Linear Drift",
168        "Seasonal Pattern",
169        "Autocorrelated Noise",
170        "Heteroscedastic Noise",
171        "Combined Noise",
172    ];
173
174    for (config, name) in noise_configs.iter().zip(noise_names.iter()) {
175        let mut noisy_data = clean_ts.data.clone();
176        let original_stats = calculate_basic_stats(&noisy_data);
177
178        add_time_series_noise(&mut noisy_data, config, Some(42)).unwrap();
179        let noisy_stats = calculate_basic_stats(&noisy_data);
180
181        println!("{}:", name);
182        println!("  Mean: {:.3} -> {:.3}", original_stats.0, noisy_stats.0);
183        println!("  Std: {:.3} -> {:.3}", original_stats.1, noisy_stats.1);
184        println!(
185            "  Range: [{:.3}, {:.3}] -> [{:.3}, {:.3}]",
186            original_stats.2, original_stats.3, noisy_stats.2, noisy_stats.3
187        );
188    }
189}
190
191fn demonstrate_comprehensive_corruption() {
192    println!("Testing comprehensive dataset corruption:");
193
194    // Load a real dataset
195    let iris = load_iris().unwrap();
196    println!(
197        "Original Iris dataset: {} samples, {} features",
198        iris.n_samples(),
199        iris.n_features()
200    );
201
202    let original_stats = calculate_basic_stats(&iris.data);
203    println!(
204        "Original stats - Mean: {:.3}, Std: {:.3}",
205        original_stats.0, original_stats.1
206    );
207
208    // Create different levels of corruption
209    let corruption_levels = [
210        (0.05, 0.02, "Light corruption"),
211        (0.1, 0.05, "Moderate corruption"),
212        (0.2, 0.1, "Heavy corruption"),
213        (0.3, 0.15, "Severe corruption"),
214    ];
215
216    for (missing_rate, outlier_rate, description) in corruption_levels {
217        let corrupted = make_corrupted_dataset(
218            &iris,
219            missing_rate,
220            MissingPattern::MAR, // More realistic than MCAR
221            outlier_rate,
222            OutlierType::Point,
223            2.5,
224            Some(42),
225        )
226        .unwrap();
227
228        // Calculate how much data is usable
229        let total_elements = corrupted.data.len();
230        let missing_elements = corrupted.data.iter().filter(|&&x| x.is_nan()).count();
231        let usable_percentage =
232            ((total_elements - missing_elements) as f64 / total_elements as f64) * 100.0;
233
234        println!("{}:", description);
235        println!("  Missing data: {:.1}%", missing_rate * 100.0);
236        println!("  Outliers: {:.1}%", outlier_rate * 100.0);
237        println!("  Usable data: {:.1}%", usable_percentage);
238
239        // Show metadata
240        if let Some(missing_count) = corrupted.metadata.get("missing_count") {
241            println!("  Actual missing: {} elements", missing_count);
242        }
243        if let Some(outlier_count) = corrupted.metadata.get("outlier_count") {
244            println!("  Actual outliers: {} samples", outlier_count);
245        }
246    }
247}
248
249fn demonstrate_real_world_applications() {
250    println!("Real-world application scenarios:");
251
252    println!("\n1. **Medical Data Simulation**:");
253    let medical_data = load_iris().unwrap(); // Stand-in for medical measurements
254    let _corrupted_medical = make_corrupted_dataset(
255        &medical_data,
256        0.15,                 // 15% missing - common in medical data
257        MissingPattern::MNAR, // High values often missing (privacy, measurement issues)
258        0.05,                 // 5% outliers - measurement errors
259        OutlierType::Point,
260        2.0,
261        Some(42),
262    )
263    .unwrap();
264
265    println!("  Medical dataset simulation:");
266    println!("    Missing data pattern: MNAR (high values more likely missing)");
267    println!("    Outliers: Point outliers (measurement errors)");
268    println!("    Use case: Testing imputation algorithms for clinical data");
269
270    println!("\n2. **Sensor Network Simulation**:");
271    let sensor_data = make_time_series(200, 4, true, true, 0.1, Some(42)).unwrap();
272    let mut sensor_ts = sensor_data.data.clone();
273
274    // Add realistic sensor noise
275    add_time_series_noise(
276        &mut sensor_ts,
277        &[
278            ("gaussian", 0.05),        // Background noise
279            ("spikes", 0.02),          // Electrical interference
280            ("drift", 0.1),            // Sensor calibration drift
281            ("heteroscedastic", 0.03), // Temperature-dependent noise
282        ],
283        Some(42),
284    )
285    .unwrap();
286
287    // Add missing data (sensor failures)
288    inject_missing_data(&mut sensor_ts, 0.08, MissingPattern::Block, Some(42)).unwrap();
289
290    println!("  Sensor network simulation:");
291    println!("    Multiple noise types: gaussian + spikes + drift + heteroscedastic");
292    println!("    Missing data: Block pattern (sensor failures)");
293    println!("    Use case: Testing robust time series algorithms");
294
295    println!("\n3. **Survey Data Simulation**:");
296    let survey_data = load_iris().unwrap(); // Stand-in for survey responses
297    let _corrupted_survey = make_corrupted_dataset(
298        &survey_data,
299        0.25,                // 25% missing - typical for surveys
300        MissingPattern::MAR, // Missing depends on other responses
301        0.08,                // 8% outliers - data entry errors, extreme responses
302        OutlierType::Contextual,
303        1.5,
304        Some(42),
305    )
306    .unwrap();
307
308    println!("  Survey data simulation:");
309    println!("    Missing data pattern: MAR (depends on other responses)");
310    println!("    Outliers: Contextual (unusual response patterns)");
311    println!("    Use case: Testing survey analysis robustness");
312
313    println!("\n4. **Financial Data Simulation**:");
314    let mut financial_ts = make_time_series(500, 3, false, false, 0.02, Some(42))
315        .unwrap()
316        .data;
317
318    // Add financial market-specific noise
319    add_time_series_noise(
320        &mut financial_ts,
321        &[
322            ("gaussian", 0.1),        // Market volatility
323            ("spikes", 0.05),         // Market shocks
324            ("autocorrelated", 0.15), // Momentum effects
325            ("heteroscedastic", 0.2), // Volatility clustering
326        ],
327        Some(42),
328    )
329    .unwrap();
330
331    println!("  Financial data simulation:");
332    println!("    Noise types: volatility + shocks + momentum + clustering");
333    println!("    Use case: Testing financial models under realistic conditions");
334}