pub fn inject_missing_data(
data: &mut Array2<f64>,
missing_rate: f64,
pattern: MissingPattern,
random_seed: Option<u64>,
) -> Result<Array2<bool>>
Expand description
Inject missing data into a dataset with realistic patterns
Examples found in repository?
examples/noise_models_demo.rs (line 60)
39fn demonstrate_missing_data_patterns() {
40 println!("Testing different missing data patterns on a sample dataset:");
41
42 let original_data = Array2::from_shape_vec(
43 (8, 4),
44 vec![
45 1.0, 2.0, 3.0, 4.0, 2.0, 4.0, 6.0, 8.0, 3.0, 6.0, 9.0, 12.0, 4.0, 8.0, 12.0, 16.0, 5.0,
46 10.0, 15.0, 20.0, 6.0, 12.0, 18.0, 24.0, 7.0, 14.0, 21.0, 28.0, 8.0, 16.0, 24.0, 32.0,
47 ],
48 )
49 .unwrap();
50
51 let patterns = [
52 (MissingPattern::MCAR, "Missing Completely at Random"),
53 (MissingPattern::MAR, "Missing at Random"),
54 (MissingPattern::MNAR, "Missing Not at Random"),
55 (MissingPattern::Block, "Block-wise Missing"),
56 ];
57
58 for (pattern, description) in patterns {
59 let mut test_data = original_data.clone();
60 let missing_mask = inject_missing_data(&mut test_data, 0.3, pattern, Some(42)).unwrap();
61
62 let missing_count = missing_mask.iter().filter(|&&x| x).count();
63 let total_elements = test_data.len();
64 let missing_percentage = (missing_count as f64 / total_elements as f64) * 100.0;
65
66 println!("{}:", description);
67 println!(
68 " Missing elements: {} / {} ({:.1}%)",
69 missing_count, total_elements, missing_percentage
70 );
71
72 // Show pattern of missing data
73 print!(" Pattern (X = missing): ");
74 for i in 0..test_data.nrows() {
75 for j in 0..test_data.ncols() {
76 if missing_mask[[i, j]] {
77 print!("X ");
78 } else {
79 print!(". ");
80 }
81 }
82 if i < test_data.nrows() - 1 {
83 print!("| ");
84 }
85 }
86 println!();
87 }
88}
89
90fn demonstrate_outlier_injection() {
91 println!("Testing different outlier types on a sample dataset:");
92
93 // Create a clean dataset with known statistics
94 let mut clean_data = Array2::ones((20, 3));
95 // Add some structure
96 for i in 0..20 {
97 for j in 0..3 {
98 clean_data[[i, j]] = (i as f64 + j as f64) / 2.0;
99 }
100 }
101
102 let outlier_types = [
103 (OutlierType::Point, "Point Outliers"),
104 (OutlierType::Contextual, "Contextual Outliers"),
105 (OutlierType::Collective, "Collective Outliers"),
106 ];
107
108 for (outlier_type, description) in outlier_types {
109 let mut test_data = clean_data.clone();
110 let original_stats = calculate_basic_stats(&test_data);
111
112 let outlier_mask =
113 inject_outliers(&mut test_data, 0.2, outlier_type, 3.0, Some(42)).unwrap();
114 let corrupted_stats = calculate_basic_stats(&test_data);
115
116 let outlier_count = outlier_mask.iter().filter(|&&x| x).count();
117
118 println!("{}:", description);
119 println!(
120 " Outliers injected: {} / {} samples",
121 outlier_count,
122 test_data.nrows()
123 );
124 println!(
125 " Mean change: {:.3} -> {:.3} (Δ={:.3})",
126 original_stats.0,
127 corrupted_stats.0,
128 corrupted_stats.0 - original_stats.0
129 );
130 println!(
131 " Std change: {:.3} -> {:.3} (Δ={:.3})",
132 original_stats.1,
133 corrupted_stats.1,
134 corrupted_stats.1 - original_stats.1
135 );
136
137 // Show which samples are outliers
138 print!(" Outlier samples: ");
139 for (i, &is_outlier) in outlier_mask.iter().enumerate() {
140 if is_outlier {
141 print!("{} ", i);
142 }
143 }
144 println!();
145 }
146}
147
148fn demonstrate_time_series_noise() {
149 println!("Testing different time series noise types:");
150
151 // Create a simple time series
152 let clean_ts = make_time_series(100, 2, true, true, 0.0, Some(42)).unwrap();
153
154 let noise_configs = [
155 vec![("gaussian", 0.2)],
156 vec![("spikes", 0.1)],
157 vec![("drift", 0.5)],
158 vec![("seasonal", 0.3)],
159 vec![("autocorrelated", 0.1)],
160 vec![("heteroscedastic", 0.2)],
161 vec![("gaussian", 0.1), ("spikes", 0.05), ("drift", 0.2)], // Combined noise
162 ];
163
164 let noise_names = [
165 "Gaussian White Noise",
166 "Impulse Spikes",
167 "Linear Drift",
168 "Seasonal Pattern",
169 "Autocorrelated Noise",
170 "Heteroscedastic Noise",
171 "Combined Noise",
172 ];
173
174 for (config, name) in noise_configs.iter().zip(noise_names.iter()) {
175 let mut noisy_data = clean_ts.data.clone();
176 let original_stats = calculate_basic_stats(&noisy_data);
177
178 add_time_series_noise(&mut noisy_data, config, Some(42)).unwrap();
179 let noisy_stats = calculate_basic_stats(&noisy_data);
180
181 println!("{}:", name);
182 println!(" Mean: {:.3} -> {:.3}", original_stats.0, noisy_stats.0);
183 println!(" Std: {:.3} -> {:.3}", original_stats.1, noisy_stats.1);
184 println!(
185 " Range: [{:.3}, {:.3}] -> [{:.3}, {:.3}]",
186 original_stats.2, original_stats.3, noisy_stats.2, noisy_stats.3
187 );
188 }
189}
190
191fn demonstrate_comprehensive_corruption() {
192 println!("Testing comprehensive dataset corruption:");
193
194 // Load a real dataset
195 let iris = load_iris().unwrap();
196 println!(
197 "Original Iris dataset: {} samples, {} features",
198 iris.n_samples(),
199 iris.n_features()
200 );
201
202 let original_stats = calculate_basic_stats(&iris.data);
203 println!(
204 "Original stats - Mean: {:.3}, Std: {:.3}",
205 original_stats.0, original_stats.1
206 );
207
208 // Create different levels of corruption
209 let corruption_levels = [
210 (0.05, 0.02, "Light corruption"),
211 (0.1, 0.05, "Moderate corruption"),
212 (0.2, 0.1, "Heavy corruption"),
213 (0.3, 0.15, "Severe corruption"),
214 ];
215
216 for (missing_rate, outlier_rate, description) in corruption_levels {
217 let corrupted = make_corrupted_dataset(
218 &iris,
219 missing_rate,
220 MissingPattern::MAR, // More realistic than MCAR
221 outlier_rate,
222 OutlierType::Point,
223 2.5,
224 Some(42),
225 )
226 .unwrap();
227
228 // Calculate how much data is usable
229 let total_elements = corrupted.data.len();
230 let missing_elements = corrupted.data.iter().filter(|&&x| x.is_nan()).count();
231 let usable_percentage =
232 ((total_elements - missing_elements) as f64 / total_elements as f64) * 100.0;
233
234 println!("{}:", description);
235 println!(" Missing data: {:.1}%", missing_rate * 100.0);
236 println!(" Outliers: {:.1}%", outlier_rate * 100.0);
237 println!(" Usable data: {:.1}%", usable_percentage);
238
239 // Show metadata
240 if let Some(missing_count) = corrupted.metadata.get("missing_count") {
241 println!(" Actual missing: {} elements", missing_count);
242 }
243 if let Some(outlier_count) = corrupted.metadata.get("outlier_count") {
244 println!(" Actual outliers: {} samples", outlier_count);
245 }
246 }
247}
248
249fn demonstrate_real_world_applications() {
250 println!("Real-world application scenarios:");
251
252 println!("\n1. **Medical Data Simulation**:");
253 let medical_data = load_iris().unwrap(); // Stand-in for medical measurements
254 let _corrupted_medical = make_corrupted_dataset(
255 &medical_data,
256 0.15, // 15% missing - common in medical data
257 MissingPattern::MNAR, // High values often missing (privacy, measurement issues)
258 0.05, // 5% outliers - measurement errors
259 OutlierType::Point,
260 2.0,
261 Some(42),
262 )
263 .unwrap();
264
265 println!(" Medical dataset simulation:");
266 println!(" Missing data pattern: MNAR (high values more likely missing)");
267 println!(" Outliers: Point outliers (measurement errors)");
268 println!(" Use case: Testing imputation algorithms for clinical data");
269
270 println!("\n2. **Sensor Network Simulation**:");
271 let sensor_data = make_time_series(200, 4, true, true, 0.1, Some(42)).unwrap();
272 let mut sensor_ts = sensor_data.data.clone();
273
274 // Add realistic sensor noise
275 add_time_series_noise(
276 &mut sensor_ts,
277 &[
278 ("gaussian", 0.05), // Background noise
279 ("spikes", 0.02), // Electrical interference
280 ("drift", 0.1), // Sensor calibration drift
281 ("heteroscedastic", 0.03), // Temperature-dependent noise
282 ],
283 Some(42),
284 )
285 .unwrap();
286
287 // Add missing data (sensor failures)
288 inject_missing_data(&mut sensor_ts, 0.08, MissingPattern::Block, Some(42)).unwrap();
289
290 println!(" Sensor network simulation:");
291 println!(" Multiple noise types: gaussian + spikes + drift + heteroscedastic");
292 println!(" Missing data: Block pattern (sensor failures)");
293 println!(" Use case: Testing robust time series algorithms");
294
295 println!("\n3. **Survey Data Simulation**:");
296 let survey_data = load_iris().unwrap(); // Stand-in for survey responses
297 let _corrupted_survey = make_corrupted_dataset(
298 &survey_data,
299 0.25, // 25% missing - typical for surveys
300 MissingPattern::MAR, // Missing depends on other responses
301 0.08, // 8% outliers - data entry errors, extreme responses
302 OutlierType::Contextual,
303 1.5,
304 Some(42),
305 )
306 .unwrap();
307
308 println!(" Survey data simulation:");
309 println!(" Missing data pattern: MAR (depends on other responses)");
310 println!(" Outliers: Contextual (unusual response patterns)");
311 println!(" Use case: Testing survey analysis robustness");
312
313 println!("\n4. **Financial Data Simulation**:");
314 let mut financial_ts = make_time_series(500, 3, false, false, 0.02, Some(42))
315 .unwrap()
316 .data;
317
318 // Add financial market-specific noise
319 add_time_series_noise(
320 &mut financial_ts,
321 &[
322 ("gaussian", 0.1), // Market volatility
323 ("spikes", 0.05), // Market shocks
324 ("autocorrelated", 0.15), // Momentum effects
325 ("heteroscedastic", 0.2), // Volatility clustering
326 ],
327 Some(42),
328 )
329 .unwrap();
330
331 println!(" Financial data simulation:");
332 println!(" Noise types: volatility + shocks + momentum + clustering");
333 println!(" Use case: Testing financial models under realistic conditions");
334}