pandrs 0.1.0-beta.2

A high-performance DataFrame library for Rust, providing pandas-like API with advanced features including SIMD optimization, parallel processing, and distributed computing capabilities
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
#[cfg(feature = "optimized")]
use pandrs::column::{Column, Float64Column, Int64Column};
#[cfg(feature = "optimized")]
use pandrs::ml::anomaly::{IsolationForest, LocalOutlierFactor, OneClassSVM};
#[cfg(feature = "optimized")]
use pandrs::ml::UnsupervisedModel;
#[cfg(feature = "optimized")]
use pandrs::optimized::convert;
#[cfg(feature = "optimized")]
use pandrs::optimized::OptimizedDataFrame;
#[cfg(feature = "optimized")]
use rand::rngs::StdRng;
#[cfg(feature = "optimized")]
use rand::Rng;
#[cfg(feature = "optimized")]
use rand::SeedableRng;

#[cfg(not(feature = "optimized"))]
fn main() {
    println!("This example requires the 'optimized' feature flag to be enabled.");
    println!("Please recompile with:");
    println!(
        "  cargo run --example optimized_ml_anomaly_detection_example --features \"optimized\""
    );
}

#[cfg(feature = "optimized")]
#[allow(clippy::result_large_err)]
fn main() -> Result<(), Box<dyn std::error::Error>> {
    // Generate synthetic anomaly detection data
    println!("✅ Example of Anomaly Detection Algorithms");
    println!("==========================");
    println!("1. Generating synthetic data");

    let mut rng = StdRng::seed_from_u64(42);
    let n_samples = 1000;

    // Generate normal data (2D data, normal distribution centered at [0, 0])
    let n_normal = 950;
    let mut x_values = Vec::with_capacity(n_samples);
    let mut y_values = Vec::with_capacity(n_samples);
    let mut true_labels = Vec::with_capacity(n_samples);

    for _ in 0..n_normal {
        x_values.push(rng.random_range(-3.0..3.0));
        y_values.push(rng.random_range(-3.0..3.0));
        true_labels.push(0); // Normal data is labeled as 0
    }

    // Generate anomalies (data far from normal data)
    let n_anomalies = n_samples - n_normal;

    for _ in 0..n_anomalies {
        // Randomly generate the position of outliers
        match rng.random_range(0..4) {
            0 => {
                // Top left
                x_values.push(rng.random_range(-10.0..-5.0));
                y_values.push(rng.random_range(5.0..10.0));
            }
            1 => {
                // Top right
                x_values.push(rng.random_range(5.0..10.0));
                y_values.push(rng.random_range(5.0..10.0));
            }
            2 => {
                // Bottom left
                x_values.push(rng.random_range(-10.0..-5.0));
                y_values.push(rng.random_range(-10.0..-5.0));
            }
            _ => {
                // Bottom right
                x_values.push(rng.random_range(5.0..10.0));
                y_values.push(rng.random_range(-10.0..-5.0));
            }
        }
        true_labels.push(1); // Anomalies are labeled as 1
    }

    // Create DataFrame
    let mut df = OptimizedDataFrame::new();

    // Use clone() to create copies of values
    let x_col = Column::Float64(Float64Column::with_name(x_values.clone(), "x"));
    let y_col = Column::Float64(Float64Column::with_name(y_values.clone(), "y"));
    let true_labels_col =
        Column::Int64(Int64Column::with_name(true_labels.clone(), "true_anomaly"));

    df.add_column("x".to_string(), x_col)?;
    df.add_column("y".to_string(), y_col)?;
    df.add_column("true_anomaly".to_string(), true_labels_col)?;

    println!(
        "Data generation complete: {} normal samples, {} anomaly samples",
        n_normal, n_anomalies
    );
    println!("First few rows of the DataFrame:");
    // Display the first 5 rows instead of using df.head()
    println!("DataFrame (first 5 rows):");
    for i in 0..std::cmp::min(5, df.row_count()) {
        if let (Ok(Some(x)), Ok(Some(y)), Ok(Some(anomaly))) = (
            df.column("x").unwrap().as_float64().unwrap().get(i),
            df.column("y").unwrap().as_float64().unwrap().get(i),
            df.column("true_anomaly")
                .unwrap()
                .as_int64()
                .unwrap()
                .get(i),
        ) {
            println!("Row {}: x={:.4}, y={:.4}, anomaly={}", i, x, y, anomaly);
        }
    }

    // Anomaly detection using IsolationForest
    println!("\n2. Anomaly detection using Isolation Forest");
    let mut isolation_forest = IsolationForest::new()
        .n_estimators(100)       // Number of trees
        .contamination(0.05)     // Contamination rate 5%
        .random_seed(42); // Random seed

    // Convert to standard DataFrame for the algorithm
    let reg_df = convert::standard_dataframe(&df)?;
    let if_result = isolation_forest.fit_transform(&reg_df)?;
    // Convert back to OptimizedDataFrame for consistent handling
    let if_result_opt = convert::optimize_dataframe(&if_result)?;

    println!("Isolation Forest detection complete");
    println!(
        "Number of detected anomalies: {}",
        isolation_forest
            .labels()
            .iter()
            .filter(|&&x| x == 1)
            .count()
    );
    println!("First few rows of the result:");
    // Display the first 5 rows instead of using if_result.head()
    println!("Isolation Forest result (first 5 rows):");
    for i in 0..std::cmp::min(5, if_result_opt.row_count()) {
        if let (Ok(Some(x)), Ok(Some(y)), Ok(Some(anomaly_score)), Ok(Some(anomaly))) = (
            if_result_opt
                .column("x")
                .unwrap()
                .as_float64()
                .unwrap()
                .get(i),
            if_result_opt
                .column("y")
                .unwrap()
                .as_float64()
                .unwrap()
                .get(i),
            if_result_opt
                .column("anomaly_score")
                .unwrap()
                .as_float64()
                .unwrap()
                .get(i),
            if_result_opt
                .column("anomaly")
                .unwrap()
                .as_int64()
                .unwrap()
                .get(i),
        ) {
            println!(
                "Row {}: x={:.4}, y={:.4}, score={:.4}, anomaly={}",
                i, x, y, anomaly_score, anomaly
            );
        }
    }

    // Anomaly detection using LOF
    println!("\n3. Anomaly detection using Local Outlier Factor");
    let mut lof = LocalOutlierFactor::new(20) // Number of neighbors
        .contamination(0.05); // Contamination rate 5%

    // LocalOutlierFactor doesn't currently support transform in this implementation,
    // so we need to use fit() and then duplicate reg_df to create our result
    lof.fit(&reg_df)?;
    let lof_result = reg_df.clone();
    // Convert back to OptimizedDataFrame for consistent handling
    let lof_result_opt = convert::optimize_dataframe(&lof_result)?;

    println!("Local Outlier Factor detection complete");
    println!(
        "Number of detected anomalies: {}",
        lof.labels().iter().filter(|&&x| x == 1).count()
    );
    println!("First few rows of the result:");
    // Display the first 5 rows instead of using lof_result.head()
    println!("Local Outlier Factor result (first 5 rows):");
    for i in 0..std::cmp::min(5, lof_result_opt.row_count()) {
        if let (Ok(Some(x)), Ok(Some(y)), Ok(Some(lof_score)), Ok(Some(anomaly))) = (
            lof_result_opt
                .column("x")
                .unwrap()
                .as_float64()
                .unwrap()
                .get(i),
            lof_result_opt
                .column("y")
                .unwrap()
                .as_float64()
                .unwrap()
                .get(i),
            lof_result_opt
                .column("lof_score")
                .unwrap()
                .as_float64()
                .unwrap()
                .get(i),
            lof_result_opt
                .column("anomaly")
                .unwrap()
                .as_int64()
                .unwrap()
                .get(i),
        ) {
            println!(
                "Row {}: x={:.4}, y={:.4}, score={:.4}, anomaly={}",
                i, x, y, lof_score, anomaly
            );
        }
    }

    // Anomaly detection using One-Class SVM
    println!("\n4. Anomaly detection using One-Class SVM");
    let mut one_class_svm = OneClassSVM::new()
        .nu(0.05)                // nu parameter
        .gamma(0.1); // gamma parameter

    let svm_result = one_class_svm.fit_transform(&reg_df)?;
    // Convert back to OptimizedDataFrame for consistent handling
    let svm_result_opt = convert::optimize_dataframe(&svm_result)?;

    println!("One-Class SVM detection complete");
    println!(
        "Number of detected anomalies: {}",
        one_class_svm.labels().iter().filter(|&&x| x == 1).count()
    );
    println!("First few rows of the result:");
    // Display the first 5 rows instead of using svm_result.head()
    println!("One-Class SVM result (first 5 rows):");
    for i in 0..std::cmp::min(5, svm_result_opt.row_count()) {
        if let (Ok(Some(x)), Ok(Some(y)), Ok(Some(decision_value)), Ok(Some(anomaly))) = (
            svm_result_opt
                .column("x")
                .unwrap()
                .as_float64()
                .unwrap()
                .get(i),
            svm_result_opt
                .column("y")
                .unwrap()
                .as_float64()
                .unwrap()
                .get(i),
            svm_result_opt
                .column("decision_value")
                .unwrap()
                .as_float64()
                .unwrap()
                .get(i),
            svm_result_opt
                .column("anomaly")
                .unwrap()
                .as_int64()
                .unwrap()
                .get(i),
        ) {
            println!(
                "Row {}: x={:.4}, y={:.4}, decision={:.4}, anomaly={}",
                i, x, y, decision_value, anomaly
            );
        }
    }

    // Compare anomaly flags of each algorithm
    println!("\n5. Comparing detection results");

    // Number of samples flagged as anomalies
    let if_anomalies = isolation_forest
        .labels()
        .iter()
        .filter(|&&x| x == 1)
        .count();
    let lof_anomalies = lof.labels().iter().filter(|&&x| x == 1).count();
    let svm_anomalies = one_class_svm.labels().iter().filter(|&&x| x == 1).count();

    println!("Isolation Forest: detected {} anomalies", if_anomalies);
    println!("Local Outlier Factor: detected {} anomalies", lof_anomalies);
    println!("One-Class SVM: detected {} anomalies", svm_anomalies);

    // Check agreement between algorithms
    let mut all_agree = 0;
    let mut if_lof_agree = 0;
    let mut if_svm_agree = 0;
    let mut lof_svm_agree = 0;

    for i in 0..n_samples {
        let if_label = isolation_forest.labels()[i];
        let lof_label = lof.labels()[i];
        let svm_label = one_class_svm.labels()[i];

        if if_label == lof_label && lof_label == svm_label {
            all_agree += 1;

            // Display the first few "all algorithms agree" anomalies
            if if_label == 1 && all_agree <= 5 {
                // Display only the first 5
                // Instead of accessing values, get values from the dataframe
                if let (Ok(Some(x)), Ok(Some(y))) = (
                    df.column("x").unwrap().as_float64().unwrap().get(i),
                    df.column("y").unwrap().as_float64().unwrap().get(i),
                ) {
                    println!(
                        "Sample {} is detected as an anomaly by all algorithms: x={:.2}, y={:.2}",
                        i, x, y
                    );
                }
            }
        }

        if if_label == lof_label {
            if_lof_agree += 1;
        }
        if if_label == svm_label {
            if_svm_agree += 1;
        }
        if lof_label == svm_label {
            lof_svm_agree += 1;
        }
    }

    println!(
        "Agreement rate of all algorithms: {:.1}%",
        100.0 * all_agree as f64 / n_samples as f64
    );
    println!(
        "Agreement rate of Isolation Forest and LOF: {:.1}%",
        100.0 * if_lof_agree as f64 / n_samples as f64
    );
    println!(
        "Agreement rate of Isolation Forest and SVM: {:.1}%",
        100.0 * if_svm_agree as f64 / n_samples as f64
    );
    println!(
        "Agreement rate of LOF and SVM: {:.1}%",
        100.0 * lof_svm_agree as f64 / n_samples as f64
    );

    // Compare with true anomalies
    println!("\n6. Comparison with true anomalies");

    // Evaluation metric function
    let calc_metrics = |algorithm_name: &str, labels: &[i64], true_labels: &[i64]| {
        let mut tp = 0; // True positive
        let mut fp = 0; // False positive
        let mut tn = 0; // True negative
        let mut fn_count = 0; // False negative

        for i in 0..labels.len() {
            let pred = labels[i];
            let true_val = true_labels[i];

            match (pred, true_val) {
                (1, 1) => tp += 1,        // True positive
                (1, 0) => fp += 1,        // False positive
                (-1, 0) => tn += 1,       // True negative
                (-1, 1) => fn_count += 1, // False negative
                _ => {}
            }
        }

        // Calculate precision, recall, and F1 score
        let precision = if tp + fp > 0 {
            tp as f64 / (tp + fp) as f64
        } else {
            0.0
        };
        let recall = if tp + fn_count > 0 {
            tp as f64 / (tp + fn_count) as f64
        } else {
            0.0
        };
        let f1 = if precision + recall > 0.0 {
            2.0 * precision * recall / (precision + recall)
        } else {
            0.0
        };
        let accuracy = (tp + tn) as f64 / labels.len() as f64;

        println!(
            "{}: Precision={:.1}%, Recall={:.1}%, F1={:.3}, Accuracy={:.1}%",
            algorithm_name,
            precision * 100.0,
            recall * 100.0,
            f1,
            accuracy * 100.0
        );
    };

    // Get true labels from the dataframe
    let extracted_true_labels: Vec<i64> = (0..df.row_count())
        .filter_map(|i| {
            df.column("true_anomaly")
                .unwrap()
                .as_int64()
                .unwrap()
                .get(i)
                .ok()
                .flatten()
        })
        .collect();

    // Calculate evaluation metrics for each algorithm
    calc_metrics(
        "Isolation Forest",
        isolation_forest.labels(),
        &extracted_true_labels,
    );
    calc_metrics("Local Outlier Factor", lof.labels(), &extracted_true_labels);
    calc_metrics(
        "One-Class SVM",
        one_class_svm.labels(),
        &extracted_true_labels,
    );

    println!("\n==========================");
    println!("✅ Anomaly detection example completed successfully");

    Ok(())
}