scirs2-spatial 0.4.1

Spatial algorithms module for SciRS2 (scirs2-spatial)
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
//! SIMD Performance Optimization Example
//!
//! This example demonstrates the performance benefits of SIMD-accelerated
//! distance calculations and parallel spatial operations. It compares
//! scalar vs SIMD implementations across different data sizes and metrics.

use ndarray::Array2;
use rand::Rng;
use scirs2_spatial::simd_distance::{
    bench, parallel_pdist, simd_euclidean_distance, simd_euclidean_distance_batch, simd_knn_search,
};
use std::time::Instant;

#[allow(dead_code)]
fn main() -> Result<(), Box<dyn std::error::Error>> {
    println!("=== SIMD Performance Optimization Example ===\n");

    // Example 1: System SIMD capabilities
    println!("1. System SIMD Capabilities");
    system_capabilities_example();
    println!();

    // Example 2: Single distance computation performance
    println!("2. Single Distance Computation Performance");
    single_distance_performance_example()?;
    println!();

    // Example 3: Batch distance computation
    println!("3. Batch Distance Computation Performance");
    batch_distance_performance_example()?;
    println!();

    // Example 4: Distance matrix computation
    println!("4. Distance Matrix Computation Performance");
    distance_matrix_performance_example()?;
    println!();

    // Example 5: K-nearest neighbors search
    println!("5. K-Nearest Neighbors Search Performance");
    knn_performance_example()?;
    println!();

    // Example 6: Scalability analysis
    println!("6. Scalability Analysis");
    scalability_analysis_example()?;
    println!();

    // Example 7: Memory efficiency comparison
    println!("7. Memory Efficiency and Cache Performance");
    memory_efficiency_example()?;

    Ok(())
}

#[allow(dead_code)]
fn system_capabilities_example() {
    println!("Detecting available SIMD instruction sets:");
    bench::report_simd_features();

    #[cfg(target_arch = "x86_64")]
    println!("\nOptimal SIMD width: {} elements (256-bit AVX)", 4);

    #[cfg(target_arch = "aarch64")]
    println!("\nOptimal SIMD width: {} elements (128-bit NEON)", 2);

    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
    println!("\nUsing scalar fallback implementations");
}

#[allow(dead_code)]
fn single_distance_performance_example() -> Result<(), Box<dyn std::error::Error>> {
    println!("Comparing SIMD vs scalar for single distance computations:");

    // Test different vector sizes
    let dimensions = [3, 10, 50, 100, 500, 1000];
    let iterations = 100_000;

    println!(
        "{:>8} {:>12} {:>12} {:>12}",
        "Dim", "Scalar (ms)", "SIMD (ms)", "Speedup"
    );
    println!("{}", "-".repeat(48));

    for &dim in &dimensions {
        let mut rng = rand::rng();

        // Generate random vectors
        let a: Vec<f64> = (0..dim).map(|_| rng.random_range(-10.0..10.0)).collect();
        let b: Vec<f64> = (0..dim).map(|_| rng.random_range(-10.0..10.0)).collect();

        // Scalar timing
        let start = Instant::now();
        for _ in 0..iterations {
            let _dist = scirs2_spatial::distance::euclidean(&a, &b);
        }
        let scalar_time = start.elapsed().as_millis();

        // SIMD timing
        let start = Instant::now();
        for _ in 0..iterations {
            let _dist = simd_euclidean_distance(&a, &b)?;
        }
        let simd_time = start.elapsed().as_millis();

        let speedup = scalar_time as f64 / simd_time as f64;

        println!("{dim:>8} {scalar_time:>12} {simd_time:>12} {speedup:>12.2}x");
    }

    Ok(())
}

#[allow(dead_code)]
fn batch_distance_performance_example() -> Result<(), Box<dyn std::error::Error>> {
    println!("Batch distance computation performance:");

    let n_points = 10_000;
    let dimensions = [2, 3, 5, 10, 20];

    println!(
        "{:>8} {:>12} {:>12} {:>12}",
        "Dim", "Scalar (ms)", "SIMD (ms)", "Speedup"
    );
    println!("{}", "-".repeat(48));

    for &dim in &dimensions {
        // Generate random point sets
        let points1 = generate_random_points(n_points, dim);
        let points2 = generate_random_points(n_points, dim);

        // Measure performance
        let results = bench::benchmark_distance_computation(
            &points1.view(),
            &points2.view(),
            1, // Single iteration since we have many points
        );

        let speedup = results.simd_f64_speedup;

        println!(
            "{:>8} {:>12.1} {:>12.1} {:>12.2}x",
            dim,
            results.scalar_time * 1000.0,
            results.simd_f64_time * 1000.0,
            speedup
        );
    }

    Ok(())
}

#[allow(dead_code)]
fn distance_matrix_performance_example() -> Result<(), Box<dyn std::error::Error>> {
    println!("Distance matrix computation performance comparison:");

    let point_counts = [100, 200, 500, 1000];
    let dim = 3;

    println!(
        "{:>8} {:>12} {:>12} {:>12}",
        "Points", "Scalar (ms)", "Parallel (ms)", "Speedup"
    );
    println!("{}", "-".repeat(48));

    for &n_points in &point_counts {
        let points = generate_random_points(n_points, dim);

        // Scalar pdist
        let start = Instant::now();
        let _scalar_dists =
            scirs2_spatial::distance::pdist(&points, scirs2_spatial::distance::euclidean);
        let scalar_time = start.elapsed().as_millis();

        // Parallel SIMD pdist
        let start = Instant::now();
        let _parallel_dists = parallel_pdist(&points.view(), "euclidean")?;
        let parallel_time = start.elapsed().as_millis();

        let speedup = scalar_time as f64 / parallel_time as f64;

        println!("{n_points:>8} {scalar_time:>12} {parallel_time:>12} {speedup:>12.2}x");
    }

    Ok(())
}

#[allow(dead_code)]
fn knn_performance_example() -> Result<(), Box<dyn std::error::Error>> {
    println!("K-nearest neighbors search performance:");

    let n_data = 5000;
    let n_queries = 1000;
    let k_values = [1, 5, 10, 20];
    let dim = 5;

    let data_points = generate_random_points(n_data, dim);
    let query_points = generate_random_points(n_queries, dim);

    println!(
        "{:>6} {:>15} {:>12}",
        "k", "Time (ms)", "Throughput (queries/s)"
    );
    println!("{}", "-".repeat(35));

    for &k in &k_values {
        let start = Instant::now();
        let (_indicesdistances) =
            simd_knn_search(&query_points.view(), &data_points.view(), k, "euclidean")?;
        let elapsed = start.elapsed().as_millis();

        let throughput = (n_queries * 1000) / elapsed as usize;

        println!("{k:>6} {elapsed:>15} {throughput:>12}");
    }

    // Compare different metrics
    println!("\nPerformance by distance metric (k=5):");
    let metrics = ["euclidean", "manhattan", "sqeuclidean", "chebyshev"];

    println!("{:>12} {:>12} {:>12}", "Metric", "Time (ms)", "Rel. Speed");
    println!("{}", "-".repeat(38));

    let mut base_time = 0;
    for (i, &metric) in metrics.iter().enumerate() {
        let start = Instant::now();
        let (_indicesdistances) =
            simd_knn_search(&query_points.view(), &data_points.view(), 5, metric)?;
        let elapsed = start.elapsed().as_millis();

        if i == 0 {
            base_time = elapsed;
        }

        let relative_speed = base_time as f64 / elapsed as f64;

        println!("{metric:>12} {elapsed:>12} {relative_speed:>12.2}x");
    }

    Ok(())
}

#[allow(dead_code)]
fn scalability_analysis_example() -> Result<(), Box<dyn std::error::Error>> {
    println!("Scalability analysis with increasing problem sizes:");

    let base_points = [1000, 2000, 5000, 10000];
    let dim = 4;

    println!(
        "{:>8} {:>12} {:>12} {:>15}",
        "Points", "Time (ms)", "Time/Point (μs)", "Memory (MB)"
    );
    println!("{}", "-".repeat(50));

    for &n_points in &base_points {
        let points = generate_random_points(n_points, dim);

        let start = Instant::now();
        let distances = parallel_pdist(&points.view(), "euclidean")?;
        let elapsed = start.elapsed().as_millis();

        let time_per_point = (elapsed as f64 * 1000.0) / (n_points as f64);
        let memory_mb = (distances.len() * std::mem::size_of::<f64>()) as f64 / (1024.0 * 1024.0);

        println!("{n_points:>8} {elapsed:>12} {time_per_point:>12.1} {memory_mb:>15.2}");
    }

    // Parallel efficiency analysis
    println!("\nParallel efficiency analysis:");
    test_parallel_efficiency()?;

    Ok(())
}

#[allow(dead_code)]
fn test_parallel_efficiency() -> Result<(), Box<dyn std::error::Error>> {
    let n_points = 2000;
    let dim = 5;
    let points = generate_random_points(n_points, dim);

    // Simulate different thread counts by controlling Rayon's thread pool
    println!(
        "{:>8} {:>12} {:>12} {:>12}",
        "Threads", "Time (ms)", "Speedup", "Efficiency"
    );
    println!("{}", "-".repeat(48));

    // Note: This is a simplified demonstration
    // In practice, you'd need to control Rayon's global thread pool
    let thread_counts = [1, 2, 4, 8];
    let mut base_time = 0;

    for (i, &threads) in thread_counts.iter().enumerate() {
        // For demonstration, we'll use the same parallel implementation
        // In a real scenario, you'd configure the thread pool
        let start = Instant::now();
        let _distances = parallel_pdist(&points.view(), "euclidean")?;
        let elapsed = start.elapsed().as_millis();

        if i == 0 {
            base_time = elapsed;
        }

        let speedup = base_time as f64 / elapsed as f64;
        let efficiency = speedup / threads as f64;

        println!(
            "{:>8} {:>12} {:>12.2}x {:>12.1}%",
            threads,
            elapsed,
            speedup,
            efficiency * 100.0
        );
    }

    Ok(())
}

#[allow(dead_code)]
fn memory_efficiency_example() -> Result<(), Box<dyn std::error::Error>> {
    println!("Memory efficiency and cache performance analysis:");

    // Test different access patterns
    let n_points = 1000;
    let dimensions = [2, 5, 10, 20, 50];

    println!(
        "{:>8} {:>15} {:>15} {:>15}",
        "Dim", "Cache-friendly", "Random Access", "Improvement"
    );
    println!("{}", "-".repeat(60));

    for &dim in &dimensions {
        let points1 = generate_random_points(n_points, dim);
        let points2 = generate_random_points(n_points, dim);

        // Sequential access (cache-friendly)
        let start = Instant::now();
        let _distances = simd_euclidean_distance_batch(&points1.view(), &points2.view())?;
        let sequential_time = start.elapsed().as_millis();

        // Simulate random access pattern
        let start = Instant::now();
        let mut _sum = 0.0;
        for i in 0..n_points {
            let p1 = points1.row(i).to_vec();
            let p2 = points2.row(i).to_vec();
            _sum += simd_euclidean_distance(&p1, &p2)?;
        }
        let random_time = start.elapsed().as_millis();

        let improvement = random_time as f64 / sequential_time as f64;

        println!("{dim:>8} {sequential_time:>15} {random_time:>15} {improvement:>15.2}x");
    }

    // Memory allocation analysis
    println!("\nMemory allocation patterns:");
    memory_allocation_analysis()?;

    Ok(())
}

#[allow(dead_code)]
fn memory_allocation_analysis() -> Result<(), Box<dyn std::error::Error>> {
    let n_points = 2000;
    let dim = 10;

    println!("Operation                     Memory (MB)  Allocations");
    println!("{}", "-".repeat(50));

    // Point storage
    let _points = generate_random_points(n_points, dim);
    let points_memory = (n_points * dim * std::mem::size_of::<f64>()) as f64 / (1024.0 * 1024.0);
    println!("{:<30} {:>10.2} {:>10}", "Point storage", points_memory, 1);

    // Distance matrix (condensed)
    let n_distances = n_points * (n_points - 1) / 2;
    let distance_memory = (n_distances * std::mem::size_of::<f64>()) as f64 / (1024.0 * 1024.0);
    println!(
        "{:<30} {:>10.2} {:>10}",
        "Distance matrix", distance_memory, 1
    );

    // KNN results
    let k = 10;
    let knn_memory = (n_points * k * 2 * std::mem::size_of::<f64>()) as f64 / (1024.0 * 1024.0);
    println!("{:<30} {:>10.2} {:>10}", "KNN results", knn_memory, 2);

    // Total peak memory
    let total_memory = points_memory + distance_memory.max(knn_memory);
    println!("{:<30} {:>10.2} {:>10}", "Peak usage", total_memory, 3);

    Ok(())
}

/// Generate random points for testing
#[allow(dead_code)]
fn generate_random_points(_npoints: usize, dim: usize) -> Array2<f64> {
    let mut rng = rand::rng();
    Array2::fromshape_fn((_n_points, dim), |_| rng.random_range(-10.0..10.0))
}

/// Demonstrate different optimization strategies
#[allow(dead_code)]
fn optimization_strategies_demo() -> Result<(), Box<dyn std::error::Error>> {
    println!("\nOptimization strategies comparison:");

    let n_points = 1000;
    let dim = 8;
    let points = generate_random_points(n_points, dim);

    // Strategy 1: Naive double loop
    let start = Instant::now();
    let mut naive_dists = Vec::new();
    for i in 0..n_points {
        for j in (i + 1)..n_points {
            let p1 = points.row(i).to_vec();
            let p2 = points.row(j).to_vec();
            naive_dists.push(scirs2_spatial::distance::euclidean(&p1, &p2));
        }
    }
    let naive_time = start.elapsed().as_millis();

    // Strategy 2: SIMD + sequential
    let start = Instant::now();
    let mut simd_dists = Vec::new();
    for i in 0..n_points {
        for j in (i + 1)..n_points {
            let p1 = points.row(i).to_vec();
            let p2 = points.row(j).to_vec();
            simd_dists.push(simd_euclidean_distance(&p1, &p2)?);
        }
    }
    let simd_time = start.elapsed().as_millis();

    // Strategy 3: Parallel + SIMD
    let start = Instant::now();
    let _parallel_dists = parallel_pdist(&points.view(), "euclidean")?;
    let parallel_time = start.elapsed().as_millis();

    println!("Strategy                      Time (ms)    Speedup");
    println!("{}", "-".repeat(45));
    println!("{:<30} {:>8} {:>8.1}x", "Naive scalar", naive_time, 1.0);
    println!(
        "{:<30} {:>8} {:>8.1}x",
        "SIMD sequential",
        simd_time,
        naive_time as f64 / simd_time as f64
    );
    println!(
        "{:<30} {:>8} {:>8.1}x",
        "SIMD parallel",
        parallel_time,
        naive_time as f64 / parallel_time as f64
    );

    Ok(())
}

/// Benchmark I/O intensive operations
#[allow(dead_code)]
fn benchmark_io_intensive() -> Result<(), Box<dyn std::error::Error>> {
    println!("\nI/O intensive operations benchmark:");

    let sizes = [100, 500, 1000, 2000];
    let dim = 6;

    println!(
        "{:>8} {:>12} {:>15} {:>15}",
        "Size", "Read (ms)", "Compute (ms)", "Write (ms)"
    );
    println!("{}", "-".repeat(55));

    for &size in &sizes {
        // Simulate data reading
        let start = Instant::now();
        let points = generate_random_points(size, dim);
        let read_time = start.elapsed().as_millis();

        // Distance computation
        let start = Instant::now();
        let distances = parallel_pdist(&points.view(), "euclidean")?;
        let compute_time = start.elapsed().as_millis();

        // Simulate data writing
        let start = Instant::now();
        let _sum: f64 = distances.sum(); // Simulate processing
        let write_time = start.elapsed().as_millis();

        println!("{size:>8} {read_time:>12} {compute_time:>15} {write_time:>15}");
    }

    Ok(())
}

/// Memory bandwidth utilization analysis
#[allow(dead_code)]
fn memory_bandwidth_analysis() -> Result<(), Box<dyn std::error::Error>> {
    println!("\nMemory bandwidth utilization:");

    let n_points = 5000;
    let dim = 10;
    let points = generate_random_points(n_points, dim);

    // Calculate theoretical vs actual throughput
    let start = Instant::now();
    let _distances = parallel_pdist(&points.view(), "euclidean")?;
    let elapsed = start.elapsed().as_secs_f64();

    let data_size = n_points * dim * std::mem::size_of::<f64>();
    let bandwidth_gb_s = (data_size as f64) / (elapsed * 1e9);

    println!(
        "Data processed: {:.2} MB",
        data_size as f64 / (1024.0 * 1024.0)
    );
    println!("Time elapsed: {elapsed:.3} seconds");
    println!("Effective bandwidth: {bandwidth_gb_s:.2} GB/s");

    // Compare with theoretical peak
    #[cfg(target_arch = "x86_64")]
    println!("Theoretical peak (DDR4-3200): ~25.6 GB/s");

    Ok(())
}