1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
// SPDX-FileCopyrightText: 2025-2026 Carlson Büth <code@cbueth.de>
//
// SPDX-License-Identifier: MIT OR Apache-2.0
// GPU-accelerated implementation of kernel entropy calculation
// This module is only included when the `gpu` feature flag is enabled
use crate::estimators::approaches::kernel::KernelEntropy;
use bytemuck::{Pod, Zeroable};
use futures_intrusive::channel::shared::oneshot_channel;
use ndarray::Array1;
use pollster::block_on;
use wgpu::util::DeviceExt;
// Define a struct for the point data that can be sent to the GPU
#[repr(C)]
#[derive(Copy, Clone, Pod, Zeroable)]
struct GpuPoint {
values: [f32; 32], // Support up to 32 dimensions
_padding: [f32; 0], // No padding needed
}
// Define a struct for the precision matrix that can be sent to the GPU (for Gaussian kernel)
#[repr(C)]
#[derive(Copy, Clone, Pod, Zeroable)]
struct GpuPrecisionMatrix {
values: [f32; 1024], // Support up to 32x32 dimensions
dim_count: u32, // Actual number of dimensions
_padding: [u32; 3], // Padding to ensure 16-byte alignment
}
// Define a struct for the bandwidth that can be sent to the GPU (for Box kernel)
#[repr(C)]
#[derive(Copy, Clone, Pod, Zeroable)]
struct GpuBandwidth {
value: f32, // Single bandwidth value for all dimensions
dim_count: u32, // Actual number of dimensions
_padding: [u32; 2], // Padding to ensure 16-byte alignment
}
// Define a struct for the configuration parameters
#[repr(C)]
#[derive(Copy, Clone, Pod, Zeroable)]
struct GpuConfig {
point_count: u32,
dim_count: u32,
normalization: f32,
adaptive_radius: f32,
}
impl<const K: usize> KernelEntropy<K> {
/// Computes local probability density values using a Gaussian kernel with GPU acceleration
pub fn gaussian_kernel_density_gpu(&self) -> Array1<f64> {
// Check if dimensions are within supported range
if K > 32 {
return self.gaussian_kernel_density_cpu();
}
// Try to run the GPU implementation, fall back to CPU if it fails
match self.run_gaussian_gpu_calculation() {
Ok(result) => {
// The GPU calculation currently returns local entropy values: -ln(density)
// We need to convert it back to density: density = exp(-local_entropy)
result.mapv(|h| (-h).exp())
}
Err(_) => self.gaussian_kernel_density_cpu(),
}
}
/// Computes local probability density values using a box kernel with GPU acceleration
pub fn box_kernel_density_gpu(&self) -> Array1<f64> {
// Check if dimensions are within supported range
if K > 32 {
return self.box_kernel_density_cpu();
}
// Try to run the GPU implementation, fall back to CPU if it fails
match self.run_box_gpu_calculation() {
Ok(result) => {
// The GPU calculation currently returns local entropy values: -ln(density)
// We need to convert it back to density: density = exp(-local_entropy)
result.mapv(|h| (-h).exp())
}
Err(_) => self.box_kernel_density_cpu(),
}
}
/// Computes local entropy values using a Gaussian kernel with GPU acceleration
///
/// This implementation uses the GPU via wgpu to accelerate the calculation of
/// pairwise distances and Gaussian kernel values, which can provide significant
/// performance improvements for large datasets and high-dimensional data.
///
/// # Implementation Details
///
/// 1. The data points and scale factors are transferred to the GPU
/// 2. A compute shader calculates the Gaussian kernel contributions for all points in parallel
/// 3. The results are transferred back to the CPU
/// 4. The final entropy values are calculated by applying logarithm and dimension-dependent normalization
///
/// # Performance Characteristics
///
/// The GPU implementation provides dramatic speedups compared to the CPU implementation:
/// - For 1000 data points: ~4-17x faster, with higher speedups for higher dimensions
/// - For 5000 data points: ~89-131x faster, with significant gains even for low dimensions
/// - For 10000 data points: ~87-337x faster, with the most dramatic improvements for lower dimensions
///
/// # Adaptive Radius
///
/// The GPU implementation uses an enhanced adaptive radius calculation to better handle
/// different data sizes and bandwidths:
/// - For large datasets (> 5000 points) with small bandwidths (< 0.5): 4σ radius
/// - For smaller datasets with small bandwidths (< 0.5): 5σ radius
/// - For large datasets with normal bandwidths: 3σ radius
/// - For smaller datasets with normal bandwidths: 4σ radius
///
/// # Fallback Behavior
///
/// This method automatically falls back to the CPU implementation in the following cases:
/// - If the dataset has fewer than 500 points (GPU overhead outweighs benefits)
/// - If the dimensionality exceeds 32 (current GPU implementation limitation)
/// - If any step of the GPU calculation fails (ensures robustness)
pub fn gaussian_kernel_local_values_gpu(&self) -> Array1<f64> {
// Check if dimensions are within supported range
if K > 32 {
println!(
"GPU implementation only supports up to 32 dimensions, falling back to CPU implementation"
);
return self.gaussian_kernel_local_values();
}
// Check if we have enough points to make GPU acceleration worthwhile
// Based on benchmark analysis, GPU is beneficial for Gaussian kernel when dataset size >= 500
if self.points.len() < 500 {
return self.gaussian_kernel_local_values();
}
// Try to run the GPU implementation, fall back to CPU if it fails
match self.run_gaussian_gpu_calculation() {
Ok(result) => result,
Err(e) => {
println!("GPU calculation failed: {e}, falling back to CPU implementation",);
self.gaussian_kernel_local_values()
}
}
}
/// Computes local entropy values using a box kernel with GPU acceleration
///
/// This implementation uses the GPU via wgpu to accelerate the calculation of
/// pairwise distances and neighbor counting, which can provide significant
/// performance improvements for large datasets and high-dimensional data.
///
/// # Implementation Details
///
/// 1. The data points and bandwidth are transferred to the GPU
/// 2. A compute shader counts neighbors within bandwidth/2 for all points in parallel
/// 3. The results are transferred back to the CPU
/// 4. The final entropy values are calculated by applying logarithm
///
/// # Performance Characteristics
///
/// The Box kernel GPU implementation shows a different performance profile compared to the Gaussian kernel:
/// - For small datasets (100-1000 points), the CPU implementation is faster due to GPU setup overhead
/// - For medium datasets (5000 points), the GPU implementation shows significant speedups (1.7-12.6x)
/// - For large datasets (10000+ points), the GPU implementation provides dramatic speedups (9.5-37.1x)
/// - For high dimensions with large datasets, the GPU implementation completes calculations that
/// would timeout on the CPU
///
/// # Fallback Behavior
///
/// This method automatically falls back to the CPU implementation in the following cases:
/// - If the dataset has fewer than 5000 points (GPU overhead outweighs benefits)
/// - If the dimensionality exceeds 32 (current GPU implementation limitation)
/// - If any step of the GPU calculation fails (ensures robustness)
pub fn box_kernel_local_values_gpu(&self) -> Array1<f64> {
// Check if dimensions are within supported range
if K > 32 {
println!(
"GPU implementation only supports up to 32 dimensions, falling back to CPU implementation"
);
return self.box_kernel_local_values();
}
// Check if we have enough points to make GPU acceleration worthwhile
// Based on benchmark analysis, GPU is beneficial for Box kernel when dataset size >= 5000
if self.points.len() < 5000 {
return self.box_kernel_local_values();
}
// Try to run the GPU implementation, fall back to CPU if it fails
match self.run_box_gpu_calculation() {
Ok(result) => result,
Err(e) => {
println!("GPU calculation failed: {e}, falling back to CPU implementation");
self.box_kernel_local_values()
}
}
}
/// Main GPU calculation function for Gaussian kernel
///
/// This method handles the actual GPU computation for the Gaussian kernel entropy calculation.
/// It prepares the data for the GPU, runs the computation, and processes the results.
///
/// # Implementation Details
///
/// - Uses an adaptive radius calculation based on data size and bandwidth
/// - Applies dimension-dependent normalization to the results
/// - Uses a WGSL compute shader for parallel processing
/// - Handles numerical stability with Kahan summation for higher dimensions
///
/// # Returns
///
/// - `Ok(Array1<f64>)`: Array of local entropy values if the GPU calculation succeeds
/// - `Err(Box<dyn std::error::Error>)`: Error if any step of the GPU calculation fails
fn run_gaussian_gpu_calculation(&self) -> Result<Array1<f64>, Box<dyn std::error::Error>> {
let n = self.points.len();
// Calculate normalization factor: N * sqrt(det(2πΣ_scaled))
// det(2πΣ_scaled) = (2π)^K * det(Σ_scaled)
// det(Σ_scaled) = det(L * L^T) = det(L)^2
// Since L is lower triangular, det(L) is the product of its diagonal elements.
let det_scaled_cov = if let Some(ref l) = self.cholesky_factor {
let diag_prod: f64 = l.diag().iter().product();
diag_prod * diag_prod
} else {
// Fallback to diagonal covariance if cholesky_factor is None
self.std_devs
.iter()
.map(|&s| (self.bandwidth * s).powi(2))
.product()
};
// Normalization: N * (2π)^(K/2) * sqrt(det(Σ_scaled))
let normalization =
(n as f64) * (2.0 * std::f64::consts::PI).powf(K as f64 / 2.0) * det_scaled_cov.sqrt();
// Determine adaptive radius based on data density and max_eigenvalue
// We use a larger radius (6σ) to ensure better parity with Scipy which uses all points
let adaptive_radius = if self.n_samples > 5000 {
36.0 * self.max_eigenvalue // 6σ
} else {
64.0 * self.max_eigenvalue // 8σ
};
// Initialize wgpu
let instance = wgpu::Instance::new(&wgpu::InstanceDescriptor::default());
// Request an adapter
let adapter = match block_on(instance.request_adapter(&wgpu::RequestAdapterOptions {
power_preference: wgpu::PowerPreference::HighPerformance,
compatible_surface: None,
force_fallback_adapter: false,
})) {
Ok(adapter) => adapter,
Err(_) => return Err("Failed to find an appropriate adapter".into()),
};
// Create device and queue
let (device, queue) = block_on(adapter.request_device(&wgpu::DeviceDescriptor {
label: Some("Gaussian Kernel Device"),
required_features: wgpu::Features::empty(),
required_limits: wgpu::Limits::default(),
memory_hints: wgpu::MemoryHints::default(),
trace: wgpu::Trace::default(),
}))?;
// Prepare data for GPU
let mut gpu_points = Vec::with_capacity(self.points.len());
for point in &self.points {
let mut gpu_point = GpuPoint {
values: [0.0; 32],
_padding: [],
};
// Copy point values to GPU point
for (i, &val) in point.iter().enumerate() {
gpu_point.values[i] = val as f32;
}
gpu_points.push(gpu_point);
}
// Prepare precision matrix for GPU
let mut gpu_precision_matrix = GpuPrecisionMatrix {
values: [0.0; 1024],
dim_count: K as u32,
_padding: [0; 3],
};
if let Some(ref prec) = self.precision_matrix {
for r in 0..K {
for c in 0..K {
// Store in row-major order with 32 columns for easy GPU indexing
gpu_precision_matrix.values[r * 32 + c] = prec[[r, c]] as f32;
}
}
} else {
// Fallback to diagonal precision
for i in 0..K {
let scale = self.bandwidth * self.std_devs[i];
if scale > 0.0 {
gpu_precision_matrix.values[i * 32 + i] = (1.0 / (scale * scale)) as f32;
}
}
}
let gpu_config = GpuConfig {
point_count: self.points.len() as u32,
dim_count: K as u32,
normalization: normalization as f32,
adaptive_radius: adaptive_radius as f32,
};
// Create buffers
let points_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
label: Some("Points Buffer"),
contents: bytemuck::cast_slice(&gpu_points),
usage: wgpu::BufferUsages::STORAGE,
});
let precision_matrix_buffer =
device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
label: Some("Precision Matrix Buffer"),
contents: bytemuck::bytes_of(&gpu_precision_matrix),
usage: wgpu::BufferUsages::STORAGE,
});
let config_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
label: Some("Config Buffer"),
contents: bytemuck::bytes_of(&gpu_config),
usage: wgpu::BufferUsages::UNIFORM,
});
let output_buffer = device.create_buffer(&wgpu::BufferDescriptor {
label: Some("Output Buffer"),
size: (self.points.len() * std::mem::size_of::<f32>()) as u64,
usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
mapped_at_creation: false,
});
let staging_buffer = device.create_buffer(&wgpu::BufferDescriptor {
label: Some("Staging Buffer"),
size: (self.points.len() * std::mem::size_of::<f32>()) as u64,
usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
mapped_at_creation: false,
});
// Create shader module
let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
label: Some("Gaussian Kernel Shader"),
source: wgpu::ShaderSource::Wgsl(include_str!("gaussian_kernel.wgsl").into()),
});
// Create bind group layout
let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
label: Some("Gaussian Kernel Bind Group Layout"),
entries: &[
wgpu::BindGroupLayoutEntry {
binding: 0,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::Buffer {
ty: wgpu::BufferBindingType::Storage { read_only: true },
has_dynamic_offset: false,
min_binding_size: None,
},
count: None,
},
wgpu::BindGroupLayoutEntry {
binding: 1,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::Buffer {
ty: wgpu::BufferBindingType::Storage { read_only: true },
has_dynamic_offset: false,
min_binding_size: None,
},
count: None,
},
wgpu::BindGroupLayoutEntry {
binding: 2,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::Buffer {
ty: wgpu::BufferBindingType::Uniform,
has_dynamic_offset: false,
min_binding_size: None,
},
count: None,
},
wgpu::BindGroupLayoutEntry {
binding: 3,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::Buffer {
ty: wgpu::BufferBindingType::Storage { read_only: false },
has_dynamic_offset: false,
min_binding_size: None,
},
count: None,
},
],
});
// Create pipeline layout
let pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
label: Some("Gaussian Kernel Pipeline Layout"),
bind_group_layouts: &[&bind_group_layout],
push_constant_ranges: &[],
});
// Create compute pipeline
let compute_pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
label: Some("Gaussian Kernel Pipeline"),
layout: Some(&pipeline_layout),
module: &shader,
entry_point: Some("main"),
compilation_options: wgpu::PipelineCompilationOptions::default(),
cache: None,
});
// Create bind group
let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
label: Some("Gaussian Kernel Bind Group"),
layout: &bind_group_layout,
entries: &[
wgpu::BindGroupEntry {
binding: 0,
resource: points_buffer.as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 1,
resource: precision_matrix_buffer.as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 2,
resource: config_buffer.as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 3,
resource: output_buffer.as_entire_binding(),
},
],
});
// Create command encoder
let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
label: Some("Gaussian Kernel Command Encoder"),
});
// Compute pass
{
let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
label: Some("Gaussian Kernel Compute Pass"),
timestamp_writes: None,
});
compute_pass.set_pipeline(&compute_pipeline);
compute_pass.set_bind_group(0, &bind_group, &[]);
// Dispatch workgroups
// Use 256 threads per workgroup
let workgroup_size = 256;
let workgroup_count = (self.points.len() as u32).div_ceil(workgroup_size);
compute_pass.dispatch_workgroups(workgroup_count, 1, 1);
}
// Copy output to staging buffer
encoder.copy_buffer_to_buffer(
&output_buffer,
0,
&staging_buffer,
0,
(self.points.len() * std::mem::size_of::<f32>()) as u64,
);
// Submit command buffer
queue.submit(std::iter::once(encoder.finish()));
// Read back results
let buffer_slice = staging_buffer.slice(..);
let (sender, receiver) = oneshot_channel();
buffer_slice.map_async(wgpu::MapMode::Read, move |v| {
sender.send(v).unwrap();
});
// Wait for the GPU to finish
device
.poll(wgpu::PollType::Wait)
.expect("Failed to poll device");
// Get the mapped data
if let Some(Ok(())) = block_on(receiver.receive()) {
let data = buffer_slice.get_mapped_range();
let result: Vec<f32> = bytemuck::cast_slice(&data).to_vec();
drop(data);
staging_buffer.unmap();
// Convert the results to f64 and return
let mut local_values = Array1::<f64>::zeros(self.points.len());
for (i, &val) in result.iter().enumerate() {
local_values[i] = val as f64;
}
Ok(local_values)
} else {
Err("Failed to read back results from GPU".into())
}
}
/// Main GPU calculation function for Box kernel
///
/// This method handles the actual GPU computation for the Box kernel entropy calculation.
/// It prepares the data for the GPU, runs the computation, and processes the results.
///
/// # Implementation Details
///
/// - Uses Manhattan distance to count neighbors within bandwidth/2
/// - Normalizes by the volume of the hypercube (bandwidth^d) and the number of samples
/// - Uses a WGSL compute shader for parallel processing
/// - Optimized for high-dimensional data and large datasets
///
/// # Returns
///
/// - `Ok(Array1<f64>)`: Array of local entropy values if the GPU calculation succeeds
/// - `Err(Box<dyn std::error::Error>)`: Error if any step of the GPU calculation fails
fn run_box_gpu_calculation(&self) -> Result<Array1<f64>, Box<dyn std::error::Error>> {
// Calculate volume = bandwidth^d (where d = K)
// This is the volume of the hypercube with side length = bandwidth
let volume = self.bandwidth.powi(K as i32);
// Normalization factor: N * volume
// This is the denominator in the KDE formula: f̂(x) = (1/Nh^d) ∑ K((x - x_i)/h)
// where K is the box kernel (uniform within the bandwidth)
let normalization = self.n_samples as f64 * volume;
// Initialize wgpu
let instance = wgpu::Instance::new(&wgpu::InstanceDescriptor::default());
// Request an adapter
let adapter = match block_on(instance.request_adapter(&wgpu::RequestAdapterOptions {
power_preference: wgpu::PowerPreference::HighPerformance,
compatible_surface: None,
force_fallback_adapter: false,
})) {
Ok(adapter) => adapter,
Err(_) => return Err("Failed to find an appropriate adapter".into()),
};
// Create device and queue
let (device, queue) = block_on(adapter.request_device(&wgpu::DeviceDescriptor {
label: Some("Box Kernel Device"),
required_features: wgpu::Features::empty(),
required_limits: wgpu::Limits::default(),
memory_hints: wgpu::MemoryHints::default(),
trace: wgpu::Trace::default(),
}))?;
// Prepare data for GPU
let mut gpu_points = Vec::with_capacity(self.points.len());
for point in &self.points {
let mut gpu_point = GpuPoint {
values: [0.0; 32],
_padding: [],
};
// Copy point values to GPU point
for (i, &val) in point.iter().enumerate() {
gpu_point.values[i] = val as f32;
}
gpu_points.push(gpu_point);
}
// Prepare bandwidth for GPU
let gpu_bandwidth = GpuBandwidth {
value: self.bandwidth as f32,
dim_count: K as u32,
_padding: [0; 2],
};
let gpu_config = GpuConfig {
point_count: self.points.len() as u32,
dim_count: K as u32,
normalization: normalization as f32,
adaptive_radius: 0.0, // Not used for box kernel
};
// Create buffers
let points_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
label: Some("Points Buffer"),
contents: bytemuck::cast_slice(&gpu_points),
usage: wgpu::BufferUsages::STORAGE,
});
let bandwidth_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
label: Some("Bandwidth Buffer"),
contents: bytemuck::bytes_of(&gpu_bandwidth),
usage: wgpu::BufferUsages::STORAGE,
});
let config_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
label: Some("Config Buffer"),
contents: bytemuck::bytes_of(&gpu_config),
usage: wgpu::BufferUsages::UNIFORM,
});
let output_buffer = device.create_buffer(&wgpu::BufferDescriptor {
label: Some("Output Buffer"),
size: (self.points.len() * std::mem::size_of::<f32>()) as u64,
usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
mapped_at_creation: false,
});
let staging_buffer = device.create_buffer(&wgpu::BufferDescriptor {
label: Some("Staging Buffer"),
size: (self.points.len() * std::mem::size_of::<f32>()) as u64,
usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
mapped_at_creation: false,
});
// Create shader module
let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
label: Some("Box Kernel Shader"),
source: wgpu::ShaderSource::Wgsl(include_str!("box_kernel.wgsl").into()),
});
// Create bind group layout
let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
label: Some("Box Kernel Bind Group Layout"),
entries: &[
wgpu::BindGroupLayoutEntry {
binding: 0,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::Buffer {
ty: wgpu::BufferBindingType::Storage { read_only: true },
has_dynamic_offset: false,
min_binding_size: None,
},
count: None,
},
wgpu::BindGroupLayoutEntry {
binding: 1,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::Buffer {
ty: wgpu::BufferBindingType::Storage { read_only: true },
has_dynamic_offset: false,
min_binding_size: None,
},
count: None,
},
wgpu::BindGroupLayoutEntry {
binding: 2,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::Buffer {
ty: wgpu::BufferBindingType::Uniform,
has_dynamic_offset: false,
min_binding_size: None,
},
count: None,
},
wgpu::BindGroupLayoutEntry {
binding: 3,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::Buffer {
ty: wgpu::BufferBindingType::Storage { read_only: false },
has_dynamic_offset: false,
min_binding_size: None,
},
count: None,
},
],
});
// Create pipeline layout
let pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
label: Some("Box Kernel Pipeline Layout"),
bind_group_layouts: &[&bind_group_layout],
push_constant_ranges: &[],
});
// Create compute pipeline
let compute_pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
label: Some("Box Kernel Pipeline"),
layout: Some(&pipeline_layout),
module: &shader,
entry_point: Some("main"),
compilation_options: wgpu::PipelineCompilationOptions::default(),
cache: None,
});
// Create bind group
let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
label: Some("Box Kernel Bind Group"),
layout: &bind_group_layout,
entries: &[
wgpu::BindGroupEntry {
binding: 0,
resource: points_buffer.as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 1,
resource: bandwidth_buffer.as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 2,
resource: config_buffer.as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 3,
resource: output_buffer.as_entire_binding(),
},
],
});
// Create command encoder
let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
label: Some("Box Kernel Command Encoder"),
});
// Compute pass
{
let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
label: Some("Box Kernel Compute Pass"),
timestamp_writes: None,
});
compute_pass.set_pipeline(&compute_pipeline);
compute_pass.set_bind_group(0, &bind_group, &[]);
// Dispatch workgroups
// Use 256 threads per workgroup
let workgroup_size = 256;
let workgroup_count = (self.points.len() as u32).div_ceil(workgroup_size);
compute_pass.dispatch_workgroups(workgroup_count, 1, 1);
}
// Copy output to staging buffer
encoder.copy_buffer_to_buffer(
&output_buffer,
0,
&staging_buffer,
0,
(self.points.len() * std::mem::size_of::<f32>()) as u64,
);
// Submit command buffer
queue.submit(std::iter::once(encoder.finish()));
// Read back results
let buffer_slice = staging_buffer.slice(..);
let (sender, receiver) = oneshot_channel();
buffer_slice.map_async(wgpu::MapMode::Read, move |v| {
sender.send(v).unwrap();
});
// Wait for the GPU to finish
device
.poll(wgpu::PollType::Wait)
.expect("Failed to poll device");
// Get the mapped data
if let Some(Ok(())) = block_on(receiver.receive()) {
let data = buffer_slice.get_mapped_range();
let result: Vec<f32> = bytemuck::cast_slice(&data).to_vec();
drop(data);
staging_buffer.unmap();
// Convert the results to f64 and return
let mut local_values = Array1::<f64>::zeros(self.points.len());
for (i, &val) in result.iter().enumerate() {
local_values[i] = val as f64;
}
Ok(local_values)
} else {
Err("Failed to read back results from GPU".into())
}
}
}