EMCompute/
lib.rs

1//! fast , simple and cross-platform GPGPU parallel computing library
2//! NOTE : there are still some problems with vulkan backend on linux 
3//! ##Example
4//! - this example is for v4.0.0 C ABI 
5//! ```c
6//! #include <stdio.h>
7//! #include <stdint.h>
8//! #include <stdlib.h>  
9//! #include "EMCompute.h"
10//! 
11//! int main() {
12//!  Define the kernel
13//!  CKernel kernel;
14//!  kernel.x = 60000;  // Number of workgroups in the x dimension
15//!  kernel.y = 1000;
16//!  kernel.z = 100;
17//!
18//!  // WGSL code to perform element-wise addition of example_data and example_data0
19//!  const char* code = 
20//!    "@group(0)@binding(0) var<storage, read_write> v_indices: array<u32>; "
21//!    "@group(0)@binding(1) var<storage, read> v_indices0: array<u32>; "
22//!    "@compute @workgroup_size(10 , 1 , 1)" 
23//!    "fn main(@builtin(global_invocation_id) global_id: vec3<u32>) { "
24//!    "  let idx = global_id.x % 60000; "
25//!    "   "
26//!    "v_indices[idx] = v_indices[idx] + v_indices0[idx]; "
27//!    "  "
28//!    "}";
29//!
30//!  uintptr_t index = set_kernel_default_config(&kernel);
31//!  kernel.kernel_code_index = register_computing_kernel_code(index , code , "main");
32//!
33//!
34//!
35//!  // Initialize data
36//!  uint32_t example_data[60000];
37//!  uint32_t example_data0[60000];
38//!
39//!  for (int i = 0; i < 60000; ++i) {
40//!    example_data[i] = 1;
41//!    example_data0[i] = 1;
42//!  }
43//!
44//!  // Bind data
45//!  DataBinder data;
46//!  data.bind = 0;
47//!  data.data = (uint8_t *)example_data;
48//!  data.data_len = sizeof(uint32_t)*60000/sizeof(uint8_t);
49//!
50//!  DataBinder data0;
51//!  data0.bind = 1;
52//!  data0.data = (uint8_t *)example_data0;
53//!  data0.data_len = sizeof(uint32_t)*60000/sizeof(uint8_t);
54//!
55//!  DataBinder group0[] = {data, data0};
56//!  GroupOfBinders wrapper;
57//!  wrapper.group = 0;
58//!  wrapper.datas = group0;
59//!  wrapper.datas_len = 2;
60//!
61//!  GroupOfBinders groups[] = {wrapper};
62//!
63//!  compute(&kernel, groups, 1);
64//!  
65//!
66//!  // Check results
67//!  printf("example_data[4]: %d\n", example_data[50000]);
68//!  printf("example_data0[4]: %d\n", example_data0[4]);
69//!
70//!  free_compute_cache();
71//!
72//!  return 0;
73//! }
74//! ```
75
76
77use std::os::raw::c_char;
78use std::ffi::CStr;
79
80use wgpu::util::DeviceExt;
81
82use std::sync::{Arc, Mutex};
83
84use core::ops::Range;
85
86mod configuration;
87pub use configuration::
88{GPUComputingBackend , 
89    GPUPowerSettings , 
90    GPUSpeedSettings , 
91    GPUMemorySettings , 
92    GPUComputingConfig};
93
94mod util;
95use util::{cchar_as_string};
96
97mod gpu_device;
98pub use gpu_device::
99{GPUDeviceType , 
100    GPUDeviceInfo , 
101    GPUDevices , 
102    get_computing_gpu_infos , 
103    free_gpu_devices_infos};
104
105
106
107struct GPUDeviceCollection {
108    compute_pipeline : Arc<wgpu::ComputePipeline> ,
109}
110
111struct GPUCollection {
112    device : Arc<wgpu::Device> ,
113    queue : Arc<wgpu::Queue> ,
114    res : Option<Arc<Mutex<Vec<GPUDeviceCollection>>>> ,
115}
116
117
118static mut GPU_RES_KEEPER : Option<Arc<Mutex<Vec<GPUCollection>>>> = None;
119
120
121#[no_mangle]
122/// since v4.0.0 you must create_computing_gpu_resources 
123/// it will return gpu_res_descriptor as uintptr_t (usize) 
124/// and you have to pass it as config_index value to 
125/// CKernel variable
126pub extern "C" fn create_computing_gpu_resources(config : GPUComputingConfig , customize : GPUCustomSettings) -> usize {
127
128    let backend = match config.backend {
129            GPUComputingBackend::vulkan => {
130                wgpu::Backends::VULKAN
131            },
132            GPUComputingBackend::opengl => {
133                wgpu::Backends::GL
134            },
135            GPUComputingBackend::all => {
136                wgpu::Backends::all()
137            },
138            GPUComputingBackend::default_backend => {
139                wgpu::Backends::default()
140            },
141            GPUComputingBackend::metal => {
142                wgpu::Backends::METAL 
143            },
144            GPUComputingBackend::direct_x12 => {
145                wgpu::Backends::DX12
146            },
147            GPUComputingBackend::highest_support => {
148                wgpu::Backends::PRIMARY
149            },
150            GPUComputingBackend::lowest_support => {
151                wgpu::Backends::SECONDARY
152            },
153            GPUComputingBackend::webgpu => {
154                wgpu::Backends::BROWSER_WEBGPU
155            },
156        };
157
158    let instance = wgpu::Instance::new(wgpu::InstanceDescriptor{
159        backends : backend ,
160        ..Default::default()
161    });
162
163
164    let mut adapter : wgpu::Adapter;
165
166    if config.gpu_index_in_backend_group < 0 {
167        let adapt = pollster::block_on(instance
168        .request_adapter(&wgpu::RequestAdapterOptions{
169            power_preference : match config.power {
170                GPUPowerSettings::none => {
171                    wgpu::PowerPreference::None
172                },
173                GPUPowerSettings::LowPower => {
174                    wgpu::PowerPreference::LowPower
175                },
176                GPUPowerSettings::HighPerformance => {
177                    wgpu::PowerPreference::HighPerformance
178                },
179            },
180            ..Default::default()
181        }))
182    .expect("ERROR : could not allocate gpu resources which match your configs");
183        adapter = adapt;
184
185    }else {
186        let mut adapters = instance.enumerate_adapters(backend);
187        adapter = adapters.swap_remove(config.gpu_index_in_backend_group as usize);
188    }
189
190
191    let (device, queue) = pollster::block_on(adapter
192        .request_device(
193            &wgpu::DeviceDescriptor {
194                label: None,
195                required_features: wgpu::Features::empty(),
196                required_limits: match config.speed {
197                    GPUSpeedSettings::lowest_speed => {
198                        wgpu::Limits::downlevel_webgl2_defaults()
199                    },
200                    GPUSpeedSettings::low_speed => {
201                        wgpu::Limits::downlevel_defaults()
202                    },
203                    GPUSpeedSettings::custom_speed => {
204
205                        customize.gpu_speed_custom.to_gpu_limits()
206                    },
207                    GPUSpeedSettings::default_speed => {
208                        wgpu::Limits::default()
209                    },
210                },
211                memory_hints: match config.memory {
212                    GPUMemorySettings::prefer_performance => {
213                        wgpu::MemoryHints::Performance
214                    },
215                    GPUMemorySettings::prefer_memory => {
216                        wgpu::MemoryHints::MemoryUsage
217                    },
218                    GPUMemorySettings::custom_memory => {
219
220
221                        wgpu::MemoryHints::Manual{
222                            suballocated_device_memory_block_size : customize.gpu_memory_custom.to_rs_range(),
223                        }
224                    },
225                },
226            },
227            None,
228            ))
229                .expect("ERROR : could not allocate gpu resources which match your configs");
230
231    // println!("get real done");
232    unsafe{
233        let device = Arc::new(device);
234        let queue = Arc::new(queue);
235
236        if let None = &GPU_RES_KEEPER {
237            GPU_RES_KEEPER = Some(Arc::new(Mutex::new(Vec::new())));
238        }
239
240        let arci = GPU_RES_KEEPER.clone().unwrap();
241        let mut GPU_Data = arci.lock().unwrap();
242
243        let setting_cache_index = GPU_Data.len();
244        GPU_Data.push(GPUCollection{
245            device : Arc::clone(&device) ,
246            queue : Arc::clone(&queue) ,
247            res : None ,
248        });
249
250        return setting_cache_index;
251    }
252}
253
254#[no_mangle]
255/// since v4.0.0 your kernel code must be registered before 
256/// you want to use it . gpu_res_index is gpu resource descriptor 
257/// which you get from create_computing_gpu_resources .
258pub extern "C" fn register_computing_kernel_code(gpu_res_index : usize , code : *const c_char , entry_point : *const c_char) -> usize {
259    unsafe {
260        match &GPU_RES_KEEPER {
261            None => {
262                panic!("ERROR : use create_gpu_resources function first to add and get index of your config !");
263            },
264            Some(arci) => {
265                let mut gpu_data = arci.lock().unwrap();
266                if gpu_data.len() <= gpu_res_index {
267                    panic!("ERROR : invalid gpu_res_index provided for register_kernel_code function , please use the number which you received from create_gpu_resources function");
268                }
269
270                let shader = gpu_data[gpu_res_index].device.create_shader_module(wgpu::ShaderModuleDescriptor {
271                    label: Some("Shader"),
272                    source: wgpu::ShaderSource::Wgsl(cchar_as_string(code).expect("ERROR : No computing kernel code provided , code field is not set .").into()),
273                });
274
275
276
277                let compute_pipeline = gpu_data[gpu_res_index].device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
278                    label: None,
279                    layout: None,
280                    module: &shader,
281                    entry_point: &cchar_as_string(entry_point).expect("ERROR : No code_entry_point field is set , it must be name of function which your kernel code starts from") ,
282                    compilation_options: Default::default(),
283                    cache: None,
284                });
285
286                let arc_compute_pipe = Arc::new(compute_pipeline);
287
288                match &gpu_data[gpu_res_index].res {
289                    None => {
290                        gpu_data[gpu_res_index].res = Some(Arc::new(Mutex::new(Vec::new())));
291                        let arci = gpu_data[gpu_res_index].res.clone().unwrap();
292                        let mut gpu_device_res = arci.lock().unwrap();
293                        let index = gpu_device_res.len();
294                        gpu_device_res.push(GPUDeviceCollection{
295                            compute_pipeline : Arc::clone(&arc_compute_pipe) ,
296                        });
297
298                        return index;
299                    },
300                    Some(arci) => {
301                        let mut gpu_device_res = arci.lock().unwrap();
302                        let index = gpu_device_res.len();
303                        gpu_device_res.push(GPUDeviceCollection{
304                            compute_pipeline : Arc::clone(&arc_compute_pipe) ,
305                        });
306
307                        return index;
308                    }
309                }
310            }
311        }
312    }
313}
314
315#[no_mangle]
316/// when your work fully finished with kernel codes and you 
317/// wont need to use them anymore , you can use this 
318/// function to cleanup all the mess which they created from memory
319pub extern "C" fn free_compute_kernel_codes(gpu_res_index : usize){
320    unsafe {
321        match &GPU_RES_KEEPER {
322            None => return ,
323            Some(arci) => {
324                let mut gpu_data = arci.lock().unwrap();
325                if gpu_data.len() < gpu_res_index {
326                    return;
327                }else{
328                    gpu_data[gpu_res_index].res = None;
329                    return;
330                }
331            }
332        }
333    }
334}
335
336
337
338#[repr(C)]
339#[derive(Debug, Clone)]
340/// CKernel which will represent your GPU task
341/// like how Manifest.xml does in an android 
342/// project
343pub struct CKernel {
344    /// set max number of workgroups in x dimension
345    pub x : u32 ,
346    /// set max number of workgroups in y dimension
347    pub y : u32 ,
348    /// set max number of workgroups in z dimension
349    pub z : u32 ,
350    /// since v4.0.0 instead of directly passing 
351    /// kernel code , you have to pass return 
352    /// value of register_computing_kernel_code
353    /// to this field 
354    pub kernel_code_index : usize ,
355    /// since v4.0.0 instead of directly passing 
356    /// configs of your computing task 
357    /// you have to create_computing_gpu_resources
358    /// return value to this field
359    pub config_index : usize ,
360}
361
362#[no_mangle]
363/// because setting CKernel config can be annoying if you just 
364/// want to do simple task , this function provides general 
365/// config which will meet most of your needs . since v4.0.0 
366/// this function calls create_computing_gpu_resources automatically
367/// and assign its return value to config_index of your CKernel variable .
368/// only use this function once in your programs , instead of using this 
369/// many times and causing memory leaks (well all that mem can be freed by free_compute_cache function)
370/// use config_index field of CKernel variable 
371pub extern "C" fn set_kernel_default_config(kernel: *mut CKernel) -> usize{
372    // println!("set start"); 
373    if kernel.is_null() {
374        panic!("ERROR : NULL value provided for set_kernel_default_config");
375    }
376
377    unsafe {
378
379        let kernel = &mut *kernel;
380
381
382        let config = GPUComputingConfig {
383            backend: GPUComputingBackend::opengl,
384            power: GPUPowerSettings::HighPerformance,
385            speed: GPUSpeedSettings::low_speed,
386            memory: GPUMemorySettings::prefer_memory,
387            gpu_index_in_backend_group : -1,
388        };
389
390        let customize = GPUCustomSettings::default();
391
392        let index = create_computing_gpu_resources(config , customize);
393
394        kernel.config_index = index;
395
396        return index;
397    }
398}
399
400
401impl CKernel {
402    // this function converts enums to
403    // equivalent gpu resources
404    fn get_real_config(&self) -> (Arc<wgpu::Device> , Arc<wgpu::Queue> , Arc<wgpu::ComputePipeline>) {
405        unsafe{
406            match &GPU_RES_KEEPER {
407                None => {
408                    panic!("ERROR : before using compute function you must use create_gpu_resources and register_kernel_code");
409                },
410                Some(arci) => {
411                    let gpu_data = arci.lock().unwrap();
412                    if gpu_data.len() <= self.config_index {
413                        panic!("ERROR : invalid config_index used for CKernel arg");
414                    }
415                    if let Some(arcii) = &gpu_data[self.config_index].res {
416                        let gpu_device_data = arcii.lock().unwrap();
417                        if gpu_device_data.len() <= self.kernel_code_index {
418                            panic!("ERROR : invalid kernel_code_index used for CKernel arg");
419                        }
420
421                        (Arc::clone(&gpu_data[self.config_index].device) , Arc::clone(&gpu_data[self.config_index].queue) , Arc::clone(&gpu_device_data[self.kernel_code_index].compute_pipeline))
422                    }else{
423                        panic!("ERROR : before using compute function you must register_kernel_code");
424                    }
425                },
426            }
427        }
428    }
429}
430
431#[repr(C)]
432#[derive(Debug, Clone)]
433/// this struct is for passing
434/// data based on its bind index 
435/// in gpu side 
436pub struct DataBinder {
437    /// bind index of data in gpu side
438    pub bind: u32,
439    /// because data must be in uint8_t (u8 in Rust) 
440    /// in C you have to pass the data len this way 
441    /// 
442    /// sizeof(your type) * real_len_of_your_array / sizeof(uint8_t)
443    pub data_len: usize,
444    /// address of pointer (since v5.0.0) which holds your data in memory , it must be 
445    /// uint8_t** (*mut *mut u8 in Rust side) 
446    /// in gpu side the type of this data will 
447    /// be set based on CKernel code you provided
448    pub data: *mut *mut u8,
449}
450
451#[repr(C)]
452#[derive(Debug, Clone , Default)]
453/// this struct represents custom settings 
454pub struct GPUCustomSettings {
455    /// this variable keeps custom speed settings 
456    pub gpu_speed_custom : GPUSpeedCustom ,
457    /// this variable keeps memory custom settings 
458    pub gpu_memory_custom : GPUMemoryCustom ,
459}
460
461#[repr(C)]
462#[derive(Debug, Clone , Default)]
463/// with this struct you set min - max of 
464/// memory you will need in gpu side 
465pub struct GPUMemoryCustom {
466    /// min mem needed in gpu side 
467    pub min : u64 ,
468    /// max mem needed in gpu side 
469    pub max : u64 ,
470}
471
472impl GPUMemoryCustom{
473    fn to_rs_range(&self) -> Range<u64> {
474        std::ops::Range{start : self.min , end : self.max}
475    }
476}
477
478#[repr(C)]
479#[derive(Debug, Clone , Default)]
480/// this struct is used for advance customizations refered as 
481/// custom_speed settings 
482pub struct GPUSpeedCustom {
483    pub max_texture_dimension_1d: u32,
484    pub max_texture_dimension_2d: u32,
485    pub max_texture_dimension_3d: u32,
486    pub max_texture_array_layers: u32,
487    pub max_bind_groups: u32,
488    pub max_bindings_per_bind_group: u32,
489    pub max_dynamic_uniform_buffers_per_pipeline_layout: u32,
490    pub max_dynamic_storage_buffers_per_pipeline_layout: u32,
491    pub max_sampled_textures_per_shader_stage: u32,
492    pub max_samplers_per_shader_stage: u32,
493    pub max_storage_buffers_per_shader_stage: u32,
494    pub max_storage_textures_per_shader_stage: u32,
495    pub max_uniform_buffers_per_shader_stage: u32,
496    pub max_uniform_buffer_binding_size: u32,
497    pub max_storage_buffer_binding_size: u32,
498    pub max_vertex_buffers: u32,
499    pub max_buffer_size: u64,
500    pub max_vertex_attributes: u32,
501    pub max_vertex_buffer_array_stride: u32,
502    pub min_uniform_buffer_offset_alignment: u32,
503    pub min_storage_buffer_offset_alignment: u32,
504    pub max_inter_stage_shader_components: u32,
505    pub max_color_attachments: u32,
506    pub max_color_attachment_bytes_per_sample: u32,
507    pub max_compute_workgroup_storage_size: u32,
508    pub max_compute_invocations_per_workgroup: u32,
509    pub max_compute_workgroup_size_x: u32,
510    pub max_compute_workgroup_size_y: u32,
511    pub max_compute_workgroup_size_z: u32,
512    pub max_compute_workgroups_per_dimension: u32,
513    pub min_subgroup_size: u32,
514    pub max_subgroup_size: u32,
515    pub max_push_constant_size: u32,
516    pub max_non_sampler_bindings: u32,
517}
518
519impl GPUSpeedCustom {
520    fn to_gpu_limits(&self) -> wgpu::Limits {
521        wgpu::Limits {
522            max_texture_dimension_1d: self.max_texture_dimension_1d,
523            max_texture_dimension_2d: self.max_texture_dimension_2d,
524            max_texture_dimension_3d: self.max_texture_dimension_3d,
525            max_texture_array_layers: self.max_texture_array_layers,
526            max_bind_groups: self.max_bind_groups,
527            max_bindings_per_bind_group: self.max_bindings_per_bind_group,
528            max_dynamic_uniform_buffers_per_pipeline_layout: self.max_dynamic_uniform_buffers_per_pipeline_layout,
529            max_dynamic_storage_buffers_per_pipeline_layout: self.max_dynamic_storage_buffers_per_pipeline_layout,
530            max_sampled_textures_per_shader_stage: self.max_sampled_textures_per_shader_stage,
531            max_samplers_per_shader_stage: self.max_samplers_per_shader_stage,
532            max_storage_buffers_per_shader_stage: self.max_storage_buffers_per_shader_stage,
533            max_storage_textures_per_shader_stage: self.max_storage_textures_per_shader_stage,
534            max_uniform_buffers_per_shader_stage: self.max_uniform_buffers_per_shader_stage,
535            max_uniform_buffer_binding_size: self.max_uniform_buffer_binding_size,
536            max_storage_buffer_binding_size: self.max_storage_buffer_binding_size,
537            max_vertex_buffers: self.max_vertex_buffers,
538            max_buffer_size: self.max_buffer_size,
539            max_vertex_attributes: self.max_vertex_attributes,
540            max_vertex_buffer_array_stride: self.max_vertex_buffer_array_stride,
541            min_uniform_buffer_offset_alignment: self.min_uniform_buffer_offset_alignment,
542            min_storage_buffer_offset_alignment: self.min_storage_buffer_offset_alignment,
543            max_inter_stage_shader_components: self.max_inter_stage_shader_components,
544            max_color_attachments: self.max_color_attachments,
545            max_color_attachment_bytes_per_sample: self.max_color_attachment_bytes_per_sample,
546            max_compute_workgroup_storage_size: self.max_compute_workgroup_storage_size,
547            max_compute_invocations_per_workgroup: self.max_compute_invocations_per_workgroup,
548            max_compute_workgroup_size_x: self.max_compute_workgroup_size_x,
549            max_compute_workgroup_size_y: self.max_compute_workgroup_size_y,
550            max_compute_workgroup_size_z: self.max_compute_workgroup_size_z,
551            max_compute_workgroups_per_dimension: self.max_compute_workgroups_per_dimension,
552            min_subgroup_size: self.min_subgroup_size,
553            max_subgroup_size: self.max_subgroup_size,
554            max_push_constant_size: self.max_push_constant_size,
555            max_non_sampler_bindings: self.max_non_sampler_bindings,
556        }
557    }
558}
559
560/*
561   impl DataBinder {
562// this function is for future implementions
563unsafe fn data_as_vec(&self) -> Option<Vec<u8>> {
564if self.data.is_null() {
565None
566} else {
567// Create a Vec<u8> that shares the memory but doesn't deallocate it.
568Some(Vec::from_raw_parts(self.data, self.data_len, self.data_len))
569}
570}
571}
572*/
573
574#[repr(C)]
575#[derive(Debug, Clone)]
576/// all DataBinder types which have 
577/// the same @group index in your kernel
578/// code must all be gathered in this 
579/// type
580pub struct GroupOfBinders {
581    /// index of group in your kernel code 
582    pub group : u32 ,
583    /// pointer to array which all of the 
584    /// DataBinders from same group 
585    /// are gathered in 
586    pub datas : *mut DataBinder ,
587    /// len of datas array
588    pub datas_len : usize ,
589}
590
591
592
593#[no_mangle]
594/// the simple and compact function for sending 
595/// your computing task to the gpu side
596///
597/// kernel para = CKernel type which acts as Manifest for your gpu task 
598/// data_for_gpu = pointer to array of GroupOfBinders which contains data which must be sent to gpu 
599/// gpu_data_len = len of the array of the GroupOfBinders
600///
601/// unlike CUDA , you dont need to copy data to gpu manually , this function does it for you 
602/// in the most performant possible way 
603///
604/// if you find any bug or any problem , help us to fix it -> https://github.com/SkillfulElectro/EMCompute.git
605pub extern "C" fn compute(kernel : *mut CKernel , data_for_gpu : *mut GroupOfBinders , gpu_data_len : usize) -> i32 {
606
607    {
608        // println!("compute start");
609        //
610        let kernel = unsafe {&mut *kernel};
611
612
613
614        if data_for_gpu.is_null(){
615            println!("ERROR : data_for_gpu arg of compute function is NULL");
616            return -1;
617        }
618
619        let (device , queue , compute_pipeline) = kernel.get_real_config();
620
621        // println!("compute data stage");
622
623
624
625
626        let mut encoder =
627            device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
628
629        let mut staging_buffers : Vec<wgpu::Buffer> = Vec::new();
630        let mut sizes : Vec<wgpu::BufferAddress> = Vec::new();
631        let mut storage_buffers : Vec<wgpu::Buffer> = Vec::new();
632
633
634        let groups : &mut [GroupOfBinders] = unsafe { std::slice::from_raw_parts_mut(data_for_gpu , gpu_data_len) };
635
636        // println!("before cpass");
637        {
638            let mut cpass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
639                label: None,
640                timestamp_writes: None,
641            });
642
643            for group in &mut *groups {
644                let bind_group_layout = compute_pipeline.get_bind_group_layout(group.group);
645                if group.datas.is_null() {
646                    println!("ERROR : no data provided for datas field in data_for_gpu arg");
647                    return -1;
648                }
649
650                let bindings : &mut [DataBinder] = unsafe{
651                    std::slice::from_raw_parts_mut(group.datas , group.datas_len)
652                };
653
654                let mut tmp_staging_buffers : Vec<wgpu::Buffer> = Vec::new();
655                let mut tmp_sizes : Vec<wgpu::BufferAddress> = Vec::new();
656                let mut tmp_storage_buffers : Vec<wgpu::Buffer> = Vec::new();
657
658                let mut entries : Vec<wgpu::BindGroupEntry> = Vec::new();
659
660                for binder in &mut *bindings {
661                    if binder.data.is_null() {
662                        println!("ERROR : null data field in DataBinder found");
663                        return -1;
664                    }
665
666                    let data : &[u8] = unsafe{
667                        std::slice::from_raw_parts(*binder.data , binder.data_len)
668                    };
669
670
671
672                    let size = std::mem::size_of_val(data) as wgpu::BufferAddress;
673
674                    let staging_buffer = device.create_buffer(&wgpu::BufferDescriptor {
675                        label: None ,
676                        size : size ,
677                        usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
678                        mapped_at_creation: false,
679                    });
680
681                    let storage_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
682                        label: Some("Storage Buffer"),
683                        contents: data ,
684                        usage: wgpu::BufferUsages::STORAGE
685                            | wgpu::BufferUsages::COPY_DST
686                            | wgpu::BufferUsages::COPY_SRC,
687                    });
688
689
690
691                    tmp_sizes.push(size);
692                    tmp_staging_buffers.push(staging_buffer);
693                    tmp_storage_buffers.push(storage_buffer);
694                }
695
696
697
698                for (i, binder) in bindings.iter().enumerate() {
699                    entries.push(wgpu::BindGroupEntry {
700                        binding: binder.bind.clone(),
701                        resource: tmp_storage_buffers[i].as_entire_binding(),
702                    });
703                }
704
705                let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
706                    label: None,
707                    layout: &bind_group_layout,
708                    entries: entries.as_slice() ,
709                });
710
711                cpass.set_pipeline(&compute_pipeline);
712                cpass.set_bind_group(group.group , &bind_group, &[]);
713
714                storage_buffers.append(&mut tmp_storage_buffers);
715                staging_buffers.append(&mut tmp_staging_buffers);
716                sizes.append(&mut tmp_sizes);
717            }
718
719            cpass.insert_debug_marker("debug_marker");
720            cpass.dispatch_workgroups(kernel.x, kernel.y, kernel.z);
721        }
722        // println!("after cpass");
723
724
725        for (index, storage_buffer) in storage_buffers.iter().enumerate() {
726            encoder.copy_buffer_to_buffer(&storage_buffer, 0, &staging_buffers[index], 0, sizes[index]);
727        }
728
729        queue.submit(Some(encoder.finish()));
730
731
732
733        let mut index : usize = 0;
734        for group in groups {
735            let bindings : &mut [DataBinder] = unsafe{
736                std::slice::from_raw_parts_mut(group.datas , group.datas_len)
737            };
738
739            for binder in bindings {
740
741                let data : Box<[u8]> = unsafe{
742                    Box::from_raw(std::slice::from_raw_parts_mut(*binder.data , binder.data_len))
743                };
744
745
746                let buffer_slice = staging_buffers[index].slice(..);
747                let (sender, receiver) = flume::bounded(1);
748                buffer_slice.map_async(wgpu::MapMode::Read, move |v| sender.send(v).unwrap());
749
750                device.poll(wgpu::Maintain::wait()).panic_on_timeout();
751
752                if let Ok(Ok(())) = pollster::block_on(receiver.recv_async()) {
753                    let mapped_data = buffer_slice.get_mapped_range();
754
755
756
757                    unsafe {
758                        let mapped_data_ptr = mapped_data.as_ptr();
759                        let data: &[u8] = unsafe { std::slice::from_raw_parts(mapped_data_ptr , binder.data_len) };
760                        let tmp_box : Box<[u8]> = data.into();
761                        *binder.data = tmp_box.as_ptr() as *mut u8;
762                        std::mem::forget(tmp_box);
763                    }
764
765
766
767                    drop(mapped_data);
768                    staging_buffers[index].unmap();
769
770                } else {
771                    panic!("failed to run compute on gpu!")
772                }
773
774                index += 1;
775            }
776        }
777
778
779
780        return 0;
781    }
782}
783
784
785#[no_mangle]
786/// since version 2.0.0 api does 
787/// caching for gpu resources on the memory .
788/// the api does deallocate the caches 
789/// automatically , but in some cases 
790/// you might want to do it manually
791/// so just call free_compute_cache();
792pub extern "C" fn free_compute_cache(){
793    unsafe {
794        GPU_RES_KEEPER = None;
795    }
796}