nncombinator/cuda/
mod.rs

1//! Function to wrap and handle cuda kernel
2
3use std::fmt;
4use std::fmt::{Debug, Formatter};
5use std::marker::PhantomData;
6use std::sync::{Arc, Mutex};
7use cuda_runtime_sys::{cudaHostAllocDefault, dim3};
8use libc::{c_void};
9use rcudnn::Error;
10use rcudnn::utils::DataType;
11use rcudnn_sys::{cudaMemcpyKind, cudaStream_t, cudnnDataType_t};
12use crate::arr::{Arr, IntoConverter, MakeView, MakeViewMut, SerializedVec, SliceSize};
13use mem::{MemoryPool};
14use crate::device::{DeviceGpu};
15use crate::error::{CudaError, CudaRuntimeError, SizeMismatchError, TypeConvertError};
16use crate::layer::{BatchDataType, BatchSize};
17use crate::mem::AsRawSlice;
18use crate::ope::UnitValue;
19
20pub mod ffi;
21pub mod mem;
22pub mod kernel;
23pub mod cudnn;
24
25/// Trait to associate a type with a cudnn type
26pub trait DataTypeInfo {
27    /// get cudnn data type
28    fn cudnn_data_type() -> DataType;
29    /// get cudnn raw data type
30    fn cudnn_raw_data_type() -> cudnnDataType_t;
31    /// get size
32    fn size() -> usize;
33}
34impl DataTypeInfo for f32 {
35    fn cudnn_data_type() -> DataType {
36        DataType::Float
37    }
38    fn cudnn_raw_data_type() -> cudnnDataType_t {
39        cudnnDataType_t::CUDNN_DATA_FLOAT
40    }
41    fn size() -> usize {
42        4_usize
43    }
44}
45impl DataTypeInfo for f64 {
46    fn cudnn_data_type() -> DataType {
47        DataType::Double
48    }
49    fn cudnn_raw_data_type() -> cudnnDataType_t {
50        cudnnDataType_t::CUDNN_DATA_DOUBLE
51    }
52    fn size() -> usize {
53        8_usize
54    }
55}
56pub(crate) mod private {
57    pub trait AsConstKernelPtrBase {
58        fn as_const_kernel_ptr(&self) -> *mut libc::c_void;
59    }
60
61    pub trait AsMutKernelPtrBase {
62        fn as_mut_kernel_ptr(&mut self) -> *mut libc::c_void;
63    }
64
65    pub trait AsKernelPtrBase {
66        fn as_kernel_ptr(&mut self) -> *mut libc::c_void;
67    }
68}
69/// Trait defining the conversion to an immutable pointer type passed to the cuda kernel
70pub trait AsConstKernelPtr: private::AsConstKernelPtrBase {
71}
72/// Trait defining the conversion to an mutable pointer type passed to the cuda kernel
73pub trait AsMutKernelPtr: private::AsMutKernelPtrBase {
74}
75/// Trait defining the conversion to an pointer type passed to the cuda kernel
76pub trait AsKernelPtr: private::AsKernelPtrBase {
77}
78impl<T> AsConstKernelPtr for T where T: private::AsConstKernelPtrBase {}
79impl<T> AsMutKernelPtr for T where T: private::AsMutKernelPtrBase {}
80impl<T> private::AsKernelPtrBase for T where T: AsMutKernelPtr {
81    fn as_kernel_ptr(&mut self) -> *mut c_void {
82        self.as_mut_kernel_ptr()
83    }
84}
85impl<T> AsKernelPtr for T where T: private::AsKernelPtrBase {
86
87}
88/// Obtaining an immutable void pointer
89pub trait AsVoidPtr {
90    fn as_void_ptr(&self) -> *const libc::c_void;
91}
92/// Obtaining an mutable void pointer
93pub trait AsMutVoidPtr {
94    fn as_mut_void_ptr(&mut self) -> *mut libc::c_void;
95}
96/// Obtaining an immutable pointer
97pub trait AsPtr<T> {
98    fn as_ptr(&self) -> *const T;
99}
100/// Obtaining an mutable pointer
101pub trait AsMutPtr<T> {
102    fn as_mut_ptr(&mut self) -> *mut T;
103}
104pub trait TryClone: Sized {
105    type Error;
106
107    fn try_clone(&self) -> Result<Self,Self::Error>;
108}
109impl AsVoidPtr for i32 {
110    fn as_void_ptr(&self) -> *const libc::c_void {
111        self as *const i32 as *const libc::c_void
112    }
113}
114impl AsMutVoidPtr for i32 {
115    fn as_mut_void_ptr(&mut self) -> *mut libc::c_void {
116        self as *mut i32 as *mut libc::c_void
117    }
118}
119impl AsVoidPtr for u32 {
120    fn as_void_ptr(&self) -> *const libc::c_void {
121        self as *const u32 as *const libc::c_void
122    }
123}
124impl AsMutVoidPtr for u32 {
125    fn as_mut_void_ptr(&mut self) -> *mut libc::c_void {
126        self as *mut u32 as *mut libc::c_void
127    }
128}
129impl AsVoidPtr for i64 {
130    fn as_void_ptr(&self) -> *const libc::c_void {
131        self as *const i64 as *const libc::c_void
132    }
133}
134impl AsMutVoidPtr for i64 {
135    fn as_mut_void_ptr(&mut self) -> *mut libc::c_void {
136        self as *mut i64 as *mut libc::c_void
137    }
138}
139impl AsVoidPtr for u64 {
140    fn as_void_ptr(&self) -> *const libc::c_void {
141        self as *const u64 as *const libc::c_void
142    }
143}
144impl AsMutVoidPtr for u64 {
145    fn as_mut_void_ptr(&mut self) -> *mut libc::c_void {
146        self as *mut u64 as *mut libc::c_void
147    }
148}
149impl AsVoidPtr for usize {
150    fn as_void_ptr(&self) -> *const libc::c_void {
151        self as *const usize as *const libc::c_void
152    }
153}
154impl AsMutVoidPtr for usize {
155    fn as_mut_void_ptr(&mut self) -> *mut libc::c_void {
156        self as *mut usize as *mut libc::c_void
157    }
158}
159impl AsVoidPtr for f32 {
160    fn as_void_ptr(&self) -> *const libc::c_void {
161        self as *const f32 as *const libc::c_void
162    }
163}
164impl AsMutVoidPtr for f32 {
165    fn as_mut_void_ptr(&mut self) -> *mut libc::c_void {
166        self as *mut f32 as *mut libc::c_void
167    }
168}
169impl AsVoidPtr for f64 {
170    fn as_void_ptr(&self) -> *const libc::c_void {
171        self as *const f64 as *const libc::c_void
172    }
173}
174impl AsMutVoidPtr for f64 {
175    fn as_mut_void_ptr(&mut self) -> *mut libc::c_void {
176        self as *mut f64 as *mut libc::c_void
177    }
178}
179impl private::AsMutKernelPtrBase for i32 {
180    fn as_mut_kernel_ptr(&mut self) -> *mut libc::c_void {
181       self as *mut i32 as *mut libc::c_void
182    }
183}
184impl private::AsMutKernelPtrBase for u32 {
185    fn as_mut_kernel_ptr(&mut self) -> *mut libc::c_void {
186       self as *mut u32 as *mut libc::c_void
187    }
188}
189impl private::AsMutKernelPtrBase for i64 {
190    fn as_mut_kernel_ptr(&mut self) -> *mut libc::c_void {
191       self as *mut i64 as *mut libc::c_void
192    }
193}
194impl private::AsMutKernelPtrBase for u64 {
195    fn as_mut_kernel_ptr(&mut self) -> *mut libc::c_void {
196       self as *mut u64 as *mut libc::c_void
197    }
198}
199impl private::AsMutKernelPtrBase for usize {
200    fn as_mut_kernel_ptr(&mut self) -> *mut libc::c_void {
201       self as *mut usize as *mut libc::c_void
202    }
203}
204impl private::AsMutKernelPtrBase for f32 {
205    fn as_mut_kernel_ptr(&mut self) -> *mut libc::c_void {
206        self as *mut f32 as *mut libc::c_void
207    }
208}
209impl private::AsMutKernelPtrBase for f64 {
210    fn as_mut_kernel_ptr(&mut self) -> *mut libc::c_void {
211        self as *mut f64 as *mut libc::c_void
212    }
213}
214/// Trait that defines the type of each element of a pointer
215pub trait PointerElement {
216    type Element: Default + Debug;
217}
218/// Trait to implement cuda synchronous memory read operations
219pub trait ReadMemory<T: Default + Debug>: PointerElement {
220    /// # Errors
221    ///
222    /// This function may return the following errors
223    /// * [`rcudnn::Error`]
224    ///
225    ///
226    fn read_to_vec(&self) -> Result<Vec<T>,rcudnn::Error>;
227    /// Read memory as Vec with size specified
228    /// # Arguments
229    /// * `size` - Number of elements of the value to be read
230    ///
231    /// # Errors
232    ///
233    /// This function may return the following errors
234    /// * [`rcudnn::Error`]
235    ///
236    fn read_to_vec_with_size(&self,size:usize) -> Result<Vec<T>,rcudnn::Error>;
237}
238/// Trait to implement cuda synchronous memory write operations
239pub trait WriteMemory<T: Default + Debug>: AsMutVoidPtr {
240    /// Memory Copy
241    /// # Arguments
242    /// * `p` - Pointer to source memory
243    /// * `len` - Number of elements of the value to be copied
244    ///
245    /// # Errors
246    ///
247    /// This function may return the following errors
248    /// * [`rcudnn::Error`]
249    ///
250    fn memcpy(&mut self, p:*const T,len:usize) -> Result<usize,rcudnn::Error>;
251    /// Repeatedly copy the contents of memory
252    /// # Arguments
253    /// * `p` - Pointer to source memory
254    /// * `len` - Number of elements of the value to be copied
255    /// * `count` - Number of times to copy repeatedly
256    ///
257    /// # Errors
258    ///
259    /// This function may return the following errors
260    /// * [`rcudnn::Error`]
261    ///
262    fn memcpy_repeat(&mut self, p:*const T,len:usize,count:usize) -> Result<usize,rcudnn::Error>;
263}
264/// Trait to implement cuda asynchronous memory read operations
265pub trait ReadMemoryAsync<T: Default + Debug> {
266    /// # Errors
267    ///
268    /// This function may return the following errors
269    /// * [`rcudnn::Error`]
270    ///
271    ///
272    fn read_to_vec_async(&self,stream:cudaStream_t) -> Result<Vec<T>,rcudnn::Error>;
273    /// Read memory as Vec with size specified
274    /// # Arguments
275    /// * `stream` - cuda stream
276    /// * `size` - Number of elements of the value to be read
277    ///
278    /// # Errors
279    ///
280    /// This function may return the following errors
281    /// * [`rcudnn::Error`]
282    ///
283    fn read_to_vec_with_size_async(&self,stream: cudaStream_t,size:usize) -> Result<Vec<T>,rcudnn::Error>;
284}
285/// Trait to implement cuda asynchronous memory write operations
286pub trait WriteMemoryAsync<T: Default + Debug>: AsMutVoidPtr {
287    /// Memory Copy
288    /// # Arguments
289    /// * `p` - Pointer to source memory
290    /// * `len` - Number of elements of the value to be copied
291    /// * `stream` - cuda stream
292    ///
293    /// # Errors
294    ///
295    /// This function may return the following errors
296    /// * [`rcudnn::Error`]
297    ///
298    fn memcpy_async(&mut self, p:*const T,len:usize,stream:cudaStream_t) -> Result<usize,rcudnn::Error>;
299    /// Repeatedly copy the contents of memory
300    /// # Arguments
301    /// * `p` - Pointer to source memory
302    /// * `len` - Number of elements of the value to be copied
303    /// * `count` - Number of times to copy repeatedly
304    /// * `stream` - cuda stream
305    ///
306    /// # Errors
307    ///
308    /// This function may return the following errors
309    /// * [`rcudnn::Error`]
310    fn memcpy_async_repeat(&mut self, p:*const T,len:usize,count:usize,stream:cudaStream_t) -> Result<usize,rcudnn::Error>;
311}
312/// Trait defining cuda's synchronous memory move to operations
313pub trait MemoryMoveTo<T: Default + Debug,D: AsMutPtr<T>>: AsPtr<T> {
314    /// Memory Copy To
315    /// # Arguments
316    /// * `dst` - Pointer to destination memory
317    /// * `len` - Number of elements of the value to be copied
318    ///
319    /// # Errors
320    ///
321    /// This function may return the following errors
322    /// * [`rcudnn::Error`]
323    ///
324    fn memcpy_to(&self, dst:&mut D,len:usize) -> Result<usize,rcudnn::Error>;
325}
326/// Trait defining cuda's asynchronous memory move to operations
327pub trait MemoryMoveToAsync<T: Default + Debug,D: AsMutPtr<T>>: AsPtr<T> {
328    /// Memory Copy To
329    /// # Arguments
330    /// * `dst` - Pointer to destination memory
331    /// * `len` - Number of elements of the value to be copied
332    /// * `stream` - cuda stream
333    ///
334    /// # Errors
335    ///
336    /// This function may return the following errors
337    /// * [`rcudnn::Error`]
338    ///
339    fn memcpy_to_async(&self, dst:&mut D,len:usize,stream:cudaStream_t) -> Result<usize,rcudnn::Error>;
340}
341/// Wrapper to handle cuda device memory
342#[derive(Debug)]
343pub struct CudaPtr<T> {
344    ptr:*mut T,
345    size:usize,
346}
347impl<T> CudaPtr<T> {
348    /// Create an instance of CudaPtr
349    /// # Arguments
350    /// * `size`- Number of value elements to be allocated
351    ///
352    /// # Errors
353    ///
354    /// This function may return the following errors
355    /// * [`CudaError`]
356    pub fn new(size: usize) -> Result<CudaPtr<T>, CudaError> {
357        let ptr: *mut T = ffi::malloc(size)?;
358
359        Ok(CudaPtr {
360            ptr: ptr,
361            size: size,
362        })
363    }
364}
365impl<T> PointerElement for CudaPtr<T> where T: Default + Debug {
366    type Element = T;
367}
368impl<T: Default + Debug> ReadMemory<T> for CudaPtr<T> {
369    fn read_to_vec(&self) -> Result<Vec<T>,rcudnn::Error> {
370        let mut r = Vec::with_capacity(self.size);
371        r.resize_with(self.size,Default::default);
372
373        ffi::memcpy(r.as_mut_ptr(),
374                         self.ptr,
375                         self.size,
376                         cudaMemcpyKind::cudaMemcpyDeviceToHost)?;
377        Ok(r)
378    }
379
380    fn read_to_vec_with_size(&self,size:usize) -> Result<Vec<T>,rcudnn::Error> {
381        let mut r = Vec::with_capacity(size);
382        r.resize_with(size,Default::default);
383
384        ffi::memcpy(r.as_mut_ptr(),
385                         self.ptr,
386                         size,
387                         cudaMemcpyKind::cudaMemcpyDeviceToHost)?;
388        Ok(r)
389    }
390}
391impl<T: Default + Debug> WriteMemory<T> for CudaPtr<T> {
392    fn memcpy(&mut self, p:*const T,len:usize) -> Result<usize,rcudnn::Error> {
393        ffi::memcpy(self.ptr,
394                    p,
395                    len,
396                    cudaMemcpyKind::cudaMemcpyHostToDevice)?;
397        Ok(len)
398    }
399
400    fn memcpy_repeat(&mut self, p: *const T, len: usize, count: usize) -> Result<usize, Error> {
401        for i in 0..count {
402            unsafe {
403                ffi::memcpy(self.ptr.add(i * len),
404                            p,
405                            len,
406                            cudaMemcpyKind::cudaMemcpyHostToDevice)?;
407            }
408        }
409        Ok(len * count)
410    }
411}
412impl<T: Default + Debug> MemoryMoveTo<T,CudaHostPtr<T>> for CudaPtr<T> {
413    fn memcpy_to(&self, dst: &mut CudaHostPtr<T>, len: usize) -> Result<usize, Error> {
414        ffi::memcpy(dst.as_mut_ptr(),
415                    self.ptr,
416                    len,
417                    cudaMemcpyKind::cudaMemcpyDeviceToHost)?;
418        Ok(len)
419    }
420}
421impl<T: Default + Debug> MemoryMoveTo<T,CudaPtr<T>> for CudaPtr<T> {
422    fn memcpy_to(&self, dst: &mut CudaPtr<T>, len: usize) -> Result<usize, Error> {
423        ffi::memcpy(dst.as_mut_ptr(),
424                    self.ptr,
425                    len,
426                    cudaMemcpyKind::cudaMemcpyDeviceToDevice)?;
427        Ok(len)
428    }
429}
430impl<T: Default + Debug> MemoryMoveTo<T,CudaMemoryPoolPtr<T>> for CudaPtr<T> {
431    fn memcpy_to(&self, dst: &mut CudaMemoryPoolPtr<T>, len: usize) -> Result<usize, Error> {
432        ffi::memcpy(dst.as_mut_ptr(),
433                    self.ptr,
434                    len,
435                    cudaMemcpyKind::cudaMemcpyDeviceToDevice)?;
436        Ok(len)
437    }
438}
439impl<T> Drop for CudaPtr<T> {
440    fn drop(&mut self) {
441        ffi::free(self.ptr).unwrap();
442    }
443}
444impl<T> private::AsConstKernelPtrBase for CudaPtr<T> {
445    fn as_const_kernel_ptr(&self) -> *mut libc::c_void {
446        &self.ptr as *const *mut T as *mut libc::c_void
447    }
448}
449impl<T> private::AsMutKernelPtrBase for CudaPtr<T> {
450    fn as_mut_kernel_ptr(&mut self) -> *mut libc::c_void {
451        &mut self.ptr as *mut *mut T as *mut libc::c_void
452    }
453}
454impl<T> AsVoidPtr for CudaPtr<T> {
455    fn as_void_ptr(&self) -> *const libc::c_void {
456        self.ptr as *const libc::c_void
457    }
458}
459impl<T> AsMutVoidPtr for CudaPtr<T> {
460    fn as_mut_void_ptr(&mut self) -> *mut libc::c_void {
461        self.ptr as *mut libc::c_void
462    }
463}
464impl<T> AsPtr<T> for CudaPtr<T> {
465    fn as_ptr(&self) -> *const T {
466        self.ptr as *const T
467    }
468}
469impl<T> AsMutPtr<T> for CudaPtr<T> {
470    fn as_mut_ptr(&mut self) -> *mut T {
471        self.ptr
472    }
473}
474/// Wrapper to handle cuda host memory
475#[derive(Debug)]
476pub struct CudaHostPtr<T> {
477    ptr:*mut T,
478    size:usize,
479}
480impl<T> CudaHostPtr<T> {
481    /// Create an instance of CudaHostPtr
482    /// # Arguments
483    /// * `size`- Number of value elements to be allocated
484    /// * `flags` - Requested properties of allocated memory
485    ///
486    /// # Errors
487    ///
488    /// This function may return the following errors
489    /// * [`CudaError`]
490    pub fn new(size: usize, flags:libc::c_uint) -> Result<CudaHostPtr<T>, CudaError> {
491        let ptr: *mut T = ffi::malloc_host(size,flags)?;
492
493        Ok(CudaHostPtr {
494            ptr: ptr,
495            size: size,
496        })
497    }
498}
499impl<T: Default + Debug> PointerElement for CudaHostPtr<T> {
500    type Element = T;
501}
502impl<T: Default + Debug> ReadMemory<T> for CudaHostPtr<T> {
503    fn read_to_vec(&self) -> Result<Vec<T>,rcudnn::Error> {
504        let mut r = Vec::with_capacity(self.size);
505        r.resize_with(self.size,Default::default);
506
507        ffi::memcpy(r.as_mut_ptr(),
508                         self.ptr,
509                         self.size,
510                         cudaMemcpyKind::cudaMemcpyDeviceToHost)?;
511        Ok(r)
512    }
513
514    fn read_to_vec_with_size(&self,size:usize) -> Result<Vec<T>,rcudnn::Error> {
515        let mut r = Vec::with_capacity(size);
516        r.resize_with(size,Default::default);
517
518        ffi::memcpy(r.as_mut_ptr(),
519                         self.ptr,
520                         size,
521                         cudaMemcpyKind::cudaMemcpyDeviceToHost)?;
522        Ok(r)
523    }
524}
525impl<T: Default + Debug> WriteMemory<T> for CudaHostPtr<T> {
526    fn memcpy(&mut self, p:*const T,len:usize) -> Result<usize,rcudnn::Error> {
527        ffi::memcpy(self.ptr,
528                    p,
529                    len,
530                    cudaMemcpyKind::cudaMemcpyHostToDevice)?;
531        Ok(len)
532    }
533
534    fn memcpy_repeat(&mut self, p: *const T, len: usize, count: usize) -> Result<usize, Error> {
535        for i in 0..count {
536            unsafe {
537                ffi::memcpy(self.ptr.add(i * len),
538                            p,
539                            len,
540                            cudaMemcpyKind::cudaMemcpyHostToDevice)?;
541            }
542        }
543        Ok(len * count)
544    }
545}
546impl<T: Default + Debug> ReadMemoryAsync<T> for CudaHostPtr<T> {
547    fn read_to_vec_async(&self, stream: cudaStream_t) -> Result<Vec<T>,rcudnn::Error> {
548        let mut r = Vec::with_capacity(self.size);
549        r.resize_with(self.size,Default::default);
550
551        ffi::memcpy_async(r.as_mut_ptr(),
552                               self.ptr,
553                               self.size,
554                               cudaMemcpyKind::cudaMemcpyDeviceToHost,
555                               stream)?;
556        Ok(r)
557    }
558
559    fn read_to_vec_with_size_async(&self, stream: cudaStream_t, size:usize) -> Result<Vec<T>,rcudnn::Error> {
560        let mut r = Vec::with_capacity(size);
561        r.resize_with(size,Default::default);
562
563        ffi::memcpy_async(r.as_mut_ptr(),
564                               self.ptr,
565                               size,
566                               cudaMemcpyKind::cudaMemcpyDeviceToHost,
567                               stream)?;
568        Ok(r)
569    }
570}
571impl<T: Default + Debug> WriteMemoryAsync<T> for CudaHostPtr<T> {
572    fn memcpy_async(&mut self, p:*const T,len:usize,stream:cudaStream_t) -> Result<usize,rcudnn::Error> {
573        ffi::memcpy_async(self.ptr,
574                          p,
575                          len,
576                          cudaMemcpyKind::cudaMemcpyHostToDevice,stream)?;
577        Ok(len)
578    }
579
580    fn memcpy_async_repeat(&mut self, p: *const T, len: usize, count: usize, stream: cudaStream_t) -> Result<usize, Error> {
581        for i in 0..count {
582            unsafe {
583                ffi::memcpy_async(self.ptr.add(i * len),
584                                  p,
585                                  len,
586                                  cudaMemcpyKind::cudaMemcpyHostToDevice,stream)?;
587            }
588        }
589        Ok(len * count)
590
591    }
592}
593impl<T: Default + Debug> MemoryMoveTo<T,CudaHostPtr<T>> for CudaHostPtr<T> {
594    fn memcpy_to(&self, dst: &mut CudaHostPtr<T>, len: usize) -> Result<usize, Error> {
595        ffi::memcpy(dst.as_mut_ptr(),
596                    self.ptr,
597                    len,
598                    cudaMemcpyKind::cudaMemcpyHostToHost)?;
599        Ok(len)
600    }
601}
602impl<T: Default + Debug> MemoryMoveTo<T,CudaPtr<T>> for CudaHostPtr<T> {
603    fn memcpy_to(&self, dst: &mut CudaPtr<T>, len: usize) -> Result<usize, Error> {
604        ffi::memcpy(dst.as_mut_ptr(),
605                    self.ptr,
606                    len,
607                    cudaMemcpyKind::cudaMemcpyHostToDevice)?;
608        Ok(len)
609    }
610}
611impl<T: Default + Debug> MemoryMoveTo<T,CudaMemoryPoolPtr<T>> for CudaHostPtr<T> {
612    fn memcpy_to(&self, dst: &mut CudaMemoryPoolPtr<T>, len: usize) -> Result<usize, Error> {
613        ffi::memcpy(dst.as_mut_ptr(),
614                    self.ptr,
615                    len,
616                    cudaMemcpyKind::cudaMemcpyHostToDevice)?;
617        Ok(len)
618    }
619}
620impl<T: Default + Debug> MemoryMoveToAsync<T,CudaHostPtr<T>> for CudaHostPtr<T> {
621    fn memcpy_to_async(&self, dst: &mut CudaHostPtr<T>, len: usize,stream:cudaStream_t) -> Result<usize, Error> {
622        ffi::memcpy_async(dst.as_mut_ptr(),
623                    self.ptr,
624                    len,
625                    cudaMemcpyKind::cudaMemcpyHostToHost,stream)?;
626        Ok(len)
627    }
628}
629impl<T: Default + Debug> MemoryMoveToAsync<T,CudaPtr<T>> for CudaHostPtr<T> {
630    fn memcpy_to_async(&self, dst: &mut CudaPtr<T>, len: usize,stream:cudaStream_t) -> Result<usize, Error> {
631        ffi::memcpy_async(dst.as_mut_ptr(),
632                    self.ptr,
633                    len,
634                    cudaMemcpyKind::cudaMemcpyHostToDevice,stream)?;
635        Ok(len)
636    }
637}
638impl<T: Default + Debug> MemoryMoveToAsync<T,CudaMemoryPoolPtr<T>> for CudaHostPtr<T> {
639    fn memcpy_to_async(&self, dst: &mut CudaMemoryPoolPtr<T>, len: usize,stream:cudaStream_t) -> Result<usize, Error> {
640        ffi::memcpy_async(dst.as_mut_ptr(),
641                    self.ptr,
642                    len,
643                    cudaMemcpyKind::cudaMemcpyHostToDevice,stream)?;
644        Ok(len)
645    }
646}
647impl<T> Drop for CudaHostPtr<T> {
648    fn drop(&mut self) {
649        ffi::free_host(self.ptr).unwrap()
650    }
651}
652impl<T> AsPtr<T> for CudaHostPtr<T> {
653    fn as_ptr(&self) -> *const T {
654        self.ptr as *const T
655    }
656}
657impl<T> AsMutPtr<T> for CudaHostPtr<T> {
658    fn as_mut_ptr(&mut self) -> *mut T {
659        self.ptr
660    }
661}
662impl<T> private::AsConstKernelPtrBase for CudaHostPtr<T> {
663    fn as_const_kernel_ptr(&self) -> *mut libc::c_void {
664        &self.ptr as *const *mut T as *mut libc::c_void
665    }
666}
667impl<T> private::AsMutKernelPtrBase for CudaHostPtr<T> {
668    fn as_mut_kernel_ptr(&mut self) -> *mut libc::c_void {
669        &mut self.ptr as *mut *mut T as *mut libc::c_void
670    }
671}
672impl<T> AsVoidPtr for CudaHostPtr<T> {
673    fn as_void_ptr(&self) -> *const libc::c_void {
674        self.ptr as *const libc::c_void
675    }
676}
677impl<T> AsMutVoidPtr for CudaHostPtr<T> {
678    fn as_mut_void_ptr(&mut self) -> *mut libc::c_void {
679        self.ptr as *mut libc::c_void
680    }
681}
682/// Cuda memory object allocated from the memory pool
683pub struct CudaMemoryPoolPtr<T> {
684    ptr:*mut T,
685    size:usize,
686    memory_pool:Arc<Mutex<MemoryPool>>
687}
688impl<T> CudaMemoryPoolPtr<T> {
689    /// Create an instance of CudaMemoryPoolPtr
690    /// # Arguments
691    /// * `size`- Number of value elements to be allocated
692    /// * `memory_pool` - memory pool object
693    ///
694    /// # Errors
695    ///
696    /// This function may return the following errors
697    /// * [`CudaError`]
698    pub fn new(size: usize,memory_pool:&Arc<Mutex<MemoryPool>>) -> Result<CudaMemoryPoolPtr<T>, CudaError> {
699        let ptr:*mut T = match memory_pool.lock() {
700            Ok(mut memory_pool) => {
701                memory_pool.alloc_device(size)?
702            },
703            Err(_) => {
704                return Err(CudaError::InvalidState(String::from(
705                    "Failed to secure exclusive lock on memory pool."
706                )));
707            }
708        };
709
710        Ok(CudaMemoryPoolPtr {
711            ptr: ptr,
712            size: size,
713            memory_pool:Arc::clone(memory_pool),
714        })
715    }
716}
717impl<T> CudaMemoryPoolPtr<T> where T: Default + Debug {
718    /// Create an instance of CudaMemoryPoolPtr
719    /// # Arguments
720    /// * `size`- Number of value elements to be allocated
721    /// * `memory_pool` - memory pool object
722    /// * `initializer` - Repeatedly called function to initialize each element
723    ///
724    /// # Errors
725    ///
726    /// This function may return the following errors
727    /// * [`CudaError`]
728    pub fn with_initializer<I: FnMut() -> T>(size: usize, memory_pool:&Arc<Mutex<MemoryPool>>, initializer: I) -> Result<CudaMemoryPoolPtr<T>, CudaError> {
729        let mut ptr = Self::new(size,memory_pool)?;
730
731        let mut src = Vec::with_capacity(size);
732
733        src.resize_with(size,initializer);
734
735        ptr.memcpy(src.into_boxed_slice().as_ptr(),size)?;
736
737        Ok(ptr)
738    }
739}
740impl<T: Default + Debug> PointerElement for CudaMemoryPoolPtr<T> {
741    type Element = T;
742}
743impl<T: Default + Debug> ReadMemory<T> for CudaMemoryPoolPtr<T> {
744    fn read_to_vec(&self) -> Result<Vec<T>,rcudnn::Error> {
745        let mut r = Vec::with_capacity(self.size);
746        r.resize_with(self.size,Default::default);
747
748        ffi::memcpy(r.as_mut_ptr(),
749                    self.ptr,
750                    self.size,
751                    cudaMemcpyKind::cudaMemcpyDeviceToHost)?;
752        Ok(r)
753    }
754
755    fn read_to_vec_with_size(&self,size:usize) -> Result<Vec<T>,rcudnn::Error> {
756        let mut r = Vec::with_capacity(size);
757        r.resize_with(size,Default::default);
758
759        ffi::memcpy(r.as_mut_ptr(),
760                    self.ptr,
761                    size,
762                    cudaMemcpyKind::cudaMemcpyDeviceToHost)?;
763        Ok(r)
764    }
765}
766impl<T: Default + Debug> WriteMemory<T> for CudaMemoryPoolPtr<T> {
767    fn memcpy(&mut self, p:*const T,len:usize) -> Result<usize,rcudnn::Error> {
768        ffi::memcpy(self.ptr,
769                    p,
770                    len,
771                    cudaMemcpyKind::cudaMemcpyHostToDevice)?;
772        Ok(len)
773    }
774
775    fn memcpy_repeat(&mut self, p: *const T, len: usize, count: usize) -> Result<usize, Error> {
776        for i in 0..count {
777            unsafe {
778                ffi::memcpy(self.ptr.add(i * len),
779                            p,
780                            len,
781                            cudaMemcpyKind::cudaMemcpyHostToDevice)?;
782            }
783        }
784        Ok(len * count)
785    }
786}
787impl<T: Default + Debug> MemoryMoveTo<T,CudaHostPtr<T>> for CudaMemoryPoolPtr<T> {
788    fn memcpy_to(&self, dst: &mut CudaHostPtr<T>, len: usize) -> Result<usize, Error> {
789        ffi::memcpy(dst.as_mut_ptr(),
790                    self.ptr,
791                    len,
792                    cudaMemcpyKind::cudaMemcpyDeviceToHost)?;
793        Ok(len)
794    }
795}
796impl<T: Default + Debug> MemoryMoveTo<T,CudaPtr<T>> for CudaMemoryPoolPtr<T> {
797    fn memcpy_to(&self, dst: &mut CudaPtr<T>, len: usize) -> Result<usize, Error> {
798        ffi::memcpy(dst.as_mut_ptr(),
799                    self.ptr,
800                    len,
801                    cudaMemcpyKind::cudaMemcpyDeviceToDevice)?;
802        Ok(len)
803    }
804}
805impl<T: Default + Debug> MemoryMoveTo<T,CudaMemoryPoolPtr<T>> for CudaMemoryPoolPtr<T> {
806    fn memcpy_to(&self, dst: &mut CudaMemoryPoolPtr<T>, len: usize) -> Result<usize, Error> {
807        ffi::memcpy(dst.as_mut_ptr(),
808                    self.ptr,
809                    len,
810                    cudaMemcpyKind::cudaMemcpyDeviceToDevice)?;
811        Ok(len)
812    }
813}
814impl<T> Drop for CudaMemoryPoolPtr<T> {
815    fn drop(&mut self) {
816        match self.memory_pool.lock() {
817            Ok(mut memory_pool) => {
818                memory_pool.deallocate(self.ptr).unwrap();
819            },
820            Err(_) => {
821                panic!("Failed to secure exclusive lock on memory pool.");
822            }
823        }
824    }
825}
826impl<T> private::AsConstKernelPtrBase for CudaMemoryPoolPtr<T> {
827    fn as_const_kernel_ptr(&self) -> *mut libc::c_void {
828        &self.ptr as *const *mut T as *mut libc::c_void
829    }
830}
831impl<T> private::AsMutKernelPtrBase for CudaMemoryPoolPtr<T> {
832    fn as_mut_kernel_ptr(&mut self) -> *mut libc::c_void {
833        &mut self.ptr as *mut *mut T as *mut libc::c_void
834    }
835}
836impl<T> AsVoidPtr for CudaMemoryPoolPtr<T> {
837    fn as_void_ptr(&self) -> *const libc::c_void {
838        self.ptr as *const libc::c_void
839    }
840}
841impl<T> AsMutVoidPtr for CudaMemoryPoolPtr<T> {
842    fn as_mut_void_ptr(&mut self) -> *mut libc::c_void {
843        self.ptr as *mut libc::c_void
844    }
845}
846impl<T> AsPtr<T> for CudaMemoryPoolPtr<T> {
847    fn as_ptr(&self) -> *const T {
848        self.ptr as *const T
849    }
850}
851impl<T> AsMutPtr<T> for CudaMemoryPoolPtr<T> {
852    fn as_mut_ptr(&mut self) -> *mut T {
853        self.ptr
854    }
855}
856impl<T> Debug for CudaMemoryPoolPtr<T> {
857    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
858        write!(f,"CudaMemoryPoolPtr {{ ptr: {:?}, size {:?} }}",self.ptr,self.size)
859    }
860}
861/// Type that represents a pointer of const type to be passed to Cuda
862#[derive(Debug)]
863pub struct CudaConstPtr<'a,T> where T: AsConstKernelPtr {
864    ptr:&'a T
865}
866impl<'a,T> CudaConstPtr<'a,T> where T: AsConstKernelPtr {
867    pub fn new(ptr: &'a T) -> CudaConstPtr<'a,T> {
868        CudaConstPtr {
869            ptr
870        }
871    }
872}
873impl<'a,T> private::AsKernelPtrBase for CudaConstPtr<'a,T> where T: AsConstKernelPtr {
874    fn as_kernel_ptr(&mut self) -> *mut c_void {
875        self.ptr.as_const_kernel_ptr()
876    }
877}
878/// Cuda memory object representing a 1D array with dimension number as type parameter
879#[derive(Debug)]
880pub struct CudaTensor1dPtr<T,const N:usize> where T: Default + Debug {
881    ptr:CudaMemoryPoolPtr<T>
882}
883impl<T,const N:usize> CudaTensor1dPtr<T,N> where T: Default + Debug {
884    /// Create an instance of CudaTensor1dPtr
885    /// # Arguments
886    /// * `memory_pool` - memory pool object
887    ///
888    /// # Errors
889    ///
890    /// This function may return the following errors
891    /// * [`CudaError`]
892    pub fn new(memory_pool:&Arc<Mutex<MemoryPool>>) -> Result<CudaTensor1dPtr<T,N>, CudaError> {
893        Ok(CudaTensor1dPtr {
894            ptr:CudaMemoryPoolPtr::new(N,memory_pool)?
895        })
896    }
897
898    /// Create an instance of CudaMemoryPoolPtr
899    /// # Arguments
900    /// * `memory_pool` - memory pool object
901    /// * `initializer` - Repeatedly called function to initialize each element
902    ///
903    /// # Errors
904    ///
905    /// This function may return the following errors
906    /// * [`CudaError`]
907    pub fn with_initializer<I: FnMut() -> T>(memory_pool:&Arc<Mutex<MemoryPool>>, initializer: I) -> Result<CudaTensor1dPtr<T,N>, CudaError> {
908        let mut ptr = CudaMemoryPoolPtr::new(N,memory_pool)?;
909
910        let mut src = Vec::with_capacity(N);
911
912        src.resize_with(N,initializer);
913
914        ptr.memcpy(src.into_boxed_slice().as_ptr(),N)?;
915
916        Ok(CudaTensor1dPtr {
917            ptr: ptr
918        })
919    }
920}
921impl<T,const N:usize> BatchDataType for CudaTensor1dPtr<T,N> where T: Default + Debug + UnitValue<T> {
922    type Type = CudaVec<T,CudaTensor1dPtr<T,N>>;
923}
924impl<T,const N:usize> AsCudaPtrRef for CudaTensor1dPtr<T,N> where T: Default + Debug {
925    type Pointer = CudaMemoryPoolPtr<T>;
926
927    #[inline]
928    fn as_cuda_ptr_ref(&self) -> &Self::Pointer {
929        &self.ptr
930    }
931}
932impl<T,const N:usize> AsCudaMutPtr for CudaTensor1dPtr<T,N> where T: Default + Debug {
933    type Pointer = CudaMemoryPoolPtr<T>;
934
935    #[inline]
936    fn as_cuda_mut_ptr<'a>(&'a mut self) -> CudaMutPtr<'a,Self::Pointer> {
937        CudaMutPtr::new(&mut self.ptr)
938    }
939}
940impl<T,const N:usize> TryClone for CudaTensor1dPtr<T,N> where T: Default + Debug {
941    type Error = CudaError;
942    fn try_clone(&self) -> Result<Self,CudaError> {
943        let mut dst = CudaMemoryPoolPtr::new(N,&self.ptr.memory_pool)?;
944
945        self.memcpy_to(&mut dst,N)?;
946
947        Ok(CudaTensor1dPtr {
948            ptr: dst
949        })
950    }
951}
952impl<'a,T,const N:usize> From<&'a CudaTensor1dPtr<T,N>> for &'a CudaMemoryPoolPtr<T> where T: Default + Debug {
953    fn from(value: &'a CudaTensor1dPtr<T,N>) -> Self {
954        &value.ptr
955    }
956}
957impl<'a,T,const N:usize> From<&'a mut CudaTensor1dPtr<T,N>> for &'a mut CudaMemoryPoolPtr<T> where T: Default + Debug {
958    fn from(value: &'a mut CudaTensor1dPtr<T,N>) -> Self {
959        &mut value.ptr
960    }
961}
962impl<'a,T,const N:usize> ToHost<T> for CudaTensor1dPtr<T,N>
963    where T: Default + Debug + Clone + Send + Sync + 'static {
964    type Output = Arr<T,N>;
965    fn to_host(self) -> Result<Self::Output,TypeConvertError> {
966        Ok(self.ptr.read_to_vec()?.try_into()?)
967    }
968}
969impl<T,const N:usize> MemorySize for CudaTensor1dPtr<T,N>
970    where T: Default + Debug {
971    #[inline]
972    fn size() -> usize {
973        N
974    }
975}
976impl<T,const N:usize> PointerElement for CudaTensor1dPtr<T,N> where T: Default + Debug {
977    type Element = T;
978}
979/// View into a Cuda memory object representing a 1D array with dimension number as a type parameter
980#[derive(Debug)]
981pub struct CudaTensor1dPtrView<'a,T,const N:usize>
982    where T: Default + Debug {
983    ptr:&'a CudaMemoryPoolPtr<T>
984}
985impl<'a,T,const N:usize> From<&'a CudaTensor1dPtr<T,N>> for CudaTensor1dPtrView<'a,T,N>
986    where T: Default + Debug {
987    fn from(value: &'a CudaTensor1dPtr<T, N>) -> Self {
988        CudaTensor1dPtrView {
989            ptr:&value.ptr
990        }
991    }
992}
993impl<'a,T,const N:usize> From<&'a CudaTensor1dPtrView<'a,T,N>> for CudaTensor1dPtrView<'a,T,N>
994    where T: Default + Debug {
995    fn from(value: &'a CudaTensor1dPtrView<'a,T,N>) -> Self {
996        CudaTensor1dPtrView {
997            ptr:&value.ptr
998        }
999    }
1000}
1001impl<'a,T,const N:usize> TryFrom<&'a CudaTensor1dPtrView<'a,T,N>> for CudaTensor1dPtr<T,N> where T: Default + Debug {
1002    type Error = CudaError;
1003    fn try_from(value: &'a CudaTensor1dPtrView<'a,T,N>) -> Result<CudaTensor1dPtr<T,N>,CudaError> {
1004        let mut dst = CudaMemoryPoolPtr::new(N,&value.ptr.memory_pool)?;
1005
1006        value.memcpy_to(&mut dst,N)?;
1007
1008        Ok(CudaTensor1dPtr {
1009            ptr: dst
1010        })
1011    }
1012}
1013impl<'a,T,const N:usize> PointerElement for CudaTensor1dPtrView<'a,T,N> where T: Default + Debug {
1014    type Element = T;
1015}
1016impl<'a,T,const N:usize> AsCudaPtrRef for CudaTensor1dPtrView<'a,T,N> where T: Default + Debug {
1017    type Pointer = CudaMemoryPoolPtr<T>;
1018
1019    #[inline]
1020    fn as_cuda_ptr_ref(&self) -> &Self::Pointer {
1021        &self.ptr
1022    }
1023}
1024/// Cuda memory object representing a 2D array with dimension number as type parameter
1025#[derive(Debug)]
1026pub struct CudaTensor2dPtr<T,const N1:usize,const N2:usize> where T: Default + Debug {
1027    ptr:CudaMemoryPoolPtr<T>
1028}
1029impl<T,const N1:usize,const N2:usize> CudaTensor2dPtr<T,N1,N2> where T: Default + Debug {
1030    /// Create an instance of CudaTensor1dPtr
1031    /// # Arguments
1032    /// * `memory_pool` - memory pool object
1033    ///
1034    /// # Errors
1035    ///
1036    /// This function may return the following errors
1037    /// * [`CudaError`]
1038    pub fn new(memory_pool:&Arc<Mutex<MemoryPool>>) -> Result<CudaTensor2dPtr<T,N1,N2>, CudaError> {
1039        Ok(CudaTensor2dPtr {
1040            ptr:CudaMemoryPoolPtr::new(N1*N2,memory_pool)?
1041        })
1042    }
1043
1044    /// Create an instance of CudaMemoryPoolPtr
1045    /// # Arguments
1046    /// * `memory_pool` - memory pool object
1047    /// * `initializer` - Repeatedly called function to initialize each element
1048    ///
1049    /// # Errors
1050    ///
1051    /// This function may return the following errors
1052    /// * [`CudaError`]
1053    pub fn with_initializer<I: FnMut() -> T>(memory_pool:&Arc<Mutex<MemoryPool>>, initializer: I) -> Result<CudaTensor2dPtr<T,N1,N2>, CudaError> {
1054        let mut ptr = CudaMemoryPoolPtr::new(N1*N2,memory_pool)?;
1055
1056        let mut src = Vec::with_capacity(N1*N2);
1057
1058        src.resize_with(N1*N2,initializer);
1059
1060        ptr.memcpy(src.into_boxed_slice().as_ptr(),N1*N2)?;
1061
1062        Ok(CudaTensor2dPtr {
1063            ptr: ptr
1064        })
1065    }
1066}
1067impl<T,const N1:usize,const N2:usize> BatchDataType for CudaTensor2dPtr<T,N1,N2>
1068    where T: Default + Debug + UnitValue<T> {
1069    type Type = CudaVec<T,CudaTensor2dPtr<T,N1,N2>>;
1070}
1071impl<T,const N1:usize,const N2:usize> PointerElement for CudaTensor2dPtr<T,N1,N2> where T: Default + Debug {
1072    type Element = T;
1073}
1074impl<T,const N1:usize,const N2:usize> AsCudaPtrRef for CudaTensor2dPtr<T,N1,N2> where T: Default + Debug {
1075    type Pointer = CudaMemoryPoolPtr<T>;
1076    #[inline]
1077    fn as_cuda_ptr_ref(&self) -> &Self::Pointer {
1078        &self.ptr
1079    }
1080}
1081impl<T,const N1:usize,const N2:usize> AsCudaMutPtr for CudaTensor2dPtr<T,N1,N2> where T: Default + Debug {
1082    type Pointer = CudaMemoryPoolPtr<T>;
1083
1084    #[inline]
1085    fn as_cuda_mut_ptr<'a>(&'a mut self) -> CudaMutPtr<'a,Self::Pointer> {
1086        CudaMutPtr::new(&mut self.ptr)
1087    }
1088}
1089impl<T,const N1:usize,const N2:usize> TryClone for CudaTensor2dPtr<T,N1,N2> where T: Default + Debug {
1090    type Error = CudaError;
1091    fn try_clone(&self) -> Result<Self,CudaError> {
1092        let mut dst = CudaMemoryPoolPtr::new(N1*N2,&self.ptr.memory_pool)?;
1093
1094        self.memcpy_to(&mut dst,N1*N2)?;
1095
1096        Ok(CudaTensor2dPtr {
1097            ptr: dst
1098        })
1099    }
1100}
1101impl<'a,T,const N1:usize,const N2:usize> From<&'a CudaTensor2dPtr<T,N1,N2>> for &'a CudaMemoryPoolPtr<T> where T: Default + Debug {
1102    fn from(value: &'a CudaTensor2dPtr<T,N1,N2>) -> Self {
1103        &value.ptr
1104    }
1105}
1106impl<'a,T,const N1:usize,const N2:usize> From<&'a mut CudaTensor2dPtr<T,N1,N2>> for &'a mut CudaMemoryPoolPtr<T> where T: Default + Debug {
1107    fn from(value: &'a mut CudaTensor2dPtr<T,N1,N2>) -> Self {
1108        &mut value.ptr
1109    }
1110}
1111impl<T,const N1:usize,const N2:usize> MemorySize for CudaTensor2dPtr<T,N1,N2>
1112    where T: Default + Debug {
1113    #[inline]
1114    fn size() -> usize {
1115        N1 * N2
1116    }
1117}
1118/// View into a Cuda memory object representing a 2D array with dimension number as a type parameter
1119#[derive(Debug)]
1120pub struct CudaTensor2dPtrView<'a,T,const N1:usize,const N2:usize>
1121    where T: Default + Debug {
1122    ptr:&'a CudaMemoryPoolPtr<T>
1123}
1124impl<'a,T,const N1:usize,const N2:usize> PointerElement for CudaTensor2dPtrView<'a,T,N1,N2> where T: Default + Debug {
1125    type Element = T;
1126}
1127impl<'a,T,const N1:usize,const N2:usize> AsCudaPtrRef for CudaTensor2dPtrView<'a,T,N1,N2> where T: Default + Debug {
1128    type Pointer = CudaMemoryPoolPtr<T>;
1129
1130    #[inline]
1131    fn as_cuda_ptr_ref(&self) -> &Self::Pointer {
1132        &self.ptr
1133    }
1134}
1135impl<'a,T,const N1:usize,const N2:usize> From<&'a CudaTensor2dPtr<T,N1,N2>> for CudaTensor2dPtrView<'a,T,N1,N2>
1136    where T: Default + Debug {
1137    fn from(value: &'a CudaTensor2dPtr<T,N1,N2>) -> Self {
1138        CudaTensor2dPtrView {
1139            ptr:&value.ptr
1140        }
1141    }
1142}
1143impl<'a,T,const N1:usize,const N2:usize> From<&'a CudaTensor2dPtrView<'a,T,N1,N2>> for CudaTensor2dPtrView<'a,T,N1,N2>
1144    where T: Default + Debug {
1145    fn from(value: &'a CudaTensor2dPtrView<'a,T,N1,N2>) -> Self {
1146        CudaTensor2dPtrView {
1147            ptr:&value.ptr
1148        }
1149    }
1150}
1151impl<'a,T,const N1:usize,const N2:usize> TryFrom<&'a CudaTensor2dPtrView<'a,T,N1,N2>> for CudaTensor2dPtr<T,N1,N2>
1152    where T: Default + Debug {
1153    type Error = CudaError;
1154    fn try_from(value: &'a CudaTensor2dPtrView<'a,T,N1,N2>) -> Result<CudaTensor2dPtr<T,N1,N2>,CudaError> {
1155        let mut dst = CudaMemoryPoolPtr::new(N1*N2,&value.ptr.memory_pool)?;
1156
1157        value.memcpy_to(&mut dst,N1*N2)?;
1158
1159        Ok(CudaTensor2dPtr {
1160            ptr: dst
1161        })
1162    }
1163}
1164/// Cuda memory object representing a 3D array with dimension number as type parameter
1165#[derive(Debug)]
1166pub struct CudaTensor3dPtr<T,const N1:usize,const N2:usize,const N3:usize> where T: Default + Debug {
1167    ptr:CudaMemoryPoolPtr<T>
1168}
1169impl<T,const N1:usize,const N2:usize,const N3:usize> CudaTensor3dPtr<T,N1,N2,N3> where T: Default + Debug {
1170    /// Create an instance of CudaTensor1dPtr
1171    /// # Arguments
1172    /// * `memory_pool` - memory pool object
1173    ///
1174    /// # Errors
1175    ///
1176    /// This function may return the following errors
1177    /// * [`CudaError`]
1178    pub fn new(memory_pool:&Arc<Mutex<MemoryPool>>) -> Result<CudaTensor3dPtr<T,N1,N2,N3>, CudaError> {
1179        Ok(CudaTensor3dPtr {
1180            ptr:CudaMemoryPoolPtr::new(N1*N2*N3,memory_pool)?
1181        })
1182    }
1183
1184    /// Create an instance of CudaMemoryPoolPtr
1185    /// # Arguments
1186    /// * `memory_pool` - memory pool object
1187    /// * `initializer` - Repeatedly called function to initialize each element
1188    ///
1189    /// # Errors
1190    ///
1191    /// This function may return the following errors
1192    /// * [`CudaError`]
1193    pub fn with_initializer<I: FnMut() -> T>(memory_pool:&Arc<Mutex<MemoryPool>>, initializer: I) -> Result<CudaTensor3dPtr<T,N1,N2,N3>, CudaError> {
1194        let mut ptr = CudaMemoryPoolPtr::new(N1*N2*N3,memory_pool)?;
1195
1196        let mut src = Vec::with_capacity(N1*N2*N3);
1197
1198        src.resize_with(N1*N2*N3,initializer);
1199
1200        ptr.memcpy(src.into_boxed_slice().as_ptr(),N1*N2*N3)?;
1201
1202        Ok(CudaTensor3dPtr {
1203            ptr: ptr
1204        })
1205    }
1206}
1207impl<T,const N1:usize,const N2:usize,const N3:usize> BatchDataType for CudaTensor3dPtr<T,N1,N2,N3>
1208    where T: Default + Debug + UnitValue<T> {
1209    type Type = CudaVec<T,CudaTensor3dPtr<T,N1,N2,N3>>;
1210}
1211impl<T,const N1:usize,const N2:usize,const N3:usize> PointerElement for CudaTensor3dPtr<T,N1,N2,N3> where T: Default + Debug {
1212    type Element = T;
1213}
1214impl<T,const N1:usize,const N2:usize,const N3:usize> AsCudaPtrRef for CudaTensor3dPtr<T,N1,N2,N3> where T: Default + Debug {
1215    type Pointer = CudaMemoryPoolPtr<T>;
1216
1217    #[inline]
1218    fn as_cuda_ptr_ref(&self) -> &Self::Pointer {
1219        &self.ptr
1220    }
1221}
1222impl<T,const N1:usize,const N2:usize,const N3:usize> AsCudaMutPtr for CudaTensor3dPtr<T,N1,N2,N3> where T: Default + Debug {
1223    type Pointer = CudaMemoryPoolPtr<T>;
1224
1225    #[inline]
1226    fn as_cuda_mut_ptr<'a>(&'a mut self) -> CudaMutPtr<'a,Self::Pointer> {
1227        CudaMutPtr::new(&mut self.ptr)
1228    }
1229}
1230impl<T,const N1:usize,const N2:usize,const N3:usize> TryClone for CudaTensor3dPtr<T,N1,N2,N3> where T: Default + Debug {
1231    type Error = CudaError;
1232    fn try_clone(&self) -> Result<Self,CudaError> {
1233        let mut dst = CudaMemoryPoolPtr::new(N1*N2*N3,&self.ptr.memory_pool)?;
1234
1235        self.memcpy_to(&mut dst,N1*N2*N3)?;
1236
1237        Ok(CudaTensor3dPtr {
1238            ptr: dst
1239        })
1240    }
1241}
1242impl<'a,T,const N1:usize,const N2:usize,const N3:usize> From<&'a CudaTensor3dPtr<T,N1,N2,N3>> for &'a CudaMemoryPoolPtr<T> where T: Default + Debug {
1243    fn from(value: &'a CudaTensor3dPtr<T,N1,N2,N3>) -> Self {
1244        &value.ptr
1245    }
1246}
1247impl<'a,T,const N1:usize,const N2:usize,const N3:usize> From<&'a mut CudaTensor3dPtr<T,N1,N2,N3>> for &'a mut CudaMemoryPoolPtr<T> where T: Default + Debug {
1248    fn from(value: &'a mut CudaTensor3dPtr<T,N1,N2,N3>) -> Self {
1249        &mut value.ptr
1250    }
1251}
1252impl<T,const N1:usize,const N2:usize,const N3:usize> MemorySize for CudaTensor3dPtr<T,N1,N2,N3>
1253    where T: Default + Debug {
1254    #[inline]
1255    fn size() -> usize {
1256        N1 * N2 * N3
1257    }
1258}
1259/// View into a Cuda memory object representing a 3D array with dimension number as a type parameter
1260#[derive(Debug)]
1261pub struct CudaTensor3dPtrView<'a,T,const N1:usize,const N2:usize,const N3:usize>
1262    where T: Default + Debug {
1263    ptr:&'a CudaMemoryPoolPtr<T>
1264}
1265impl<'a,T,const N1:usize,const N2:usize,const N3:usize> PointerElement for CudaTensor3dPtrView<'a,T,N1,N2,N3>
1266    where T: Default + Debug{
1267    type Element = T;
1268}
1269impl<'a,T,const N1:usize,const N2:usize,const N3:usize> AsCudaPtrRef for CudaTensor3dPtrView<'a,T,N1,N2,N3> where T: Default + Debug {
1270    type Pointer = CudaMemoryPoolPtr<T>;
1271
1272    #[inline]
1273    fn as_cuda_ptr_ref(&self) -> &Self::Pointer {
1274        &self.ptr
1275    }
1276}
1277impl<'a,T,const N1:usize,const N2:usize,const N3:usize> From<&'a CudaTensor3dPtr<T,N1,N2,N3>> for CudaTensor3dPtrView<'a,T,N1,N2,N3>
1278    where T: Default + Debug {
1279    fn from(value: &'a CudaTensor3dPtr<T,N1,N2,N3>) -> Self {
1280        CudaTensor3dPtrView {
1281            ptr:&value.ptr
1282        }
1283    }
1284}
1285impl<'a,T,const N1:usize,const N2:usize,const N3:usize> From<&'a CudaTensor3dPtrView<'a,T,N1,N2,N3>> for CudaTensor3dPtrView<'a,T,N1,N2,N3>
1286    where T: Default + Debug {
1287    fn from(value: &'a CudaTensor3dPtrView<'a,T,N1,N2,N3>) -> Self {
1288        CudaTensor3dPtrView {
1289            ptr:&value.ptr
1290        }
1291    }
1292}
1293impl<'a,T,const N1:usize,const N2:usize,const N3:usize> TryFrom<&'a CudaTensor3dPtrView<'a,T,N1,N2,N3>> for CudaTensor3dPtr<T,N1,N2,N3>
1294    where T: Default + Debug {
1295    type Error = CudaError;
1296    fn try_from(value: &'a CudaTensor3dPtrView<'a,T,N1,N2,N3>) -> Result<CudaTensor3dPtr<T,N1,N2,N3>,CudaError> {
1297        let mut dst = CudaMemoryPoolPtr::new(N1*N2*N3,&value.ptr.memory_pool)?;
1298
1299        value.memcpy_to(&mut dst,N1*N2*N3)?;
1300
1301        Ok(CudaTensor3dPtr {
1302            ptr: dst
1303        })
1304    }
1305}
1306/// Cuda memory object representing a 4D array with dimension number as type parameter
1307#[derive(Debug)]
1308pub struct CudaTensor4dPtr<T,const N1:usize,const N2:usize,const N3:usize,const N4:usize> where T: Default + Debug {
1309    ptr:CudaMemoryPoolPtr<T>
1310}
1311impl<T,const N1:usize,const N2:usize,const N3:usize,const N4:usize> CudaTensor4dPtr<T,N1,N2,N3,N4> where T: Default + Debug {
1312    /// Create an instance of CudaTensor1dPtr
1313    /// # Arguments
1314    /// * `memory_pool` - memory pool object
1315    ///
1316    /// # Errors
1317    ///
1318    /// This function may return the following errors
1319    /// * [`CudaError`]
1320    pub fn new(memory_pool:&Arc<Mutex<MemoryPool>>) -> Result<CudaTensor4dPtr<T,N1,N2,N3,N4>, CudaError> {
1321        Ok(CudaTensor4dPtr {
1322            ptr:CudaMemoryPoolPtr::new(N1*N2*N3*N4,memory_pool)?
1323        })
1324    }
1325
1326    /// Create an instance of CudaMemoryPoolPtr
1327    /// # Arguments
1328    /// * `memory_pool` - memory pool object
1329    /// * `initializer` - Repeatedly called function to initialize each element
1330    ///
1331    /// # Errors
1332    ///
1333    /// This function may return the following errors
1334    /// * [`CudaError`]
1335    pub fn with_initializer<I: FnMut() -> T>(memory_pool:&Arc<Mutex<MemoryPool>>, initializer: I) -> Result<CudaTensor4dPtr<T,N1,N2,N3,N4>, CudaError> {
1336        let mut ptr = CudaMemoryPoolPtr::new(N1*N2*N3*N4,memory_pool)?;
1337
1338        let mut src = Vec::with_capacity(N1*N2*N3*N4);
1339
1340        src.resize_with(N1*N2*N3*N4,initializer);
1341
1342        ptr.memcpy(src.into_boxed_slice().as_ptr(),N1*N2*N3*N4)?;
1343
1344        Ok(CudaTensor4dPtr {
1345            ptr: ptr
1346        })
1347    }
1348}
1349impl<T,const N1:usize,const N2:usize,const N3:usize,const N4:usize> BatchDataType for CudaTensor4dPtr<T,N1,N2,N3,N4>
1350    where T: Default + Debug + UnitValue<T> {
1351    type Type = CudaVec<T,CudaTensor4dPtr<T,N1,N2,N3,N4>>;
1352}
1353impl<T,const N1:usize,const N2:usize,const N3:usize,const N4:usize> MemorySize for CudaTensor4dPtr<T,N1,N2,N3,N4>
1354    where T: Default + Debug {
1355    #[inline]
1356    fn size() -> usize {
1357        N1 * N2 * N3 * N4
1358    }
1359}
1360impl<T,const N1:usize,const N2:usize,const N3:usize,const N4:usize> PointerElement for CudaTensor4dPtr<T,N1,N2,N3,N4>
1361    where T: Default + Debug{
1362    type Element = T;
1363}
1364impl<T,const N1:usize,const N2:usize,const N3:usize,const N4:usize> AsCudaPtrRef for CudaTensor4dPtr<T,N1,N2,N3,N4>
1365    where T: Default + Debug {
1366    type Pointer = CudaMemoryPoolPtr<T>;
1367
1368    #[inline]
1369    fn as_cuda_ptr_ref(&self) -> &Self::Pointer {
1370        &self.ptr
1371    }
1372}
1373impl<T,const N1:usize,const N2:usize,const N3:usize,const N4:usize> AsCudaMutPtr for CudaTensor4dPtr<T,N1,N2,N3,N4>
1374    where T: Default + Debug {
1375    type Pointer = CudaMemoryPoolPtr<T>;
1376
1377    #[inline]
1378    fn as_cuda_mut_ptr<'a>(&'a mut self) -> CudaMutPtr<'a,Self::Pointer> {
1379        CudaMutPtr::new(&mut self.ptr)
1380    }
1381}
1382impl<T,const N1:usize,const N2:usize,const N3:usize,const N4:usize> TryClone for CudaTensor4dPtr<T,N1,N2,N3,N4>
1383    where T: Default + Debug {
1384    type Error = CudaError;
1385    fn try_clone(&self) -> Result<Self,CudaError> {
1386        let mut dst = CudaMemoryPoolPtr::new(N1*N2*N3*N4,&self.ptr.memory_pool)?;
1387
1388        self.memcpy_to(&mut dst,N1*N2*N3*N4)?;
1389
1390        Ok(CudaTensor4dPtr {
1391            ptr: dst
1392        })
1393    }
1394}
1395impl<'a,T,const N1:usize,const N2:usize,const N3:usize,const N4:usize> From<&'a CudaTensor4dPtr<T,N1,N2,N3,N4>> for &'a CudaMemoryPoolPtr<T> where T: Default + Debug {
1396    fn from(value: &'a CudaTensor4dPtr<T,N1,N2,N3,N4>) -> Self {
1397        &value.ptr
1398    }
1399}
1400impl<'a,T,const N1:usize,const N2:usize,const N3:usize,const N4:usize> From<&'a mut CudaTensor4dPtr<T,N1,N2,N3,N4>> for &'a mut CudaMemoryPoolPtr<T> where T: Default + Debug {
1401    fn from(value: &'a mut CudaTensor4dPtr<T,N1,N2,N3,N4>) -> Self {
1402        &mut value.ptr
1403    }
1404}
1405/// View into a Cuda memory object representing a 4D array with dimension number as a type parameter
1406#[derive(Debug)]
1407pub struct CudaTensor4dPtrView<'a,T,const N1:usize,const N2:usize,const N3:usize,const N4:usize>
1408    where T: Default + Debug {
1409    ptr:&'a CudaMemoryPoolPtr<T>
1410}
1411impl<'a,T,const N1:usize,const N2:usize,const N3:usize,const N4:usize> PointerElement for CudaTensor4dPtrView<'a,T,N1,N2,N3,N4>
1412    where T: Default + Debug{
1413    type Element = T;
1414}
1415impl<'a,T,const N1:usize,const N2:usize,const N3:usize,const N4:usize> AsCudaPtrRef for CudaTensor4dPtrView<'a,T,N1,N2,N3,N4>
1416    where T: Default + Debug {
1417    type Pointer = CudaMemoryPoolPtr<T>;
1418
1419    #[inline]
1420    fn as_cuda_ptr_ref(&self) -> &Self::Pointer {
1421        &self.ptr
1422    }
1423}
1424impl<'a,T,const N1:usize,const N2:usize,const N3:usize,const N4:usize> From<&'a CudaTensor4dPtr<T,N1,N2,N3,N4>>
1425    for CudaTensor4dPtrView<'a,T,N1,N2,N3,N4>
1426    where T: Default + Debug {
1427    fn from(value: &'a CudaTensor4dPtr<T,N1,N2,N3,N4>) -> Self {
1428        CudaTensor4dPtrView {
1429            ptr:&value.ptr
1430        }
1431    }
1432}
1433impl<'a,T,const N1:usize,const N2:usize,const N3:usize,const N4:usize> From<&'a CudaTensor4dPtrView<'a,T,N1,N2,N3,N4>>
1434    for CudaTensor4dPtrView<'a,T,N1,N2,N3,N4>
1435    where T: Default + Debug {
1436    fn from(value: &'a CudaTensor4dPtrView<'a,T,N1,N2,N3,N4>) -> Self {
1437        CudaTensor4dPtrView {
1438            ptr:&value.ptr
1439        }
1440    }
1441}
1442impl<'a,T,const N1:usize,const N2:usize,const N3:usize,const N4:usize> TryFrom<&'a CudaTensor4dPtrView<'a,T,N1,N2,N3,N4>>
1443    for CudaTensor4dPtr<T,N1,N2,N3,N4>
1444    where T: Default + Debug {
1445    type Error = CudaError;
1446
1447    fn try_from(value: &'a CudaTensor4dPtrView<'a, T, N1, N2, N3, N4>) -> Result<Self, CudaError> {
1448        let mut dst = CudaMemoryPoolPtr::new(N1*N2*N3*N4,&value.ptr.memory_pool)?;
1449
1450        value.memcpy_to(&mut dst,N1*N2*N3*N4)?;
1451
1452        Ok(CudaTensor4dPtr {
1453            ptr: dst
1454        })
1455    }
1456}
1457/// Trait that returns the size of Cuda smart point type memory (returns the number of elements)
1458pub trait MemorySize {
1459    fn size() -> usize;
1460}
1461#[derive(Debug)]
1462pub struct CudaVec<U,T>
1463    where U: UnitValue<U>,
1464          T: AsConstKernelPtr + AsKernelPtr {
1465    len: usize,
1466    ptr:CudaMemoryPoolPtr<U>,
1467    t:PhantomData<T>
1468}
1469impl<U,T> CudaVec<U,T>
1470    where U: UnitValue<U>,
1471          T: AsConstKernelPtr + AsKernelPtr + MemorySize {
1472    /// Create an instance of CudaVec
1473    /// # Arguments
1474    /// * `size`- Number of value elements to be allocated
1475    /// * `memory_pool` - memory pool object
1476    ///
1477    /// # Errors
1478    ///
1479    /// This function may return the following errors
1480    /// * [`CudaError`]
1481    pub fn new(size: usize, memory_pool:&Arc<Mutex<MemoryPool>>) -> Result<CudaVec<U,T>, CudaError> {
1482        let ptr = CudaMemoryPoolPtr::new(size * T::size(), memory_pool)?;
1483
1484        Ok(CudaVec {
1485            len:size,
1486            ptr,
1487            t:PhantomData::<T>
1488        })
1489    }
1490    /// Create an instance of CudaVec
1491    /// # Arguments
1492    /// * `size`- Number of value elements to be allocated
1493    /// * `memory_pool` - memory pool object
1494    /// * `initializer` - Repeatedly called function to initialize each element
1495    ///
1496    /// # Errors
1497    ///
1498    /// This function may return the following errors
1499    /// * [`CudaError`]
1500    pub fn with_initializer<I: FnMut() -> U>(size: usize, memory_pool:&Arc<Mutex<MemoryPool>>, initializer: I) -> Result<CudaVec<U,T>, CudaError> {
1501        let mut ptr = CudaMemoryPoolPtr::new(size * T::size(),memory_pool)?;
1502
1503        let mut src = Vec::with_capacity(size * T::size());
1504
1505        src.resize_with(size * T::size(),initializer);
1506
1507        ptr.memcpy(src.into_boxed_slice().as_ptr(),size * T::size())?;
1508
1509        Ok(CudaVec {
1510            len:size,
1511            ptr,
1512            t:PhantomData::<T>
1513        })
1514    }
1515}
1516impl<U,T> BatchSize for CudaVec<U,T>
1517    where U: UnitValue<U>,
1518          T: AsConstKernelPtr + AsKernelPtr + MemorySize {
1519    fn size(&self) -> usize {
1520        self.len
1521    } 
1522}
1523impl<U,T> PointerElement for CudaVec<U,T>
1524    where U: UnitValue<U>,
1525          T: AsConstKernelPtr + AsKernelPtr {
1526    type Element = U;
1527}
1528impl<U,T> AsCudaPtrRef for CudaVec<U,T>
1529    where U: UnitValue<U>,
1530          T: AsConstKernelPtr + AsKernelPtr + MemorySize {
1531    type Pointer = CudaMemoryPoolPtr<U>;
1532
1533    #[inline]
1534    fn as_cuda_ptr_ref(&self) -> &Self::Pointer {
1535        &self.ptr
1536    }
1537}
1538impl<U,T> AsCudaMutPtr for CudaVec<U,T>
1539    where U: UnitValue<U>,
1540          T: AsConstKernelPtr + AsKernelPtr {
1541    type Pointer = CudaMemoryPoolPtr<U>;
1542
1543    #[inline]
1544    fn as_cuda_mut_ptr<'a>(&'a mut self) -> CudaMutPtr<'a,Self::Pointer> {
1545        CudaMutPtr::new(&mut self.ptr)
1546    }
1547}
1548impl<U,T> TryClone for CudaVec<U,T>
1549    where U: UnitValue<U>,
1550          T: AsConstKernelPtr + AsKernelPtr + MemorySize {
1551    type Error = CudaError;
1552    fn try_clone(&self) -> Result<Self,CudaError> {
1553        let mut dst = CudaMemoryPoolPtr::new(self.len * T::size(),&self.ptr.memory_pool)?;
1554
1555        self.memcpy_to(&mut dst,self.len * T::size())?;
1556
1557        Ok(CudaVec {
1558            len: self.len,
1559            ptr: dst,
1560            t:PhantomData::<T>
1561        })
1562    }
1563}
1564impl<'a,U,T> ToCuda<U> for &'a CudaVec<U,T>
1565    where U: UnitValue<U>,
1566          T: AsConstKernelPtr + AsKernelPtr + MemorySize {
1567    type Output = CudaVecView<'a,U,T>;
1568
1569    fn to_cuda(self, _: &DeviceGpu<U>) -> Result<Self::Output, TypeConvertError> {
1570        Ok(self.try_into()?)
1571    }
1572}
1573impl<U,T> ToCuda<U> for CudaVec<U,T>
1574    where U: UnitValue<U>,
1575          T: AsConstKernelPtr + AsKernelPtr + MemorySize {
1576    type Output = CudaVec<U,T>;
1577
1578    fn to_cuda(self, _: &DeviceGpu<U>) -> Result<Self::Output, TypeConvertError> {
1579        Ok(self)
1580    }
1581}
1582#[derive(Debug)]
1583pub struct CudaVecView<'a,U,T>
1584    where U: UnitValue<U>,
1585          T: AsConstKernelPtr {
1586    len: usize,
1587    ptr:&'a CudaMemoryPoolPtr<U>,
1588    t:PhantomData<T>
1589}
1590impl<'a,U,T> BatchSize for CudaVecView<'a,U,T>
1591    where U: UnitValue<U>,
1592          T: AsConstKernelPtr + MemorySize {
1593    fn size(&self) -> usize {
1594        self.len
1595    }
1596}
1597impl<'a,U,T> PointerElement for CudaVecView<'a,U,T>
1598    where U: UnitValue<U>,
1599          T: AsConstKernelPtr + AsKernelPtr {
1600    type Element = U;
1601}
1602impl<'a,U,T> AsCudaPtrRef for CudaVecView<'a,U,T>
1603    where U: UnitValue<U>,
1604          T: AsConstKernelPtr + MemorySize {
1605    type Pointer = CudaMemoryPoolPtr<U>;
1606
1607    #[inline]
1608    fn as_cuda_ptr_ref(&self) -> &Self::Pointer {
1609        self.ptr
1610    }
1611}
1612impl<'a,U,T,R> TryFrom<&'a CudaVec<U,T>> for CudaVecView<'a,U,R>
1613    where U: UnitValue<U> + Default + Clone + Send,
1614          T: MemorySize + AsKernelPtr + AsConstKernelPtr,
1615          R: MemorySize + AsKernelPtr + AsConstKernelPtr + TryFrom<T> {
1616    type Error = TypeConvertError;
1617
1618    fn try_from(value: &'a CudaVec<U,T>) -> Result<Self, Self::Error> {
1619        if T::size() != R::size() {
1620            Err(TypeConvertError::SizeMismatchError(SizeMismatchError(T::size(),R::size())))
1621        } else {
1622            Ok(CudaVecView {
1623                len:value.size(),
1624                ptr: &value.ptr,
1625                t:PhantomData::<R>
1626            })
1627        }
1628    }
1629}
1630pub struct CudaVecViewConverter<'a,U,T>
1631    where U: UnitValue<U> + Default + Clone + Send,
1632          T: MemorySize + AsConstKernelPtr {
1633    len:usize,
1634    ptr:&'a CudaMemoryPoolPtr<U>,
1635    t:PhantomData<T>
1636}
1637impl<'a,U,T> IntoConverter for CudaVecView<'a,U,T>
1638    where U: UnitValue<U> + Default + Clone + Send,
1639          T: MemorySize + AsConstKernelPtr {
1640    type Converter = CudaVecViewConverter<'a,U,T>;
1641
1642    fn into_converter(self) -> Self::Converter {
1643        CudaVecViewConverter {
1644            len:self.len,
1645            ptr:self.ptr,
1646            t:PhantomData::<T>
1647        }
1648    }
1649}
1650impl<'a,U,T,R> TryFrom<CudaVecViewConverter<'a,U,T>> for CudaVecView<'a,U,R>
1651    where U: UnitValue<U> + Default + Clone + Send,
1652          T: MemorySize + AsConstKernelPtr,
1653          R: MemorySize + AsConstKernelPtr + From<T> {
1654    type Error = TypeConvertError;
1655
1656    #[inline]
1657    fn try_from(value: CudaVecViewConverter<'a,U,T>) -> Result<Self, Self::Error> {
1658        if T::size() != R::size() {
1659            Err(TypeConvertError::SizeMismatchError(SizeMismatchError(T::size(),R::size())))
1660        } else {
1661            let len = value.len;
1662
1663            Ok(CudaVecView {
1664                len:len,
1665                ptr: value.ptr,
1666                t:PhantomData::<R>
1667            })
1668        }
1669    }
1670}
1671impl<'a,U,T> TryFrom<&'a CudaVecView<'a,U,T>> for CudaVec<U,T>
1672    where U: UnitValue<U> + Default + Clone + Send,
1673          T: MemorySize + AsConstKernelPtr + AsKernelPtr {
1674    type Error = CudaError;
1675
1676    fn try_from(value: &'a CudaVecView<'a, U, T>) -> Result<Self, Self::Error> {
1677        let mut dst = CudaMemoryPoolPtr::new(value.size() * T::size(),&value.ptr.memory_pool)?;
1678
1679        value.memcpy_to(&mut dst,value.size() * T::size())?;
1680
1681        Ok(CudaVec {
1682            len: value.size(),
1683            ptr: dst,
1684            t:PhantomData::<T>
1685        })
1686    }
1687}
1688pub struct CudaVecConverter<U,T>
1689    where U: UnitValue<U> + Default + Clone + Send,
1690          T: MemorySize + AsKernelPtr + AsConstKernelPtr {
1691    len:usize,
1692    ptr:CudaMemoryPoolPtr<U>,
1693    u:PhantomData<U>,
1694    t:PhantomData<T>
1695}
1696impl<U,T> IntoConverter for CudaVec<U,T>
1697    where U: UnitValue<U> + Default + Clone + Send,
1698          T: MemorySize + AsKernelPtr + AsConstKernelPtr {
1699    type Converter = CudaVecConverter<U,T>;
1700
1701    fn into_converter(self) -> Self::Converter {
1702        CudaVecConverter {
1703            len:self.len,
1704            ptr:self.ptr,
1705            u:PhantomData::<U>,
1706            t:PhantomData::<T>
1707        }
1708    }
1709}
1710impl<U,T> BatchSize for CudaVecConverter<U,T>
1711    where U: UnitValue<U> + Default + Clone + Send,
1712          T: MemorySize + AsKernelPtr + AsConstKernelPtr {
1713    fn size(&self) -> usize {
1714        self.len
1715    }
1716}
1717impl<U,T> From<CudaVecConverter<U,T>> for CudaMemoryPoolPtr<U> 
1718    where U: UnitValue<U> + Default + Clone + Send,
1719          T: MemorySize + AsKernelPtr + AsConstKernelPtr {
1720    fn from(value: CudaVecConverter<U, T>) -> Self {
1721        value.ptr
1722    }
1723}
1724impl<U,T,R> TryFrom<CudaVecConverter<U,T>> for CudaVec<U,R>
1725    where U: UnitValue<U> + Default + Clone + Send,
1726          T: MemorySize + AsKernelPtr + AsConstKernelPtr,
1727          R: MemorySize + AsKernelPtr + AsConstKernelPtr + From<T> {
1728    type Error = TypeConvertError;
1729
1730    #[inline]
1731    fn try_from(value: CudaVecConverter<U,T>) -> Result<Self, Self::Error> {
1732        if T::size() != R::size() {
1733            Err(TypeConvertError::SizeMismatchError(SizeMismatchError(T::size(),R::size())))
1734        } else {
1735            let len = value.size();
1736
1737            Ok(CudaVec {
1738                len:len,
1739                ptr: value.into(),
1740                t:PhantomData::<R>
1741            })
1742        }
1743    }
1744}
1745impl<U,T,R> TryFrom<CudaVecConverter<U,T>> for SerializedVec<U,R>
1746    where U: Debug + Default + Clone + Copy + Send + UnitValue<U>,
1747          for<'a> T: MemorySize + AsKernelPtr + AsConstKernelPtr,
1748          for<'b> R: SliceSize + AsRawSlice<U> + MakeView<'b,U> + MakeViewMut<'b,U> {
1749    type Error = TypeConvertError;
1750    #[inline]
1751    fn try_from(value: CudaVecConverter<U,T>) -> Result<Self, Self::Error> {
1752        if T::size() != R::slice_size() {
1753            Err(TypeConvertError::SizeMismatchError(SizeMismatchError(T::size(),R::slice_size())))
1754        } else {
1755            Ok(value.ptr.read_to_vec()?.into_boxed_slice().try_into()?)
1756        }
1757    }
1758}
1759impl<U,T> ToHost<U> for CudaVec<U,T>
1760    where U: Debug + Default + Clone + Copy + Send + UnitValue<U>,
1761          SerializedVec<U,<T as ToHost<U>>::Output>: TryFrom<Box<[U]>,Error=TypeConvertError>,
1762          for<'a> <T as ToHost<U>>::Output: SliceSize + MakeView<'a,U>,
1763          for<'a> T: MemorySize + AsKernelPtr + AsConstKernelPtr + ToHost<U> {
1764    type Output = SerializedVec<U,<T as ToHost<U>>::Output>;
1765    #[inline]
1766    fn to_host(self) -> Result<Self::Output,TypeConvertError> {
1767        if T::size() != <T as ToHost<U>>::Output::slice_size() {
1768            Err(TypeConvertError::SizeMismatchError(SizeMismatchError(T::size(),<T as ToHost<U>>::Output::slice_size())))
1769        } else {
1770            Ok(self.ptr.read_to_vec()?.into_boxed_slice().try_into()?)
1771        }
1772    }
1773}
1774impl TryFrom<f32> for CudaPtr<f32> {
1775    type Error = CudaError;
1776
1777    fn try_from(value: f32) -> Result<Self, Self::Error> {
1778        let mut ptr:CudaPtr<f32> = CudaPtr::new(1)?;
1779        ptr.memcpy(&value as *const f32,1)?;
1780        Ok(ptr)
1781    }
1782}
1783impl TryFrom<f64> for CudaPtr<f64> {
1784    type Error = CudaError;
1785
1786    fn try_from(value: f64) -> Result<Self, Self::Error> {
1787        let mut ptr:CudaPtr<f64> = CudaPtr::new(1)?;
1788        ptr.memcpy(&value as *const f64,1)?;
1789        Ok(ptr)
1790    }
1791}
1792impl TryFrom<i32> for CudaPtr<i32> {
1793    type Error = CudaError;
1794
1795    fn try_from(value: i32) -> Result<Self, Self::Error> {
1796        let mut ptr:CudaPtr<i32> = CudaPtr::new(1)?;
1797        ptr.memcpy(&value as *const i32,1)?;
1798        Ok(ptr)
1799    }
1800}
1801impl TryFrom<i64> for CudaPtr<i64> {
1802    type Error = CudaError;
1803
1804    fn try_from(value: i64) -> Result<Self, Self::Error> {
1805        let mut ptr:CudaPtr<i64> = CudaPtr::new(1)?;
1806        ptr.memcpy(&value as *const i64,1)?;
1807        Ok(ptr)
1808    }
1809}
1810impl TryFrom<f32> for CudaHostPtr<f32> {
1811    type Error = CudaError;
1812
1813    fn try_from(value: f32) -> Result<Self, Self::Error> {
1814        let mut ptr:CudaHostPtr<f32> = CudaHostPtr::new(1,cudaHostAllocDefault)?;
1815        ptr.memcpy(&value as *const f32,1)?;
1816        Ok(ptr)
1817    }
1818}
1819impl TryFrom<f64> for CudaHostPtr<f64> {
1820    type Error = CudaError;
1821
1822    fn try_from(value: f64) -> Result<Self, Self::Error> {
1823        let mut ptr:CudaHostPtr<f64> = CudaHostPtr::new(1,cudaHostAllocDefault)?;
1824        ptr.memcpy(&value as *const f64,1)?;
1825        Ok(ptr)
1826    }
1827}
1828impl TryFrom<i32> for CudaHostPtr<i32> {
1829    type Error = CudaError;
1830
1831    fn try_from(value: i32) -> Result<Self, Self::Error> {
1832        let mut ptr:CudaHostPtr<i32> = CudaHostPtr::new(1,cudaHostAllocDefault)?;
1833        ptr.memcpy(&value as *const i32,1)?;
1834        Ok(ptr)
1835    }
1836}
1837impl TryFrom<i64> for CudaHostPtr<i64> {
1838    type Error = CudaError;
1839
1840    fn try_from(value: i64) -> Result<Self, Self::Error> {
1841        let mut ptr:CudaHostPtr<i64> = CudaHostPtr::new(1,cudaHostAllocDefault)?;
1842        ptr.memcpy(&value as *const i64,1)?;
1843        Ok(ptr)
1844    }
1845}
1846/// Trait to convert value to Cuda smart pointer type
1847pub trait ToCuda<T> where T: UnitValue<T> {
1848    type Output;
1849
1850    /// # Arguments
1851    /// * `device` - gpu device
1852    ///
1853    /// # Errors
1854    ///
1855    /// This function may return the following errors
1856    /// * [`TypeConvertError`]
1857    ///
1858    fn to_cuda(self,device:&DeviceGpu<T>) -> Result<Self::Output,TypeConvertError>;
1859}
1860/// Trait for inverse conversion of value to host memory type
1861pub trait ToHost<T> where T: Default + Clone + Send {
1862    type Output;
1863
1864    /// # Errors
1865    ///
1866    /// This function may return the following errors
1867    /// * [`TypeConvertError`]
1868    ///
1869    fn to_host(self) -> Result<Self::Output,TypeConvertError>;
1870}
1871impl<'a,T,const N:usize> ToCuda<T> for &'a CudaTensor1dPtr<T,N>
1872    where T :UnitValue<T> {
1873    type Output = CudaTensor1dPtrView<'a,T,N>;
1874
1875    fn to_cuda(self, _: &DeviceGpu<T>) -> Result<Self::Output,TypeConvertError> {
1876        Ok(self.into())
1877    }
1878}
1879impl<T,const N:usize> ToCuda<T> for CudaTensor1dPtr<T,N>
1880    where T :UnitValue<T> {
1881    type Output = CudaTensor1dPtr<T,N>;
1882
1883    fn to_cuda(self, _: &DeviceGpu<T>) -> Result<Self::Output,TypeConvertError> {
1884        Ok(self)
1885    }
1886}
1887impl<'a,T,const N1:usize,const N2:usize> ToCuda<T> for &'a CudaTensor2dPtr<T,N1,N2>
1888    where T :UnitValue<T> {
1889    type Output = CudaTensor2dPtrView<'a,T,N1,N2>;
1890
1891    fn to_cuda(self, _: &DeviceGpu<T>) -> Result<Self::Output,TypeConvertError> {
1892        Ok(self.into())
1893    }
1894}
1895impl<T,const N1:usize,const N2:usize> ToCuda<T> for CudaTensor2dPtr<T,N1,N2>
1896    where T :UnitValue<T> {
1897    type Output = CudaTensor2dPtr<T,N1,N2>;
1898
1899    fn to_cuda(self, _: &DeviceGpu<T>) -> Result<Self::Output,TypeConvertError> {
1900        Ok(self)
1901    }
1902}
1903impl<'a,T,const N1:usize,const N2:usize,const N3:usize> ToCuda<T> for &'a CudaTensor3dPtr<T,N1,N2,N3>
1904    where T :UnitValue<T> {
1905    type Output = CudaTensor3dPtrView<'a,T,N1,N2,N3>;
1906
1907    fn to_cuda(self, _: &DeviceGpu<T>) -> Result<Self::Output,TypeConvertError> {
1908        Ok(self.into())
1909    }
1910}
1911impl<T,const N1:usize,const N2:usize,const N3:usize> ToCuda<T> for CudaTensor3dPtr<T,N1,N2,N3>
1912    where T :UnitValue<T> {
1913    type Output = CudaTensor3dPtr<T,N1,N2,N3>;
1914
1915    fn to_cuda(self, _: &DeviceGpu<T>) -> Result<Self::Output,TypeConvertError> {
1916        Ok(self)
1917    }
1918}
1919impl<'a,T,const N1:usize,const N2:usize,const N3:usize,const N4:usize> ToCuda<T> for &'a CudaTensor4dPtr<T,N1,N2,N3,N4>
1920    where T :UnitValue<T> {
1921    type Output = CudaTensor4dPtrView<'a,T,N1,N2,N3,N4>;
1922
1923    fn to_cuda(self, _: &DeviceGpu<T>) -> Result<Self::Output,TypeConvertError> {
1924        Ok(self.into())
1925    }
1926}
1927impl<T,const N1:usize,const N2:usize,const N3:usize,const N4:usize> ToCuda<T> for CudaTensor4dPtr<T,N1,N2,N3,N4>
1928    where T :UnitValue<T> {
1929    type Output = CudaTensor4dPtr<T,N1,N2,N3,N4>;
1930
1931    fn to_cuda(self, _: &DeviceGpu<T>) -> Result<Self::Output,TypeConvertError> {
1932        Ok(self)
1933    }
1934}
1935/// Trait that defines the ability to get a reference to a cuda smart pointer
1936pub trait AsCudaPtrRef {
1937    /// Returned Cuda smart pointer type
1938    type Pointer: AsConstKernelPtr;
1939
1940    fn as_cuda_ptr_ref(&self) -> &Self::Pointer;
1941}
1942impl<'a,T,const N:usize> AsCudaPtrRef for &'a CudaTensor1dPtr<T,N>
1943    where T: Default + Debug {
1944    type Pointer = CudaMemoryPoolPtr<T>;
1945
1946    #[inline]
1947    fn as_cuda_ptr_ref(&self) -> &Self::Pointer {
1948        &self.ptr
1949    }
1950}
1951impl<'a,T,const N1:usize,const N2:usize> AsCudaPtrRef for &'a CudaTensor2dPtr<T,N1,N2>
1952    where T: Default + Debug {
1953    type Pointer = CudaMemoryPoolPtr<T>;
1954
1955    #[inline]
1956    fn as_cuda_ptr_ref(&self) -> &Self::Pointer {
1957        &self.ptr
1958    }
1959}
1960impl<'a,T,const N1:usize,const N2:usize,const N3:usize> AsCudaPtrRef for &'a CudaTensor3dPtr<T,N1,N2,N3>
1961    where T: Default + Debug {
1962    type Pointer = CudaMemoryPoolPtr<T>;
1963
1964    #[inline]
1965    fn as_cuda_ptr_ref(&self) -> &Self::Pointer {
1966        &self.ptr
1967    }
1968}
1969impl<'a,T,const N1:usize,const N2:usize,const N3:usize,const N4:usize> AsCudaPtrRef for &'a CudaTensor4dPtr<T,N1,N2,N3,N4>
1970    where T: Default + Debug {
1971    type Pointer = CudaMemoryPoolPtr<T>;
1972
1973    #[inline]
1974    fn as_cuda_ptr_ref(&self) -> &Self::Pointer {
1975        &self.ptr
1976    }
1977}
1978impl<CP> private::AsConstKernelPtrBase for CP
1979    where CP: AsCudaPtrRef,
1980          <CP as AsCudaPtrRef>::Pointer: private::AsConstKernelPtrBase {
1981    #[inline]
1982    fn as_const_kernel_ptr(&self) -> *mut c_void {
1983        self.as_cuda_ptr_ref().as_const_kernel_ptr()
1984    }
1985}
1986impl<CP> AsVoidPtr for CP
1987    where CP: AsCudaPtrRef,
1988          <CP as AsCudaPtrRef>::Pointer: AsVoidPtr {
1989    #[inline]
1990    fn as_void_ptr(&self) -> *const c_void {
1991        self.as_cuda_ptr_ref().as_void_ptr()
1992    }
1993}
1994impl<CP,T> AsPtr<T> for CP
1995    where CP: AsCudaPtrRef,
1996          <CP as AsCudaPtrRef>::Pointer: AsPtr<T> {
1997    #[inline]
1998    fn as_ptr(&self) -> *const T {
1999        self.as_cuda_ptr_ref().as_ptr()
2000    }
2001}
2002impl<CP> ReadMemory<<CP as PointerElement>::Element> for CP
2003    where CP: AsCudaPtrRef + PointerElement,
2004          <CP as AsCudaPtrRef>::Pointer: ReadMemory<<CP as PointerElement>::Element> {
2005    #[inline]
2006    fn read_to_vec(&self) -> Result<Vec<<CP as PointerElement>::Element>, Error> {
2007        self.as_cuda_ptr_ref().read_to_vec()
2008    }
2009
2010    #[inline]
2011    fn read_to_vec_with_size(&self, size: usize) -> Result<Vec<<CP as PointerElement>::Element>, Error> {
2012        self.as_cuda_ptr_ref().read_to_vec_with_size(size)
2013    }
2014}
2015impl<CP> ReadMemoryAsync<<CP as PointerElement>::Element> for CP
2016    where CP: AsCudaPtrRef + PointerElement,
2017          <CP as AsCudaPtrRef>::Pointer: ReadMemoryAsync<<CP as PointerElement>::Element> {
2018    #[inline]
2019    fn read_to_vec_async(&self, stream: cudaStream_t) -> Result<Vec<<CP as PointerElement>::Element>, Error> {
2020        self.as_cuda_ptr_ref().read_to_vec_async(stream)
2021    }
2022    #[inline]
2023    fn read_to_vec_with_size_async(&self, stream: cudaStream_t, size: usize) -> Result<Vec<<CP as PointerElement>::Element>, Error> {
2024        self.as_cuda_ptr_ref().read_to_vec_with_size_async(stream,size)
2025    }
2026}
2027impl<CP,D> MemoryMoveTo<<CP as PointerElement>::Element,D> for CP
2028    where CP: AsCudaPtrRef + PointerElement,
2029          D: AsCudaMutPtr,
2030          <CP as AsCudaPtrRef>::Pointer: MemoryMoveTo<<CP as PointerElement>::Element,<D as AsCudaMutPtr>::Pointer>,
2031          <D as AsCudaMutPtr>::Pointer: AsMutPtr<<CP as PointerElement>::Element> {
2032    #[inline]
2033    fn memcpy_to(&self, dst: &mut D, len: usize) -> Result<usize, Error> {
2034        self.as_cuda_ptr_ref().memcpy_to(dst.as_cuda_mut_ptr().ptr,len)
2035    }
2036}
2037impl<CP> MemoryMoveTo<<CP as PointerElement>::Element,CudaPtr<<CP as PointerElement>::Element>> for CP
2038    where CP: AsCudaPtrRef + PointerElement,
2039          <CP as AsCudaPtrRef>::Pointer: MemoryMoveTo<<CP as PointerElement>::Element,CudaPtr<<CP as PointerElement>::Element>> {
2040    #[inline]
2041    fn memcpy_to(&self, dst: &mut CudaPtr<<CP as PointerElement>::Element>, len: usize) -> Result<usize, Error> {
2042        self.as_cuda_ptr_ref().memcpy_to(dst,len)
2043    }
2044}
2045impl<CP> MemoryMoveTo<<CP as PointerElement>::Element,CudaHostPtr<<CP as PointerElement>::Element>> for CP
2046    where CP: AsCudaPtrRef + PointerElement,
2047          <CP as AsCudaPtrRef>::Pointer: MemoryMoveTo<<CP as PointerElement>::Element,CudaHostPtr<<CP as PointerElement>::Element>> {
2048    #[inline]
2049    fn memcpy_to(&self, dst: &mut CudaHostPtr<<CP as PointerElement>::Element>, len: usize) -> Result<usize, Error> {
2050        self.as_cuda_ptr_ref().memcpy_to(dst,len)
2051    }
2052}
2053impl<CP> MemoryMoveTo<<CP as PointerElement>::Element,CudaMemoryPoolPtr<<CP as PointerElement>::Element>> for CP
2054    where CP: AsCudaPtrRef + PointerElement,
2055          <CP as AsCudaPtrRef>::Pointer: MemoryMoveTo<<CP as PointerElement>::Element,CudaMemoryPoolPtr<<CP as PointerElement>::Element>> {
2056    #[inline]
2057    fn memcpy_to(&self, dst: &mut CudaMemoryPoolPtr<<CP as PointerElement>::Element>, len: usize) -> Result<usize, Error> {
2058        self.as_cuda_ptr_ref().memcpy_to(dst,len)
2059    }
2060}
2061impl<CP,D> MemoryMoveToAsync<<CP as PointerElement>::Element,D> for CP
2062    where CP: AsCudaPtrRef + PointerElement,
2063          D: AsCudaMutPtr,
2064          <CP as AsCudaPtrRef>::Pointer: MemoryMoveToAsync<<CP as PointerElement>::Element,<D as AsCudaMutPtr>::Pointer>,
2065          <D as AsCudaMutPtr>::Pointer: AsMutPtr<<CP as PointerElement>::Element> {
2066    #[inline]
2067    fn memcpy_to_async(&self, dst: &mut D, len: usize,stream:cudaStream_t) -> Result<usize, Error> {
2068        self.as_cuda_ptr_ref().memcpy_to_async(&mut dst.as_cuda_mut_ptr().ptr,len,stream)
2069    }
2070}
2071impl<CP> MemoryMoveToAsync<<CP as PointerElement>::Element,CudaPtr<<CP as PointerElement>::Element>> for CP
2072    where CP: AsCudaPtrRef + PointerElement,
2073          <CP as AsCudaPtrRef>::Pointer: MemoryMoveToAsync<<CP as PointerElement>::Element,CudaPtr<<CP as PointerElement>::Element>> {
2074    #[inline]
2075    fn memcpy_to_async(&self, dst: &mut CudaPtr<<CP as PointerElement>::Element>, len: usize,stream:cudaStream_t) -> Result<usize, Error> {
2076        self.as_cuda_ptr_ref().memcpy_to_async(dst,len,stream)
2077    }
2078}
2079impl<CP> MemoryMoveToAsync<<CP as PointerElement>::Element,CudaHostPtr<<CP as PointerElement>::Element>> for CP
2080    where CP: AsCudaPtrRef + PointerElement,
2081          <CP as AsCudaPtrRef>::Pointer: MemoryMoveToAsync<<CP as PointerElement>::Element,CudaHostPtr<<CP as PointerElement>::Element>> {
2082    #[inline]
2083    fn memcpy_to_async(&self, dst: &mut CudaHostPtr<<CP as PointerElement>::Element>, len: usize,stream:cudaStream_t) -> Result<usize, Error> {
2084        self.as_cuda_ptr_ref().memcpy_to_async(dst,len,stream)
2085    }
2086}
2087impl<CP> MemoryMoveToAsync<<CP as PointerElement>::Element,CudaMemoryPoolPtr<<CP as PointerElement>::Element>> for CP
2088    where CP: AsCudaPtrRef + PointerElement,
2089          <CP as AsCudaPtrRef>::Pointer: MemoryMoveToAsync<<CP as PointerElement>::Element,CudaMemoryPoolPtr<<CP as PointerElement>::Element>> {
2090    #[inline]
2091    fn memcpy_to_async(&self, dst: &mut CudaMemoryPoolPtr<<CP as PointerElement>::Element>, len: usize,stream:cudaStream_t) -> Result<usize, Error> {
2092        self.as_cuda_ptr_ref().memcpy_to_async(dst,len,stream)
2093    }
2094}
2095/// Proxy type to Cuda smart pointer type with write operation
2096pub struct CudaMutPtr<'a,P> {
2097    ptr:&'a mut P
2098}
2099impl<'a,P> CudaMutPtr<'a,P> {
2100    pub fn new(ptr:&'a mut P) -> CudaMutPtr<'a,P> {
2101        CudaMutPtr {
2102            ptr:ptr
2103        }
2104    }
2105}
2106impl<'a,P> private::AsMutKernelPtrBase for CudaMutPtr<'a,P> where P: private::AsMutKernelPtrBase {
2107    #[inline]
2108    fn as_mut_kernel_ptr(&mut self) -> *mut c_void {
2109        self.ptr.as_mut_kernel_ptr()
2110    }
2111}
2112impl<'a,P> AsMutVoidPtr for CudaMutPtr<'a,P> where P: AsMutVoidPtr {
2113    #[inline]
2114    fn as_mut_void_ptr(&mut self) -> *mut c_void {
2115        self.ptr.as_mut_void_ptr()
2116    }
2117}
2118impl<'a,P,T> AsMutPtr<T> for CudaMutPtr<'a,P> where P: AsMutPtr<T> {
2119    #[inline]
2120    fn as_mut_ptr(&mut self) -> *mut T {
2121        self.ptr.as_mut_ptr()
2122    }
2123}
2124impl<'a,P> PointerElement for CudaMutPtr<'a,P>
2125    where P: PointerElement {
2126    type Element = P::Element;
2127}
2128impl<'a,P> WriteMemory<<Self as PointerElement>::Element> for CudaMutPtr<'a,P>
2129    where P: WriteMemory<<Self as PointerElement>::Element> + PointerElement {
2130    #[inline]
2131    fn memcpy(&mut self, p: *const <Self as PointerElement>::Element, len: usize) -> Result<usize, Error> {
2132        self.ptr.memcpy(p,len)
2133    }
2134
2135    #[inline]
2136    fn memcpy_repeat(&mut self, p: *const <Self as PointerElement>::Element, len: usize, count: usize) -> Result<usize, Error> {
2137        self.ptr.memcpy_repeat(p,len,count)
2138    }
2139}
2140impl<'a,P> WriteMemoryAsync<<Self as PointerElement>::Element> for CudaMutPtr<'a,P>
2141    where P: WriteMemoryAsync<<Self as PointerElement>::Element> + PointerElement {
2142    #[inline]
2143    fn memcpy_async(&mut self, p: *const <Self as PointerElement>::Element, len: usize, stream: cudaStream_t) -> Result<usize, Error> {
2144        self.ptr.memcpy_async(p,len,stream)
2145    }
2146
2147    #[inline]
2148    fn memcpy_async_repeat(&mut self, p: *const <Self as PointerElement>::Element, len: usize, count: usize, stream: cudaStream_t) -> Result<usize, Error> {
2149        self.ptr.memcpy_async_repeat(p,len,count,stream)
2150    }
2151}
2152/// Characteristic that defines the ability to obtain a reference to a writable cuda smart pointer
2153pub trait AsCudaMutPtr {
2154    /// Returned Cuda smart pointer type
2155    type Pointer;
2156
2157    fn as_cuda_mut_ptr<'a>(&'a mut self) -> CudaMutPtr<'a,Self::Pointer>;
2158}
2159impl<CP> private::AsMutKernelPtrBase for CP
2160    where CP: AsCudaMutPtr,
2161          <CP as AsCudaMutPtr>::Pointer: private::AsMutKernelPtrBase {
2162    #[inline]
2163    fn as_mut_kernel_ptr(&mut self) -> *mut c_void {
2164        self.as_cuda_mut_ptr().as_mut_kernel_ptr()
2165    }
2166}
2167impl<CP> AsMutVoidPtr for CP
2168    where CP: AsCudaMutPtr,
2169          <CP as AsCudaMutPtr>::Pointer: AsMutVoidPtr {
2170    #[inline]
2171    fn as_mut_void_ptr(&mut self) -> *mut c_void {
2172        self.as_cuda_mut_ptr().as_mut_void_ptr()
2173    }
2174}
2175impl<CP,T> AsMutPtr<T> for CP
2176    where CP: AsCudaMutPtr,
2177          <CP as AsCudaMutPtr>::Pointer: AsMutPtr<T> {
2178    #[inline]
2179    fn as_mut_ptr(&mut self) -> *mut T {
2180        self.as_cuda_mut_ptr().as_mut_ptr()
2181    }
2182}
2183impl<CP> WriteMemory<<CP as PointerElement>::Element> for CP
2184    where CP: AsCudaMutPtr + PointerElement,
2185          <CP as AsCudaMutPtr>::Pointer: WriteMemory<<CP as PointerElement>::Element> {
2186    #[inline]
2187    fn memcpy(&mut self, p: *const <CP as PointerElement>::Element, len: usize) -> Result<usize, Error> {
2188        self.as_cuda_mut_ptr().ptr.memcpy(p,len)
2189    }
2190
2191    #[inline]
2192    fn memcpy_repeat(&mut self, p: *const <CP as PointerElement>::Element, len: usize, count: usize) -> Result<usize, Error> {
2193        self.as_cuda_mut_ptr().ptr.memcpy_repeat(p,len,count)
2194    }
2195}
2196impl<CP> WriteMemoryAsync<<CP as PointerElement>::Element> for CP
2197    where CP: AsCudaMutPtr + PointerElement,
2198          <CP as AsCudaMutPtr>::Pointer: WriteMemoryAsync<<CP as PointerElement>::Element> {
2199    #[inline]
2200    fn memcpy_async(&mut self, p: *const <CP as PointerElement>::Element, len: usize, stream: cudaStream_t) -> Result<usize, Error> {
2201        self.as_cuda_mut_ptr().ptr.memcpy_async(p,len,stream)
2202    }
2203
2204    #[inline]
2205    fn memcpy_async_repeat(&mut self, p: *const <CP as PointerElement>::Element, len: usize, count: usize, stream: cudaStream_t) -> Result<usize, Error> {
2206        self.as_cuda_mut_ptr().ptr.memcpy_async_repeat(p,len,count,stream)
2207    }
2208}
2209impl<'a,T,const N:usize> From<&'a mut CudaTensor1dPtr<T,N>> for CudaMutPtr<'a,CudaMemoryPoolPtr<T>> where T: Default + Debug {
2210    fn from(value: &'a mut CudaTensor1dPtr<T,N>) -> Self {
2211        value.as_cuda_mut_ptr()
2212    }
2213}
2214impl<'a,T,const N1:usize,const N2:usize> From<&'a mut CudaTensor2dPtr<T,N1,N2>> for CudaMutPtr<'a,CudaMemoryPoolPtr<T>> where T: Default + Debug {
2215    fn from(value: &'a mut CudaTensor2dPtr<T,N1,N2>) -> Self {
2216        value.as_cuda_mut_ptr()
2217    }
2218}
2219/// Trait that defines arguments passed to cuda kernel functions
2220pub trait KernelArgs {
2221    /// Returns a Vec<&mut dyn AsMutKernelPtr> of the type implementing AsMutKernelPtr,
2222    /// which is converted to a data type that can be passed to the cuda kernel in subsequent processing.
2223    fn as_vec(&mut self) ->  Vec<&mut dyn AsKernelPtr>;
2224}
2225/// Trait defining cuda kernel functions
2226pub trait Kernel {
2227    /// Object to be converted into a list of arguments to be passed to the cuda kernel function
2228    type Args: KernelArgs;
2229
2230    /// Pointer to cuda kernel function
2231    const FUNC_PTR: *const c_void;
2232
2233    /// cuda kernel startup function
2234    /// # Arguments
2235    /// * `grid_dim` - Number of dims in grid
2236    /// * `block_dim` - Number of blocks in grid
2237    /// * `args` - List of arguments passed to cuda kernel functions
2238    /// * `shared_mem` - Size (in bytes) of shared memory to allocate for use within cuda kernel functions.
2239    ///
2240    /// # Errors
2241    ///
2242    /// This function may return the following errors
2243    /// * [`CudaRuntimeError`]
2244    fn launch(&mut self,grid_dim:dim3,block_dim:dim3,args:&mut Self::Args,shared_mem:usize) -> Result<(),CudaRuntimeError> {
2245        ffi::launch(Self::FUNC_PTR,
2246                    grid_dim,
2247                    block_dim,
2248                    &mut args.as_vec().into_iter()
2249                        .map(|p| p.as_kernel_ptr())
2250                        .collect::<Vec<*mut c_void>>().as_mut_slice(),
2251                    shared_mem
2252        )
2253    }
2254
2255    /// Function that waits for the completion of the execution of the process passed to the Cuda kernel
2256    ///
2257    /// # Errors
2258    ///
2259    /// This function may return the following errors
2260    /// * [`CudaRuntimeError`]
2261    fn device_synchronize(&self) -> Result<(),CudaRuntimeError> {
2262        ffi::device_synchronize()
2263    }
2264}
2265/// Trait defining cuda cooperative kernel functions
2266pub trait CooperativeKernel {
2267    /// Object to be converted into a list of arguments to be passed to the cuda kernel function
2268    type Args: KernelArgs;
2269
2270    /// Pointer to cuda kernel function
2271    const FUNC_PTR: *const c_void;
2272
2273    /// cuda kernel startup function
2274    /// Launches a device function where thread blocks can cooperate and synchronize as they execute.
2275    /// # Arguments
2276    /// * `grid_dim` - Number of dims in grid
2277    /// * `block_dim` - Number of blocks in grid
2278    /// * `args` - List of arguments passed to cuda kernel functions
2279    /// * `shared_mem` - Size (in bytes) of shared memory to allocate for use within cuda kernel functions.
2280    ///
2281    /// # Errors
2282    ///
2283    /// This function may return the following errors
2284    /// * [`CudaRuntimeError`]
2285    fn launch(&mut self,grid_dim:dim3,block_dim:dim3,args:&mut Self::Args,shared_mem:usize) -> Result<(),CudaRuntimeError> {
2286        ffi::launch_cooperative(Self::FUNC_PTR,
2287                    grid_dim,
2288                    block_dim,
2289                    &mut args.as_vec().into_iter()
2290                        .map(|p| p.as_kernel_ptr())
2291                        .collect::<Vec<*mut c_void>>().as_mut_slice(),
2292                    shared_mem
2293        )
2294    }
2295
2296    /// Function that waits for the completion of the execution of the process passed to the Cuda kernel
2297    ///
2298    /// # Errors
2299    ///
2300    /// This function may return the following errors
2301    /// * [`CudaRuntimeError`]
2302    fn device_synchronize(&self) -> Result<(),CudaRuntimeError> {
2303        ffi::device_synchronize()
2304    }
2305}