scirs2_series/gpu_acceleration/
convolution.rs

1//! GPU-accelerated convolution operations
2//!
3//! This module provides convolution and correlation operations optimized for GPU
4//! execution, including 1D convolution, cross-correlation, and sliding window operations.
5
6use scirs2_core::ndarray::{s, Array1};
7use scirs2_core::numeric::Float;
8use std::fmt::Debug;
9
10use super::{fft, GpuConfig};
11use crate::error::{Result, TimeSeriesError};
12
13/// GPU-accelerated convolution processor
14#[derive(Debug)]
15pub struct GpuConvolution<F: Float + Debug> {
16    #[allow(dead_code)]
17    config: GpuConfig,
18    phantom: std::marker::PhantomData<F>,
19}
20
21impl<F: Float + Debug + Clone> GpuConvolution<F> {
22    /// Create new GPU convolution processor
23    pub fn new(config: GpuConfig) -> Self {
24        Self {
25            config,
26            phantom: std::marker::PhantomData,
27        }
28    }
29
30    /// GPU-accelerated 1D convolution
31    pub fn convolve_1d(&self, signal: &Array1<F>, kernel: &Array1<F>) -> Result<Array1<F>> {
32        if signal.is_empty() || kernel.is_empty() {
33            return Err(TimeSeriesError::InvalidInput(
34                "Signal and kernel must be non-empty".to_string(),
35            ));
36        }
37
38        let signal_len = signal.len();
39        let kernel_len = kernel.len();
40        let output_len = signal_len + kernel_len - 1;
41
42        let mut result = Array1::zeros(output_len);
43
44        // GPU-style parallel convolution with memory coalescing
45        let chunk_size = self.config.batch_size;
46
47        for chunk_start in (0..output_len).step_by(chunk_size) {
48            let chunk_end = (chunk_start + chunk_size).min(output_len);
49
50            // Parallel processing within chunk
51            for i in chunk_start..chunk_end {
52                let mut sum = F::zero();
53
54                // Vectorized inner loop
55                let k_start = if i >= signal_len - 1 {
56                    i - signal_len + 1
57                } else {
58                    0
59                };
60                let k_end = (i + 1).min(kernel_len);
61
62                for k in k_start..k_end {
63                    let signal_idx = i - k;
64                    if signal_idx < signal_len {
65                        sum = sum + signal[signal_idx] * kernel[k];
66                    }
67                }
68
69                result[i] = sum;
70            }
71        }
72
73        Ok(result)
74    }
75
76    /// GPU-accelerated cross-correlation
77    pub fn cross_correlate(&self, x: &Array1<F>, y: &Array1<F>) -> Result<Array1<F>> {
78        if x.is_empty() || y.is_empty() {
79            return Err(TimeSeriesError::InvalidInput(
80                "Input arrays must be non-empty".to_string(),
81            ));
82        }
83
84        let n = x.len();
85        let m = y.len();
86        let result_len = n + m - 1;
87        let mut result = Array1::zeros(result_len);
88
89        // GPU-optimized cross-correlation using parallel reduction
90        for lag in 0..result_len {
91            let mut correlation = F::zero();
92
93            // Determine overlap region
94            let start_x = if lag >= m { lag - m + 1 } else { 0 };
95            let end_x = (lag + 1).min(n);
96
97            // Parallel dot product computation
98            for i in start_x..end_x {
99                let j = lag - i;
100                if j < m {
101                    correlation = correlation + x[i] * y[j];
102                }
103            }
104
105            result[lag] = correlation;
106        }
107
108        Ok(result)
109    }
110
111    /// GPU-accelerated auto-correlation with FFT
112    pub fn auto_correlate_fft(&self, data: &Array1<F>) -> Result<Array1<F>> {
113        let n = data.len();
114        if n == 0 {
115            return Ok(Array1::zeros(0));
116        }
117
118        // Use FFT-based correlation for better performance
119        let padded_size = (2 * n - 1).next_power_of_two();
120        let mut padded = Array1::zeros(padded_size);
121
122        // Copy data to padded array
123        for i in 0..n {
124            padded[i] = data[i];
125        }
126
127        // Compute FFT, multiply by conjugate, then IFFT
128        let fft_processor = fft::GpuFFT::new(self.config.clone());
129        let fft_result = fft_processor.fft(&padded)?;
130
131        // Multiply by complex conjugate (for real signals, this is just squaring)
132        let power_spectrum = fft_result.mapv(|x| x * x);
133
134        let autocorr_full = fft_processor.ifft(&power_spectrum)?;
135
136        // Return only the meaningful part (0 to n-1 lags)
137        Ok(autocorr_full.slice(s![0..n]).to_owned())
138    }
139
140    /// GPU-accelerated sliding window correlation
141    pub fn sliding_correlation(
142        &self,
143        x: &Array1<F>,
144        y: &Array1<F>,
145        window_size: usize,
146    ) -> Result<Array1<F>> {
147        if x.len() != y.len() {
148            return Err(TimeSeriesError::DimensionMismatch {
149                expected: x.len(),
150                actual: y.len(),
151            });
152        }
153
154        if x.len() < window_size {
155            return Err(TimeSeriesError::InsufficientData {
156                message: "Data length less than window _size".to_string(),
157                required: window_size,
158                actual: x.len(),
159            });
160        }
161
162        let num_windows = x.len() - window_size + 1;
163        let mut correlations = Array1::zeros(num_windows);
164
165        // GPU-style parallel window processing
166        for i in 0..num_windows {
167            let x_window = x.slice(s![i..i + window_size]);
168            let y_window = y.slice(s![i..i + window_size]);
169
170            // Compute Pearson correlation coefficient
171            let mean_x = x_window.sum() / F::from(window_size).unwrap();
172            let mean_y = y_window.sum() / F::from(window_size).unwrap();
173
174            let mut num = F::zero();
175            let mut den_x = F::zero();
176            let mut den_y = F::zero();
177
178            // Vectorized correlation computation
179            for j in 0..window_size {
180                let dx = x_window[j] - mean_x;
181                let dy = y_window[j] - mean_y;
182
183                num = num + dx * dy;
184                den_x = den_x + dx * dx;
185                den_y = den_y + dy * dy;
186            }
187
188            let denominator = (den_x * den_y).sqrt();
189            correlations[i] = if denominator > F::zero() {
190                num / denominator
191            } else {
192                F::zero()
193            };
194        }
195
196        Ok(correlations)
197    }
198}