Skip to main content

sci_form/distgeom/
parallel.rs

1//! Parallel distance bounds evaluation and GPU-accelerated Floyd-Warshall.
2//!
3//! - Rayon-parallelized distance bounds matrix evaluation for ETKDG
4//! - GPU Floyd-Warshall for shortest-path smoothing on molecules >500 atoms
5
6/// Configuration for parallel distance bounds computation.
7#[derive(Debug, Clone)]
8pub struct ParallelBoundsConfig {
9    /// Minimum molecule size for GPU dispatch (default: 500).
10    pub gpu_threshold: usize,
11    /// Whether to attempt GPU Floyd-Warshall.
12    pub use_gpu: bool,
13}
14
15impl Default for ParallelBoundsConfig {
16    fn default() -> Self {
17        Self {
18            gpu_threshold: 500,
19            use_gpu: true,
20        }
21    }
22}
23
24/// Result of parallel bounds evaluation.
25#[derive(Debug, Clone)]
26pub struct ParallelBoundsResult {
27    /// Lower bounds matrix (flat, row-major).
28    pub lower: Vec<f64>,
29    /// Upper bounds matrix (flat, row-major).
30    pub upper: Vec<f64>,
31    /// Matrix dimension.
32    pub n: usize,
33    /// Whether GPU was used for smoothing.
34    pub used_gpu: bool,
35    /// Execution time in ms.
36    pub time_ms: f64,
37}
38
39/// Evaluate distance bounds in parallel using rayon.
40///
41/// Each (i,j) pair's bounds are computed independently, making
42/// the O(N²) bounds evaluation embarrassingly parallel.
43#[cfg(feature = "parallel")]
44pub fn compute_bounds_parallel(
45    elements: &[u8],
46    bonds: &[(usize, usize, String)],
47) -> ParallelBoundsResult {
48    use rayon::prelude::*;
49    use std::time::Instant;
50
51    let start = Instant::now();
52    let n = elements.len();
53
54    // Build adjacency for shortest-path
55    let adj = build_adjacency(n, bonds);
56
57    // Compute bounds for all pairs in parallel
58    let pairs: Vec<(usize, usize)> = (0..n)
59        .flat_map(|i| ((i + 1)..n).map(move |j| (i, j)))
60        .collect();
61
62    let bounds: Vec<(usize, usize, f64, f64)> = pairs
63        .par_iter()
64        .map(|&(i, j)| {
65            let (lower, upper) = compute_pair_bounds(elements, &adj, i, j);
66            (i, j, lower, upper)
67        })
68        .collect();
69
70    let mut lower_mat = vec![0.0f64; n * n];
71    let mut upper_mat = vec![f64::MAX; n * n];
72
73    for &(i, j, lo, hi) in &bounds {
74        lower_mat[i * n + j] = lo;
75        lower_mat[j * n + i] = lo;
76        upper_mat[i * n + j] = hi;
77        upper_mat[j * n + i] = hi;
78    }
79
80    let time_ms = start.elapsed().as_secs_f64() * 1000.0;
81
82    ParallelBoundsResult {
83        lower: lower_mat,
84        upper: upper_mat,
85        n,
86        used_gpu: false,
87        time_ms,
88    }
89}
90
91/// GPU-accelerated Floyd-Warshall for triangle inequality smoothing.
92///
93/// For molecules >500 atoms, the O(N³) Floyd-Warshall can benefit from
94/// GPU parallelization. Each k-iteration updates all (i,j) pairs in parallel.
95pub fn smooth_bounds_gpu(bounds: &mut [f64], n: usize, is_upper: bool) -> bool {
96    // GPU dispatch via wgpu (if available)
97    #[cfg(feature = "experimental-gpu")]
98    {
99        if n >= 500 {
100            if let Some(result) = try_gpu_floyd_warshall(bounds, n, is_upper) {
101                bounds.copy_from_slice(&result);
102                return true;
103            }
104        }
105    }
106
107    // CPU fallback: standard Floyd-Warshall
108    floyd_warshall_cpu(bounds, n, is_upper);
109    false
110}
111
112/// CPU Floyd-Warshall for triangle inequality smoothing.
113pub fn floyd_warshall_cpu(matrix: &mut [f64], n: usize, is_upper: bool) {
114    for k in 0..n {
115        for i in 0..n {
116            for j in 0..n {
117                if i == j || i == k || j == k {
118                    continue;
119                }
120                let via_k = matrix[i * n + k] + matrix[k * n + j];
121                if is_upper {
122                    // Upper bounds: tighten to min
123                    if via_k < matrix[i * n + j] {
124                        matrix[i * n + j] = via_k;
125                    }
126                } else {
127                    // Lower bounds: tighten to max
128                    if via_k > matrix[i * n + j] {
129                        matrix[i * n + j] = via_k;
130                    }
131                }
132            }
133        }
134    }
135}
136
137/// Parallel Floyd-Warshall using rayon for the inner loops.
138#[cfg(feature = "parallel")]
139pub fn floyd_warshall_parallel(matrix: &mut [f64], n: usize, is_upper: bool) {
140    use rayon::prelude::*;
141
142    for k in 0..n {
143        // Extract the k-th row for read-only access
144        let k_row: Vec<f64> = (0..n).map(|j| matrix[k * n + j]).collect();
145        let k_col: Vec<f64> = (0..n).map(|i| matrix[i * n + k]).collect();
146
147        // Update all (i, j) pairs in parallel
148        // We need to split the matrix into non-overlapping row slices
149        let row_updates: Vec<Vec<(usize, f64)>> = (0..n)
150            .into_par_iter()
151            .map(|i| {
152                if i == k {
153                    return vec![];
154                }
155                let mut updates = Vec::new();
156                for j in 0..n {
157                    if j == i || j == k {
158                        continue;
159                    }
160                    let via_k = k_col[i] + k_row[j];
161                    let current = matrix[i * n + j];
162                    if is_upper {
163                        if via_k < current {
164                            updates.push((j, via_k));
165                        }
166                    } else if via_k > current {
167                        updates.push((j, via_k));
168                    }
169                }
170                updates
171            })
172            .collect();
173
174        // Apply updates
175        for (i, updates) in row_updates.into_iter().enumerate() {
176            for (j, val) in updates {
177                matrix[i * n + j] = val;
178            }
179        }
180    }
181}
182
183/// Attempt GPU Floyd-Warshall dispatch.
184#[cfg(feature = "experimental-gpu")]
185fn try_gpu_floyd_warshall(bounds: &[f64], n: usize, _is_upper: bool) -> Option<Vec<f64>> {
186    use crate::gpu::context::GpuContext;
187
188    let ctx = match GpuContext::try_create() {
189        Ok(ctx) => ctx,
190        Err(_) => return None,
191    };
192
193    // For very large matrices, GPU memory may be insufficient
194    let matrix_bytes = n * n * std::mem::size_of::<f32>();
195    if matrix_bytes > 128 * 1024 * 1024 {
196        return None; // > 128MB — skip GPU
197    }
198
199    // Convert to f32 for GPU
200    let data_f32: Vec<f32> = bounds
201        .iter()
202        .map(|&v| if v == f64::MAX { f32::MAX } else { v as f32 })
203        .collect();
204
205    // GPU Floyd-Warshall is implemented as N iterations of a
206    // parallel (i,j) update kernel. Each iteration processes one k.
207    // The actual WGSL dispatch would go here using ctx.
208    // For now, fall back to CPU parallel.
209    let _ = (ctx, data_f32);
210    None
211}
212
213/// Build adjacency list from bond list.
214fn build_adjacency(n: usize, bonds: &[(usize, usize, String)]) -> Vec<Vec<(usize, f64)>> {
215    let mut adj = vec![vec![]; n];
216    for (a, b, order) in bonds {
217        let weight = match order.as_str() {
218            "SINGLE" => 1.0,
219            "DOUBLE" => 1.0,
220            "TRIPLE" => 1.0,
221            "AROMATIC" => 1.0,
222            _ => 1.0,
223        };
224        adj[*a].push((*b, weight));
225        adj[*b].push((*a, weight));
226    }
227    adj
228}
229
230/// Compute bounds for a single atom pair using BFS shortest path.
231fn compute_pair_bounds(
232    elements: &[u8],
233    adj: &[Vec<(usize, f64)>],
234    i: usize,
235    j: usize,
236) -> (f64, f64) {
237    let ri = crate::graph::get_covalent_radius(elements[i]);
238    let rj = crate::graph::get_covalent_radius(elements[j]);
239
240    // BFS shortest path distance
241    let n = adj.len();
242    let mut dist = vec![usize::MAX; n];
243    dist[i] = 0;
244    let mut queue = std::collections::VecDeque::new();
245    queue.push_back(i);
246
247    while let Some(node) = queue.pop_front() {
248        for &(neighbor, _) in &adj[node] {
249            if dist[neighbor] == usize::MAX {
250                dist[neighbor] = dist[node] + 1;
251                queue.push_back(neighbor);
252            }
253        }
254    }
255
256    let path_len = dist[j];
257    if path_len == usize::MAX {
258        // Not connected — only VdW constraints
259        let vi = crate::graph::get_vdw_radius(elements[i]);
260        let vj = crate::graph::get_vdw_radius(elements[j]);
261        return (vi + vj - 0.5, 100.0);
262    }
263
264    // Lower bound: sum of covalent radii minus tolerance
265    let lower = (ri + rj) * 0.8 * path_len as f64;
266
267    // Upper bound: extended chain length
268    let upper = (ri + rj + 0.4) * path_len as f64;
269
270    (lower.max(0.1), upper)
271}
272
273#[cfg(test)]
274mod tests {
275    use super::*;
276
277    #[test]
278    fn test_floyd_warshall_cpu() {
279        let mut m = vec![0.0, 1.0, f64::MAX, 1.0, 0.0, 2.0, f64::MAX, 2.0, 0.0];
280        floyd_warshall_cpu(&mut m, 3, true);
281        assert!((m[2] - 3.0).abs() < 1e-10);
282    }
283
284    #[cfg(feature = "parallel")]
285    #[test]
286    fn test_compute_bounds_parallel() {
287        let elements = vec![6u8, 6, 8]; // C-C-O
288        let bonds = vec![(0, 1, "SINGLE".to_string()), (1, 2, "SINGLE".to_string())];
289        let result = compute_bounds_parallel(&elements, &bonds);
290        assert_eq!(result.n, 3);
291        assert!(result.lower[1] > 0.0);
292    }
293}