1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
use super::CompiledScanner;
pub(crate) type ShardDispatchResult = std::result::Result<Vec<Vec<u8>>, vyre::BackendError>;
// `GpuPhase1Output` (the {Hits, Done} two-phase return type used by
// the GPU scan_coalesced_* wrappers) is defined canonically in
// `gpu_scan_wrappers.rs`. A duplicate copy used to live here wrapped
// in a malformed outer `impl CompiledScanner {` opener (an enum can't
// be a member of an impl), which left the file unparseable and broke
// `cargo check` for the whole scanner crate - and via the dependency
// chain, every CI job in the repo. The orphan brace was removed and
// the duplicate enum dropped on 2026-05-26 (v0.5.17 prep).
impl CompiledScanner {
/// Dispatch `N` shards of the same program through whichever GPU
/// backend is active.
///
/// - wgpu path: `WgpuBackend::dispatch_borrowed_batch` records all
/// shards into one command encoder, one submit, one poll.
/// - CUDA path: `rayon::par_iter` over per-shard sync
/// `dispatch_borrowed` calls - CUDA's driver pipelines kernel
/// launches on the default stream while host threads enqueue
/// the next shard, giving roughly the same overlap with no
/// wgpu-specific batched API on the trait.
///
/// Returns one `Result<Vec<Vec<u8>>>` per shard in input order.
/// The outer `Result` is `Err` only on a setup-time failure
/// (e.g. no backend); per-shard dispatch errors land inside the
/// inner Result so partial-failure handling stays per-shard.
pub(crate) fn dispatch_gpu_shards(
&self,
program: &vyre::Program,
shard_inputs: &[&[&[u8]]],
configs: &[vyre::DispatchConfig],
) -> std::result::Result<Vec<ShardDispatchResult>, vyre::BackendError> {
debug_assert_eq!(shard_inputs.len(), configs.len());
if let Some(wgpu) = &self.wgpu_backend {
let jobs: Vec<(&vyre::Program, &[&[u8]], &vyre::DispatchConfig)> = shard_inputs
.iter()
.zip(configs.iter())
.map(|(inputs, config)| (program, *inputs, config))
.collect();
return wgpu.dispatch_borrowed_batch(&jobs);
}
let Some(backend) = self.gpu_backend.as_ref() else {
return Err(vyre::BackendError::new(
"no GPU backend acquired - keyhog should have routed to non-GPU before dispatch_gpu_shards. \
Fix: check select_backend() and the gpu_backend field after compile()."
.to_string(),
));
};
// CUDA / generic-trait path. rayon par_iter lets host threads
// submit kernel launches concurrently. The fire-async-then-
// await-all variant was measured slower on 1 GiB on RTX 5090
// (23.5 s vs 21.9 s for plain par_iter) because vyre's
// dispatch_borrowed_async sequentially queues onto the
// default CUDA stream - no real device-side concurrency until
// a multi-stream API is exposed at this layer.
use rayon::prelude::*;
let results: Vec<ShardDispatchResult> = shard_inputs
.par_iter()
.zip(configs.par_iter())
.map(|(inputs, config)| backend.dispatch_borrowed(program, inputs, config))
.collect();
Ok(results)
}
}
// Note: the doc-comment describing `scan_coalesced_megascan` formerly
// trailed here as an orphan. Its target function lives in
// `gpu_megascan.rs`; the doc was stranded when the file was split. Do
// not re-add `///` blocks at module scope without an item below them
// - rustc demands a binding for outer doc comments and the broken
// state cost a whole day of red CI.