use crate::device_work_queue::{
plan_cuda_device_work_queue, CudaDeviceWorkQueueError, CudaDeviceWorkQueuePlan,
CudaDeviceWorkQueueProfile, CudaWorkQueueHostSync,
};
use crate::frontier_typed_ir_adapter::CudaFrontierTypedIrInput;
use crate::megakernel_barrier_planner::{
plan_cuda_frontier_megakernel_execution_with_scratch, CudaMegakernelBarrierScratch,
CudaMegakernelFrontierExecutionPlan, CudaMegakernelFrontierExecutionPlanError,
};
use crate::megakernel_plan_cache::{
CudaMegakernelAnalysisKind, CudaMegakernelDeviceKey, CudaMegakernelPlanCache,
};
use crate::megakernel_scheduler::CudaMegakernelScheduleSample;
use crate::token_fact_graph_cuda_adapter::CudaTokenFactGraphLayout;
use vyre_driver::ResidentGraphReuseTelemetry;
#[derive(Clone, Debug, PartialEq)]
pub struct CudaTokenFactFrontierExecutionPlan {
pub frontier: CudaMegakernelFrontierExecutionPlan,
pub work_queue: CudaDeviceWorkQueuePlan,
pub resident_payload_bytes: u64,
pub resident_work_queue_bytes: u64,
pub total_required_bytes: u64,
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum CudaTokenFactGraphResidency {
ColdUpload,
WarmResident,
}
#[derive(Clone, Debug, PartialEq)]
pub struct CudaTokenFactFrontierExecutionEnvelope {
pub plan: CudaTokenFactFrontierExecutionPlan,
pub graph_reuse: ResidentGraphReuseTelemetry,
pub resident_graph_bytes: u64,
pub graph_upload_bytes: u64,
pub avoided_graph_upload_bytes: u64,
pub total_resident_bytes: u64,
}
#[derive(Clone, Debug, PartialEq)]
pub enum CudaTokenFactFrontierExecutionError {
ZeroResidentGraphBytes,
ResidentGraphByteEnvelopeMismatch {
expected_bytes: u64,
actual_bytes: u64,
},
PayloadExceedsBudget {
payload_bytes: u64,
budget_bytes: u64,
},
ByteCountOverflow {
field: &'static str,
},
ActiveItemWaveCountMismatch {
waves: usize,
active_items: usize,
},
FrontierPlan(CudaMegakernelFrontierExecutionPlanError),
WorkQueue(CudaDeviceWorkQueueError),
}
impl std::fmt::Display for CudaTokenFactFrontierExecutionError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::ZeroResidentGraphBytes => write!(
f,
"CUDA token/fact frontier plan received an empty resident graph topology. Fix: build a concrete token/fact graph before CUDA execution planning."
),
Self::ResidentGraphByteEnvelopeMismatch {
expected_bytes,
actual_bytes,
} => write!(
f,
"CUDA token/fact frontier layout reports {actual_bytes} resident bytes but node+edge+payload fields require {expected_bytes}. Fix: rebuild the CUDA token/fact layout from the canonical adapter before planning."
),
Self::PayloadExceedsBudget {
payload_bytes,
budget_bytes,
} => write!(
f,
"CUDA token/fact frontier plan payload requires {payload_bytes} bytes but budget allows {budget_bytes}. Fix: shard the token/fact payload slab before megakernel planning."
),
Self::ByteCountOverflow { field } => write!(
f,
"CUDA token/fact frontier planner overflowed while computing {field}. Fix: shard the resident token/fact graph before CUDA execution planning."
),
Self::ActiveItemWaveCountMismatch {
waves,
active_items,
} => write!(
f,
"CUDA token/fact frontier plan has {waves} wave envelope(s) but {active_items} active-item count(s). Fix: preserve one active-item entry per frontier wave before device work-queue planning."
),
Self::FrontierPlan(err) => write!(f, "{err}"),
Self::WorkQueue(err) => write!(f, "{err}"),
}
}
}
impl std::error::Error for CudaTokenFactFrontierExecutionError {}
impl From<CudaMegakernelFrontierExecutionPlanError> for CudaTokenFactFrontierExecutionError {
fn from(err: CudaMegakernelFrontierExecutionPlanError) -> Self {
Self::FrontierPlan(err)
}
}
impl From<CudaDeviceWorkQueueError> for CudaTokenFactFrontierExecutionError {
fn from(err: CudaDeviceWorkQueueError) -> Self {
Self::WorkQueue(err)
}
}
pub fn plan_cuda_token_fact_frontier_execution(
cache: &mut CudaMegakernelPlanCache,
graph_layout_hash: u64,
analysis_kind: CudaMegakernelAnalysisKind,
device: CudaMegakernelDeviceKey,
sample: CudaMegakernelScheduleSample,
graph_layout: CudaTokenFactGraphLayout,
frontier_input: &CudaFrontierTypedIrInput,
budget_bytes: u64,
launch_overhead_ns: f64,
fusion_pressure: f64,
) -> Result<CudaTokenFactFrontierExecutionPlan, CudaTokenFactFrontierExecutionError> {
let mut barrier_scratch = CudaMegakernelBarrierScratch::try_with_capacity(
frontier_input.waves.len(),
frontier_input.dependencies.len(),
)
.map_err(CudaMegakernelFrontierExecutionPlanError::Barrier)?;
plan_cuda_token_fact_frontier_execution_with_scratch(
cache,
graph_layout_hash,
analysis_kind,
device,
sample,
graph_layout,
frontier_input,
budget_bytes,
launch_overhead_ns,
fusion_pressure,
&mut barrier_scratch,
)
}
pub fn plan_cuda_token_fact_frontier_execution_envelope(
cache: &mut CudaMegakernelPlanCache,
graph_layout_hash: u64,
analysis_kind: CudaMegakernelAnalysisKind,
device: CudaMegakernelDeviceKey,
sample: CudaMegakernelScheduleSample,
graph_layout: CudaTokenFactGraphLayout,
graph_residency: CudaTokenFactGraphResidency,
frontier_input: &CudaFrontierTypedIrInput,
budget_bytes: u64,
launch_overhead_ns: f64,
fusion_pressure: f64,
) -> Result<CudaTokenFactFrontierExecutionEnvelope, CudaTokenFactFrontierExecutionError> {
let mut barrier_scratch = CudaMegakernelBarrierScratch::try_with_capacity(
frontier_input.waves.len(),
frontier_input.dependencies.len(),
)
.map_err(CudaMegakernelFrontierExecutionPlanError::Barrier)?;
plan_cuda_token_fact_frontier_execution_envelope_with_scratch(
cache,
graph_layout_hash,
analysis_kind,
device,
sample,
graph_layout,
graph_residency,
frontier_input,
budget_bytes,
launch_overhead_ns,
fusion_pressure,
&mut barrier_scratch,
)
}
pub fn plan_cuda_token_fact_frontier_execution_with_scratch(
cache: &mut CudaMegakernelPlanCache,
graph_layout_hash: u64,
analysis_kind: CudaMegakernelAnalysisKind,
device: CudaMegakernelDeviceKey,
sample: CudaMegakernelScheduleSample,
graph_layout: CudaTokenFactGraphLayout,
frontier_input: &CudaFrontierTypedIrInput,
budget_bytes: u64,
launch_overhead_ns: f64,
fusion_pressure: f64,
barrier_scratch: &mut CudaMegakernelBarrierScratch,
) -> Result<CudaTokenFactFrontierExecutionPlan, CudaTokenFactFrontierExecutionError> {
Ok(
plan_cuda_token_fact_frontier_execution_envelope_with_scratch(
cache,
graph_layout_hash,
analysis_kind,
device,
sample,
graph_layout,
CudaTokenFactGraphResidency::ColdUpload,
frontier_input,
budget_bytes,
launch_overhead_ns,
fusion_pressure,
barrier_scratch,
)?
.plan,
)
}
pub fn plan_cuda_token_fact_frontier_execution_envelope_with_scratch(
cache: &mut CudaMegakernelPlanCache,
graph_layout_hash: u64,
analysis_kind: CudaMegakernelAnalysisKind,
device: CudaMegakernelDeviceKey,
sample: CudaMegakernelScheduleSample,
graph_layout: CudaTokenFactGraphLayout,
graph_residency: CudaTokenFactGraphResidency,
frontier_input: &CudaFrontierTypedIrInput,
budget_bytes: u64,
launch_overhead_ns: f64,
fusion_pressure: f64,
barrier_scratch: &mut CudaMegakernelBarrierScratch,
) -> Result<CudaTokenFactFrontierExecutionEnvelope, CudaTokenFactFrontierExecutionError> {
if frontier_input.active_items.len() != frontier_input.waves.len() {
return Err(
CudaTokenFactFrontierExecutionError::ActiveItemWaveCountMismatch {
waves: frontier_input.waves.len(),
active_items: frontier_input.active_items.len(),
},
);
}
let resident_graph_bytes = graph_layout
.node_bytes
.checked_add(graph_layout.edge_bytes)
.ok_or(CudaTokenFactFrontierExecutionError::ByteCountOverflow {
field: "resident token/fact graph bytes",
})?;
if resident_graph_bytes == 0 {
return Err(CudaTokenFactFrontierExecutionError::ZeroResidentGraphBytes);
}
let expected_resident_bytes = resident_graph_bytes
.checked_add(graph_layout.payload_bytes)
.ok_or(CudaTokenFactFrontierExecutionError::ByteCountOverflow {
field: "resident token/fact graph envelope bytes",
})?;
if expected_resident_bytes != graph_layout.resident_bytes {
return Err(
CudaTokenFactFrontierExecutionError::ResidentGraphByteEnvelopeMismatch {
expected_bytes: expected_resident_bytes,
actual_bytes: graph_layout.resident_bytes,
},
);
}
let payload_budget = budget_bytes.checked_sub(graph_layout.payload_bytes).ok_or(
CudaTokenFactFrontierExecutionError::PayloadExceedsBudget {
payload_bytes: graph_layout.payload_bytes,
budget_bytes,
},
)?;
let active_items = total_active_items(&frontier_input.active_items)?;
let work_queue = if active_items == 0 {
empty_device_work_queue_plan()
} else {
plan_cuda_device_work_queue(CudaDeviceWorkQueueProfile {
initial_items: active_items,
queue_capacity: active_items,
entry_bytes: 4,
control_bytes: 16,
budget_bytes: payload_budget,
host_sync: CudaWorkQueueHostSync::FinalOnly,
})?
};
let scheduler_budget = payload_budget
.checked_sub(work_queue.resident_bytes)
.ok_or(CudaTokenFactFrontierExecutionError::ByteCountOverflow {
field: "scheduler budget after work queue",
})?;
let frontier = plan_cuda_frontier_megakernel_execution_with_scratch(
cache,
graph_layout_hash,
analysis_kind,
device,
sample,
graph_layout.graph_shape,
graph_layout.node_record_bytes,
graph_layout.edge_record_bytes,
&frontier_input.waves,
&frontier_input.dependencies,
scheduler_budget,
launch_overhead_ns,
fusion_pressure,
barrier_scratch,
)?;
let total_required_bytes = frontier
.execution
.memory
.required_bytes
.checked_add(graph_layout.payload_bytes)
.and_then(|bytes| bytes.checked_add(work_queue.resident_bytes))
.ok_or(CudaTokenFactFrontierExecutionError::ByteCountOverflow {
field: "total required bytes",
})?;
let plan = CudaTokenFactFrontierExecutionPlan {
frontier,
work_queue,
resident_payload_bytes: graph_layout.payload_bytes,
resident_work_queue_bytes: work_queue.resident_bytes,
total_required_bytes,
};
let graph_upload_bytes = match graph_residency {
CudaTokenFactGraphResidency::ColdUpload => resident_graph_bytes,
CudaTokenFactGraphResidency::WarmResident => 0,
};
let avoided_graph_upload_bytes = match graph_residency {
CudaTokenFactGraphResidency::ColdUpload => 0,
CudaTokenFactGraphResidency::WarmResident => resident_graph_bytes,
};
let graph_reuse = match graph_residency {
CudaTokenFactGraphResidency::ColdUpload => {
ResidentGraphReuseTelemetry::cold_upload(resident_graph_bytes)
}
CudaTokenFactGraphResidency::WarmResident => {
ResidentGraphReuseTelemetry::warm_reuse(resident_graph_bytes)
}
};
Ok(CudaTokenFactFrontierExecutionEnvelope {
total_resident_bytes: plan.total_required_bytes,
plan,
graph_reuse,
resident_graph_bytes,
graph_upload_bytes,
avoided_graph_upload_bytes,
})
}
fn total_active_items(active_items: &[u64]) -> Result<u64, CudaTokenFactFrontierExecutionError> {
let mut total = 0_u64;
for &items in active_items {
total = total.checked_add(items).ok_or(
CudaTokenFactFrontierExecutionError::ByteCountOverflow {
field: "total active frontier items",
},
)?;
}
Ok(total)
}
fn empty_device_work_queue_plan() -> CudaDeviceWorkQueuePlan {
CudaDeviceWorkQueuePlan {
queue_bytes: 0,
control_bytes: 0,
resident_bytes: 0,
initial_occupancy_bps: 0,
final_only_host_sync: true,
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::frontier_typed_ir_adapter::adapt_frontier_typed_ir_to_cuda;
use crate::token_fact_graph_cuda_adapter::adapt_token_fact_graph_to_cuda_layout;
use vyre_self_substrate::device_resident_token_fact_graph::{
plan_device_resident_token_fact_graph, TokenFactEdge, TokenFactEdgeKind, TokenFactNode,
TokenFactNodeKind,
};
use vyre_self_substrate::frontier_typed_ir::{
plan_frontier_typed_ir, FrontierDependency, FrontierDomain, FrontierNode,
};
#[test]
fn planner_combines_token_fact_residency_with_frontier_barriers() {
let graph = plan_device_resident_token_fact_graph(
&[
node(1, TokenFactNodeKind::Token, 0, 16),
node(2, TokenFactNodeKind::Semantic, 16, 16),
node(3, TokenFactNodeKind::Fact, 32, 16),
],
&[
edge(1, 2, TokenFactEdgeKind::SemanticFact),
edge(2, 3, TokenFactEdgeKind::FactDependency),
],
48,
)
.expect("Fix: token/fact graph should pack");
let graph_layout = adapt_token_fact_graph_to_cuda_layout(&graph, 32, 16)
.expect("Fix: token/fact graph should adapt");
let frontier_plan = plan_frontier_typed_ir(
&[
frontier_node(10, FrontierDomain::Parser, 4),
frontier_node(20, FrontierDomain::Semantic, 4),
frontier_node(30, FrontierDomain::Dataflow, 4),
],
&[
FrontierDependency {
before: 10,
after: 20,
},
FrontierDependency {
before: 20,
after: 30,
},
],
)
.expect("Fix: frontier plan should build");
let frontier_input = adapt_frontier_typed_ir_to_cuda(&frontier_plan, 8, 16, 8)
.expect("Fix: frontier plan should adapt");
let mut cache = CudaMegakernelPlanCache::new();
let plan = plan_cuda_token_fact_frontier_execution(
&mut cache,
0xfeed,
CudaMegakernelAnalysisKind::ParserFrontend,
device(),
CudaMegakernelScheduleSample {
dispatch_cost_ns: 100_000.0,
frontier_density: 0.10,
readback_bytes: 24,
},
graph_layout,
&frontier_input,
8_192,
1_000.0,
0.0,
)
.expect("Fix: token/fact frontier execution should plan");
assert_eq!(plan.frontier.barriers.global_barriers, 2);
assert_eq!(plan.work_queue.queue_bytes, 12 * 4);
assert_eq!(plan.resident_work_queue_bytes, 64);
assert_eq!(plan.resident_payload_bytes, 48);
assert!(plan.total_required_bytes >= plan.frontier.execution.memory.required_bytes);
}
#[test]
fn planner_rejects_payload_that_exceeds_budget_before_frontier_planning() {
let graph = plan_device_resident_token_fact_graph(
&[node(1, TokenFactNodeKind::Token, 0, 64)],
&[],
64,
)
.expect("Fix: token/fact graph should pack");
let graph_layout = adapt_token_fact_graph_to_cuda_layout(&graph, 32, 16)
.expect("Fix: token/fact graph should adapt");
let frontier_input = CudaFrontierTypedIrInput {
waves: Vec::new(),
active_items: Vec::new(),
dependencies: Vec::new(),
};
let mut cache = CudaMegakernelPlanCache::new();
assert_eq!(
plan_cuda_token_fact_frontier_execution(
&mut cache,
0xfeed,
CudaMegakernelAnalysisKind::ParserFrontend,
device(),
CudaMegakernelScheduleSample {
dispatch_cost_ns: 1.0,
frontier_density: 0.0,
readback_bytes: 0,
},
graph_layout,
&frontier_input,
63,
0.0,
0.0,
)
.expect_err("payload over budget should fail before cache planning"),
CudaTokenFactFrontierExecutionError::PayloadExceedsBudget {
payload_bytes: 64,
budget_bytes: 63,
}
);
}
#[test]
fn planner_rejects_invalid_public_token_fact_layout_envelope() {
let frontier_input = CudaFrontierTypedIrInput {
waves: Vec::new(),
active_items: Vec::new(),
dependencies: Vec::new(),
};
let mut cache = CudaMegakernelPlanCache::new();
let sample = CudaMegakernelScheduleSample {
dispatch_cost_ns: 1.0,
frontier_density: 0.0,
readback_bytes: 0,
};
assert_eq!(
plan_cuda_token_fact_frontier_execution(
&mut cache,
0xfeed,
CudaMegakernelAnalysisKind::ParserFrontend,
device(),
sample,
CudaTokenFactGraphLayout {
graph_shape: crate::megakernel_scheduler::CudaMegakernelGraphShape {
node_count: 0,
edge_count: 0,
},
node_record_bytes: 32,
edge_record_bytes: 16,
node_bytes: 0,
edge_bytes: 0,
payload_bytes: 0,
resident_bytes: 0,
},
&frontier_input,
8_192,
0.0,
0.0,
)
.expect_err("empty resident topology should fail before CUDA planning"),
CudaTokenFactFrontierExecutionError::ZeroResidentGraphBytes
);
assert_eq!(
plan_cuda_token_fact_frontier_execution(
&mut cache,
0xfeed,
CudaMegakernelAnalysisKind::ParserFrontend,
device(),
sample,
CudaTokenFactGraphLayout {
graph_shape: crate::megakernel_scheduler::CudaMegakernelGraphShape {
node_count: 1,
edge_count: 1,
},
node_record_bytes: 32,
edge_record_bytes: 16,
node_bytes: 32,
edge_bytes: 16,
payload_bytes: 8,
resident_bytes: 55,
},
&frontier_input,
8_192,
0.0,
0.0,
)
.expect_err("mismatched resident byte envelope should fail before CUDA planning"),
CudaTokenFactFrontierExecutionError::ResidentGraphByteEnvelopeMismatch {
expected_bytes: 56,
actual_bytes: 55,
}
);
}
#[test]
fn planner_accounts_warm_resident_graph_without_upload_pressure() {
let graph = plan_device_resident_token_fact_graph(
&[node(1, TokenFactNodeKind::Token, 0, 16)],
&[],
16,
)
.expect("Fix: token/fact graph should pack");
let graph_layout = adapt_token_fact_graph_to_cuda_layout(&graph, 32, 16)
.expect("Fix: token/fact graph should adapt");
let frontier_input = CudaFrontierTypedIrInput {
waves: Vec::new(),
active_items: Vec::new(),
dependencies: Vec::new(),
};
let mut cache = CudaMegakernelPlanCache::new();
let cold = plan_cuda_token_fact_frontier_execution_envelope(
&mut cache,
0xfeed,
CudaMegakernelAnalysisKind::ParserFrontend,
device(),
CudaMegakernelScheduleSample {
dispatch_cost_ns: 1.0,
frontier_density: 0.0,
readback_bytes: 0,
},
graph_layout,
CudaTokenFactGraphResidency::ColdUpload,
&frontier_input,
8_192,
0.0,
0.0,
)
.expect("Fix: cold token/fact graph should plan");
let warm = plan_cuda_token_fact_frontier_execution_envelope(
&mut cache,
0xfeed,
CudaMegakernelAnalysisKind::ParserFrontend,
device(),
CudaMegakernelScheduleSample {
dispatch_cost_ns: 1.0,
frontier_density: 0.0,
readback_bytes: 0,
},
graph_layout,
CudaTokenFactGraphResidency::WarmResident,
&frontier_input,
8_192,
0.0,
0.0,
)
.expect("Fix: warm token/fact graph should plan");
assert_eq!(cold.resident_graph_bytes, 32);
assert_eq!(cold.graph_upload_bytes, 32);
assert_eq!(cold.avoided_graph_upload_bytes, 0);
assert_eq!(
cold.graph_reuse,
ResidentGraphReuseTelemetry::cold_upload(32)
);
assert_eq!(warm.resident_graph_bytes, 32);
assert_eq!(warm.graph_upload_bytes, 0);
assert_eq!(warm.avoided_graph_upload_bytes, 32);
assert_eq!(
warm.graph_reuse,
ResidentGraphReuseTelemetry::warm_reuse(32)
);
assert_eq!(warm.total_resident_bytes, cold.total_resident_bytes);
}
#[test]
fn planner_rejects_frontier_waves_without_matching_active_item_counts() {
let graph = plan_device_resident_token_fact_graph(
&[node(1, TokenFactNodeKind::Token, 0, 16)],
&[],
16,
)
.expect("Fix: token/fact graph should pack");
let graph_layout = adapt_token_fact_graph_to_cuda_layout(&graph, 32, 16)
.expect("Fix: token/fact graph should adapt");
let frontier_input = CudaFrontierTypedIrInput {
waves: vec![
crate::megakernel_barrier_planner::CudaMegakernelFrontierWave {
frontier_bytes: 8,
scratch_bytes: 8,
output_bytes: 8,
},
],
active_items: Vec::new(),
dependencies: Vec::new(),
};
let mut cache = CudaMegakernelPlanCache::new();
assert_eq!(
plan_cuda_token_fact_frontier_execution(
&mut cache,
0xfeed,
CudaMegakernelAnalysisKind::ParserFrontend,
device(),
CudaMegakernelScheduleSample {
dispatch_cost_ns: 1.0,
frontier_density: 0.0,
readback_bytes: 0,
},
graph_layout,
&frontier_input,
8_192,
0.0,
0.0,
)
.expect_err("mismatched active-item counts should fail before queue planning"),
CudaTokenFactFrontierExecutionError::ActiveItemWaveCountMismatch {
waves: 1,
active_items: 0,
}
);
}
#[test]
fn planner_does_not_allocate_resident_work_queue_for_empty_frontier() {
let graph = plan_device_resident_token_fact_graph(
&[node(1, TokenFactNodeKind::Token, 0, 16)],
&[],
16,
)
.expect("Fix: token/fact graph should pack");
let graph_layout = adapt_token_fact_graph_to_cuda_layout(&graph, 32, 16)
.expect("Fix: token/fact graph should adapt");
let frontier_input = CudaFrontierTypedIrInput {
waves: Vec::new(),
active_items: Vec::new(),
dependencies: Vec::new(),
};
let mut cache = CudaMegakernelPlanCache::new();
let plan = plan_cuda_token_fact_frontier_execution(
&mut cache,
0xfeed,
CudaMegakernelAnalysisKind::ParserFrontend,
device(),
CudaMegakernelScheduleSample {
dispatch_cost_ns: 1.0,
frontier_density: 0.0,
readback_bytes: 0,
},
graph_layout,
&frontier_input,
8_192,
0.0,
0.0,
)
.expect("Fix: empty frontier should not need a resident work queue");
assert_eq!(plan.work_queue.queue_bytes, 0);
assert_eq!(plan.work_queue.control_bytes, 0);
assert_eq!(plan.resident_work_queue_bytes, 0);
assert!(plan.work_queue.final_only_host_sync);
}
fn node(
id: u32,
kind: TokenFactNodeKind,
payload_offset: u64,
payload_bytes: u64,
) -> TokenFactNode {
TokenFactNode {
id,
kind,
payload_offset,
payload_bytes,
}
}
fn edge(from: u32, to: u32, kind: TokenFactEdgeKind) -> TokenFactEdge {
TokenFactEdge { from, to, kind }
}
fn frontier_node(id: u32, domain: FrontierDomain, active_items: u32) -> FrontierNode {
FrontierNode {
id,
domain,
active_items,
}
}
fn device() -> CudaMegakernelDeviceKey {
CudaMegakernelDeviceKey {
sm_major: 12,
sm_minor: 0,
warp_size: 32,
supports_grid_sync: true,
supports_tensor_cores: true,
max_workgroup_size: 1024,
}
}
}