trueno/brick/exec_graph/node/execution.rs
1#![allow(missing_docs)]
2//! Execution Path Graph Types (PAR-201)
3//!
4//! Node, edge, and transfer types for the execution hierarchy.
5
6use super::BrickId;
7
8/// Node ID in the execution graph.
9#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
10pub struct ExecutionNodeId(pub u32);
11
12impl ExecutionNodeId {
13 /// Maximum node ID budget (100k nodes).
14 pub const MAX_BUDGET: u32 = 100_000;
15
16 /// Validate this node ID is within budget.
17 #[inline]
18 pub fn validate(self) -> bool {
19 debug_assert!(
20 self.0 < Self::MAX_BUDGET,
21 "CB-BUDGET: node id {} exceeds max budget {}",
22 self.0,
23 Self::MAX_BUDGET
24 );
25 self.0 < Self::MAX_BUDGET
26 }
27}
28
29/// Execution graph node types.
30///
31/// PAR-201: Represents different levels of the execution hierarchy.
32#[derive(Debug, Clone)]
33pub enum ExecutionNode {
34 /// High-level brick (BrickId from v2)
35 Brick { id: BrickId, timing_ns: u64, elements: u64 },
36 /// GPU kernel launch
37 Kernel {
38 name: String,
39 /// FNV-1a hash of PTX source for identity
40 ptx_hash: u64,
41 /// Grid dimensions (blocks)
42 grid: (u32, u32, u32),
43 /// Block dimensions (threads)
44 block: (u32, u32, u32),
45 /// Shared memory bytes
46 shared_mem: u32,
47 /// Kernel execution time in nanoseconds (Phase 9: for CPA)
48 timing_ns: Option<u64>,
49 /// Arithmetic intensity (FLOPs/byte) for roofline analysis (Phase 9)
50 arithmetic_intensity: Option<f32>,
51 /// Achieved throughput in TFLOP/s (Phase 9)
52 achieved_tflops: Option<f32>,
53 },
54 /// Memory transfer operation (Phase 9: data movement topology)
55 Transfer {
56 /// Source location description
57 src: String,
58 /// Destination location description
59 dst: String,
60 /// Bytes transferred
61 bytes: u64,
62 /// Transfer direction
63 direction: TransferDirection,
64 /// Transfer time in nanoseconds
65 timing_ns: Option<u64>,
66 },
67 /// Rust function (from DWARF or manual annotation)
68 Function { name: String, file: Option<String>, line: Option<u32> },
69 /// Transformer layer grouping
70 Layer { index: u32 },
71 /// Phase 11 (E.9.4): Async task metrics for poll efficiency tracking
72 AsyncTask {
73 /// Task name for identification
74 name: String,
75 /// Number of times poll() was called
76 poll_count: u64,
77 /// Number of times poll() returned Pending
78 yield_count: u64,
79 /// Total time spent in poll() (nanoseconds)
80 total_poll_ns: u64,
81 },
82}
83
84impl ExecutionNode {
85 /// Get the display name of this node.
86 pub fn name(&self) -> String {
87 match self {
88 Self::Brick { id, .. } => id.name().to_string(),
89 Self::Kernel { name, .. } => name.clone(),
90 Self::Function { name, .. } => name.clone(),
91 Self::Layer { index } => format!("Layer{}", index),
92 Self::Transfer { src, dst, direction, .. } => {
93 let dir = match direction {
94 TransferDirection::H2D => "H2D",
95 TransferDirection::D2H => "D2H",
96 TransferDirection::D2D => "D2D",
97 };
98 format!("{}:{}->{}", dir, src, dst)
99 }
100 Self::AsyncTask { name, .. } => name.clone(),
101 }
102 }
103
104 /// Check if this is a kernel node.
105 pub fn is_kernel(&self) -> bool {
106 matches!(self, Self::Kernel { .. })
107 }
108
109 /// Check if this is a brick node.
110 pub fn is_brick(&self) -> bool {
111 matches!(self, Self::Brick { .. })
112 }
113
114 /// Check if this is a transfer node.
115 pub fn is_transfer(&self) -> bool {
116 matches!(self, Self::Transfer { .. })
117 }
118
119 /// Get timing if available (bricks, kernels, and transfers).
120 pub fn timing_ns(&self) -> Option<u64> {
121 match self {
122 Self::Brick { timing_ns, .. } => Some(*timing_ns),
123 Self::Kernel { timing_ns, .. } => *timing_ns,
124 Self::Transfer { timing_ns, .. } => *timing_ns,
125 _ => None,
126 }
127 }
128
129 /// Get PTX hash if available (kernels only).
130 pub fn ptx_hash(&self) -> Option<u64> {
131 match self {
132 Self::Kernel { ptx_hash, .. } => Some(*ptx_hash),
133 _ => None,
134 }
135 }
136
137 /// Get arithmetic intensity if available (kernels only, Phase 9).
138 pub fn arithmetic_intensity(&self) -> Option<f32> {
139 match self {
140 Self::Kernel { arithmetic_intensity, .. } => *arithmetic_intensity,
141 _ => None,
142 }
143 }
144
145 /// Get achieved TFLOP/s if available (kernels only, Phase 9).
146 pub fn achieved_tflops(&self) -> Option<f32> {
147 match self {
148 Self::Kernel { achieved_tflops, .. } => *achieved_tflops,
149 _ => None,
150 }
151 }
152
153 /// Get transfer bytes if available (transfers only, Phase 9).
154 pub fn transfer_bytes(&self) -> Option<u64> {
155 match self {
156 Self::Transfer { bytes, .. } => Some(*bytes),
157 _ => None,
158 }
159 }
160}
161
162/// Edge types in execution graph.
163///
164/// PAR-201: Describes relationships between execution nodes.
165/// Phase 9 (E.7.12): Added DependsOn and Transfer for advanced profiling.
166#[derive(Debug, Clone, PartialEq)]
167pub enum EdgeType {
168 /// Function calls function
169 Calls,
170 /// Brick contains sub-operations
171 Contains,
172 /// Function launches GPU kernel
173 Launches,
174 /// Temporal sequence (A happens before B)
175 Sequence,
176 /// Dependency edge for critical path analysis (CUDA events, stream sync)
177 /// PAR-201 Phase 9: CPA requires tracking true dependencies vs containment
178 DependsOn,
179 /// Data transfer edge with byte count (H2D/D2H/D2D)
180 /// PAR-201 Phase 9: For data movement topology and ping-pong detection
181 Transfer {
182 /// Bytes transferred
183 bytes: u64,
184 /// Transfer direction
185 direction: TransferDirection,
186 },
187}
188
189/// Direction of memory transfer.
190///
191/// PAR-201 Phase 9: Used with EdgeType::Transfer for data movement analysis.
192#[derive(Debug, Clone, Copy, PartialEq, Eq)]
193pub enum TransferDirection {
194 /// Host to Device
195 H2D,
196 /// Device to Host
197 D2H,
198 /// Device to Device
199 D2D,
200}
201
202/// An edge in the execution graph.
203#[derive(Debug, Clone)]
204pub struct ExecutionEdge {
205 /// Source node ID
206 pub src: ExecutionNodeId,
207 /// Destination node ID
208 pub dst: ExecutionNodeId,
209 /// Edge type
210 pub edge_type: EdgeType,
211 /// Optional weight (e.g., call count, timing)
212 pub weight: f32,
213}