1#![allow(missing_docs)]
6
7#[cfg(feature = "cuda")]
8use ringkernel_cuda_codegen::{
9 transpile_global_kernel, transpile_ring_kernel, transpile_stencil_kernel, RingKernelConfig,
10 StencilConfig,
11};
12
13#[cfg(feature = "cuda")]
14use syn::parse_quote;
15
16#[derive(Debug, Clone)]
18pub struct KernelSource {
19 pub name: String,
21 pub source: String,
23 pub entry_point: String,
25 pub kernel_type: KernelType,
27}
28
29#[derive(Debug, Clone, Copy, PartialEq, Eq)]
31pub enum KernelType {
32 Global,
34 Stencil,
36 Ring,
38}
39
40impl KernelSource {
41 pub fn new(
43 name: impl Into<String>,
44 source: impl Into<String>,
45 kernel_type: KernelType,
46 ) -> Self {
47 let name = name.into();
48 Self {
49 entry_point: name.clone(),
50 name,
51 source: source.into(),
52 kernel_type,
53 }
54 }
55
56 pub fn with_entry_point(mut self, entry: impl Into<String>) -> Self {
58 self.entry_point = entry.into();
59 self
60 }
61}
62
63#[cfg(feature = "cuda")]
65pub fn generate_all_kernels() -> Result<Vec<KernelSource>, String> {
66 let mut kernels = Vec::new();
67
68 kernels.push(generate_dfg_batch_kernel()?);
69 kernels.push(generate_pattern_batch_kernel()?);
70 kernels.push(generate_partial_order_stencil_kernel()?);
71 kernels.push(generate_dfg_ring_kernel()?);
72
73 Ok(kernels)
74}
75
76#[cfg(feature = "cuda")]
80pub fn generate_dfg_batch_kernel() -> Result<KernelSource, String> {
81 let kernel_fn: syn::ItemFn = parse_quote! {
82 fn dfg_construction(
83 source_activities: &[u32],
84 target_activities: &[u32],
85 durations: &[u32],
86 edge_frequencies: &mut [u32],
87 edge_durations: &mut [u64],
88 max_activities: i32,
89 n: i32
90 ) {
91 let idx = block_idx_x() * block_dim_x() + thread_idx_x();
92 if idx >= n { return; }
93
94 let source = source_activities[idx as usize];
95 let target = target_activities[idx as usize];
96 let duration = durations[idx as usize];
97
98 let edge_idx = (source as i32 * max_activities + target as i32) as usize;
100
101 atomic_add(&mut edge_frequencies[edge_idx], 1u32);
103
104 atomic_add(&mut edge_durations[edge_idx], duration as u64);
106 }
107 };
108
109 let cuda_source = transpile_global_kernel(&kernel_fn)
110 .map_err(|e| format!("Failed to transpile DFG batch kernel: {}", e))?;
111
112 Ok(
113 KernelSource::new("dfg_construction", cuda_source, KernelType::Global)
114 .with_entry_point("dfg_construction"),
115 )
116}
117
118#[cfg(feature = "cuda")]
122pub fn generate_pattern_batch_kernel() -> Result<KernelSource, String> {
123 let kernel_fn: syn::ItemFn = parse_quote! {
124 fn pattern_detection(
125 event_counts: &[u32],
126 avg_durations: &[f32],
127 incoming_counts: &[u16],
128 outgoing_counts: &[u16],
129 pattern_types: &mut [u8],
130 pattern_confidences: &mut [f32],
131 bottleneck_threshold: f32,
132 duration_threshold: f32,
133 n: i32
134 ) {
135 let idx = block_idx_x() * block_dim_x() + thread_idx_x();
136 if idx >= n { return; }
137
138 let event_count = event_counts[idx as usize];
139 let avg_duration = avg_durations[idx as usize];
140 let incoming = incoming_counts[idx as usize] as f32;
141 let outgoing = outgoing_counts[idx as usize] as f32;
142
143 pattern_types[idx as usize] = 0u8;
145 pattern_confidences[idx as usize] = 0.0f32;
146
147 if event_count == 0 { return; }
148
149 if incoming > bottleneck_threshold && outgoing < incoming * 0.5f32 {
151 pattern_types[idx as usize] = 7u8; pattern_confidences[idx as usize] = incoming / bottleneck_threshold;
153 return;
154 }
155
156 if avg_duration > duration_threshold {
158 pattern_types[idx as usize] = 6u8; pattern_confidences[idx as usize] = avg_duration / duration_threshold;
160 }
161 }
162 };
163
164 let cuda_source = transpile_global_kernel(&kernel_fn)
165 .map_err(|e| format!("Failed to transpile pattern batch kernel: {}", e))?;
166
167 Ok(
168 KernelSource::new("pattern_detection", cuda_source, KernelType::Global)
169 .with_entry_point("pattern_detection"),
170 )
171}
172
173#[cfg(feature = "cuda")]
177pub fn generate_partial_order_stencil_kernel() -> Result<KernelSource, String> {
178 let stencil_fn: syn::ItemFn = parse_quote! {
179 fn partial_order_derive(
180 start_times: &[u64],
181 end_times: &[u64],
182 precedence: &mut [u32],
183 pos: GridPos
184 ) {
185 let i_end = end_times[pos.y() as usize];
187 let j_start = pos.east(start_times);
188
189 if i_end <= j_start {
191 precedence[pos.idx()] = 1u32;
192 } else {
193 precedence[pos.idx()] = 0u32;
194 }
195 }
196 };
197
198 let config = StencilConfig::new("partial_order")
199 .with_tile_size(16, 16)
200 .with_halo(0); let cuda_source = transpile_stencil_kernel(&stencil_fn, &config)
203 .map_err(|e| format!("Failed to transpile partial order stencil: {}", e))?;
204
205 Ok(
206 KernelSource::new("partial_order_derive", cuda_source, KernelType::Stencil)
207 .with_entry_point("partial_order_derive"),
208 )
209}
210
211#[cfg(feature = "cuda")]
215pub fn generate_dfg_ring_kernel() -> Result<KernelSource, String> {
216 let handler_fn: syn::ItemFn = parse_quote! {
217 fn process_event(ctx: &RingContext, event: &GpuObjectEvent) -> EdgeUpdate {
218 let tid = ctx.global_thread_id();
219
220 let ts = ctx.tick();
222
223 ctx.sync_threads();
224
225 EdgeUpdate {
227 source_activity: event.prev_activity,
228 target_activity: event.activity_id,
229 duration_ms: event.duration_ms,
230 timestamp: ts.physical,
231 thread_id: tid as u32,
232 }
233 }
234 };
235
236 let config = RingKernelConfig::new("dfg_processor")
237 .with_block_size(256)
238 .with_queue_capacity(4096)
239 .with_hlc(true)
240 .with_k2k(false); let cuda_source = transpile_ring_kernel(&handler_fn, &config)
243 .map_err(|e| format!("Failed to transpile DFG ring kernel: {}", e))?;
244
245 Ok(
246 KernelSource::new("ring_kernel_dfg", cuda_source, KernelType::Ring)
247 .with_entry_point("ring_kernel_dfg"),
248 )
249}
250
251pub fn cuda_type_definitions() -> &'static str {
253 r#"
254// Process Intelligence GPU Types
255// Auto-generated - matches Rust struct layouts
256
257struct __align__(128) GpuObjectEvent {
258 unsigned long long event_id;
259 unsigned long long object_id;
260 unsigned int activity_id;
261 unsigned char event_type;
262 unsigned char _padding1[3];
263 unsigned long long physical_ms;
264 unsigned int logical;
265 unsigned int node_id;
266 unsigned int resource_id;
267 unsigned int duration_ms;
268 unsigned int flags;
269 unsigned int attributes[4];
270 unsigned int object_type_id;
271 unsigned int prev_activity;
272 unsigned long long related_object_id;
273 unsigned char _reserved[32];
274};
275
276struct __align__(64) GpuDFGNode {
277 unsigned int activity_id;
278 unsigned int event_count;
279 unsigned long long total_duration_ms;
280 unsigned int min_duration_ms;
281 unsigned int max_duration_ms;
282 float avg_duration_ms;
283 float std_duration_ms;
284 unsigned long long first_seen_ms;
285 unsigned long long last_seen_ms;
286 unsigned char is_start;
287 unsigned char is_end;
288 unsigned char flags;
289 unsigned char _padding;
290 unsigned short incoming_count;
291 unsigned short outgoing_count;
292};
293
294struct __align__(64) GpuDFGEdge {
295 unsigned int source_activity;
296 unsigned int target_activity;
297 unsigned int frequency;
298 unsigned int min_duration_ms;
299 float avg_duration_ms;
300 unsigned int max_duration_ms;
301 float probability;
302 unsigned char flags;
303 unsigned char _padding[3];
304 float total_cost;
305 unsigned long long first_seen_ms;
306 unsigned long long last_seen_ms;
307 unsigned char _reserved[16];
308};
309
310struct __align__(64) GpuPatternMatch {
311 unsigned char pattern_type;
312 unsigned char severity;
313 unsigned char activity_count;
314 unsigned char flags;
315 unsigned int activity_ids[8];
316 float confidence;
317 unsigned int frequency;
318 float avg_duration_ms;
319 float impact;
320 unsigned char _reserved[4];
321};
322
323struct EdgeUpdate {
324 unsigned int source_activity;
325 unsigned int target_activity;
326 unsigned int duration_ms;
327 unsigned long long timestamp;
328 unsigned int thread_id;
329};
330
331// Pattern type constants
332#define PATTERN_SEQUENCE 0
333#define PATTERN_CHOICE 1
334#define PATTERN_LOOP 2
335#define PATTERN_PARALLEL 3
336#define PATTERN_SKIP 4
337#define PATTERN_REWORK 5
338#define PATTERN_LONG_RUNNING 6
339#define PATTERN_BOTTLENECK 7
340#define PATTERN_ANOMALY 8
341
342// Severity constants
343#define SEVERITY_INFO 0
344#define SEVERITY_WARNING 1
345#define SEVERITY_CRITICAL 2
346"#
347}
348
349#[cfg(not(feature = "cuda"))]
351pub fn generate_all_kernels() -> Result<Vec<KernelSource>, String> {
352 Err("CUDA feature not enabled. Build with --features cuda".to_string())
353}
354
355#[cfg(not(feature = "cuda"))]
356pub fn generate_dfg_batch_kernel() -> Result<KernelSource, String> {
357 Err("CUDA feature not enabled".to_string())
358}
359
360#[cfg(not(feature = "cuda"))]
361pub fn generate_pattern_batch_kernel() -> Result<KernelSource, String> {
362 Err("CUDA feature not enabled".to_string())
363}
364
365#[cfg(not(feature = "cuda"))]
366pub fn generate_partial_order_stencil_kernel() -> Result<KernelSource, String> {
367 Err("CUDA feature not enabled".to_string())
368}
369
370#[cfg(not(feature = "cuda"))]
371pub fn generate_dfg_ring_kernel() -> Result<KernelSource, String> {
372 Err("CUDA feature not enabled".to_string())
373}
374
375pub fn generate_dfg_kernel() -> KernelSource {
377 KernelSource::new("dfg_construction", STATIC_DFG_KERNEL, KernelType::Global)
378 .with_entry_point("dfg_construction_kernel")
379}
380
381pub fn generate_pattern_kernel() -> KernelSource {
383 KernelSource::new(
384 "pattern_detection",
385 STATIC_PATTERN_KERNEL,
386 KernelType::Global,
387 )
388 .with_entry_point("pattern_detection_kernel")
389}
390
391pub fn generate_partial_order_kernel() -> KernelSource {
393 KernelSource::new(
394 "partial_order",
395 STATIC_PARTIAL_ORDER_KERNEL,
396 KernelType::Stencil,
397 )
398 .with_entry_point("partial_order_kernel")
399}
400
401pub fn generate_conformance_kernel() -> KernelSource {
403 KernelSource::new("conformance", STATIC_CONFORMANCE_KERNEL, KernelType::Global)
404 .with_entry_point("conformance_kernel")
405}
406
407const STATIC_DFG_KERNEL: &str = r#"
410extern "C" __global__ void dfg_construction_kernel(
411 const unsigned int* source_activities,
412 const unsigned int* target_activities,
413 const unsigned int* durations,
414 unsigned int* edge_frequencies,
415 unsigned long long* edge_durations,
416 int max_activities,
417 int n
418) {
419 int idx = blockIdx.x * blockDim.x + threadIdx.x;
420 if (idx >= n) return;
421
422 unsigned int source = source_activities[idx];
423 unsigned int target = target_activities[idx];
424 unsigned int duration = durations[idx];
425
426 int edge_idx = source * max_activities + target;
427 atomicAdd(&edge_frequencies[edge_idx], 1);
428 atomicAdd(&edge_durations[edge_idx], (unsigned long long)duration);
429}
430"#;
431
432const STATIC_PATTERN_KERNEL: &str = r#"
433extern "C" __global__ void pattern_detection_kernel(
434 const unsigned int* event_counts,
435 const float* avg_durations,
436 const unsigned short* incoming_counts,
437 const unsigned short* outgoing_counts,
438 unsigned char* pattern_types,
439 float* pattern_confidences,
440 float bottleneck_threshold,
441 float duration_threshold,
442 int n
443) {
444 int idx = blockIdx.x * blockDim.x + threadIdx.x;
445 if (idx >= n) return;
446
447 unsigned int event_count = event_counts[idx];
448 float avg_duration = avg_durations[idx];
449 float incoming = (float)incoming_counts[idx];
450 float outgoing = (float)outgoing_counts[idx];
451
452 pattern_types[idx] = 0;
453 pattern_confidences[idx] = 0.0f;
454
455 if (event_count == 0) return;
456
457 // Bottleneck detection
458 if (incoming > bottleneck_threshold && outgoing < incoming * 0.5f) {
459 pattern_types[idx] = 7; // Bottleneck
460 pattern_confidences[idx] = incoming / bottleneck_threshold;
461 return;
462 }
463
464 // Long-running detection
465 if (avg_duration > duration_threshold) {
466 pattern_types[idx] = 6; // LongRunning
467 pattern_confidences[idx] = avg_duration / duration_threshold;
468 }
469}
470"#;
471
472const STATIC_PARTIAL_ORDER_KERNEL: &str = r#"
473extern "C" __global__ void partial_order_kernel(
474 const unsigned long long* start_times,
475 const unsigned long long* end_times,
476 unsigned int* precedence,
477 int width,
478 int height
479) {
480 int x = blockIdx.x * blockDim.x + threadIdx.x;
481 int y = blockIdx.y * blockDim.y + threadIdx.y;
482
483 if (x >= width || y >= height) return;
484
485 int idx = y * width + x;
486 int x_next = min(x + 1, width - 1);
487
488 unsigned long long i_end = end_times[y];
489 unsigned long long j_start = start_times[x_next];
490
491 precedence[idx] = (i_end <= j_start) ? 1 : 0;
492}
493"#;
494
495const STATIC_CONFORMANCE_KERNEL: &str = r#"
496extern "C" __global__ void conformance_kernel(
497 const unsigned int* trace_activities,
498 const int* trace_starts,
499 const int* trace_lengths,
500 const unsigned int* model_sources,
501 const unsigned int* model_targets,
502 int num_transitions,
503 float* fitness_scores,
504 int num_traces
505) {
506 int trace_idx = blockIdx.x * blockDim.x + threadIdx.x;
507 if (trace_idx >= num_traces) return;
508
509 int start = trace_starts[trace_idx];
510 int len = trace_lengths[trace_idx];
511
512 if (len <= 1) {
513 fitness_scores[trace_idx] = 1.0f;
514 return;
515 }
516
517 int valid_moves = 0;
518 for (int i = 0; i < len - 1; i++) {
519 unsigned int source = trace_activities[start + i];
520 unsigned int target = trace_activities[start + i + 1];
521
522 for (int t = 0; t < num_transitions; t++) {
523 if (model_sources[t] == source && model_targets[t] == target) {
524 valid_moves++;
525 break;
526 }
527 }
528 }
529
530 fitness_scores[trace_idx] = (float)valid_moves / (float)(len - 1);
531}
532"#;
533
534#[cfg(test)]
535mod tests {
536 use super::*;
537
538 #[test]
539 fn test_dfg_kernel_generation() {
540 let source = generate_dfg_kernel();
541 assert!(source.source.contains("dfg_construction_kernel"));
542 assert!(source.source.contains("atomicAdd"));
543 }
544
545 #[test]
546 fn test_pattern_kernel_generation() {
547 let source = generate_pattern_kernel();
548 assert!(source.source.contains("pattern_detection_kernel"));
549 }
550
551 #[test]
552 fn test_partial_order_kernel_generation() {
553 let source = generate_partial_order_kernel();
554 assert!(source.source.contains("partial_order_kernel"));
555 assert!(source.source.contains("precedence"));
556 }
557
558 #[test]
559 fn test_conformance_kernel_generation() {
560 let source = generate_conformance_kernel();
561 assert!(source.source.contains("conformance_kernel"));
562 assert!(source.source.contains("fitness"));
563 }
564
565 #[test]
566 fn test_cuda_type_definitions() {
567 let defs = cuda_type_definitions();
568 assert!(defs.contains("GpuObjectEvent"));
569 assert!(defs.contains("GpuDFGNode"));
570 assert!(defs.contains("GpuPatternMatch"));
571 assert!(defs.contains("__align__(64)"));
572 }
573
574 #[cfg(feature = "cuda")]
575 #[test]
576 fn test_transpiled_dfg_kernel() {
577 let result = generate_dfg_batch_kernel();
578 match result {
579 Ok(kernel) => {
580 println!(
581 "Generated DFG batch kernel ({} bytes):",
582 kernel.source.len()
583 );
584 println!("{}", kernel.source);
585 assert!(kernel.source.contains("__global__"));
586 }
587 Err(e) => {
588 println!("DFG kernel generation pending: {}", e);
589 }
590 }
591 }
592
593 #[cfg(feature = "cuda")]
594 #[test]
595 fn test_transpiled_pattern_kernel() {
596 let result = generate_pattern_batch_kernel();
597 match result {
598 Ok(kernel) => {
599 println!(
600 "Generated pattern batch kernel ({} bytes):",
601 kernel.source.len()
602 );
603 println!("{}", kernel.source);
604 assert!(kernel.source.contains("__global__"));
605 }
606 Err(e) => {
607 println!("Pattern kernel generation pending: {}", e);
608 }
609 }
610 }
611}