ringkernel_procint/cuda/
codegen.rs

1//! CUDA code generation for process intelligence kernels.
2//!
3//! Uses ringkernel-cuda-codegen to transpile Rust DSL to CUDA C.
4
5#![allow(missing_docs)]
6
7#[cfg(feature = "cuda")]
8use ringkernel_cuda_codegen::{
9    transpile_global_kernel, transpile_ring_kernel, transpile_stencil_kernel, RingKernelConfig,
10    StencilConfig,
11};
12
13#[cfg(feature = "cuda")]
14use syn::parse_quote;
15
16/// Kernel source code holder.
17#[derive(Debug, Clone)]
18pub struct KernelSource {
19    /// Kernel name.
20    pub name: String,
21    /// CUDA C source code.
22    pub source: String,
23    /// Entry point function name.
24    pub entry_point: String,
25    /// Type of kernel.
26    pub kernel_type: KernelType,
27}
28
29/// Type of kernel.
30#[derive(Debug, Clone, Copy, PartialEq, Eq)]
31pub enum KernelType {
32    /// Global/batch kernel.
33    Global,
34    /// Stencil kernel with halo.
35    Stencil,
36    /// Ring kernel (persistent actor).
37    Ring,
38}
39
40impl KernelSource {
41    /// Create a new kernel source.
42    pub fn new(
43        name: impl Into<String>,
44        source: impl Into<String>,
45        kernel_type: KernelType,
46    ) -> Self {
47        let name = name.into();
48        Self {
49            entry_point: name.clone(),
50            name,
51            source: source.into(),
52            kernel_type,
53        }
54    }
55
56    /// Set custom entry point.
57    pub fn with_entry_point(mut self, entry: impl Into<String>) -> Self {
58        self.entry_point = entry.into();
59        self
60    }
61}
62
63/// Generate all CUDA kernels for process intelligence.
64#[cfg(feature = "cuda")]
65pub fn generate_all_kernels() -> Result<Vec<KernelSource>, String> {
66    Ok(vec![
67        generate_dfg_batch_kernel()?,
68        generate_pattern_batch_kernel()?,
69        generate_partial_order_stencil_kernel()?,
70        generate_dfg_ring_kernel()?,
71    ])
72}
73
74/// Generate DFG construction batch kernel.
75///
76/// Each thread processes one event pair to build edge frequencies.
77#[cfg(feature = "cuda")]
78pub fn generate_dfg_batch_kernel() -> Result<KernelSource, String> {
79    let kernel_fn: syn::ItemFn = parse_quote! {
80        fn dfg_construction(
81            source_activities: &[u32],
82            target_activities: &[u32],
83            durations: &[u32],
84            edge_frequencies: &mut [u32],
85            edge_durations: &mut [u64],
86            max_activities: i32,
87            n: i32
88        ) {
89            let idx = block_idx_x() * block_dim_x() + thread_idx_x();
90            if idx >= n { return; }
91
92            let source = source_activities[idx as usize];
93            let target = target_activities[idx as usize];
94            let duration = durations[idx as usize];
95
96            // Calculate edge index
97            let edge_idx = (source as i32 * max_activities + target as i32) as usize;
98
99            // Atomic increment frequency
100            atomic_add(&mut edge_frequencies[edge_idx], 1u32);
101
102            // Atomic add duration for averaging later
103            atomic_add(&mut edge_durations[edge_idx], duration as u64);
104        }
105    };
106
107    let cuda_source = transpile_global_kernel(&kernel_fn)
108        .map_err(|e| format!("Failed to transpile DFG batch kernel: {}", e))?;
109
110    Ok(
111        KernelSource::new("dfg_construction", cuda_source, KernelType::Global)
112            .with_entry_point("dfg_construction"),
113    )
114}
115
116/// Generate pattern detection batch kernel.
117///
118/// Each thread analyzes one DFG node for patterns.
119#[cfg(feature = "cuda")]
120pub fn generate_pattern_batch_kernel() -> Result<KernelSource, String> {
121    let kernel_fn: syn::ItemFn = parse_quote! {
122        fn pattern_detection(
123            event_counts: &[u32],
124            avg_durations: &[f32],
125            incoming_counts: &[u16],
126            outgoing_counts: &[u16],
127            pattern_types: &mut [u8],
128            pattern_confidences: &mut [f32],
129            bottleneck_threshold: f32,
130            duration_threshold: f32,
131            n: i32
132        ) {
133            let idx = block_idx_x() * block_dim_x() + thread_idx_x();
134            if idx >= n { return; }
135
136            let event_count = event_counts[idx as usize];
137            let avg_duration = avg_durations[idx as usize];
138            let incoming = incoming_counts[idx as usize] as f32;
139            let outgoing = outgoing_counts[idx as usize] as f32;
140
141            // Default: no pattern
142            pattern_types[idx as usize] = 0u8;
143            pattern_confidences[idx as usize] = 0.0f32;
144
145            if event_count == 0 { return; }
146
147            // Bottleneck detection: high incoming, low outgoing
148            if incoming > bottleneck_threshold && outgoing < incoming * 0.5f32 {
149                pattern_types[idx as usize] = 7u8; // Bottleneck
150                pattern_confidences[idx as usize] = incoming / bottleneck_threshold;
151                return;
152            }
153
154            // Long-running detection
155            if avg_duration > duration_threshold {
156                pattern_types[idx as usize] = 6u8; // LongRunning
157                pattern_confidences[idx as usize] = avg_duration / duration_threshold;
158            }
159        }
160    };
161
162    let cuda_source = transpile_global_kernel(&kernel_fn)
163        .map_err(|e| format!("Failed to transpile pattern batch kernel: {}", e))?;
164
165    Ok(
166        KernelSource::new("pattern_detection", cuda_source, KernelType::Global)
167            .with_entry_point("pattern_detection"),
168    )
169}
170
171/// Generate partial order stencil kernel.
172///
173/// Uses GridPos abstraction for pairwise event comparison.
174#[cfg(feature = "cuda")]
175pub fn generate_partial_order_stencil_kernel() -> Result<KernelSource, String> {
176    let stencil_fn: syn::ItemFn = parse_quote! {
177        fn partial_order_derive(
178            start_times: &[u64],
179            end_times: &[u64],
180            precedence: &mut [u32],
181            pos: GridPos
182        ) {
183            // Each cell (i, j) in the grid represents whether event i precedes event j
184            let i_end = end_times[pos.y() as usize];
185            let j_start = pos.east(start_times);
186
187            // i precedes j if i ends before j starts
188            if i_end <= j_start {
189                precedence[pos.idx()] = 1u32;
190            } else {
191                precedence[pos.idx()] = 0u32;
192            }
193        }
194    };
195
196    let config = StencilConfig::new("partial_order")
197        .with_tile_size(16, 16)
198        .with_halo(0); // No halo needed for pairwise comparison
199
200    let cuda_source = transpile_stencil_kernel(&stencil_fn, &config)
201        .map_err(|e| format!("Failed to transpile partial order stencil: {}", e))?;
202
203    Ok(
204        KernelSource::new("partial_order_derive", cuda_source, KernelType::Stencil)
205            .with_entry_point("partial_order_derive"),
206    )
207}
208
209/// Generate DFG ring kernel (persistent actor).
210///
211/// Continuous event processing with HLC timestamps.
212#[cfg(feature = "cuda")]
213pub fn generate_dfg_ring_kernel() -> Result<KernelSource, String> {
214    let handler_fn: syn::ItemFn = parse_quote! {
215        fn process_event(ctx: &RingContext, event: &GpuObjectEvent) -> EdgeUpdate {
216            let tid = ctx.global_thread_id();
217
218            // Get HLC timestamp for ordering
219            let ts = ctx.tick();
220
221            ctx.sync_threads();
222
223            // Create edge update response
224            EdgeUpdate {
225                source_activity: event.prev_activity,
226                target_activity: event.activity_id,
227                duration_ms: event.duration_ms,
228                timestamp: ts.physical,
229                thread_id: tid as u32,
230            }
231        }
232    };
233
234    let config = RingKernelConfig::new("dfg_processor")
235        .with_block_size(256)
236        .with_queue_capacity(4096)
237        .with_hlc(true)
238        .with_k2k(false); // No K2K needed for DFG
239
240    let cuda_source = transpile_ring_kernel(&handler_fn, &config)
241        .map_err(|e| format!("Failed to transpile DFG ring kernel: {}", e))?;
242
243    Ok(
244        KernelSource::new("ring_kernel_dfg", cuda_source, KernelType::Ring)
245            .with_entry_point("ring_kernel_dfg"),
246    )
247}
248
249/// Get CUDA type definitions header.
250pub fn cuda_type_definitions() -> &'static str {
251    r#"
252// Process Intelligence GPU Types
253// Auto-generated - matches Rust struct layouts
254
255struct __align__(128) GpuObjectEvent {
256    unsigned long long event_id;
257    unsigned long long object_id;
258    unsigned int activity_id;
259    unsigned char event_type;
260    unsigned char _padding1[3];
261    unsigned long long physical_ms;
262    unsigned int logical;
263    unsigned int node_id;
264    unsigned int resource_id;
265    unsigned int duration_ms;
266    unsigned int flags;
267    unsigned int attributes[4];
268    unsigned int object_type_id;
269    unsigned int prev_activity;
270    unsigned long long related_object_id;
271    unsigned char _reserved[32];
272};
273
274struct __align__(64) GpuDFGNode {
275    unsigned int activity_id;
276    unsigned int event_count;
277    unsigned long long total_duration_ms;
278    unsigned int min_duration_ms;
279    unsigned int max_duration_ms;
280    float avg_duration_ms;
281    float std_duration_ms;
282    unsigned long long first_seen_ms;
283    unsigned long long last_seen_ms;
284    unsigned char is_start;
285    unsigned char is_end;
286    unsigned char flags;
287    unsigned char _padding;
288    unsigned short incoming_count;
289    unsigned short outgoing_count;
290};
291
292struct __align__(64) GpuDFGEdge {
293    unsigned int source_activity;
294    unsigned int target_activity;
295    unsigned int frequency;
296    unsigned int min_duration_ms;
297    float avg_duration_ms;
298    unsigned int max_duration_ms;
299    float probability;
300    unsigned char flags;
301    unsigned char _padding[3];
302    float total_cost;
303    unsigned long long first_seen_ms;
304    unsigned long long last_seen_ms;
305    unsigned char _reserved[16];
306};
307
308struct __align__(64) GpuPatternMatch {
309    unsigned char pattern_type;
310    unsigned char severity;
311    unsigned char activity_count;
312    unsigned char flags;
313    unsigned int activity_ids[8];
314    float confidence;
315    unsigned int frequency;
316    float avg_duration_ms;
317    float impact;
318    unsigned char _reserved[4];
319};
320
321struct EdgeUpdate {
322    unsigned int source_activity;
323    unsigned int target_activity;
324    unsigned int duration_ms;
325    unsigned long long timestamp;
326    unsigned int thread_id;
327};
328
329// Pattern type constants
330#define PATTERN_SEQUENCE 0
331#define PATTERN_CHOICE 1
332#define PATTERN_LOOP 2
333#define PATTERN_PARALLEL 3
334#define PATTERN_SKIP 4
335#define PATTERN_REWORK 5
336#define PATTERN_LONG_RUNNING 6
337#define PATTERN_BOTTLENECK 7
338#define PATTERN_ANOMALY 8
339
340// Severity constants
341#define SEVERITY_INFO 0
342#define SEVERITY_WARNING 1
343#define SEVERITY_CRITICAL 2
344"#
345}
346
347// Fallback implementations when cuda feature is not enabled
348#[cfg(not(feature = "cuda"))]
349pub fn generate_all_kernels() -> Result<Vec<KernelSource>, String> {
350    Err("CUDA feature not enabled. Build with --features cuda".to_string())
351}
352
353#[cfg(not(feature = "cuda"))]
354pub fn generate_dfg_batch_kernel() -> Result<KernelSource, String> {
355    Err("CUDA feature not enabled".to_string())
356}
357
358#[cfg(not(feature = "cuda"))]
359pub fn generate_pattern_batch_kernel() -> Result<KernelSource, String> {
360    Err("CUDA feature not enabled".to_string())
361}
362
363#[cfg(not(feature = "cuda"))]
364pub fn generate_partial_order_stencil_kernel() -> Result<KernelSource, String> {
365    Err("CUDA feature not enabled".to_string())
366}
367
368#[cfg(not(feature = "cuda"))]
369pub fn generate_dfg_ring_kernel() -> Result<KernelSource, String> {
370    Err("CUDA feature not enabled".to_string())
371}
372
373/// Generate DFG construction kernel (legacy static version).
374pub fn generate_dfg_kernel() -> KernelSource {
375    KernelSource::new("dfg_construction", STATIC_DFG_KERNEL, KernelType::Global)
376        .with_entry_point("dfg_construction_kernel")
377}
378
379/// Generate pattern detection kernel (legacy static version).
380pub fn generate_pattern_kernel() -> KernelSource {
381    KernelSource::new(
382        "pattern_detection",
383        STATIC_PATTERN_KERNEL,
384        KernelType::Global,
385    )
386    .with_entry_point("pattern_detection_kernel")
387}
388
389/// Generate partial order kernel (legacy static version).
390pub fn generate_partial_order_kernel() -> KernelSource {
391    KernelSource::new(
392        "partial_order",
393        STATIC_PARTIAL_ORDER_KERNEL,
394        KernelType::Stencil,
395    )
396    .with_entry_point("partial_order_kernel")
397}
398
399/// Generate conformance kernel (legacy static version).
400pub fn generate_conformance_kernel() -> KernelSource {
401    KernelSource::new("conformance", STATIC_CONFORMANCE_KERNEL, KernelType::Global)
402        .with_entry_point("conformance_kernel")
403}
404
405// Static kernel sources (fallback when transpiler isn't available)
406// Note: extern "C" prevents C++ name mangling so we can find the function by name
407const STATIC_DFG_KERNEL: &str = r#"
408extern "C" __global__ void dfg_construction_kernel(
409    const unsigned int* source_activities,
410    const unsigned int* target_activities,
411    const unsigned int* durations,
412    unsigned int* edge_frequencies,
413    unsigned long long* edge_durations,
414    int max_activities,
415    int n
416) {
417    int idx = blockIdx.x * blockDim.x + threadIdx.x;
418    if (idx >= n) return;
419
420    unsigned int source = source_activities[idx];
421    unsigned int target = target_activities[idx];
422    unsigned int duration = durations[idx];
423
424    int edge_idx = source * max_activities + target;
425    atomicAdd(&edge_frequencies[edge_idx], 1);
426    atomicAdd(&edge_durations[edge_idx], (unsigned long long)duration);
427}
428"#;
429
430const STATIC_PATTERN_KERNEL: &str = r#"
431extern "C" __global__ void pattern_detection_kernel(
432    const unsigned int* event_counts,
433    const float* avg_durations,
434    const unsigned short* incoming_counts,
435    const unsigned short* outgoing_counts,
436    unsigned char* pattern_types,
437    float* pattern_confidences,
438    float bottleneck_threshold,
439    float duration_threshold,
440    int n
441) {
442    int idx = blockIdx.x * blockDim.x + threadIdx.x;
443    if (idx >= n) return;
444
445    unsigned int event_count = event_counts[idx];
446    float avg_duration = avg_durations[idx];
447    float incoming = (float)incoming_counts[idx];
448    float outgoing = (float)outgoing_counts[idx];
449
450    pattern_types[idx] = 0;
451    pattern_confidences[idx] = 0.0f;
452
453    if (event_count == 0) return;
454
455    // Bottleneck detection
456    if (incoming > bottleneck_threshold && outgoing < incoming * 0.5f) {
457        pattern_types[idx] = 7; // Bottleneck
458        pattern_confidences[idx] = incoming / bottleneck_threshold;
459        return;
460    }
461
462    // Long-running detection
463    if (avg_duration > duration_threshold) {
464        pattern_types[idx] = 6; // LongRunning
465        pattern_confidences[idx] = avg_duration / duration_threshold;
466    }
467}
468"#;
469
470const STATIC_PARTIAL_ORDER_KERNEL: &str = r#"
471extern "C" __global__ void partial_order_kernel(
472    const unsigned long long* start_times,
473    const unsigned long long* end_times,
474    unsigned int* precedence,
475    int width,
476    int height
477) {
478    int x = blockIdx.x * blockDim.x + threadIdx.x;
479    int y = blockIdx.y * blockDim.y + threadIdx.y;
480
481    if (x >= width || y >= height) return;
482
483    int idx = y * width + x;
484    int x_next = min(x + 1, width - 1);
485
486    unsigned long long i_end = end_times[y];
487    unsigned long long j_start = start_times[x_next];
488
489    precedence[idx] = (i_end <= j_start) ? 1 : 0;
490}
491"#;
492
493const STATIC_CONFORMANCE_KERNEL: &str = r#"
494extern "C" __global__ void conformance_kernel(
495    const unsigned int* trace_activities,
496    const int* trace_starts,
497    const int* trace_lengths,
498    const unsigned int* model_sources,
499    const unsigned int* model_targets,
500    int num_transitions,
501    float* fitness_scores,
502    int num_traces
503) {
504    int trace_idx = blockIdx.x * blockDim.x + threadIdx.x;
505    if (trace_idx >= num_traces) return;
506
507    int start = trace_starts[trace_idx];
508    int len = trace_lengths[trace_idx];
509
510    if (len <= 1) {
511        fitness_scores[trace_idx] = 1.0f;
512        return;
513    }
514
515    int valid_moves = 0;
516    for (int i = 0; i < len - 1; i++) {
517        unsigned int source = trace_activities[start + i];
518        unsigned int target = trace_activities[start + i + 1];
519
520        for (int t = 0; t < num_transitions; t++) {
521            if (model_sources[t] == source && model_targets[t] == target) {
522                valid_moves++;
523                break;
524            }
525        }
526    }
527
528    fitness_scores[trace_idx] = (float)valid_moves / (float)(len - 1);
529}
530"#;
531
532#[cfg(test)]
533mod tests {
534    use super::*;
535
536    #[test]
537    fn test_dfg_kernel_generation() {
538        let source = generate_dfg_kernel();
539        assert!(source.source.contains("dfg_construction_kernel"));
540        assert!(source.source.contains("atomicAdd"));
541    }
542
543    #[test]
544    fn test_pattern_kernel_generation() {
545        let source = generate_pattern_kernel();
546        assert!(source.source.contains("pattern_detection_kernel"));
547    }
548
549    #[test]
550    fn test_partial_order_kernel_generation() {
551        let source = generate_partial_order_kernel();
552        assert!(source.source.contains("partial_order_kernel"));
553        assert!(source.source.contains("precedence"));
554    }
555
556    #[test]
557    fn test_conformance_kernel_generation() {
558        let source = generate_conformance_kernel();
559        assert!(source.source.contains("conformance_kernel"));
560        assert!(source.source.contains("fitness"));
561    }
562
563    #[test]
564    fn test_cuda_type_definitions() {
565        let defs = cuda_type_definitions();
566        assert!(defs.contains("GpuObjectEvent"));
567        assert!(defs.contains("GpuDFGNode"));
568        assert!(defs.contains("GpuPatternMatch"));
569        assert!(defs.contains("__align__(64)"));
570    }
571
572    #[cfg(feature = "cuda")]
573    #[test]
574    fn test_transpiled_dfg_kernel() {
575        let result = generate_dfg_batch_kernel();
576        match result {
577            Ok(kernel) => {
578                println!(
579                    "Generated DFG batch kernel ({} bytes):",
580                    kernel.source.len()
581                );
582                println!("{}", kernel.source);
583                assert!(kernel.source.contains("__global__"));
584            }
585            Err(e) => {
586                println!("DFG kernel generation pending: {}", e);
587            }
588        }
589    }
590
591    #[cfg(feature = "cuda")]
592    #[test]
593    fn test_transpiled_pattern_kernel() {
594        let result = generate_pattern_batch_kernel();
595        match result {
596            Ok(kernel) => {
597                println!(
598                    "Generated pattern batch kernel ({} bytes):",
599                    kernel.source.len()
600                );
601                println!("{}", kernel.source);
602                assert!(kernel.source.contains("__global__"));
603            }
604            Err(e) => {
605                println!("Pattern kernel generation pending: {}", e);
606            }
607        }
608    }
609}