ringkernel_procint/cuda/
codegen.rs

1//! CUDA code generation for process intelligence kernels.
2//!
3//! Uses ringkernel-cuda-codegen to transpile Rust DSL to CUDA C.
4
5#![allow(missing_docs)]
6
7#[cfg(feature = "cuda")]
8use ringkernel_cuda_codegen::{
9    transpile_global_kernel, transpile_ring_kernel, transpile_stencil_kernel, RingKernelConfig,
10    StencilConfig,
11};
12
13#[cfg(feature = "cuda")]
14use syn::parse_quote;
15
16/// Kernel source code holder.
17#[derive(Debug, Clone)]
18pub struct KernelSource {
19    /// Kernel name.
20    pub name: String,
21    /// CUDA C source code.
22    pub source: String,
23    /// Entry point function name.
24    pub entry_point: String,
25    /// Type of kernel.
26    pub kernel_type: KernelType,
27}
28
29/// Type of kernel.
30#[derive(Debug, Clone, Copy, PartialEq, Eq)]
31pub enum KernelType {
32    /// Global/batch kernel.
33    Global,
34    /// Stencil kernel with halo.
35    Stencil,
36    /// Ring kernel (persistent actor).
37    Ring,
38}
39
40impl KernelSource {
41    /// Create a new kernel source.
42    pub fn new(
43        name: impl Into<String>,
44        source: impl Into<String>,
45        kernel_type: KernelType,
46    ) -> Self {
47        let name = name.into();
48        Self {
49            entry_point: name.clone(),
50            name,
51            source: source.into(),
52            kernel_type,
53        }
54    }
55
56    /// Set custom entry point.
57    pub fn with_entry_point(mut self, entry: impl Into<String>) -> Self {
58        self.entry_point = entry.into();
59        self
60    }
61}
62
63/// Generate all CUDA kernels for process intelligence.
64#[cfg(feature = "cuda")]
65pub fn generate_all_kernels() -> Result<Vec<KernelSource>, String> {
66    let mut kernels = Vec::new();
67
68    kernels.push(generate_dfg_batch_kernel()?);
69    kernels.push(generate_pattern_batch_kernel()?);
70    kernels.push(generate_partial_order_stencil_kernel()?);
71    kernels.push(generate_dfg_ring_kernel()?);
72
73    Ok(kernels)
74}
75
76/// Generate DFG construction batch kernel.
77///
78/// Each thread processes one event pair to build edge frequencies.
79#[cfg(feature = "cuda")]
80pub fn generate_dfg_batch_kernel() -> Result<KernelSource, String> {
81    let kernel_fn: syn::ItemFn = parse_quote! {
82        fn dfg_construction(
83            source_activities: &[u32],
84            target_activities: &[u32],
85            durations: &[u32],
86            edge_frequencies: &mut [u32],
87            edge_durations: &mut [u64],
88            max_activities: i32,
89            n: i32
90        ) {
91            let idx = block_idx_x() * block_dim_x() + thread_idx_x();
92            if idx >= n { return; }
93
94            let source = source_activities[idx as usize];
95            let target = target_activities[idx as usize];
96            let duration = durations[idx as usize];
97
98            // Calculate edge index
99            let edge_idx = (source as i32 * max_activities + target as i32) as usize;
100
101            // Atomic increment frequency
102            atomic_add(&mut edge_frequencies[edge_idx], 1u32);
103
104            // Atomic add duration for averaging later
105            atomic_add(&mut edge_durations[edge_idx], duration as u64);
106        }
107    };
108
109    let cuda_source = transpile_global_kernel(&kernel_fn)
110        .map_err(|e| format!("Failed to transpile DFG batch kernel: {}", e))?;
111
112    Ok(
113        KernelSource::new("dfg_construction", cuda_source, KernelType::Global)
114            .with_entry_point("dfg_construction"),
115    )
116}
117
118/// Generate pattern detection batch kernel.
119///
120/// Each thread analyzes one DFG node for patterns.
121#[cfg(feature = "cuda")]
122pub fn generate_pattern_batch_kernel() -> Result<KernelSource, String> {
123    let kernel_fn: syn::ItemFn = parse_quote! {
124        fn pattern_detection(
125            event_counts: &[u32],
126            avg_durations: &[f32],
127            incoming_counts: &[u16],
128            outgoing_counts: &[u16],
129            pattern_types: &mut [u8],
130            pattern_confidences: &mut [f32],
131            bottleneck_threshold: f32,
132            duration_threshold: f32,
133            n: i32
134        ) {
135            let idx = block_idx_x() * block_dim_x() + thread_idx_x();
136            if idx >= n { return; }
137
138            let event_count = event_counts[idx as usize];
139            let avg_duration = avg_durations[idx as usize];
140            let incoming = incoming_counts[idx as usize] as f32;
141            let outgoing = outgoing_counts[idx as usize] as f32;
142
143            // Default: no pattern
144            pattern_types[idx as usize] = 0u8;
145            pattern_confidences[idx as usize] = 0.0f32;
146
147            if event_count == 0 { return; }
148
149            // Bottleneck detection: high incoming, low outgoing
150            if incoming > bottleneck_threshold && outgoing < incoming * 0.5f32 {
151                pattern_types[idx as usize] = 7u8; // Bottleneck
152                pattern_confidences[idx as usize] = incoming / bottleneck_threshold;
153                return;
154            }
155
156            // Long-running detection
157            if avg_duration > duration_threshold {
158                pattern_types[idx as usize] = 6u8; // LongRunning
159                pattern_confidences[idx as usize] = avg_duration / duration_threshold;
160            }
161        }
162    };
163
164    let cuda_source = transpile_global_kernel(&kernel_fn)
165        .map_err(|e| format!("Failed to transpile pattern batch kernel: {}", e))?;
166
167    Ok(
168        KernelSource::new("pattern_detection", cuda_source, KernelType::Global)
169            .with_entry_point("pattern_detection"),
170    )
171}
172
173/// Generate partial order stencil kernel.
174///
175/// Uses GridPos abstraction for pairwise event comparison.
176#[cfg(feature = "cuda")]
177pub fn generate_partial_order_stencil_kernel() -> Result<KernelSource, String> {
178    let stencil_fn: syn::ItemFn = parse_quote! {
179        fn partial_order_derive(
180            start_times: &[u64],
181            end_times: &[u64],
182            precedence: &mut [u32],
183            pos: GridPos
184        ) {
185            // Each cell (i, j) in the grid represents whether event i precedes event j
186            let i_end = end_times[pos.y() as usize];
187            let j_start = pos.east(start_times);
188
189            // i precedes j if i ends before j starts
190            if i_end <= j_start {
191                precedence[pos.idx()] = 1u32;
192            } else {
193                precedence[pos.idx()] = 0u32;
194            }
195        }
196    };
197
198    let config = StencilConfig::new("partial_order")
199        .with_tile_size(16, 16)
200        .with_halo(0); // No halo needed for pairwise comparison
201
202    let cuda_source = transpile_stencil_kernel(&stencil_fn, &config)
203        .map_err(|e| format!("Failed to transpile partial order stencil: {}", e))?;
204
205    Ok(
206        KernelSource::new("partial_order_derive", cuda_source, KernelType::Stencil)
207            .with_entry_point("partial_order_derive"),
208    )
209}
210
211/// Generate DFG ring kernel (persistent actor).
212///
213/// Continuous event processing with HLC timestamps.
214#[cfg(feature = "cuda")]
215pub fn generate_dfg_ring_kernel() -> Result<KernelSource, String> {
216    let handler_fn: syn::ItemFn = parse_quote! {
217        fn process_event(ctx: &RingContext, event: &GpuObjectEvent) -> EdgeUpdate {
218            let tid = ctx.global_thread_id();
219
220            // Get HLC timestamp for ordering
221            let ts = ctx.tick();
222
223            ctx.sync_threads();
224
225            // Create edge update response
226            EdgeUpdate {
227                source_activity: event.prev_activity,
228                target_activity: event.activity_id,
229                duration_ms: event.duration_ms,
230                timestamp: ts.physical,
231                thread_id: tid as u32,
232            }
233        }
234    };
235
236    let config = RingKernelConfig::new("dfg_processor")
237        .with_block_size(256)
238        .with_queue_capacity(4096)
239        .with_hlc(true)
240        .with_k2k(false); // No K2K needed for DFG
241
242    let cuda_source = transpile_ring_kernel(&handler_fn, &config)
243        .map_err(|e| format!("Failed to transpile DFG ring kernel: {}", e))?;
244
245    Ok(
246        KernelSource::new("ring_kernel_dfg", cuda_source, KernelType::Ring)
247            .with_entry_point("ring_kernel_dfg"),
248    )
249}
250
251/// Get CUDA type definitions header.
252pub fn cuda_type_definitions() -> &'static str {
253    r#"
254// Process Intelligence GPU Types
255// Auto-generated - matches Rust struct layouts
256
257struct __align__(128) GpuObjectEvent {
258    unsigned long long event_id;
259    unsigned long long object_id;
260    unsigned int activity_id;
261    unsigned char event_type;
262    unsigned char _padding1[3];
263    unsigned long long physical_ms;
264    unsigned int logical;
265    unsigned int node_id;
266    unsigned int resource_id;
267    unsigned int duration_ms;
268    unsigned int flags;
269    unsigned int attributes[4];
270    unsigned int object_type_id;
271    unsigned int prev_activity;
272    unsigned long long related_object_id;
273    unsigned char _reserved[32];
274};
275
276struct __align__(64) GpuDFGNode {
277    unsigned int activity_id;
278    unsigned int event_count;
279    unsigned long long total_duration_ms;
280    unsigned int min_duration_ms;
281    unsigned int max_duration_ms;
282    float avg_duration_ms;
283    float std_duration_ms;
284    unsigned long long first_seen_ms;
285    unsigned long long last_seen_ms;
286    unsigned char is_start;
287    unsigned char is_end;
288    unsigned char flags;
289    unsigned char _padding;
290    unsigned short incoming_count;
291    unsigned short outgoing_count;
292};
293
294struct __align__(64) GpuDFGEdge {
295    unsigned int source_activity;
296    unsigned int target_activity;
297    unsigned int frequency;
298    unsigned int min_duration_ms;
299    float avg_duration_ms;
300    unsigned int max_duration_ms;
301    float probability;
302    unsigned char flags;
303    unsigned char _padding[3];
304    float total_cost;
305    unsigned long long first_seen_ms;
306    unsigned long long last_seen_ms;
307    unsigned char _reserved[16];
308};
309
310struct __align__(64) GpuPatternMatch {
311    unsigned char pattern_type;
312    unsigned char severity;
313    unsigned char activity_count;
314    unsigned char flags;
315    unsigned int activity_ids[8];
316    float confidence;
317    unsigned int frequency;
318    float avg_duration_ms;
319    float impact;
320    unsigned char _reserved[4];
321};
322
323struct EdgeUpdate {
324    unsigned int source_activity;
325    unsigned int target_activity;
326    unsigned int duration_ms;
327    unsigned long long timestamp;
328    unsigned int thread_id;
329};
330
331// Pattern type constants
332#define PATTERN_SEQUENCE 0
333#define PATTERN_CHOICE 1
334#define PATTERN_LOOP 2
335#define PATTERN_PARALLEL 3
336#define PATTERN_SKIP 4
337#define PATTERN_REWORK 5
338#define PATTERN_LONG_RUNNING 6
339#define PATTERN_BOTTLENECK 7
340#define PATTERN_ANOMALY 8
341
342// Severity constants
343#define SEVERITY_INFO 0
344#define SEVERITY_WARNING 1
345#define SEVERITY_CRITICAL 2
346"#
347}
348
349// Fallback implementations when cuda feature is not enabled
350#[cfg(not(feature = "cuda"))]
351pub fn generate_all_kernels() -> Result<Vec<KernelSource>, String> {
352    Err("CUDA feature not enabled. Build with --features cuda".to_string())
353}
354
355#[cfg(not(feature = "cuda"))]
356pub fn generate_dfg_batch_kernel() -> Result<KernelSource, String> {
357    Err("CUDA feature not enabled".to_string())
358}
359
360#[cfg(not(feature = "cuda"))]
361pub fn generate_pattern_batch_kernel() -> Result<KernelSource, String> {
362    Err("CUDA feature not enabled".to_string())
363}
364
365#[cfg(not(feature = "cuda"))]
366pub fn generate_partial_order_stencil_kernel() -> Result<KernelSource, String> {
367    Err("CUDA feature not enabled".to_string())
368}
369
370#[cfg(not(feature = "cuda"))]
371pub fn generate_dfg_ring_kernel() -> Result<KernelSource, String> {
372    Err("CUDA feature not enabled".to_string())
373}
374
375/// Generate DFG construction kernel (legacy static version).
376pub fn generate_dfg_kernel() -> KernelSource {
377    KernelSource::new("dfg_construction", STATIC_DFG_KERNEL, KernelType::Global)
378        .with_entry_point("dfg_construction_kernel")
379}
380
381/// Generate pattern detection kernel (legacy static version).
382pub fn generate_pattern_kernel() -> KernelSource {
383    KernelSource::new(
384        "pattern_detection",
385        STATIC_PATTERN_KERNEL,
386        KernelType::Global,
387    )
388    .with_entry_point("pattern_detection_kernel")
389}
390
391/// Generate partial order kernel (legacy static version).
392pub fn generate_partial_order_kernel() -> KernelSource {
393    KernelSource::new(
394        "partial_order",
395        STATIC_PARTIAL_ORDER_KERNEL,
396        KernelType::Stencil,
397    )
398    .with_entry_point("partial_order_kernel")
399}
400
401/// Generate conformance kernel (legacy static version).
402pub fn generate_conformance_kernel() -> KernelSource {
403    KernelSource::new("conformance", STATIC_CONFORMANCE_KERNEL, KernelType::Global)
404        .with_entry_point("conformance_kernel")
405}
406
407// Static kernel sources (fallback when transpiler isn't available)
408// Note: extern "C" prevents C++ name mangling so we can find the function by name
409const STATIC_DFG_KERNEL: &str = r#"
410extern "C" __global__ void dfg_construction_kernel(
411    const unsigned int* source_activities,
412    const unsigned int* target_activities,
413    const unsigned int* durations,
414    unsigned int* edge_frequencies,
415    unsigned long long* edge_durations,
416    int max_activities,
417    int n
418) {
419    int idx = blockIdx.x * blockDim.x + threadIdx.x;
420    if (idx >= n) return;
421
422    unsigned int source = source_activities[idx];
423    unsigned int target = target_activities[idx];
424    unsigned int duration = durations[idx];
425
426    int edge_idx = source * max_activities + target;
427    atomicAdd(&edge_frequencies[edge_idx], 1);
428    atomicAdd(&edge_durations[edge_idx], (unsigned long long)duration);
429}
430"#;
431
432const STATIC_PATTERN_KERNEL: &str = r#"
433extern "C" __global__ void pattern_detection_kernel(
434    const unsigned int* event_counts,
435    const float* avg_durations,
436    const unsigned short* incoming_counts,
437    const unsigned short* outgoing_counts,
438    unsigned char* pattern_types,
439    float* pattern_confidences,
440    float bottleneck_threshold,
441    float duration_threshold,
442    int n
443) {
444    int idx = blockIdx.x * blockDim.x + threadIdx.x;
445    if (idx >= n) return;
446
447    unsigned int event_count = event_counts[idx];
448    float avg_duration = avg_durations[idx];
449    float incoming = (float)incoming_counts[idx];
450    float outgoing = (float)outgoing_counts[idx];
451
452    pattern_types[idx] = 0;
453    pattern_confidences[idx] = 0.0f;
454
455    if (event_count == 0) return;
456
457    // Bottleneck detection
458    if (incoming > bottleneck_threshold && outgoing < incoming * 0.5f) {
459        pattern_types[idx] = 7; // Bottleneck
460        pattern_confidences[idx] = incoming / bottleneck_threshold;
461        return;
462    }
463
464    // Long-running detection
465    if (avg_duration > duration_threshold) {
466        pattern_types[idx] = 6; // LongRunning
467        pattern_confidences[idx] = avg_duration / duration_threshold;
468    }
469}
470"#;
471
472const STATIC_PARTIAL_ORDER_KERNEL: &str = r#"
473extern "C" __global__ void partial_order_kernel(
474    const unsigned long long* start_times,
475    const unsigned long long* end_times,
476    unsigned int* precedence,
477    int width,
478    int height
479) {
480    int x = blockIdx.x * blockDim.x + threadIdx.x;
481    int y = blockIdx.y * blockDim.y + threadIdx.y;
482
483    if (x >= width || y >= height) return;
484
485    int idx = y * width + x;
486    int x_next = min(x + 1, width - 1);
487
488    unsigned long long i_end = end_times[y];
489    unsigned long long j_start = start_times[x_next];
490
491    precedence[idx] = (i_end <= j_start) ? 1 : 0;
492}
493"#;
494
495const STATIC_CONFORMANCE_KERNEL: &str = r#"
496extern "C" __global__ void conformance_kernel(
497    const unsigned int* trace_activities,
498    const int* trace_starts,
499    const int* trace_lengths,
500    const unsigned int* model_sources,
501    const unsigned int* model_targets,
502    int num_transitions,
503    float* fitness_scores,
504    int num_traces
505) {
506    int trace_idx = blockIdx.x * blockDim.x + threadIdx.x;
507    if (trace_idx >= num_traces) return;
508
509    int start = trace_starts[trace_idx];
510    int len = trace_lengths[trace_idx];
511
512    if (len <= 1) {
513        fitness_scores[trace_idx] = 1.0f;
514        return;
515    }
516
517    int valid_moves = 0;
518    for (int i = 0; i < len - 1; i++) {
519        unsigned int source = trace_activities[start + i];
520        unsigned int target = trace_activities[start + i + 1];
521
522        for (int t = 0; t < num_transitions; t++) {
523            if (model_sources[t] == source && model_targets[t] == target) {
524                valid_moves++;
525                break;
526            }
527        }
528    }
529
530    fitness_scores[trace_idx] = (float)valid_moves / (float)(len - 1);
531}
532"#;
533
534#[cfg(test)]
535mod tests {
536    use super::*;
537
538    #[test]
539    fn test_dfg_kernel_generation() {
540        let source = generate_dfg_kernel();
541        assert!(source.source.contains("dfg_construction_kernel"));
542        assert!(source.source.contains("atomicAdd"));
543    }
544
545    #[test]
546    fn test_pattern_kernel_generation() {
547        let source = generate_pattern_kernel();
548        assert!(source.source.contains("pattern_detection_kernel"));
549    }
550
551    #[test]
552    fn test_partial_order_kernel_generation() {
553        let source = generate_partial_order_kernel();
554        assert!(source.source.contains("partial_order_kernel"));
555        assert!(source.source.contains("precedence"));
556    }
557
558    #[test]
559    fn test_conformance_kernel_generation() {
560        let source = generate_conformance_kernel();
561        assert!(source.source.contains("conformance_kernel"));
562        assert!(source.source.contains("fitness"));
563    }
564
565    #[test]
566    fn test_cuda_type_definitions() {
567        let defs = cuda_type_definitions();
568        assert!(defs.contains("GpuObjectEvent"));
569        assert!(defs.contains("GpuDFGNode"));
570        assert!(defs.contains("GpuPatternMatch"));
571        assert!(defs.contains("__align__(64)"));
572    }
573
574    #[cfg(feature = "cuda")]
575    #[test]
576    fn test_transpiled_dfg_kernel() {
577        let result = generate_dfg_batch_kernel();
578        match result {
579            Ok(kernel) => {
580                println!(
581                    "Generated DFG batch kernel ({} bytes):",
582                    kernel.source.len()
583                );
584                println!("{}", kernel.source);
585                assert!(kernel.source.contains("__global__"));
586            }
587            Err(e) => {
588                println!("DFG kernel generation pending: {}", e);
589            }
590        }
591    }
592
593    #[cfg(feature = "cuda")]
594    #[test]
595    fn test_transpiled_pattern_kernel() {
596        let result = generate_pattern_batch_kernel();
597        match result {
598            Ok(kernel) => {
599                println!(
600                    "Generated pattern batch kernel ({} bytes):",
601                    kernel.source.len()
602                );
603                println!("{}", kernel.source);
604                assert!(kernel.source.contains("__global__"));
605            }
606            Err(e) => {
607                println!("Pattern kernel generation pending: {}", e);
608            }
609        }
610    }
611}