vyre-libs 0.6.1

vyre Category A library ecosystem - pure-IR compositions over vyre-ops hardware primitives
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
use crate::parsing::c::parse::gnu_builtins::gpu_builtin_hash_table_words;
use crate::parsing::c::preprocess::gpu_define_parse::gpu_define_parse;
use crate::parsing::c::preprocess::gpu_if_expression::gpu_if_expression;
use crate::parsing::c::preprocess::gpu_ifdef_value::gpu_ifdef_value;
use crate::parsing::c::preprocess::gpu_include_parse::gpu_include_parse;
use crate::parsing::c::preprocess::gpu_undef_parse::gpu_undef_parse;
use vyre::execution_plan::fusion::fuse_programs;

use super::buffers::{
    bucket_pow2, pack_u32_words_into, pad_to_u32_words_into, unpack_u32_words_exact_into,
};
use super::tokenization::reject_invalid_if_expression_values;
use super::{ClassifiedTokens, GpuDispatcher};

/// Parsed payload for one directive row.
///
/// Indexed by token position in the source-order token stream. Rows
/// whose `directive_kinds[i] == 0` (not a directive) get
/// [`DirectivePayload::None`].
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum DirectivePayload {
    /// Not a directive row.
    None,
    /// `#define name [(args)] body`.
    Define {
        /// Macro name bytes.
        name: Vec<u8>,
        /// Macro name byte offset in the filtered source.
        name_start: u32,
        /// Macro name byte length in the filtered source.
        name_len: u32,
        /// Comma-separated args (function-like macros only). Empty for object-like.
        args: Vec<u8>,
        /// Argument-list byte offset in the filtered source.
        args_start: u32,
        /// Argument-list byte length in the filtered source.
        args_len: u32,
        /// Replacement body bytes (trailing whitespace already trimmed).
        body: Vec<u8>,
        /// Replacement body byte offset in the filtered source.
        body_start: u32,
        /// Replacement body byte length in the filtered source.
        body_len: u32,
        /// `true` if the directive used the `name(args)` form.
        is_function_like: bool,
    },
    /// `#undef name`. Currently classified by directive kind only  -
    /// the name extraction reuses the define-parse name span shape
    /// (treats `#undef` as `#define`-shaped for the parse).
    Undef {
        /// Macro name to undefine.
        name: Vec<u8>,
    },
    /// `#include <…>` or `#include "…"` (and `#include_next`).
    Include {
        /// The path bytes between the delimiters.
        path: Vec<u8>,
        /// `true` for `<…>`, `false` for `"…"`.
        is_system: bool,
        /// `true` only for `#include_next`.
        is_next: bool,
    },
    /// `#ifdef name` / `#ifndef name`. The kernel's value column gives
    /// directive truth: definedness for `#ifdef`, complement for `#ifndef`.
    Ifdef {
        /// `1` if the conditional branch is active, else `0`.
        value: u32,
        /// `true` if the directive was `#ifndef` (value semantics inverted).
        negated: bool,
    },
    /// `#if expr` / `#elif expr`. Compatibility extractors may store a
    /// snapshot value here; the production driver re-evaluates reachable rows
    /// against the live GPU macro table before branch selection.
    IfExpr {
        /// The truth value of the expression (1 or 0).
        value: u32,
        /// `true` for `#elif`, `false` for `#if`.
        is_elif: bool,
    },
    /// `#else`. No payload  -  caller flips the conditional frame.
    Else,
    /// `#endif`. No payload  -  caller pops the conditional frame.
    Endif,
    /// `#pragma`, `#line`, `#error`, `#warning`, `#ident`, `#sccs`,
    /// `#null` (empty `#`). Carried by kind only; payload is opaque.
    Other,
}

#[derive(Default)]
pub(super) struct DirectiveExtractionScratch {
    starts_b: Vec<u8>,
    lens_b: Vec<u8>,
    kinds_b: Vec<u8>,
    src_pad: Vec<u8>,
    zero_init: Vec<u8>,
    macro_names: Vec<u8>,
    macro_offsets_b: Vec<u8>,
    macro_values_b: Vec<u8>,
    parse_out: Vec<Vec<u8>>,
    condition_out: Vec<Vec<u8>>,
    name_s: Vec<u32>,
    name_l: Vec<u32>,
    args_s: Vec<u32>,
    args_l: Vec<u32>,
    body_s: Vec<u32>,
    body_l: Vec<u32>,
    is_func: Vec<u32>,
    path_s: Vec<u32>,
    path_l: Vec<u32>,
    is_system: Vec<u32>,
    undef_name_s: Vec<u32>,
    undef_name_l: Vec<u32>,
    ifdef_values: Vec<u32>,
    if_values: Vec<u32>,
}

impl DirectiveExtractionScratch {
    fn prepare_zero_init(&mut self, byte_len: usize) -> Result<(), String> {
        self.zero_init.clear();
        self.zero_init.try_reserve_exact(byte_len).map_err(|error| {
            format!(
                "gpu directive parse zero-init staging could not reserve {byte_len} bytes: {error:?}. Fix: shard preprocessing before directive payload extraction."
            )
        })?;
        self.zero_init.resize(byte_len, 0);
        Ok(())
    }
}

fn directive_word_bytes(word_count: usize, label: &'static str) -> Result<usize, String> {
    word_count.checked_mul(4).ok_or_else(|| {
        format!(
            "gpu directive parse {label} word count {word_count} overflows host byte sizing. Fix: shard preprocessing before directive payload extraction."
        )
    })
}

fn directive_padded_u32_bytes(byte_len: usize, label: &'static str) -> Result<usize, String> {
    byte_len
        .checked_add(3)
        .and_then(|value| value.checked_div(4))
        .and_then(|words| words.checked_mul(4))
        .map(|bytes| bytes.max(4))
        .ok_or_else(|| {
            format!(
                "gpu directive parse {label} byte length {byte_len} overflows u32 padding. Fix: shard preprocessing before directive payload extraction."
            )
        })
}

fn reserve_directive_vec<T>(
    out: &mut Vec<T>,
    additional: usize,
    label: &'static str,
) -> Result<(), String> {
    out.try_reserve_exact(additional).map_err(|error| {
        format!(
            "gpu directive parse could not reserve {additional} {label}: {error:?}. Fix: shard preprocessing before directive payload extraction."
        )
    })
}

/// Extract per-directive payloads for every directive row in
/// `classified`, against the supplied `defined_macros` snapshot.
///
/// Dispatches each per-directive kernel once over the FULL token
/// stream (every kernel is per-thread parallel internally). Then host
/// walks the directive_kinds column to assemble payloads.
///
/// **Note on macro accuracy:** this compatibility API accepts only
/// defined macro names. `#ifdef` / `#ifndef` snapshots are exact for that
/// name set, while `#if` bare identifiers are definedness snapshots rather
/// than full object-like macro integer expansions. The production
/// preprocessor driver does not use these snapshots for branch selection; it
/// re-evaluates reachable conditionals against the live GPU macro table.
///
/// # Errors
/// Returns the dispatcher error verbatim if any stage fails.
pub fn gpu_extract_directive_payloads(
    dispatcher: &dyn GpuDispatcher,
    classified: &ClassifiedTokens,
    defined_macros: &[&[u8]],
) -> Result<Vec<DirectivePayload>, String> {
    let mut scratch = DirectiveExtractionScratch::default();
    gpu_extract_directive_payloads_impl(dispatcher, classified, defined_macros, true, &mut scratch)
}

pub(super) fn gpu_extract_directive_payloads_for_driver_with_scratch(
    dispatcher: &dyn GpuDispatcher,
    classified: &ClassifiedTokens,
    scratch: &mut DirectiveExtractionScratch,
) -> Result<Vec<DirectivePayload>, String> {
    gpu_extract_directive_payloads_impl(dispatcher, classified, &[], false, scratch)
}

fn gpu_extract_directive_payloads_impl(
    dispatcher: &dyn GpuDispatcher,
    classified: &ClassifiedTokens,
    defined_macros: &[&[u8]],
    evaluate_condition_values: bool,
    scratch: &mut DirectiveExtractionScratch,
) -> Result<Vec<DirectivePayload>, String> {
    use crate::parsing::c::lex::tokens::{
        TOK_PP_DEFINE, TOK_PP_ELIF, TOK_PP_ELSE, TOK_PP_ENDIF, TOK_PP_IF, TOK_PP_IFDEF,
        TOK_PP_IFNDEF, TOK_PP_INCLUDE, TOK_PP_INCLUDE_NEXT, TOK_PP_UNDEF,
    };
    let n = classified.tok_types.len();
    if n == 0 {
        return Ok(Vec::new());
    }
    // Fast path: if there are no directive rows at all (no `#define`,
    // `#include`, `#ifdef`, `#if`, etc.), every payload is `None`.
    // Skip the parse-kernel dispatches entirely. For fixtures that
    // arrive at this stage already preprocessor-clean (the common case
    // after the host has expanded includes / inlined defines upstream),
    // this avoids 3 cold native-compiles of the directive parse
    // kernels  -  each of which is ~MB-scale WGSL and ~20s cold-compile
    // on the wgpu Vulkan path. The kernels still dispatch when
    // directives are present.
    if !classified.has_directives() {
        if !evaluate_condition_values {
            return Ok(Vec::new());
        }
        return Ok(vec![DirectivePayload::None; n]);
    }
    // Bucket token-count output shapes only. Source and macro buffers are
    // runtime-sized U32-packed byte buffers, so they no longer specialize
    // shader programs or require power-of-two source padding.
    let n_bucket = bucket_pow2(n.max(1), 64);
    let n_pad = n_bucket;
    let source_len = u32::try_from(classified.source.len()).map_err(|_| {
        format!(
            "gpu directive parse source length {} exceeds u32 address space. Fix: shard preprocessing before directive payload extraction.",
            classified.source.len()
        )
    })?;
    pack_u32_words_into(&mut scratch.starts_b, &classified.tok_starts, n_pad)?;
    pack_u32_words_into(&mut scratch.lens_b, &classified.tok_lens, n_pad)?;
    pack_u32_words_into(&mut scratch.kinds_b, &classified.directive_kinds, n_pad)?;
    pad_to_u32_words_into(&mut scratch.src_pad, &classified.source)?;

    // ---- Dispatch 1: define + include + undef parsers fused ----
    // All three kernels share the (tok_starts, tok_lens,
    // directive_kinds, source) inputs and have NO overlapping output
    // buffer names (undef_parse's outputs were renamed to
    // `undef_name_*` so they don't collide with define_parse's
    // `name_*`). Fusing reduces 3 separate dispatches to 1, cutting
    // host-side Vec<u8> round-trips substantially. Buffer order in
    // the fused program is set by `fuse_programs` iteration:
    //   shared:     tok_starts, tok_lens, directive_kinds, source
    //   define out: name_start, name_len, args_start, args_len,
    //               body_start, body_len, is_function_like
    //   include out: path_start, path_len, is_system
    //   undef out:  undef_name_start, undef_name_len
    let dp = gpu_define_parse(n_bucket as u32, source_len);
    let ip = gpu_include_parse(n_bucket as u32, source_len);
    let up = gpu_undef_parse(n_bucket as u32, source_len);
    let parse_fused = fuse_programs(&[dp, ip, up])
        .map_err(|e| format!("fuse define+include+undef parse: {e}"))?
        .with_entry_op_id("vyre-libs::parsing::c::preprocess::define_include_undef_parse_fused");
    let zero_init_bytes = directive_word_bytes(n_pad, "zero-init")?;
    scratch.prepare_zero_init(zero_init_bytes)?;
    let parse_inputs = [
        scratch.starts_b.as_slice(),
        scratch.lens_b.as_slice(),
        scratch.kinds_b.as_slice(),
        scratch.src_pad.as_slice(),
        scratch.zero_init.as_slice(),
        scratch.zero_init.as_slice(),
        scratch.zero_init.as_slice(),
        scratch.zero_init.as_slice(),
        scratch.zero_init.as_slice(),
        scratch.zero_init.as_slice(),
        scratch.zero_init.as_slice(),
        scratch.zero_init.as_slice(),
        scratch.zero_init.as_slice(),
        scratch.zero_init.as_slice(),
        scratch.zero_init.as_slice(),
        scratch.zero_init.as_slice(),
    ];
    dispatcher
        .dispatch_borrowed_into(&parse_fused, &parse_inputs, &mut scratch.parse_out)
        .map_err(|e| format!("gpu_define+include+undef_parse fused: {e}"))?;
    if scratch.parse_out.len() != 12 {
        return Err(format!(
            "gpu_define+include+undef_parse fused: expected exactly 12 outputs, got {}. Fix: backend must return the declared directive parse tables and no extras.",
            scratch.parse_out.len()
        ));
    }
    unpack_u32_words_exact_into(
        &scratch.parse_out[0],
        n_pad,
        "define name_start",
        &mut scratch.name_s,
    )?;
    unpack_u32_words_exact_into(
        &scratch.parse_out[1],
        n_pad,
        "define name_len",
        &mut scratch.name_l,
    )?;
    unpack_u32_words_exact_into(
        &scratch.parse_out[2],
        n_pad,
        "define args_start",
        &mut scratch.args_s,
    )?;
    unpack_u32_words_exact_into(
        &scratch.parse_out[3],
        n_pad,
        "define args_len",
        &mut scratch.args_l,
    )?;
    unpack_u32_words_exact_into(
        &scratch.parse_out[4],
        n_pad,
        "define body_start",
        &mut scratch.body_s,
    )?;
    unpack_u32_words_exact_into(
        &scratch.parse_out[5],
        n_pad,
        "define body_len",
        &mut scratch.body_l,
    )?;
    unpack_u32_words_exact_into(
        &scratch.parse_out[6],
        n_pad,
        "define is_function_like",
        &mut scratch.is_func,
    )?;
    unpack_u32_words_exact_into(
        &scratch.parse_out[7],
        n_pad,
        "include path_start",
        &mut scratch.path_s,
    )?;
    unpack_u32_words_exact_into(
        &scratch.parse_out[8],
        n_pad,
        "include path_len",
        &mut scratch.path_l,
    )?;
    unpack_u32_words_exact_into(
        &scratch.parse_out[9],
        n_pad,
        "include is_system",
        &mut scratch.is_system,
    )?;
    unpack_u32_words_exact_into(
        &scratch.parse_out[10],
        n_pad,
        "undef name_start",
        &mut scratch.undef_name_s,
    )?;
    unpack_u32_words_exact_into(
        &scratch.parse_out[11],
        n_pad,
        "undef name_len",
        &mut scratch.undef_name_l,
    )?;

    // ---- Dispatch 2: gpu_ifdef_value ----
    // Both gpu_ifdef_value and gpu_if_expression are kind-gated to
    // disjoint rows of `directive_values`, so they can be fused safely
    // (verified under reference-eval). Real-GPU cold-pipeline evidence
    // showed the merged shader compiles substantially slower, so the
    // release path keeps them as separate dispatches until compile-cache
    // telemetry proves fusion wins end-to-end.
    // gpu_ifdef_value and gpu_if_expression now read num_macros and
    // macro_names_len at runtime via Expr::buf_len, so the kernel
    // program shape is independent of how many macros the host packs.
    // No more bucketing needed for these dimensions  -  one cold compile
    // per process for the ifdef/if-expression pair.
    if evaluate_condition_values {
        // Pack defined_macros into the (names_packed, offsets, values)
        // layout only when the caller needs snapshot condition values. The
        // production driver skips this branch and re-evaluates reachable
        // conditionals against the live macro table instead.
        let macro_name_bytes =
            defined_macros
                .iter()
                .try_fold(0usize, |total, name| {
                    total.checked_add(name.len()).ok_or_else(|| {
                        "gpu directive parse macro-name byte total overflows usize. Fix: shard preprocessing before directive payload extraction.".to_string()
                    })
                })?;
        scratch.macro_names.clear();
        reserve_directive_vec(
            &mut scratch.macro_names,
            macro_name_bytes,
            "macro-name bytes",
        )?;
        let macro_offset_slots = defined_macros.len().checked_add(1).ok_or_else(|| {
            "gpu directive parse macro-offset slot count overflows usize. Fix: shard preprocessing before directive payload extraction.".to_string()
        })?;
        let mut macro_offsets: Vec<u32> = Vec::new();
        reserve_directive_vec(&mut macro_offsets, macro_offset_slots, "macro-offset slots")?;
        macro_offsets.push(0);
        for name in defined_macros {
            scratch.macro_names.extend_from_slice(name);
            macro_offsets.push(u32::try_from(scratch.macro_names.len()).map_err(|_| {
                format!(
                    "gpu directive parse macro-name byte offset {} exceeds u32 address space. Fix: shard preprocessing before directive payload extraction.",
                    scratch.macro_names.len()
                )
            })?);
        }
        let padded = directive_padded_u32_bytes(scratch.macro_names.len(), "macro names")?;
        let macro_name_padding = padded
            .checked_sub(scratch.macro_names.len())
            .ok_or_else(|| {
                "gpu directive parse macro-name padded length underflowed. Fix: repair directive padding sizing.".to_string()
            })?;
        reserve_directive_vec(
            &mut scratch.macro_names,
            macro_name_padding,
            "macro-name padding bytes",
        )?;
        scratch.macro_names.resize(padded, 0);
        pack_u32_words_into(
            &mut scratch.macro_offsets_b,
            &macro_offsets,
            macro_offsets.len(),
        )?;
        let count = defined_macros.len().max(1);
        scratch.macro_values_b.clear();
        let builtin_hashes = gpu_builtin_hash_table_words();
        let macro_value_words = count.checked_add(builtin_hashes.len()).ok_or_else(|| {
            "gpu directive parse macro-value word count overflows usize. Fix: shard preprocessing before directive payload extraction.".to_string()
        })?;
        let macro_value_bytes = directive_word_bytes(macro_value_words, "macro values")?;
        reserve_directive_vec(
            &mut scratch.macro_values_b,
            macro_value_bytes,
            "macro-value bytes",
        )?;
        vyre_primitives::wire::append_u32_slice_le_bytes(
            &builtin_hashes,
            &mut scratch.macro_values_b,
        );
        for idx in 0..count {
            let value = u32::from(idx < defined_macros.len());
            scratch
                .macro_values_b
                .extend_from_slice(&value.to_le_bytes());
        }

        let iv = gpu_ifdef_value(n_bucket as u32, source_len);
        let iv_inputs = [
            scratch.starts_b.as_slice(),
            scratch.lens_b.as_slice(),
            scratch.kinds_b.as_slice(),
            scratch.src_pad.as_slice(),
            scratch.macro_names.as_slice(),
            scratch.macro_offsets_b.as_slice(),
            scratch.zero_init.as_slice(),
        ];
        dispatcher
            .dispatch_borrowed_into(&iv, &iv_inputs, &mut scratch.condition_out)
            .map_err(|e| format!("gpu_ifdef_value: {e}"))?;
        if scratch.condition_out.len() != 1 {
            return Err(format!(
                "gpu_ifdef_value: expected exactly 1 output, got {}. Fix: backend must return only the ifdef values table.",
                scratch.condition_out.len()
            ));
        }
        unpack_u32_words_exact_into(
            &scratch.condition_out[0],
            n_pad,
            "ifdef values",
            &mut scratch.ifdef_values,
        )?;

        // ---- Dispatch 3: gpu_if_expression ----
        let ie = gpu_if_expression(n_bucket as u32, source_len);
        let ie_inputs = [
            scratch.starts_b.as_slice(),
            scratch.lens_b.as_slice(),
            scratch.kinds_b.as_slice(),
            scratch.src_pad.as_slice(),
            scratch.macro_names.as_slice(),
            scratch.macro_offsets_b.as_slice(),
            scratch.macro_values_b.as_slice(),
            scratch.zero_init.as_slice(),
        ];
        dispatcher
            .dispatch_borrowed_into(&ie, &ie_inputs, &mut scratch.condition_out)
            .map_err(|e| format!("gpu_if_expression: {e}"))?;
        if scratch.condition_out.len() != 1 {
            return Err(format!(
                "gpu_if_expression: expected exactly 1 output, got {}. Fix: backend must return only the #if expression values table.",
                scratch.condition_out.len()
            ));
        }
        unpack_u32_words_exact_into(
            &scratch.condition_out[0],
            n_pad,
            "if expression values",
            &mut scratch.if_values,
        )?;
        reject_invalid_if_expression_values(&scratch.if_values, classified)?;
    } else {
        scratch.ifdef_values.clear();
        scratch.ifdef_values.resize(n, 0);
        scratch.if_values.clear();
        scratch.if_values.resize(n, 0);
    };

    // (define + include + undef fused above into one dispatch.
    // Total: 5 dispatches → 3.)

    // ---- Walk and assemble payloads ----
    let mut out = Vec::new();
    reserve_directive_vec(&mut out, n, "directive payload slots")?;
    for i in 0..n {
        let kind = classified.directive_kinds[i];
        let payload = match kind {
            0 => DirectivePayload::None,
            k if k == TOK_PP_DEFINE => {
                let nb = scratch.name_s[i] as usize;
                let nl = scratch.name_l[i] as usize;
                let ab = scratch.args_s[i] as usize;
                let al = scratch.args_l[i] as usize;
                let bb = scratch.body_s[i] as usize;
                let bl = scratch.body_l[i] as usize;
                let name = payload_span_bytes(&classified.source, nb, nl, i, "define name")?;
                let args = if al == 0 {
                    Vec::new()
                } else {
                    payload_span_bytes(&classified.source, ab, al, i, "define args")?
                };
                let body = if bl == 0 {
                    Vec::new()
                } else {
                    payload_span_bytes(&classified.source, bb, bl, i, "define body")?
                };
                DirectivePayload::Define {
                    name,
                    name_start: scratch.name_s[i],
                    name_len: scratch.name_l[i],
                    args,
                    args_start: scratch.args_s[i],
                    args_len: scratch.args_l[i],
                    body,
                    body_start: scratch.body_s[i],
                    body_len: scratch.body_l[i],
                    is_function_like: scratch.is_func[i] == 1,
                }
            }
            k if k == TOK_PP_UNDEF => {
                // Dedicated `gpu_undef_parse` kernel (5-byte keyword
                // step, single ident scan). Replaces the prior workaround
                // that routed `#undef` through `gpu_define_parse` which
                // bakes in a 6-byte `#define` keyword length.
                let nb = scratch.undef_name_s[i] as usize;
                let nl = scratch.undef_name_l[i] as usize;
                if nl == 0 {
                    DirectivePayload::Undef { name: Vec::new() }
                } else {
                    DirectivePayload::Undef {
                        name: payload_span_bytes(&classified.source, nb, nl, i, "undef name")?,
                    }
                }
            }
            k if k == TOK_PP_INCLUDE || k == TOK_PP_INCLUDE_NEXT => {
                let pb = scratch.path_s[i] as usize;
                let pl = scratch.path_l[i] as usize;
                if pl == 0 {
                    DirectivePayload::Other
                } else {
                    DirectivePayload::Include {
                        path: payload_span_bytes(&classified.source, pb, pl, i, "include path")?,
                        is_system: scratch.is_system[i] == 1,
                        is_next: k == TOK_PP_INCLUDE_NEXT,
                    }
                }
            }
            k if k == TOK_PP_IFDEF => DirectivePayload::Ifdef {
                value: scratch.ifdef_values[i],
                negated: false,
            },
            k if k == TOK_PP_IFNDEF => DirectivePayload::Ifdef {
                value: scratch.ifdef_values[i],
                negated: true,
            },
            k if k == TOK_PP_IF => DirectivePayload::IfExpr {
                value: scratch.if_values[i],
                is_elif: false,
            },
            k if k == TOK_PP_ELIF => DirectivePayload::IfExpr {
                value: scratch.if_values[i],
                is_elif: true,
            },
            k if k == TOK_PP_ELSE => DirectivePayload::Else,
            k if k == TOK_PP_ENDIF => DirectivePayload::Endif,
            _ => DirectivePayload::Other,
        };
        out.push(payload);
    }
    Ok(out)
}


fn payload_span_bytes(
    source: &[u8],
    start: usize,
    len: usize,
    token_index: usize,
    label: &str,
) -> Result<Vec<u8>, String> {
    let end = start.checked_add(len).ok_or_else(|| {
        format!(
            "vyre-libs::gpu_pipeline: {label} span at token {token_index} overflows usize. Fix: repair GPU directive payload span emission."
        )
    })?;
    source
        .get(start..end)
        .map(|bytes| bytes.to_vec())
        .ok_or_else(|| {
            format!(
                "vyre-libs::gpu_pipeline: {label} span {start}..{end} at token {token_index} is outside source length {}. Fix: repair GPU directive payload span emission.",
                source.len()
            )
        })
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::sync::Arc;
    use vyre::ir::Program;

    struct NoDispatch;

    impl GpuDispatcher for NoDispatch {
        fn dispatch(
            &self,
            _program: &Program,
            _inputs: &[Vec<u8>],
        ) -> Result<Vec<Vec<u8>>, String> {
            Err("directive-free fixture must not dispatch".to_string())
        }
    }

    fn directive_free_classified() -> ClassifiedTokens {
        ClassifiedTokens {
            tok_types: vec![1, 1, 1],
            tok_starts: vec![0, 4, 5],
            tok_lens: vec![3, 1, 1],
            directive_kinds: vec![0, 0, 0],
            directive_count: 0,
            source: Arc::from(b"int x".as_slice()),
        }
    }

    #[test]
    fn driver_payload_extraction_uses_empty_slice_for_directive_free_inputs() {
        let classified = directive_free_classified();
        let mut scratch = DirectiveExtractionScratch::default();
        let payloads = gpu_extract_directive_payloads_for_driver_with_scratch(
            &NoDispatch,
            &classified,
            &mut scratch,
        )
        .expect("Fix: directive-free production extraction must not dispatch");

        assert!(
            payloads.is_empty(),
            "production driver should use empty payload slices for directive-free inputs"
        );
    }

    #[test]
    fn compatibility_payload_extraction_preserves_per_token_none_contract() {
        let classified = directive_free_classified();
        let payloads = gpu_extract_directive_payloads(&NoDispatch, &classified, &[])
            .expect("Fix: compatibility extraction must not dispatch on directive-free inputs");

        assert_eq!(
            payloads,
            vec![
                DirectivePayload::None,
                DirectivePayload::None,
                DirectivePayload::None
            ]
        );
    }

    #[test]
    fn directive_staging_sizing_is_checked_and_fallible() {
        assert_eq!(
            directive_word_bytes(3, "test").expect("Fix: small directive table should fit"),
            12
        );
        assert!(
            directive_word_bytes(usize::MAX, "test").is_err(),
            "Fix: directive word-to-byte sizing must reject usize overflow"
        );
        assert_eq!(
            directive_padded_u32_bytes(0, "test")
                .expect("Fix: empty macro-name table should pad to one u32"),
            4
        );
        assert_eq!(
            directive_padded_u32_bytes(5, "test")
                .expect("Fix: small macro-name table should pad to u32 bytes"),
            8
        );
        assert!(
            directive_padded_u32_bytes(usize::MAX, "test").is_err(),
            "Fix: directive padding must reject usize overflow"
        );

        let mut scratch = DirectiveExtractionScratch::default();
        scratch
            .prepare_zero_init(8)
            .expect("Fix: small directive zero staging should fit");
        assert_eq!(scratch.zero_init, vec![0; 8]);
    }
}