vyre-primitives 0.4.1

Compositional primitives for vyre — marker types (always on) + Tier 2.5 LEGO substrate (feature-gated per domain).
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
//! Exploded supergraph primitive (G3).
//!
//! # What this is
//!
//! IFDS / IDE reframes interprocedural dataflow as a reachability
//! problem on the **exploded supergraph**: each `(proc, block,
//! fact)` triple is a graph vertex, and the edges are the flow
//! functions (GEN / KILL + summary + call-to-return). Once
//! expanded, the analysis collapses to a BFS over this graph —
//! which is the exact shape
//! [`crate::graph::csr_forward_traverse`] already handles.
//!
//! This module owns the **node encoding** — the bit-layout that
//! packs `(proc_id, block_id, fact_id)` into a single `u32` node id
//! — plus a CPU reference that builds the exploded CSR so tests in
//! `vyre-libs::dataflow::ifds_gpu` can prove the GPU kernel produces
//! byte-identical CSR output.
//!
//! # Bit layout
//!
//! ```text
//!   bits 31..20   proc_id   (12 bits — 4096 procedures per module)
//!   bits 19..10   block_id  (10 bits — 1024 blocks per procedure)
//!   bits 9..0     fact_id   (10 bits — 1024 facts per workgroup;
//!                            matches FACTS_PER_WORKGROUP and the
//!                            NFA subgroup sizing)
//! ```
//!
//! This deliberately leaves no room for >4096 procedures in a
//! single module. Any real codebase that exceeds that split along
//! a module boundary first — doing interprocedural dataflow over
//! 10 000+ procs in one pass is a different problem that we don't
//! solve here and shouldn't pretend to.
//!
//! # Status
//!
//! Node encoding, CSR builder, and tests. The GPU Program wrapper
//! (the actual kernel that walks edges in parallel) lives in
//! `vyre-libs::dataflow::ifds_gpu` and composes this encoding with
//! `csr_forward_traverse`.

/// Bits reserved for each component of the packed node id.
pub const PROC_BITS: u32 = 12;
/// Bits reserved for the basic-block component of the packed node id.
pub const BLOCK_BITS: u32 = 10;
/// Bits reserved for the fact component of the packed node id.
pub const FACT_BITS: u32 = 10;
const _SANITY: () = assert!(PROC_BITS + BLOCK_BITS + FACT_BITS == 32);

/// Max values for each component — one less than the available
/// space because zero is a valid id.
pub const MAX_PROC_ID: u32 = (1 << PROC_BITS) - 1;
/// Maximum encodable basic-block id.
pub const MAX_BLOCK_ID: u32 = (1 << BLOCK_BITS) - 1;
/// Maximum encodable fact id.
pub const MAX_FACT_ID: u32 = (1 << FACT_BITS) - 1;

/// Number of facts per workgroup lane. A 32-lane subgroup x
/// 32 bits = 1024 facts; wider subgroup layouts preserve the same budget.
/// Matches the
/// NFA window sizing in `nfa::subgroup_nfa` so both subsystems
/// share occupancy budget.
pub const FACTS_PER_WORKGROUP: usize = 1024;

const BLOCK_SHIFT: u32 = FACT_BITS;
const PROC_SHIFT: u32 = FACT_BITS + BLOCK_BITS;
const FACT_MASK: u32 = MAX_FACT_ID;
const BLOCK_MASK: u32 = MAX_BLOCK_ID;
const PROC_MASK: u32 = MAX_PROC_ID;

/// Pack a `(proc_id, block_id, fact_id)` triple into a 32-bit
/// node id.
///
/// Invalid triples have no non-aliasing `u32` representation, so the
/// failure is explicit instead of silently clamping or masking.
#[must_use]
pub fn encode_node(proc_id: u32, block_id: u32, fact_id: u32) -> Option<u32> {
    fits(proc_id, block_id, fact_id)
        .then_some((proc_id << PROC_SHIFT) | (block_id << BLOCK_SHIFT) | fact_id)
}

/// Unpack a node id back into `(proc_id, block_id, fact_id)`.
#[must_use]
pub fn decode_node(node_id: u32) -> (u32, u32, u32) {
    let proc_id = (node_id >> PROC_SHIFT) & PROC_MASK;
    let block_id = (node_id >> BLOCK_SHIFT) & BLOCK_MASK;
    let fact_id = node_id & FACT_MASK;
    (proc_id, block_id, fact_id)
}

/// Whether a `(proc, block, fact)` triple fits in the packed
/// 32-bit representation. Callers on the production path should
/// verify this before calling [`encode_node`].
#[must_use]
pub fn fits(proc_id: u32, block_id: u32, fact_id: u32) -> bool {
    proc_id <= MAX_PROC_ID && block_id <= MAX_BLOCK_ID && fact_id <= MAX_FACT_ID
}

/// CPU-reference CSR builder for the exploded supergraph.
///
/// `intra_edges` are `(src_block, dst_block)` pairs **within** a
/// procedure — the standard CFG. `inter_edges` are `(src_proc,
/// src_block, dst_proc, dst_block)` call / return edges. Flow
/// functions are encoded as per-block GEN / KILL bitsets over the
/// fact domain.
///
/// Returns `(row_ptr, col_idx)` in the **dense** index space
/// `idx(p, b, f) = p * blocks * facts + b * facts + f`. This is
/// the space every traversal kernel operates in — packing via
/// [`encode_node`] is only used at the I/O boundary when the
/// caller needs to report results as `(proc, block, fact)`
/// triples. The two spaces coincide only in the degenerate case
/// `blocks_per_proc == 1 << BLOCK_BITS` and `facts_per_proc == 1 << FACT_BITS`;
/// the dense layout works for any dimensions that fit in
/// 32-bit encoding.
#[must_use]
pub fn build_cpu_reference(
    num_procs: u32,
    blocks_per_proc: u32,
    facts_per_proc: u32,
    intra_edges: &[(u32, u32, u32)], // (proc, src_block, dst_block)
    inter_edges: &[(u32, u32, u32, u32)], // (src_proc, src_block, dst_proc, dst_block)
    flow_gen: &[(u32, u32, u32)],    // (proc, block, fact) — GEN bits
    flow_kill: &[(u32, u32, u32)],   // (proc, block, fact) — KILL bits
) -> (Vec<u32>, Vec<u32>) {
    if num_procs == 0 || blocks_per_proc == 0 || facts_per_proc == 0 {
        return (vec![0], Vec::new());
    }
    if !fits(
        num_procs.saturating_sub(1),
        blocks_per_proc.saturating_sub(1),
        facts_per_proc.saturating_sub(1),
    ) {
        return (vec![0], Vec::new());
    }

    // PHASE7_GRAPH C4: every multiply checked. The previous unchecked
    // chain (`blocks * facts`, then `procs * slots`) wraps silently
    // when the caller passes the maximum dimensions for each field
    // (4096 × 1024 × 1024 = 2^32 = wraps to 0 on 32-bit usize and
    // sits exactly at the overflow boundary on 64-bit). Either case
    // produced a tiny `Vec<Vec<u32>>` and catastrophic OOB writes in
    // the edge-emit loops below.
    let Some(slots_per_proc) = (blocks_per_proc as usize).checked_mul(facts_per_proc as usize)
    else {
        return (vec![0], Vec::new());
    };
    let Some(total_nodes) = (num_procs as usize).checked_mul(slots_per_proc) else {
        return (vec![0], Vec::new());
    };
    if total_nodes > u32::MAX as usize {
        return (vec![0], Vec::new());
    }
    let mut edges_flat: Vec<(u32, u32)> = Vec::new();
    let block_count = (num_procs as usize) * (blocks_per_proc as usize);

    let idx = |p: u32, b: u32, f: u32| -> u32 {
        ((p as usize) * slots_per_proc + (b as usize) * facts_per_proc as usize + f as usize) as u32
    };
    let block_idx =
        |p: u32, b: u32| -> usize { (p as usize) * blocks_per_proc as usize + b as usize };
    let in_space =
        |p: u32, b: u32, f: u32| p < num_procs && b < blocks_per_proc && f < facts_per_proc;

    let mut killed = vec![false; total_nodes];
    for &(p, b, f) in flow_kill {
        if in_space(p, b, f) {
            killed[idx(p, b, f) as usize] = true;
        }
    }

    let mut gen_offsets = vec![0usize; block_count + 1];
    for &(p, b, f) in flow_gen {
        if in_space(p, b, f) {
            gen_offsets[block_idx(p, b) + 1] += 1;
        }
    }
    for i in 1..gen_offsets.len() {
        gen_offsets[i] += gen_offsets[i - 1];
    }
    let mut gen_cursor = gen_offsets[..block_count].to_vec();
    let mut gen_facts = vec![0u32; gen_offsets[block_count]];
    for &(p, b, f) in flow_gen {
        if in_space(p, b, f) {
            let key = block_idx(p, b);
            let slot = gen_cursor[key];
            gen_facts[slot] = f;
            gen_cursor[key] += 1;
        }
    }

    // Intra-procedural CFG edges, cross-producted with fact-propagation:
    // an edge (B_src -> B_dst) gives rise to an edge in the exploded
    // supergraph between every pair (f, f) that survives the flow
    // function at B_src (fact f propagates iff f is not killed).
    for &(p, src_b, dst_b) in intra_edges {
        if p >= num_procs || src_b >= blocks_per_proc || dst_b >= blocks_per_proc {
            continue;
        }
        for f in 0..facts_per_proc {
            if killed[idx(p, src_b, f) as usize] {
                continue;
            }
            edges_flat.push((idx(p, src_b, f), idx(p, dst_b, f)));
        }
        // GEN edges: standard IFDS 0-fact encoding — fact 0 is the
        // tautological "always present" fact. `GEN(src_b, gf)` emits
        // edge `(src_b, 0) → (dst_b, gf)`, so seeding `(entry, 0)`
        // triggers every GEN along the reachable CFG. Callers that
        // don't use the 0-fact convention see GEN as a no-op.
        let gen_key = block_idx(p, src_b);
        for &gf in &gen_facts[gen_offsets[gen_key]..gen_offsets[gen_key + 1]] {
            edges_flat.push((idx(p, src_b, 0), idx(p, dst_b, gf)));
        }
    }

    // Inter-procedural call / return edges propagate every fact
    // (IFDS handles parameter mapping via summary edges in the full
    // algorithm; this CPU reference is the unfiltered
    // "every-fact-flows" upper bound used for correctness tests).
    for &(sp, sb, dp, db) in inter_edges {
        if sp >= num_procs || dp >= num_procs || sb >= blocks_per_proc || db >= blocks_per_proc {
            continue;
        }
        for f in 0..facts_per_proc {
            edges_flat.push((idx(sp, sb, f), idx(dp, db, f)));
        }
    }

    // Flatten into CSR — row_ptr has total_nodes+1 entries.
    if edges_flat.len() > u32::MAX as usize {
        return (vec![0], Vec::new());
    }
    let mut row_ptr = vec![0u32; total_nodes + 1];
    for &(src, _) in &edges_flat {
        let row = src as usize;
        row_ptr[row + 1] = row_ptr[row + 1].saturating_add(1);
    }
    for row in 1..row_ptr.len() {
        row_ptr[row] = row_ptr[row].saturating_add(row_ptr[row - 1]);
    }
    let mut cursor = row_ptr[..total_nodes]
        .iter()
        .map(|&offset| offset as usize)
        .collect::<Vec<_>>();
    let mut col_idx = vec![0u32; edges_flat.len()];
    for (src, dst) in edges_flat {
        let row = src as usize;
        let slot = cursor[row];
        col_idx[slot] = dst;
        cursor[row] += 1;
    }
    (row_ptr, col_idx)
}

/// Convert a dense `(proc, block, fact)` index — the space
/// [`build_cpu_reference`] operates in — into the packed
/// [`encode_node`] form for reporting or cross-subsystem handoff.
#[must_use]
pub fn dense_to_encoded(dense: u32, blocks_per_proc: u32, facts_per_proc: u32) -> Option<u32> {
    let slots_per_proc = blocks_per_proc.checked_mul(facts_per_proc)?;
    if slots_per_proc == 0 {
        return None;
    }
    let p = dense / slots_per_proc;
    let within_proc = dense % slots_per_proc;
    let b = within_proc / facts_per_proc;
    let f = within_proc % facts_per_proc;
    encode_node(p, b, f)
}

/// Inverse of [`dense_to_encoded`].
#[must_use]
pub fn encoded_to_dense(node_id: u32, blocks_per_proc: u32, facts_per_proc: u32) -> Option<u32> {
    let (p, b, f) = decode_node(node_id);
    let proc_span = blocks_per_proc.checked_mul(facts_per_proc)?;
    let proc_offset = p.checked_mul(proc_span)?;
    let block_offset = b.checked_mul(facts_per_proc)?;
    proc_offset.checked_add(block_offset)?.checked_add(f)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn encode_decode_roundtrips_at_max_values() {
        let n = encode_node(MAX_PROC_ID, MAX_BLOCK_ID, MAX_FACT_ID).unwrap();
        assert_eq!(n, u32::MAX);
        assert_eq!(decode_node(n), (MAX_PROC_ID, MAX_BLOCK_ID, MAX_FACT_ID));
    }

    #[test]
    fn encode_decode_roundtrips_at_zero() {
        let n = encode_node(0, 0, 0).unwrap();
        assert_eq!(n, 0);
        assert_eq!(decode_node(n), (0, 0, 0));
    }

    #[test]
    fn encode_decode_roundtrips_at_component_boundaries() {
        for (p, b, f) in [
            (0, 0, 1),
            (0, 1, 0),
            (1, 0, 0),
            (0, 0, MAX_FACT_ID),
            (0, MAX_BLOCK_ID, 0),
            (MAX_PROC_ID, 0, 0),
            (1, 2, 3),
            (42, 17, 99),
            (MAX_PROC_ID / 2, MAX_BLOCK_ID / 2, MAX_FACT_ID / 2),
        ] {
            let n = encode_node(p, b, f).unwrap();
            assert_eq!(
                decode_node(n),
                (p, b, f),
                "roundtrip failed for {p}/{b}/{f}"
            );
        }
    }

    #[test]
    fn fits_catches_over_range_components() {
        assert!(fits(MAX_PROC_ID, MAX_BLOCK_ID, MAX_FACT_ID));
        assert!(!fits(MAX_PROC_ID + 1, 0, 0));
        assert!(!fits(0, MAX_BLOCK_ID + 1, 0));
        assert!(!fits(0, 0, MAX_FACT_ID + 1));
        assert_eq!(encode_node(MAX_PROC_ID + 1, 0, 0), None);
    }

    #[test]
    fn csr_of_empty_graph_has_only_sentinel_row_ptr() {
        let (row_ptr, col_idx) = build_cpu_reference(1, 1, 1, &[], &[], &[], &[]);
        assert_eq!(row_ptr, vec![0, 0]);
        assert!(col_idx.is_empty());
    }

    // Dense-index helper mirrors the one inside build_cpu_reference.
    fn di(p: u32, b: u32, f: u32, blocks: u32, facts: u32) -> u32 {
        p * blocks * facts + b * facts + f
    }

    #[test]
    fn csr_single_intra_edge_produces_per_fact_duplicate_edges() {
        // 1 proc, 2 blocks (B0→B1), 4 facts; no kills → each fact
        // flows forward once.
        let (row_ptr, col_idx) = build_cpu_reference(1, 2, 4, &[(0, 0, 1)], &[], &[], &[]);
        assert_eq!(row_ptr.len(), 9);
        assert_eq!(col_idx.len(), 4);
        for f in 0..4 {
            let src = di(0, 0, f, 2, 4) as usize;
            let edge_start = row_ptr[src] as usize;
            assert_eq!(col_idx[edge_start], di(0, 1, f, 2, 4));
        }
    }

    #[test]
    fn csr_kill_suppresses_edge_for_that_fact() {
        let (row_ptr, col_idx) = build_cpu_reference(
            1,
            2,
            4,
            &[(0, 0, 1)],
            &[],
            &[],
            &[(0, 0, 2)], // KILL fact 2 at (0, 0)
        );
        let n_edges: u32 = row_ptr.windows(2).map(|w| w[1] - w[0]).sum();
        assert_eq!(n_edges, 3);
        let killed_src = di(0, 0, 2, 2, 4) as usize;
        assert_eq!(row_ptr[killed_src + 1] - row_ptr[killed_src], 0);
        let _ = col_idx;
    }

    #[test]
    fn csr_inter_edges_connect_procs() {
        let (row_ptr, col_idx) = build_cpu_reference(
            2,
            2,
            2,
            &[],
            &[(0, 1, 1, 0)], // call: P0/B1 → P1/B0
            &[],
            &[],
        );
        assert_eq!(row_ptr.len(), 9);
        assert_eq!(col_idx.len(), 2);
        let src0 = di(0, 1, 0, 2, 2) as usize;
        let src1 = di(0, 1, 1, 2, 2) as usize;
        assert_eq!(
            &col_idx[row_ptr[src0] as usize..row_ptr[src0 + 1] as usize],
            &[di(1, 0, 0, 2, 2)]
        );
        assert_eq!(
            &col_idx[row_ptr[src1] as usize..row_ptr[src1 + 1] as usize],
            &[di(1, 0, 1, 2, 2)]
        );
    }

    #[test]
    fn dense_encoded_roundtrips() {
        for &(p, b, f, blocks, facts) in &[
            (0_u32, 0_u32, 0_u32, 2_u32, 2_u32),
            (1, 1, 1, 2, 2),
            (42, 17, 99, 64, 128),
            (MAX_PROC_ID, 3, 7, 16, 16),
        ] {
            let d = di(p, b, f, blocks, facts);
            let enc = dense_to_encoded(d, blocks, facts).unwrap();
            assert_eq!(decode_node(enc), (p, b, f));
            let back = encoded_to_dense(enc, blocks, facts).unwrap();
            assert_eq!(back, d, "roundtrip mismatch {p}/{b}/{f}");
        }
    }

    #[test]
    fn csr_gen_introduces_new_fact_flow_from_zero_fact() {
        // B0 → B1, GEN fact 2 at B0. Per IFDS 0-fact convention,
        // GEN emits edge (B0, 0) → (B1, 2). The intra loop still
        // propagates every non-killed fact (0..3), so total edges
        // are 4 (intra) + 1 (GEN from 0-fact) = 5. The GEN edge
        // specifically targets fact 2 at B1 even though fact 2 did
        // not flow in through any predecessor — that is the point.
        let (row_ptr, col_idx) = build_cpu_reference(1, 2, 4, &[(0, 0, 1)], &[], &[(0, 0, 2)], &[]);
        assert_eq!(col_idx.len(), 5);
        // Verify the GEN edge is attached to the 0-fact source, not
        // to fact-2 (which would be redundant with the intra edge).
        let zero_src = di(0, 0, 0, 2, 4) as usize;
        let fact2_dst = di(0, 1, 2, 2, 4);
        let zero_neighbours = &col_idx[row_ptr[zero_src] as usize..row_ptr[zero_src + 1] as usize];
        assert!(zero_neighbours.contains(&fact2_dst));
    }

    #[test]
    fn csr_rejects_dimensions_overflowing_encoding() {
        // (MAX_PROC_ID + 2) × anything overflows PROC_BITS.
        let (row_ptr, col_idx) = build_cpu_reference(MAX_PROC_ID + 2, 1, 1, &[], &[], &[], &[]);
        assert_eq!(row_ptr, vec![0]);
        assert!(col_idx.is_empty());
    }

    #[test]
    fn row_ptr_length_is_nodes_plus_one() {
        let procs = 3;
        let blocks = 4;
        let facts = 8;
        let (row_ptr, _) = build_cpu_reference(procs, blocks, facts, &[], &[], &[], &[]);
        assert_eq!(
            row_ptr.len(),
            (procs as usize * blocks as usize * facts as usize) + 1
        );
    }

    #[test]
    fn facts_per_workgroup_matches_max_fact_id_plus_one() {
        // G3 docstring claim: lane sizing matches NFA's.
        assert_eq!(FACTS_PER_WORKGROUP as u32, MAX_FACT_ID + 1);
    }
}