Skip to main content

vyre_driver/
arm_independence.rs

1//! D2 substrate: independent-arm detection for queue-parallel dispatch.
2//!
3//! Two megakernel arms can execute on independent native streams /
4//! portable queues if and only if their writes are disjoint and neither
5//! reads what the other writes. The dispatcher uses this to fan out
6//! N concurrent stream submissions instead of serialising on one
7//! queue  -  wins on backends with multiple async-engine count > 1
8//! (every modern discrete GPU + every portable adapter).
9//!
10//! This module owns the *decision*: given two `BindingPlan`s
11//! (or any structure that names input/output binding slots), are
12//! they safe to dispatch concurrently? Pure analysis, no Program
13//! walking  -  the caller passes the already-derived input/output
14//! sets.
15
16use smallvec::SmallVec;
17
18/// Small sorted binding-slot set optimized for dispatch-policy hot paths.
19///
20/// Dispatch summaries usually contain fewer than eight bindings. Keeping
21/// slots inline avoids allocating two tree nodes per arm while preserving
22/// deterministic equality/debug output and O(log n) membership checks.
23#[derive(Debug, Clone, PartialEq, Eq, Default)]
24pub struct BindingSlotSet {
25    slots: SmallVec<[u32; 8]>,
26}
27
28impl BindingSlotSet {
29    /// Insert `slot`, preserving sorted unique order.
30    pub fn insert(&mut self, slot: u32) -> bool {
31        match self.slots.binary_search(&slot) {
32            Ok(_) => false,
33            Err(pos) => {
34                self.slots.insert(pos, slot);
35                true
36            }
37        }
38    }
39
40    /// True when `slot` is present.
41    #[must_use]
42    pub fn contains(&self, slot: &u32) -> bool {
43        self.slots.binary_search(slot).is_ok()
44    }
45
46    /// True when this set and `other` share at least one slot.
47    #[must_use]
48    pub fn intersects(&self, other: &Self) -> bool {
49        let (small, large) = if self.slots.len() <= other.slots.len() {
50            (self, other)
51        } else {
52            (other, self)
53        };
54        small.slots.iter().any(|slot| large.contains(slot))
55    }
56}
57
58impl FromIterator<u32> for BindingSlotSet {
59    fn from_iter<T: IntoIterator<Item = u32>>(iter: T) -> Self {
60        let mut set = Self::default();
61        for slot in iter {
62            set.insert(slot);
63        }
64        set
65    }
66}
67
68/// Input/output set summary for one arm. Two of these are compared
69/// pairwise to decide whether the arms can launch on independent
70/// streams.
71#[derive(Debug, Clone, PartialEq, Eq)]
72pub struct ArmBindingSummary {
73    /// Slots this arm reads (LoadGlobal/LoadShared/LoadConstant/
74    /// AsyncLoad/Atomic-input/BufferLength source operand).
75    pub reads: BindingSlotSet,
76    /// Slots this arm writes (StoreGlobal/StoreShared/AsyncStore/
77    /// Atomic-target).
78    pub writes: BindingSlotSet,
79}
80
81impl ArmBindingSummary {
82    /// Empty summary (no reads, no writes). Useful as a starting
83    /// accumulator.
84    #[must_use]
85    pub fn new() -> Self {
86        Self::default()
87    }
88}
89
90impl Default for ArmBindingSummary {
91    fn default() -> Self {
92        Self {
93            reads: BindingSlotSet::default(),
94            writes: BindingSlotSet::default(),
95        }
96    }
97}
98
99/// Verdict from [`can_dispatch_concurrently`].
100#[derive(Debug, Clone, Copy, PartialEq, Eq)]
101pub enum ArmIndependenceVerdict {
102    /// Two arms touch disjoint resources; safe to launch on
103    /// independent streams.
104    Independent,
105    /// Two arms share at least one binding slot in a way that
106    /// would race on concurrent dispatch. Caller must serialise
107    /// (single stream, or stream A then sync then stream B).
108    SerializeRequired {
109        /// Why serialisation is required  -  names the offending
110        /// access pattern so telemetry and diagnostics can attribute
111        /// the missed concurrency.
112        reason: ArmConflict,
113    },
114}
115
116/// Reason two arms cannot launch concurrently.
117#[derive(Debug, Clone, Copy, PartialEq, Eq)]
118pub enum ArmConflict {
119    /// Both arms write the same slot  -  the second write would race
120    /// with the first.
121    WriteWriteConflict,
122    /// Arm A writes a slot that arm B reads  -  read-after-write.
123    /// B would see either the old or new value depending on stream
124    /// order; observable nondeterminism.
125    ReadAfterWrite,
126    /// Arm A reads a slot that arm B writes  -  write-after-read.
127    /// Symmetric of the above; equally observable.
128    WriteAfterRead,
129}
130
131/// Decide whether two arms (described by their `ArmBindingSummary`s)
132/// can dispatch concurrently on independent streams. Pure set
133/// arithmetic  -  no IR walk, no Program clone, and no heap allocation
134/// for the common <= 8 binding-slot case.
135///
136/// Read-only ↔ read-only on the same slot is always safe and does
137/// NOT count as a conflict.
138#[must_use]
139pub fn can_dispatch_concurrently(
140    a: &ArmBindingSummary,
141    b: &ArmBindingSummary,
142) -> ArmIndependenceVerdict {
143    if a.writes.intersects(&b.writes) {
144        return ArmIndependenceVerdict::SerializeRequired {
145            reason: ArmConflict::WriteWriteConflict,
146        };
147    }
148    if a.writes.intersects(&b.reads) {
149        return ArmIndependenceVerdict::SerializeRequired {
150            reason: ArmConflict::ReadAfterWrite,
151        };
152    }
153    if a.reads.intersects(&b.writes) {
154        return ArmIndependenceVerdict::SerializeRequired {
155            reason: ArmConflict::WriteAfterRead,
156        };
157    }
158    ArmIndependenceVerdict::Independent
159}
160
161#[cfg(test)]
162mod tests {
163    use super::*;
164
165    fn summary(reads: &[u32], writes: &[u32]) -> ArmBindingSummary {
166        ArmBindingSummary {
167            reads: reads.iter().copied().collect(),
168            writes: writes.iter().copied().collect(),
169        }
170    }
171
172    #[test]
173    fn fully_disjoint_arms_are_independent() {
174        let a = summary(&[0, 1], &[2]);
175        let b = summary(&[3, 4], &[5]);
176        assert_eq!(
177            can_dispatch_concurrently(&a, &b),
178            ArmIndependenceVerdict::Independent
179        );
180    }
181
182    #[test]
183    fn empty_arms_are_independent() {
184        let a = summary(&[], &[]);
185        let b = summary(&[], &[]);
186        assert_eq!(
187            can_dispatch_concurrently(&a, &b),
188            ArmIndependenceVerdict::Independent
189        );
190    }
191
192    #[test]
193    fn shared_read_only_slot_is_independent() {
194        let a = summary(&[7], &[1]);
195        let b = summary(&[7], &[2]);
196        // Both READ slot 7; neither writes it  -  no race.
197        assert_eq!(
198            can_dispatch_concurrently(&a, &b),
199            ArmIndependenceVerdict::Independent
200        );
201    }
202
203    #[test]
204    fn write_write_conflict_serialises() {
205        let a = summary(&[], &[3]);
206        let b = summary(&[], &[3]);
207        assert_eq!(
208            can_dispatch_concurrently(&a, &b),
209            ArmIndependenceVerdict::SerializeRequired {
210                reason: ArmConflict::WriteWriteConflict,
211            }
212        );
213    }
214
215    #[test]
216    fn read_after_write_serialises() {
217        let a = summary(&[0], &[5]);
218        let b = summary(&[5], &[1]);
219        assert_eq!(
220            can_dispatch_concurrently(&a, &b),
221            ArmIndependenceVerdict::SerializeRequired {
222                reason: ArmConflict::ReadAfterWrite,
223            }
224        );
225    }
226
227    #[test]
228    fn write_after_read_serialises() {
229        let a = summary(&[5], &[1]);
230        let b = summary(&[0], &[5]);
231        assert_eq!(
232            can_dispatch_concurrently(&a, &b),
233            ArmIndependenceVerdict::SerializeRequired {
234                reason: ArmConflict::WriteAfterRead,
235            }
236        );
237    }
238
239    #[test]
240    fn write_write_takes_precedence_over_other_conflicts() {
241        // Both write slot 3 AND a writes 1 / b reads 1. Verdict
242        // names the strongest conflict (write-write).
243        let a = summary(&[], &[1, 3]);
244        let b = summary(&[1], &[3]);
245        assert_eq!(
246            can_dispatch_concurrently(&a, &b),
247            ArmIndependenceVerdict::SerializeRequired {
248                reason: ArmConflict::WriteWriteConflict,
249            }
250        );
251    }
252
253    #[test]
254    fn verdict_is_symmetric_for_writes_and_reads() {
255        let a = summary(&[], &[10]);
256        let b = summary(&[], &[10]);
257        // ww conflict reported the same regardless of arg order.
258        let verdict_ab = can_dispatch_concurrently(&a, &b);
259        let verdict_ba = can_dispatch_concurrently(&b, &a);
260        assert_eq!(verdict_ab, verdict_ba);
261    }
262
263    #[test]
264    fn one_empty_arm_leaves_independent_when_other_alone() {
265        let a = summary(&[1, 2, 3], &[4]);
266        let b = ArmBindingSummary::new();
267        assert_eq!(
268            can_dispatch_concurrently(&a, &b),
269            ArmIndependenceVerdict::Independent
270        );
271        assert_eq!(
272            can_dispatch_concurrently(&b, &a),
273            ArmIndependenceVerdict::Independent
274        );
275    }
276}