morok-schedule 0.1.0-alpha.2

Optimization passes and pattern engine for the Morok ML compiler
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
use std::sync::Arc;

use morok_ir::{Op, UOp};

use crate::rangeify::kernel::split_store;
use crate::rangeify::{KernelContext, bufferize_to_store};
#[allow(unused_imports)]
use crate::test::unit::rangeify::helpers::extract_kernel;

/// Helper to call split_store with the new signature
fn call_split_store(x: &Arc<UOp>) -> Option<Arc<UOp>> {
    let mut uop_list = Vec::new();
    split_store(&mut uop_list, x)
}

#[test]
fn test_bufferize_to_store_global() {
    let mut ctx = KernelContext::new();

    // Create a simple BUFFERIZE with one range
    let compute = UOp::native_const(42.0f32);
    let range = UOp::range_const(10, 0);

    let bufferize = UOp::bufferize_global(compute.clone(), vec![range.clone()]);

    // Convert to STORE
    let result = bufferize_to_store(&bufferize, &mut ctx, true);

    assert!(result.is_some());
    let result = result.unwrap();

    // bufferize_to_store returns AFTER(passthrough=BUFFER, deps=[END(STORE)])
    // Following Tinygrad's architecture: .store().end(*rngs)
    // BUFFER → DEFINE_GLOBAL conversion happens later in split_store
    let Op::After { passthrough, deps } = result.op() else {
        panic!("Expected AFTER operation, got {:?}", result.op());
    };

    // Passthrough should be BUFFER (not DEFINE_GLOBAL - that conversion happens in split_store)
    assert!(matches!(passthrough.op(), Op::Buffer { .. }), "Expected BUFFER, got {:?}", passthrough.op());
    assert_eq!(deps.len(), 1);

    // Deps should contain END wrapping STORE
    let Op::End { computation, ranges: end_ranges } = deps[0].op() else {
        panic!("Expected END operation in deps, got {:?}", deps[0].op());
    };

    // END should have 1 range
    assert_eq!(end_ranges.len(), 1);
    assert!(std::sync::Arc::ptr_eq(&end_ranges[0], &range));

    // Unwrap END to get STORE
    let Op::Store { index, value, ranges } = computation.op() else {
        panic!("Expected STORE operation inside END, got {:?}", computation.op());
    };

    // STORE should have empty ranges (iteration space on END)
    assert!(ranges.is_empty());

    // Index should contain the buffer reference
    let Op::Index { buffer, .. } = index.op() else {
        panic!("Expected INDEX operation, got {:?}", index.op());
    };
    assert!(matches!(buffer.op(), Op::Buffer { .. }), "Expected BUFFER, got {:?}", buffer.op());
    assert!(std::sync::Arc::ptr_eq(buffer, passthrough));

    // Value should be the compute
    assert!(std::sync::Arc::ptr_eq(value, &compute));

    // Index should be INDEX operation with the range
    assert!(matches!(index.op(), Op::Index { .. }));

    // Verify context state - global_counter is NOT incremented for BUFFER ops
    // (it's only used for DEFINE_GLOBAL/DEFINE_LOCAL counters)
    assert_eq!(ctx.local_counter, 0);
    assert!(ctx.has_buffer(&bufferize));
}

#[test]
fn test_bufferize_to_store_local_with_barrier() {
    let mut ctx = KernelContext::new();

    // Create BUFFERIZE with LOCAL addrspace
    let compute = UOp::native_const(1.0f32);
    let range = UOp::range_const(5, 0);

    let bufferize = UOp::bufferize_local(compute.clone(), vec![range.clone()]);

    // Convert to STORE
    let result = bufferize_to_store(&bufferize, &mut ctx, true);

    assert!(result.is_some());
    let result = result.unwrap();

    // bufferize_to_store returns AFTER(passthrough=DEFINE_LOCAL, deps=[BARRIER(END(STORE))])
    let Op::After { passthrough, deps } = result.op() else {
        panic!("Expected AFTER operation, got {:?}", result.op());
    };

    // Passthrough should be DEFINE_LOCAL
    assert!(matches!(passthrough.op(), Op::DefineLocal(0)));
    assert_eq!(deps.len(), 1);

    // Unwrap AFTER to get BARRIER
    let Op::Barrier { src, .. } = deps[0].op() else {
        panic!("Expected BARRIER operation in deps");
    };

    // BARRIER wraps END(STORE)
    let Op::End { computation, ranges: end_ranges } = src.op() else {
        panic!("Expected END operation inside BARRIER, got {:?}", src.op());
    };

    // END should have 1 range
    assert_eq!(end_ranges.len(), 1);
    assert!(std::sync::Arc::ptr_eq(&end_ranges[0], &range));

    // Unwrap END to get STORE
    let Op::Store { index, value, ranges } = computation.op() else {
        panic!("Expected STORE operation inside END, got {:?}", computation.op());
    };

    // STORE should have empty ranges
    assert!(ranges.is_empty());

    // Index should contain the buffer reference
    let Op::Index { buffer, .. } = index.op() else {
        panic!("Expected INDEX operation, got {:?}", index.op());
    };
    assert!(matches!(buffer.op(), Op::DefineLocal(0)));
    assert!(std::sync::Arc::ptr_eq(value, &compute));

    // Verify context state
    assert_eq!(ctx.global_counter, 0);
    assert_eq!(ctx.local_counter, 1);
    assert!(ctx.has_buffer(&bufferize));
}

#[test]
#[should_panic(expected = "unexpected multi-range")]
fn test_bufferize_to_store_multiple_ranges() {
    let mut ctx = KernelContext::new();

    // Create BUFFERIZE with multiple ranges
    let compute = UOp::native_const(100i32);
    let range1 = UOp::range_const(4, 0);
    let range2 = UOp::range_const(8, 1);

    let bufferize = UOp::bufferize_global(compute.clone(), vec![range1.clone(), range2.clone()]);

    // Convert to STORE
    let result = bufferize_to_store(&bufferize, &mut ctx, true);

    assert!(result.is_some());
    let result = result.unwrap();

    // bufferize_to_store returns AFTER(passthrough=BUFFER, deps=[END(STORE)])
    // BUFFER → DEFINE_GLOBAL conversion happens later in split_store
    let Op::After { passthrough, deps } = result.op() else {
        panic!("Expected AFTER operation, got {:?}", result.op());
    };

    // Passthrough should be BUFFER (not DEFINE_GLOBAL)
    assert!(matches!(passthrough.op(), Op::Buffer { .. }), "Expected BUFFER, got {:?}", passthrough.op());
    assert_eq!(deps.len(), 1);

    // Deps should contain END wrapping STORE
    let Op::End { computation, ranges: end_ranges } = deps[0].op() else {
        panic!("Expected END operation in deps, got {:?}", deps[0].op());
    };

    // END should have 2 ranges in the same order
    assert_eq!(end_ranges.len(), 2);
    assert!(std::sync::Arc::ptr_eq(&end_ranges[0], &range1));
    assert!(std::sync::Arc::ptr_eq(&end_ranges[1], &range2));

    // Unwrap END to get STORE
    let Op::Store { index, value, ranges } = computation.op() else {
        panic!("Expected STORE operation inside END, got {:?}", computation.op());
    };

    // STORE should have empty ranges
    assert!(ranges.is_empty());

    // Value should be the compute
    assert!(std::sync::Arc::ptr_eq(value, &compute));

    // Index should be INDEX with linearized index (2 ranges → 1 linear index)
    // For dims [4, 8], strides are [8, 1], so linear = range1 * 8 + range2
    let Op::Index { buffer: idx_buffer, indices, .. } = index.op() else {
        panic!("Expected INDEX operation");
    };

    // Buffer should be BUFFER (same as passthrough)
    assert!(matches!(idx_buffer.op(), Op::Buffer { .. }), "Expected BUFFER, got {:?}", idx_buffer.op());
    assert!(std::sync::Arc::ptr_eq(idx_buffer, passthrough));

    // Should have 1 linearized index (not 2 separate ranges)
    assert_eq!(indices.len(), 1, "Multi-index should be linearized to single index");

    // Verify context
    assert!(ctx.has_buffer(&bufferize));
}

#[test]
fn test_non_bufferize_returns_none() {
    let mut ctx = KernelContext::new();

    // Create a non-BUFFERIZE operation
    let const_op = UOp::native_const(1.0f32);

    // Should return None
    let result = bufferize_to_store(&const_op, &mut ctx, true);
    assert!(result.is_none());
}

#[test]
fn test_buffer_tracked_in_context() {
    let mut ctx = KernelContext::new();

    let compute = UOp::native_const(1.0f32);
    let bufferize = UOp::bufferize_global(compute, vec![]);

    // Before conversion, buffer should not be tracked
    assert!(!ctx.has_buffer(&bufferize));

    // Convert to STORE
    bufferize_to_store(&bufferize, &mut ctx, true);

    // After conversion, buffer should be tracked
    assert!(ctx.has_buffer(&bufferize));

    // Should be able to get the AFTER wrapping BUFFER
    // bufferize_to_store stores AFTER(buffer, [STORE]) in context
    let replacement = ctx.get_buffer(&bufferize).unwrap();

    // Unwrap AFTER to get the actual BUFFER (not DEFINE_GLOBAL)
    let Op::After { passthrough, .. } = replacement.op() else {
        panic!("Expected AFTER operation, got {:?}", replacement.op());
    };
    assert!(matches!(passthrough.op(), Op::Buffer { .. }), "Expected BUFFER, got {:?}", passthrough.op());
}

#[test]
fn test_bufferize_to_store_sequential_global_ids() {
    let mut ctx = KernelContext::new();

    // Create three BUFFERIZE operations
    for i in 0..3 {
        let compute = UOp::native_const((i as f64) as f32);
        let bufferize = UOp::bufferize_global(compute, vec![]);

        let result = bufferize_to_store(&bufferize, &mut ctx, true);
        assert!(result.is_some());

        // For BUFFER ops, global_counter is NOT incremented (it's only for DEFINE_GLOBAL)
        // But each BUFFERIZE should be tracked
        assert!(ctx.has_buffer(&bufferize));
        assert_eq!(ctx.local_counter, 0);
    }
}

#[test]
fn test_bufferize_to_store_sequential_local_ids() {
    let mut ctx = KernelContext::new();

    // Create three BUFFERIZE operations with LOCAL addrspace
    for i in 0..3 {
        let compute = UOp::native_const((i as f64) as f32);
        let bufferize = UOp::bufferize_local(compute, vec![]);

        bufferize_to_store(&bufferize, &mut ctx, true);

        // Counter should increment
        assert_eq!(ctx.global_counter, 0);
        assert_eq!(ctx.local_counter, (i + 1) as usize);
    }
}

#[test]
fn test_bufferize_to_store_mixed_global_local() {
    let mut ctx = KernelContext::new();

    let global_compute = UOp::native_const(1.0f32);
    let local_compute = UOp::native_const(2.0f32);

    // Global buffer
    let global_bufferize = UOp::bufferize_global(global_compute.clone(), vec![]);

    // Local buffer
    let local_bufferize = UOp::bufferize_local(local_compute.clone(), vec![]);

    // Convert both
    let global_result = bufferize_to_store(&global_bufferize, &mut ctx, true);
    let local_result = bufferize_to_store(&local_bufferize, &mut ctx, true);

    // For global (BUFFER), global_counter is NOT incremented
    // For local (DEFINE_LOCAL), local_counter IS incremented
    assert_eq!(ctx.local_counter, 1);

    // Global: AFTER(passthrough=BUFFER, deps=[STORE])
    // No ranges means no END wrapper, but still has AFTER
    let global_result = global_result.unwrap();
    let Op::After { passthrough: global_buf, deps: global_deps } = global_result.op() else {
        panic!("Expected AFTER operation for global, got {:?}", global_result.op());
    };
    assert!(matches!(global_buf.op(), Op::Buffer { .. }), "Expected BUFFER, got {:?}", global_buf.op());
    assert_eq!(global_deps.len(), 1);

    // deps[0] should be STORE (no ranges = no END)
    let Op::Store { index, value, .. } = global_deps[0].op() else {
        panic!("Expected STORE in global AFTER deps, got {:?}", global_deps[0].op());
    };
    // Index should contain the buffer reference
    let Op::Index { buffer, .. } = index.op() else {
        panic!("Expected INDEX operation, got {:?}", index.op());
    };
    assert!(std::sync::Arc::ptr_eq(buffer, global_buf));
    assert!(std::sync::Arc::ptr_eq(value, &global_compute));

    // Local: AFTER(passthrough=DEFINE_LOCAL, deps=[BARRIER(STORE)])
    // No ranges means no END wrapper, but still has AFTER and BARRIER
    let local_result = local_result.unwrap();
    let Op::After { passthrough: local_buf, deps: local_deps } = local_result.op() else {
        panic!("Expected AFTER operation for local, got {:?}", local_result.op());
    };
    assert!(matches!(local_buf.op(), Op::DefineLocal(0)));
    assert_eq!(local_deps.len(), 1);

    // deps[0] should be BARRIER wrapping STORE
    let Op::Barrier { src, .. } = local_deps[0].op() else {
        panic!("Expected BARRIER in local AFTER deps, got {:?}", local_deps[0].op());
    };
    let Op::Store { index, value, .. } = src.op() else {
        panic!("Expected STORE inside BARRIER, got {:?}", src.op());
    };
    // Index should contain the buffer reference
    let Op::Index { buffer, .. } = index.op() else {
        panic!("Expected INDEX operation, got {:?}", index.op());
    };
    assert!(std::sync::Arc::ptr_eq(buffer, local_buf));
    assert!(std::sync::Arc::ptr_eq(value, &local_compute));

    // Verify both are tracked in context
    assert!(ctx.has_buffer(&global_bufferize));
    assert!(ctx.has_buffer(&local_bufferize));
}

#[test]
fn test_bufferize_to_store_integration_with_split_kernel() {
    let mut ctx = KernelContext::new();

    // Create a BUFFERIZE operation with non-OUTER range
    // split_store skips END operations with OUTER ranges (control flow markers)
    // so we use range_const which creates a non-OUTER range
    let compute = UOp::native_const(42.0f32);
    let range = UOp::range_const(10, 0);

    let bufferize = UOp::bufferize_global(compute.clone(), vec![range]);

    // Stage 1: BUFFERIZE → AFTER(BUFFER, [END(STORE)])
    let store_result = bufferize_to_store(&bufferize, &mut ctx, true).unwrap();

    // Verify buffer was tracked
    assert!(ctx.has_buffer(&bufferize));

    // Extract structure: AFTER(passthrough=BUFFER, deps=[END(STORE)])
    let Op::After { passthrough: buffer_node, deps } = store_result.op() else {
        panic!("Expected AFTER operation, got {:?}", store_result.op());
    };
    assert!(matches!(buffer_node.op(), Op::Buffer { .. }), "Expected BUFFER, got {:?}", buffer_node.op());
    assert_eq!(deps.len(), 1);

    // deps[0] should be END wrapping STORE
    let end_op = &deps[0];
    let Op::End { computation, ranges: end_ranges } = end_op.op() else {
        panic!("Expected END in AFTER deps, got {:?}", end_op.op());
    };
    assert_eq!(end_ranges.len(), 1);
    assert!(matches!(computation.op(), Op::Store { .. }), "Expected STORE inside END");

    // Stage 2: split_store transforms END(STORE) to KERNEL
    // The BUFFER node will be converted to DEFINE_GLOBAL inside the KERNEL
    let kernel = call_split_store(end_op).expect("split_store should create a KERNEL");

    // Verify KERNEL structure
    let Op::Kernel { sources, ast } = kernel.op() else {
        panic!("Expected KERNEL operation, got {:?}", kernel.op());
    };

    // KERNEL sources should contain the BUFFER (mapped to itself by local_to_param_patterns)
    assert!(!sources.is_empty(), "KERNEL should have at least one source");

    // AST should be SINK wrapping the transformed computation
    let Op::Sink { sources: sink_sources } = ast.op() else {
        panic!("Expected SINK operation in kernel AST, got {:?}", ast.op());
    };
    assert_eq!(sink_sources.len(), 1, "SINK should have 1 source");

    // Verify the context stores AFTER with BUFFER passthrough
    let ctx_buffer = ctx.get_buffer(&bufferize).unwrap();
    let Op::After { passthrough, .. } = ctx_buffer.op() else {
        panic!("Expected AFTER in context buffer mapping");
    };
    assert!(matches!(passthrough.op(), Op::Buffer { .. }), "Expected BUFFER, got {:?}", passthrough.op());
}