use cutile;
use cutile::api;
use cutile::tile_kernel::{DeviceOp, TileKernel};
use cutile_compiler::compiler::utils::CompileOptions;
use cutile_compiler::compiler::{CUDATileFunctionCompiler, CUDATileModules};
use cutile_compiler::specialization::{DivHint, SpecializationBits};
use std::path::Path;
use std::sync::{Arc, Mutex};
fn dh(divisor: i32) -> DivHint {
DivHint { divisor, max: 16 }
}
static RAW_PTR_DUMP_LOCK: Mutex<()> = Mutex::new(());
const RAW_PTR_SCALAR_DUMP_DIR: &str = "/tmp/cutile_raw_ptr_scalar_mlir";
mod common;
#[cutile::module]
mod spec_test_module {
use cutile::core::*;
#[cutile::entry()]
fn simple_kernel<const S: [i32; 1]>(output: &mut Tensor<f32, S>) {
let tile: Tile<f32, S> = constant(1.0f32, output.shape());
output.store(tile);
}
#[cutile::entry(optimization_hints = (sm_120 = (max_divisibility = 8,),))]
fn capped_kernel<const S: [i32; 1]>(output: &mut Tensor<f32, S>) {
let tile: Tile<f32, S> = constant(1.0f32, output.shape());
output.store(tile);
}
#[cutile::entry(print_ir = true)]
fn scalar_kernel<const S: [i32; 1]>(output: &mut Tensor<f32, S>, _n: i32) {
let tile: Tile<f32, S> = constant(1.0f32, output.shape());
output.store(tile);
}
#[cutile::entry(dump_mlir_dir = "/tmp/cutile_raw_ptr_scalar_mlir")]
unsafe fn raw_ptr_scalar_kernel(_ptr: *mut f32, _n: i32) {}
}
use spec_test_module::{__module_ast_self, raw_ptr_scalar_kernel};
fn compile_with_spec(
name: &str,
strides: &[(&str, &[i32])],
specs: &[(&str, &SpecializationBits)],
) -> String {
compile_with_spec_and_options(name, strides, specs, &CompileOptions::default())
}
fn compile_with_spec_and_options(
name: &str,
strides: &[(&str, &[i32])],
specs: &[(&str, &SpecializationBits)],
options: &CompileOptions,
) -> String {
compile_kernel(name, &[128.to_string()], strides, specs, &[], options)
}
fn compile_kernel(
name: &str,
function_generic_args: &[String],
strides: &[(&str, &[i32])],
specs: &[(&str, &SpecializationBits)],
scalar_hints: &[(&str, &DivHint)],
options: &CompileOptions,
) -> String {
let modules = CUDATileModules::from_kernel(__module_ast_self())
.expect("Failed to create CUDATileModules");
let gpu_name = "sm_120".to_string();
let compiler = CUDATileFunctionCompiler::new(
&modules,
"spec_test_module",
name,
function_generic_args,
strides,
specs,
scalar_hints,
None,
gpu_name,
options,
)
.expect("Failed to create compiler");
let module_op = compiler.compile().expect("Failed to compile");
let result = module_op.to_string();
drop(module_op);
drop(compiler);
result
}
#[test]
fn spec_bits_div_16_produces_div_by_16() {
common::with_test_stack(|| {
let spec = SpecializationBits {
shape_div: vec![dh(16)],
stride_div: vec![dh(4)],
stride_one: vec![true],
base_ptr_div: dh(16),
elements_disjoint: true,
};
let mlir = compile_with_spec("simple_kernel", &[("output", &[1])], &[("output", &spec)]);
println!("{mlir}");
assert!(
mlir.contains("div_by<16>"),
"Expected div_by<16> for shape divisible by 16.\nMLIR:\n{mlir}"
);
});
}
#[test]
fn spec_bits_div_8_produces_div_by_8() {
common::with_test_stack(|| {
let spec = SpecializationBits {
shape_div: vec![dh(8)],
stride_div: vec![dh(4)],
stride_one: vec![true],
base_ptr_div: dh(8),
elements_disjoint: true,
};
let mlir = compile_with_spec("simple_kernel", &[("output", &[1])], &[("output", &spec)]);
println!("{mlir}");
assert!(
mlir.contains("div_by<8>"),
"Expected div_by<8> for shape divisible by 8.\nMLIR:\n{mlir}"
);
});
}
#[test]
fn no_spec_bits_no_div_by() {
common::with_test_stack(|| {
let mlir = compile_with_spec("simple_kernel", &[("output", &[1])], &[]);
println!("{mlir}");
assert!(
!mlir.contains("div_by"),
"Expected no div_by when no spec bits provided.\nMLIR:\n{mlir}"
);
});
}
#[test]
fn spec_bits_div_1_no_div_by() {
common::with_test_stack(|| {
let spec = SpecializationBits {
shape_div: vec![dh(1)],
stride_div: vec![dh(1)],
stride_one: vec![true],
base_ptr_div: dh(1),
elements_disjoint: true,
};
let mlir = compile_with_spec("simple_kernel", &[("output", &[1])], &[("output", &spec)]);
println!("{mlir}");
assert!(
!mlir.contains("div_by"),
"Expected no div_by when all divisors are 1.\nMLIR:\n{mlir}"
);
});
}
#[test]
fn different_spec_bits_different_cache_keys() {
use cutile::tile_kernel::TileFunctionKey;
let spec_a = SpecializationBits {
shape_div: vec![dh(16)],
stride_div: vec![dh(16)],
stride_one: vec![true],
base_ptr_div: dh(16),
elements_disjoint: true,
};
let spec_b = SpecializationBits {
shape_div: vec![dh(8)],
stride_div: vec![dh(8)],
stride_one: vec![true],
base_ptr_div: dh(8),
elements_disjoint: true,
};
let key_a = TileFunctionKey::new(
"m".into(),
"f".into(),
vec![],
vec![],
vec![("output".into(), spec_a.clone())],
vec![],
None,
CompileOptions::default(),
);
let key_b = TileFunctionKey::new(
"m".into(),
"f".into(),
vec![],
vec![],
vec![("output".into(), spec_b.clone())],
vec![],
None,
CompileOptions::default(),
);
let key_a2 = TileFunctionKey::new(
"m".into(),
"f".into(),
vec![],
vec![],
vec![("output".into(), spec_a)],
vec![],
None,
CompileOptions::default(),
);
assert_ne!(
key_a, key_b,
"Different spec bits should produce different cache keys"
);
assert_eq!(
key_a, key_a2,
"Same spec bits should produce equal cache keys"
);
}
#[test]
fn entry_max_divisibility_caps_inferred_div() {
common::with_test_stack(|| {
let spec = SpecializationBits {
shape_div: vec![dh(16)],
stride_div: vec![dh(16)],
stride_one: vec![true],
base_ptr_div: dh(16),
elements_disjoint: true,
};
let mlir = compile_with_spec("capped_kernel", &[("output", &[1])], &[("output", &spec)]);
println!("{mlir}");
assert!(
mlir.contains("div_by<8>"),
"Expected div_by<8> (capped from 16 by max_divisibility=8).\nMLIR:\n{mlir}"
);
assert!(
!mlir.contains("div_by<16>"),
"Should not contain div_by<16> when max_divisibility=8.\nMLIR:\n{mlir}"
);
});
}
#[test]
fn entry_max_divisibility_does_not_inflate() {
common::with_test_stack(|| {
let spec = SpecializationBits {
shape_div: vec![dh(4)],
stride_div: vec![dh(4)],
stride_one: vec![true],
base_ptr_div: dh(4),
elements_disjoint: true,
};
let mlir = compile_with_spec("capped_kernel", &[("output", &[1])], &[("output", &spec)]);
println!("{mlir}");
assert!(
mlir.contains("div_by<4>"),
"Expected div_by<4> (not inflated by max_divisibility=8).\nMLIR:\n{mlir}"
);
assert!(
!mlir.contains("div_by<8>"),
"Should not contain div_by<8> when inferred is only 4.\nMLIR:\n{mlir}"
);
});
}
#[test]
fn runtime_max_divisibility_overrides_entry_hint() {
common::with_test_stack(|| {
let spec = SpecializationBits {
shape_div: vec![dh(16)],
stride_div: vec![dh(16)],
stride_one: vec![true],
base_ptr_div: dh(16),
elements_disjoint: true,
};
let options = CompileOptions::default().max_divisibility(4);
let mlir = compile_with_spec_and_options(
"simple_kernel",
&[("output", &[1])],
&[("output", &spec)],
&options,
);
println!("{mlir}");
assert!(
mlir.contains("div_by<4>"),
"Expected div_by<4> from runtime max_divisibility override.\nMLIR:\n{mlir}"
);
assert!(
!mlir.contains("div_by<16>"),
"Should not contain div_by<16> when runtime max_divisibility=4.\nMLIR:\n{mlir}"
);
});
}
#[test]
fn scalar_int_hint_emits_assume_div_by_in_entry_wrapper() {
common::with_test_stack(|| {
let hint = DivHint::from_value(1024);
let mlir = compile_kernel(
"scalar_kernel",
&[128.to_string()],
&[("output", &[1])],
&[],
&[("_n", &hint)],
&CompileOptions::default(),
);
println!("{mlir}");
assert!(
mlir.contains("div_by<16>"),
"Expected scalar hint to emit div_by<16>.\nMLIR:\n{mlir}"
);
});
}
#[test]
fn scalar_int_hint_respects_runtime_max_divisibility() {
common::with_test_stack(|| {
let hint = DivHint::from_value(1024);
let options = CompileOptions::default().max_divisibility(4);
let mlir = compile_kernel(
"scalar_kernel",
&[128.to_string()],
&[("output", &[1])],
&[],
&[("_n", &hint)],
&options,
);
println!("{mlir}");
assert!(
mlir.contains("div_by<4>"),
"Expected scalar hint to be capped to div_by<4>.\nMLIR:\n{mlir}"
);
assert!(
!mlir.contains("div_by<16>"),
"Should not contain div_by<16> when runtime max_divisibility=4.\nMLIR:\n{mlir}"
);
});
}
#[test]
fn raw_pointer_integer_scalar_hint_emits_assume_div_by() {
common::with_test_stack(|| {
let ptr_hint = DivHint::from_ptr(0x1000);
let scalar_hint = DivHint::from_value(1024);
let mlir = compile_kernel(
"raw_ptr_scalar_kernel",
&[],
&[],
&[],
&[("_ptr", &ptr_hint), ("_n", &scalar_hint)],
&CompileOptions::default(),
);
println!("{mlir}");
assert_eq!(
mlir.matches("assume div_by<16>").count(),
2,
"Expected raw pointer and integer scalar hints to both emit div_by<16>.\nMLIR:\n{mlir}"
);
});
}
#[test]
fn raw_pointer_launch_computes_scalar_div_hint() {
common::with_test_stack(|| {
let mlir =
launch_raw_ptr_scalar_kernel_and_read_mlir(12, CompileOptions::default().occupancy(3));
assert!(
mlir.contains("entry @raw_ptr_scalar_kernel_entry"),
"Expected dumped MLIR for raw_ptr_scalar_kernel.\nMLIR:\n{mlir}"
);
assert!(
mlir.contains("assume div_by<4>"),
"Expected launcher-computed scalar hint for n=12 to emit div_by<4>.\nMLIR:\n{mlir}"
);
assert!(
mlir.contains("assume div_by<16>"),
"Expected launcher-computed raw pointer hint to emit div_by<16>.\nMLIR:\n{mlir}"
);
assert_eq!(
mlir.matches("assume div_by<").count(),
2,
"Expected raw pointer and scalar argument to receive divisibility assumes.\nMLIR:\n{mlir}"
);
});
}
#[test]
fn raw_pointer_launch_scalar_div_hint_covers_powers_of_two_through_16() {
common::with_test_stack(|| {
let cases = [
(1, None, 4),
(2, Some(2), 5),
(4, Some(4), 6),
(8, Some(8), 7),
(16, Some(16), 8),
(32, Some(16), 9),
];
for (n, expected_divisor, occupancy) in cases {
let mlir = launch_raw_ptr_scalar_kernel_and_read_mlir(
n,
CompileOptions::default().occupancy(occupancy),
);
assert!(
mlir.contains("assume div_by<16>"),
"Expected raw pointer hint for n={n} to emit div_by<16>.\nMLIR:\n{mlir}"
);
match expected_divisor {
Some(divisor) => {
let expected = format!("assume div_by<{divisor}>");
if divisor == 16 {
assert_eq!(
mlir.matches(&expected).count(),
2,
"Expected raw pointer and scalar hints for n={n} to both emit {expected}.\nMLIR:\n{mlir}"
);
} else {
assert!(
mlir.contains(&expected),
"Expected launcher-computed scalar hint for n={n} to emit {expected}.\nMLIR:\n{mlir}"
);
}
assert_eq!(
mlir.matches("assume div_by<").count(),
2,
"Expected raw pointer and scalar arguments to receive divisibility assumes for n={n}.\nMLIR:\n{mlir}"
);
}
None => {
assert_eq!(
mlir.matches("assume div_by<").count(),
1,
"Expected only the raw pointer argument to receive a divisibility assume for n={n}.\nMLIR:\n{mlir}"
);
}
}
}
});
}
fn launch_raw_ptr_scalar_kernel_and_read_mlir(n: i32, options: CompileOptions) -> String {
let _lock = RAW_PTR_DUMP_LOCK.lock().expect("lock raw pointer dump dir");
let dump_dir = Path::new(RAW_PTR_SCALAR_DUMP_DIR);
let _ = std::fs::remove_dir_all(dump_dir);
std::fs::create_dir_all(dump_dir).expect("create MLIR dump dir");
let backing = api::copy_host_vec_to_device(&Arc::new(vec![0.0f32; 1]))
.sync()
.expect("alloc backing tensor");
let ptr = backing.device_pointer();
unsafe { raw_ptr_scalar_kernel(ptr, n) }
.grid((1, 1, 1))
.compile_options(options)
.sync()
.expect("raw pointer scalar kernel launch");
let mut mlir = String::new();
for entry in std::fs::read_dir(dump_dir).expect("read MLIR dump dir") {
let path = entry.expect("read MLIR dump entry").path();
if path.extension().is_some_and(|ext| ext == "mlir") {
mlir.push_str(
&std::fs::read_to_string(&path)
.unwrap_or_else(|err| panic!("read dumped MLIR {path:?}: {err}")),
);
}
}
mlir
}