const { transpileCuda, Runtime } = require('cuda-rust-wasm');
const fs = require('fs').promises;
const path = require('path');
async function main() {
console.log('๐ CUDA-Rust-WASM Basic Vector Operations Example\n');
try {
console.log('๐ Initializing runtime...');
const runtime = new Runtime();
console.log('๐ Loading CUDA kernels...');
const vectorAddCode = await fs.readFile(
path.join(__dirname, '../kernels/vector_add.cu'),
'utf8'
);
console.log('๐ Transpiling CUDA to WebAssembly...');
const transpiled = await transpileCuda(vectorAddCode, {
target: 'wasm',
optimize: true,
profile: true
});
console.log('โ
Transpilation successful!');
console.log(` Generated code size: ${transpiled.code.length} bytes`);
if (transpiled.wasmBinary) {
console.log(` WASM binary size: ${transpiled.wasmBinary.length} bytes`);
}
console.log('\n๐จ Compiling kernel...');
const kernel = await runtime.compileKernel(transpiled.code, 'vectorAdd');
const n = 1024 * 1024; const size = n * 4;
console.log(`\n๐ Setting up test data (${n} elements)...`);
const hostA = new Float32Array(n);
const hostB = new Float32Array(n);
const hostC = new Float32Array(n);
for (let i = 0; i < n; i++) {
hostA[i] = Math.random() * 100;
hostB[i] = Math.random() * 100;
}
console.log('๐พ Allocating device memory...');
const deviceA = await runtime.allocate(size);
const deviceB = await runtime.allocate(size);
const deviceC = await runtime.allocate(size);
console.log('๐ค Copying data to device...');
const copyStart = performance.now();
await deviceA.copyFrom(hostA.buffer);
await deviceB.copyFrom(hostB.buffer);
const copyTime = performance.now() - copyStart;
const blockSize = 256;
const gridSize = Math.ceil(n / blockSize);
console.log(`\n๐ Launching kernel (grid: ${gridSize}, block: ${blockSize})...`);
kernel.setBlockDim(blockSize);
kernel.setGridDim(gridSize);
kernel.setBuffer(0, deviceA);
kernel.setBuffer(1, deviceB);
kernel.setBuffer(2, deviceC);
kernel.setArg(3, n);
const kernelStart = performance.now();
await kernel.launch();
const kernelTime = performance.now() - kernelStart;
console.log('๐ฅ Copying results back...');
const copyBackStart = performance.now();
await deviceC.copyTo(hostC.buffer);
const copyBackTime = performance.now() - copyBackStart;
console.log('\n๐ Verifying results...');
let errors = 0;
const tolerance = 1e-5;
for (let i = 0; i < Math.min(n, 1000); i++) {
const expected = hostA[i] + hostB[i];
const actual = hostC[i];
if (Math.abs(actual - expected) > tolerance) {
if (errors < 10) {
console.log(` Error at index ${i}: expected ${expected}, got ${actual}`);
}
errors++;
}
}
if (errors === 0) {
console.log('โ
All results verified correct!');
} else {
console.log(`โ Found ${errors} errors`);
}
console.log('\n๐ Performance Summary:');
console.log(` Data transfer to device: ${copyTime.toFixed(2)}ms`);
console.log(` Kernel execution: ${kernelTime.toFixed(2)}ms`);
console.log(` Data transfer from device: ${copyBackTime.toFixed(2)}ms`);
console.log(` Total time: ${(copyTime + kernelTime + copyBackTime).toFixed(2)}ms`);
const totalElements = n;
const totalTime = (copyTime + kernelTime + copyBackTime) / 1000; const throughput = totalElements / totalTime / 1e9;
console.log(` Throughput: ${throughput.toFixed(2)} GFLOPS`);
console.log(` Effective bandwidth: ${(3 * size / totalTime / 1e9).toFixed(2)} GB/s`);
if (transpiled.profile) {
console.log('\n๐ Profiling Data:');
console.log(` Parse time: ${transpiled.profile.parseTime.toFixed(2)}ms`);
console.log(` Transpile time: ${transpiled.profile.transpileTime.toFixed(2)}ms`);
console.log(` Optimize time: ${transpiled.profile.optimizeTime.toFixed(2)}ms`);
console.log(` Total transpile time: ${transpiled.profile.totalTime.toFixed(2)}ms`);
}
console.log('\n๐งน Cleaning up...');
await runtime.synchronize();
console.log('โ
Example completed successfully!');
} catch (error) {
console.error('โ Error:', error.message);
if (error.stack) {
console.error('Stack trace:', error.stack);
}
process.exit(1);
}
}
main().catch(console.error);