Skip to main content

flodl_sys/
lib.rs

1//! Raw FFI bindings to the libtorch C++ shim.
2//!
3//! Every function that can fail returns a `*mut i8` error string (caller
4//! must free it with [`flodl_free_string`]). A null pointer means success.
5//!
6//! `FlodlTensor` is an opaque `*mut c_void` handle to a heap-allocated
7//! `torch::Tensor`. Caller owns it and must free with [`flodl_free_tensor`].
8
9use std::ffi::c_void;
10
11/// Opaque handle to a `torch::Tensor` on the C++ side.
12pub type FlodlTensor = *mut c_void;
13
14// --- DType constants (must match shim.h) ---
15pub const FLODL_FLOAT16: i32 = 5;
16pub const FLODL_BFLOAT16: i32 = 15;
17pub const FLODL_FLOAT32: i32 = 6;
18pub const FLODL_FLOAT64: i32 = 7;
19pub const FLODL_INT32: i32 = 3;
20pub const FLODL_INT64: i32 = 4;
21
22// --- Device constants (must match shim.h) ---
23pub const FLODL_CPU: i32 = 0;
24pub const FLODL_CUDA: i32 = 1;
25
26unsafe extern "C" {
27    // --- Tensor creation ---
28
29    pub fn flodl_zeros(
30        shape: *mut i64, ndim: i32, dtype: i32,
31        device_type: i32, device_index: i32,
32        result: *mut FlodlTensor,
33    ) -> *mut i8;
34
35    pub fn flodl_ones(
36        shape: *mut i64, ndim: i32, dtype: i32,
37        device_type: i32, device_index: i32,
38        result: *mut FlodlTensor,
39    ) -> *mut i8;
40
41    pub fn flodl_rand(
42        shape: *mut i64, ndim: i32, dtype: i32,
43        device_type: i32, device_index: i32,
44        result: *mut FlodlTensor,
45    ) -> *mut i8;
46
47    pub fn flodl_randn(
48        shape: *mut i64, ndim: i32, dtype: i32,
49        device_type: i32, device_index: i32,
50        result: *mut FlodlTensor,
51    ) -> *mut i8;
52
53    pub fn flodl_from_blob(
54        data: *mut c_void, shape: *mut i64, ndim: i32,
55        dtype: i32, device_type: i32, device_index: i32,
56        result: *mut FlodlTensor,
57    ) -> *mut i8;
58
59    pub fn flodl_linspace(
60        start: f64, end: f64, steps: i64,
61        dtype: i32, device_type: i32, device_index: i32,
62        result: *mut FlodlTensor,
63    ) -> *mut i8;
64
65    pub fn flodl_arange(
66        start: f64, end: f64, step: f64,
67        dtype: i32, device_type: i32, device_index: i32,
68        result: *mut FlodlTensor,
69    ) -> *mut i8;
70
71    pub fn flodl_expand(
72        t: FlodlTensor, new_shape: *mut i64, ndim: i32,
73        result: *mut FlodlTensor,
74    ) -> *mut i8;
75
76    // --- Tensor lifecycle ---
77
78    pub fn flodl_free_tensor(t: FlodlTensor);
79    pub fn flodl_shallow_clone(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
80
81    // --- Tensor metadata ---
82
83    pub fn flodl_ndim(t: FlodlTensor) -> i32;
84    pub fn flodl_shape(t: FlodlTensor, dim: i32) -> i64;
85    pub fn flodl_dtype(t: FlodlTensor) -> i32;
86    pub fn flodl_device_type(t: FlodlTensor) -> i32;
87    pub fn flodl_device_index(t: FlodlTensor) -> i32;
88    pub fn flodl_numel(t: FlodlTensor) -> i64;
89
90    // --- Data access ---
91
92    pub fn flodl_copy_data(
93        t: FlodlTensor, buffer: *mut c_void, buffer_bytes: i64,
94    ) -> *mut i8;
95
96    // --- Arithmetic ---
97
98    pub fn flodl_add(a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
99    pub fn flodl_sub(a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
100    pub fn flodl_mul(a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
101    pub fn flodl_div(a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
102    pub fn flodl_matmul(a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
103
104    pub fn flodl_add_scalar(
105        t: FlodlTensor, scalar: f64, result: *mut FlodlTensor,
106    ) -> *mut i8;
107
108    pub fn flodl_mul_scalar(
109        t: FlodlTensor, scalar: f64, result: *mut FlodlTensor,
110    ) -> *mut i8;
111
112    pub fn flodl_div_scalar(
113        t: FlodlTensor, scalar: f64, result: *mut FlodlTensor,
114    ) -> *mut i8;
115
116    pub fn flodl_neg(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
117
118    // --- Activations ---
119
120    pub fn flodl_relu(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
121    pub fn flodl_sigmoid(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
122    pub fn flodl_tanh_op(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
123    pub fn flodl_softmax(t: FlodlTensor, dim: i32, result: *mut FlodlTensor) -> *mut i8;
124    pub fn flodl_log_softmax(t: FlodlTensor, dim: i32, result: *mut FlodlTensor) -> *mut i8;
125    pub fn flodl_gelu(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
126    pub fn flodl_silu(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
127    pub fn flodl_leaky_relu(
128        t: FlodlTensor, negative_slope: f64, result: *mut FlodlTensor,
129    ) -> *mut i8;
130    pub fn flodl_elu(t: FlodlTensor, alpha: f64, result: *mut FlodlTensor) -> *mut i8;
131    pub fn flodl_softplus(
132        t: FlodlTensor, beta: f64, threshold: f64, result: *mut FlodlTensor,
133    ) -> *mut i8;
134    pub fn flodl_mish(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
135    pub fn flodl_selu(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
136    pub fn flodl_hardswish(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
137    pub fn flodl_hardsigmoid(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
138    pub fn flodl_prelu(t: FlodlTensor, weight: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
139
140    // --- Layer normalization ---
141
142    pub fn flodl_native_layer_norm(
143        input: FlodlTensor, weight: FlodlTensor, bias: FlodlTensor,
144        normalized_size: i64, eps: f64,
145        output: *mut FlodlTensor, mean: *mut FlodlTensor, rstd: *mut FlodlTensor,
146    ) -> *mut i8;
147
148    // --- Group normalization ---
149
150    pub fn flodl_group_norm(
151        input: FlodlTensor, num_groups: i64,
152        weight: FlodlTensor, bias: FlodlTensor,
153        eps: f64, result: *mut FlodlTensor,
154    ) -> *mut i8;
155
156    // --- Element-wise math ---
157
158    pub fn flodl_exp(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
159    pub fn flodl_log(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
160    pub fn flodl_sqrt(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
161    pub fn flodl_abs(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
162    pub fn flodl_triu(t: FlodlTensor, diagonal: i64, result: *mut FlodlTensor) -> *mut i8;
163    pub fn flodl_tril(t: FlodlTensor, diagonal: i64, result: *mut FlodlTensor) -> *mut i8;
164
165    pub fn flodl_pow_scalar(
166        t: FlodlTensor, exponent: f64, result: *mut FlodlTensor,
167    ) -> *mut i8;
168
169    pub fn flodl_clamp(
170        t: FlodlTensor, min_val: f64, max_val: f64, result: *mut FlodlTensor,
171    ) -> *mut i8;
172
173    pub fn flodl_clamp_min(
174        t: FlodlTensor, min_val: f64, result: *mut FlodlTensor,
175    ) -> *mut i8;
176
177    pub fn flodl_clamp_max(
178        t: FlodlTensor, max_val: f64, result: *mut FlodlTensor,
179    ) -> *mut i8;
180
181    pub fn flodl_log1p(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
182    pub fn flodl_expm1(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
183    pub fn flodl_log2(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
184    pub fn flodl_log10(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
185
186    // --- Reductions ---
187
188    pub fn flodl_sum(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
189    pub fn flodl_mean(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
190
191    pub fn flodl_sum_dim(
192        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
193    ) -> *mut i8;
194
195    pub fn flodl_mean_dim(
196        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
197    ) -> *mut i8;
198
199    pub fn flodl_prod(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
200
201    pub fn flodl_prod_dim(
202        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
203    ) -> *mut i8;
204
205    pub fn flodl_cumsum(
206        t: FlodlTensor, dim: i32, result: *mut FlodlTensor,
207    ) -> *mut i8;
208
209    pub fn flodl_logsumexp(
210        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
211    ) -> *mut i8;
212
213    pub fn flodl_min(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
214    pub fn flodl_max(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
215    pub fn flodl_norm(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
216
217    pub fn flodl_min_dim(
218        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
219    ) -> *mut i8;
220
221    pub fn flodl_max_dim(
222        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
223    ) -> *mut i8;
224
225    pub fn flodl_argmax(
226        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
227    ) -> *mut i8;
228
229    // --- Comparison (return float masks: 0.0 or 1.0) ---
230
231    pub fn flodl_gt_scalar(
232        t: FlodlTensor, scalar: f64, result: *mut FlodlTensor,
233    ) -> *mut i8;
234
235    pub fn flodl_ge_scalar(
236        t: FlodlTensor, scalar: f64, result: *mut FlodlTensor,
237    ) -> *mut i8;
238
239    pub fn flodl_le_scalar(
240        t: FlodlTensor, scalar: f64, result: *mut FlodlTensor,
241    ) -> *mut i8;
242
243    pub fn flodl_lt_scalar(
244        t: FlodlTensor, scalar: f64, result: *mut FlodlTensor,
245    ) -> *mut i8;
246
247    pub fn flodl_eq_scalar(
248        t: FlodlTensor, scalar: f64, result: *mut FlodlTensor,
249    ) -> *mut i8;
250
251    pub fn flodl_ne_scalar(
252        t: FlodlTensor, scalar: f64, result: *mut FlodlTensor,
253    ) -> *mut i8;
254
255    // --- Boolean / detection (return float masks) ---
256
257    pub fn flodl_isnan(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
258    pub fn flodl_isinf(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
259    pub fn flodl_logical_and(
260        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
261    ) -> *mut i8;
262    pub fn flodl_logical_or(
263        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
264    ) -> *mut i8;
265    pub fn flodl_logical_not(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
266    pub fn flodl_any(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
267    pub fn flodl_all(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
268
269    // --- Shape operations ---
270
271    pub fn flodl_reshape(
272        t: FlodlTensor, shape: *mut i64, ndim: i32, result: *mut FlodlTensor,
273    ) -> *mut i8;
274
275    pub fn flodl_transpose(
276        t: FlodlTensor, dim0: i32, dim1: i32, result: *mut FlodlTensor,
277    ) -> *mut i8;
278
279    pub fn flodl_permute(
280        t: FlodlTensor, dims: *mut i64, ndim: i32, result: *mut FlodlTensor,
281    ) -> *mut i8;
282
283    pub fn flodl_select(
284        t: FlodlTensor, dim: i32, index: i64, result: *mut FlodlTensor,
285    ) -> *mut i8;
286
287    pub fn flodl_narrow(
288        t: FlodlTensor, dim: i32, start: i64, length: i64,
289        result: *mut FlodlTensor,
290    ) -> *mut i8;
291
292    pub fn flodl_squeeze(
293        t: FlodlTensor, dim: i32, result: *mut FlodlTensor,
294    ) -> *mut i8;
295
296    pub fn flodl_unsqueeze(
297        t: FlodlTensor, dim: i32, result: *mut FlodlTensor,
298    ) -> *mut i8;
299
300    pub fn flodl_flatten(
301        t: FlodlTensor, start_dim: i32, end_dim: i32, result: *mut FlodlTensor,
302    ) -> *mut i8;
303
304    // --- Scatter ---
305
306    pub fn flodl_select_scatter(
307        input: FlodlTensor, src: FlodlTensor, dim: i32, index: i64,
308        result: *mut FlodlTensor,
309    ) -> *mut i8;
310
311    pub fn flodl_narrow_scatter(
312        input: FlodlTensor, src: FlodlTensor, dim: i32, start: i64,
313        result: *mut FlodlTensor,
314    ) -> *mut i8;
315
316    // --- Indexing ---
317
318    pub fn flodl_index_select(
319        t: FlodlTensor, dim: i32, index: FlodlTensor,
320        result: *mut FlodlTensor,
321    ) -> *mut i8;
322
323    pub fn flodl_index_add(
324        t: FlodlTensor, dim: i32, index: FlodlTensor, src: FlodlTensor,
325        result: *mut FlodlTensor,
326    ) -> *mut i8;
327
328    // --- Concatenation ---
329
330    pub fn flodl_cat2(
331        a: FlodlTensor, b: FlodlTensor, dim: i32, result: *mut FlodlTensor,
332    ) -> *mut i8;
333
334    pub fn flodl_cat(
335        tensors: *mut FlodlTensor, count: i32, dim: i32, result: *mut FlodlTensor,
336    ) -> *mut i8;
337
338    pub fn flodl_stack(
339        tensors: *mut FlodlTensor, count: i32, dim: i32, result: *mut FlodlTensor,
340    ) -> *mut i8;
341
342    // --- Masking ---
343
344    pub fn flodl_masked_fill(
345        t: FlodlTensor, mask: FlodlTensor, value: f64,
346        result: *mut FlodlTensor,
347    ) -> *mut i8;
348
349    // --- Conditional ---
350
351    pub fn flodl_where(
352        condition: FlodlTensor, x: FlodlTensor, y: FlodlTensor,
353        result: *mut FlodlTensor,
354    ) -> *mut i8;
355
356    // --- Like constructors ---
357
358    pub fn flodl_zeros_like(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
359    pub fn flodl_ones_like(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
360    pub fn flodl_full_like(
361        t: FlodlTensor, value: f64, result: *mut FlodlTensor,
362    ) -> *mut i8;
363    pub fn flodl_rand_like(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
364    pub fn flodl_randn_like(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
365
366    // --- Tensor creation (tier 2) ---
367
368    pub fn flodl_randint(
369        low: i64, high: i64, shape: *mut i64, ndim: i32,
370        dtype: i32, device_type: i32, device_index: i32,
371        result: *mut FlodlTensor,
372    ) -> *mut i8;
373
374    pub fn flodl_empty(
375        shape: *mut i64, ndim: i32, dtype: i32,
376        device_type: i32, device_index: i32,
377        result: *mut FlodlTensor,
378    ) -> *mut i8;
379
380    pub fn flodl_one_hot(
381        t: FlodlTensor, num_classes: i64,
382        result: *mut FlodlTensor,
383    ) -> *mut i8;
384
385    pub fn flodl_bernoulli(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
386
387    // --- Convolution ---
388
389    pub fn flodl_conv2d(
390        input: FlodlTensor, weight: FlodlTensor, bias: FlodlTensor,
391        stride: *mut i64, padding: *mut i64, dilation: *mut i64,
392        groups: i64, result: *mut FlodlTensor,
393    ) -> *mut i8;
394
395    // --- 1D convolution ---
396
397    pub fn flodl_conv1d(
398        input: FlodlTensor, weight: FlodlTensor, bias: FlodlTensor,
399        stride: i64, padding: i64, dilation: i64,
400        groups: i64, result: *mut FlodlTensor,
401    ) -> *mut i8;
402
403    // --- Transposed convolution ---
404
405    pub fn flodl_conv_transpose2d(
406        input: FlodlTensor, weight: FlodlTensor, bias: FlodlTensor,
407        stride: *mut i64, padding: *mut i64,
408        output_padding: *mut i64, dilation: *mut i64,
409        groups: i64, result: *mut FlodlTensor,
410    ) -> *mut i8;
411
412    // --- Transposed 1D convolution ---
413
414    pub fn flodl_conv_transpose1d(
415        input: FlodlTensor, weight: FlodlTensor, bias: FlodlTensor,
416        stride: i64, padding: i64,
417        output_padding: i64, dilation: i64,
418        groups: i64, result: *mut FlodlTensor,
419    ) -> *mut i8;
420
421    // --- Pooling ---
422
423    pub fn flodl_max_pool2d(
424        input: FlodlTensor, kernel_size: *mut i64,
425        stride: *mut i64, padding: *mut i64, dilation: *mut i64,
426        ceil_mode: i32, result: *mut FlodlTensor,
427    ) -> *mut i8;
428
429    pub fn flodl_avg_pool2d(
430        input: FlodlTensor, kernel_size: *mut i64,
431        stride: *mut i64, padding: *mut i64,
432        ceil_mode: i32, count_include_pad: i32,
433        result: *mut FlodlTensor,
434    ) -> *mut i8;
435
436    pub fn flodl_adaptive_avg_pool2d(
437        input: FlodlTensor, output_size: *mut i64,
438        result: *mut FlodlTensor,
439    ) -> *mut i8;
440
441    pub fn flodl_adaptive_max_pool2d(
442        input: FlodlTensor, output_size: *mut i64,
443        result: *mut FlodlTensor,
444    ) -> *mut i8;
445
446    // --- Unfold / Fold (im2col / col2im) ---
447
448    pub fn flodl_im2col(
449        input: FlodlTensor, kernel_size: *mut i64, dilation: *mut i64,
450        padding: *mut i64, stride: *mut i64, result: *mut FlodlTensor,
451    ) -> *mut i8;
452
453    pub fn flodl_col2im(
454        input: FlodlTensor, output_size: *mut i64,
455        kernel_size: *mut i64, dilation: *mut i64,
456        padding: *mut i64, stride: *mut i64, result: *mut FlodlTensor,
457    ) -> *mut i8;
458
459    // --- 3D convolution ---
460
461    pub fn flodl_conv3d(
462        input: FlodlTensor, weight: FlodlTensor, bias: FlodlTensor,
463        stride: *mut i64, padding: *mut i64, dilation: *mut i64,
464        groups: i64, result: *mut FlodlTensor,
465    ) -> *mut i8;
466
467    pub fn flodl_conv_transpose3d(
468        input: FlodlTensor, weight: FlodlTensor, bias: FlodlTensor,
469        stride: *mut i64, padding: *mut i64, output_padding: *mut i64,
470        dilation: *mut i64, groups: i64, result: *mut FlodlTensor,
471    ) -> *mut i8;
472
473    // --- 1D pooling ---
474
475    pub fn flodl_max_pool1d(
476        input: FlodlTensor, kernel_size: i64,
477        stride: i64, padding: i64, dilation: i64,
478        ceil_mode: i32, result: *mut FlodlTensor,
479    ) -> *mut i8;
480
481    pub fn flodl_avg_pool1d(
482        input: FlodlTensor, kernel_size: i64,
483        stride: i64, padding: i64,
484        ceil_mode: i32, count_include_pad: i32,
485        result: *mut FlodlTensor,
486    ) -> *mut i8;
487
488    // --- Instance normalization ---
489
490    pub fn flodl_instance_norm(
491        input: FlodlTensor, weight: FlodlTensor, bias: FlodlTensor,
492        running_mean: FlodlTensor, running_var: FlodlTensor,
493        use_input_stats: i32, momentum: f64, eps: f64,
494        result: *mut FlodlTensor,
495    ) -> *mut i8;
496
497    // --- PixelShuffle ---
498
499    pub fn flodl_pixel_shuffle(
500        input: FlodlTensor, upscale_factor: i64, result: *mut FlodlTensor,
501    ) -> *mut i8;
502
503    pub fn flodl_pixel_unshuffle(
504        input: FlodlTensor, downscale_factor: i64, result: *mut FlodlTensor,
505    ) -> *mut i8;
506
507    // --- Bilinear ---
508
509    pub fn flodl_bilinear(
510        input1: FlodlTensor, input2: FlodlTensor,
511        weight: FlodlTensor, bias: FlodlTensor,
512        result: *mut FlodlTensor,
513    ) -> *mut i8;
514
515    // --- Grid sampling ---
516
517    pub fn flodl_grid_sample(
518        input: FlodlTensor, grid: FlodlTensor,
519        mode: i32, padding_mode: i32, align_corners: i32,
520        result: *mut FlodlTensor,
521    ) -> *mut i8;
522
523    // --- Device ---
524
525    pub fn flodl_to_device(
526        t: FlodlTensor, device_type: i32, device_index: i32,
527        result: *mut FlodlTensor,
528    ) -> *mut i8;
529
530    pub fn flodl_to_device_async(
531        t: FlodlTensor, device_type: i32, device_index: i32,
532        result: *mut FlodlTensor,
533    ) -> *mut i8;
534
535    pub fn flodl_cuda_is_available() -> i32;
536    pub fn flodl_cuda_device_count() -> i32;
537    pub fn flodl_force_cuda_link() -> i32;
538    pub fn flodl_set_current_device(device_index: i32);
539    pub fn flodl_get_current_device() -> i32;
540    pub fn flodl_cuda_synchronize(device_index: i32);
541
542    // --- CUDA memory/utilization (monitor support) ---
543
544    pub fn flodl_cuda_mem_info(
545        device_index: i32, used_bytes: *mut u64, total_bytes: *mut u64,
546    ) -> *mut i8;
547
548    pub fn flodl_cuda_alloc_bytes(
549        device_index: i32, allocated_bytes: *mut u64,
550    ) -> *mut i8;
551
552    pub fn flodl_cuda_active_bytes(
553        device_index: i32, active_bytes: *mut u64,
554    ) -> *mut i8;
555
556    pub fn flodl_cuda_peak_active_bytes(
557        device_index: i32, peak_bytes: *mut u64,
558    ) -> *mut i8;
559
560    pub fn flodl_cuda_peak_reserved_bytes(
561        device_index: i32, peak_bytes: *mut u64,
562    ) -> *mut i8;
563
564    pub fn flodl_cuda_reset_peak_stats(device_index: i32);
565
566    pub fn flodl_cuda_empty_cache();
567
568    pub fn flodl_cuda_utilization(device_index: i32) -> i32;
569
570    pub fn flodl_cuda_device_name(
571        device_index: i32, buf: *mut i8, buf_len: i32,
572    ) -> *mut i8;
573
574    pub fn flodl_cuda_compute_capability(
575        device_index: i32, major: *mut i32, minor: *mut i32,
576    ) -> *mut i8;
577
578    // --- Dtype casting ---
579
580    pub fn flodl_to_dtype(
581        t: FlodlTensor, dtype: i32, result: *mut FlodlTensor,
582    ) -> *mut i8;
583
584    pub fn flodl_all_finite(t: FlodlTensor, result: *mut i32) -> *mut i8;
585
586    // --- Comparison (tensor-tensor, return float masks: 0.0 or 1.0) ---
587
588    pub fn flodl_gt_tensor(
589        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
590    ) -> *mut i8;
591
592    pub fn flodl_lt_tensor(
593        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
594    ) -> *mut i8;
595
596    pub fn flodl_ge_tensor(
597        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
598    ) -> *mut i8;
599
600    pub fn flodl_le_tensor(
601        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
602    ) -> *mut i8;
603
604    pub fn flodl_eq_tensor(
605        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
606    ) -> *mut i8;
607
608    pub fn flodl_ne_tensor(
609        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
610    ) -> *mut i8;
611
612    // --- Element-wise binary (differentiable) ---
613
614    pub fn flodl_atan2(
615        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
616    ) -> *mut i8;
617
618    pub fn flodl_maximum(
619        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
620    ) -> *mut i8;
621
622    pub fn flodl_minimum(
623        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
624    ) -> *mut i8;
625
626    // --- Additional reductions ---
627
628    pub fn flodl_argmin(
629        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
630    ) -> *mut i8;
631
632    pub fn flodl_var(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
633    pub fn flodl_std_op(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
634
635    pub fn flodl_var_dim(
636        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
637    ) -> *mut i8;
638
639    pub fn flodl_std_dim(
640        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
641    ) -> *mut i8;
642
643    pub fn flodl_cumprod(t: FlodlTensor, dim: i32, result: *mut FlodlTensor) -> *mut i8;
644    pub fn flodl_norm_p_dim(
645        t: FlodlTensor, p: f64, dim: i32, keepdim: i32, result: *mut FlodlTensor,
646    ) -> *mut i8;
647    pub fn flodl_sum_dims(
648        t: FlodlTensor, dims: *mut i64, ndims: i32, keepdim: i32,
649        result: *mut FlodlTensor,
650    ) -> *mut i8;
651    pub fn flodl_median(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
652    pub fn flodl_median_dim(
653        t: FlodlTensor, dim: i32, keepdim: i32,
654        values: *mut FlodlTensor, indices: *mut FlodlTensor,
655    ) -> *mut i8;
656    pub fn flodl_count_nonzero(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
657    pub fn flodl_count_nonzero_dim(
658        t: FlodlTensor, dim: i32, result: *mut FlodlTensor,
659    ) -> *mut i8;
660
661    // --- Query ops ---
662
663    pub fn flodl_nonzero(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
664    pub fn flodl_unique(
665        t: FlodlTensor, sorted: i32, return_inverse: i32,
666        output: *mut FlodlTensor, inverse_indices: *mut FlodlTensor,
667    ) -> *mut i8;
668    pub fn flodl_searchsorted(
669        sorted_seq: FlodlTensor, values: FlodlTensor,
670        result: *mut FlodlTensor,
671    ) -> *mut i8;
672
673    // --- Shape ops (advanced) ---
674
675    pub fn flodl_diagonal(
676        t: FlodlTensor, offset: i64, dim1: i32, dim2: i32,
677        result: *mut FlodlTensor,
678    ) -> *mut i8;
679    pub fn flodl_movedim(
680        t: FlodlTensor, src: i64, dst: i64, result: *mut FlodlTensor,
681    ) -> *mut i8;
682    pub fn flodl_tile(
683        t: FlodlTensor, reps: *mut i64, ndim: i32, result: *mut FlodlTensor,
684    ) -> *mut i8;
685
686    // --- Element-wise math (trig, rounding, sign) ---
687
688    pub fn flodl_sin(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
689    pub fn flodl_cos(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
690    pub fn flodl_tan(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
691    pub fn flodl_asin(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
692    pub fn flodl_acos(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
693    pub fn flodl_atan(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
694    pub fn flodl_sign(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
695    pub fn flodl_floor(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
696    pub fn flodl_ceil(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
697    pub fn flodl_round(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
698    pub fn flodl_reciprocal(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
699    pub fn flodl_erf(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
700    pub fn flodl_erfc(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
701    pub fn flodl_trunc(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
702    pub fn flodl_frac(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
703    pub fn flodl_fmod_scalar(t: FlodlTensor, scalar: f64, result: *mut FlodlTensor) -> *mut i8;
704    pub fn flodl_fmod_tensor(a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
705    pub fn flodl_remainder_scalar(t: FlodlTensor, scalar: f64, result: *mut FlodlTensor) -> *mut i8;
706    pub fn flodl_remainder_tensor(a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
707    pub fn flodl_lerp(a: FlodlTensor, b: FlodlTensor, weight: f64, result: *mut FlodlTensor) -> *mut i8;
708    pub fn flodl_lerp_tensor(a: FlodlTensor, b: FlodlTensor, weight: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
709    pub fn flodl_isclose(a: FlodlTensor, b: FlodlTensor, rtol: f64, atol: f64, result: *mut FlodlTensor) -> *mut i8;
710
711    // --- Fused mul-add ---
712
713    pub fn flodl_addmm(
714        bias: FlodlTensor, mat1: FlodlTensor, mat2: FlodlTensor,
715        beta: f64, alpha: f64, result: *mut FlodlTensor,
716    ) -> *mut i8;
717    pub fn flodl_addcmul(
718        self_: FlodlTensor, t1: FlodlTensor, t2: FlodlTensor,
719        value: f64, result: *mut FlodlTensor,
720    ) -> *mut i8;
721    pub fn flodl_addcdiv(
722        self_: FlodlTensor, t1: FlodlTensor, t2: FlodlTensor,
723        value: f64, result: *mut FlodlTensor,
724    ) -> *mut i8;
725
726    // --- Advanced indexing ---
727
728    pub fn flodl_gather(
729        t: FlodlTensor, dim: i32, index: FlodlTensor,
730        result: *mut FlodlTensor,
731    ) -> *mut i8;
732
733    pub fn flodl_scatter_add(
734        t: FlodlTensor, dim: i32, index: FlodlTensor, src: FlodlTensor,
735        result: *mut FlodlTensor,
736    ) -> *mut i8;
737
738    // --- Sorting ---
739
740    pub fn flodl_topk(
741        t: FlodlTensor, k: i64, dim: i32, largest: i32, sorted: i32,
742        values: *mut FlodlTensor, indices: *mut FlodlTensor,
743    ) -> *mut i8;
744
745    pub fn flodl_sort(
746        t: FlodlTensor, dim: i32, descending: i32,
747        values: *mut FlodlTensor, indices: *mut FlodlTensor,
748    ) -> *mut i8;
749
750    // --- Tensor creation (additional) ---
751
752    pub fn flodl_eye(
753        n: i64, dtype: i32, device_type: i32, device_index: i32,
754        result: *mut FlodlTensor,
755    ) -> *mut i8;
756
757    pub fn flodl_full(
758        shape: *mut i64, ndim: i32, value: f64, dtype: i32,
759        device_type: i32, device_index: i32,
760        result: *mut FlodlTensor,
761    ) -> *mut i8;
762
763    pub fn flodl_randperm(
764        n: i64, dtype: i32, device_type: i32, device_index: i32,
765        result: *mut FlodlTensor,
766    ) -> *mut i8;
767
768    pub fn flodl_multinomial(
769        probs: FlodlTensor, num_samples: i64, replacement: i32,
770        result: *mut FlodlTensor,
771    ) -> *mut i8;
772
773    // --- Normalization ---
774
775    pub fn flodl_normalize(
776        t: FlodlTensor, p: f64, dim: i32, result: *mut FlodlTensor,
777    ) -> *mut i8;
778
779    // --- Shape operations (additional) ---
780
781    pub fn flodl_chunk(
782        t: FlodlTensor, chunks: i32, dim: i32,
783        results: *mut *mut FlodlTensor, count: *mut i32,
784    ) -> *mut i8;
785
786    pub fn flodl_repeat(
787        t: FlodlTensor, repeats: *mut i64, ndim: i32,
788        result: *mut FlodlTensor,
789    ) -> *mut i8;
790
791    pub fn flodl_pad(
792        t: FlodlTensor, padding: *mut i64, pad_len: i32, value: f64,
793        result: *mut FlodlTensor,
794    ) -> *mut i8;
795
796    // mode: 0=constant, 1=reflect, 2=replicate, 3=circular
797    pub fn flodl_pad_mode(
798        t: FlodlTensor, padding: *mut i64, pad_len: i32,
799        mode: i32, value: f64,
800        result: *mut FlodlTensor,
801    ) -> *mut i8;
802
803    // mode: 0=nearest, 1=bilinear, 2=bicubic, 3=trilinear
804    pub fn flodl_interpolate(
805        input: FlodlTensor, output_size: *mut i64, ndim: i32,
806        mode: i32, align_corners: i32,
807        result: *mut FlodlTensor,
808    ) -> *mut i8;
809
810    pub fn flodl_flip(
811        t: FlodlTensor, dims: *mut i64, ndim: i32,
812        result: *mut FlodlTensor,
813    ) -> *mut i8;
814
815    pub fn flodl_roll(
816        t: FlodlTensor, shift: i64, dim: i32,
817        result: *mut FlodlTensor,
818    ) -> *mut i8;
819
820    pub fn flodl_split(
821        t: FlodlTensor, split_size: i64, dim: i32,
822        results: *mut *mut FlodlTensor, count: *mut i32,
823    ) -> *mut i8;
824
825    pub fn flodl_unbind(
826        t: FlodlTensor, dim: i32,
827        results: *mut *mut FlodlTensor, count: *mut i32,
828    ) -> *mut i8;
829
830    pub fn flodl_contiguous(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
831    pub fn flodl_is_contiguous(t: FlodlTensor) -> i32;
832
833    pub fn flodl_argsort(
834        t: FlodlTensor, dim: i32, descending: i32,
835        result: *mut FlodlTensor,
836    ) -> *mut i8;
837
838    pub fn flodl_scatter(
839        t: FlodlTensor, dim: i32, index: FlodlTensor, src: FlodlTensor,
840        result: *mut FlodlTensor,
841    ) -> *mut i8;
842
843    // --- Autograd ---
844
845    pub fn flodl_set_requires_grad(
846        t: FlodlTensor, requires_grad: i32, result: *mut FlodlTensor,
847    ) -> *mut i8;
848
849    pub fn flodl_requires_grad(t: FlodlTensor) -> i32;
850
851    /// Force creation of the AccumulateGrad node for a leaf tensor with
852    /// `requires_grad=true`. The node's stream is pinned to the current
853    /// CUDA stream at the moment of this call. Use under `StreamGuard`
854    /// to ensure DDP workers' parameters accumulate on the training
855    /// stream, not the autograd engine's default stream.
856    ///
857    /// Writes an opaque handle to `*handle_out` that keeps the node
858    /// alive. The caller must later pass it to
859    /// [`flodl_grad_accumulator_delete`] to free it. For non-leaf or
860    /// non-requires-grad tensors `*handle_out` is set to null.
861    pub fn flodl_ensure_grad_accumulator(
862        t: FlodlTensor, handle_out: *mut *mut c_void,
863    ) -> *mut i8;
864
865    /// Free a handle returned by [`flodl_ensure_grad_accumulator`].
866    /// Safe to call with a null pointer.
867    pub fn flodl_grad_accumulator_delete(handle: *mut c_void);
868
869    pub fn flodl_backward(t: FlodlTensor) -> *mut i8;
870
871    pub fn flodl_grad(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
872
873    pub fn flodl_set_grad(t: FlodlTensor, grad: FlodlTensor) -> *mut i8;
874
875    pub fn flodl_zero_grad(t: FlodlTensor) -> *mut i8;
876
877    pub fn flodl_detach(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
878
879    pub fn flodl_detach_(t: FlodlTensor) -> *mut i8;
880
881    pub fn flodl_is_leaf(t: FlodlTensor) -> i32;
882
883    // --- Autograd context ---
884
885    pub fn flodl_no_grad_guard_new() -> *mut c_void;
886    pub fn flodl_no_grad_guard_delete(guard: *mut c_void);
887    pub fn flodl_is_grad_enabled() -> i32;
888
889    // --- Autocast (automatic mixed precision) ---
890
891    pub fn flodl_autocast_guard_new(device_type: i32, dtype: i32) -> *mut c_void;
892    pub fn flodl_autocast_guard_delete(guard: *mut c_void);
893    pub fn flodl_is_autocast_enabled(device_type: i32) -> i32;
894
895    // --- Meshgrid ---
896
897    pub fn flodl_meshgrid(
898        tensors: *mut FlodlTensor, count: i32,
899        results: *mut *mut FlodlTensor, result_count: *mut i32,
900    ) -> *mut i8;
901
902    // --- Pairwise distance ---
903
904    pub fn flodl_cdist(
905        x: FlodlTensor, y: FlodlTensor, p: f64,
906        result: *mut FlodlTensor,
907    ) -> *mut i8;
908
909    // --- Cosine similarity ---
910
911    pub fn flodl_cosine_similarity(
912        a: FlodlTensor, b: FlodlTensor,
913        dim: i64, eps: f64,
914        result: *mut FlodlTensor,
915    ) -> *mut i8;
916
917    // --- Fused ops ---
918
919    pub fn flodl_linear(
920        input: FlodlTensor, weight: FlodlTensor, bias: FlodlTensor,
921        result: *mut FlodlTensor,
922    ) -> *mut i8;
923
924    pub fn flodl_gru_cell(
925        input: FlodlTensor, hx: FlodlTensor,
926        w_ih: FlodlTensor, w_hh: FlodlTensor,
927        b_ih: FlodlTensor, b_hh: FlodlTensor,
928        result: *mut FlodlTensor,
929    ) -> *mut i8;
930
931    pub fn flodl_lstm_cell(
932        input: FlodlTensor, hx: FlodlTensor, cx: FlodlTensor,
933        w_ih: FlodlTensor, w_hh: FlodlTensor,
934        b_ih: FlodlTensor, b_hh: FlodlTensor,
935        h_out: *mut FlodlTensor, c_out: *mut FlodlTensor,
936    ) -> *mut i8;
937
938    // Fused sequence ops (cuDNN-accelerated)
939    pub fn flodl_lstm(
940        input: FlodlTensor, h_0: FlodlTensor, c_0: FlodlTensor,
941        params: *const FlodlTensor, num_params: i64,
942        num_layers: i64, batch_first: bool, flatten: bool,
943        output: *mut FlodlTensor, h_n: *mut FlodlTensor, c_n: *mut FlodlTensor,
944    ) -> *mut i8;
945
946    pub fn flodl_gru(
947        input: FlodlTensor, h_0: FlodlTensor,
948        params: *const FlodlTensor, num_params: i64,
949        num_layers: i64, batch_first: bool, flatten: bool,
950        output: *mut FlodlTensor, h_n: *mut FlodlTensor,
951    ) -> *mut i8;
952
953    // Cached RNN params (zero per-forward overhead)
954    pub fn flodl_rnn_params_create(
955        params: *const FlodlTensor, num_params: i64,
956        mode: i64, num_layers: i64, batch_first: bool, flatten: bool,
957        out: *mut *mut std::os::raw::c_void,
958    ) -> *mut i8;
959    pub fn flodl_rnn_params_free(rp: *mut std::os::raw::c_void);
960    pub fn flodl_lstm_cached(
961        input: FlodlTensor, h_0: FlodlTensor, c_0: FlodlTensor,
962        rp: *mut std::os::raw::c_void, num_layers: i64, batch_first: bool,
963        output: *mut FlodlTensor, h_n: *mut FlodlTensor, c_n: *mut FlodlTensor,
964    ) -> *mut i8;
965    pub fn flodl_gru_cached(
966        input: FlodlTensor, h_0: FlodlTensor,
967        rp: *mut std::os::raw::c_void, num_layers: i64, batch_first: bool,
968        output: *mut FlodlTensor, h_n: *mut FlodlTensor,
969    ) -> *mut i8;
970
971    // --- cuDNN benchmark ---
972
973    pub fn flodl_set_cudnn_benchmark(enable: i32);
974
975    // --- RNG seed ---
976
977    pub fn flodl_manual_seed(seed: u64);
978    pub fn flodl_cuda_manual_seed_all(seed: u64);
979
980    // --- In-place operations ---
981
982    pub fn flodl_add_(t: FlodlTensor, other: FlodlTensor) -> *mut i8;
983    pub fn flodl_sub_(t: FlodlTensor, other: FlodlTensor) -> *mut i8;
984    pub fn flodl_mul_scalar_(t: FlodlTensor, scalar: f64) -> *mut i8;
985    pub fn flodl_add_scalar_(t: FlodlTensor, scalar: f64) -> *mut i8;
986    pub fn flodl_zero_(t: FlodlTensor) -> *mut i8;
987    pub fn flodl_mul_(t: FlodlTensor, other: FlodlTensor) -> *mut i8;
988    pub fn flodl_div_scalar_(t: FlodlTensor, scalar: f64) -> *mut i8;
989    pub fn flodl_div_(t: FlodlTensor, other: FlodlTensor) -> *mut i8;
990    pub fn flodl_fill_(t: FlodlTensor, value: f64) -> *mut i8;
991
992    // --- Fused Adam step ---
993
994    pub fn flodl_adam_step(
995        param: FlodlTensor, grad: FlodlTensor,
996        m: FlodlTensor, v: FlodlTensor,
997        lr: f64, beta1: f64, beta2: f64, eps: f64,
998        weight_decay: f64, step: i64,
999    ) -> *mut i8;
1000
1001    // --- Batched Adam step ---
1002
1003    pub fn flodl_adam_step_batched(
1004        params: *mut FlodlTensor, grads: *mut FlodlTensor,
1005        ms: *mut FlodlTensor, vs: *mut FlodlTensor,
1006        lrs: *mut f64, count: i32,
1007        beta1: f64, beta2: f64, eps: f64,
1008        weight_decay: f64, step: i64,
1009    ) -> *mut i8;
1010
1011    // --- Fused Adam/AdamW (multi-tensor kernel) ---
1012
1013    pub fn flodl_fused_adam_(
1014        params: *mut FlodlTensor, grads: *mut FlodlTensor,
1015        exp_avgs: *mut FlodlTensor, exp_avg_sqs: *mut FlodlTensor,
1016        count: i32, lr: f64,
1017        beta1: f64, beta2: f64, eps: f64,
1018        weight_decay: f64, step: i64,
1019        grad_scale: FlodlTensor, found_inf: FlodlTensor,
1020    ) -> *mut i8;
1021
1022    pub fn flodl_fused_adamw_(
1023        params: *mut FlodlTensor, grads: *mut FlodlTensor,
1024        exp_avgs: *mut FlodlTensor, exp_avg_sqs: *mut FlodlTensor,
1025        count: i32, lr: f64,
1026        beta1: f64, beta2: f64, eps: f64,
1027        weight_decay: f64, step: i64,
1028        grad_scale: FlodlTensor, found_inf: FlodlTensor,
1029    ) -> *mut i8;
1030
1031    // --- Pinned memory ---
1032
1033    pub fn flodl_pin_memory(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
1034    pub fn flodl_is_pinned(t: FlodlTensor) -> i32;
1035
1036    // --- Memory diagnostics ---
1037
1038    pub fn flodl_malloc_trim() -> i32;
1039
1040    // --- Zero grad (set_to_none) ---
1041
1042    pub fn flodl_zero_grad_set_to_none(t: FlodlTensor);
1043
1044    // --- Fused clip_grad_norm ---
1045
1046    pub fn flodl_clip_grad_norm(
1047        params: *mut FlodlTensor, count: i32,
1048        max_norm: f64, total_norm_out: *mut f64,
1049    ) -> *mut i8;
1050
1051    // --- Multi-tensor foreach operations ---
1052
1053    pub fn flodl_foreach_add_scalar_(
1054        tensors: *mut FlodlTensor, count: i32, scalar: f64,
1055    ) -> *mut i8;
1056
1057    pub fn flodl_foreach_mul_scalar_(
1058        tensors: *mut FlodlTensor, count: i32, scalar: f64,
1059    ) -> *mut i8;
1060
1061    pub fn flodl_foreach_zero_(
1062        tensors: *mut FlodlTensor, count: i32,
1063    ) -> *mut i8;
1064
1065    pub fn flodl_foreach_add_list_(
1066        tensors1: *mut FlodlTensor, tensors2: *mut FlodlTensor,
1067        count: i32, alpha: f64,
1068    ) -> *mut i8;
1069
1070    pub fn flodl_foreach_norm(
1071        tensors: *mut FlodlTensor, count: i32, ord: f64,
1072        results: *mut FlodlTensor,
1073    ) -> *mut i8;
1074
1075    pub fn flodl_foreach_lerp_scalar_(
1076        tensors1: *mut FlodlTensor, tensors2: *mut FlodlTensor,
1077        count: i32, weight: f64,
1078    ) -> *mut i8;
1079
1080    pub fn flodl_foreach_sqrt_(
1081        tensors: *mut FlodlTensor, count: i32,
1082    ) -> *mut i8;
1083
1084    // --- Autograd diagnostics ---
1085
1086    pub fn flodl_autograd_node_count(t: FlodlTensor) -> i64;
1087
1088    // --- Fused loss functions ---
1089
1090    pub fn flodl_mse_loss(
1091        pred: FlodlTensor, target: FlodlTensor,
1092        reduction: i64, result: *mut FlodlTensor,
1093    ) -> *mut i8;
1094
1095    pub fn flodl_cross_entropy_loss(
1096        pred: FlodlTensor, target: FlodlTensor,
1097        reduction: i64, ignore_index: i64, label_smoothing: f64,
1098        result: *mut FlodlTensor,
1099    ) -> *mut i8;
1100
1101    pub fn flodl_bce_with_logits_loss(
1102        pred: FlodlTensor, target: FlodlTensor,
1103        reduction: i64, result: *mut FlodlTensor,
1104    ) -> *mut i8;
1105
1106    pub fn flodl_bce_loss(
1107        pred: FlodlTensor, target: FlodlTensor,
1108        reduction: i64, result: *mut FlodlTensor,
1109    ) -> *mut i8;
1110
1111    pub fn flodl_l1_loss(
1112        pred: FlodlTensor, target: FlodlTensor,
1113        reduction: i64, result: *mut FlodlTensor,
1114    ) -> *mut i8;
1115
1116    pub fn flodl_smooth_l1_loss(
1117        pred: FlodlTensor, target: FlodlTensor,
1118        reduction: i64, beta: f64,
1119        result: *mut FlodlTensor,
1120    ) -> *mut i8;
1121
1122    pub fn flodl_kl_div_loss(
1123        input: FlodlTensor, target: FlodlTensor,
1124        reduction: i64, log_target: i32,
1125        result: *mut FlodlTensor,
1126    ) -> *mut i8;
1127
1128    pub fn flodl_nll_loss(
1129        input: FlodlTensor, target: FlodlTensor,
1130        reduction: i64, ignore_index: i64,
1131        result: *mut FlodlTensor,
1132    ) -> *mut i8;
1133
1134    pub fn flodl_ctc_loss(
1135        log_probs: FlodlTensor, targets: FlodlTensor,
1136        input_lengths: FlodlTensor, target_lengths: FlodlTensor,
1137        blank: i64, reduction: i64,
1138        result: *mut FlodlTensor,
1139    ) -> *mut i8;
1140
1141    // --- Fused batch normalization ---
1142
1143    pub fn flodl_batch_norm(
1144        input: FlodlTensor, weight: FlodlTensor,
1145        bias: FlodlTensor, running_mean: FlodlTensor,
1146        running_var: FlodlTensor, training: i32,
1147        momentum: f64, eps: f64,
1148        result: *mut FlodlTensor,
1149    ) -> *mut i8;
1150
1151    // --- Fused dropout ---
1152
1153    pub fn flodl_dropout(
1154        input: FlodlTensor, p: f64, training: i32,
1155        result: *mut FlodlTensor,
1156    ) -> *mut i8;
1157
1158    pub fn flodl_feature_dropout(
1159        input: FlodlTensor, p: f64, training: i32,
1160        result: *mut FlodlTensor,
1161    ) -> *mut i8;
1162
1163    // --- In-place copy ---
1164
1165    pub fn flodl_copy_(dst: FlodlTensor, src: FlodlTensor, non_blocking: i32) -> *mut i8;
1166
1167    // --- Memory format ---
1168
1169    pub fn flodl_to_channels_last(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
1170    pub fn flodl_is_channels_last(t: FlodlTensor) -> i32;
1171
1172    // --- Embedding bag ---
1173
1174    pub fn flodl_embedding_bag(
1175        weight: FlodlTensor, indices: FlodlTensor, offsets: FlodlTensor,
1176        mode: i64, result: *mut FlodlTensor,
1177    ) -> *mut i8;
1178
1179    // --- CUDA Graphs ---
1180
1181    pub fn flodl_cuda_graph_new(graph_out: *mut *mut c_void) -> *mut i8;
1182    pub fn flodl_cuda_graph_capture_begin(
1183        graph: *mut c_void, pool_hi: u64, pool_lo: u64, mode: i32,
1184    ) -> *mut i8;
1185    pub fn flodl_cuda_graph_capture_end(graph: *mut c_void) -> *mut i8;
1186    pub fn flodl_cuda_graph_replay(graph: *mut c_void) -> *mut i8;
1187    pub fn flodl_cuda_graph_reset(graph: *mut c_void) -> *mut i8;
1188    pub fn flodl_cuda_graph_delete(graph: *mut c_void);
1189    pub fn flodl_cuda_graph_pool(
1190        graph: *mut c_void, pool_hi: *mut u64, pool_lo: *mut u64,
1191    );
1192    pub fn flodl_cuda_graph_pool_handle(pool_hi: *mut u64, pool_lo: *mut u64);
1193
1194    // --- CUDA Events ---
1195
1196    pub fn flodl_cuda_event_new(flags: i32, event_out: *mut *mut c_void) -> *mut i8;
1197    pub fn flodl_cuda_event_record(event: *mut c_void) -> *mut i8;
1198    pub fn flodl_cuda_event_record_on_stream(
1199        event: *mut c_void, stream: *mut c_void,
1200    ) -> *mut i8;
1201    pub fn flodl_cuda_event_synchronize(event: *mut c_void) -> *mut i8;
1202    pub fn flodl_cuda_event_elapsed_time(
1203        start: *mut c_void, end: *mut c_void, ms_out: *mut f32,
1204    ) -> *mut i8;
1205    pub fn flodl_cuda_event_query(event: *mut c_void) -> i32;
1206    pub fn flodl_cuda_event_delete(event: *mut c_void);
1207
1208    // --- CUDA Streams ---
1209
1210    pub fn flodl_cuda_stream_new(
1211        device_index: i32, high_priority: i32, stream_out: *mut *mut c_void,
1212    ) -> *mut i8;
1213    pub fn flodl_cuda_stream_synchronize(stream: *mut c_void) -> *mut i8;
1214    pub fn flodl_cuda_stream_wait_event(
1215        stream: *mut c_void, event: *mut c_void,
1216    ) -> *mut i8;
1217    pub fn flodl_cuda_stream_query(stream: *mut c_void) -> i32;
1218    pub fn flodl_cuda_stream_set_current(stream: *mut c_void);
1219    pub fn flodl_cuda_stream_get_current(device_index: i32) -> *mut c_void;
1220    pub fn flodl_cuda_stream_restore_default(device_index: i32);
1221    pub fn flodl_cuda_stream_delete(stream: *mut c_void);
1222
1223    // --- NCCL Collective Operations ---
1224
1225    pub fn flodl_nccl_init(
1226        ndev: i32, devlist: *const i32, handle_out: *mut *mut c_void,
1227    ) -> *mut i8;
1228    pub fn flodl_nccl_destroy(handle: *mut c_void);
1229    pub fn flodl_nccl_all_reduce(
1230        handle: *mut c_void, tensors: *mut FlodlTensor,
1231        streams: *mut *mut c_void, op: i32,
1232    ) -> *mut i8;
1233    pub fn flodl_nccl_broadcast(
1234        handle: *mut c_void, tensors: *mut FlodlTensor,
1235        streams: *mut *mut c_void, root: i32,
1236    ) -> *mut i8;
1237    pub fn flodl_nccl_size(handle: *mut c_void) -> i32;
1238
1239    // --- NCCL Per-Rank Operations ---
1240
1241    pub fn flodl_nccl_get_unique_id(uid_out: *mut u8) -> *mut i8;
1242    pub fn flodl_nccl_init_rank(
1243        rank: i32, nranks: i32, uid: *const u8, handle_out: *mut *mut c_void,
1244    ) -> *mut i8;
1245    pub fn flodl_nccl_destroy_rank(handle: *mut c_void);
1246    pub fn flodl_nccl_abort_rank(handle: *mut c_void) -> *mut i8;
1247    pub fn flodl_nccl_all_reduce_rank(
1248        handle: *mut c_void, tensors: *mut FlodlTensor, ntensors: i32,
1249        stream: *mut c_void, op: i32,
1250    ) -> *mut i8;
1251    pub fn flodl_nccl_split_rank(
1252        group_handle: *mut c_void, rank: i32,
1253        rank_handle_out: *mut *mut c_void,
1254    ) -> *mut i8;
1255
1256    // --- Utility ---
1257
1258    pub fn flodl_free_string(s: *mut i8);
1259}