Skip to main content

flodl_sys/
lib.rs

1//! Raw FFI bindings to the libtorch C++ shim.
2//!
3//! Every function that can fail returns a `*mut i8` error string (caller
4//! must free it with [`flodl_free_string`]). A null pointer means success.
5//!
6//! `FlodlTensor` is an opaque `*mut c_void` handle to a heap-allocated
7//! `torch::Tensor`. Caller owns it and must free with [`flodl_free_tensor`].
8
9use std::ffi::c_void;
10
11/// Opaque handle to a `torch::Tensor` on the C++ side.
12pub type FlodlTensor = *mut c_void;
13
14// --- DType constants (must match shim.h) ---
15pub const FLODL_FLOAT16: i32 = 5;
16pub const FLODL_BFLOAT16: i32 = 15;
17pub const FLODL_FLOAT32: i32 = 6;
18pub const FLODL_FLOAT64: i32 = 7;
19pub const FLODL_INT32: i32 = 3;
20pub const FLODL_INT64: i32 = 4;
21
22// --- Device constants (must match shim.h) ---
23pub const FLODL_CPU: i32 = 0;
24pub const FLODL_CUDA: i32 = 1;
25
26unsafe extern "C" {
27    // --- Tensor creation ---
28
29    pub fn flodl_zeros(
30        shape: *mut i64, ndim: i32, dtype: i32,
31        device_type: i32, device_index: i32,
32        result: *mut FlodlTensor,
33    ) -> *mut i8;
34
35    pub fn flodl_ones(
36        shape: *mut i64, ndim: i32, dtype: i32,
37        device_type: i32, device_index: i32,
38        result: *mut FlodlTensor,
39    ) -> *mut i8;
40
41    pub fn flodl_rand(
42        shape: *mut i64, ndim: i32, dtype: i32,
43        device_type: i32, device_index: i32,
44        result: *mut FlodlTensor,
45    ) -> *mut i8;
46
47    pub fn flodl_randn(
48        shape: *mut i64, ndim: i32, dtype: i32,
49        device_type: i32, device_index: i32,
50        result: *mut FlodlTensor,
51    ) -> *mut i8;
52
53    pub fn flodl_from_blob(
54        data: *mut c_void, shape: *mut i64, ndim: i32,
55        dtype: i32, device_type: i32, device_index: i32,
56        result: *mut FlodlTensor,
57    ) -> *mut i8;
58
59    pub fn flodl_linspace(
60        start: f64, end: f64, steps: i64,
61        dtype: i32, device_type: i32, device_index: i32,
62        result: *mut FlodlTensor,
63    ) -> *mut i8;
64
65    pub fn flodl_arange(
66        start: f64, end: f64, step: f64,
67        dtype: i32, device_type: i32, device_index: i32,
68        result: *mut FlodlTensor,
69    ) -> *mut i8;
70
71    pub fn flodl_expand(
72        t: FlodlTensor, new_shape: *mut i64, ndim: i32,
73        result: *mut FlodlTensor,
74    ) -> *mut i8;
75
76    // --- Tensor lifecycle ---
77
78    pub fn flodl_free_tensor(t: FlodlTensor);
79    pub fn flodl_shallow_clone(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
80
81    // --- Tensor metadata ---
82
83    pub fn flodl_ndim(t: FlodlTensor) -> i32;
84    pub fn flodl_shape(t: FlodlTensor, dim: i32) -> i64;
85    pub fn flodl_dtype(t: FlodlTensor) -> i32;
86    pub fn flodl_device_type(t: FlodlTensor) -> i32;
87    pub fn flodl_device_index(t: FlodlTensor) -> i32;
88    pub fn flodl_numel(t: FlodlTensor) -> i64;
89
90    // --- Data access ---
91
92    pub fn flodl_copy_data(
93        t: FlodlTensor, buffer: *mut c_void, buffer_bytes: i64,
94    ) -> *mut i8;
95
96    // --- Arithmetic ---
97
98    pub fn flodl_add(a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
99    pub fn flodl_sub(a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
100    pub fn flodl_mul(a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
101    pub fn flodl_div(a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
102    pub fn flodl_matmul(a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
103
104    pub fn flodl_add_scalar(
105        t: FlodlTensor, scalar: f64, result: *mut FlodlTensor,
106    ) -> *mut i8;
107
108    pub fn flodl_mul_scalar(
109        t: FlodlTensor, scalar: f64, result: *mut FlodlTensor,
110    ) -> *mut i8;
111
112    pub fn flodl_div_scalar(
113        t: FlodlTensor, scalar: f64, result: *mut FlodlTensor,
114    ) -> *mut i8;
115
116    pub fn flodl_neg(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
117
118    // --- Activations ---
119
120    pub fn flodl_relu(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
121    pub fn flodl_sigmoid(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
122    pub fn flodl_tanh_op(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
123    pub fn flodl_softmax(t: FlodlTensor, dim: i32, result: *mut FlodlTensor) -> *mut i8;
124    pub fn flodl_log_softmax(t: FlodlTensor, dim: i32, result: *mut FlodlTensor) -> *mut i8;
125    pub fn flodl_gelu(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
126    pub fn flodl_silu(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
127    pub fn flodl_leaky_relu(
128        t: FlodlTensor, negative_slope: f64, result: *mut FlodlTensor,
129    ) -> *mut i8;
130    pub fn flodl_elu(t: FlodlTensor, alpha: f64, result: *mut FlodlTensor) -> *mut i8;
131    pub fn flodl_softplus(
132        t: FlodlTensor, beta: f64, threshold: f64, result: *mut FlodlTensor,
133    ) -> *mut i8;
134    pub fn flodl_mish(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
135    pub fn flodl_selu(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
136    pub fn flodl_hardswish(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
137    pub fn flodl_hardsigmoid(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
138    pub fn flodl_prelu(t: FlodlTensor, weight: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
139
140    // --- Layer normalization ---
141
142    pub fn flodl_native_layer_norm(
143        input: FlodlTensor, weight: FlodlTensor, bias: FlodlTensor,
144        normalized_size: i64, eps: f64,
145        output: *mut FlodlTensor, mean: *mut FlodlTensor, rstd: *mut FlodlTensor,
146    ) -> *mut i8;
147
148    // --- Group normalization ---
149
150    pub fn flodl_group_norm(
151        input: FlodlTensor, num_groups: i64,
152        weight: FlodlTensor, bias: FlodlTensor,
153        eps: f64, result: *mut FlodlTensor,
154    ) -> *mut i8;
155
156    // --- Element-wise math ---
157
158    pub fn flodl_exp(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
159    pub fn flodl_log(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
160    pub fn flodl_sqrt(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
161    pub fn flodl_abs(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
162    pub fn flodl_triu(t: FlodlTensor, diagonal: i64, result: *mut FlodlTensor) -> *mut i8;
163    pub fn flodl_tril(t: FlodlTensor, diagonal: i64, result: *mut FlodlTensor) -> *mut i8;
164
165    pub fn flodl_pow_scalar(
166        t: FlodlTensor, exponent: f64, result: *mut FlodlTensor,
167    ) -> *mut i8;
168
169    pub fn flodl_clamp(
170        t: FlodlTensor, min_val: f64, max_val: f64, result: *mut FlodlTensor,
171    ) -> *mut i8;
172
173    pub fn flodl_clamp_min(
174        t: FlodlTensor, min_val: f64, result: *mut FlodlTensor,
175    ) -> *mut i8;
176
177    pub fn flodl_clamp_max(
178        t: FlodlTensor, max_val: f64, result: *mut FlodlTensor,
179    ) -> *mut i8;
180
181    pub fn flodl_log1p(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
182    pub fn flodl_expm1(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
183    pub fn flodl_log2(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
184    pub fn flodl_log10(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
185
186    // --- Reductions ---
187
188    pub fn flodl_sum(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
189    pub fn flodl_mean(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
190
191    pub fn flodl_sum_dim(
192        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
193    ) -> *mut i8;
194
195    pub fn flodl_mean_dim(
196        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
197    ) -> *mut i8;
198
199    pub fn flodl_prod(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
200
201    pub fn flodl_prod_dim(
202        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
203    ) -> *mut i8;
204
205    pub fn flodl_cumsum(
206        t: FlodlTensor, dim: i32, result: *mut FlodlTensor,
207    ) -> *mut i8;
208
209    pub fn flodl_logsumexp(
210        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
211    ) -> *mut i8;
212
213    pub fn flodl_min(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
214    pub fn flodl_max(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
215    pub fn flodl_norm(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
216
217    pub fn flodl_min_dim(
218        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
219    ) -> *mut i8;
220
221    pub fn flodl_max_dim(
222        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
223    ) -> *mut i8;
224
225    pub fn flodl_argmax(
226        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
227    ) -> *mut i8;
228
229    // --- Comparison (return float masks: 0.0 or 1.0) ---
230
231    pub fn flodl_gt_scalar(
232        t: FlodlTensor, scalar: f64, result: *mut FlodlTensor,
233    ) -> *mut i8;
234
235    pub fn flodl_ge_scalar(
236        t: FlodlTensor, scalar: f64, result: *mut FlodlTensor,
237    ) -> *mut i8;
238
239    pub fn flodl_le_scalar(
240        t: FlodlTensor, scalar: f64, result: *mut FlodlTensor,
241    ) -> *mut i8;
242
243    pub fn flodl_lt_scalar(
244        t: FlodlTensor, scalar: f64, result: *mut FlodlTensor,
245    ) -> *mut i8;
246
247    pub fn flodl_eq_scalar(
248        t: FlodlTensor, scalar: f64, result: *mut FlodlTensor,
249    ) -> *mut i8;
250
251    pub fn flodl_ne_scalar(
252        t: FlodlTensor, scalar: f64, result: *mut FlodlTensor,
253    ) -> *mut i8;
254
255    // --- Boolean / detection (return float masks) ---
256
257    pub fn flodl_isnan(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
258    pub fn flodl_isinf(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
259    pub fn flodl_logical_and(
260        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
261    ) -> *mut i8;
262    pub fn flodl_logical_or(
263        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
264    ) -> *mut i8;
265    pub fn flodl_logical_not(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
266    pub fn flodl_any(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
267    pub fn flodl_all(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
268
269    // --- Shape operations ---
270
271    pub fn flodl_reshape(
272        t: FlodlTensor, shape: *mut i64, ndim: i32, result: *mut FlodlTensor,
273    ) -> *mut i8;
274
275    pub fn flodl_transpose(
276        t: FlodlTensor, dim0: i32, dim1: i32, result: *mut FlodlTensor,
277    ) -> *mut i8;
278
279    pub fn flodl_permute(
280        t: FlodlTensor, dims: *mut i64, ndim: i32, result: *mut FlodlTensor,
281    ) -> *mut i8;
282
283    pub fn flodl_select(
284        t: FlodlTensor, dim: i32, index: i64, result: *mut FlodlTensor,
285    ) -> *mut i8;
286
287    pub fn flodl_narrow(
288        t: FlodlTensor, dim: i32, start: i64, length: i64,
289        result: *mut FlodlTensor,
290    ) -> *mut i8;
291
292    pub fn flodl_squeeze(
293        t: FlodlTensor, dim: i32, result: *mut FlodlTensor,
294    ) -> *mut i8;
295
296    pub fn flodl_unsqueeze(
297        t: FlodlTensor, dim: i32, result: *mut FlodlTensor,
298    ) -> *mut i8;
299
300    pub fn flodl_flatten(
301        t: FlodlTensor, start_dim: i32, end_dim: i32, result: *mut FlodlTensor,
302    ) -> *mut i8;
303
304    // --- Scatter ---
305
306    pub fn flodl_select_scatter(
307        input: FlodlTensor, src: FlodlTensor, dim: i32, index: i64,
308        result: *mut FlodlTensor,
309    ) -> *mut i8;
310
311    pub fn flodl_narrow_scatter(
312        input: FlodlTensor, src: FlodlTensor, dim: i32, start: i64,
313        result: *mut FlodlTensor,
314    ) -> *mut i8;
315
316    // --- Indexing ---
317
318    pub fn flodl_index_select(
319        t: FlodlTensor, dim: i32, index: FlodlTensor,
320        result: *mut FlodlTensor,
321    ) -> *mut i8;
322
323    pub fn flodl_index_add(
324        t: FlodlTensor, dim: i32, index: FlodlTensor, src: FlodlTensor,
325        result: *mut FlodlTensor,
326    ) -> *mut i8;
327
328    // --- Concatenation ---
329
330    pub fn flodl_cat2(
331        a: FlodlTensor, b: FlodlTensor, dim: i32, result: *mut FlodlTensor,
332    ) -> *mut i8;
333
334    pub fn flodl_cat(
335        tensors: *mut FlodlTensor, count: i32, dim: i32, result: *mut FlodlTensor,
336    ) -> *mut i8;
337
338    pub fn flodl_stack(
339        tensors: *mut FlodlTensor, count: i32, dim: i32, result: *mut FlodlTensor,
340    ) -> *mut i8;
341
342    // --- Masking ---
343
344    pub fn flodl_masked_fill(
345        t: FlodlTensor, mask: FlodlTensor, value: f64,
346        result: *mut FlodlTensor,
347    ) -> *mut i8;
348
349    // --- Conditional ---
350
351    pub fn flodl_where(
352        condition: FlodlTensor, x: FlodlTensor, y: FlodlTensor,
353        result: *mut FlodlTensor,
354    ) -> *mut i8;
355
356    // --- Like constructors ---
357
358    pub fn flodl_zeros_like(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
359    pub fn flodl_ones_like(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
360    pub fn flodl_full_like(
361        t: FlodlTensor, value: f64, result: *mut FlodlTensor,
362    ) -> *mut i8;
363    pub fn flodl_rand_like(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
364    pub fn flodl_randn_like(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
365
366    // --- Tensor creation (tier 2) ---
367
368    pub fn flodl_randint(
369        low: i64, high: i64, shape: *mut i64, ndim: i32,
370        dtype: i32, device_type: i32, device_index: i32,
371        result: *mut FlodlTensor,
372    ) -> *mut i8;
373
374    pub fn flodl_empty(
375        shape: *mut i64, ndim: i32, dtype: i32,
376        device_type: i32, device_index: i32,
377        result: *mut FlodlTensor,
378    ) -> *mut i8;
379
380    pub fn flodl_one_hot(
381        t: FlodlTensor, num_classes: i64,
382        result: *mut FlodlTensor,
383    ) -> *mut i8;
384
385    pub fn flodl_bernoulli(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
386
387    // --- Convolution ---
388
389    pub fn flodl_conv2d(
390        input: FlodlTensor, weight: FlodlTensor, bias: FlodlTensor,
391        stride: *mut i64, padding: *mut i64, dilation: *mut i64,
392        groups: i64, result: *mut FlodlTensor,
393    ) -> *mut i8;
394
395    // --- 1D convolution ---
396
397    pub fn flodl_conv1d(
398        input: FlodlTensor, weight: FlodlTensor, bias: FlodlTensor,
399        stride: i64, padding: i64, dilation: i64,
400        groups: i64, result: *mut FlodlTensor,
401    ) -> *mut i8;
402
403    // --- Transposed convolution ---
404
405    pub fn flodl_conv_transpose2d(
406        input: FlodlTensor, weight: FlodlTensor, bias: FlodlTensor,
407        stride: *mut i64, padding: *mut i64,
408        output_padding: *mut i64, dilation: *mut i64,
409        groups: i64, result: *mut FlodlTensor,
410    ) -> *mut i8;
411
412    // --- Transposed 1D convolution ---
413
414    pub fn flodl_conv_transpose1d(
415        input: FlodlTensor, weight: FlodlTensor, bias: FlodlTensor,
416        stride: i64, padding: i64,
417        output_padding: i64, dilation: i64,
418        groups: i64, result: *mut FlodlTensor,
419    ) -> *mut i8;
420
421    // --- Pooling ---
422
423    pub fn flodl_max_pool2d(
424        input: FlodlTensor, kernel_size: *mut i64,
425        stride: *mut i64, padding: *mut i64, dilation: *mut i64,
426        ceil_mode: i32, result: *mut FlodlTensor,
427    ) -> *mut i8;
428
429    pub fn flodl_avg_pool2d(
430        input: FlodlTensor, kernel_size: *mut i64,
431        stride: *mut i64, padding: *mut i64,
432        ceil_mode: i32, count_include_pad: i32,
433        result: *mut FlodlTensor,
434    ) -> *mut i8;
435
436    pub fn flodl_adaptive_avg_pool2d(
437        input: FlodlTensor, output_size: *mut i64,
438        result: *mut FlodlTensor,
439    ) -> *mut i8;
440
441    pub fn flodl_adaptive_max_pool2d(
442        input: FlodlTensor, output_size: *mut i64,
443        result: *mut FlodlTensor,
444    ) -> *mut i8;
445
446    // --- Unfold / Fold (im2col / col2im) ---
447
448    pub fn flodl_im2col(
449        input: FlodlTensor, kernel_size: *mut i64, dilation: *mut i64,
450        padding: *mut i64, stride: *mut i64, result: *mut FlodlTensor,
451    ) -> *mut i8;
452
453    pub fn flodl_col2im(
454        input: FlodlTensor, output_size: *mut i64,
455        kernel_size: *mut i64, dilation: *mut i64,
456        padding: *mut i64, stride: *mut i64, result: *mut FlodlTensor,
457    ) -> *mut i8;
458
459    // --- 3D convolution ---
460
461    pub fn flodl_conv3d(
462        input: FlodlTensor, weight: FlodlTensor, bias: FlodlTensor,
463        stride: *mut i64, padding: *mut i64, dilation: *mut i64,
464        groups: i64, result: *mut FlodlTensor,
465    ) -> *mut i8;
466
467    pub fn flodl_conv_transpose3d(
468        input: FlodlTensor, weight: FlodlTensor, bias: FlodlTensor,
469        stride: *mut i64, padding: *mut i64, output_padding: *mut i64,
470        dilation: *mut i64, groups: i64, result: *mut FlodlTensor,
471    ) -> *mut i8;
472
473    // --- 1D pooling ---
474
475    pub fn flodl_max_pool1d(
476        input: FlodlTensor, kernel_size: i64,
477        stride: i64, padding: i64, dilation: i64,
478        ceil_mode: i32, result: *mut FlodlTensor,
479    ) -> *mut i8;
480
481    pub fn flodl_avg_pool1d(
482        input: FlodlTensor, kernel_size: i64,
483        stride: i64, padding: i64,
484        ceil_mode: i32, count_include_pad: i32,
485        result: *mut FlodlTensor,
486    ) -> *mut i8;
487
488    // --- Instance normalization ---
489
490    pub fn flodl_instance_norm(
491        input: FlodlTensor, weight: FlodlTensor, bias: FlodlTensor,
492        running_mean: FlodlTensor, running_var: FlodlTensor,
493        use_input_stats: i32, momentum: f64, eps: f64,
494        result: *mut FlodlTensor,
495    ) -> *mut i8;
496
497    // --- PixelShuffle ---
498
499    pub fn flodl_pixel_shuffle(
500        input: FlodlTensor, upscale_factor: i64, result: *mut FlodlTensor,
501    ) -> *mut i8;
502
503    pub fn flodl_pixel_unshuffle(
504        input: FlodlTensor, downscale_factor: i64, result: *mut FlodlTensor,
505    ) -> *mut i8;
506
507    // --- Bilinear ---
508
509    pub fn flodl_bilinear(
510        input1: FlodlTensor, input2: FlodlTensor,
511        weight: FlodlTensor, bias: FlodlTensor,
512        result: *mut FlodlTensor,
513    ) -> *mut i8;
514
515    // --- Grid sampling ---
516
517    pub fn flodl_grid_sample(
518        input: FlodlTensor, grid: FlodlTensor,
519        mode: i32, padding_mode: i32, align_corners: i32,
520        result: *mut FlodlTensor,
521    ) -> *mut i8;
522
523    // --- Device ---
524
525    pub fn flodl_to_device(
526        t: FlodlTensor, device_type: i32, device_index: i32,
527        result: *mut FlodlTensor,
528    ) -> *mut i8;
529
530    pub fn flodl_to_device_async(
531        t: FlodlTensor, device_type: i32, device_index: i32,
532        result: *mut FlodlTensor,
533    ) -> *mut i8;
534
535    pub fn flodl_cuda_is_available() -> i32;
536    pub fn flodl_cuda_device_count() -> i32;
537    pub fn flodl_force_cuda_link() -> i32;
538    pub fn flodl_set_current_device(device_index: i32);
539    pub fn flodl_get_current_device() -> i32;
540    pub fn flodl_cuda_synchronize(device_index: i32);
541
542    // --- CUDA memory/utilization (monitor support) ---
543
544    pub fn flodl_cuda_mem_info(
545        device_index: i32, used_bytes: *mut u64, total_bytes: *mut u64,
546    ) -> *mut i8;
547
548    pub fn flodl_cuda_alloc_bytes(
549        device_index: i32, allocated_bytes: *mut u64,
550    ) -> *mut i8;
551
552    pub fn flodl_cuda_active_bytes(
553        device_index: i32, active_bytes: *mut u64,
554    ) -> *mut i8;
555
556    pub fn flodl_cuda_peak_active_bytes(
557        device_index: i32, peak_bytes: *mut u64,
558    ) -> *mut i8;
559
560    pub fn flodl_cuda_peak_reserved_bytes(
561        device_index: i32, peak_bytes: *mut u64,
562    ) -> *mut i8;
563
564    pub fn flodl_cuda_reset_peak_stats(device_index: i32);
565
566    pub fn flodl_cuda_empty_cache();
567
568    pub fn flodl_cuda_utilization(device_index: i32) -> i32;
569
570    pub fn flodl_cuda_device_name(
571        device_index: i32, buf: *mut i8, buf_len: i32,
572    ) -> *mut i8;
573
574    pub fn flodl_cuda_compute_capability(
575        device_index: i32, major: *mut i32, minor: *mut i32,
576    ) -> *mut i8;
577
578    // --- Dtype casting ---
579
580    pub fn flodl_to_dtype(
581        t: FlodlTensor, dtype: i32, result: *mut FlodlTensor,
582    ) -> *mut i8;
583
584    pub fn flodl_all_finite(t: FlodlTensor, result: *mut i32) -> *mut i8;
585
586    // --- Comparison (tensor-tensor, return float masks: 0.0 or 1.0) ---
587
588    pub fn flodl_gt_tensor(
589        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
590    ) -> *mut i8;
591
592    pub fn flodl_lt_tensor(
593        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
594    ) -> *mut i8;
595
596    pub fn flodl_ge_tensor(
597        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
598    ) -> *mut i8;
599
600    pub fn flodl_le_tensor(
601        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
602    ) -> *mut i8;
603
604    pub fn flodl_eq_tensor(
605        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
606    ) -> *mut i8;
607
608    pub fn flodl_ne_tensor(
609        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
610    ) -> *mut i8;
611
612    // --- Element-wise binary (differentiable) ---
613
614    pub fn flodl_atan2(
615        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
616    ) -> *mut i8;
617
618    pub fn flodl_maximum(
619        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
620    ) -> *mut i8;
621
622    pub fn flodl_minimum(
623        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
624    ) -> *mut i8;
625
626    // --- Additional reductions ---
627
628    pub fn flodl_argmin(
629        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
630    ) -> *mut i8;
631
632    pub fn flodl_var(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
633    pub fn flodl_std_op(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
634
635    pub fn flodl_var_dim(
636        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
637    ) -> *mut i8;
638
639    pub fn flodl_std_dim(
640        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
641    ) -> *mut i8;
642
643    pub fn flodl_cumprod(t: FlodlTensor, dim: i32, result: *mut FlodlTensor) -> *mut i8;
644    pub fn flodl_norm_p_dim(
645        t: FlodlTensor, p: f64, dim: i32, keepdim: i32, result: *mut FlodlTensor,
646    ) -> *mut i8;
647    pub fn flodl_sum_dims(
648        t: FlodlTensor, dims: *mut i64, ndims: i32, keepdim: i32,
649        result: *mut FlodlTensor,
650    ) -> *mut i8;
651    pub fn flodl_median(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
652    pub fn flodl_median_dim(
653        t: FlodlTensor, dim: i32, keepdim: i32,
654        values: *mut FlodlTensor, indices: *mut FlodlTensor,
655    ) -> *mut i8;
656    pub fn flodl_count_nonzero(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
657    pub fn flodl_count_nonzero_dim(
658        t: FlodlTensor, dim: i32, result: *mut FlodlTensor,
659    ) -> *mut i8;
660
661    // --- Query ops ---
662
663    pub fn flodl_nonzero(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
664    pub fn flodl_unique(
665        t: FlodlTensor, sorted: i32, return_inverse: i32,
666        output: *mut FlodlTensor, inverse_indices: *mut FlodlTensor,
667    ) -> *mut i8;
668    pub fn flodl_searchsorted(
669        sorted_seq: FlodlTensor, values: FlodlTensor,
670        result: *mut FlodlTensor,
671    ) -> *mut i8;
672
673    // --- Shape ops (advanced) ---
674
675    pub fn flodl_diagonal(
676        t: FlodlTensor, offset: i64, dim1: i32, dim2: i32,
677        result: *mut FlodlTensor,
678    ) -> *mut i8;
679    pub fn flodl_movedim(
680        t: FlodlTensor, src: i64, dst: i64, result: *mut FlodlTensor,
681    ) -> *mut i8;
682    pub fn flodl_tile(
683        t: FlodlTensor, reps: *mut i64, ndim: i32, result: *mut FlodlTensor,
684    ) -> *mut i8;
685
686    // --- Element-wise math (trig, rounding, sign) ---
687
688    pub fn flodl_sin(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
689    pub fn flodl_cos(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
690    pub fn flodl_tan(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
691    pub fn flodl_asin(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
692    pub fn flodl_acos(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
693    pub fn flodl_atan(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
694    pub fn flodl_sign(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
695    pub fn flodl_floor(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
696    pub fn flodl_ceil(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
697    pub fn flodl_round(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
698    pub fn flodl_reciprocal(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
699    pub fn flodl_erf(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
700    pub fn flodl_erfc(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
701    pub fn flodl_trunc(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
702    pub fn flodl_frac(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
703    pub fn flodl_fmod_scalar(t: FlodlTensor, scalar: f64, result: *mut FlodlTensor) -> *mut i8;
704    pub fn flodl_fmod_tensor(a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
705    pub fn flodl_remainder_scalar(t: FlodlTensor, scalar: f64, result: *mut FlodlTensor) -> *mut i8;
706    pub fn flodl_remainder_tensor(a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
707    pub fn flodl_lerp(a: FlodlTensor, b: FlodlTensor, weight: f64, result: *mut FlodlTensor) -> *mut i8;
708    pub fn flodl_lerp_tensor(a: FlodlTensor, b: FlodlTensor, weight: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
709    pub fn flodl_isclose(a: FlodlTensor, b: FlodlTensor, rtol: f64, atol: f64, result: *mut FlodlTensor) -> *mut i8;
710
711    // --- Fused mul-add ---
712
713    pub fn flodl_addmm(
714        bias: FlodlTensor, mat1: FlodlTensor, mat2: FlodlTensor,
715        beta: f64, alpha: f64, result: *mut FlodlTensor,
716    ) -> *mut i8;
717    pub fn flodl_addcmul(
718        self_: FlodlTensor, t1: FlodlTensor, t2: FlodlTensor,
719        value: f64, result: *mut FlodlTensor,
720    ) -> *mut i8;
721    pub fn flodl_addcdiv(
722        self_: FlodlTensor, t1: FlodlTensor, t2: FlodlTensor,
723        value: f64, result: *mut FlodlTensor,
724    ) -> *mut i8;
725
726    // --- Advanced indexing ---
727
728    pub fn flodl_gather(
729        t: FlodlTensor, dim: i32, index: FlodlTensor,
730        result: *mut FlodlTensor,
731    ) -> *mut i8;
732
733    pub fn flodl_scatter_add(
734        t: FlodlTensor, dim: i32, index: FlodlTensor, src: FlodlTensor,
735        result: *mut FlodlTensor,
736    ) -> *mut i8;
737
738    // --- Sorting ---
739
740    pub fn flodl_topk(
741        t: FlodlTensor, k: i64, dim: i32, largest: i32, sorted: i32,
742        values: *mut FlodlTensor, indices: *mut FlodlTensor,
743    ) -> *mut i8;
744
745    pub fn flodl_sort(
746        t: FlodlTensor, dim: i32, descending: i32,
747        values: *mut FlodlTensor, indices: *mut FlodlTensor,
748    ) -> *mut i8;
749
750    // --- Tensor creation (additional) ---
751
752    pub fn flodl_eye(
753        n: i64, dtype: i32, device_type: i32, device_index: i32,
754        result: *mut FlodlTensor,
755    ) -> *mut i8;
756
757    pub fn flodl_full(
758        shape: *mut i64, ndim: i32, value: f64, dtype: i32,
759        device_type: i32, device_index: i32,
760        result: *mut FlodlTensor,
761    ) -> *mut i8;
762
763    pub fn flodl_randperm(
764        n: i64, dtype: i32, device_type: i32, device_index: i32,
765        result: *mut FlodlTensor,
766    ) -> *mut i8;
767
768    pub fn flodl_multinomial(
769        probs: FlodlTensor, num_samples: i64, replacement: i32,
770        result: *mut FlodlTensor,
771    ) -> *mut i8;
772
773    // --- Normalization ---
774
775    pub fn flodl_normalize(
776        t: FlodlTensor, p: f64, dim: i32, result: *mut FlodlTensor,
777    ) -> *mut i8;
778
779    // --- Shape operations (additional) ---
780
781    pub fn flodl_chunk(
782        t: FlodlTensor, chunks: i32, dim: i32,
783        results: *mut *mut FlodlTensor, count: *mut i32,
784    ) -> *mut i8;
785
786    pub fn flodl_repeat(
787        t: FlodlTensor, repeats: *mut i64, ndim: i32,
788        result: *mut FlodlTensor,
789    ) -> *mut i8;
790
791    pub fn flodl_pad(
792        t: FlodlTensor, padding: *mut i64, pad_len: i32, value: f64,
793        result: *mut FlodlTensor,
794    ) -> *mut i8;
795
796    // mode: 0=constant, 1=reflect, 2=replicate, 3=circular
797    pub fn flodl_pad_mode(
798        t: FlodlTensor, padding: *mut i64, pad_len: i32,
799        mode: i32, value: f64,
800        result: *mut FlodlTensor,
801    ) -> *mut i8;
802
803    // mode: 0=nearest, 1=bilinear, 2=bicubic, 3=trilinear
804    pub fn flodl_interpolate(
805        input: FlodlTensor, output_size: *mut i64, ndim: i32,
806        mode: i32, align_corners: i32,
807        result: *mut FlodlTensor,
808    ) -> *mut i8;
809
810    pub fn flodl_flip(
811        t: FlodlTensor, dims: *mut i64, ndim: i32,
812        result: *mut FlodlTensor,
813    ) -> *mut i8;
814
815    pub fn flodl_roll(
816        t: FlodlTensor, shift: i64, dim: i32,
817        result: *mut FlodlTensor,
818    ) -> *mut i8;
819
820    pub fn flodl_split(
821        t: FlodlTensor, split_size: i64, dim: i32,
822        results: *mut *mut FlodlTensor, count: *mut i32,
823    ) -> *mut i8;
824
825    pub fn flodl_unbind(
826        t: FlodlTensor, dim: i32,
827        results: *mut *mut FlodlTensor, count: *mut i32,
828    ) -> *mut i8;
829
830    pub fn flodl_contiguous(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
831    pub fn flodl_is_contiguous(t: FlodlTensor) -> i32;
832
833    pub fn flodl_argsort(
834        t: FlodlTensor, dim: i32, descending: i32,
835        result: *mut FlodlTensor,
836    ) -> *mut i8;
837
838    pub fn flodl_scatter(
839        t: FlodlTensor, dim: i32, index: FlodlTensor, src: FlodlTensor,
840        result: *mut FlodlTensor,
841    ) -> *mut i8;
842
843    // --- Autograd ---
844
845    pub fn flodl_set_requires_grad(
846        t: FlodlTensor, requires_grad: i32, result: *mut FlodlTensor,
847    ) -> *mut i8;
848
849    pub fn flodl_requires_grad(t: FlodlTensor) -> i32;
850
851    pub fn flodl_backward(t: FlodlTensor) -> *mut i8;
852
853    pub fn flodl_grad(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
854
855    pub fn flodl_set_grad(t: FlodlTensor, grad: FlodlTensor) -> *mut i8;
856
857    pub fn flodl_zero_grad(t: FlodlTensor) -> *mut i8;
858
859    pub fn flodl_detach(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
860
861    pub fn flodl_detach_(t: FlodlTensor) -> *mut i8;
862
863    pub fn flodl_is_leaf(t: FlodlTensor) -> i32;
864
865    // --- Autograd context ---
866
867    pub fn flodl_no_grad_guard_new() -> *mut c_void;
868    pub fn flodl_no_grad_guard_delete(guard: *mut c_void);
869    pub fn flodl_is_grad_enabled() -> i32;
870
871    // --- Autocast (automatic mixed precision) ---
872
873    pub fn flodl_autocast_guard_new(device_type: i32, dtype: i32) -> *mut c_void;
874    pub fn flodl_autocast_guard_delete(guard: *mut c_void);
875    pub fn flodl_is_autocast_enabled(device_type: i32) -> i32;
876
877    // --- Meshgrid ---
878
879    pub fn flodl_meshgrid(
880        tensors: *mut FlodlTensor, count: i32,
881        results: *mut *mut FlodlTensor, result_count: *mut i32,
882    ) -> *mut i8;
883
884    // --- Pairwise distance ---
885
886    pub fn flodl_cdist(
887        x: FlodlTensor, y: FlodlTensor, p: f64,
888        result: *mut FlodlTensor,
889    ) -> *mut i8;
890
891    // --- Cosine similarity ---
892
893    pub fn flodl_cosine_similarity(
894        a: FlodlTensor, b: FlodlTensor,
895        dim: i64, eps: f64,
896        result: *mut FlodlTensor,
897    ) -> *mut i8;
898
899    // --- Fused ops ---
900
901    pub fn flodl_linear(
902        input: FlodlTensor, weight: FlodlTensor, bias: FlodlTensor,
903        result: *mut FlodlTensor,
904    ) -> *mut i8;
905
906    pub fn flodl_gru_cell(
907        input: FlodlTensor, hx: FlodlTensor,
908        w_ih: FlodlTensor, w_hh: FlodlTensor,
909        b_ih: FlodlTensor, b_hh: FlodlTensor,
910        result: *mut FlodlTensor,
911    ) -> *mut i8;
912
913    pub fn flodl_lstm_cell(
914        input: FlodlTensor, hx: FlodlTensor, cx: FlodlTensor,
915        w_ih: FlodlTensor, w_hh: FlodlTensor,
916        b_ih: FlodlTensor, b_hh: FlodlTensor,
917        h_out: *mut FlodlTensor, c_out: *mut FlodlTensor,
918    ) -> *mut i8;
919
920    // Fused sequence ops (cuDNN-accelerated)
921    pub fn flodl_lstm(
922        input: FlodlTensor, h_0: FlodlTensor, c_0: FlodlTensor,
923        params: *const FlodlTensor, num_params: i64,
924        num_layers: i64, batch_first: bool, flatten: bool,
925        output: *mut FlodlTensor, h_n: *mut FlodlTensor, c_n: *mut FlodlTensor,
926    ) -> *mut i8;
927
928    pub fn flodl_gru(
929        input: FlodlTensor, h_0: FlodlTensor,
930        params: *const FlodlTensor, num_params: i64,
931        num_layers: i64, batch_first: bool, flatten: bool,
932        output: *mut FlodlTensor, h_n: *mut FlodlTensor,
933    ) -> *mut i8;
934
935    // Cached RNN params (zero per-forward overhead)
936    pub fn flodl_rnn_params_create(
937        params: *const FlodlTensor, num_params: i64,
938        mode: i64, num_layers: i64, batch_first: bool, flatten: bool,
939        out: *mut *mut std::os::raw::c_void,
940    ) -> *mut i8;
941    pub fn flodl_rnn_params_free(rp: *mut std::os::raw::c_void);
942    pub fn flodl_lstm_cached(
943        input: FlodlTensor, h_0: FlodlTensor, c_0: FlodlTensor,
944        rp: *mut std::os::raw::c_void, num_layers: i64, batch_first: bool,
945        output: *mut FlodlTensor, h_n: *mut FlodlTensor, c_n: *mut FlodlTensor,
946    ) -> *mut i8;
947    pub fn flodl_gru_cached(
948        input: FlodlTensor, h_0: FlodlTensor,
949        rp: *mut std::os::raw::c_void, num_layers: i64, batch_first: bool,
950        output: *mut FlodlTensor, h_n: *mut FlodlTensor,
951    ) -> *mut i8;
952
953    // --- cuDNN benchmark ---
954
955    pub fn flodl_set_cudnn_benchmark(enable: i32);
956
957    // --- RNG seed ---
958
959    pub fn flodl_manual_seed(seed: u64);
960    pub fn flodl_cuda_manual_seed_all(seed: u64);
961
962    // --- In-place operations ---
963
964    pub fn flodl_add_(t: FlodlTensor, other: FlodlTensor) -> *mut i8;
965    pub fn flodl_sub_(t: FlodlTensor, other: FlodlTensor) -> *mut i8;
966    pub fn flodl_mul_scalar_(t: FlodlTensor, scalar: f64) -> *mut i8;
967    pub fn flodl_add_scalar_(t: FlodlTensor, scalar: f64) -> *mut i8;
968    pub fn flodl_zero_(t: FlodlTensor) -> *mut i8;
969    pub fn flodl_mul_(t: FlodlTensor, other: FlodlTensor) -> *mut i8;
970    pub fn flodl_div_scalar_(t: FlodlTensor, scalar: f64) -> *mut i8;
971    pub fn flodl_div_(t: FlodlTensor, other: FlodlTensor) -> *mut i8;
972    pub fn flodl_fill_(t: FlodlTensor, value: f64) -> *mut i8;
973
974    // --- Fused Adam step ---
975
976    pub fn flodl_adam_step(
977        param: FlodlTensor, grad: FlodlTensor,
978        m: FlodlTensor, v: FlodlTensor,
979        lr: f64, beta1: f64, beta2: f64, eps: f64,
980        weight_decay: f64, step: i64,
981    ) -> *mut i8;
982
983    // --- Batched Adam step ---
984
985    pub fn flodl_adam_step_batched(
986        params: *mut FlodlTensor, grads: *mut FlodlTensor,
987        ms: *mut FlodlTensor, vs: *mut FlodlTensor,
988        lrs: *mut f64, count: i32,
989        beta1: f64, beta2: f64, eps: f64,
990        weight_decay: f64, step: i64,
991    ) -> *mut i8;
992
993    // --- Fused Adam/AdamW (multi-tensor kernel) ---
994
995    pub fn flodl_fused_adam_(
996        params: *mut FlodlTensor, grads: *mut FlodlTensor,
997        exp_avgs: *mut FlodlTensor, exp_avg_sqs: *mut FlodlTensor,
998        count: i32, lr: f64,
999        beta1: f64, beta2: f64, eps: f64,
1000        weight_decay: f64, step: i64,
1001        grad_scale: FlodlTensor, found_inf: FlodlTensor,
1002    ) -> *mut i8;
1003
1004    pub fn flodl_fused_adamw_(
1005        params: *mut FlodlTensor, grads: *mut FlodlTensor,
1006        exp_avgs: *mut FlodlTensor, exp_avg_sqs: *mut FlodlTensor,
1007        count: i32, lr: f64,
1008        beta1: f64, beta2: f64, eps: f64,
1009        weight_decay: f64, step: i64,
1010        grad_scale: FlodlTensor, found_inf: FlodlTensor,
1011    ) -> *mut i8;
1012
1013    // --- Pinned memory ---
1014
1015    pub fn flodl_pin_memory(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
1016    pub fn flodl_is_pinned(t: FlodlTensor) -> i32;
1017
1018    // --- Memory diagnostics ---
1019
1020    pub fn flodl_malloc_trim() -> i32;
1021
1022    // --- Zero grad (set_to_none) ---
1023
1024    pub fn flodl_zero_grad_set_to_none(t: FlodlTensor);
1025
1026    // --- Fused clip_grad_norm ---
1027
1028    pub fn flodl_clip_grad_norm(
1029        params: *mut FlodlTensor, count: i32,
1030        max_norm: f64, total_norm_out: *mut f64,
1031    ) -> *mut i8;
1032
1033    // --- Multi-tensor foreach operations ---
1034
1035    pub fn flodl_foreach_add_scalar_(
1036        tensors: *mut FlodlTensor, count: i32, scalar: f64,
1037    ) -> *mut i8;
1038
1039    pub fn flodl_foreach_mul_scalar_(
1040        tensors: *mut FlodlTensor, count: i32, scalar: f64,
1041    ) -> *mut i8;
1042
1043    pub fn flodl_foreach_zero_(
1044        tensors: *mut FlodlTensor, count: i32,
1045    ) -> *mut i8;
1046
1047    pub fn flodl_foreach_add_list_(
1048        tensors1: *mut FlodlTensor, tensors2: *mut FlodlTensor,
1049        count: i32, alpha: f64,
1050    ) -> *mut i8;
1051
1052    pub fn flodl_foreach_norm(
1053        tensors: *mut FlodlTensor, count: i32, ord: f64,
1054        results: *mut FlodlTensor,
1055    ) -> *mut i8;
1056
1057    pub fn flodl_foreach_lerp_scalar_(
1058        tensors1: *mut FlodlTensor, tensors2: *mut FlodlTensor,
1059        count: i32, weight: f64,
1060    ) -> *mut i8;
1061
1062    pub fn flodl_foreach_sqrt_(
1063        tensors: *mut FlodlTensor, count: i32,
1064    ) -> *mut i8;
1065
1066    // --- Autograd diagnostics ---
1067
1068    pub fn flodl_autograd_node_count(t: FlodlTensor) -> i64;
1069
1070    // --- Fused loss functions ---
1071
1072    pub fn flodl_mse_loss(
1073        pred: FlodlTensor, target: FlodlTensor,
1074        reduction: i64, result: *mut FlodlTensor,
1075    ) -> *mut i8;
1076
1077    pub fn flodl_cross_entropy_loss(
1078        pred: FlodlTensor, target: FlodlTensor,
1079        reduction: i64, ignore_index: i64, label_smoothing: f64,
1080        result: *mut FlodlTensor,
1081    ) -> *mut i8;
1082
1083    pub fn flodl_bce_with_logits_loss(
1084        pred: FlodlTensor, target: FlodlTensor,
1085        reduction: i64, result: *mut FlodlTensor,
1086    ) -> *mut i8;
1087
1088    pub fn flodl_bce_loss(
1089        pred: FlodlTensor, target: FlodlTensor,
1090        reduction: i64, result: *mut FlodlTensor,
1091    ) -> *mut i8;
1092
1093    pub fn flodl_l1_loss(
1094        pred: FlodlTensor, target: FlodlTensor,
1095        reduction: i64, result: *mut FlodlTensor,
1096    ) -> *mut i8;
1097
1098    pub fn flodl_smooth_l1_loss(
1099        pred: FlodlTensor, target: FlodlTensor,
1100        reduction: i64, beta: f64,
1101        result: *mut FlodlTensor,
1102    ) -> *mut i8;
1103
1104    pub fn flodl_kl_div_loss(
1105        input: FlodlTensor, target: FlodlTensor,
1106        reduction: i64, log_target: i32,
1107        result: *mut FlodlTensor,
1108    ) -> *mut i8;
1109
1110    pub fn flodl_nll_loss(
1111        input: FlodlTensor, target: FlodlTensor,
1112        reduction: i64, ignore_index: i64,
1113        result: *mut FlodlTensor,
1114    ) -> *mut i8;
1115
1116    pub fn flodl_ctc_loss(
1117        log_probs: FlodlTensor, targets: FlodlTensor,
1118        input_lengths: FlodlTensor, target_lengths: FlodlTensor,
1119        blank: i64, reduction: i64,
1120        result: *mut FlodlTensor,
1121    ) -> *mut i8;
1122
1123    // --- Fused batch normalization ---
1124
1125    pub fn flodl_batch_norm(
1126        input: FlodlTensor, weight: FlodlTensor,
1127        bias: FlodlTensor, running_mean: FlodlTensor,
1128        running_var: FlodlTensor, training: i32,
1129        momentum: f64, eps: f64,
1130        result: *mut FlodlTensor,
1131    ) -> *mut i8;
1132
1133    // --- Fused dropout ---
1134
1135    pub fn flodl_dropout(
1136        input: FlodlTensor, p: f64, training: i32,
1137        result: *mut FlodlTensor,
1138    ) -> *mut i8;
1139
1140    pub fn flodl_feature_dropout(
1141        input: FlodlTensor, p: f64, training: i32,
1142        result: *mut FlodlTensor,
1143    ) -> *mut i8;
1144
1145    // --- In-place copy ---
1146
1147    pub fn flodl_copy_(dst: FlodlTensor, src: FlodlTensor, non_blocking: i32) -> *mut i8;
1148
1149    // --- Memory format ---
1150
1151    pub fn flodl_to_channels_last(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
1152    pub fn flodl_is_channels_last(t: FlodlTensor) -> i32;
1153
1154    // --- Embedding bag ---
1155
1156    pub fn flodl_embedding_bag(
1157        weight: FlodlTensor, indices: FlodlTensor, offsets: FlodlTensor,
1158        mode: i64, result: *mut FlodlTensor,
1159    ) -> *mut i8;
1160
1161    // --- CUDA Graphs ---
1162
1163    pub fn flodl_cuda_graph_new(graph_out: *mut *mut c_void) -> *mut i8;
1164    pub fn flodl_cuda_graph_capture_begin(
1165        graph: *mut c_void, pool_hi: u64, pool_lo: u64, mode: i32,
1166    ) -> *mut i8;
1167    pub fn flodl_cuda_graph_capture_end(graph: *mut c_void) -> *mut i8;
1168    pub fn flodl_cuda_graph_replay(graph: *mut c_void) -> *mut i8;
1169    pub fn flodl_cuda_graph_reset(graph: *mut c_void) -> *mut i8;
1170    pub fn flodl_cuda_graph_delete(graph: *mut c_void);
1171    pub fn flodl_cuda_graph_pool(
1172        graph: *mut c_void, pool_hi: *mut u64, pool_lo: *mut u64,
1173    );
1174    pub fn flodl_cuda_graph_pool_handle(pool_hi: *mut u64, pool_lo: *mut u64);
1175
1176    // --- CUDA Events ---
1177
1178    pub fn flodl_cuda_event_new(flags: i32, event_out: *mut *mut c_void) -> *mut i8;
1179    pub fn flodl_cuda_event_record(event: *mut c_void) -> *mut i8;
1180    pub fn flodl_cuda_event_record_on_stream(
1181        event: *mut c_void, stream: *mut c_void,
1182    ) -> *mut i8;
1183    pub fn flodl_cuda_event_synchronize(event: *mut c_void) -> *mut i8;
1184    pub fn flodl_cuda_event_elapsed_time(
1185        start: *mut c_void, end: *mut c_void, ms_out: *mut f32,
1186    ) -> *mut i8;
1187    pub fn flodl_cuda_event_query(event: *mut c_void) -> i32;
1188    pub fn flodl_cuda_event_delete(event: *mut c_void);
1189
1190    // --- CUDA Streams ---
1191
1192    pub fn flodl_cuda_stream_new(
1193        device_index: i32, high_priority: i32, stream_out: *mut *mut c_void,
1194    ) -> *mut i8;
1195    pub fn flodl_cuda_stream_synchronize(stream: *mut c_void) -> *mut i8;
1196    pub fn flodl_cuda_stream_wait_event(
1197        stream: *mut c_void, event: *mut c_void,
1198    ) -> *mut i8;
1199    pub fn flodl_cuda_stream_query(stream: *mut c_void) -> i32;
1200    pub fn flodl_cuda_stream_set_current(stream: *mut c_void);
1201    pub fn flodl_cuda_stream_restore_default(device_index: i32);
1202    pub fn flodl_cuda_stream_delete(stream: *mut c_void);
1203
1204    // --- NCCL Collective Operations ---
1205
1206    pub fn flodl_nccl_init(
1207        ndev: i32, devlist: *const i32, handle_out: *mut *mut c_void,
1208    ) -> *mut i8;
1209    pub fn flodl_nccl_destroy(handle: *mut c_void);
1210    pub fn flodl_nccl_all_reduce(
1211        handle: *mut c_void, tensors: *mut FlodlTensor,
1212        streams: *mut *mut c_void, op: i32,
1213    ) -> *mut i8;
1214    pub fn flodl_nccl_broadcast(
1215        handle: *mut c_void, tensors: *mut FlodlTensor,
1216        streams: *mut *mut c_void, root: i32,
1217    ) -> *mut i8;
1218    pub fn flodl_nccl_size(handle: *mut c_void) -> i32;
1219
1220    // --- NCCL Per-Rank Operations ---
1221
1222    pub fn flodl_nccl_get_unique_id(uid_out: *mut u8) -> *mut i8;
1223    pub fn flodl_nccl_init_rank(
1224        rank: i32, nranks: i32, uid: *const u8, handle_out: *mut *mut c_void,
1225    ) -> *mut i8;
1226    pub fn flodl_nccl_destroy_rank(handle: *mut c_void);
1227    pub fn flodl_nccl_abort_rank(handle: *mut c_void) -> *mut i8;
1228    pub fn flodl_nccl_all_reduce_rank(
1229        handle: *mut c_void, tensors: *mut FlodlTensor, ntensors: i32,
1230        stream: *mut c_void, op: i32,
1231    ) -> *mut i8;
1232    pub fn flodl_nccl_split_rank(
1233        group_handle: *mut c_void, rank: i32,
1234        rank_handle_out: *mut *mut c_void,
1235    ) -> *mut i8;
1236
1237    // --- Utility ---
1238
1239    pub fn flodl_free_string(s: *mut i8);
1240}