Skip to main content

flodl_sys/
lib.rs

1//! Raw FFI bindings to the libtorch C++ shim.
2//!
3//! Every function that can fail returns a `*mut i8` error string (caller
4//! must free it with [`flodl_free_string`]). A null pointer means success.
5//!
6//! `FlodlTensor` is an opaque `*mut c_void` handle to a heap-allocated
7//! `torch::Tensor`. Caller owns it and must free with [`flodl_free_tensor`].
8
9use std::ffi::c_void;
10
11/// Opaque handle to a `torch::Tensor` on the C++ side.
12pub type FlodlTensor = *mut c_void;
13
14// --- DType constants (must match shim.h) ---
15pub const FLODL_FLOAT16: i32 = 5;
16pub const FLODL_BFLOAT16: i32 = 15;
17pub const FLODL_FLOAT32: i32 = 6;
18pub const FLODL_FLOAT64: i32 = 7;
19pub const FLODL_INT32: i32 = 3;
20pub const FLODL_INT64: i32 = 4;
21
22// --- Device constants (must match shim.h) ---
23pub const FLODL_CPU: i32 = 0;
24pub const FLODL_CUDA: i32 = 1;
25
26unsafe extern "C" {
27    // --- Tensor creation ---
28
29    pub fn flodl_zeros(
30        shape: *mut i64, ndim: i32, dtype: i32,
31        device_type: i32, device_index: i32,
32        result: *mut FlodlTensor,
33    ) -> *mut i8;
34
35    pub fn flodl_ones(
36        shape: *mut i64, ndim: i32, dtype: i32,
37        device_type: i32, device_index: i32,
38        result: *mut FlodlTensor,
39    ) -> *mut i8;
40
41    pub fn flodl_rand(
42        shape: *mut i64, ndim: i32, dtype: i32,
43        device_type: i32, device_index: i32,
44        result: *mut FlodlTensor,
45    ) -> *mut i8;
46
47    pub fn flodl_randn(
48        shape: *mut i64, ndim: i32, dtype: i32,
49        device_type: i32, device_index: i32,
50        result: *mut FlodlTensor,
51    ) -> *mut i8;
52
53    pub fn flodl_from_blob(
54        data: *mut c_void, shape: *mut i64, ndim: i32,
55        dtype: i32, device_type: i32, device_index: i32,
56        result: *mut FlodlTensor,
57    ) -> *mut i8;
58
59    pub fn flodl_linspace(
60        start: f64, end: f64, steps: i64,
61        dtype: i32, device_type: i32, device_index: i32,
62        result: *mut FlodlTensor,
63    ) -> *mut i8;
64
65    pub fn flodl_arange(
66        start: f64, end: f64, step: f64,
67        dtype: i32, device_type: i32, device_index: i32,
68        result: *mut FlodlTensor,
69    ) -> *mut i8;
70
71    pub fn flodl_expand(
72        t: FlodlTensor, new_shape: *mut i64, ndim: i32,
73        result: *mut FlodlTensor,
74    ) -> *mut i8;
75
76    // --- Tensor lifecycle ---
77
78    pub fn flodl_free_tensor(t: FlodlTensor);
79    pub fn flodl_shallow_clone(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
80
81    // --- Tensor metadata ---
82
83    pub fn flodl_ndim(t: FlodlTensor) -> i32;
84    pub fn flodl_shape(t: FlodlTensor, dim: i32) -> i64;
85    pub fn flodl_dtype(t: FlodlTensor) -> i32;
86    pub fn flodl_device_type(t: FlodlTensor) -> i32;
87    pub fn flodl_device_index(t: FlodlTensor) -> i32;
88    pub fn flodl_numel(t: FlodlTensor) -> i64;
89
90    // --- Data access ---
91
92    pub fn flodl_copy_data(
93        t: FlodlTensor, buffer: *mut c_void, buffer_bytes: i64,
94    ) -> *mut i8;
95
96    // --- Arithmetic ---
97
98    pub fn flodl_add(a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
99    pub fn flodl_sub(a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
100    pub fn flodl_mul(a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
101    pub fn flodl_div(a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
102    pub fn flodl_matmul(a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
103
104    pub fn flodl_add_scalar(
105        t: FlodlTensor, scalar: f64, result: *mut FlodlTensor,
106    ) -> *mut i8;
107
108    pub fn flodl_mul_scalar(
109        t: FlodlTensor, scalar: f64, result: *mut FlodlTensor,
110    ) -> *mut i8;
111
112    pub fn flodl_div_scalar(
113        t: FlodlTensor, scalar: f64, result: *mut FlodlTensor,
114    ) -> *mut i8;
115
116    pub fn flodl_neg(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
117
118    // --- Activations ---
119
120    pub fn flodl_relu(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
121    pub fn flodl_sigmoid(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
122    pub fn flodl_tanh_op(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
123    pub fn flodl_softmax(t: FlodlTensor, dim: i32, result: *mut FlodlTensor) -> *mut i8;
124    pub fn flodl_log_softmax(t: FlodlTensor, dim: i32, result: *mut FlodlTensor) -> *mut i8;
125    pub fn flodl_gelu(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
126    pub fn flodl_silu(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
127    pub fn flodl_leaky_relu(
128        t: FlodlTensor, negative_slope: f64, result: *mut FlodlTensor,
129    ) -> *mut i8;
130    pub fn flodl_elu(t: FlodlTensor, alpha: f64, result: *mut FlodlTensor) -> *mut i8;
131    pub fn flodl_softplus(
132        t: FlodlTensor, beta: f64, threshold: f64, result: *mut FlodlTensor,
133    ) -> *mut i8;
134    pub fn flodl_mish(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
135    pub fn flodl_selu(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
136    pub fn flodl_hardswish(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
137    pub fn flodl_hardsigmoid(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
138    pub fn flodl_prelu(t: FlodlTensor, weight: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
139
140    // --- Layer normalization ---
141
142    pub fn flodl_native_layer_norm(
143        input: FlodlTensor, weight: FlodlTensor, bias: FlodlTensor,
144        normalized_size: i64, eps: f64,
145        output: *mut FlodlTensor, mean: *mut FlodlTensor, rstd: *mut FlodlTensor,
146    ) -> *mut i8;
147
148    // --- Group normalization ---
149
150    pub fn flodl_group_norm(
151        input: FlodlTensor, num_groups: i64,
152        weight: FlodlTensor, bias: FlodlTensor,
153        eps: f64, result: *mut FlodlTensor,
154    ) -> *mut i8;
155
156    // --- Element-wise math ---
157
158    pub fn flodl_exp(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
159    pub fn flodl_log(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
160    pub fn flodl_sqrt(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
161    pub fn flodl_abs(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
162    pub fn flodl_triu(t: FlodlTensor, diagonal: i64, result: *mut FlodlTensor) -> *mut i8;
163    pub fn flodl_tril(t: FlodlTensor, diagonal: i64, result: *mut FlodlTensor) -> *mut i8;
164
165    pub fn flodl_pow_scalar(
166        t: FlodlTensor, exponent: f64, result: *mut FlodlTensor,
167    ) -> *mut i8;
168
169    pub fn flodl_clamp(
170        t: FlodlTensor, min_val: f64, max_val: f64, result: *mut FlodlTensor,
171    ) -> *mut i8;
172
173    pub fn flodl_clamp_min(
174        t: FlodlTensor, min_val: f64, result: *mut FlodlTensor,
175    ) -> *mut i8;
176
177    pub fn flodl_clamp_max(
178        t: FlodlTensor, max_val: f64, result: *mut FlodlTensor,
179    ) -> *mut i8;
180
181    pub fn flodl_log1p(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
182    pub fn flodl_expm1(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
183    pub fn flodl_log2(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
184    pub fn flodl_log10(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
185
186    // --- Reductions ---
187
188    pub fn flodl_sum(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
189    pub fn flodl_mean(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
190
191    pub fn flodl_sum_dim(
192        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
193    ) -> *mut i8;
194
195    pub fn flodl_mean_dim(
196        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
197    ) -> *mut i8;
198
199    pub fn flodl_prod(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
200
201    pub fn flodl_prod_dim(
202        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
203    ) -> *mut i8;
204
205    pub fn flodl_cumsum(
206        t: FlodlTensor, dim: i32, result: *mut FlodlTensor,
207    ) -> *mut i8;
208
209    pub fn flodl_logsumexp(
210        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
211    ) -> *mut i8;
212
213    pub fn flodl_min(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
214    pub fn flodl_max(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
215    pub fn flodl_norm(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
216
217    pub fn flodl_min_dim(
218        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
219    ) -> *mut i8;
220
221    pub fn flodl_max_dim(
222        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
223    ) -> *mut i8;
224
225    pub fn flodl_argmax(
226        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
227    ) -> *mut i8;
228
229    // --- Comparison (return float masks: 0.0 or 1.0) ---
230
231    pub fn flodl_gt_scalar(
232        t: FlodlTensor, scalar: f64, result: *mut FlodlTensor,
233    ) -> *mut i8;
234
235    pub fn flodl_ge_scalar(
236        t: FlodlTensor, scalar: f64, result: *mut FlodlTensor,
237    ) -> *mut i8;
238
239    pub fn flodl_le_scalar(
240        t: FlodlTensor, scalar: f64, result: *mut FlodlTensor,
241    ) -> *mut i8;
242
243    pub fn flodl_lt_scalar(
244        t: FlodlTensor, scalar: f64, result: *mut FlodlTensor,
245    ) -> *mut i8;
246
247    pub fn flodl_eq_scalar(
248        t: FlodlTensor, scalar: f64, result: *mut FlodlTensor,
249    ) -> *mut i8;
250
251    pub fn flodl_ne_scalar(
252        t: FlodlTensor, scalar: f64, result: *mut FlodlTensor,
253    ) -> *mut i8;
254
255    // --- Boolean / detection (return float masks) ---
256
257    pub fn flodl_isnan(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
258    pub fn flodl_isinf(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
259    pub fn flodl_logical_and(
260        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
261    ) -> *mut i8;
262    pub fn flodl_logical_or(
263        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
264    ) -> *mut i8;
265    pub fn flodl_logical_not(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
266    pub fn flodl_any(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
267    pub fn flodl_all(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
268
269    // --- Shape operations ---
270
271    pub fn flodl_reshape(
272        t: FlodlTensor, shape: *mut i64, ndim: i32, result: *mut FlodlTensor,
273    ) -> *mut i8;
274
275    pub fn flodl_transpose(
276        t: FlodlTensor, dim0: i32, dim1: i32, result: *mut FlodlTensor,
277    ) -> *mut i8;
278
279    pub fn flodl_permute(
280        t: FlodlTensor, dims: *mut i64, ndim: i32, result: *mut FlodlTensor,
281    ) -> *mut i8;
282
283    pub fn flodl_select(
284        t: FlodlTensor, dim: i32, index: i64, result: *mut FlodlTensor,
285    ) -> *mut i8;
286
287    pub fn flodl_narrow(
288        t: FlodlTensor, dim: i32, start: i64, length: i64,
289        result: *mut FlodlTensor,
290    ) -> *mut i8;
291
292    pub fn flodl_squeeze(
293        t: FlodlTensor, dim: i32, result: *mut FlodlTensor,
294    ) -> *mut i8;
295
296    pub fn flodl_unsqueeze(
297        t: FlodlTensor, dim: i32, result: *mut FlodlTensor,
298    ) -> *mut i8;
299
300    pub fn flodl_flatten(
301        t: FlodlTensor, start_dim: i32, end_dim: i32, result: *mut FlodlTensor,
302    ) -> *mut i8;
303
304    // --- Scatter ---
305
306    pub fn flodl_select_scatter(
307        input: FlodlTensor, src: FlodlTensor, dim: i32, index: i64,
308        result: *mut FlodlTensor,
309    ) -> *mut i8;
310
311    pub fn flodl_narrow_scatter(
312        input: FlodlTensor, src: FlodlTensor, dim: i32, start: i64,
313        result: *mut FlodlTensor,
314    ) -> *mut i8;
315
316    // --- Indexing ---
317
318    pub fn flodl_index_select(
319        t: FlodlTensor, dim: i32, index: FlodlTensor,
320        result: *mut FlodlTensor,
321    ) -> *mut i8;
322
323    pub fn flodl_index_add(
324        t: FlodlTensor, dim: i32, index: FlodlTensor, src: FlodlTensor,
325        result: *mut FlodlTensor,
326    ) -> *mut i8;
327
328    // --- Concatenation ---
329
330    pub fn flodl_cat2(
331        a: FlodlTensor, b: FlodlTensor, dim: i32, result: *mut FlodlTensor,
332    ) -> *mut i8;
333
334    pub fn flodl_cat(
335        tensors: *mut FlodlTensor, count: i32, dim: i32, result: *mut FlodlTensor,
336    ) -> *mut i8;
337
338    pub fn flodl_stack(
339        tensors: *mut FlodlTensor, count: i32, dim: i32, result: *mut FlodlTensor,
340    ) -> *mut i8;
341
342    // --- Masking ---
343
344    pub fn flodl_masked_fill(
345        t: FlodlTensor, mask: FlodlTensor, value: f64,
346        result: *mut FlodlTensor,
347    ) -> *mut i8;
348
349    // --- Conditional ---
350
351    pub fn flodl_where(
352        condition: FlodlTensor, x: FlodlTensor, y: FlodlTensor,
353        result: *mut FlodlTensor,
354    ) -> *mut i8;
355
356    // --- Like constructors ---
357
358    pub fn flodl_zeros_like(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
359    pub fn flodl_ones_like(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
360    pub fn flodl_full_like(
361        t: FlodlTensor, value: f64, result: *mut FlodlTensor,
362    ) -> *mut i8;
363    pub fn flodl_rand_like(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
364    pub fn flodl_randn_like(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
365
366    // --- Tensor creation (tier 2) ---
367
368    pub fn flodl_randint(
369        low: i64, high: i64, shape: *mut i64, ndim: i32,
370        dtype: i32, device_type: i32, device_index: i32,
371        result: *mut FlodlTensor,
372    ) -> *mut i8;
373
374    pub fn flodl_empty(
375        shape: *mut i64, ndim: i32, dtype: i32,
376        device_type: i32, device_index: i32,
377        result: *mut FlodlTensor,
378    ) -> *mut i8;
379
380    pub fn flodl_one_hot(
381        t: FlodlTensor, num_classes: i64,
382        result: *mut FlodlTensor,
383    ) -> *mut i8;
384
385    pub fn flodl_bernoulli(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
386
387    // --- Convolution ---
388
389    pub fn flodl_conv2d(
390        input: FlodlTensor, weight: FlodlTensor, bias: FlodlTensor,
391        stride: *mut i64, padding: *mut i64, dilation: *mut i64,
392        groups: i64, result: *mut FlodlTensor,
393    ) -> *mut i8;
394
395    // --- 1D convolution ---
396
397    pub fn flodl_conv1d(
398        input: FlodlTensor, weight: FlodlTensor, bias: FlodlTensor,
399        stride: i64, padding: i64, dilation: i64,
400        groups: i64, result: *mut FlodlTensor,
401    ) -> *mut i8;
402
403    // --- Transposed convolution ---
404
405    pub fn flodl_conv_transpose2d(
406        input: FlodlTensor, weight: FlodlTensor, bias: FlodlTensor,
407        stride: *mut i64, padding: *mut i64,
408        output_padding: *mut i64, dilation: *mut i64,
409        groups: i64, result: *mut FlodlTensor,
410    ) -> *mut i8;
411
412    // --- Transposed 1D convolution ---
413
414    pub fn flodl_conv_transpose1d(
415        input: FlodlTensor, weight: FlodlTensor, bias: FlodlTensor,
416        stride: i64, padding: i64,
417        output_padding: i64, dilation: i64,
418        groups: i64, result: *mut FlodlTensor,
419    ) -> *mut i8;
420
421    // --- Pooling ---
422
423    pub fn flodl_max_pool2d(
424        input: FlodlTensor, kernel_size: *mut i64,
425        stride: *mut i64, padding: *mut i64, dilation: *mut i64,
426        ceil_mode: i32, result: *mut FlodlTensor,
427    ) -> *mut i8;
428
429    pub fn flodl_avg_pool2d(
430        input: FlodlTensor, kernel_size: *mut i64,
431        stride: *mut i64, padding: *mut i64,
432        ceil_mode: i32, count_include_pad: i32,
433        result: *mut FlodlTensor,
434    ) -> *mut i8;
435
436    pub fn flodl_adaptive_avg_pool2d(
437        input: FlodlTensor, output_size: *mut i64,
438        result: *mut FlodlTensor,
439    ) -> *mut i8;
440
441    pub fn flodl_adaptive_max_pool2d(
442        input: FlodlTensor, output_size: *mut i64,
443        result: *mut FlodlTensor,
444    ) -> *mut i8;
445
446    // --- Unfold / Fold (im2col / col2im) ---
447
448    pub fn flodl_im2col(
449        input: FlodlTensor, kernel_size: *mut i64, dilation: *mut i64,
450        padding: *mut i64, stride: *mut i64, result: *mut FlodlTensor,
451    ) -> *mut i8;
452
453    pub fn flodl_col2im(
454        input: FlodlTensor, output_size: *mut i64,
455        kernel_size: *mut i64, dilation: *mut i64,
456        padding: *mut i64, stride: *mut i64, result: *mut FlodlTensor,
457    ) -> *mut i8;
458
459    // --- 3D convolution ---
460
461    pub fn flodl_conv3d(
462        input: FlodlTensor, weight: FlodlTensor, bias: FlodlTensor,
463        stride: *mut i64, padding: *mut i64, dilation: *mut i64,
464        groups: i64, result: *mut FlodlTensor,
465    ) -> *mut i8;
466
467    pub fn flodl_conv_transpose3d(
468        input: FlodlTensor, weight: FlodlTensor, bias: FlodlTensor,
469        stride: *mut i64, padding: *mut i64, output_padding: *mut i64,
470        dilation: *mut i64, groups: i64, result: *mut FlodlTensor,
471    ) -> *mut i8;
472
473    // --- 1D pooling ---
474
475    pub fn flodl_max_pool1d(
476        input: FlodlTensor, kernel_size: i64,
477        stride: i64, padding: i64, dilation: i64,
478        ceil_mode: i32, result: *mut FlodlTensor,
479    ) -> *mut i8;
480
481    pub fn flodl_avg_pool1d(
482        input: FlodlTensor, kernel_size: i64,
483        stride: i64, padding: i64,
484        ceil_mode: i32, count_include_pad: i32,
485        result: *mut FlodlTensor,
486    ) -> *mut i8;
487
488    // --- Instance normalization ---
489
490    pub fn flodl_instance_norm(
491        input: FlodlTensor, weight: FlodlTensor, bias: FlodlTensor,
492        running_mean: FlodlTensor, running_var: FlodlTensor,
493        use_input_stats: i32, momentum: f64, eps: f64,
494        result: *mut FlodlTensor,
495    ) -> *mut i8;
496
497    // --- PixelShuffle ---
498
499    pub fn flodl_pixel_shuffle(
500        input: FlodlTensor, upscale_factor: i64, result: *mut FlodlTensor,
501    ) -> *mut i8;
502
503    pub fn flodl_pixel_unshuffle(
504        input: FlodlTensor, downscale_factor: i64, result: *mut FlodlTensor,
505    ) -> *mut i8;
506
507    // --- Bilinear ---
508
509    pub fn flodl_bilinear(
510        input1: FlodlTensor, input2: FlodlTensor,
511        weight: FlodlTensor, bias: FlodlTensor,
512        result: *mut FlodlTensor,
513    ) -> *mut i8;
514
515    // --- Grid sampling ---
516
517    pub fn flodl_grid_sample(
518        input: FlodlTensor, grid: FlodlTensor,
519        mode: i32, padding_mode: i32, align_corners: i32,
520        result: *mut FlodlTensor,
521    ) -> *mut i8;
522
523    // --- Device ---
524
525    pub fn flodl_to_device(
526        t: FlodlTensor, device_type: i32, device_index: i32,
527        result: *mut FlodlTensor,
528    ) -> *mut i8;
529
530    pub fn flodl_to_device_async(
531        t: FlodlTensor, device_type: i32, device_index: i32,
532        result: *mut FlodlTensor,
533    ) -> *mut i8;
534
535    pub fn flodl_cuda_is_available() -> i32;
536    pub fn flodl_cuda_device_count() -> i32;
537    pub fn flodl_force_cuda_link() -> i32;
538    pub fn flodl_set_current_device(device_index: i32);
539    pub fn flodl_get_current_device() -> i32;
540    pub fn flodl_cuda_synchronize(device_index: i32);
541
542    // --- CUDA memory/utilization (monitor support) ---
543
544    pub fn flodl_cuda_mem_info(
545        device_index: i32, used_bytes: *mut u64, total_bytes: *mut u64,
546    ) -> *mut i8;
547
548    pub fn flodl_cuda_alloc_bytes(
549        device_index: i32, allocated_bytes: *mut u64,
550    ) -> *mut i8;
551
552    pub fn flodl_cuda_active_bytes(
553        device_index: i32, active_bytes: *mut u64,
554    ) -> *mut i8;
555
556    pub fn flodl_cuda_peak_active_bytes(
557        device_index: i32, peak_bytes: *mut u64,
558    ) -> *mut i8;
559
560    pub fn flodl_cuda_peak_reserved_bytes(
561        device_index: i32, peak_bytes: *mut u64,
562    ) -> *mut i8;
563
564    pub fn flodl_cuda_reset_peak_stats(device_index: i32);
565
566    pub fn flodl_cuda_empty_cache();
567
568    pub fn flodl_cuda_utilization(device_index: i32) -> i32;
569
570    pub fn flodl_cuda_device_name(
571        device_index: i32, buf: *mut i8, buf_len: i32,
572    ) -> *mut i8;
573
574    // --- Dtype casting ---
575
576    pub fn flodl_to_dtype(
577        t: FlodlTensor, dtype: i32, result: *mut FlodlTensor,
578    ) -> *mut i8;
579
580    pub fn flodl_all_finite(t: FlodlTensor, result: *mut i32) -> *mut i8;
581
582    // --- Comparison (tensor-tensor, return float masks: 0.0 or 1.0) ---
583
584    pub fn flodl_gt_tensor(
585        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
586    ) -> *mut i8;
587
588    pub fn flodl_lt_tensor(
589        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
590    ) -> *mut i8;
591
592    pub fn flodl_ge_tensor(
593        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
594    ) -> *mut i8;
595
596    pub fn flodl_le_tensor(
597        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
598    ) -> *mut i8;
599
600    pub fn flodl_eq_tensor(
601        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
602    ) -> *mut i8;
603
604    pub fn flodl_ne_tensor(
605        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
606    ) -> *mut i8;
607
608    // --- Element-wise binary (differentiable) ---
609
610    pub fn flodl_atan2(
611        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
612    ) -> *mut i8;
613
614    pub fn flodl_maximum(
615        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
616    ) -> *mut i8;
617
618    pub fn flodl_minimum(
619        a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor,
620    ) -> *mut i8;
621
622    // --- Additional reductions ---
623
624    pub fn flodl_argmin(
625        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
626    ) -> *mut i8;
627
628    pub fn flodl_var(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
629    pub fn flodl_std_op(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
630
631    pub fn flodl_var_dim(
632        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
633    ) -> *mut i8;
634
635    pub fn flodl_std_dim(
636        t: FlodlTensor, dim: i32, keepdim: i32, result: *mut FlodlTensor,
637    ) -> *mut i8;
638
639    pub fn flodl_cumprod(t: FlodlTensor, dim: i32, result: *mut FlodlTensor) -> *mut i8;
640    pub fn flodl_norm_p_dim(
641        t: FlodlTensor, p: f64, dim: i32, keepdim: i32, result: *mut FlodlTensor,
642    ) -> *mut i8;
643    pub fn flodl_sum_dims(
644        t: FlodlTensor, dims: *mut i64, ndims: i32, keepdim: i32,
645        result: *mut FlodlTensor,
646    ) -> *mut i8;
647    pub fn flodl_median(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
648    pub fn flodl_median_dim(
649        t: FlodlTensor, dim: i32, keepdim: i32,
650        values: *mut FlodlTensor, indices: *mut FlodlTensor,
651    ) -> *mut i8;
652    pub fn flodl_count_nonzero(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
653    pub fn flodl_count_nonzero_dim(
654        t: FlodlTensor, dim: i32, result: *mut FlodlTensor,
655    ) -> *mut i8;
656
657    // --- Query ops ---
658
659    pub fn flodl_nonzero(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
660    pub fn flodl_unique(
661        t: FlodlTensor, sorted: i32, return_inverse: i32,
662        output: *mut FlodlTensor, inverse_indices: *mut FlodlTensor,
663    ) -> *mut i8;
664    pub fn flodl_searchsorted(
665        sorted_seq: FlodlTensor, values: FlodlTensor,
666        result: *mut FlodlTensor,
667    ) -> *mut i8;
668
669    // --- Shape ops (advanced) ---
670
671    pub fn flodl_diagonal(
672        t: FlodlTensor, offset: i64, dim1: i32, dim2: i32,
673        result: *mut FlodlTensor,
674    ) -> *mut i8;
675    pub fn flodl_movedim(
676        t: FlodlTensor, src: i64, dst: i64, result: *mut FlodlTensor,
677    ) -> *mut i8;
678    pub fn flodl_tile(
679        t: FlodlTensor, reps: *mut i64, ndim: i32, result: *mut FlodlTensor,
680    ) -> *mut i8;
681
682    // --- Element-wise math (trig, rounding, sign) ---
683
684    pub fn flodl_sin(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
685    pub fn flodl_cos(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
686    pub fn flodl_tan(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
687    pub fn flodl_asin(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
688    pub fn flodl_acos(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
689    pub fn flodl_atan(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
690    pub fn flodl_sign(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
691    pub fn flodl_floor(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
692    pub fn flodl_ceil(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
693    pub fn flodl_round(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
694    pub fn flodl_reciprocal(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
695    pub fn flodl_erf(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
696    pub fn flodl_erfc(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
697    pub fn flodl_trunc(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
698    pub fn flodl_frac(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
699    pub fn flodl_fmod_scalar(t: FlodlTensor, scalar: f64, result: *mut FlodlTensor) -> *mut i8;
700    pub fn flodl_fmod_tensor(a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
701    pub fn flodl_remainder_scalar(t: FlodlTensor, scalar: f64, result: *mut FlodlTensor) -> *mut i8;
702    pub fn flodl_remainder_tensor(a: FlodlTensor, b: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
703    pub fn flodl_lerp(a: FlodlTensor, b: FlodlTensor, weight: f64, result: *mut FlodlTensor) -> *mut i8;
704    pub fn flodl_lerp_tensor(a: FlodlTensor, b: FlodlTensor, weight: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
705    pub fn flodl_isclose(a: FlodlTensor, b: FlodlTensor, rtol: f64, atol: f64, result: *mut FlodlTensor) -> *mut i8;
706
707    // --- Fused mul-add ---
708
709    pub fn flodl_addmm(
710        bias: FlodlTensor, mat1: FlodlTensor, mat2: FlodlTensor,
711        beta: f64, alpha: f64, result: *mut FlodlTensor,
712    ) -> *mut i8;
713    pub fn flodl_addcmul(
714        self_: FlodlTensor, t1: FlodlTensor, t2: FlodlTensor,
715        value: f64, result: *mut FlodlTensor,
716    ) -> *mut i8;
717    pub fn flodl_addcdiv(
718        self_: FlodlTensor, t1: FlodlTensor, t2: FlodlTensor,
719        value: f64, result: *mut FlodlTensor,
720    ) -> *mut i8;
721
722    // --- Advanced indexing ---
723
724    pub fn flodl_gather(
725        t: FlodlTensor, dim: i32, index: FlodlTensor,
726        result: *mut FlodlTensor,
727    ) -> *mut i8;
728
729    pub fn flodl_scatter_add(
730        t: FlodlTensor, dim: i32, index: FlodlTensor, src: FlodlTensor,
731        result: *mut FlodlTensor,
732    ) -> *mut i8;
733
734    // --- Sorting ---
735
736    pub fn flodl_topk(
737        t: FlodlTensor, k: i64, dim: i32, largest: i32, sorted: i32,
738        values: *mut FlodlTensor, indices: *mut FlodlTensor,
739    ) -> *mut i8;
740
741    pub fn flodl_sort(
742        t: FlodlTensor, dim: i32, descending: i32,
743        values: *mut FlodlTensor, indices: *mut FlodlTensor,
744    ) -> *mut i8;
745
746    // --- Tensor creation (additional) ---
747
748    pub fn flodl_eye(
749        n: i64, dtype: i32, device_type: i32, device_index: i32,
750        result: *mut FlodlTensor,
751    ) -> *mut i8;
752
753    pub fn flodl_full(
754        shape: *mut i64, ndim: i32, value: f64, dtype: i32,
755        device_type: i32, device_index: i32,
756        result: *mut FlodlTensor,
757    ) -> *mut i8;
758
759    pub fn flodl_randperm(
760        n: i64, dtype: i32, device_type: i32, device_index: i32,
761        result: *mut FlodlTensor,
762    ) -> *mut i8;
763
764    pub fn flodl_multinomial(
765        probs: FlodlTensor, num_samples: i64, replacement: i32,
766        result: *mut FlodlTensor,
767    ) -> *mut i8;
768
769    // --- Normalization ---
770
771    pub fn flodl_normalize(
772        t: FlodlTensor, p: f64, dim: i32, result: *mut FlodlTensor,
773    ) -> *mut i8;
774
775    // --- Shape operations (additional) ---
776
777    pub fn flodl_chunk(
778        t: FlodlTensor, chunks: i32, dim: i32,
779        results: *mut *mut FlodlTensor, count: *mut i32,
780    ) -> *mut i8;
781
782    pub fn flodl_repeat(
783        t: FlodlTensor, repeats: *mut i64, ndim: i32,
784        result: *mut FlodlTensor,
785    ) -> *mut i8;
786
787    pub fn flodl_pad(
788        t: FlodlTensor, padding: *mut i64, pad_len: i32, value: f64,
789        result: *mut FlodlTensor,
790    ) -> *mut i8;
791
792    // mode: 0=constant, 1=reflect, 2=replicate, 3=circular
793    pub fn flodl_pad_mode(
794        t: FlodlTensor, padding: *mut i64, pad_len: i32,
795        mode: i32, value: f64,
796        result: *mut FlodlTensor,
797    ) -> *mut i8;
798
799    // mode: 0=nearest, 1=bilinear, 2=bicubic, 3=trilinear
800    pub fn flodl_interpolate(
801        input: FlodlTensor, output_size: *mut i64, ndim: i32,
802        mode: i32, align_corners: i32,
803        result: *mut FlodlTensor,
804    ) -> *mut i8;
805
806    pub fn flodl_flip(
807        t: FlodlTensor, dims: *mut i64, ndim: i32,
808        result: *mut FlodlTensor,
809    ) -> *mut i8;
810
811    pub fn flodl_roll(
812        t: FlodlTensor, shift: i64, dim: i32,
813        result: *mut FlodlTensor,
814    ) -> *mut i8;
815
816    pub fn flodl_split(
817        t: FlodlTensor, split_size: i64, dim: i32,
818        results: *mut *mut FlodlTensor, count: *mut i32,
819    ) -> *mut i8;
820
821    pub fn flodl_unbind(
822        t: FlodlTensor, dim: i32,
823        results: *mut *mut FlodlTensor, count: *mut i32,
824    ) -> *mut i8;
825
826    pub fn flodl_contiguous(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
827    pub fn flodl_is_contiguous(t: FlodlTensor) -> i32;
828
829    pub fn flodl_argsort(
830        t: FlodlTensor, dim: i32, descending: i32,
831        result: *mut FlodlTensor,
832    ) -> *mut i8;
833
834    pub fn flodl_scatter(
835        t: FlodlTensor, dim: i32, index: FlodlTensor, src: FlodlTensor,
836        result: *mut FlodlTensor,
837    ) -> *mut i8;
838
839    // --- Autograd ---
840
841    pub fn flodl_set_requires_grad(
842        t: FlodlTensor, requires_grad: i32, result: *mut FlodlTensor,
843    ) -> *mut i8;
844
845    pub fn flodl_requires_grad(t: FlodlTensor) -> i32;
846
847    pub fn flodl_backward(t: FlodlTensor) -> *mut i8;
848
849    pub fn flodl_grad(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
850
851    pub fn flodl_set_grad(t: FlodlTensor, grad: FlodlTensor) -> *mut i8;
852
853    pub fn flodl_zero_grad(t: FlodlTensor) -> *mut i8;
854
855    pub fn flodl_detach(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
856
857    pub fn flodl_detach_(t: FlodlTensor) -> *mut i8;
858
859    pub fn flodl_is_leaf(t: FlodlTensor) -> i32;
860
861    // --- Autograd context ---
862
863    pub fn flodl_no_grad_guard_new() -> *mut c_void;
864    pub fn flodl_no_grad_guard_delete(guard: *mut c_void);
865    pub fn flodl_is_grad_enabled() -> i32;
866
867    // --- Autocast (automatic mixed precision) ---
868
869    pub fn flodl_autocast_guard_new(device_type: i32, dtype: i32) -> *mut c_void;
870    pub fn flodl_autocast_guard_delete(guard: *mut c_void);
871    pub fn flodl_is_autocast_enabled(device_type: i32) -> i32;
872
873    // --- Meshgrid ---
874
875    pub fn flodl_meshgrid(
876        tensors: *mut FlodlTensor, count: i32,
877        results: *mut *mut FlodlTensor, result_count: *mut i32,
878    ) -> *mut i8;
879
880    // --- Pairwise distance ---
881
882    pub fn flodl_cdist(
883        x: FlodlTensor, y: FlodlTensor, p: f64,
884        result: *mut FlodlTensor,
885    ) -> *mut i8;
886
887    // --- Cosine similarity ---
888
889    pub fn flodl_cosine_similarity(
890        a: FlodlTensor, b: FlodlTensor,
891        dim: i64, eps: f64,
892        result: *mut FlodlTensor,
893    ) -> *mut i8;
894
895    // --- Fused ops ---
896
897    pub fn flodl_linear(
898        input: FlodlTensor, weight: FlodlTensor, bias: FlodlTensor,
899        result: *mut FlodlTensor,
900    ) -> *mut i8;
901
902    pub fn flodl_gru_cell(
903        input: FlodlTensor, hx: FlodlTensor,
904        w_ih: FlodlTensor, w_hh: FlodlTensor,
905        b_ih: FlodlTensor, b_hh: FlodlTensor,
906        result: *mut FlodlTensor,
907    ) -> *mut i8;
908
909    pub fn flodl_lstm_cell(
910        input: FlodlTensor, hx: FlodlTensor, cx: FlodlTensor,
911        w_ih: FlodlTensor, w_hh: FlodlTensor,
912        b_ih: FlodlTensor, b_hh: FlodlTensor,
913        h_out: *mut FlodlTensor, c_out: *mut FlodlTensor,
914    ) -> *mut i8;
915
916    // Fused sequence ops (cuDNN-accelerated)
917    pub fn flodl_lstm(
918        input: FlodlTensor, h_0: FlodlTensor, c_0: FlodlTensor,
919        params: *const FlodlTensor, num_params: i64,
920        num_layers: i64, batch_first: bool, flatten: bool,
921        output: *mut FlodlTensor, h_n: *mut FlodlTensor, c_n: *mut FlodlTensor,
922    ) -> *mut i8;
923
924    pub fn flodl_gru(
925        input: FlodlTensor, h_0: FlodlTensor,
926        params: *const FlodlTensor, num_params: i64,
927        num_layers: i64, batch_first: bool, flatten: bool,
928        output: *mut FlodlTensor, h_n: *mut FlodlTensor,
929    ) -> *mut i8;
930
931    // Cached RNN params (zero per-forward overhead)
932    pub fn flodl_rnn_params_create(
933        params: *const FlodlTensor, num_params: i64,
934        mode: i64, num_layers: i64, batch_first: bool, flatten: bool,
935        out: *mut *mut std::os::raw::c_void,
936    ) -> *mut i8;
937    pub fn flodl_rnn_params_free(rp: *mut std::os::raw::c_void);
938    pub fn flodl_lstm_cached(
939        input: FlodlTensor, h_0: FlodlTensor, c_0: FlodlTensor,
940        rp: *mut std::os::raw::c_void, num_layers: i64, batch_first: bool,
941        output: *mut FlodlTensor, h_n: *mut FlodlTensor, c_n: *mut FlodlTensor,
942    ) -> *mut i8;
943    pub fn flodl_gru_cached(
944        input: FlodlTensor, h_0: FlodlTensor,
945        rp: *mut std::os::raw::c_void, num_layers: i64, batch_first: bool,
946        output: *mut FlodlTensor, h_n: *mut FlodlTensor,
947    ) -> *mut i8;
948
949    // --- cuDNN benchmark ---
950
951    pub fn flodl_set_cudnn_benchmark(enable: i32);
952
953    // --- RNG seed ---
954
955    pub fn flodl_manual_seed(seed: u64);
956    pub fn flodl_cuda_manual_seed_all(seed: u64);
957
958    // --- In-place operations ---
959
960    pub fn flodl_add_(t: FlodlTensor, other: FlodlTensor) -> *mut i8;
961    pub fn flodl_sub_(t: FlodlTensor, other: FlodlTensor) -> *mut i8;
962    pub fn flodl_mul_scalar_(t: FlodlTensor, scalar: f64) -> *mut i8;
963    pub fn flodl_add_scalar_(t: FlodlTensor, scalar: f64) -> *mut i8;
964    pub fn flodl_zero_(t: FlodlTensor) -> *mut i8;
965    pub fn flodl_mul_(t: FlodlTensor, other: FlodlTensor) -> *mut i8;
966    pub fn flodl_div_scalar_(t: FlodlTensor, scalar: f64) -> *mut i8;
967    pub fn flodl_div_(t: FlodlTensor, other: FlodlTensor) -> *mut i8;
968    pub fn flodl_fill_(t: FlodlTensor, value: f64) -> *mut i8;
969
970    // --- Fused Adam step ---
971
972    pub fn flodl_adam_step(
973        param: FlodlTensor, grad: FlodlTensor,
974        m: FlodlTensor, v: FlodlTensor,
975        lr: f64, beta1: f64, beta2: f64, eps: f64,
976        weight_decay: f64, step: i64,
977    ) -> *mut i8;
978
979    // --- Batched Adam step ---
980
981    pub fn flodl_adam_step_batched(
982        params: *mut FlodlTensor, grads: *mut FlodlTensor,
983        ms: *mut FlodlTensor, vs: *mut FlodlTensor,
984        lrs: *mut f64, count: i32,
985        beta1: f64, beta2: f64, eps: f64,
986        weight_decay: f64, step: i64,
987    ) -> *mut i8;
988
989    // --- Fused Adam/AdamW (multi-tensor kernel) ---
990
991    pub fn flodl_fused_adam_(
992        params: *mut FlodlTensor, grads: *mut FlodlTensor,
993        exp_avgs: *mut FlodlTensor, exp_avg_sqs: *mut FlodlTensor,
994        count: i32, lr: f64,
995        beta1: f64, beta2: f64, eps: f64,
996        weight_decay: f64, step: i64,
997        grad_scale: FlodlTensor, found_inf: FlodlTensor,
998    ) -> *mut i8;
999
1000    pub fn flodl_fused_adamw_(
1001        params: *mut FlodlTensor, grads: *mut FlodlTensor,
1002        exp_avgs: *mut FlodlTensor, exp_avg_sqs: *mut FlodlTensor,
1003        count: i32, lr: f64,
1004        beta1: f64, beta2: f64, eps: f64,
1005        weight_decay: f64, step: i64,
1006        grad_scale: FlodlTensor, found_inf: FlodlTensor,
1007    ) -> *mut i8;
1008
1009    // --- Pinned memory ---
1010
1011    pub fn flodl_pin_memory(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
1012    pub fn flodl_is_pinned(t: FlodlTensor) -> i32;
1013
1014    // --- Memory diagnostics ---
1015
1016    pub fn flodl_malloc_trim() -> i32;
1017
1018    // --- Zero grad (set_to_none) ---
1019
1020    pub fn flodl_zero_grad_set_to_none(t: FlodlTensor);
1021
1022    // --- Fused clip_grad_norm ---
1023
1024    pub fn flodl_clip_grad_norm(
1025        params: *mut FlodlTensor, count: i32,
1026        max_norm: f64, total_norm_out: *mut f64,
1027    ) -> *mut i8;
1028
1029    // --- Multi-tensor foreach operations ---
1030
1031    pub fn flodl_foreach_add_scalar_(
1032        tensors: *mut FlodlTensor, count: i32, scalar: f64,
1033    ) -> *mut i8;
1034
1035    pub fn flodl_foreach_mul_scalar_(
1036        tensors: *mut FlodlTensor, count: i32, scalar: f64,
1037    ) -> *mut i8;
1038
1039    pub fn flodl_foreach_zero_(
1040        tensors: *mut FlodlTensor, count: i32,
1041    ) -> *mut i8;
1042
1043    pub fn flodl_foreach_add_list_(
1044        tensors1: *mut FlodlTensor, tensors2: *mut FlodlTensor,
1045        count: i32, alpha: f64,
1046    ) -> *mut i8;
1047
1048    pub fn flodl_foreach_norm(
1049        tensors: *mut FlodlTensor, count: i32, ord: f64,
1050        results: *mut FlodlTensor,
1051    ) -> *mut i8;
1052
1053    pub fn flodl_foreach_lerp_scalar_(
1054        tensors1: *mut FlodlTensor, tensors2: *mut FlodlTensor,
1055        count: i32, weight: f64,
1056    ) -> *mut i8;
1057
1058    pub fn flodl_foreach_sqrt_(
1059        tensors: *mut FlodlTensor, count: i32,
1060    ) -> *mut i8;
1061
1062    // --- Autograd diagnostics ---
1063
1064    pub fn flodl_autograd_node_count(t: FlodlTensor) -> i64;
1065
1066    // --- Fused loss functions ---
1067
1068    pub fn flodl_mse_loss(
1069        pred: FlodlTensor, target: FlodlTensor,
1070        reduction: i64, result: *mut FlodlTensor,
1071    ) -> *mut i8;
1072
1073    pub fn flodl_cross_entropy_loss(
1074        pred: FlodlTensor, target: FlodlTensor,
1075        reduction: i64, ignore_index: i64, label_smoothing: f64,
1076        result: *mut FlodlTensor,
1077    ) -> *mut i8;
1078
1079    pub fn flodl_bce_with_logits_loss(
1080        pred: FlodlTensor, target: FlodlTensor,
1081        reduction: i64, result: *mut FlodlTensor,
1082    ) -> *mut i8;
1083
1084    pub fn flodl_bce_loss(
1085        pred: FlodlTensor, target: FlodlTensor,
1086        reduction: i64, result: *mut FlodlTensor,
1087    ) -> *mut i8;
1088
1089    pub fn flodl_l1_loss(
1090        pred: FlodlTensor, target: FlodlTensor,
1091        reduction: i64, result: *mut FlodlTensor,
1092    ) -> *mut i8;
1093
1094    pub fn flodl_smooth_l1_loss(
1095        pred: FlodlTensor, target: FlodlTensor,
1096        reduction: i64, beta: f64,
1097        result: *mut FlodlTensor,
1098    ) -> *mut i8;
1099
1100    pub fn flodl_kl_div_loss(
1101        input: FlodlTensor, target: FlodlTensor,
1102        reduction: i64, log_target: i32,
1103        result: *mut FlodlTensor,
1104    ) -> *mut i8;
1105
1106    pub fn flodl_nll_loss(
1107        input: FlodlTensor, target: FlodlTensor,
1108        reduction: i64, ignore_index: i64,
1109        result: *mut FlodlTensor,
1110    ) -> *mut i8;
1111
1112    pub fn flodl_ctc_loss(
1113        log_probs: FlodlTensor, targets: FlodlTensor,
1114        input_lengths: FlodlTensor, target_lengths: FlodlTensor,
1115        blank: i64, reduction: i64,
1116        result: *mut FlodlTensor,
1117    ) -> *mut i8;
1118
1119    // --- Fused batch normalization ---
1120
1121    pub fn flodl_batch_norm(
1122        input: FlodlTensor, weight: FlodlTensor,
1123        bias: FlodlTensor, running_mean: FlodlTensor,
1124        running_var: FlodlTensor, training: i32,
1125        momentum: f64, eps: f64,
1126        result: *mut FlodlTensor,
1127    ) -> *mut i8;
1128
1129    // --- Fused dropout ---
1130
1131    pub fn flodl_dropout(
1132        input: FlodlTensor, p: f64, training: i32,
1133        result: *mut FlodlTensor,
1134    ) -> *mut i8;
1135
1136    pub fn flodl_feature_dropout(
1137        input: FlodlTensor, p: f64, training: i32,
1138        result: *mut FlodlTensor,
1139    ) -> *mut i8;
1140
1141    // --- In-place copy ---
1142
1143    pub fn flodl_copy_(dst: FlodlTensor, src: FlodlTensor, non_blocking: i32) -> *mut i8;
1144
1145    // --- Memory format ---
1146
1147    pub fn flodl_to_channels_last(t: FlodlTensor, result: *mut FlodlTensor) -> *mut i8;
1148    pub fn flodl_is_channels_last(t: FlodlTensor) -> i32;
1149
1150    // --- Embedding bag ---
1151
1152    pub fn flodl_embedding_bag(
1153        weight: FlodlTensor, indices: FlodlTensor, offsets: FlodlTensor,
1154        mode: i64, result: *mut FlodlTensor,
1155    ) -> *mut i8;
1156
1157    // --- CUDA Graphs ---
1158
1159    pub fn flodl_cuda_graph_new(graph_out: *mut *mut c_void) -> *mut i8;
1160    pub fn flodl_cuda_graph_capture_begin(
1161        graph: *mut c_void, pool_hi: u64, pool_lo: u64, mode: i32,
1162    ) -> *mut i8;
1163    pub fn flodl_cuda_graph_capture_end(graph: *mut c_void) -> *mut i8;
1164    pub fn flodl_cuda_graph_replay(graph: *mut c_void) -> *mut i8;
1165    pub fn flodl_cuda_graph_reset(graph: *mut c_void) -> *mut i8;
1166    pub fn flodl_cuda_graph_delete(graph: *mut c_void);
1167    pub fn flodl_cuda_graph_pool(
1168        graph: *mut c_void, pool_hi: *mut u64, pool_lo: *mut u64,
1169    );
1170    pub fn flodl_cuda_graph_pool_handle(pool_hi: *mut u64, pool_lo: *mut u64);
1171
1172    // --- Utility ---
1173
1174    pub fn flodl_free_string(s: *mut i8);
1175}