1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
//! C ABI declarations for the CUDA kernels.
//!
//! `dsfb_gpu_run_pipeline` is the host-side wrapper defined in
//! `cuda/kernels.cu`. It owns the device-side allocations: the Rust caller
//! supplies host buffers sized to the contract's grid, and the wrapper
//! handles `cudaMalloc` / `cudaMemcpy` / kernel launches / `cudaFree`. The
//! return value is the raw `cudaError_t` (0 = success).
#![cfg(feature = "cuda")]
#![allow(non_camel_case_types)]
#![allow(non_snake_case)]
#![allow(missing_docs)]
use core::ffi::c_int;
use dsfb_gpu_debug_core::candidate::CandidateInterval;
use dsfb_gpu_debug_core::consensus::ConsensusCell;
use dsfb_gpu_debug_core::detector::{DetectorCell, DetectorCellWide};
use dsfb_gpu_debug_core::event::GpuTraceEventCompact;
use dsfb_gpu_debug_core::residual::ResidualCell;
use dsfb_gpu_debug_core::sign::SignCell;
use dsfb_gpu_debug_core::window::WindowFeature;
/// repr-C mirror of `cuda/layout.cuh::DetectorThresholds`. The Rust
/// `DetectorThresholds` from the core crate carries the same fields in the
/// same order but has additional methods; the FFI form drops the methods
/// and keeps only the data.
#[repr(C)]
#[derive(Copy, Clone)]
pub struct DetectorThresholdsFfi {
pub spike_q16_raw: i32,
pub sustain_q16_raw: i32,
pub slew_shock_q16_raw: i32,
pub plateau_min_q16_raw: i32,
pub plateau_slew_max_q16_raw: i32,
pub plateau_windows: u32,
pub oscillation_window: u32,
pub oscillation_alternations: u32,
pub deadband_low_q16_raw: i32,
pub deadband_high_q16_raw: i32,
pub error_burst_q16_raw: i32,
pub coupling_lat_q16_raw: i32,
pub coupling_err_q16_raw: i32,
pub variance_window: u32,
pub variance_threshold_q16_raw: i32,
pub ramp_window: u32,
pub recovery_min_norm_q16_raw: i32,
pub clean_band_q16_raw: i32,
pub confuser_min_q16_raw: i32,
pub fanout_drift_q16_raw: i32,
pub entity_anomaly_factor_q16_raw: i32,
pub history_window: u32,
}
impl From<&dsfb_gpu_debug_core::detector::DetectorThresholds> for DetectorThresholdsFfi {
fn from(t: &dsfb_gpu_debug_core::detector::DetectorThresholds) -> Self {
Self {
spike_q16_raw: t.spike_q16_raw,
sustain_q16_raw: t.sustain_q16_raw,
slew_shock_q16_raw: t.slew_shock_q16_raw,
plateau_min_q16_raw: t.plateau_min_q16_raw,
plateau_slew_max_q16_raw: t.plateau_slew_max_q16_raw,
plateau_windows: t.plateau_windows,
oscillation_window: t.oscillation_window,
oscillation_alternations: t.oscillation_alternations,
deadband_low_q16_raw: t.deadband_low_q16_raw,
deadband_high_q16_raw: t.deadband_high_q16_raw,
error_burst_q16_raw: t.error_burst_q16_raw,
coupling_lat_q16_raw: t.coupling_lat_q16_raw,
coupling_err_q16_raw: t.coupling_err_q16_raw,
variance_window: t.variance_window,
variance_threshold_q16_raw: t.variance_threshold_q16_raw,
ramp_window: t.ramp_window,
recovery_min_norm_q16_raw: t.recovery_min_norm_q16_raw,
clean_band_q16_raw: t.clean_band_q16_raw,
confuser_min_q16_raw: t.confuser_min_q16_raw,
fanout_drift_q16_raw: t.fanout_drift_q16_raw,
entity_anomaly_factor_q16_raw: t.entity_anomaly_factor_q16_raw,
history_window: t.history_window,
}
}
}
/// R.9.c-diagnostic — per-stage cudaEvent timings for the D64
/// Throughput tree pipeline. Mirrors
/// `cuda/kernels.cu::D64ThroughputStageTimings` byte-for-byte under
/// `repr(C)`. Populated when the caller passes a non-null
/// `*mut D64ThroughputStageTimingsFfi` to the D64 throughput FFI;
/// when null, the FFI does no event work and pays zero overhead
/// — the existing R.9.b.3 wall time stays the canonical baseline.
///
/// All fields are CUDA-event-derived microseconds, recorded against
/// the pinned/async stream. The field order is the launch order of
/// the current D64 pipeline (residual → sign → wide detector →
/// wide consensus → wide candidate → 4 tree digests → D2H). Each
/// digest is reported individually because the R.9.b.3 16× wall-time
/// regression has several plausible culprits and we need per-kernel
/// granularity to distinguish them.
#[repr(C)]
#[derive(Copy, Clone, Debug, Default)]
pub struct D64ThroughputStageTimingsFfi {
/// `cudaMemcpyAsync` H2D for `WindowFeature[]`.
pub h2d_us: f32,
/// `residual_field_kernel`.
pub residual_us: f32,
/// `drift_slew_sign_kernel` (per-entity EWMA recurrence).
pub sign_us: f32,
/// `detector_motif_kernel_wide_d64` — the wide-mask detector
/// kernel writing 264-byte `DetectorCellWide` cells. Suspected
/// dominant cost at full scale; this field confirms or
/// falsifies that.
pub detector_wide_us: f32,
/// `consensus_grid_kernel_wide` — reads wide detector cells,
/// projects to canonical u16 on-device, writes
/// `ConsensusCell[]`.
pub consensus_wide_us: f32,
/// R.10a — `axis5_grid_sum_kernel_wide`: per-window i64 sum of
/// `axis7_consensus_q` across entities. Hoisted out of the
/// candidate kernel's flush loop so that loop drops from
/// O(length × n_entities) to O(length).
pub axis5_grid_sum_us: f32,
/// `candidate_collapse_kernel_wide` — entity-serial walker.
/// Post-R.10a the flush loop reads the precomputed grid sums
/// instead of re-scanning the entity axis per candidate.
pub candidate_wide_us: f32,
/// Tree-digest stage 0 (residual cells, 16 B/cell).
pub residual_digest_us: f32,
/// Tree-digest stage 1 (sign cells, 20 B/cell).
pub sign_digest_us: f32,
/// Tree-digest stage 2 (wide detector cells, 264 B/cell).
/// The R.9.c-compaction experiment isolated this stage's cost
/// as ~wash on wall time even when shrunk 16×, so this field
/// quantifies that empirically.
pub detector_digest_us: f32,
/// Tree-digest stage 3 (consensus cells, 32 B/cell).
pub consensus_digest_us: f32,
/// `cudaMemcpyAsync` D2H for candidates + counts +
/// 4×32-byte stage digests.
pub d2h_us: f32,
/// Sum of the above: device-side wall time from the H2D
/// `cudaEventRecord` to the D2H `cudaEventRecord` on the
/// captured stream.
pub total_device_us: f32,
}
/// R.8 — per-stage timings for the pinned/async Throughput-digests
/// pipeline. Mirrors `cuda/kernels.cu::R8StageTimings` byte-for-byte
/// under `repr(C)`. Populated when the caller passes a non-null
/// `*mut R8StageTimingsFfi` to the R.6b async FFI; ignored when null.
/// All fields are CUDA-event-derived microseconds.
///
/// Field order is the launch order: H2D → residual → drift/slew sign →
/// detector → consensus → candidate-collapse → 4 digest kernels → D2H.
/// `total_device_us` is the wall time from the H2D event to the D2H
/// event on the captured stream.
#[repr(C)]
#[derive(Copy, Clone, Debug, Default)]
pub struct R8StageTimingsFfi {
/// `cudaMemcpyAsync` H2D for `WindowFeature[]`.
pub h2d_us: f32,
/// `residual_field_kernel`.
pub residual_us: f32,
/// `drift_slew_sign_kernel` (per-entity EWMA recurrence).
pub sign_us: f32,
/// `detector_motif_kernel` (param or const variant).
pub detector_us: f32,
/// `consensus_grid_kernel`.
pub consensus_us: f32,
/// `candidate_collapse_kernel`.
pub candidate_us: f32,
/// 4 per-stage device SHA-256 digest kernels combined (Tier 3B).
pub digests_us: f32,
/// `cudaMemcpyAsync` D2H for candidates + count + 4 stage digests.
pub d2h_us: f32,
/// Sum of the above; effectively the device-side wall time.
pub total_device_us: f32,
}
/// Per-stage CUDA-event timings in microseconds. Mirrors the
/// `PipelineTimings` struct emitted by the host wrapper in
/// `cuda/kernels.cu`. All fields are floats because `cudaEventElapsedTime`
/// returns milliseconds as a `float`; the C++ side converts to µs before
/// writing. `total_us` includes alloc and free; the per-stage fields are
/// pairwise differences between consecutive event records.
///
/// Field order is fixed and must match the C ABI of the C++ struct in
/// `kernels.cu` verbatim.
#[repr(C)]
#[derive(Copy, Clone, Debug, Default)]
#[allow(clippy::struct_field_names)]
pub struct PipelineTimingsFfi {
/// Wall time spent in the seven `cudaMalloc` calls.
pub alloc_us: f32,
/// Wall time for the single host-to-device copy of the
/// `WindowFeature` array.
pub h2d_us: f32,
/// `residual_field` kernel time (from end of H2D to end of kernel 1).
pub k1_residual_us: f32,
/// `drift_slew_sign` kernel time.
pub k2_sign_us: f32,
/// `detector_motif` kernel time.
pub k3_detector_us: f32,
/// `consensus_grid` kernel time.
pub k4_consensus_us: f32,
/// `candidate_collapse` kernel time.
pub k5_candidate_us: f32,
/// Wall time for the six device-to-host copies of intermediates.
pub d2h_us: f32,
/// Wall time spent in `cudaFree` for the seven device buffers.
pub free_us: f32,
/// Total wall time on the device side, from the first event to the
/// last. Equal to the sum of the per-stage fields ± event-record
/// granularity.
pub total_us: f32,
}
extern "C" {
/// Batched allocation: allocates seven device buffers sized for
/// `n_catalogs * n_entities * n_windows` cells each. Pairs with
/// `dsfb_gpu_workspace_free` (the free routine is layout-agnostic).
pub fn dsfb_gpu_workspace_alloc_batched(
n_catalogs: i32,
n_entities: i32,
n_windows: i32,
max_candidates_per_entity: i32,
d_features_out: *mut *mut WindowFeature,
d_residuals_out: *mut *mut ResidualCell,
d_signs_out: *mut *mut SignCell,
d_detectors_out: *mut *mut DetectorCell,
d_consensus_out: *mut *mut ConsensusCell,
d_candidates_out: *mut *mut CandidateInterval,
d_candidate_count_out: *mut *mut i32,
) -> c_int;
/// Batched dispatch: runs K independent catalogs through the
/// deterministic-inference kernels in a single graph of launches.
/// `h_features` carries `n_catalogs * n_entities * n_windows`
/// WindowFeature records laid out catalog-major; outputs follow the
/// same layout. Each kernel reads its catalog index from
/// `blockIdx.z`. Per-catalog outputs are byte-identical to a
/// single-catalog dispatch on the same inputs.
pub fn dsfb_gpu_run_pipeline_batched(
d_features: *mut WindowFeature,
d_residuals: *mut ResidualCell,
d_signs: *mut SignCell,
d_detectors: *mut DetectorCell,
d_consensus: *mut ConsensusCell,
d_candidates: *mut CandidateInterval,
d_candidate_count: *mut i32,
h_features: *const WindowFeature,
n_catalogs: i32,
n_entities: i32,
n_windows: i32,
alpha_q16_raw: i32,
baseline_latency_us: u32,
baseline_error_rate_q_raw: i32,
h_thresholds: *const DetectorThresholdsFfi,
min_detector_count: i32,
min_residual_q_raw: i32,
min_length_windows: i32,
max_candidates_per_entity: i32,
h_residuals: *mut ResidualCell,
h_signs: *mut SignCell,
h_detectors: *mut DetectorCell,
h_consensus: *mut ConsensusCell,
h_candidates: *mut CandidateInterval,
h_candidate_count_per_entity: *mut i32,
timings_out: *mut PipelineTimingsFfi,
) -> c_int;
/// Allocate the seven device buffers a pipeline run needs and write
/// the resulting pointers through the out-parameters. Pairs with
/// `dsfb_gpu_workspace_free`. Returns 0 on success; on failure the
/// already-allocated buffers are rolled back and every out-pointer is
/// set to null.
pub fn dsfb_gpu_workspace_alloc(
n_entities: i32,
n_windows: i32,
max_candidates_per_entity: i32,
d_features_out: *mut *mut WindowFeature,
d_residuals_out: *mut *mut ResidualCell,
d_signs_out: *mut *mut SignCell,
d_detectors_out: *mut *mut DetectorCell,
d_consensus_out: *mut *mut ConsensusCell,
d_candidates_out: *mut *mut CandidateInterval,
d_candidate_count_out: *mut *mut i32,
) -> c_int;
/// Free every non-null device pointer in the workspace. Best-effort;
/// returns the last non-success cudaError seen.
pub fn dsfb_gpu_workspace_free(
d_features: *mut WindowFeature,
d_residuals: *mut ResidualCell,
d_signs: *mut SignCell,
d_detectors: *mut DetectorCell,
d_consensus: *mut ConsensusCell,
d_candidates: *mut CandidateInterval,
d_candidate_count: *mut i32,
) -> c_int;
/// Run the deterministic pipeline on a pre-allocated workspace. Same
/// outputs as `dsfb_gpu_run_pipeline` but skips the per-call
/// cudaMalloc/cudaFree storm. `timings_out` may be null; when set,
/// `alloc_us` and `free_us` are written as 0 because no allocation
/// occurs here.
pub fn dsfb_gpu_run_pipeline_on_workspace(
d_features: *mut WindowFeature,
d_residuals: *mut ResidualCell,
d_signs: *mut SignCell,
d_detectors: *mut DetectorCell,
d_consensus: *mut ConsensusCell,
d_candidates: *mut CandidateInterval,
d_candidate_count: *mut i32,
h_features: *const WindowFeature,
n_entities: i32,
n_windows: i32,
alpha_q16_raw: i32,
baseline_latency_us: u32,
baseline_error_rate_q_raw: i32,
h_thresholds: *const DetectorThresholdsFfi,
min_detector_count: i32,
min_residual_q_raw: i32,
min_length_windows: i32,
max_candidates_per_entity: i32,
h_residuals: *mut ResidualCell,
h_signs: *mut SignCell,
h_detectors: *mut DetectorCell,
h_consensus: *mut ConsensusCell,
h_candidates: *mut CandidateInterval,
h_candidate_count_per_entity: *mut i32,
timings_out: *mut PipelineTimingsFfi,
) -> c_int;
/// Run the full deterministic pipeline on the GPU. All host buffers are
/// allocated by the caller and must be sized exactly:
///
/// * `h_features`, `h_residuals`, `h_signs`, `h_detectors`, `h_consensus`
/// — each `n_entities * n_windows` elements.
/// * `h_candidates` — `n_entities * max_candidates_per_entity` elements.
/// * `h_candidate_count_per_entity` — `n_entities` elements.
///
/// `timings_out` may be null. When non-null the wrapper records CUDA
/// events around each stage and writes per-stage microsecond timings
/// before returning. Event creation/destruction is fully inside the
/// `if (want_timings)` branch on the C++ side, so passing `null`
/// reduces the function to its non-instrumented path with zero
/// CUDA-event overhead.
///
/// Returns 0 on success or a non-zero `cudaError_t` on failure.
pub fn dsfb_gpu_run_pipeline(
h_features: *const WindowFeature,
n_entities: i32,
n_windows: i32,
alpha_q16_raw: i32,
baseline_latency_us: u32,
baseline_error_rate_q_raw: i32,
h_thresholds: *const DetectorThresholdsFfi,
min_detector_count: i32,
min_residual_q_raw: i32,
min_length_windows: i32,
max_candidates_per_entity: i32,
h_residuals: *mut ResidualCell,
h_signs: *mut SignCell,
h_detectors: *mut DetectorCell,
h_consensus: *mut ConsensusCell,
h_candidates: *mut CandidateInterval,
h_candidate_count_per_entity: *mut i32,
timings_out: *mut PipelineTimingsFfi,
) -> c_int;
/// Tier 3B self-test: hash `len` bytes from `host_data` on the GPU
/// using the `__device__ SHA-256` defined in `cuda/sha256.cuh`. The
/// 32-byte big-endian digest is written to `out_digest`. The Rust
/// acceptance test asserts byte-equality with the host
/// `dsfb_gpu_debug_core::hash::sha256` implementation over three
/// known-vector inputs (empty, 55 bytes, 64 KiB). Single thread on
/// device; the SHA-256 algorithm is inherently serial.
pub fn dsfb_gpu_sha256_self_test(host_data: *const u8, len: u64, out_digest: *mut u8) -> c_int;
/// S-PERF.14b.1 streaming self-test: hashes `len` bytes from
/// `host_data` using the streaming SHA-256 init/update/finalize
/// helpers defined in `cuda/sha256.cuh` (per-byte updates,
/// stress-testing the buffering + tail-padding logic). The
/// 32-byte big-endian digest is written to `out_digest`. The
/// Rust acceptance test asserts byte-equality with
/// `dsfb_gpu_sha256_self_test` (one-shot) over three known-
/// vector inputs (empty, 55 bytes, 64 KiB) BEFORE the streaming
/// helpers are allowed to be consumed by the rewritten
/// `compact_densor_digest_v1_root_kernel_blockcoop`. Single
/// thread on device; SHA-256 is inherently serial within a
/// stream.
pub fn dsfb_gpu_sha256_streaming_self_test(
host_data: *const u8,
len: u64,
out_digest: *mut u8,
) -> c_int;
/// Generic device-byte allocator used by the workspace digest
/// buffer extension. `size` bytes are `cudaMalloc`'d and written
/// through `out_ptr`. Returns 0 on success or the raw `cudaError_t`.
pub fn dsfb_gpu_alloc_bytes(size: u64, out_ptr: *mut *mut u8) -> c_int;
/// Generic device-byte free. Accepts a null pointer (returns
/// `cudaSuccess`).
pub fn dsfb_gpu_free_bytes(ptr: *mut u8) -> c_int;
/// S-PERF.15.a Step 0 — synchronous device-to-host byte copy
/// helper. Required by the panel-locked pre-fusion byte-capture
/// harness: the harness D2Hs the post-dispatch
/// `d_detectors_wide` + `d_detector_digest_compact` arenas,
/// SHA-256s them, and asserts byte-equality against the
/// `PINNED_PRE_S_PERF_15_A_*_ARENA_SHA256` constants. Returns
/// `cudaSuccess` (= 0) on success. `size == 0` is a no-op; null
/// `d_src` or `h_dst` with non-zero size returns
/// `cudaErrorInvalidValue`.
pub fn dsfb_gpu_memcpy_d2h_bytes(d_src: *const u8, h_dst: *mut u8, size: u64) -> c_int;
/// S-PERF.15.d Step 1 — synchronous `cudaMemset` device-byte
/// helper. Required by the panel-locked Direction A.1 (D64
/// hot-lane projection with one-time zero-init): the
/// `d_detectors_wide` buffer is zero-initialised exactly once
/// at allocation time so that the rewritten
/// `detector_motif_fused_d64_kernel` can skip writing the cold
/// `mask[1..31]` lanes per dispatch without corrupting the
/// byte-identity contract (`PINNED_PRE_S_PERF_15_D_DETECTOR_CELLWIDE_ARENA_SHA256`
/// stays byte-identical because the cold bytes remain stable
/// zero across the buffer's lifetime). `value` is the byte
/// pattern (low 8 bits of `i32`, per `cudaMemset` semantics).
/// Returns `cudaSuccess` (= 0) on success. `size == 0` is a
/// no-op; null `d_dst` with non-zero size returns
/// `cudaErrorInvalidValue`.
pub fn dsfb_gpu_memset_bytes(d_dst: *mut u8, value: c_int, size: u64) -> c_int;
/// S-PERF.14b.1 v3 tile-size sweep entry point. Allocates a
/// synthetic leaf buffer of `n_chunks_per_catalog * 32 *
/// n_catalogs` bytes (cuda-memset to 0xA5; digest values are
/// NOT examined — only kernel wall-time is measured), runs
/// the streaming_blockcoop kernel `n_warmup` warm-up
/// iterations + `n_timed` timed iterations with the given
/// `tile_bytes`, and writes the mean per-iteration wall in
/// nanoseconds to `out_mean_ns`. Timing uses `cudaEvent_t`
/// (sub-µs precision; sufficient for tile-size comparison
/// without requiring sudo ncu). Path 1b v2 is INACTIVE in
/// production until a tile size beats Path 1a's pinned
/// 925.2 µs per-call wall; this entry point is the sweep
/// harness that finds (or fails to find) such a tile size.
pub fn dsfb_gpu_compact_densor_root_streaming_sweep_time(
n_chunks_per_catalog: u32,
chunk_size: u32,
stage_id: u32,
n_catalogs: u32,
tile_bytes: u32,
n_warmup: c_int,
n_timed: c_int,
out_mean_ns: *mut u64,
) -> c_int;
/// S-PERF.14b.1 v3 Path 1a baseline measurement (apples-
/// to-apples comparator). Same harness shape as the
/// streaming sweep above but invokes
/// `compact_densor_digest_v1_root_kernel_blockcoop` (the
/// active production backend). Used by the v3 sweep test
/// to produce per-stage Path 1a walls so the streaming
/// kernel comparison can be honest at every stage shape
/// (residual / sign / detector-compact-pack / consensus)
/// rather than relying on a single averaged Path 1a number.
pub fn dsfb_gpu_compact_densor_root_path1a_sweep_time(
n_chunks_per_catalog: u32,
chunk_size: u32,
stage_id: u32,
n_catalogs: u32,
n_warmup: c_int,
n_timed: c_int,
out_mean_ns: *mut u64,
) -> c_int;
/// R.6a — pinned host-byte allocator. `size` bytes are
/// `cudaMallocHost`'d (page-locked) and written through
/// `out_ptr`. Pinned host memory is the prerequisite for
/// `cudaMemcpyAsync` (without staging through a pageable copy),
/// which R.6b will use for stream-based double-buffered dispatch.
/// Returns 0 on success or the raw `cudaError_t` (and zeroes
/// `*out_ptr` on failure).
pub fn dsfb_gpu_alloc_pinned_bytes(size: u64, out_ptr: *mut *mut u8) -> c_int;
/// R.6a — pinned host-byte free. Mirror of
/// `dsfb_gpu_alloc_pinned_bytes`. Accepts a null pointer
/// (returns `cudaSuccess`). Calls `cudaFreeHost` rather than
/// `cudaFree`; mixing the two is a runtime error.
pub fn dsfb_gpu_free_pinned_bytes(ptr: *mut u8) -> c_int;
/// R.6b — create a CUDA stream. The returned handle is opaque
/// (`cudaStream_t` as `u64`); the Rust side never dereferences
/// it. Round-trip the handle through
/// `dsfb_gpu_run_pipeline_throughput_digests_async_on_workspace`
/// and free it with `dsfb_gpu_destroy_stream`. Returns 0 on
/// success or the raw `cudaError_t`; on failure `*out_stream`
/// is set to 0.
pub fn dsfb_gpu_create_stream(out_stream: *mut u64) -> c_int;
/// R.6b — destroy a CUDA stream created via
/// `dsfb_gpu_create_stream`. Accepts handle = 0 (no-op).
pub fn dsfb_gpu_destroy_stream(stream: u64) -> c_int;
/// R.8.5 — Throughput tree-digest dispatch. Mirrors
/// `dsfb_gpu_run_pipeline_throughput_digests_async_on_workspace`
/// but replaces the 4 single-thread `*_digest_kernel_batched`
/// kernels with the parallel tree-digest pair (one leaf block
/// per chunk + one root block per catalog). Produces a digest
/// that is INTENTIONALLY DIFFERENT from the serial-digest
/// output — the case file records `digest_mode = tree_sha256_v1`
/// plus the chunk_size + chunk_count so replay catches a
/// mode-mismatched receipt.
///
/// The caller supplies two workspace-owned scratch arenas:
/// `d_tree_leaves` holds per-stage leaf digests, striped at
/// `tree_leaves_stride_bytes` per stage; `d_tree_scratch` holds
/// the root kernel's per-stage concatenation scratch striped
/// at `tree_scratch_stride_bytes` per stage. Both are allocated
/// once at `GpuWorkspace::new_with_pinned_async` and sized for
/// the contract's worst case.
///
/// K=1 only at v0. The catalog axis is locked to 1 inside the
/// FFI; batched-K tree digest is a follow-up.
#[allow(clippy::too_many_arguments)]
pub fn dsfb_gpu_run_pipeline_throughput_tree_digests_async_on_workspace(
d_features: *mut WindowFeature,
d_residuals: *mut ResidualCell,
d_signs: *mut SignCell,
d_detectors: *mut DetectorCell,
d_consensus: *mut ConsensusCell,
d_candidates: *mut CandidateInterval,
d_candidate_count: *mut i32,
d_stage_digests: *mut u8,
d_tree_leaves: *mut u8,
tree_leaves_stride_bytes: u64,
d_tree_scratch: *mut u8,
tree_scratch_stride_bytes: u64,
h_features: *const WindowFeature,
n_entities: i32,
n_windows: i32,
alpha_q16_raw: i32,
baseline_latency_us: u32,
baseline_error_rate_q_raw: i32,
h_thresholds: *const DetectorThresholdsFfi,
min_detector_count: i32,
min_residual_q_raw: i32,
min_length_windows: i32,
max_candidates_per_entity: i32,
h_candidates: *mut CandidateInterval,
h_candidate_count_per_entity: *mut i32,
h_stage_digests: *mut u8,
stream_handle: u64,
use_const_thresholds: c_int,
tree_chunk_size: u32,
) -> c_int;
/// R.6d — upload `DetectorThresholds` into the device-side
/// `__constant__ c_detector_thresholds` symbol. Synchronous;
/// not bound to any stream so it cannot accidentally be baked
/// into a captured CUDA Graph. Callers upload once at workspace
/// construction; the dispatch wrappers then prefer the
/// `_const` kernel variant. On failure callers retain the
/// param-passing path (the FFI signatures accept a
/// `use_const_thresholds` flag).
///
/// Returns 0 on success or the raw `cudaError_t`. Idempotent:
/// uploading the same canonical thresholds twice is a no-op
/// from the device's perspective.
pub fn dsfb_gpu_upload_detector_thresholds(h_thresholds: *const DetectorThresholdsFfi)
-> c_int;
/// R.6b — explicit-stream async variant of
/// `dsfb_gpu_run_pipeline_throughput_digests_on_workspace`. Same
/// pipeline, same outputs; uses `cudaMemcpyAsync` and
/// stream-bound kernel launches throughout, then issues a
/// single `cudaStreamSynchronize` before returning so the
/// caller's contract (host buffers populated on return) is
/// preserved. `stream_handle = 0` falls back to the default
/// stream (matching the legacy sync wrapper's behaviour).
#[allow(clippy::too_many_arguments)]
pub fn dsfb_gpu_run_pipeline_throughput_digests_async_on_workspace(
d_features: *mut WindowFeature,
d_residuals: *mut ResidualCell,
d_signs: *mut SignCell,
d_detectors: *mut DetectorCell,
d_consensus: *mut ConsensusCell,
d_candidates: *mut CandidateInterval,
d_candidate_count: *mut i32,
d_stage_digests: *mut u8,
h_features: *const WindowFeature,
n_entities: i32,
n_windows: i32,
alpha_q16_raw: i32,
baseline_latency_us: u32,
baseline_error_rate_q_raw: i32,
h_thresholds: *const DetectorThresholdsFfi,
min_detector_count: i32,
min_residual_q_raw: i32,
min_length_windows: i32,
max_candidates_per_entity: i32,
h_consensus: *mut ConsensusCell,
h_candidates: *mut CandidateInterval,
h_candidate_count_per_entity: *mut i32,
h_stage_digests: *mut u8,
stream_handle: u64,
// R.6d — when non-zero, launch the `detector_motif_kernel_const`
// variant which reads thresholds from `c_detector_thresholds`
// (uploaded once at workspace construction via
// `dsfb_gpu_upload_detector_thresholds`). When zero, falls back
// to the original `detector_motif_kernel` with thresholds
// passed by value.
use_const_thresholds: c_int,
// R.8 — when non-null, the wrapper records cudaEvent_t markers
// around each kernel + each memcpy and writes per-stage
// microsecond timings into the struct. When null, no event
// overhead.
stage_timings_out: *mut R8StageTimingsFfi,
) -> c_int;
/// Tier 3B single-catalog dispatch: runs the pipeline + 4 digest
/// kernels on a pre-existing workspace. Copies back consensus
/// (needed by the bank stage's axis-5 entity-locality gate),
/// candidates, counts, and 4 × 32-byte stage digests. The
/// residual / sign / detector cell buffers stay on the device.
pub fn dsfb_gpu_run_pipeline_throughput_digests_on_workspace(
d_features: *mut WindowFeature,
d_residuals: *mut ResidualCell,
d_signs: *mut SignCell,
d_detectors: *mut DetectorCell,
d_consensus: *mut ConsensusCell,
d_candidates: *mut CandidateInterval,
d_candidate_count: *mut i32,
d_stage_digests: *mut u8,
h_features: *const WindowFeature,
n_entities: i32,
n_windows: i32,
alpha_q16_raw: i32,
baseline_latency_us: u32,
baseline_error_rate_q_raw: i32,
h_thresholds: *const DetectorThresholdsFfi,
min_detector_count: i32,
min_residual_q_raw: i32,
min_length_windows: i32,
max_candidates_per_entity: i32,
h_consensus: *mut ConsensusCell,
h_candidates: *mut CandidateInterval,
h_candidate_count_per_entity: *mut i32,
h_stage_digests: *mut u8,
) -> c_int;
/// R.4 fused single-catalog dispatch: same evidence chain as
/// `dsfb_gpu_run_pipeline_throughput_digests_on_workspace`, but
/// the residual + sign stages run as a Pre-Alpha EWMA kernel
/// followed by a cell-parallel fused R+S kernel instead of the
/// legacy entity-serial sign kernel. Detector / consensus /
/// candidate / digest kernels are unchanged. Byte-preserving.
///
/// `d_drifts` is the workspace's Pre-Alpha drift buffer
/// (`int32_t*`, sized `n_entities * n_windows`); the C++ side
/// uses it to stage the EWMA recurrence so the fused R+S kernel
/// can read it cell-locally.
pub fn dsfb_gpu_run_pipeline_fused_throughput_digests_on_workspace(
d_features: *mut WindowFeature,
d_residuals: *mut ResidualCell,
d_signs: *mut SignCell,
d_detectors: *mut DetectorCell,
d_consensus: *mut ConsensusCell,
d_candidates: *mut CandidateInterval,
d_candidate_count: *mut i32,
d_stage_digests: *mut u8,
d_drifts: *mut u8,
h_features: *const WindowFeature,
n_entities: i32,
n_windows: i32,
alpha_q16_raw: i32,
baseline_latency_us: u32,
baseline_error_rate_q_raw: i32,
h_thresholds: *const DetectorThresholdsFfi,
min_detector_count: i32,
min_residual_q_raw: i32,
min_length_windows: i32,
max_candidates_per_entity: i32,
h_consensus: *mut ConsensusCell,
h_candidates: *mut CandidateInterval,
h_candidate_count_per_entity: *mut i32,
h_stage_digests: *mut u8,
) -> c_int;
/// R.6c — opt-in CUDA Graph capture for the Throughput-digests
/// pipeline. Wraps the R.6b async kernel sequence in a
/// `cudaStreamBeginCapture`/`cudaStreamEndCapture` pair and
/// instantiates the captured topology into a `cudaGraphExec_t`
/// returned as an opaque `u64` through `out_graph_exec`. The
/// host buffers passed here are the workspace's pinned shadows
/// — the captured graph references those exact pointers, so
/// refreshing the pinned `h_features` between launches feeds
/// fresh inputs into the graph without re-capture.
///
/// Capture can legitimately fail (driver / device does not
/// support graphs, etc.). On failure the wrapper drains
/// capture mode and returns the raw `cudaError_t`; the Rust
/// caller demotes to the R.6b async path.
///
/// Implementation note: the C++ wrapper creates a private
/// scratch stream for `cudaStreamBeginCapture` and uses
/// `cudaStreamCaptureModeThreadLocal`. This ensures (a) the
/// workspace's launch stream is never put into capture mode,
/// so the demoted fallback can use it unmodified, and (b)
/// concurrent CUDA work on other threads (cargo test
/// parallelism, etc.) does not invalidate this capture or
/// vice-versa. `stream_handle` is accepted for API symmetry
/// with the R.6b async FFI but is not used inside capture —
/// the resulting `cudaGraphExec_t` is launchable on any
/// stream by design.
#[allow(clippy::too_many_arguments)]
pub fn dsfb_gpu_try_capture_throughput_graph(
out_graph_exec: *mut u64,
d_features: *mut WindowFeature,
d_residuals: *mut ResidualCell,
d_signs: *mut SignCell,
d_detectors: *mut DetectorCell,
d_consensus: *mut ConsensusCell,
d_candidates: *mut CandidateInterval,
d_candidate_count: *mut i32,
d_stage_digests: *mut u8,
h_features: *const WindowFeature,
n_entities: i32,
n_windows: i32,
alpha_q16_raw: i32,
baseline_latency_us: u32,
baseline_error_rate_q_raw: i32,
h_thresholds: *const DetectorThresholdsFfi,
min_detector_count: i32,
min_residual_q_raw: i32,
min_length_windows: i32,
max_candidates_per_entity: i32,
h_candidates: *mut CandidateInterval,
h_candidate_count_per_entity: *mut i32,
h_stage_digests: *mut u8,
stream_handle: u64,
// R.6d — non-zero bakes the `detector_motif_kernel_const`
// variant into the captured topology; zero bakes the
// param-passing variant. The caller is responsible for
// uploading thresholds via
// `dsfb_gpu_upload_detector_thresholds` BEFORE this call
// when passing 1, because the upload is intentionally
// kept outside the captured graph.
use_const_thresholds: c_int,
) -> c_int;
/// R.6c — launch a previously captured graph on the given
/// stream and `cudaStreamSynchronize` before returning so the
/// host pinned shadows are valid to read on return. Returns 0
/// on success or the raw `cudaError_t`. Rejects
/// `graph_exec = 0` or `stream_handle = 0` with
/// `cudaErrorInvalidValue`.
pub fn dsfb_gpu_launch_throughput_graph(graph_exec: u64, stream_handle: u64) -> c_int;
/// R.9.b.3 — full D64 Throughput pipeline (wide detector +
/// wide consensus + wide candidate + tree digest of wide
/// stage bytes). Mirrors `dsfb_gpu_run_pipeline_throughput_
/// tree_digests_async_on_workspace` but routes through the
/// wide-mask kernels with on-device projection to the
/// canonical 16-motif basis (the bank ABI is unchanged).
/// Wide detector cells stay on device; only candidates +
/// counts + 4 × 32-byte stage digests cross PCIe.
///
/// R.9.c-diagnostic: when `timings_out` is non-null, the FFI
/// records per-stage cudaEvents on the captured stream and
/// writes the elapsed-time breakdown into the
/// `D64ThroughputStageTimingsFfi` struct. Null = zero overhead
/// and the byte output is identical to the pre-diagnostic
/// path.
#[allow(clippy::too_many_arguments)]
pub fn dsfb_gpu_run_pipeline_throughput_d64_tree_async_on_workspace(
d_features: *mut WindowFeature,
d_residuals: *mut ResidualCell,
d_signs: *mut SignCell,
// S-PERF.14 — Pre-Alpha drift EWMA precompute buffer.
// Sized n_entities × n_windows × i32 per catalog.
// Written by `drift_ewma_precompute_kernel`; consumed
// by `drift_slew_sign_kernel_cellpar`. Allocated by
// `GpuWorkspace::ensure_drift_buffer()`.
d_drift_buffer: *mut i32,
d_detectors_wide: *mut DetectorCellWide,
d_consensus: *mut ConsensusCell,
d_axis5_grid_sum: *mut i64,
d_detector_digest_compact: *mut u8,
d_candidate_fired: *mut u8,
d_candidate_boundaries: *mut u8,
// S-PERF.14c — per-entity intermediate run-boundary
// scratch (`n_entities × MAX_CANDIDATES_PER_ENTITY × 8`
// bytes). Allocated by
// `GpuWorkspace::ensure_candidate_run_buffer()`. Written
// by `candidate_boundary_precompute_kernel`; consumed by
// `candidate_boundary_cellpar_emit_kernel`.
d_candidate_run_buffer: *mut u8,
// S-PERF.14c — per-entity surviving-run count scratch
// (`n_entities × 4` bytes). Allocated by
// `GpuWorkspace::ensure_candidate_run_buffer()`. Written
// by `candidate_boundary_precompute_kernel`; consumed by
// thread 0 of each (entity, catalog) cellpar emit block.
d_candidate_run_count: *mut i32,
d_candidates: *mut CandidateInterval,
d_candidate_count: *mut i32,
d_stage_digests: *mut u8,
d_tree_leaves: *mut u8,
tree_leaves_stride_bytes: u64,
d_tree_scratch: *mut u8,
tree_scratch_stride_bytes: u64,
d_events: *mut GpuTraceEventCompact,
h_events: *const GpuTraceEventCompact,
n_events: u64,
ticks_per_event_ns: u64,
window_size_ns: u64,
n_entities: i32,
n_windows: i32,
alpha_q16_raw: i32,
baseline_latency_us: u32,
baseline_error_rate_q_raw: i32,
h_thresholds: *const DetectorThresholdsFfi,
min_detector_count: i32,
min_residual_q_raw: i32,
min_length_windows: i32,
max_candidates_per_entity: i32,
profile_id: i32,
wide_mask_words_used: i32,
h_candidates: *mut CandidateInterval,
h_candidate_count_per_entity: *mut i32,
h_stage_digests: *mut u8,
stream_handle: u64,
tree_chunk_size: u32,
digest_mode_id: i32,
timings_out: *mut D64ThroughputStageTimingsFfi,
) -> c_int;
/// R.9.d.1 — full D128 Throughput pipeline. Mirrors the D64
/// throughput FFI but routes through the D128-specific kernels
/// (`detector_motif_kernel_wide_d128`, `consensus_grid_kernel_
/// wide_d128`, `candidate_pack_kernel_wide_d128`) so the bank
/// sees the OR-projected u16 mask over 8 variants per motif
/// instead of 4. The bank ABI is unchanged — only the wider
/// detector evidence drives more candidate firings.
///
/// R.9.d.1 deliberately omits the R.10b compact-wide-detector-
/// digest pack: the D128 detector tree-digest hashes the full
/// 264-byte `DetectorCellWide` stride. Compact-pack for D128
/// (a 28 B/cell `[header, mask_word_0, mask_word_1]` form) is
/// R.9.d.1-followup work, gated by the post-D128 R.12b sweep.
///
/// No optional `D64ThroughputStageTimings*` here — R.9.d.1
/// keeps the surface narrow; the D128 saturation bench
/// measures host wall-time directly.
#[allow(clippy::too_many_arguments)]
pub fn dsfb_gpu_run_pipeline_throughput_d128_tree_async_on_workspace(
d_features: *mut WindowFeature,
d_residuals: *mut ResidualCell,
d_signs: *mut SignCell,
d_detectors_wide: *mut DetectorCellWide,
d_consensus: *mut ConsensusCell,
d_axis5_grid_sum: *mut i64,
d_candidate_fired: *mut u8,
d_candidate_boundaries: *mut u8,
d_candidates: *mut CandidateInterval,
d_candidate_count: *mut i32,
d_stage_digests: *mut u8,
d_tree_leaves: *mut u8,
tree_leaves_stride_bytes: u64,
d_tree_scratch: *mut u8,
tree_scratch_stride_bytes: u64,
d_events: *mut GpuTraceEventCompact,
h_events: *const GpuTraceEventCompact,
n_events: u64,
ticks_per_event_ns: u64,
window_size_ns: u64,
n_entities: i32,
n_windows: i32,
alpha_q16_raw: i32,
baseline_latency_us: u32,
baseline_error_rate_q_raw: i32,
h_thresholds: *const DetectorThresholdsFfi,
min_detector_count: i32,
min_residual_q_raw: i32,
min_length_windows: i32,
max_candidates_per_entity: i32,
h_candidates: *mut CandidateInterval,
h_candidate_count_per_entity: *mut i32,
h_stage_digests: *mut u8,
stream_handle: u64,
tree_chunk_size: u32,
) -> c_int;
/// R.9.d.2.1 — D205 wide-mask throughput dispatch. Mirrors the
/// D128 entry-point signature exactly; the only difference is
/// the underlying CUDA kernels (detector_motif / consensus_grid
/// / candidate_pack are D205-specific). The same workspace
/// fields are used (D205's 264-byte `DetectorCellWide` cell is
/// identical to D128's). D205 is a scaling-ladder
/// byte-equivalence proof, NOT a new performance headline.
pub fn dsfb_gpu_run_pipeline_throughput_d205_tree_async_on_workspace(
d_features: *mut WindowFeature,
d_residuals: *mut ResidualCell,
d_signs: *mut SignCell,
d_detectors_wide: *mut DetectorCellWide,
d_consensus: *mut ConsensusCell,
d_axis5_grid_sum: *mut i64,
d_candidate_fired: *mut u8,
d_candidate_boundaries: *mut u8,
d_candidates: *mut CandidateInterval,
d_candidate_count: *mut i32,
d_stage_digests: *mut u8,
d_tree_leaves: *mut u8,
tree_leaves_stride_bytes: u64,
d_tree_scratch: *mut u8,
tree_scratch_stride_bytes: u64,
d_events: *mut GpuTraceEventCompact,
h_events: *const GpuTraceEventCompact,
n_events: u64,
ticks_per_event_ns: u64,
window_size_ns: u64,
n_entities: i32,
n_windows: i32,
alpha_q16_raw: i32,
baseline_latency_us: u32,
baseline_error_rate_q_raw: i32,
h_thresholds: *const DetectorThresholdsFfi,
min_detector_count: i32,
min_residual_q_raw: i32,
min_length_windows: i32,
max_candidates_per_entity: i32,
h_candidates: *mut CandidateInterval,
h_candidate_count_per_entity: *mut i32,
h_stage_digests: *mut u8,
stream_handle: u64,
tree_chunk_size: u32,
) -> c_int;
/// R.9.b.2 — wide-mask detector dispatch (D64). Runs the
/// standard residual → drift/slew sign → wide-detector
/// pipeline on the workspace's device buffers, then copies
/// the resulting `DetectorCellWide[]` (264 bytes per cell)
/// back to the host. Stops at the detector stage so the
/// parity claim against the CPU `evaluate_wide(D64, ...)`
/// reference is precise. Full pipeline integration is R.9.b.3.
///
/// `use_const_thresholds` is accepted for API symmetry with
/// the throughput dispatch path; the wide kernel currently
/// passes thresholds by value and ignores the flag — caller
/// should pass 0.
///
/// Returns 0 on success or the raw `cudaError_t` from the
/// first failing CUDA call.
pub fn dsfb_gpu_evaluate_detector_wide_d64_on_workspace(
d_features: *mut WindowFeature,
d_residuals: *mut ResidualCell,
d_signs: *mut SignCell,
d_detectors_wide: *mut DetectorCellWide,
h_features: *const WindowFeature,
n_entities: i32,
n_windows: i32,
alpha_q16_raw: i32,
baseline_latency_us: u32,
baseline_error_rate_q_raw: i32,
h_thresholds: *const DetectorThresholdsFfi,
h_detectors_wide: *mut DetectorCellWide,
stream_handle: u64,
use_const_thresholds: c_int,
) -> c_int;
/// R.6c — destroy a graph exec created by
/// `dsfb_gpu_try_capture_throughput_graph`. Accepts
/// `graph_exec = 0` (no-op, returns `cudaSuccess`).
pub fn dsfb_gpu_destroy_throughput_graph(graph_exec: u64) -> c_int;
/// Tier 3B batched dispatch: K-catalog pipeline + parallel digest
/// kernels. Residual / sign / detector cells stay on device;
/// per-catalog consensus + candidates + counts + (4 × 32) stage
/// digests come back. The digest buffer layout is stage-major:
/// `[K × residual][K × sign][K × detector][K × consensus]` for a
/// total of `4 × 32 × K` bytes.
pub fn dsfb_gpu_run_pipeline_batched_throughput_digests(
d_features: *mut WindowFeature,
d_residuals: *mut ResidualCell,
d_signs: *mut SignCell,
d_detectors: *mut DetectorCell,
d_consensus: *mut ConsensusCell,
d_candidates: *mut CandidateInterval,
d_candidate_count: *mut i32,
d_stage_digests: *mut u8,
h_features: *const WindowFeature,
n_catalogs: i32,
n_entities: i32,
n_windows: i32,
alpha_q16_raw: i32,
baseline_latency_us: u32,
baseline_error_rate_q_raw: i32,
h_thresholds: *const DetectorThresholdsFfi,
min_detector_count: i32,
min_residual_q_raw: i32,
min_length_windows: i32,
max_candidates_per_entity: i32,
h_consensus: *mut ConsensusCell,
h_candidates: *mut CandidateInterval,
h_candidate_count_per_entity: *mut i32,
h_stage_digests: *mut u8,
) -> c_int;
}