rype 1.0.0-rc.1

High-performance genomic sequence classification using minimizer-based k-mer sketching in RY space
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
#ifndef RYPE_H
#define RYPE_H

#include <stdint.h>
#include <stddef.h>

#ifdef __cplusplus
extern "C" {
#endif

/**
 * Rype: RY-encoded K-mer Partitioning Engine
 *
 * Version: 2.0.0
 *
 * A high-performance genomic sequence classification library using
 * minimizer-based k-mer sketching in RY (purine/pyrimidine) space.
 *
 * ## Quick Start
 *
 *     RypeIndex* idx = rype_index_load("index.ryxdi");
 *     if (!idx) {
 *         fprintf(stderr, "Error: %s\n", rype_get_last_error());
 *         return 1;
 *     }
 *
 *     RypeQuery queries[1] = {{
 *         .id = 0,
 *         .seq = "ACGTACGTACGT...",
 *         .seq_len = 100,
 *         .pair_seq = NULL,
 *         .pair_len = 0
 *     }};
 *
 *     RypeResultArray* results = rype_classify(idx, queries, 1, 0.1);
 *     if (results) {
 *         for (size_t i = 0; i < results->len; i++) {
 *             const char* name = rype_bucket_name(idx, results->data[i].bucket_id);
 *             printf("Query %ld -> %s (score: %.4f)\n",
 *                    results->data[i].query_id,
 *                    name ? name : "unknown",
 *                    results->data[i].score);
 *         }
 *         rype_results_free(results);
 *     }
 *
 *     rype_index_free(idx);
 *
 * ## Index Format
 *
 * This library uses Parquet-based sharded inverted indices stored as directories:
 *
 *     index.ryxdi/
 *     ├── manifest.toml           # TOML metadata (k, w, salt, bucket info)
 *     ├── buckets.parquet         # Bucket metadata (id, name, sources)
 *     └── inverted/
 *         ├── shard.0.parquet     # (minimizer, bucket_id) pairs
 *         ├── shard.1.parquet     # Additional shards for large indices
 *         └── ...
 *
 * All indices are sharded inverted indices. The format is auto-detected by
 * rype_index_load() when opening a .ryxdi directory.
 *
 * ## Thread Safety
 *
 * - Index loading/freeing: NOT thread-safe (use external synchronization)
 * - Index metadata queries: Thread-safe (read-only)
 * - Classification: Thread-safe (multiple threads can classify with same RypeIndex)
 * - Results: NOT thread-safe (each thread needs its own result array)
 * - Error reporting: Thread-safe (thread-local errors)
 *
 * ## Memory Management
 *
 * - All pointers returned by rype_*_load() must be freed with corresponding _free function
 * - Do NOT free RypeIndex while any rype_classify() calls are in progress
 * - Do NOT free RypeResultArray twice (undefined behavior)
 * - Do NOT free RypeResultArray from multiple threads simultaneously
 * - Strings returned by rype_bucket_name() are owned by the RypeIndex (do NOT free)
 * - Strings returned by rype_get_last_error() are valid until next API call on same thread
 *
 * ## Score Semantics
 *
 * The score returned in RypeHit represents the fraction of query minimizers
 * that match the bucket:
 *
 *     score = matching_minimizers / total_query_minimizers
 *
 * - Range: 0.0 to 1.0
 * - Higher scores indicate stronger matches
 * - Threshold of 0.1 means >= 10% of query minimizers must match
 * - Typical values: 0.05-0.2 for metagenomic classification
 */

// ============================================================================
// OPAQUE TYPES
// ============================================================================

/**
 * Opaque pointer to a Parquet inverted index
 *
 * RypeIndex represents a sharded Parquet inverted index stored as a directory:
 *
 *     index.ryxdi/
 *     ├── manifest.toml       # Index metadata
 *     ├── buckets.parquet     # Bucket names and sources
 *     └── inverted/           # Parquet shards with (minimizer, bucket_id) pairs
 *
 * Create with rype_index_load(), free with rype_index_free().
 */
typedef struct RypeIndex RypeIndex;

/**
 * Opaque pointer to a negative minimizer set
 *
 * Contains a shared reference (via Arc) to an index used to filter query
 * minimizers during classification. Used to filter out contaminating sequences
 * (e.g., host DNA, adapters).
 *
 * Memory-efficient: Uses sharded filtering that loads one shard at a time,
 * rather than loading all minimizers into memory at once. Creating a negative
 * set from an index is a cheap operation (increments a reference count) rather
 * than copying data.
 */
typedef struct RypeNegativeSet RypeNegativeSet;

// ============================================================================
// DATA STRUCTURES
// ============================================================================

/**
 * Query structure for sequence classification
 *
 * ## Field Requirements
 *
 * @field id       User-defined query identifier, returned unchanged in results.
 *                 Can be any int64_t value.
 *
 * @field seq      Pointer to primary sequence bytes.
 *                 MUST be non-NULL and point to at least seq_len bytes.
 *                 Accepts: A, C, G, T (case-insensitive). N and other IUPAC
 *                 codes reset k-mer extraction (reduce sensitivity, won't crash).
 *
 * @field seq_len  Length of seq in bytes.
 *                 MUST be > 0 and <= 2,000,000,000 (2GB).
 *                 Sequences shorter than k (index k-mer size) produce no minimizers.
 *
 * @field pair_seq Pointer to paired-end sequence bytes, or NULL for single-end.
 *                 If non-NULL, MUST point to at least pair_len bytes.
 *                 Same base requirements as seq.
 *
 * @field pair_len Length of pair_seq in bytes.
 *                 MUST be 0 if pair_seq is NULL.
 *                 MUST be > 0 and <= 2GB if pair_seq is non-NULL.
 *
 * ## Memory Lifetime
 *
 * All pointers must remain valid for the duration of rype_classify() call.
 * Sequences are NOT copied internally.
 *
 * ## Example
 *
 *     // Single-end query
 *     RypeQuery single = {
 *         .id = 42,
 *         .seq = "ACGTACGTACGT",
 *         .seq_len = 12,
 *         .pair_seq = NULL,
 *         .pair_len = 0
 *     };
 *
 *     // Paired-end query
 *     RypeQuery paired = {
 *         .id = 43,
 *         .seq = read1_data,
 *         .seq_len = read1_len,
 *         .pair_seq = read2_data,
 *         .pair_len = read2_len
 *     };
 */
typedef struct {
    int64_t id;           ///< User-defined query identifier
    const char* seq;      ///< Primary sequence (ACGT, case-insensitive)
    size_t seq_len;       ///< Length of seq in bytes
    const char* pair_seq; ///< Optional paired-end sequence (NULL if single-end)
    size_t pair_len;      ///< Length of pair_seq (0 if single-end)
} RypeQuery;

/**
 * Classification result for a single query
 *
 * @field query_id  The id value from the corresponding RypeQuery
 * @field bucket_id Numeric ID of the matched bucket/reference
 * @field score     Classification score: fraction of query minimizers matching
 *                  this bucket. Range [0.0, 1.0]. Higher is better.
 *
 * ## Score Interpretation
 *
 * - 1.0 = All query minimizers found in bucket (perfect match)
 * - 0.5 = Half of query minimizers found
 * - 0.1 = 10% of query minimizers found (default threshold)
 *
 * Multiple hits per query are possible if multiple buckets exceed threshold.
 * Use rype_bucket_name() to convert bucket_id to human-readable name.
 */
typedef struct {
    int64_t query_id;    ///< Query ID from RypeQuery
    uint32_t bucket_id;  ///< Matched bucket/reference ID
    double score;        ///< Classification score (0.0 - 1.0)
} RypeHit;

/**
 * Array of classification results
 *
 * Contains zero or more hits for each query that exceeded the threshold.
 * Queries with no matches above threshold are not included.
 *
 * @field data     Pointer to array of RypeHit structs
 * @field len      Number of valid hits in data array
 * @field capacity Internal capacity (ignore this field)
 *
 * ## Memory
 *
 * Owned by caller after rype_classify() returns.
 * MUST be freed with rype_results_free() exactly once.
 * Do NOT free individual RypeHit elements.
 */
typedef struct {
    RypeHit* data;       ///< Array of hits
    size_t len;          ///< Number of hits
    size_t capacity;     ///< Capacity (internal use)
} RypeResultArray;

// ============================================================================
// INDEX API
// ============================================================================

/**
 * Load an index from disk
 *
 * Supported format:
 * - Parquet inverted index directory (.ryxdi with manifest.toml)
 *
 * @param path  Null-terminated UTF-8 path to index directory
 * @return      Non-NULL RypeIndex pointer on success, NULL on failure
 *
 * ## Errors (returns NULL)
 *
 * - path is NULL
 * - Directory not found or cannot be opened
 * - Missing or corrupted manifest.toml
 * - Unsupported format version
 * - Out of memory
 *
 * ## Thread Safety
 *
 * NOT thread-safe. Use external synchronization if loading from multiple threads.
 *
 * ## Memory
 *
 * Returned RypeIndex must be freed with rype_index_free() when no longer needed.
 *
 * ## Error Details
 *
 * Call rype_get_last_error() for detailed error message.
 *
 * ## Example
 *
 *     RypeIndex* idx = rype_index_load("bacteria.ryxdi");
 *     if (!idx) {
 *         fprintf(stderr, "Error: %s\n", rype_get_last_error());
 *         return 1;
 *     }
 */
RypeIndex* rype_index_load(const char* path);

/**
 * Free an index
 *
 * @param index  RypeIndex pointer from rype_index_load(), or NULL (no-op)
 *
 * ## Thread Safety
 *
 * NOT thread-safe. Do NOT call while any thread is using the RypeIndex
 * in rype_classify() - this causes use-after-free.
 */
void rype_index_free(RypeIndex* index);

/**
 * Get the k-mer size of an index
 *
 * @param index  Non-NULL RypeIndex pointer
 * @return       K-mer size (16, 32, or 64), or 0 if index is NULL
 *
 * ## Thread Safety
 *
 * Thread-safe (read-only access).
 */
size_t rype_index_k(const RypeIndex* index);

/**
 * Get the window size of an index
 *
 * @param index  Non-NULL RypeIndex pointer
 * @return       Window size for minimizer selection, or 0 if index is NULL
 *
 * ## Thread Safety
 *
 * Thread-safe (read-only access).
 */
size_t rype_index_w(const RypeIndex* index);

/**
 * Get the salt value of an index
 *
 * @param index  Non-NULL RypeIndex pointer
 * @return       Salt XOR'd with k-mer hashes, or 0 if index is NULL
 *
 * ## Note
 *
 * Indices with different salts are incompatible for comparison.
 *
 * ## Thread Safety
 *
 * Thread-safe (read-only access).
 */
uint64_t rype_index_salt(const RypeIndex* index);

/**
 * Get the number of buckets in an index
 *
 * @param index  Non-NULL RypeIndex pointer
 * @return       Number of buckets, or 0 if index is NULL
 *
 * ## Thread Safety
 *
 * Thread-safe (read-only access).
 */
uint32_t rype_index_num_buckets(const RypeIndex* index);

/**
 * Check if an index is sharded (DEPRECATED)
 *
 * @deprecated Always returns 1. All indices are Parquet-based and sharded.
 *
 * @param index  Non-NULL RypeIndex pointer
 * @return       Always 1 (all Parquet indices are sharded), or 0 if NULL
 *
 * ## Thread Safety
 *
 * Thread-safe (read-only access).
 */
int rype_index_is_sharded(const RypeIndex* index);

/**
 * Get the number of shards in an index
 *
 * @param index  Non-NULL RypeIndex pointer
 * @return       Number of shards (>= 1), or 0 if NULL
 *
 * ## Thread Safety
 *
 * Thread-safe (read-only access).
 */
uint32_t rype_index_num_shards(const RypeIndex* index);

// ============================================================================
// MEMORY UTILITIES
// ============================================================================

/**
 * Get the estimated memory footprint of the loaded index in bytes
 *
 * Returns a **lower bound estimate** of memory needed to load all shards.
 * Actual usage will be 1.5-2x higher due to Arrow array overhead and
 * temporary allocations. Shards are loaded on-demand during classification.
 *
 * @param index  Non-NULL RypeIndex pointer
 * @return       Estimated memory in bytes (lower bound), 0 if NULL
 *
 * ## Thread Safety
 *
 * Thread-safe (read-only access).
 */
size_t rype_index_memory_bytes(const RypeIndex* index);

/**
 * Get the estimated size of the largest shard in bytes
 *
 * Returns the estimated memory needed to load the largest single shard.
 * Use this for memory planning when classifying against sharded indices.
 *
 * @param index  Non-NULL RypeIndex pointer
 * @return       Largest shard size in bytes, or 0 if NULL
 *
 * ## Thread Safety
 *
 * Thread-safe (read-only access).
 */
size_t rype_index_largest_shard_bytes(const RypeIndex* index);

/**
 * Detect available system memory
 *
 * Detection order:
 * - Linux: cgroups v2, cgroups v1, /proc/meminfo
 * - macOS: sysctl hw.memsize
 * - Fallback: 8GB
 *
 * @return  Available memory in bytes
 *
 * ## Thread Safety
 *
 * Thread-safe.
 */
size_t rype_detect_available_memory(void);

/**
 * Parse a byte size string (e.g., "4G", "512M", "1024K")
 *
 * Supported suffixes (case-insensitive): B, K, KB, M, MB, G, GB, T, TB
 * Decimal values are supported: "1.5G"
 *
 * @param str  Null-terminated string to parse
 * @return     Size in bytes, or 0 on parse error or NULL input
 *
 * ## Notes
 *
 * - "auto" returns 0 (use rype_detect_available_memory() instead)
 * - Returns 0 for invalid input - no error message set
 *
 * ## Thread Safety
 *
 * Thread-safe.
 */
size_t rype_parse_byte_suffix(const char* str);

/**
 * Recommend an optimal batch size for Arrow streaming classification
 *
 * The Arrow streaming C API (rype_classify_arrow) processes one RecordBatch at a
 * time, and each batch triggers a full shard loop. If batches are too small, shard
 * I/O dominates and performance suffers. This function computes the batch size that
 * the CLI would use, given the index characteristics, read profile, and available
 * memory.
 *
 * @param index            Non-NULL RypeIndex pointer from rype_index_load()
 * @param avg_read_length  Average nucleotide length of individual reads (must be > 0)
 * @param is_paired        Non-zero for paired-end, 0 for single-end
 * @param max_memory       Maximum memory budget in bytes, or 0 to auto-detect
 * @return                 Recommended number of rows per RecordBatch (>= 1000 on
 *                         success), or 0 on error
 *
 * ## Error Handling
 *
 * Returns 0 on error. Call rype_get_last_error() for details.
 *
 * ## Thread Safety
 *
 * Thread-safe (read-only access to index).
 */
size_t rype_recommend_batch_size(
    const RypeIndex* index,
    size_t avg_read_length,
    int is_paired,
    size_t max_memory);

/**
 * Full batch configuration returned by rype_calculate_batch_config()
 *
 * On error, all fields are 0. Check batch_size == 0 to detect errors,
 * then call rype_get_last_error() for details.
 */
typedef struct {
    size_t batch_size;       ///< Number of records per batch (>= 1000 on success)
    size_t batch_count;      ///< Reserved for forward-compatibility; always 1.
                             ///< Do not write application logic that depends on this being > 1.
    size_t per_batch_memory; ///< Estimated memory per batch in bytes
    size_t peak_memory;      ///< Estimated peak memory usage in bytes
} RypeBatchConfig;

/* Compile-time ABI safety check: catch Rust/C struct layout drift. */
_Static_assert(sizeof(RypeBatchConfig) == 4 * sizeof(size_t),
    "RypeBatchConfig layout mismatch - update rype.h if struct fields changed");

/**
 * Calculate the full batch configuration for Arrow streaming classification
 *
 * Returns the same information that the CLI uses internally to size batches,
 * including per-batch memory estimates and peak memory. This is a superset of
 * rype_recommend_batch_size() which returns only batch_size.
 *
 * @param index            Non-NULL RypeIndex pointer from rype_index_load()
 * @param avg_read_length  Average nucleotide length of individual reads (must be > 0)
 * @param is_paired        Non-zero for paired-end, 0 for single-end
 * @param max_memory       Maximum memory budget in bytes, or 0 to auto-detect
 * @return                 RypeBatchConfig struct. On error all fields are 0.
 *
 * ## Error Handling
 *
 * Check batch_size == 0 to detect errors. Call rype_get_last_error() for details.
 *
 * ## Thread Safety
 *
 * Thread-safe (read-only access to index).
 */
RypeBatchConfig rype_calculate_batch_config(
    const RypeIndex* index,
    size_t avg_read_length,
    int is_paired,
    size_t max_memory);

/**
 * Get the name of a bucket by ID
 *
 * @param index      Non-NULL RypeIndex pointer
 * @param bucket_id  Bucket ID from RypeHit.bucket_id
 * @return           Bucket name string, or NULL if not found
 *
 * ## NULL Return Cases
 *
 * - index is NULL
 * - bucket_id doesn't exist in the index
 *
 * **WARNING**: ALWAYS check return value before use. Passing NULL to
 * printf("%s") causes undefined behavior. Use pattern:
 *
 *     const char* name = rype_bucket_name(idx, hit->bucket_id);
 *     printf("%s\n", name ? name : "unknown");
 *
 * ## Memory
 *
 * Returned string is owned by the RypeIndex. Do NOT free it.
 * String is valid until RypeIndex is freed.
 *
 * ## Thread Safety
 *
 * Thread-safe (read-only access).
 */
const char* rype_bucket_name(const RypeIndex* index, uint32_t bucket_id);

// ============================================================================
// BUCKET FILE STATISTICS API
// ============================================================================

/**
 * Per-bucket file statistics.
 *
 * Statistics are computed over the total sequence lengths of each input file
 * within a bucket. All values are in base pairs.
 */
typedef struct {
    uint32_t bucket_id;  ///< Bucket ID this stats entry belongs to
    double mean;         ///< Mean of per-file total sequence lengths
    double median;       ///< Median of per-file total sequence lengths
    double stdev;        ///< Population standard deviation of per-file total sequence lengths
    double min;          ///< Minimum per-file total sequence length
    double max;          ///< Maximum per-file total sequence length
} RypeBucketFileStats;

/**
 * Array of per-bucket file statistics.
 *
 * Returned by rype_bucket_file_stats(). Free with rype_bucket_file_stats_free().
 */
typedef struct {
    RypeBucketFileStats* stats;  ///< Pointer to array of stats entries
    size_t count;                ///< Number of entries in the array
} RypeBucketFileStatsArray;

/**
 * Get per-bucket file statistics for all buckets that have them
 *
 * @param index  Non-NULL RypeIndex pointer from rype_index_load()
 * @return       Pointer to stats array, or NULL if no stats available
 *
 * Returns NULL if the index has no file statistics (e.g., old format indices
 * or merged indices). Entries are sorted by bucket_id.
 *
 * ## Memory
 *
 * Caller takes ownership and MUST call rype_bucket_file_stats_free() when done.
 *
 * ## Thread Safety
 *
 * Thread-safe (read-only access).
 */
RypeBucketFileStatsArray* rype_bucket_file_stats(const RypeIndex* index);

/**
 * Free a bucket file stats array
 *
 * @param stats  Pointer from rype_bucket_file_stats(), or NULL (no-op)
 *
 * Do NOT call twice on the same pointer (undefined behavior).
 */
void rype_bucket_file_stats_free(RypeBucketFileStatsArray* stats);

// ============================================================================
// NEGATIVE FILTERING API
// ============================================================================

/**
 * Create a negative minimizer set from an index
 *
 * @param negative_index  RypeIndex containing sequences to filter out
 * @return                RypeNegativeSet pointer on success, NULL on failure
 *
 * The negative set uses memory-efficient sharded filtering. During
 * classification, negative shards are processed one at a time, so memory
 * usage is O(single_shard) rather than O(entire_index).
 *
 * ## Example
 *
 *     RypeIndex* neg_idx = rype_index_load("contaminants.ryxdi");
 *     RypeNegativeSet* neg_set = rype_negative_set_create(neg_idx);
 *     // neg_idx can be freed after creating neg_set if desired
 *     rype_index_free(neg_idx);
 *
 *     // Use neg_set for classification
 *     RypeResultArray* results = rype_classify_with_negative(
 *         main_idx, neg_set, queries, num_queries, threshold
 *     );
 *
 *     rype_negative_set_free(neg_set);
 *
 * ## Thread Safety
 *
 * NOT thread-safe. Use external synchronization when creating from multiple threads.
 *
 * ## Memory
 *
 * The returned RypeNegativeSet must be freed with rype_negative_set_free().
 * The RypeNegativeSet shares the underlying index data with the source RypeIndex
 * via reference counting (Arc). This means:
 * - Creating a negative set is cheap (no data copying)
 * - The original negative_index can be freed after creating the negative set
 * - The negative set remains valid and usable after freeing the source index
 */
RypeNegativeSet* rype_negative_set_create(const RypeIndex* negative_index);

/**
 * Free a negative set
 *
 * @param neg_set  RypeNegativeSet pointer, or NULL (no-op)
 *
 * ## Thread Safety
 *
 * NOT thread-safe. Do not free while classification is in progress.
 */
void rype_negative_set_free(RypeNegativeSet* neg_set);

/**
 * Get the total minimizer count in a negative set
 *
 * @param neg_set  RypeNegativeSet pointer
 * @return         Total minimizer count from the index manifest, or 0 if NULL
 *
 * Note: This returns the total entry count from the index manifest.
 * Since shards may contain duplicate entries, this is an upper bound
 * rather than an exact count of unique minimizers.
 */
size_t rype_negative_set_size(const RypeNegativeSet* neg_set);

// ============================================================================
// CLASSIFICATION API
// ============================================================================

/**
 * Classify a batch of sequences against an index
 *
 * Equivalent to rype_classify_with_negative(index, NULL, queries, num_queries, threshold).
 *
 * @param index        Non-NULL RypeIndex pointer from rype_index_load()
 * @param queries      Array of RypeQuery structs
 * @param num_queries  Number of queries (must be > 0 and < INTPTR_MAX)
 * @param threshold    Classification threshold (0.0-1.0, must be finite)
 * @return             Non-NULL RypeResultArray on success, NULL on failure
 *
 * See rype_classify_with_negative() for full documentation.
 */
RypeResultArray* rype_classify(
    const RypeIndex* index,
    const RypeQuery* queries,
    size_t num_queries,
    double threshold
);

/**
 * Classify a batch of sequences with optional negative filtering
 *
 * Processes queries against a Parquet sharded inverted index, loading
 * shards sequentially to minimize memory usage.
 *
 * @param index        Non-NULL RypeIndex pointer from rype_index_load()
 * @param negative_set Optional RypeNegativeSet for filtering (NULL to disable)
 * @param queries      Array of RypeQuery structs
 * @param num_queries  Number of queries (must be > 0 and < INTPTR_MAX)
 * @param threshold    Classification threshold (0.0-1.0, must be finite)
 * @return             Non-NULL RypeResultArray on success, NULL on failure
 *
 * ## Threshold
 *
 * The threshold is the minimum fraction of query minimizers that must match
 * a bucket for it to be reported. Typical values:
 *
 * - 0.05 - High sensitivity, more false positives
 * - 0.10 - Balanced (default)
 * - 0.20 - High specificity, may miss distant matches
 *
 * ## Negative Filtering
 *
 * When negative_set is provided, minimizers matching the negative set are
 * excluded from the query before scoring. This improves specificity when
 * contaminating sequences (e.g., host DNA) share minimizers with targets.
 *
 * ## Errors (returns NULL)
 *
 * - index is NULL
 * - queries is NULL or num_queries is 0
 * - threshold is NaN, infinity, < 0.0, or > 1.0
 * - Any query violates RypeQuery requirements
 * - Out of memory
 *
 * ## Thread Safety
 *
 * Thread-safe. Multiple threads can classify concurrently with same RypeIndex.
 *
 * ## Memory
 *
 * Returned RypeResultArray MUST be freed with rype_results_free().
 *
 * ## Performance
 *
 * Uses parallel processing internally. For best throughput, batch many
 * queries together rather than calling one at a time.
 */
RypeResultArray* rype_classify_with_negative(
    const RypeIndex* index,
    const RypeNegativeSet* negative_set,
    const RypeQuery* queries,
    size_t num_queries,
    double threshold
);

/**
 * Classify a batch of sequences and return only the best hit per query
 *
 * Same as rype_classify() but filters results to keep only the highest-scoring
 * bucket for each query. If multiple buckets tie for the best score, one is
 * chosen arbitrarily.
 *
 * @param index        Non-NULL RypeIndex pointer from rype_index_load()
 * @param queries      Array of RypeQuery structs
 * @param num_queries  Number of queries (must be > 0 and < INTPTR_MAX)
 * @param threshold    Classification threshold (0.0-1.0, must be finite)
 * @return             Non-NULL RypeResultArray on success, NULL on failure
 *
 * The result array contains at most one hit per query_id.
 */
RypeResultArray* rype_classify_best_hit(
    const RypeIndex* index,
    const RypeQuery* queries,
    size_t num_queries,
    double threshold
);

/**
 * Classify with negative filtering, returning only the best hit per query
 *
 * Same as rype_classify_with_negative() but filters results to keep only the
 * highest-scoring bucket for each query.
 *
 * @param index         Non-NULL RypeIndex pointer from rype_index_load()
 * @param negative_set  Optional RypeNegativeSet (NULL to disable filtering)
 * @param queries       Array of RypeQuery structs
 * @param num_queries   Number of queries (must be > 0 and < INTPTR_MAX)
 * @param threshold     Classification threshold (0.0-1.0, must be finite)
 * @return              Non-NULL RypeResultArray on success, NULL on failure
 *
 * The result array contains at most one hit per query_id.
 */
RypeResultArray* rype_classify_best_hit_with_negative(
    const RypeIndex* index,
    const RypeNegativeSet* negative_set,
    const RypeQuery* queries,
    size_t num_queries,
    double threshold
);

/**
 * Free a result array
 *
 * @param results  RypeResultArray pointer from rype_classify(), or NULL (no-op)
 *
 * ## Thread Safety
 *
 * NOT thread-safe. Each result array should be freed by one thread.
 *
 * ## Warning
 *
 * - Do NOT call twice on the same pointer (undefined behavior)
 * - Do NOT access results->data after calling this function
 */
void rype_results_free(RypeResultArray* results);

// ============================================================================
// LOG-RATIO CLASSIFICATION API
// ============================================================================
//
// Log-ratio classification computes log10(numerator_score / denominator_score)
// for each read against two single-bucket indices. This is useful for
// differential abundance analysis.
//
// ## Score Semantics
//
// - Positive log-ratio: read matches numerator more strongly
// - Negative log-ratio: read matches denominator more strongly
// - +inf: numerator > 0, denominator = 0 (or fast-path: numerator exceeded skip threshold)
// - -inf: numerator = 0, denominator > 0
// - NaN: both numerator and denominator = 0 (no evidence)
//
// ## Fast-Path
//
// When numerator_skip_threshold is set (0.0 < threshold <= 1.0), reads whose
// numerator score exceeds the threshold are assigned +inf without classifying
// against the denominator, saving computation.

/**
 * Log-ratio classification result for a single query
 *
 * @field query_id   The id value from the corresponding RypeQuery
 * @field log_ratio  log10(numerator_score / denominator_score).
 *                   Can be +inf, -inf, or NaN (see score semantics above).
 * @field fast_path  0 = computed exactly (both indices classified),
 *                   1 = numerator exceeded skip threshold (+inf fast-path)
 */
typedef struct {
    int64_t query_id;    ///< Query ID from RypeQuery
    double log_ratio;    ///< log10(num/denom), can be +inf, -inf, NaN
    int32_t fast_path;   ///< 0 = None, 1 = NumHigh
} RypeLogRatioHit;

/**
 * Array of log-ratio classification results
 *
 * Contains one result per input query (unlike RypeResultArray which only
 * includes queries exceeding the threshold).
 *
 * @field data     Pointer to array of RypeLogRatioHit structs
 * @field len      Number of results in data array
 * @field capacity Internal capacity (ignore this field)
 *
 * ## Memory
 *
 * Owned by caller after rype_classify_log_ratio() returns.
 * MUST be freed with rype_log_ratio_results_free() exactly once.
 */
typedef struct {
    RypeLogRatioHit* data;  ///< Array of log-ratio results
    size_t len;             ///< Number of results
    size_t capacity;        ///< Capacity (internal use)
} RypeLogRatioResultArray;

/**
 * Validate two indices are compatible for log-ratio classification
 *
 * Checks that both indices have exactly 1 bucket and that their k, w, and
 * salt parameters match.
 *
 * @param numerator    Non-NULL RypeIndex pointer (must be single-bucket)
 * @param denominator  Non-NULL RypeIndex pointer (must be single-bucket)
 * @return             0 on success, -1 on error
 *
 * ## Thread Safety
 *
 * Thread-safe (read-only access).
 */
int rype_validate_log_ratio_indices(
    const RypeIndex* numerator,
    const RypeIndex* denominator
);

/**
 * Classify a batch using log-ratio (numerator vs denominator)
 *
 * @param numerator                Non-NULL RypeIndex pointer (single-bucket)
 * @param denominator              Non-NULL RypeIndex pointer (single-bucket)
 * @param queries                  Array of RypeQuery structs
 * @param num_queries              Number of queries (must be > 0)
 * @param numerator_skip_threshold Fast-path threshold:
 *                                 <= 0.0: disabled (classify all against both)
 *                                 (0.0, 1.0]: enabled (fast-path for high-scoring reads)
 *                                 > 1.0, NaN, inf: error (returns NULL)
 * @return                         Non-NULL RypeLogRatioResultArray on success, NULL on failure
 *
 * ## Paired-End Support
 *
 * Paired-end reads are supported via RypeQuery pair_seq/pair_len fields.
 * Minimizers from both ends are combined before scoring.
 *
 * ## Thread Safety
 *
 * Thread-safe. Multiple threads can classify concurrently with the same
 * RypeIndex pointers.
 *
 * ## Memory
 *
 * Returned RypeLogRatioResultArray MUST be freed with rype_log_ratio_results_free().
 */
RypeLogRatioResultArray* rype_classify_log_ratio(
    const RypeIndex* numerator,
    const RypeIndex* denominator,
    const RypeQuery* queries,
    size_t num_queries,
    double numerator_skip_threshold
);

/**
 * Free a log-ratio result array
 *
 * @param results  RypeLogRatioResultArray pointer, or NULL (no-op)
 *
 * ## Warning
 *
 * - Do NOT call twice on the same pointer (undefined behavior)
 * - Do NOT access results->data after calling this function
 */
void rype_log_ratio_results_free(RypeLogRatioResultArray* results);

// ============================================================================
// ERROR HANDLING
// ============================================================================

/**
 * Get the last error message from a failed API call
 *
 * @return  Null-terminated error string, or NULL if no error
 *
 * ## Lifetime
 *
 * - Error messages are thread-local (each thread has its own)
 * - Error is cleared on successful API calls
 * - Returned pointer is valid until next API call on same thread
 * - Do NOT free the returned pointer
 *
 * ## Thread Safety
 *
 * Thread-safe (returns thread-local error).
 */
const char* rype_get_last_error(void);

// ============================================================================
// MINIMIZER EXTRACTION API
// ============================================================================
//
// These functions extract minimizer hashes (and optionally positions) from
// raw DNA sequences without requiring an index. Useful for building custom
// pipelines, sequence comparison, or pre-computing minimizer sketches.
//
// ## Parameters (common to both functions)
//
// - seq:     Pointer to DNA sequence bytes (A/C/G/T, case-insensitive)
// - seq_len: Length of seq in bytes (must be > 0)
// - k:       K-mer size (must be 16, 32, or 64)
// - w:       Window size for minimizer selection (must be > 0)
// - salt:    XOR salt applied to k-mer hashes (use 0 for no salt)
//
// ## Thread Safety
//
// All extraction functions are thread-safe (pure computation, no shared state).

/**
 * Array of uint64_t values.
 *
 * Used as a building block for extraction results. Do NOT free this
 * directly - free the containing result struct instead.
 */
typedef struct {
    uint64_t* data;  ///< Array of values (NULL if len == 0)
    size_t len;      ///< Number of elements
} RypeU64Array;

/**
 * Result of rype_extract_minimizer_set()
 *
 * Contains sorted, deduplicated minimizer hash sets for forward and
 * reverse complement strands.
 *
 * Free with rype_minimizer_set_result_free(). Do NOT call twice.
 */
typedef struct {
    RypeU64Array forward;              ///< Sorted, deduplicated forward strand hashes
    RypeU64Array reverse_complement;   ///< Sorted, deduplicated RC strand hashes
} RypeMinimizerSetResult;

/**
 * Minimizer hashes and positions for a single strand (SoA layout)
 *
 * hashes[i] corresponds to positions[i]. Both arrays have length len.
 * Positions are 0-based byte offsets into the input sequence where the
 * k-mer starts.
 */
typedef struct {
    uint64_t* hashes;    ///< Minimizer hash values
    uint64_t* positions; ///< 0-based positions in sequence
    size_t len;          ///< Number of minimizers (length of both arrays)
} RypeStrandResult;

/**
 * Result of rype_extract_strand_minimizers()
 *
 * Contains ordered minimizer hashes and positions for forward and
 * reverse complement strands. Positions are non-decreasing within
 * each strand.
 *
 * Free with rype_strand_minimizers_result_free(). Do NOT call twice.
 */
typedef struct {
    RypeStrandResult forward;              ///< Forward strand minimizers
    RypeStrandResult reverse_complement;   ///< Reverse complement minimizers
} RypeStrandMinimizersResult;

/**
 * Extract sorted, deduplicated minimizer hash sets per strand
 *
 * Returns two sorted, deduplicated arrays of minimizer hashes - one for the
 * forward strand and one for the reverse complement.
 *
 * @param seq      Pointer to DNA sequence bytes
 * @param seq_len  Length of seq in bytes (must be > 0)
 * @param k        K-mer size (must be 16, 32, or 64)
 * @param w        Window size (must be > 0)
 * @param salt     XOR salt for k-mer hashing
 * @return         Non-NULL result on success, NULL on error
 *
 * ## Notes
 *
 * - Sequences shorter than k produce empty (len=0) arrays
 * - N bases and other non-ACGT characters reset k-mer extraction
 *
 * ## Memory
 *
 * Caller must free with rype_minimizer_set_result_free().
 */
RypeMinimizerSetResult* rype_extract_minimizer_set(
    const uint8_t* seq,
    size_t seq_len,
    size_t k,
    size_t w,
    uint64_t salt
);

/**
 * Free a minimizer set result
 *
 * @param result  Pointer from rype_extract_minimizer_set(), or NULL (no-op)
 *
 * Do NOT call twice on the same pointer.
 */
void rype_minimizer_set_result_free(RypeMinimizerSetResult* result);

/**
 * Extract ordered minimizers with positions per strand (SoA layout)
 *
 * Returns minimizer hashes and their 0-based positions for both forward
 * and reverse complement strands. Positions are non-decreasing within
 * each strand.
 *
 * @param seq      Pointer to DNA sequence bytes
 * @param seq_len  Length of seq in bytes (must be > 0)
 * @param k        K-mer size (must be 16, 32, or 64)
 * @param w        Window size (must be > 0)
 * @param salt     XOR salt for k-mer hashing
 * @return         Non-NULL result on success, NULL on error
 *
 * ## Notes
 *
 * - Sequences shorter than k produce empty (len=0) results
 * - positions[i] + k <= seq_len for all i
 *
 * ## Memory
 *
 * Caller must free with rype_strand_minimizers_result_free().
 */
RypeStrandMinimizersResult* rype_extract_strand_minimizers(
    const uint8_t* seq,
    size_t seq_len,
    size_t k,
    size_t w,
    uint64_t salt
);

/**
 * Free a strand minimizers result
 *
 * @param result  Pointer from rype_extract_strand_minimizers(), or NULL (no-op)
 *
 * Do NOT call twice on the same pointer.
 */
void rype_strand_minimizers_result_free(RypeStrandMinimizersResult* result);

// ============================================================================
// ARROW C DATA INTERFACE API (Optional Feature)
// ============================================================================
//
// These functions are only available when rype is built with --features arrow-ffi.
// They use the Arrow C Data Interface for zero-copy data exchange with
// Arrow-compatible systems (Python/PyArrow, R/arrow, DuckDB, Polars, etc.).
//
// Reference: https://arrow.apache.org/docs/format/CDataInterface.html
//
// ## Building with Arrow support
//
//     cargo build --release --features arrow-ffi
//
// ## Linking
//
// When linking against rype with Arrow support, you must define RYPE_ARROW
// before including this header to enable the Arrow API declarations:
//
//     #define RYPE_ARROW
//     #include "rype.h"
//

#ifdef RYPE_ARROW

// ----------------------------------------------------------------------------
// Arrow C Data Interface Structures
// ----------------------------------------------------------------------------
// These structures match the Arrow C Data Interface specification exactly.
// See: https://arrow.apache.org/docs/format/CDataInterface.html

#ifndef ARROW_C_DATA_INTERFACE
#define ARROW_C_DATA_INTERFACE

#define ARROW_FLAG_DICTIONARY_ORDERED 1
#define ARROW_FLAG_NULLABLE 2
#define ARROW_FLAG_MAP_KEYS_SORTED 4

struct ArrowSchema {
    const char* format;
    const char* name;
    const char* metadata;
    int64_t flags;
    int64_t n_children;
    struct ArrowSchema** children;
    struct ArrowSchema* dictionary;
    void (*release)(struct ArrowSchema*);
    void* private_data;
};

struct ArrowArray {
    int64_t length;
    int64_t null_count;
    int64_t offset;
    int64_t n_buffers;
    int64_t n_children;
    const void** buffers;
    struct ArrowArray** children;
    struct ArrowArray* dictionary;
    void (*release)(struct ArrowArray*);
    void* private_data;
};

struct ArrowArrayStream {
    int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out);
    int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out);
    const char* (*get_last_error)(struct ArrowArrayStream*);
    void (*release)(struct ArrowArrayStream*);
    void* private_data;
};

#endif // ARROW_C_DATA_INTERFACE

// ----------------------------------------------------------------------------
// Arrow Classification Functions
// ----------------------------------------------------------------------------

/**
 * Classify sequences from an Arrow stream
 *
 * TRUE STREAMING: Processes one batch at a time. Memory usage is O(batch_size),
 * not O(total_data). Results are available as soon as each batch is processed.
 *
 * @param index          Non-NULL RypeIndex pointer from rype_index_load()
 * @param negative_set   Optional RypeNegativeSet for filtering (NULL to disable)
 * @param input_stream   Input ArrowArrayStream containing sequence batches
 * @param threshold      Classification threshold (0.0-1.0)
 * @param out_stream     Output ArrowArrayStream for results (caller-allocated)
 * @return               0 on success, -1 on error
 *
 * ## Input Schema
 *
 * | Column         | Type                    | Nullable | Description              |
 * |----------------|-------------------------|----------|--------------------------|
 * | id             | Int64                   | No       | Query identifier         |
 * | sequence       | Binary or LargeBinary   | No       | DNA sequence bytes       |
 * | pair_sequence  | Binary or LargeBinary   | Yes      | Paired-end sequence      |
 *
 * ## Output Schema
 *
 * | Column    | Type    | Description                         |
 * |-----------|---------|-------------------------------------|
 * | query_id  | Int64   | Matching query ID from input        |
 * | bucket_id | UInt32  | Matched bucket/reference ID         |
 * | score     | Float64 | Classification score (0.0-1.0)      |
 *
 * ## Memory Management
 *
 * - This function TAKES OWNERSHIP of input_stream
 * - Caller owns out_stream and MUST call out_stream->release() when done
 * - Do NOT release out_stream if this function returns -1
 *
 * ## Thread Safety
 *
 * Thread-safe for concurrent classification with the same RypeIndex.
 */
int rype_classify_arrow(
    const RypeIndex* index,
    const RypeNegativeSet* negative_set,
    struct ArrowArrayStream* input_stream,
    double threshold,
    struct ArrowArrayStream* out_stream
);

/**
 * Classify from Arrow stream, returning only the best hit per query
 *
 * Same as rype_classify_arrow() but filters results to keep only the
 * highest-scoring bucket for each query.
 */
int rype_classify_arrow_best_hit(
    const RypeIndex* index,
    const RypeNegativeSet* negative_set,
    struct ArrowArrayStream* input_stream,
    double threshold,
    struct ArrowArrayStream* out_stream
);

/**
 * Get the output schema for Arrow classification results
 *
 * @param out_schema  Pointer to caller-allocated ArrowSchema to initialize
 * @return            0 on success, -1 on error
 *
 * Caller must call out_schema->release(out_schema) when done.
 */
int rype_arrow_result_schema(struct ArrowSchema* out_schema);

// ----------------------------------------------------------------------------
// Arrow Log-Ratio Classification Functions
// ----------------------------------------------------------------------------

/**
 * Get the output schema for Arrow log-ratio results
 *
 * Schema: query_id (Int64), log_ratio (Float64), fast_path (Int32)
 *
 * @param out_schema  Pointer to caller-allocated ArrowSchema to initialize
 * @return            0 on success, -1 on error
 *
 * Caller must call out_schema->release(out_schema) when done.
 */
int rype_arrow_log_ratio_result_schema(struct ArrowSchema* out_schema);

/**
 * Classify sequences from an Arrow stream using log-ratio
 *
 * TRUE STREAMING: Processes one batch at a time. Memory usage is O(batch_size).
 *
 * @param numerator                Non-NULL RypeIndex pointer (single-bucket)
 * @param denominator              Non-NULL RypeIndex pointer (single-bucket)
 * @param input_stream             Input ArrowArrayStream containing sequence batches
 * @param numerator_skip_threshold Fast-path threshold (see rype_classify_log_ratio)
 * @param out_stream               Output ArrowArrayStream for results (caller-allocated)
 * @return                         0 on success, -1 on error
 *
 * ## Output Schema
 *
 * | Column    | Type    | Description                                     |
 * |-----------|---------|-------------------------------------------------|
 * | query_id  | Int64   | Query ID from input                             |
 * | log_ratio | Float64 | log10(num/denom), can be +inf, -inf, NaN        |
 * | fast_path | Int32   | 0 = computed exactly, 1 = numerator fast-path   |
 *
 * ## Memory Management
 *
 * - This function TAKES OWNERSHIP of input_stream
 * - Caller owns out_stream and MUST call out_stream->release() when done
 * - Do NOT release out_stream if this function returns -1
 *
 * ## Thread Safety
 *
 * Thread-safe for concurrent classification with the same RypeIndex pointers.
 */
int rype_classify_arrow_log_ratio(
    const RypeIndex* numerator,
    const RypeIndex* denominator,
    struct ArrowArrayStream* input_stream,
    double numerator_skip_threshold,
    struct ArrowArrayStream* out_stream
);

// ----------------------------------------------------------------------------
// Arrow Minimizer Extraction Functions
// ----------------------------------------------------------------------------

/**
 * Get the output schema for Arrow minimizer set extraction
 *
 * Schema: id (Int64), fwd_set (List<UInt64>), rc_set (List<UInt64>)
 *
 * @param out_schema  Pointer to caller-allocated ArrowSchema to initialize
 * @return            0 on success, -1 on error
 *
 * Caller must call out_schema->release(out_schema) when done.
 */
int rype_arrow_minimizer_set_schema(struct ArrowSchema* out_schema);

/**
 * Get the output schema for Arrow strand minimizers extraction
 *
 * Schema: id (Int64), fwd_hashes (List<UInt64>), fwd_positions (List<UInt64>),
 * rc_hashes (List<UInt64>), rc_positions (List<UInt64>)
 *
 * @param out_schema  Pointer to caller-allocated ArrowSchema to initialize
 * @return            0 on success, -1 on error
 *
 * Caller must call out_schema->release(out_schema) when done.
 */
int rype_arrow_strand_minimizers_schema(struct ArrowSchema* out_schema);

/**
 * Extract minimizer sets from an Arrow stream
 *
 * TRUE STREAMING: Processes one batch at a time.
 *
 * @param input_stream  Input ArrowArrayStream with id (Int64) and sequence (Binary) columns
 * @param k             K-mer size (must be 16, 32, or 64)
 * @param w             Window size (must be > 0)
 * @param salt          XOR salt for k-mer hashing
 * @param out_stream    Output ArrowArrayStream for results (caller-allocated)
 * @return              0 on success, -1 on error
 *
 * ## Output Schema
 *
 * | Column   | Type           | Description                          |
 * |----------|----------------|--------------------------------------|
 * | id       | Int64          | Query identifier from input          |
 * | fwd_set  | List<UInt64>   | Sorted, deduplicated forward hashes  |
 * | rc_set   | List<UInt64>   | Sorted, deduplicated RC hashes       |
 *
 * ## Memory Management
 *
 * - This function TAKES OWNERSHIP of input_stream
 * - Caller owns out_stream and MUST call out_stream->release() when done
 */
int rype_extract_minimizer_set_arrow(
    struct ArrowArrayStream* input_stream,
    size_t k,
    size_t w,
    uint64_t salt,
    struct ArrowArrayStream* out_stream
);

/**
 * Extract strand minimizers (hashes + positions) from an Arrow stream
 *
 * TRUE STREAMING: Processes one batch at a time.
 *
 * @param input_stream  Input ArrowArrayStream with id (Int64) and sequence (Binary) columns
 * @param k             K-mer size (must be 16, 32, or 64)
 * @param w             Window size (must be > 0)
 * @param salt          XOR salt for k-mer hashing
 * @param out_stream    Output ArrowArrayStream for results (caller-allocated)
 * @return              0 on success, -1 on error
 *
 * ## Output Schema
 *
 * | Column        | Type           | Description                     |
 * |---------------|----------------|---------------------------------|
 * | id            | Int64          | Query identifier from input     |
 * | fwd_hashes    | List<UInt64>   | Forward strand hash values      |
 * | fwd_positions | List<UInt64>   | Forward strand positions        |
 * | rc_hashes     | List<UInt64>   | RC strand hash values           |
 * | rc_positions  | List<UInt64>   | RC strand positions             |
 *
 * ## Memory Management
 *
 * - This function TAKES OWNERSHIP of input_stream
 * - Caller owns out_stream and MUST call out_stream->release() when done
 */
int rype_extract_strand_minimizers_arrow(
    struct ArrowArrayStream* input_stream,
    size_t k,
    size_t w,
    uint64_t salt,
    struct ArrowArrayStream* out_stream
);

#endif // RYPE_ARROW

#ifdef __cplusplus
}
#endif

#endif // RYPE_H