ragc_core/ffi/
splitter_check.rs

1// FFI helper for splitter checking - micro-function callable from C++
2// Tests that our splitter data structures work correctly
3
4use ahash::AHashSet;
5
6/// Check if a k-mer is a splitter
7///
8/// Matches C++ AGC's splitter checking logic (agc_compressor.cpp:2034):
9/// ```cpp
10/// if (bloom_splitters.check(d) && hs_splitters.check(d))
11/// ```
12///
13/// This is the critical decision point in compress_contig() that determines
14/// where to split segments.
15///
16/// # Safety
17/// - splitters_ptr must point to a valid array of splitters_len elements
18/// - This function does not take ownership of the splitters array
19#[no_mangle]
20pub extern "C" fn ragc_is_splitter(
21    kmer_value: u64,
22    splitters_ptr: *const u64,
23    splitters_len: usize,
24) -> bool {
25    unsafe {
26        let splitters = std::slice::from_raw_parts(splitters_ptr, splitters_len);
27
28        // Simple linear search for now (could optimize with HashSet if needed)
29        // But C++ AGC uses hash set + bloom filter, so we should match that
30        splitters.binary_search(&kmer_value).is_ok()
31    }
32}
33
34/// Create a splitter checker with proper data structures
35///
36/// This matches C++ AGC's use of both bloom filter and hash set.
37/// For now, we'll use a simpler approach and just return a sorted vector
38/// that can be binary searched.
39///
40/// Returns a pointer to a sorted array of splitters that must be freed
41/// with ragc_free_splitter_checker().
42#[repr(C)]
43pub struct SplitterChecker {
44    pub splitters: *mut u64,
45    pub len: usize,
46}
47
48#[no_mangle]
49pub extern "C" fn ragc_create_splitter_checker(
50    splitters_ptr: *const u64,
51    splitters_len: usize,
52) -> SplitterChecker {
53    unsafe {
54        let splitters_slice = std::slice::from_raw_parts(splitters_ptr, splitters_len);
55        let mut splitters_vec: Vec<u64> = splitters_slice.to_vec();
56
57        // Sort for binary search
58        splitters_vec.sort_unstable();
59
60        let ptr = splitters_vec.as_mut_ptr();
61        let len = splitters_vec.len();
62
63        std::mem::forget(splitters_vec);
64
65        SplitterChecker {
66            splitters: ptr,
67            len,
68        }
69    }
70}
71
72#[no_mangle]
73pub extern "C" fn ragc_free_splitter_checker(checker: SplitterChecker) {
74    unsafe {
75        if !checker.splitters.is_null() && checker.len > 0 {
76            let _ = Vec::from_raw_parts(checker.splitters, checker.len, checker.len);
77        }
78    }
79}
80
81/// Batch check if multiple k-mers are splitters
82///
83/// More efficient than calling ragc_is_splitter() repeatedly.
84/// Returns array of bools indicating which k-mers are splitters.
85///
86/// # Safety
87/// - kmers_ptr must point to valid array of kmers_len elements
88/// - splitters_ptr must point to valid array of splitters_len elements
89/// - Returned array must be freed with ragc_free_bool_array()
90#[repr(C)]
91pub struct BoolArray {
92    pub data: *mut bool,
93    pub len: usize,
94}
95
96#[no_mangle]
97pub extern "C" fn ragc_check_splitters_batch(
98    kmers_ptr: *const u64,
99    kmers_len: usize,
100    splitters_ptr: *const u64,
101    splitters_len: usize,
102) -> BoolArray {
103    unsafe {
104        let kmers = std::slice::from_raw_parts(kmers_ptr, kmers_len);
105        let splitters_slice = std::slice::from_raw_parts(splitters_ptr, splitters_len);
106
107        // Build AHashSet for O(1) lookups (faster than std HashSet for u64)
108        let splitter_set: AHashSet<u64> = splitters_slice.iter().copied().collect();
109
110        // Check each k-mer
111        let mut results: Vec<bool> = kmers
112            .iter()
113            .map(|kmer| splitter_set.contains(kmer))
114            .collect();
115
116        let ptr = results.as_mut_ptr();
117        let len = results.len();
118
119        std::mem::forget(results);
120
121        BoolArray { data: ptr, len }
122    }
123}
124
125#[no_mangle]
126pub extern "C" fn ragc_free_bool_array(array: BoolArray) {
127    unsafe {
128        if !array.data.is_null() && array.len > 0 {
129            let _ = Vec::from_raw_parts(array.data, array.len, array.len);
130        }
131    }
132}
133
134#[cfg(test)]
135mod tests {
136    use super::*;
137
138    #[test]
139    fn test_is_splitter() {
140        let splitters = vec![100u64, 200, 300, 400, 500];
141
142        // K-mer is a splitter
143        assert!(ragc_is_splitter(300, splitters.as_ptr(), splitters.len()));
144
145        // K-mer is not a splitter
146        assert!(!ragc_is_splitter(350, splitters.as_ptr(), splitters.len()));
147
148        // First and last
149        assert!(ragc_is_splitter(100, splitters.as_ptr(), splitters.len()));
150        assert!(ragc_is_splitter(500, splitters.as_ptr(), splitters.len()));
151    }
152
153    #[test]
154    fn test_splitter_checker() {
155        let splitters = vec![300u64, 100, 500, 200, 400]; // Unsorted
156        let checker = ragc_create_splitter_checker(splitters.as_ptr(), splitters.len());
157
158        unsafe {
159            let sorted = std::slice::from_raw_parts(checker.splitters, checker.len);
160
161            // Should be sorted
162            assert_eq!(sorted, &[100, 200, 300, 400, 500]);
163        }
164
165        ragc_free_splitter_checker(checker);
166    }
167
168    #[test]
169    fn test_batch_check() {
170        let splitters = vec![100u64, 200, 300];
171        let kmers = vec![50u64, 100, 150, 200, 250, 300, 350];
172
173        let results = ragc_check_splitters_batch(
174            kmers.as_ptr(),
175            kmers.len(),
176            splitters.as_ptr(),
177            splitters.len(),
178        );
179
180        unsafe {
181            let checks = std::slice::from_raw_parts(results.data, results.len);
182            assert_eq!(checks, &[false, true, false, true, false, true, false]);
183        }
184
185        ragc_free_bool_array(results);
186    }
187}