ragc_core/ffi/kmer_helpers.rs
1// FFI helpers for k-mer operations - micro-functions callable from C++
2// Tests that Rust k-mer canonicalization matches C++ CKmer exactly
3
4use crate::kmer::{Kmer, KmerMode};
5use crate::kmer_extract::{
6 find_new_splitters_kmers, remove_non_singletons, remove_non_singletons_with_duplicates,
7};
8use std::slice;
9
10/// Extract all canonical k-mer values from a contig
11///
12/// Matches C++ CKmer behavior:
13/// - Scans through contig building rolling k-mers
14/// - Resets on non-ACGT bases (> 3)
15/// - Returns canonical representation of each k-mer
16///
17/// # Safety
18/// - Caller must ensure contig_data points to valid memory of contig_len bytes
19/// - Returned array must be freed with ragc_free_kmer_array()
20#[repr(C)]
21pub struct KmerArray {
22 /// Array of k-mer values (canonical)
23 pub data: *mut u64,
24 /// Number of k-mers
25 pub len: usize,
26}
27
28#[no_mangle]
29pub extern "C" fn ragc_extract_canonical_kmers(
30 contig_data: *const u8,
31 contig_len: usize,
32 k: u32,
33) -> KmerArray {
34 unsafe {
35 let contig = slice::from_raw_parts(contig_data, contig_len);
36 let mut kmers = Vec::new();
37 let mut kmer = Kmer::new(k, KmerMode::Canonical);
38
39 for &base in contig {
40 if base > 3 {
41 // Non-ACGT base, reset k-mer
42 kmer.reset();
43 } else {
44 kmer.insert(base as u64);
45
46 if kmer.is_full() {
47 kmers.push(kmer.data());
48 }
49 }
50 }
51
52 let mut result = kmers;
53 let result_ptr = result.as_mut_ptr();
54 let result_len = result.len();
55
56 // Prevent Rust from freeing the allocation
57 std::mem::forget(result);
58
59 KmerArray {
60 data: result_ptr,
61 len: result_len,
62 }
63 }
64}
65
66/// Free a k-mer array allocated by ragc_extract_canonical_kmers()
67///
68/// # Safety
69/// - Must only be called once per KmerArray
70/// - array.data must be a valid pointer from ragc_extract_canonical_kmers()
71#[no_mangle]
72pub extern "C" fn ragc_free_kmer_array(array: KmerArray) {
73 unsafe {
74 if !array.data.is_null() && array.len > 0 {
75 // Reconstruct the Vec and let it drop
76 let _ = Vec::from_raw_parts(array.data, array.len, array.len);
77 }
78 }
79}
80
81/// Extract a single k-mer at a specific position
82///
83/// Returns the canonical k-mer value at position `pos` in the contig,
84/// or u64::MAX if the k-mer cannot be extracted (position out of bounds,
85/// non-ACGT base in k-mer window).
86///
87/// # Safety
88/// - Caller must ensure contig_data points to valid memory of contig_len bytes
89#[no_mangle]
90pub extern "C" fn ragc_extract_kmer_at_position(
91 contig_data: *const u8,
92 contig_len: usize,
93 k: u32,
94 pos: usize,
95) -> u64 {
96 unsafe {
97 let contig = slice::from_raw_parts(contig_data, contig_len);
98 let k = k as usize;
99
100 // Check bounds
101 if pos + k > contig_len {
102 return u64::MAX;
103 }
104
105 // Check for non-ACGT bases in the k-mer window
106 for i in 0..k {
107 if contig[pos + i] > 3 {
108 return u64::MAX;
109 }
110 }
111
112 // Build k-mer
113 let mut kmer = Kmer::new(k as u32, KmerMode::Canonical);
114 for i in 0..k {
115 kmer.insert(contig[pos + i] as u64);
116 }
117
118 kmer.data()
119 }
120}
121
122/// Remove non-singleton k-mers from a sorted vector
123///
124/// Modifies the vector in place to keep only k-mers that appear exactly once.
125/// K-mers before `virtual_begin` are not checked and remain in the output.
126///
127/// Returns the new length of the vector.
128///
129/// # Safety
130/// - vec_ptr must point to a valid vector allocation of at least vec_capacity elements
131/// - The vector must be sorted (for correct singleton detection)
132/// - The caller must resize the vector to the returned length
133/// - Caller maintains ownership of the allocation
134#[no_mangle]
135pub extern "C" fn ragc_remove_non_singletons(
136 vec_ptr: *mut u64,
137 vec_len: usize,
138 vec_capacity: usize,
139 virtual_begin: usize,
140) -> usize {
141 unsafe {
142 // Reconstruct the Vec from raw parts (temporarily borrow ownership)
143 let mut vec = Vec::from_raw_parts(vec_ptr, vec_len, vec_capacity);
144
145 // Call the Rust implementation
146 remove_non_singletons(&mut vec, virtual_begin);
147
148 // Get the new length after modification
149 let new_len = vec.len();
150
151 // Prevent Rust from freeing the allocation (C++ owns it)
152 std::mem::forget(vec);
153
154 new_len
155 }
156}
157
158/// Result of remove_non_singletons_with_duplicates containing both new lengths
159#[repr(C)]
160pub struct RemoveSingletonsResult {
161 /// New length of the main vector (singletons only)
162 pub vec_new_len: usize,
163 /// New length of the duplicates vector
164 pub dup_new_len: usize,
165}
166
167/// Remove non-singleton k-mers from a sorted vector, collecting duplicates
168///
169/// Modifies the vector in place to keep only k-mers that appear exactly once,
170/// and collects k-mers appearing more than once into a separate vector.
171/// K-mers before `virtual_begin` are not checked and remain in the output.
172///
173/// Returns a struct containing the new lengths of both vectors.
174///
175/// # Safety
176/// - vec_ptr must point to a valid vector allocation of at least vec_capacity elements
177/// - dup_ptr must point to a valid vector allocation
178/// - The main vector must be sorted (for correct singleton detection)
179/// - The caller must resize both vectors to the returned lengths
180/// - Caller maintains ownership of both allocations
181/// - The duplicates vector will be cleared and filled with new values
182#[no_mangle]
183pub extern "C" fn ragc_remove_non_singletons_with_duplicates(
184 vec_ptr: *mut u64,
185 vec_len: usize,
186 vec_capacity: usize,
187 dup_ptr: *mut u64,
188 dup_len: usize,
189 dup_capacity: usize,
190 virtual_begin: usize,
191) -> RemoveSingletonsResult {
192 unsafe {
193 // Reconstruct both Vecs from raw parts (temporarily borrow ownership)
194 let mut vec = Vec::from_raw_parts(vec_ptr, vec_len, vec_capacity);
195 let mut duplicated = Vec::from_raw_parts(dup_ptr, dup_len, dup_capacity);
196
197 // Call the Rust implementation
198 remove_non_singletons_with_duplicates(&mut vec, &mut duplicated, virtual_begin);
199
200 // Get the new lengths after modification
201 let new_vec_len = vec.len();
202 let new_dup_len = duplicated.len();
203
204 // Prevent Rust from freeing the allocations (C++ owns them)
205 std::mem::forget(vec);
206 std::mem::forget(duplicated);
207
208 RemoveSingletonsResult {
209 vec_new_len: new_vec_len,
210 dup_new_len: new_dup_len,
211 }
212 }
213}
214
215/// Find new splitter k-mers from a contig by excluding reference k-mers
216///
217/// Implements the k-mer filtering workflow from C++ AGC's find_new_splitters():
218/// 1. Extract canonical k-mers from contig
219/// 2. Filter to singletons only
220/// 3. Exclude k-mers that appear in reference singletons
221/// 4. Exclude k-mers that appear in reference duplicates
222///
223/// # Parameters
224/// - contig_data: Pointer to contig sequence data (numeric encoding: A=0, C=1, G=2, T=3)
225/// - contig_len: Length of contig
226/// - k: K-mer length
227/// - candidate_kmers_ptr: Pointer to sorted reference singleton k-mers
228/// - candidate_kmers_len: Length of reference singleton array
229/// - candidate_kmers_offset: Offset to start reading from candidate k-mers
230/// - duplicated_kmers_ptr: Pointer to sorted reference duplicate k-mers
231/// - duplicated_kmers_len: Length of reference duplicate array
232///
233/// # Returns
234/// KmerArray containing novel k-mer values (must be freed with ragc_free_kmer_array)
235///
236/// # Safety
237/// - Caller must ensure all pointers point to valid memory
238/// - All k-mer arrays must be sorted
239/// - Returned array must be freed with ragc_free_kmer_array()
240#[no_mangle]
241pub extern "C" fn ragc_find_new_splitters_kmers(
242 contig_data: *const u8,
243 contig_len: usize,
244 k: u32,
245 candidate_kmers_ptr: *const u64,
246 candidate_kmers_len: usize,
247 candidate_kmers_offset: usize,
248 duplicated_kmers_ptr: *const u64,
249 duplicated_kmers_len: usize,
250) -> KmerArray {
251 unsafe {
252 let contig = slice::from_raw_parts(contig_data, contig_len);
253 let candidate_kmers = slice::from_raw_parts(candidate_kmers_ptr, candidate_kmers_len);
254 let duplicated_kmers = slice::from_raw_parts(duplicated_kmers_ptr, duplicated_kmers_len);
255
256 let mut result = find_new_splitters_kmers(
257 contig,
258 k,
259 candidate_kmers,
260 candidate_kmers_offset,
261 duplicated_kmers,
262 );
263
264 let len = result.len();
265 let ptr = result.as_mut_ptr();
266 std::mem::forget(result);
267
268 KmerArray { data: ptr, len }
269 }
270}
271
272#[cfg(test)]
273mod tests {
274 use super::*;
275
276 #[test]
277 fn test_extract_canonical_kmers() {
278 // ACGT sequence
279 let contig = vec![0, 1, 2, 3, 0, 1, 2, 3]; // ACGTACGT
280 let k = 3;
281 let array = ragc_extract_canonical_kmers(contig.as_ptr(), contig.len(), k);
282
283 unsafe {
284 let kmers = slice::from_raw_parts(array.data, array.len);
285 // Should have 6 k-mers (length - k + 1)
286 assert_eq!(kmers.len(), 6);
287 }
288
289 ragc_free_kmer_array(array);
290 }
291
292 #[test]
293 fn test_extract_kmers_with_reset() {
294 // Sequence with non-ACGT base (N = 4)
295 let contig = vec![0, 1, 2, 4, 0, 1, 2, 3]; // ACGTNACGT
296 let k = 3;
297 let array = ragc_extract_canonical_kmers(contig.as_ptr(), contig.len(), k);
298
299 unsafe {
300 let kmers = slice::from_raw_parts(array.data, array.len);
301 // Should have 3 k-mers: ACG (0-2), then reset at N, then ACG (4-6), CGT (5-7)
302 assert_eq!(kmers.len(), 3);
303 }
304
305 ragc_free_kmer_array(array);
306 }
307
308 #[test]
309 fn test_extract_kmer_at_position() {
310 let contig = vec![0, 1, 2, 3, 0, 1, 2, 3]; // ACGTACGT
311 let k = 3;
312
313 // Extract at position 0: ACG
314 let kmer0 = ragc_extract_kmer_at_position(contig.as_ptr(), contig.len(), k, 0);
315 assert_ne!(kmer0, u64::MAX);
316
317 // Extract at position 5: CGT
318 let kmer5 = ragc_extract_kmer_at_position(contig.as_ptr(), contig.len(), k, 5);
319 assert_ne!(kmer5, u64::MAX);
320
321 // Out of bounds
322 let kmer_oob = ragc_extract_kmer_at_position(contig.as_ptr(), contig.len(), k, 10);
323 assert_eq!(kmer_oob, u64::MAX);
324 }
325
326 #[test]
327 fn test_extract_kmer_with_n() {
328 let contig = vec![0, 4, 2, 3]; // ANGT
329 let k = 3;
330
331 // Position 0 contains N, should return MAX
332 let kmer = ragc_extract_kmer_at_position(contig.as_ptr(), contig.len(), k, 0);
333 assert_eq!(kmer, u64::MAX);
334 }
335}