ragc_core/ffi/
kmer_helpers.rs

1// FFI helpers for k-mer operations - micro-functions callable from C++
2// Tests that Rust k-mer canonicalization matches C++ CKmer exactly
3
4use crate::kmer::{Kmer, KmerMode};
5use crate::kmer_extract::{
6    find_new_splitters_kmers, remove_non_singletons, remove_non_singletons_with_duplicates,
7};
8use std::slice;
9
10/// Extract all canonical k-mer values from a contig
11///
12/// Matches C++ CKmer behavior:
13/// - Scans through contig building rolling k-mers
14/// - Resets on non-ACGT bases (> 3)
15/// - Returns canonical representation of each k-mer
16///
17/// # Safety
18/// - Caller must ensure contig_data points to valid memory of contig_len bytes
19/// - Returned array must be freed with ragc_free_kmer_array()
20#[repr(C)]
21pub struct KmerArray {
22    /// Array of k-mer values (canonical)
23    pub data: *mut u64,
24    /// Number of k-mers
25    pub len: usize,
26}
27
28#[no_mangle]
29pub extern "C" fn ragc_extract_canonical_kmers(
30    contig_data: *const u8,
31    contig_len: usize,
32    k: u32,
33) -> KmerArray {
34    unsafe {
35        let contig = slice::from_raw_parts(contig_data, contig_len);
36        let mut kmers = Vec::new();
37        let mut kmer = Kmer::new(k, KmerMode::Canonical);
38
39        for &base in contig {
40            if base > 3 {
41                // Non-ACGT base, reset k-mer
42                kmer.reset();
43            } else {
44                kmer.insert(base as u64);
45
46                if kmer.is_full() {
47                    kmers.push(kmer.data());
48                }
49            }
50        }
51
52        let mut result = kmers;
53        let result_ptr = result.as_mut_ptr();
54        let result_len = result.len();
55
56        // Prevent Rust from freeing the allocation
57        std::mem::forget(result);
58
59        KmerArray {
60            data: result_ptr,
61            len: result_len,
62        }
63    }
64}
65
66/// Free a k-mer array allocated by ragc_extract_canonical_kmers()
67///
68/// # Safety
69/// - Must only be called once per KmerArray
70/// - array.data must be a valid pointer from ragc_extract_canonical_kmers()
71#[no_mangle]
72pub extern "C" fn ragc_free_kmer_array(array: KmerArray) {
73    unsafe {
74        if !array.data.is_null() && array.len > 0 {
75            // Reconstruct the Vec and let it drop
76            let _ = Vec::from_raw_parts(array.data, array.len, array.len);
77        }
78    }
79}
80
81/// Extract a single k-mer at a specific position
82///
83/// Returns the canonical k-mer value at position `pos` in the contig,
84/// or u64::MAX if the k-mer cannot be extracted (position out of bounds,
85/// non-ACGT base in k-mer window).
86///
87/// # Safety
88/// - Caller must ensure contig_data points to valid memory of contig_len bytes
89#[no_mangle]
90pub extern "C" fn ragc_extract_kmer_at_position(
91    contig_data: *const u8,
92    contig_len: usize,
93    k: u32,
94    pos: usize,
95) -> u64 {
96    unsafe {
97        let contig = slice::from_raw_parts(contig_data, contig_len);
98        let k = k as usize;
99
100        // Check bounds
101        if pos + k > contig_len {
102            return u64::MAX;
103        }
104
105        // Check for non-ACGT bases in the k-mer window
106        for i in 0..k {
107            if contig[pos + i] > 3 {
108                return u64::MAX;
109            }
110        }
111
112        // Build k-mer
113        let mut kmer = Kmer::new(k as u32, KmerMode::Canonical);
114        for i in 0..k {
115            kmer.insert(contig[pos + i] as u64);
116        }
117
118        kmer.data()
119    }
120}
121
122/// Remove non-singleton k-mers from a sorted vector
123///
124/// Modifies the vector in place to keep only k-mers that appear exactly once.
125/// K-mers before `virtual_begin` are not checked and remain in the output.
126///
127/// Returns the new length of the vector.
128///
129/// # Safety
130/// - vec_ptr must point to a valid vector allocation of at least vec_capacity elements
131/// - The vector must be sorted (for correct singleton detection)
132/// - The caller must resize the vector to the returned length
133/// - Caller maintains ownership of the allocation
134#[no_mangle]
135pub extern "C" fn ragc_remove_non_singletons(
136    vec_ptr: *mut u64,
137    vec_len: usize,
138    vec_capacity: usize,
139    virtual_begin: usize,
140) -> usize {
141    unsafe {
142        // Reconstruct the Vec from raw parts (temporarily borrow ownership)
143        let mut vec = Vec::from_raw_parts(vec_ptr, vec_len, vec_capacity);
144
145        // Call the Rust implementation
146        remove_non_singletons(&mut vec, virtual_begin);
147
148        // Get the new length after modification
149        let new_len = vec.len();
150
151        // Prevent Rust from freeing the allocation (C++ owns it)
152        std::mem::forget(vec);
153
154        new_len
155    }
156}
157
158/// Result of remove_non_singletons_with_duplicates containing both new lengths
159#[repr(C)]
160pub struct RemoveSingletonsResult {
161    /// New length of the main vector (singletons only)
162    pub vec_new_len: usize,
163    /// New length of the duplicates vector
164    pub dup_new_len: usize,
165}
166
167/// Remove non-singleton k-mers from a sorted vector, collecting duplicates
168///
169/// Modifies the vector in place to keep only k-mers that appear exactly once,
170/// and collects k-mers appearing more than once into a separate vector.
171/// K-mers before `virtual_begin` are not checked and remain in the output.
172///
173/// Returns a struct containing the new lengths of both vectors.
174///
175/// # Safety
176/// - vec_ptr must point to a valid vector allocation of at least vec_capacity elements
177/// - dup_ptr must point to a valid vector allocation
178/// - The main vector must be sorted (for correct singleton detection)
179/// - The caller must resize both vectors to the returned lengths
180/// - Caller maintains ownership of both allocations
181/// - The duplicates vector will be cleared and filled with new values
182#[no_mangle]
183pub extern "C" fn ragc_remove_non_singletons_with_duplicates(
184    vec_ptr: *mut u64,
185    vec_len: usize,
186    vec_capacity: usize,
187    dup_ptr: *mut u64,
188    dup_len: usize,
189    dup_capacity: usize,
190    virtual_begin: usize,
191) -> RemoveSingletonsResult {
192    unsafe {
193        // Reconstruct both Vecs from raw parts (temporarily borrow ownership)
194        let mut vec = Vec::from_raw_parts(vec_ptr, vec_len, vec_capacity);
195        let mut duplicated = Vec::from_raw_parts(dup_ptr, dup_len, dup_capacity);
196
197        // Call the Rust implementation
198        remove_non_singletons_with_duplicates(&mut vec, &mut duplicated, virtual_begin);
199
200        // Get the new lengths after modification
201        let new_vec_len = vec.len();
202        let new_dup_len = duplicated.len();
203
204        // Prevent Rust from freeing the allocations (C++ owns them)
205        std::mem::forget(vec);
206        std::mem::forget(duplicated);
207
208        RemoveSingletonsResult {
209            vec_new_len: new_vec_len,
210            dup_new_len: new_dup_len,
211        }
212    }
213}
214
215/// Find new splitter k-mers from a contig by excluding reference k-mers
216///
217/// Implements the k-mer filtering workflow from C++ AGC's find_new_splitters():
218/// 1. Extract canonical k-mers from contig
219/// 2. Filter to singletons only
220/// 3. Exclude k-mers that appear in reference singletons
221/// 4. Exclude k-mers that appear in reference duplicates
222///
223/// # Parameters
224/// - contig_data: Pointer to contig sequence data (numeric encoding: A=0, C=1, G=2, T=3)
225/// - contig_len: Length of contig
226/// - k: K-mer length
227/// - candidate_kmers_ptr: Pointer to sorted reference singleton k-mers
228/// - candidate_kmers_len: Length of reference singleton array
229/// - candidate_kmers_offset: Offset to start reading from candidate k-mers
230/// - duplicated_kmers_ptr: Pointer to sorted reference duplicate k-mers
231/// - duplicated_kmers_len: Length of reference duplicate array
232///
233/// # Returns
234/// KmerArray containing novel k-mer values (must be freed with ragc_free_kmer_array)
235///
236/// # Safety
237/// - Caller must ensure all pointers point to valid memory
238/// - All k-mer arrays must be sorted
239/// - Returned array must be freed with ragc_free_kmer_array()
240#[no_mangle]
241pub extern "C" fn ragc_find_new_splitters_kmers(
242    contig_data: *const u8,
243    contig_len: usize,
244    k: u32,
245    candidate_kmers_ptr: *const u64,
246    candidate_kmers_len: usize,
247    candidate_kmers_offset: usize,
248    duplicated_kmers_ptr: *const u64,
249    duplicated_kmers_len: usize,
250) -> KmerArray {
251    unsafe {
252        let contig = slice::from_raw_parts(contig_data, contig_len);
253        let candidate_kmers = slice::from_raw_parts(candidate_kmers_ptr, candidate_kmers_len);
254        let duplicated_kmers = slice::from_raw_parts(duplicated_kmers_ptr, duplicated_kmers_len);
255
256        let mut result = find_new_splitters_kmers(
257            contig,
258            k,
259            candidate_kmers,
260            candidate_kmers_offset,
261            duplicated_kmers,
262        );
263
264        let len = result.len();
265        let ptr = result.as_mut_ptr();
266        std::mem::forget(result);
267
268        KmerArray { data: ptr, len }
269    }
270}
271
272#[cfg(test)]
273mod tests {
274    use super::*;
275
276    #[test]
277    fn test_extract_canonical_kmers() {
278        // ACGT sequence
279        let contig = vec![0, 1, 2, 3, 0, 1, 2, 3]; // ACGTACGT
280        let k = 3;
281        let array = ragc_extract_canonical_kmers(contig.as_ptr(), contig.len(), k);
282
283        unsafe {
284            let kmers = slice::from_raw_parts(array.data, array.len);
285            // Should have 6 k-mers (length - k + 1)
286            assert_eq!(kmers.len(), 6);
287        }
288
289        ragc_free_kmer_array(array);
290    }
291
292    #[test]
293    fn test_extract_kmers_with_reset() {
294        // Sequence with non-ACGT base (N = 4)
295        let contig = vec![0, 1, 2, 4, 0, 1, 2, 3]; // ACGTNACGT
296        let k = 3;
297        let array = ragc_extract_canonical_kmers(contig.as_ptr(), contig.len(), k);
298
299        unsafe {
300            let kmers = slice::from_raw_parts(array.data, array.len);
301            // Should have 3 k-mers: ACG (0-2), then reset at N, then ACG (4-6), CGT (5-7)
302            assert_eq!(kmers.len(), 3);
303        }
304
305        ragc_free_kmer_array(array);
306    }
307
308    #[test]
309    fn test_extract_kmer_at_position() {
310        let contig = vec![0, 1, 2, 3, 0, 1, 2, 3]; // ACGTACGT
311        let k = 3;
312
313        // Extract at position 0: ACG
314        let kmer0 = ragc_extract_kmer_at_position(contig.as_ptr(), contig.len(), k, 0);
315        assert_ne!(kmer0, u64::MAX);
316
317        // Extract at position 5: CGT
318        let kmer5 = ragc_extract_kmer_at_position(contig.as_ptr(), contig.len(), k, 5);
319        assert_ne!(kmer5, u64::MAX);
320
321        // Out of bounds
322        let kmer_oob = ragc_extract_kmer_at_position(contig.as_ptr(), contig.len(), k, 10);
323        assert_eq!(kmer_oob, u64::MAX);
324    }
325
326    #[test]
327    fn test_extract_kmer_with_n() {
328        let contig = vec![0, 4, 2, 3]; // ANGT
329        let k = 3;
330
331        // Position 0 contains N, should return MAX
332        let kmer = ragc_extract_kmer_at_position(contig.as_ptr(), contig.len(), k, 0);
333        assert_eq!(kmer, u64::MAX);
334    }
335}