ragc_core/ffi/
base_validation.rs

1// FFI helper for DNA base validation
2// Simple but fundamental check used throughout segmentation
3
4/// Check if a base is a valid ACGT nucleotide
5///
6/// Matches C++ AGC's base validation (agc_compressor.cpp:2025):
7/// ```cpp
8/// if (x >> 2)         // x > 3
9///     kmer.Reset();
10/// ```
11///
12/// Valid bases are encoded as:
13/// - A = 0
14/// - C = 1
15/// - G = 2
16/// - T = 3
17/// - N or other = 4+ (invalid, triggers k-mer reset)
18///
19/// # Arguments
20/// * `base` - Encoded base value
21///
22/// # Returns
23/// true if base is valid ACGT (0-3), false otherwise
24#[no_mangle]
25pub extern "C" fn ragc_is_valid_base(base: u8) -> bool {
26    // Match C++ AGC logic exactly
27    // C++ uses: if (x >> 2) to check if x > 3
28    // We can use: base <= 3 (equivalent and clearer)
29    base <= 3
30}
31
32/// Check if a base requires k-mer reset
33///
34/// This is the inverse of is_valid_base() - returns true if the base
35/// should trigger a k-mer reset (non-ACGT).
36///
37/// Matches C++ AGC: if (x >> 2) kmer.Reset();
38#[no_mangle]
39pub extern "C" fn ragc_should_reset_kmer(base: u8) -> bool {
40    base > 3
41}
42
43/// Validate an entire sequence, counting valid and invalid bases
44///
45/// Returns (n_valid, n_invalid) counts
46#[repr(C)]
47pub struct BaseCounts {
48    pub n_valid: usize,
49    pub n_invalid: usize,
50}
51
52#[no_mangle]
53pub extern "C" fn ragc_count_base_validity(sequence: *const u8, length: usize) -> BaseCounts {
54    unsafe {
55        let seq = std::slice::from_raw_parts(sequence, length);
56
57        let mut n_valid = 0;
58        let mut n_invalid = 0;
59
60        for &base in seq {
61            if ragc_is_valid_base(base) {
62                n_valid += 1;
63            } else {
64                n_invalid += 1;
65            }
66        }
67
68        BaseCounts { n_valid, n_invalid }
69    }
70}
71
72/// Find positions of invalid bases (N or other) in a sequence
73///
74/// Returns array of positions where invalid bases occur.
75/// Useful for debugging or understanding where k-mer resets happen.
76///
77/// # Safety
78/// - sequence must point to valid memory of length bytes
79/// - Returned array must be freed with ragc_free_position_array()
80#[repr(C)]
81pub struct PositionArray {
82    pub data: *mut usize,
83    pub len: usize,
84}
85
86#[no_mangle]
87pub extern "C" fn ragc_find_invalid_base_positions(
88    sequence: *const u8,
89    length: usize,
90) -> PositionArray {
91    unsafe {
92        let seq = std::slice::from_raw_parts(sequence, length);
93
94        let mut positions: Vec<usize> = seq
95            .iter()
96            .enumerate()
97            .filter(|(_, &base)| !ragc_is_valid_base(base))
98            .map(|(pos, _)| pos)
99            .collect();
100
101        let ptr = positions.as_mut_ptr();
102        let len = positions.len();
103
104        std::mem::forget(positions);
105
106        PositionArray { data: ptr, len }
107    }
108}
109
110#[no_mangle]
111pub extern "C" fn ragc_free_position_array(array: PositionArray) {
112    unsafe {
113        if !array.data.is_null() && array.len > 0 {
114            let _ = Vec::from_raw_parts(array.data, array.len, array.len);
115        }
116    }
117}
118
119#[cfg(test)]
120mod tests {
121    use super::*;
122
123    #[test]
124    fn test_valid_bases() {
125        // Valid ACGT bases
126        assert!(ragc_is_valid_base(0)); // A
127        assert!(ragc_is_valid_base(1)); // C
128        assert!(ragc_is_valid_base(2)); // G
129        assert!(ragc_is_valid_base(3)); // T
130
131        // Invalid bases
132        assert!(!ragc_is_valid_base(4)); // N
133        assert!(!ragc_is_valid_base(5));
134        assert!(!ragc_is_valid_base(255));
135    }
136
137    #[test]
138    fn test_should_reset() {
139        // Should NOT reset for valid bases
140        assert!(!ragc_should_reset_kmer(0));
141        assert!(!ragc_should_reset_kmer(1));
142        assert!(!ragc_should_reset_kmer(2));
143        assert!(!ragc_should_reset_kmer(3));
144
145        // SHOULD reset for invalid bases
146        assert!(ragc_should_reset_kmer(4));
147        assert!(ragc_should_reset_kmer(5));
148        assert!(ragc_should_reset_kmer(255));
149    }
150
151    #[test]
152    fn test_count_validity() {
153        let sequence = vec![0, 1, 2, 3, 4, 0, 1, 5]; // ACGTNACX
154        let counts = ragc_count_base_validity(sequence.as_ptr(), sequence.len());
155
156        assert_eq!(counts.n_valid, 6); // A,C,G,T,A,C
157        assert_eq!(counts.n_invalid, 2); // N,X
158    }
159
160    #[test]
161    fn test_find_invalid_positions() {
162        let sequence = vec![0, 1, 4, 2, 3, 5, 0]; // ACNGTXA
163        let positions = ragc_find_invalid_base_positions(sequence.as_ptr(), sequence.len());
164
165        unsafe {
166            let pos_slice = std::slice::from_raw_parts(positions.data, positions.len);
167            assert_eq!(pos_slice, &[2, 5]); // Positions of N and X
168        }
169
170        ragc_free_position_array(positions);
171    }
172
173    #[test]
174    fn test_all_valid() {
175        let sequence = vec![0, 1, 2, 3, 0, 1, 2, 3]; // ACGTACGT
176        let positions = ragc_find_invalid_base_positions(sequence.as_ptr(), sequence.len());
177
178        assert_eq!(positions.len, 0); // No invalid bases
179
180        ragc_free_position_array(positions);
181    }
182}