Skip to main content

yara_mapper/
mapper.rs

1use std::ffi::{CStr, CString};
2use std::path::Path;
3use std::ptr::NonNull;
4
5use crate::error::YaraError;
6use crate::ffi_helpers::{collect_contig_lengths, collect_contig_names, path_to_cstring};
7use crate::options::MapperOptions;
8use crate::record::{CigarOp, YaraRecord};
9
10/// One end (read 1 or read 2) of a paired-end read.
11///
12/// Sequence should be ASCII DNA (ACGTN) and quality should be phred+33.
13#[derive(Debug, Clone, Copy)]
14pub struct ReadEnd<'a> {
15    /// DNA sequence bytes (ASCII).
16    pub seq: &'a [u8],
17    /// Base quality bytes (phred+33 ASCII).
18    pub qual: &'a [u8],
19}
20
21/// A batch of paired-end reads to map.
22///
23/// All slices must have the same length (`count`).  Sequences are ASCII
24/// DNA strings (ACGTN) and qualities are phred+33 ASCII strings.
25#[derive(Default)]
26pub struct ReadBatch {
27    names: Vec<CString>,
28    r1_seqs: Vec<CString>,
29    r1_quals: Vec<CString>,
30    r2_seqs: Vec<CString>,
31    r2_quals: Vec<CString>,
32}
33
34impl ReadBatch {
35    /// Create a new empty batch.
36    #[must_use]
37    pub fn new() -> Self {
38        Self::default()
39    }
40
41    /// Create a batch with pre-allocated capacity for `n` pairs.
42    #[must_use]
43    pub fn with_capacity(n: usize) -> Self {
44        Self {
45            names: Vec::with_capacity(n),
46            r1_seqs: Vec::with_capacity(n),
47            r1_quals: Vec::with_capacity(n),
48            r2_seqs: Vec::with_capacity(n),
49            r2_quals: Vec::with_capacity(n),
50        }
51    }
52
53    /// Add a read pair to the batch.
54    ///
55    /// # Errors
56    ///
57    /// Returns [`YaraError::InvalidInput`] if any input contains a null byte
58    /// or if sequence and quality lengths do not match.
59    pub fn push(&mut self, name: &str, r1: ReadEnd<'_>, r2: ReadEnd<'_>) -> Result<(), YaraError> {
60        if r1.seq.len() != r1.qual.len() {
61            return Err(YaraError::InvalidInput(format!(
62                "r1 seq/qual length mismatch: {} vs {}",
63                r1.seq.len(),
64                r1.qual.len()
65            )));
66        }
67        if r2.seq.len() != r2.qual.len() {
68            return Err(YaraError::InvalidInput(format!(
69                "r2 seq/qual length mismatch: {} vs {}",
70                r2.seq.len(),
71                r2.qual.len()
72            )));
73        }
74        self.names
75            .push(CString::new(name).map_err(|e| YaraError::InvalidInput(format!("name: {e}")))?);
76        self.r1_seqs.push(
77            CString::new(r1.seq).map_err(|e| YaraError::InvalidInput(format!("r1_seq: {e}")))?,
78        );
79        self.r1_quals.push(
80            CString::new(r1.qual).map_err(|e| YaraError::InvalidInput(format!("r1_qual: {e}")))?,
81        );
82        self.r2_seqs.push(
83            CString::new(r2.seq).map_err(|e| YaraError::InvalidInput(format!("r2_seq: {e}")))?,
84        );
85        self.r2_quals.push(
86            CString::new(r2.qual).map_err(|e| YaraError::InvalidInput(format!("r2_qual: {e}")))?,
87        );
88        Ok(())
89    }
90
91    /// Number of read pairs in the batch.
92    #[must_use]
93    pub fn len(&self) -> usize {
94        self.names.len()
95    }
96
97    /// Whether the batch is empty.
98    #[must_use]
99    pub fn is_empty(&self) -> bool {
100        self.names.is_empty()
101    }
102}
103
104/// Handle to a loaded YARA mapper with a pre-built FM index.
105///
106/// The mapper is configured at construction time and can be used to map
107/// multiple batches of reads.  The FM index stays loaded in memory for
108/// the lifetime of this handle.
109///
110/// # Thread safety
111///
112/// The underlying C++ mapper uses OpenMP internally but is not safe for
113/// concurrent `map_paired` calls.  [`YaraMapper`] is [`Send`] but not
114/// [`Sync`].
115pub struct YaraMapper {
116    handle: NonNull<yara_mapper_sys::YaraMapperHandle>,
117}
118
119// SAFETY: The C++ handle owns all its memory and can be moved between threads.
120// It is NOT safe to call map_paired concurrently (OpenMP parallelism is
121// internal to each call), so we implement Send but not Sync.
122unsafe impl Send for YaraMapper {}
123
124impl YaraMapper {
125    /// Open a pre-built YARA index and create a mapper.
126    ///
127    /// `index_prefix` is the path prefix used when building the index (e.g.,
128    /// `ref/hla.fasta` if the index files are `ref/hla.fasta.yara.*`).
129    ///
130    /// # Errors
131    ///
132    /// Returns [`YaraError::IndexOpen`] if the index files cannot be found or
133    /// loaded, or if `index_prefix` contains non-UTF-8 characters or null bytes.
134    pub fn open<P: AsRef<Path>>(
135        index_prefix: P,
136        options: &MapperOptions,
137    ) -> Result<Self, YaraError> {
138        let prefix_cstr =
139            path_to_cstring(index_prefix.as_ref(), "index_prefix", YaraError::IndexOpen)?;
140
141        let ffi_opts = options.to_ffi();
142        let mut error_buf = vec![0u8; 1024];
143
144        let handle = unsafe {
145            yara_mapper_sys::yara_mapper_open(
146                prefix_cstr.as_ptr(),
147                &ffi_opts,
148                error_buf.as_mut_ptr().cast(),
149                error_buf.len(),
150            )
151        };
152
153        NonNull::new(handle).map(|h| Self { handle: h }).ok_or_else(|| {
154            let msg = unsafe { CStr::from_ptr(error_buf.as_ptr().cast()) };
155            YaraError::IndexOpen(msg.to_string_lossy().into_owned())
156        })
157    }
158
159    /// Map a batch of paired-end reads and return alignment records.
160    ///
161    /// The returned records include primary and (depending on options)
162    /// secondary alignments, plus unmapped entries for reads that could
163    /// not be aligned.
164    ///
165    /// # Errors
166    ///
167    /// Returns [`YaraError::Mapping`] if the underlying C++ mapper encounters
168    /// an error during alignment.
169    pub fn map_paired(&self, reads: &ReadBatch) -> Result<Vec<YaraRecord>, YaraError> {
170        if reads.is_empty() {
171            return Ok(Vec::new());
172        }
173
174        // Build pointer arrays for the C API.
175        let name_ptrs = cstring_ptrs(&reads.names);
176        let r1_seq_ptrs = cstring_ptrs(&reads.r1_seqs);
177        let r1_qual_ptrs = cstring_ptrs(&reads.r1_quals);
178        let r2_seq_ptrs = cstring_ptrs(&reads.r2_seqs);
179        let r2_qual_ptrs = cstring_ptrs(&reads.r2_quals);
180
181        let batch = yara_mapper_sys::YaraReadBatch {
182            names: name_ptrs.as_ptr(),
183            r1_seqs: r1_seq_ptrs.as_ptr(),
184            r1_quals: r1_qual_ptrs.as_ptr(),
185            r2_seqs: r2_seq_ptrs.as_ptr(),
186            r2_quals: r2_qual_ptrs.as_ptr(),
187            count: reads.len(),
188        };
189
190        let capacity = reads.len() * RECORDS_PER_PAIR;
191        // SAFETY: YaraAlignmentRecord is #[repr(C)] POD — all-zeros is a valid
192        // representation (null pointers, zero scalars).
193        let mut out_records: Vec<yara_mapper_sys::YaraAlignmentRecord> =
194            vec![unsafe { std::mem::zeroed() }; capacity];
195
196        let mut error_buf = [0u8; 1024];
197
198        let count = unsafe {
199            yara_mapper_sys::yara_mapper_map_paired(
200                self.handle.as_ptr(),
201                &batch,
202                out_records.as_mut_ptr(),
203                capacity,
204                error_buf.as_mut_ptr().cast(),
205                error_buf.len(),
206            )
207        };
208
209        if count < 0 {
210            // C++ already freed any partially written records before returning -1.
211            let msg = unsafe { CStr::from_ptr(error_buf.as_ptr().cast()) };
212            return Err(YaraError::Mapping(msg.to_string_lossy().into_owned()));
213        }
214
215        #[expect(
216            clippy::cast_possible_truncation,
217            clippy::cast_sign_loss,
218            reason = "count is non-negative and bounded by capacity (which is usize)"
219        )]
220        let n = count as usize;
221
222        // Convert each C record to an owned Rust type, freeing the C++ memory
223        // for that record immediately.  This avoids holding two copies of all
224        // records simultaneously and is safe against panics (only the current
225        // record's C++ memory can leak if convert_record panics).
226        let results: Vec<YaraRecord> = out_records[..n]
227            .iter()
228            .map(|rec| {
229                let converted = convert_record(rec);
230                unsafe {
231                    yara_mapper_sys::yara_mapper_free_record(std::ptr::from_ref(rec).cast_mut());
232                }
233                converted
234            })
235            .collect();
236
237        Ok(results)
238    }
239
240    /// Number of reference contigs in the loaded index.
241    #[must_use]
242    pub fn contig_count(&self) -> usize {
243        unsafe { yara_mapper_sys::yara_mapper_contig_count(self.handle.as_ptr()) }
244    }
245
246    /// Reference contig names (for SAM/BAM header construction).
247    #[must_use]
248    pub fn contig_names(&self) -> Vec<String> {
249        let n = self.contig_count();
250        unsafe {
251            collect_contig_names(n, |i| {
252                yara_mapper_sys::yara_mapper_contig_name(self.handle.as_ptr(), i)
253            })
254        }
255    }
256
257    /// Reference contig lengths.
258    #[must_use]
259    pub fn contig_lengths(&self) -> Vec<usize> {
260        let n = self.contig_count();
261        collect_contig_lengths(n, |i| unsafe {
262            yara_mapper_sys::yara_mapper_contig_length(self.handle.as_ptr(), i)
263        })
264    }
265}
266
267impl Drop for YaraMapper {
268    fn drop(&mut self) {
269        unsafe { yara_mapper_sys::yara_mapper_close(self.handle.as_ptr()) }
270    }
271}
272
273/// Heuristic upper bound on records per read pair.  In `SecondaryMode::Record`
274/// mode each secondary is a separate record; 10x accommodates this generously.
275const RECORDS_PER_PAIR: usize = 10;
276
277/// Collect raw pointers from a slice of `CString` values for the C API.
278fn cstring_ptrs(strings: &[CString]) -> Vec<*const i8> {
279    strings.iter().map(|s| s.as_ptr()).collect()
280}
281
282/// Convert a single FFI record to an owned Rust record.
283fn convert_record(rec: &yara_mapper_sys::YaraAlignmentRecord) -> YaraRecord {
284    // CIGAR
285    let cigar = if !rec.cigar.is_null() && rec.cigar_len > 0 {
286        let slice = unsafe { std::slice::from_raw_parts(rec.cigar, rec.cigar_len as usize) };
287        slice.iter().map(|&encoded| CigarOp::from_bam(encoded)).collect()
288    } else {
289        Vec::new()
290    };
291
292    // Sequence
293    let seq = if !rec.seq.is_null() && rec.seq_len > 0 {
294        let slice =
295            unsafe { std::slice::from_raw_parts(rec.seq.cast::<u8>(), rec.seq_len as usize) };
296        Some(slice.to_vec())
297    } else {
298        None
299    };
300
301    // Quality
302    let qual = if !rec.qual.is_null() && rec.seq_len > 0 {
303        let slice =
304            unsafe { std::slice::from_raw_parts(rec.qual.cast::<u8>(), rec.seq_len as usize) };
305        Some(slice.to_vec())
306    } else {
307        None
308    };
309
310    // XA tag
311    let xa = if rec.xa.is_null() {
312        None
313    } else {
314        Some(unsafe { CStr::from_ptr(rec.xa) }.to_string_lossy().into_owned())
315    };
316
317    YaraRecord {
318        read_pair_index: rec.read_pair_index,
319        is_read1: rec.is_read1 != 0,
320        contig_id: rec.contig_id,
321        pos: rec.pos,
322        is_reverse: rec.is_reverse != 0,
323        is_secondary: rec.is_secondary != 0,
324        is_unmapped: rec.is_unmapped != 0,
325        mapq: rec.mapq,
326        nm: rec.nm,
327        x0: rec.x0,
328        x1: rec.x1,
329        mate_contig_id: rec.mate_contig_id,
330        mate_pos: rec.mate_pos,
331        tlen: rec.tlen,
332        flag: rec.flag,
333        cigar,
334        seq,
335        qual,
336        xa,
337    }
338}