Skip to main content

yara_mapper/
mapper.rs

1use std::ffi::{CStr, CString};
2use std::path::Path;
3use std::ptr::NonNull;
4
5use crate::error::YaraError;
6use crate::ffi_helpers::{
7    bytes_to_cstring, collect_contig_lengths, collect_contig_names, path_to_cstring,
8};
9use crate::options::{MapperOptions, SecondaryMode};
10use crate::record::{CigarOp, YaraRecord};
11
12/// One end (read 1 or read 2) of a paired-end read.
13///
14/// Sequence should be ASCII DNA (ACGTN) and quality should be phred+33.
15#[derive(Debug, Clone, Copy)]
16pub struct ReadEnd<'a> {
17    /// DNA sequence bytes (ASCII).
18    pub seq: &'a [u8],
19    /// Base quality bytes (phred+33 ASCII).
20    pub qual: &'a [u8],
21}
22
23/// A batch of paired-end reads to map.
24///
25/// All slices must have the same length (`count`).  Sequences are ASCII
26/// DNA strings (ACGTN) and qualities are phred+33 ASCII strings.
27#[derive(Default)]
28pub struct ReadBatch {
29    names: Vec<CString>,
30    r1_seqs: Vec<CString>,
31    r1_quals: Vec<CString>,
32    r2_seqs: Vec<CString>,
33    r2_quals: Vec<CString>,
34}
35
36impl ReadBatch {
37    /// Create a new empty batch.
38    #[must_use]
39    pub fn new() -> Self {
40        Self::default()
41    }
42
43    /// Create a batch with pre-allocated capacity for `n` pairs.
44    #[must_use]
45    pub fn with_capacity(n: usize) -> Self {
46        Self {
47            names: Vec::with_capacity(n),
48            r1_seqs: Vec::with_capacity(n),
49            r1_quals: Vec::with_capacity(n),
50            r2_seqs: Vec::with_capacity(n),
51            r2_quals: Vec::with_capacity(n),
52        }
53    }
54
55    /// Add a read pair to the batch.
56    ///
57    /// # Errors
58    ///
59    /// Returns [`YaraError::InvalidInput`] if any input contains a null byte
60    /// or if sequence and quality lengths do not match.
61    pub fn push(&mut self, name: &str, r1: ReadEnd<'_>, r2: ReadEnd<'_>) -> Result<(), YaraError> {
62        if r1.seq.len() != r1.qual.len() {
63            return Err(YaraError::InvalidInput(format!(
64                "r1 seq/qual length mismatch: {} vs {}",
65                r1.seq.len(),
66                r1.qual.len()
67            )));
68        }
69        if r2.seq.len() != r2.qual.len() {
70            return Err(YaraError::InvalidInput(format!(
71                "r2 seq/qual length mismatch: {} vs {}",
72                r2.seq.len(),
73                r2.qual.len()
74            )));
75        }
76        self.names
77            .push(CString::new(name).map_err(|e| YaraError::InvalidInput(format!("name: {e}")))?);
78        self.r1_seqs.push(bytes_to_cstring(r1.seq));
79        self.r1_quals.push(bytes_to_cstring(r1.qual));
80        self.r2_seqs.push(bytes_to_cstring(r2.seq));
81        self.r2_quals.push(bytes_to_cstring(r2.qual));
82        Ok(())
83    }
84
85    /// Number of read pairs in the batch.
86    #[must_use]
87    pub fn len(&self) -> usize {
88        self.names.len()
89    }
90
91    /// Whether the batch is empty.
92    #[must_use]
93    pub fn is_empty(&self) -> bool {
94        self.names.is_empty()
95    }
96
97    /// Clear all reads, retaining allocated capacity for reuse.
98    pub fn clear(&mut self) {
99        self.names.clear();
100        self.r1_seqs.clear();
101        self.r1_quals.clear();
102        self.r2_seqs.clear();
103        self.r2_quals.clear();
104    }
105}
106
107/// Handle to a loaded YARA mapper with a pre-built FM index.
108///
109/// The mapper is configured at construction time and can be used to map
110/// multiple batches of reads.  The FM index stays loaded in memory for
111/// the lifetime of this handle.
112///
113/// # Thread safety
114///
115/// The underlying C++ mapper uses OpenMP internally but is not safe for
116/// concurrent `map_paired` calls.  [`YaraMapper`] is [`Send`] but not
117/// [`Sync`].
118pub struct YaraMapper {
119    handle: NonNull<yara_mapper_sys::YaraMapperHandle>,
120    secondary_mode: SecondaryMode,
121}
122
123// SAFETY: The C++ handle owns all its memory and can be moved between threads.
124// It is NOT safe to call map_paired concurrently (OpenMP parallelism is
125// internal to each call), so we implement Send but not Sync.
126unsafe impl Send for YaraMapper {}
127
128impl YaraMapper {
129    /// Open a pre-built YARA index and create a mapper.
130    ///
131    /// `index_prefix` is the path prefix used when building the index (e.g.,
132    /// `ref/hla.fasta` if the index files are `ref/hla.fasta.yara.*`).
133    ///
134    /// # Errors
135    ///
136    /// Returns [`YaraError::IndexOpen`] if the index files cannot be found or
137    /// loaded, or if `index_prefix` contains non-UTF-8 characters or null bytes.
138    pub fn open<P: AsRef<Path>>(
139        index_prefix: P,
140        options: &MapperOptions,
141    ) -> Result<Self, YaraError> {
142        let prefix_cstr =
143            path_to_cstring(index_prefix.as_ref(), "index_prefix", YaraError::IndexOpen)?;
144
145        let ffi_opts = options.to_ffi();
146        let mut error_buf = vec![0u8; 1024];
147
148        let handle = unsafe {
149            yara_mapper_sys::yara_mapper_open(
150                prefix_cstr.as_ptr(),
151                &ffi_opts,
152                error_buf.as_mut_ptr().cast(),
153                error_buf.len(),
154            )
155        };
156
157        NonNull::new(handle)
158            .map(|h| Self { handle: h, secondary_mode: options.secondary_mode })
159            .ok_or_else(|| {
160                let msg = unsafe { CStr::from_ptr(error_buf.as_ptr().cast()) };
161                YaraError::IndexOpen(msg.to_string_lossy().into_owned())
162            })
163    }
164
165    /// Map a batch of paired-end reads and return alignment records.
166    ///
167    /// The returned records include primary and (depending on options)
168    /// secondary alignments, plus unmapped entries for reads that could
169    /// not be aligned.
170    ///
171    /// # Errors
172    ///
173    /// Returns [`YaraError::Mapping`] if the underlying C++ mapper encounters
174    /// an error during alignment.
175    pub fn map_paired(&self, reads: &ReadBatch) -> Result<Vec<YaraRecord>, YaraError> {
176        if reads.is_empty() {
177            return Ok(Vec::new());
178        }
179
180        // Build pointer arrays for the C API.
181        let name_ptrs = cstring_ptrs(&reads.names);
182        let r1_seq_ptrs = cstring_ptrs(&reads.r1_seqs);
183        let r1_qual_ptrs = cstring_ptrs(&reads.r1_quals);
184        let r2_seq_ptrs = cstring_ptrs(&reads.r2_seqs);
185        let r2_qual_ptrs = cstring_ptrs(&reads.r2_quals);
186
187        let batch = yara_mapper_sys::YaraReadBatch {
188            names: name_ptrs.as_ptr(),
189            r1_seqs: r1_seq_ptrs.as_ptr(),
190            r1_quals: r1_qual_ptrs.as_ptr(),
191            r2_seqs: r2_seq_ptrs.as_ptr(),
192            r2_quals: r2_qual_ptrs.as_ptr(),
193            count: reads.len(),
194        };
195
196        let capacity = reads.len() * records_per_pair(self.secondary_mode);
197        // SAFETY: YaraAlignmentRecord is #[repr(C)] POD — all-zeros is a valid
198        // representation (null pointers, zero scalars).  The C++ shim also
199        // memsets the buffer, but we zero here defensively.
200        let mut out_records: Vec<yara_mapper_sys::YaraAlignmentRecord> =
201            vec![unsafe { std::mem::zeroed() }; capacity];
202
203        let mut error_buf = [0u8; 1024];
204
205        let count = unsafe {
206            yara_mapper_sys::yara_mapper_map_paired(
207                self.handle.as_ptr(),
208                &batch,
209                out_records.as_mut_ptr(),
210                capacity,
211                error_buf.as_mut_ptr().cast(),
212                error_buf.len(),
213            )
214        };
215
216        if count < 0 {
217            // C++ already freed any partially written records before returning -1.
218            let msg = unsafe { CStr::from_ptr(error_buf.as_ptr().cast()) };
219            return Err(YaraError::Mapping(msg.to_string_lossy().into_owned()));
220        }
221
222        #[expect(
223            clippy::cast_possible_truncation,
224            clippy::cast_sign_loss,
225            reason = "count is non-negative and bounded by capacity (which is usize)"
226        )]
227        let n = count as usize;
228
229        // Convert each C record to an owned Rust type, then batch-free the
230        // C++ memory.  Pool-managed fields (seq/qual/cigar) are freed when the
231        // pool is cleared on the next mapPaired call; only XA strings are freed
232        // by free_records.
233        let results: Vec<YaraRecord> = out_records[..n].iter().map(convert_record).collect();
234
235        unsafe {
236            yara_mapper_sys::yara_mapper_free_records(out_records.as_mut_ptr(), n);
237        }
238
239        Ok(results)
240    }
241
242    /// Number of reference contigs in the loaded index.
243    #[must_use]
244    pub fn contig_count(&self) -> usize {
245        unsafe { yara_mapper_sys::yara_mapper_contig_count(self.handle.as_ptr()) }
246    }
247
248    /// Reference contig names (for SAM/BAM header construction).
249    #[must_use]
250    pub fn contig_names(&self) -> Vec<String> {
251        let n = self.contig_count();
252        unsafe {
253            collect_contig_names(n, |i| {
254                yara_mapper_sys::yara_mapper_contig_name(self.handle.as_ptr(), i)
255            })
256        }
257    }
258
259    /// Reference contig lengths.
260    #[must_use]
261    pub fn contig_lengths(&self) -> Vec<usize> {
262        let n = self.contig_count();
263        collect_contig_lengths(n, |i| unsafe {
264            yara_mapper_sys::yara_mapper_contig_length(self.handle.as_ptr(), i)
265        })
266    }
267}
268
269impl Drop for YaraMapper {
270    fn drop(&mut self) {
271        unsafe { yara_mapper_sys::yara_mapper_close(self.handle.as_ptr()) }
272    }
273}
274
275/// Upper bound on records per read pair, depending on secondary mode.
276/// `Tag`/`Omit`: each read end produces exactly one record, so 2 per pair.
277/// `Record`: secondaries are separate records; 10 per pair is generous.
278fn records_per_pair(mode: SecondaryMode) -> usize {
279    match mode {
280        SecondaryMode::Tag | SecondaryMode::Omit => 2,
281        SecondaryMode::Record => 10,
282    }
283}
284
285/// Collect raw pointers from a slice of `CString` values for the C API.
286fn cstring_ptrs(strings: &[CString]) -> Vec<*const i8> {
287    strings.iter().map(|s| s.as_ptr()).collect()
288}
289
290/// Convert a single FFI record to an owned Rust record.
291fn convert_record(rec: &yara_mapper_sys::YaraAlignmentRecord) -> YaraRecord {
292    // CIGAR
293    let cigar = if !rec.cigar.is_null() && rec.cigar_len > 0 {
294        let slice = unsafe { std::slice::from_raw_parts(rec.cigar, rec.cigar_len as usize) };
295        slice.iter().map(|&encoded| CigarOp::from_bam(encoded)).collect()
296    } else {
297        Vec::new()
298    };
299
300    // Sequence
301    let seq = if !rec.seq.is_null() && rec.seq_len > 0 {
302        let slice =
303            unsafe { std::slice::from_raw_parts(rec.seq.cast::<u8>(), rec.seq_len as usize) };
304        Some(slice.to_vec())
305    } else {
306        None
307    };
308
309    // Quality
310    let qual = if !rec.qual.is_null() && rec.seq_len > 0 {
311        let slice =
312            unsafe { std::slice::from_raw_parts(rec.qual.cast::<u8>(), rec.seq_len as usize) };
313        Some(slice.to_vec())
314    } else {
315        None
316    };
317
318    // XA tag
319    let xa = if rec.xa.is_null() {
320        None
321    } else {
322        Some(unsafe { CStr::from_ptr(rec.xa) }.to_string_lossy().into_owned())
323    };
324
325    YaraRecord {
326        read_pair_index: rec.read_pair_index,
327        is_read1: rec.is_read1 != 0,
328        contig_id: rec.contig_id,
329        pos: rec.pos,
330        is_reverse: rec.is_reverse != 0,
331        is_secondary: rec.is_secondary != 0,
332        is_unmapped: rec.is_unmapped != 0,
333        mapq: rec.mapq,
334        nm: rec.nm,
335        x0: rec.x0,
336        x1: rec.x1,
337        mate_contig_id: rec.mate_contig_id,
338        mate_pos: rec.mate_pos,
339        tlen: rec.tlen,
340        flag: rec.flag,
341        cigar,
342        seq,
343        qual,
344        xa,
345    }
346}