Skip to main content

yara_mapper/
mapper.rs

1use std::ffi::{CStr, CString};
2use std::path::Path;
3use std::ptr::NonNull;
4
5use crate::error::YaraError;
6use crate::ffi_helpers::{collect_contig_lengths, collect_contig_names, path_to_cstring};
7use crate::options::{MapperOptions, SecondaryMode};
8use crate::record::{CigarOp, YaraRecord};
9
10/// One end (read 1 or read 2) of a paired-end read.
11///
12/// Sequence should be ASCII DNA (ACGTN) and quality should be phred+33.
13#[derive(Debug, Clone, Copy)]
14pub struct ReadEnd<'a> {
15    /// DNA sequence bytes (ASCII).
16    pub seq: &'a [u8],
17    /// Base quality bytes (phred+33 ASCII).
18    pub qual: &'a [u8],
19}
20
21/// A batch of paired-end reads to map.
22///
23/// All slices must have the same length (`count`).  Sequences are ASCII
24/// DNA strings (ACGTN) and qualities are phred+33 ASCII strings.
25#[derive(Default)]
26pub struct ReadBatch {
27    names: Vec<CString>,
28    r1_seqs: Vec<CString>,
29    r1_quals: Vec<CString>,
30    r2_seqs: Vec<CString>,
31    r2_quals: Vec<CString>,
32}
33
34impl ReadBatch {
35    /// Create a new empty batch.
36    #[must_use]
37    pub fn new() -> Self {
38        Self::default()
39    }
40
41    /// Create a batch with pre-allocated capacity for `n` pairs.
42    #[must_use]
43    pub fn with_capacity(n: usize) -> Self {
44        Self {
45            names: Vec::with_capacity(n),
46            r1_seqs: Vec::with_capacity(n),
47            r1_quals: Vec::with_capacity(n),
48            r2_seqs: Vec::with_capacity(n),
49            r2_quals: Vec::with_capacity(n),
50        }
51    }
52
53    /// Add a read pair to the batch.
54    ///
55    /// # Errors
56    ///
57    /// Returns [`YaraError::InvalidInput`] if any input contains a null byte
58    /// or if sequence and quality lengths do not match.
59    pub fn push(&mut self, name: &str, r1: ReadEnd<'_>, r2: ReadEnd<'_>) -> Result<(), YaraError> {
60        if r1.seq.len() != r1.qual.len() {
61            return Err(YaraError::InvalidInput(format!(
62                "r1 seq/qual length mismatch: {} vs {}",
63                r1.seq.len(),
64                r1.qual.len()
65            )));
66        }
67        if r2.seq.len() != r2.qual.len() {
68            return Err(YaraError::InvalidInput(format!(
69                "r2 seq/qual length mismatch: {} vs {}",
70                r2.seq.len(),
71                r2.qual.len()
72            )));
73        }
74        self.names
75            .push(CString::new(name).map_err(|e| YaraError::InvalidInput(format!("name: {e}")))?);
76        self.r1_seqs.push(
77            CString::new(r1.seq).map_err(|e| YaraError::InvalidInput(format!("r1_seq: {e}")))?,
78        );
79        self.r1_quals.push(
80            CString::new(r1.qual).map_err(|e| YaraError::InvalidInput(format!("r1_qual: {e}")))?,
81        );
82        self.r2_seqs.push(
83            CString::new(r2.seq).map_err(|e| YaraError::InvalidInput(format!("r2_seq: {e}")))?,
84        );
85        self.r2_quals.push(
86            CString::new(r2.qual).map_err(|e| YaraError::InvalidInput(format!("r2_qual: {e}")))?,
87        );
88        Ok(())
89    }
90
91    /// Number of read pairs in the batch.
92    #[must_use]
93    pub fn len(&self) -> usize {
94        self.names.len()
95    }
96
97    /// Whether the batch is empty.
98    #[must_use]
99    pub fn is_empty(&self) -> bool {
100        self.names.is_empty()
101    }
102}
103
104/// Handle to a loaded YARA mapper with a pre-built FM index.
105///
106/// The mapper is configured at construction time and can be used to map
107/// multiple batches of reads.  The FM index stays loaded in memory for
108/// the lifetime of this handle.
109///
110/// # Thread safety
111///
112/// The underlying C++ mapper uses OpenMP internally but is not safe for
113/// concurrent `map_paired` calls.  [`YaraMapper`] is [`Send`] but not
114/// [`Sync`].
115pub struct YaraMapper {
116    handle: NonNull<yara_mapper_sys::YaraMapperHandle>,
117    secondary_mode: SecondaryMode,
118}
119
120// SAFETY: The C++ handle owns all its memory and can be moved between threads.
121// It is NOT safe to call map_paired concurrently (OpenMP parallelism is
122// internal to each call), so we implement Send but not Sync.
123unsafe impl Send for YaraMapper {}
124
125impl YaraMapper {
126    /// Open a pre-built YARA index and create a mapper.
127    ///
128    /// `index_prefix` is the path prefix used when building the index (e.g.,
129    /// `ref/hla.fasta` if the index files are `ref/hla.fasta.yara.*`).
130    ///
131    /// # Errors
132    ///
133    /// Returns [`YaraError::IndexOpen`] if the index files cannot be found or
134    /// loaded, or if `index_prefix` contains non-UTF-8 characters or null bytes.
135    pub fn open<P: AsRef<Path>>(
136        index_prefix: P,
137        options: &MapperOptions,
138    ) -> Result<Self, YaraError> {
139        let prefix_cstr =
140            path_to_cstring(index_prefix.as_ref(), "index_prefix", YaraError::IndexOpen)?;
141
142        let ffi_opts = options.to_ffi();
143        let mut error_buf = vec![0u8; 1024];
144
145        let handle = unsafe {
146            yara_mapper_sys::yara_mapper_open(
147                prefix_cstr.as_ptr(),
148                &ffi_opts,
149                error_buf.as_mut_ptr().cast(),
150                error_buf.len(),
151            )
152        };
153
154        NonNull::new(handle)
155            .map(|h| Self { handle: h, secondary_mode: options.secondary_mode })
156            .ok_or_else(|| {
157                let msg = unsafe { CStr::from_ptr(error_buf.as_ptr().cast()) };
158                YaraError::IndexOpen(msg.to_string_lossy().into_owned())
159            })
160    }
161
162    /// Map a batch of paired-end reads and return alignment records.
163    ///
164    /// The returned records include primary and (depending on options)
165    /// secondary alignments, plus unmapped entries for reads that could
166    /// not be aligned.
167    ///
168    /// # Errors
169    ///
170    /// Returns [`YaraError::Mapping`] if the underlying C++ mapper encounters
171    /// an error during alignment.
172    pub fn map_paired(&self, reads: &ReadBatch) -> Result<Vec<YaraRecord>, YaraError> {
173        if reads.is_empty() {
174            return Ok(Vec::new());
175        }
176
177        // Build pointer arrays for the C API.
178        let name_ptrs = cstring_ptrs(&reads.names);
179        let r1_seq_ptrs = cstring_ptrs(&reads.r1_seqs);
180        let r1_qual_ptrs = cstring_ptrs(&reads.r1_quals);
181        let r2_seq_ptrs = cstring_ptrs(&reads.r2_seqs);
182        let r2_qual_ptrs = cstring_ptrs(&reads.r2_quals);
183
184        let batch = yara_mapper_sys::YaraReadBatch {
185            names: name_ptrs.as_ptr(),
186            r1_seqs: r1_seq_ptrs.as_ptr(),
187            r1_quals: r1_qual_ptrs.as_ptr(),
188            r2_seqs: r2_seq_ptrs.as_ptr(),
189            r2_quals: r2_qual_ptrs.as_ptr(),
190            count: reads.len(),
191        };
192
193        let capacity = reads.len() * records_per_pair(self.secondary_mode);
194        // SAFETY: YaraAlignmentRecord is #[repr(C)] POD — all-zeros is a valid
195        // representation (null pointers, zero scalars).
196        let mut out_records: Vec<yara_mapper_sys::YaraAlignmentRecord> =
197            vec![unsafe { std::mem::zeroed() }; capacity];
198
199        let mut error_buf = [0u8; 1024];
200
201        let count = unsafe {
202            yara_mapper_sys::yara_mapper_map_paired(
203                self.handle.as_ptr(),
204                &batch,
205                out_records.as_mut_ptr(),
206                capacity,
207                error_buf.as_mut_ptr().cast(),
208                error_buf.len(),
209            )
210        };
211
212        if count < 0 {
213            // C++ already freed any partially written records before returning -1.
214            let msg = unsafe { CStr::from_ptr(error_buf.as_ptr().cast()) };
215            return Err(YaraError::Mapping(msg.to_string_lossy().into_owned()));
216        }
217
218        #[expect(
219            clippy::cast_possible_truncation,
220            clippy::cast_sign_loss,
221            reason = "count is non-negative and bounded by capacity (which is usize)"
222        )]
223        let n = count as usize;
224
225        // Convert each C record to an owned Rust type, freeing the C++ memory
226        // for that record immediately.  This avoids holding two copies of all
227        // records simultaneously and is safe against panics (only the current
228        // record's C++ memory can leak if convert_record panics).
229        let results: Vec<YaraRecord> = out_records[..n]
230            .iter()
231            .map(|rec| {
232                let converted = convert_record(rec);
233                unsafe {
234                    yara_mapper_sys::yara_mapper_free_record(std::ptr::from_ref(rec).cast_mut());
235                }
236                converted
237            })
238            .collect();
239
240        Ok(results)
241    }
242
243    /// Number of reference contigs in the loaded index.
244    #[must_use]
245    pub fn contig_count(&self) -> usize {
246        unsafe { yara_mapper_sys::yara_mapper_contig_count(self.handle.as_ptr()) }
247    }
248
249    /// Reference contig names (for SAM/BAM header construction).
250    #[must_use]
251    pub fn contig_names(&self) -> Vec<String> {
252        let n = self.contig_count();
253        unsafe {
254            collect_contig_names(n, |i| {
255                yara_mapper_sys::yara_mapper_contig_name(self.handle.as_ptr(), i)
256            })
257        }
258    }
259
260    /// Reference contig lengths.
261    #[must_use]
262    pub fn contig_lengths(&self) -> Vec<usize> {
263        let n = self.contig_count();
264        collect_contig_lengths(n, |i| unsafe {
265            yara_mapper_sys::yara_mapper_contig_length(self.handle.as_ptr(), i)
266        })
267    }
268}
269
270impl Drop for YaraMapper {
271    fn drop(&mut self) {
272        unsafe { yara_mapper_sys::yara_mapper_close(self.handle.as_ptr()) }
273    }
274}
275
276/// Upper bound on records per read pair, depending on secondary mode.
277/// `Tag`/`Omit`: each read end produces exactly one record, so 2 per pair.
278/// `Record`: secondaries are separate records; 10 per pair is generous.
279fn records_per_pair(mode: SecondaryMode) -> usize {
280    match mode {
281        SecondaryMode::Tag | SecondaryMode::Omit => 2,
282        SecondaryMode::Record => 10,
283    }
284}
285
286/// Collect raw pointers from a slice of `CString` values for the C API.
287fn cstring_ptrs(strings: &[CString]) -> Vec<*const i8> {
288    strings.iter().map(|s| s.as_ptr()).collect()
289}
290
291/// Convert a single FFI record to an owned Rust record.
292fn convert_record(rec: &yara_mapper_sys::YaraAlignmentRecord) -> YaraRecord {
293    // CIGAR
294    let cigar = if !rec.cigar.is_null() && rec.cigar_len > 0 {
295        let slice = unsafe { std::slice::from_raw_parts(rec.cigar, rec.cigar_len as usize) };
296        slice.iter().map(|&encoded| CigarOp::from_bam(encoded)).collect()
297    } else {
298        Vec::new()
299    };
300
301    // Sequence
302    let seq = if !rec.seq.is_null() && rec.seq_len > 0 {
303        let slice =
304            unsafe { std::slice::from_raw_parts(rec.seq.cast::<u8>(), rec.seq_len as usize) };
305        Some(slice.to_vec())
306    } else {
307        None
308    };
309
310    // Quality
311    let qual = if !rec.qual.is_null() && rec.seq_len > 0 {
312        let slice =
313            unsafe { std::slice::from_raw_parts(rec.qual.cast::<u8>(), rec.seq_len as usize) };
314        Some(slice.to_vec())
315    } else {
316        None
317    };
318
319    // XA tag
320    let xa = if rec.xa.is_null() {
321        None
322    } else {
323        Some(unsafe { CStr::from_ptr(rec.xa) }.to_string_lossy().into_owned())
324    };
325
326    YaraRecord {
327        read_pair_index: rec.read_pair_index,
328        is_read1: rec.is_read1 != 0,
329        contig_id: rec.contig_id,
330        pos: rec.pos,
331        is_reverse: rec.is_reverse != 0,
332        is_secondary: rec.is_secondary != 0,
333        is_unmapped: rec.is_unmapped != 0,
334        mapq: rec.mapq,
335        nm: rec.nm,
336        x0: rec.x0,
337        x1: rec.x1,
338        mate_contig_id: rec.mate_contig_id,
339        mate_pos: rec.mate_pos,
340        tlen: rec.tlen,
341        flag: rec.flag,
342        cigar,
343        seq,
344        qual,
345        xa,
346    }
347}