nanalogue_core/lib.rs
1//! # Nanalogue Core
2//!
3//! ## Introduction
4//!
5//! Nanalogue = *N*ucleic Acid *Analogue*.
6//! Nanalogue is a tool to parse or analyse BAM/Mod BAM files with a single-molecule focus.
7//!
8//! [](https://github.com/DNAReplicationLab/nanalogue/actions/workflows/ci.yml)
9//! [](https://github.com/DNAReplicationLab/nanalogue/actions/workflows/cargo-llvm-cov.yml)
10//! [](https://crates.io/crates/nanalogue)
11//! [](https://opensource.org/licenses/MIT)
12//!
13//! A common pain point in genomics analyses is that BAM files are information-dense
14//! which makes it difficult to gain insight from them. Nanalogue hopes to make it easy
15//! to extract and process this information, with a particular focus on single-molecule
16//! aspects and DNA/RNA modifications. Despite this focus, some of nanalogue's commands
17//! and functions are quite general and can be applied to almost any BAM file.
18//!
19//! We process and calculate data associated with DNA/RNA molecules, their alignments to
20//! reference genomes, modification information on them, and other miscellaneous
21//! information. We can process any type of DNA/RNA modifications occuring in any pattern
22//! (single/multiple mods, spatially-isolated/non-isolated etc.). All we require is that
23//! the data is stored in a BAM file in the mod BAM format (i.e. using `MM/ML` tags as
24//! laid down in the [specifications](https://samtools.github.io/hts-specs/SAMtags.pdf)).
25//!
26//! Nanalogue is both an executable that can be run from the command line and a library
27//! whose functionality can be used by others writing rust code. The library's functions
28//! are presented here. The executable exposes the modules `subcommands::*`, and a
29//! separate executable `nanalogue_sim_bam` exposes the `simulate_mod_bam` functionality
30//! (see below).
31//!
32//! For developers: if you are looking to make a custom BAM file containing synthetic, simulated
33//! DNA/RNA modification data to develop/test your tool, you may be interested in `nanalogue_sim_bam`.
34//! This is an executable that ships with nanalogue that can create a BAM file according to your
35//! specifications. Please run `nanalogue_sim_bam --help`. If you are a rust developer looking
36//! to use this functionality in your library, please look at the documentation of the module
37//! [`crate::simulate_mod_bam`].
38//!
39//! This documentation is supplemented by a companion [cookbook](https://www.nanalogue.com).
40//!
41//! ## Sample code
42//!
43//! The [`InputBam`] and the [`InputMods`] structs allow us to set input options
44//! for BAM/modBAM calculations. An example is shown below where the [`crate::read_info::run`]
45//! command is called to process data from a BAM file with some input options.
46//!
47//! ```
48//! use nanalogue_core::{BamRcRecords, BamPreFilt, Error, InputBamBuilder, InputModsBuilder,
49//! OptionalTag, PathOrURLOrStdin, ThresholdState, nanalogue_bam_reader, read_info};
50//!
51//! let mut bam = InputBamBuilder::default()
52//! .bam_path(PathOrURLOrStdin::Path("./examples/example_1.bam".into()))
53//! .region("dummyI".into())
54//! .build()?;
55//! let mut mods = InputModsBuilder::<OptionalTag>::default()
56//! .mod_prob_filter(ThresholdState::GtEq(0))
57//! .build()?;
58//!
59//! let mut buffer = Vec::new();
60//! let mut reader = nanalogue_bam_reader(&bam.bam_path.to_string())?;
61//! let bam_rc_records = BamRcRecords::new(&mut reader, &mut bam, &mut mods)?;
62//! read_info::run(
63//! &mut buffer,
64//! bam_rc_records.rc_records
65//! .filter(|r| r.as_ref().map_or(true, |v| v.pre_filt(&bam))),
66//! mods,
67//! None,
68//! )?;
69//! assert!(str::from_utf8(buffer.as_slice())?
70//! .contains("5d10eb9a-aae1-4db8-8ec6-7ebb34d32575"));
71//! # Ok::<(), Error>(())
72//! ```
73//!
74//! If you want to write custom functionality yourself, please familiarize yourself with the
75//! [`crate::read_utils::CurrRead`] struct. This is the centerpiece of our library, which receives
76//! BAM record data, processes the DNA/RNA modification information amongst other pieces of information,
77//! and exposes them for downstream usage.
78
79use bedrs::{Bed3, Coordinates as _};
80use bio::alphabets::dna::revcomp;
81use bio_types::sequence::SequenceRead as _;
82use fibertools_rs::utils::{
83 bamannotations::{FiberAnnotation, Ranges},
84 basemods::{BaseMod, BaseMods},
85 bio_io::{convert_seq_uppercase, get_u8_tag},
86};
87use rand::random;
88use regex::Regex;
89use rust_htslib::{bam, bam::ext::BamRecordExtensions as _, bam::record::Aux, tpool};
90use std::collections::HashSet;
91use std::fs::File;
92use std::io::{BufRead as _, BufReader};
93use std::sync::{LazyLock, Once};
94
95// Declare the modules.
96pub mod analysis;
97pub mod cli;
98pub mod commands;
99pub mod error;
100pub mod file_utils;
101pub mod read_utils;
102pub mod simulate_mod_bam;
103pub mod subcommands;
104pub mod utils;
105
106// Re-exports
107pub use cli::{
108 InputBam, InputBamBuilder, InputModOptions, InputMods, InputModsBuilder, InputRegionOptions,
109 InputWindowing, InputWindowingBuilder, OptionalTag, RequiredTag, SeqDisplayOptions,
110};
111pub use error::Error;
112pub use file_utils::{
113 nanalogue_bam_reader, nanalogue_bam_reader_from_stdin, nanalogue_bam_reader_from_url,
114 nanalogue_indexed_bam_reader, nanalogue_indexed_bam_reader_from_url, write_bam_denovo,
115 write_fasta,
116};
117pub use read_utils::{
118 AlignmentInfoBuilder, CurrRead, CurrReadBuilder, ModTableEntryBuilder, curr_reads_to_dataframe,
119};
120pub use simulate_mod_bam::SimulationConfig;
121pub use subcommands::{
122 find_modified_reads, peek, read_info, read_stats, reads_table, window_reads,
123};
124pub use utils::{
125 AllowedAGCTN, Contains, DNARestrictive, F32AbsValAtMost1, F32Bw0and1, FilterByRefCoords,
126 GenomicRegion, GetDNARestrictive, Intersects, ModChar, OrdPair, PathOrURLOrStdin, ReadState,
127 ReadStates, RestrictModCalledStrand, SeqCoordCalls, ThresholdState,
128};
129
130/// Static initialization guard for SSL certificate configuration.
131static SSL_INIT: Once = Once::new();
132
133/// Initialize SSL certificate paths for HTTPS support.
134///
135/// This function automatically detects and configures SSL certificate locations
136/// to enable HTTPS file access through libcurl. It uses the `openssl-probe`
137/// crate to find system CA certificate bundles.
138///
139/// # Usage
140///
141/// ```
142/// use nanalogue_core::init_ssl_certificates;
143/// init_ssl_certificates();
144/// ```
145///
146/// # Why This Is Needed
147///
148/// The statically-compiled libcurl in rust-htslib doesn't know where to find
149/// the system's CA certificate bundle for SSL certificate verification. This
150/// function sets the appropriate environment variables to point to the system's
151/// certificate bundle, enabling secure HTTPS connections.
152pub fn init_ssl_certificates() {
153 SSL_INIT.call_once(|| {
154 // SAFETY: This function probes for SSL certificates and sets environment
155 // variables. We're setting these before any SSL/TLS
156 // operations happen, which is the safe usage pattern.
157 unsafe {
158 if openssl_probe::try_init_openssl_env_vars() {
159 // Also set CURL_CA_BUNDLE if SSL_CERT_FILE was set by openssl-probe.
160 // The statically-compiled libcurl in rust-htslib specifically checks
161 // CURL_CA_BUNDLE, not just SSL_CERT_FILE.
162 if let Ok(cert_file) = std::env::var("SSL_CERT_FILE")
163 && std::env::var("CURL_CA_BUNDLE").is_err()
164 {
165 std::env::set_var("CURL_CA_BUNDLE", cert_file);
166 }
167 }
168 }
169 });
170}
171
172/// Extracts mod information from BAM record to the `fibertools-rs` `BaseMods` Struct.
173///
174/// We are copying and modifying code from the fibertools-rs repository
175/// (<https://github.com/fiberseq/fibertools-rs>) which is under the MIT license
176/// (please see their Cargo.toml) to create this function.
177///
178/// # Tag Variant Support
179///
180/// We support MM/ML (standard) and Mm/Ml (fallback) tag variants, but no other variants.
181/// The specification recommends MM/ML, but we support the Mm/Ml variant as some sequencing
182/// technologies use this capitalization. Other mixed-case variants (e.g., mM/mL) and fully
183/// lowercase variants (mm/ml) are not recognized.
184///
185/// Function should cover almost all mod bam cases, but will fail in the following scenarios:
186/// - If multiple mods are present on the same base e.g. methylation and hydroxymethylation,
187/// most BAM files the author has come across use the notation MM:Z:C+m,...;C+h,...;,
188/// which this function can parse. But the notation MM:Z:C+mh,...; is also allowed.
189/// We do not parse this for now, please contribute code if you want to add this functionality!
190/// - We do not know any technology/technique which, for instance, replaces both C and T
191/// with `BrdU` i.e. two different bases have the same substitution. This also leads to failure.
192/// If you deal with this, please notify us! On the other hand, it is possible that both C
193/// and A are methylated e.g. 5-Methylcytosine and 6-Methyladenine. Our program can deal
194/// with this as the "tags" corresponding to these are 'm' and 'a' respectively i.e. the
195/// tags are different. For this failure mode, the modifications have to be identical,
196/// not just conceptually identical i.e. although 5-Methylcytosine and 6-Methyladenine
197/// fall under "methylation", they are not chemically identical and thus have different
198/// tags associated with them in the mod BAM format, and thus will not lead to failure.
199///
200/// Following is an example of reading and parsing modification data, with
201/// no filters applied.
202/// We are using an example mod BAM file which has very short reads and very few
203/// modified positions, and just examining two reads in it below.
204/// ```
205/// use nanalogue_core::{Error, nanalogue_bam_reader, nanalogue_mm_ml_parser};
206/// use rust_htslib::bam::Read;
207/// use fibertools_rs::utils::basemods::{BaseMods, BaseMod};
208/// use fibertools_rs::utils::bamannotations::{FiberAnnotation, Ranges};
209/// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
210/// let mut count = 0;
211/// for record in reader.records(){
212/// let r = record?;
213/// let Ok(BaseMods{base_mods: v}) = nanalogue_mm_ml_parser(&r, |&_| true, |&_| true,
214/// |&_, &_, &_| true, 0) else { unreachable!() };
215/// match count {
216/// 0 => assert_eq!(v, vec![BaseMod{
217/// modified_base: b'T',
218/// strand: '+',
219/// modification_type: 'T',
220/// ranges: Ranges {
221/// annotations: vec![
222/// FiberAnnotation {
223/// start: 0, end: 1, length: 1, qual: 4,
224/// reference_start: Some(9), reference_end: Some(9),
225/// reference_length: Some(0), extra_columns: None,
226/// },
227/// FiberAnnotation {
228/// start: 3, end: 4, length: 1, qual: 7,
229/// reference_start: Some(12), reference_end: Some(12),
230/// reference_length: Some(0), extra_columns: None,
231/// },
232/// FiberAnnotation {
233/// start: 4, end: 5, length: 1, qual: 9,
234/// reference_start: Some(13), reference_end: Some(13),
235/// reference_length: Some(0), extra_columns: None,
236/// },
237/// FiberAnnotation {
238/// start: 7, end: 8, length: 1, qual: 6,
239/// reference_start: Some(16), reference_end: Some(16),
240/// reference_length: Some(0), extra_columns: None,
241/// },
242/// ],
243/// seq_len: 8,
244/// reverse: false,
245/// },
246/// record_is_reverse: false,
247/// }]),
248/// 2 => assert_eq!(v, vec![BaseMod{
249/// modified_base: b'T',
250/// strand: '+',
251/// modification_type: 'T',
252/// ranges: Ranges {
253/// annotations: vec![
254/// FiberAnnotation {
255/// start: 12, end: 13, length: 1, qual: 3,
256/// reference_start: Some(15), reference_end: Some(15),
257/// reference_length: Some(0), extra_columns: None,
258/// },
259/// FiberAnnotation {
260/// start: 13, end: 14, length: 1, qual: 3,
261/// reference_start: Some(16), reference_end: Some(16),
262/// reference_length: Some(0), extra_columns: None,
263/// },
264/// FiberAnnotation {
265/// start: 16, end: 17, length: 1, qual: 4,
266/// reference_start: Some(19), reference_end: Some(19),
267/// reference_length: Some(0), extra_columns: None,
268/// },
269/// FiberAnnotation {
270/// start: 19, end: 20, length: 1, qual: 3,
271/// reference_start: Some(22), reference_end: Some(22),
272/// reference_length: Some(0), extra_columns: None,
273/// },
274/// FiberAnnotation {
275/// start: 20, end: 21, length: 1, qual: 182,
276/// reference_start: Some(23), reference_end: Some(23),
277/// reference_length: Some(0), extra_columns: None,
278/// },
279/// ],
280/// seq_len: 33,
281/// reverse: true,
282/// },
283/// record_is_reverse: true,
284/// }]),
285/// _ => {},
286/// }
287/// count = count + 1;
288/// }
289/// # Ok::<(), Error>(())
290/// ```
291///
292/// # Errors
293/// If MM/ML BAM tags are malformed, you will get `InvalidModProbs` or `InvalidModCoords`.
294/// Most integer overflows are dealt with using `except`, except one which gives `ArithmeticError`.
295/// `InvalidDuplicates` occurs if the same tag, strand combination occurs many times.
296/// Please read the function documentation above as well, which explains some scenarios where
297/// even valid tags can be marked as malformed.
298///
299#[expect(clippy::too_many_lines, reason = "Complex BAM MM/ML tag parsing logic")]
300#[expect(
301 clippy::missing_panics_doc,
302 reason = "either impossible scenarios or integer overflows \
303which are also unlikely as genomic coordinates are much less than ~2^63"
304)]
305pub fn nanalogue_mm_ml_parser<F, G, H>(
306 record: &bam::Record,
307 filter_mod_prob: F,
308 filter_mod_pos: G,
309 filter_mod_base_strand_tag: H,
310 min_qual: u8,
311) -> Result<BaseMods, Error>
312where
313 F: Fn(&u8) -> bool,
314 G: Fn(&usize) -> bool,
315 H: Fn(&u8, &char, &ModChar) -> bool,
316{
317 // Regular expression for matching modification data in the MM tag
318 static MM_RE: LazyLock<Regex> = LazyLock::new(|| {
319 Regex::new(r"((([ACGTUN])([-+])([A-Za-z]+|[0-9]+)([.?]?))((,[0-9]+)*;)*)")
320 .expect("no error")
321 });
322 // Array to store all the different modifications within the MM tag
323 let mut rtn: Vec<BaseMod> = Vec::new();
324
325 let ml_tag: Vec<u8> = if record.aux(b"ML").is_ok() {
326 get_u8_tag(record, b"ML")
327 } else {
328 get_u8_tag(record, b"Ml")
329 };
330
331 let mut num_mods_seen: usize = 0;
332
333 let is_reverse = record.is_reverse();
334
335 // if there is an MM tag iterate over all the regex matches
336 if let Ok(Aux::String(mm_text)) = record.aux(b"MM").or_else(|_| record.aux(b"Mm")) {
337 // base qualities; must reverse if rev comp.
338 // NOTE this is always equal to number of bases in the sequence, otherwise
339 // rust_htslib will throw an error, so we don't have to check this.
340 let base_qual: Vec<u8> = match (min_qual, is_reverse) {
341 (0, _) => Vec::new(),
342 (_, true) => record.qual().iter().rev().copied().collect(),
343 (_, false) => record.qual().to_vec(),
344 };
345
346 // get forward sequence bases from the bam record
347 let forward_bases = {
348 let seq = convert_seq_uppercase(record.seq().as_bytes());
349 if is_reverse { revcomp(seq) } else { seq }
350 };
351
352 let seq_len = forward_bases.len();
353
354 let pos_map = {
355 let temp: Vec<Option<i64>> = {
356 if record.is_unmapped() {
357 std::iter::repeat_n(None, seq_len).collect()
358 } else {
359 record
360 .aligned_pairs_full()
361 .filter(|x| x[0].is_some())
362 .map(|x| x[1])
363 .collect()
364 }
365 };
366 if temp.len() == seq_len {
367 temp
368 } else {
369 return Err(Error::InvalidState(format!(
370 "rust_htslib failure! seq coordinates malformed {} != {}",
371 temp.len(),
372 seq_len
373 )));
374 }
375 };
376
377 for cap in MM_RE.captures_iter(mm_text) {
378 let mod_base = cap
379 .get(3)
380 .map(|m| {
381 *m.as_str()
382 .as_bytes()
383 .first()
384 .expect("regex match guaranteed to have at least 1 byte")
385 })
386 .expect("no error");
387 let mod_strand = cap
388 .get(4)
389 .map_or("", |m| m.as_str())
390 .chars()
391 .next()
392 .expect("no error");
393
394 // get modification type
395 let modification_type: ModChar = cap
396 .get(5)
397 .map_or("", |m| m.as_str())
398 .parse()
399 .expect("no error");
400
401 let is_implicit = match cap.get(6).map_or("", |m| m.as_str()).as_bytes() {
402 b"" | b"." => true,
403 b"?" => false,
404 _ => unreachable!("our regex expression must have prevented other possibilities"),
405 };
406 let mod_dists_str = cap.get(7).map_or("", |m| m.as_str());
407 // parse the string containing distances between modifications into a vector of i64
408
409 let mod_dists: Vec<usize> = mod_dists_str
410 .trim_end_matches(';')
411 .split(',')
412 .map(str::trim)
413 .filter(|s| !s.is_empty())
414 .map(|s| s.parse().unwrap())
415 .collect();
416
417 // do we include bases with zero probabilities?
418 let is_include_zero_prob = filter_mod_prob(&0);
419
420 // find real positions in the forward sequence
421 let mut cur_mod_idx: usize = 0;
422 let mut dist_from_last_mod_base: usize = 0;
423
424 // declare vectors with an approximate with_capacity
425 let (mut modified_positions, mut modified_probabilities) = {
426 #[expect(
427 clippy::arithmetic_side_effects,
428 reason = "in rare chance of overflow, vectors are initially missized but will be resized anyway \
429as they are populated. This will result in a small performance hit but we are ok as this will probably never happen \
430when usize is 64-bit as genomic sequences are not that long"
431 )]
432 let mod_data_len_approx = if is_implicit {
433 // In implicit mode, there may be any number of bases
434 // after the MM data is over, which must be assumed as unmodified.
435 // So we cannot know the exact length of the data before actually
436 // parsing it, and this is just a lower bound of the length.
437 mod_dists.len() + mod_dists.iter().sum::<usize>()
438 } else {
439 mod_dists.len()
440 };
441 (
442 Vec::<usize>::with_capacity(mod_data_len_approx),
443 Vec::<u8>::with_capacity(mod_data_len_approx),
444 )
445 };
446
447 #[expect(
448 clippy::arithmetic_side_effects,
449 reason = "one counter is checked for overflow and the other is incremented only when below a ceiling"
450 )]
451 for (cur_seq_idx, &_) in forward_bases
452 .iter()
453 .enumerate()
454 .filter(|&(_, &k)| mod_base == b'N' || k == mod_base)
455 {
456 let is_seq_pos_pass: bool = filter_mod_pos(&cur_seq_idx)
457 && (min_qual > 0).then(|| {
458 base_qual
459 .get(cur_seq_idx)
460 .filter(|&x| *x >= min_qual && *x != 255u8)
461 .is_some()
462 }) != Some(false);
463 if cur_mod_idx < mod_dists.len()
464 && dist_from_last_mod_base
465 == *mod_dists
466 .get(cur_mod_idx)
467 .expect("cur_mod_idx < mod_dists.len()")
468 {
469 let prob = ml_tag
470 .get(cur_mod_idx..)
471 .expect("cur_mod_idx < mod_dists.len() and ml_tag has same length")
472 .get(num_mods_seen)
473 .ok_or(Error::InvalidModProbs(
474 "ML tag appears to be insufficiently long!".into(),
475 ))?;
476 if filter_mod_prob(prob) && is_seq_pos_pass {
477 modified_positions.push(cur_seq_idx);
478 modified_probabilities.push(*prob);
479 }
480 dist_from_last_mod_base = 0;
481 cur_mod_idx += 1;
482 } else if cur_mod_idx < mod_dists.len()
483 && dist_from_last_mod_base
484 > *mod_dists
485 .get(cur_mod_idx)
486 .expect("cur_mod_idx < mod_dists.len()")
487 {
488 return Err(Error::InvalidModCoords(String::from(
489 "Problem with parsing distances in MM/ML data",
490 )));
491 } else {
492 if is_include_zero_prob && is_implicit && is_seq_pos_pass {
493 modified_positions.push(cur_seq_idx);
494 modified_probabilities.push(0);
495 }
496 dist_from_last_mod_base += 1;
497 }
498 }
499
500 if cur_mod_idx == mod_dists.len() {
501 num_mods_seen = num_mods_seen
502 .checked_add(cur_mod_idx)
503 .ok_or(Error::Arithmetic("in MM ML parsing".to_owned()))?;
504 } else {
505 return Err(Error::InvalidModCoords(format!(
506 "Problem with parsing MM/ML data, counts do not match {} != {}",
507 cur_mod_idx,
508 mod_dists.len(),
509 )));
510 }
511
512 // if data matches filters, add to struct.
513 modified_positions.shrink_to(0);
514 modified_probabilities.shrink_to(0);
515
516 #[expect(
517 clippy::arithmetic_side_effects,
518 reason = "`seq_len - 1 - k` (protected as mod pos cannot exceed seq_len), \
519`k.0 + 1` (overflow unlikely as genomic coords << 2^63)"
520 )]
521 #[expect(
522 clippy::indexing_slicing,
523 reason = "`pos_map[k.0]`; neither pos_map's len nor mod pos entry can exceed seq_len"
524 )]
525 if filter_mod_base_strand_tag(&mod_base, &mod_strand, &modification_type) {
526 let annotations: Vec<FiberAnnotation> = modified_positions
527 .iter()
528 .map(|k| if is_reverse { seq_len - 1 - k } else { *k })
529 .zip(modified_probabilities.iter())
530 .map(|k| {
531 Ok(FiberAnnotation {
532 start: i64::try_from(k.0)?,
533 end: i64::try_from(k.0 + 1)?,
534 length: 1,
535 qual: *k.1,
536 reference_start: pos_map[k.0],
537 reference_end: pos_map[k.0],
538 reference_length: pos_map[k.0].is_some().then_some(0),
539 extra_columns: None,
540 })
541 })
542 .collect::<Result<Vec<FiberAnnotation>, Error>>()?;
543 let mods = BaseMod {
544 modified_base: mod_base,
545 strand: mod_strand,
546 modification_type: modification_type.val(),
547 ranges: Ranges {
548 annotations: if is_reverse {
549 annotations.into_iter().rev().collect()
550 } else {
551 annotations
552 },
553 seq_len: i64::try_from(seq_len)?,
554 reverse: is_reverse,
555 },
556 record_is_reverse: is_reverse,
557 };
558 rtn.push(mods);
559 }
560 }
561 }
562
563 if num_mods_seen == ml_tag.len() {
564 // needed so I can compare methods
565 rtn.sort();
566
567 // Check for duplicate strand, modification_type combinations
568 let mut seen_combinations = HashSet::new();
569 for base_mod in &rtn {
570 let combination = (base_mod.strand, base_mod.modification_type);
571 if seen_combinations.contains(&combination) {
572 return Err(Error::InvalidDuplicates(format!(
573 "Duplicate strand '{}' and modification_type '{}' combination found",
574 base_mod.strand, base_mod.modification_type
575 )));
576 }
577 let _: bool = seen_combinations.insert(combination);
578 }
579
580 Ok(BaseMods { base_mods: rtn })
581 } else {
582 Err(Error::InvalidModProbs(
583 "MM and ML tag lengths do not match!".to_owned(),
584 ))
585 }
586}
587
588/// A global struct which contains BAM records for further usage.
589/// NOTE: we don't derive many traits here as the `RcRecords` object
590/// does not have many traits.
591///
592/// ```
593/// use nanalogue_core::{Error, nanalogue_bam_reader, BamRcRecords, InputBam, InputMods,
594/// OptionalTag};
595/// use rust_htslib::bam::Read;
596/// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
597/// let BamRcRecords = BamRcRecords::new(&mut reader, &mut InputBam::default(),
598/// &mut InputMods::<OptionalTag>::default())?;
599/// assert_eq!(BamRcRecords.header.tid(b"dummyI"), Some(0));
600/// assert_eq!(BamRcRecords.header.tid(b"dummyII"), Some(1));
601/// assert_eq!(BamRcRecords.header.tid(b"dummyIII"), Some(2));
602/// # Ok::<(), Error>(())
603/// ```
604#[derive(Debug)]
605#[non_exhaustive]
606pub struct BamRcRecords<'a, R>
607where
608 R: bam::Read,
609{
610 /// `RcRecords` object output by rust htslib which we can iterate over
611 pub rc_records: bam::RcRecords<'a, R>,
612 /// Header of the bam file
613 pub header: bam::HeaderView,
614}
615
616impl<'a, R: bam::Read> BamRcRecords<'a, R> {
617 /// Extracts `RcRecords` from a BAM Reader
618 ///
619 /// # Errors
620 /// Returns an error if thread pool creation, BAM region fetching/processing,
621 /// or read ID file processing fails.
622 pub fn new<T: InputRegionOptions>(
623 bam_reader: &'a mut R,
624 bam_opts: &mut InputBam,
625 mod_opts: &mut T,
626 ) -> Result<Self, Error> {
627 let header = bam_reader.header().clone();
628 let tp = tpool::ThreadPool::new(bam_opts.threads.get())?;
629 bam_reader.set_thread_pool(&tp)?;
630
631 // Load read ID list if specified
632 match (
633 bam_opts.read_id_set.is_some(),
634 bam_opts.read_id_list.is_some(),
635 ) {
636 (_, false) => {}
637 (true, true) => {
638 return Err(Error::InvalidState(
639 "cannot set both `read_id_set` and `read_id_list` in `bam_opts` in `BamRcRecords`".to_owned(),
640 ));
641 }
642 (false, true) => {
643 bam_opts.read_id_set = if let Some(file_path) = bam_opts.read_id_list.as_ref() {
644 let file = File::open(file_path).map_err(Error::InputOutputError)?;
645 let reader = BufReader::new(file);
646 let mut read_ids = HashSet::new();
647
648 for raw_line in reader.lines() {
649 let temp_line = raw_line.map_err(Error::InputOutputError)?;
650 let line = temp_line.trim();
651 if !line.is_empty() && !line.starts_with('#') {
652 let _: bool = read_ids.insert(line.to_string());
653 }
654 }
655 Some(read_ids)
656 } else {
657 None
658 };
659 }
660 }
661
662 bam_opts.convert_region_to_bed3(header.clone())?;
663 mod_opts.convert_region_to_bed3(header.clone())?;
664 let rc_records = bam_reader.rc_records();
665 Ok(BamRcRecords::<R> { rc_records, header })
666 }
667}
668
669/// Trait that performs filtration
670pub trait BamPreFilt {
671 /// apply default filtration
672 fn pre_filt(&self, _bam_opts: &InputBam) -> bool {
673 unimplemented!()
674 }
675 /// filtration by length
676 fn filt_by_len(&self, _min_seq_len: u64, _include_zero_len: bool) -> bool {
677 unimplemented!()
678 }
679 /// filtration by alignment length
680 fn filt_by_align_len(&self, _min_align_len: i64) -> bool {
681 unimplemented!()
682 }
683 /// filtration by read id
684 fn filt_by_read_id(&self, _read_id: &str) -> bool {
685 unimplemented!()
686 }
687 /// filtration by read id set
688 fn filt_by_read_id_set(&self, _read_id_set: &HashSet<String>) -> bool {
689 unimplemented!()
690 }
691 /// filtration using flags
692 fn filt_by_bitwise_or_flags(&self, _states: &ReadStates) -> bool {
693 unimplemented!()
694 }
695 /// random filtration
696 fn filt_random_subset(&self, _fraction: F32Bw0and1) -> bool {
697 unimplemented!()
698 }
699 /// filtration by mapq
700 fn filt_by_mapq(&self, _min_mapq: u8, _exclude_mapq_unavail: bool) -> bool {
701 unimplemented!()
702 }
703 /// filtration by region
704 fn filt_by_region(&self, _region: &Bed3<i32, u64>, _full_region: bool) -> bool {
705 unimplemented!()
706 }
707}
708
709/// Trait that performs filtration on `rust_htslib` Record
710///
711/// Filter in action below, 100 bp read passed with a 20 bp filter
712/// but fails with a 120 bp filter
713/// ```
714/// use nanalogue_core::{BamPreFilt, InputBam, Error};
715/// use rust_htslib::bam::record;
716/// use rust_htslib::bam::record::{Cigar, CigarString};
717/// let mut bam_record = record::Record::new();
718/// bam_record.set(&vec![b'r',b'e',b'a',b'd'], Some(&CigarString(vec![Cigar::Match(100)])),
719/// &vec![ b'A' as u8; 100], &vec![255 as u8; 100]);
720/// let mut bam_opts = InputBam::default();
721/// bam_opts.min_seq_len = 20;
722/// assert_eq!(bam_record.pre_filt(&bam_opts), true);
723/// bam_opts.min_seq_len = 120;
724/// assert_eq!(bam_record.pre_filt(&bam_opts), false);
725/// # Ok::<(), Error>(())
726/// ```
727/// By default, zero-length records are excluded to avoid processing errors.
728/// ```
729/// # use nanalogue_core::{BamPreFilt, InputBam, Error};
730/// # use rust_htslib::bam::record;
731/// # use rust_htslib::bam::record::{Cigar, CigarString};
732/// let mut bam_record = record::Record::new();
733/// let mut bam_opts = InputBam::default();
734/// bam_opts.min_seq_len = 20;
735/// bam_opts.include_zero_len = false; // default behavior
736/// assert_eq!(bam_record.pre_filt(&bam_opts), false); // excluded
737/// # Ok::<(), Error>(())
738/// ```
739/// Zero-length records can be included if explicitly requested.
740/// ```
741/// # use nanalogue_core::{BamPreFilt, InputBam, Error};
742/// # use rust_htslib::bam::record;
743/// # use rust_htslib::bam::record::{Cigar, CigarString};
744/// let mut bam_record = record::Record::new();
745/// let mut bam_opts = InputBam::default();
746/// bam_opts.min_seq_len = 20;
747/// bam_opts.include_zero_len = true;
748/// assert_eq!(bam_record.pre_filt(&bam_opts), true); // included
749/// # Ok::<(), Error>(())
750/// ```
751impl BamPreFilt for bam::Record {
752 /// apply default filtration by read length
753 fn pre_filt(&self, bam_opts: &InputBam) -> bool {
754 self.filt_random_subset(bam_opts.sample_fraction)
755 & self.filt_by_len(bam_opts.min_seq_len, bam_opts.include_zero_len)
756 & self.filt_by_mapq(bam_opts.mapq_filter, bam_opts.exclude_mapq_unavail)
757 & {
758 if let Some(v) = bam_opts.read_id.as_ref() {
759 self.filt_by_read_id(v)
760 } else if let Some(read_id_set) = bam_opts.read_id_set.as_ref() {
761 self.filt_by_read_id_set(read_id_set)
762 } else {
763 true
764 }
765 }
766 & {
767 if let Some(v) = bam_opts.min_align_len {
768 self.filt_by_align_len(v)
769 } else {
770 true
771 }
772 }
773 & {
774 if let Some(v) = bam_opts.read_filter.as_ref() {
775 self.filt_by_bitwise_or_flags(v)
776 } else {
777 true
778 }
779 }
780 & {
781 if let Some(v) = bam_opts.region_filter().as_ref() {
782 self.filt_by_region(v, bam_opts.is_full_overlap())
783 } else {
784 true
785 }
786 }
787 }
788 /// filtration by read length
789 fn filt_by_len(&self, min_seq_len: u64, include_zero_len: bool) -> bool {
790 // self.len() returns a usize which we convert to u64
791 match (min_seq_len, self.len() as u64, include_zero_len) {
792 (_, 0, false) => false, // Exclude zero-length by default
793 (_, 0, true) => true, // Include zero-length when explicitly requested
794 (l_min, v, _) => v >= l_min,
795 }
796 }
797 /// filtration by alignment length
798 fn filt_by_align_len(&self, min_align_len: i64) -> bool {
799 !self.is_unmapped() && {
800 let ref_end = self.reference_end();
801 let ref_start = self.pos();
802 ref_end >= ref_start
803 && ref_start >= 0
804 && ref_end
805 .checked_sub(ref_start)
806 .expect("ref_end >= ref_start so overflow is impossible")
807 >= min_align_len
808 }
809 }
810 /// filtration by read id
811 fn filt_by_read_id(&self, read_id: &str) -> bool {
812 read_id.as_bytes() == self.name()
813 }
814 /// filtration by read id set
815 fn filt_by_read_id_set(&self, read_id_set: &HashSet<String>) -> bool {
816 if let Ok(name_str) = std::str::from_utf8(self.name()) {
817 read_id_set.contains(name_str)
818 } else {
819 false
820 }
821 }
822 /// filtration by flag list
823 fn filt_by_bitwise_or_flags(&self, states: &ReadStates) -> bool {
824 states.bam_flags().contains(&self.flags())
825 }
826 /// random filtration
827 fn filt_random_subset(&self, fraction: F32Bw0and1) -> bool {
828 match fraction.val() {
829 1.0 => true,
830 0.0 => false,
831 v => {
832 let random_number: f32 = random();
833 random_number < v
834 }
835 }
836 }
837 /// filtration by mapq
838 fn filt_by_mapq(&self, min_mapq: u8, exclude_mapq_unavail: bool) -> bool {
839 !(exclude_mapq_unavail && self.mapq() == 255) && self.mapq() >= min_mapq
840 }
841 /// filtration by region
842 fn filt_by_region(&self, region: &Bed3<i32, u64>, full_region: bool) -> bool {
843 !self.is_unmapped() && (self.tid() == *region.chr()) && {
844 let region_start = region.start();
845 let region_end = region.end();
846 let start: u64 = self.pos().try_into().expect("no error");
847 let end: u64 = self.reference_end().try_into().expect("no error");
848 if full_region {
849 (start..end).contains(®ion_start) && (start..=end).contains(®ion_end)
850 } else {
851 (start..end).intersects(&(region_start..region_end))
852 }
853 }
854 }
855}
856
857#[cfg(test)]
858mod mod_parse_tests {
859 use super::*;
860 use rust_htslib::bam::Read as _;
861
862 /// Tests if Mod BAM modification parsing is alright with fallback tag support.
863 /// Tests both MM/ML (standard uppercase) and Mm/Ml (fallback lowercase) tag variants.
864 /// Note: Only these specific variants are supported - fully lowercase (mm/ml) and
865 /// other mixed-case variants (e.g., mM/mL) are not recognized.
866 /// Some of the test cases here may be a repeat of the doctest above.
867 #[test]
868 #[expect(clippy::too_many_lines, reason = "Comprehensive integration test")]
869 fn mod_bam_parsing_from_example_1_compatible_variants() -> Result<(), Error> {
870 for file_path in ["examples/example_1.bam", "examples/example_1.Mm_Ml.bam"] {
871 let mut reader = nanalogue_bam_reader(file_path)?;
872 for (count, record) in reader.records().enumerate() {
873 let r = record?;
874 let BaseMods { base_mods: v } =
875 nanalogue_mm_ml_parser(&r, |&_| true, |&_| true, |&_, &_, &_| true, 0).unwrap();
876 match count {
877 0 => assert_eq!(
878 v,
879 vec![BaseMod {
880 modified_base: b'T',
881 strand: '+',
882 modification_type: 'T',
883 ranges: Ranges {
884 annotations: vec![
885 FiberAnnotation {
886 start: 0,
887 end: 1,
888 length: 1,
889 qual: 4,
890 reference_start: Some(9),
891 reference_end: Some(9),
892 reference_length: Some(0),
893 extra_columns: None,
894 },
895 FiberAnnotation {
896 start: 3,
897 end: 4,
898 length: 1,
899 qual: 7,
900 reference_start: Some(12),
901 reference_end: Some(12),
902 reference_length: Some(0),
903 extra_columns: None,
904 },
905 FiberAnnotation {
906 start: 4,
907 end: 5,
908 length: 1,
909 qual: 9,
910 reference_start: Some(13),
911 reference_end: Some(13),
912 reference_length: Some(0),
913 extra_columns: None,
914 },
915 FiberAnnotation {
916 start: 7,
917 end: 8,
918 length: 1,
919 qual: 6,
920 reference_start: Some(16),
921 reference_end: Some(16),
922 reference_length: Some(0),
923 extra_columns: None,
924 },
925 ],
926 seq_len: 8,
927 reverse: false,
928 },
929 record_is_reverse: false,
930 }]
931 ),
932 1 => assert_eq!(
933 v,
934 vec![BaseMod {
935 modified_base: b'T',
936 strand: '+',
937 modification_type: 'T',
938 ranges: Ranges {
939 annotations: vec![
940 FiberAnnotation {
941 start: 3,
942 end: 4,
943 length: 1,
944 qual: 221,
945 reference_start: Some(26),
946 reference_end: Some(26),
947 reference_length: Some(0),
948 extra_columns: None,
949 },
950 FiberAnnotation {
951 start: 8,
952 end: 9,
953 length: 1,
954 qual: 242,
955 reference_start: Some(31),
956 reference_end: Some(31),
957 reference_length: Some(0),
958 extra_columns: None,
959 },
960 FiberAnnotation {
961 start: 27,
962 end: 28,
963 length: 1,
964 qual: 3,
965 reference_start: Some(50),
966 reference_end: Some(50),
967 reference_length: Some(0),
968 extra_columns: None,
969 },
970 FiberAnnotation {
971 start: 39,
972 end: 40,
973 length: 1,
974 qual: 47,
975 reference_start: Some(62),
976 reference_end: Some(62),
977 reference_length: Some(0),
978 extra_columns: None,
979 },
980 FiberAnnotation {
981 start: 47,
982 end: 48,
983 length: 1,
984 qual: 239,
985 reference_start: Some(70),
986 reference_end: Some(70),
987 reference_length: Some(0),
988 extra_columns: None,
989 },
990 ],
991 seq_len: 48,
992 reverse: false,
993 },
994 record_is_reverse: false,
995 }]
996 ),
997 2 => assert_eq!(
998 v,
999 vec![BaseMod {
1000 modified_base: b'T',
1001 strand: '+',
1002 modification_type: 'T',
1003 ranges: Ranges {
1004 annotations: vec![
1005 FiberAnnotation {
1006 start: 12,
1007 end: 13,
1008 length: 1,
1009 qual: 3,
1010 reference_start: Some(15),
1011 reference_end: Some(15),
1012 reference_length: Some(0),
1013 extra_columns: None,
1014 },
1015 FiberAnnotation {
1016 start: 13,
1017 end: 14,
1018 length: 1,
1019 qual: 3,
1020 reference_start: Some(16),
1021 reference_end: Some(16),
1022 reference_length: Some(0),
1023 extra_columns: None,
1024 },
1025 FiberAnnotation {
1026 start: 16,
1027 end: 17,
1028 length: 1,
1029 qual: 4,
1030 reference_start: Some(19),
1031 reference_end: Some(19),
1032 reference_length: Some(0),
1033 extra_columns: None,
1034 },
1035 FiberAnnotation {
1036 start: 19,
1037 end: 20,
1038 length: 1,
1039 qual: 3,
1040 reference_start: Some(22),
1041 reference_end: Some(22),
1042 reference_length: Some(0),
1043 extra_columns: None,
1044 },
1045 FiberAnnotation {
1046 start: 20,
1047 end: 21,
1048 length: 1,
1049 qual: 182,
1050 reference_start: Some(23),
1051 reference_end: Some(23),
1052 reference_length: Some(0),
1053 extra_columns: None,
1054 },
1055 ],
1056 seq_len: 33,
1057 reverse: true,
1058 },
1059 record_is_reverse: true,
1060 }]
1061 ),
1062 3 => assert_eq!(
1063 v,
1064 vec![
1065 BaseMod {
1066 modified_base: b'G',
1067 strand: '-',
1068 modification_type: '\u{1C20}',
1069 ranges: Ranges {
1070 annotations: vec![
1071 FiberAnnotation {
1072 start: 28,
1073 end: 29,
1074 length: 1,
1075 qual: 0,
1076 reference_start: None,
1077 reference_end: None,
1078 reference_length: None,
1079 extra_columns: None,
1080 },
1081 FiberAnnotation {
1082 start: 29,
1083 end: 30,
1084 length: 1,
1085 qual: 0,
1086 reference_start: None,
1087 reference_end: None,
1088 reference_length: None,
1089 extra_columns: None,
1090 },
1091 FiberAnnotation {
1092 start: 30,
1093 end: 31,
1094 length: 1,
1095 qual: 0,
1096 reference_start: None,
1097 reference_end: None,
1098 reference_length: None,
1099 extra_columns: None,
1100 },
1101 FiberAnnotation {
1102 start: 32,
1103 end: 33,
1104 length: 1,
1105 qual: 0,
1106 reference_start: None,
1107 reference_end: None,
1108 reference_length: None,
1109 extra_columns: None,
1110 },
1111 FiberAnnotation {
1112 start: 43,
1113 end: 44,
1114 length: 1,
1115 qual: 77,
1116 reference_start: None,
1117 reference_end: None,
1118 reference_length: None,
1119 extra_columns: None,
1120 },
1121 FiberAnnotation {
1122 start: 44,
1123 end: 45,
1124 length: 1,
1125 qual: 0,
1126 reference_start: None,
1127 reference_end: None,
1128 reference_length: None,
1129 extra_columns: None,
1130 },
1131 ],
1132 seq_len: 48,
1133 reverse: false,
1134 },
1135 record_is_reverse: false,
1136 },
1137 BaseMod {
1138 modified_base: b'T',
1139 strand: '+',
1140 modification_type: 'T',
1141 ranges: Ranges {
1142 annotations: vec![
1143 FiberAnnotation {
1144 start: 3,
1145 end: 4,
1146 length: 1,
1147 qual: 221,
1148 reference_start: None,
1149 reference_end: None,
1150 reference_length: None,
1151 extra_columns: None,
1152 },
1153 FiberAnnotation {
1154 start: 8,
1155 end: 9,
1156 length: 1,
1157 qual: 242,
1158 reference_start: None,
1159 reference_end: None,
1160 reference_length: None,
1161 extra_columns: None,
1162 },
1163 FiberAnnotation {
1164 start: 27,
1165 end: 28,
1166 length: 1,
1167 qual: 0,
1168 reference_start: None,
1169 reference_end: None,
1170 reference_length: None,
1171 extra_columns: None,
1172 },
1173 FiberAnnotation {
1174 start: 39,
1175 end: 40,
1176 length: 1,
1177 qual: 47,
1178 reference_start: None,
1179 reference_end: None,
1180 reference_length: None,
1181 extra_columns: None,
1182 },
1183 FiberAnnotation {
1184 start: 47,
1185 end: 48,
1186 length: 1,
1187 qual: 239,
1188 reference_start: None,
1189 reference_end: None,
1190 reference_length: None,
1191 extra_columns: None,
1192 },
1193 ],
1194 seq_len: 48,
1195 reverse: false,
1196 },
1197 record_is_reverse: false,
1198 }
1199 ]
1200 ),
1201 _ => {}
1202 }
1203 }
1204 }
1205 Ok(())
1206 }
1207
1208 /// Tests that case variants that are not supported (mixed case or fully lowercase)
1209 /// correctly return empty `BaseMods`, since the parser should not find these tags.
1210 #[test]
1211 fn mod_bam_parsing_from_example_1_failure_variants() -> Result<(), Error> {
1212 for file_path in [
1213 "examples/example_1.mM_mL.bam",
1214 "examples/example_1.mm_ml.bam",
1215 ] {
1216 let mut reader = nanalogue_bam_reader(file_path)?;
1217 for record in reader.records() {
1218 let r = record?;
1219 let BaseMods { base_mods: v } =
1220 nanalogue_mm_ml_parser(&r, |&_| true, |&_| true, |&_, &_, &_| true, 0).unwrap();
1221 assert_eq!(v, vec![], "Expected empty BaseMods for file: {file_path}");
1222 }
1223 }
1224 Ok(())
1225 }
1226
1227 fn create_test_record_and_parse(mm_value: &str, ml_values: &[u8]) -> Result<BaseMods, Error> {
1228 let mut record = bam::Record::new();
1229
1230 // Create a sequence - ATCG repeated
1231 let seq_bytes = b"ATCGATCGATCGATCGATCG";
1232 let qname = b"test_read";
1233 let cigar = bam::record::CigarString::from(vec![bam::record::Cigar::Match(20)]);
1234 let quals = vec![30u8; 20]; // Quality scores of 30 for all bases
1235 record.set(qname, Some(&cigar), seq_bytes, &quals);
1236
1237 // Set MM tag
1238 record.push_aux(b"MM", Aux::String(mm_value))?;
1239
1240 // Set ML tag (modification probabilities)
1241 record.push_aux(b"ML", Aux::ArrayU8((&ml_values).into()))?;
1242
1243 // Call the parser and return result
1244 nanalogue_mm_ml_parser(
1245 &record,
1246 |&_| true, // Accept all probabilities
1247 |&_| true, // Accept all positions
1248 |&_, &_, &_| true, // Accept all base/strand/tag combinations
1249 0, // No quality threshold
1250 )
1251 }
1252
1253 #[test]
1254 #[should_panic(expected = "InvalidDuplicates")]
1255 fn nanalogue_mm_ml_parser_detects_duplicates() {
1256 // Two T+ modifications with same strand and modification_type
1257 let mm_value = "T+T,0,3;T+T,1,2;";
1258 let ml_values = Vec::from([100u8, 200u8, 150u8, 180u8]);
1259 let _: BaseMods = create_test_record_and_parse(mm_value, &ml_values).unwrap();
1260 }
1261
1262 #[test]
1263 fn nanalogue_mm_ml_parser_accepts_unique_combinations() {
1264 // T+, C+, T- (all different combinations)
1265 let mm_value = "T+T,0,3;C+m,0,1;T-T,1,2;";
1266 let ml_values = Vec::from([100u8, 200u8, 150u8, 180u8, 120u8, 140u8]);
1267 let _: BaseMods = create_test_record_and_parse(mm_value, &ml_values).unwrap();
1268 }
1269
1270 #[test]
1271 #[should_panic(expected = "InvalidModCoords")]
1272 fn nanalogue_mm_ml_parser_detects_invalid_mod_coords_1() {
1273 // Invalid coordinates: seq does not have 11 As (4 + 1 + 5 + 1)
1274 let mm_value = "T+T,0,3;A-T,4,5;";
1275 let ml_values = Vec::from([100u8, 200u8, 150u8, 180u8]);
1276 let _: BaseMods = create_test_record_and_parse(mm_value, &ml_values).unwrap();
1277 }
1278
1279 #[test]
1280 #[should_panic(expected = "InvalidModCoords")]
1281 fn nanalogue_mm_ml_parser_detects_invalid_mod_coords_2() {
1282 // Invalid coords: there aren't 11 C in the sequence
1283 let mm_value = "T+T,0,3;C+m,4,5;T-T,1,2;";
1284 let ml_values = Vec::from([100u8, 200u8, 150u8, 180u8, 120u8, 140u8]);
1285 let _: BaseMods = create_test_record_and_parse(mm_value, &ml_values).unwrap();
1286 }
1287
1288 #[test]
1289 #[should_panic(expected = "InvalidModProbs")]
1290 fn nanalogue_mm_ml_parser_detects_ml_tag_longer_than_mm_data() {
1291 // ML tag has more values than modifications in MM tag
1292 let mm_value = "T+T,0,3;"; // 2 modifications
1293 let ml_values = Vec::from([100u8, 200u8, 150u8, 180u8]); // 4 values - too many!
1294 let _: BaseMods = create_test_record_and_parse(mm_value, &ml_values).unwrap();
1295 }
1296
1297 #[test]
1298 #[should_panic(expected = "InvalidModProbs")]
1299 fn nanalogue_mm_ml_parser_detects_ml_tag_shorter_than_mm_data() {
1300 // ML tag has fewer values than modifications in MM tag
1301 let mm_value = "T+T,0,3;"; // 2 modifications
1302 let ml_values = Vec::from([100u8]); // 1 value - too few!
1303 let _: BaseMods = create_test_record_and_parse(mm_value, &ml_values).unwrap();
1304 }
1305}
1306
1307#[cfg(test)]
1308mod zero_length_filtering_tests {
1309 use super::*;
1310 use rust_htslib::bam::Read as _;
1311
1312 #[test]
1313 fn zero_length_filtering_with_example_2_zero_len_sam() -> Result<(), Error> {
1314 // Test with include_zero_len = false and min_seq_len = 1 (should get 1 read)
1315 let mut reader = nanalogue_bam_reader("examples/example_2_zero_len.sam")?;
1316 let bam_opts_exclude_zero: InputBam =
1317 serde_json::from_str(r#"{"min_seq_len": 1}"#).unwrap();
1318
1319 let mut count_exclude_zero = 0;
1320 for record_result in reader.records() {
1321 let record = record_result?;
1322 if record.pre_filt(&bam_opts_exclude_zero) {
1323 count_exclude_zero += 1;
1324 }
1325 }
1326
1327 // Test with include_zero_len = true and min_seq_len = 1 (should get 2 reads)
1328 let mut reader_same_file = nanalogue_bam_reader("examples/example_2_zero_len.sam")?;
1329 let bam_opts_include_zero: InputBam =
1330 serde_json::from_str(r#"{"min_seq_len": 1, "include_zero_len": true}"#).unwrap();
1331
1332 let mut count_include_zero = 0;
1333 for record_result in reader_same_file.records() {
1334 let record = record_result?;
1335 if record.pre_filt(&bam_opts_include_zero) {
1336 count_include_zero += 1;
1337 }
1338 }
1339
1340 assert_eq!(count_exclude_zero, 1);
1341 assert_eq!(count_include_zero, 2);
1342
1343 Ok(())
1344 }
1345}
1346
1347#[cfg(test)]
1348mod invalid_seq_length_tests {
1349 use super::*;
1350 use rust_htslib::bam::Read as _;
1351
1352 #[test]
1353 fn invalid_seq_length_error_with_example_4() {
1354 let mut reader =
1355 nanalogue_bam_reader("examples/example_4_invalid_basequal_len.sam").unwrap();
1356
1357 let mut cnt = 0;
1358 for record_result in reader.records() {
1359 cnt += 1;
1360 let _: rust_htslib::errors::Error = record_result.unwrap_err();
1361 }
1362 assert_eq!(cnt, 1);
1363 }
1364}
1365
1366#[cfg(test)]
1367mod base_qual_filtering_tests {
1368 use super::*;
1369 use rust_htslib::bam::Read as _;
1370
1371 #[test]
1372 fn base_qual_filtering_with_example_5_valid_basequal_sam() -> Result<(), Error> {
1373 let mut reader = nanalogue_bam_reader("examples/example_5_valid_basequal.sam")?;
1374
1375 for record_result in reader.records() {
1376 let record = record_result?;
1377 let BaseMods { base_mods: v } =
1378 nanalogue_mm_ml_parser(&record, |&_| true, |&_| true, |&_, &_, &_| true, 60)
1379 .unwrap();
1380
1381 assert_eq!(v.len(), 1);
1382 let base_mod = v.first().expect("v has exactly 1 element");
1383 assert_eq!(base_mod.modified_base, b'T');
1384 assert_eq!(base_mod.strand, '+');
1385 assert_eq!(base_mod.modification_type, 'T');
1386 assert_eq!(base_mod.ranges.qual(), vec![7, 9]);
1387 assert_eq!(base_mod.ranges.starts(), vec![3, 4]);
1388 assert_eq!(base_mod.ranges.ends(), vec![4, 5]);
1389 }
1390
1391 Ok(())
1392 }
1393}
1394
1395#[cfg(test)]
1396mod bam_rc_record_tests {
1397 use super::*;
1398 use rand::random_range;
1399 use rust_htslib::bam::record;
1400 use rust_htslib::bam::record::{Cigar, CigarString};
1401
1402 #[test]
1403 fn bam_rc_records() {
1404 let mut reader = nanalogue_bam_reader("examples/example_1.bam").unwrap();
1405 let bam_rc_records = BamRcRecords::new(
1406 &mut reader,
1407 &mut InputBam::default(),
1408 &mut InputMods::<OptionalTag>::default(),
1409 )
1410 .unwrap();
1411 assert_eq!(bam_rc_records.header.tid(b"dummyI"), Some(0));
1412 assert_eq!(bam_rc_records.header.tid(b"dummyII"), Some(1));
1413 assert_eq!(bam_rc_records.header.tid(b"dummyIII"), Some(2));
1414 }
1415
1416 #[test]
1417 fn example_3_read_list_1() {
1418 // Test with example_3_subset_1 - should see 2 reads
1419 let mut reader = nanalogue_bam_reader("examples/example_3.bam").unwrap();
1420 let mut bam_opts: InputBam =
1421 serde_json::from_str(r#"{"read_id_list": "examples/example_3_subset_1"}"#).unwrap();
1422
1423 let bam_rc_records = BamRcRecords::new(
1424 &mut reader,
1425 &mut bam_opts,
1426 &mut InputMods::<OptionalTag>::default(),
1427 )
1428 .unwrap();
1429
1430 // Count lines in the read ID file
1431 let file = File::open("examples/example_3_subset_1").unwrap();
1432 let reader_file = BufReader::new(file);
1433 let mut line_count = 0;
1434 for raw_line in reader_file.lines() {
1435 let temp_line = raw_line.unwrap();
1436 let line = temp_line.trim();
1437 if !line.is_empty() && !line.starts_with('#') {
1438 line_count += 1;
1439 }
1440 }
1441 assert_eq!(line_count, 2);
1442
1443 let mut count = 0;
1444 for record_result in bam_rc_records.rc_records {
1445 let record = record_result.unwrap();
1446 if record.pre_filt(&bam_opts) {
1447 count += 1;
1448 }
1449 }
1450 assert_eq!(count, 2);
1451 }
1452
1453 #[test]
1454 fn example_3_read_list_2() {
1455 // Test with example_3_subset_w_invalid - should see 2 reads
1456 // even though file contains 3 read IDs
1457 let mut reader = nanalogue_bam_reader("examples/example_3.bam").unwrap();
1458 let mut bam_opts: InputBam =
1459 serde_json::from_str(r#"{"read_id_list": "examples/example_3_subset_w_invalid"}"#)
1460 .unwrap();
1461 let bam_rc_records = BamRcRecords::new(
1462 &mut reader,
1463 &mut bam_opts,
1464 &mut InputMods::<OptionalTag>::default(),
1465 )
1466 .unwrap();
1467
1468 // Count lines in the read ID file
1469 let file = File::open("examples/example_3_subset_w_invalid").unwrap();
1470 let reader_file = BufReader::new(file);
1471 let mut line_count = 0;
1472 for raw_line in reader_file.lines() {
1473 let temp_line = raw_line.unwrap();
1474 let line = temp_line.trim();
1475 if !line.is_empty() && !line.starts_with('#') {
1476 line_count += 1;
1477 }
1478 }
1479 assert_eq!(line_count, 3);
1480
1481 let mut count = 0;
1482 for record_result in bam_rc_records.rc_records {
1483 let record = record_result.unwrap();
1484 if record.pre_filt(&bam_opts) {
1485 count += 1;
1486 }
1487 }
1488 assert_eq!(count, 2);
1489 }
1490
1491 #[test]
1492 fn random_retrieval() {
1493 let mut count_retained = 0;
1494 let bam_opts: InputBam =
1495 serde_json::from_str(r#"{"sample_fraction": 0.5, "include_zero_len": true}"#).unwrap();
1496
1497 for _ in 0..10000 {
1498 let record = record::Record::new();
1499 if record.pre_filt(&bam_opts) {
1500 count_retained += 1;
1501 }
1502 }
1503
1504 // 50% retention rate => 5000 reads, so we test if 4500-5500 reads come through
1505 // (this is quite lax actually)
1506 assert!((4500..=5500).contains(&count_retained));
1507 }
1508
1509 #[test]
1510 fn single_read_id_filtering() {
1511 let mut count_retained = 0;
1512
1513 let bam_opts: InputBam =
1514 serde_json::from_str(r#"{"read_id": "read2", "include_zero_len": true}"#).unwrap();
1515
1516 for i in 1..=1000 {
1517 let mut record = record::Record::new();
1518 let read_id = format!("read{i}");
1519 record.set_qname(read_id.as_bytes());
1520
1521 if record.pre_filt(&bam_opts) {
1522 count_retained += 1;
1523 }
1524 }
1525
1526 assert_eq!(count_retained, 1);
1527 }
1528
1529 #[test]
1530 fn seq_and_align_len_filtering() {
1531 let bam_opts_min_len: InputBam = serde_json::from_str(r#"{"min_seq_len": 5000}"#).unwrap();
1532 let bam_opts_min_align_len: InputBam =
1533 serde_json::from_str(r#"{"min_align_len": 2500}"#).unwrap();
1534
1535 let mut count_retained = (0, 0);
1536
1537 for _ in 1..=10000 {
1538 let mut record = record::Record::new();
1539 let seq_len: usize = random_range(2..=10000);
1540 let match_len = seq_len / 2;
1541 let hard_clip_len = seq_len - match_len;
1542
1543 record.set(
1544 b"read",
1545 Some(&CigarString(vec![
1546 Cigar::Match(u32::try_from(match_len).unwrap()),
1547 Cigar::HardClip(u32::try_from(hard_clip_len).unwrap()),
1548 ])),
1549 &vec![b'A'; seq_len],
1550 &vec![50; seq_len],
1551 );
1552 record.unset_flags();
1553 record.set_flags(loop {
1554 let random_state: ReadState = random();
1555 match random_state {
1556 ReadState::Unmapped => {}
1557 v @ (ReadState::PrimaryFwd
1558 | ReadState::PrimaryRev
1559 | ReadState::SecondaryFwd
1560 | ReadState::SecondaryRev
1561 | ReadState::SupplementaryFwd
1562 | ReadState::SupplementaryRev) => break u16::from(v),
1563 }
1564 });
1565 if record.pre_filt(&bam_opts_min_align_len) {
1566 count_retained.0 += 1;
1567 }
1568 if record.pre_filt(&bam_opts_min_len) {
1569 count_retained.1 += 1;
1570 }
1571 }
1572
1573 assert!(count_retained.0 >= 4500 && count_retained.0 <= 5500);
1574 assert!(count_retained.1 >= 4500 && count_retained.1 <= 5500);
1575 }
1576
1577 #[test]
1578 fn filt_by_bitwise_or_flags() {
1579 // Create random subset of read states (ensure at least one)
1580 let selected_states = {
1581 let mut selected_states = Vec::new();
1582 let all_states = vec!["0", "4", "16", "256", "272", "2048", "2064"];
1583 for state in &all_states {
1584 if random::<f32>() < 0.5 {
1585 selected_states.push(*state);
1586 }
1587 }
1588 if selected_states.is_empty() {
1589 let random_idx = random_range(0..all_states.len());
1590 selected_states.push(
1591 *all_states
1592 .get(random_idx)
1593 .expect("random_idx is within all_states range"),
1594 );
1595 }
1596 selected_states
1597 };
1598
1599 let states_string = selected_states.join(",");
1600 let bam_opts: InputBam = serde_json::from_str(
1601 format!("{{\"read_filter\": [{states_string}], \"include_zero_len\": true}}").as_str(),
1602 )
1603 .unwrap();
1604
1605 let mut count_retained = 0;
1606
1607 for _ in 1..=70000 {
1608 let mut record = record::Record::new();
1609 record.unset_flags();
1610 record.set_flags({
1611 let random_state: ReadState = random();
1612 u16::from(random_state)
1613 });
1614 if record.pre_filt(&bam_opts) {
1615 count_retained += 1;
1616 }
1617 }
1618
1619 let expected_count = selected_states.len() * 10000;
1620 let tolerance = expected_count.div_ceil(5);
1621 let min_count = expected_count.saturating_sub(tolerance);
1622 let max_count = expected_count + tolerance;
1623
1624 assert!(count_retained >= min_count && count_retained <= max_count);
1625 }
1626
1627 #[test]
1628 fn filt_by_region() {
1629 let mut count_retained = (0, 0);
1630
1631 for _ in 1..=10000 {
1632 let mut record = record::Record::new();
1633
1634 // Draw four numbers from 0 to 10000
1635 let four_nums = [
1636 random_range(0..=10000),
1637 random_range(0..=10000),
1638 random_range(0..=10000),
1639 random_range(0..=10000u64),
1640 ];
1641
1642 // First two for record
1643 let mut pos_nums = [four_nums[0], four_nums[1]];
1644 pos_nums.sort_unstable();
1645 let start_pos = pos_nums[0];
1646 let end_pos = pos_nums[1];
1647
1648 // Use the last two to set up region, with a random contig chosen
1649 // from a set of two.
1650 let region_tid = i32::from(random::<bool>());
1651 let mut region_nums = [four_nums[2], four_nums[3]];
1652 region_nums.sort_unstable();
1653 let region_start = region_nums[0];
1654 let region_end = region_nums[1];
1655 let bam_opts_no_full_region: InputBam = serde_json::from_str(
1656 format!("{{\"region_bed3\": [{region_tid}, {region_start}, {region_end}]}}")
1657 .as_str(),
1658 )
1659 .unwrap();
1660 let bam_opts_full_region: InputBam = serde_json::from_str(
1661 format!("{{\"full_region\": true, \"region_bed3\": [{region_tid}, {region_start}, {region_end}]}}")
1662 .as_str(),
1663 )
1664 .unwrap();
1665
1666 // Calculate sequence length (ensure at least 1)
1667 let seq_len = if end_pos == start_pos {
1668 1
1669 } else {
1670 end_pos - start_pos
1671 };
1672
1673 // Set sequence details first
1674 record.set(
1675 b"read",
1676 Some(&CigarString(vec![Cigar::Match(
1677 u32::try_from(seq_len).unwrap(),
1678 )])),
1679 &vec![b'A'; usize::try_from(seq_len).unwrap()],
1680 &vec![255; usize::try_from(seq_len).unwrap()],
1681 );
1682
1683 // Set tid and position, contig chosen from a random set of two.
1684 let tid = i32::from(random::<bool>());
1685 record.set_tid(tid);
1686 record.set_pos(i64::try_from(start_pos).unwrap());
1687
1688 // Verify reference_end calculation
1689 let expected_ref_end = i64::try_from(start_pos + seq_len).unwrap();
1690 assert_eq!(record.reference_end(), expected_ref_end);
1691
1692 if record.pre_filt(&bam_opts_no_full_region) {
1693 count_retained.0 += 1;
1694 }
1695 if record.pre_filt(&bam_opts_full_region) {
1696 count_retained.1 += 1;
1697 }
1698 }
1699
1700 // the chance that two randomly chosen intervals on the interval [0, l]
1701 // intersect is 2/3, and then we have the added random element of one
1702 // of two contigs, so the probability is 2/3 * 1/2 = 1/3.
1703 let expected_count = 10000usize.div_ceil(3); // ~3333
1704 let tolerance = expected_count.div_ceil(5);
1705 let min_count = expected_count.saturating_sub(tolerance);
1706 let max_count = expected_count + tolerance;
1707 assert!(count_retained.0 >= min_count && count_retained.0 <= max_count);
1708
1709 // the chance that two randomly chosen intervals are such that one is
1710 // contained within the other is 1/3, and we are looking for the region
1711 // being contained completely within the read and NOT the read being contained
1712 // within the region, so the probability is 1/6. This is one fourth of the
1713 // 2/3 used above. So the same criterion will work with the second count quadrupled.
1714 assert!(4 * count_retained.1 >= min_count && 4 * count_retained.1 <= max_count);
1715 }
1716
1717 #[test]
1718 #[should_panic(expected = "InvalidState")]
1719 fn bam_rc_records_conflicting_read_id_set_and_list() {
1720 let mut reader = nanalogue_bam_reader("examples/example_1.bam").unwrap();
1721 let mut read_id_set = HashSet::new();
1722 let _: bool = read_id_set.insert("some-read".to_owned());
1723
1724 // Build InputBam with read_id_list, then manually set read_id_set to create conflict
1725 let mut bam_opts = InputBamBuilder::default()
1726 .bam_path(PathOrURLOrStdin::Path("examples/example_1.bam".into()))
1727 .read_id_list("examples/example_3_subset_1")
1728 .build()
1729 .unwrap();
1730
1731 // Manually set read_id_set to create the forbidden (true, true) state
1732 bam_opts.read_id_set = Some(read_id_set);
1733
1734 let _: BamRcRecords<_> = BamRcRecords::new(
1735 &mut reader,
1736 &mut bam_opts,
1737 &mut InputMods::<OptionalTag>::default(),
1738 )
1739 .unwrap();
1740 }
1741
1742 #[test]
1743 fn bam_rc_records_read_id_list_none_read_id_set_none() {
1744 // Test (false, false) case: both None, both should remain None
1745 let mut reader = nanalogue_bam_reader("examples/example_1.bam").unwrap();
1746 let mut bam_opts = InputBamBuilder::default()
1747 .bam_path(PathOrURLOrStdin::Path("examples/example_1.bam".into()))
1748 .build()
1749 .unwrap();
1750
1751 assert!(bam_opts.read_id_list.is_none());
1752 assert!(bam_opts.read_id_set.is_none());
1753
1754 let _: BamRcRecords<_> = BamRcRecords::new(
1755 &mut reader,
1756 &mut bam_opts,
1757 &mut InputMods::<OptionalTag>::default(),
1758 )
1759 .unwrap();
1760
1761 // Both should still be None
1762 assert!(bam_opts.read_id_list.is_none());
1763 assert!(bam_opts.read_id_set.is_none());
1764 }
1765
1766 #[test]
1767 fn bam_rc_records_read_id_list_none_read_id_set_some() {
1768 // Test (true, false) case: read_id_set is Some, read_id_list is None
1769 // read_id_set should remain unchanged
1770 let mut reader = nanalogue_bam_reader("examples/example_1.bam").unwrap();
1771 let mut read_id_set = HashSet::new();
1772 let _: bool = read_id_set.insert("read1".to_owned());
1773 let _: bool = read_id_set.insert("read2".to_owned());
1774
1775 let mut bam_opts = InputBamBuilder::default()
1776 .bam_path(PathOrURLOrStdin::Path("examples/example_1.bam".into()))
1777 .read_id_set(read_id_set.clone())
1778 .build()
1779 .unwrap();
1780
1781 assert!(bam_opts.read_id_list.is_none());
1782 assert!(bam_opts.read_id_set.is_some());
1783
1784 let _: BamRcRecords<_> = BamRcRecords::new(
1785 &mut reader,
1786 &mut bam_opts,
1787 &mut InputMods::<OptionalTag>::default(),
1788 )
1789 .unwrap();
1790
1791 // read_id_set should remain unchanged
1792 assert!(bam_opts.read_id_list.is_none());
1793 assert_eq!(bam_opts.read_id_set, Some(read_id_set));
1794 }
1795}