nanalogue_core/
lib.rs

1//! # Nanalogue Core
2//!
3//! ## Introduction
4//!
5//! Nanalogue = *N*ucleic Acid *Analogue*.
6//! Nanalogue is a tool to parse or analyse BAM/Mod BAM files with a single-molecule focus.
7//!
8//! [![Cargo Build & Test](https://github.com/DNAReplicationLab/nanalogue/actions/workflows/ci.yml/badge.svg)](https://github.com/DNAReplicationLab/nanalogue/actions/workflows/ci.yml)
9//! [![Code test coverage > 92\%](https://github.com/DNAReplicationLab/nanalogue/actions/workflows/cargo-llvm-cov.yml/badge.svg)](https://github.com/DNAReplicationLab/nanalogue/actions/workflows/cargo-llvm-cov.yml)
10//! [![crates.io](https://img.shields.io/crates/v/nanalogue.svg)](https://crates.io/crates/nanalogue)
11//! [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
12//!
13//! A common pain point in genomics analyses is that BAM files are information-dense
14//! which makes it difficult to gain insight from them. Nanalogue hopes to make it easy
15//! to extract and process this information, with a particular focus on single-molecule
16//! aspects and DNA/RNA modifications. Despite this focus, some of nanalogue's commands
17//! and functions are quite general and can be applied to almost any BAM file.
18//!
19//! We process and calculate data associated with DNA/RNA molecules, their alignments to
20//! reference genomes, modification information on them, and other miscellaneous
21//! information.  We can process any type of DNA/RNA modifications occuring in any pattern
22//! (single/multiple mods, spatially-isolated/non-isolated etc.). All we require is that
23//! the data is stored in a BAM file in the mod BAM format (i.e. using `MM/ML` tags as
24//! laid down in the [specifications](https://samtools.github.io/hts-specs/SAMtags.pdf)).
25//!
26//! Nanalogue is both an executable that can be run from the command line and a library
27//! whose functionality can be used by others writing rust code. The library's functions
28//! are presented here. The executable exposes the modules `subcommands::*`, and a
29//! separate executable `nanalogue_sim_bam` exposes the `simulate_mod_bam` functionality
30//! (see below).
31//!
32//! For developers: if you are looking to make a custom BAM file containing synthetic, simulated
33//! DNA/RNA modification data to develop/test your tool, you may be interested in `nanalogue_sim_bam`.
34//! This is an executable that ships with nanalogue that can create a BAM file according to your
35//! specifications. Please run `nanalogue_sim_bam --help`. If you are a rust developer looking
36//! to use this functionality in your library, please look at the documentation of the module
37//! [`crate::simulate_mod_bam`].
38//!
39//! This documentation is supplemented by a companion [cookbook](https://www.nanalogue.com).
40//!
41//! ## Sample code
42//!
43//! The [`InputBam`] and the [`InputMods`] structs allow us to set input options
44//! for BAM/modBAM calculations. An example is shown below where the [`crate::read_info::run`]
45//! command is called to process data from a BAM file with some input options.
46//!
47//! ```
48//! use nanalogue_core::{BamRcRecords, BamPreFilt, Error, InputBamBuilder, InputModsBuilder,
49//!     OptionalTag, PathOrURLOrStdin, ThresholdState, nanalogue_bam_reader, read_info};
50//!
51//! let mut bam = InputBamBuilder::default()
52//!     .bam_path(PathOrURLOrStdin::Path("./examples/example_1.bam".into()))
53//!     .region("dummyI".into())
54//!     .build()?;
55//! let mut mods = InputModsBuilder::<OptionalTag>::default()
56//!     .mod_prob_filter(ThresholdState::GtEq(0))
57//!     .build()?;
58//!
59//! let mut buffer = Vec::new();
60//! let mut reader = nanalogue_bam_reader(&bam.bam_path.to_string())?;
61//! let bam_rc_records = BamRcRecords::new(&mut reader, &mut bam, &mut mods)?;
62//! read_info::run(
63//!     &mut buffer,
64//!     bam_rc_records.rc_records
65//!         .filter(|r| r.as_ref().map_or(true, |v| v.pre_filt(&bam))),
66//!     mods,
67//!     None,
68//! )?;
69//! assert!(str::from_utf8(buffer.as_slice())?
70//!     .contains("5d10eb9a-aae1-4db8-8ec6-7ebb34d32575"));
71//! # Ok::<(), Error>(())
72//! ```
73//!
74//! If you want to write custom functionality yourself, please familiarize yourself with the
75//! [`crate::read_utils::CurrRead`] struct. This is the centerpiece of our library, which receives
76//! BAM record data, processes the DNA/RNA modification information amongst other pieces of information,
77//! and exposes them for downstream usage.
78
79use bedrs::{Bed3, Coordinates as _};
80use bio::alphabets::dna::revcomp;
81use bio_types::sequence::SequenceRead as _;
82use fibertools_rs::utils::{
83    bamannotations::{FiberAnnotation, Ranges},
84    basemods::{BaseMod, BaseMods},
85    bio_io::{convert_seq_uppercase, get_u8_tag},
86};
87use rand::random;
88use regex::Regex;
89use rust_htslib::{bam, bam::ext::BamRecordExtensions as _, bam::record::Aux, tpool};
90use std::collections::HashSet;
91use std::fs::File;
92use std::io::{BufRead as _, BufReader};
93use std::sync::{LazyLock, Once};
94
95// Declare the modules.
96pub mod analysis;
97pub mod cli;
98pub mod commands;
99pub mod error;
100pub mod file_utils;
101pub mod read_utils;
102pub mod simulate_mod_bam;
103pub mod subcommands;
104pub mod utils;
105
106// Re-exports
107pub use cli::{
108    InputBam, InputBamBuilder, InputModOptions, InputMods, InputModsBuilder, InputRegionOptions,
109    InputWindowing, InputWindowingBuilder, OptionalTag, RequiredTag, SeqDisplayOptions,
110};
111pub use error::Error;
112pub use file_utils::{
113    nanalogue_bam_reader, nanalogue_bam_reader_from_stdin, nanalogue_bam_reader_from_url,
114    nanalogue_indexed_bam_reader, nanalogue_indexed_bam_reader_from_url, write_bam_denovo,
115    write_fasta,
116};
117pub use read_utils::{
118    AlignmentInfoBuilder, CurrRead, CurrReadBuilder, ModTableEntryBuilder, curr_reads_to_dataframe,
119};
120pub use simulate_mod_bam::SimulationConfig;
121pub use subcommands::{
122    find_modified_reads, peek, read_info, read_stats, reads_table, window_reads,
123};
124pub use utils::{
125    AllowedAGCTN, Contains, DNARestrictive, F32AbsValAtMost1, F32Bw0and1, FilterByRefCoords,
126    GenomicRegion, GetDNARestrictive, Intersects, ModChar, OrdPair, PathOrURLOrStdin, ReadState,
127    ReadStates, RestrictModCalledStrand, SeqCoordCalls, ThresholdState,
128};
129
130/// Static initialization guard for SSL certificate configuration.
131static SSL_INIT: Once = Once::new();
132
133/// Initialize SSL certificate paths for HTTPS support.
134///
135/// This function automatically detects and configures SSL certificate locations
136/// to enable HTTPS file access through libcurl. It uses the `openssl-probe`
137/// crate to find system CA certificate bundles.
138///
139/// # Usage
140///
141/// ```
142/// use nanalogue_core::init_ssl_certificates;
143/// init_ssl_certificates();
144/// ```
145///
146/// # Why This Is Needed
147///
148/// The statically-compiled libcurl in rust-htslib doesn't know where to find
149/// the system's CA certificate bundle for SSL certificate verification. This
150/// function sets the appropriate environment variables to point to the system's
151/// certificate bundle, enabling secure HTTPS connections.
152pub fn init_ssl_certificates() {
153    SSL_INIT.call_once(|| {
154        // SAFETY: This function probes for SSL certificates and sets environment
155        // variables. We're setting these before any SSL/TLS
156        // operations happen, which is the safe usage pattern.
157        unsafe {
158            if openssl_probe::try_init_openssl_env_vars() {
159                // Also set CURL_CA_BUNDLE if SSL_CERT_FILE was set by openssl-probe.
160                // The statically-compiled libcurl in rust-htslib specifically checks
161                // CURL_CA_BUNDLE, not just SSL_CERT_FILE.
162                if let Ok(cert_file) = std::env::var("SSL_CERT_FILE")
163                    && std::env::var("CURL_CA_BUNDLE").is_err()
164                {
165                    std::env::set_var("CURL_CA_BUNDLE", cert_file);
166                }
167            }
168        }
169    });
170}
171
172/// Extracts mod information from BAM record to the `fibertools-rs` `BaseMods` Struct.
173///
174/// We are copying and modifying code from the fibertools-rs repository
175/// (<https://github.com/fiberseq/fibertools-rs>) which is under the MIT license
176/// (please see their Cargo.toml) to create this function.
177///
178/// # Tag Variant Support
179///
180/// We support MM/ML (standard) and Mm/Ml (fallback) tag variants, but no other variants.
181/// The specification recommends MM/ML, but we support the Mm/Ml variant as some sequencing
182/// technologies use this capitalization. Other mixed-case variants (e.g., mM/mL) and fully
183/// lowercase variants (mm/ml) are not recognized.
184///
185/// Function should cover almost all mod bam cases, but will fail in the following scenarios:
186/// - If multiple mods are present on the same base e.g. methylation and hydroxymethylation,
187///   most BAM files the author has come across use the notation MM:Z:C+m,...;C+h,...;,
188///   which this function can parse. But the notation MM:Z:C+mh,...; is also allowed.
189///   We do not parse this for now, please contribute code if you want to add this functionality!
190/// - We do not know any technology/technique which, for instance, replaces both C and T
191///   with `BrdU` i.e. two different bases have the same substitution. This also leads to failure.
192///   If you deal with this, please notify us! On the other hand, it is possible that both C
193///   and A are methylated e.g. 5-Methylcytosine and 6-Methyladenine. Our program can deal
194///   with this as the "tags" corresponding to these are 'm' and 'a' respectively i.e. the
195///   tags are different. For this failure mode, the modifications have to be identical,
196///   not just conceptually identical i.e. although 5-Methylcytosine and 6-Methyladenine
197///   fall under "methylation", they are not chemically identical and thus have different
198///   tags associated with them in the mod BAM format, and thus will not lead to failure.
199///
200/// Following is an example of reading and parsing modification data, with
201/// no filters applied.
202/// We are using an example mod BAM file which has very short reads and very few
203/// modified positions, and just examining two reads in it below.
204/// ```
205/// use nanalogue_core::{Error, nanalogue_bam_reader, nanalogue_mm_ml_parser};
206/// use rust_htslib::bam::Read;
207/// use fibertools_rs::utils::basemods::{BaseMods, BaseMod};
208/// use fibertools_rs::utils::bamannotations::{FiberAnnotation, Ranges};
209/// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
210/// let mut count = 0;
211/// for record in reader.records(){
212///     let r = record?;
213///     let Ok(BaseMods{base_mods: v}) = nanalogue_mm_ml_parser(&r, |&_| true, |&_| true,
214///         |&_, &_, &_| true, 0) else { unreachable!() };
215///     match count {
216///     0 => assert_eq!(v, vec![BaseMod{
217///             modified_base: b'T',
218///             strand: '+',
219///             modification_type: 'T',
220///             ranges: Ranges {
221///                 annotations: vec![
222///                     FiberAnnotation {
223///                         start: 0, end: 1, length: 1, qual: 4,
224///                         reference_start: Some(9), reference_end: Some(9),
225///                         reference_length: Some(0), extra_columns: None,
226///                     },
227///                     FiberAnnotation {
228///                         start: 3, end: 4, length: 1, qual: 7,
229///                         reference_start: Some(12), reference_end: Some(12),
230///                         reference_length: Some(0), extra_columns: None,
231///                     },
232///                     FiberAnnotation {
233///                         start: 4, end: 5, length: 1, qual: 9,
234///                         reference_start: Some(13), reference_end: Some(13),
235///                         reference_length: Some(0), extra_columns: None,
236///                     },
237///                     FiberAnnotation {
238///                         start: 7, end: 8, length: 1, qual: 6,
239///                         reference_start: Some(16), reference_end: Some(16),
240///                         reference_length: Some(0), extra_columns: None,
241///                     },
242///                 ],
243///                 seq_len: 8,
244///                 reverse: false,
245///             },
246///             record_is_reverse: false,
247///     }]),
248///     2 => assert_eq!(v, vec![BaseMod{
249///             modified_base: b'T',
250///             strand: '+',
251///             modification_type: 'T',
252///             ranges: Ranges {
253///                 annotations: vec![
254///                     FiberAnnotation {
255///                         start: 12, end: 13, length: 1, qual: 3,
256///                         reference_start: Some(15), reference_end: Some(15),
257///                         reference_length: Some(0), extra_columns: None,
258///                     },
259///                     FiberAnnotation {
260///                         start: 13, end: 14, length: 1, qual: 3,
261///                         reference_start: Some(16), reference_end: Some(16),
262///                         reference_length: Some(0), extra_columns: None,
263///                     },
264///                     FiberAnnotation {
265///                         start: 16, end: 17, length: 1, qual: 4,
266///                         reference_start: Some(19), reference_end: Some(19),
267///                         reference_length: Some(0), extra_columns: None,
268///                     },
269///                     FiberAnnotation {
270///                         start: 19, end: 20, length: 1, qual: 3,
271///                         reference_start: Some(22), reference_end: Some(22),
272///                         reference_length: Some(0), extra_columns: None,
273///                     },
274///                     FiberAnnotation {
275///                         start: 20, end: 21, length: 1, qual: 182,
276///                         reference_start: Some(23), reference_end: Some(23),
277///                         reference_length: Some(0), extra_columns: None,
278///                     },
279///                 ],
280///                 seq_len: 33,
281///                 reverse: true,
282///             },
283///             record_is_reverse: true,
284///     }]),
285///     _ => {},
286///     }
287///     count = count + 1;
288/// }
289/// # Ok::<(), Error>(())
290/// ```
291///
292/// # Errors
293/// If MM/ML BAM tags are malformed, you will get `InvalidModProbs` or `InvalidModCoords`.
294/// Most integer overflows are dealt with using `except`, except one which gives `ArithmeticError`.
295/// `InvalidDuplicates` occurs if the same tag, strand combination occurs many times.
296/// Please read the function documentation above as well, which explains some scenarios where
297/// even valid tags can be marked as malformed.
298///
299#[expect(clippy::too_many_lines, reason = "Complex BAM MM/ML tag parsing logic")]
300#[expect(
301    clippy::missing_panics_doc,
302    reason = "either impossible scenarios or integer overflows \
303which are also unlikely as genomic coordinates are much less than ~2^63"
304)]
305pub fn nanalogue_mm_ml_parser<F, G, H>(
306    record: &bam::Record,
307    filter_mod_prob: F,
308    filter_mod_pos: G,
309    filter_mod_base_strand_tag: H,
310    min_qual: u8,
311) -> Result<BaseMods, Error>
312where
313    F: Fn(&u8) -> bool,
314    G: Fn(&usize) -> bool,
315    H: Fn(&u8, &char, &ModChar) -> bool,
316{
317    // Regular expression for matching modification data in the MM tag
318    static MM_RE: LazyLock<Regex> = LazyLock::new(|| {
319        Regex::new(r"((([ACGTUN])([-+])([A-Za-z]+|[0-9]+)([.?]?))((,[0-9]+)*;)*)")
320            .expect("no error")
321    });
322    // Array to store all the different modifications within the MM tag
323    let mut rtn: Vec<BaseMod> = Vec::new();
324
325    let ml_tag: Vec<u8> = if record.aux(b"ML").is_ok() {
326        get_u8_tag(record, b"ML")
327    } else {
328        get_u8_tag(record, b"Ml")
329    };
330
331    let mut num_mods_seen: usize = 0;
332
333    let is_reverse = record.is_reverse();
334
335    // if there is an MM tag iterate over all the regex matches
336    if let Ok(Aux::String(mm_text)) = record.aux(b"MM").or_else(|_| record.aux(b"Mm")) {
337        // base qualities; must reverse if rev comp.
338        // NOTE this is always equal to number of bases in the sequence, otherwise
339        // rust_htslib will throw an error, so we don't have to check this.
340        let base_qual: Vec<u8> = match (min_qual, is_reverse) {
341            (0, _) => Vec::new(),
342            (_, true) => record.qual().iter().rev().copied().collect(),
343            (_, false) => record.qual().to_vec(),
344        };
345
346        // get forward sequence bases from the bam record
347        let forward_bases = {
348            let seq = convert_seq_uppercase(record.seq().as_bytes());
349            if is_reverse { revcomp(seq) } else { seq }
350        };
351
352        let seq_len = forward_bases.len();
353
354        let pos_map = {
355            let temp: Vec<Option<i64>> = {
356                if record.is_unmapped() {
357                    std::iter::repeat_n(None, seq_len).collect()
358                } else {
359                    record
360                        .aligned_pairs_full()
361                        .filter(|x| x[0].is_some())
362                        .map(|x| x[1])
363                        .collect()
364                }
365            };
366            if temp.len() == seq_len {
367                temp
368            } else {
369                return Err(Error::InvalidState(format!(
370                    "rust_htslib failure! seq coordinates malformed {} != {}",
371                    temp.len(),
372                    seq_len
373                )));
374            }
375        };
376
377        for cap in MM_RE.captures_iter(mm_text) {
378            let mod_base = cap
379                .get(3)
380                .map(|m| {
381                    *m.as_str()
382                        .as_bytes()
383                        .first()
384                        .expect("regex match guaranteed to have at least 1 byte")
385                })
386                .expect("no error");
387            let mod_strand = cap
388                .get(4)
389                .map_or("", |m| m.as_str())
390                .chars()
391                .next()
392                .expect("no error");
393
394            // get modification type
395            let modification_type: ModChar = cap
396                .get(5)
397                .map_or("", |m| m.as_str())
398                .parse()
399                .expect("no error");
400
401            let is_implicit = match cap.get(6).map_or("", |m| m.as_str()).as_bytes() {
402                b"" | b"." => true,
403                b"?" => false,
404                _ => unreachable!("our regex expression must have prevented other possibilities"),
405            };
406            let mod_dists_str = cap.get(7).map_or("", |m| m.as_str());
407            // parse the string containing distances between modifications into a vector of i64
408
409            let mod_dists: Vec<usize> = mod_dists_str
410                .trim_end_matches(';')
411                .split(',')
412                .map(str::trim)
413                .filter(|s| !s.is_empty())
414                .map(|s| s.parse().unwrap())
415                .collect();
416
417            // do we include bases with zero probabilities?
418            let is_include_zero_prob = filter_mod_prob(&0);
419
420            // find real positions in the forward sequence
421            let mut cur_mod_idx: usize = 0;
422            let mut dist_from_last_mod_base: usize = 0;
423
424            // declare vectors with an approximate with_capacity
425            let (mut modified_positions, mut modified_probabilities) = {
426                #[expect(
427                    clippy::arithmetic_side_effects,
428                    reason = "in rare chance of overflow, vectors are initially missized but will be resized anyway \
429as they are populated. This will result in a small performance hit but we are ok as this will probably never happen \
430when usize is 64-bit as genomic sequences are not that long"
431                )]
432                let mod_data_len_approx = if is_implicit {
433                    // In implicit mode, there may be any number of bases
434                    // after the MM data is over, which must be assumed as unmodified.
435                    // So we cannot know the exact length of the data before actually
436                    // parsing it, and this is just a lower bound of the length.
437                    mod_dists.len() + mod_dists.iter().sum::<usize>()
438                } else {
439                    mod_dists.len()
440                };
441                (
442                    Vec::<usize>::with_capacity(mod_data_len_approx),
443                    Vec::<u8>::with_capacity(mod_data_len_approx),
444                )
445            };
446
447            #[expect(
448                clippy::arithmetic_side_effects,
449                reason = "one counter is checked for overflow and the other is incremented only when below a ceiling"
450            )]
451            for (cur_seq_idx, &_) in forward_bases
452                .iter()
453                .enumerate()
454                .filter(|&(_, &k)| mod_base == b'N' || k == mod_base)
455            {
456                let is_seq_pos_pass: bool = filter_mod_pos(&cur_seq_idx)
457                    && (min_qual > 0).then(|| {
458                        base_qual
459                            .get(cur_seq_idx)
460                            .filter(|&x| *x >= min_qual && *x != 255u8)
461                            .is_some()
462                    }) != Some(false);
463                if cur_mod_idx < mod_dists.len()
464                    && dist_from_last_mod_base
465                        == *mod_dists
466                            .get(cur_mod_idx)
467                            .expect("cur_mod_idx < mod_dists.len()")
468                {
469                    let prob = ml_tag
470                        .get(cur_mod_idx..)
471                        .expect("cur_mod_idx < mod_dists.len() and ml_tag has same length")
472                        .get(num_mods_seen)
473                        .ok_or(Error::InvalidModProbs(
474                            "ML tag appears to be insufficiently long!".into(),
475                        ))?;
476                    if filter_mod_prob(prob) && is_seq_pos_pass {
477                        modified_positions.push(cur_seq_idx);
478                        modified_probabilities.push(*prob);
479                    }
480                    dist_from_last_mod_base = 0;
481                    cur_mod_idx += 1;
482                } else if cur_mod_idx < mod_dists.len()
483                    && dist_from_last_mod_base
484                        > *mod_dists
485                            .get(cur_mod_idx)
486                            .expect("cur_mod_idx < mod_dists.len()")
487                {
488                    return Err(Error::InvalidModCoords(String::from(
489                        "Problem with parsing distances in MM/ML data",
490                    )));
491                } else {
492                    if is_include_zero_prob && is_implicit && is_seq_pos_pass {
493                        modified_positions.push(cur_seq_idx);
494                        modified_probabilities.push(0);
495                    }
496                    dist_from_last_mod_base += 1;
497                }
498            }
499
500            if cur_mod_idx == mod_dists.len() {
501                num_mods_seen = num_mods_seen
502                    .checked_add(cur_mod_idx)
503                    .ok_or(Error::Arithmetic("in MM ML parsing".to_owned()))?;
504            } else {
505                return Err(Error::InvalidModCoords(format!(
506                    "Problem with parsing MM/ML data, counts do not match {} != {}",
507                    cur_mod_idx,
508                    mod_dists.len(),
509                )));
510            }
511
512            // if data matches filters, add to struct.
513            modified_positions.shrink_to(0);
514            modified_probabilities.shrink_to(0);
515
516            #[expect(
517                clippy::arithmetic_side_effects,
518                reason = "`seq_len - 1 - k` (protected as mod pos cannot exceed seq_len), \
519`k.0 + 1` (overflow unlikely as genomic coords << 2^63)"
520            )]
521            #[expect(
522                clippy::indexing_slicing,
523                reason = "`pos_map[k.0]`; neither pos_map's len nor mod pos entry can exceed seq_len"
524            )]
525            if filter_mod_base_strand_tag(&mod_base, &mod_strand, &modification_type) {
526                let annotations: Vec<FiberAnnotation> = modified_positions
527                    .iter()
528                    .map(|k| if is_reverse { seq_len - 1 - k } else { *k })
529                    .zip(modified_probabilities.iter())
530                    .map(|k| {
531                        Ok(FiberAnnotation {
532                            start: i64::try_from(k.0)?,
533                            end: i64::try_from(k.0 + 1)?,
534                            length: 1,
535                            qual: *k.1,
536                            reference_start: pos_map[k.0],
537                            reference_end: pos_map[k.0],
538                            reference_length: pos_map[k.0].is_some().then_some(0),
539                            extra_columns: None,
540                        })
541                    })
542                    .collect::<Result<Vec<FiberAnnotation>, Error>>()?;
543                let mods = BaseMod {
544                    modified_base: mod_base,
545                    strand: mod_strand,
546                    modification_type: modification_type.val(),
547                    ranges: Ranges {
548                        annotations: if is_reverse {
549                            annotations.into_iter().rev().collect()
550                        } else {
551                            annotations
552                        },
553                        seq_len: i64::try_from(seq_len)?,
554                        reverse: is_reverse,
555                    },
556                    record_is_reverse: is_reverse,
557                };
558                rtn.push(mods);
559            }
560        }
561    }
562
563    if num_mods_seen == ml_tag.len() {
564        // needed so I can compare methods
565        rtn.sort();
566
567        // Check for duplicate strand, modification_type combinations
568        let mut seen_combinations = HashSet::new();
569        for base_mod in &rtn {
570            let combination = (base_mod.strand, base_mod.modification_type);
571            if seen_combinations.contains(&combination) {
572                return Err(Error::InvalidDuplicates(format!(
573                    "Duplicate strand '{}' and modification_type '{}' combination found",
574                    base_mod.strand, base_mod.modification_type
575                )));
576            }
577            let _: bool = seen_combinations.insert(combination);
578        }
579
580        Ok(BaseMods { base_mods: rtn })
581    } else {
582        Err(Error::InvalidModProbs(
583            "MM and ML tag lengths do not match!".to_owned(),
584        ))
585    }
586}
587
588/// A global struct which contains BAM records for further usage.
589/// NOTE: we don't derive many traits here as the `RcRecords` object
590/// does not have many traits.
591///
592/// ```
593/// use nanalogue_core::{Error, nanalogue_bam_reader, BamRcRecords, InputBam, InputMods,
594///     OptionalTag};
595/// use rust_htslib::bam::Read;
596/// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
597/// let BamRcRecords = BamRcRecords::new(&mut reader, &mut InputBam::default(),
598///     &mut InputMods::<OptionalTag>::default())?;
599/// assert_eq!(BamRcRecords.header.tid(b"dummyI"), Some(0));
600/// assert_eq!(BamRcRecords.header.tid(b"dummyII"), Some(1));
601/// assert_eq!(BamRcRecords.header.tid(b"dummyIII"), Some(2));
602/// # Ok::<(), Error>(())
603/// ```
604#[derive(Debug)]
605#[non_exhaustive]
606pub struct BamRcRecords<'a, R>
607where
608    R: bam::Read,
609{
610    /// `RcRecords` object output by rust htslib which we can iterate over
611    pub rc_records: bam::RcRecords<'a, R>,
612    /// Header of the bam file
613    pub header: bam::HeaderView,
614}
615
616impl<'a, R: bam::Read> BamRcRecords<'a, R> {
617    /// Extracts `RcRecords` from a BAM Reader
618    ///
619    /// # Errors
620    /// Returns an error if thread pool creation, BAM region fetching/processing,
621    /// or read ID file processing fails.
622    pub fn new<T: InputRegionOptions>(
623        bam_reader: &'a mut R,
624        bam_opts: &mut InputBam,
625        mod_opts: &mut T,
626    ) -> Result<Self, Error> {
627        let header = bam_reader.header().clone();
628        let tp = tpool::ThreadPool::new(bam_opts.threads.get())?;
629        bam_reader.set_thread_pool(&tp)?;
630
631        // Load read ID list if specified
632        match (
633            bam_opts.read_id_set.is_some(),
634            bam_opts.read_id_list.is_some(),
635        ) {
636            (_, false) => {}
637            (true, true) => {
638                return Err(Error::InvalidState(
639                    "cannot set both `read_id_set` and `read_id_list` in `bam_opts` in `BamRcRecords`".to_owned(),
640                ));
641            }
642            (false, true) => {
643                bam_opts.read_id_set = if let Some(file_path) = bam_opts.read_id_list.as_ref() {
644                    let file = File::open(file_path).map_err(Error::InputOutputError)?;
645                    let reader = BufReader::new(file);
646                    let mut read_ids = HashSet::new();
647
648                    for raw_line in reader.lines() {
649                        let temp_line = raw_line.map_err(Error::InputOutputError)?;
650                        let line = temp_line.trim();
651                        if !line.is_empty() && !line.starts_with('#') {
652                            let _: bool = read_ids.insert(line.to_string());
653                        }
654                    }
655                    Some(read_ids)
656                } else {
657                    None
658                };
659            }
660        }
661
662        bam_opts.convert_region_to_bed3(header.clone())?;
663        mod_opts.convert_region_to_bed3(header.clone())?;
664        let rc_records = bam_reader.rc_records();
665        Ok(BamRcRecords::<R> { rc_records, header })
666    }
667}
668
669/// Trait that performs filtration
670pub trait BamPreFilt {
671    /// apply default filtration
672    fn pre_filt(&self, _bam_opts: &InputBam) -> bool {
673        unimplemented!()
674    }
675    /// filtration by length
676    fn filt_by_len(&self, _min_seq_len: u64, _include_zero_len: bool) -> bool {
677        unimplemented!()
678    }
679    /// filtration by alignment length
680    fn filt_by_align_len(&self, _min_align_len: i64) -> bool {
681        unimplemented!()
682    }
683    /// filtration by read id
684    fn filt_by_read_id(&self, _read_id: &str) -> bool {
685        unimplemented!()
686    }
687    /// filtration by read id set
688    fn filt_by_read_id_set(&self, _read_id_set: &HashSet<String>) -> bool {
689        unimplemented!()
690    }
691    /// filtration using flags
692    fn filt_by_bitwise_or_flags(&self, _states: &ReadStates) -> bool {
693        unimplemented!()
694    }
695    /// random filtration
696    fn filt_random_subset(&self, _fraction: F32Bw0and1) -> bool {
697        unimplemented!()
698    }
699    /// filtration by mapq
700    fn filt_by_mapq(&self, _min_mapq: u8, _exclude_mapq_unavail: bool) -> bool {
701        unimplemented!()
702    }
703    /// filtration by region
704    fn filt_by_region(&self, _region: &Bed3<i32, u64>, _full_region: bool) -> bool {
705        unimplemented!()
706    }
707}
708
709/// Trait that performs filtration on `rust_htslib` Record
710///
711/// Filter in action below, 100 bp read passed with a 20 bp filter
712/// but fails with a 120 bp filter
713/// ```
714/// use nanalogue_core::{BamPreFilt, InputBam, Error};
715/// use rust_htslib::bam::record;
716/// use rust_htslib::bam::record::{Cigar, CigarString};
717/// let mut bam_record = record::Record::new();
718/// bam_record.set(&vec![b'r',b'e',b'a',b'd'], Some(&CigarString(vec![Cigar::Match(100)])),
719///       &vec![ b'A' as u8; 100], &vec![255 as u8; 100]);
720/// let mut bam_opts = InputBam::default();
721/// bam_opts.min_seq_len = 20;
722/// assert_eq!(bam_record.pre_filt(&bam_opts), true);
723/// bam_opts.min_seq_len = 120;
724/// assert_eq!(bam_record.pre_filt(&bam_opts), false);
725/// # Ok::<(), Error>(())
726/// ```
727/// By default, zero-length records are excluded to avoid processing errors.
728/// ```
729/// # use nanalogue_core::{BamPreFilt, InputBam, Error};
730/// # use rust_htslib::bam::record;
731/// # use rust_htslib::bam::record::{Cigar, CigarString};
732/// let mut bam_record = record::Record::new();
733/// let mut bam_opts = InputBam::default();
734/// bam_opts.min_seq_len = 20;
735/// bam_opts.include_zero_len = false;  // default behavior
736/// assert_eq!(bam_record.pre_filt(&bam_opts), false);  // excluded
737/// # Ok::<(), Error>(())
738/// ```
739/// Zero-length records can be included if explicitly requested.
740/// ```
741/// # use nanalogue_core::{BamPreFilt, InputBam, Error};
742/// # use rust_htslib::bam::record;
743/// # use rust_htslib::bam::record::{Cigar, CigarString};
744/// let mut bam_record = record::Record::new();
745/// let mut bam_opts = InputBam::default();
746/// bam_opts.min_seq_len = 20;
747/// bam_opts.include_zero_len = true;
748/// assert_eq!(bam_record.pre_filt(&bam_opts), true);  // included
749/// # Ok::<(), Error>(())
750/// ```
751impl BamPreFilt for bam::Record {
752    /// apply default filtration by read length
753    fn pre_filt(&self, bam_opts: &InputBam) -> bool {
754        self.filt_random_subset(bam_opts.sample_fraction)
755            & self.filt_by_len(bam_opts.min_seq_len, bam_opts.include_zero_len)
756            & self.filt_by_mapq(bam_opts.mapq_filter, bam_opts.exclude_mapq_unavail)
757            & {
758                if let Some(v) = bam_opts.read_id.as_ref() {
759                    self.filt_by_read_id(v)
760                } else if let Some(read_id_set) = bam_opts.read_id_set.as_ref() {
761                    self.filt_by_read_id_set(read_id_set)
762                } else {
763                    true
764                }
765            }
766            & {
767                if let Some(v) = bam_opts.min_align_len {
768                    self.filt_by_align_len(v)
769                } else {
770                    true
771                }
772            }
773            & {
774                if let Some(v) = bam_opts.read_filter.as_ref() {
775                    self.filt_by_bitwise_or_flags(v)
776                } else {
777                    true
778                }
779            }
780            & {
781                if let Some(v) = bam_opts.region_filter().as_ref() {
782                    self.filt_by_region(v, bam_opts.is_full_overlap())
783                } else {
784                    true
785                }
786            }
787    }
788    /// filtration by read length
789    fn filt_by_len(&self, min_seq_len: u64, include_zero_len: bool) -> bool {
790        // self.len() returns a usize which we convert to u64
791        match (min_seq_len, self.len() as u64, include_zero_len) {
792            (_, 0, false) => false, // Exclude zero-length by default
793            (_, 0, true) => true,   // Include zero-length when explicitly requested
794            (l_min, v, _) => v >= l_min,
795        }
796    }
797    /// filtration by alignment length
798    fn filt_by_align_len(&self, min_align_len: i64) -> bool {
799        !self.is_unmapped() && {
800            let ref_end = self.reference_end();
801            let ref_start = self.pos();
802            ref_end >= ref_start
803                && ref_start >= 0
804                && ref_end
805                    .checked_sub(ref_start)
806                    .expect("ref_end >= ref_start so overflow is impossible")
807                    >= min_align_len
808        }
809    }
810    /// filtration by read id
811    fn filt_by_read_id(&self, read_id: &str) -> bool {
812        read_id.as_bytes() == self.name()
813    }
814    /// filtration by read id set
815    fn filt_by_read_id_set(&self, read_id_set: &HashSet<String>) -> bool {
816        if let Ok(name_str) = std::str::from_utf8(self.name()) {
817            read_id_set.contains(name_str)
818        } else {
819            false
820        }
821    }
822    /// filtration by flag list
823    fn filt_by_bitwise_or_flags(&self, states: &ReadStates) -> bool {
824        states.bam_flags().contains(&self.flags())
825    }
826    /// random filtration
827    fn filt_random_subset(&self, fraction: F32Bw0and1) -> bool {
828        match fraction.val() {
829            1.0 => true,
830            0.0 => false,
831            v => {
832                let random_number: f32 = random();
833                random_number < v
834            }
835        }
836    }
837    /// filtration by mapq
838    fn filt_by_mapq(&self, min_mapq: u8, exclude_mapq_unavail: bool) -> bool {
839        !(exclude_mapq_unavail && self.mapq() == 255) && self.mapq() >= min_mapq
840    }
841    /// filtration by region
842    fn filt_by_region(&self, region: &Bed3<i32, u64>, full_region: bool) -> bool {
843        !self.is_unmapped() && (self.tid() == *region.chr()) && {
844            let region_start = region.start();
845            let region_end = region.end();
846            let start: u64 = self.pos().try_into().expect("no error");
847            let end: u64 = self.reference_end().try_into().expect("no error");
848            if full_region {
849                (start..end).contains(&region_start) && (start..=end).contains(&region_end)
850            } else {
851                (start..end).intersects(&(region_start..region_end))
852            }
853        }
854    }
855}
856
857#[cfg(test)]
858mod mod_parse_tests {
859    use super::*;
860    use rust_htslib::bam::Read as _;
861
862    /// Tests if Mod BAM modification parsing is alright with fallback tag support.
863    /// Tests both MM/ML (standard uppercase) and Mm/Ml (fallback lowercase) tag variants.
864    /// Note: Only these specific variants are supported - fully lowercase (mm/ml) and
865    /// other mixed-case variants (e.g., mM/mL) are not recognized.
866    /// Some of the test cases here may be a repeat of the doctest above.
867    #[test]
868    #[expect(clippy::too_many_lines, reason = "Comprehensive integration test")]
869    fn mod_bam_parsing_from_example_1_compatible_variants() -> Result<(), Error> {
870        for file_path in ["examples/example_1.bam", "examples/example_1.Mm_Ml.bam"] {
871            let mut reader = nanalogue_bam_reader(file_path)?;
872            for (count, record) in reader.records().enumerate() {
873                let r = record?;
874                let BaseMods { base_mods: v } =
875                    nanalogue_mm_ml_parser(&r, |&_| true, |&_| true, |&_, &_, &_| true, 0).unwrap();
876                match count {
877                    0 => assert_eq!(
878                        v,
879                        vec![BaseMod {
880                            modified_base: b'T',
881                            strand: '+',
882                            modification_type: 'T',
883                            ranges: Ranges {
884                                annotations: vec![
885                                    FiberAnnotation {
886                                        start: 0,
887                                        end: 1,
888                                        length: 1,
889                                        qual: 4,
890                                        reference_start: Some(9),
891                                        reference_end: Some(9),
892                                        reference_length: Some(0),
893                                        extra_columns: None,
894                                    },
895                                    FiberAnnotation {
896                                        start: 3,
897                                        end: 4,
898                                        length: 1,
899                                        qual: 7,
900                                        reference_start: Some(12),
901                                        reference_end: Some(12),
902                                        reference_length: Some(0),
903                                        extra_columns: None,
904                                    },
905                                    FiberAnnotation {
906                                        start: 4,
907                                        end: 5,
908                                        length: 1,
909                                        qual: 9,
910                                        reference_start: Some(13),
911                                        reference_end: Some(13),
912                                        reference_length: Some(0),
913                                        extra_columns: None,
914                                    },
915                                    FiberAnnotation {
916                                        start: 7,
917                                        end: 8,
918                                        length: 1,
919                                        qual: 6,
920                                        reference_start: Some(16),
921                                        reference_end: Some(16),
922                                        reference_length: Some(0),
923                                        extra_columns: None,
924                                    },
925                                ],
926                                seq_len: 8,
927                                reverse: false,
928                            },
929                            record_is_reverse: false,
930                        }]
931                    ),
932                    1 => assert_eq!(
933                        v,
934                        vec![BaseMod {
935                            modified_base: b'T',
936                            strand: '+',
937                            modification_type: 'T',
938                            ranges: Ranges {
939                                annotations: vec![
940                                    FiberAnnotation {
941                                        start: 3,
942                                        end: 4,
943                                        length: 1,
944                                        qual: 221,
945                                        reference_start: Some(26),
946                                        reference_end: Some(26),
947                                        reference_length: Some(0),
948                                        extra_columns: None,
949                                    },
950                                    FiberAnnotation {
951                                        start: 8,
952                                        end: 9,
953                                        length: 1,
954                                        qual: 242,
955                                        reference_start: Some(31),
956                                        reference_end: Some(31),
957                                        reference_length: Some(0),
958                                        extra_columns: None,
959                                    },
960                                    FiberAnnotation {
961                                        start: 27,
962                                        end: 28,
963                                        length: 1,
964                                        qual: 3,
965                                        reference_start: Some(50),
966                                        reference_end: Some(50),
967                                        reference_length: Some(0),
968                                        extra_columns: None,
969                                    },
970                                    FiberAnnotation {
971                                        start: 39,
972                                        end: 40,
973                                        length: 1,
974                                        qual: 47,
975                                        reference_start: Some(62),
976                                        reference_end: Some(62),
977                                        reference_length: Some(0),
978                                        extra_columns: None,
979                                    },
980                                    FiberAnnotation {
981                                        start: 47,
982                                        end: 48,
983                                        length: 1,
984                                        qual: 239,
985                                        reference_start: Some(70),
986                                        reference_end: Some(70),
987                                        reference_length: Some(0),
988                                        extra_columns: None,
989                                    },
990                                ],
991                                seq_len: 48,
992                                reverse: false,
993                            },
994                            record_is_reverse: false,
995                        }]
996                    ),
997                    2 => assert_eq!(
998                        v,
999                        vec![BaseMod {
1000                            modified_base: b'T',
1001                            strand: '+',
1002                            modification_type: 'T',
1003                            ranges: Ranges {
1004                                annotations: vec![
1005                                    FiberAnnotation {
1006                                        start: 12,
1007                                        end: 13,
1008                                        length: 1,
1009                                        qual: 3,
1010                                        reference_start: Some(15),
1011                                        reference_end: Some(15),
1012                                        reference_length: Some(0),
1013                                        extra_columns: None,
1014                                    },
1015                                    FiberAnnotation {
1016                                        start: 13,
1017                                        end: 14,
1018                                        length: 1,
1019                                        qual: 3,
1020                                        reference_start: Some(16),
1021                                        reference_end: Some(16),
1022                                        reference_length: Some(0),
1023                                        extra_columns: None,
1024                                    },
1025                                    FiberAnnotation {
1026                                        start: 16,
1027                                        end: 17,
1028                                        length: 1,
1029                                        qual: 4,
1030                                        reference_start: Some(19),
1031                                        reference_end: Some(19),
1032                                        reference_length: Some(0),
1033                                        extra_columns: None,
1034                                    },
1035                                    FiberAnnotation {
1036                                        start: 19,
1037                                        end: 20,
1038                                        length: 1,
1039                                        qual: 3,
1040                                        reference_start: Some(22),
1041                                        reference_end: Some(22),
1042                                        reference_length: Some(0),
1043                                        extra_columns: None,
1044                                    },
1045                                    FiberAnnotation {
1046                                        start: 20,
1047                                        end: 21,
1048                                        length: 1,
1049                                        qual: 182,
1050                                        reference_start: Some(23),
1051                                        reference_end: Some(23),
1052                                        reference_length: Some(0),
1053                                        extra_columns: None,
1054                                    },
1055                                ],
1056                                seq_len: 33,
1057                                reverse: true,
1058                            },
1059                            record_is_reverse: true,
1060                        }]
1061                    ),
1062                    3 => assert_eq!(
1063                        v,
1064                        vec![
1065                            BaseMod {
1066                                modified_base: b'G',
1067                                strand: '-',
1068                                modification_type: '\u{1C20}',
1069                                ranges: Ranges {
1070                                    annotations: vec![
1071                                        FiberAnnotation {
1072                                            start: 28,
1073                                            end: 29,
1074                                            length: 1,
1075                                            qual: 0,
1076                                            reference_start: None,
1077                                            reference_end: None,
1078                                            reference_length: None,
1079                                            extra_columns: None,
1080                                        },
1081                                        FiberAnnotation {
1082                                            start: 29,
1083                                            end: 30,
1084                                            length: 1,
1085                                            qual: 0,
1086                                            reference_start: None,
1087                                            reference_end: None,
1088                                            reference_length: None,
1089                                            extra_columns: None,
1090                                        },
1091                                        FiberAnnotation {
1092                                            start: 30,
1093                                            end: 31,
1094                                            length: 1,
1095                                            qual: 0,
1096                                            reference_start: None,
1097                                            reference_end: None,
1098                                            reference_length: None,
1099                                            extra_columns: None,
1100                                        },
1101                                        FiberAnnotation {
1102                                            start: 32,
1103                                            end: 33,
1104                                            length: 1,
1105                                            qual: 0,
1106                                            reference_start: None,
1107                                            reference_end: None,
1108                                            reference_length: None,
1109                                            extra_columns: None,
1110                                        },
1111                                        FiberAnnotation {
1112                                            start: 43,
1113                                            end: 44,
1114                                            length: 1,
1115                                            qual: 77,
1116                                            reference_start: None,
1117                                            reference_end: None,
1118                                            reference_length: None,
1119                                            extra_columns: None,
1120                                        },
1121                                        FiberAnnotation {
1122                                            start: 44,
1123                                            end: 45,
1124                                            length: 1,
1125                                            qual: 0,
1126                                            reference_start: None,
1127                                            reference_end: None,
1128                                            reference_length: None,
1129                                            extra_columns: None,
1130                                        },
1131                                    ],
1132                                    seq_len: 48,
1133                                    reverse: false,
1134                                },
1135                                record_is_reverse: false,
1136                            },
1137                            BaseMod {
1138                                modified_base: b'T',
1139                                strand: '+',
1140                                modification_type: 'T',
1141                                ranges: Ranges {
1142                                    annotations: vec![
1143                                        FiberAnnotation {
1144                                            start: 3,
1145                                            end: 4,
1146                                            length: 1,
1147                                            qual: 221,
1148                                            reference_start: None,
1149                                            reference_end: None,
1150                                            reference_length: None,
1151                                            extra_columns: None,
1152                                        },
1153                                        FiberAnnotation {
1154                                            start: 8,
1155                                            end: 9,
1156                                            length: 1,
1157                                            qual: 242,
1158                                            reference_start: None,
1159                                            reference_end: None,
1160                                            reference_length: None,
1161                                            extra_columns: None,
1162                                        },
1163                                        FiberAnnotation {
1164                                            start: 27,
1165                                            end: 28,
1166                                            length: 1,
1167                                            qual: 0,
1168                                            reference_start: None,
1169                                            reference_end: None,
1170                                            reference_length: None,
1171                                            extra_columns: None,
1172                                        },
1173                                        FiberAnnotation {
1174                                            start: 39,
1175                                            end: 40,
1176                                            length: 1,
1177                                            qual: 47,
1178                                            reference_start: None,
1179                                            reference_end: None,
1180                                            reference_length: None,
1181                                            extra_columns: None,
1182                                        },
1183                                        FiberAnnotation {
1184                                            start: 47,
1185                                            end: 48,
1186                                            length: 1,
1187                                            qual: 239,
1188                                            reference_start: None,
1189                                            reference_end: None,
1190                                            reference_length: None,
1191                                            extra_columns: None,
1192                                        },
1193                                    ],
1194                                    seq_len: 48,
1195                                    reverse: false,
1196                                },
1197                                record_is_reverse: false,
1198                            }
1199                        ]
1200                    ),
1201                    _ => {}
1202                }
1203            }
1204        }
1205        Ok(())
1206    }
1207
1208    /// Tests that case variants that are not supported (mixed case or fully lowercase)
1209    /// correctly return empty `BaseMods`, since the parser should not find these tags.
1210    #[test]
1211    fn mod_bam_parsing_from_example_1_failure_variants() -> Result<(), Error> {
1212        for file_path in [
1213            "examples/example_1.mM_mL.bam",
1214            "examples/example_1.mm_ml.bam",
1215        ] {
1216            let mut reader = nanalogue_bam_reader(file_path)?;
1217            for record in reader.records() {
1218                let r = record?;
1219                let BaseMods { base_mods: v } =
1220                    nanalogue_mm_ml_parser(&r, |&_| true, |&_| true, |&_, &_, &_| true, 0).unwrap();
1221                assert_eq!(v, vec![], "Expected empty BaseMods for file: {file_path}");
1222            }
1223        }
1224        Ok(())
1225    }
1226
1227    fn create_test_record_and_parse(mm_value: &str, ml_values: &[u8]) -> Result<BaseMods, Error> {
1228        let mut record = bam::Record::new();
1229
1230        // Create a sequence - ATCG repeated
1231        let seq_bytes = b"ATCGATCGATCGATCGATCG";
1232        let qname = b"test_read";
1233        let cigar = bam::record::CigarString::from(vec![bam::record::Cigar::Match(20)]);
1234        let quals = vec![30u8; 20]; // Quality scores of 30 for all bases
1235        record.set(qname, Some(&cigar), seq_bytes, &quals);
1236
1237        // Set MM tag
1238        record.push_aux(b"MM", Aux::String(mm_value))?;
1239
1240        // Set ML tag (modification probabilities)
1241        record.push_aux(b"ML", Aux::ArrayU8((&ml_values).into()))?;
1242
1243        // Call the parser and return result
1244        nanalogue_mm_ml_parser(
1245            &record,
1246            |&_| true,         // Accept all probabilities
1247            |&_| true,         // Accept all positions
1248            |&_, &_, &_| true, // Accept all base/strand/tag combinations
1249            0,                 // No quality threshold
1250        )
1251    }
1252
1253    #[test]
1254    #[should_panic(expected = "InvalidDuplicates")]
1255    fn nanalogue_mm_ml_parser_detects_duplicates() {
1256        // Two T+ modifications with same strand and modification_type
1257        let mm_value = "T+T,0,3;T+T,1,2;";
1258        let ml_values = Vec::from([100u8, 200u8, 150u8, 180u8]);
1259        let _: BaseMods = create_test_record_and_parse(mm_value, &ml_values).unwrap();
1260    }
1261
1262    #[test]
1263    fn nanalogue_mm_ml_parser_accepts_unique_combinations() {
1264        // T+, C+, T- (all different combinations)
1265        let mm_value = "T+T,0,3;C+m,0,1;T-T,1,2;";
1266        let ml_values = Vec::from([100u8, 200u8, 150u8, 180u8, 120u8, 140u8]);
1267        let _: BaseMods = create_test_record_and_parse(mm_value, &ml_values).unwrap();
1268    }
1269
1270    #[test]
1271    #[should_panic(expected = "InvalidModCoords")]
1272    fn nanalogue_mm_ml_parser_detects_invalid_mod_coords_1() {
1273        // Invalid coordinates: seq does not have 11 As (4 + 1 + 5 + 1)
1274        let mm_value = "T+T,0,3;A-T,4,5;";
1275        let ml_values = Vec::from([100u8, 200u8, 150u8, 180u8]);
1276        let _: BaseMods = create_test_record_and_parse(mm_value, &ml_values).unwrap();
1277    }
1278
1279    #[test]
1280    #[should_panic(expected = "InvalidModCoords")]
1281    fn nanalogue_mm_ml_parser_detects_invalid_mod_coords_2() {
1282        // Invalid coords: there aren't 11 C in the sequence
1283        let mm_value = "T+T,0,3;C+m,4,5;T-T,1,2;";
1284        let ml_values = Vec::from([100u8, 200u8, 150u8, 180u8, 120u8, 140u8]);
1285        let _: BaseMods = create_test_record_and_parse(mm_value, &ml_values).unwrap();
1286    }
1287
1288    #[test]
1289    #[should_panic(expected = "InvalidModProbs")]
1290    fn nanalogue_mm_ml_parser_detects_ml_tag_longer_than_mm_data() {
1291        // ML tag has more values than modifications in MM tag
1292        let mm_value = "T+T,0,3;"; // 2 modifications
1293        let ml_values = Vec::from([100u8, 200u8, 150u8, 180u8]); // 4 values - too many!
1294        let _: BaseMods = create_test_record_and_parse(mm_value, &ml_values).unwrap();
1295    }
1296
1297    #[test]
1298    #[should_panic(expected = "InvalidModProbs")]
1299    fn nanalogue_mm_ml_parser_detects_ml_tag_shorter_than_mm_data() {
1300        // ML tag has fewer values than modifications in MM tag
1301        let mm_value = "T+T,0,3;"; // 2 modifications
1302        let ml_values = Vec::from([100u8]); // 1 value - too few!
1303        let _: BaseMods = create_test_record_and_parse(mm_value, &ml_values).unwrap();
1304    }
1305}
1306
1307#[cfg(test)]
1308mod zero_length_filtering_tests {
1309    use super::*;
1310    use rust_htslib::bam::Read as _;
1311
1312    #[test]
1313    fn zero_length_filtering_with_example_2_zero_len_sam() -> Result<(), Error> {
1314        // Test with include_zero_len = false and min_seq_len = 1 (should get 1 read)
1315        let mut reader = nanalogue_bam_reader("examples/example_2_zero_len.sam")?;
1316        let bam_opts_exclude_zero: InputBam =
1317            serde_json::from_str(r#"{"min_seq_len": 1}"#).unwrap();
1318
1319        let mut count_exclude_zero = 0;
1320        for record_result in reader.records() {
1321            let record = record_result?;
1322            if record.pre_filt(&bam_opts_exclude_zero) {
1323                count_exclude_zero += 1;
1324            }
1325        }
1326
1327        // Test with include_zero_len = true and min_seq_len = 1 (should get 2 reads)
1328        let mut reader_same_file = nanalogue_bam_reader("examples/example_2_zero_len.sam")?;
1329        let bam_opts_include_zero: InputBam =
1330            serde_json::from_str(r#"{"min_seq_len": 1, "include_zero_len": true}"#).unwrap();
1331
1332        let mut count_include_zero = 0;
1333        for record_result in reader_same_file.records() {
1334            let record = record_result?;
1335            if record.pre_filt(&bam_opts_include_zero) {
1336                count_include_zero += 1;
1337            }
1338        }
1339
1340        assert_eq!(count_exclude_zero, 1);
1341        assert_eq!(count_include_zero, 2);
1342
1343        Ok(())
1344    }
1345}
1346
1347#[cfg(test)]
1348mod invalid_seq_length_tests {
1349    use super::*;
1350    use rust_htslib::bam::Read as _;
1351
1352    #[test]
1353    fn invalid_seq_length_error_with_example_4() {
1354        let mut reader =
1355            nanalogue_bam_reader("examples/example_4_invalid_basequal_len.sam").unwrap();
1356
1357        let mut cnt = 0;
1358        for record_result in reader.records() {
1359            cnt += 1;
1360            let _: rust_htslib::errors::Error = record_result.unwrap_err();
1361        }
1362        assert_eq!(cnt, 1);
1363    }
1364}
1365
1366#[cfg(test)]
1367mod base_qual_filtering_tests {
1368    use super::*;
1369    use rust_htslib::bam::Read as _;
1370
1371    #[test]
1372    fn base_qual_filtering_with_example_5_valid_basequal_sam() -> Result<(), Error> {
1373        let mut reader = nanalogue_bam_reader("examples/example_5_valid_basequal.sam")?;
1374
1375        for record_result in reader.records() {
1376            let record = record_result?;
1377            let BaseMods { base_mods: v } =
1378                nanalogue_mm_ml_parser(&record, |&_| true, |&_| true, |&_, &_, &_| true, 60)
1379                    .unwrap();
1380
1381            assert_eq!(v.len(), 1);
1382            let base_mod = v.first().expect("v has exactly 1 element");
1383            assert_eq!(base_mod.modified_base, b'T');
1384            assert_eq!(base_mod.strand, '+');
1385            assert_eq!(base_mod.modification_type, 'T');
1386            assert_eq!(base_mod.ranges.qual(), vec![7, 9]);
1387            assert_eq!(base_mod.ranges.starts(), vec![3, 4]);
1388            assert_eq!(base_mod.ranges.ends(), vec![4, 5]);
1389        }
1390
1391        Ok(())
1392    }
1393}
1394
1395#[cfg(test)]
1396mod bam_rc_record_tests {
1397    use super::*;
1398    use rand::random_range;
1399    use rust_htslib::bam::record;
1400    use rust_htslib::bam::record::{Cigar, CigarString};
1401
1402    #[test]
1403    fn bam_rc_records() {
1404        let mut reader = nanalogue_bam_reader("examples/example_1.bam").unwrap();
1405        let bam_rc_records = BamRcRecords::new(
1406            &mut reader,
1407            &mut InputBam::default(),
1408            &mut InputMods::<OptionalTag>::default(),
1409        )
1410        .unwrap();
1411        assert_eq!(bam_rc_records.header.tid(b"dummyI"), Some(0));
1412        assert_eq!(bam_rc_records.header.tid(b"dummyII"), Some(1));
1413        assert_eq!(bam_rc_records.header.tid(b"dummyIII"), Some(2));
1414    }
1415
1416    #[test]
1417    fn example_3_read_list_1() {
1418        // Test with example_3_subset_1 - should see 2 reads
1419        let mut reader = nanalogue_bam_reader("examples/example_3.bam").unwrap();
1420        let mut bam_opts: InputBam =
1421            serde_json::from_str(r#"{"read_id_list": "examples/example_3_subset_1"}"#).unwrap();
1422
1423        let bam_rc_records = BamRcRecords::new(
1424            &mut reader,
1425            &mut bam_opts,
1426            &mut InputMods::<OptionalTag>::default(),
1427        )
1428        .unwrap();
1429
1430        // Count lines in the read ID file
1431        let file = File::open("examples/example_3_subset_1").unwrap();
1432        let reader_file = BufReader::new(file);
1433        let mut line_count = 0;
1434        for raw_line in reader_file.lines() {
1435            let temp_line = raw_line.unwrap();
1436            let line = temp_line.trim();
1437            if !line.is_empty() && !line.starts_with('#') {
1438                line_count += 1;
1439            }
1440        }
1441        assert_eq!(line_count, 2);
1442
1443        let mut count = 0;
1444        for record_result in bam_rc_records.rc_records {
1445            let record = record_result.unwrap();
1446            if record.pre_filt(&bam_opts) {
1447                count += 1;
1448            }
1449        }
1450        assert_eq!(count, 2);
1451    }
1452
1453    #[test]
1454    fn example_3_read_list_2() {
1455        // Test with example_3_subset_w_invalid - should see 2 reads
1456        // even though file contains 3 read IDs
1457        let mut reader = nanalogue_bam_reader("examples/example_3.bam").unwrap();
1458        let mut bam_opts: InputBam =
1459            serde_json::from_str(r#"{"read_id_list": "examples/example_3_subset_w_invalid"}"#)
1460                .unwrap();
1461        let bam_rc_records = BamRcRecords::new(
1462            &mut reader,
1463            &mut bam_opts,
1464            &mut InputMods::<OptionalTag>::default(),
1465        )
1466        .unwrap();
1467
1468        // Count lines in the read ID file
1469        let file = File::open("examples/example_3_subset_w_invalid").unwrap();
1470        let reader_file = BufReader::new(file);
1471        let mut line_count = 0;
1472        for raw_line in reader_file.lines() {
1473            let temp_line = raw_line.unwrap();
1474            let line = temp_line.trim();
1475            if !line.is_empty() && !line.starts_with('#') {
1476                line_count += 1;
1477            }
1478        }
1479        assert_eq!(line_count, 3);
1480
1481        let mut count = 0;
1482        for record_result in bam_rc_records.rc_records {
1483            let record = record_result.unwrap();
1484            if record.pre_filt(&bam_opts) {
1485                count += 1;
1486            }
1487        }
1488        assert_eq!(count, 2);
1489    }
1490
1491    #[test]
1492    fn random_retrieval() {
1493        let mut count_retained = 0;
1494        let bam_opts: InputBam =
1495            serde_json::from_str(r#"{"sample_fraction": 0.5, "include_zero_len": true}"#).unwrap();
1496
1497        for _ in 0..10000 {
1498            let record = record::Record::new();
1499            if record.pre_filt(&bam_opts) {
1500                count_retained += 1;
1501            }
1502        }
1503
1504        // 50% retention rate => 5000 reads, so we test if 4500-5500 reads come through
1505        // (this is quite lax actually)
1506        assert!((4500..=5500).contains(&count_retained));
1507    }
1508
1509    #[test]
1510    fn single_read_id_filtering() {
1511        let mut count_retained = 0;
1512
1513        let bam_opts: InputBam =
1514            serde_json::from_str(r#"{"read_id": "read2", "include_zero_len": true}"#).unwrap();
1515
1516        for i in 1..=1000 {
1517            let mut record = record::Record::new();
1518            let read_id = format!("read{i}");
1519            record.set_qname(read_id.as_bytes());
1520
1521            if record.pre_filt(&bam_opts) {
1522                count_retained += 1;
1523            }
1524        }
1525
1526        assert_eq!(count_retained, 1);
1527    }
1528
1529    #[test]
1530    fn seq_and_align_len_filtering() {
1531        let bam_opts_min_len: InputBam = serde_json::from_str(r#"{"min_seq_len": 5000}"#).unwrap();
1532        let bam_opts_min_align_len: InputBam =
1533            serde_json::from_str(r#"{"min_align_len": 2500}"#).unwrap();
1534
1535        let mut count_retained = (0, 0);
1536
1537        for _ in 1..=10000 {
1538            let mut record = record::Record::new();
1539            let seq_len: usize = random_range(2..=10000);
1540            let match_len = seq_len / 2;
1541            let hard_clip_len = seq_len - match_len;
1542
1543            record.set(
1544                b"read",
1545                Some(&CigarString(vec![
1546                    Cigar::Match(u32::try_from(match_len).unwrap()),
1547                    Cigar::HardClip(u32::try_from(hard_clip_len).unwrap()),
1548                ])),
1549                &vec![b'A'; seq_len],
1550                &vec![50; seq_len],
1551            );
1552            record.unset_flags();
1553            record.set_flags(loop {
1554                let random_state: ReadState = random();
1555                match random_state {
1556                    ReadState::Unmapped => {}
1557                    v @ (ReadState::PrimaryFwd
1558                    | ReadState::PrimaryRev
1559                    | ReadState::SecondaryFwd
1560                    | ReadState::SecondaryRev
1561                    | ReadState::SupplementaryFwd
1562                    | ReadState::SupplementaryRev) => break u16::from(v),
1563                }
1564            });
1565            if record.pre_filt(&bam_opts_min_align_len) {
1566                count_retained.0 += 1;
1567            }
1568            if record.pre_filt(&bam_opts_min_len) {
1569                count_retained.1 += 1;
1570            }
1571        }
1572
1573        assert!(count_retained.0 >= 4500 && count_retained.0 <= 5500);
1574        assert!(count_retained.1 >= 4500 && count_retained.1 <= 5500);
1575    }
1576
1577    #[test]
1578    fn filt_by_bitwise_or_flags() {
1579        // Create random subset of read states (ensure at least one)
1580        let selected_states = {
1581            let mut selected_states = Vec::new();
1582            let all_states = vec!["0", "4", "16", "256", "272", "2048", "2064"];
1583            for state in &all_states {
1584                if random::<f32>() < 0.5 {
1585                    selected_states.push(*state);
1586                }
1587            }
1588            if selected_states.is_empty() {
1589                let random_idx = random_range(0..all_states.len());
1590                selected_states.push(
1591                    *all_states
1592                        .get(random_idx)
1593                        .expect("random_idx is within all_states range"),
1594                );
1595            }
1596            selected_states
1597        };
1598
1599        let states_string = selected_states.join(",");
1600        let bam_opts: InputBam = serde_json::from_str(
1601            format!("{{\"read_filter\": [{states_string}], \"include_zero_len\": true}}").as_str(),
1602        )
1603        .unwrap();
1604
1605        let mut count_retained = 0;
1606
1607        for _ in 1..=70000 {
1608            let mut record = record::Record::new();
1609            record.unset_flags();
1610            record.set_flags({
1611                let random_state: ReadState = random();
1612                u16::from(random_state)
1613            });
1614            if record.pre_filt(&bam_opts) {
1615                count_retained += 1;
1616            }
1617        }
1618
1619        let expected_count = selected_states.len() * 10000;
1620        let tolerance = expected_count.div_ceil(5);
1621        let min_count = expected_count.saturating_sub(tolerance);
1622        let max_count = expected_count + tolerance;
1623
1624        assert!(count_retained >= min_count && count_retained <= max_count);
1625    }
1626
1627    #[test]
1628    fn filt_by_region() {
1629        let mut count_retained = (0, 0);
1630
1631        for _ in 1..=10000 {
1632            let mut record = record::Record::new();
1633
1634            // Draw four numbers from 0 to 10000
1635            let four_nums = [
1636                random_range(0..=10000),
1637                random_range(0..=10000),
1638                random_range(0..=10000),
1639                random_range(0..=10000u64),
1640            ];
1641
1642            // First two for record
1643            let mut pos_nums = [four_nums[0], four_nums[1]];
1644            pos_nums.sort_unstable();
1645            let start_pos = pos_nums[0];
1646            let end_pos = pos_nums[1];
1647
1648            // Use the last two to set up region, with a random contig chosen
1649            // from a set of two.
1650            let region_tid = i32::from(random::<bool>());
1651            let mut region_nums = [four_nums[2], four_nums[3]];
1652            region_nums.sort_unstable();
1653            let region_start = region_nums[0];
1654            let region_end = region_nums[1];
1655            let bam_opts_no_full_region: InputBam = serde_json::from_str(
1656                format!("{{\"region_bed3\": [{region_tid}, {region_start}, {region_end}]}}")
1657                    .as_str(),
1658            )
1659            .unwrap();
1660            let bam_opts_full_region: InputBam = serde_json::from_str(
1661                format!("{{\"full_region\": true, \"region_bed3\": [{region_tid}, {region_start}, {region_end}]}}")
1662                    .as_str(),
1663            )
1664            .unwrap();
1665
1666            // Calculate sequence length (ensure at least 1)
1667            let seq_len = if end_pos == start_pos {
1668                1
1669            } else {
1670                end_pos - start_pos
1671            };
1672
1673            // Set sequence details first
1674            record.set(
1675                b"read",
1676                Some(&CigarString(vec![Cigar::Match(
1677                    u32::try_from(seq_len).unwrap(),
1678                )])),
1679                &vec![b'A'; usize::try_from(seq_len).unwrap()],
1680                &vec![255; usize::try_from(seq_len).unwrap()],
1681            );
1682
1683            // Set tid and position, contig chosen from a random set of two.
1684            let tid = i32::from(random::<bool>());
1685            record.set_tid(tid);
1686            record.set_pos(i64::try_from(start_pos).unwrap());
1687
1688            // Verify reference_end calculation
1689            let expected_ref_end = i64::try_from(start_pos + seq_len).unwrap();
1690            assert_eq!(record.reference_end(), expected_ref_end);
1691
1692            if record.pre_filt(&bam_opts_no_full_region) {
1693                count_retained.0 += 1;
1694            }
1695            if record.pre_filt(&bam_opts_full_region) {
1696                count_retained.1 += 1;
1697            }
1698        }
1699
1700        // the chance that two randomly chosen intervals on the interval [0, l]
1701        // intersect is 2/3, and then we have the added random element of one
1702        // of two contigs, so the probability is 2/3 * 1/2 = 1/3.
1703        let expected_count = 10000usize.div_ceil(3); // ~3333
1704        let tolerance = expected_count.div_ceil(5);
1705        let min_count = expected_count.saturating_sub(tolerance);
1706        let max_count = expected_count + tolerance;
1707        assert!(count_retained.0 >= min_count && count_retained.0 <= max_count);
1708
1709        // the chance that two randomly chosen intervals are such that one is
1710        // contained within the other is 1/3, and we are looking for the region
1711        // being contained completely within the read and NOT the read being contained
1712        // within the region, so the probability is 1/6. This is one fourth of the
1713        // 2/3 used above. So the same criterion will work with the second count quadrupled.
1714        assert!(4 * count_retained.1 >= min_count && 4 * count_retained.1 <= max_count);
1715    }
1716
1717    #[test]
1718    #[should_panic(expected = "InvalidState")]
1719    fn bam_rc_records_conflicting_read_id_set_and_list() {
1720        let mut reader = nanalogue_bam_reader("examples/example_1.bam").unwrap();
1721        let mut read_id_set = HashSet::new();
1722        let _: bool = read_id_set.insert("some-read".to_owned());
1723
1724        // Build InputBam with read_id_list, then manually set read_id_set to create conflict
1725        let mut bam_opts = InputBamBuilder::default()
1726            .bam_path(PathOrURLOrStdin::Path("examples/example_1.bam".into()))
1727            .read_id_list("examples/example_3_subset_1")
1728            .build()
1729            .unwrap();
1730
1731        // Manually set read_id_set to create the forbidden (true, true) state
1732        bam_opts.read_id_set = Some(read_id_set);
1733
1734        let _: BamRcRecords<_> = BamRcRecords::new(
1735            &mut reader,
1736            &mut bam_opts,
1737            &mut InputMods::<OptionalTag>::default(),
1738        )
1739        .unwrap();
1740    }
1741
1742    #[test]
1743    fn bam_rc_records_read_id_list_none_read_id_set_none() {
1744        // Test (false, false) case: both None, both should remain None
1745        let mut reader = nanalogue_bam_reader("examples/example_1.bam").unwrap();
1746        let mut bam_opts = InputBamBuilder::default()
1747            .bam_path(PathOrURLOrStdin::Path("examples/example_1.bam".into()))
1748            .build()
1749            .unwrap();
1750
1751        assert!(bam_opts.read_id_list.is_none());
1752        assert!(bam_opts.read_id_set.is_none());
1753
1754        let _: BamRcRecords<_> = BamRcRecords::new(
1755            &mut reader,
1756            &mut bam_opts,
1757            &mut InputMods::<OptionalTag>::default(),
1758        )
1759        .unwrap();
1760
1761        // Both should still be None
1762        assert!(bam_opts.read_id_list.is_none());
1763        assert!(bam_opts.read_id_set.is_none());
1764    }
1765
1766    #[test]
1767    fn bam_rc_records_read_id_list_none_read_id_set_some() {
1768        // Test (true, false) case: read_id_set is Some, read_id_list is None
1769        // read_id_set should remain unchanged
1770        let mut reader = nanalogue_bam_reader("examples/example_1.bam").unwrap();
1771        let mut read_id_set = HashSet::new();
1772        let _: bool = read_id_set.insert("read1".to_owned());
1773        let _: bool = read_id_set.insert("read2".to_owned());
1774
1775        let mut bam_opts = InputBamBuilder::default()
1776            .bam_path(PathOrURLOrStdin::Path("examples/example_1.bam".into()))
1777            .read_id_set(read_id_set.clone())
1778            .build()
1779            .unwrap();
1780
1781        assert!(bam_opts.read_id_list.is_none());
1782        assert!(bam_opts.read_id_set.is_some());
1783
1784        let _: BamRcRecords<_> = BamRcRecords::new(
1785            &mut reader,
1786            &mut bam_opts,
1787            &mut InputMods::<OptionalTag>::default(),
1788        )
1789        .unwrap();
1790
1791        // read_id_set should remain unchanged
1792        assert!(bam_opts.read_id_list.is_none());
1793        assert_eq!(bam_opts.read_id_set, Some(read_id_set));
1794    }
1795}