nanalogue_core/
simulate_mod_bam.rs

1//! # Write Simulated Mod BAM
2//! Generates simulated BAM files with base modifications for testing purposes.
3//! Accepts JSON configuration to produce both BAM and FASTA reference files.
4//! Please note that both BAM files and FASTA files are created from scratch,
5//! so please do not specify pre-existing BAM or FASTA files in the output
6//! path; if so, they will be overwritten.
7//!
8//! ## Example Usage
9//!
10//! We've shown how to construct `SimulationConfig` which contains input options
11//! and then use it to create a BAM file below. You can also use [`SimulationConfigBuilder`]
12//! to achieve this without using a `json` structure. Some of the fields in the `json` entry
13//! shown below are optional; please read the comments following it.
14//!
15//! ```
16//! use nanalogue_core::{Error, SimulationConfig, simulate_mod_bam::run};
17//! use uuid::Uuid;
18//!
19//! let config_json = r#"{
20//!   "contigs": {
21//!     "number": 4,
22//!     "len_range": [1000, 8000],
23//!     "repeated_seq": "ATCGAATT"
24//!   },
25//!   "reads": [{
26//!     "number": 1000,
27//!     "mapq_range": [10, 20],
28//!     "base_qual_range": [10, 20],
29//!     "len_range": [0.1, 0.8],
30//!     "barcode": "ACGTAA",
31//!     "delete": [0.5, 0.7],
32//!     "insert_middle": "ATCG",
33//!     "mismatch": 0.5,
34//!     "mods": [{
35//!         "base": "T",
36//!         "is_strand_plus": true,
37//!         "mod_code": "T",
38//!         "win": [4, 5],
39//!         "mod_range": [[0.1, 0.2], [0.3, 0.4]]
40//!     }]
41//!   }]
42//! }"#;
43//!
44//! // Note: * We generate four contigs `4` with a length chosen randomly from the range `1000-8000`
45//! //         in the example above. Contigs are called `contig_00000`, ..., `contig_00003`.
46//! //       * "repeated_seq" field is optional in contigs; if set, contigs are made by
47//! //         repeating this sequence instead of generating random sequences.
48//! //       * Reads are generated with options shown. In the example, 1000 reads are made, with a
49//! //         mapping quality in the range 10-20, a basecalling quality in the range 10-20,
50//! //         lengths in the range 10%-80% of a contig (each read is mapped to a contig chosen
51//! //         randomly with equal probability for all contigs). Reads are randomly assigned to be
52//! //         in one of seven states -> primary/secondary/supplementary X fwd/rev plus unmapped.
53//! //         Read sequences are drawn from the associated contig at a random location.
54//! //         (Unmapped reads are also drawn from contig sequences; they are just marked unmapped
55//! //          after the fact).
56//! //         Read names are 'n.uuid' where n is the read group number and uuid is a random UUID
57//! //         (UUID is a random string that looks like this for example ->
58//! //          "e9529f28-d27a-497a-be7e-bffdb77e8bc1"). Read group number is assigned
59//! //         automatically, depending on how many entries are in the `reads` field.
60//! //         In the example here, there is only one entry, so the read group number is 0,
61//! //         so all the reads will have a name like '0.uuid'.
62//! //       * "barcode" field is optional and can be omitted; barcodes added to both ends.
63//! //         As sequences and lengths are generated independently of barcodes,
64//! //         a barcode will _add_ 2 times so many bp to sequence length statistics.
65//! //       * "delete" is optional. If specified, the shown section is deleted on
66//! //         all reads. In the example, the section between the middle of the read (50%)
67//! //         and seven-tenths of the read (70%) is deleted. The direction in the instructions
68//! //         here is parallel to the reference genome.
69//! //       * "insert_middle" is optional. If specified, the specified sequence will be inserted
70//! //         at the middle of the read.
71//! //       * "mismatch" is optional. If specified, this random fraction of bases is mismatched i.e.
72//! //         we make the base different from what it is supposed to be.
73//! //       * insertion/deletion/barcode are applied after sequence lengths are determined
74//! //         according to the given options. In other words, these options will cause sequence
75//! //         lengths to be different from the given statistics e.g. an insertion of
76//! //         10 bp will make all reads longer than the desired sequence statistics by 10 bp.
77//! //       * "mods" are optional as well, omitting them will create a BAM file
78//! //          with no modification information.
79//! //       * mod information is specified using windows i.e. in window of given size,
80//! //         we enforce that mod probabilities per base are in the given range.
81//! //         Multiple windows can be specified, and they repeat along the
82//! //         read until it ends i.e. we cycle windows so we go 4Ts, 5Ts, 4Ts, 5Ts, ...
83//! //         and probabilities are cycled as (0.1, 0.2), (0.3, 0.4), (0.1, 0.2), ...
84//! //       * if windows and mod_range lists are of different sizes, the cycles may not line up.
85//! //         e.g. if there are three window values and two mod ranges, then
86//! //         windows repeat in cycles of 3 whereas mod ranges will repeat in cycles of 2.
87//! //         You can use such inputs if this is what you want.
88//!
89//! // Paths used here must not exist already as these are files created anew.
90//! let config: SimulationConfig = serde_json::from_str(&config_json)?;
91//! let temp_dir = std::env::temp_dir();
92//! let bam_path = temp_dir.join(format!("{}.bam", Uuid::new_v4()));
93//! let fasta_path = temp_dir.join(format!("{}.fa", Uuid::new_v4()));
94//! let bai_path = bam_path.with_extension("bam.bai");
95//! run(config, &bam_path, &fasta_path)?;
96//! std::fs::remove_file(&bam_path)?;
97//! std::fs::remove_file(&fasta_path)?;
98//! std::fs::remove_file(&bai_path)?;
99//! # Ok::<(), Error>(())
100//! ```
101
102use crate::{
103    AllowedAGCTN, DNARestrictive, Error, F32Bw0and1, GetDNARestrictive, ModChar, OrdPair, ReadState,
104};
105use crate::{write_bam_denovo, write_fasta};
106use derive_builder::Builder;
107use itertools::join;
108use rand::seq::IteratorRandom as _;
109use rand::{Rng, random};
110use rust_htslib::bam;
111use rust_htslib::bam::record::{Aux, Cigar, CigarString};
112use serde::{Deserialize, Serialize};
113use std::iter;
114use std::num::{NonZeroU32, NonZeroU64};
115use std::ops::RangeInclusive;
116use std::path::Path;
117use std::str::FromStr as _;
118use uuid::Uuid;
119
120/// We need a shorthand for this for a one-liner we use
121/// in a macro provided by `derive_builder`.
122type OF = OrdPair<F32Bw0and1>;
123
124/// Creates an `OrdPair` of `F32Bw0and1` values
125///
126/// We do not export this macro as it involves `unwrap`,
127/// so we only use it locally.
128macro_rules! ord_pair_f32_bw0and1 {
129    ($low:expr, $high:expr) => {
130        OrdPair::new(
131            F32Bw0and1::new($low).unwrap(),
132            F32Bw0and1::new($high).unwrap(),
133        )
134        .unwrap()
135    };
136}
137
138/// Main configuration struct for simulation.
139///
140/// After construction, you can pass it to [`crate::simulate_mod_bam::run`]
141/// to create a mod BAM file. You can build this using [`SimulationConfigBuilder`]
142/// as shown below.
143///
144/// # Example
145///
146/// Below struct can be used in appropriate routines to
147/// create a BAM file.
148///
149/// ```
150/// use nanalogue_core::Error;
151/// use nanalogue_core::simulate_mod_bam::{ContigConfigBuilder, SimulationConfigBuilder,
152///     ReadConfigBuilder, ModConfigBuilder};
153///
154/// // Request two contigs with lengths randomly chosen from 1000 - 2000 bp.
155/// let contig_config = ContigConfigBuilder::default()
156///     .number(2)
157///     .len_range((1000,2000)).build()?;
158///
159/// // First set of reads
160/// let read_config1 = ReadConfigBuilder::default()
161///     .number(100)
162///     .mapq_range((10, 20))
163///     .base_qual_range((30, 40))
164///     .len_range((0.5, 0.6));
165///
166/// // second set of reads, now 200 requested with a modification and a barcode
167/// let mod_config = ModConfigBuilder::default()
168///     .base('C')
169///     .is_strand_plus(true)
170///     .mod_code("m".into())
171///     .win(vec![2, 3])
172///     .mod_range(vec![(0.1, 0.8), (0.3, 0.4)]).build()?;
173///
174/// let read_config2 = read_config1.clone()
175///     .number(200)
176///     .barcode("AATTG".into())
177///     .mods(vec![mod_config]);
178///
179/// // set simulation config, ready to be fed into an appropriate function
180/// let sim_config = SimulationConfigBuilder::default()
181///     .contigs(contig_config)
182///     .reads(vec![read_config1.build()?, read_config2.build()?]).build()?;
183/// # Ok::<(), Error>(())
184/// ```
185#[derive(Builder, Debug, Default, PartialEq, Clone, Serialize, Deserialize)]
186#[serde(default)]
187#[builder(default, build_fn(error = "Error"), pattern = "owned", derive(Clone))]
188#[non_exhaustive]
189pub struct SimulationConfig {
190    /// Configuration for contig generation
191    pub contigs: ContigConfig,
192    /// Configuration for read generation
193    pub reads: Vec<ReadConfig>,
194}
195
196/// Configuration for contig generation
197///
198/// The struct contains parameters which can then be passed to other
199/// functions to construct the actual contig. Also refer to [`SimulationConfig`]
200/// to see how this `struct` can be used, and [`ContigConfigBuilder`] for how
201/// to build this (as shown below in the examples).
202///
203/// # Examples
204///
205/// Request two contigs with lengths randomly chosen from 1000 - 2000 bp.
206/// The DNA of the contigs are randomly-generated.
207///
208/// ```
209/// use nanalogue_core::Error;
210/// use nanalogue_core::simulate_mod_bam::ContigConfigBuilder;
211///
212/// let contig_config = ContigConfigBuilder::default()
213///     .number(2)
214///     .len_range((1000,2000)).build()?;
215///
216/// # Ok::<(), Error>(())
217/// ```
218///
219/// Request similar contigs as above but with the DNA of the contigs
220/// consisting of the same sequence repeated over and over.
221///
222/// ```
223/// # use nanalogue_core::Error;
224/// # use nanalogue_core::simulate_mod_bam::ContigConfigBuilder;
225/// let contig_config = ContigConfigBuilder::default()
226///     .number(2)
227///     .len_range((1000,2000))
228///     .repeated_seq("AAGCTTGA".into()).build()?;
229///
230/// # Ok::<(), Error>(())
231/// ```
232///
233/// Request a contig with a fixed length and sequence.
234/// As the repeated sequence's length is equal to the
235/// contig length, and the contig length is precisely fixed,
236/// you get a non-randomly generated contig.
237///
238/// ```
239/// # use nanalogue_core::Error;
240/// # use nanalogue_core::simulate_mod_bam::ContigConfigBuilder;
241/// let contig_config = ContigConfigBuilder::default()
242///     .number(1)
243///     .len_range((20,20))
244///     .repeated_seq("ACGTACGTACGTACGTACGT".into()).build()?;
245///
246/// # Ok::<(), Error>(())
247/// ```
248#[derive(Builder, Debug, PartialEq, Clone, Serialize, Deserialize)]
249#[serde(default)]
250#[builder(default, build_fn(error = "Error"), pattern = "owned", derive(Clone))]
251#[non_exhaustive]
252pub struct ContigConfig {
253    /// Number of contigs to generate
254    #[builder(field(ty = "u32", build = "self.number.try_into()?"))]
255    pub number: NonZeroU32,
256    /// Contig length range in bp [min, max]
257    #[builder(field(ty = "(u64, u64)", build = "self.len_range.try_into()?"))]
258    pub len_range: OrdPair<NonZeroU64>,
259    /// Optional repeated sequence to use for contigs instead of random generation
260    #[builder(field(
261        ty = "String",
262        build = "(!self.repeated_seq.is_empty()).then(|| DNARestrictive::from_str(&self.repeated_seq)).transpose()?"
263    ))]
264    pub repeated_seq: Option<DNARestrictive>,
265}
266
267/// Configuration for read generation. Also refer to [`SimulationConfig`]
268/// to see how this `struct` can be used, and to [`ReadConfigBuilder`] for
269/// how to build this (as shown below in the examples).
270///
271/// # Example 1
272///
273///  Request reads without modifications
274/// ```
275/// use nanalogue_core::Error;
276/// use nanalogue_core::simulate_mod_bam::ReadConfigBuilder;
277///
278/// // First set of reads
279/// let read_config1 = ReadConfigBuilder::default()
280///     .number(100)
281///     .mapq_range((10, 20))
282///     .base_qual_range((30, 40))
283///     .len_range((0.5, 0.6)).build()?;
284///
285/// # Ok::<(), Error>(())
286/// ```
287/// # Example 2
288///
289///  Request reads with modifications, but also with insertions, deletions, and a mismatch rate.
290/// ```
291/// use nanalogue_core::Error;
292/// use nanalogue_core::simulate_mod_bam::{ReadConfigBuilder, ModConfigBuilder};
293///
294/// let mod_config = ModConfigBuilder::default()
295///     .base('C')
296///     .is_strand_plus(true)
297///     .mod_code("m".into())
298///     .win(vec![2, 3])
299///     .mod_range(vec![(0.1, 0.8), (0.3, 0.4)]).build()?;
300///
301/// let read_config2 = ReadConfigBuilder::default()
302///     .number(200)
303///     .mapq_range((30, 40))
304///     .base_qual_range((50, 60))
305///     .barcode("AATTG".into())
306///     .len_range((0.7, 0.8))
307///     .delete((0.2, 0.3))
308///     .insert_middle("ATCGA".into())
309///     .mismatch(0.1)
310///     .mods(vec![mod_config]).build()?;
311///
312/// # Ok::<(), Error>(())
313/// ```
314#[derive(Builder, Debug, Clone, PartialEq, Serialize, Deserialize)]
315#[serde(default)]
316#[builder(default, build_fn(error = "Error"), pattern = "owned", derive(Clone))]
317#[non_exhaustive]
318pub struct ReadConfig {
319    /// Total number of reads to generate
320    #[builder(field(ty = "u32", build = "self.number.try_into()?"))]
321    pub number: NonZeroU32,
322    /// Mapping quality range [min, max]
323    #[builder(field(ty = "(u8, u8)", build = "self.mapq_range.try_into()?"))]
324    pub mapq_range: OrdPair<u8>,
325    /// Base quality score range [min, max]
326    #[builder(field(ty = "(u8, u8)", build = "self.base_qual_range.try_into()?"))]
327    pub base_qual_range: OrdPair<u8>,
328    /// Read length range as fraction of contig [min, max] (e.g.: [0.1, 0.8] = 10% to 80%)
329    #[builder(field(ty = "(f32, f32)", build = "self.len_range.try_into()?"))]
330    pub len_range: OrdPair<F32Bw0and1>,
331    /// Optional barcode DNA sequence to add to read ends
332    #[builder(field(
333        ty = "String",
334        build = "(!self.barcode.is_empty()).then(|| DNARestrictive::from_str(&self.barcode)).transpose()?"
335    ))]
336    pub barcode: Option<DNARestrictive>,
337    /// Optional deletion: fractional range [start, end] of read to delete (e.g., [0.5, 0.7] deletes middle 20%)
338    #[builder(field(
339        ty = "(f32, f32)",
340        build = "Some(self.delete).map(TryInto::try_into).transpose()?"
341    ))]
342    pub delete: Option<OrdPair<F32Bw0and1>>,
343    /// Optional sequence to insert at the middle of the read
344    #[builder(field(
345        ty = "String",
346        build = "(!self.insert_middle.is_empty()).then(|| DNARestrictive::from_str(&self.insert_middle)).transpose()?"
347    ))]
348    pub insert_middle: Option<DNARestrictive>,
349    /// Optional mismatch fraction: random fraction of bases to mutate (e.g., 0.5 = 50% of bases)
350    #[builder(field(ty = "f32", build = "Some(F32Bw0and1::try_from(self.mismatch)?)"))]
351    pub mismatch: Option<F32Bw0and1>,
352    /// Modification configurations
353    pub mods: Vec<ModConfig>,
354}
355
356/// Configuration for modification generation in simulated BAM files.
357/// Also refer to [`SimulationConfig`] to see how this `struct` can be used,
358/// and [`ModConfigBuilder`] to see how it can be built as shown below.
359///
360/// # Example
361///
362/// Configure modification for cytosines on plus strand.
363/// Here, 2Cs have probabilities randomly chosen b/w 0.4-0.8,
364///       3Cs have probabilities b/w 0.5-0.7,
365///       2Cs have ...,
366///       3Cs have ...,
367///       ...
368///  repeated forever. This can be passed as an input to a routine
369///  sometime later and applied on a given sequence.
370/// ```
371/// use nanalogue_core::Error;
372/// use nanalogue_core::simulate_mod_bam::ModConfigBuilder;
373///
374/// let mod_config_c = ModConfigBuilder::default()
375///     .base('C')
376///     .is_strand_plus(true)
377///     .mod_code("m".into())
378///     .win(vec![2, 3])
379///     .mod_range(vec![(0.4, 0.8), (0.5, 0.7)]).build()?;
380/// # Ok::<(), Error>(())
381/// ```
382#[derive(Builder, Debug, Clone, PartialEq, Serialize, Deserialize)]
383#[serde(default)]
384#[builder(default, build_fn(error = "Error"), pattern = "owned", derive(Clone))]
385#[non_exhaustive]
386pub struct ModConfig {
387    /// Base that is modified (A, C, G, T, or N)
388    #[builder(field(ty = "char", build = "self.base.try_into()?"))]
389    pub base: AllowedAGCTN,
390    /// Whether this is on the plus strand
391    pub is_strand_plus: bool,
392    /// Modification code (character or numeric)
393    #[builder(field(ty = "String", build = "self.mod_code.parse()?"))]
394    pub mod_code: ModChar,
395    /// Vector of window sizes for modification density variation.
396    /// e.g. if you want reads with a window of 100 bases with each
397    /// modified with a probability in the range 0.4-0.6 and the next
398    /// 200 modified in the range 0.1-0.2 (and this two window config
399    /// repeating forever), you need to set this win array to
400    /// [100, 200] and the mod range array to [[0.4, 0.6], [0.1, 0.2]]
401    /// (Remembering to adjust the datatypes of the inputs suitably
402    /// i.e. use `NonZeroU32` etc.)
403    /// NOTE: "bases" in the above description refer to bases of interest
404    /// set in the base field i.e. 100 bases with base set to "T" mean
405    /// 100 thymidines specifically.
406    #[builder(field(
407        ty = "Vec<u32>",
408        build = "self.win.iter().map(|&x| NonZeroU32::new(x).ok_or(Error::Zero(\"cannot use zero-\
409sized windows in builder\".to_owned()))).collect::<Result<Vec<NonZeroU32>,_>>()?"
410    ))]
411    pub win: Vec<NonZeroU32>,
412    /// Vector of modification density range e.g. [[0.4, 0.6], [0.1, 0.2]].
413    /// Also see description of `win` above.
414    #[builder(field(
415        ty = "Vec<(f32, f32)>",
416        build = "self.mod_range.iter().map(|&x| OF::try_from(x)).collect::<Result<Vec<OF>, _>>()?"
417    ))]
418    pub mod_range: Vec<OrdPair<F32Bw0and1>>,
419}
420
421/// Represents a contig with name and sequence.
422/// Can be built using [`ContigConfigBuilder`] as shown below.
423///
424/// # Example
425///
426/// Example contig with a simple sequence and name, to be passed
427/// somewhere else to be constructed.
428///
429/// ```
430/// use nanalogue_core::{Error, simulate_mod_bam::ContigBuilder};
431///
432/// let contig = ContigBuilder::default()
433///     .name("chr1")
434///     .seq("ACGTACGTACGTACGT".into()).build()?;
435/// # Ok::<(), Error>(())
436#[derive(Builder, Debug, Clone, Serialize, Deserialize)]
437#[serde(default)]
438#[builder(default, build_fn(error = "Error"))]
439#[non_exhaustive]
440pub struct Contig {
441    /// Contig name
442    #[builder(setter(into))]
443    pub name: String,
444    /// Contig sequence (A, C, G, T)
445    #[builder(field(ty = "String", build = "self.seq.parse()?"))]
446    pub seq: DNARestrictive,
447}
448
449impl GetDNARestrictive for Contig {
450    /// Returns a reference to the contig's DNA
451    fn get_dna_restrictive(&self) -> &DNARestrictive {
452        &self.seq
453    }
454}
455
456impl Default for Contig {
457    fn default() -> Self {
458        Self {
459            name: String::new(),
460            seq: DNARestrictive::from_str("A").expect("valid default DNA"),
461        }
462    }
463}
464
465impl Default for ContigConfig {
466    fn default() -> Self {
467        Self {
468            number: NonZeroU32::new(1).unwrap(),
469            len_range: OrdPair::new(NonZeroU64::new(1).unwrap(), NonZeroU64::new(1).unwrap())
470                .unwrap(),
471            repeated_seq: None,
472        }
473    }
474}
475
476impl Default for ReadConfig {
477    fn default() -> Self {
478        Self {
479            number: NonZeroU32::new(1).unwrap(),
480            mapq_range: OrdPair::new(0, 0).unwrap(),
481            base_qual_range: OrdPair::new(0, 0).unwrap(),
482            len_range: ord_pair_f32_bw0and1!(0.0, 0.0),
483            barcode: None,
484            delete: None,
485            insert_middle: None,
486            mismatch: None,
487            mods: Vec::new(),
488        }
489    }
490}
491
492impl Default for ModConfig {
493    fn default() -> Self {
494        Self {
495            base: AllowedAGCTN::C,
496            is_strand_plus: true,
497            mod_code: ModChar::new('m'),
498            win: vec![NonZeroU32::new(1).unwrap()],
499            mod_range: vec![ord_pair_f32_bw0and1!(0.0, 1.0)],
500        }
501    }
502}
503
504/// Builder for creating a sequence with cigar string and optional barcode
505#[derive(Debug, Clone, Serialize, Deserialize)]
506pub struct PerfectSeqMatchToNot {
507    /// The DNA sequence to be processed
508    seq: Vec<u8>,
509    /// Optional barcode to add to both ends of the sequence
510    barcode: Option<DNARestrictive>,
511    /// Optional deletion: fractional range of sequence to delete
512    delete: Option<OrdPair<F32Bw0and1>>,
513    /// Optional sequence to insert at the middle
514    insert_middle: Option<DNARestrictive>,
515    /// Optional mismatch fraction
516    mismatch: Option<F32Bw0and1>,
517}
518
519impl PerfectSeqMatchToNot {
520    /// Creates a new builder with the given sequence
521    ///
522    /// # Errors
523    /// Returns `Error::InvalidState` if the sequence is empty
524    ///
525    /// # Examples
526    ///
527    /// Valid sequence:
528    /// ```
529    /// use nanalogue_core::simulate_mod_bam::PerfectSeqMatchToNot;
530    ///
531    /// let builder = PerfectSeqMatchToNot::seq(b"ACGT".to_vec()).unwrap();
532    /// ```
533    ///
534    /// Empty sequence returns error:
535    /// ```
536    /// use nanalogue_core::simulate_mod_bam::PerfectSeqMatchToNot;
537    /// use nanalogue_core::Error;
538    ///
539    /// let result = PerfectSeqMatchToNot::seq(b"".to_vec());
540    /// assert!(matches!(result, Err(Error::InvalidState(_))));
541    /// ```
542    pub fn seq(seq: Vec<u8>) -> Result<Self, Error> {
543        if seq.is_empty() {
544            return Err(Error::InvalidState(
545                "Sequence length is 0; cannot create PerfectSeqMatchToNot with empty sequence"
546                    .into(),
547            ));
548        }
549        Ok(Self {
550            seq,
551            barcode: None,
552            delete: None,
553            insert_middle: None,
554            mismatch: None,
555        })
556    }
557
558    /// Sets the barcode for this sequence
559    #[must_use]
560    pub fn barcode(mut self, barcode: DNARestrictive) -> Self {
561        self.barcode = Some(barcode);
562        self
563    }
564
565    /// Sets the deletion range for this sequence
566    #[must_use]
567    pub fn delete(mut self, delete: OrdPair<F32Bw0and1>) -> Self {
568        self.delete = Some(delete);
569        self
570    }
571
572    /// Sets the sequence to insert at the middle
573    #[must_use]
574    pub fn insert_middle(mut self, insert: DNARestrictive) -> Self {
575        self.insert_middle = Some(insert);
576        self
577    }
578
579    /// Sets the mismatch fraction for this sequence
580    #[must_use]
581    pub fn mismatch(mut self, mismatch: F32Bw0and1) -> Self {
582        self.mismatch = Some(mismatch);
583        self
584    }
585
586    /// Builds the final sequence with barcode (if specified) and corresponding cigar string
587    ///
588    /// # Examples
589    ///
590    /// Without barcode, forward read:
591    /// ```
592    /// use nanalogue_core::simulate_mod_bam::PerfectSeqMatchToNot;
593    /// use nanalogue_core::{ReadState, Error};
594    /// use rand::Rng;
595    ///
596    /// let mut rng = rand::rng();
597    /// let (seq, cigar) = PerfectSeqMatchToNot::seq(b"GGGGGGGG".to_vec())?
598    ///     .build(ReadState::PrimaryFwd, &mut rng)?;
599    /// assert_eq!(seq, b"GGGGGGGG");
600    /// assert_eq!(cigar.expect("no error").to_string(), "8M");
601    /// # Ok::<(), Error>(())
602    /// ```
603    ///
604    /// Without barcode, reverse read:
605    /// ```
606    /// use nanalogue_core::simulate_mod_bam::PerfectSeqMatchToNot;
607    /// use nanalogue_core::{ReadState, Error};
608    /// use rand::Rng;
609    ///
610    /// let mut rng = rand::rng();
611    /// let (seq, cigar) = PerfectSeqMatchToNot::seq(b"GGGGGGGG".to_vec())?
612    ///     .build(ReadState::PrimaryRev, &mut rng)?;
613    /// assert_eq!(seq, b"GGGGGGGG");
614    /// assert_eq!(cigar.expect("no error").to_string(), "8M");
615    /// # Ok::<(), Error>(())
616    /// ```
617    ///
618    /// With barcode, forward read (barcode + seq + revcomp(barcode)):
619    /// ```
620    /// use nanalogue_core::simulate_mod_bam::PerfectSeqMatchToNot;
621    /// use nanalogue_core::{ReadState, DNARestrictive, Error};
622    /// use std::str::FromStr;
623    /// use rand::Rng;
624    ///
625    /// let mut rng = rand::rng();
626    /// let barcode = DNARestrictive::from_str("ACGTAA").unwrap();
627    /// let (seq, cigar) = PerfectSeqMatchToNot::seq(b"GGGGGGGG".to_vec())?
628    ///     .barcode(barcode)
629    ///     .build(ReadState::PrimaryFwd, &mut rng)?;
630    /// assert_eq!(seq, b"ACGTAAGGGGGGGGTTACGT");
631    /// assert_eq!(cigar.expect("no error").to_string(), "6S8M6S");
632    /// # Ok::<(), Error>(())
633    /// ```
634    ///
635    /// With barcode, reverse read (comp(barcode) + seq + rev(barcode)):
636    /// ```
637    /// use nanalogue_core::simulate_mod_bam::PerfectSeqMatchToNot;
638    /// use nanalogue_core::{ReadState, DNARestrictive, Error};
639    /// use std::str::FromStr;
640    /// use rand::Rng;
641    ///
642    /// let mut rng = rand::rng();
643    /// let barcode = DNARestrictive::from_str("ACGTAA").unwrap();
644    /// let (seq, cigar) = PerfectSeqMatchToNot::seq(b"GGGGGGGG".to_vec())?
645    ///     .barcode(barcode)
646    ///     .build(ReadState::PrimaryRev, &mut rng)?;
647    /// assert_eq!(seq, b"TGCATTGGGGGGGGAATGCA");
648    /// assert_eq!(cigar.expect("no error").to_string(), "6S8M6S");
649    /// # Ok::<(), Error>(())
650    /// ```
651    ///
652    /// Unmapped read returns None for cigar:
653    /// ```
654    /// use nanalogue_core::simulate_mod_bam::PerfectSeqMatchToNot;
655    /// use nanalogue_core::{ReadState, Error};
656    /// use rand::Rng;
657    ///
658    /// let mut rng = rand::rng();
659    /// let (seq, cigar) = PerfectSeqMatchToNot::seq(b"GGGGGGGG".to_vec())?
660    ///     .build(ReadState::Unmapped, &mut rng)?;
661    /// assert_eq!(seq, b"GGGGGGGG");
662    /// assert!(cigar.is_none());
663    /// # Ok::<(), Error>(())
664    /// ```
665    ///
666    /// With deletion (removes a portion of the sequence):
667    /// ```
668    /// use nanalogue_core::simulate_mod_bam::PerfectSeqMatchToNot;
669    /// use nanalogue_core::{ReadState, F32Bw0and1, OrdPair, Error};
670    /// use rand::Rng;
671    ///
672    /// let mut rng = rand::rng();
673    /// let delete_range = OrdPair::new(
674    ///     F32Bw0and1::try_from(0.5)?,
675    ///     F32Bw0and1::try_from(0.75)?
676    /// )?;
677    /// let (seq, cigar) = PerfectSeqMatchToNot::seq(b"AAAATTTTCCCCGGGG".to_vec())?
678    ///     .delete(delete_range)
679    ///     .build(ReadState::PrimaryFwd, &mut rng)?;
680    /// // Deletes positions 8-12 (CCCC), resulting in AAAATTTTGGGG
681    /// assert_eq!(seq, b"AAAATTTTGGGG");
682    /// assert_eq!(cigar.expect("no error").to_string(), "8M4D4M");
683    /// # Ok::<(), Error>(())
684    /// ```
685    ///
686    /// With insertion in the middle:
687    /// ```
688    /// use nanalogue_core::simulate_mod_bam::PerfectSeqMatchToNot;
689    /// use nanalogue_core::{ReadState, DNARestrictive, Error};
690    /// use std::str::FromStr;
691    /// use rand::Rng;
692    ///
693    /// let mut rng = rand::rng();
694    /// let insert_seq = DNARestrictive::from_str("TTTT")?;
695    /// let (seq, cigar) = PerfectSeqMatchToNot::seq(b"AAAAGGGG".to_vec())?
696    ///     .insert_middle(insert_seq)
697    ///     .build(ReadState::PrimaryFwd, &mut rng)?;
698    /// // Inserts TTTT at position 4 (middle), resulting in AAAATTTTGGGG
699    /// assert_eq!(seq, b"AAAATTTTGGGG");
700    /// assert_eq!(cigar.expect("no error").to_string(), "4M4I4M");
701    /// # Ok::<(), Error>(())
702    /// ```
703    ///
704    /// With mismatches (randomly changes bases):
705    /// ```
706    /// use nanalogue_core::simulate_mod_bam::PerfectSeqMatchToNot;
707    /// use nanalogue_core::{ReadState, F32Bw0and1, Error};
708    /// use rand::Rng;
709    ///
710    /// let mut rng = rand::rng();
711    /// let mismatch_frac = F32Bw0and1::try_from(0.5)?;
712    /// let (seq, cigar) = PerfectSeqMatchToNot::seq(b"AAAAAAAA".to_vec())?
713    ///     .mismatch(mismatch_frac)
714    ///     .build(ReadState::PrimaryFwd, &mut rng)?;
715    /// // 50% of bases (4 out of 8) will be changed to different bases
716    /// let mismatch_count = seq.iter().filter(|&&b| b != b'A').count();
717    /// assert_eq!(mismatch_count, 4);
718    /// // CIGAR remains 8M (mismatches don't change CIGAR)
719    /// assert_eq!(cigar.expect("no error").to_string(), "8M");
720    /// # Ok::<(), Error>(())
721    /// ```
722    ///
723    /// Combining deletion and insertion:
724    /// ```
725    /// use nanalogue_core::simulate_mod_bam::PerfectSeqMatchToNot;
726    /// use nanalogue_core::{ReadState, F32Bw0and1, OrdPair, DNARestrictive, Error};
727    /// use std::str::FromStr;
728    /// use rand::Rng;
729    ///
730    /// let mut rng = rand::rng();
731    /// let delete_range = OrdPair::new(
732    ///     F32Bw0and1::try_from(0.5)?,
733    ///     F32Bw0and1::try_from(0.75)?
734    /// )?;
735    /// let insert_seq = DNARestrictive::from_str("TT")?;
736    /// let (seq, cigar) = PerfectSeqMatchToNot::seq(b"AAAACCCCGGGG".to_vec())?
737    ///     .delete(delete_range)
738    ///     .insert_middle(insert_seq)
739    ///     .build(ReadState::PrimaryFwd, &mut rng)?;
740    /// // Applies both deletion and insertion operations
741    /// assert_eq!(seq, b"AAAACCTTGGG");
742    /// assert_eq!(cigar.expect("no error").to_string(), "6M2I3D3M");
743    /// # Ok::<(), Error>(())
744    /// ```
745    ///
746    /// # Returns
747    /// A tuple of (`final_sequence`, `cigar_string`) wrapped in a `Result` where:
748    /// - `final_sequence` includes barcodes if specified
749    /// - `cigar_string` includes `SoftClip` operations for barcodes, or `None` if unmapped
750    ///
751    /// # Errors
752    /// - If parameters result in a sequence with no mapped bases i.e. a sequence without
753    ///   any M characters in the CIGAR string.
754    /// - If a deletion occurs right after a softclip at the start of a sequence
755    ///   a cigar like "20S10D..." is not valid as we can just shift the start of the sequence
756    ///   and make the cigar "20S...". We will end up in this state if the deletion is right at
757    ///   the beginning of the sequence. We may deal with this in the future by shifting the start
758    ///   of the sequence, but we don't want to do this right now.
759    ///
760    /// # Panics
761    /// Panics on number conversion errors (sequence or barcode length overflow)
762    #[expect(
763        clippy::cast_precision_loss,
764        clippy::cast_possible_truncation,
765        clippy::cast_sign_loss,
766        reason = "Casting and arithmetic needed for fractional calculations on sequence positions"
767    )]
768    #[expect(
769        clippy::too_many_lines,
770        reason = "performs many steps to alter sequence (indel, mismatch, barcode)"
771    )]
772    pub fn build<R: Rng>(
773        self,
774        read_state: ReadState,
775        rng: &mut R,
776    ) -> Result<(Vec<u8>, Option<CigarString>), Error> {
777        let seq = self.seq;
778
779        // Step 1: Create initial representation as (base, operation) tuples
780        // All bases start with 'M' for Match operation
781        let mut bases_and_ops: Vec<(u8, u8)> = seq.iter().map(|&base| (base, b'M')).collect();
782
783        // Step 2: Apply mismatch (randomly mutate bases, keep 'M' operation)
784        // We use `choose_multiple` on indices to preserve position information,
785        // unlike `partial_shuffle` which would reorder elements and destroy positions.
786        if let Some(mismatch_frac) = self.mismatch {
787            let num_mismatches =
788                ((bases_and_ops.len() as f32) * mismatch_frac.val()).round() as usize;
789
790            let indices_to_mutate: Vec<usize> =
791                (0..bases_and_ops.len()).choose_multiple(rng, num_mismatches);
792
793            for idx in indices_to_mutate {
794                let item = bases_and_ops
795                    .get_mut(idx)
796                    .expect("idx is within bounds from choose_multiple");
797                let current_base = item.0;
798                let new_base = match current_base {
799                    v @ (b'A' | b'C' | b'G' | b'T') => {
800                        // Sample random bases until we get A/C/G/T different from the current
801                        loop {
802                            let candidate: AllowedAGCTN = rng.random();
803                            let candidate_u8: u8 = candidate.into();
804                            if candidate_u8 != v && candidate_u8 != b'N' {
805                                break candidate_u8;
806                            }
807                        }
808                    }
809                    other => other, // Keep N and other bases as is
810                };
811                item.0 = new_base;
812            }
813        }
814
815        // Step 3: Apply delete (mark bases as deleted with 'D' operation)
816        if let Some(delete_range) = self.delete {
817            let start = ((bases_and_ops.len() as f32) * delete_range.low().val()).round() as usize;
818            let end_raw =
819                ((bases_and_ops.len() as f32) * delete_range.high().val()).round() as usize;
820            let end = end_raw.min(bases_and_ops.len());
821
822            if let Some(slice) = bases_and_ops.get_mut(start..end) {
823                for item in slice {
824                    item.1 = b'D';
825                }
826            }
827        }
828
829        // Step 4: Apply insert_middle (add new tuples with 'I' operation)
830        if let Some(insert_seq) = self.insert_middle {
831            let middle = bases_and_ops.len() / 2;
832            let insert_bases = insert_seq.get_dna_restrictive().get();
833            let insertions: Vec<(u8, u8)> = insert_bases.iter().map(|&b| (b, b'I')).collect();
834
835            drop(
836                bases_and_ops
837                    .splice(middle..middle, insertions)
838                    .collect::<Vec<_>>(),
839            );
840        }
841
842        // Step 5: Apply barcode (add tuples with 'S' for SoftClip operation)
843        if let Some(barcode) = self.barcode {
844            // Use add_barcode with empty sequence to get the transformed barcodes
845            let barcoded_seq = add_barcode(&[], barcode.clone(), read_state);
846            let barcode_len = barcode.get_dna_restrictive().get().len();
847
848            // Split the result into start and end barcodes
849            let (start_bc, end_bc) = barcoded_seq.split_at(barcode_len);
850
851            // Prepend start barcode with 'S' operations
852            let bc_tuples_start: Vec<(u8, u8)> = start_bc.iter().map(|&b| (b, b'S')).collect();
853            drop(
854                bases_and_ops
855                    .splice(0..0, bc_tuples_start)
856                    .collect::<Vec<_>>(),
857            );
858
859            // Append end barcode with 'S' operations
860            let bc_tuples_end: Vec<(u8, u8)> = end_bc.iter().map(|&b| (b, b'S')).collect();
861            bases_and_ops.extend(bc_tuples_end);
862        }
863
864        // Step 6: Build CIGAR string from operations (second entries)
865        let cigar_string = CigarString(
866            bases_and_ops
867                .iter()
868                .map(|&(_, op)| op)
869                .fold(Vec::<(u8, u32)>::new(), |mut acc, op| {
870                    match acc.last_mut() {
871                        Some(&mut (last_op, ref mut count)) if last_op == op => {
872                            *count = count.saturating_add(1);
873                        }
874                        _ => acc.push((op, 1u32)),
875                    }
876                    acc
877                })
878                .into_iter()
879                .map(|(op, count)| match op {
880                    b'M' => Cigar::Match(count),
881                    b'I' => Cigar::Ins(count),
882                    b'D' => Cigar::Del(count),
883                    b'S' => Cigar::SoftClip(count),
884                    _ => unreachable!("Invalid CIGAR operation: {op}"),
885                })
886                .collect::<Vec<_>>(),
887        );
888
889        // Step 7: Build final sequence by filtering out deletions and collecting bases
890        let final_seq: Vec<u8> = bases_and_ops
891            .iter()
892            .copied()
893            .filter_map(|item| (item.1 != b'D').then_some(item.0))
894            .collect();
895
896        // Step 8: final checks
897        let first_ok = (cigar_string.0.as_slice().first().map(|x| x.char()) == Some('M'))
898            || (cigar_string
899                .0
900                .as_slice()
901                .first_chunk::<2>()
902                .map(|&x| [x[0].char(), x[1].char()])
903                == Some(['S', 'M']));
904        let last_ok = (cigar_string.0.as_slice().last().map(|x| x.char()) == Some('M'))
905            || (cigar_string
906                .0
907                .as_slice()
908                .last_chunk::<2>()
909                .map(|&x| [x[0].char(), x[1].char()])
910                == Some(['M', 'S']));
911        if first_ok && last_ok {
912            Ok((
913                final_seq,
914                (!read_state.is_unmapped()).then_some(cigar_string),
915            ))
916        } else {
917            Err(Error::SimulateDNASeqCIGAREndProblem(
918                "too few bp or insertion/deletion close to end".to_owned(),
919            ))
920        }
921    }
922}
923
924/// Generates BAM MM and ML tags with DNA modification probabilities
925///
926/// # Examples
927///
928/// ```
929/// use std::str::FromStr;
930/// use nanalogue_core::{DNARestrictive, Error};
931/// use nanalogue_core::simulate_mod_bam::{ModConfigBuilder, generate_random_dna_modification};
932/// use rand::Rng;
933///
934/// // Create a DNA sequence with multiple cytosines
935/// let seq = DNARestrictive::from_str("ACGTCGCGATCGACGTCGCGATCG").unwrap();
936///
937/// // Configure modification for cytosines on plus strand.
938/// // NOTE: we use mod ranges with an identical low and high value below
939/// // because we do not want to deal with random values in an example.
940///
941/// let mod_config_c = ModConfigBuilder::default()
942///     .base('C')
943///     .is_strand_plus(true)
944///     .mod_code("m".into())
945///     .win(vec![2, 3])
946///     .mod_range(vec![(0.8, 0.8), (0.4, 0.4)]).build()?;
947///
948/// // Put modifications for A on the minus strand.
949/// // Again, we use equal low and high value of 0.2 as we do not
950/// // want to deal with values drawn randomly from a range.
951///
952/// let mod_config_a = ModConfigBuilder::default()
953///     .base('A')
954///     .is_strand_plus(false)
955///     .mod_code("20000".into())
956///     .win([2].into())
957///     .mod_range(vec![(0.2, 0.2)]).build()?;
958///
959/// let mod_config = vec![mod_config_c, mod_config_a];
960///
961/// let mut rng = rand::rng();
962/// let (mm_str, ml_str) = generate_random_dna_modification(&mod_config, &seq, &mut rng);
963///
964/// // MM string format: C+m?gap1,gap2,gap3,gap4,...;A-20000?,...;
965/// assert_eq!(mm_str, String::from("C+m?,0,0,0,0,0,0,0,0;A-20000?,0,0,0,0;"));
966///
967/// // ML string contains probabilities
968/// assert_eq!(ml_str, vec![204, 204, 102, 102, 102, 204, 204, 102, 51, 51, 51, 51]);
969/// # Ok::<(), Error>(())
970/// ```
971///
972pub fn generate_random_dna_modification<R: Rng, S: GetDNARestrictive>(
973    mod_configs: &[ModConfig],
974    seq: &S,
975    rng: &mut R,
976) -> (String, Vec<u8>) {
977    let seq_bytes = seq.get_dna_restrictive().get();
978    let mut mm_str = String::new();
979
980    // rough estimate of capacity below
981    let mut ml_vec: Vec<u8> = Vec::with_capacity(seq_bytes.len());
982
983    for mod_config in mod_configs {
984        let base = u8::from(mod_config.base);
985        let strand = if mod_config.is_strand_plus { '+' } else { '-' };
986        let mod_code = mod_config.mod_code;
987        let mut count = if base == b'N' {
988            seq_bytes.len()
989        } else {
990            seq_bytes
991                .iter()
992                .zip(iter::repeat(&base))
993                .filter(|&(&a, &b)| a == b)
994                .count()
995        };
996        let mut output: Vec<u8> = Vec::with_capacity(count);
997        for k in mod_config
998            .win
999            .iter()
1000            .cycle()
1001            .zip(mod_config.mod_range.iter().cycle())
1002        {
1003            let low = u8::from(k.1.low());
1004            let high = u8::from(k.1.high());
1005            #[expect(
1006                clippy::redundant_else,
1007                reason = "so that the clippy arithmetic lint fits better with the code"
1008            )]
1009            #[expect(
1010                clippy::arithmetic_side_effects,
1011                reason = "we loop only if count > 0 and catch count == 0, thus count doesn't underflow"
1012            )]
1013            if count == 0 {
1014                break;
1015            } else {
1016                for _ in 0..k.0.get() {
1017                    output.push(rng.random_range(low..=high));
1018                    count -= 1;
1019                    if count == 0 {
1020                        break;
1021                    }
1022                }
1023            }
1024        }
1025        if !output.is_empty() {
1026            let mod_len = output.len();
1027            ml_vec.append(&mut output);
1028            mm_str += format!(
1029                "{}{}{}?,{};",
1030                base as char,
1031                strand,
1032                mod_code,
1033                join(vec![0; mod_len], ",")
1034            )
1035            .as_str();
1036        }
1037    }
1038    ml_vec.shrink_to_fit();
1039    (mm_str, ml_vec)
1040}
1041
1042/// Generates random DNA sequence of given length
1043///
1044/// # Panics
1045/// Panics if the sequence length exceeds `usize::MAX` (2^32 - 1 on 32-bit systems, 2^64 - 1 on 64-bit systems).
1046///
1047/// ```
1048/// use std::num::NonZeroU64;
1049/// use rand::Rng;
1050/// use nanalogue_core::simulate_mod_bam::generate_random_dna_sequence;
1051///
1052/// let mut rng = rand::rng();
1053/// let seq = generate_random_dna_sequence(100.try_into().expect("no error"), &mut rng);
1054/// assert_eq!(seq.len(), 100);
1055/// assert!(seq.iter().all(|&base| [b'A', b'C', b'G', b'T'].contains(&base)));
1056/// ```
1057pub fn generate_random_dna_sequence<R: Rng>(length: NonZeroU64, rng: &mut R) -> Vec<u8> {
1058    const DNA_BASES: [u8; 4] = [b'A', b'C', b'G', b'T'];
1059    iter::repeat_with(|| {
1060        *DNA_BASES
1061            .get(rng.random_range(0..4))
1062            .expect("random_range(0..4) is always 0-3")
1063    })
1064    .take(usize::try_from(length.get()).expect("sequence length exceeds usize::MAX"))
1065    .collect()
1066}
1067
1068/// Adds barcodes to read sequence based on strand orientation.
1069///
1070/// For forward and unmapped reads:
1071/// - Prepends barcode to read sequence
1072/// - Appends reverse complement of barcode to read sequence
1073///
1074/// For reverse reads:
1075/// - Prepends complement of barcode to read sequence
1076/// - Appends reverse of barcode (not reverse complement) to read sequence
1077///
1078/// # Examples
1079/// ```
1080/// use nanalogue_core::simulate_mod_bam::add_barcode;
1081/// use nanalogue_core::{ReadState, DNARestrictive};
1082/// use std::str::FromStr;
1083///
1084/// // Forward read: barcode + seq + revcomp(barcode)
1085/// let result = add_barcode("GGGGGGGG".as_bytes(), "ACGTAA".parse().unwrap(), ReadState::PrimaryFwd);
1086/// assert_eq!(result, b"ACGTAAGGGGGGGGTTACGT".to_vec());
1087///
1088/// // Reverse read: comp(barcode) + seq + rev(barcode)
1089/// let result = add_barcode("GGGGGGGG".as_bytes(), "ACGTAA".parse().unwrap(), ReadState::PrimaryRev);
1090/// assert_eq!(result, b"TGCATTGGGGGGGGAATGCA".to_vec());
1091/// ```
1092#[must_use]
1093#[expect(
1094    clippy::needless_pass_by_value,
1095    reason = "barcode by value allows passing `&'static str` easily (I think, may fix in future). \
1096If you don't know what this means, ignore this; this is _not_ a bug."
1097)]
1098pub fn add_barcode(read_seq: &[u8], barcode: DNARestrictive, read_state: ReadState) -> Vec<u8> {
1099    let bc_bytes = barcode.get();
1100
1101    match read_state {
1102        ReadState::PrimaryFwd
1103        | ReadState::SecondaryFwd
1104        | ReadState::SupplementaryFwd
1105        | ReadState::Unmapped => {
1106            // Forward/Unmapped: barcode + read_seq + reverse_complement(barcode)
1107            let revcomp_bc = bio::alphabets::dna::revcomp(bc_bytes);
1108            [bc_bytes, read_seq, &revcomp_bc[..]].concat()
1109        }
1110        ReadState::PrimaryRev | ReadState::SecondaryRev | ReadState::SupplementaryRev => {
1111            // Reverse: complement(barcode) + read_seq + reverse(barcode)
1112            let comp_bc: Vec<u8> = bc_bytes
1113                .iter()
1114                .map(|&b| bio::alphabets::dna::complement(b))
1115                .collect();
1116            let rev_bc: Vec<u8> = bc_bytes.iter().copied().rev().collect();
1117            [&comp_bc[..], read_seq, &rev_bc[..]].concat()
1118        }
1119    }
1120}
1121
1122/// Generates contigs with random sequence according to configuration
1123///
1124/// ```
1125/// use std::num::{NonZeroU32, NonZeroU64};
1126/// use nanalogue_core::{OrdPair, GetDNARestrictive};
1127/// use nanalogue_core::simulate_mod_bam::generate_contigs_denovo;
1128/// use rand::Rng;
1129///
1130/// let mut rng = rand::rng();
1131/// let contigs = generate_contigs_denovo(
1132///     3.try_into().unwrap(),
1133///     (100, 200).try_into().unwrap(),
1134///     &mut rng
1135/// );
1136/// assert_eq!(contigs.len(), 3);
1137/// assert!(contigs.iter().all(|c| (100..=200).contains(&c.get_dna_restrictive().get().len())));
1138/// ```
1139///
1140#[expect(
1141    clippy::missing_panics_doc,
1142    reason = "no length number or DNA conversion problems expected"
1143)]
1144pub fn generate_contigs_denovo<R: Rng>(
1145    contig_number: NonZeroU32,
1146    len_range: OrdPair<NonZeroU64>,
1147    rng: &mut R,
1148) -> Vec<Contig> {
1149    (0..contig_number.get())
1150        .map(|i| {
1151            let length = rng.random_range(len_range.low().get()..=len_range.high().get());
1152            let seq_bytes =
1153                generate_random_dna_sequence(NonZeroU64::try_from(length).expect("no error"), rng);
1154            let seq_str = String::from_utf8(seq_bytes).expect("valid DNA sequence");
1155            ContigBuilder::default()
1156                .name(format!("contig_{i:05}"))
1157                .seq(seq_str)
1158                .build()
1159                .expect("no error")
1160        })
1161        .collect()
1162}
1163
1164/// Generates contigs with repeated sequence pattern
1165///
1166/// Creates contigs by repeating a given DNA sequence to reach a random length
1167/// within the specified range. The last repetition is clipped if needed.
1168///
1169/// ```
1170/// use std::num::{NonZeroU32, NonZeroU64};
1171/// use std::str::FromStr;
1172/// use nanalogue_core::{DNARestrictive, GetDNARestrictive, OrdPair};
1173/// use nanalogue_core::simulate_mod_bam::generate_contigs_denovo_repeated_seq;
1174/// use rand::Rng;
1175///
1176/// let mut rng = rand::rng();
1177/// let seq = DNARestrictive::from_str("ACGT").unwrap();
1178/// let contigs = generate_contigs_denovo_repeated_seq(
1179///     2.try_into().unwrap(),
1180///     (10, 12).try_into().unwrap(),
1181///     &seq,
1182///     &mut rng
1183/// );
1184/// assert_eq!(contigs.len(), 2);
1185/// for (i, contig) in contigs.iter().enumerate() {
1186///     assert_eq!(contig.name, format!("contig_0000{i}"));
1187///     match contig.get_dna_restrictive().get() {
1188///         b"ACGTACGTAC" | b"ACGTACGTACG" | b"ACGTACGTACGT" => {},
1189///         _ => panic!("Unexpected sequence"),
1190///     }
1191/// }
1192/// ```
1193#[expect(
1194    clippy::missing_panics_doc,
1195    reason = "no length number or DNA conversion problems expected"
1196)]
1197pub fn generate_contigs_denovo_repeated_seq<R: Rng, S: GetDNARestrictive>(
1198    contig_number: NonZeroU32,
1199    len_range: OrdPair<NonZeroU64>,
1200    seq: &S,
1201    rng: &mut R,
1202) -> Vec<Contig> {
1203    (0..contig_number.get())
1204        .map(|i| {
1205            let length = rng.random_range(len_range.low().get()..=len_range.high().get());
1206            let contig_seq: Vec<u8> = seq
1207                .get_dna_restrictive()
1208                .get()
1209                .iter()
1210                .cycle()
1211                .take(usize::try_from(length).expect("number conversion error"))
1212                .copied()
1213                .collect();
1214            let seq_str =
1215                String::from_utf8(contig_seq).expect("valid UTF-8 from repeated DNA sequence");
1216            ContigBuilder::default()
1217                .name(format!("contig_{i:05}"))
1218                .seq(seq_str)
1219                .build()
1220                .expect("valid DNA sequence from repeated pattern")
1221        })
1222        .collect()
1223}
1224
1225/// Generates reads that align to contigs
1226///
1227/// # Example
1228///
1229/// Here we generate a contig, specify read properties, and ask for them to be generated.
1230/// Although we have used `ContigBuilder` here, you can also use `DNARestrictive` to pass
1231/// DNA sequences directly in the first argument i.e. you can use any `struct` that implements
1232/// `GetDNARestrictive`.
1233///
1234/// ```
1235/// use nanalogue_core::Error;
1236/// use nanalogue_core::simulate_mod_bam::{ContigBuilder, ReadConfigBuilder, generate_reads_denovo};
1237/// use rand::Rng;
1238///
1239/// let mut contig = ContigBuilder::default()
1240///     .name("chr1")
1241///     .seq("ACGTACGTACGTACGT".into()).build()?;
1242/// let contigs = vec![contig];
1243///
1244/// let mut config = ReadConfigBuilder::default()
1245///     .number(10)
1246///     .mapq_range((10, 20))
1247///     .base_qual_range((20, 30))
1248///     .len_range((0.2, 0.5)).build()?;
1249/// // NOTE: barcodes are optional, and will add 2*barcode_length to length statistics.
1250/// //       i.e. length stats are imposed independent of barcodes.
1251/// let mut rng = rand::rng();
1252/// let reads = generate_reads_denovo(&contigs, &config, "RG1", &mut rng).unwrap();
1253/// assert_eq!(reads.len(), 10);
1254/// # Ok::<(), Error>(())
1255/// ```
1256///
1257/// # Errors
1258/// Returns an error if the contigs input slice is empty,
1259/// if parameters are such that zero read lengths are produced,
1260/// or if BAM record creation fails, such as when making RG, MM, or ML tags.
1261#[expect(
1262    clippy::missing_panics_doc,
1263    clippy::too_many_lines,
1264    reason = "number conversion errors or mis-generation of DNA bases are unlikely \
1265    as we generate DNA sequences ourselves here, and genomic data are unlikely \
1266    to exceed ~2^63 bp or have ~2^32 contigs. Function length is acceptable for read generation"
1267)]
1268#[expect(
1269    clippy::cast_possible_truncation,
1270    reason = "read length calculated as a fraction of contig length, managed with trunc()"
1271)]
1272pub fn generate_reads_denovo<R: Rng, S: GetDNARestrictive>(
1273    contigs: &[S],
1274    read_config: &ReadConfig,
1275    read_group: &str,
1276    rng: &mut R,
1277) -> Result<Vec<bam::Record>, Error> {
1278    if contigs.is_empty() {
1279        return Err(Error::UnavailableData("no contigs found".to_owned()));
1280    }
1281
1282    let mut reads = Vec::new();
1283
1284    for _ in 0..read_config.number.get() {
1285        // Select a random contig
1286        let contig_idx = rng.random_range(0..contigs.len());
1287        let contig = contigs
1288            .get(contig_idx)
1289            .expect("contig_idx is within contigs range");
1290        let contig_len = contig.get_dna_restrictive().get().len() as u64;
1291
1292        // Calculate read length as fraction of contig length
1293        #[expect(
1294            clippy::cast_precision_loss,
1295            reason = "u64->f32 causes precision loss but we are fine with this for now"
1296        )]
1297        #[expect(
1298            clippy::cast_sign_loss,
1299            reason = "these are positive numbers so no problem"
1300        )]
1301        let read_len = ((rng
1302            .random_range(read_config.len_range.low().val()..=read_config.len_range.high().val())
1303            * contig_len as f32)
1304            .trunc() as u64)
1305            .min(contig_len);
1306
1307        // Set starting position
1308        let start_pos = rng.random_range(0..=(contig_len.saturating_sub(read_len)));
1309
1310        // Extract sequence from contig
1311        let random_state: ReadState = random();
1312        let end_pos = usize::try_from(start_pos.checked_add(read_len).expect("u64 overflow"))
1313            .expect("number conversion error");
1314        let (read_seq, cigar) = {
1315            let start_idx = usize::try_from(start_pos).expect("number conversion error");
1316            let temp_seq = contig
1317                .get_dna_restrictive()
1318                .get()
1319                .get(start_idx..end_pos)
1320                .expect("start_idx and end_pos are within contig bounds")
1321                .to_vec();
1322            let mut builder = PerfectSeqMatchToNot::seq(temp_seq)?;
1323            if let Some(barcode) = read_config.barcode.clone() {
1324                builder = builder.barcode(barcode);
1325            }
1326            if let Some(delete) = read_config.delete {
1327                builder = builder.delete(delete);
1328            }
1329            if let Some(insert_middle) = read_config.insert_middle.clone() {
1330                builder = builder.insert_middle(insert_middle);
1331            }
1332            if let Some(mismatch) = read_config.mismatch {
1333                builder = builder.mismatch(mismatch);
1334            }
1335            builder.build(random_state, rng)?
1336        };
1337
1338        // Generate quality scores (for final read length including any adjustments like barcodes)
1339        let qual: Vec<u8> = iter::repeat_with(|| {
1340            rng.random_range(RangeInclusive::from(read_config.base_qual_range))
1341        })
1342        .take(read_seq.len())
1343        .collect();
1344
1345        // Generate modification information after reverse complementing sequence if needed
1346        let seq: DNARestrictive;
1347        let (mod_prob_mm_tag, mod_pos_ml_tag) = generate_random_dna_modification(
1348            &read_config.mods,
1349            match random_state.strand() {
1350                '.' | '+' => {
1351                    seq = DNARestrictive::from_str(str::from_utf8(&read_seq).expect("no error"))
1352                        .expect("no error");
1353                    &seq
1354                }
1355                '-' => {
1356                    seq = DNARestrictive::from_str(
1357                        str::from_utf8(&bio::alphabets::dna::revcomp(&read_seq)).expect("no error"),
1358                    )
1359                    .expect("no error");
1360                    &seq
1361                }
1362                _ => unreachable!("`strand` is supposed to return one of +/-/."),
1363            },
1364            rng,
1365        );
1366
1367        // Create BAM record
1368        let record = {
1369            let mut record = bam::Record::new();
1370            let qname = format!("{}.{}", read_group, Uuid::new_v4()).into_bytes();
1371            record.unset_flags();
1372            record.set_flags(u16::from(random_state));
1373            if random_state == ReadState::Unmapped {
1374                record.set(&qname, None, &read_seq, &qual);
1375                record.set_mapq(255);
1376                record.set_tid(-1);
1377                record.set_pos(-1);
1378            } else {
1379                let mapq = rng.random_range(RangeInclusive::from(read_config.mapq_range));
1380                record.set(&qname, cigar.as_ref(), &read_seq, &qual);
1381                record.set_mapq(mapq);
1382                record.set_tid(i32::try_from(contig_idx).expect("number conversion error"));
1383                record.set_pos(i64::try_from(start_pos).expect("number conversion error"));
1384            }
1385            record.set_mpos(-1);
1386            record.set_mtid(-1);
1387            record.push_aux(b"RG", Aux::String(read_group))?;
1388            if !mod_prob_mm_tag.is_empty() && !mod_pos_ml_tag.is_empty() {
1389                record.push_aux(b"MM", Aux::String(mod_prob_mm_tag.as_str()))?;
1390                record.push_aux(b"ML", Aux::ArrayU8((&mod_pos_ml_tag).into()))?;
1391            }
1392            record
1393        };
1394        reads.push(record);
1395    }
1396
1397    Ok(reads)
1398}
1399
1400/// Main function to generate simulated BAM file
1401///
1402/// # Example
1403///
1404/// ```
1405/// use nanalogue_core::{Error, SimulationConfig, simulate_mod_bam::run};
1406/// use uuid::Uuid;
1407///
1408/// let config_json = r#"{
1409///   "contigs": {
1410///     "number": 4,
1411///     "len_range": [1000, 8000]
1412///   },
1413///   "reads": [{
1414///     "number": 1000,
1415///     "mapq_range": [10, 20],
1416///     "base_qual_range": [10, 20],
1417///     "len_range": [0.1, 0.8]
1418///   }]
1419/// }"#;
1420///
1421/// // Note: * Optional "barcode" field can be added to reads (e.g., "barcode": "ACGTAA")
1422/// //         Length statistics are imposed independent of barcodes; so they will add
1423/// //         2x barcode length on top of these.
1424/// //       * Optional "repeated_seq" field can be added to contigs (e.g., "repeated_seq": "ACGT")
1425/// //         to create contigs by repeating a sequence instead of random generation.
1426/// //       * Optional modifications can be added using the "mods" field
1427/// // Note: The files used below should not exist because they will be created by this function.
1428/// //       If they exist, they will be overwritten.
1429///
1430/// let config: SimulationConfig = serde_json::from_str(&config_json)?;
1431/// let temp_dir = std::env::temp_dir();
1432/// let bam_path = temp_dir.join(format!("{}.bam", Uuid::new_v4()));
1433/// let fasta_path = temp_dir.join(format!("{}.fa", Uuid::new_v4()));
1434/// let bai_path = bam_path.with_extension("bam.bai");
1435/// run(config, &bam_path, &fasta_path)?;
1436/// std::fs::remove_file(&bam_path)?;
1437/// std::fs::remove_file(&fasta_path)?;
1438/// std::fs::remove_file(&bai_path)?;
1439/// # Ok::<(), Error>(())
1440/// ```
1441///
1442/// # Errors
1443/// Returns an error if JSON parsing fails, read generation fails, or BAM/FASTA writing fails.
1444pub fn run<F>(
1445    config: SimulationConfig,
1446    bam_output_path: &F,
1447    fasta_output_path: &F,
1448) -> Result<(), Error>
1449where
1450    F: AsRef<Path> + ?Sized,
1451{
1452    let mut rng = rand::rng();
1453
1454    let contigs = match config.contigs.repeated_seq {
1455        Some(seq) => generate_contigs_denovo_repeated_seq(
1456            config.contigs.number,
1457            config.contigs.len_range,
1458            &seq,
1459            &mut rng,
1460        ),
1461        None => generate_contigs_denovo(config.contigs.number, config.contigs.len_range, &mut rng),
1462    };
1463    let read_groups: Vec<String> = (0..config.reads.len()).map(|k| k.to_string()).collect();
1464    let reads = {
1465        let mut temp_reads = Vec::new();
1466        for k in config.reads.into_iter().zip(read_groups.clone()) {
1467            temp_reads.append(&mut generate_reads_denovo(&contigs, &k.0, &k.1, &mut rng)?);
1468        }
1469        temp_reads.sort_by_key(|k| (k.is_unmapped(), k.tid(), k.pos(), k.is_reverse()));
1470        temp_reads
1471    };
1472
1473    write_bam_denovo(
1474        reads,
1475        contigs
1476            .iter()
1477            .map(|k| (k.name.clone(), k.get_dna_restrictive().get().len())),
1478        read_groups,
1479        vec![String::from("simulated BAM file, not real data")],
1480        bam_output_path,
1481    )?;
1482    write_fasta(
1483        contigs.into_iter().map(|k| (k.name.clone(), k)),
1484        fasta_output_path,
1485    )?;
1486
1487    Ok(())
1488}
1489
1490/// Temporary BAM simulation with automatic cleanup
1491///
1492/// Creates temporary BAM and FASTA files for testing purposes and
1493/// automatically removes them when dropped.
1494#[derive(Debug, Serialize, Deserialize)]
1495pub struct TempBamSimulation {
1496    /// Path to bam file that will be created
1497    bam_path: String,
1498    /// Path to fasta file that will be created
1499    fasta_path: String,
1500}
1501
1502impl TempBamSimulation {
1503    /// Creates a new temporary BAM simulation from JSON configuration
1504    ///
1505    /// # Errors
1506    /// Returns an error if the simulation run fails
1507    pub fn new(config: SimulationConfig) -> Result<Self, Error> {
1508        let temp_dir = std::env::temp_dir();
1509        let bam_path = temp_dir.join(format!("{}.bam", Uuid::new_v4()));
1510        let fasta_path = temp_dir.join(format!("{}.fa", Uuid::new_v4()));
1511
1512        run(config, &bam_path, &fasta_path)?;
1513
1514        Ok(Self {
1515            bam_path: bam_path.to_string_lossy().to_string(),
1516            fasta_path: fasta_path.to_string_lossy().to_string(),
1517        })
1518    }
1519
1520    /// Returns the path to the temporary BAM file
1521    #[must_use]
1522    pub fn bam_path(&self) -> &str {
1523        &self.bam_path
1524    }
1525
1526    /// Returns the path to the temporary FASTA file
1527    #[must_use]
1528    pub fn fasta_path(&self) -> &str {
1529        &self.fasta_path
1530    }
1531}
1532
1533impl Drop for TempBamSimulation {
1534    fn drop(&mut self) {
1535        // Ignore errors during cleanup - files may already be deleted
1536        drop(std::fs::remove_file(&self.bam_path));
1537        drop(std::fs::remove_file(&self.fasta_path));
1538        drop(std::fs::remove_file(&(self.bam_path.clone() + ".bai")));
1539    }
1540}
1541
1542#[cfg(test)]
1543mod random_dna_generation_test {
1544    use super::*;
1545
1546    /// Test for generation of random DNA of a given length
1547    #[test]
1548    fn generate_random_dna_sequence_works() {
1549        let seq = generate_random_dna_sequence(NonZeroU64::new(100).unwrap(), &mut rand::rng());
1550        assert_eq!(seq.len(), 100);
1551        for base in seq {
1552            assert!([b'A', b'C', b'G', b'T'].contains(&base));
1553        }
1554    }
1555}
1556
1557#[cfg(test)]
1558mod read_generation_no_mods_tests {
1559    use super::*;
1560    use rust_htslib::bam::Read as _;
1561
1562    /// Tests read generation with desired properties but no modifications
1563    #[test]
1564    fn generate_reads_denovo_no_mods_works() {
1565        let mut rng = rand::rng();
1566        let contigs = vec![
1567            ContigBuilder::default()
1568                .name("contig_0")
1569                .seq("ACGTACGTACGTACGTACGT".into())
1570                .build()
1571                .unwrap(),
1572            ContigBuilder::default()
1573                .name("contig_1")
1574                .seq("TGCATGCATGCATGCATGCA".into())
1575                .build()
1576                .unwrap(),
1577        ];
1578
1579        let config = ReadConfigBuilder::default()
1580            .number(10)
1581            .mapq_range((10, 20))
1582            .base_qual_range((30, 50))
1583            .len_range((0.2, 0.8))
1584            .build()
1585            .unwrap();
1586
1587        let reads = generate_reads_denovo(&contigs, &config, "ph", &mut rng).unwrap();
1588        assert_eq!(reads.len(), 10);
1589
1590        for read in &reads {
1591            // check mapping information
1592            if !read.is_unmapped() {
1593                assert!((10..=20).contains(&read.mapq()));
1594                assert!((0..2).contains(&read.tid()));
1595            }
1596
1597            // check sequence length, quality, and check that no mods are present
1598            assert!((4..=16).contains(&read.seq_len()));
1599            assert!(read.qual().iter().all(|x| (30..=50).contains(x)));
1600            let _: rust_htslib::errors::Error = read.aux(b"MM").unwrap_err();
1601            let _: rust_htslib::errors::Error = read.aux(b"ML").unwrap_err();
1602
1603            // check read name, must be "ph.UUID"
1604            assert_eq!(read.qname().get(0..3).unwrap(), *b"ph.");
1605            let _: Uuid =
1606                Uuid::parse_str(str::from_utf8(read.qname().get(3..).unwrap()).unwrap()).unwrap();
1607        }
1608    }
1609
1610    /// Tests full BAM creation
1611    #[test]
1612    fn full_bam_generation() {
1613        let config_json = r#"{
1614            "contigs": {
1615                "number": 2,
1616                "len_range": [100, 200]
1617            },
1618            "reads": [{
1619                "number": 1000,
1620                "mapq_range": [10, 20],
1621                "base_qual_range": [10, 20],
1622                "len_range": [0.1, 0.8]
1623            }]
1624        }"#;
1625
1626        // create files with random names and check for existence
1627        let temp_dir = std::env::temp_dir();
1628        let bam_path = temp_dir.join(format!("{}.bam", Uuid::new_v4()));
1629        let fasta_path = temp_dir.join(format!("{}.fa", Uuid::new_v4()));
1630
1631        let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
1632        run(config, &bam_path, &fasta_path).unwrap();
1633
1634        assert!(bam_path.exists());
1635        assert!(fasta_path.exists());
1636
1637        // read BAM file and check contig, read count
1638        let mut reader = bam::Reader::from_path(&bam_path).unwrap();
1639        let header = reader.header();
1640        assert_eq!(header.target_count(), 2);
1641        assert_eq!(reader.records().count(), 1000);
1642
1643        // delete files
1644        let bai_path = bam_path.with_extension("bam.bai");
1645        std::fs::remove_file(&bam_path).unwrap();
1646        std::fs::remove_file(fasta_path).unwrap();
1647        std::fs::remove_file(bai_path).unwrap();
1648    }
1649
1650    /// Tests `TempBamSimulation` struct functionality without mods
1651    #[test]
1652    fn temp_bam_simulation_struct_no_mods() {
1653        let config_json = r#"{
1654            "contigs": {
1655                "number": 2,
1656                "len_range": [100, 200]
1657            },
1658            "reads": [{
1659                "number": 1000,
1660                "mapq_range": [10, 20],
1661                "base_qual_range": [10, 20],
1662                "len_range": [0.1, 0.8]
1663            }]
1664        }"#;
1665
1666        // Create temporary simulation
1667        let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
1668        let sim = TempBamSimulation::new(config).unwrap();
1669
1670        // Verify files exist
1671        assert!(Path::new(sim.bam_path()).exists());
1672        assert!(Path::new(sim.fasta_path()).exists());
1673
1674        // Read BAM file and check contig count
1675        let mut reader = bam::Reader::from_path(sim.bam_path()).unwrap();
1676        let header = reader.header();
1677        assert_eq!(header.target_count(), 2);
1678        assert_eq!(reader.records().count(), 1000);
1679    }
1680
1681    /// Tests `TempBamSimulation` automatic cleanup
1682    #[test]
1683    fn temp_bam_simulation_cleanup() {
1684        let config_json = r#"{
1685            "contigs": {
1686                "number": 1,
1687                "len_range": [50, 50]
1688            },
1689            "reads": [{
1690                "number": 10,
1691                "len_range": [0.5, 0.5]
1692            }]
1693        }"#;
1694
1695        let bam_path: String;
1696        let fasta_path: String;
1697
1698        {
1699            let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
1700            let sim = TempBamSimulation::new(config).unwrap();
1701            bam_path = sim.bam_path().to_string();
1702            fasta_path = sim.fasta_path().to_string();
1703
1704            // Files should exist while sim is in scope
1705            assert!(Path::new(&bam_path).exists());
1706            assert!(Path::new(&fasta_path).exists());
1707        } // sim is dropped here
1708
1709        // Files should be cleaned up after drop
1710        assert!(!Path::new(&bam_path).exists());
1711        assert!(!Path::new(&fasta_path).exists());
1712    }
1713
1714    /// Tests error when generating reads with empty contigs slice
1715    #[test]
1716    fn generate_reads_denovo_empty_contigs_error() {
1717        let contigs: Vec<Contig> = vec![];
1718        let config = ReadConfig::default();
1719        let mut rng = rand::rng();
1720
1721        let result = generate_reads_denovo(&contigs, &config, "test", &mut rng);
1722        assert!(matches!(result, Err(Error::UnavailableData(_))));
1723    }
1724
1725    /// Tests error when read length would be zero
1726    #[test]
1727    fn generate_reads_denovo_zero_length_error() {
1728        let contigs = [ContigBuilder::default()
1729            .name("tiny")
1730            .seq("ACGT".into())
1731            .build()
1732            .unwrap()];
1733
1734        let config = ReadConfigBuilder::default()
1735            .number(1)
1736            .len_range((0.0, 0.0))
1737            .build()
1738            .unwrap();
1739
1740        let mut rng = rand::rng();
1741        let result = generate_reads_denovo(&contigs, &config, "test", &mut rng);
1742        assert!(matches!(result, Err(Error::InvalidState(_))));
1743    }
1744
1745    /// Tests invalid JSON structure causing empty reads generation
1746    #[test]
1747    fn run_empty_reads_error() {
1748        let invalid_json = r#"{ "reads": [] }"#; // Empty reads array
1749        let temp_dir = std::env::temp_dir();
1750        let bam_path = temp_dir.join(format!("{}.bam", Uuid::new_v4()));
1751        let fasta_path = temp_dir.join(format!("{}.fa", Uuid::new_v4()));
1752
1753        let config: SimulationConfig = serde_json::from_str(invalid_json).unwrap();
1754        let result = run(config, &bam_path, &fasta_path);
1755        // With empty reads, this should succeed but produce an empty BAM (valid)
1756        // So we won't assert error here, just test it doesn't crash
1757        drop(result);
1758        let bai_path = bam_path.with_extension("bam.bai");
1759        drop(std::fs::remove_file(&bam_path));
1760        drop(std::fs::remove_file(&fasta_path));
1761        drop(std::fs::remove_file(&bai_path));
1762    }
1763
1764    /// Tests multiple read groups in BAM generation
1765    #[test]
1766    fn multiple_read_groups_work() {
1767        let config_json = r#"{
1768            "contigs": {
1769                "number": 2,
1770                "len_range": [100, 200]
1771            },
1772            "reads": [
1773                {
1774                    "number": 50,
1775                    "mapq_range": [10, 20],
1776                    "base_qual_range": [20, 30],
1777                    "len_range": [0.1, 0.5]
1778                },
1779                {
1780                    "number": 75,
1781                    "mapq_range": [30, 40],
1782                    "base_qual_range": [25, 35],
1783                    "len_range": [0.3, 0.7]
1784                },
1785                {
1786                    "number": 25,
1787                    "mapq_range": [5, 15],
1788                    "base_qual_range": [15, 25],
1789                    "len_range": [0.2, 0.6]
1790                }
1791            ]
1792        }"#;
1793
1794        let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
1795        let sim = TempBamSimulation::new(config).unwrap();
1796        let mut reader = bam::Reader::from_path(sim.bam_path()).unwrap();
1797
1798        // Should have 50 + 75 + 25 = 150 reads total
1799        assert_eq!(reader.records().count(), 150);
1800
1801        // Verify read groups exist in header
1802        let mut reader2 = bam::Reader::from_path(sim.bam_path()).unwrap();
1803        let header = reader2.header();
1804        let header_text = std::str::from_utf8(header.as_bytes()).unwrap();
1805        assert!(header_text.contains("@RG\tID:0"));
1806        assert!(header_text.contains("@RG\tID:1"));
1807        assert!(header_text.contains("@RG\tID:2"));
1808
1809        // Count reads per read group and verify read name format
1810        let mut rg_counts = [0, 0, 0];
1811        for record in reader2.records() {
1812            let read = record.unwrap();
1813            let qname = std::str::from_utf8(read.qname()).unwrap();
1814
1815            if let Ok(Aux::String(rg)) = read.aux(b"RG") {
1816                // Verify read name format: should be "rg.uuid"
1817                let parts: Vec<&str> = qname.split('.').collect();
1818                assert_eq!(parts.len(), 2, "Read name should be in format 'n.uuid'");
1819
1820                let read_group_prefix =
1821                    parts.first().expect("parts should have at least 1 element");
1822                let uuid_part = parts.get(1).expect("parts should have 2 elements");
1823
1824                assert_eq!(
1825                    *read_group_prefix, rg,
1826                    "Read name prefix should match read group"
1827                );
1828
1829                // Verify the second part is a valid UUID
1830                assert!(
1831                    Uuid::parse_str(uuid_part).is_ok(),
1832                    "Read name should contain a valid UUID after the dot"
1833                );
1834
1835                match rg {
1836                    "0" => *rg_counts.get_mut(0).unwrap() += 1,
1837                    "1" => *rg_counts.get_mut(1).unwrap() += 1,
1838                    "2" => *rg_counts.get_mut(2).unwrap() += 1,
1839                    unexpected => unreachable!("Unexpected read group: {unexpected}"),
1840                }
1841            }
1842        }
1843        assert_eq!(rg_counts[0], 50);
1844        assert_eq!(rg_counts[1], 75);
1845        assert_eq!(rg_counts[2], 25);
1846    }
1847
1848    /// Tests that read sequences are valid substrings of their parent contigs
1849    #[expect(
1850        clippy::cast_sign_loss,
1851        reason = "tid and pos are non-negative in valid BAM"
1852    )]
1853    #[expect(
1854        clippy::cast_possible_truncation,
1855        reason = "tid fits in usize for normal BAM files"
1856    )]
1857    #[test]
1858    fn read_sequences_match_contigs() {
1859        let contigs = vec![
1860            ContigBuilder::default()
1861                .name("chr1")
1862                .seq("ACGTACGTACGTACGTACGTACGTACGTACGT".into())
1863                .build()
1864                .unwrap(),
1865            ContigBuilder::default()
1866                .name("chr2")
1867                .seq("TGCATGCATGCATGCATGCATGCATGCATGCA".into())
1868                .build()
1869                .unwrap(),
1870        ];
1871
1872        let read_config = ReadConfigBuilder::default()
1873            .number(50)
1874            .len_range((0.2, 0.8))
1875            .build()
1876            .unwrap();
1877
1878        let mut rng = rand::rng();
1879        let reads = generate_reads_denovo(&contigs, &read_config, "test", &mut rng).unwrap();
1880
1881        let mut validated_count = 0;
1882        for read in &reads {
1883            // Only validate forward mapped reads without soft clips for simplicity
1884            if !read.is_unmapped() && !read.is_reverse() {
1885                let cigar = read.cigar();
1886
1887                // Skip reads with soft clips (shouldn't happen without barcodes, but be safe)
1888                let has_soft_clip = cigar.iter().any(|op| matches!(op, Cigar::SoftClip(_)));
1889                if has_soft_clip {
1890                    continue;
1891                }
1892
1893                let tid = read.tid() as usize;
1894                let pos = read.pos() as usize;
1895                let contig = contigs.get(tid).expect("valid tid");
1896                let contig_seq = contig.get_dna_restrictive().get();
1897
1898                // Get alignment length
1899                let mut aligned_len = 0usize;
1900                for op in &cigar {
1901                    if let Cigar::Match(len) = *op {
1902                        aligned_len = aligned_len.checked_add(len as usize).expect("overflow");
1903                    }
1904                }
1905
1906                let read_seq = read.seq().as_bytes();
1907                let expected_seq = contig_seq
1908                    .get(pos..pos.checked_add(aligned_len).expect("overflow"))
1909                    .expect("contig subsequence exists");
1910
1911                assert_eq!(
1912                    read_seq, expected_seq,
1913                    "Forward read sequence should exactly match contig substring"
1914                );
1915                validated_count += 1;
1916            }
1917        }
1918
1919        // Ensure we validated at least some reads
1920        assert!(
1921            validated_count > 0,
1922            "Should have validated at least some forward reads"
1923        );
1924    }
1925
1926    /// Tests different read states (unmapped, secondary, supplementary)
1927    #[test]
1928    fn different_read_states_work() {
1929        let config_json = r#"{
1930            "contigs": {
1931                "number": 1,
1932                "len_range": [200, 200]
1933            },
1934            "reads": [{
1935                "number": 1000,
1936                "mapq_range": [10, 20],
1937                "base_qual_range": [20, 30],
1938                "len_range": [0.2, 0.8]
1939            }]
1940        }"#;
1941
1942        let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
1943        let sim = TempBamSimulation::new(config).unwrap();
1944        let mut reader = bam::Reader::from_path(sim.bam_path()).unwrap();
1945
1946        let mut has_unmapped = false;
1947        let mut has_forward = false;
1948        let mut has_reverse = false;
1949        let mut has_secondary = false;
1950        let mut has_supplementary = false;
1951
1952        for record in reader.records() {
1953            let read = record.unwrap();
1954
1955            if read.is_unmapped() {
1956                has_unmapped = true;
1957                // Verify unmapped read properties
1958                assert_eq!(read.tid(), -1);
1959                assert_eq!(read.pos(), -1);
1960                assert_eq!(read.mapq(), 255);
1961            } else {
1962                if read.is_reverse() {
1963                    has_reverse = true;
1964                } else {
1965                    has_forward = true;
1966                }
1967
1968                if read.is_secondary() {
1969                    has_secondary = true;
1970                }
1971
1972                if read.is_supplementary() {
1973                    has_supplementary = true;
1974                }
1975            }
1976        }
1977
1978        // With 1000 reads, we should have diverse read states
1979        assert!(has_unmapped, "Should have unmapped reads");
1980        assert!(has_forward, "Should have forward reads");
1981        assert!(has_reverse, "Should have reverse reads");
1982        assert!(has_secondary, "Should have secondary reads");
1983        assert!(has_supplementary, "Should have supplementary reads");
1984    }
1985
1986    /// Tests read length at 100% of contig length
1987    #[test]
1988    fn read_length_full_contig_works() {
1989        let contigs = vec![
1990            ContigBuilder::default()
1991                .name("chr1")
1992                .seq("ACGTACGTACGTACGTACGTACGTACGTACGT".into())
1993                .build()
1994                .unwrap(),
1995        ];
1996
1997        let config_full_length = ReadConfigBuilder::default()
1998            .number(20)
1999            .len_range((1.0, 1.0))
2000            .build()
2001            .unwrap();
2002
2003        let mut rng = rand::rng();
2004        let reads_full =
2005            generate_reads_denovo(&contigs, &config_full_length, "test", &mut rng).unwrap();
2006
2007        // All mapped reads should be exactly contig length
2008        for read in &reads_full {
2009            if !read.is_unmapped() {
2010                assert_eq!(read.seq_len(), 32, "Read should be full contig length");
2011            }
2012        }
2013    }
2014
2015    /// Tests read length with very small fraction of contig
2016    #[test]
2017    fn read_length_small_fraction_works() {
2018        // Use 200bp contig so 1% = 2bp (won't round to 0)
2019        let contigs = [ContigBuilder::default()
2020            .name("chr1")
2021            .seq("ACGT".repeat(50))
2022            .build()
2023            .unwrap()];
2024
2025        let config_small = ReadConfigBuilder::default()
2026            .number(20)
2027            .len_range((0.01, 0.05))
2028            .build()
2029            .unwrap();
2030
2031        let mut rng = rand::rng();
2032        let reads_small = generate_reads_denovo(&contigs, &config_small, "test", &mut rng).unwrap();
2033
2034        // All mapped reads should be very small (2-10 bp given 200bp contig)
2035        for read in &reads_small {
2036            if !read.is_unmapped() {
2037                assert!(read.seq_len() <= 10, "Read should be very small");
2038                assert!(read.seq_len() >= 2, "Read should be at least 2bp");
2039            }
2040        }
2041    }
2042}
2043
2044#[cfg(test)]
2045mod read_generation_barcodes {
2046    use super::*;
2047    use rust_htslib::bam::Read as _;
2048
2049    /// Tests `add_barcode` function with forward and reverse reads
2050    #[expect(
2051        clippy::shadow_unrelated,
2052        reason = "repetition is fine; each block is clearly separated"
2053    )]
2054    #[test]
2055    fn add_barcode_works() {
2056        let read_seq = b"GGGGGGGG".to_vec();
2057        let barcode = DNARestrictive::from_str("ACGTAA").unwrap();
2058
2059        // Test forward read: barcode + seq + revcomp(barcode)
2060        let result = add_barcode(&read_seq, barcode.clone(), ReadState::PrimaryFwd);
2061        assert_eq!(result, b"ACGTAAGGGGGGGGTTACGT".to_vec());
2062
2063        // Test reverse read: comp(barcode) + seq + rev(barcode)
2064        let result = add_barcode(&read_seq, barcode, ReadState::PrimaryRev);
2065        assert_eq!(result, b"TGCATTGGGGGGGGAATGCA".to_vec());
2066    }
2067
2068    /// Tests CIGAR string validity with and without barcodes
2069    #[test]
2070    fn cigar_strings_valid_with_or_without_barcodes() {
2071        // Test without barcodes
2072        let contigs = [ContigBuilder::default()
2073            .name("chr1")
2074            .seq("ACGTACGTACGTACGTACGTACGTACGTACGT".into())
2075            .build()
2076            .unwrap()];
2077
2078        let config_no_barcode = ReadConfigBuilder::default()
2079            .number(20)
2080            .len_range((0.3, 0.7))
2081            .build()
2082            .unwrap();
2083
2084        let mut rng = rand::rng();
2085        let reads_no_barcode =
2086            generate_reads_denovo(&contigs, &config_no_barcode, "test", &mut rng).unwrap();
2087
2088        for read in &reads_no_barcode {
2089            if !read.is_unmapped() {
2090                let cigar = read.cigar();
2091                // Without barcode, CIGAR should be just a single Match operation
2092                assert_eq!(cigar.len(), 1);
2093                assert!(matches!(cigar.first().unwrap(), Cigar::Match(_)));
2094
2095                // Verify CIGAR length matches sequence length
2096                let cigar_len: u32 = cigar
2097                    .iter()
2098                    .map(|op| match *op {
2099                        Cigar::Match(len) | Cigar::SoftClip(len) => len,
2100                        Cigar::Ins(_)
2101                        | Cigar::Del(_)
2102                        | Cigar::RefSkip(_)
2103                        | Cigar::HardClip(_)
2104                        | Cigar::Pad(_)
2105                        | Cigar::Equal(_)
2106                        | Cigar::Diff(_) => unreachable!(),
2107                    })
2108                    .sum();
2109                assert_eq!(cigar_len as usize, read.seq_len());
2110            }
2111        }
2112
2113        // Test with barcodes
2114        let config_with_barcode = ReadConfigBuilder::default()
2115            .number(20)
2116            .len_range((0.3, 0.7))
2117            .barcode("ACGTAA".into())
2118            .build()
2119            .unwrap();
2120
2121        let reads_with_barcode =
2122            generate_reads_denovo(&contigs, &config_with_barcode, "test", &mut rng).unwrap();
2123
2124        for read in &reads_with_barcode {
2125            if !read.is_unmapped() {
2126                let cigar = read.cigar();
2127                // With barcode, CIGAR should be: SoftClip + Match + SoftClip
2128                assert_eq!(cigar.len(), 3);
2129                assert!(matches!(cigar.first().unwrap(), Cigar::SoftClip(6))); // barcode length
2130                assert!(matches!(cigar.get(1).unwrap(), Cigar::Match(_)));
2131                assert!(matches!(cigar.get(2).unwrap(), Cigar::SoftClip(6))); // barcode length
2132
2133                // Verify total CIGAR length matches sequence length
2134                let cigar_len: u32 = cigar
2135                    .iter()
2136                    .map(|op| match *op {
2137                        Cigar::Match(len) | Cigar::SoftClip(len) => len,
2138                        Cigar::Ins(_)
2139                        | Cigar::Del(_)
2140                        | Cigar::RefSkip(_)
2141                        | Cigar::HardClip(_)
2142                        | Cigar::Pad(_)
2143                        | Cigar::Equal(_)
2144                        | Cigar::Diff(_) => unreachable!(),
2145                    })
2146                    .sum();
2147                assert_eq!(cigar_len as usize, read.seq_len());
2148            }
2149        }
2150    }
2151
2152    /// Tests read generation with barcode
2153    #[test]
2154    fn generate_reads_denovo_with_barcode_works() {
2155        let config_json = r#"{
2156            "contigs": {
2157                "number": 1,
2158                "len_range": [20, 20]
2159            },
2160            "reads": [{
2161                "number": 10,
2162                "len_range": [0.2, 0.8],
2163                "barcode": "GTACGG"
2164            }]
2165        }"#;
2166
2167        let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
2168        let sim = TempBamSimulation::new(config).unwrap();
2169        let mut reader = bam::Reader::from_path(sim.bam_path()).unwrap();
2170
2171        for record in reader.records() {
2172            let read = record.unwrap();
2173            let seq = read.seq().as_bytes();
2174            let seq_len = seq.len();
2175
2176            if read.is_reverse() {
2177                // Reverse: comp(GTACGG)=CATGCC at start, rev(GTACGG)=GGCATG at end
2178                assert_eq!(seq.get(..6).expect("seq has at least 6 bases"), b"CATGCC");
2179                assert_eq!(
2180                    seq.get(seq_len.saturating_sub(6)..)
2181                        .expect("seq has at least 6 bases"),
2182                    b"GGCATG"
2183                );
2184            } else {
2185                // Forward: GTACGG at start, revcomp(GTACGG)=CCGTAC at end
2186                assert_eq!(seq.get(..6).expect("seq has at least 6 bases"), b"GTACGG");
2187                assert_eq!(
2188                    seq.get(seq_len.saturating_sub(6)..)
2189                        .expect("seq has at least 6 bases"),
2190                    b"CCGTAC"
2191                );
2192            }
2193        }
2194    }
2195}
2196
2197#[cfg(test)]
2198mod read_generation_with_mods_tests {
2199    use super::*;
2200    use crate::{CurrRead, ThresholdState, curr_reads_to_dataframe};
2201    use rust_htslib::bam::Read as _;
2202
2203    /// Tests read generation with desired properties but with modifications
2204    #[test]
2205    fn generate_reads_denovo_with_mods_works() {
2206        let mut rng = rand::rng();
2207        let contigs = vec![
2208            ContigBuilder::default()
2209                .name("contig_0")
2210                .seq("ACGTACGTACGTACGTACGT".into())
2211                .build()
2212                .unwrap(),
2213            ContigBuilder::default()
2214                .name("contig_1")
2215                .seq("TGCATGCATGCATGCATGCA".into())
2216                .build()
2217                .unwrap(),
2218        ];
2219
2220        let config = ReadConfigBuilder::default()
2221            .number(10000)
2222            .mapq_range((10, 20))
2223            .base_qual_range((30, 50))
2224            .len_range((0.2, 0.8))
2225            .mods(
2226                [ModConfigBuilder::default()
2227                    .base('C')
2228                    .mod_code("m".into())
2229                    .win([4].into())
2230                    .mod_range([(0f32, 1f32)].into())
2231                    .build()
2232                    .unwrap()]
2233                .into(),
2234            )
2235            .build()
2236            .unwrap();
2237
2238        let reads = generate_reads_denovo(&contigs, &config, "1", &mut rng).unwrap();
2239        assert_eq!(reads.len(), 10000);
2240
2241        let mut zero_to_255_visited = vec![false; 256];
2242
2243        for read in &reads {
2244            // check mapping information
2245            if !read.is_unmapped() {
2246                assert!((10..=20).contains(&read.mapq()));
2247                assert!((0..2).contains(&read.tid()));
2248            }
2249
2250            // check other information
2251            assert!((4..=16).contains(&read.seq_len()));
2252            assert!(read.qual().iter().all(|x| (30..=50).contains(x)));
2253
2254            // check modification information
2255            let Aux::String(mod_pos_mm_tag) = read.aux(b"MM").unwrap() else {
2256                unreachable!()
2257            };
2258            let Aux::ArrayU8(mod_prob_ml_tag) = read.aux(b"ML").unwrap() else {
2259                unreachable!()
2260            };
2261            // Parse and verify the actual gap position values in MM tag
2262            let pos: Vec<&str> = mod_pos_mm_tag
2263                .strip_prefix("C+m?,")
2264                .unwrap()
2265                .strip_suffix(';')
2266                .unwrap()
2267                .split(',')
2268                .collect();
2269            assert!(pos.iter().all(|&x| x == "0"), "All MM values should be 0");
2270
2271            // verify probability values which should have been converted to 0-255.
2272            #[expect(
2273                clippy::indexing_slicing,
2274                reason = "we are not gonna bother, this should be fine"
2275            )]
2276            for k in 0..mod_prob_ml_tag.len() {
2277                zero_to_255_visited[usize::from(mod_prob_ml_tag.get(k).expect("no error"))] = true;
2278            }
2279        }
2280
2281        // check that all values from 0-255 have been visited as we are using a huge number of
2282        // reads.
2283        assert!(zero_to_255_visited.into_iter().all(|x| x));
2284    }
2285
2286    /// Tests `TempBamSimulation` struct functionality with mods
2287    #[test]
2288    fn temp_bam_simulation_struct_with_mods() {
2289        let config_json = r#"{
2290            "contigs": {
2291                "number": 2,
2292                "len_range": [100, 200]
2293            },
2294            "reads": [{
2295                "number": 1000,
2296                "mapq_range": [10, 20],
2297                "base_qual_range": [10, 20],
2298                "len_range": [0.1, 0.8],
2299                "mods": [{
2300                    "base": "T",
2301                    "is_strand_plus": true,
2302                    "mod_code": "T",
2303                    "win": [4, 5],
2304                    "mod_range": [[0.1, 0.2], [0.3, 0.4]]
2305                }]
2306            }]
2307        }"#;
2308
2309        // Create temporary simulation
2310        let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
2311        let sim = TempBamSimulation::new(config).unwrap();
2312
2313        // Verify files exist
2314        assert!(Path::new(sim.bam_path()).exists());
2315        assert!(Path::new(sim.fasta_path()).exists());
2316
2317        // Read BAM file and check contig, read count
2318        let mut reader = bam::Reader::from_path(sim.bam_path()).unwrap();
2319        let header = reader.header();
2320        assert_eq!(header.target_count(), 2);
2321        assert_eq!(reader.records().count(), 1000);
2322    }
2323
2324    /// Tests `generate_random_dna_modification` with empty mod config
2325    #[test]
2326    fn generate_random_dna_modification_empty_config() {
2327        let seq = DNARestrictive::from_str("ACGTACGT").unwrap();
2328        let mod_configs: Vec<ModConfig> = vec![];
2329        let mut rng = rand::rng();
2330
2331        let (mm_str, ml_vec) = generate_random_dna_modification(&mod_configs, &seq, &mut rng);
2332
2333        assert_eq!(mm_str, String::new());
2334        assert_eq!(ml_vec.len(), 0);
2335    }
2336
2337    /// Tests `generate_random_dna_modification` with single modification config
2338    #[test]
2339    fn generate_random_dna_modification_single_mod() {
2340        let seq = DNARestrictive::from_str("ACGTCGCGATCG").unwrap();
2341        let mod_config = ModConfigBuilder::default()
2342            .base('C')
2343            .mod_code("m".into())
2344            .win(vec![2])
2345            .mod_range(vec![(0.5, 0.5)])
2346            .build()
2347            .unwrap();
2348
2349        let mut rng = rand::rng();
2350        let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config], &seq, &mut rng);
2351
2352        // Sequence has 4 C's, so we expect 4 modifications
2353        assert_eq!(ml_vec.len(), 4);
2354        // All should be 128 (0.5 * 255)
2355        assert!(ml_vec.iter().all(|&x| x == 128u8));
2356        // All should be zero
2357        let probs: Vec<&str> = mm_str
2358            .strip_prefix("C+m?,")
2359            .unwrap()
2360            .strip_suffix(';')
2361            .unwrap()
2362            .split(',')
2363            .collect();
2364        assert_eq!(probs.len(), 4);
2365        assert!(probs.iter().all(|&x| x == "0"));
2366    }
2367
2368    /// Tests `generate_random_dna_modification` with multiple modification configs
2369    #[test]
2370    fn generate_random_dna_modification_multiple_mods() {
2371        let seq = DNARestrictive::from_str("ACGTACGT").unwrap();
2372
2373        let mod_config_c = ModConfigBuilder::default()
2374            .base('C')
2375            .mod_code('m'.into())
2376            .win(vec![1])
2377            .mod_range(vec![(0.8, 0.8)])
2378            .build()
2379            .unwrap();
2380
2381        let mod_config_t = ModConfigBuilder::default()
2382            .base('T')
2383            .is_strand_plus(false)
2384            .mod_code('t'.into())
2385            .win(vec![1])
2386            .mod_range(vec![(0.4, 0.4)])
2387            .build()
2388            .unwrap();
2389
2390        let mut rng = rand::rng();
2391        let (mm_str, ml_vec) =
2392            generate_random_dna_modification(&[mod_config_c, mod_config_t], &seq, &mut rng);
2393
2394        // Sequence has 2 C's and 2 T's
2395        assert!(mm_str.contains("C+m?,"));
2396        assert!(mm_str.contains("T-t?,"));
2397        assert_eq!(
2398            ml_vec,
2399            vec![204u8, 204u8, 102u8, 102u8],
2400            "ML values should be 204,204,102,102"
2401        );
2402
2403        // Parse and verify the C modifications
2404        let c_section = mm_str
2405            .split(';')
2406            .find(|s| s.starts_with("C+m?"))
2407            .expect("Should have C+m? section");
2408        let c_probs: Vec<&str> = c_section
2409            .strip_prefix("C+m?,")
2410            .unwrap()
2411            .split(',')
2412            .collect();
2413        assert_eq!(
2414            c_probs.len(),
2415            2,
2416            "Should have exactly 2 C modifications in ACGTACGT"
2417        );
2418        // All C pos should be 0
2419        assert!(
2420            c_probs.iter().all(|&x| x == "0"),
2421            "All C modifications should have gap pos 0",
2422        );
2423
2424        // Parse and verify the T modifications
2425        let t_section = mm_str
2426            .split(';')
2427            .find(|s| s.starts_with("T-t?"))
2428            .expect("Should have T-t? section");
2429        let t_probs: Vec<&str> = t_section
2430            .strip_prefix("T-t?,")
2431            .unwrap()
2432            .split(',')
2433            .collect();
2434        assert_eq!(
2435            t_probs.len(),
2436            2,
2437            "Should have exactly 2 T modifications in ACGTACGT"
2438        );
2439        // All T pos should be 0
2440        assert!(
2441            t_probs.iter().all(|&x| x == "0"),
2442            "All T modifications should have a gap pos of 0"
2443        );
2444    }
2445
2446    /// Tests `generate_random_dna_modification` with N base (all bases)
2447    #[test]
2448    fn generate_random_dna_modification_n_base() {
2449        let seq = DNARestrictive::from_str("ACGT").unwrap();
2450
2451        let mod_config = ModConfigBuilder::default()
2452            .base('N')
2453            .is_strand_plus(true)
2454            .mod_code("n".into())
2455            .win([4].into())
2456            .mod_range([(0.5, 0.5)].into())
2457            .build()
2458            .unwrap();
2459        let mut rng = rand::rng();
2460        let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config], &seq, &mut rng);
2461
2462        // N base means all 4 bases should be marked
2463        assert_eq!(ml_vec.len(), 4);
2464        // Parse and verify the actual pos values in MM tag
2465        let pos: Vec<&str> = mm_str
2466            .strip_prefix("N+n?,")
2467            .unwrap()
2468            .strip_suffix(';')
2469            .unwrap()
2470            .split(',')
2471            .collect();
2472        assert_eq!(pos.len(), 4, "Should have exactly 4 modifications for ACGT");
2473
2474        // All positions should be 0
2475        assert!(
2476            pos.iter().all(|&x| x == "0"),
2477            "All N base modifications should have 0 as their gap position coordinate"
2478        );
2479
2480        // Verify all ML values are 128 (probabilities)
2481        assert_eq!(
2482            ml_vec,
2483            vec![128u8, 128u8, 128u8, 128u8],
2484            "All ML values should be 128 and number of values is 4"
2485        );
2486    }
2487
2488    /// Tests `generate_random_dna_modification` with cycling windows
2489    #[test]
2490    fn generate_random_dna_modification_cycling_windows() {
2491        let seq = DNARestrictive::from_str("CCCCCCCCCCCCCCCC").unwrap(); // 16 C's
2492
2493        let mod_config = ModConfigBuilder::default()
2494            .base('C')
2495            .mod_code("m".into())
2496            .win([3, 2].into())
2497            .mod_range([(0.8, 0.8), (0.4, 0.4)].into())
2498            .build()
2499            .unwrap();
2500
2501        let mut rng = rand::rng();
2502        let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config], &seq, &mut rng);
2503
2504        // Should have 16 modifications, cycling pattern: 3@0.8, 2@0.4, 3@0.8, 2@0.4, ...
2505        let pos: Vec<&str> = mm_str
2506            .strip_prefix("C+m?,")
2507            .unwrap()
2508            .strip_suffix(';')
2509            .unwrap()
2510            .split(',')
2511            .collect();
2512        assert_eq!(pos, vec!["0"; 16]);
2513        // Verify the cycling pattern (0.8 = 204, 0.4 = 102)
2514        let expected_pattern: Vec<u8> = vec![
2515            204, 204, 204, 102, 102, 204, 204, 204, 102, 102, 204, 204, 204, 102, 102, 204,
2516        ];
2517        assert_eq!(ml_vec, expected_pattern);
2518    }
2519    /// Tests multiple simultaneous modifications on different bases
2520    #[expect(
2521        clippy::similar_names,
2522        reason = "has_c_mod, has_a_mod, has_t_mod are clear in context"
2523    )]
2524    #[test]
2525    fn multiple_simultaneous_modifications_work() {
2526        let config_json = r#"{
2527            "contigs": {
2528                "number": 1,
2529                "len_range": [100, 100],
2530                "repeated_seq": "ACGTACGTACGTACGT"
2531            },
2532            "reads": [{
2533                "number": 20,
2534                "mapq_range": [20, 30],
2535                "base_qual_range": [20, 30],
2536                "len_range": [0.8, 1.0],
2537                "mods": [
2538                    {
2539                        "base": "C",
2540                        "is_strand_plus": true,
2541                        "mod_code": "m",
2542                        "win": [2],
2543                        "mod_range": [[0.7, 0.9]]
2544                    },
2545                    {
2546                        "base": "A",
2547                        "is_strand_plus": true,
2548                        "mod_code": "a",
2549                        "win": [3],
2550                        "mod_range": [[0.3, 0.5]]
2551                    },
2552                    {
2553                        "base": "T",
2554                        "is_strand_plus": false,
2555                        "mod_code": "t",
2556                        "win": [1],
2557                        "mod_range": [[0.5, 0.6]]
2558                    }
2559                ]
2560            }]
2561        }"#;
2562
2563        let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
2564        let sim = TempBamSimulation::new(config).unwrap();
2565        let mut reader = bam::Reader::from_path(sim.bam_path()).unwrap();
2566
2567        let mut has_c_mod = false;
2568        let mut has_a_mod = false;
2569        let mut has_t_mod = false;
2570
2571        for record in reader.records() {
2572            let read = record.unwrap();
2573            if let Ok(Aux::String(mm_tag)) = read.aux(b"MM") {
2574                // Check that all three modification types are present
2575                if mm_tag.contains("C+m?") {
2576                    has_c_mod = true;
2577                }
2578                if mm_tag.contains("A+a?") {
2579                    has_a_mod = true;
2580                }
2581                if mm_tag.contains("T-t?") {
2582                    has_t_mod = true;
2583                }
2584
2585                // Verify ML tag exists and has correct format
2586                assert!(
2587                    matches!(read.aux(b"ML").unwrap(), Aux::ArrayU8(_)),
2588                    "ML tag should be ArrayU8"
2589                );
2590            }
2591        }
2592
2593        assert!(has_c_mod, "Should have C+m modifications");
2594        assert!(has_a_mod, "Should have A+a modifications");
2595        assert!(has_t_mod, "Should have T-t modifications");
2596    }
2597
2598    /// Tests that modifications only target the specified bases
2599    #[expect(
2600        clippy::shadow_unrelated,
2601        reason = "clear variable reuse in sequential tests"
2602    )]
2603    #[test]
2604    fn modifications_target_correct_bases() {
2605        // Create sequence with known base counts
2606        let seq = DNARestrictive::from_str("AAAACCCCGGGGTTTT").unwrap();
2607
2608        // template we will reuse for various mods
2609        let mod_config_template = ModConfigBuilder::default()
2610            .base('C')
2611            .mod_code("m".into())
2612            .win([4].into())
2613            .mod_range([(1.0, 1.0)].into());
2614
2615        // Test C modification - should only mark C bases
2616        let mod_config_c = mod_config_template.clone().build().unwrap();
2617
2618        let mut rng = rand::rng();
2619        let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config_c], &seq, &mut rng);
2620
2621        // Sequence has exactly 4 C's, so should have 4 modifications
2622        assert_eq!(ml_vec.len(), 4);
2623        let probs: Vec<&str> = mm_str
2624            .strip_prefix("C+m?,")
2625            .unwrap()
2626            .strip_suffix(';')
2627            .unwrap()
2628            .split(',')
2629            .collect();
2630        assert_eq!(probs.len(), 4, "Should have exactly 4 C modifications");
2631
2632        // Test T modification - should only mark T bases
2633        let mod_config_t = mod_config_template
2634            .clone()
2635            .base('T')
2636            .is_strand_plus(false)
2637            .mod_code("t".into())
2638            .build()
2639            .unwrap();
2640
2641        let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config_t], &seq, &mut rng);
2642
2643        // Sequence has exactly 4 T's, so should have 4 modifications
2644        assert_eq!(ml_vec.len(), 4);
2645        let probs: Vec<&str> = mm_str
2646            .strip_prefix("T-t?,")
2647            .unwrap()
2648            .strip_suffix(';')
2649            .unwrap()
2650            .split(',')
2651            .collect();
2652        assert_eq!(probs.len(), 4, "Should have exactly 4 T modifications");
2653
2654        // Test A modification - should only mark A bases
2655        let mod_config_a = mod_config_template
2656            .clone()
2657            .base('A')
2658            .mod_code("a".into())
2659            .build()
2660            .unwrap();
2661
2662        let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config_a], &seq, &mut rng);
2663
2664        // Sequence has exactly 4 A's, so should have 4 modifications
2665        assert_eq!(ml_vec.len(), 4);
2666        let probs: Vec<&str> = mm_str
2667            .strip_prefix("A+a?,")
2668            .unwrap()
2669            .strip_suffix(';')
2670            .unwrap()
2671            .split(',')
2672            .collect();
2673        assert_eq!(probs.len(), 4, "Should have exactly 4 A modifications");
2674
2675        // Test G modification - should only mark G bases
2676        let mod_config_g = mod_config_template
2677            .clone()
2678            .base('G')
2679            .mod_code("g".into())
2680            .build()
2681            .unwrap();
2682
2683        let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config_g], &seq, &mut rng);
2684
2685        // Sequence has exactly 4 G's, so should have 4 modifications
2686        assert_eq!(ml_vec.len(), 4);
2687        let probs: Vec<&str> = mm_str
2688            .strip_prefix("G+g?,")
2689            .unwrap()
2690            .strip_suffix(';')
2691            .unwrap()
2692            .split(',')
2693            .collect();
2694        assert_eq!(probs.len(), 4, "Should have exactly 4 G modifications");
2695    }
2696
2697    /// Tests edge case: sequence with no target base for modification
2698    #[test]
2699    fn edge_case_no_target_base_for_modification() {
2700        let seq = DNARestrictive::from_str("AAAAAAAAAA").unwrap(); // Only A's
2701
2702        let mod_config = ModConfigBuilder::default()
2703            .base('C')
2704            .mod_code("m".into())
2705            .win([5].into())
2706            .mod_range([(0.5, 0.5)].into())
2707            .build()
2708            .unwrap();
2709
2710        let mut rng = rand::rng();
2711        let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config], &seq, &mut rng);
2712
2713        // Should have no modifications since no C's exist
2714        assert_eq!(mm_str, String::new());
2715        assert_eq!(ml_vec.len(), 0);
2716    }
2717
2718    /// Tests edge case: modification with probability 0.0 and 1.0
2719    #[expect(
2720        clippy::shadow_unrelated,
2721        reason = "clear variable reuse in sequential tests"
2722    )]
2723    #[test]
2724    fn edge_case_modification_probability_extremes() {
2725        let seq = DNARestrictive::from_str("CCCCCCCC").unwrap();
2726
2727        // template to build modification configurations
2728        let mod_config_template = ModConfigBuilder::default()
2729            .base('C')
2730            .mod_code("m".into())
2731            .win([8].into());
2732
2733        let mod_config_zero = mod_config_template
2734            .clone()
2735            .mod_range([(0.0, 0.0)].into())
2736            .build()
2737            .unwrap();
2738
2739        let mut rng = rand::rng();
2740        let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config_zero], &seq, &mut rng);
2741
2742        assert_eq!(ml_vec, vec![0u8; 8]);
2743        let pos: Vec<&str> = mm_str
2744            .strip_prefix("C+m?,")
2745            .unwrap()
2746            .strip_suffix(';')
2747            .unwrap()
2748            .split(',')
2749            .collect();
2750        assert!(pos.iter().all(|&x| x == "0"));
2751
2752        // Test probability 1.0
2753        let mod_config_one = mod_config_template
2754            .clone()
2755            .mod_range([(1.0, 1.0)].into())
2756            .build()
2757            .unwrap();
2758
2759        let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config_one], &seq, &mut rng);
2760
2761        assert_eq!(ml_vec, vec![255u8; 8]);
2762        let pos: Vec<&str> = mm_str
2763            .strip_prefix("C+m?,")
2764            .unwrap()
2765            .strip_suffix(';')
2766            .unwrap()
2767            .split(',')
2768            .collect();
2769        assert!(pos.iter().all(|&x| x == "0"));
2770    }
2771
2772    /// Tests config deserialization from JSON string with mods
2773    #[expect(
2774        clippy::indexing_slicing,
2775        reason = "test validates length before indexing"
2776    )]
2777    #[test]
2778    fn config_deserialization_works_with_json_with_mods() {
2779        // Create a JSON config string
2780        let json_str = r#"{
2781            "contigs": {
2782                "number": 5,
2783                "len_range": [100, 500],
2784                "repeated_seq": "ACGTACGT"
2785            },
2786            "reads": [
2787                {
2788                    "number": 100,
2789                    "mapq_range": [10, 30],
2790                    "base_qual_range": [20, 40],
2791                    "len_range": [0.1, 0.9],
2792                    "barcode": "ACGTAA",
2793                    "mods": [{
2794                        "base": "C",
2795                        "is_strand_plus": true,
2796                        "mod_code": "m",
2797                        "win": [5, 3],
2798                        "mod_range": [[0.3, 0.7], [0.1, 0.5]]
2799                    }]
2800                },
2801                {
2802                    "number": 50,
2803                    "mapq_range": [5, 15],
2804                    "base_qual_range": [15, 25],
2805                    "len_range": [0.2, 0.8]
2806                }
2807            ]
2808        }"#;
2809
2810        // Deserialize
2811        let config: SimulationConfig = serde_json::from_str(json_str).unwrap();
2812
2813        // Verify all fields deserialized correctly
2814        assert_eq!(config.contigs.number.get(), 5);
2815        assert_eq!(config.contigs.len_range.low().get(), 100);
2816        assert_eq!(config.contigs.len_range.high().get(), 500);
2817        assert!(config.contigs.repeated_seq.is_some());
2818
2819        assert_eq!(config.reads.len(), 2);
2820
2821        // Verify first read config
2822        assert_eq!(config.reads[0].number.get(), 100);
2823        assert_eq!(config.reads[0].mapq_range.low(), 10);
2824        assert_eq!(config.reads[0].mapq_range.high(), 30);
2825        assert_eq!(config.reads[0].base_qual_range.low(), 20);
2826        assert_eq!(config.reads[0].base_qual_range.high(), 40);
2827        assert!(config.reads[0].barcode.is_some());
2828        assert_eq!(config.reads[0].mods.len(), 1);
2829        assert!(matches!(config.reads[0].mods[0].base, AllowedAGCTN::C));
2830        assert_eq!(config.reads[0].mods[0].win.len(), 2);
2831
2832        // Verify second read config
2833        assert_eq!(config.reads[1].number.get(), 50);
2834        assert_eq!(config.reads[1].mods.len(), 0);
2835        assert!(config.reads[1].barcode.is_none());
2836    }
2837
2838    /// Tests that modifications are applied to barcode bases
2839    /// Creates a sequence with no C's, but barcode has C's, so modifications should appear
2840    #[test]
2841    fn barcodes_with_modifications_integration_works() {
2842        // Create contig with only A's (no C's)
2843        let contigs = [ContigBuilder::default()
2844            .name("chr1")
2845            .seq("A".repeat(50))
2846            .build()
2847            .unwrap()];
2848
2849        let config = ReadConfigBuilder::default()
2850            .number(20)
2851            .len_range((0.3, 0.7))
2852            .barcode("ACGTAA".into()) // barcode contains Cs i.e. barcode sequence has one C
2853            .mods(
2854                [ModConfigBuilder::default()
2855                    .base('C')
2856                    .mod_code("m".into())
2857                    .win([10].into())
2858                    .mod_range([(0.8, 0.8)].into())
2859                    .build()
2860                    .unwrap()]
2861                .into(),
2862            )
2863            .build()
2864            .unwrap();
2865
2866        let mut rng = rand::rng();
2867        let reads = generate_reads_denovo(&contigs, &config, "test", &mut rng).unwrap();
2868
2869        let mut has_modifications = false;
2870        for read in &reads {
2871            if !read.is_unmapped() {
2872                // The read sequence should have barcodes added
2873                // Forward: ACGTAA + AAAAA... + TTACGT (barcode has 1 C, rev comp has 0 C)
2874                // Reverse: TGCATT + AAAAA... + AATGCA (complement has 1 C, reverse has 0 C)
2875                // So each read should have at least 1 C from the barcode
2876
2877                let mm_pos_result = read.aux(b"MM");
2878                let ml_prob_result = read.aux(b"ML");
2879
2880                // Should have MM and ML tags because barcode introduced C's
2881                assert!(
2882                    mm_pos_result.is_ok(),
2883                    "MM tag should be present due to C in barcode"
2884                );
2885                assert!(
2886                    ml_prob_result.is_ok(),
2887                    "ML tag should be present due to C in barcode"
2888                );
2889
2890                if let Ok(Aux::String(mm_str)) = mm_pos_result {
2891                    assert!(
2892                        mm_str.starts_with("C+m?,"),
2893                        "MM tag should contain C+m modifications"
2894                    );
2895                    assert!(!mm_str.is_empty(), "MM tag should not be empty");
2896                    has_modifications = true;
2897                }
2898            }
2899        }
2900
2901        assert!(
2902            has_modifications,
2903            "Should have found C modifications from barcode bases"
2904        );
2905    }
2906
2907    /// Tests edge case: window size larger than number of target bases in sequence
2908    #[test]
2909    fn edge_case_window_larger_than_sequence() {
2910        let seq = DNARestrictive::from_str("ACGTACGT").unwrap(); // Only 2 C's
2911
2912        let mod_config = ModConfigBuilder::default()
2913            .base('C')
2914            .mod_code("m".into())
2915            .win([100].into()) // window of 100 Cs but only 2Cs exist in the sequence.
2916            .mod_range([(0.5, 0.5)].into())
2917            .build()
2918            .unwrap();
2919
2920        let mut rng = rand::rng();
2921        let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config], &seq, &mut rng);
2922
2923        // Should only generate modifications for the 2 C's that exist
2924        assert_eq!(
2925            ml_vec,
2926            vec![128u8, 128u8],
2927            "All prob should be 128 (0.5*255)"
2928        );
2929        let pos: Vec<&str> = mm_str
2930            .strip_prefix("C+m?,")
2931            .unwrap()
2932            .strip_suffix(';')
2933            .unwrap()
2934            .split(',')
2935            .collect();
2936        assert_eq!(pos.len(), 2, "Should have exactly 2 position values");
2937        assert!(pos.iter().all(|&x| x == "0"), "All pos should be 0");
2938    }
2939
2940    /// Tests edge case: single-base windows with multiple cycles
2941    #[test]
2942    fn edge_case_single_base_windows() {
2943        let seq = DNARestrictive::from_str("C".repeat(16).as_str()).unwrap(); // 16 C's
2944
2945        let mod_config = ModConfigBuilder::default()
2946            .base('C')
2947            .is_strand_plus(true)
2948            .mod_code("m".into())
2949            .win([1].into())
2950            .mod_range([(0.9, 0.9), (0.1, 0.1)].into())
2951            .build()
2952            .unwrap();
2953
2954        let mut rng = rand::rng();
2955        let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config], &seq, &mut rng);
2956
2957        // Should have 16 modifications, alternating pattern: 0.9, 0.1, 0.9, 0.1, ...
2958        // Verify alternating pattern (0.9*255=229.5→230, 0.1*255=25.5→26)
2959        let expected_pattern: Vec<u8> = vec![
2960            230, 26, 230, 26, 230, 26, 230, 26, 230, 26, 230, 26, 230, 26, 230, 26,
2961        ];
2962        assert_eq!(ml_vec, expected_pattern);
2963        let pos: Vec<&str> = mm_str
2964            .strip_prefix("C+m?,")
2965            .unwrap()
2966            .strip_suffix(';')
2967            .unwrap()
2968            .split(',')
2969            .collect();
2970        assert_eq!(pos, vec!["0"; 16], "pos should have 16 entries, all 0");
2971    }
2972
2973    /// Tests that modifications on reverse reads are correctly applied to reverse-complemented sequence
2974    #[test]
2975    fn reverse_reads_modification_positions_correct() {
2976        // Create sequence with 3 C's: "AAAAACAAAAACAAAAAC"
2977        let contigs = [ContigBuilder::default()
2978            .name("chr1")
2979            .seq("AAAAACAAAAACAAAAAC".into())
2980            .build()
2981            .unwrap()];
2982
2983        let config = ReadConfigBuilder::default()
2984            .number(100)
2985            .len_range((1.0, 1.0))
2986            .mods(
2987                [ModConfigBuilder::default()
2988                    .base('C')
2989                    .mod_code("m".into())
2990                    .win([10].into())
2991                    .mod_range([(0.8, 0.8)].into())
2992                    .build()
2993                    .unwrap()]
2994                .into(),
2995            )
2996            .build()
2997            .unwrap();
2998
2999        let mut rng = rand::rng();
3000        let reads = generate_reads_denovo(&contigs, &config, "test", &mut rng).unwrap();
3001
3002        let mut forward_with_mods = 0;
3003
3004        for read in &reads {
3005            if !read.is_unmapped() {
3006                let mm_result = read.aux(b"MM");
3007
3008                if read.is_reverse() {
3009                    // Reverse complement of "AAAAACAAAAACAAAAAC"
3010                    // has 3 G's but 0 C's, so no C modifications should exist
3011                    assert!(
3012                        mm_result.is_err(),
3013                        "Reverse reads should have no MM tag (revcomp has no C's)"
3014                    );
3015                } else {
3016                    // Forward reads should have MM tag with 3 C modifications
3017                    assert!(mm_result.is_ok(), "Forward reads should have MM tag");
3018                    if let Ok(Aux::String(mm_str)) = mm_result {
3019                        // Parse and count modifications
3020                        let probs: Vec<&str> = mm_str
3021                            .strip_prefix("C+m?,")
3022                            .unwrap()
3023                            .strip_suffix(';')
3024                            .unwrap()
3025                            .split(',')
3026                            .collect();
3027                        assert_eq!(probs.len(), 3, "Should have exactly 3 C modifications");
3028                        forward_with_mods += 1;
3029                    }
3030                }
3031            }
3032        }
3033
3034        assert!(
3035            forward_with_mods > 0,
3036            "Should have validated some forward reads with modifications"
3037        );
3038    }
3039
3040    /// Tests that mismatches affect modification reference positions while preserving mod quality
3041    ///
3042    /// This test creates two read groups:
3043    /// - Group 0: 100% mod probability (quality 255), no mismatches - mods at expected C positions
3044    /// - Group 1: 51% mod probability (quality ~130), 100% mismatch - mods at shifted positions
3045    ///
3046    /// For a "ACGT" repeated contig, C's are at positions 1, 5, 9, 13, ... (form 4n+1).
3047    /// On reverse complement, G's become C's at positions 2, 6, 10, 14, ... (form 4n+2).
3048    /// With 100% mismatch, positions should be shifted and no longer follow these patterns.
3049    #[expect(
3050        clippy::too_many_lines,
3051        reason = "test requires setup, iteration, and multiple assertions for thorough validation"
3052    )]
3053    #[test]
3054    fn mismatch_mod_check() {
3055        let json_str = r#"{
3056            "contigs": {
3057                "number": 1,
3058                "len_range": [100, 100],
3059                "repeated_seq": "ACGT"
3060            },
3061            "reads": [
3062                {
3063                    "number": 100,
3064                    "len_range": [1.0, 1.0],
3065                    "mods": [{
3066                        "base": "C",
3067                        "is_strand_plus": true,
3068                        "mod_code": "m",
3069                        "win": [1],
3070                        "mod_range": [[1.0, 1.0]]
3071                    }]
3072                },
3073                {
3074                    "number": 100,
3075                    "len_range": [1.0, 1.0],
3076                    "mismatch": 1.0,
3077                    "mods": [{
3078                        "base": "C",
3079                        "is_strand_plus": true,
3080                        "mod_code": "m",
3081                        "win": [1],
3082                        "mod_range": [[0.51, 0.51]]
3083                    }]
3084                }
3085            ]
3086        }"#;
3087
3088        let config: SimulationConfig = serde_json::from_str(json_str).unwrap();
3089        let sim = TempBamSimulation::new(config).unwrap();
3090
3091        let mut bam = bam::Reader::from_path(sim.bam_path()).unwrap();
3092        let mut df_collection = Vec::new();
3093
3094        for k in bam.records() {
3095            let record = k.unwrap();
3096            let curr_read = CurrRead::default()
3097                .try_from_only_alignment(&record)
3098                .unwrap()
3099                .set_mod_data(&record, ThresholdState::GtEq(0), 0)
3100                .unwrap();
3101            df_collection.push(curr_read);
3102        }
3103
3104        let df = curr_reads_to_dataframe(df_collection.as_slice()).unwrap();
3105
3106        let read_id_col = df.column("read_id").unwrap().str().unwrap();
3107        let ref_position_col = df.column("ref_position").unwrap().i64().unwrap();
3108        let alignment_type_col = df.column("alignment_type").unwrap().str().unwrap();
3109        let mod_quality_col = df.column("mod_quality").unwrap().u32().unwrap();
3110
3111        let mut row_count_0: usize = 0;
3112        let mut row_count_1: usize = 0;
3113
3114        // Violation counters for debugging
3115        let mut group0_forward_position_violations: usize = 0;
3116        let mut group0_reverse_position_violations: usize = 0;
3117        let mut group0_unmapped_position_violations: usize = 0;
3118        let mut group0_quality_violations: usize = 0;
3119        let mut group1_forward_position_violations: usize = 0;
3120        let mut group1_reverse_position_violations: usize = 0;
3121        let mut group1_unmapped_position_violations: usize = 0;
3122        let mut group1_quality_violations: usize = 0;
3123        let mut read_id_violations: usize = 0;
3124
3125        for i in 0..df.height() {
3126            let read_id = read_id_col.get(i).unwrap();
3127            let ref_position = ref_position_col.get(i).unwrap();
3128            let alignment_type = alignment_type_col.get(i).unwrap();
3129            let mod_quality = mod_quality_col.get(i).unwrap();
3130
3131            if read_id.starts_with("0.") {
3132                // Group 0: no mismatch, mod positions should follow 4n+1 (forward) or 4n+2 (reverse)
3133                if alignment_type.contains("forward") {
3134                    if ref_position % 4 != 1 {
3135                        group0_forward_position_violations += 1;
3136                    }
3137                } else if alignment_type.contains("reverse") {
3138                    if ref_position % 4 != 2 {
3139                        group0_reverse_position_violations += 1;
3140                    }
3141                } else {
3142                    // unmapped reads should have ref_position of -1
3143                    if ref_position != -1 {
3144                        group0_unmapped_position_violations += 1;
3145                    }
3146                }
3147                if mod_quality != 255 {
3148                    group0_quality_violations += 1;
3149                }
3150                row_count_0 += 1;
3151            } else if read_id.starts_with("1.") {
3152                // Group 1: 100% mismatch, mod positions should NOT follow expected patterns
3153                if alignment_type.contains("forward") {
3154                    if ref_position % 4 == 1 {
3155                        group1_forward_position_violations += 1;
3156                    }
3157                } else if alignment_type.contains("reverse") {
3158                    if ref_position % 4 == 2 {
3159                        group1_reverse_position_violations += 1;
3160                    }
3161                } else {
3162                    // unmapped reads should have ref_position of -1
3163                    if ref_position != -1 {
3164                        group1_unmapped_position_violations += 1;
3165                    }
3166                }
3167                if mod_quality != 130 {
3168                    group1_quality_violations += 1;
3169                }
3170                row_count_1 += 1;
3171            } else {
3172                read_id_violations += 1;
3173            }
3174        }
3175
3176        // Assert all violations are zero
3177        assert_eq!(
3178            group0_forward_position_violations, 0,
3179            "Group 0 forward: expected 0 position violations"
3180        );
3181        assert_eq!(
3182            group0_reverse_position_violations, 0,
3183            "Group 0 reverse: expected 0 position violations"
3184        );
3185        assert_eq!(
3186            group0_unmapped_position_violations, 0,
3187            "Group 0 unmapped: expected 0 position violations"
3188        );
3189        assert_eq!(
3190            group0_quality_violations, 0,
3191            "Group 0: expected 0 quality violations"
3192        );
3193        assert_eq!(
3194            group1_forward_position_violations, 0,
3195            "Group 1 forward: expected 0 position violations"
3196        );
3197        assert_eq!(
3198            group1_reverse_position_violations, 0,
3199            "Group 1 reverse: expected 0 position violations"
3200        );
3201        assert_eq!(
3202            group1_unmapped_position_violations, 0,
3203            "Group 1 unmapped: expected 0 position violations"
3204        );
3205        assert_eq!(
3206            group1_quality_violations, 0,
3207            "Group 1: expected 0 quality violations"
3208        );
3209        assert_eq!(read_id_violations, 0, "expected 0 read_id violations");
3210        assert!(
3211            row_count_0 > 0,
3212            "Should have processed some rows from group 0"
3213        );
3214        assert!(
3215            row_count_1 > 0,
3216            "Should have processed some rows from group 1"
3217        );
3218    }
3219}
3220
3221#[cfg(test)]
3222mod contig_generation_tests {
3223    use super::*;
3224    use rust_htslib::bam::Read as _;
3225
3226    /// Tests edge case: repeated sequence contig generation
3227    #[test]
3228    fn edge_case_repeated_sequence_exact_length() {
3229        let config_json = r#"{
3230            "contigs": {
3231                "number": 1,
3232                "len_range": [16, 16],
3233                "repeated_seq": "ACGT"
3234            },
3235            "reads": [{
3236                "number": 5,
3237                "len_range": [0.5, 0.5]
3238            }]
3239        }"#;
3240
3241        let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
3242        let sim = TempBamSimulation::new(config).unwrap();
3243        let reader = bam::Reader::from_path(sim.bam_path()).unwrap();
3244
3245        // Verify contig is exactly ACGTACGTACGTACGT (4 repeats)
3246        let header = reader.header();
3247        assert_eq!(header.target_len(0).unwrap(), 16);
3248    }
3249
3250    /// Tests edge case: minimum contig size (1 bp)
3251    #[test]
3252    fn edge_case_minimum_contig_size() {
3253        let config_json = r#"{
3254            "contigs": {
3255                "number": 1,
3256                "len_range": [1, 1]
3257            },
3258            "reads": [{
3259                "number": 10,
3260                "mapq_range": [10, 20],
3261                "base_qual_range": [20, 30],
3262                "len_range": [1.0, 1.0]
3263            }]
3264        }"#;
3265
3266        let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
3267        let sim = TempBamSimulation::new(config).unwrap();
3268        let mut reader = bam::Reader::from_path(sim.bam_path()).unwrap();
3269
3270        // Verify 1bp contig works
3271        let header = reader.header();
3272        assert_eq!(header.target_count(), 1);
3273        assert_eq!(header.target_len(0).unwrap(), 1);
3274
3275        // Some reads should exist (though some may be unmapped)
3276        assert!(reader.records().count() > 0);
3277    }
3278
3279    /// Tests contig generation
3280    #[test]
3281    fn generate_contigs_denovo_works() {
3282        let config = ContigConfigBuilder::default()
3283            .number(5)
3284            .len_range((100, 200))
3285            .build()
3286            .unwrap();
3287        let contigs = generate_contigs_denovo(config.number, config.len_range, &mut rand::rng());
3288        assert_eq!(contigs.len(), 5);
3289        for (i, contig) in contigs.iter().enumerate() {
3290            assert_eq!(contig.name, format!("contig_0000{i}"));
3291            assert!((100..=200).contains(&contig.get_dna_restrictive().get().len()));
3292            for base in contig.get_dna_restrictive().get() {
3293                assert!([b'A', b'C', b'G', b'T'].contains(base));
3294            }
3295        }
3296    }
3297
3298    /// Tests contig generation with repeated sequence
3299    #[test]
3300    fn generate_contigs_denovo_repeated_seq_works() {
3301        let seq = DNARestrictive::from_str("ACGT").unwrap();
3302        let contigs = generate_contigs_denovo_repeated_seq(
3303            NonZeroU32::new(10000).unwrap(),
3304            OrdPair::new(NonZeroU64::new(10).unwrap(), NonZeroU64::new(12).unwrap()).unwrap(),
3305            &seq,
3306            &mut rand::rng(),
3307        );
3308
3309        let mut counts = [0, 0, 0];
3310
3311        assert_eq!(contigs.len(), 10000);
3312        for (i, contig) in contigs.iter().enumerate() {
3313            assert_eq!(contig.name, format!("contig_{i:05}"));
3314            let idx = match contig.get_dna_restrictive().get() {
3315                b"ACGTACGTAC" => 0,
3316                b"ACGTACGTACG" => 1,
3317                b"ACGTACGTACGT" => 2,
3318                _ => unreachable!(),
3319            };
3320            *counts
3321                .get_mut(idx)
3322                .expect("idx is 0, 1, or 2; counts has 3 elements") += 1;
3323        }
3324
3325        // 3000/10000 is quite lax actually - we expect ~3333 each
3326        for count in counts {
3327            assert!(count >= 3000);
3328        }
3329    }
3330}
3331
3332/// Tests for `PerfectSeqMatchToNot` methods
3333#[cfg(test)]
3334mod perfect_seq_match_to_not_tests {
3335    use super::*;
3336
3337    #[test]
3338    fn seq_with_valid_sequence() -> Result<(), Error> {
3339        let _builder = PerfectSeqMatchToNot::seq(b"ACGT".to_vec())?;
3340        Ok(())
3341    }
3342
3343    #[test]
3344    fn seq_with_empty_sequence_returns_error() {
3345        let result = PerfectSeqMatchToNot::seq(b"".to_vec());
3346        assert!(matches!(result, Err(Error::InvalidState(_))));
3347    }
3348
3349    #[test]
3350    fn build_without_barcode_forward_read() -> Result<(), Error> {
3351        let mut rng = rand::rng();
3352        let (seq, cigar) = PerfectSeqMatchToNot::seq(b"GGGGGGGG".to_vec())?
3353            .build(ReadState::PrimaryFwd, &mut rng)?;
3354        assert_eq!(seq, b"GGGGGGGG");
3355        assert_eq!(cigar.expect("no error").to_string(), "8M");
3356        Ok(())
3357    }
3358
3359    #[test]
3360    fn build_without_barcode_reverse_read() -> Result<(), Error> {
3361        let mut rng = rand::rng();
3362        let (seq, cigar) = PerfectSeqMatchToNot::seq(b"GGGGGGGG".to_vec())?
3363            .build(ReadState::PrimaryRev, &mut rng)?;
3364        assert_eq!(seq, b"GGGGGGGG");
3365        assert_eq!(cigar.expect("no error").to_string(), "8M");
3366        Ok(())
3367    }
3368
3369    #[test]
3370    fn build_with_barcode_forward_read() -> Result<(), Error> {
3371        let mut rng = rand::rng();
3372        let barcode = DNARestrictive::from_str("ACGTAA").unwrap();
3373        let (seq, cigar) = PerfectSeqMatchToNot::seq(b"GGGGGGGG".to_vec())?
3374            .barcode(barcode)
3375            .build(ReadState::PrimaryFwd, &mut rng)?;
3376        assert_eq!(seq, b"ACGTAAGGGGGGGGTTACGT");
3377        assert_eq!(cigar.expect("no error").to_string(), "6S8M6S");
3378        Ok(())
3379    }
3380
3381    #[test]
3382    fn build_with_barcode_reverse_read() -> Result<(), Error> {
3383        let mut rng = rand::rng();
3384        let barcode = DNARestrictive::from_str("ACGTAA").unwrap();
3385        let (seq, cigar) = PerfectSeqMatchToNot::seq(b"GGGGGGGG".to_vec())?
3386            .barcode(barcode)
3387            .build(ReadState::PrimaryRev, &mut rng)?;
3388        assert_eq!(seq, b"TGCATTGGGGGGGGAATGCA");
3389        assert_eq!(cigar.expect("no error").to_string(), "6S8M6S");
3390        Ok(())
3391    }
3392
3393    #[test]
3394    fn build_unmapped_read_returns_none_cigar() -> Result<(), Error> {
3395        let mut rng = rand::rng();
3396        let (seq, cigar) = PerfectSeqMatchToNot::seq(b"GGGGGGGG".to_vec())?
3397            .build(ReadState::Unmapped, &mut rng)?;
3398        assert_eq!(seq, b"GGGGGGGG");
3399        assert!(cigar.is_none());
3400        Ok(())
3401    }
3402
3403    #[test]
3404    fn build_unmapped_read_with_barcode_returns_none_cigar() -> Result<(), Error> {
3405        let mut rng = rand::rng();
3406        let barcode = DNARestrictive::from_str("ACGTAA").unwrap();
3407        let (seq, cigar) = PerfectSeqMatchToNot::seq(b"GGGGGGGG".to_vec())?
3408            .barcode(barcode)
3409            .build(ReadState::Unmapped, &mut rng)?;
3410        assert_eq!(seq, b"ACGTAAGGGGGGGGTTACGT");
3411        assert!(cigar.is_none());
3412        Ok(())
3413    }
3414
3415    #[test]
3416    fn build_with_delete() -> Result<(), Error> {
3417        let mut rng = rand::rng();
3418        let delete_range = OrdPair::new(F32Bw0and1::try_from(0.5)?, F32Bw0and1::try_from(0.75)?)?;
3419        let (seq, cigar) = PerfectSeqMatchToNot::seq(b"AAAATTTTCCCCGGGG".to_vec())?
3420            .delete(delete_range)
3421            .build(ReadState::PrimaryFwd, &mut rng)?;
3422
3423        // Original: AAAATTTTCCCCGGGG (16 bases)
3424        // Delete 50%-75%: positions 8-12, deletes CCCC (4 bases)
3425        // Result: AAAATTTTGGGG (12 bases)
3426        assert_eq!(seq, b"AAAATTTTGGGG");
3427        assert_eq!(cigar.expect("no error").to_string(), "8M4D4M");
3428        Ok(())
3429    }
3430
3431    #[test]
3432    fn build_with_insert_middle() -> Result<(), Error> {
3433        let mut rng = rand::rng();
3434        let insert_seq = DNARestrictive::from_str("TTTT").unwrap();
3435        let (seq, cigar) = PerfectSeqMatchToNot::seq(b"AAAAGGGG".to_vec())?
3436            .insert_middle(insert_seq)
3437            .build(ReadState::PrimaryFwd, &mut rng)?;
3438
3439        // Original: AAAAGGGG (8 bases)
3440        // Middle is at position 4
3441        // Insert TTTT at position 4
3442        // Result: AAAATTTTGGGG (12 bases)
3443        assert_eq!(seq, b"AAAATTTTGGGG");
3444        assert_eq!(cigar.expect("no error").to_string(), "4M4I4M");
3445        Ok(())
3446    }
3447
3448    #[test]
3449    fn build_with_mismatch() -> Result<(), Error> {
3450        let mut rng = rand::rng();
3451        let mismatch_frac = F32Bw0and1::try_from(0.5)?;
3452        let (seq, cigar) = PerfectSeqMatchToNot::seq(b"AAAAAAAA".to_vec())?
3453            .mismatch(mismatch_frac)
3454            .build(ReadState::PrimaryFwd, &mut rng)?;
3455
3456        // Original: AAAAAAAA (8 bases)
3457        // 50% mismatch = 4 bases should be changed
3458        // Count how many bases are NOT 'A'
3459        let mismatch_count = seq.iter().filter(|&&b| b != b'A').count();
3460        assert_eq!(mismatch_count, 4);
3461
3462        // CIGAR should still be 8M (mismatches don't change CIGAR)
3463        assert_eq!(cigar.expect("no error").to_string(), "8M");
3464        Ok(())
3465    }
3466
3467    #[test]
3468    fn build_with_delete_and_insert() -> Result<(), Error> {
3469        let mut rng = rand::rng();
3470        let delete_range = OrdPair::new(F32Bw0and1::try_from(0.5)?, F32Bw0and1::try_from(0.75)?)?;
3471        let insert_seq = DNARestrictive::from_str("TT").unwrap();
3472        let (seq, cigar) = PerfectSeqMatchToNot::seq(b"AAAACCCCGGGG".to_vec())?
3473            .delete(delete_range)
3474            .insert_middle(insert_seq)
3475            .build(ReadState::PrimaryFwd, &mut rng)?;
3476
3477        // Original: AAAACCCCGGGG (12 bases)
3478        // Mark positions 6-8 for deletion (CCG)
3479        // Insert TT at position 6 (middle of 12)
3480        // Final operations: 6M (AAAACC) + 2I (TT) + 3D (CCG) + 3M (GGG)
3481        // Result: AAAACCTTGGG (11 bases)
3482        assert_eq!(seq, b"AAAACCTTGGG");
3483        assert_eq!(cigar.expect("no error").to_string(), "6M2I3D3M");
3484        Ok(())
3485    }
3486
3487    #[test]
3488    fn build_with_all_features() -> Result<(), Error> {
3489        let mut rng = rand::rng();
3490        let delete_range = OrdPair::new(F32Bw0and1::try_from(0.25)?, F32Bw0and1::try_from(0.5)?)?;
3491        let insert_seq = DNARestrictive::from_str("CC").unwrap();
3492        let mismatch_frac = F32Bw0and1::try_from(0.25)?;
3493        let barcode = DNARestrictive::from_str("AAA").unwrap();
3494
3495        let (seq, cigar) = PerfectSeqMatchToNot::seq(b"GGGGGGGGGGGGGGGG".to_vec())?
3496            .delete(delete_range)
3497            .insert_middle(insert_seq)
3498            .mismatch(mismatch_frac)
3499            .barcode(barcode)
3500            .build(ReadState::PrimaryFwd, &mut rng)?;
3501
3502        // Original: GGGGGGGGGGGGGGGG (16 bases)
3503        // Mismatch 25% = 4 bases changed (but to C, T, or A)
3504        // Delete 25%-50%: positions 4-8, deletes GGGG (4 bases) -> 12 bases
3505        // Insert CC at middle (position 6) -> 14 bases
3506        // Add barcode AAA to both ends -> 20 bases total
3507        assert_eq!(seq.len(), 20);
3508
3509        // CIGAR should have SoftClips, deletion, insertion, and matches
3510        let cigar_str = cigar.expect("no error").to_string();
3511        assert!(cigar_str.starts_with("3S")); // Barcode at start
3512        assert!(cigar_str.ends_with("3S")); // Barcode at end
3513        assert!(cigar_str.contains('D')); // Deletion
3514        assert!(cigar_str.contains('I')); // Insertion
3515        Ok(())
3516    }
3517
3518    #[test]
3519    fn build_with_delete_zero_length() -> Result<(), Error> {
3520        let mut rng = rand::rng();
3521        // Delete nothing (start == end after rounding)
3522        let delete_range = OrdPair::new(F32Bw0and1::try_from(0.5)?, F32Bw0and1::try_from(0.5)?)?;
3523        let (seq, cigar) = PerfectSeqMatchToNot::seq(b"AAAAAAAA".to_vec())?
3524            .delete(delete_range)
3525            .build(ReadState::PrimaryFwd, &mut rng)?;
3526
3527        // No deletion should occur
3528        assert_eq!(seq, b"AAAAAAAA");
3529        assert_eq!(cigar.expect("no error").to_string(), "8M");
3530        Ok(())
3531    }
3532
3533    #[test]
3534    fn build_with_mismatch_zero_fraction() -> Result<(), Error> {
3535        let mut rng = rand::rng();
3536        let mismatch_frac = F32Bw0and1::try_from(0.0)?;
3537        let (seq, cigar) = PerfectSeqMatchToNot::seq(b"AAAAAAAA".to_vec())?
3538            .mismatch(mismatch_frac)
3539            .build(ReadState::PrimaryFwd, &mut rng)?;
3540
3541        // No mismatches should occur
3542        assert_eq!(seq, b"AAAAAAAA");
3543        assert_eq!(cigar.expect("no error").to_string(), "8M");
3544        Ok(())
3545    }
3546
3547    #[test]
3548    #[should_panic(expected = "SimulateDNASeqCIGAREndProblem")]
3549    fn build_with_delete_whole_length() {
3550        let mut rng = rand::rng();
3551        let delete_range = OrdPair::new(F32Bw0and1::zero(), F32Bw0and1::one()).unwrap();
3552        let (_seq, _cigar) = PerfectSeqMatchToNot::seq(b"AAAAAAAA".to_vec())
3553            .unwrap()
3554            .delete(delete_range)
3555            .build(ReadState::PrimaryFwd, &mut rng)
3556            .unwrap();
3557    }
3558
3559    #[test]
3560    #[should_panic(expected = "SimulateDNASeqCIGAREndProblem")]
3561    fn build_with_insert_almost_whole_length() {
3562        let mut rng = rand::rng();
3563        let insert_seq = DNARestrictive::from_str("AATT").unwrap();
3564        let (_seq, _cigar) = PerfectSeqMatchToNot::seq(b"A".to_vec())
3565            .unwrap()
3566            .insert_middle(insert_seq)
3567            .build(ReadState::PrimaryFwd, &mut rng)
3568            .unwrap();
3569    }
3570
3571    #[test]
3572    #[should_panic(expected = "SimulateDNASeqCIGAREndProblem")]
3573    fn build_with_delete_whole_length_with_barcode() {
3574        let mut rng = rand::rng();
3575        let delete_range = OrdPair::new(F32Bw0and1::zero(), F32Bw0and1::one()).unwrap();
3576        let barcode = DNARestrictive::from_str("CGCG").unwrap();
3577        let (_seq, _cigar) = PerfectSeqMatchToNot::seq(b"AAAAAAAA".to_vec())
3578            .unwrap()
3579            .delete(delete_range)
3580            .barcode(barcode)
3581            .build(ReadState::PrimaryFwd, &mut rng)
3582            .unwrap();
3583    }
3584
3585    #[test]
3586    #[should_panic(expected = "SimulateDNASeqCIGAREndProblem")]
3587    fn build_with_insert_almost_whole_length_with_barcode() {
3588        let mut rng = rand::rng();
3589        let insert_seq = DNARestrictive::from_str("AATT").unwrap();
3590        let barcode = DNARestrictive::from_str("CGCG").unwrap();
3591        let (_seq, _cigar) = PerfectSeqMatchToNot::seq(b"A".to_vec())
3592            .unwrap()
3593            .insert_middle(insert_seq)
3594            .barcode(barcode)
3595            .build(ReadState::PrimaryFwd, &mut rng)
3596            .unwrap();
3597    }
3598}