nanalogue 0.1.9

//! Implements `CurrRead` Struct for processing information
//! and the mod information in the BAM file using a parser implemented in
//! another module.

use crate::{
    AllowedAGCTN, Contains as _, Error, F32Bw0and1, FilterByRefCoords, InputModOptions,
    InputRegionOptions, InputWindowing, ModChar, ReadState, ThresholdState, nanalogue_mm_ml_parser,
};
use bedrs::prelude::{Intersect as _, StrandedBed3};
use bedrs::{Bed3, Coordinates as _, Strand};
use bio_types::genome::AbstractInterval as _;
use derive_builder::Builder;
use fibertools_rs::utils::{
    bamannotations::{FiberAnnotation, Ranges},
    basemods::{BaseMod, BaseMods},
};
use polars::{df, prelude::DataFrame};
use rust_htslib::{bam::ext::BamRecordExtensions as _, bam::record::Record};
use serde::{Deserialize, Serialize};
use std::{
    cmp::Ordering,
    collections::{HashMap, HashSet},
    fmt::{self, Write as _},
    ops::Range,
    rc::Rc,
};

/// Shows [`CurrRead`] has no data, a state-type parameter
#[derive(Debug, Default, Copy, Clone, PartialEq)]
#[non_exhaustive]
pub struct NoData;

/// Shows [`CurrRead`] has only alignment data, a state-type parameter
#[derive(Debug, Default, Copy, Clone, PartialEq)]
#[non_exhaustive]
pub struct OnlyAlignData;

/// Shows [`CurrRead`] has only alignment data but with all fields filled, a state-type parameter
#[derive(Debug, Default, Copy, Clone, PartialEq)]
#[non_exhaustive]
pub struct OnlyAlignDataComplete;

/// Shows [`CurrRead`] has alignment and modification data, a state-type parameter
#[derive(Debug, Default, Copy, Clone, PartialEq)]
#[non_exhaustive]
pub struct AlignAndModData;

/// Dummy trait, associated with the state-type pattern in [`CurrRead`]
pub trait CurrReadState {}

impl CurrReadState for NoData {}
impl CurrReadState for OnlyAlignData {}
impl CurrReadState for OnlyAlignDataComplete {}
impl CurrReadState for AlignAndModData {}

/// Another dummy trait, associated with the state-type pattern in [`CurrRead`]
pub trait CurrReadStateWithAlign {}

impl CurrReadStateWithAlign for OnlyAlignData {}
impl CurrReadStateWithAlign for OnlyAlignDataComplete {}
impl CurrReadStateWithAlign for AlignAndModData {}

/// Another dummy trait, associated with the state-type pattern in [`CurrRead`]
pub trait CurrReadStateOnlyAlign {}

impl CurrReadStateOnlyAlign for OnlyAlignData {}
impl CurrReadStateOnlyAlign for OnlyAlignDataComplete {}

/// Our main struct that receives and stores from one BAM record.
/// Also has methods for processing this information.
/// Also see [`CurrReadBuilder`] for a way to build this without BAM records.
///
/// We call this `CurrRead` as in 'current read'. `Read` is used
/// within `rust-htslib`, so we don't want to create another `Read` struct.
///
/// The information within the struct is hard to access without
/// the methods defined here. This is to ensure the struct
/// doesn't fall into an invalid state, which could cause mistakes
/// in calculations associated with the struct. For example:
/// if I want to measure mean modification density along windows
/// of the raw modification data, I need a guarantee that the
/// modification data is sorted by position. We can guarantee
/// this when the modification data is parsed, but we cannot
/// guarantee this if we allow free access to the struct.
/// NOTE: we could have implemented these as a trait extension
/// to the `rust_htslib` `Record` struct, but we have chosen not to,
/// as we may want to persist data like modifications and do
/// multiple operations on them. And `Record` has inconvenient
/// return types like i64 instead of u64 for positions along the
/// genome.
#[derive(Debug, Clone, PartialEq)]
pub struct CurrRead<S: CurrReadState> {
    /// Stores alignment type.
    state: ReadState,

    /// Read ID of molecule, also called query name in some contexts.
    read_id: String,

    /// Length of the stored sequence in a BAM file. This is usually the basecalled sequence.
    seq_len: Option<u64>,

    /// Length of the segment on the reference genome the molecule maps to.
    align_len: Option<u64>,

    /// Stores modification information along with any applied thresholds.
    mods: (BaseMods, ThresholdState),

    /// ID of the reference genome contig and the starting position on
    /// contig that the molecule maps or aligns to.
    /// NOTE: the contig here is numeric and refers to an index on the BAM
    /// header. To convert this into an alphanumeric string, you have to
    /// process the header and store it in `contig_name` below.
    /// We have left it this way as it is easier to store and process integers.
    contig_id_and_start: Option<(i32, u64)>,

    /// Contig name.
    contig_name: Option<String>,

    /// Base PHRED-quality threshold (no offset). Mods could have been filtered by this.
    mod_base_qual_thres: u8,

    /// `PhantomData` marker for compiler's sake
    marker: std::marker::PhantomData<S>,
}

/// Implements defaults for `CurrRead`
impl Default for CurrRead<NoData> {
    fn default() -> Self {
        CurrRead::<NoData> {
            state: ReadState::PrimaryFwd,
            read_id: String::new(),
            seq_len: None,
            align_len: None,
            mods: (BaseMods { base_mods: vec![] }, ThresholdState::default()),
            contig_id_and_start: None,
            contig_name: None,
            mod_base_qual_thres: 0,
            marker: std::marker::PhantomData::<NoData>,
        }
    }
}

impl CurrRead<NoData> {
    /// sets the alignment of the read and read ID using BAM record
    ///
    /// # Errors
    /// While we support normal BAM reads from `ONT`, `PacBio` etc. that contain modifications,
    /// we do not support some BAM flags like paired, duplicate, quality check failed etc.
    /// This is because of our design choices e.g. if mods are called on paired reads,
    /// then we'll have to include both records as one read in our statistics
    /// and we do not have functionality in place to do this.
    /// So, we return errors if such flags or an invalid combination of flags (e.g.
    /// secondary and supplementary bits are set) are encountered.
    /// We also return error upon invalid read id but this is expected to be in violation
    /// of the BAM format (UTF-8 error).
    ///
    /// ```
    /// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader, ReadState};
    /// use rust_htslib::bam::Read;
    /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
    /// let mut count = 0;
    /// for record in reader.records(){
    ///     let r = record?;
    ///     let curr_read = CurrRead::default().set_read_state_and_id(&r)?;
    ///     match count {
    ///         0 => assert_eq!(curr_read.read_state(), ReadState::PrimaryFwd),
    ///         1 => assert_eq!(curr_read.read_state(), ReadState::PrimaryFwd),
    ///         2 => assert_eq!(curr_read.read_state(), ReadState::PrimaryRev),
    ///         3 => assert_eq!(curr_read.read_state(), ReadState::Unmapped),
    ///         _ => unreachable!(),
    ///     }
    ///     count = count + 1;
    /// }
    /// # Ok::<(), Error>(())
    /// ```
    pub fn set_read_state_and_id(self, record: &Record) -> Result<CurrRead<OnlyAlignData>, Error> {
        // extract read id
        let read_id = match str::from_utf8(record.qname()) {
            Ok(v) => v.to_string(),
            Err(e) => {
                return Err(Error::InvalidReadID(format!(
                    "error in setting read id, which possibly violates BAM requirements: {e}"
                )));
            }
        };
        // check for unsupported flags
        if record.is_paired()
            || record.is_proper_pair()
            || record.is_first_in_template()
            || record.is_last_in_template()
            || record.is_mate_reverse()
            || record.is_mate_unmapped()
            || record.is_duplicate()
            || record.is_quality_check_failed()
        {
            return Err(Error::NotImplemented(format!(
                "paired-read/mate-read/duplicate/qual-check-failed flags not supported! read_id: {read_id}"
            )));
        }
        // set read state
        let state = match (
            record.is_reverse(),
            record.is_unmapped(),
            record.is_secondary(),
            record.is_supplementary(),
        ) {
            (false, true, false, false) => ReadState::Unmapped,
            (true, false, false, false) => ReadState::PrimaryRev,
            (true, false, true, false) => ReadState::SecondaryRev,
            (false, false, true, false) => ReadState::SecondaryFwd,
            (true, false, false, true) => ReadState::SupplementaryRev,
            (false, false, false, true) => ReadState::SupplementaryFwd,
            (false, false, false, false) => ReadState::PrimaryFwd,
            _ => {
                return Err(Error::UnknownAlignState(format!(
                    "invalid flag combination! read_id: {read_id}",
                )));
            }
        };
        Ok(CurrRead::<OnlyAlignData> {
            state,
            read_id,
            seq_len: None,
            align_len: None,
            mods: (BaseMods { base_mods: vec![] }, ThresholdState::default()),
            contig_id_and_start: None,
            contig_name: None,
            mod_base_qual_thres: 0,
            marker: std::marker::PhantomData::<OnlyAlignData>,
        })
    }

    /// Runs [`CurrRead<NoData>::try_from_only_alignment_seq_len_optional`], forcing sequence
    /// length retrieval. See comments there and the comments below.
    ///
    /// Some BAM records have zero-length sequence fields i.e. marked by a '*'.
    /// This may be intentional e.g. a secondary alignment has the same sequence
    /// as a corresponding primary alignment and repeating a sequence is not space-efficient.
    /// Or it may be intentional due to other reasons.
    /// For modification processing, we cannot deal with these records as we have to match
    /// sequences across different records.
    /// So, our easy-to-use-function here forces sequence length retrieval and will fail
    /// if zero length sequences are found.
    ///
    /// Another function below [`CurrRead<NoData>::try_from_only_alignment_zero_seq_len`], allows
    /// zero length sequences through and can be used if zero length sequences are really
    /// needed. In such a scenario, the user has to carefully watch for errors.
    /// So we discourage its use unless really necessary.
    ///
    /// # Errors
    pub fn try_from_only_alignment(
        self,
        record: &Record,
    ) -> Result<CurrRead<OnlyAlignDataComplete>, Error> {
        self.try_from_only_alignment_seq_len_optional(record, true)
    }

    /// Runs [`CurrRead<NoData>::try_from_only_alignment_seq_len_optional`], avoiding sequence
    /// length retrieval and setting it to zero. See comments there and the comments below.
    ///
    /// See notes on [`CurrRead<NoData>::try_from_only_alignment`].
    /// Use of this function is discouraged unless really necessary as we cannot parse
    /// modification information from zero-length sequences without errors.
    ///
    /// # Errors
    pub fn try_from_only_alignment_zero_seq_len(
        self,
        record: &Record,
    ) -> Result<CurrRead<OnlyAlignDataComplete>, Error> {
        self.try_from_only_alignment_seq_len_optional(record, false)
    }

    /// Uses only alignment information and no modification information to
    /// create the struct. Use this if you want to perform operations that
    /// do not involve reading or manipulating the modification data.
    /// If `is_seq_len_non_zero` is set to false, then sequence length is
    /// not retrieved and is set to zero.
    ///
    /// # Errors
    /// Upon failure in retrieving record information.
    pub fn try_from_only_alignment_seq_len_optional(
        self,
        record: &Record,
        is_seq_len_non_zero: bool,
    ) -> Result<CurrRead<OnlyAlignDataComplete>, Error> {
        let mut curr_read_state = CurrRead::default().set_read_state_and_id(record)?;
        if !curr_read_state.read_state().is_unmapped() {
            curr_read_state = curr_read_state
                .set_align_len(record)?
                .set_contig_id_and_start(record)?
                .set_contig_name(record)?;
        }
        if is_seq_len_non_zero {
            curr_read_state = curr_read_state.set_seq_len(record)?;
        } else {
            curr_read_state.seq_len = Some(0u64);
        }
        let CurrRead::<OnlyAlignData> {
            state,
            read_id,
            seq_len,
            align_len,
            contig_id_and_start,
            contig_name,
            ..
        } = curr_read_state;
        Ok(CurrRead::<OnlyAlignDataComplete> {
            state,
            read_id,
            seq_len,
            align_len,
            mods: (BaseMods { base_mods: vec![] }, ThresholdState::default()),
            contig_id_and_start,
            contig_name,
            mod_base_qual_thres: 0,
            marker: std::marker::PhantomData::<OnlyAlignDataComplete>,
        })
    }
}

impl<S: CurrReadStateWithAlign + CurrReadState> CurrRead<S> {
    /// gets the read state
    #[must_use]
    pub fn read_state(&self) -> ReadState {
        self.state
    }
    /// set length of sequence from BAM record
    ///
    /// # Errors
    /// Errors are returned if sequence length is already set or
    /// sequence length is not non-zero.
    ///
    /// ```
    /// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
    /// use rust_htslib::bam::Read;
    /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
    /// let mut count = 0;
    /// for record in reader.records(){
    ///     let r = record?;
    ///     let curr_read = CurrRead::default().set_read_state_and_id(&r)?.set_seq_len(&r)?;
    ///     let Ok(len) = curr_read.seq_len() else { unreachable!() };
    ///     match count {
    ///         0 => assert_eq!(len, 8),
    ///         1 => assert_eq!(len, 48),
    ///         2 => assert_eq!(len, 33),
    ///         3 => assert_eq!(len, 48),
    ///         _ => unreachable!(),
    ///     }
    ///     count = count + 1;
    /// }
    /// # Ok::<(), Error>(())
    /// ```
    ///
    /// If we call the method twice, we should hit a panic
    /// ```should_panic
    /// # use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
    /// # use rust_htslib::bam::Read;
    /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
    /// for record in reader.records(){
    ///     let r = record?;
    ///     let curr_read = CurrRead::default().set_read_state_and_id(&r)?
    ///         .set_seq_len(&r)?.set_seq_len(&r)?;
    ///     break;
    /// }
    /// # Ok::<(), Error>(())
    /// ```
    pub fn set_seq_len(mut self, record: &Record) -> Result<Self, Error> {
        self.seq_len = match self.seq_len {
            Some(_) => {
                return Err(Error::InvalidDuplicates(format!(
                    "cannot set sequence length again! read_id: {}",
                    self.read_id()
                )));
            }
            None => match record.seq_len() {
                0 => {
                    return Err(Error::ZeroSeqLen(format!(
                        "avoid including 0-len sequences while parsing mod data in this program, read_id: {}",
                        self.read_id()
                    )));
                }
                l => Some(u64::try_from(l)?),
            },
        };
        Ok(self)
    }
    /// gets length of sequence
    ///
    /// # Errors
    /// Error if sequence length is not set
    pub fn seq_len(&self) -> Result<u64, Error> {
        self.seq_len.ok_or(Error::UnavailableData(format!(
            "seq len not available, read_id: {}",
            self.read_id()
        )))
    }
    /// set alignment length from BAM record if available
    ///
    /// # Errors
    /// Returns errors if alignment len is already set, instance is
    /// unmapped, or if alignment coordinates are malformed
    /// (e.g. end < start).
    pub fn set_align_len(mut self, record: &Record) -> Result<Self, Error> {
        self.align_len = match self.align_len {
            Some(_) => Err(Error::InvalidDuplicates(format!(
                "cannot set alignment length again! read_id: {}",
                self.read_id()
            ))),
            None => {
                if self.read_state().is_unmapped() {
                    Err(Error::Unmapped(format!(
                        "cannot set alignment properties for unmapped reads, read_id: {}",
                        self.read_id()
                    )))
                } else {
                    // NOTE: right now, I don't know of a way to test the error below
                    // as rust htslib initializes an empty record with an alignment
                    // length of 1 (see the code below). This is only a note about
                    // the error variant, not the normal function of this code block
                    // which is fine.
                    // ```
                    // use rust_htslib::bam::ext::BamRecordExtensions;
                    // let r = Record::new();
                    // assert_eq!(r.seq_len(), 0);
                    // assert_eq!(r.pos(), 0);
                    // assert_eq!(r.reference_end(), 1);
                    // ```
                    let st = record.pos();
                    let en = record.reference_end();
                    if en > st && st >= 0 {
                        #[expect(
                            clippy::arithmetic_side_effects,
                            reason = "en > st && st >= 0 guarantee no i64 overflows"
                        )]
                        #[expect(
                            clippy::missing_panics_doc,
                            reason = "en > st && st >= 0 guarantee no panic"
                        )]
                        Ok(Some(
                            (en - st)
                                .try_into()
                                .expect("en>st && st>=0 guarantee no problems i64->u64"),
                        ))
                    } else {
                        Err(Error::InvalidAlignLength(format!(
                            "in `set_align_len`, start: {st}, end: {en} invalid! i.e. en <= st or st < 0, read_id: {}",
                            self.read_id()
                        )))
                    }
                }
            }
        }?;
        Ok(self)
    }
    /// gets alignment length
    ///
    /// # Errors
    /// if instance is unmapped or alignment length is not set
    pub fn align_len(&self) -> Result<u64, Error> {
        if self.read_state().is_unmapped() {
            Err(Error::Unmapped(format!(
                "request alignment data on unmapped, read_id: {}",
                self.read_id()
            )))
        } else {
            self.align_len.ok_or(Error::UnavailableData(format!(
                "alignment data not available, read_id: {}",
                self.read_id()
            )))
        }
    }
    /// sets contig ID and start from BAM record if available
    ///
    /// # Errors
    /// if instance is unmapped, if these data are already set and
    /// the user is trying to set them again, or if coordinates
    /// are malformed (start position is negative)
    ///
    /// ```
    /// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
    /// use rust_htslib::bam::Read;
    /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
    /// let mut count = 0;
    /// for record in reader.records(){
    ///     let r = record?;
    ///     let curr_read =
    ///         CurrRead::default().set_read_state_and_id(&r)?.set_contig_id_and_start(&r)?;
    ///     match (count, curr_read.contig_id_and_start()) {
    ///         (0, Ok((0, 9))) |
    ///         (1, Ok((2, 23))) |
    ///         (2, Ok((1, 3))) => {},
    ///         _ => unreachable!(),
    ///     }
    ///     count = count + 1;
    ///     if count == 3 { break; } // the fourth entry is unmapped, and will lead to an error.
    /// }
    /// # Ok::<(), Error>(())
    /// ```
    ///
    /// If we call the method on an unmapped read, we should see an error.
    /// ```should_panic
    /// # use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
    /// # use rust_htslib::bam::Read;
    /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
    /// let mut count = 0;
    /// for record in reader.records(){
    ///     let r = record?;
    ///     if count < 3 {
    ///         count = count + 1;
    ///         continue;
    ///     }
    ///     // the fourth read is unmapped
    ///     let curr_read =
    ///         CurrRead::default().set_read_state_and_id(&r)?.set_contig_id_and_start(&r)?;
    /// }
    /// # Ok::<(), Error>(())
    /// ```
    ///
    /// If we call the method twice, we should hit a panic
    /// ```should_panic
    /// # use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
    /// # use rust_htslib::bam::Read;
    /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
    /// for record in reader.records(){
    ///     let r = record?;
    ///     let mut curr_read = CurrRead::default().set_read_state_and_id(&r)?
    ///         .set_contig_id_and_start(&r)?.set_contig_id_and_start(&r)?;
    ///     break;
    /// }
    /// # Ok::<(), Error>(())
    /// ```
    pub fn set_contig_id_and_start(mut self, record: &Record) -> Result<Self, Error> {
        self.contig_id_and_start = match self.contig_id_and_start {
            Some(_) => Err(Error::InvalidDuplicates(format!(
                "cannot set contig and start again! read_id: {}",
                self.read_id()
            ))),
            None => {
                if self.read_state().is_unmapped() {
                    Err(Error::Unmapped(format!(
                        "setting alignment data on unmapped read, read_id: {}",
                        self.read_id()
                    )))
                } else {
                    Ok(Some((record.tid(), record.pos().try_into()?)))
                }
            }
        }?;
        Ok(self)
    }
    /// gets contig ID and start
    ///
    /// # Errors
    /// If instance is unmapped or if data (contig id and start) are not set
    pub fn contig_id_and_start(&self) -> Result<(i32, u64), Error> {
        if self.read_state().is_unmapped() {
            Err(Error::Unmapped(format!(
                "requesting alignment data on unmapped read, read_id: {}",
                self.read_id()
            )))
        } else {
            self.contig_id_and_start
                .ok_or(Error::UnavailableData(format!(
                    "contig id, start not available, read_id: {}",
                    self.read_id()
                )))
        }
    }
    /// sets contig name
    ///
    /// # Errors
    /// Returns error if instance is unmapped or contig name has already been set
    ///
    /// ```
    /// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
    /// use rust_htslib::bam::Read;
    /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
    /// let mut count = 0;
    /// for record in reader.records(){
    ///     let r = record?;
    ///     let mut curr_read = CurrRead::default().set_read_state_and_id(&r)?.set_contig_name(&r)?;
    ///     let Ok(contig_name) = curr_read.contig_name() else {unreachable!()};
    ///     match (count, contig_name) {
    ///         (0, "dummyI") |
    ///         (1, "dummyIII") |
    ///         (2, "dummyII") => {},
    ///         _ => unreachable!(),
    ///     }
    ///     count = count + 1;
    ///     if count == 3 { break; } // the fourth entry is unmapped, and will lead to an error.
    /// }
    /// # Ok::<(), Error>(())
    /// ```
    ///
    /// If we try to set contig name on an unmapped read, we will get an error
    ///
    /// ```should_panic
    /// # use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
    /// # use rust_htslib::bam::Read;
    /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
    /// let mut count = 0;
    /// for record in reader.records(){
    ///     if count < 3 {
    ///         count = count + 1;
    ///         continue;
    ///     }
    ///     let r = record?;
    ///     let mut curr_read = CurrRead::default().set_read_state_and_id(&r)?;
    ///     curr_read.set_contig_name(&r)?;
    /// }
    /// # Ok::<(), Error>(())
    /// ```
    ///
    /// If we call the method twice, we should hit a panic
    /// ```should_panic
    /// # use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
    /// # use rust_htslib::bam::Read;
    /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
    /// for record in reader.records(){
    ///     let r = record?;
    ///     let mut curr_read = CurrRead::default().set_read_state_and_id(&r)?
    ///         .set_contig_name(&r)?.set_contig_name(&r)?;
    ///     break;
    /// }
    /// # Ok::<(), Error>(())
    /// ```
    pub fn set_contig_name(mut self, record: &Record) -> Result<Self, Error> {
        self.contig_name = match (self.read_state().is_unmapped(), self.contig_name.is_none()) {
            (true, _) => Err(Error::Unmapped(format!(
                "set align data on unmapped read! read_id: {}",
                self.read_id()
            ))),
            (_, false) => Err(Error::InvalidDuplicates(format!(
                "cannot set contig name again! read_id: {}",
                self.read_id()
            ))),
            (_, true) => Ok(Some(String::from(record.contig()))),
        }?;
        Ok(self)
    }
    /// gets contig name
    ///
    /// # Errors
    /// If instance is unmapped or contig name has not been set
    pub fn contig_name(&self) -> Result<&str, Error> {
        match (self.read_state().is_unmapped(), self.contig_name.as_ref()) {
            (true, _) => Err(Error::Unmapped(format!(
                "get align data on unmapped read, read_id: {}",
                self.read_id()
            ))),
            (_, None) => Err(Error::UnavailableData(format!(
                "contig name unavailable, read_id: {}",
                self.read_id()
            ))),
            (_, Some(v)) => Ok(v.as_str()),
        }
    }
    /// gets read id
    #[must_use]
    pub fn read_id(&self) -> &str {
        self.read_id.as_str()
    }
    /// Returns the character corresponding to the strand
    ///
    /// ```
    /// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
    /// use rust_htslib::bam::Read;
    /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
    /// let mut count = 0;
    /// for record in reader.records(){
    ///     let r = record?;
    ///     let mut curr_read = CurrRead::default().set_read_state_and_id(&r)?;
    ///     let strand = curr_read.strand() else { unreachable!() };
    ///     match (count, strand) {
    ///         (0, '+') | (1, '+') | (2, '-') | (3, '.') => {},
    ///         _ => unreachable!(),
    ///     }
    ///     count = count + 1;
    /// }
    /// # Ok::<(), Error>(())
    /// ```
    #[must_use]
    pub fn strand(&self) -> char {
        self.read_state().strand()
    }
    /// Returns read sequence overlapping with a genomic region
    ///
    /// # Errors
    /// If getting sequence coordinates from reference coordinates fails, see
    /// [`CurrRead::seq_and_qual_on_ref_coords`]
    ///
    /// # Example
    ///
    /// ```
    /// use bedrs::Bed3;
    /// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
    /// use rust_htslib::bam::Read;
    ///
    /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
    /// let mut count = 0;
    /// for record in reader.records() {
    ///     let r = record?;
    ///     let curr_read = CurrRead::default().try_from_only_alignment(&r)?;
    ///
    ///     // Skip unmapped reads
    ///     if !curr_read.read_state().is_unmapped() {
    ///         let (contig_id, start) = curr_read.contig_id_and_start()?;
    ///         let align_len = curr_read.align_len()?;
    ///
    ///         // Create a region that overlaps with the read but is short of one bp.
    ///         // Note that this BAM file has reads with all bases matching perfectly
    ///         // with the reference.
    ///         let region = Bed3::new(contig_id, start, start + align_len - 1);
    ///         let seq_subset = curr_read.seq_on_ref_coords(&r, &region)?;
    ///
    ///         // Check for sequence length match
    ///         assert_eq!(curr_read.seq_len()? - 1, u64::try_from(seq_subset.len())?);
    ///
    ///         // Create a region with no overlap at all and check we get no data
    ///         let region = Bed3::new(contig_id, start + align_len, start + align_len + 2);
    ///         match curr_read.seq_on_ref_coords(&r, &region){
    ///             Err(Error::UnavailableData(_)) => (),
    ///             _ => unreachable!(),
    ///         };
    ///
    ///     }
    ///     count += 1;
    /// }
    /// # Ok::<(), Error>(())
    /// ```
    pub fn seq_on_ref_coords(
        &self,
        record: &Record,
        region: &Bed3<i32, u64>,
    ) -> Result<Vec<u8>, Error> {
        Ok(self
            .seq_and_qual_on_ref_coords(record, region)?
            .into_iter()
            .map(|x| x.map_or(b'.', |y| y.1))
            .collect::<Vec<u8>>())
    }
    /// Returns match-or-mismatch, read sequence, base-quality values overlapping with genomic region.
    ///
    /// Returns a vector of Options:
    /// * is a `None` at a deletion i.e. a base on the reference and not on the read.
    /// * is a `Some(bool, u8, u8)` at a match/mismatch/insertion.
    ///   The first `u8` is the base itself and the `bool` is `true` if a match/mismatch
    ///   and `false` if an insertion, and the second `u8` is the base quality (0-93),
    ///   which the BAM format sets to 255 if no quality is available for the entire read.
    ///
    /// Because sequences are encoded using 4-bit values into a `[u8]`, we need to use
    /// `rust-htslib` functions to convert them into 8-bit values and then use
    /// the `Index<usize>` trait on sequences from `rust-htslib`.
    ///
    /// # Errors
    /// If the read does not intersect with the specified region, see
    /// [`CurrRead::seq_coords_from_ref_coords`]
    ///
    /// # Example
    ///
    /// Example 1
    ///
    /// ```
    /// use bedrs::Bed3;
    /// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
    /// use rust_htslib::bam::Read;
    ///
    /// let mut reader = nanalogue_bam_reader(&"examples/example_5_valid_basequal.sam")?;
    /// for record in reader.records() {
    ///     let r = record?;
    ///     let curr_read = CurrRead::default().try_from_only_alignment(&r)?;
    ///
    ///     let region = Bed3::new(0, 0, 20);
    ///     let seq_subset = curr_read.seq_and_qual_on_ref_coords(&r, &region)?;
    ///     assert_eq!(seq_subset, [Some((true, b'T', 32)), Some((true, b'C', 0)),
    ///         Some((true, b'G', 69)), Some((true, b'T', 80)), Some((true, b'T', 79)),
    ///         Some((true, b'T', 81)), Some((true, b'C', 29)), Some((true, b'T', 30))]);
    ///
    ///     // Create a region with no overlap at all and check we get no data
    ///     let region = Bed3::new(0, 20, 22);
    ///     match curr_read.seq_and_qual_on_ref_coords(&r, &region){
    ///         Err(Error::UnavailableData(_)) => (),
    ///         _ => unreachable!(),
    ///     };
    ///
    /// }
    /// # Ok::<(), Error>(())
    /// ```
    ///
    /// Example 2
    ///
    /// ```
    /// use bedrs::Bed3;
    /// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
    /// use rust_htslib::bam::Read;
    ///
    /// let mut reader = nanalogue_bam_reader(&"examples/example_7.sam")?;
    /// for record in reader.records() {
    ///     let r = record?;
    ///     let curr_read = CurrRead::default().try_from_only_alignment(&r)?;
    ///
    ///     let region = Bed3::new(0, 0, 20);
    ///     let seq_subset = curr_read.seq_and_qual_on_ref_coords(&r, &region)?;
    ///     assert_eq!(seq_subset, [Some((true, b'T', 32)), None, None,
    ///         Some((false, b'A', 0)), Some((true, b'T', 0)), Some((true, b'T', 79)),
    ///         Some((true, b'T', 81)), Some((true, b'G', 29)), Some((true, b'T', 30))]);
    ///
    /// }
    /// # Ok::<(), Error>(())
    /// ```
    #[expect(
        clippy::indexing_slicing,
        reason = "qual, seq same len and coords will not exceed these"
    )]
    #[expect(
        clippy::type_complexity,
        reason = "not complex enough, I think types will make this less clear"
    )]
    pub fn seq_and_qual_on_ref_coords(
        &self,
        record: &Record,
        region: &Bed3<i32, u64>,
    ) -> Result<Vec<Option<(bool, u8, u8)>>, Error> {
        let seq = record.seq();
        let qual = record.qual();
        Ok(self
            .seq_coords_from_ref_coords(record, region)?
            .into_iter()
            .map(|x| x.map(|y| (y.0, seq[y.1], qual[y.1])))
            .collect::<Vec<Option<(bool, u8, u8)>>>())
    }
    /// Extract sequence coordinates corresponding to a region on the reference genome.
    ///
    /// The vector we return contains `Some((bool, usize))` entries where both the reference and the read
    /// have bases, and `None` where bases from the reference are missing on the read:
    /// * matches or mismatches, we record the coordinate.
    ///   so SNPs for example (i.e. a 1 bp difference from the ref) will show up as `Some((true, _))`.
    /// * a deletion or a ref skip ("N" in cigar) will show up as `None`.
    /// * insertions are preserved i.e. bases in the middle of a read present
    ///   on the read but not on the reference are `Some((false, _))`
    /// * clipped bases at the end of the read are not preserved. These are bases
    ///   on the read but not on the reference and are denoted as soft or hard
    ///   clips on the CIGAR string e.g. barcodes from sequencing
    /// * some CIGAR combinations are illogical and we are assuming they do not happen
    ///   e.g. a read's CIGAR can end with, say 10D20S, this means last 10 bp
    ///   are in a deletion and the next 20 are a softclip. This is illogical,
    ///   as they must be combined into a 30-bp softclip i.e. 30S. So if the
    ///   aligner produces such illogical states, then the sequence coordinates reported
    ///   here may be erroneous.
    ///
    /// If the read does not intersect with the region, we return an `Error` (see below).
    /// If the read does intersect with the region but we cannot retrieve any bases,
    /// we return an empty vector (I am not sure if we will run into this scenario).
    ///
    /// # Errors
    /// If the read does not intersect with the specified region, or if `usize` conversions fail.
    ///
    /// # Example
    ///
    /// ```
    /// use bedrs::Bed3;
    /// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
    /// use rust_htslib::bam::Read;
    ///
    /// let mut reader = nanalogue_bam_reader(&"examples/example_7.sam")?;
    /// for record in reader.records() {
    ///     let r = record?;
    ///     let curr_read = CurrRead::default().try_from_only_alignment(&r)?;
    ///
    ///     let region = Bed3::new(0, 9, 13);
    ///     let seq_subset = curr_read.seq_coords_from_ref_coords(&r, &region)?;
    ///     // there are deletions on the read above
    ///     assert_eq!(seq_subset, vec![Some((true, 0)), None, None, Some((false, 1)), Some((true, 2))]);
    ///
    ///     // Create a region with no overlap at all and check we get no data
    ///     let region = Bed3::new(0, 20, 22);
    ///     match curr_read.seq_coords_from_ref_coords(&r, &region){
    ///         Err(Error::UnavailableData(_)) => (),
    ///         _ => unreachable!(),
    ///     };
    ///
    /// }
    /// # Ok::<(), Error>(())
    pub fn seq_coords_from_ref_coords(
        &self,
        record: &Record,
        region: &Bed3<i32, u64>,
    ) -> Result<Vec<Option<(bool, usize)>>, Error> {
        #[expect(
            clippy::missing_panics_doc,
            reason = "genomic coordinates are far less than (2^64-1)/2 i.e. u64->i64 should be ok"
        )]
        let interval = {
            region
                .intersect(&StrandedBed3::<i32, u64>::try_from(self)?)
                .and_then(|v| {
                    let start = i64::try_from(v.start()).expect("genomic coordinates << 2^63");
                    let end = i64::try_from(v.end()).expect("genomic coordinates << 2^63");
                    (start < end).then_some(start..end)
                })
                .ok_or(Error::UnavailableData(
                    "coord-retrieval: region does not intersect with read".to_owned(),
                ))
        }?;

        // Initialize coord calculation.
        // We don't know how long the subset will be, we initialize with a guess
        // of 2 * interval size
        #[expect(
            clippy::arithmetic_side_effects,
            reason = "genomic coordinates far less than i64::MAX (approx (2^64-1)/2)"
        )]
        let mut s: Vec<Option<(bool, usize)>> =
            Vec::with_capacity(usize::try_from(2 * (interval.end - interval.start))?);

        // we may have to trim the sequence if we hit a bunch of unaligned base
        // pairs right at the end e.g. a softclip.
        let mut trim_end_bp: u64 = 0;

        for w in record
            .aligned_pairs_full()
            .skip_while(|x| x[1].is_none_or(|y| !interval.contains(&y)))
            .take_while(|x| x[1].is_none_or(|y| interval.contains(&y)))
        {
            #[expect(
                clippy::arithmetic_side_effects,
                reason = "coordinates far less than u64::MAX (2^64-1) so no chance of counter overflow"
            )]
            match w {
                [Some(x), Some(_)] => {
                    s.push(Some((true, usize::try_from(x)?)));
                    trim_end_bp = 0;
                }
                [Some(x), None] => {
                    s.push(Some((false, usize::try_from(x)?)));
                    trim_end_bp += 1;
                }
                [None, Some(_)] => {
                    s.push(None);
                    trim_end_bp = 0;
                }
                [None, None] => unreachable!(
                    "impossible to find bases that are neither on the read nor on the reference"
                ),
            }
        }

        // if last few bp in sequence are all unmapped, we remove them here.
        for _ in 0..trim_end_bp {
            let Some(_) = s.pop() else {
                unreachable!("trim_end_bp cannot exceed length of s")
            };
        }

        // Trim excess allocated capacity and return
        s.shrink_to(0);
        Ok(s)
    }
    /// sets modification data using the BAM record
    ///
    /// # Errors
    /// If tags in the BAM record containing the modification information (MM, ML)
    /// contain mistakes.
    pub fn set_mod_data(
        self,
        record: &Record,
        mod_thres: ThresholdState,
        min_qual: u8,
    ) -> Result<CurrRead<AlignAndModData>, Error> {
        let result = nanalogue_mm_ml_parser(
            record,
            |x| mod_thres.contains(x),
            |_| true,
            |_, _, _| true,
            min_qual,
        )?;
        Ok(CurrRead::<AlignAndModData> {
            state: self.state,
            read_id: self.read_id,
            seq_len: self.seq_len,
            align_len: self.align_len,
            mods: (result, mod_thres),
            contig_id_and_start: self.contig_id_and_start,
            contig_name: self.contig_name,
            mod_base_qual_thres: min_qual,
            marker: std::marker::PhantomData::<AlignAndModData>,
        })
    }
    /// sets modification data using BAM record but restricted to the specified filters
    ///
    /// # Errors
    /// If tags in the BAM record containing the modification information (MM, ML)
    /// contain mistakes.
    pub fn set_mod_data_restricted<G, H>(
        self,
        record: &Record,
        mod_thres: ThresholdState,
        mod_fwd_pos_filter: G,
        mod_filter_base_strand_tag: H,
        min_qual: u8,
    ) -> Result<CurrRead<AlignAndModData>, Error>
    where
        G: Fn(&usize) -> bool,
        H: Fn(&u8, &char, &ModChar) -> bool,
    {
        let result = nanalogue_mm_ml_parser(
            record,
            |x| mod_thres.contains(x),
            mod_fwd_pos_filter,
            mod_filter_base_strand_tag,
            min_qual,
        )?;
        Ok(CurrRead::<AlignAndModData> {
            state: self.state,
            read_id: self.read_id,
            seq_len: self.seq_len,
            align_len: self.align_len,
            mods: (result, mod_thres),
            contig_id_and_start: self.contig_id_and_start,
            contig_name: self.contig_name,
            mod_base_qual_thres: min_qual,
            marker: std::marker::PhantomData::<AlignAndModData>,
        })
    }
}

impl CurrRead<OnlyAlignDataComplete> {
    /// sets modification data using BAM record but with restrictions
    /// applied by the [`crate::InputMods`] options for example.
    ///
    /// # Errors
    /// If a region filter is specified and we fail to convert current instance to Bed,
    /// and if parsing the MM/ML BAM tags fails (presumably because they are malformed).
    #[expect(
        clippy::missing_panics_doc,
        reason = "integer conversions (u64->usize, u64->i64) are expected to not fail as \
genomic coordinates are far smaller than ~2^63"
    )]
    pub fn set_mod_data_restricted_options<S: InputModOptions + InputRegionOptions>(
        self,
        record: &Record,
        mod_options: &S,
    ) -> Result<CurrRead<AlignAndModData>, Error> {
        let l = usize::try_from(self.seq_len().expect("no error"))
            .expect("bit conversion errors unlikely");
        let w = mod_options.trim_read_ends_mod();
        let interval = if let Some(bed3) = mod_options.region_filter().as_ref() {
            let stranded_bed3 = StrandedBed3::<i32, u64>::try_from(&self)?;
            if let Some(v) = bed3.intersect(&stranded_bed3) {
                if v.start() == stranded_bed3.start() && v.end() == stranded_bed3.end() {
                    None
                } else {
                    Some(v.start()..v.end())
                }
            } else {
                Some(0..0)
            }
        } else {
            None
        };
        Ok({
            let mut read = self.set_mod_data_restricted(
                record,
                mod_options.mod_prob_filter(),
                |x| w == 0 || (w..l.checked_sub(w).unwrap_or_default()).contains(x),
                |_, &s, &t| {
                    mod_options.tag().is_none_or(|x| x == t)
                        && mod_options.mod_strand().is_none_or(|v| s == char::from(v))
                },
                mod_options.base_qual_filter_mod(),
            )?;
            if let Some(v) = interval {
                match v.start.cmp(&v.end) {
                    Ordering::Less => read.filter_by_ref_pos(
                        i64::try_from(v.start)
                            .expect("no error as genomic coordinates far less than ~2^63"),
                        i64::try_from(v.end)
                            .expect("no error as genomic coordinates far less than ~2^63"),
                    )?,
                    Ordering::Equal => read.filter_by_ref_pos(i64::MAX - 1, i64::MAX)?,
                    Ordering::Greater => {
                        unreachable!("`bedrs` should not allow malformed intervals!")
                    }
                }
            }
            read
        })
    }
}

impl CurrRead<AlignAndModData> {
    /// gets modification data
    #[must_use]
    pub fn mod_data(&self) -> &(BaseMods, ThresholdState) {
        &self.mods
    }
    /// window modification data with restrictions.
    /// If a read has the same modification on both the basecalled
    /// strand and its complement, then windows along both are returned.
    ///
    /// If `win_size` exceeds the modification data length, no windows are produced.
    ///
    /// # Errors
    /// Returns an error if the window function returns an error.
    #[expect(
        clippy::pattern_type_mismatch,
        reason = "suggested notation is verbose but I am not sure"
    )]
    pub fn windowed_mod_data_restricted<F>(
        &self,
        window_function: &F,
        win_options: InputWindowing,
        tag: ModChar,
    ) -> Result<Vec<F32Bw0and1>, Error>
    where
        F: Fn(&[u8]) -> Result<F32Bw0and1, Error>,
    {
        let win_size = win_options.win.get();
        let slide_size = win_options.step.get();
        let mut result = Vec::<F32Bw0and1>::new();
        let tag_char = tag.val();
        let (BaseMods { base_mods: v }, _) = &self.mods;

        // we make a few assumptions below:
        // * data is sorted by coordinate along sequence
        // * illegal types like strand not '+' or '-', or multiple entries
        //   corresponding to the same modification strand combination
        //   e.g. C+m occuring twice.
        // we control data flow into CurrRead, checking these do not happen
        // during ingress. To be future-proof etc., we should check these things
        // here but we do not as there is no way right now to test error checking
        // as there is no way to make CurrRead fall into these illegal states.
        #[expect(
            clippy::missing_panics_doc,
            reason = "checked_sub ensures win_size <= mod_data.len() before windowing"
        )]
        for k in v {
            match k {
                BaseMod {
                    modification_type: x,
                    ranges: track,
                    ..
                } if *x == tag_char => {
                    let mod_data = track.qual();
                    if let Some(l) = mod_data.len().checked_sub(win_size) {
                        result.extend(
                            (0..=l)
                                .step_by(slide_size)
                                .map(|i| {
                                    window_function(
                                        mod_data
                                            .get(i..)
                                            .expect("i <= len - win_size")
                                            .get(0..win_size)
                                            .expect("checked len>=win_size so no error"),
                                    )
                                })
                                .collect::<Result<Vec<F32Bw0and1>, _>>()?,
                        );
                    }
                }
                _ => {}
            }
        }
        Ok(result)
    }
    /// Performs a count of number of bases per modified type.
    /// Note that this result depends on the type of filtering done
    /// while the struct was created e.g. by modification threshold.
    ///
    /// # Panics
    /// Panics if the number of modifications exceeds `u32::MAX` (approximately 4.2 billion).
    ///
    /// ```
    /// use nanalogue_core::{CurrRead, Error, ModChar, nanalogue_bam_reader, ThresholdState};
    /// use rust_htslib::bam::Read;
    /// use std::collections::HashMap;
    /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
    /// let mut count = 0;
    /// for record in reader.records(){
    ///     let r = record?;
    ///     let curr_read = CurrRead::default().set_read_state_and_id(&r)?
    ///         .set_mod_data(&r, ThresholdState::GtEq(180), 0)?;
    ///     let mod_count = curr_read.base_count_per_mod();
    ///     let zero_count = HashMap::from([(ModChar::new('T'), 0)]);
    ///     let a = HashMap::from([(ModChar::new('T'), 3)]);
    ///     let b = HashMap::from([(ModChar::new('T'), 1)]);
    ///     let c = HashMap::from([(ModChar::new('T'), 3),(ModChar::new('ᰠ'), 0)]);
    ///     match (count, mod_count) {
    ///         (0, v) => assert_eq!(v, zero_count),
    ///         (1, v) => assert_eq!(v, a),
    ///         (2, v) => assert_eq!(v, b),
    ///         (3, v) => assert_eq!(v, c),
    ///         _ => unreachable!(),
    ///     }
    ///     count = count + 1;
    /// }
    /// # Ok::<(), Error>(())
    /// ```
    #[must_use]
    pub fn base_count_per_mod(&self) -> HashMap<ModChar, u32> {
        let mut output = HashMap::<ModChar, u32>::new();
        #[expect(
            clippy::arithmetic_side_effects,
            reason = "u32::MAX approx 4.2 Gb, v unlikely 1 molecule is this modified"
        )]
        for k in &self.mod_data().0.base_mods {
            let base_count =
                u32::try_from(k.ranges.annotations.len()).expect("number conversion error");
            let _: &mut u32 = output
                .entry(ModChar::new(k.modification_type))
                .and_modify(|e| *e += base_count)
                .or_insert(base_count);
        }
        output
    }
}

/// To format and display modification data in a condensed manner.
trait DisplayCondensedModData {
    /// Outputs mod data section in a condensed manner.
    fn mod_data_section(&self) -> Result<String, fmt::Error>;
}

/// No mod data means no display is produced
impl<S> DisplayCondensedModData for CurrRead<S>
where
    S: CurrReadStateOnlyAlign + CurrReadState,
{
    fn mod_data_section(&self) -> Result<String, fmt::Error> {
        Ok(String::new())
    }
}

/// Implements display when mod data is available
impl DisplayCondensedModData for CurrRead<AlignAndModData> {
    #[expect(
        clippy::pattern_type_mismatch,
        reason = "suggested notation is verbose but I am not sure"
    )]
    fn mod_data_section(&self) -> Result<String, fmt::Error> {
        let mut mod_count_str = String::new();
        let (BaseMods { base_mods: v }, w) = &self.mods;
        for k in v {
            write!(
                mod_count_str,
                "{}{}{}:{};",
                k.modified_base as char,
                k.strand,
                ModChar::new(k.modification_type),
                k.ranges.annotations.len()
            )?;
        }
        if mod_count_str.is_empty() {
            write!(mod_count_str, "NA")?;
        } else {
            write!(
                mod_count_str,
                "({}, PHRED base qual >= {})",
                w, self.mod_base_qual_thres
            )?;
        }
        Ok(format!(",\n\t\"mod_count\": \"{mod_count_str}\""))
    }
}

impl<S> fmt::Display for CurrRead<S>
where
    S: CurrReadState,
    CurrRead<S>: DisplayCondensedModData,
{
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        let mut output_string = String::from("{\n");

        writeln!(output_string, "\t\"read_id\": \"{}\",", self.read_id)?;

        if let Some(v) = self.seq_len {
            writeln!(output_string, "\t\"sequence_length\": {v},")?;
        }

        if let Some((v, w)) = self.contig_id_and_start {
            let num_str = &v.to_string();
            writeln!(
                output_string,
                "\t\"contig\": \"{}\",",
                if let Some(x) = self.contig_name.as_ref() {
                    x
                } else {
                    num_str
                }
            )?;
            writeln!(output_string, "\t\"reference_start\": {w},")?;
            if let Some(x) = self.align_len {
                writeln!(
                    output_string,
                    "\t\"reference_end\": {},",
                    w.checked_add(x)
                        .expect("numeric overflow in calculating reference_end")
                )?;
                writeln!(output_string, "\t\"alignment_length\": {x},")?;
            }
        }

        write!(output_string, "\t\"alignment_type\": \"{}\"", self.state)?;
        writeln!(output_string, "{}", &self.mod_data_section()?)?;
        output_string.push('}');
        output_string.fmt(f)
    }
}

/// Converts `CurrRead` to `StrandedBed3`
///
/// ```
/// use bedrs::{Coordinates, Strand};
/// use bedrs::prelude::StrandedBed3;
/// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
/// use rust_htslib::bam::Read;
/// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
/// let mut count = 0;
/// for record in reader.records(){
///     let r = record?;
///     let mut curr_read = CurrRead::default().try_from_only_alignment(&r)?;
///     let Ok(bed3_stranded) = StrandedBed3::try_from(&curr_read) else {unreachable!()};
///     let exp_bed3_stranded = match count {
///         0 => StrandedBed3::new(0, 9, 17, Strand::Forward),
///         1 => StrandedBed3::new(2, 23, 71, Strand::Forward),
///         2 => StrandedBed3::new(1, 3, 36, Strand::Reverse),
///         3 => StrandedBed3::empty(),
///         _ => unreachable!(),
///     };
///     assert_eq!(*bed3_stranded.chr(), *exp_bed3_stranded.chr());
///     assert_eq!(bed3_stranded.start(), exp_bed3_stranded.start());
///     assert_eq!(bed3_stranded.end(), exp_bed3_stranded.end());
///     assert_eq!(bed3_stranded.strand(), exp_bed3_stranded.strand());
///     count = count + 1;
/// }
/// # Ok::<(), Error>(())
/// ```
impl<S: CurrReadStateWithAlign + CurrReadState> TryFrom<&CurrRead<S>> for StrandedBed3<i32, u64> {
    type Error = Error;

    #[expect(
        clippy::arithmetic_side_effects,
        reason = "u64 variables won't overflow with genomic coords (<2^64-1)"
    )]
    fn try_from(value: &CurrRead<S>) -> Result<Self, Self::Error> {
        match (
            value.read_state().strand(),
            value.align_len().ok(),
            value.contig_id_and_start().ok(),
        ) {
            ('.', _, _) => Ok(StrandedBed3::empty()),
            (_, None, _) => Err(Error::InvalidAlignLength(format!(
                "align len not set while converting to bed3! read_id: {}",
                value.read_id()
            ))),
            (_, _, None) => Err(Error::InvalidContigAndStart(format!(
                "contig id and start not set while converting to bed3! read_id: {}",
                value.read_id()
            ))),
            ('+', Some(al), Some((cg, st))) => {
                Ok(StrandedBed3::new(cg, st, st + al, Strand::Forward))
            }
            ('-', Some(al), Some((cg, st))) => {
                Ok(StrandedBed3::new(cg, st, st + al, Strand::Reverse))
            }
            (_, _, _) => unreachable!("strand should be +/-/., so we should never reach this"),
        }
    }
}

/// Convert a rust htslib record to our `CurrRead` struct.
/// NOTE: This operation loads many types of data from the
/// record and you may not be interested in all of them.
/// So, unless you know for sure that you are dealing with
/// a small number of reads, please do not use this function,
/// and call only a subset of the individual invocations below
/// in your program for the sake of speed and/or memory.
impl TryFrom<Record> for CurrRead<AlignAndModData> {
    type Error = Error;

    fn try_from(record: Record) -> Result<Self, Self::Error> {
        let curr_read_state = CurrRead::default()
            .try_from_only_alignment(&record)?
            .set_mod_data(&record, ThresholdState::GtEq(128), 0)?;
        Ok(curr_read_state)
    }
}

/// Convert a rust htslib rc record into our struct.
/// I think the rc datatype is just like the normal record,
/// except the record datatype is not destroyed and created
/// every time a new record is read (or something like that).
/// All comments I've made for the `TryFrom<Record>` function
/// apply here as well.
impl TryFrom<Rc<Record>> for CurrRead<AlignAndModData> {
    type Error = Error;

    fn try_from(record: Rc<Record>) -> Result<Self, Self::Error> {
        let curr_read_state = CurrRead::default()
            .try_from_only_alignment(&record)?
            .set_mod_data(&record, ThresholdState::GtEq(128), 0)?;
        Ok(curr_read_state)
    }
}

/// Implements filter by reference coordinates for our `CurrRead`.
/// This only filters modification data.
impl FilterByRefCoords for CurrRead<AlignAndModData> {
    /// filters modification data by reference position i.e. all pos such that
    /// start <= pos < end are retained. does not use contig in filtering.
    fn filter_by_ref_pos(&mut self, start: i64, end: i64) -> Result<(), Error> {
        for k in &mut self.mods.0.base_mods {
            k.ranges.filter_by_ref_pos(start, end)?;
        }
        Ok(())
    }
}

/// Serialized representation of [`CurrRead`]; can also be used as a builder
/// to build [`CurrRead`]. This is useful if modification data is in formats
/// other than mod BAM, so we can transform it into [`CurrRead`] and use it
/// with functions from our library.
///
/// # Examples
///
/// First example, unmapped read with very little information
///
/// ```
/// use nanalogue_core::{Error, CurrReadBuilder};
/// let read = CurrReadBuilder::default().build()?;
/// # Ok::<(), Error>(())
/// ```
///
/// Add some simple information, still unmapped.
///
/// ```
/// use nanalogue_core::{Error, CurrReadBuilder};
/// let read = CurrReadBuilder::default().read_id("some_read".into()).seq_len(40).build()?;
/// # Ok::<(), Error>(())
/// ```
///
/// If we try to build a mapped read, we hit a panic as we haven't specified alignment information.
///
/// ```should_panic
/// use nanalogue_core::{Error, CurrReadBuilder,ReadState};
/// let read = CurrReadBuilder::default()
///     .read_id("some_read".into())
///     .seq_len(40)
///     .alignment_type(ReadState::PrimaryFwd).build()?;
/// # Ok::<(), Error>(())
/// ```
///
/// Mapped read building
///
/// ```
/// use nanalogue_core::{Error, CurrReadBuilder, AlignmentInfoBuilder, ReadState};
/// let read = CurrReadBuilder::default()
///     .read_id("some_read".into())
///     .seq_len(40)
///     .alignment_type(ReadState::PrimaryFwd)
///     .alignment(AlignmentInfoBuilder::default()
///         .start(10)
///         .end(60)
///         .contig("chr1".into())
///         .contig_id(1).build()?).build()?;
/// # Ok::<(), Error>(())
/// ```
/// ## Mapped read building with modification information.
/// Mod table entries are associated with one type of modification each.
/// The data tuples are: `sequence coordinate`, `reference coordinate`,
/// `modification probability`. The first varies from 0 to `sequence length` - 1,
/// the second varies within alignment coordinates and is -1 for an unmapped
/// position or all entries are -1 if the read as a whole is unmapped, and
/// the third entry varies from 0 - 255 where 0 means no modification with
/// certainty, and 255 means modification with certainty.
///
/// ```
/// use nanalogue_core::{Error, CurrReadBuilder, AlignmentInfoBuilder, ModTableEntryBuilder,
///     ReadState};
///
/// let mod_table_entry = ModTableEntryBuilder::default()
///     .base('C')
///     .is_strand_plus(true)
///     .mod_code("m".into())
///     .data([(0, 15, 200), (2, 25, 100)]).build()?;
///
/// let read = CurrReadBuilder::default()
///     .read_id("some_read".into())
///     .seq_len(40)
///     .alignment_type(ReadState::PrimaryFwd)
///     .alignment(AlignmentInfoBuilder::default()
///         .start(10)
///         .end(60)
///         .contig("chr1".into())
///         .contig_id(1).build()?)
///         .mod_table([mod_table_entry].into()).build()?;
/// # Ok::<(), Error>(())
/// ```
///
/// ## Mapped read building with multiple mod types.
///
/// ```
/// use nanalogue_core::{Error, CurrReadBuilder, AlignmentInfoBuilder, ModTableEntryBuilder,
///     ReadState};
///
/// let mod_table_entry_1 = ModTableEntryBuilder::default()
///     .base('C')
///     .is_strand_plus(true)
///     .mod_code("m".into())
///     .data([(0, 15, 200), (2, 25, 100)]).build()?;
///
/// let mod_table_entry_2 = ModTableEntryBuilder::default()
///     .base('A')
///     .is_strand_plus(true)
///     .mod_code("a".into())
///     .data([(1, 20, 50), (3, 30, 225)]).build()?;
///
/// let read = CurrReadBuilder::default()
///     .read_id("some_read".into())
///     .seq_len(40)
///     .alignment_type(ReadState::PrimaryFwd)
///     .alignment(AlignmentInfoBuilder::default()
///         .start(10)
///         .end(60)
///         .contig("chr1".into())
///         .contig_id(1).build()?)
///         .mod_table([mod_table_entry_1, mod_table_entry_2].into()).build()?;
/// # Ok::<(), Error>(())
/// ```
/// ## Unmapped read building with multiple mod types.
///
/// Note that all the reference coordinates are set to -1.
///
/// ```
/// use nanalogue_core::{Error, CurrReadBuilder, ModTableEntryBuilder, ReadState};
///
/// let mod_table_entry_1 = ModTableEntryBuilder::default()
///     .base('C')
///     .is_strand_plus(true)
///     .mod_code("m".into())
///     .data([(0, -1, 200), (2, -1, 100)]).build()?;
///
/// let mod_table_entry_2 = ModTableEntryBuilder::default()
///     .base('A')
///     .is_strand_plus(true)
///     .mod_code("a".into())
///     .data([(1, -1, 50), (3, -1, 225)]).build()?;
///
/// let read = CurrReadBuilder::default()
///     .read_id("some_read".into())
///     .seq_len(40)
///     .mod_table([mod_table_entry_1, mod_table_entry_2].into()).build()?;
/// # Ok::<(), Error>(())
/// ```
///
/// ## Erroneous Usage
///
/// ### Coordinates are not sorted
///
/// Please note that even for reverse-aligned reads, coordinates must always
/// be ascending.
///
/// ```should_panic
/// use nanalogue_core::{Error, CurrReadBuilder, AlignmentInfoBuilder, ModTableEntryBuilder,
///     ReadState};
///
/// let mod_table_entry = ModTableEntryBuilder::default()
///     .base('C')
///     .is_strand_plus(true)
///     .mod_code("m".into())
///     .data([(2, 25, 100), (0, 15, 200)]).build()?;
///
/// let read_before_build = CurrReadBuilder::default()
///     .read_id("some_read".into())
///     .seq_len(40)
///     .alignment(AlignmentInfoBuilder::default()
///         .start(10)
///         .end(60)
///         .contig("chr1".into())
///         .contig_id(1).build()?)
///         .mod_table([mod_table_entry].into());
///
/// let _ = read_before_build.alignment_type(ReadState::PrimaryFwd).build()?;
/// # Ok::<(), Error>(())
/// ```
///
/// ```should_panic
/// # use nanalogue_core::{Error, CurrReadBuilder, AlignmentInfoBuilder, ModTableEntryBuilder,
/// #    ReadState};
/// #
/// # let mod_table_entry = ModTableEntryBuilder::default()
/// #    .base('C')
/// #    .is_strand_plus(true)
/// #    .mod_code("m".into())
/// #    .data([(2, 25, 100), (0, 15, 200)]).build()?;
/// #
/// # let read_before_build = CurrReadBuilder::default()
/// #    .read_id("some_read".into())
/// #    .seq_len(40)
/// #    .alignment(AlignmentInfoBuilder::default()
/// #        .start(10)
/// #        .end(60)
/// #        .contig("chr1".into())
/// #        .contig_id(1).build()?)
/// #        .mod_table([mod_table_entry].into());
/// #
/// let _ = read_before_build.alignment_type(ReadState::PrimaryRev).build()?;
/// # Ok::<(), Error>(())
/// ```
///
/// ### Unmapped read but mod reference coordinates are not set to `-1`.
///
/// ```should_panic
/// use nanalogue_core::{Error, CurrReadBuilder, ModTableEntryBuilder, ReadState};
///
/// let mod_table_entry = ModTableEntryBuilder::default()
///     .base('C')
///     .is_strand_plus(true)
///     .mod_code("m".into())
///     .data([(0, -1, 200), (2, 11, 100)]).build()?;
///
/// let read = CurrReadBuilder::default()
///     .read_id("some_read".into())
///     .seq_len(40)
///     .mod_table([mod_table_entry].into()).build()?;
/// # Ok::<(), Error>(())
/// ```
///
/// ### Read with mod coordinates larger than sequence length
///
/// This will cause a problem irrespective of whether a read is mapped or not.
///
/// ```should_panic
/// use nanalogue_core::{Error, CurrReadBuilder, ModTableEntryBuilder, ReadState};
///
/// let mod_table_entry = ModTableEntryBuilder::default()
///     .base('C')
///     .is_strand_plus(true)
///     .mod_code("m".into())
///     .data([(0, -1, 200), (42, -1, 100)]).build()?;
///
/// let read = CurrReadBuilder::default()
///     .read_id("some_read".into())
///     .seq_len(40)
///     .mod_table([mod_table_entry].into()).build()?;
/// # Ok::<(), Error>(())
/// ```
///
/// ```should_panic
/// use nanalogue_core::{Error, CurrReadBuilder, AlignmentInfoBuilder, ModTableEntryBuilder,
///     ReadState};
///
/// let mod_table_entry = ModTableEntryBuilder::default()
///     .base('C')
///     .is_strand_plus(true)
///     .mod_code("m".into())
///     .data([(0, 20, 200), (42, 30, 100)]).build()?;
///
/// let read = CurrReadBuilder::default()
///     .read_id("some_read".into())
///     .seq_len(40)
///     .alignment(AlignmentInfoBuilder::default()
///         .start(10)
///         .end(60)
///         .contig("chr1".into())
///         .contig_id(1).build()?)
///     .mod_table([mod_table_entry].into()).build()?;
/// # Ok::<(), Error>(())
/// ```
/// ### Read with mod coordinates beyond alignment coordinates
///
/// ```should_panic
/// use nanalogue_core::{Error, CurrReadBuilder, AlignmentInfoBuilder, ModTableEntryBuilder,
///     ReadState};
///
/// let mod_table_entry = ModTableEntryBuilder::default()
///     .base('C')
///     .is_strand_plus(true)
///     .mod_code("m".into())
///     .data([(0, 1, 200), (22, 30, 100)]).build()?;
///
/// let read = CurrReadBuilder::default()
///     .read_id("some_read".into())
///     .seq_len(40)
///     .alignment(AlignmentInfoBuilder::default()
///         .start(10)
///         .end(60)
///         .contig("chr1".into())
///         .contig_id(1).build()?)
///     .mod_table([mod_table_entry].into()).build()?;
/// # Ok::<(), Error>(())
/// ```
/// ### Reads with multiple tracks of the same kind of modification.
///
/// Here we have two `C+m` tracks. Note that two cytosine tracks or two
/// methylation tracks or two cytosine methylation tracks on opposite
/// strands are all allowed. Only the case where there are two tracks
/// of the same base with the same modification on the same strand
/// are disallowed as the same kind of information is being populated twice.
///
/// ```should_panic
/// use nanalogue_core::{Error, CurrReadBuilder, ModTableEntryBuilder, ReadState};
///
/// let mod_table_entry_1 = ModTableEntryBuilder::default()
///     .base('C')
///     .is_strand_plus(true)
///     .mod_code("m".into())
///     .data([(0, -1, 200), (2, -1, 100)]).build()?;
///
/// let mod_table_entry_2 = ModTableEntryBuilder::default()
///     .base('C')
///     .is_strand_plus(true)
///     .mod_code("m".into())
///     .data([(1, -1, 50), (3, -1, 225)]).build()?;
///
/// let read = CurrReadBuilder::default()
///     .read_id("some_read".into())
///     .seq_len(40)
///     .mod_table([mod_table_entry_1, mod_table_entry_2].into()).build()?;
/// # Ok::<(), Error>(())
/// ```
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(default)]
pub struct CurrReadBuilder {
    /// The type of alignment
    alignment_type: ReadState,
    /// Alignment information, None for unmapped reads
    #[serde(skip_serializing_if = "Option::is_none")]
    alignment: Option<AlignmentInfo>,
    /// Condensed modification data table
    mod_table: Vec<ModTableEntry>,
    /// Read identifier
    read_id: String,
    /// Sequence length
    seq_len: u64,
}

impl Default for CurrReadBuilder {
    fn default() -> Self {
        Self {
            alignment_type: ReadState::Unmapped, // note that default is unmapped now, not primary
            alignment: None,
            mod_table: Vec::new(),
            read_id: String::new(),
            seq_len: 0,
        }
    }
}

/// Alignment information for mapped reads
///
/// See documentation of [`CurrReadBuilder`] on how to use
/// this struct.
#[derive(Builder, Debug, Clone, Default, Serialize, Deserialize)]
#[serde(default)]
#[builder(default, build_fn(error = "Error"), pattern = "owned")]
pub struct AlignmentInfo {
    /// Start position on reference
    start: u64,
    /// End position on reference
    end: u64,
    /// Contig/chromosome name
    contig: String,
    /// Contig/chromosome ID
    contig_id: i32,
}

/// Data per type of modification in [`CurrReadBuilder`].
///
/// See documentation of [`CurrReadBuilder`] on how to use
/// this struct.
#[derive(Builder, Debug, Clone, Default, Serialize, Deserialize)]
#[serde(default)]
#[builder(default, build_fn(error = "Error"), pattern = "owned")]
pub struct ModTableEntry {
    /// Base that is modified (A, C, G, T, etc.)
    #[builder(field(ty = "char", build = "self.base.try_into()?"))]
    base: AllowedAGCTN,
    /// Whether this is on the plus strand
    is_strand_plus: bool,
    /// Modification code (character or numeric)
    #[builder(field(ty = "String", build = "self.mod_code.parse()?"))]
    mod_code: ModChar,
    /// Modification data as [`start`, `ref_start`, `mod_qual`] tuples
    #[builder(setter(into))]
    data: Vec<(u64, i64, u8)>,
}

impl CurrReadBuilder {
    /// Sets alignment type
    #[must_use]
    pub fn alignment_type(mut self, value: ReadState) -> Self {
        self.alignment_type = value;
        self
    }
    /// Sets alignment information
    #[must_use]
    pub fn alignment(mut self, value: AlignmentInfo) -> Self {
        self.alignment = Some(value);
        self
    }
    /// Sets mod information
    #[must_use]
    pub fn mod_table(mut self, value: Vec<ModTableEntry>) -> Self {
        self.mod_table = value;
        self
    }
    /// Sets read id
    #[must_use]
    pub fn read_id(mut self, value: String) -> Self {
        self.read_id = value;
        self
    }
    /// Sets sequence length
    #[must_use]
    pub fn seq_len(mut self, value: u64) -> Self {
        self.seq_len = value;
        self
    }
    /// Build `CurrRead`
    ///
    /// # Errors
    /// If the conversion fails. This could happen if invalid
    /// data was used in the building.
    pub fn build(self) -> Result<CurrRead<AlignAndModData>, Error> {
        CurrRead::<AlignAndModData>::try_from(self)
    }
}

impl Serialize for CurrRead<AlignAndModData> {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: serde::Serializer,
    {
        let serialized_curr_read: CurrReadBuilder =
            self.clone().try_into().map_err(serde::ser::Error::custom)?;
        serialized_curr_read.serialize(serializer)
    }
}

impl TryFrom<CurrRead<AlignAndModData>> for CurrReadBuilder {
    type Error = Error;

    fn try_from(curr_read: CurrRead<AlignAndModData>) -> Result<Self, Self::Error> {
        let alignment_type = curr_read.read_state();

        #[expect(
            clippy::arithmetic_side_effects,
            reason = "u64 variables won't overflow with genomic coords (<2^64-1)"
        )]
        let alignment = if curr_read.read_state().is_unmapped() {
            None
        } else {
            let (contig_id, start) = curr_read.contig_id_and_start()?;
            let align_len = curr_read.align_len()?;
            let contig = curr_read.contig_name()?.to_string();
            let end = start + align_len;

            Some(AlignmentInfo {
                start,
                end,
                contig,
                contig_id,
            })
        };

        let mod_table = condense_base_mods(&curr_read.mod_data().0)?;

        let read_id = curr_read.read_id().to_string();
        let seq_len = curr_read.seq_len()?;

        Ok(CurrReadBuilder {
            alignment_type,
            alignment,
            mod_table,
            read_id,
            seq_len,
        })
    }
}

impl<'de> Deserialize<'de> for CurrRead<AlignAndModData> {
    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
    where
        D: serde::Deserializer<'de>,
    {
        let serialized = CurrReadBuilder::deserialize(deserializer)?;
        serialized.try_into().map_err(serde::de::Error::custom)
    }
}

impl TryFrom<CurrReadBuilder> for CurrRead<AlignAndModData> {
    type Error = Error;

    fn try_from(serialized: CurrReadBuilder) -> Result<Self, Self::Error> {
        // Extract alignment information
        let (align_len, contig_id_and_start, contig_name, ref_range) = match (
            serialized.alignment_type.is_unmapped(),
            serialized.alignment.as_ref(),
        ) {
            (false, Some(alignment)) => {
                let align_len = {
                    if let Some(v) = alignment.end.checked_sub(alignment.start) {
                        Ok(Some(v))
                    } else {
                        Err(Error::InvalidAlignCoords(format!(
                            "is align end {0} < align start {1}? read {2} failed in `CurrRead` building!",
                            alignment.end, alignment.start, serialized.read_id
                        )))
                    }
                }?;
                let contig_id_and_start = Some((alignment.contig_id, alignment.start));
                let contig_name = Some(alignment.contig.clone());
                (
                    align_len,
                    contig_id_and_start,
                    contig_name,
                    i64::try_from(alignment.start)?..i64::try_from(alignment.end)?,
                )
            }
            (true, None) => (None, None, None, (-1i64)..0),
            (_, _) => {
                return Err(Error::UnknownAlignState(format!(
                    "alignment_type and alignment info not matching while building CurrRead! read_id: {}",
                    serialized.read_id,
                )));
            }
        };

        // Reconstruct BaseMods from mod_table
        let base_mods = reconstruct_base_mods(
            &serialized.mod_table,
            serialized.alignment_type.strand() == '-',
            ref_range,
            serialized.seq_len,
        )?;

        // Create CurrRead directly
        Ok(CurrRead {
            state: serialized.alignment_type,
            read_id: serialized.read_id,
            seq_len: Some(serialized.seq_len),
            align_len,
            mods: (base_mods, ThresholdState::GtEq(0)), // Default threshold
            contig_id_and_start,
            contig_name,
            mod_base_qual_thres: 0, // Default value
            marker: std::marker::PhantomData,
        })
    }
}

/// Convert `BaseMods` to condensed `mod_table` format
fn condense_base_mods(base_mods: &BaseMods) -> Result<Vec<ModTableEntry>, Error> {
    let mut mod_table = Vec::new();

    for base_mod in &base_mods.base_mods {
        let entries: Result<Vec<_>, Error> = base_mod
            .ranges
            .annotations
            .iter()
            .map(|k| {
                let start: u64 = k.start.try_into()?;
                let ref_start = k.reference_start.unwrap_or(-1);
                Ok((start, ref_start, k.qual))
            })
            .collect();

        mod_table.push(ModTableEntry {
            base: AllowedAGCTN::try_from(base_mod.modified_base)?,
            is_strand_plus: base_mod.strand == '+',
            mod_code: ModChar::new(base_mod.modification_type),
            data: entries?,
        });
    }

    Ok(mod_table)
}

/// Reconstruct `BaseMods` from condensed `mod_table` format
fn reconstruct_base_mods(
    mod_table: &[ModTableEntry],
    is_reverse: bool,
    ref_range: Range<i64>,
    seq_len: u64,
) -> Result<BaseMods, Error> {
    let mut base_mods = Vec::new();

    for entry in mod_table {
        let annotations = {
            let mut annotations = Vec::<FiberAnnotation>::with_capacity(entry.data.len());
            let mut valid_range = 0..seq_len;

            #[expect(
                clippy::arithmetic_side_effects,
                reason = "overflow errors not possible as genomic coordinates << 2^64"
            )]
            for &(start, ref_start, qual) in &entry.data {
                // Check that sequence coordinates are in range [prev_start + 1, seq_len)
                if !valid_range.contains(&start) {
                    return Err(Error::InvalidModCoords(String::from(
                        "in mod table, read coords > seq length or read coords not sorted (NOTE: \
ascending needed even if reversed read)!",
                    )));
                }
                valid_range = start + 1..seq_len;

                let ref_start_after_check = match (ref_start, ref_range.contains(&ref_start)) {
                    (-1, _) => None,
                    (v, true) if v > -1 => Some(v),
                    (v, _) => {
                        return Err(Error::InvalidAlignCoords(format!(
                            "coordinate {v} invalid in mod table (exceeds alignment coords or is < -1)"
                        )));
                    }
                };
                let start_i64 = i64::try_from(start)?;
                annotations.push(FiberAnnotation {
                    start: start_i64,
                    end: start_i64 + 1,
                    length: 1,
                    qual,
                    reference_start: ref_start_after_check,
                    reference_end: ref_start_after_check,
                    reference_length: ref_start_after_check.is_some().then_some(0),
                    extra_columns: None,
                });
            }
            annotations
        };

        let ranges = Ranges::from_annotations(annotations, seq_len.try_into()?, is_reverse);

        let strand = if entry.is_strand_plus { '+' } else { '-' };

        base_mods.push(BaseMod {
            modified_base: u8::try_from(char::from(entry.base))?,
            strand,
            modification_type: entry.mod_code.val(),
            ranges,
            record_is_reverse: is_reverse,
        });
    }

    base_mods.sort();

    // Check for duplicate strand, modification_type combinations
    let mut seen_combinations = HashSet::new();
    for base_mod in &base_mods {
        let combination = (base_mod.strand, base_mod.modification_type);
        if seen_combinations.contains(&combination) {
            return Err(Error::InvalidDuplicates(format!(
                "Duplicate strand '{}' and modification_type '{}' combination found",
                base_mod.strand, base_mod.modification_type
            )));
        }
        let _: bool = seen_combinations.insert(combination);
    }

    Ok(BaseMods { base_mods })
}

/// Convert a vector of `CurrRead<AlignAndModData>` to a Polars `DataFrame`
/// with one row per modification data point
///
/// Each modification data point becomes a row in the `DataFrame`, with read-level
/// and alignment-level information repeated across rows for the same read.
///
/// # Example
///
/// ```
/// use nanalogue_core::{Error, CurrReadBuilder, AlignmentInfoBuilder, ModTableEntryBuilder,
///     ReadState, curr_reads_to_dataframe};
/// use polars::prelude::*;
///
/// // Build modification table entries with two data points
/// let mod_table_entry = ModTableEntryBuilder::default()
///     .base('C')
///     .is_strand_plus(true)
///     .mod_code("m".into())
///     .data([(0, 15, 200), (2, 25, 100)])
///     .build()?;
///
/// // Build a CurrRead<AlignAndModData>
/// let read = CurrReadBuilder::default()
///     .read_id("test_read".into())
///     .seq_len(40)
///     .alignment_type(ReadState::PrimaryFwd)
///     .alignment(AlignmentInfoBuilder::default()
///         .start(10)
///         .end(60)
///         .contig("chr1".into())
///         .contig_id(1)
///         .build()?)
///     .mod_table([mod_table_entry].into())
///     .build()?;
///
/// // Convert to DataFrame
/// let df = curr_reads_to_dataframe(&[read])?;
///
/// // Verify DataFrame structure
/// assert_eq!(df.height(), 2); // Two modification data points
/// assert_eq!(df.width(), 13); // 13 columns
///
/// // Check read-level fields (repeated across both rows)
/// let read_id_col = df.column("read_id")?.str()?;
/// assert_eq!(read_id_col.get(0), Some("test_read"));
/// assert_eq!(read_id_col.get(1), Some("test_read"));
///
/// let seq_len_col = df.column("seq_len")?.u64()?;
/// assert_eq!(seq_len_col.get(0), Some(40));
/// assert_eq!(seq_len_col.get(1), Some(40));
///
/// let alignment_type_col = df.column("alignment_type")?.str()?;
/// assert_eq!(alignment_type_col.get(0), Some("primary_forward"));
/// assert_eq!(alignment_type_col.get(1), Some("primary_forward"));
///
/// // Check alignment fields (repeated across both rows)
/// let align_start_col = df.column("align_start")?.u64()?;
/// assert_eq!(align_start_col.get(0), Some(10));
/// assert_eq!(align_start_col.get(1), Some(10));
///
/// let align_end_col = df.column("align_end")?.u64()?;
/// assert_eq!(align_end_col.get(0), Some(60));
/// assert_eq!(align_end_col.get(1), Some(60));
///
/// let contig_col = df.column("contig")?.str()?;
/// assert_eq!(contig_col.get(0), Some("chr1"));
/// assert_eq!(contig_col.get(1), Some("chr1"));
///
/// let contig_id_col = df.column("contig_id")?.i32()?;
/// assert_eq!(contig_id_col.get(0), Some(1));
/// assert_eq!(contig_id_col.get(1), Some(1));
///
/// // Check modification entry fields (repeated across both rows)
/// let base_col = df.column("base")?.str()?;
/// assert_eq!(base_col.get(0), Some("C"));
/// assert_eq!(base_col.get(1), Some("C"));
///
/// let is_strand_plus_col = df.column("is_strand_plus")?.bool()?;
/// assert_eq!(is_strand_plus_col.get(0), Some(true));
/// assert_eq!(is_strand_plus_col.get(1), Some(true));
///
/// let mod_code_col = df.column("mod_code")?.str()?;
/// assert_eq!(mod_code_col.get(0), Some("m"));
/// assert_eq!(mod_code_col.get(1), Some("m"));
///
/// // Check modification data points (unique per row)
/// let position_col = df.column("position")?.u64()?;
/// assert_eq!(position_col.get(0), Some(0)); // First mod position
/// assert_eq!(position_col.get(1), Some(2)); // Second mod position
///
/// let ref_position_col = df.column("ref_position")?.i64()?;
/// assert_eq!(ref_position_col.get(0), Some(15)); // First ref position
/// assert_eq!(ref_position_col.get(1), Some(25)); // Second ref position
///
/// let mod_quality_col = df.column("mod_quality")?.u32()?;
/// assert_eq!(mod_quality_col.get(0), Some(200)); // First mod_quality score
/// assert_eq!(mod_quality_col.get(1), Some(100)); // Second mod_quality score
///
/// # Ok::<(), Error>(())
/// ```
///
/// # Errors
/// Returns nanalogue `Error` if `DataFrame` construction fails or if
/// data extraction from `CurrRead` fails
#[expect(
    clippy::arithmetic_side_effects,
    reason = "start + alen cannot overflow as genomic coordinates are far below u64::MAX"
)]
pub fn curr_reads_to_dataframe(reads: &[CurrRead<AlignAndModData>]) -> Result<DataFrame, Error> {
    // Vectors to hold column data
    let mut read_ids: Vec<String> = Vec::new();
    let mut seq_lens: Vec<Option<u64>> = Vec::new();
    let mut alignment_types: Vec<String> = Vec::new();

    // Alignment info columns
    let mut align_starts: Vec<Option<u64>> = Vec::new();
    let mut align_ends: Vec<Option<u64>> = Vec::new();
    let mut contigs: Vec<Option<String>> = Vec::new();
    let mut contig_ids: Vec<Option<i32>> = Vec::new();

    // Modification entry columns
    let mut bases: Vec<String> = Vec::new();
    let mut is_strand_plus: Vec<bool> = Vec::new();
    let mut mod_codes: Vec<String> = Vec::new();

    // Modification data point columns (the unique part)
    let mut positions: Vec<u64> = Vec::new();
    let mut ref_positions: Vec<i64> = Vec::new();
    let mut mod_qualities: Vec<u32> = Vec::new();

    // Iterate through each CurrRead
    for read in reads {
        // Extract read-level information
        let read_id = read.read_id();
        let seq_len = read.seq_len().ok();
        let alignment_type = read.read_state();

        // Extract alignment information (may be None for unmapped reads)
        let (contig_id, align_start, align_end, contig_name) =
            if let (Ok((cid, start)), Ok(alen), Ok(cname)) = (
                read.contig_id_and_start(),
                read.align_len(),
                read.contig_name(),
            ) {
                (
                    Some(cid),
                    Some(start),
                    Some(start + alen),
                    Some(cname.to_string()),
                )
            } else {
                (None, None, None, None)
            };

        // Get BaseMods and condense to ModTableEntry format
        let mod_data = read.mod_data();
        let mod_table = condense_base_mods(&mod_data.0)?;

        // For each modification table entry
        for mod_entry in &mod_table {
            // For each data point in this mod entry
            for &(start, ref_start, qual) in &mod_entry.data {
                // Add read-level fields (repeated)
                read_ids.push(read_id.to_string());
                seq_lens.push(seq_len);
                alignment_types.push(alignment_type.to_string());

                // Add alignment fields (repeated, may be None)
                align_starts.push(align_start);
                align_ends.push(align_end);
                contigs.push(contig_name.clone());
                contig_ids.push(contig_id);

                // Add mod entry fields (repeated)
                bases.push(mod_entry.base.to_string());
                is_strand_plus.push(mod_entry.is_strand_plus);
                mod_codes.push(mod_entry.mod_code.to_string());

                // Add data point fields (unique per row)
                positions.push(start);
                ref_positions.push(ref_start);
                mod_qualities.push(u32::from(qual));
            }
        }
    }

    // Build the DataFrame
    let df = df! {
        "read_id" => read_ids,
        "seq_len" => seq_lens,
        "alignment_type" => alignment_types,
        "align_start" => align_starts,
        "align_end" => align_ends,
        "contig" => contigs,
        "contig_id" => contig_ids,
        "base" => bases,
        "is_strand_plus" => is_strand_plus,
        "mod_code" => mod_codes,
        "position" => positions,
        "ref_position" => ref_positions,
        "mod_quality" => mod_qualities,
    }?;

    Ok(df)
}

#[cfg(test)]
mod test_error_handling {
    use super::*;

    #[test]
    fn set_read_state_not_implemented_error() {
        for flag_value in 0..4096u16 {
            let mut record = Record::new();
            record.set_qname(b"test_read");
            record.set_flags(flag_value);
            let curr_read = CurrRead::default();
            // * first line below is usual primary, secondary, supplementary,
            //   unmapped with reversed set or not with the first 3 categories
            // * second line is if unmapped is set with a mapped state, or
            //   secondary and supplementary are both set
            // * third line are flags we have chosen to exclude from our program.
            match (flag_value, curr_read.set_read_state_and_id(&record)) {
                (0 | 4 | 16 | 256 | 272 | 2048 | 2064, Ok(_))
                | (
                    20 | 260 | 276 | 2052 | 2068 | 2304 | 2308 | 2320 | 2324,
                    Err(Error::UnknownAlignState(_)),
                )
                | (_, Err(Error::NotImplemented(_))) => {}
                (_, _) => unreachable!(),
            }
        }
    }

    #[test]
    #[should_panic(expected = "InvalidDuplicates")]
    fn reconstruct_base_mods_detects_duplicates() {
        // Create a test case with duplicate strand and modification_type combinations
        // i.e. we have two entries for T+T below with different data
        let mod_entries = vec![
            ModTableEntry {
                base: AllowedAGCTN::T,
                is_strand_plus: true,
                mod_code: ModChar::new('T'),
                data: vec![(0, 0, 10)],
            },
            ModTableEntry {
                base: AllowedAGCTN::T,
                is_strand_plus: true,
                mod_code: ModChar::new('T'),
                data: vec![(1, 1, 20)],
            },
        ];

        // Test reconstruct_base_mods with duplicate entries
        let _: BaseMods = reconstruct_base_mods(&mod_entries, false, 0..i64::MAX, 10).unwrap();
    }

    #[test]
    fn reconstruct_base_mods_accepts_unique_combinations() {
        // Create a valid case with different combinations
        // First entry: T+ modification
        // Second entry: C+ modification (different base, same strand)
        // Third entry: T- modification (same base, different strand)
        let mod_entries = vec![
            ModTableEntry {
                base: AllowedAGCTN::T,
                is_strand_plus: true,
                mod_code: ModChar::new('T'),
                data: vec![(0, 0, 10)],
            },
            ModTableEntry {
                base: AllowedAGCTN::C,
                is_strand_plus: true,
                mod_code: ModChar::new('m'),
                data: vec![(1, 1, 20)],
            },
            ModTableEntry {
                base: AllowedAGCTN::T,
                is_strand_plus: false,
                mod_code: ModChar::new('T'),
                data: vec![(2, 2, 30)],
            },
        ];

        // This should succeed since all combinations are unique
        let _: BaseMods = reconstruct_base_mods(&mod_entries, false, 0..i64::MAX, 10).unwrap();
    }
}

#[cfg(test)]
mod test_serde {
    use super::*;
    use crate::nanalogue_bam_reader;
    use indoc::indoc;
    use rust_htslib::bam::Read as _;

    #[test]
    fn first_record_serde() -> Result<(), Error> {
        // Read the first record from the example BAM file
        let mut reader = nanalogue_bam_reader("examples/example_1.bam")?;
        let first_record = reader.records().next().unwrap()?;

        // Create CurrRead with alignment and modification data
        let curr_read = CurrRead::default()
            .try_from_only_alignment(&first_record)?
            .set_mod_data(&first_record, ThresholdState::GtEq(0), 0)?;

        let actual_json: serde_json::Value = serde_json::to_value(&curr_read)?;

        let expected_json_str = indoc! {r#"
            {
              "alignment_type": "primary_forward",
              "alignment": {
                "start": 9,
                "end": 17,
                "contig": "dummyI",
                "contig_id": 0
              },
              "mod_table": [
                {
                  "base": "T",
                  "is_strand_plus": true,
                  "mod_code": "T",
                  "data": [
                    [0, 9, 4],
                    [3, 12, 7],
                    [4, 13, 9],
                    [7, 16, 6]
                  ]
                }
              ],
              "read_id": "5d10eb9a-aae1-4db8-8ec6-7ebb34d32575",
              "seq_len": 8
            }"#};

        let expected_json: serde_json::Value = serde_json::from_str(expected_json_str)?;

        // Compare expected and actual outputs
        assert_eq!(actual_json, expected_json);

        // Also test deserialization: deserialize the expected JSON and compare with original CurrRead
        let deserialized_curr_read: CurrRead<AlignAndModData> =
            serde_json::from_str(expected_json_str)?;
        assert_eq!(deserialized_curr_read, curr_read);

        Ok(())
    }

    #[test]
    fn first_record_roundtrip() -> Result<(), Error> {
        // Read the first record from the example BAM file (same as serialization test)
        let mut reader = nanalogue_bam_reader("examples/example_1.bam")?;
        let first_record = reader.records().next().unwrap()?;

        // Create the original CurrRead with alignment and modification data
        let original_curr_read = CurrRead::default()
            .try_from_only_alignment(&first_record)?
            .set_mod_data(&first_record, ThresholdState::GtEq(0), 0)?;

        // Serialize to JSON
        let json_str = serde_json::to_string_pretty(&original_curr_read)?;

        // Deserialize back to CurrRead
        let deserialized_curr_read: CurrRead<AlignAndModData> = serde_json::from_str(&json_str)?;

        // The deserialized CurrRead should be equal to the original
        assert_eq!(deserialized_curr_read, original_curr_read);

        Ok(())
    }

    #[test]
    fn blank_json_record_roundtrip() -> Result<(), Error> {
        let json_str = indoc! {r"
            {
            }"};

        // Deserialize JSON to CurrRead
        let curr_read: CurrRead<AlignAndModData> = serde_json::from_str(json_str)?;

        // Serialize back to JSON
        let serialized_json = serde_json::to_string_pretty(&curr_read)?;

        // Deserialize again
        let roundtrip_curr_read: CurrRead<AlignAndModData> =
            serde_json::from_str(&serialized_json)?;

        // Check that the roundtrip preserves equality
        assert_eq!(curr_read, roundtrip_curr_read);

        Ok(())
    }

    #[test]
    fn example_1_unmapped() -> Result<(), Error> {
        // Read the fourth record from the example BAM file (this is the unmapped read)
        let fourth_record = {
            let mut reader = nanalogue_bam_reader("examples/example_1.bam")?;
            let mut records = reader.records();
            for _ in 0..3 {
                let _: Record = records.next().unwrap()?;
            }
            records.next().unwrap()?
        };

        // Create CurrRead with alignment and modification data
        let curr_read = CurrRead::default()
            .try_from_only_alignment(&fourth_record)?
            .set_mod_data(&fourth_record, ThresholdState::GtEq(0), 0)?;

        let actual_json: serde_json::Value = serde_json::to_value(&curr_read)?;

        let expected_json_str = indoc! {r#"
            {
              "alignment_type": "unmapped",
              "mod_table": [
                {
                  "base": "G",
                  "is_strand_plus": false,
                  "mod_code": "7200",
                  "data": [
                    [28, -1, 0],
                    [29, -1, 0],
                    [30, -1, 0],
                    [32, -1, 0],
                    [43, -1, 77],
                    [44, -1, 0]
                  ]
                },
                {
                  "base": "T",
                  "is_strand_plus": true,
                  "mod_code": "T",
                  "data": [
                    [3, -1, 221],
                    [8, -1, 242],
                    [27, -1, 0],
                    [39, -1, 47],
                    [47, -1, 239]
                  ]
                }
              ],
              "read_id": "a4f36092-b4d5-47a9-813e-c22c3b477a0c",
              "seq_len": 48
            }"#};

        let expected_json: serde_json::Value = serde_json::from_str(expected_json_str)?;

        // Compare expected and actual outputs
        assert_eq!(actual_json, expected_json);

        Ok(())
    }

    #[test]
    #[should_panic(expected = "invalid alignment coordinates")]
    fn invalid_align_coords_unmapped_with_reference_positions() {
        let invalid_json = r#"{
            "mod_table": [
                {
                    "data": [[2, 3, 200]]
                }
            ],
            "seq_len": 10
        }"#;

        // Deserialize JSON to CurrRead - this should panic with InvalidAlignCoords
        let _: CurrRead<AlignAndModData> = serde_json::from_str(invalid_json).unwrap();
    }

    #[test]
    #[should_panic(expected = "invalid mod coordinates")]
    fn invalid_sequence_length() {
        let invalid_json = r#"{
            "mod_table": [
                {
                    "data": [[20, -1, 200]]
                }
            ],
            "seq_len": 10
        }"#;

        // Deserialize JSON to CurrRead - this should panic with InvalidSeqLength
        let _: CurrRead<AlignAndModData> = serde_json::from_str(invalid_json).unwrap();
    }

    #[test]
    #[should_panic(expected = "invalid value")]
    fn invalid_sequence_coordinate() {
        let invalid_json = r#"{
            "alignment_type": "primary_forward",
            "alignment": {
                "start": 0,
                "end": 30
            },
            "mod_table": [
                {
                    "data": [[-1, 1, 200]]
                }
            ],
            "seq_len": 30
        }"#;

        // Deserialize JSON to CurrRead - this should panic with InvalidSeqLength
        let _: CurrRead<AlignAndModData> = serde_json::from_str(invalid_json).unwrap();
    }

    #[test]
    #[should_panic(expected = "invalid alignment coordinates")]
    fn invalid_alignment_coordinates() {
        let invalid_json = r#"{
            "alignment_type": "primary_forward",
            "alignment": {
                "start": 10,
                "end": 25
            },
            "mod_table": [
                {
                    "data": [[0, 10, 200], [1, 20, 180], [2, 30, 220]]
                }
            ],
            "seq_len": 3
        }"#;

        // Deserialize JSON to CurrRead - this should panic with InvalidAlignCoords
        let _: CurrRead<AlignAndModData> = serde_json::from_str(invalid_json).unwrap();
    }

    #[test]
    #[should_panic(expected = "invalid mod coordinates")]
    fn invalid_sorting_forward_alignment() {
        let invalid_json = r#"{
            "alignment_type": "primary_forward",
            "alignment": {
                "start": 10,
                "end": 40
            },
            "mod_table": [
                {
                    "data": [[0, 10, 200], [2, 30, 180], [1, 20, 220]]
                }
            ],
            "seq_len": 3
        }"#;

        // Deserialize JSON to CurrRead - this should panic with InvalidSorting
        let _: CurrRead<AlignAndModData> = serde_json::from_str(invalid_json).unwrap();
    }

    #[test]
    #[should_panic(expected = "invalid mod coordinates")]
    fn invalid_sorting_reverse_alignment() {
        let invalid_json = r#"{
            "alignment_type": "primary_reverse",
            "alignment": {
                "start": 10,
                "end": 40
            },
            "mod_table": [
                {
                    "data": [[2, 30, 180], [1, 20, 220], [0, 10, 200]]
                }
            ],
            "seq_len": 3
        }"#;

        // Deserialize JSON to CurrRead - this should panic with InvalidSorting
        let _: CurrRead<AlignAndModData> = serde_json::from_str(invalid_json).unwrap();
    }
}