nanalogue_core/
read_utils.rs

1//! Implements `CurrRead` Struct for processing information
2//! and the mod information in the BAM file using a parser implemented in
3//! another module.
4
5use crate::{
6    AllowedAGCTN, Contains as _, Error, F32Bw0and1, FilterByRefCoords, InputModOptions,
7    InputRegionOptions, InputWindowing, ModChar, ReadState, ThresholdState, nanalogue_mm_ml_parser,
8};
9use bedrs::prelude::{Intersect as _, StrandedBed3};
10use bedrs::{Bed3, Coordinates as _, Strand};
11use bio_types::genome::AbstractInterval as _;
12use derive_builder::Builder;
13use fibertools_rs::utils::{
14    bamannotations::{FiberAnnotation, Ranges},
15    basemods::{BaseMod, BaseMods},
16};
17use polars::{df, prelude::DataFrame};
18use rust_htslib::{bam::ext::BamRecordExtensions as _, bam::record::Record};
19use serde::{Deserialize, Serialize};
20use std::{
21    cmp::Ordering,
22    collections::{HashMap, HashSet},
23    fmt::{self, Write as _},
24    ops::Range,
25    rc::Rc,
26};
27
28/// Shows [`CurrRead`] has no data, a state-type parameter
29#[derive(Debug, Default, Copy, Clone, PartialEq)]
30#[non_exhaustive]
31pub struct NoData;
32
33/// Shows [`CurrRead`] has only alignment data, a state-type parameter
34#[derive(Debug, Default, Copy, Clone, PartialEq)]
35#[non_exhaustive]
36pub struct OnlyAlignData;
37
38/// Shows [`CurrRead`] has only alignment data but with all fields filled, a state-type parameter
39#[derive(Debug, Default, Copy, Clone, PartialEq)]
40#[non_exhaustive]
41pub struct OnlyAlignDataComplete;
42
43/// Shows [`CurrRead`] has alignment and modification data, a state-type parameter
44#[derive(Debug, Default, Copy, Clone, PartialEq)]
45#[non_exhaustive]
46pub struct AlignAndModData;
47
48/// Dummy trait, associated with the state-type pattern in [`CurrRead`]
49pub trait CurrReadState {}
50
51impl CurrReadState for NoData {}
52impl CurrReadState for OnlyAlignData {}
53impl CurrReadState for OnlyAlignDataComplete {}
54impl CurrReadState for AlignAndModData {}
55
56/// Another dummy trait, associated with the state-type pattern in [`CurrRead`]
57pub trait CurrReadStateWithAlign {}
58
59impl CurrReadStateWithAlign for OnlyAlignData {}
60impl CurrReadStateWithAlign for OnlyAlignDataComplete {}
61impl CurrReadStateWithAlign for AlignAndModData {}
62
63/// Another dummy trait, associated with the state-type pattern in [`CurrRead`]
64pub trait CurrReadStateOnlyAlign {}
65
66impl CurrReadStateOnlyAlign for OnlyAlignData {}
67impl CurrReadStateOnlyAlign for OnlyAlignDataComplete {}
68
69/// Our main struct that receives and stores from one BAM record.
70/// Also has methods for processing this information.
71/// Also see [`CurrReadBuilder`] for a way to build this without BAM records.
72///
73/// We call this `CurrRead` as in 'current read'. `Read` is used
74/// within `rust-htslib`, so we don't want to create another `Read` struct.
75///
76/// The information within the struct is hard to access without
77/// the methods defined here. This is to ensure the struct
78/// doesn't fall into an invalid state, which could cause mistakes
79/// in calculations associated with the struct. For example:
80/// if I want to measure mean modification density along windows
81/// of the raw modification data, I need a guarantee that the
82/// modification data is sorted by position. We can guarantee
83/// this when the modification data is parsed, but we cannot
84/// guarantee this if we allow free access to the struct.
85/// NOTE: we could have implemented these as a trait extension
86/// to the `rust_htslib` `Record` struct, but we have chosen not to,
87/// as we may want to persist data like modifications and do
88/// multiple operations on them. And `Record` has inconvenient
89/// return types like i64 instead of u64 for positions along the
90/// genome.
91#[derive(Debug, Clone, PartialEq)]
92pub struct CurrRead<S: CurrReadState> {
93    /// Stores alignment type.
94    state: ReadState,
95
96    /// Read ID of molecule, also called query name in some contexts.
97    read_id: String,
98
99    /// Length of the stored sequence in a BAM file. This is usually the basecalled sequence.
100    seq_len: Option<u64>,
101
102    /// Length of the segment on the reference genome the molecule maps to.
103    align_len: Option<u64>,
104
105    /// Stores modification information along with any applied thresholds.
106    mods: (BaseMods, ThresholdState),
107
108    /// ID of the reference genome contig and the starting position on
109    /// contig that the molecule maps or aligns to.
110    /// NOTE: the contig here is numeric and refers to an index on the BAM
111    /// header. To convert this into an alphanumeric string, you have to
112    /// process the header and store it in `contig_name` below.
113    /// We have left it this way as it is easier to store and process integers.
114    contig_id_and_start: Option<(i32, u64)>,
115
116    /// Contig name.
117    contig_name: Option<String>,
118
119    /// Base PHRED-quality threshold (no offset). Mods could have been filtered by this.
120    mod_base_qual_thres: u8,
121
122    /// `PhantomData` marker for compiler's sake
123    marker: std::marker::PhantomData<S>,
124}
125
126/// Implements defaults for `CurrRead`
127impl Default for CurrRead<NoData> {
128    fn default() -> Self {
129        CurrRead::<NoData> {
130            state: ReadState::PrimaryFwd,
131            read_id: String::new(),
132            seq_len: None,
133            align_len: None,
134            mods: (BaseMods { base_mods: vec![] }, ThresholdState::default()),
135            contig_id_and_start: None,
136            contig_name: None,
137            mod_base_qual_thres: 0,
138            marker: std::marker::PhantomData::<NoData>,
139        }
140    }
141}
142
143impl CurrRead<NoData> {
144    /// sets the alignment of the read and read ID using BAM record
145    ///
146    /// # Errors
147    /// While we support normal BAM reads from `ONT`, `PacBio` etc. that contain modifications,
148    /// we do not support some BAM flags like paired, duplicate, quality check failed etc.
149    /// This is because of our design choices e.g. if mods are called on paired reads,
150    /// then we'll have to include both records as one read in our statistics
151    /// and we do not have functionality in place to do this.
152    /// So, we return errors if such flags or an invalid combination of flags (e.g.
153    /// secondary and supplementary bits are set) are encountered.
154    /// We also return error upon invalid read id but this is expected to be in violation
155    /// of the BAM format (UTF-8 error).
156    ///
157    /// ```
158    /// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader, ReadState};
159    /// use rust_htslib::bam::Read;
160    /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
161    /// let mut count = 0;
162    /// for record in reader.records(){
163    ///     let r = record?;
164    ///     let curr_read = CurrRead::default().set_read_state_and_id(&r)?;
165    ///     match count {
166    ///         0 => assert_eq!(curr_read.read_state(), ReadState::PrimaryFwd),
167    ///         1 => assert_eq!(curr_read.read_state(), ReadState::PrimaryFwd),
168    ///         2 => assert_eq!(curr_read.read_state(), ReadState::PrimaryRev),
169    ///         3 => assert_eq!(curr_read.read_state(), ReadState::Unmapped),
170    ///         _ => unreachable!(),
171    ///     }
172    ///     count = count + 1;
173    /// }
174    /// # Ok::<(), Error>(())
175    /// ```
176    pub fn set_read_state_and_id(self, record: &Record) -> Result<CurrRead<OnlyAlignData>, Error> {
177        // extract read id
178        let read_id = match str::from_utf8(record.qname()) {
179            Ok(v) => v.to_string(),
180            Err(e) => {
181                return Err(Error::InvalidReadID(format!(
182                    "error in setting read id, which possibly violates BAM requirements: {e}"
183                )));
184            }
185        };
186        // check for unsupported flags
187        if record.is_paired()
188            || record.is_proper_pair()
189            || record.is_first_in_template()
190            || record.is_last_in_template()
191            || record.is_mate_reverse()
192            || record.is_mate_unmapped()
193            || record.is_duplicate()
194            || record.is_quality_check_failed()
195        {
196            return Err(Error::NotImplemented(format!(
197                "paired-read/mate-read/duplicate/qual-check-failed flags not supported! read_id: {read_id}"
198            )));
199        }
200        // set read state
201        let state = match (
202            record.is_reverse(),
203            record.is_unmapped(),
204            record.is_secondary(),
205            record.is_supplementary(),
206        ) {
207            (false, true, false, false) => ReadState::Unmapped,
208            (true, false, false, false) => ReadState::PrimaryRev,
209            (true, false, true, false) => ReadState::SecondaryRev,
210            (false, false, true, false) => ReadState::SecondaryFwd,
211            (true, false, false, true) => ReadState::SupplementaryRev,
212            (false, false, false, true) => ReadState::SupplementaryFwd,
213            (false, false, false, false) => ReadState::PrimaryFwd,
214            _ => {
215                return Err(Error::UnknownAlignState(format!(
216                    "invalid flag combination! read_id: {read_id}",
217                )));
218            }
219        };
220        Ok(CurrRead::<OnlyAlignData> {
221            state,
222            read_id,
223            seq_len: None,
224            align_len: None,
225            mods: (BaseMods { base_mods: vec![] }, ThresholdState::default()),
226            contig_id_and_start: None,
227            contig_name: None,
228            mod_base_qual_thres: 0,
229            marker: std::marker::PhantomData::<OnlyAlignData>,
230        })
231    }
232
233    /// Runs [`CurrRead<NoData>::try_from_only_alignment_seq_len_optional`], forcing sequence
234    /// length retrieval. See comments there and the comments below.
235    ///
236    /// Some BAM records have zero-length sequence fields i.e. marked by a '*'.
237    /// This may be intentional e.g. a secondary alignment has the same sequence
238    /// as a corresponding primary alignment and repeating a sequence is not space-efficient.
239    /// Or it may be intentional due to other reasons.
240    /// For modification processing, we cannot deal with these records as we have to match
241    /// sequences across different records.
242    /// So, our easy-to-use-function here forces sequence length retrieval and will fail
243    /// if zero length sequences are found.
244    ///
245    /// Another function below [`CurrRead<NoData>::try_from_only_alignment_zero_seq_len`], allows
246    /// zero length sequences through and can be used if zero length sequences are really
247    /// needed. In such a scenario, the user has to carefully watch for errors.
248    /// So we discourage its use unless really necessary.
249    ///
250    /// # Errors
251    pub fn try_from_only_alignment(
252        self,
253        record: &Record,
254    ) -> Result<CurrRead<OnlyAlignDataComplete>, Error> {
255        self.try_from_only_alignment_seq_len_optional(record, true)
256    }
257
258    /// Runs [`CurrRead<NoData>::try_from_only_alignment_seq_len_optional`], avoiding sequence
259    /// length retrieval and setting it to zero. See comments there and the comments below.
260    ///
261    /// See notes on [`CurrRead<NoData>::try_from_only_alignment`].
262    /// Use of this function is discouraged unless really necessary as we cannot parse
263    /// modification information from zero-length sequences without errors.
264    ///
265    /// # Errors
266    pub fn try_from_only_alignment_zero_seq_len(
267        self,
268        record: &Record,
269    ) -> Result<CurrRead<OnlyAlignDataComplete>, Error> {
270        self.try_from_only_alignment_seq_len_optional(record, false)
271    }
272
273    /// Uses only alignment information and no modification information to
274    /// create the struct. Use this if you want to perform operations that
275    /// do not involve reading or manipulating the modification data.
276    /// If `is_seq_len_non_zero` is set to false, then sequence length is
277    /// not retrieved and is set to zero.
278    ///
279    /// # Errors
280    /// Upon failure in retrieving record information.
281    pub fn try_from_only_alignment_seq_len_optional(
282        self,
283        record: &Record,
284        is_seq_len_non_zero: bool,
285    ) -> Result<CurrRead<OnlyAlignDataComplete>, Error> {
286        let mut curr_read_state = CurrRead::default().set_read_state_and_id(record)?;
287        if !curr_read_state.read_state().is_unmapped() {
288            curr_read_state = curr_read_state
289                .set_align_len(record)?
290                .set_contig_id_and_start(record)?
291                .set_contig_name(record)?;
292        }
293        if is_seq_len_non_zero {
294            curr_read_state = curr_read_state.set_seq_len(record)?;
295        } else {
296            curr_read_state.seq_len = Some(0u64);
297        }
298        let CurrRead::<OnlyAlignData> {
299            state,
300            read_id,
301            seq_len,
302            align_len,
303            contig_id_and_start,
304            contig_name,
305            ..
306        } = curr_read_state;
307        Ok(CurrRead::<OnlyAlignDataComplete> {
308            state,
309            read_id,
310            seq_len,
311            align_len,
312            mods: (BaseMods { base_mods: vec![] }, ThresholdState::default()),
313            contig_id_and_start,
314            contig_name,
315            mod_base_qual_thres: 0,
316            marker: std::marker::PhantomData::<OnlyAlignDataComplete>,
317        })
318    }
319}
320
321impl<S: CurrReadStateWithAlign + CurrReadState> CurrRead<S> {
322    /// gets the read state
323    #[must_use]
324    pub fn read_state(&self) -> ReadState {
325        self.state
326    }
327    /// set length of sequence from BAM record
328    ///
329    /// # Errors
330    /// Errors are returned if sequence length is already set or
331    /// sequence length is not non-zero.
332    ///
333    /// ```
334    /// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
335    /// use rust_htslib::bam::Read;
336    /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
337    /// let mut count = 0;
338    /// for record in reader.records(){
339    ///     let r = record?;
340    ///     let curr_read = CurrRead::default().set_read_state_and_id(&r)?.set_seq_len(&r)?;
341    ///     let Ok(len) = curr_read.seq_len() else { unreachable!() };
342    ///     match count {
343    ///         0 => assert_eq!(len, 8),
344    ///         1 => assert_eq!(len, 48),
345    ///         2 => assert_eq!(len, 33),
346    ///         3 => assert_eq!(len, 48),
347    ///         _ => unreachable!(),
348    ///     }
349    ///     count = count + 1;
350    /// }
351    /// # Ok::<(), Error>(())
352    /// ```
353    ///
354    /// If we call the method twice, we should hit a panic
355    /// ```should_panic
356    /// # use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
357    /// # use rust_htslib::bam::Read;
358    /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
359    /// for record in reader.records(){
360    ///     let r = record?;
361    ///     let curr_read = CurrRead::default().set_read_state_and_id(&r)?
362    ///         .set_seq_len(&r)?.set_seq_len(&r)?;
363    ///     break;
364    /// }
365    /// # Ok::<(), Error>(())
366    /// ```
367    pub fn set_seq_len(mut self, record: &Record) -> Result<Self, Error> {
368        self.seq_len = match self.seq_len {
369            Some(_) => {
370                return Err(Error::InvalidDuplicates(format!(
371                    "cannot set sequence length again! read_id: {}",
372                    self.read_id()
373                )));
374            }
375            None => match record.seq_len() {
376                0 => {
377                    return Err(Error::ZeroSeqLen(format!(
378                        "avoid including 0-len sequences while parsing mod data in this program, read_id: {}",
379                        self.read_id()
380                    )));
381                }
382                l => Some(u64::try_from(l)?),
383            },
384        };
385        Ok(self)
386    }
387    /// gets length of sequence
388    ///
389    /// # Errors
390    /// Error if sequence length is not set
391    pub fn seq_len(&self) -> Result<u64, Error> {
392        self.seq_len.ok_or(Error::UnavailableData(format!(
393            "seq len not available, read_id: {}",
394            self.read_id()
395        )))
396    }
397    /// set alignment length from BAM record if available
398    ///
399    /// # Errors
400    /// Returns errors if alignment len is already set, instance is
401    /// unmapped, or if alignment coordinates are malformed
402    /// (e.g. end < start).
403    pub fn set_align_len(mut self, record: &Record) -> Result<Self, Error> {
404        self.align_len = match self.align_len {
405            Some(_) => Err(Error::InvalidDuplicates(format!(
406                "cannot set alignment length again! read_id: {}",
407                self.read_id()
408            ))),
409            None => {
410                if self.read_state().is_unmapped() {
411                    Err(Error::Unmapped(format!(
412                        "cannot set alignment properties for unmapped reads, read_id: {}",
413                        self.read_id()
414                    )))
415                } else {
416                    // NOTE: right now, I don't know of a way to test the error below
417                    // as rust htslib initializes an empty record with an alignment
418                    // length of 1 (see the code below). This is only a note about
419                    // the error variant, not the normal function of this code block
420                    // which is fine.
421                    // ```
422                    // use rust_htslib::bam::ext::BamRecordExtensions;
423                    // let r = Record::new();
424                    // assert_eq!(r.seq_len(), 0);
425                    // assert_eq!(r.pos(), 0);
426                    // assert_eq!(r.reference_end(), 1);
427                    // ```
428                    let st = record.pos();
429                    let en = record.reference_end();
430                    if en > st && st >= 0 {
431                        #[expect(
432                            clippy::arithmetic_side_effects,
433                            reason = "en > st && st >= 0 guarantee no i64 overflows"
434                        )]
435                        #[expect(
436                            clippy::missing_panics_doc,
437                            reason = "en > st && st >= 0 guarantee no panic"
438                        )]
439                        Ok(Some(
440                            (en - st)
441                                .try_into()
442                                .expect("en>st && st>=0 guarantee no problems i64->u64"),
443                        ))
444                    } else {
445                        Err(Error::InvalidAlignLength(format!(
446                            "in `set_align_len`, start: {st}, end: {en} invalid! i.e. en <= st or st < 0, read_id: {}",
447                            self.read_id()
448                        )))
449                    }
450                }
451            }
452        }?;
453        Ok(self)
454    }
455    /// gets alignment length
456    ///
457    /// # Errors
458    /// if instance is unmapped or alignment length is not set
459    pub fn align_len(&self) -> Result<u64, Error> {
460        if self.read_state().is_unmapped() {
461            Err(Error::Unmapped(format!(
462                "request alignment data on unmapped, read_id: {}",
463                self.read_id()
464            )))
465        } else {
466            self.align_len.ok_or(Error::UnavailableData(format!(
467                "alignment data not available, read_id: {}",
468                self.read_id()
469            )))
470        }
471    }
472    /// sets contig ID and start from BAM record if available
473    ///
474    /// # Errors
475    /// if instance is unmapped, if these data are already set and
476    /// the user is trying to set them again, or if coordinates
477    /// are malformed (start position is negative)
478    ///
479    /// ```
480    /// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
481    /// use rust_htslib::bam::Read;
482    /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
483    /// let mut count = 0;
484    /// for record in reader.records(){
485    ///     let r = record?;
486    ///     let curr_read =
487    ///         CurrRead::default().set_read_state_and_id(&r)?.set_contig_id_and_start(&r)?;
488    ///     match (count, curr_read.contig_id_and_start()) {
489    ///         (0, Ok((0, 9))) |
490    ///         (1, Ok((2, 23))) |
491    ///         (2, Ok((1, 3))) => {},
492    ///         _ => unreachable!(),
493    ///     }
494    ///     count = count + 1;
495    ///     if count == 3 { break; } // the fourth entry is unmapped, and will lead to an error.
496    /// }
497    /// # Ok::<(), Error>(())
498    /// ```
499    ///
500    /// If we call the method on an unmapped read, we should see an error.
501    /// ```should_panic
502    /// # use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
503    /// # use rust_htslib::bam::Read;
504    /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
505    /// let mut count = 0;
506    /// for record in reader.records(){
507    ///     let r = record?;
508    ///     if count < 3 {
509    ///         count = count + 1;
510    ///         continue;
511    ///     }
512    ///     // the fourth read is unmapped
513    ///     let curr_read =
514    ///         CurrRead::default().set_read_state_and_id(&r)?.set_contig_id_and_start(&r)?;
515    /// }
516    /// # Ok::<(), Error>(())
517    /// ```
518    ///
519    /// If we call the method twice, we should hit a panic
520    /// ```should_panic
521    /// # use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
522    /// # use rust_htslib::bam::Read;
523    /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
524    /// for record in reader.records(){
525    ///     let r = record?;
526    ///     let mut curr_read = CurrRead::default().set_read_state_and_id(&r)?
527    ///         .set_contig_id_and_start(&r)?.set_contig_id_and_start(&r)?;
528    ///     break;
529    /// }
530    /// # Ok::<(), Error>(())
531    /// ```
532    pub fn set_contig_id_and_start(mut self, record: &Record) -> Result<Self, Error> {
533        self.contig_id_and_start = match self.contig_id_and_start {
534            Some(_) => Err(Error::InvalidDuplicates(format!(
535                "cannot set contig and start again! read_id: {}",
536                self.read_id()
537            ))),
538            None => {
539                if self.read_state().is_unmapped() {
540                    Err(Error::Unmapped(format!(
541                        "setting alignment data on unmapped read, read_id: {}",
542                        self.read_id()
543                    )))
544                } else {
545                    Ok(Some((record.tid(), record.pos().try_into()?)))
546                }
547            }
548        }?;
549        Ok(self)
550    }
551    /// gets contig ID and start
552    ///
553    /// # Errors
554    /// If instance is unmapped or if data (contig id and start) are not set
555    pub fn contig_id_and_start(&self) -> Result<(i32, u64), Error> {
556        if self.read_state().is_unmapped() {
557            Err(Error::Unmapped(format!(
558                "requesting alignment data on unmapped read, read_id: {}",
559                self.read_id()
560            )))
561        } else {
562            self.contig_id_and_start
563                .ok_or(Error::UnavailableData(format!(
564                    "contig id, start not available, read_id: {}",
565                    self.read_id()
566                )))
567        }
568    }
569    /// sets contig name
570    ///
571    /// # Errors
572    /// Returns error if instance is unmapped or contig name has already been set
573    ///
574    /// ```
575    /// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
576    /// use rust_htslib::bam::Read;
577    /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
578    /// let mut count = 0;
579    /// for record in reader.records(){
580    ///     let r = record?;
581    ///     let mut curr_read = CurrRead::default().set_read_state_and_id(&r)?.set_contig_name(&r)?;
582    ///     let Ok(contig_name) = curr_read.contig_name() else {unreachable!()};
583    ///     match (count, contig_name) {
584    ///         (0, "dummyI") |
585    ///         (1, "dummyIII") |
586    ///         (2, "dummyII") => {},
587    ///         _ => unreachable!(),
588    ///     }
589    ///     count = count + 1;
590    ///     if count == 3 { break; } // the fourth entry is unmapped, and will lead to an error.
591    /// }
592    /// # Ok::<(), Error>(())
593    /// ```
594    ///
595    /// If we try to set contig name on an unmapped read, we will get an error
596    ///
597    /// ```should_panic
598    /// # use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
599    /// # use rust_htslib::bam::Read;
600    /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
601    /// let mut count = 0;
602    /// for record in reader.records(){
603    ///     if count < 3 {
604    ///         count = count + 1;
605    ///         continue;
606    ///     }
607    ///     let r = record?;
608    ///     let mut curr_read = CurrRead::default().set_read_state_and_id(&r)?;
609    ///     curr_read.set_contig_name(&r)?;
610    /// }
611    /// # Ok::<(), Error>(())
612    /// ```
613    ///
614    /// If we call the method twice, we should hit a panic
615    /// ```should_panic
616    /// # use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
617    /// # use rust_htslib::bam::Read;
618    /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
619    /// for record in reader.records(){
620    ///     let r = record?;
621    ///     let mut curr_read = CurrRead::default().set_read_state_and_id(&r)?
622    ///         .set_contig_name(&r)?.set_contig_name(&r)?;
623    ///     break;
624    /// }
625    /// # Ok::<(), Error>(())
626    /// ```
627    pub fn set_contig_name(mut self, record: &Record) -> Result<Self, Error> {
628        self.contig_name = match (self.read_state().is_unmapped(), self.contig_name.is_none()) {
629            (true, _) => Err(Error::Unmapped(format!(
630                "set align data on unmapped read! read_id: {}",
631                self.read_id()
632            ))),
633            (_, false) => Err(Error::InvalidDuplicates(format!(
634                "cannot set contig name again! read_id: {}",
635                self.read_id()
636            ))),
637            (_, true) => Ok(Some(String::from(record.contig()))),
638        }?;
639        Ok(self)
640    }
641    /// gets contig name
642    ///
643    /// # Errors
644    /// If instance is unmapped or contig name has not been set
645    pub fn contig_name(&self) -> Result<&str, Error> {
646        match (self.read_state().is_unmapped(), self.contig_name.as_ref()) {
647            (true, _) => Err(Error::Unmapped(format!(
648                "get align data on unmapped read, read_id: {}",
649                self.read_id()
650            ))),
651            (_, None) => Err(Error::UnavailableData(format!(
652                "contig name unavailable, read_id: {}",
653                self.read_id()
654            ))),
655            (_, Some(v)) => Ok(v.as_str()),
656        }
657    }
658    /// gets read id
659    #[must_use]
660    pub fn read_id(&self) -> &str {
661        self.read_id.as_str()
662    }
663    /// Returns the character corresponding to the strand
664    ///
665    /// ```
666    /// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
667    /// use rust_htslib::bam::Read;
668    /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
669    /// let mut count = 0;
670    /// for record in reader.records(){
671    ///     let r = record?;
672    ///     let mut curr_read = CurrRead::default().set_read_state_and_id(&r)?;
673    ///     let strand = curr_read.strand() else { unreachable!() };
674    ///     match (count, strand) {
675    ///         (0, '+') | (1, '+') | (2, '-') | (3, '.') => {},
676    ///         _ => unreachable!(),
677    ///     }
678    ///     count = count + 1;
679    /// }
680    /// # Ok::<(), Error>(())
681    /// ```
682    #[must_use]
683    pub fn strand(&self) -> char {
684        self.read_state().strand()
685    }
686    /// Returns read sequence overlapping with a genomic region
687    ///
688    /// # Errors
689    /// If getting sequence coordinates from reference coordinates fails, see
690    /// [`CurrRead::seq_and_qual_on_ref_coords`]
691    ///
692    /// # Example
693    ///
694    /// ```
695    /// use bedrs::Bed3;
696    /// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
697    /// use rust_htslib::bam::Read;
698    ///
699    /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
700    /// let mut count = 0;
701    /// for record in reader.records() {
702    ///     let r = record?;
703    ///     let curr_read = CurrRead::default().try_from_only_alignment(&r)?;
704    ///
705    ///     // Skip unmapped reads
706    ///     if !curr_read.read_state().is_unmapped() {
707    ///         let (contig_id, start) = curr_read.contig_id_and_start()?;
708    ///         let align_len = curr_read.align_len()?;
709    ///
710    ///         // Create a region that overlaps with the read but is short of one bp.
711    ///         // Note that this BAM file has reads with all bases matching perfectly
712    ///         // with the reference.
713    ///         let region = Bed3::new(contig_id, start, start + align_len - 1);
714    ///         let seq_subset = curr_read.seq_on_ref_coords(&r, &region)?;
715    ///
716    ///         // Check for sequence length match
717    ///         assert_eq!(curr_read.seq_len()? - 1, u64::try_from(seq_subset.len())?);
718    ///
719    ///         // Create a region with no overlap at all and check we get no data
720    ///         let region = Bed3::new(contig_id, start + align_len, start + align_len + 2);
721    ///         match curr_read.seq_on_ref_coords(&r, &region){
722    ///             Err(Error::UnavailableData(_)) => (),
723    ///             _ => unreachable!(),
724    ///         };
725    ///
726    ///     }
727    ///     count += 1;
728    /// }
729    /// # Ok::<(), Error>(())
730    /// ```
731    pub fn seq_on_ref_coords(
732        &self,
733        record: &Record,
734        region: &Bed3<i32, u64>,
735    ) -> Result<Vec<u8>, Error> {
736        Ok(self
737            .seq_and_qual_on_ref_coords(record, region)?
738            .into_iter()
739            .map(|x| x.map_or(b'.', |y| y.1))
740            .collect::<Vec<u8>>())
741    }
742    /// Returns match-or-mismatch, read sequence, base-quality values overlapping with genomic region.
743    ///
744    /// Returns a vector of Options:
745    /// * is a `None` at a deletion i.e. a base on the reference and not on the read.
746    /// * is a `Some(bool, u8, u8)` at a match/mismatch/insertion.
747    ///   The first `u8` is the base itself and the `bool` is `true` if a match/mismatch
748    ///   and `false` if an insertion, and the second `u8` is the base quality (0-93),
749    ///   which the BAM format sets to 255 if no quality is available for the entire read.
750    ///
751    /// Because sequences are encoded using 4-bit values into a `[u8]`, we need to use
752    /// `rust-htslib` functions to convert them into 8-bit values and then use
753    /// the `Index<usize>` trait on sequences from `rust-htslib`.
754    ///
755    /// # Errors
756    /// If the read does not intersect with the specified region, see
757    /// [`CurrRead::seq_coords_from_ref_coords`]
758    ///
759    /// # Example
760    ///
761    /// Example 1
762    ///
763    /// ```
764    /// use bedrs::Bed3;
765    /// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
766    /// use rust_htslib::bam::Read;
767    ///
768    /// let mut reader = nanalogue_bam_reader(&"examples/example_5_valid_basequal.sam")?;
769    /// for record in reader.records() {
770    ///     let r = record?;
771    ///     let curr_read = CurrRead::default().try_from_only_alignment(&r)?;
772    ///
773    ///     let region = Bed3::new(0, 0, 20);
774    ///     let seq_subset = curr_read.seq_and_qual_on_ref_coords(&r, &region)?;
775    ///     assert_eq!(seq_subset, [Some((true, b'T', 32)), Some((true, b'C', 0)),
776    ///         Some((true, b'G', 69)), Some((true, b'T', 80)), Some((true, b'T', 79)),
777    ///         Some((true, b'T', 81)), Some((true, b'C', 29)), Some((true, b'T', 30))]);
778    ///
779    ///     // Create a region with no overlap at all and check we get no data
780    ///     let region = Bed3::new(0, 20, 22);
781    ///     match curr_read.seq_and_qual_on_ref_coords(&r, &region){
782    ///         Err(Error::UnavailableData(_)) => (),
783    ///         _ => unreachable!(),
784    ///     };
785    ///
786    /// }
787    /// # Ok::<(), Error>(())
788    /// ```
789    ///
790    /// Example 2
791    ///
792    /// ```
793    /// use bedrs::Bed3;
794    /// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
795    /// use rust_htslib::bam::Read;
796    ///
797    /// let mut reader = nanalogue_bam_reader(&"examples/example_7.sam")?;
798    /// for record in reader.records() {
799    ///     let r = record?;
800    ///     let curr_read = CurrRead::default().try_from_only_alignment(&r)?;
801    ///
802    ///     let region = Bed3::new(0, 0, 20);
803    ///     let seq_subset = curr_read.seq_and_qual_on_ref_coords(&r, &region)?;
804    ///     assert_eq!(seq_subset, [Some((true, b'T', 32)), None, None,
805    ///         Some((false, b'A', 0)), Some((true, b'T', 0)), Some((true, b'T', 79)),
806    ///         Some((true, b'T', 81)), Some((true, b'G', 29)), Some((true, b'T', 30))]);
807    ///
808    /// }
809    /// # Ok::<(), Error>(())
810    /// ```
811    #[expect(
812        clippy::indexing_slicing,
813        reason = "qual, seq same len and coords will not exceed these"
814    )]
815    #[expect(
816        clippy::type_complexity,
817        reason = "not complex enough, I think types will make this less clear"
818    )]
819    pub fn seq_and_qual_on_ref_coords(
820        &self,
821        record: &Record,
822        region: &Bed3<i32, u64>,
823    ) -> Result<Vec<Option<(bool, u8, u8)>>, Error> {
824        let seq = record.seq();
825        let qual = record.qual();
826        Ok(self
827            .seq_coords_from_ref_coords(record, region)?
828            .into_iter()
829            .map(|x| x.map(|y| (y.0, seq[y.1], qual[y.1])))
830            .collect::<Vec<Option<(bool, u8, u8)>>>())
831    }
832    /// Extract sequence coordinates corresponding to a region on the reference genome.
833    ///
834    /// The vector we return contains `Some((bool, usize))` entries where both the reference and the read
835    /// have bases, and `None` where bases from the reference are missing on the read:
836    /// * matches or mismatches, we record the coordinate.
837    ///   so SNPs for example (i.e. a 1 bp difference from the ref) will show up as `Some((true, _))`.
838    /// * a deletion or a ref skip ("N" in cigar) will show up as `None`.
839    /// * insertions are preserved i.e. bases in the middle of a read present
840    ///   on the read but not on the reference are `Some((false, _))`
841    /// * clipped bases at the end of the read are not preserved. These are bases
842    ///   on the read but not on the reference and are denoted as soft or hard
843    ///   clips on the CIGAR string e.g. barcodes from sequencing
844    /// * some CIGAR combinations are illogical and we are assuming they do not happen
845    ///   e.g. a read's CIGAR can end with, say 10D20S, this means last 10 bp
846    ///   are in a deletion and the next 20 are a softclip. This is illogical,
847    ///   as they must be combined into a 30-bp softclip i.e. 30S. So if the
848    ///   aligner produces such illogical states, then the sequence coordinates reported
849    ///   here may be erroneous.
850    ///
851    /// If the read does not intersect with the region, we return an `Error` (see below).
852    /// If the read does intersect with the region but we cannot retrieve any bases,
853    /// we return an empty vector (I am not sure if we will run into this scenario).
854    ///
855    /// # Errors
856    /// If the read does not intersect with the specified region, or if `usize` conversions fail.
857    ///
858    /// # Example
859    ///
860    /// ```
861    /// use bedrs::Bed3;
862    /// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
863    /// use rust_htslib::bam::Read;
864    ///
865    /// let mut reader = nanalogue_bam_reader(&"examples/example_7.sam")?;
866    /// for record in reader.records() {
867    ///     let r = record?;
868    ///     let curr_read = CurrRead::default().try_from_only_alignment(&r)?;
869    ///
870    ///     let region = Bed3::new(0, 9, 13);
871    ///     let seq_subset = curr_read.seq_coords_from_ref_coords(&r, &region)?;
872    ///     // there are deletions on the read above
873    ///     assert_eq!(seq_subset, vec![Some((true, 0)), None, None, Some((false, 1)), Some((true, 2))]);
874    ///
875    ///     // Create a region with no overlap at all and check we get no data
876    ///     let region = Bed3::new(0, 20, 22);
877    ///     match curr_read.seq_coords_from_ref_coords(&r, &region){
878    ///         Err(Error::UnavailableData(_)) => (),
879    ///         _ => unreachable!(),
880    ///     };
881    ///
882    /// }
883    /// # Ok::<(), Error>(())
884    pub fn seq_coords_from_ref_coords(
885        &self,
886        record: &Record,
887        region: &Bed3<i32, u64>,
888    ) -> Result<Vec<Option<(bool, usize)>>, Error> {
889        #[expect(
890            clippy::missing_panics_doc,
891            reason = "genomic coordinates are far less than (2^64-1)/2 i.e. u64->i64 should be ok"
892        )]
893        let interval = {
894            region
895                .intersect(&StrandedBed3::<i32, u64>::try_from(self)?)
896                .and_then(|v| {
897                    let start = i64::try_from(v.start()).expect("genomic coordinates << 2^63");
898                    let end = i64::try_from(v.end()).expect("genomic coordinates << 2^63");
899                    (start < end).then_some(start..end)
900                })
901                .ok_or(Error::UnavailableData(
902                    "coord-retrieval: region does not intersect with read".to_owned(),
903                ))
904        }?;
905
906        // Initialize coord calculation.
907        // We don't know how long the subset will be, we initialize with a guess
908        // of 2 * interval size
909        #[expect(
910            clippy::arithmetic_side_effects,
911            reason = "genomic coordinates far less than i64::MAX (approx (2^64-1)/2)"
912        )]
913        let mut s: Vec<Option<(bool, usize)>> =
914            Vec::with_capacity(usize::try_from(2 * (interval.end - interval.start))?);
915
916        // we may have to trim the sequence if we hit a bunch of unaligned base
917        // pairs right at the end e.g. a softclip.
918        let mut trim_end_bp: u64 = 0;
919
920        for w in record
921            .aligned_pairs_full()
922            .skip_while(|x| x[1].is_none_or(|y| !interval.contains(&y)))
923            .take_while(|x| x[1].is_none_or(|y| interval.contains(&y)))
924        {
925            #[expect(
926                clippy::arithmetic_side_effects,
927                reason = "coordinates far less than u64::MAX (2^64-1) so no chance of counter overflow"
928            )]
929            match w {
930                [Some(x), Some(_)] => {
931                    s.push(Some((true, usize::try_from(x)?)));
932                    trim_end_bp = 0;
933                }
934                [Some(x), None] => {
935                    s.push(Some((false, usize::try_from(x)?)));
936                    trim_end_bp += 1;
937                }
938                [None, Some(_)] => {
939                    s.push(None);
940                    trim_end_bp = 0;
941                }
942                [None, None] => unreachable!(
943                    "impossible to find bases that are neither on the read nor on the reference"
944                ),
945            }
946        }
947
948        // if last few bp in sequence are all unmapped, we remove them here.
949        for _ in 0..trim_end_bp {
950            let Some(_) = s.pop() else {
951                unreachable!("trim_end_bp cannot exceed length of s")
952            };
953        }
954
955        // Trim excess allocated capacity and return
956        s.shrink_to(0);
957        Ok(s)
958    }
959    /// sets modification data using the BAM record
960    ///
961    /// # Errors
962    /// If tags in the BAM record containing the modification information (MM, ML)
963    /// contain mistakes.
964    pub fn set_mod_data(
965        self,
966        record: &Record,
967        mod_thres: ThresholdState,
968        min_qual: u8,
969    ) -> Result<CurrRead<AlignAndModData>, Error> {
970        let result = nanalogue_mm_ml_parser(
971            record,
972            |x| mod_thres.contains(x),
973            |_| true,
974            |_, _, _| true,
975            min_qual,
976        )?;
977        Ok(CurrRead::<AlignAndModData> {
978            state: self.state,
979            read_id: self.read_id,
980            seq_len: self.seq_len,
981            align_len: self.align_len,
982            mods: (result, mod_thres),
983            contig_id_and_start: self.contig_id_and_start,
984            contig_name: self.contig_name,
985            mod_base_qual_thres: min_qual,
986            marker: std::marker::PhantomData::<AlignAndModData>,
987        })
988    }
989    /// sets modification data using BAM record but restricted to the specified filters
990    ///
991    /// # Errors
992    /// If tags in the BAM record containing the modification information (MM, ML)
993    /// contain mistakes.
994    pub fn set_mod_data_restricted<G, H>(
995        self,
996        record: &Record,
997        mod_thres: ThresholdState,
998        mod_fwd_pos_filter: G,
999        mod_filter_base_strand_tag: H,
1000        min_qual: u8,
1001    ) -> Result<CurrRead<AlignAndModData>, Error>
1002    where
1003        G: Fn(&usize) -> bool,
1004        H: Fn(&u8, &char, &ModChar) -> bool,
1005    {
1006        let result = nanalogue_mm_ml_parser(
1007            record,
1008            |x| mod_thres.contains(x),
1009            mod_fwd_pos_filter,
1010            mod_filter_base_strand_tag,
1011            min_qual,
1012        )?;
1013        Ok(CurrRead::<AlignAndModData> {
1014            state: self.state,
1015            read_id: self.read_id,
1016            seq_len: self.seq_len,
1017            align_len: self.align_len,
1018            mods: (result, mod_thres),
1019            contig_id_and_start: self.contig_id_and_start,
1020            contig_name: self.contig_name,
1021            mod_base_qual_thres: min_qual,
1022            marker: std::marker::PhantomData::<AlignAndModData>,
1023        })
1024    }
1025}
1026
1027impl CurrRead<OnlyAlignDataComplete> {
1028    /// sets modification data using BAM record but with restrictions
1029    /// applied by the [`crate::InputMods`] options for example.
1030    ///
1031    /// # Errors
1032    /// If a region filter is specified and we fail to convert current instance to Bed,
1033    /// and if parsing the MM/ML BAM tags fails (presumably because they are malformed).
1034    #[expect(
1035        clippy::missing_panics_doc,
1036        reason = "integer conversions (u64->usize, u64->i64) are expected to not fail as \
1037genomic coordinates are far smaller than ~2^63"
1038    )]
1039    pub fn set_mod_data_restricted_options<S: InputModOptions + InputRegionOptions>(
1040        self,
1041        record: &Record,
1042        mod_options: &S,
1043    ) -> Result<CurrRead<AlignAndModData>, Error> {
1044        let l = usize::try_from(self.seq_len().expect("no error"))
1045            .expect("bit conversion errors unlikely");
1046        let w = mod_options.trim_read_ends_mod();
1047        let interval = if let Some(bed3) = mod_options.region_filter().as_ref() {
1048            let stranded_bed3 = StrandedBed3::<i32, u64>::try_from(&self)?;
1049            if let Some(v) = bed3.intersect(&stranded_bed3) {
1050                if v.start() == stranded_bed3.start() && v.end() == stranded_bed3.end() {
1051                    None
1052                } else {
1053                    Some(v.start()..v.end())
1054                }
1055            } else {
1056                Some(0..0)
1057            }
1058        } else {
1059            None
1060        };
1061        Ok({
1062            let mut read = self.set_mod_data_restricted(
1063                record,
1064                mod_options.mod_prob_filter(),
1065                |x| w == 0 || (w..l.checked_sub(w).unwrap_or_default()).contains(x),
1066                |_, &s, &t| {
1067                    mod_options.tag().is_none_or(|x| x == t)
1068                        && mod_options.mod_strand().is_none_or(|v| s == char::from(v))
1069                },
1070                mod_options.base_qual_filter_mod(),
1071            )?;
1072            if let Some(v) = interval {
1073                match v.start.cmp(&v.end) {
1074                    Ordering::Less => read.filter_by_ref_pos(
1075                        i64::try_from(v.start)
1076                            .expect("no error as genomic coordinates far less than ~2^63"),
1077                        i64::try_from(v.end)
1078                            .expect("no error as genomic coordinates far less than ~2^63"),
1079                    )?,
1080                    Ordering::Equal => read.filter_by_ref_pos(i64::MAX - 1, i64::MAX)?,
1081                    Ordering::Greater => {
1082                        unreachable!("`bedrs` should not allow malformed intervals!")
1083                    }
1084                }
1085            }
1086            read
1087        })
1088    }
1089}
1090
1091impl CurrRead<AlignAndModData> {
1092    /// gets modification data
1093    #[must_use]
1094    pub fn mod_data(&self) -> &(BaseMods, ThresholdState) {
1095        &self.mods
1096    }
1097    /// window modification data with restrictions.
1098    /// If a read has the same modification on both the basecalled
1099    /// strand and its complement, then windows along both are returned.
1100    ///
1101    /// If `win_size` exceeds the modification data length, no windows are produced.
1102    ///
1103    /// # Errors
1104    /// Returns an error if the window function returns an error.
1105    #[expect(
1106        clippy::pattern_type_mismatch,
1107        reason = "suggested notation is verbose but I am not sure"
1108    )]
1109    pub fn windowed_mod_data_restricted<F>(
1110        &self,
1111        window_function: &F,
1112        win_options: InputWindowing,
1113        tag: ModChar,
1114    ) -> Result<Vec<F32Bw0and1>, Error>
1115    where
1116        F: Fn(&[u8]) -> Result<F32Bw0and1, Error>,
1117    {
1118        let win_size = win_options.win.get();
1119        let slide_size = win_options.step.get();
1120        let mut result = Vec::<F32Bw0and1>::new();
1121        let tag_char = tag.val();
1122        let (BaseMods { base_mods: v }, _) = &self.mods;
1123
1124        // we make a few assumptions below:
1125        // * data is sorted by coordinate along sequence
1126        // * illegal types like strand not '+' or '-', or multiple entries
1127        //   corresponding to the same modification strand combination
1128        //   e.g. C+m occuring twice.
1129        // we control data flow into CurrRead, checking these do not happen
1130        // during ingress. To be future-proof etc., we should check these things
1131        // here but we do not as there is no way right now to test error checking
1132        // as there is no way to make CurrRead fall into these illegal states.
1133        #[expect(
1134            clippy::missing_panics_doc,
1135            reason = "checked_sub ensures win_size <= mod_data.len() before windowing"
1136        )]
1137        for k in v {
1138            match k {
1139                BaseMod {
1140                    modification_type: x,
1141                    ranges: track,
1142                    ..
1143                } if *x == tag_char => {
1144                    let mod_data = track.qual();
1145                    if let Some(l) = mod_data.len().checked_sub(win_size) {
1146                        result.extend(
1147                            (0..=l)
1148                                .step_by(slide_size)
1149                                .map(|i| {
1150                                    window_function(
1151                                        mod_data
1152                                            .get(i..)
1153                                            .expect("i <= len - win_size")
1154                                            .get(0..win_size)
1155                                            .expect("checked len>=win_size so no error"),
1156                                    )
1157                                })
1158                                .collect::<Result<Vec<F32Bw0and1>, _>>()?,
1159                        );
1160                    }
1161                }
1162                _ => {}
1163            }
1164        }
1165        Ok(result)
1166    }
1167    /// Performs a count of number of bases per modified type.
1168    /// Note that this result depends on the type of filtering done
1169    /// while the struct was created e.g. by modification threshold.
1170    ///
1171    /// # Panics
1172    /// Panics if the number of modifications exceeds `u32::MAX` (approximately 4.2 billion).
1173    ///
1174    /// ```
1175    /// use nanalogue_core::{CurrRead, Error, ModChar, nanalogue_bam_reader, ThresholdState};
1176    /// use rust_htslib::bam::Read;
1177    /// use std::collections::HashMap;
1178    /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
1179    /// let mut count = 0;
1180    /// for record in reader.records(){
1181    ///     let r = record?;
1182    ///     let curr_read = CurrRead::default().set_read_state_and_id(&r)?
1183    ///         .set_mod_data(&r, ThresholdState::GtEq(180), 0)?;
1184    ///     let mod_count = curr_read.base_count_per_mod();
1185    ///     let zero_count = HashMap::from([(ModChar::new('T'), 0)]);
1186    ///     let a = HashMap::from([(ModChar::new('T'), 3)]);
1187    ///     let b = HashMap::from([(ModChar::new('T'), 1)]);
1188    ///     let c = HashMap::from([(ModChar::new('T'), 3),(ModChar::new('ᰠ'), 0)]);
1189    ///     match (count, mod_count) {
1190    ///         (0, v) => assert_eq!(v, zero_count),
1191    ///         (1, v) => assert_eq!(v, a),
1192    ///         (2, v) => assert_eq!(v, b),
1193    ///         (3, v) => assert_eq!(v, c),
1194    ///         _ => unreachable!(),
1195    ///     }
1196    ///     count = count + 1;
1197    /// }
1198    /// # Ok::<(), Error>(())
1199    /// ```
1200    #[must_use]
1201    pub fn base_count_per_mod(&self) -> HashMap<ModChar, u32> {
1202        let mut output = HashMap::<ModChar, u32>::new();
1203        #[expect(
1204            clippy::arithmetic_side_effects,
1205            reason = "u32::MAX approx 4.2 Gb, v unlikely 1 molecule is this modified"
1206        )]
1207        for k in &self.mod_data().0.base_mods {
1208            let base_count =
1209                u32::try_from(k.ranges.annotations.len()).expect("number conversion error");
1210            let _: &mut u32 = output
1211                .entry(ModChar::new(k.modification_type))
1212                .and_modify(|e| *e += base_count)
1213                .or_insert(base_count);
1214        }
1215        output
1216    }
1217}
1218
1219/// To format and display modification data in a condensed manner.
1220trait DisplayCondensedModData {
1221    /// Outputs mod data section in a condensed manner.
1222    fn mod_data_section(&self) -> Result<String, fmt::Error>;
1223}
1224
1225/// No mod data means no display is produced
1226impl<S> DisplayCondensedModData for CurrRead<S>
1227where
1228    S: CurrReadStateOnlyAlign + CurrReadState,
1229{
1230    fn mod_data_section(&self) -> Result<String, fmt::Error> {
1231        Ok(String::new())
1232    }
1233}
1234
1235/// Implements display when mod data is available
1236impl DisplayCondensedModData for CurrRead<AlignAndModData> {
1237    #[expect(
1238        clippy::pattern_type_mismatch,
1239        reason = "suggested notation is verbose but I am not sure"
1240    )]
1241    fn mod_data_section(&self) -> Result<String, fmt::Error> {
1242        let mut mod_count_str = String::new();
1243        let (BaseMods { base_mods: v }, w) = &self.mods;
1244        for k in v {
1245            write!(
1246                mod_count_str,
1247                "{}{}{}:{};",
1248                k.modified_base as char,
1249                k.strand,
1250                ModChar::new(k.modification_type),
1251                k.ranges.annotations.len()
1252            )?;
1253        }
1254        if mod_count_str.is_empty() {
1255            write!(mod_count_str, "NA")?;
1256        } else {
1257            write!(
1258                mod_count_str,
1259                "({}, PHRED base qual >= {})",
1260                w, self.mod_base_qual_thres
1261            )?;
1262        }
1263        Ok(format!(",\n\t\"mod_count\": \"{mod_count_str}\""))
1264    }
1265}
1266
1267impl<S> fmt::Display for CurrRead<S>
1268where
1269    S: CurrReadState,
1270    CurrRead<S>: DisplayCondensedModData,
1271{
1272    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1273        let mut output_string = String::from("{\n");
1274
1275        writeln!(output_string, "\t\"read_id\": \"{}\",", self.read_id)?;
1276
1277        if let Some(v) = self.seq_len {
1278            writeln!(output_string, "\t\"sequence_length\": {v},")?;
1279        }
1280
1281        if let Some((v, w)) = self.contig_id_and_start {
1282            let num_str = &v.to_string();
1283            writeln!(
1284                output_string,
1285                "\t\"contig\": \"{}\",",
1286                if let Some(x) = self.contig_name.as_ref() {
1287                    x
1288                } else {
1289                    num_str
1290                }
1291            )?;
1292            writeln!(output_string, "\t\"reference_start\": {w},")?;
1293            if let Some(x) = self.align_len {
1294                writeln!(
1295                    output_string,
1296                    "\t\"reference_end\": {},",
1297                    w.checked_add(x)
1298                        .expect("numeric overflow in calculating reference_end")
1299                )?;
1300                writeln!(output_string, "\t\"alignment_length\": {x},")?;
1301            }
1302        }
1303
1304        write!(output_string, "\t\"alignment_type\": \"{}\"", self.state)?;
1305        writeln!(output_string, "{}", &self.mod_data_section()?)?;
1306        output_string.push('}');
1307        output_string.fmt(f)
1308    }
1309}
1310
1311/// Converts `CurrRead` to `StrandedBed3`
1312///
1313/// ```
1314/// use bedrs::{Coordinates, Strand};
1315/// use bedrs::prelude::StrandedBed3;
1316/// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
1317/// use rust_htslib::bam::Read;
1318/// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
1319/// let mut count = 0;
1320/// for record in reader.records(){
1321///     let r = record?;
1322///     let mut curr_read = CurrRead::default().try_from_only_alignment(&r)?;
1323///     let Ok(bed3_stranded) = StrandedBed3::try_from(&curr_read) else {unreachable!()};
1324///     let exp_bed3_stranded = match count {
1325///         0 => StrandedBed3::new(0, 9, 17, Strand::Forward),
1326///         1 => StrandedBed3::new(2, 23, 71, Strand::Forward),
1327///         2 => StrandedBed3::new(1, 3, 36, Strand::Reverse),
1328///         3 => StrandedBed3::empty(),
1329///         _ => unreachable!(),
1330///     };
1331///     assert_eq!(*bed3_stranded.chr(), *exp_bed3_stranded.chr());
1332///     assert_eq!(bed3_stranded.start(), exp_bed3_stranded.start());
1333///     assert_eq!(bed3_stranded.end(), exp_bed3_stranded.end());
1334///     assert_eq!(bed3_stranded.strand(), exp_bed3_stranded.strand());
1335///     count = count + 1;
1336/// }
1337/// # Ok::<(), Error>(())
1338/// ```
1339impl<S: CurrReadStateWithAlign + CurrReadState> TryFrom<&CurrRead<S>> for StrandedBed3<i32, u64> {
1340    type Error = Error;
1341
1342    #[expect(
1343        clippy::arithmetic_side_effects,
1344        reason = "u64 variables won't overflow with genomic coords (<2^64-1)"
1345    )]
1346    fn try_from(value: &CurrRead<S>) -> Result<Self, Self::Error> {
1347        match (
1348            value.read_state().strand(),
1349            value.align_len().ok(),
1350            value.contig_id_and_start().ok(),
1351        ) {
1352            ('.', _, _) => Ok(StrandedBed3::empty()),
1353            (_, None, _) => Err(Error::InvalidAlignLength(format!(
1354                "align len not set while converting to bed3! read_id: {}",
1355                value.read_id()
1356            ))),
1357            (_, _, None) => Err(Error::InvalidContigAndStart(format!(
1358                "contig id and start not set while converting to bed3! read_id: {}",
1359                value.read_id()
1360            ))),
1361            ('+', Some(al), Some((cg, st))) => {
1362                Ok(StrandedBed3::new(cg, st, st + al, Strand::Forward))
1363            }
1364            ('-', Some(al), Some((cg, st))) => {
1365                Ok(StrandedBed3::new(cg, st, st + al, Strand::Reverse))
1366            }
1367            (_, _, _) => unreachable!("strand should be +/-/., so we should never reach this"),
1368        }
1369    }
1370}
1371
1372/// Convert a rust htslib record to our `CurrRead` struct.
1373/// NOTE: This operation loads many types of data from the
1374/// record and you may not be interested in all of them.
1375/// So, unless you know for sure that you are dealing with
1376/// a small number of reads, please do not use this function,
1377/// and call only a subset of the individual invocations below
1378/// in your program for the sake of speed and/or memory.
1379impl TryFrom<Record> for CurrRead<AlignAndModData> {
1380    type Error = Error;
1381
1382    fn try_from(record: Record) -> Result<Self, Self::Error> {
1383        let curr_read_state = CurrRead::default()
1384            .try_from_only_alignment(&record)?
1385            .set_mod_data(&record, ThresholdState::GtEq(128), 0)?;
1386        Ok(curr_read_state)
1387    }
1388}
1389
1390/// Convert a rust htslib rc record into our struct.
1391/// I think the rc datatype is just like the normal record,
1392/// except the record datatype is not destroyed and created
1393/// every time a new record is read (or something like that).
1394/// All comments I've made for the `TryFrom<Record>` function
1395/// apply here as well.
1396impl TryFrom<Rc<Record>> for CurrRead<AlignAndModData> {
1397    type Error = Error;
1398
1399    fn try_from(record: Rc<Record>) -> Result<Self, Self::Error> {
1400        let curr_read_state = CurrRead::default()
1401            .try_from_only_alignment(&record)?
1402            .set_mod_data(&record, ThresholdState::GtEq(128), 0)?;
1403        Ok(curr_read_state)
1404    }
1405}
1406
1407/// Implements filter by reference coordinates for our `CurrRead`.
1408/// This only filters modification data.
1409impl FilterByRefCoords for CurrRead<AlignAndModData> {
1410    /// filters modification data by reference position i.e. all pos such that
1411    /// start <= pos < end are retained. does not use contig in filtering.
1412    fn filter_by_ref_pos(&mut self, start: i64, end: i64) -> Result<(), Error> {
1413        for k in &mut self.mods.0.base_mods {
1414            k.ranges.filter_by_ref_pos(start, end)?;
1415        }
1416        Ok(())
1417    }
1418}
1419
1420/// Serialized representation of [`CurrRead`]; can also be used as a builder
1421/// to build [`CurrRead`]. This is useful if modification data is in formats
1422/// other than mod BAM, so we can transform it into [`CurrRead`] and use it
1423/// with functions from our library.
1424///
1425/// # Examples
1426///
1427/// First example, unmapped read with very little information
1428///
1429/// ```
1430/// use nanalogue_core::{Error, CurrReadBuilder};
1431/// let read = CurrReadBuilder::default().build()?;
1432/// # Ok::<(), Error>(())
1433/// ```
1434///
1435/// Add some simple information, still unmapped.
1436///
1437/// ```
1438/// use nanalogue_core::{Error, CurrReadBuilder};
1439/// let read = CurrReadBuilder::default().read_id("some_read".into()).seq_len(40).build()?;
1440/// # Ok::<(), Error>(())
1441/// ```
1442///
1443/// If we try to build a mapped read, we hit a panic as we haven't specified alignment information.
1444///
1445/// ```should_panic
1446/// use nanalogue_core::{Error, CurrReadBuilder,ReadState};
1447/// let read = CurrReadBuilder::default()
1448///     .read_id("some_read".into())
1449///     .seq_len(40)
1450///     .alignment_type(ReadState::PrimaryFwd).build()?;
1451/// # Ok::<(), Error>(())
1452/// ```
1453///
1454/// Mapped read building
1455///
1456/// ```
1457/// use nanalogue_core::{Error, CurrReadBuilder, AlignmentInfoBuilder, ReadState};
1458/// let read = CurrReadBuilder::default()
1459///     .read_id("some_read".into())
1460///     .seq_len(40)
1461///     .alignment_type(ReadState::PrimaryFwd)
1462///     .alignment(AlignmentInfoBuilder::default()
1463///         .start(10)
1464///         .end(60)
1465///         .contig("chr1".into())
1466///         .contig_id(1).build()?).build()?;
1467/// # Ok::<(), Error>(())
1468/// ```
1469/// ## Mapped read building with modification information.
1470/// Mod table entries are associated with one type of modification each.
1471/// The data tuples are: `sequence coordinate`, `reference coordinate`,
1472/// `modification probability`. The first varies from 0 to `sequence length` - 1,
1473/// the second varies within alignment coordinates and is -1 for an unmapped
1474/// position or all entries are -1 if the read as a whole is unmapped, and
1475/// the third entry varies from 0 - 255 where 0 means no modification with
1476/// certainty, and 255 means modification with certainty.
1477///
1478/// ```
1479/// use nanalogue_core::{Error, CurrReadBuilder, AlignmentInfoBuilder, ModTableEntryBuilder,
1480///     ReadState};
1481///
1482/// let mod_table_entry = ModTableEntryBuilder::default()
1483///     .base('C')
1484///     .is_strand_plus(true)
1485///     .mod_code("m".into())
1486///     .data([(0, 15, 200), (2, 25, 100)]).build()?;
1487///
1488/// let read = CurrReadBuilder::default()
1489///     .read_id("some_read".into())
1490///     .seq_len(40)
1491///     .alignment_type(ReadState::PrimaryFwd)
1492///     .alignment(AlignmentInfoBuilder::default()
1493///         .start(10)
1494///         .end(60)
1495///         .contig("chr1".into())
1496///         .contig_id(1).build()?)
1497///         .mod_table([mod_table_entry].into()).build()?;
1498/// # Ok::<(), Error>(())
1499/// ```
1500///
1501/// ## Mapped read building with multiple mod types.
1502///
1503/// ```
1504/// use nanalogue_core::{Error, CurrReadBuilder, AlignmentInfoBuilder, ModTableEntryBuilder,
1505///     ReadState};
1506///
1507/// let mod_table_entry_1 = ModTableEntryBuilder::default()
1508///     .base('C')
1509///     .is_strand_plus(true)
1510///     .mod_code("m".into())
1511///     .data([(0, 15, 200), (2, 25, 100)]).build()?;
1512///
1513/// let mod_table_entry_2 = ModTableEntryBuilder::default()
1514///     .base('A')
1515///     .is_strand_plus(true)
1516///     .mod_code("a".into())
1517///     .data([(1, 20, 50), (3, 30, 225)]).build()?;
1518///
1519/// let read = CurrReadBuilder::default()
1520///     .read_id("some_read".into())
1521///     .seq_len(40)
1522///     .alignment_type(ReadState::PrimaryFwd)
1523///     .alignment(AlignmentInfoBuilder::default()
1524///         .start(10)
1525///         .end(60)
1526///         .contig("chr1".into())
1527///         .contig_id(1).build()?)
1528///         .mod_table([mod_table_entry_1, mod_table_entry_2].into()).build()?;
1529/// # Ok::<(), Error>(())
1530/// ```
1531/// ## Unmapped read building with multiple mod types.
1532///
1533/// Note that all the reference coordinates are set to -1.
1534///
1535/// ```
1536/// use nanalogue_core::{Error, CurrReadBuilder, ModTableEntryBuilder, ReadState};
1537///
1538/// let mod_table_entry_1 = ModTableEntryBuilder::default()
1539///     .base('C')
1540///     .is_strand_plus(true)
1541///     .mod_code("m".into())
1542///     .data([(0, -1, 200), (2, -1, 100)]).build()?;
1543///
1544/// let mod_table_entry_2 = ModTableEntryBuilder::default()
1545///     .base('A')
1546///     .is_strand_plus(true)
1547///     .mod_code("a".into())
1548///     .data([(1, -1, 50), (3, -1, 225)]).build()?;
1549///
1550/// let read = CurrReadBuilder::default()
1551///     .read_id("some_read".into())
1552///     .seq_len(40)
1553///     .mod_table([mod_table_entry_1, mod_table_entry_2].into()).build()?;
1554/// # Ok::<(), Error>(())
1555/// ```
1556///
1557/// ## Erroneous Usage
1558///
1559/// ### Coordinates are not sorted
1560///
1561/// Please note that even for reverse-aligned reads, coordinates must always
1562/// be ascending.
1563///
1564/// ```should_panic
1565/// use nanalogue_core::{Error, CurrReadBuilder, AlignmentInfoBuilder, ModTableEntryBuilder,
1566///     ReadState};
1567///
1568/// let mod_table_entry = ModTableEntryBuilder::default()
1569///     .base('C')
1570///     .is_strand_plus(true)
1571///     .mod_code("m".into())
1572///     .data([(2, 25, 100), (0, 15, 200)]).build()?;
1573///
1574/// let read_before_build = CurrReadBuilder::default()
1575///     .read_id("some_read".into())
1576///     .seq_len(40)
1577///     .alignment(AlignmentInfoBuilder::default()
1578///         .start(10)
1579///         .end(60)
1580///         .contig("chr1".into())
1581///         .contig_id(1).build()?)
1582///         .mod_table([mod_table_entry].into());
1583///
1584/// let _ = read_before_build.alignment_type(ReadState::PrimaryFwd).build()?;
1585/// # Ok::<(), Error>(())
1586/// ```
1587///
1588/// ```should_panic
1589/// # use nanalogue_core::{Error, CurrReadBuilder, AlignmentInfoBuilder, ModTableEntryBuilder,
1590/// #    ReadState};
1591/// #
1592/// # let mod_table_entry = ModTableEntryBuilder::default()
1593/// #    .base('C')
1594/// #    .is_strand_plus(true)
1595/// #    .mod_code("m".into())
1596/// #    .data([(2, 25, 100), (0, 15, 200)]).build()?;
1597/// #
1598/// # let read_before_build = CurrReadBuilder::default()
1599/// #    .read_id("some_read".into())
1600/// #    .seq_len(40)
1601/// #    .alignment(AlignmentInfoBuilder::default()
1602/// #        .start(10)
1603/// #        .end(60)
1604/// #        .contig("chr1".into())
1605/// #        .contig_id(1).build()?)
1606/// #        .mod_table([mod_table_entry].into());
1607/// #
1608/// let _ = read_before_build.alignment_type(ReadState::PrimaryRev).build()?;
1609/// # Ok::<(), Error>(())
1610/// ```
1611///
1612/// ### Unmapped read but mod reference coordinates are not set to `-1`.
1613///
1614/// ```should_panic
1615/// use nanalogue_core::{Error, CurrReadBuilder, ModTableEntryBuilder, ReadState};
1616///
1617/// let mod_table_entry = ModTableEntryBuilder::default()
1618///     .base('C')
1619///     .is_strand_plus(true)
1620///     .mod_code("m".into())
1621///     .data([(0, -1, 200), (2, 11, 100)]).build()?;
1622///
1623/// let read = CurrReadBuilder::default()
1624///     .read_id("some_read".into())
1625///     .seq_len(40)
1626///     .mod_table([mod_table_entry].into()).build()?;
1627/// # Ok::<(), Error>(())
1628/// ```
1629///
1630/// ### Read with mod coordinates larger than sequence length
1631///
1632/// This will cause a problem irrespective of whether a read is mapped or not.
1633///
1634/// ```should_panic
1635/// use nanalogue_core::{Error, CurrReadBuilder, ModTableEntryBuilder, ReadState};
1636///
1637/// let mod_table_entry = ModTableEntryBuilder::default()
1638///     .base('C')
1639///     .is_strand_plus(true)
1640///     .mod_code("m".into())
1641///     .data([(0, -1, 200), (42, -1, 100)]).build()?;
1642///
1643/// let read = CurrReadBuilder::default()
1644///     .read_id("some_read".into())
1645///     .seq_len(40)
1646///     .mod_table([mod_table_entry].into()).build()?;
1647/// # Ok::<(), Error>(())
1648/// ```
1649///
1650/// ```should_panic
1651/// use nanalogue_core::{Error, CurrReadBuilder, AlignmentInfoBuilder, ModTableEntryBuilder,
1652///     ReadState};
1653///
1654/// let mod_table_entry = ModTableEntryBuilder::default()
1655///     .base('C')
1656///     .is_strand_plus(true)
1657///     .mod_code("m".into())
1658///     .data([(0, 20, 200), (42, 30, 100)]).build()?;
1659///
1660/// let read = CurrReadBuilder::default()
1661///     .read_id("some_read".into())
1662///     .seq_len(40)
1663///     .alignment(AlignmentInfoBuilder::default()
1664///         .start(10)
1665///         .end(60)
1666///         .contig("chr1".into())
1667///         .contig_id(1).build()?)
1668///     .mod_table([mod_table_entry].into()).build()?;
1669/// # Ok::<(), Error>(())
1670/// ```
1671/// ### Read with mod coordinates beyond alignment coordinates
1672///
1673/// ```should_panic
1674/// use nanalogue_core::{Error, CurrReadBuilder, AlignmentInfoBuilder, ModTableEntryBuilder,
1675///     ReadState};
1676///
1677/// let mod_table_entry = ModTableEntryBuilder::default()
1678///     .base('C')
1679///     .is_strand_plus(true)
1680///     .mod_code("m".into())
1681///     .data([(0, 1, 200), (22, 30, 100)]).build()?;
1682///
1683/// let read = CurrReadBuilder::default()
1684///     .read_id("some_read".into())
1685///     .seq_len(40)
1686///     .alignment(AlignmentInfoBuilder::default()
1687///         .start(10)
1688///         .end(60)
1689///         .contig("chr1".into())
1690///         .contig_id(1).build()?)
1691///     .mod_table([mod_table_entry].into()).build()?;
1692/// # Ok::<(), Error>(())
1693/// ```
1694/// ### Reads with multiple tracks of the same kind of modification.
1695///
1696/// Here we have two `C+m` tracks. Note that two cytosine tracks or two
1697/// methylation tracks or two cytosine methylation tracks on opposite
1698/// strands are all allowed. Only the case where there are two tracks
1699/// of the same base with the same modification on the same strand
1700/// are disallowed as the same kind of information is being populated twice.
1701///
1702/// ```should_panic
1703/// use nanalogue_core::{Error, CurrReadBuilder, ModTableEntryBuilder, ReadState};
1704///
1705/// let mod_table_entry_1 = ModTableEntryBuilder::default()
1706///     .base('C')
1707///     .is_strand_plus(true)
1708///     .mod_code("m".into())
1709///     .data([(0, -1, 200), (2, -1, 100)]).build()?;
1710///
1711/// let mod_table_entry_2 = ModTableEntryBuilder::default()
1712///     .base('C')
1713///     .is_strand_plus(true)
1714///     .mod_code("m".into())
1715///     .data([(1, -1, 50), (3, -1, 225)]).build()?;
1716///
1717/// let read = CurrReadBuilder::default()
1718///     .read_id("some_read".into())
1719///     .seq_len(40)
1720///     .mod_table([mod_table_entry_1, mod_table_entry_2].into()).build()?;
1721/// # Ok::<(), Error>(())
1722/// ```
1723#[derive(Debug, Clone, Serialize, Deserialize)]
1724#[serde(default)]
1725pub struct CurrReadBuilder {
1726    /// The type of alignment
1727    alignment_type: ReadState,
1728    /// Alignment information, None for unmapped reads
1729    #[serde(skip_serializing_if = "Option::is_none")]
1730    alignment: Option<AlignmentInfo>,
1731    /// Condensed modification data table
1732    mod_table: Vec<ModTableEntry>,
1733    /// Read identifier
1734    read_id: String,
1735    /// Sequence length
1736    seq_len: u64,
1737}
1738
1739impl Default for CurrReadBuilder {
1740    fn default() -> Self {
1741        Self {
1742            alignment_type: ReadState::Unmapped, // note that default is unmapped now, not primary
1743            alignment: None,
1744            mod_table: Vec::new(),
1745            read_id: String::new(),
1746            seq_len: 0,
1747        }
1748    }
1749}
1750
1751/// Alignment information for mapped reads
1752///
1753/// See documentation of [`CurrReadBuilder`] on how to use
1754/// this struct.
1755#[derive(Builder, Debug, Clone, Default, Serialize, Deserialize)]
1756#[serde(default)]
1757#[builder(default, build_fn(error = "Error"), pattern = "owned")]
1758pub struct AlignmentInfo {
1759    /// Start position on reference
1760    start: u64,
1761    /// End position on reference
1762    end: u64,
1763    /// Contig/chromosome name
1764    contig: String,
1765    /// Contig/chromosome ID
1766    contig_id: i32,
1767}
1768
1769/// Data per type of modification in [`CurrReadBuilder`].
1770///
1771/// See documentation of [`CurrReadBuilder`] on how to use
1772/// this struct.
1773#[derive(Builder, Debug, Clone, Default, Serialize, Deserialize)]
1774#[serde(default)]
1775#[builder(default, build_fn(error = "Error"), pattern = "owned")]
1776pub struct ModTableEntry {
1777    /// Base that is modified (A, C, G, T, etc.)
1778    #[builder(field(ty = "char", build = "self.base.try_into()?"))]
1779    base: AllowedAGCTN,
1780    /// Whether this is on the plus strand
1781    is_strand_plus: bool,
1782    /// Modification code (character or numeric)
1783    #[builder(field(ty = "String", build = "self.mod_code.parse()?"))]
1784    mod_code: ModChar,
1785    /// Modification data as [`start`, `ref_start`, `mod_qual`] tuples
1786    #[builder(setter(into))]
1787    data: Vec<(u64, i64, u8)>,
1788}
1789
1790impl CurrReadBuilder {
1791    /// Sets alignment type
1792    #[must_use]
1793    pub fn alignment_type(mut self, value: ReadState) -> Self {
1794        self.alignment_type = value;
1795        self
1796    }
1797    /// Sets alignment information
1798    #[must_use]
1799    pub fn alignment(mut self, value: AlignmentInfo) -> Self {
1800        self.alignment = Some(value);
1801        self
1802    }
1803    /// Sets mod information
1804    #[must_use]
1805    pub fn mod_table(mut self, value: Vec<ModTableEntry>) -> Self {
1806        self.mod_table = value;
1807        self
1808    }
1809    /// Sets read id
1810    #[must_use]
1811    pub fn read_id(mut self, value: String) -> Self {
1812        self.read_id = value;
1813        self
1814    }
1815    /// Sets sequence length
1816    #[must_use]
1817    pub fn seq_len(mut self, value: u64) -> Self {
1818        self.seq_len = value;
1819        self
1820    }
1821    /// Build `CurrRead`
1822    ///
1823    /// # Errors
1824    /// If the conversion fails. This could happen if invalid
1825    /// data was used in the building.
1826    pub fn build(self) -> Result<CurrRead<AlignAndModData>, Error> {
1827        CurrRead::<AlignAndModData>::try_from(self)
1828    }
1829}
1830
1831impl Serialize for CurrRead<AlignAndModData> {
1832    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
1833    where
1834        S: serde::Serializer,
1835    {
1836        let serialized_curr_read: CurrReadBuilder =
1837            self.clone().try_into().map_err(serde::ser::Error::custom)?;
1838        serialized_curr_read.serialize(serializer)
1839    }
1840}
1841
1842impl TryFrom<CurrRead<AlignAndModData>> for CurrReadBuilder {
1843    type Error = Error;
1844
1845    fn try_from(curr_read: CurrRead<AlignAndModData>) -> Result<Self, Self::Error> {
1846        let alignment_type = curr_read.read_state();
1847
1848        #[expect(
1849            clippy::arithmetic_side_effects,
1850            reason = "u64 variables won't overflow with genomic coords (<2^64-1)"
1851        )]
1852        let alignment = if curr_read.read_state().is_unmapped() {
1853            None
1854        } else {
1855            let (contig_id, start) = curr_read.contig_id_and_start()?;
1856            let align_len = curr_read.align_len()?;
1857            let contig = curr_read.contig_name()?.to_string();
1858            let end = start + align_len;
1859
1860            Some(AlignmentInfo {
1861                start,
1862                end,
1863                contig,
1864                contig_id,
1865            })
1866        };
1867
1868        let mod_table = condense_base_mods(&curr_read.mod_data().0)?;
1869
1870        let read_id = curr_read.read_id().to_string();
1871        let seq_len = curr_read.seq_len()?;
1872
1873        Ok(CurrReadBuilder {
1874            alignment_type,
1875            alignment,
1876            mod_table,
1877            read_id,
1878            seq_len,
1879        })
1880    }
1881}
1882
1883impl<'de> Deserialize<'de> for CurrRead<AlignAndModData> {
1884    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
1885    where
1886        D: serde::Deserializer<'de>,
1887    {
1888        let serialized = CurrReadBuilder::deserialize(deserializer)?;
1889        serialized.try_into().map_err(serde::de::Error::custom)
1890    }
1891}
1892
1893impl TryFrom<CurrReadBuilder> for CurrRead<AlignAndModData> {
1894    type Error = Error;
1895
1896    fn try_from(serialized: CurrReadBuilder) -> Result<Self, Self::Error> {
1897        // Extract alignment information
1898        let (align_len, contig_id_and_start, contig_name, ref_range) = match (
1899            serialized.alignment_type.is_unmapped(),
1900            serialized.alignment.as_ref(),
1901        ) {
1902            (false, Some(alignment)) => {
1903                let align_len = {
1904                    if let Some(v) = alignment.end.checked_sub(alignment.start) {
1905                        Ok(Some(v))
1906                    } else {
1907                        Err(Error::InvalidAlignCoords(format!(
1908                            "is align end {0} < align start {1}? read {2} failed in `CurrRead` building!",
1909                            alignment.end, alignment.start, serialized.read_id
1910                        )))
1911                    }
1912                }?;
1913                let contig_id_and_start = Some((alignment.contig_id, alignment.start));
1914                let contig_name = Some(alignment.contig.clone());
1915                (
1916                    align_len,
1917                    contig_id_and_start,
1918                    contig_name,
1919                    i64::try_from(alignment.start)?..i64::try_from(alignment.end)?,
1920                )
1921            }
1922            (true, None) => (None, None, None, (-1i64)..0),
1923            (_, _) => {
1924                return Err(Error::UnknownAlignState(format!(
1925                    "alignment_type and alignment info not matching while building CurrRead! read_id: {}",
1926                    serialized.read_id,
1927                )));
1928            }
1929        };
1930
1931        // Reconstruct BaseMods from mod_table
1932        let base_mods = reconstruct_base_mods(
1933            &serialized.mod_table,
1934            serialized.alignment_type.strand() == '-',
1935            ref_range,
1936            serialized.seq_len,
1937        )?;
1938
1939        // Create CurrRead directly
1940        Ok(CurrRead {
1941            state: serialized.alignment_type,
1942            read_id: serialized.read_id,
1943            seq_len: Some(serialized.seq_len),
1944            align_len,
1945            mods: (base_mods, ThresholdState::GtEq(0)), // Default threshold
1946            contig_id_and_start,
1947            contig_name,
1948            mod_base_qual_thres: 0, // Default value
1949            marker: std::marker::PhantomData,
1950        })
1951    }
1952}
1953
1954/// Convert `BaseMods` to condensed `mod_table` format
1955fn condense_base_mods(base_mods: &BaseMods) -> Result<Vec<ModTableEntry>, Error> {
1956    let mut mod_table = Vec::new();
1957
1958    for base_mod in &base_mods.base_mods {
1959        let entries: Result<Vec<_>, Error> = base_mod
1960            .ranges
1961            .annotations
1962            .iter()
1963            .map(|k| {
1964                let start: u64 = k.start.try_into()?;
1965                let ref_start = k.reference_start.unwrap_or(-1);
1966                Ok((start, ref_start, k.qual))
1967            })
1968            .collect();
1969
1970        mod_table.push(ModTableEntry {
1971            base: AllowedAGCTN::try_from(base_mod.modified_base)?,
1972            is_strand_plus: base_mod.strand == '+',
1973            mod_code: ModChar::new(base_mod.modification_type),
1974            data: entries?,
1975        });
1976    }
1977
1978    Ok(mod_table)
1979}
1980
1981/// Reconstruct `BaseMods` from condensed `mod_table` format
1982fn reconstruct_base_mods(
1983    mod_table: &[ModTableEntry],
1984    is_reverse: bool,
1985    ref_range: Range<i64>,
1986    seq_len: u64,
1987) -> Result<BaseMods, Error> {
1988    let mut base_mods = Vec::new();
1989
1990    for entry in mod_table {
1991        let annotations = {
1992            let mut annotations = Vec::<FiberAnnotation>::with_capacity(entry.data.len());
1993            let mut valid_range = 0..seq_len;
1994
1995            #[expect(
1996                clippy::arithmetic_side_effects,
1997                reason = "overflow errors not possible as genomic coordinates << 2^64"
1998            )]
1999            for &(start, ref_start, qual) in &entry.data {
2000                // Check that sequence coordinates are in range [prev_start + 1, seq_len)
2001                if !valid_range.contains(&start) {
2002                    return Err(Error::InvalidModCoords(String::from(
2003                        "in mod table, read coords > seq length or read coords not sorted (NOTE: \
2004ascending needed even if reversed read)!",
2005                    )));
2006                }
2007                valid_range = start + 1..seq_len;
2008
2009                let ref_start_after_check = match (ref_start, ref_range.contains(&ref_start)) {
2010                    (-1, _) => None,
2011                    (v, true) if v > -1 => Some(v),
2012                    (v, _) => {
2013                        return Err(Error::InvalidAlignCoords(format!(
2014                            "coordinate {v} invalid in mod table (exceeds alignment coords or is < -1)"
2015                        )));
2016                    }
2017                };
2018                let start_i64 = i64::try_from(start)?;
2019                annotations.push(FiberAnnotation {
2020                    start: start_i64,
2021                    end: start_i64 + 1,
2022                    length: 1,
2023                    qual,
2024                    reference_start: ref_start_after_check,
2025                    reference_end: ref_start_after_check,
2026                    reference_length: ref_start_after_check.is_some().then_some(0),
2027                    extra_columns: None,
2028                });
2029            }
2030            annotations
2031        };
2032
2033        let ranges = Ranges::from_annotations(annotations, seq_len.try_into()?, is_reverse);
2034
2035        let strand = if entry.is_strand_plus { '+' } else { '-' };
2036
2037        base_mods.push(BaseMod {
2038            modified_base: u8::try_from(char::from(entry.base))?,
2039            strand,
2040            modification_type: entry.mod_code.val(),
2041            ranges,
2042            record_is_reverse: is_reverse,
2043        });
2044    }
2045
2046    base_mods.sort();
2047
2048    // Check for duplicate strand, modification_type combinations
2049    let mut seen_combinations = HashSet::new();
2050    for base_mod in &base_mods {
2051        let combination = (base_mod.strand, base_mod.modification_type);
2052        if seen_combinations.contains(&combination) {
2053            return Err(Error::InvalidDuplicates(format!(
2054                "Duplicate strand '{}' and modification_type '{}' combination found",
2055                base_mod.strand, base_mod.modification_type
2056            )));
2057        }
2058        let _: bool = seen_combinations.insert(combination);
2059    }
2060
2061    Ok(BaseMods { base_mods })
2062}
2063
2064/// Convert a vector of `CurrRead<AlignAndModData>` to a Polars `DataFrame`
2065/// with one row per modification data point
2066///
2067/// Each modification data point becomes a row in the `DataFrame`, with read-level
2068/// and alignment-level information repeated across rows for the same read.
2069///
2070/// # Example
2071///
2072/// ```
2073/// use nanalogue_core::{Error, CurrReadBuilder, AlignmentInfoBuilder, ModTableEntryBuilder,
2074///     ReadState, curr_reads_to_dataframe};
2075/// use polars::prelude::*;
2076///
2077/// // Build modification table entries with two data points
2078/// let mod_table_entry = ModTableEntryBuilder::default()
2079///     .base('C')
2080///     .is_strand_plus(true)
2081///     .mod_code("m".into())
2082///     .data([(0, 15, 200), (2, 25, 100)])
2083///     .build()?;
2084///
2085/// // Build a CurrRead<AlignAndModData>
2086/// let read = CurrReadBuilder::default()
2087///     .read_id("test_read".into())
2088///     .seq_len(40)
2089///     .alignment_type(ReadState::PrimaryFwd)
2090///     .alignment(AlignmentInfoBuilder::default()
2091///         .start(10)
2092///         .end(60)
2093///         .contig("chr1".into())
2094///         .contig_id(1)
2095///         .build()?)
2096///     .mod_table([mod_table_entry].into())
2097///     .build()?;
2098///
2099/// // Convert to DataFrame
2100/// let df = curr_reads_to_dataframe(&[read])?;
2101///
2102/// // Verify DataFrame structure
2103/// assert_eq!(df.height(), 2); // Two modification data points
2104/// assert_eq!(df.width(), 13); // 13 columns
2105///
2106/// // Check read-level fields (repeated across both rows)
2107/// let read_id_col = df.column("read_id")?.str()?;
2108/// assert_eq!(read_id_col.get(0), Some("test_read"));
2109/// assert_eq!(read_id_col.get(1), Some("test_read"));
2110///
2111/// let seq_len_col = df.column("seq_len")?.u64()?;
2112/// assert_eq!(seq_len_col.get(0), Some(40));
2113/// assert_eq!(seq_len_col.get(1), Some(40));
2114///
2115/// let alignment_type_col = df.column("alignment_type")?.str()?;
2116/// assert_eq!(alignment_type_col.get(0), Some("primary_forward"));
2117/// assert_eq!(alignment_type_col.get(1), Some("primary_forward"));
2118///
2119/// // Check alignment fields (repeated across both rows)
2120/// let align_start_col = df.column("align_start")?.u64()?;
2121/// assert_eq!(align_start_col.get(0), Some(10));
2122/// assert_eq!(align_start_col.get(1), Some(10));
2123///
2124/// let align_end_col = df.column("align_end")?.u64()?;
2125/// assert_eq!(align_end_col.get(0), Some(60));
2126/// assert_eq!(align_end_col.get(1), Some(60));
2127///
2128/// let contig_col = df.column("contig")?.str()?;
2129/// assert_eq!(contig_col.get(0), Some("chr1"));
2130/// assert_eq!(contig_col.get(1), Some("chr1"));
2131///
2132/// let contig_id_col = df.column("contig_id")?.i32()?;
2133/// assert_eq!(contig_id_col.get(0), Some(1));
2134/// assert_eq!(contig_id_col.get(1), Some(1));
2135///
2136/// // Check modification entry fields (repeated across both rows)
2137/// let base_col = df.column("base")?.str()?;
2138/// assert_eq!(base_col.get(0), Some("C"));
2139/// assert_eq!(base_col.get(1), Some("C"));
2140///
2141/// let is_strand_plus_col = df.column("is_strand_plus")?.bool()?;
2142/// assert_eq!(is_strand_plus_col.get(0), Some(true));
2143/// assert_eq!(is_strand_plus_col.get(1), Some(true));
2144///
2145/// let mod_code_col = df.column("mod_code")?.str()?;
2146/// assert_eq!(mod_code_col.get(0), Some("m"));
2147/// assert_eq!(mod_code_col.get(1), Some("m"));
2148///
2149/// // Check modification data points (unique per row)
2150/// let position_col = df.column("position")?.u64()?;
2151/// assert_eq!(position_col.get(0), Some(0)); // First mod position
2152/// assert_eq!(position_col.get(1), Some(2)); // Second mod position
2153///
2154/// let ref_position_col = df.column("ref_position")?.i64()?;
2155/// assert_eq!(ref_position_col.get(0), Some(15)); // First ref position
2156/// assert_eq!(ref_position_col.get(1), Some(25)); // Second ref position
2157///
2158/// let mod_quality_col = df.column("mod_quality")?.u32()?;
2159/// assert_eq!(mod_quality_col.get(0), Some(200)); // First mod_quality score
2160/// assert_eq!(mod_quality_col.get(1), Some(100)); // Second mod_quality score
2161///
2162/// # Ok::<(), Error>(())
2163/// ```
2164///
2165/// # Errors
2166/// Returns nanalogue `Error` if `DataFrame` construction fails or if
2167/// data extraction from `CurrRead` fails
2168#[expect(
2169    clippy::arithmetic_side_effects,
2170    reason = "start + alen cannot overflow as genomic coordinates are far below u64::MAX"
2171)]
2172pub fn curr_reads_to_dataframe(reads: &[CurrRead<AlignAndModData>]) -> Result<DataFrame, Error> {
2173    // Vectors to hold column data
2174    let mut read_ids: Vec<String> = Vec::new();
2175    let mut seq_lens: Vec<Option<u64>> = Vec::new();
2176    let mut alignment_types: Vec<String> = Vec::new();
2177
2178    // Alignment info columns
2179    let mut align_starts: Vec<Option<u64>> = Vec::new();
2180    let mut align_ends: Vec<Option<u64>> = Vec::new();
2181    let mut contigs: Vec<Option<String>> = Vec::new();
2182    let mut contig_ids: Vec<Option<i32>> = Vec::new();
2183
2184    // Modification entry columns
2185    let mut bases: Vec<String> = Vec::new();
2186    let mut is_strand_plus: Vec<bool> = Vec::new();
2187    let mut mod_codes: Vec<String> = Vec::new();
2188
2189    // Modification data point columns (the unique part)
2190    let mut positions: Vec<u64> = Vec::new();
2191    let mut ref_positions: Vec<i64> = Vec::new();
2192    let mut mod_qualities: Vec<u32> = Vec::new();
2193
2194    // Iterate through each CurrRead
2195    for read in reads {
2196        // Extract read-level information
2197        let read_id = read.read_id();
2198        let seq_len = read.seq_len().ok();
2199        let alignment_type = read.read_state();
2200
2201        // Extract alignment information (may be None for unmapped reads)
2202        let (contig_id, align_start, align_end, contig_name) =
2203            if let (Ok((cid, start)), Ok(alen), Ok(cname)) = (
2204                read.contig_id_and_start(),
2205                read.align_len(),
2206                read.contig_name(),
2207            ) {
2208                (
2209                    Some(cid),
2210                    Some(start),
2211                    Some(start + alen),
2212                    Some(cname.to_string()),
2213                )
2214            } else {
2215                (None, None, None, None)
2216            };
2217
2218        // Get BaseMods and condense to ModTableEntry format
2219        let mod_data = read.mod_data();
2220        let mod_table = condense_base_mods(&mod_data.0)?;
2221
2222        // For each modification table entry
2223        for mod_entry in &mod_table {
2224            // For each data point in this mod entry
2225            for &(start, ref_start, qual) in &mod_entry.data {
2226                // Add read-level fields (repeated)
2227                read_ids.push(read_id.to_string());
2228                seq_lens.push(seq_len);
2229                alignment_types.push(alignment_type.to_string());
2230
2231                // Add alignment fields (repeated, may be None)
2232                align_starts.push(align_start);
2233                align_ends.push(align_end);
2234                contigs.push(contig_name.clone());
2235                contig_ids.push(contig_id);
2236
2237                // Add mod entry fields (repeated)
2238                bases.push(mod_entry.base.to_string());
2239                is_strand_plus.push(mod_entry.is_strand_plus);
2240                mod_codes.push(mod_entry.mod_code.to_string());
2241
2242                // Add data point fields (unique per row)
2243                positions.push(start);
2244                ref_positions.push(ref_start);
2245                mod_qualities.push(u32::from(qual));
2246            }
2247        }
2248    }
2249
2250    // Build the DataFrame
2251    let df = df! {
2252        "read_id" => read_ids,
2253        "seq_len" => seq_lens,
2254        "alignment_type" => alignment_types,
2255        "align_start" => align_starts,
2256        "align_end" => align_ends,
2257        "contig" => contigs,
2258        "contig_id" => contig_ids,
2259        "base" => bases,
2260        "is_strand_plus" => is_strand_plus,
2261        "mod_code" => mod_codes,
2262        "position" => positions,
2263        "ref_position" => ref_positions,
2264        "mod_quality" => mod_qualities,
2265    }?;
2266
2267    Ok(df)
2268}
2269
2270#[cfg(test)]
2271mod test_error_handling {
2272    use super::*;
2273
2274    #[test]
2275    fn set_read_state_not_implemented_error() {
2276        for flag_value in 0..4096u16 {
2277            let mut record = Record::new();
2278            record.set_qname(b"test_read");
2279            record.set_flags(flag_value);
2280            let curr_read = CurrRead::default();
2281            // * first line below is usual primary, secondary, supplementary,
2282            //   unmapped with reversed set or not with the first 3 categories
2283            // * second line is if unmapped is set with a mapped state, or
2284            //   secondary and supplementary are both set
2285            // * third line are flags we have chosen to exclude from our program.
2286            match (flag_value, curr_read.set_read_state_and_id(&record)) {
2287                (0 | 4 | 16 | 256 | 272 | 2048 | 2064, Ok(_))
2288                | (
2289                    20 | 260 | 276 | 2052 | 2068 | 2304 | 2308 | 2320 | 2324,
2290                    Err(Error::UnknownAlignState(_)),
2291                )
2292                | (_, Err(Error::NotImplemented(_))) => {}
2293                (_, _) => unreachable!(),
2294            }
2295        }
2296    }
2297
2298    #[test]
2299    #[should_panic(expected = "InvalidDuplicates")]
2300    fn reconstruct_base_mods_detects_duplicates() {
2301        // Create a test case with duplicate strand and modification_type combinations
2302        // i.e. we have two entries for T+T below with different data
2303        let mod_entries = vec![
2304            ModTableEntry {
2305                base: AllowedAGCTN::T,
2306                is_strand_plus: true,
2307                mod_code: ModChar::new('T'),
2308                data: vec![(0, 0, 10)],
2309            },
2310            ModTableEntry {
2311                base: AllowedAGCTN::T,
2312                is_strand_plus: true,
2313                mod_code: ModChar::new('T'),
2314                data: vec![(1, 1, 20)],
2315            },
2316        ];
2317
2318        // Test reconstruct_base_mods with duplicate entries
2319        let _: BaseMods = reconstruct_base_mods(&mod_entries, false, 0..i64::MAX, 10).unwrap();
2320    }
2321
2322    #[test]
2323    fn reconstruct_base_mods_accepts_unique_combinations() {
2324        // Create a valid case with different combinations
2325        // First entry: T+ modification
2326        // Second entry: C+ modification (different base, same strand)
2327        // Third entry: T- modification (same base, different strand)
2328        let mod_entries = vec![
2329            ModTableEntry {
2330                base: AllowedAGCTN::T,
2331                is_strand_plus: true,
2332                mod_code: ModChar::new('T'),
2333                data: vec![(0, 0, 10)],
2334            },
2335            ModTableEntry {
2336                base: AllowedAGCTN::C,
2337                is_strand_plus: true,
2338                mod_code: ModChar::new('m'),
2339                data: vec![(1, 1, 20)],
2340            },
2341            ModTableEntry {
2342                base: AllowedAGCTN::T,
2343                is_strand_plus: false,
2344                mod_code: ModChar::new('T'),
2345                data: vec![(2, 2, 30)],
2346            },
2347        ];
2348
2349        // This should succeed since all combinations are unique
2350        let _: BaseMods = reconstruct_base_mods(&mod_entries, false, 0..i64::MAX, 10).unwrap();
2351    }
2352}
2353
2354#[cfg(test)]
2355mod test_serde {
2356    use super::*;
2357    use crate::nanalogue_bam_reader;
2358    use indoc::indoc;
2359    use rust_htslib::bam::Read as _;
2360
2361    #[test]
2362    fn first_record_serde() -> Result<(), Error> {
2363        // Read the first record from the example BAM file
2364        let mut reader = nanalogue_bam_reader("examples/example_1.bam")?;
2365        let first_record = reader.records().next().unwrap()?;
2366
2367        // Create CurrRead with alignment and modification data
2368        let curr_read = CurrRead::default()
2369            .try_from_only_alignment(&first_record)?
2370            .set_mod_data(&first_record, ThresholdState::GtEq(0), 0)?;
2371
2372        let actual_json: serde_json::Value = serde_json::to_value(&curr_read)?;
2373
2374        let expected_json_str = indoc! {r#"
2375            {
2376              "alignment_type": "primary_forward",
2377              "alignment": {
2378                "start": 9,
2379                "end": 17,
2380                "contig": "dummyI",
2381                "contig_id": 0
2382              },
2383              "mod_table": [
2384                {
2385                  "base": "T",
2386                  "is_strand_plus": true,
2387                  "mod_code": "T",
2388                  "data": [
2389                    [0, 9, 4],
2390                    [3, 12, 7],
2391                    [4, 13, 9],
2392                    [7, 16, 6]
2393                  ]
2394                }
2395              ],
2396              "read_id": "5d10eb9a-aae1-4db8-8ec6-7ebb34d32575",
2397              "seq_len": 8
2398            }"#};
2399
2400        let expected_json: serde_json::Value = serde_json::from_str(expected_json_str)?;
2401
2402        // Compare expected and actual outputs
2403        assert_eq!(actual_json, expected_json);
2404
2405        // Also test deserialization: deserialize the expected JSON and compare with original CurrRead
2406        let deserialized_curr_read: CurrRead<AlignAndModData> =
2407            serde_json::from_str(expected_json_str)?;
2408        assert_eq!(deserialized_curr_read, curr_read);
2409
2410        Ok(())
2411    }
2412
2413    #[test]
2414    fn first_record_roundtrip() -> Result<(), Error> {
2415        // Read the first record from the example BAM file (same as serialization test)
2416        let mut reader = nanalogue_bam_reader("examples/example_1.bam")?;
2417        let first_record = reader.records().next().unwrap()?;
2418
2419        // Create the original CurrRead with alignment and modification data
2420        let original_curr_read = CurrRead::default()
2421            .try_from_only_alignment(&first_record)?
2422            .set_mod_data(&first_record, ThresholdState::GtEq(0), 0)?;
2423
2424        // Serialize to JSON
2425        let json_str = serde_json::to_string_pretty(&original_curr_read)?;
2426
2427        // Deserialize back to CurrRead
2428        let deserialized_curr_read: CurrRead<AlignAndModData> = serde_json::from_str(&json_str)?;
2429
2430        // The deserialized CurrRead should be equal to the original
2431        assert_eq!(deserialized_curr_read, original_curr_read);
2432
2433        Ok(())
2434    }
2435
2436    #[test]
2437    fn blank_json_record_roundtrip() -> Result<(), Error> {
2438        let json_str = indoc! {r"
2439            {
2440            }"};
2441
2442        // Deserialize JSON to CurrRead
2443        let curr_read: CurrRead<AlignAndModData> = serde_json::from_str(json_str)?;
2444
2445        // Serialize back to JSON
2446        let serialized_json = serde_json::to_string_pretty(&curr_read)?;
2447
2448        // Deserialize again
2449        let roundtrip_curr_read: CurrRead<AlignAndModData> =
2450            serde_json::from_str(&serialized_json)?;
2451
2452        // Check that the roundtrip preserves equality
2453        assert_eq!(curr_read, roundtrip_curr_read);
2454
2455        Ok(())
2456    }
2457
2458    #[test]
2459    fn example_1_unmapped() -> Result<(), Error> {
2460        // Read the fourth record from the example BAM file (this is the unmapped read)
2461        let fourth_record = {
2462            let mut reader = nanalogue_bam_reader("examples/example_1.bam")?;
2463            let mut records = reader.records();
2464            for _ in 0..3 {
2465                let _: Record = records.next().unwrap()?;
2466            }
2467            records.next().unwrap()?
2468        };
2469
2470        // Create CurrRead with alignment and modification data
2471        let curr_read = CurrRead::default()
2472            .try_from_only_alignment(&fourth_record)?
2473            .set_mod_data(&fourth_record, ThresholdState::GtEq(0), 0)?;
2474
2475        let actual_json: serde_json::Value = serde_json::to_value(&curr_read)?;
2476
2477        let expected_json_str = indoc! {r#"
2478            {
2479              "alignment_type": "unmapped",
2480              "mod_table": [
2481                {
2482                  "base": "G",
2483                  "is_strand_plus": false,
2484                  "mod_code": "7200",
2485                  "data": [
2486                    [28, -1, 0],
2487                    [29, -1, 0],
2488                    [30, -1, 0],
2489                    [32, -1, 0],
2490                    [43, -1, 77],
2491                    [44, -1, 0]
2492                  ]
2493                },
2494                {
2495                  "base": "T",
2496                  "is_strand_plus": true,
2497                  "mod_code": "T",
2498                  "data": [
2499                    [3, -1, 221],
2500                    [8, -1, 242],
2501                    [27, -1, 0],
2502                    [39, -1, 47],
2503                    [47, -1, 239]
2504                  ]
2505                }
2506              ],
2507              "read_id": "a4f36092-b4d5-47a9-813e-c22c3b477a0c",
2508              "seq_len": 48
2509            }"#};
2510
2511        let expected_json: serde_json::Value = serde_json::from_str(expected_json_str)?;
2512
2513        // Compare expected and actual outputs
2514        assert_eq!(actual_json, expected_json);
2515
2516        Ok(())
2517    }
2518
2519    #[test]
2520    #[should_panic(expected = "invalid alignment coordinates")]
2521    fn invalid_align_coords_unmapped_with_reference_positions() {
2522        let invalid_json = r#"{
2523            "mod_table": [
2524                {
2525                    "data": [[2, 3, 200]]
2526                }
2527            ],
2528            "seq_len": 10
2529        }"#;
2530
2531        // Deserialize JSON to CurrRead - this should panic with InvalidAlignCoords
2532        let _: CurrRead<AlignAndModData> = serde_json::from_str(invalid_json).unwrap();
2533    }
2534
2535    #[test]
2536    #[should_panic(expected = "invalid mod coordinates")]
2537    fn invalid_sequence_length() {
2538        let invalid_json = r#"{
2539            "mod_table": [
2540                {
2541                    "data": [[20, -1, 200]]
2542                }
2543            ],
2544            "seq_len": 10
2545        }"#;
2546
2547        // Deserialize JSON to CurrRead - this should panic with InvalidSeqLength
2548        let _: CurrRead<AlignAndModData> = serde_json::from_str(invalid_json).unwrap();
2549    }
2550
2551    #[test]
2552    #[should_panic(expected = "invalid value")]
2553    fn invalid_sequence_coordinate() {
2554        let invalid_json = r#"{
2555            "alignment_type": "primary_forward",
2556            "alignment": {
2557                "start": 0,
2558                "end": 30
2559            },
2560            "mod_table": [
2561                {
2562                    "data": [[-1, 1, 200]]
2563                }
2564            ],
2565            "seq_len": 30
2566        }"#;
2567
2568        // Deserialize JSON to CurrRead - this should panic with InvalidSeqLength
2569        let _: CurrRead<AlignAndModData> = serde_json::from_str(invalid_json).unwrap();
2570    }
2571
2572    #[test]
2573    #[should_panic(expected = "invalid alignment coordinates")]
2574    fn invalid_alignment_coordinates() {
2575        let invalid_json = r#"{
2576            "alignment_type": "primary_forward",
2577            "alignment": {
2578                "start": 10,
2579                "end": 25
2580            },
2581            "mod_table": [
2582                {
2583                    "data": [[0, 10, 200], [1, 20, 180], [2, 30, 220]]
2584                }
2585            ],
2586            "seq_len": 3
2587        }"#;
2588
2589        // Deserialize JSON to CurrRead - this should panic with InvalidAlignCoords
2590        let _: CurrRead<AlignAndModData> = serde_json::from_str(invalid_json).unwrap();
2591    }
2592
2593    #[test]
2594    #[should_panic(expected = "invalid mod coordinates")]
2595    fn invalid_sorting_forward_alignment() {
2596        let invalid_json = r#"{
2597            "alignment_type": "primary_forward",
2598            "alignment": {
2599                "start": 10,
2600                "end": 40
2601            },
2602            "mod_table": [
2603                {
2604                    "data": [[0, 10, 200], [2, 30, 180], [1, 20, 220]]
2605                }
2606            ],
2607            "seq_len": 3
2608        }"#;
2609
2610        // Deserialize JSON to CurrRead - this should panic with InvalidSorting
2611        let _: CurrRead<AlignAndModData> = serde_json::from_str(invalid_json).unwrap();
2612    }
2613
2614    #[test]
2615    #[should_panic(expected = "invalid mod coordinates")]
2616    fn invalid_sorting_reverse_alignment() {
2617        let invalid_json = r#"{
2618            "alignment_type": "primary_reverse",
2619            "alignment": {
2620                "start": 10,
2621                "end": 40
2622            },
2623            "mod_table": [
2624                {
2625                    "data": [[2, 30, 180], [1, 20, 220], [0, 10, 200]]
2626                }
2627            ],
2628            "seq_len": 3
2629        }"#;
2630
2631        // Deserialize JSON to CurrRead - this should panic with InvalidSorting
2632        let _: CurrRead<AlignAndModData> = serde_json::from_str(invalid_json).unwrap();
2633    }
2634}
nanalogue_core/read_utils.rs

nanalogue_core/
read_utils.rs