nanalogue_core/read_utils.rs
1//! Implements `CurrRead` Struct for processing information
2//! and the mod information in the BAM file using a parser implemented in
3//! another module.
4
5use crate::{
6 AllowedAGCTN, Contains as _, Error, F32Bw0and1, FilterByRefCoords, InputModOptions,
7 InputRegionOptions, InputWindowing, ModChar, ReadState, ThresholdState, nanalogue_mm_ml_parser,
8};
9use bedrs::prelude::{Intersect as _, StrandedBed3};
10use bedrs::{Bed3, Coordinates as _, Strand};
11use bio_types::genome::AbstractInterval as _;
12use derive_builder::Builder;
13use fibertools_rs::utils::{
14 bamannotations::{FiberAnnotation, Ranges},
15 basemods::{BaseMod, BaseMods},
16};
17use polars::{df, prelude::DataFrame};
18use rust_htslib::{bam::ext::BamRecordExtensions as _, bam::record::Record};
19use serde::{Deserialize, Serialize};
20use std::{
21 cmp::Ordering,
22 collections::{HashMap, HashSet},
23 fmt::{self, Write as _},
24 ops::Range,
25 rc::Rc,
26};
27
28/// Shows [`CurrRead`] has no data, a state-type parameter
29#[derive(Debug, Default, Copy, Clone, PartialEq)]
30#[non_exhaustive]
31pub struct NoData;
32
33/// Shows [`CurrRead`] has only alignment data, a state-type parameter
34#[derive(Debug, Default, Copy, Clone, PartialEq)]
35#[non_exhaustive]
36pub struct OnlyAlignData;
37
38/// Shows [`CurrRead`] has only alignment data but with all fields filled, a state-type parameter
39#[derive(Debug, Default, Copy, Clone, PartialEq)]
40#[non_exhaustive]
41pub struct OnlyAlignDataComplete;
42
43/// Shows [`CurrRead`] has alignment and modification data, a state-type parameter
44#[derive(Debug, Default, Copy, Clone, PartialEq)]
45#[non_exhaustive]
46pub struct AlignAndModData;
47
48/// Dummy trait, associated with the state-type pattern in [`CurrRead`]
49pub trait CurrReadState {}
50
51impl CurrReadState for NoData {}
52impl CurrReadState for OnlyAlignData {}
53impl CurrReadState for OnlyAlignDataComplete {}
54impl CurrReadState for AlignAndModData {}
55
56/// Another dummy trait, associated with the state-type pattern in [`CurrRead`]
57pub trait CurrReadStateWithAlign {}
58
59impl CurrReadStateWithAlign for OnlyAlignData {}
60impl CurrReadStateWithAlign for OnlyAlignDataComplete {}
61impl CurrReadStateWithAlign for AlignAndModData {}
62
63/// Another dummy trait, associated with the state-type pattern in [`CurrRead`]
64pub trait CurrReadStateOnlyAlign {}
65
66impl CurrReadStateOnlyAlign for OnlyAlignData {}
67impl CurrReadStateOnlyAlign for OnlyAlignDataComplete {}
68
69/// Our main struct that receives and stores from one BAM record.
70/// Also has methods for processing this information.
71/// Also see [`CurrReadBuilder`] for a way to build this without BAM records.
72///
73/// We call this `CurrRead` as in 'current read'. `Read` is used
74/// within `rust-htslib`, so we don't want to create another `Read` struct.
75///
76/// The information within the struct is hard to access without
77/// the methods defined here. This is to ensure the struct
78/// doesn't fall into an invalid state, which could cause mistakes
79/// in calculations associated with the struct. For example:
80/// if I want to measure mean modification density along windows
81/// of the raw modification data, I need a guarantee that the
82/// modification data is sorted by position. We can guarantee
83/// this when the modification data is parsed, but we cannot
84/// guarantee this if we allow free access to the struct.
85/// NOTE: we could have implemented these as a trait extension
86/// to the `rust_htslib` `Record` struct, but we have chosen not to,
87/// as we may want to persist data like modifications and do
88/// multiple operations on them. And `Record` has inconvenient
89/// return types like i64 instead of u64 for positions along the
90/// genome.
91#[derive(Debug, Clone, PartialEq)]
92pub struct CurrRead<S: CurrReadState> {
93 /// Stores alignment type.
94 state: ReadState,
95
96 /// Read ID of molecule, also called query name in some contexts.
97 read_id: String,
98
99 /// Length of the stored sequence in a BAM file. This is usually the basecalled sequence.
100 seq_len: Option<u64>,
101
102 /// Length of the segment on the reference genome the molecule maps to.
103 align_len: Option<u64>,
104
105 /// Stores modification information along with any applied thresholds.
106 mods: (BaseMods, ThresholdState),
107
108 /// ID of the reference genome contig and the starting position on
109 /// contig that the molecule maps or aligns to.
110 /// NOTE: the contig here is numeric and refers to an index on the BAM
111 /// header. To convert this into an alphanumeric string, you have to
112 /// process the header and store it in `contig_name` below.
113 /// We have left it this way as it is easier to store and process integers.
114 contig_id_and_start: Option<(i32, u64)>,
115
116 /// Contig name.
117 contig_name: Option<String>,
118
119 /// Base PHRED-quality threshold (no offset). Mods could have been filtered by this.
120 mod_base_qual_thres: u8,
121
122 /// `PhantomData` marker for compiler's sake
123 marker: std::marker::PhantomData<S>,
124}
125
126/// Implements defaults for `CurrRead`
127impl Default for CurrRead<NoData> {
128 fn default() -> Self {
129 CurrRead::<NoData> {
130 state: ReadState::PrimaryFwd,
131 read_id: String::new(),
132 seq_len: None,
133 align_len: None,
134 mods: (BaseMods { base_mods: vec![] }, ThresholdState::default()),
135 contig_id_and_start: None,
136 contig_name: None,
137 mod_base_qual_thres: 0,
138 marker: std::marker::PhantomData::<NoData>,
139 }
140 }
141}
142
143impl CurrRead<NoData> {
144 /// sets the alignment of the read and read ID using BAM record
145 ///
146 /// # Errors
147 /// While we support normal BAM reads from `ONT`, `PacBio` etc. that contain modifications,
148 /// we do not support some BAM flags like paired, duplicate, quality check failed etc.
149 /// This is because of our design choices e.g. if mods are called on paired reads,
150 /// then we'll have to include both records as one read in our statistics
151 /// and we do not have functionality in place to do this.
152 /// So, we return errors if such flags or an invalid combination of flags (e.g.
153 /// secondary and supplementary bits are set) are encountered.
154 /// We also return error upon invalid read id but this is expected to be in violation
155 /// of the BAM format (UTF-8 error).
156 ///
157 /// ```
158 /// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader, ReadState};
159 /// use rust_htslib::bam::Read;
160 /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
161 /// let mut count = 0;
162 /// for record in reader.records(){
163 /// let r = record?;
164 /// let curr_read = CurrRead::default().set_read_state_and_id(&r)?;
165 /// match count {
166 /// 0 => assert_eq!(curr_read.read_state(), ReadState::PrimaryFwd),
167 /// 1 => assert_eq!(curr_read.read_state(), ReadState::PrimaryFwd),
168 /// 2 => assert_eq!(curr_read.read_state(), ReadState::PrimaryRev),
169 /// 3 => assert_eq!(curr_read.read_state(), ReadState::Unmapped),
170 /// _ => unreachable!(),
171 /// }
172 /// count = count + 1;
173 /// }
174 /// # Ok::<(), Error>(())
175 /// ```
176 pub fn set_read_state_and_id(self, record: &Record) -> Result<CurrRead<OnlyAlignData>, Error> {
177 // extract read id
178 let read_id = match str::from_utf8(record.qname()) {
179 Ok(v) => v.to_string(),
180 Err(e) => {
181 return Err(Error::InvalidReadID(format!(
182 "error in setting read id, which possibly violates BAM requirements: {e}"
183 )));
184 }
185 };
186 // check for unsupported flags
187 if record.is_paired()
188 || record.is_proper_pair()
189 || record.is_first_in_template()
190 || record.is_last_in_template()
191 || record.is_mate_reverse()
192 || record.is_mate_unmapped()
193 || record.is_duplicate()
194 || record.is_quality_check_failed()
195 {
196 return Err(Error::NotImplemented(format!(
197 "paired-read/mate-read/duplicate/qual-check-failed flags not supported! read_id: {read_id}"
198 )));
199 }
200 // set read state
201 let state = match (
202 record.is_reverse(),
203 record.is_unmapped(),
204 record.is_secondary(),
205 record.is_supplementary(),
206 ) {
207 (false, true, false, false) => ReadState::Unmapped,
208 (true, false, false, false) => ReadState::PrimaryRev,
209 (true, false, true, false) => ReadState::SecondaryRev,
210 (false, false, true, false) => ReadState::SecondaryFwd,
211 (true, false, false, true) => ReadState::SupplementaryRev,
212 (false, false, false, true) => ReadState::SupplementaryFwd,
213 (false, false, false, false) => ReadState::PrimaryFwd,
214 _ => {
215 return Err(Error::UnknownAlignState(format!(
216 "invalid flag combination! read_id: {read_id}",
217 )));
218 }
219 };
220 Ok(CurrRead::<OnlyAlignData> {
221 state,
222 read_id,
223 seq_len: None,
224 align_len: None,
225 mods: (BaseMods { base_mods: vec![] }, ThresholdState::default()),
226 contig_id_and_start: None,
227 contig_name: None,
228 mod_base_qual_thres: 0,
229 marker: std::marker::PhantomData::<OnlyAlignData>,
230 })
231 }
232
233 /// Runs [`CurrRead<NoData>::try_from_only_alignment_seq_len_optional`], forcing sequence
234 /// length retrieval. See comments there and the comments below.
235 ///
236 /// Some BAM records have zero-length sequence fields i.e. marked by a '*'.
237 /// This may be intentional e.g. a secondary alignment has the same sequence
238 /// as a corresponding primary alignment and repeating a sequence is not space-efficient.
239 /// Or it may be intentional due to other reasons.
240 /// For modification processing, we cannot deal with these records as we have to match
241 /// sequences across different records.
242 /// So, our easy-to-use-function here forces sequence length retrieval and will fail
243 /// if zero length sequences are found.
244 ///
245 /// Another function below [`CurrRead<NoData>::try_from_only_alignment_zero_seq_len`], allows
246 /// zero length sequences through and can be used if zero length sequences are really
247 /// needed. In such a scenario, the user has to carefully watch for errors.
248 /// So we discourage its use unless really necessary.
249 ///
250 /// # Errors
251 pub fn try_from_only_alignment(
252 self,
253 record: &Record,
254 ) -> Result<CurrRead<OnlyAlignDataComplete>, Error> {
255 self.try_from_only_alignment_seq_len_optional(record, true)
256 }
257
258 /// Runs [`CurrRead<NoData>::try_from_only_alignment_seq_len_optional`], avoiding sequence
259 /// length retrieval and setting it to zero. See comments there and the comments below.
260 ///
261 /// See notes on [`CurrRead<NoData>::try_from_only_alignment`].
262 /// Use of this function is discouraged unless really necessary as we cannot parse
263 /// modification information from zero-length sequences without errors.
264 ///
265 /// # Errors
266 pub fn try_from_only_alignment_zero_seq_len(
267 self,
268 record: &Record,
269 ) -> Result<CurrRead<OnlyAlignDataComplete>, Error> {
270 self.try_from_only_alignment_seq_len_optional(record, false)
271 }
272
273 /// Uses only alignment information and no modification information to
274 /// create the struct. Use this if you want to perform operations that
275 /// do not involve reading or manipulating the modification data.
276 /// If `is_seq_len_non_zero` is set to false, then sequence length is
277 /// not retrieved and is set to zero.
278 ///
279 /// # Errors
280 /// Upon failure in retrieving record information.
281 pub fn try_from_only_alignment_seq_len_optional(
282 self,
283 record: &Record,
284 is_seq_len_non_zero: bool,
285 ) -> Result<CurrRead<OnlyAlignDataComplete>, Error> {
286 let mut curr_read_state = CurrRead::default().set_read_state_and_id(record)?;
287 if !curr_read_state.read_state().is_unmapped() {
288 curr_read_state = curr_read_state
289 .set_align_len(record)?
290 .set_contig_id_and_start(record)?
291 .set_contig_name(record)?;
292 }
293 if is_seq_len_non_zero {
294 curr_read_state = curr_read_state.set_seq_len(record)?;
295 } else {
296 curr_read_state.seq_len = Some(0u64);
297 }
298 let CurrRead::<OnlyAlignData> {
299 state,
300 read_id,
301 seq_len,
302 align_len,
303 contig_id_and_start,
304 contig_name,
305 ..
306 } = curr_read_state;
307 Ok(CurrRead::<OnlyAlignDataComplete> {
308 state,
309 read_id,
310 seq_len,
311 align_len,
312 mods: (BaseMods { base_mods: vec![] }, ThresholdState::default()),
313 contig_id_and_start,
314 contig_name,
315 mod_base_qual_thres: 0,
316 marker: std::marker::PhantomData::<OnlyAlignDataComplete>,
317 })
318 }
319}
320
321impl<S: CurrReadStateWithAlign + CurrReadState> CurrRead<S> {
322 /// gets the read state
323 #[must_use]
324 pub fn read_state(&self) -> ReadState {
325 self.state
326 }
327 /// set length of sequence from BAM record
328 ///
329 /// # Errors
330 /// Errors are returned if sequence length is already set or
331 /// sequence length is not non-zero.
332 ///
333 /// ```
334 /// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
335 /// use rust_htslib::bam::Read;
336 /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
337 /// let mut count = 0;
338 /// for record in reader.records(){
339 /// let r = record?;
340 /// let curr_read = CurrRead::default().set_read_state_and_id(&r)?.set_seq_len(&r)?;
341 /// let Ok(len) = curr_read.seq_len() else { unreachable!() };
342 /// match count {
343 /// 0 => assert_eq!(len, 8),
344 /// 1 => assert_eq!(len, 48),
345 /// 2 => assert_eq!(len, 33),
346 /// 3 => assert_eq!(len, 48),
347 /// _ => unreachable!(),
348 /// }
349 /// count = count + 1;
350 /// }
351 /// # Ok::<(), Error>(())
352 /// ```
353 ///
354 /// If we call the method twice, we should hit a panic
355 /// ```should_panic
356 /// # use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
357 /// # use rust_htslib::bam::Read;
358 /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
359 /// for record in reader.records(){
360 /// let r = record?;
361 /// let curr_read = CurrRead::default().set_read_state_and_id(&r)?
362 /// .set_seq_len(&r)?.set_seq_len(&r)?;
363 /// break;
364 /// }
365 /// # Ok::<(), Error>(())
366 /// ```
367 pub fn set_seq_len(mut self, record: &Record) -> Result<Self, Error> {
368 self.seq_len = match self.seq_len {
369 Some(_) => {
370 return Err(Error::InvalidDuplicates(format!(
371 "cannot set sequence length again! read_id: {}",
372 self.read_id()
373 )));
374 }
375 None => match record.seq_len() {
376 0 => {
377 return Err(Error::ZeroSeqLen(format!(
378 "avoid including 0-len sequences while parsing mod data in this program, read_id: {}",
379 self.read_id()
380 )));
381 }
382 l => Some(u64::try_from(l)?),
383 },
384 };
385 Ok(self)
386 }
387 /// gets length of sequence
388 ///
389 /// # Errors
390 /// Error if sequence length is not set
391 pub fn seq_len(&self) -> Result<u64, Error> {
392 self.seq_len.ok_or(Error::UnavailableData(format!(
393 "seq len not available, read_id: {}",
394 self.read_id()
395 )))
396 }
397 /// set alignment length from BAM record if available
398 ///
399 /// # Errors
400 /// Returns errors if alignment len is already set, instance is
401 /// unmapped, or if alignment coordinates are malformed
402 /// (e.g. end < start).
403 pub fn set_align_len(mut self, record: &Record) -> Result<Self, Error> {
404 self.align_len = match self.align_len {
405 Some(_) => Err(Error::InvalidDuplicates(format!(
406 "cannot set alignment length again! read_id: {}",
407 self.read_id()
408 ))),
409 None => {
410 if self.read_state().is_unmapped() {
411 Err(Error::Unmapped(format!(
412 "cannot set alignment properties for unmapped reads, read_id: {}",
413 self.read_id()
414 )))
415 } else {
416 // NOTE: right now, I don't know of a way to test the error below
417 // as rust htslib initializes an empty record with an alignment
418 // length of 1 (see the code below). This is only a note about
419 // the error variant, not the normal function of this code block
420 // which is fine.
421 // ```
422 // use rust_htslib::bam::ext::BamRecordExtensions;
423 // let r = Record::new();
424 // assert_eq!(r.seq_len(), 0);
425 // assert_eq!(r.pos(), 0);
426 // assert_eq!(r.reference_end(), 1);
427 // ```
428 let st = record.pos();
429 let en = record.reference_end();
430 if en > st && st >= 0 {
431 #[expect(
432 clippy::arithmetic_side_effects,
433 reason = "en > st && st >= 0 guarantee no i64 overflows"
434 )]
435 #[expect(
436 clippy::missing_panics_doc,
437 reason = "en > st && st >= 0 guarantee no panic"
438 )]
439 Ok(Some(
440 (en - st)
441 .try_into()
442 .expect("en>st && st>=0 guarantee no problems i64->u64"),
443 ))
444 } else {
445 Err(Error::InvalidAlignLength(format!(
446 "in `set_align_len`, start: {st}, end: {en} invalid! i.e. en <= st or st < 0, read_id: {}",
447 self.read_id()
448 )))
449 }
450 }
451 }
452 }?;
453 Ok(self)
454 }
455 /// gets alignment length
456 ///
457 /// # Errors
458 /// if instance is unmapped or alignment length is not set
459 pub fn align_len(&self) -> Result<u64, Error> {
460 if self.read_state().is_unmapped() {
461 Err(Error::Unmapped(format!(
462 "request alignment data on unmapped, read_id: {}",
463 self.read_id()
464 )))
465 } else {
466 self.align_len.ok_or(Error::UnavailableData(format!(
467 "alignment data not available, read_id: {}",
468 self.read_id()
469 )))
470 }
471 }
472 /// sets contig ID and start from BAM record if available
473 ///
474 /// # Errors
475 /// if instance is unmapped, if these data are already set and
476 /// the user is trying to set them again, or if coordinates
477 /// are malformed (start position is negative)
478 ///
479 /// ```
480 /// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
481 /// use rust_htslib::bam::Read;
482 /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
483 /// let mut count = 0;
484 /// for record in reader.records(){
485 /// let r = record?;
486 /// let curr_read =
487 /// CurrRead::default().set_read_state_and_id(&r)?.set_contig_id_and_start(&r)?;
488 /// match (count, curr_read.contig_id_and_start()) {
489 /// (0, Ok((0, 9))) |
490 /// (1, Ok((2, 23))) |
491 /// (2, Ok((1, 3))) => {},
492 /// _ => unreachable!(),
493 /// }
494 /// count = count + 1;
495 /// if count == 3 { break; } // the fourth entry is unmapped, and will lead to an error.
496 /// }
497 /// # Ok::<(), Error>(())
498 /// ```
499 ///
500 /// If we call the method on an unmapped read, we should see an error.
501 /// ```should_panic
502 /// # use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
503 /// # use rust_htslib::bam::Read;
504 /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
505 /// let mut count = 0;
506 /// for record in reader.records(){
507 /// let r = record?;
508 /// if count < 3 {
509 /// count = count + 1;
510 /// continue;
511 /// }
512 /// // the fourth read is unmapped
513 /// let curr_read =
514 /// CurrRead::default().set_read_state_and_id(&r)?.set_contig_id_and_start(&r)?;
515 /// }
516 /// # Ok::<(), Error>(())
517 /// ```
518 ///
519 /// If we call the method twice, we should hit a panic
520 /// ```should_panic
521 /// # use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
522 /// # use rust_htslib::bam::Read;
523 /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
524 /// for record in reader.records(){
525 /// let r = record?;
526 /// let mut curr_read = CurrRead::default().set_read_state_and_id(&r)?
527 /// .set_contig_id_and_start(&r)?.set_contig_id_and_start(&r)?;
528 /// break;
529 /// }
530 /// # Ok::<(), Error>(())
531 /// ```
532 pub fn set_contig_id_and_start(mut self, record: &Record) -> Result<Self, Error> {
533 self.contig_id_and_start = match self.contig_id_and_start {
534 Some(_) => Err(Error::InvalidDuplicates(format!(
535 "cannot set contig and start again! read_id: {}",
536 self.read_id()
537 ))),
538 None => {
539 if self.read_state().is_unmapped() {
540 Err(Error::Unmapped(format!(
541 "setting alignment data on unmapped read, read_id: {}",
542 self.read_id()
543 )))
544 } else {
545 Ok(Some((record.tid(), record.pos().try_into()?)))
546 }
547 }
548 }?;
549 Ok(self)
550 }
551 /// gets contig ID and start
552 ///
553 /// # Errors
554 /// If instance is unmapped or if data (contig id and start) are not set
555 pub fn contig_id_and_start(&self) -> Result<(i32, u64), Error> {
556 if self.read_state().is_unmapped() {
557 Err(Error::Unmapped(format!(
558 "requesting alignment data on unmapped read, read_id: {}",
559 self.read_id()
560 )))
561 } else {
562 self.contig_id_and_start
563 .ok_or(Error::UnavailableData(format!(
564 "contig id, start not available, read_id: {}",
565 self.read_id()
566 )))
567 }
568 }
569 /// sets contig name
570 ///
571 /// # Errors
572 /// Returns error if instance is unmapped or contig name has already been set
573 ///
574 /// ```
575 /// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
576 /// use rust_htslib::bam::Read;
577 /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
578 /// let mut count = 0;
579 /// for record in reader.records(){
580 /// let r = record?;
581 /// let mut curr_read = CurrRead::default().set_read_state_and_id(&r)?.set_contig_name(&r)?;
582 /// let Ok(contig_name) = curr_read.contig_name() else {unreachable!()};
583 /// match (count, contig_name) {
584 /// (0, "dummyI") |
585 /// (1, "dummyIII") |
586 /// (2, "dummyII") => {},
587 /// _ => unreachable!(),
588 /// }
589 /// count = count + 1;
590 /// if count == 3 { break; } // the fourth entry is unmapped, and will lead to an error.
591 /// }
592 /// # Ok::<(), Error>(())
593 /// ```
594 ///
595 /// If we try to set contig name on an unmapped read, we will get an error
596 ///
597 /// ```should_panic
598 /// # use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
599 /// # use rust_htslib::bam::Read;
600 /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
601 /// let mut count = 0;
602 /// for record in reader.records(){
603 /// if count < 3 {
604 /// count = count + 1;
605 /// continue;
606 /// }
607 /// let r = record?;
608 /// let mut curr_read = CurrRead::default().set_read_state_and_id(&r)?;
609 /// curr_read.set_contig_name(&r)?;
610 /// }
611 /// # Ok::<(), Error>(())
612 /// ```
613 ///
614 /// If we call the method twice, we should hit a panic
615 /// ```should_panic
616 /// # use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
617 /// # use rust_htslib::bam::Read;
618 /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
619 /// for record in reader.records(){
620 /// let r = record?;
621 /// let mut curr_read = CurrRead::default().set_read_state_and_id(&r)?
622 /// .set_contig_name(&r)?.set_contig_name(&r)?;
623 /// break;
624 /// }
625 /// # Ok::<(), Error>(())
626 /// ```
627 pub fn set_contig_name(mut self, record: &Record) -> Result<Self, Error> {
628 self.contig_name = match (self.read_state().is_unmapped(), self.contig_name.is_none()) {
629 (true, _) => Err(Error::Unmapped(format!(
630 "set align data on unmapped read! read_id: {}",
631 self.read_id()
632 ))),
633 (_, false) => Err(Error::InvalidDuplicates(format!(
634 "cannot set contig name again! read_id: {}",
635 self.read_id()
636 ))),
637 (_, true) => Ok(Some(String::from(record.contig()))),
638 }?;
639 Ok(self)
640 }
641 /// gets contig name
642 ///
643 /// # Errors
644 /// If instance is unmapped or contig name has not been set
645 pub fn contig_name(&self) -> Result<&str, Error> {
646 match (self.read_state().is_unmapped(), self.contig_name.as_ref()) {
647 (true, _) => Err(Error::Unmapped(format!(
648 "get align data on unmapped read, read_id: {}",
649 self.read_id()
650 ))),
651 (_, None) => Err(Error::UnavailableData(format!(
652 "contig name unavailable, read_id: {}",
653 self.read_id()
654 ))),
655 (_, Some(v)) => Ok(v.as_str()),
656 }
657 }
658 /// gets read id
659 #[must_use]
660 pub fn read_id(&self) -> &str {
661 self.read_id.as_str()
662 }
663 /// Returns the character corresponding to the strand
664 ///
665 /// ```
666 /// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
667 /// use rust_htslib::bam::Read;
668 /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
669 /// let mut count = 0;
670 /// for record in reader.records(){
671 /// let r = record?;
672 /// let mut curr_read = CurrRead::default().set_read_state_and_id(&r)?;
673 /// let strand = curr_read.strand() else { unreachable!() };
674 /// match (count, strand) {
675 /// (0, '+') | (1, '+') | (2, '-') | (3, '.') => {},
676 /// _ => unreachable!(),
677 /// }
678 /// count = count + 1;
679 /// }
680 /// # Ok::<(), Error>(())
681 /// ```
682 #[must_use]
683 pub fn strand(&self) -> char {
684 self.read_state().strand()
685 }
686 /// Returns read sequence overlapping with a genomic region
687 ///
688 /// # Errors
689 /// If getting sequence coordinates from reference coordinates fails, see
690 /// [`CurrRead::seq_and_qual_on_ref_coords`]
691 ///
692 /// # Example
693 ///
694 /// ```
695 /// use bedrs::Bed3;
696 /// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
697 /// use rust_htslib::bam::Read;
698 ///
699 /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
700 /// let mut count = 0;
701 /// for record in reader.records() {
702 /// let r = record?;
703 /// let curr_read = CurrRead::default().try_from_only_alignment(&r)?;
704 ///
705 /// // Skip unmapped reads
706 /// if !curr_read.read_state().is_unmapped() {
707 /// let (contig_id, start) = curr_read.contig_id_and_start()?;
708 /// let align_len = curr_read.align_len()?;
709 ///
710 /// // Create a region that overlaps with the read but is short of one bp.
711 /// // Note that this BAM file has reads with all bases matching perfectly
712 /// // with the reference.
713 /// let region = Bed3::new(contig_id, start, start + align_len - 1);
714 /// let seq_subset = curr_read.seq_on_ref_coords(&r, ®ion)?;
715 ///
716 /// // Check for sequence length match
717 /// assert_eq!(curr_read.seq_len()? - 1, u64::try_from(seq_subset.len())?);
718 ///
719 /// // Create a region with no overlap at all and check we get no data
720 /// let region = Bed3::new(contig_id, start + align_len, start + align_len + 2);
721 /// match curr_read.seq_on_ref_coords(&r, ®ion){
722 /// Err(Error::UnavailableData(_)) => (),
723 /// _ => unreachable!(),
724 /// };
725 ///
726 /// }
727 /// count += 1;
728 /// }
729 /// # Ok::<(), Error>(())
730 /// ```
731 pub fn seq_on_ref_coords(
732 &self,
733 record: &Record,
734 region: &Bed3<i32, u64>,
735 ) -> Result<Vec<u8>, Error> {
736 Ok(self
737 .seq_and_qual_on_ref_coords(record, region)?
738 .into_iter()
739 .map(|x| x.map_or(b'.', |y| y.1))
740 .collect::<Vec<u8>>())
741 }
742 /// Returns match-or-mismatch, read sequence, base-quality values overlapping with genomic region.
743 ///
744 /// Returns a vector of Options:
745 /// * is a `None` at a deletion i.e. a base on the reference and not on the read.
746 /// * is a `Some(bool, u8, u8)` at a match/mismatch/insertion.
747 /// The first `u8` is the base itself and the `bool` is `true` if a match/mismatch
748 /// and `false` if an insertion, and the second `u8` is the base quality (0-93),
749 /// which the BAM format sets to 255 if no quality is available for the entire read.
750 ///
751 /// Because sequences are encoded using 4-bit values into a `[u8]`, we need to use
752 /// `rust-htslib` functions to convert them into 8-bit values and then use
753 /// the `Index<usize>` trait on sequences from `rust-htslib`.
754 ///
755 /// # Errors
756 /// If the read does not intersect with the specified region, see
757 /// [`CurrRead::seq_coords_from_ref_coords`]
758 ///
759 /// # Example
760 ///
761 /// Example 1
762 ///
763 /// ```
764 /// use bedrs::Bed3;
765 /// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
766 /// use rust_htslib::bam::Read;
767 ///
768 /// let mut reader = nanalogue_bam_reader(&"examples/example_5_valid_basequal.sam")?;
769 /// for record in reader.records() {
770 /// let r = record?;
771 /// let curr_read = CurrRead::default().try_from_only_alignment(&r)?;
772 ///
773 /// let region = Bed3::new(0, 0, 20);
774 /// let seq_subset = curr_read.seq_and_qual_on_ref_coords(&r, ®ion)?;
775 /// assert_eq!(seq_subset, [Some((true, b'T', 32)), Some((true, b'C', 0)),
776 /// Some((true, b'G', 69)), Some((true, b'T', 80)), Some((true, b'T', 79)),
777 /// Some((true, b'T', 81)), Some((true, b'C', 29)), Some((true, b'T', 30))]);
778 ///
779 /// // Create a region with no overlap at all and check we get no data
780 /// let region = Bed3::new(0, 20, 22);
781 /// match curr_read.seq_and_qual_on_ref_coords(&r, ®ion){
782 /// Err(Error::UnavailableData(_)) => (),
783 /// _ => unreachable!(),
784 /// };
785 ///
786 /// }
787 /// # Ok::<(), Error>(())
788 /// ```
789 ///
790 /// Example 2
791 ///
792 /// ```
793 /// use bedrs::Bed3;
794 /// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
795 /// use rust_htslib::bam::Read;
796 ///
797 /// let mut reader = nanalogue_bam_reader(&"examples/example_7.sam")?;
798 /// for record in reader.records() {
799 /// let r = record?;
800 /// let curr_read = CurrRead::default().try_from_only_alignment(&r)?;
801 ///
802 /// let region = Bed3::new(0, 0, 20);
803 /// let seq_subset = curr_read.seq_and_qual_on_ref_coords(&r, ®ion)?;
804 /// assert_eq!(seq_subset, [Some((true, b'T', 32)), None, None,
805 /// Some((false, b'A', 0)), Some((true, b'T', 0)), Some((true, b'T', 79)),
806 /// Some((true, b'T', 81)), Some((true, b'G', 29)), Some((true, b'T', 30))]);
807 ///
808 /// }
809 /// # Ok::<(), Error>(())
810 /// ```
811 #[expect(
812 clippy::indexing_slicing,
813 reason = "qual, seq same len and coords will not exceed these"
814 )]
815 #[expect(
816 clippy::type_complexity,
817 reason = "not complex enough, I think types will make this less clear"
818 )]
819 pub fn seq_and_qual_on_ref_coords(
820 &self,
821 record: &Record,
822 region: &Bed3<i32, u64>,
823 ) -> Result<Vec<Option<(bool, u8, u8)>>, Error> {
824 let seq = record.seq();
825 let qual = record.qual();
826 Ok(self
827 .seq_coords_from_ref_coords(record, region)?
828 .into_iter()
829 .map(|x| x.map(|y| (y.0, seq[y.1], qual[y.1])))
830 .collect::<Vec<Option<(bool, u8, u8)>>>())
831 }
832 /// Extract sequence coordinates corresponding to a region on the reference genome.
833 ///
834 /// The vector we return contains `Some((bool, usize))` entries where both the reference and the read
835 /// have bases, and `None` where bases from the reference are missing on the read:
836 /// * matches or mismatches, we record the coordinate.
837 /// so SNPs for example (i.e. a 1 bp difference from the ref) will show up as `Some((true, _))`.
838 /// * a deletion or a ref skip ("N" in cigar) will show up as `None`.
839 /// * insertions are preserved i.e. bases in the middle of a read present
840 /// on the read but not on the reference are `Some((false, _))`
841 /// * clipped bases at the end of the read are not preserved. These are bases
842 /// on the read but not on the reference and are denoted as soft or hard
843 /// clips on the CIGAR string e.g. barcodes from sequencing
844 /// * some CIGAR combinations are illogical and we are assuming they do not happen
845 /// e.g. a read's CIGAR can end with, say 10D20S, this means last 10 bp
846 /// are in a deletion and the next 20 are a softclip. This is illogical,
847 /// as they must be combined into a 30-bp softclip i.e. 30S. So if the
848 /// aligner produces such illogical states, then the sequence coordinates reported
849 /// here may be erroneous.
850 ///
851 /// If the read does not intersect with the region, we return an `Error` (see below).
852 /// If the read does intersect with the region but we cannot retrieve any bases,
853 /// we return an empty vector (I am not sure if we will run into this scenario).
854 ///
855 /// # Errors
856 /// If the read does not intersect with the specified region, or if `usize` conversions fail.
857 ///
858 /// # Example
859 ///
860 /// ```
861 /// use bedrs::Bed3;
862 /// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
863 /// use rust_htslib::bam::Read;
864 ///
865 /// let mut reader = nanalogue_bam_reader(&"examples/example_7.sam")?;
866 /// for record in reader.records() {
867 /// let r = record?;
868 /// let curr_read = CurrRead::default().try_from_only_alignment(&r)?;
869 ///
870 /// let region = Bed3::new(0, 9, 13);
871 /// let seq_subset = curr_read.seq_coords_from_ref_coords(&r, ®ion)?;
872 /// // there are deletions on the read above
873 /// assert_eq!(seq_subset, vec![Some((true, 0)), None, None, Some((false, 1)), Some((true, 2))]);
874 ///
875 /// // Create a region with no overlap at all and check we get no data
876 /// let region = Bed3::new(0, 20, 22);
877 /// match curr_read.seq_coords_from_ref_coords(&r, ®ion){
878 /// Err(Error::UnavailableData(_)) => (),
879 /// _ => unreachable!(),
880 /// };
881 ///
882 /// }
883 /// # Ok::<(), Error>(())
884 pub fn seq_coords_from_ref_coords(
885 &self,
886 record: &Record,
887 region: &Bed3<i32, u64>,
888 ) -> Result<Vec<Option<(bool, usize)>>, Error> {
889 #[expect(
890 clippy::missing_panics_doc,
891 reason = "genomic coordinates are far less than (2^64-1)/2 i.e. u64->i64 should be ok"
892 )]
893 let interval = {
894 region
895 .intersect(&StrandedBed3::<i32, u64>::try_from(self)?)
896 .and_then(|v| {
897 let start = i64::try_from(v.start()).expect("genomic coordinates << 2^63");
898 let end = i64::try_from(v.end()).expect("genomic coordinates << 2^63");
899 (start < end).then_some(start..end)
900 })
901 .ok_or(Error::UnavailableData(
902 "coord-retrieval: region does not intersect with read".to_owned(),
903 ))
904 }?;
905
906 // Initialize coord calculation.
907 // We don't know how long the subset will be, we initialize with a guess
908 // of 2 * interval size
909 #[expect(
910 clippy::arithmetic_side_effects,
911 reason = "genomic coordinates far less than i64::MAX (approx (2^64-1)/2)"
912 )]
913 let mut s: Vec<Option<(bool, usize)>> =
914 Vec::with_capacity(usize::try_from(2 * (interval.end - interval.start))?);
915
916 // we may have to trim the sequence if we hit a bunch of unaligned base
917 // pairs right at the end e.g. a softclip.
918 let mut trim_end_bp: u64 = 0;
919
920 for w in record
921 .aligned_pairs_full()
922 .skip_while(|x| x[1].is_none_or(|y| !interval.contains(&y)))
923 .take_while(|x| x[1].is_none_or(|y| interval.contains(&y)))
924 {
925 #[expect(
926 clippy::arithmetic_side_effects,
927 reason = "coordinates far less than u64::MAX (2^64-1) so no chance of counter overflow"
928 )]
929 match w {
930 [Some(x), Some(_)] => {
931 s.push(Some((true, usize::try_from(x)?)));
932 trim_end_bp = 0;
933 }
934 [Some(x), None] => {
935 s.push(Some((false, usize::try_from(x)?)));
936 trim_end_bp += 1;
937 }
938 [None, Some(_)] => {
939 s.push(None);
940 trim_end_bp = 0;
941 }
942 [None, None] => unreachable!(
943 "impossible to find bases that are neither on the read nor on the reference"
944 ),
945 }
946 }
947
948 // if last few bp in sequence are all unmapped, we remove them here.
949 for _ in 0..trim_end_bp {
950 let Some(_) = s.pop() else {
951 unreachable!("trim_end_bp cannot exceed length of s")
952 };
953 }
954
955 // Trim excess allocated capacity and return
956 s.shrink_to(0);
957 Ok(s)
958 }
959 /// sets modification data using the BAM record
960 ///
961 /// # Errors
962 /// If tags in the BAM record containing the modification information (MM, ML)
963 /// contain mistakes.
964 pub fn set_mod_data(
965 self,
966 record: &Record,
967 mod_thres: ThresholdState,
968 min_qual: u8,
969 ) -> Result<CurrRead<AlignAndModData>, Error> {
970 let result = nanalogue_mm_ml_parser(
971 record,
972 |x| mod_thres.contains(x),
973 |_| true,
974 |_, _, _| true,
975 min_qual,
976 )?;
977 Ok(CurrRead::<AlignAndModData> {
978 state: self.state,
979 read_id: self.read_id,
980 seq_len: self.seq_len,
981 align_len: self.align_len,
982 mods: (result, mod_thres),
983 contig_id_and_start: self.contig_id_and_start,
984 contig_name: self.contig_name,
985 mod_base_qual_thres: min_qual,
986 marker: std::marker::PhantomData::<AlignAndModData>,
987 })
988 }
989 /// sets modification data using BAM record but restricted to the specified filters
990 ///
991 /// # Errors
992 /// If tags in the BAM record containing the modification information (MM, ML)
993 /// contain mistakes.
994 pub fn set_mod_data_restricted<G, H>(
995 self,
996 record: &Record,
997 mod_thres: ThresholdState,
998 mod_fwd_pos_filter: G,
999 mod_filter_base_strand_tag: H,
1000 min_qual: u8,
1001 ) -> Result<CurrRead<AlignAndModData>, Error>
1002 where
1003 G: Fn(&usize) -> bool,
1004 H: Fn(&u8, &char, &ModChar) -> bool,
1005 {
1006 let result = nanalogue_mm_ml_parser(
1007 record,
1008 |x| mod_thres.contains(x),
1009 mod_fwd_pos_filter,
1010 mod_filter_base_strand_tag,
1011 min_qual,
1012 )?;
1013 Ok(CurrRead::<AlignAndModData> {
1014 state: self.state,
1015 read_id: self.read_id,
1016 seq_len: self.seq_len,
1017 align_len: self.align_len,
1018 mods: (result, mod_thres),
1019 contig_id_and_start: self.contig_id_and_start,
1020 contig_name: self.contig_name,
1021 mod_base_qual_thres: min_qual,
1022 marker: std::marker::PhantomData::<AlignAndModData>,
1023 })
1024 }
1025}
1026
1027impl CurrRead<OnlyAlignDataComplete> {
1028 /// sets modification data using BAM record but with restrictions
1029 /// applied by the [`crate::InputMods`] options for example.
1030 ///
1031 /// # Errors
1032 /// If a region filter is specified and we fail to convert current instance to Bed,
1033 /// and if parsing the MM/ML BAM tags fails (presumably because they are malformed).
1034 #[expect(
1035 clippy::missing_panics_doc,
1036 reason = "integer conversions (u64->usize, u64->i64) are expected to not fail as \
1037genomic coordinates are far smaller than ~2^63"
1038 )]
1039 pub fn set_mod_data_restricted_options<S: InputModOptions + InputRegionOptions>(
1040 self,
1041 record: &Record,
1042 mod_options: &S,
1043 ) -> Result<CurrRead<AlignAndModData>, Error> {
1044 let l = usize::try_from(self.seq_len().expect("no error"))
1045 .expect("bit conversion errors unlikely");
1046 let w = mod_options.trim_read_ends_mod();
1047 let interval = if let Some(bed3) = mod_options.region_filter().as_ref() {
1048 let stranded_bed3 = StrandedBed3::<i32, u64>::try_from(&self)?;
1049 if let Some(v) = bed3.intersect(&stranded_bed3) {
1050 if v.start() == stranded_bed3.start() && v.end() == stranded_bed3.end() {
1051 None
1052 } else {
1053 Some(v.start()..v.end())
1054 }
1055 } else {
1056 Some(0..0)
1057 }
1058 } else {
1059 None
1060 };
1061 Ok({
1062 let mut read = self.set_mod_data_restricted(
1063 record,
1064 mod_options.mod_prob_filter(),
1065 |x| w == 0 || (w..l.checked_sub(w).unwrap_or_default()).contains(x),
1066 |_, &s, &t| {
1067 mod_options.tag().is_none_or(|x| x == t)
1068 && mod_options.mod_strand().is_none_or(|v| s == char::from(v))
1069 },
1070 mod_options.base_qual_filter_mod(),
1071 )?;
1072 if let Some(v) = interval {
1073 match v.start.cmp(&v.end) {
1074 Ordering::Less => read.filter_by_ref_pos(
1075 i64::try_from(v.start)
1076 .expect("no error as genomic coordinates far less than ~2^63"),
1077 i64::try_from(v.end)
1078 .expect("no error as genomic coordinates far less than ~2^63"),
1079 )?,
1080 Ordering::Equal => read.filter_by_ref_pos(i64::MAX - 1, i64::MAX)?,
1081 Ordering::Greater => {
1082 unreachable!("`bedrs` should not allow malformed intervals!")
1083 }
1084 }
1085 }
1086 read
1087 })
1088 }
1089}
1090
1091impl CurrRead<AlignAndModData> {
1092 /// gets modification data
1093 #[must_use]
1094 pub fn mod_data(&self) -> &(BaseMods, ThresholdState) {
1095 &self.mods
1096 }
1097 /// window modification data with restrictions.
1098 /// If a read has the same modification on both the basecalled
1099 /// strand and its complement, then windows along both are returned.
1100 ///
1101 /// If `win_size` exceeds the modification data length, no windows are produced.
1102 ///
1103 /// # Errors
1104 /// Returns an error if the window function returns an error.
1105 #[expect(
1106 clippy::pattern_type_mismatch,
1107 reason = "suggested notation is verbose but I am not sure"
1108 )]
1109 pub fn windowed_mod_data_restricted<F>(
1110 &self,
1111 window_function: &F,
1112 win_options: InputWindowing,
1113 tag: ModChar,
1114 ) -> Result<Vec<F32Bw0and1>, Error>
1115 where
1116 F: Fn(&[u8]) -> Result<F32Bw0and1, Error>,
1117 {
1118 let win_size = win_options.win.get();
1119 let slide_size = win_options.step.get();
1120 let mut result = Vec::<F32Bw0and1>::new();
1121 let tag_char = tag.val();
1122 let (BaseMods { base_mods: v }, _) = &self.mods;
1123
1124 // we make a few assumptions below:
1125 // * data is sorted by coordinate along sequence
1126 // * illegal types like strand not '+' or '-', or multiple entries
1127 // corresponding to the same modification strand combination
1128 // e.g. C+m occuring twice.
1129 // we control data flow into CurrRead, checking these do not happen
1130 // during ingress. To be future-proof etc., we should check these things
1131 // here but we do not as there is no way right now to test error checking
1132 // as there is no way to make CurrRead fall into these illegal states.
1133 #[expect(
1134 clippy::missing_panics_doc,
1135 reason = "checked_sub ensures win_size <= mod_data.len() before windowing"
1136 )]
1137 for k in v {
1138 match k {
1139 BaseMod {
1140 modification_type: x,
1141 ranges: track,
1142 ..
1143 } if *x == tag_char => {
1144 let mod_data = track.qual();
1145 if let Some(l) = mod_data.len().checked_sub(win_size) {
1146 result.extend(
1147 (0..=l)
1148 .step_by(slide_size)
1149 .map(|i| {
1150 window_function(
1151 mod_data
1152 .get(i..)
1153 .expect("i <= len - win_size")
1154 .get(0..win_size)
1155 .expect("checked len>=win_size so no error"),
1156 )
1157 })
1158 .collect::<Result<Vec<F32Bw0and1>, _>>()?,
1159 );
1160 }
1161 }
1162 _ => {}
1163 }
1164 }
1165 Ok(result)
1166 }
1167 /// Performs a count of number of bases per modified type.
1168 /// Note that this result depends on the type of filtering done
1169 /// while the struct was created e.g. by modification threshold.
1170 ///
1171 /// # Panics
1172 /// Panics if the number of modifications exceeds `u32::MAX` (approximately 4.2 billion).
1173 ///
1174 /// ```
1175 /// use nanalogue_core::{CurrRead, Error, ModChar, nanalogue_bam_reader, ThresholdState};
1176 /// use rust_htslib::bam::Read;
1177 /// use std::collections::HashMap;
1178 /// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
1179 /// let mut count = 0;
1180 /// for record in reader.records(){
1181 /// let r = record?;
1182 /// let curr_read = CurrRead::default().set_read_state_and_id(&r)?
1183 /// .set_mod_data(&r, ThresholdState::GtEq(180), 0)?;
1184 /// let mod_count = curr_read.base_count_per_mod();
1185 /// let zero_count = HashMap::from([(ModChar::new('T'), 0)]);
1186 /// let a = HashMap::from([(ModChar::new('T'), 3)]);
1187 /// let b = HashMap::from([(ModChar::new('T'), 1)]);
1188 /// let c = HashMap::from([(ModChar::new('T'), 3),(ModChar::new('á° '), 0)]);
1189 /// match (count, mod_count) {
1190 /// (0, v) => assert_eq!(v, zero_count),
1191 /// (1, v) => assert_eq!(v, a),
1192 /// (2, v) => assert_eq!(v, b),
1193 /// (3, v) => assert_eq!(v, c),
1194 /// _ => unreachable!(),
1195 /// }
1196 /// count = count + 1;
1197 /// }
1198 /// # Ok::<(), Error>(())
1199 /// ```
1200 #[must_use]
1201 pub fn base_count_per_mod(&self) -> HashMap<ModChar, u32> {
1202 let mut output = HashMap::<ModChar, u32>::new();
1203 #[expect(
1204 clippy::arithmetic_side_effects,
1205 reason = "u32::MAX approx 4.2 Gb, v unlikely 1 molecule is this modified"
1206 )]
1207 for k in &self.mod_data().0.base_mods {
1208 let base_count =
1209 u32::try_from(k.ranges.annotations.len()).expect("number conversion error");
1210 let _: &mut u32 = output
1211 .entry(ModChar::new(k.modification_type))
1212 .and_modify(|e| *e += base_count)
1213 .or_insert(base_count);
1214 }
1215 output
1216 }
1217}
1218
1219/// To format and display modification data in a condensed manner.
1220trait DisplayCondensedModData {
1221 /// Outputs mod data section in a condensed manner.
1222 fn mod_data_section(&self) -> Result<String, fmt::Error>;
1223}
1224
1225/// No mod data means no display is produced
1226impl<S> DisplayCondensedModData for CurrRead<S>
1227where
1228 S: CurrReadStateOnlyAlign + CurrReadState,
1229{
1230 fn mod_data_section(&self) -> Result<String, fmt::Error> {
1231 Ok(String::new())
1232 }
1233}
1234
1235/// Implements display when mod data is available
1236impl DisplayCondensedModData for CurrRead<AlignAndModData> {
1237 #[expect(
1238 clippy::pattern_type_mismatch,
1239 reason = "suggested notation is verbose but I am not sure"
1240 )]
1241 fn mod_data_section(&self) -> Result<String, fmt::Error> {
1242 let mut mod_count_str = String::new();
1243 let (BaseMods { base_mods: v }, w) = &self.mods;
1244 for k in v {
1245 write!(
1246 mod_count_str,
1247 "{}{}{}:{};",
1248 k.modified_base as char,
1249 k.strand,
1250 ModChar::new(k.modification_type),
1251 k.ranges.annotations.len()
1252 )?;
1253 }
1254 if mod_count_str.is_empty() {
1255 write!(mod_count_str, "NA")?;
1256 } else {
1257 write!(
1258 mod_count_str,
1259 "({}, PHRED base qual >= {})",
1260 w, self.mod_base_qual_thres
1261 )?;
1262 }
1263 Ok(format!(",\n\t\"mod_count\": \"{mod_count_str}\""))
1264 }
1265}
1266
1267impl<S> fmt::Display for CurrRead<S>
1268where
1269 S: CurrReadState,
1270 CurrRead<S>: DisplayCondensedModData,
1271{
1272 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1273 let mut output_string = String::from("{\n");
1274
1275 writeln!(output_string, "\t\"read_id\": \"{}\",", self.read_id)?;
1276
1277 if let Some(v) = self.seq_len {
1278 writeln!(output_string, "\t\"sequence_length\": {v},")?;
1279 }
1280
1281 if let Some((v, w)) = self.contig_id_and_start {
1282 let num_str = &v.to_string();
1283 writeln!(
1284 output_string,
1285 "\t\"contig\": \"{}\",",
1286 if let Some(x) = self.contig_name.as_ref() {
1287 x
1288 } else {
1289 num_str
1290 }
1291 )?;
1292 writeln!(output_string, "\t\"reference_start\": {w},")?;
1293 if let Some(x) = self.align_len {
1294 writeln!(
1295 output_string,
1296 "\t\"reference_end\": {},",
1297 w.checked_add(x)
1298 .expect("numeric overflow in calculating reference_end")
1299 )?;
1300 writeln!(output_string, "\t\"alignment_length\": {x},")?;
1301 }
1302 }
1303
1304 write!(output_string, "\t\"alignment_type\": \"{}\"", self.state)?;
1305 writeln!(output_string, "{}", &self.mod_data_section()?)?;
1306 output_string.push('}');
1307 output_string.fmt(f)
1308 }
1309}
1310
1311/// Converts `CurrRead` to `StrandedBed3`
1312///
1313/// ```
1314/// use bedrs::{Coordinates, Strand};
1315/// use bedrs::prelude::StrandedBed3;
1316/// use nanalogue_core::{CurrRead, Error, nanalogue_bam_reader};
1317/// use rust_htslib::bam::Read;
1318/// let mut reader = nanalogue_bam_reader(&"examples/example_1.bam")?;
1319/// let mut count = 0;
1320/// for record in reader.records(){
1321/// let r = record?;
1322/// let mut curr_read = CurrRead::default().try_from_only_alignment(&r)?;
1323/// let Ok(bed3_stranded) = StrandedBed3::try_from(&curr_read) else {unreachable!()};
1324/// let exp_bed3_stranded = match count {
1325/// 0 => StrandedBed3::new(0, 9, 17, Strand::Forward),
1326/// 1 => StrandedBed3::new(2, 23, 71, Strand::Forward),
1327/// 2 => StrandedBed3::new(1, 3, 36, Strand::Reverse),
1328/// 3 => StrandedBed3::empty(),
1329/// _ => unreachable!(),
1330/// };
1331/// assert_eq!(*bed3_stranded.chr(), *exp_bed3_stranded.chr());
1332/// assert_eq!(bed3_stranded.start(), exp_bed3_stranded.start());
1333/// assert_eq!(bed3_stranded.end(), exp_bed3_stranded.end());
1334/// assert_eq!(bed3_stranded.strand(), exp_bed3_stranded.strand());
1335/// count = count + 1;
1336/// }
1337/// # Ok::<(), Error>(())
1338/// ```
1339impl<S: CurrReadStateWithAlign + CurrReadState> TryFrom<&CurrRead<S>> for StrandedBed3<i32, u64> {
1340 type Error = Error;
1341
1342 #[expect(
1343 clippy::arithmetic_side_effects,
1344 reason = "u64 variables won't overflow with genomic coords (<2^64-1)"
1345 )]
1346 fn try_from(value: &CurrRead<S>) -> Result<Self, Self::Error> {
1347 match (
1348 value.read_state().strand(),
1349 value.align_len().ok(),
1350 value.contig_id_and_start().ok(),
1351 ) {
1352 ('.', _, _) => Ok(StrandedBed3::empty()),
1353 (_, None, _) => Err(Error::InvalidAlignLength(format!(
1354 "align len not set while converting to bed3! read_id: {}",
1355 value.read_id()
1356 ))),
1357 (_, _, None) => Err(Error::InvalidContigAndStart(format!(
1358 "contig id and start not set while converting to bed3! read_id: {}",
1359 value.read_id()
1360 ))),
1361 ('+', Some(al), Some((cg, st))) => {
1362 Ok(StrandedBed3::new(cg, st, st + al, Strand::Forward))
1363 }
1364 ('-', Some(al), Some((cg, st))) => {
1365 Ok(StrandedBed3::new(cg, st, st + al, Strand::Reverse))
1366 }
1367 (_, _, _) => unreachable!("strand should be +/-/., so we should never reach this"),
1368 }
1369 }
1370}
1371
1372/// Convert a rust htslib record to our `CurrRead` struct.
1373/// NOTE: This operation loads many types of data from the
1374/// record and you may not be interested in all of them.
1375/// So, unless you know for sure that you are dealing with
1376/// a small number of reads, please do not use this function,
1377/// and call only a subset of the individual invocations below
1378/// in your program for the sake of speed and/or memory.
1379impl TryFrom<Record> for CurrRead<AlignAndModData> {
1380 type Error = Error;
1381
1382 fn try_from(record: Record) -> Result<Self, Self::Error> {
1383 let curr_read_state = CurrRead::default()
1384 .try_from_only_alignment(&record)?
1385 .set_mod_data(&record, ThresholdState::GtEq(128), 0)?;
1386 Ok(curr_read_state)
1387 }
1388}
1389
1390/// Convert a rust htslib rc record into our struct.
1391/// I think the rc datatype is just like the normal record,
1392/// except the record datatype is not destroyed and created
1393/// every time a new record is read (or something like that).
1394/// All comments I've made for the `TryFrom<Record>` function
1395/// apply here as well.
1396impl TryFrom<Rc<Record>> for CurrRead<AlignAndModData> {
1397 type Error = Error;
1398
1399 fn try_from(record: Rc<Record>) -> Result<Self, Self::Error> {
1400 let curr_read_state = CurrRead::default()
1401 .try_from_only_alignment(&record)?
1402 .set_mod_data(&record, ThresholdState::GtEq(128), 0)?;
1403 Ok(curr_read_state)
1404 }
1405}
1406
1407/// Implements filter by reference coordinates for our `CurrRead`.
1408/// This only filters modification data.
1409impl FilterByRefCoords for CurrRead<AlignAndModData> {
1410 /// filters modification data by reference position i.e. all pos such that
1411 /// start <= pos < end are retained. does not use contig in filtering.
1412 fn filter_by_ref_pos(&mut self, start: i64, end: i64) -> Result<(), Error> {
1413 for k in &mut self.mods.0.base_mods {
1414 k.ranges.filter_by_ref_pos(start, end)?;
1415 }
1416 Ok(())
1417 }
1418}
1419
1420/// Serialized representation of [`CurrRead`]; can also be used as a builder
1421/// to build [`CurrRead`]. This is useful if modification data is in formats
1422/// other than mod BAM, so we can transform it into [`CurrRead`] and use it
1423/// with functions from our library.
1424///
1425/// # Examples
1426///
1427/// First example, unmapped read with very little information
1428///
1429/// ```
1430/// use nanalogue_core::{Error, CurrReadBuilder};
1431/// let read = CurrReadBuilder::default().build()?;
1432/// # Ok::<(), Error>(())
1433/// ```
1434///
1435/// Add some simple information, still unmapped.
1436///
1437/// ```
1438/// use nanalogue_core::{Error, CurrReadBuilder};
1439/// let read = CurrReadBuilder::default().read_id("some_read".into()).seq_len(40).build()?;
1440/// # Ok::<(), Error>(())
1441/// ```
1442///
1443/// If we try to build a mapped read, we hit a panic as we haven't specified alignment information.
1444///
1445/// ```should_panic
1446/// use nanalogue_core::{Error, CurrReadBuilder,ReadState};
1447/// let read = CurrReadBuilder::default()
1448/// .read_id("some_read".into())
1449/// .seq_len(40)
1450/// .alignment_type(ReadState::PrimaryFwd).build()?;
1451/// # Ok::<(), Error>(())
1452/// ```
1453///
1454/// Mapped read building
1455///
1456/// ```
1457/// use nanalogue_core::{Error, CurrReadBuilder, AlignmentInfoBuilder, ReadState};
1458/// let read = CurrReadBuilder::default()
1459/// .read_id("some_read".into())
1460/// .seq_len(40)
1461/// .alignment_type(ReadState::PrimaryFwd)
1462/// .alignment(AlignmentInfoBuilder::default()
1463/// .start(10)
1464/// .end(60)
1465/// .contig("chr1".into())
1466/// .contig_id(1).build()?).build()?;
1467/// # Ok::<(), Error>(())
1468/// ```
1469/// ## Mapped read building with modification information.
1470/// Mod table entries are associated with one type of modification each.
1471/// The data tuples are: `sequence coordinate`, `reference coordinate`,
1472/// `modification probability`. The first varies from 0 to `sequence length` - 1,
1473/// the second varies within alignment coordinates and is -1 for an unmapped
1474/// position or all entries are -1 if the read as a whole is unmapped, and
1475/// the third entry varies from 0 - 255 where 0 means no modification with
1476/// certainty, and 255 means modification with certainty.
1477///
1478/// ```
1479/// use nanalogue_core::{Error, CurrReadBuilder, AlignmentInfoBuilder, ModTableEntryBuilder,
1480/// ReadState};
1481///
1482/// let mod_table_entry = ModTableEntryBuilder::default()
1483/// .base('C')
1484/// .is_strand_plus(true)
1485/// .mod_code("m".into())
1486/// .data([(0, 15, 200), (2, 25, 100)]).build()?;
1487///
1488/// let read = CurrReadBuilder::default()
1489/// .read_id("some_read".into())
1490/// .seq_len(40)
1491/// .alignment_type(ReadState::PrimaryFwd)
1492/// .alignment(AlignmentInfoBuilder::default()
1493/// .start(10)
1494/// .end(60)
1495/// .contig("chr1".into())
1496/// .contig_id(1).build()?)
1497/// .mod_table([mod_table_entry].into()).build()?;
1498/// # Ok::<(), Error>(())
1499/// ```
1500///
1501/// ## Mapped read building with multiple mod types.
1502///
1503/// ```
1504/// use nanalogue_core::{Error, CurrReadBuilder, AlignmentInfoBuilder, ModTableEntryBuilder,
1505/// ReadState};
1506///
1507/// let mod_table_entry_1 = ModTableEntryBuilder::default()
1508/// .base('C')
1509/// .is_strand_plus(true)
1510/// .mod_code("m".into())
1511/// .data([(0, 15, 200), (2, 25, 100)]).build()?;
1512///
1513/// let mod_table_entry_2 = ModTableEntryBuilder::default()
1514/// .base('A')
1515/// .is_strand_plus(true)
1516/// .mod_code("a".into())
1517/// .data([(1, 20, 50), (3, 30, 225)]).build()?;
1518///
1519/// let read = CurrReadBuilder::default()
1520/// .read_id("some_read".into())
1521/// .seq_len(40)
1522/// .alignment_type(ReadState::PrimaryFwd)
1523/// .alignment(AlignmentInfoBuilder::default()
1524/// .start(10)
1525/// .end(60)
1526/// .contig("chr1".into())
1527/// .contig_id(1).build()?)
1528/// .mod_table([mod_table_entry_1, mod_table_entry_2].into()).build()?;
1529/// # Ok::<(), Error>(())
1530/// ```
1531/// ## Unmapped read building with multiple mod types.
1532///
1533/// Note that all the reference coordinates are set to -1.
1534///
1535/// ```
1536/// use nanalogue_core::{Error, CurrReadBuilder, ModTableEntryBuilder, ReadState};
1537///
1538/// let mod_table_entry_1 = ModTableEntryBuilder::default()
1539/// .base('C')
1540/// .is_strand_plus(true)
1541/// .mod_code("m".into())
1542/// .data([(0, -1, 200), (2, -1, 100)]).build()?;
1543///
1544/// let mod_table_entry_2 = ModTableEntryBuilder::default()
1545/// .base('A')
1546/// .is_strand_plus(true)
1547/// .mod_code("a".into())
1548/// .data([(1, -1, 50), (3, -1, 225)]).build()?;
1549///
1550/// let read = CurrReadBuilder::default()
1551/// .read_id("some_read".into())
1552/// .seq_len(40)
1553/// .mod_table([mod_table_entry_1, mod_table_entry_2].into()).build()?;
1554/// # Ok::<(), Error>(())
1555/// ```
1556///
1557/// ## Erroneous Usage
1558///
1559/// ### Coordinates are not sorted
1560///
1561/// Please note that even for reverse-aligned reads, coordinates must always
1562/// be ascending.
1563///
1564/// ```should_panic
1565/// use nanalogue_core::{Error, CurrReadBuilder, AlignmentInfoBuilder, ModTableEntryBuilder,
1566/// ReadState};
1567///
1568/// let mod_table_entry = ModTableEntryBuilder::default()
1569/// .base('C')
1570/// .is_strand_plus(true)
1571/// .mod_code("m".into())
1572/// .data([(2, 25, 100), (0, 15, 200)]).build()?;
1573///
1574/// let read_before_build = CurrReadBuilder::default()
1575/// .read_id("some_read".into())
1576/// .seq_len(40)
1577/// .alignment(AlignmentInfoBuilder::default()
1578/// .start(10)
1579/// .end(60)
1580/// .contig("chr1".into())
1581/// .contig_id(1).build()?)
1582/// .mod_table([mod_table_entry].into());
1583///
1584/// let _ = read_before_build.alignment_type(ReadState::PrimaryFwd).build()?;
1585/// # Ok::<(), Error>(())
1586/// ```
1587///
1588/// ```should_panic
1589/// # use nanalogue_core::{Error, CurrReadBuilder, AlignmentInfoBuilder, ModTableEntryBuilder,
1590/// # ReadState};
1591/// #
1592/// # let mod_table_entry = ModTableEntryBuilder::default()
1593/// # .base('C')
1594/// # .is_strand_plus(true)
1595/// # .mod_code("m".into())
1596/// # .data([(2, 25, 100), (0, 15, 200)]).build()?;
1597/// #
1598/// # let read_before_build = CurrReadBuilder::default()
1599/// # .read_id("some_read".into())
1600/// # .seq_len(40)
1601/// # .alignment(AlignmentInfoBuilder::default()
1602/// # .start(10)
1603/// # .end(60)
1604/// # .contig("chr1".into())
1605/// # .contig_id(1).build()?)
1606/// # .mod_table([mod_table_entry].into());
1607/// #
1608/// let _ = read_before_build.alignment_type(ReadState::PrimaryRev).build()?;
1609/// # Ok::<(), Error>(())
1610/// ```
1611///
1612/// ### Unmapped read but mod reference coordinates are not set to `-1`.
1613///
1614/// ```should_panic
1615/// use nanalogue_core::{Error, CurrReadBuilder, ModTableEntryBuilder, ReadState};
1616///
1617/// let mod_table_entry = ModTableEntryBuilder::default()
1618/// .base('C')
1619/// .is_strand_plus(true)
1620/// .mod_code("m".into())
1621/// .data([(0, -1, 200), (2, 11, 100)]).build()?;
1622///
1623/// let read = CurrReadBuilder::default()
1624/// .read_id("some_read".into())
1625/// .seq_len(40)
1626/// .mod_table([mod_table_entry].into()).build()?;
1627/// # Ok::<(), Error>(())
1628/// ```
1629///
1630/// ### Read with mod coordinates larger than sequence length
1631///
1632/// This will cause a problem irrespective of whether a read is mapped or not.
1633///
1634/// ```should_panic
1635/// use nanalogue_core::{Error, CurrReadBuilder, ModTableEntryBuilder, ReadState};
1636///
1637/// let mod_table_entry = ModTableEntryBuilder::default()
1638/// .base('C')
1639/// .is_strand_plus(true)
1640/// .mod_code("m".into())
1641/// .data([(0, -1, 200), (42, -1, 100)]).build()?;
1642///
1643/// let read = CurrReadBuilder::default()
1644/// .read_id("some_read".into())
1645/// .seq_len(40)
1646/// .mod_table([mod_table_entry].into()).build()?;
1647/// # Ok::<(), Error>(())
1648/// ```
1649///
1650/// ```should_panic
1651/// use nanalogue_core::{Error, CurrReadBuilder, AlignmentInfoBuilder, ModTableEntryBuilder,
1652/// ReadState};
1653///
1654/// let mod_table_entry = ModTableEntryBuilder::default()
1655/// .base('C')
1656/// .is_strand_plus(true)
1657/// .mod_code("m".into())
1658/// .data([(0, 20, 200), (42, 30, 100)]).build()?;
1659///
1660/// let read = CurrReadBuilder::default()
1661/// .read_id("some_read".into())
1662/// .seq_len(40)
1663/// .alignment(AlignmentInfoBuilder::default()
1664/// .start(10)
1665/// .end(60)
1666/// .contig("chr1".into())
1667/// .contig_id(1).build()?)
1668/// .mod_table([mod_table_entry].into()).build()?;
1669/// # Ok::<(), Error>(())
1670/// ```
1671/// ### Read with mod coordinates beyond alignment coordinates
1672///
1673/// ```should_panic
1674/// use nanalogue_core::{Error, CurrReadBuilder, AlignmentInfoBuilder, ModTableEntryBuilder,
1675/// ReadState};
1676///
1677/// let mod_table_entry = ModTableEntryBuilder::default()
1678/// .base('C')
1679/// .is_strand_plus(true)
1680/// .mod_code("m".into())
1681/// .data([(0, 1, 200), (22, 30, 100)]).build()?;
1682///
1683/// let read = CurrReadBuilder::default()
1684/// .read_id("some_read".into())
1685/// .seq_len(40)
1686/// .alignment(AlignmentInfoBuilder::default()
1687/// .start(10)
1688/// .end(60)
1689/// .contig("chr1".into())
1690/// .contig_id(1).build()?)
1691/// .mod_table([mod_table_entry].into()).build()?;
1692/// # Ok::<(), Error>(())
1693/// ```
1694/// ### Reads with multiple tracks of the same kind of modification.
1695///
1696/// Here we have two `C+m` tracks. Note that two cytosine tracks or two
1697/// methylation tracks or two cytosine methylation tracks on opposite
1698/// strands are all allowed. Only the case where there are two tracks
1699/// of the same base with the same modification on the same strand
1700/// are disallowed as the same kind of information is being populated twice.
1701///
1702/// ```should_panic
1703/// use nanalogue_core::{Error, CurrReadBuilder, ModTableEntryBuilder, ReadState};
1704///
1705/// let mod_table_entry_1 = ModTableEntryBuilder::default()
1706/// .base('C')
1707/// .is_strand_plus(true)
1708/// .mod_code("m".into())
1709/// .data([(0, -1, 200), (2, -1, 100)]).build()?;
1710///
1711/// let mod_table_entry_2 = ModTableEntryBuilder::default()
1712/// .base('C')
1713/// .is_strand_plus(true)
1714/// .mod_code("m".into())
1715/// .data([(1, -1, 50), (3, -1, 225)]).build()?;
1716///
1717/// let read = CurrReadBuilder::default()
1718/// .read_id("some_read".into())
1719/// .seq_len(40)
1720/// .mod_table([mod_table_entry_1, mod_table_entry_2].into()).build()?;
1721/// # Ok::<(), Error>(())
1722/// ```
1723#[derive(Debug, Clone, Serialize, Deserialize)]
1724#[serde(default)]
1725pub struct CurrReadBuilder {
1726 /// The type of alignment
1727 alignment_type: ReadState,
1728 /// Alignment information, None for unmapped reads
1729 #[serde(skip_serializing_if = "Option::is_none")]
1730 alignment: Option<AlignmentInfo>,
1731 /// Condensed modification data table
1732 mod_table: Vec<ModTableEntry>,
1733 /// Read identifier
1734 read_id: String,
1735 /// Sequence length
1736 seq_len: u64,
1737}
1738
1739impl Default for CurrReadBuilder {
1740 fn default() -> Self {
1741 Self {
1742 alignment_type: ReadState::Unmapped, // note that default is unmapped now, not primary
1743 alignment: None,
1744 mod_table: Vec::new(),
1745 read_id: String::new(),
1746 seq_len: 0,
1747 }
1748 }
1749}
1750
1751/// Alignment information for mapped reads
1752///
1753/// See documentation of [`CurrReadBuilder`] on how to use
1754/// this struct.
1755#[derive(Builder, Debug, Clone, Default, Serialize, Deserialize)]
1756#[serde(default)]
1757#[builder(default, build_fn(error = "Error"), pattern = "owned")]
1758pub struct AlignmentInfo {
1759 /// Start position on reference
1760 start: u64,
1761 /// End position on reference
1762 end: u64,
1763 /// Contig/chromosome name
1764 contig: String,
1765 /// Contig/chromosome ID
1766 contig_id: i32,
1767}
1768
1769/// Data per type of modification in [`CurrReadBuilder`].
1770///
1771/// See documentation of [`CurrReadBuilder`] on how to use
1772/// this struct.
1773#[derive(Builder, Debug, Clone, Default, Serialize, Deserialize)]
1774#[serde(default)]
1775#[builder(default, build_fn(error = "Error"), pattern = "owned")]
1776pub struct ModTableEntry {
1777 /// Base that is modified (A, C, G, T, etc.)
1778 #[builder(field(ty = "char", build = "self.base.try_into()?"))]
1779 base: AllowedAGCTN,
1780 /// Whether this is on the plus strand
1781 is_strand_plus: bool,
1782 /// Modification code (character or numeric)
1783 #[builder(field(ty = "String", build = "self.mod_code.parse()?"))]
1784 mod_code: ModChar,
1785 /// Modification data as [`start`, `ref_start`, `mod_qual`] tuples
1786 #[builder(setter(into))]
1787 data: Vec<(u64, i64, u8)>,
1788}
1789
1790impl CurrReadBuilder {
1791 /// Sets alignment type
1792 #[must_use]
1793 pub fn alignment_type(mut self, value: ReadState) -> Self {
1794 self.alignment_type = value;
1795 self
1796 }
1797 /// Sets alignment information
1798 #[must_use]
1799 pub fn alignment(mut self, value: AlignmentInfo) -> Self {
1800 self.alignment = Some(value);
1801 self
1802 }
1803 /// Sets mod information
1804 #[must_use]
1805 pub fn mod_table(mut self, value: Vec<ModTableEntry>) -> Self {
1806 self.mod_table = value;
1807 self
1808 }
1809 /// Sets read id
1810 #[must_use]
1811 pub fn read_id(mut self, value: String) -> Self {
1812 self.read_id = value;
1813 self
1814 }
1815 /// Sets sequence length
1816 #[must_use]
1817 pub fn seq_len(mut self, value: u64) -> Self {
1818 self.seq_len = value;
1819 self
1820 }
1821 /// Build `CurrRead`
1822 ///
1823 /// # Errors
1824 /// If the conversion fails. This could happen if invalid
1825 /// data was used in the building.
1826 pub fn build(self) -> Result<CurrRead<AlignAndModData>, Error> {
1827 CurrRead::<AlignAndModData>::try_from(self)
1828 }
1829}
1830
1831impl Serialize for CurrRead<AlignAndModData> {
1832 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
1833 where
1834 S: serde::Serializer,
1835 {
1836 let serialized_curr_read: CurrReadBuilder =
1837 self.clone().try_into().map_err(serde::ser::Error::custom)?;
1838 serialized_curr_read.serialize(serializer)
1839 }
1840}
1841
1842impl TryFrom<CurrRead<AlignAndModData>> for CurrReadBuilder {
1843 type Error = Error;
1844
1845 fn try_from(curr_read: CurrRead<AlignAndModData>) -> Result<Self, Self::Error> {
1846 let alignment_type = curr_read.read_state();
1847
1848 #[expect(
1849 clippy::arithmetic_side_effects,
1850 reason = "u64 variables won't overflow with genomic coords (<2^64-1)"
1851 )]
1852 let alignment = if curr_read.read_state().is_unmapped() {
1853 None
1854 } else {
1855 let (contig_id, start) = curr_read.contig_id_and_start()?;
1856 let align_len = curr_read.align_len()?;
1857 let contig = curr_read.contig_name()?.to_string();
1858 let end = start + align_len;
1859
1860 Some(AlignmentInfo {
1861 start,
1862 end,
1863 contig,
1864 contig_id,
1865 })
1866 };
1867
1868 let mod_table = condense_base_mods(&curr_read.mod_data().0)?;
1869
1870 let read_id = curr_read.read_id().to_string();
1871 let seq_len = curr_read.seq_len()?;
1872
1873 Ok(CurrReadBuilder {
1874 alignment_type,
1875 alignment,
1876 mod_table,
1877 read_id,
1878 seq_len,
1879 })
1880 }
1881}
1882
1883impl<'de> Deserialize<'de> for CurrRead<AlignAndModData> {
1884 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
1885 where
1886 D: serde::Deserializer<'de>,
1887 {
1888 let serialized = CurrReadBuilder::deserialize(deserializer)?;
1889 serialized.try_into().map_err(serde::de::Error::custom)
1890 }
1891}
1892
1893impl TryFrom<CurrReadBuilder> for CurrRead<AlignAndModData> {
1894 type Error = Error;
1895
1896 fn try_from(serialized: CurrReadBuilder) -> Result<Self, Self::Error> {
1897 // Extract alignment information
1898 let (align_len, contig_id_and_start, contig_name, ref_range) = match (
1899 serialized.alignment_type.is_unmapped(),
1900 serialized.alignment.as_ref(),
1901 ) {
1902 (false, Some(alignment)) => {
1903 let align_len = {
1904 if let Some(v) = alignment.end.checked_sub(alignment.start) {
1905 Ok(Some(v))
1906 } else {
1907 Err(Error::InvalidAlignCoords(format!(
1908 "is align end {0} < align start {1}? read {2} failed in `CurrRead` building!",
1909 alignment.end, alignment.start, serialized.read_id
1910 )))
1911 }
1912 }?;
1913 let contig_id_and_start = Some((alignment.contig_id, alignment.start));
1914 let contig_name = Some(alignment.contig.clone());
1915 (
1916 align_len,
1917 contig_id_and_start,
1918 contig_name,
1919 i64::try_from(alignment.start)?..i64::try_from(alignment.end)?,
1920 )
1921 }
1922 (true, None) => (None, None, None, (-1i64)..0),
1923 (_, _) => {
1924 return Err(Error::UnknownAlignState(format!(
1925 "alignment_type and alignment info not matching while building CurrRead! read_id: {}",
1926 serialized.read_id,
1927 )));
1928 }
1929 };
1930
1931 // Reconstruct BaseMods from mod_table
1932 let base_mods = reconstruct_base_mods(
1933 &serialized.mod_table,
1934 serialized.alignment_type.strand() == '-',
1935 ref_range,
1936 serialized.seq_len,
1937 )?;
1938
1939 // Create CurrRead directly
1940 Ok(CurrRead {
1941 state: serialized.alignment_type,
1942 read_id: serialized.read_id,
1943 seq_len: Some(serialized.seq_len),
1944 align_len,
1945 mods: (base_mods, ThresholdState::GtEq(0)), // Default threshold
1946 contig_id_and_start,
1947 contig_name,
1948 mod_base_qual_thres: 0, // Default value
1949 marker: std::marker::PhantomData,
1950 })
1951 }
1952}
1953
1954/// Convert `BaseMods` to condensed `mod_table` format
1955fn condense_base_mods(base_mods: &BaseMods) -> Result<Vec<ModTableEntry>, Error> {
1956 let mut mod_table = Vec::new();
1957
1958 for base_mod in &base_mods.base_mods {
1959 let entries: Result<Vec<_>, Error> = base_mod
1960 .ranges
1961 .annotations
1962 .iter()
1963 .map(|k| {
1964 let start: u64 = k.start.try_into()?;
1965 let ref_start = k.reference_start.unwrap_or(-1);
1966 Ok((start, ref_start, k.qual))
1967 })
1968 .collect();
1969
1970 mod_table.push(ModTableEntry {
1971 base: AllowedAGCTN::try_from(base_mod.modified_base)?,
1972 is_strand_plus: base_mod.strand == '+',
1973 mod_code: ModChar::new(base_mod.modification_type),
1974 data: entries?,
1975 });
1976 }
1977
1978 Ok(mod_table)
1979}
1980
1981/// Reconstruct `BaseMods` from condensed `mod_table` format
1982fn reconstruct_base_mods(
1983 mod_table: &[ModTableEntry],
1984 is_reverse: bool,
1985 ref_range: Range<i64>,
1986 seq_len: u64,
1987) -> Result<BaseMods, Error> {
1988 let mut base_mods = Vec::new();
1989
1990 for entry in mod_table {
1991 let annotations = {
1992 let mut annotations = Vec::<FiberAnnotation>::with_capacity(entry.data.len());
1993 let mut valid_range = 0..seq_len;
1994
1995 #[expect(
1996 clippy::arithmetic_side_effects,
1997 reason = "overflow errors not possible as genomic coordinates << 2^64"
1998 )]
1999 for &(start, ref_start, qual) in &entry.data {
2000 // Check that sequence coordinates are in range [prev_start + 1, seq_len)
2001 if !valid_range.contains(&start) {
2002 return Err(Error::InvalidModCoords(String::from(
2003 "in mod table, read coords > seq length or read coords not sorted (NOTE: \
2004ascending needed even if reversed read)!",
2005 )));
2006 }
2007 valid_range = start + 1..seq_len;
2008
2009 let ref_start_after_check = match (ref_start, ref_range.contains(&ref_start)) {
2010 (-1, _) => None,
2011 (v, true) if v > -1 => Some(v),
2012 (v, _) => {
2013 return Err(Error::InvalidAlignCoords(format!(
2014 "coordinate {v} invalid in mod table (exceeds alignment coords or is < -1)"
2015 )));
2016 }
2017 };
2018 let start_i64 = i64::try_from(start)?;
2019 annotations.push(FiberAnnotation {
2020 start: start_i64,
2021 end: start_i64 + 1,
2022 length: 1,
2023 qual,
2024 reference_start: ref_start_after_check,
2025 reference_end: ref_start_after_check,
2026 reference_length: ref_start_after_check.is_some().then_some(0),
2027 extra_columns: None,
2028 });
2029 }
2030 annotations
2031 };
2032
2033 let ranges = Ranges::from_annotations(annotations, seq_len.try_into()?, is_reverse);
2034
2035 let strand = if entry.is_strand_plus { '+' } else { '-' };
2036
2037 base_mods.push(BaseMod {
2038 modified_base: u8::try_from(char::from(entry.base))?,
2039 strand,
2040 modification_type: entry.mod_code.val(),
2041 ranges,
2042 record_is_reverse: is_reverse,
2043 });
2044 }
2045
2046 base_mods.sort();
2047
2048 // Check for duplicate strand, modification_type combinations
2049 let mut seen_combinations = HashSet::new();
2050 for base_mod in &base_mods {
2051 let combination = (base_mod.strand, base_mod.modification_type);
2052 if seen_combinations.contains(&combination) {
2053 return Err(Error::InvalidDuplicates(format!(
2054 "Duplicate strand '{}' and modification_type '{}' combination found",
2055 base_mod.strand, base_mod.modification_type
2056 )));
2057 }
2058 let _: bool = seen_combinations.insert(combination);
2059 }
2060
2061 Ok(BaseMods { base_mods })
2062}
2063
2064/// Convert a vector of `CurrRead<AlignAndModData>` to a Polars `DataFrame`
2065/// with one row per modification data point
2066///
2067/// Each modification data point becomes a row in the `DataFrame`, with read-level
2068/// and alignment-level information repeated across rows for the same read.
2069///
2070/// # Example
2071///
2072/// ```
2073/// use nanalogue_core::{Error, CurrReadBuilder, AlignmentInfoBuilder, ModTableEntryBuilder,
2074/// ReadState, curr_reads_to_dataframe};
2075/// use polars::prelude::*;
2076///
2077/// // Build modification table entries with two data points
2078/// let mod_table_entry = ModTableEntryBuilder::default()
2079/// .base('C')
2080/// .is_strand_plus(true)
2081/// .mod_code("m".into())
2082/// .data([(0, 15, 200), (2, 25, 100)])
2083/// .build()?;
2084///
2085/// // Build a CurrRead<AlignAndModData>
2086/// let read = CurrReadBuilder::default()
2087/// .read_id("test_read".into())
2088/// .seq_len(40)
2089/// .alignment_type(ReadState::PrimaryFwd)
2090/// .alignment(AlignmentInfoBuilder::default()
2091/// .start(10)
2092/// .end(60)
2093/// .contig("chr1".into())
2094/// .contig_id(1)
2095/// .build()?)
2096/// .mod_table([mod_table_entry].into())
2097/// .build()?;
2098///
2099/// // Convert to DataFrame
2100/// let df = curr_reads_to_dataframe(&[read])?;
2101///
2102/// // Verify DataFrame structure
2103/// assert_eq!(df.height(), 2); // Two modification data points
2104/// assert_eq!(df.width(), 13); // 13 columns
2105///
2106/// // Check read-level fields (repeated across both rows)
2107/// let read_id_col = df.column("read_id")?.str()?;
2108/// assert_eq!(read_id_col.get(0), Some("test_read"));
2109/// assert_eq!(read_id_col.get(1), Some("test_read"));
2110///
2111/// let seq_len_col = df.column("seq_len")?.u64()?;
2112/// assert_eq!(seq_len_col.get(0), Some(40));
2113/// assert_eq!(seq_len_col.get(1), Some(40));
2114///
2115/// let alignment_type_col = df.column("alignment_type")?.str()?;
2116/// assert_eq!(alignment_type_col.get(0), Some("primary_forward"));
2117/// assert_eq!(alignment_type_col.get(1), Some("primary_forward"));
2118///
2119/// // Check alignment fields (repeated across both rows)
2120/// let align_start_col = df.column("align_start")?.u64()?;
2121/// assert_eq!(align_start_col.get(0), Some(10));
2122/// assert_eq!(align_start_col.get(1), Some(10));
2123///
2124/// let align_end_col = df.column("align_end")?.u64()?;
2125/// assert_eq!(align_end_col.get(0), Some(60));
2126/// assert_eq!(align_end_col.get(1), Some(60));
2127///
2128/// let contig_col = df.column("contig")?.str()?;
2129/// assert_eq!(contig_col.get(0), Some("chr1"));
2130/// assert_eq!(contig_col.get(1), Some("chr1"));
2131///
2132/// let contig_id_col = df.column("contig_id")?.i32()?;
2133/// assert_eq!(contig_id_col.get(0), Some(1));
2134/// assert_eq!(contig_id_col.get(1), Some(1));
2135///
2136/// // Check modification entry fields (repeated across both rows)
2137/// let base_col = df.column("base")?.str()?;
2138/// assert_eq!(base_col.get(0), Some("C"));
2139/// assert_eq!(base_col.get(1), Some("C"));
2140///
2141/// let is_strand_plus_col = df.column("is_strand_plus")?.bool()?;
2142/// assert_eq!(is_strand_plus_col.get(0), Some(true));
2143/// assert_eq!(is_strand_plus_col.get(1), Some(true));
2144///
2145/// let mod_code_col = df.column("mod_code")?.str()?;
2146/// assert_eq!(mod_code_col.get(0), Some("m"));
2147/// assert_eq!(mod_code_col.get(1), Some("m"));
2148///
2149/// // Check modification data points (unique per row)
2150/// let position_col = df.column("position")?.u64()?;
2151/// assert_eq!(position_col.get(0), Some(0)); // First mod position
2152/// assert_eq!(position_col.get(1), Some(2)); // Second mod position
2153///
2154/// let ref_position_col = df.column("ref_position")?.i64()?;
2155/// assert_eq!(ref_position_col.get(0), Some(15)); // First ref position
2156/// assert_eq!(ref_position_col.get(1), Some(25)); // Second ref position
2157///
2158/// let mod_quality_col = df.column("mod_quality")?.u32()?;
2159/// assert_eq!(mod_quality_col.get(0), Some(200)); // First mod_quality score
2160/// assert_eq!(mod_quality_col.get(1), Some(100)); // Second mod_quality score
2161///
2162/// # Ok::<(), Error>(())
2163/// ```
2164///
2165/// # Errors
2166/// Returns nanalogue `Error` if `DataFrame` construction fails or if
2167/// data extraction from `CurrRead` fails
2168#[expect(
2169 clippy::arithmetic_side_effects,
2170 reason = "start + alen cannot overflow as genomic coordinates are far below u64::MAX"
2171)]
2172pub fn curr_reads_to_dataframe(reads: &[CurrRead<AlignAndModData>]) -> Result<DataFrame, Error> {
2173 // Vectors to hold column data
2174 let mut read_ids: Vec<String> = Vec::new();
2175 let mut seq_lens: Vec<Option<u64>> = Vec::new();
2176 let mut alignment_types: Vec<String> = Vec::new();
2177
2178 // Alignment info columns
2179 let mut align_starts: Vec<Option<u64>> = Vec::new();
2180 let mut align_ends: Vec<Option<u64>> = Vec::new();
2181 let mut contigs: Vec<Option<String>> = Vec::new();
2182 let mut contig_ids: Vec<Option<i32>> = Vec::new();
2183
2184 // Modification entry columns
2185 let mut bases: Vec<String> = Vec::new();
2186 let mut is_strand_plus: Vec<bool> = Vec::new();
2187 let mut mod_codes: Vec<String> = Vec::new();
2188
2189 // Modification data point columns (the unique part)
2190 let mut positions: Vec<u64> = Vec::new();
2191 let mut ref_positions: Vec<i64> = Vec::new();
2192 let mut mod_qualities: Vec<u32> = Vec::new();
2193
2194 // Iterate through each CurrRead
2195 for read in reads {
2196 // Extract read-level information
2197 let read_id = read.read_id();
2198 let seq_len = read.seq_len().ok();
2199 let alignment_type = read.read_state();
2200
2201 // Extract alignment information (may be None for unmapped reads)
2202 let (contig_id, align_start, align_end, contig_name) =
2203 if let (Ok((cid, start)), Ok(alen), Ok(cname)) = (
2204 read.contig_id_and_start(),
2205 read.align_len(),
2206 read.contig_name(),
2207 ) {
2208 (
2209 Some(cid),
2210 Some(start),
2211 Some(start + alen),
2212 Some(cname.to_string()),
2213 )
2214 } else {
2215 (None, None, None, None)
2216 };
2217
2218 // Get BaseMods and condense to ModTableEntry format
2219 let mod_data = read.mod_data();
2220 let mod_table = condense_base_mods(&mod_data.0)?;
2221
2222 // For each modification table entry
2223 for mod_entry in &mod_table {
2224 // For each data point in this mod entry
2225 for &(start, ref_start, qual) in &mod_entry.data {
2226 // Add read-level fields (repeated)
2227 read_ids.push(read_id.to_string());
2228 seq_lens.push(seq_len);
2229 alignment_types.push(alignment_type.to_string());
2230
2231 // Add alignment fields (repeated, may be None)
2232 align_starts.push(align_start);
2233 align_ends.push(align_end);
2234 contigs.push(contig_name.clone());
2235 contig_ids.push(contig_id);
2236
2237 // Add mod entry fields (repeated)
2238 bases.push(mod_entry.base.to_string());
2239 is_strand_plus.push(mod_entry.is_strand_plus);
2240 mod_codes.push(mod_entry.mod_code.to_string());
2241
2242 // Add data point fields (unique per row)
2243 positions.push(start);
2244 ref_positions.push(ref_start);
2245 mod_qualities.push(u32::from(qual));
2246 }
2247 }
2248 }
2249
2250 // Build the DataFrame
2251 let df = df! {
2252 "read_id" => read_ids,
2253 "seq_len" => seq_lens,
2254 "alignment_type" => alignment_types,
2255 "align_start" => align_starts,
2256 "align_end" => align_ends,
2257 "contig" => contigs,
2258 "contig_id" => contig_ids,
2259 "base" => bases,
2260 "is_strand_plus" => is_strand_plus,
2261 "mod_code" => mod_codes,
2262 "position" => positions,
2263 "ref_position" => ref_positions,
2264 "mod_quality" => mod_qualities,
2265 }?;
2266
2267 Ok(df)
2268}
2269
2270#[cfg(test)]
2271mod test_error_handling {
2272 use super::*;
2273
2274 #[test]
2275 fn set_read_state_not_implemented_error() {
2276 for flag_value in 0..4096u16 {
2277 let mut record = Record::new();
2278 record.set_qname(b"test_read");
2279 record.set_flags(flag_value);
2280 let curr_read = CurrRead::default();
2281 // * first line below is usual primary, secondary, supplementary,
2282 // unmapped with reversed set or not with the first 3 categories
2283 // * second line is if unmapped is set with a mapped state, or
2284 // secondary and supplementary are both set
2285 // * third line are flags we have chosen to exclude from our program.
2286 match (flag_value, curr_read.set_read_state_and_id(&record)) {
2287 (0 | 4 | 16 | 256 | 272 | 2048 | 2064, Ok(_))
2288 | (
2289 20 | 260 | 276 | 2052 | 2068 | 2304 | 2308 | 2320 | 2324,
2290 Err(Error::UnknownAlignState(_)),
2291 )
2292 | (_, Err(Error::NotImplemented(_))) => {}
2293 (_, _) => unreachable!(),
2294 }
2295 }
2296 }
2297
2298 #[test]
2299 #[should_panic(expected = "InvalidDuplicates")]
2300 fn reconstruct_base_mods_detects_duplicates() {
2301 // Create a test case with duplicate strand and modification_type combinations
2302 // i.e. we have two entries for T+T below with different data
2303 let mod_entries = vec![
2304 ModTableEntry {
2305 base: AllowedAGCTN::T,
2306 is_strand_plus: true,
2307 mod_code: ModChar::new('T'),
2308 data: vec![(0, 0, 10)],
2309 },
2310 ModTableEntry {
2311 base: AllowedAGCTN::T,
2312 is_strand_plus: true,
2313 mod_code: ModChar::new('T'),
2314 data: vec![(1, 1, 20)],
2315 },
2316 ];
2317
2318 // Test reconstruct_base_mods with duplicate entries
2319 let _: BaseMods = reconstruct_base_mods(&mod_entries, false, 0..i64::MAX, 10).unwrap();
2320 }
2321
2322 #[test]
2323 fn reconstruct_base_mods_accepts_unique_combinations() {
2324 // Create a valid case with different combinations
2325 // First entry: T+ modification
2326 // Second entry: C+ modification (different base, same strand)
2327 // Third entry: T- modification (same base, different strand)
2328 let mod_entries = vec![
2329 ModTableEntry {
2330 base: AllowedAGCTN::T,
2331 is_strand_plus: true,
2332 mod_code: ModChar::new('T'),
2333 data: vec![(0, 0, 10)],
2334 },
2335 ModTableEntry {
2336 base: AllowedAGCTN::C,
2337 is_strand_plus: true,
2338 mod_code: ModChar::new('m'),
2339 data: vec![(1, 1, 20)],
2340 },
2341 ModTableEntry {
2342 base: AllowedAGCTN::T,
2343 is_strand_plus: false,
2344 mod_code: ModChar::new('T'),
2345 data: vec![(2, 2, 30)],
2346 },
2347 ];
2348
2349 // This should succeed since all combinations are unique
2350 let _: BaseMods = reconstruct_base_mods(&mod_entries, false, 0..i64::MAX, 10).unwrap();
2351 }
2352}
2353
2354#[cfg(test)]
2355mod test_serde {
2356 use super::*;
2357 use crate::nanalogue_bam_reader;
2358 use indoc::indoc;
2359 use rust_htslib::bam::Read as _;
2360
2361 #[test]
2362 fn first_record_serde() -> Result<(), Error> {
2363 // Read the first record from the example BAM file
2364 let mut reader = nanalogue_bam_reader("examples/example_1.bam")?;
2365 let first_record = reader.records().next().unwrap()?;
2366
2367 // Create CurrRead with alignment and modification data
2368 let curr_read = CurrRead::default()
2369 .try_from_only_alignment(&first_record)?
2370 .set_mod_data(&first_record, ThresholdState::GtEq(0), 0)?;
2371
2372 let actual_json: serde_json::Value = serde_json::to_value(&curr_read)?;
2373
2374 let expected_json_str = indoc! {r#"
2375 {
2376 "alignment_type": "primary_forward",
2377 "alignment": {
2378 "start": 9,
2379 "end": 17,
2380 "contig": "dummyI",
2381 "contig_id": 0
2382 },
2383 "mod_table": [
2384 {
2385 "base": "T",
2386 "is_strand_plus": true,
2387 "mod_code": "T",
2388 "data": [
2389 [0, 9, 4],
2390 [3, 12, 7],
2391 [4, 13, 9],
2392 [7, 16, 6]
2393 ]
2394 }
2395 ],
2396 "read_id": "5d10eb9a-aae1-4db8-8ec6-7ebb34d32575",
2397 "seq_len": 8
2398 }"#};
2399
2400 let expected_json: serde_json::Value = serde_json::from_str(expected_json_str)?;
2401
2402 // Compare expected and actual outputs
2403 assert_eq!(actual_json, expected_json);
2404
2405 // Also test deserialization: deserialize the expected JSON and compare with original CurrRead
2406 let deserialized_curr_read: CurrRead<AlignAndModData> =
2407 serde_json::from_str(expected_json_str)?;
2408 assert_eq!(deserialized_curr_read, curr_read);
2409
2410 Ok(())
2411 }
2412
2413 #[test]
2414 fn first_record_roundtrip() -> Result<(), Error> {
2415 // Read the first record from the example BAM file (same as serialization test)
2416 let mut reader = nanalogue_bam_reader("examples/example_1.bam")?;
2417 let first_record = reader.records().next().unwrap()?;
2418
2419 // Create the original CurrRead with alignment and modification data
2420 let original_curr_read = CurrRead::default()
2421 .try_from_only_alignment(&first_record)?
2422 .set_mod_data(&first_record, ThresholdState::GtEq(0), 0)?;
2423
2424 // Serialize to JSON
2425 let json_str = serde_json::to_string_pretty(&original_curr_read)?;
2426
2427 // Deserialize back to CurrRead
2428 let deserialized_curr_read: CurrRead<AlignAndModData> = serde_json::from_str(&json_str)?;
2429
2430 // The deserialized CurrRead should be equal to the original
2431 assert_eq!(deserialized_curr_read, original_curr_read);
2432
2433 Ok(())
2434 }
2435
2436 #[test]
2437 fn blank_json_record_roundtrip() -> Result<(), Error> {
2438 let json_str = indoc! {r"
2439 {
2440 }"};
2441
2442 // Deserialize JSON to CurrRead
2443 let curr_read: CurrRead<AlignAndModData> = serde_json::from_str(json_str)?;
2444
2445 // Serialize back to JSON
2446 let serialized_json = serde_json::to_string_pretty(&curr_read)?;
2447
2448 // Deserialize again
2449 let roundtrip_curr_read: CurrRead<AlignAndModData> =
2450 serde_json::from_str(&serialized_json)?;
2451
2452 // Check that the roundtrip preserves equality
2453 assert_eq!(curr_read, roundtrip_curr_read);
2454
2455 Ok(())
2456 }
2457
2458 #[test]
2459 fn example_1_unmapped() -> Result<(), Error> {
2460 // Read the fourth record from the example BAM file (this is the unmapped read)
2461 let fourth_record = {
2462 let mut reader = nanalogue_bam_reader("examples/example_1.bam")?;
2463 let mut records = reader.records();
2464 for _ in 0..3 {
2465 let _: Record = records.next().unwrap()?;
2466 }
2467 records.next().unwrap()?
2468 };
2469
2470 // Create CurrRead with alignment and modification data
2471 let curr_read = CurrRead::default()
2472 .try_from_only_alignment(&fourth_record)?
2473 .set_mod_data(&fourth_record, ThresholdState::GtEq(0), 0)?;
2474
2475 let actual_json: serde_json::Value = serde_json::to_value(&curr_read)?;
2476
2477 let expected_json_str = indoc! {r#"
2478 {
2479 "alignment_type": "unmapped",
2480 "mod_table": [
2481 {
2482 "base": "G",
2483 "is_strand_plus": false,
2484 "mod_code": "7200",
2485 "data": [
2486 [28, -1, 0],
2487 [29, -1, 0],
2488 [30, -1, 0],
2489 [32, -1, 0],
2490 [43, -1, 77],
2491 [44, -1, 0]
2492 ]
2493 },
2494 {
2495 "base": "T",
2496 "is_strand_plus": true,
2497 "mod_code": "T",
2498 "data": [
2499 [3, -1, 221],
2500 [8, -1, 242],
2501 [27, -1, 0],
2502 [39, -1, 47],
2503 [47, -1, 239]
2504 ]
2505 }
2506 ],
2507 "read_id": "a4f36092-b4d5-47a9-813e-c22c3b477a0c",
2508 "seq_len": 48
2509 }"#};
2510
2511 let expected_json: serde_json::Value = serde_json::from_str(expected_json_str)?;
2512
2513 // Compare expected and actual outputs
2514 assert_eq!(actual_json, expected_json);
2515
2516 Ok(())
2517 }
2518
2519 #[test]
2520 #[should_panic(expected = "invalid alignment coordinates")]
2521 fn invalid_align_coords_unmapped_with_reference_positions() {
2522 let invalid_json = r#"{
2523 "mod_table": [
2524 {
2525 "data": [[2, 3, 200]]
2526 }
2527 ],
2528 "seq_len": 10
2529 }"#;
2530
2531 // Deserialize JSON to CurrRead - this should panic with InvalidAlignCoords
2532 let _: CurrRead<AlignAndModData> = serde_json::from_str(invalid_json).unwrap();
2533 }
2534
2535 #[test]
2536 #[should_panic(expected = "invalid mod coordinates")]
2537 fn invalid_sequence_length() {
2538 let invalid_json = r#"{
2539 "mod_table": [
2540 {
2541 "data": [[20, -1, 200]]
2542 }
2543 ],
2544 "seq_len": 10
2545 }"#;
2546
2547 // Deserialize JSON to CurrRead - this should panic with InvalidSeqLength
2548 let _: CurrRead<AlignAndModData> = serde_json::from_str(invalid_json).unwrap();
2549 }
2550
2551 #[test]
2552 #[should_panic(expected = "invalid value")]
2553 fn invalid_sequence_coordinate() {
2554 let invalid_json = r#"{
2555 "alignment_type": "primary_forward",
2556 "alignment": {
2557 "start": 0,
2558 "end": 30
2559 },
2560 "mod_table": [
2561 {
2562 "data": [[-1, 1, 200]]
2563 }
2564 ],
2565 "seq_len": 30
2566 }"#;
2567
2568 // Deserialize JSON to CurrRead - this should panic with InvalidSeqLength
2569 let _: CurrRead<AlignAndModData> = serde_json::from_str(invalid_json).unwrap();
2570 }
2571
2572 #[test]
2573 #[should_panic(expected = "invalid alignment coordinates")]
2574 fn invalid_alignment_coordinates() {
2575 let invalid_json = r#"{
2576 "alignment_type": "primary_forward",
2577 "alignment": {
2578 "start": 10,
2579 "end": 25
2580 },
2581 "mod_table": [
2582 {
2583 "data": [[0, 10, 200], [1, 20, 180], [2, 30, 220]]
2584 }
2585 ],
2586 "seq_len": 3
2587 }"#;
2588
2589 // Deserialize JSON to CurrRead - this should panic with InvalidAlignCoords
2590 let _: CurrRead<AlignAndModData> = serde_json::from_str(invalid_json).unwrap();
2591 }
2592
2593 #[test]
2594 #[should_panic(expected = "invalid mod coordinates")]
2595 fn invalid_sorting_forward_alignment() {
2596 let invalid_json = r#"{
2597 "alignment_type": "primary_forward",
2598 "alignment": {
2599 "start": 10,
2600 "end": 40
2601 },
2602 "mod_table": [
2603 {
2604 "data": [[0, 10, 200], [2, 30, 180], [1, 20, 220]]
2605 }
2606 ],
2607 "seq_len": 3
2608 }"#;
2609
2610 // Deserialize JSON to CurrRead - this should panic with InvalidSorting
2611 let _: CurrRead<AlignAndModData> = serde_json::from_str(invalid_json).unwrap();
2612 }
2613
2614 #[test]
2615 #[should_panic(expected = "invalid mod coordinates")]
2616 fn invalid_sorting_reverse_alignment() {
2617 let invalid_json = r#"{
2618 "alignment_type": "primary_reverse",
2619 "alignment": {
2620 "start": 10,
2621 "end": 40
2622 },
2623 "mod_table": [
2624 {
2625 "data": [[2, 30, 180], [1, 20, 220], [0, 10, 200]]
2626 }
2627 ],
2628 "seq_len": 3
2629 }"#;
2630
2631 // Deserialize JSON to CurrRead - this should panic with InvalidSorting
2632 let _: CurrRead<AlignAndModData> = serde_json::from_str(invalid_json).unwrap();
2633 }
2634}