use crate::{
AllowedAGCTN, Contains as _, Error, F32Bw0and1, FilterByRefCoords, InputModOptions,
InputRegionOptions, InputWindowing, ModChar, ReadState, ThresholdState, nanalogue_mm_ml_parser,
};
use bedrs::prelude::{Intersect as _, StrandedBed3};
use bedrs::{Bed3, Coordinates as _, Strand};
use bio_types::genome::AbstractInterval as _;
use derive_builder::Builder;
use fibertools_rs::utils::{
bamannotations::{FiberAnnotation, Ranges},
basemods::{BaseMod, BaseMods},
};
use polars::{df, prelude::DataFrame};
use rust_htslib::{bam::ext::BamRecordExtensions as _, bam::record::Record};
use serde::{Deserialize, Serialize};
use std::{
cmp::Ordering,
collections::{HashMap, HashSet},
fmt::{self, Write as _},
ops::Range,
rc::Rc,
};
#[derive(Debug, Default, Copy, Clone, PartialEq)]
#[non_exhaustive]
pub struct NoData;
#[derive(Debug, Default, Copy, Clone, PartialEq)]
#[non_exhaustive]
pub struct OnlyAlignData;
#[derive(Debug, Default, Copy, Clone, PartialEq)]
#[non_exhaustive]
pub struct OnlyAlignDataComplete;
#[derive(Debug, Default, Copy, Clone, PartialEq)]
#[non_exhaustive]
pub struct AlignAndModData;
pub trait CurrReadState {}
impl CurrReadState for NoData {}
impl CurrReadState for OnlyAlignData {}
impl CurrReadState for OnlyAlignDataComplete {}
impl CurrReadState for AlignAndModData {}
pub trait CurrReadStateWithAlign {}
impl CurrReadStateWithAlign for OnlyAlignData {}
impl CurrReadStateWithAlign for OnlyAlignDataComplete {}
impl CurrReadStateWithAlign for AlignAndModData {}
pub trait CurrReadStateOnlyAlign {}
impl CurrReadStateOnlyAlign for OnlyAlignData {}
impl CurrReadStateOnlyAlign for OnlyAlignDataComplete {}
#[derive(Debug, Clone, PartialEq)]
pub struct CurrRead<S: CurrReadState> {
state: ReadState,
read_id: String,
seq_len: Option<u64>,
align_len: Option<u64>,
mods: (BaseMods, ThresholdState),
contig_id_and_start: Option<(i32, u64)>,
contig_name: Option<String>,
mod_base_qual_thres: u8,
marker: std::marker::PhantomData<S>,
}
impl Default for CurrRead<NoData> {
fn default() -> Self {
CurrRead::<NoData> {
state: ReadState::PrimaryFwd,
read_id: String::new(),
seq_len: None,
align_len: None,
mods: (BaseMods { base_mods: vec![] }, ThresholdState::default()),
contig_id_and_start: None,
contig_name: None,
mod_base_qual_thres: 0,
marker: std::marker::PhantomData::<NoData>,
}
}
}
impl CurrRead<NoData> {
pub fn set_read_state_and_id(self, record: &Record) -> Result<CurrRead<OnlyAlignData>, Error> {
let read_id = match str::from_utf8(record.qname()) {
Ok(v) => v.to_string(),
Err(e) => {
return Err(Error::InvalidReadID(format!(
"error in setting read id, which possibly violates BAM requirements: {e}"
)));
}
};
if record.is_paired()
|| record.is_proper_pair()
|| record.is_first_in_template()
|| record.is_last_in_template()
|| record.is_mate_reverse()
|| record.is_mate_unmapped()
|| record.is_duplicate()
|| record.is_quality_check_failed()
{
return Err(Error::NotImplemented(format!(
"paired-read/mate-read/duplicate/qual-check-failed flags not supported! read_id: {read_id}"
)));
}
let state = match (
record.is_reverse(),
record.is_unmapped(),
record.is_secondary(),
record.is_supplementary(),
) {
(false, true, false, false) => ReadState::Unmapped,
(true, false, false, false) => ReadState::PrimaryRev,
(true, false, true, false) => ReadState::SecondaryRev,
(false, false, true, false) => ReadState::SecondaryFwd,
(true, false, false, true) => ReadState::SupplementaryRev,
(false, false, false, true) => ReadState::SupplementaryFwd,
(false, false, false, false) => ReadState::PrimaryFwd,
_ => {
return Err(Error::UnknownAlignState(format!(
"invalid flag combination! read_id: {read_id}",
)));
}
};
Ok(CurrRead::<OnlyAlignData> {
state,
read_id,
seq_len: None,
align_len: None,
mods: (BaseMods { base_mods: vec![] }, ThresholdState::default()),
contig_id_and_start: None,
contig_name: None,
mod_base_qual_thres: 0,
marker: std::marker::PhantomData::<OnlyAlignData>,
})
}
pub fn try_from_only_alignment(
self,
record: &Record,
) -> Result<CurrRead<OnlyAlignDataComplete>, Error> {
self.try_from_only_alignment_seq_len_optional(record, true)
}
pub fn try_from_only_alignment_zero_seq_len(
self,
record: &Record,
) -> Result<CurrRead<OnlyAlignDataComplete>, Error> {
self.try_from_only_alignment_seq_len_optional(record, false)
}
pub fn try_from_only_alignment_seq_len_optional(
self,
record: &Record,
is_seq_len_non_zero: bool,
) -> Result<CurrRead<OnlyAlignDataComplete>, Error> {
let mut curr_read_state = CurrRead::default().set_read_state_and_id(record)?;
if !curr_read_state.read_state().is_unmapped() {
curr_read_state = curr_read_state
.set_align_len(record)?
.set_contig_id_and_start(record)?
.set_contig_name(record)?;
}
if is_seq_len_non_zero {
curr_read_state = curr_read_state.set_seq_len(record)?;
} else {
curr_read_state.seq_len = Some(0u64);
}
let CurrRead::<OnlyAlignData> {
state,
read_id,
seq_len,
align_len,
contig_id_and_start,
contig_name,
..
} = curr_read_state;
Ok(CurrRead::<OnlyAlignDataComplete> {
state,
read_id,
seq_len,
align_len,
mods: (BaseMods { base_mods: vec![] }, ThresholdState::default()),
contig_id_and_start,
contig_name,
mod_base_qual_thres: 0,
marker: std::marker::PhantomData::<OnlyAlignDataComplete>,
})
}
}
impl<S: CurrReadStateWithAlign + CurrReadState> CurrRead<S> {
#[must_use]
pub fn read_state(&self) -> ReadState {
self.state
}
pub fn set_seq_len(mut self, record: &Record) -> Result<Self, Error> {
self.seq_len = match self.seq_len {
Some(_) => {
return Err(Error::InvalidDuplicates(format!(
"cannot set sequence length again! read_id: {}",
self.read_id()
)));
}
None => match record.seq_len() {
0 => {
return Err(Error::ZeroSeqLen(format!(
"avoid including 0-len sequences while parsing mod data in this program, read_id: {}",
self.read_id()
)));
}
l => Some(u64::try_from(l)?),
},
};
Ok(self)
}
pub fn seq_len(&self) -> Result<u64, Error> {
self.seq_len.ok_or(Error::UnavailableData(format!(
"seq len not available, read_id: {}",
self.read_id()
)))
}
pub fn set_align_len(mut self, record: &Record) -> Result<Self, Error> {
self.align_len = match self.align_len {
Some(_) => Err(Error::InvalidDuplicates(format!(
"cannot set alignment length again! read_id: {}",
self.read_id()
))),
None => {
if self.read_state().is_unmapped() {
Err(Error::Unmapped(format!(
"cannot set alignment properties for unmapped reads, read_id: {}",
self.read_id()
)))
} else {
let st = record.pos();
let en = record.reference_end();
if en > st && st >= 0 {
#[expect(
clippy::arithmetic_side_effects,
reason = "en > st && st >= 0 guarantee no i64 overflows"
)]
#[expect(
clippy::missing_panics_doc,
reason = "en > st && st >= 0 guarantee no panic"
)]
Ok(Some(
(en - st)
.try_into()
.expect("en>st && st>=0 guarantee no problems i64->u64"),
))
} else {
Err(Error::InvalidAlignLength(format!(
"in `set_align_len`, start: {st}, end: {en} invalid! i.e. en <= st or st < 0, read_id: {}",
self.read_id()
)))
}
}
}
}?;
Ok(self)
}
pub fn align_len(&self) -> Result<u64, Error> {
if self.read_state().is_unmapped() {
Err(Error::Unmapped(format!(
"request alignment data on unmapped, read_id: {}",
self.read_id()
)))
} else {
self.align_len.ok_or(Error::UnavailableData(format!(
"alignment data not available, read_id: {}",
self.read_id()
)))
}
}
pub fn set_contig_id_and_start(mut self, record: &Record) -> Result<Self, Error> {
self.contig_id_and_start = match self.contig_id_and_start {
Some(_) => Err(Error::InvalidDuplicates(format!(
"cannot set contig and start again! read_id: {}",
self.read_id()
))),
None => {
if self.read_state().is_unmapped() {
Err(Error::Unmapped(format!(
"setting alignment data on unmapped read, read_id: {}",
self.read_id()
)))
} else {
Ok(Some((record.tid(), record.pos().try_into()?)))
}
}
}?;
Ok(self)
}
pub fn contig_id_and_start(&self) -> Result<(i32, u64), Error> {
if self.read_state().is_unmapped() {
Err(Error::Unmapped(format!(
"requesting alignment data on unmapped read, read_id: {}",
self.read_id()
)))
} else {
self.contig_id_and_start
.ok_or(Error::UnavailableData(format!(
"contig id, start not available, read_id: {}",
self.read_id()
)))
}
}
pub fn set_contig_name(mut self, record: &Record) -> Result<Self, Error> {
self.contig_name = match (self.read_state().is_unmapped(), self.contig_name.is_none()) {
(true, _) => Err(Error::Unmapped(format!(
"set align data on unmapped read! read_id: {}",
self.read_id()
))),
(_, false) => Err(Error::InvalidDuplicates(format!(
"cannot set contig name again! read_id: {}",
self.read_id()
))),
(_, true) => Ok(Some(String::from(record.contig()))),
}?;
Ok(self)
}
pub fn contig_name(&self) -> Result<&str, Error> {
match (self.read_state().is_unmapped(), self.contig_name.as_ref()) {
(true, _) => Err(Error::Unmapped(format!(
"get align data on unmapped read, read_id: {}",
self.read_id()
))),
(_, None) => Err(Error::UnavailableData(format!(
"contig name unavailable, read_id: {}",
self.read_id()
))),
(_, Some(v)) => Ok(v.as_str()),
}
}
#[must_use]
pub fn read_id(&self) -> &str {
self.read_id.as_str()
}
#[must_use]
pub fn strand(&self) -> char {
self.read_state().strand()
}
pub fn seq_on_ref_coords(
&self,
record: &Record,
region: &Bed3<i32, u64>,
) -> Result<Vec<u8>, Error> {
Ok(self
.seq_and_qual_on_ref_coords(record, region)?
.into_iter()
.map(|x| x.map_or(b'.', |y| y.1))
.collect::<Vec<u8>>())
}
#[expect(
clippy::indexing_slicing,
reason = "qual, seq same len and coords will not exceed these"
)]
#[expect(
clippy::type_complexity,
reason = "not complex enough, I think types will make this less clear"
)]
pub fn seq_and_qual_on_ref_coords(
&self,
record: &Record,
region: &Bed3<i32, u64>,
) -> Result<Vec<Option<(bool, u8, u8)>>, Error> {
let seq = record.seq();
let qual = record.qual();
Ok(self
.seq_coords_from_ref_coords(record, region)?
.into_iter()
.map(|x| x.map(|y| (y.0, seq[y.1], qual[y.1])))
.collect::<Vec<Option<(bool, u8, u8)>>>())
}
pub fn seq_coords_from_ref_coords(
&self,
record: &Record,
region: &Bed3<i32, u64>,
) -> Result<Vec<Option<(bool, usize)>>, Error> {
#[expect(
clippy::missing_panics_doc,
reason = "genomic coordinates are far less than (2^64-1)/2 i.e. u64->i64 should be ok"
)]
let interval = {
region
.intersect(&StrandedBed3::<i32, u64>::try_from(self)?)
.and_then(|v| {
let start = i64::try_from(v.start()).expect("genomic coordinates << 2^63");
let end = i64::try_from(v.end()).expect("genomic coordinates << 2^63");
(start < end).then_some(start..end)
})
.ok_or(Error::UnavailableData(
"coord-retrieval: region does not intersect with read".to_owned(),
))
}?;
#[expect(
clippy::arithmetic_side_effects,
reason = "genomic coordinates far less than i64::MAX (approx (2^64-1)/2)"
)]
let mut s: Vec<Option<(bool, usize)>> =
Vec::with_capacity(usize::try_from(2 * (interval.end - interval.start))?);
let mut trim_end_bp: u64 = 0;
for w in record
.aligned_pairs_full()
.skip_while(|x| x[1].is_none_or(|y| !interval.contains(&y)))
.take_while(|x| x[1].is_none_or(|y| interval.contains(&y)))
{
#[expect(
clippy::arithmetic_side_effects,
reason = "coordinates far less than u64::MAX (2^64-1) so no chance of counter overflow"
)]
match w {
[Some(x), Some(_)] => {
s.push(Some((true, usize::try_from(x)?)));
trim_end_bp = 0;
}
[Some(x), None] => {
s.push(Some((false, usize::try_from(x)?)));
trim_end_bp += 1;
}
[None, Some(_)] => {
s.push(None);
trim_end_bp = 0;
}
[None, None] => unreachable!(
"impossible to find bases that are neither on the read nor on the reference"
),
}
}
for _ in 0..trim_end_bp {
let Some(_) = s.pop() else {
unreachable!("trim_end_bp cannot exceed length of s")
};
}
s.shrink_to(0);
Ok(s)
}
pub fn set_mod_data(
self,
record: &Record,
mod_thres: ThresholdState,
min_qual: u8,
) -> Result<CurrRead<AlignAndModData>, Error> {
let result = nanalogue_mm_ml_parser(
record,
|x| mod_thres.contains(x),
|_| true,
|_, _, _| true,
min_qual,
)?;
Ok(CurrRead::<AlignAndModData> {
state: self.state,
read_id: self.read_id,
seq_len: self.seq_len,
align_len: self.align_len,
mods: (result, mod_thres),
contig_id_and_start: self.contig_id_and_start,
contig_name: self.contig_name,
mod_base_qual_thres: min_qual,
marker: std::marker::PhantomData::<AlignAndModData>,
})
}
pub fn set_mod_data_restricted<G, H>(
self,
record: &Record,
mod_thres: ThresholdState,
mod_fwd_pos_filter: G,
mod_filter_base_strand_tag: H,
min_qual: u8,
) -> Result<CurrRead<AlignAndModData>, Error>
where
G: Fn(&usize) -> bool,
H: Fn(&u8, &char, &ModChar) -> bool,
{
let result = nanalogue_mm_ml_parser(
record,
|x| mod_thres.contains(x),
mod_fwd_pos_filter,
mod_filter_base_strand_tag,
min_qual,
)?;
Ok(CurrRead::<AlignAndModData> {
state: self.state,
read_id: self.read_id,
seq_len: self.seq_len,
align_len: self.align_len,
mods: (result, mod_thres),
contig_id_and_start: self.contig_id_and_start,
contig_name: self.contig_name,
mod_base_qual_thres: min_qual,
marker: std::marker::PhantomData::<AlignAndModData>,
})
}
}
impl CurrRead<OnlyAlignDataComplete> {
#[expect(
clippy::missing_panics_doc,
reason = "integer conversions (u64->usize, u64->i64) are expected to not fail as \
genomic coordinates are far smaller than ~2^63"
)]
pub fn set_mod_data_restricted_options<S: InputModOptions + InputRegionOptions>(
self,
record: &Record,
mod_options: &S,
) -> Result<CurrRead<AlignAndModData>, Error> {
let l = usize::try_from(self.seq_len().expect("no error"))
.expect("bit conversion errors unlikely");
let w = mod_options.trim_read_ends_mod();
let interval = if let Some(bed3) = mod_options.region_filter().as_ref() {
let stranded_bed3 = StrandedBed3::<i32, u64>::try_from(&self)?;
if let Some(v) = bed3.intersect(&stranded_bed3) {
if v.start() == stranded_bed3.start() && v.end() == stranded_bed3.end() {
None
} else {
Some(v.start()..v.end())
}
} else {
Some(0..0)
}
} else {
None
};
Ok({
let mut read = self.set_mod_data_restricted(
record,
mod_options.mod_prob_filter(),
|x| w == 0 || (w..l.checked_sub(w).unwrap_or_default()).contains(x),
|_, &s, &t| {
mod_options.tag().is_none_or(|x| x == t)
&& mod_options.mod_strand().is_none_or(|v| s == char::from(v))
},
mod_options.base_qual_filter_mod(),
)?;
if let Some(v) = interval {
match v.start.cmp(&v.end) {
Ordering::Less => read.filter_by_ref_pos(
i64::try_from(v.start)
.expect("no error as genomic coordinates far less than ~2^63"),
i64::try_from(v.end)
.expect("no error as genomic coordinates far less than ~2^63"),
)?,
Ordering::Equal => read.filter_by_ref_pos(i64::MAX - 1, i64::MAX)?,
Ordering::Greater => {
unreachable!("`bedrs` should not allow malformed intervals!")
}
}
}
read
})
}
}
impl CurrRead<AlignAndModData> {
#[must_use]
pub fn mod_data(&self) -> &(BaseMods, ThresholdState) {
&self.mods
}
#[expect(
clippy::pattern_type_mismatch,
reason = "suggested notation is verbose but I am not sure"
)]
pub fn windowed_mod_data_restricted<F>(
&self,
window_function: &F,
win_options: InputWindowing,
tag: ModChar,
) -> Result<Vec<F32Bw0and1>, Error>
where
F: Fn(&[u8]) -> Result<F32Bw0and1, Error>,
{
let win_size = win_options.win.get();
let slide_size = win_options.step.get();
let mut result = Vec::<F32Bw0and1>::new();
let tag_char = tag.val();
let (BaseMods { base_mods: v }, _) = &self.mods;
#[expect(
clippy::missing_panics_doc,
reason = "checked_sub ensures win_size <= mod_data.len() before windowing"
)]
for k in v {
match k {
BaseMod {
modification_type: x,
ranges: track,
..
} if *x == tag_char => {
let mod_data = track.qual();
if let Some(l) = mod_data.len().checked_sub(win_size) {
result.extend(
(0..=l)
.step_by(slide_size)
.map(|i| {
window_function(
mod_data
.get(i..)
.expect("i <= len - win_size")
.get(0..win_size)
.expect("checked len>=win_size so no error"),
)
})
.collect::<Result<Vec<F32Bw0and1>, _>>()?,
);
}
}
_ => {}
}
}
Ok(result)
}
#[must_use]
pub fn base_count_per_mod(&self) -> HashMap<ModChar, u32> {
let mut output = HashMap::<ModChar, u32>::new();
#[expect(
clippy::arithmetic_side_effects,
reason = "u32::MAX approx 4.2 Gb, v unlikely 1 molecule is this modified"
)]
for k in &self.mod_data().0.base_mods {
let base_count =
u32::try_from(k.ranges.annotations.len()).expect("number conversion error");
let _: &mut u32 = output
.entry(ModChar::new(k.modification_type))
.and_modify(|e| *e += base_count)
.or_insert(base_count);
}
output
}
}
trait DisplayCondensedModData {
fn mod_data_section(&self) -> Result<String, fmt::Error>;
}
impl<S> DisplayCondensedModData for CurrRead<S>
where
S: CurrReadStateOnlyAlign + CurrReadState,
{
fn mod_data_section(&self) -> Result<String, fmt::Error> {
Ok(String::new())
}
}
impl DisplayCondensedModData for CurrRead<AlignAndModData> {
#[expect(
clippy::pattern_type_mismatch,
reason = "suggested notation is verbose but I am not sure"
)]
fn mod_data_section(&self) -> Result<String, fmt::Error> {
let mut mod_count_str = String::new();
let (BaseMods { base_mods: v }, w) = &self.mods;
for k in v {
write!(
mod_count_str,
"{}{}{}:{};",
k.modified_base as char,
k.strand,
ModChar::new(k.modification_type),
k.ranges.annotations.len()
)?;
}
if mod_count_str.is_empty() {
write!(mod_count_str, "NA")?;
} else {
write!(
mod_count_str,
"({}, PHRED base qual >= {})",
w, self.mod_base_qual_thres
)?;
}
Ok(format!(",\n\t\"mod_count\": \"{mod_count_str}\""))
}
}
impl<S> fmt::Display for CurrRead<S>
where
S: CurrReadState,
CurrRead<S>: DisplayCondensedModData,
{
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut output_string = String::from("{\n");
writeln!(output_string, "\t\"read_id\": \"{}\",", self.read_id)?;
if let Some(v) = self.seq_len {
writeln!(output_string, "\t\"sequence_length\": {v},")?;
}
if let Some((v, w)) = self.contig_id_and_start {
let num_str = &v.to_string();
writeln!(
output_string,
"\t\"contig\": \"{}\",",
if let Some(x) = self.contig_name.as_ref() {
x
} else {
num_str
}
)?;
writeln!(output_string, "\t\"reference_start\": {w},")?;
if let Some(x) = self.align_len {
writeln!(
output_string,
"\t\"reference_end\": {},",
w.checked_add(x)
.expect("numeric overflow in calculating reference_end")
)?;
writeln!(output_string, "\t\"alignment_length\": {x},")?;
}
}
write!(output_string, "\t\"alignment_type\": \"{}\"", self.state)?;
writeln!(output_string, "{}", &self.mod_data_section()?)?;
output_string.push('}');
output_string.fmt(f)
}
}
impl<S: CurrReadStateWithAlign + CurrReadState> TryFrom<&CurrRead<S>> for StrandedBed3<i32, u64> {
type Error = Error;
#[expect(
clippy::arithmetic_side_effects,
reason = "u64 variables won't overflow with genomic coords (<2^64-1)"
)]
fn try_from(value: &CurrRead<S>) -> Result<Self, Self::Error> {
match (
value.read_state().strand(),
value.align_len().ok(),
value.contig_id_and_start().ok(),
) {
('.', _, _) => Ok(StrandedBed3::empty()),
(_, None, _) => Err(Error::InvalidAlignLength(format!(
"align len not set while converting to bed3! read_id: {}",
value.read_id()
))),
(_, _, None) => Err(Error::InvalidContigAndStart(format!(
"contig id and start not set while converting to bed3! read_id: {}",
value.read_id()
))),
('+', Some(al), Some((cg, st))) => {
Ok(StrandedBed3::new(cg, st, st + al, Strand::Forward))
}
('-', Some(al), Some((cg, st))) => {
Ok(StrandedBed3::new(cg, st, st + al, Strand::Reverse))
}
(_, _, _) => unreachable!("strand should be +/-/., so we should never reach this"),
}
}
}
impl TryFrom<Record> for CurrRead<AlignAndModData> {
type Error = Error;
fn try_from(record: Record) -> Result<Self, Self::Error> {
let curr_read_state = CurrRead::default()
.try_from_only_alignment(&record)?
.set_mod_data(&record, ThresholdState::GtEq(128), 0)?;
Ok(curr_read_state)
}
}
impl TryFrom<Rc<Record>> for CurrRead<AlignAndModData> {
type Error = Error;
fn try_from(record: Rc<Record>) -> Result<Self, Self::Error> {
let curr_read_state = CurrRead::default()
.try_from_only_alignment(&record)?
.set_mod_data(&record, ThresholdState::GtEq(128), 0)?;
Ok(curr_read_state)
}
}
impl FilterByRefCoords for CurrRead<AlignAndModData> {
fn filter_by_ref_pos(&mut self, start: i64, end: i64) -> Result<(), Error> {
for k in &mut self.mods.0.base_mods {
k.ranges.filter_by_ref_pos(start, end)?;
}
Ok(())
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(default)]
pub struct CurrReadBuilder {
alignment_type: ReadState,
#[serde(skip_serializing_if = "Option::is_none")]
alignment: Option<AlignmentInfo>,
mod_table: Vec<ModTableEntry>,
read_id: String,
seq_len: u64,
}
impl Default for CurrReadBuilder {
fn default() -> Self {
Self {
alignment_type: ReadState::Unmapped, alignment: None,
mod_table: Vec::new(),
read_id: String::new(),
seq_len: 0,
}
}
}
#[derive(Builder, Debug, Clone, Default, Serialize, Deserialize)]
#[serde(default)]
#[builder(default, build_fn(error = "Error"), pattern = "owned")]
pub struct AlignmentInfo {
start: u64,
end: u64,
contig: String,
contig_id: i32,
}
#[derive(Builder, Debug, Clone, Default, Serialize, Deserialize)]
#[serde(default)]
#[builder(default, build_fn(error = "Error"), pattern = "owned")]
pub struct ModTableEntry {
#[builder(field(ty = "char", build = "self.base.try_into()?"))]
base: AllowedAGCTN,
is_strand_plus: bool,
#[builder(field(ty = "String", build = "self.mod_code.parse()?"))]
mod_code: ModChar,
#[builder(setter(into))]
data: Vec<(u64, i64, u8)>,
}
impl CurrReadBuilder {
#[must_use]
pub fn alignment_type(mut self, value: ReadState) -> Self {
self.alignment_type = value;
self
}
#[must_use]
pub fn alignment(mut self, value: AlignmentInfo) -> Self {
self.alignment = Some(value);
self
}
#[must_use]
pub fn mod_table(mut self, value: Vec<ModTableEntry>) -> Self {
self.mod_table = value;
self
}
#[must_use]
pub fn read_id(mut self, value: String) -> Self {
self.read_id = value;
self
}
#[must_use]
pub fn seq_len(mut self, value: u64) -> Self {
self.seq_len = value;
self
}
pub fn build(self) -> Result<CurrRead<AlignAndModData>, Error> {
CurrRead::<AlignAndModData>::try_from(self)
}
}
impl Serialize for CurrRead<AlignAndModData> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
let serialized_curr_read: CurrReadBuilder =
self.clone().try_into().map_err(serde::ser::Error::custom)?;
serialized_curr_read.serialize(serializer)
}
}
impl TryFrom<CurrRead<AlignAndModData>> for CurrReadBuilder {
type Error = Error;
fn try_from(curr_read: CurrRead<AlignAndModData>) -> Result<Self, Self::Error> {
let alignment_type = curr_read.read_state();
#[expect(
clippy::arithmetic_side_effects,
reason = "u64 variables won't overflow with genomic coords (<2^64-1)"
)]
let alignment = if curr_read.read_state().is_unmapped() {
None
} else {
let (contig_id, start) = curr_read.contig_id_and_start()?;
let align_len = curr_read.align_len()?;
let contig = curr_read.contig_name()?.to_string();
let end = start + align_len;
Some(AlignmentInfo {
start,
end,
contig,
contig_id,
})
};
let mod_table = condense_base_mods(&curr_read.mod_data().0)?;
let read_id = curr_read.read_id().to_string();
let seq_len = curr_read.seq_len()?;
Ok(CurrReadBuilder {
alignment_type,
alignment,
mod_table,
read_id,
seq_len,
})
}
}
impl<'de> Deserialize<'de> for CurrRead<AlignAndModData> {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
let serialized = CurrReadBuilder::deserialize(deserializer)?;
serialized.try_into().map_err(serde::de::Error::custom)
}
}
impl TryFrom<CurrReadBuilder> for CurrRead<AlignAndModData> {
type Error = Error;
fn try_from(serialized: CurrReadBuilder) -> Result<Self, Self::Error> {
let (align_len, contig_id_and_start, contig_name, ref_range) = match (
serialized.alignment_type.is_unmapped(),
serialized.alignment.as_ref(),
) {
(false, Some(alignment)) => {
let align_len = {
if let Some(v) = alignment.end.checked_sub(alignment.start) {
Ok(Some(v))
} else {
Err(Error::InvalidAlignCoords(format!(
"is align end {0} < align start {1}? read {2} failed in `CurrRead` building!",
alignment.end, alignment.start, serialized.read_id
)))
}
}?;
let contig_id_and_start = Some((alignment.contig_id, alignment.start));
let contig_name = Some(alignment.contig.clone());
(
align_len,
contig_id_and_start,
contig_name,
i64::try_from(alignment.start)?..i64::try_from(alignment.end)?,
)
}
(true, None) => (None, None, None, (-1i64)..0),
(_, _) => {
return Err(Error::UnknownAlignState(format!(
"alignment_type and alignment info not matching while building CurrRead! read_id: {}",
serialized.read_id,
)));
}
};
let base_mods = reconstruct_base_mods(
&serialized.mod_table,
serialized.alignment_type.strand() == '-',
ref_range,
serialized.seq_len,
)?;
Ok(CurrRead {
state: serialized.alignment_type,
read_id: serialized.read_id,
seq_len: Some(serialized.seq_len),
align_len,
mods: (base_mods, ThresholdState::GtEq(0)), contig_id_and_start,
contig_name,
mod_base_qual_thres: 0, marker: std::marker::PhantomData,
})
}
}
fn condense_base_mods(base_mods: &BaseMods) -> Result<Vec<ModTableEntry>, Error> {
let mut mod_table = Vec::new();
for base_mod in &base_mods.base_mods {
let entries: Result<Vec<_>, Error> = base_mod
.ranges
.annotations
.iter()
.map(|k| {
let start: u64 = k.start.try_into()?;
let ref_start = k.reference_start.unwrap_or(-1);
Ok((start, ref_start, k.qual))
})
.collect();
mod_table.push(ModTableEntry {
base: AllowedAGCTN::try_from(base_mod.modified_base)?,
is_strand_plus: base_mod.strand == '+',
mod_code: ModChar::new(base_mod.modification_type),
data: entries?,
});
}
Ok(mod_table)
}
fn reconstruct_base_mods(
mod_table: &[ModTableEntry],
is_reverse: bool,
ref_range: Range<i64>,
seq_len: u64,
) -> Result<BaseMods, Error> {
let mut base_mods = Vec::new();
for entry in mod_table {
let annotations = {
let mut annotations = Vec::<FiberAnnotation>::with_capacity(entry.data.len());
let mut valid_range = 0..seq_len;
#[expect(
clippy::arithmetic_side_effects,
reason = "overflow errors not possible as genomic coordinates << 2^64"
)]
for &(start, ref_start, qual) in &entry.data {
if !valid_range.contains(&start) {
return Err(Error::InvalidModCoords(String::from(
"in mod table, read coords > seq length or read coords not sorted (NOTE: \
ascending needed even if reversed read)!",
)));
}
valid_range = start + 1..seq_len;
let ref_start_after_check = match (ref_start, ref_range.contains(&ref_start)) {
(-1, _) => None,
(v, true) if v > -1 => Some(v),
(v, _) => {
return Err(Error::InvalidAlignCoords(format!(
"coordinate {v} invalid in mod table (exceeds alignment coords or is < -1)"
)));
}
};
let start_i64 = i64::try_from(start)?;
annotations.push(FiberAnnotation {
start: start_i64,
end: start_i64 + 1,
length: 1,
qual,
reference_start: ref_start_after_check,
reference_end: ref_start_after_check,
reference_length: ref_start_after_check.is_some().then_some(0),
extra_columns: None,
});
}
annotations
};
let ranges = Ranges::from_annotations(annotations, seq_len.try_into()?, is_reverse);
let strand = if entry.is_strand_plus { '+' } else { '-' };
base_mods.push(BaseMod {
modified_base: u8::try_from(char::from(entry.base))?,
strand,
modification_type: entry.mod_code.val(),
ranges,
record_is_reverse: is_reverse,
});
}
base_mods.sort();
let mut seen_combinations = HashSet::new();
for base_mod in &base_mods {
let combination = (base_mod.strand, base_mod.modification_type);
if seen_combinations.contains(&combination) {
return Err(Error::InvalidDuplicates(format!(
"Duplicate strand '{}' and modification_type '{}' combination found",
base_mod.strand, base_mod.modification_type
)));
}
let _: bool = seen_combinations.insert(combination);
}
Ok(BaseMods { base_mods })
}
#[expect(
clippy::arithmetic_side_effects,
reason = "start + alen cannot overflow as genomic coordinates are far below u64::MAX"
)]
pub fn curr_reads_to_dataframe(reads: &[CurrRead<AlignAndModData>]) -> Result<DataFrame, Error> {
let mut read_ids: Vec<String> = Vec::new();
let mut seq_lens: Vec<Option<u64>> = Vec::new();
let mut alignment_types: Vec<String> = Vec::new();
let mut align_starts: Vec<Option<u64>> = Vec::new();
let mut align_ends: Vec<Option<u64>> = Vec::new();
let mut contigs: Vec<Option<String>> = Vec::new();
let mut contig_ids: Vec<Option<i32>> = Vec::new();
let mut bases: Vec<String> = Vec::new();
let mut is_strand_plus: Vec<bool> = Vec::new();
let mut mod_codes: Vec<String> = Vec::new();
let mut positions: Vec<u64> = Vec::new();
let mut ref_positions: Vec<i64> = Vec::new();
let mut mod_qualities: Vec<u32> = Vec::new();
for read in reads {
let read_id = read.read_id();
let seq_len = read.seq_len().ok();
let alignment_type = read.read_state();
let (contig_id, align_start, align_end, contig_name) =
if let (Ok((cid, start)), Ok(alen), Ok(cname)) = (
read.contig_id_and_start(),
read.align_len(),
read.contig_name(),
) {
(
Some(cid),
Some(start),
Some(start + alen),
Some(cname.to_string()),
)
} else {
(None, None, None, None)
};
let mod_data = read.mod_data();
let mod_table = condense_base_mods(&mod_data.0)?;
for mod_entry in &mod_table {
for &(start, ref_start, qual) in &mod_entry.data {
read_ids.push(read_id.to_string());
seq_lens.push(seq_len);
alignment_types.push(alignment_type.to_string());
align_starts.push(align_start);
align_ends.push(align_end);
contigs.push(contig_name.clone());
contig_ids.push(contig_id);
bases.push(mod_entry.base.to_string());
is_strand_plus.push(mod_entry.is_strand_plus);
mod_codes.push(mod_entry.mod_code.to_string());
positions.push(start);
ref_positions.push(ref_start);
mod_qualities.push(u32::from(qual));
}
}
}
let df = df! {
"read_id" => read_ids,
"seq_len" => seq_lens,
"alignment_type" => alignment_types,
"align_start" => align_starts,
"align_end" => align_ends,
"contig" => contigs,
"contig_id" => contig_ids,
"base" => bases,
"is_strand_plus" => is_strand_plus,
"mod_code" => mod_codes,
"position" => positions,
"ref_position" => ref_positions,
"mod_quality" => mod_qualities,
}?;
Ok(df)
}
#[cfg(test)]
mod test_error_handling {
use super::*;
#[test]
fn set_read_state_not_implemented_error() {
for flag_value in 0..4096u16 {
let mut record = Record::new();
record.set_qname(b"test_read");
record.set_flags(flag_value);
let curr_read = CurrRead::default();
match (flag_value, curr_read.set_read_state_and_id(&record)) {
(0 | 4 | 16 | 256 | 272 | 2048 | 2064, Ok(_))
| (
20 | 260 | 276 | 2052 | 2068 | 2304 | 2308 | 2320 | 2324,
Err(Error::UnknownAlignState(_)),
)
| (_, Err(Error::NotImplemented(_))) => {}
(_, _) => unreachable!(),
}
}
}
#[test]
#[should_panic(expected = "InvalidDuplicates")]
fn reconstruct_base_mods_detects_duplicates() {
let mod_entries = vec![
ModTableEntry {
base: AllowedAGCTN::T,
is_strand_plus: true,
mod_code: ModChar::new('T'),
data: vec![(0, 0, 10)],
},
ModTableEntry {
base: AllowedAGCTN::T,
is_strand_plus: true,
mod_code: ModChar::new('T'),
data: vec![(1, 1, 20)],
},
];
let _: BaseMods = reconstruct_base_mods(&mod_entries, false, 0..i64::MAX, 10).unwrap();
}
#[test]
fn reconstruct_base_mods_accepts_unique_combinations() {
let mod_entries = vec![
ModTableEntry {
base: AllowedAGCTN::T,
is_strand_plus: true,
mod_code: ModChar::new('T'),
data: vec![(0, 0, 10)],
},
ModTableEntry {
base: AllowedAGCTN::C,
is_strand_plus: true,
mod_code: ModChar::new('m'),
data: vec![(1, 1, 20)],
},
ModTableEntry {
base: AllowedAGCTN::T,
is_strand_plus: false,
mod_code: ModChar::new('T'),
data: vec![(2, 2, 30)],
},
];
let _: BaseMods = reconstruct_base_mods(&mod_entries, false, 0..i64::MAX, 10).unwrap();
}
}
#[cfg(test)]
mod test_serde {
use super::*;
use crate::nanalogue_bam_reader;
use indoc::indoc;
use rust_htslib::bam::Read as _;
#[test]
fn first_record_serde() -> Result<(), Error> {
let mut reader = nanalogue_bam_reader("examples/example_1.bam")?;
let first_record = reader.records().next().unwrap()?;
let curr_read = CurrRead::default()
.try_from_only_alignment(&first_record)?
.set_mod_data(&first_record, ThresholdState::GtEq(0), 0)?;
let actual_json: serde_json::Value = serde_json::to_value(&curr_read)?;
let expected_json_str = indoc! {r#"
{
"alignment_type": "primary_forward",
"alignment": {
"start": 9,
"end": 17,
"contig": "dummyI",
"contig_id": 0
},
"mod_table": [
{
"base": "T",
"is_strand_plus": true,
"mod_code": "T",
"data": [
[0, 9, 4],
[3, 12, 7],
[4, 13, 9],
[7, 16, 6]
]
}
],
"read_id": "5d10eb9a-aae1-4db8-8ec6-7ebb34d32575",
"seq_len": 8
}"#};
let expected_json: serde_json::Value = serde_json::from_str(expected_json_str)?;
assert_eq!(actual_json, expected_json);
let deserialized_curr_read: CurrRead<AlignAndModData> =
serde_json::from_str(expected_json_str)?;
assert_eq!(deserialized_curr_read, curr_read);
Ok(())
}
#[test]
fn first_record_roundtrip() -> Result<(), Error> {
let mut reader = nanalogue_bam_reader("examples/example_1.bam")?;
let first_record = reader.records().next().unwrap()?;
let original_curr_read = CurrRead::default()
.try_from_only_alignment(&first_record)?
.set_mod_data(&first_record, ThresholdState::GtEq(0), 0)?;
let json_str = serde_json::to_string_pretty(&original_curr_read)?;
let deserialized_curr_read: CurrRead<AlignAndModData> = serde_json::from_str(&json_str)?;
assert_eq!(deserialized_curr_read, original_curr_read);
Ok(())
}
#[test]
fn blank_json_record_roundtrip() -> Result<(), Error> {
let json_str = indoc! {r"
{
}"};
let curr_read: CurrRead<AlignAndModData> = serde_json::from_str(json_str)?;
let serialized_json = serde_json::to_string_pretty(&curr_read)?;
let roundtrip_curr_read: CurrRead<AlignAndModData> =
serde_json::from_str(&serialized_json)?;
assert_eq!(curr_read, roundtrip_curr_read);
Ok(())
}
#[test]
fn example_1_unmapped() -> Result<(), Error> {
let fourth_record = {
let mut reader = nanalogue_bam_reader("examples/example_1.bam")?;
let mut records = reader.records();
for _ in 0..3 {
let _: Record = records.next().unwrap()?;
}
records.next().unwrap()?
};
let curr_read = CurrRead::default()
.try_from_only_alignment(&fourth_record)?
.set_mod_data(&fourth_record, ThresholdState::GtEq(0), 0)?;
let actual_json: serde_json::Value = serde_json::to_value(&curr_read)?;
let expected_json_str = indoc! {r#"
{
"alignment_type": "unmapped",
"mod_table": [
{
"base": "G",
"is_strand_plus": false,
"mod_code": "7200",
"data": [
[28, -1, 0],
[29, -1, 0],
[30, -1, 0],
[32, -1, 0],
[43, -1, 77],
[44, -1, 0]
]
},
{
"base": "T",
"is_strand_plus": true,
"mod_code": "T",
"data": [
[3, -1, 221],
[8, -1, 242],
[27, -1, 0],
[39, -1, 47],
[47, -1, 239]
]
}
],
"read_id": "a4f36092-b4d5-47a9-813e-c22c3b477a0c",
"seq_len": 48
}"#};
let expected_json: serde_json::Value = serde_json::from_str(expected_json_str)?;
assert_eq!(actual_json, expected_json);
Ok(())
}
#[test]
#[should_panic(expected = "invalid alignment coordinates")]
fn invalid_align_coords_unmapped_with_reference_positions() {
let invalid_json = r#"{
"mod_table": [
{
"data": [[2, 3, 200]]
}
],
"seq_len": 10
}"#;
let _: CurrRead<AlignAndModData> = serde_json::from_str(invalid_json).unwrap();
}
#[test]
#[should_panic(expected = "invalid mod coordinates")]
fn invalid_sequence_length() {
let invalid_json = r#"{
"mod_table": [
{
"data": [[20, -1, 200]]
}
],
"seq_len": 10
}"#;
let _: CurrRead<AlignAndModData> = serde_json::from_str(invalid_json).unwrap();
}
#[test]
#[should_panic(expected = "invalid value")]
fn invalid_sequence_coordinate() {
let invalid_json = r#"{
"alignment_type": "primary_forward",
"alignment": {
"start": 0,
"end": 30
},
"mod_table": [
{
"data": [[-1, 1, 200]]
}
],
"seq_len": 30
}"#;
let _: CurrRead<AlignAndModData> = serde_json::from_str(invalid_json).unwrap();
}
#[test]
#[should_panic(expected = "invalid alignment coordinates")]
fn invalid_alignment_coordinates() {
let invalid_json = r#"{
"alignment_type": "primary_forward",
"alignment": {
"start": 10,
"end": 25
},
"mod_table": [
{
"data": [[0, 10, 200], [1, 20, 180], [2, 30, 220]]
}
],
"seq_len": 3
}"#;
let _: CurrRead<AlignAndModData> = serde_json::from_str(invalid_json).unwrap();
}
#[test]
#[should_panic(expected = "invalid mod coordinates")]
fn invalid_sorting_forward_alignment() {
let invalid_json = r#"{
"alignment_type": "primary_forward",
"alignment": {
"start": 10,
"end": 40
},
"mod_table": [
{
"data": [[0, 10, 200], [2, 30, 180], [1, 20, 220]]
}
],
"seq_len": 3
}"#;
let _: CurrRead<AlignAndModData> = serde_json::from_str(invalid_json).unwrap();
}
#[test]
#[should_panic(expected = "invalid mod coordinates")]
fn invalid_sorting_reverse_alignment() {
let invalid_json = r#"{
"alignment_type": "primary_reverse",
"alignment": {
"start": 10,
"end": 40
},
"mod_table": [
{
"data": [[2, 30, 180], [1, 20, 220], [0, 10, 200]]
}
],
"seq_len": 3
}"#;
let _: CurrRead<AlignAndModData> = serde_json::from_str(invalid_json).unwrap();
}
}