use std::cmp;
use std::str::FromStr;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub(crate) struct PafRecord {
#[serde(
serialize_with = "serialize_bytes",
deserialize_with = "deserialize_bytes"
)]
pub query_name: Vec<u8>,
pub query_len: i32,
pub query_start: i32,
pub query_end: i32,
pub strand: char,
#[serde(
serialize_with = "serialize_bytes",
deserialize_with = "deserialize_bytes"
)]
pub target_name: Vec<u8>,
pub target_len: i32,
pub target_start: i32,
pub target_end: i32,
pub match_len: i32,
pub block_len: i32,
pub mapq: u32,
#[serde(serialize_with = "serialize_tp", deserialize_with = "deserialize_tag")]
pub tp: char,
#[serde(serialize_with = "serialize_cm", deserialize_with = "deserialize_tag")]
pub cm: i32,
#[serde(serialize_with = "serialize_s1", deserialize_with = "deserialize_tag")]
pub s1: i32,
#[serde(serialize_with = "serialize_dv", deserialize_with = "deserialize_tag")]
pub dv: f32,
#[serde(serialize_with = "serialize_rl", deserialize_with = "deserialize_tag")]
pub rl: i32,
}
impl PafRecord {
pub(crate) fn is_internal(&self, max_overhang_ratio: f32) -> bool {
let overhang = if self.strand == '+' {
cmp::min(self.query_start, self.target_start)
+ cmp::min(
self.query_len - self.query_end,
self.target_len - self.target_end,
)
} else {
cmp::min(self.query_start, self.target_len - self.target_end)
+ cmp::min(self.query_len - self.query_end, self.target_start)
};
let maplen = cmp::max(
self.query_end - self.query_start,
self.target_end - self.target_start,
);
let overhang_ratio = overhang as f32 / maplen as f32;
overhang_ratio < max_overhang_ratio
}
}
fn serialize_bytes<S>(bytes: &[u8], serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
let bytes = trim_null_bytes(bytes);
let s = String::from_utf8_lossy(bytes);
serializer.serialize_str(&s)
}
fn trim_null_bytes(data: &[u8]) -> &[u8] {
if let Some(end) = data.iter().rposition(|&byte| byte != 0) {
&data[..=end] } else {
&[] }
}
fn deserialize_bytes<'de, D>(deserializer: D) -> Result<Vec<u8>, D::Error>
where
D: Deserializer<'de>,
{
let s: &str = Deserialize::deserialize(deserializer)?;
Ok(s.as_bytes().to_vec())
}
fn serialize_tp<S, T>(value: &T, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
T: std::fmt::Display,
{
serialize_tag_with_name("tp", value, serializer)
}
fn serialize_cm<S, T>(value: &T, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
T: std::fmt::Display,
{
serialize_tag_with_name("cm", value, serializer)
}
fn serialize_s1<S, T>(value: &T, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
T: std::fmt::Display,
{
serialize_tag_with_name("s1", value, serializer)
}
fn serialize_dv<S>(value: &f32, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
let value = if *value < f32::EPSILON {
"0".to_string()
} else {
format!("{value:.4}",)
};
serialize_tag_with_name("dv", &value, serializer)
}
fn serialize_rl<S, T>(value: &T, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
T: std::fmt::Display,
{
serialize_tag_with_name("rl", value, serializer)
}
fn serialize_tag_with_name<S, T>(name: &str, value: &T, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
T: std::fmt::Display,
{
let mut prefix = match std::any::type_name::<T>() {
"char" => "A",
"i32" => "i",
"f32" => "f",
s => s,
};
if name == "dv" {
prefix = "f";
}
let formatted = format!("{name}:{prefix}:{value}",);
serializer.serialize_str(&formatted)
}
fn deserialize_tag<'de, T, D>(deserializer: D) -> Result<T, D::Error>
where
T: FromStr,
T::Err: std::fmt::Display,
D: Deserializer<'de>,
{
let s: &str = Deserialize::deserialize(deserializer)?;
s.split(':')
.next_back()
.ok_or_else(|| serde::de::Error::custom("Invalid field format"))
.and_then(|val| val.parse::<T>().map_err(serde::de::Error::custom))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_deserialize_mapping() {
let buf = b"SRR28370649.1\t4402\t40\t237\t-\tSRR28370649.7311\t5094\t41\t238\t190\t197\t0\ttp:A:S\tcm:i:59\ts1:i:190\tdv:f:0.0022\trl:i:56";
let expected = PafRecord {
query_name: b"SRR28370649.1".to_vec(),
query_len: 4402,
query_start: 40,
query_end: 237,
strand: '-',
target_name: b"SRR28370649.7311".to_vec(),
target_len: 5094,
target_start: 41,
target_end: 238,
match_len: 190,
block_len: 197,
mapq: 0,
tp: 'S',
cm: 59,
s1: 190,
dv: 0.0022,
rl: 56,
};
let mut rdr = csv::ReaderBuilder::new()
.delimiter(b'\t')
.has_headers(false)
.from_reader(&buf[..]);
for result in rdr.deserialize() {
let mapping: PafRecord = result.unwrap();
assert_eq!(mapping, expected);
}
}
#[test]
fn test_serialize_mapping() {
let mapping = PafRecord {
query_name: b"SRR28370649.1".to_vec(),
query_len: 4402,
query_start: 40,
query_end: 237,
strand: '-',
target_name: b"SRR28370649.7311".to_vec(),
target_len: 5094,
target_start: 41,
target_end: 238,
match_len: 190,
block_len: 197,
mapq: 0,
tp: 'S',
cm: 59,
s1: 190,
dv: 0.0022,
rl: 56,
};
let mut wtr = csv::WriterBuilder::new()
.delimiter(b'\t')
.has_headers(false)
.from_writer(vec![]);
wtr.serialize(mapping).unwrap();
let result = wtr.into_inner().unwrap();
let result = String::from_utf8(result).unwrap();
let expected = "SRR28370649.1\t4402\t40\t237\t-\tSRR28370649.7311\t5094\t41\t238\t190\t197\t0\ttp:A:S\tcm:i:59\ts1:i:190\tdv:f:0.0022\trl:i:56\n";
assert_eq!(result, expected);
}
#[test]
fn test_serialize_mapping_null_terminated_qname() {
let mapping = PafRecord {
query_name: b"SRR28370649.1\0".to_vec(),
query_len: 4402,
query_start: 40,
query_end: 237,
strand: '-',
target_name: b"SRR28370649.7311".to_vec(),
target_len: 5094,
target_start: 41,
target_end: 238,
match_len: 190,
block_len: 197,
mapq: 0,
tp: 'S',
cm: 59,
s1: 190,
dv: 0.0022,
rl: 56,
};
let mut wtr = csv::WriterBuilder::new()
.delimiter(b'\t')
.has_headers(false)
.from_writer(vec![]);
wtr.serialize(mapping).unwrap();
let result = wtr.into_inner().unwrap();
let result = String::from_utf8(result).unwrap();
let expected = "SRR28370649.1\t4402\t40\t237\t-\tSRR28370649.7311\t5094\t41\t238\t190\t197\t0\ttp:A:S\tcm:i:59\ts1:i:190\tdv:f:0.0022\trl:i:56\n";
assert_eq!(result, expected);
}
#[test]
fn test_serialize_mapping_dv_round_down() {
let mapping = PafRecord {
query_name: b"SRR28370649.1".to_vec(),
query_len: 4402,
query_start: 40,
query_end: 237,
strand: '-',
target_name: b"SRR28370649.7311".to_vec(),
target_len: 5094,
target_start: 41,
target_end: 238,
match_len: 190,
block_len: 197,
mapq: 0,
tp: 'S',
cm: 59,
s1: 190,
dv: 0.0022111,
rl: 56,
};
let mut wtr = csv::WriterBuilder::new()
.delimiter(b'\t')
.has_headers(false)
.from_writer(vec![]);
wtr.serialize(mapping).unwrap();
let result = wtr.into_inner().unwrap();
let result = String::from_utf8(result).unwrap();
let expected = "SRR28370649.1\t4402\t40\t237\t-\tSRR28370649.7311\t5094\t41\t238\t190\t197\t0\ttp:A:S\tcm:i:59\ts1:i:190\tdv:f:0.0022\trl:i:56\n";
assert_eq!(result, expected);
}
#[test]
fn test_serialize_mapping_dv_round_up() {
let mapping = PafRecord {
query_name: b"SRR28370649.1".to_vec(),
query_len: 4402,
query_start: 40,
query_end: 237,
strand: '-',
target_name: b"SRR28370649.7311".to_vec(),
target_len: 5094,
target_start: 41,
target_end: 238,
match_len: 190,
block_len: 197,
mapq: 0,
tp: 'S',
cm: 59,
s1: 190,
dv: 0.0021999,
rl: 56,
};
let mut wtr = csv::WriterBuilder::new()
.delimiter(b'\t')
.has_headers(false)
.from_writer(vec![]);
wtr.serialize(mapping).unwrap();
let result = wtr.into_inner().unwrap();
let result = String::from_utf8(result).unwrap();
let expected = "SRR28370649.1\t4402\t40\t237\t-\tSRR28370649.7311\t5094\t41\t238\t190\t197\t0\ttp:A:S\tcm:i:59\ts1:i:190\tdv:f:0.0022\trl:i:56\n";
assert_eq!(result, expected);
}
#[test]
fn test_serialize_mapping_dv_fill_to_decimal_places() {
let mapping = PafRecord {
query_name: b"SRR28370649.1".to_vec(),
query_len: 4402,
query_start: 40,
query_end: 237,
strand: '-',
target_name: b"SRR28370649.7311".to_vec(),
target_len: 5094,
target_start: 41,
target_end: 238,
match_len: 190,
block_len: 197,
mapq: 0,
tp: 'S',
cm: 59,
s1: 190,
dv: 0.004,
rl: 56,
};
let mut wtr = csv::WriterBuilder::new()
.delimiter(b'\t')
.has_headers(false)
.from_writer(vec![]);
wtr.serialize(mapping).unwrap();
let result = wtr.into_inner().unwrap();
let result = String::from_utf8(result).unwrap();
let expected = "SRR28370649.1\t4402\t40\t237\t-\tSRR28370649.7311\t5094\t41\t238\t190\t197\t0\ttp:A:S\tcm:i:59\ts1:i:190\tdv:f:0.0040\trl:i:56\n";
assert_eq!(result, expected);
}
#[test]
fn test_serialize_mapping_dv_zero() {
let mapping = PafRecord {
query_name: b"SRR28370649.1".to_vec(),
query_len: 4402,
query_start: 40,
query_end: 237,
strand: '-',
target_name: b"SRR28370649.7311".to_vec(),
target_len: 5094,
target_start: 41,
target_end: 238,
match_len: 190,
block_len: 197,
mapq: 0,
tp: 'S',
cm: 59,
s1: 190,
dv: 0.0000,
rl: 56,
};
let mut wtr = csv::WriterBuilder::new()
.delimiter(b'\t')
.has_headers(false)
.from_writer(vec![]);
wtr.serialize(mapping).unwrap();
let result = wtr.into_inner().unwrap();
let result = String::from_utf8(result).unwrap();
let expected = "SRR28370649.1\t4402\t40\t237\t-\tSRR28370649.7311\t5094\t41\t238\t190\t197\t0\ttp:A:S\tcm:i:59\ts1:i:190\tdv:f:0\trl:i:56\n";
assert_eq!(result, expected);
}
#[test]
fn test_is_internal() {
let mapping = PafRecord {
query_name: b"SRR28370649.1".to_vec(),
query_len: 390,
query_start: 46,
query_end: 317,
strand: '+',
target_name: b"SRR28370649.7311".to_vec(),
target_len: 278,
target_start: 4,
target_end: 275,
match_len: 260,
block_len: 271,
mapq: 0,
tp: 'S',
cm: 77,
s1: 260,
dv: 0.0,
rl: 0,
};
assert!(mapping.is_internal(0.2));
}
#[test]
fn test_is_internal2() {
let mapping = PafRecord {
query_name: b"SRR28370649.1".to_vec(),
query_len: 298,
query_start: 1,
query_end: 297,
strand: '+',
target_name: b"SRR28370649.7311".to_vec(),
target_len: 398,
target_start: 54,
target_end: 350,
match_len: 276,
block_len: 296,
mapq: 0,
tp: 'S',
cm: 77,
s1: 260,
dv: 0.0,
rl: 0,
};
assert!(mapping.is_internal(0.2));
}
#[test]
fn test_is_internal3() {
let mapping = PafRecord {
query_name: b"SRR28370649.1".to_vec(),
query_len: 390,
query_start: 0,
query_end: 355,
strand: '+',
target_name: b"SRR28370649.7311".to_vec(),
target_len: 418,
target_start: 39,
target_end: 394,
match_len: 335,
block_len: 355,
mapq: 0,
tp: 'S',
cm: 77,
s1: 260,
dv: 0.0,
rl: 0,
};
assert!(!mapping.is_internal(0.05));
}
}