use std::collections::HashMap;
use std::fs::File;
use std::io::{self, BufRead};
use std::path::Path;
use crate::{Error, ErrorKind, Result};
#[derive(Debug)]
pub enum Type {
Int(i64),
Float(f64),
String(String),
Char(char),
}
impl Type {
fn parse(field_type: &str, value: &str) -> Option<Self> {
match field_type {
"i" => value.parse::<i64>().ok().map(Type::Int),
"f" => value.parse::<f64>().ok().map(Type::Float),
"Z" => Some(Type::String(value.to_string())),
"A" => value.chars().next().map(Type::Char),
_ => Some(Type::String(value.to_string())), }
}
pub fn get_int(&self) -> Option<&i64> {
match self {
Type::Int(v) => Some(v),
_ => None,
}
}
pub fn get_float(&self) -> Option<&f64> {
match self {
Type::Float(v) => Some(v),
_ => None,
}
}
pub fn get_string(&self) -> Option<&String> {
match self {
Type::String(v) => Some(v),
_ => None,
}
}
pub fn get_char(&self) -> Option<&char> {
match self {
Type::Char(v) => Some(v),
_ => None,
}
}
}
#[derive(Debug)]
#[allow(non_camel_case_types)]
pub enum Tag {
tp(Type),
cm(Type),
s1(Type),
s2(Type),
NM(Type),
MD(Type),
AS(Type),
SA(Type),
ms(Type),
nn(Type),
ts(Type),
cg(Type),
cs(Type),
dv(Type),
de(Type),
rl(Type),
zd(Type),
}
impl Tag {
pub fn parse(tag: &str, value: Type) -> Result<Self> {
match tag {
"tp" => Ok(Tag::tp(value)),
"cm" => Ok(Tag::cm(value)),
"s1" => Ok(Tag::s1(value)),
"s2" => Ok(Tag::s2(value)),
"NM" => Ok(Tag::NM(value)),
"MD" => Ok(Tag::MD(value)),
"AS" => Ok(Tag::AS(value)),
"SA" => Ok(Tag::SA(value)),
"ms" => Ok(Tag::ms(value)),
"nn" => Ok(Tag::nn(value)),
"ts" => Ok(Tag::ts(value)),
"cg" => Ok(Tag::cg(value)),
"cs" => Ok(Tag::cs(value)),
"dv" => Ok(Tag::dv(value)),
"de" => Ok(Tag::de(value)),
"rl" => Ok(Tag::rl(value)),
"zd" => Ok(Tag::zd(value)),
_ => Err(Error::new(ErrorKind::ReadRecord(format!(
"Invalid PAF tag: {}",
tag
)))),
}
}
fn to_string(&self) -> String {
match self {
Tag::tp(_) => "tp".into(),
Tag::cm(_) => "cm".into(),
Tag::s1(_) => "s1".into(),
Tag::s2(_) => "s2".into(),
Tag::NM(_) => "NM".into(),
Tag::MD(_) => "MD".into(),
Tag::AS(_) => "AS".into(),
Tag::SA(_) => "SA".into(),
Tag::ms(_) => "ms".into(),
Tag::nn(_) => "nn".into(),
Tag::ts(_) => "ts".into(),
Tag::cg(_) => "cg".into(),
Tag::cs(_) => "cs".into(),
Tag::dv(_) => "dv".into(),
Tag::de(_) => "de".into(),
Tag::rl(_) => "rl".into(),
Tag::zd(_) => "zd".into(),
}
}
}
#[derive(Debug)]
pub struct PafRecord {
query_name: String,
query_len: u32,
query_start: u32,
query_end: u32,
strand: char,
target_name: String,
target_len: u32,
target_start: u32,
target_end: u32,
residue_matches: u32,
alignment_block_len: u32,
mapping_quality: u8,
optional: HashMap<String, Tag>,
}
impl PafRecord {
pub fn new(
query_name: String,
query_len: u32,
query_start: u32,
query_end: u32,
strand: char,
target_name: String,
target_len: u32,
target_start: u32,
target_end: u32,
residue_matches: u32,
alignment_block_len: u32,
mapping_quality: u8,
optional: HashMap<String, Tag>,
) -> PafRecord {
PafRecord {
query_name,
query_len,
query_start,
query_end,
strand,
target_name,
target_len,
target_start,
target_end,
residue_matches,
alignment_block_len,
mapping_quality,
optional,
}
}
pub fn query_name(&self) -> &str {
&self.query_name
}
pub fn query_len(&self) -> u32 {
self.query_len
}
pub fn query_start(&self) -> u32 {
self.query_start
}
pub fn query_end(&self) -> u32 {
self.query_end
}
pub fn target_name(&self) -> &str {
&self.target_name
}
pub fn target_len(&self) -> u32 {
self.target_len
}
pub fn target_start(&self) -> u32 {
self.target_start
}
pub fn target_end(&self) -> u32 {
self.target_end
}
pub fn residue_matches(&self) -> u32 {
self.residue_matches
}
pub fn alignment_block_len(&self) -> u32 {
self.alignment_block_len
}
pub fn mapping_quality(&self) -> u8 {
self.mapping_quality
}
pub fn strand(&self) -> char {
self.strand
}
pub fn optional_fields(&self) -> &HashMap<String, Tag> {
&self.optional
}
pub fn tp(&self) -> Option<&char> {
self.optional.get("tp").map(|tag| match tag {
Tag::tp(t) => t.get_char().unwrap(),
_ => panic!("Invalid tag"),
})
}
pub fn cm(&self) -> Option<&i64> {
self.optional.get("cm").map(|tag| match tag {
Tag::cm(t) => t.get_int().unwrap(),
_ => panic!("Invalid tag"),
})
}
pub fn s1(&self) -> Option<&i64> {
self.optional.get("s1").map(|tag| match tag {
Tag::s1(t) => t.get_int().unwrap(),
_ => panic!("Invalid tag"),
})
}
pub fn s2(&self) -> Option<&i64> {
self.optional.get("s2").map(|tag| match tag {
Tag::s2(t) => t.get_int().unwrap(),
_ => panic!("Invalid tag"),
})
}
pub fn nm(&self) -> Option<&i64> {
self.optional.get("NM").map(|tag| match tag {
Tag::NM(t) => t.get_int().unwrap(),
_ => panic!("Invalid tag"),
})
}
pub fn md(&self) -> Option<&String> {
self.optional.get("MD").map(|tag| match tag {
Tag::MD(t) => t.get_string().unwrap(),
_ => panic!("Invalid tag"),
})
}
pub fn as_(&self) -> Option<&i64> {
self.optional.get("AS").map(|tag| match tag {
Tag::AS(t) => t.get_int().unwrap(),
_ => panic!("Invalid tag"),
})
}
pub fn sa(&self) -> Option<&String> {
self.optional.get("SA").map(|tag| match tag {
Tag::SA(t) => t.get_string().unwrap(),
_ => panic!("Invalid tag"),
})
}
pub fn ms(&self) -> Option<&i64> {
self.optional.get("ms").map(|tag| match tag {
Tag::ms(t) => t.get_int().unwrap(),
_ => panic!("Invalid tag"),
})
}
pub fn nn(&self) -> Option<&i64> {
self.optional.get("nn").map(|tag| match tag {
Tag::nn(t) => t.get_int().unwrap(),
_ => panic!("Invalid tag"),
})
}
pub fn ts(&self) -> Option<&char> {
self.optional.get("ts").map(|tag| match tag {
Tag::ts(t) => t.get_char().unwrap(),
_ => panic!("Invalid tag"),
})
}
pub fn cg(&self) -> Option<&String> {
self.optional.get("cg").map(|tag| match tag {
Tag::cg(t) => t.get_string().unwrap(),
_ => panic!("Invalid tag"),
})
}
pub fn cs(&self) -> Option<&String> {
self.optional.get("cs").map(|tag| match tag {
Tag::cs(t) => t.get_string().unwrap(),
_ => panic!("Invalid tag"),
})
}
pub fn dv(&self) -> Option<&f64> {
self.optional.get("dv").map(|tag| match tag {
Tag::dv(t) => t.get_float().unwrap(),
_ => panic!("Invalid tag"),
})
}
pub fn de(&self) -> Option<&f64> {
self.optional.get("de").map(|tag| match tag {
Tag::de(t) => t.get_float().unwrap(),
_ => panic!("Invalid tag"),
})
}
pub fn rl(&self) -> Option<&i64> {
self.optional.get("rl").map(|tag| match tag {
Tag::rl(t) => t.get_int().unwrap(),
_ => panic!("Invalid tag"),
})
}
}
pub struct Reader<R> {
reader: io::BufReader<R>,
line: u64,
}
impl Reader<File> {
pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Reader<File>> {
Ok(Reader::new(File::open(path)?))
}
pub fn from_reader<R: io::Read>(rdr: R) -> Reader<R> {
Reader::new(rdr)
}
}
fn parse_optional_fields(fields: &[&str]) -> Result<HashMap<String, Tag>> {
let mut map = HashMap::new();
for field in fields {
let parts: Vec<&str> = field.split(':').collect();
if parts.len() < 3 {
return Err(Error::new(ErrorKind::ReadRecord(
"Invalid PAF line: invalid optional field - too few parts".into(),
)));
}
let tag = parts[0];
let type_ = parts[1];
let inner = parts[2];
let type_ = Type::parse(type_, inner).ok_or_else(|| {
Error::new(ErrorKind::ReadRecord(format!(
"Invalid PAF line: invalid optional field type: {}",
type_
)))
})?;
let tag = Tag::parse(tag, type_)?;
map.insert(tag.to_string(), tag);
}
Ok(map)
}
impl<R: io::Read> Reader<R> {
pub fn new(rdr: R) -> Self {
Reader {
reader: io::BufReader::new(rdr),
line: 0,
}
}
pub fn records(&mut self) -> RecordsIter<R> {
RecordsIter::new(self)
}
pub fn into_records(self) -> RecordsIntoIter<R> {
RecordsIntoIter::new(self)
}
pub fn read_record(&mut self) -> Result<Option<PafRecord>> {
let mut line = String::new();
let bytes_read = match self.reader.read_line(&mut line) {
Ok(b) => b,
Err(e) => return Err(Error::new(ErrorKind::Io(e))),
};
if bytes_read == 0 {
return Ok(None); }
let columns: Vec<&str> = line.trim().split('\t').collect();
if columns.len() < 12 {
return Err(Error::new(ErrorKind::ReadRecord(format!(
"Invalid PAF at line {}: less than 12 mandatory fields",
self.line
))));
}
let query_name = columns[0].to_string();
let query_len = columns[1].parse::<u32>()?;
let query_start = columns[2].parse::<u32>()?;
let query_end = columns[3].parse::<u32>()?;
let strand = columns[4]
.chars()
.next()
.ok_or_else(|| Error::new(ErrorKind::ReadRecord("Empty strand field".into())))?;
if strand != '+' && strand != '-' {
return Err(Error::new(ErrorKind::ReadRecord(format!(
"Invalid strand field at line {}: {}",
self.line, strand
))));
}
let target_name = columns[5].to_string();
let target_len = columns[6].parse::<u32>()?;
let target_start = columns[7].parse::<u32>()?;
let target_end = columns[8].parse::<u32>()?;
let residue_matches = columns[9].parse::<u32>()?;
let alignment_block_len = columns[10].parse::<u32>()?;
let mapping_quality = columns[11].parse::<u8>()?;
let optional = parse_optional_fields(&columns[12..])?;
let record = PafRecord {
query_name,
query_len,
query_start,
query_end,
strand,
target_name,
target_len,
target_start,
target_end,
residue_matches,
alignment_block_len,
mapping_quality,
optional,
};
Ok(Some(record))
}
}
pub struct RecordsIter<'r, R: 'r> {
rdr: &'r mut Reader<R>,
}
impl<'r, R: io::Read> RecordsIter<'r, R> {
fn new(rdr: &'r mut Reader<R>) -> RecordsIter<'r, R> {
RecordsIter { rdr }
}
pub fn reader(&self) -> &Reader<R> {
self.rdr
}
pub fn reader_mut(&mut self) -> &mut Reader<R> {
self.rdr
}
}
impl<'r, R: io::Read> Iterator for RecordsIter<'r, R> {
type Item = Result<PafRecord>;
fn next(&mut self) -> Option<Result<PafRecord>> {
match self.rdr.read_record() {
Ok(Some(r)) => {
self.rdr.line += 1;
Some(Ok(r))
}
Ok(None) => None,
Err(e) => Some(Err(e)),
}
}
}
pub struct RecordsIntoIter<R> {
rdr: Reader<R>,
}
impl<R: io::Read> RecordsIntoIter<R> {
fn new(rdr: Reader<R>) -> RecordsIntoIter<R> {
RecordsIntoIter { rdr }
}
pub fn reader(&self) -> &Reader<R> {
&self.rdr
}
pub fn reader_mut(&mut self) -> &mut Reader<R> {
&mut self.rdr
}
pub fn into_reader(self) -> Reader<R> {
self.rdr
}
}
impl<R: io::Read> Iterator for RecordsIntoIter<R> {
type Item = Result<PafRecord>;
fn next(&mut self) -> Option<Result<PafRecord>> {
match self.rdr.read_record() {
Ok(Some(r)) => {
self.rdr.line += 1;
Some(Ok(r))
}
Ok(None) => None,
Err(e) => Some(Err(e)),
}
}
}
#[cfg(test)]
mod tests {
use super::Reader;
const PAF_RECORD_1: &[u8] = b"NC_041798.1 41841605 28850796 29394458 + SUPER_10 44636193 31974877 32470190 495111 515145 60 NM:i:48730 ms:i:488389 AS:i:439775 nn:i:28696 tp:A:P cm:i:46495 s1:i:466570 s2:i:10896 de:f:0.0003 zd:i:3 rl:i:3568165 cg:Z:770M1D945M1D389M1I9141M1I356M1D196M1I30268M2D789M3I992M2D1819M1D7M1D7M1I10M6D2922M1D17899M2D1010M4D12324M1I1376M1D5549M6D1839M1I2206M1D770M1D2287M1D16103M1D3238M1D2014M1D140M5I14M1D8496M2I2151M1I335M1D14424M1D1093M1I567M1D1835M2D1995M1D5257M1D639M1I699M1I133M1I52M1I99M2I26M1I195M1I1543M1I240M1I176M1I412M2D159M1I261M1D1158M1I933M2D12836M1D993M1D12263M2D4975M2I16452M3I396M1I3924M2D929M3I3015M1D225M1D4225M1D717M2D752M1D2051M1D5110M1D15073M1D1053M2D4369M1D619M3I13564M2I4386M1D1431M2D617M1I612M2I3445M2I252M1D220M1D237M1I903M1I145M1I53M1I197M1I1280M1D4201M1D1736M1D1289M1I3344M2D5456M1D488M1I1655M2D1830M1D796M1I19341M2D1165M1D1926M1D6041M1D2170M1D3917M1D926M1D759M1D400M2I8802M1I836M1I381M48451I166M1I4896M2D1522M49D2729M1D947M2D927M6D911M2D800M2D3040M1D13213M1D8999M3D847M1D220M1I673M1D165M1I901M1I2887M1I105M2I597M1I1201M1I53M2I494M1I23M1D99M1I146M1D29906M1D5661M1I27598M1D520M1I166M2D11600M1D388M1D844M1D4583M1D8390M1D5789M2D3773M1D4494M1D448M1D846M3D531M";
#[test]
fn test_read_record() {
let mut parser = Reader::from_reader(&PAF_RECORD_1[..]);
let record = parser.read_record().unwrap().unwrap();
assert_eq!(record.query_name(), "NC_041798.1");
assert_eq!(record.query_len(), 41841605);
assert_eq!(record.query_start(), 28850796);
assert_eq!(record.query_end(), 29394458);
assert_eq!(record.strand(), '+');
assert_eq!(record.target_name(), "SUPER_10");
assert_eq!(record.target_len(), 44636193);
assert_eq!(record.target_start(), 31974877);
assert_eq!(record.target_end(), 32470190);
assert_eq!(record.residue_matches(), 495111);
assert_eq!(record.alignment_block_len(), 515145);
assert_eq!(record.mapping_quality(), 60);
let nm = record.nm().unwrap();
assert_eq!(nm, &48730);
}
}