#![allow(unused)]
use std::{
collections::HashMap,
fs::File,
io::{self, ErrorKind, Read, Seek, SeekFrom},
path::Path,
};
#[cfg(feature = "encode")]
use bincode::{Decode, Encode};
use na_seq::{Seq, seq_from_str};
const HEADER_SIZE: usize = 26;
const DIR_SIZE: usize = 28;
#[cfg_attr(feature = "encode", derive(Encode, Decode))]
#[derive(Clone, Debug, Default)]
pub struct SeqRecordAb1 {
pub id: String,
pub name: String,
pub description: String,
pub sequence: Seq,
pub sequence_user: Option<Seq>,
pub annotations: HashMap<String, String>,
pub quality: Option<Vec<u8>>,
pub quality_user: Option<Vec<u8>>,
pub peak_heights: Vec<u16>,
pub data_ch1: Vec<u16>,
pub data_ch2: Vec<u16>,
pub data_ch3: Vec<u16>,
pub data_ch4: Vec<u16>,
pub peak_locations: Vec<u16>,
pub peak_locations_user: Option<Vec<u16>>,
}
#[derive(Debug)]
struct Header {
pub file_version: u16,
pub tag_name: Seq, pub tag_number: u32,
pub element_type_code: u16,
pub element_size: u16,
pub num_elements: usize,
pub data_size: u32,
pub data_offset: u32,
}
impl Header {
pub fn from_bytes(bytes: [u8; HEADER_SIZE]) -> io::Result<Self> {
let seq_str = std::str::from_utf8(&bytes[2..6]).unwrap().to_owned(); Ok(Self {
file_version: u16::from_be_bytes(bytes[0..2].try_into().unwrap()),
tag_name: seq_from_str(&seq_str),
tag_number: u32::from_be_bytes(bytes[6..10].try_into().unwrap()),
element_type_code: u16::from_be_bytes(bytes[10..12].try_into().unwrap()),
element_size: u16::from_be_bytes(bytes[12..14].try_into().unwrap()),
num_elements: u32::from_be_bytes(bytes[14..18].try_into().unwrap()) as usize,
data_size: u32::from_be_bytes(bytes[18..22].try_into().unwrap()),
data_offset: u32::from_be_bytes(bytes[22..26].try_into().unwrap()),
})
}
}
#[derive(Debug)]
struct Dir {
pub tag_name: String, pub tag_number: u32,
pub elem_code: u16,
pub num_elements: usize,
pub data_size: usize,
pub data_offset: usize,
pub tag_offset: usize,
}
impl Dir {
pub fn from_bytes(bytes: [u8; DIR_SIZE], tag_offset: usize) -> io::Result<Self> {
Ok(Self {
tag_name: std::str::from_utf8(&bytes[..4]).unwrap().to_owned(), tag_number: u32::from_be_bytes(bytes[4..8].try_into().unwrap()),
elem_code: u16::from_be_bytes(bytes[8..10].try_into().unwrap()),
num_elements: u32::from_be_bytes(bytes[12..16].try_into().unwrap()) as usize,
data_size: u32::from_be_bytes(bytes[16..20].try_into().unwrap()) as usize,
data_offset: u32::from_be_bytes(bytes[20..24].try_into().unwrap()) as usize,
tag_offset,
})
}
}
#[derive(Debug)]
struct AbiIterator<R: Read + Seek> {
stream: R,
}
impl<R: Read + Seek> AbiIterator<R> {
pub fn new(mut stream: R) -> io::Result<Self> {
let mut marker = [0; 4];
stream.read_exact(&mut marker)?;
if &marker != b"ABIF" {
return Err(io::Error::new(
ErrorKind::InvalidData,
"Invalid AB1 file start marker",
));
}
Ok(Self { stream })
}
pub fn next(&mut self) -> io::Result<Option<SeqRecordAb1>> {
let mut result = SeqRecordAb1::default();
let mut header_data = [0; HEADER_SIZE];
if self.stream.read(&mut header_data)? == 0 {
return Ok(None); }
let header = Header::from_bytes(header_data)?;
for i in 0..header.num_elements {
let start = header.data_offset as usize + i * header.element_size as usize;
self.stream.seek(SeekFrom::Start(start as u64))?;
let mut dir_buf = [0; DIR_SIZE];
if self.stream.read(&mut dir_buf)? == 0 {
return Ok(None); };
let mut dir = Dir::from_bytes(dir_buf, start)?;
let key = format!("{}{}", dir.tag_name, dir.tag_number);
if dir.data_size <= 4 {
dir.data_offset = dir.tag_offset + 20;
}
self.stream.seek(SeekFrom::Start(dir.data_offset as u64))?;
let mut tag_buf = vec![0; dir.data_size];
if self.stream.read(&mut tag_buf)? == 0 {
return Ok(None); };
let tag_data = parse_tag_data(dir.elem_code, dir.num_elements, &tag_buf)?;
match key.as_str() {
"PBAS1" => match tag_data {
TagData::Str(s) => {
result.sequence_user = Some(seq_from_str(&s));
}
_ => {
return Err(io::Error::new(
ErrorKind::InvalidData,
"Invalid PBAS sequence",
));
}
},
"PBAS2" => match tag_data {
TagData::Str(s) => {
result.sequence = seq_from_str(&s);
}
_ => {
return Err(io::Error::new(
ErrorKind::InvalidData,
"Invalid PBAS sequence",
));
}
},
"PCON1" => {
match tag_data {
TagData::Str(s) => {
result.quality_user = Some(s.as_bytes().to_vec());
}
_ => {
return Err(io::Error::new(
ErrorKind::InvalidData,
"Invalid quality data",
));
}
}
}
"PCON2" => {
match tag_data {
TagData::Str(s) => {
result.quality = Some(s.as_bytes().to_vec());
}
_ => {
return Err(io::Error::new(
ErrorKind::InvalidData,
"Invalid quality data",
));
}
}
}
"SMPL1" => match tag_data {
TagData::Str(s) => result.id = s,
_ => return Err(io::Error::new(ErrorKind::InvalidData, "Invalid sample ID")),
},
"PLOC1" => match tag_data {
TagData::U16(d) => {
result.peak_locations_user = Some(d);
}
_ => {
return Err(io::Error::new(
ErrorKind::InvalidData,
"Invalid peak location data",
));
}
},
"PLOC2" => match tag_data {
TagData::U16(d) => {
result.peak_locations = d;
}
_ => {
return Err(io::Error::new(
ErrorKind::InvalidData,
"Invalid peak location data",
));
}
},
"DATA9" => match tag_data {
TagData::U16(d) => {
result.data_ch1 = d;
}
_ => {
return Err(io::Error::new(
ErrorKind::InvalidData,
"Invalid height data",
));
}
},
"DATA10" => match tag_data {
TagData::U16(d) => {
result.data_ch2 = d;
}
_ => {
return Err(io::Error::new(
ErrorKind::InvalidData,
"Invalid height data",
));
}
},
"DATA11" => match tag_data {
TagData::U16(d) => {
result.data_ch3 = d;
}
_ => {
return Err(io::Error::new(
ErrorKind::InvalidData,
"Invalid height data",
));
}
},
"DATA12" => match tag_data {
TagData::U16(d) => {
result.data_ch4 = d;
}
_ => {
return Err(io::Error::new(
ErrorKind::InvalidData,
"Invalid height data",
));
}
},
_ => {
eprintln!("Invalid key in AB1 file: {key:?}");
eprintln!("Tag data for this key: {tag_data:?}");
}
}
}
Ok(Some(result))
}
}
fn parse_abi_tag(data: &[u8]) -> io::Result<(String, String)> {
let tag_name = String::from_utf8_lossy(&data[0..4]).to_string();
let tag_number = u32::from_be_bytes(
data[4..8]
.try_into()
.map_err(|err| io::Error::new(ErrorKind::InvalidData, err))?,
);
Ok((tag_name, tag_number.to_string()))
}
#[derive(Debug)]
enum TagData {
U8(Vec<u8>),
U16(Vec<u16>),
U32(Vec<u32>),
Str(String),
}
fn parse_tag_data(elem_code: u16, _elem_num: usize, data: &[u8]) -> io::Result<TagData> {
match elem_code {
2 => Ok(TagData::Str(
std::str::from_utf8(data).unwrap_or("").to_string(),
)),
4 => {
let as_u16 = data
.chunks_exact(2)
.map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]]))
.collect();
Ok(TagData::U16(as_u16))
}
5 => {
let as_u32 = data
.chunks_exact(4)
.map(|chunk| u32::from_be_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]))
.collect();
Ok(TagData::U32(as_u32))
}
_ => {
Err(io::Error::new(
ErrorKind::InvalidData,
format!("Invalid data type in AB1 file: {elem_code}"),
))
}
}
}
fn read_string<R: Read>(reader: &mut R, length: usize) -> io::Result<String> {
let mut buffer = vec![0; length];
reader.read_exact(&mut buffer)?;
Ok(String::from_utf8_lossy(&buffer)
.trim_end_matches(char::from(0))
.to_string())
}
pub fn import_ab1(path: &Path) -> io::Result<Vec<SeqRecordAb1>> {
let file = File::open(path)?;
let mut iterator = AbiIterator::new(file)?;
let mut results = Vec::new();
while let Some(record) = iterator.next()? {
results.push(record);
}
Ok(results)
}