use crate::types::*;
use atoi::atoi;
use chrono::{
format::{strftime::StrftimeItems, Parsed},
NaiveDate,
};
use nom::{
bytes::complete::{is_not, tag, take, take_while},
character::complete::{anychar, char, line_ending, multispace1, not_line_ending},
combinator::{map, map_res, peek},
IResult,
};
pub trait FieldParser {
type Output;
fn parse(inp: &[u8]) -> IResult<&[u8], Self::Output>;
fn parse_into<'a, 'b>(inp: &'a [u8], dst: &'b mut Self::Output) -> &'a [u8] {
let (i, data) = Self::parse(inp).expect("parse error");
*dst = data;
i
}
fn parse_into_vec<'a>(inp: &'a [u8], dst: &mut Vec<Self::Output>) -> &'a [u8] {
let (i, data) = Self::parse(inp).expect("parse error");
dst.push(data);
i
}
fn parse_into_option<'a>(inp: &'a [u8], dst: &mut Option<Self::Output>) -> &'a [u8] {
let (i, data) = Self::parse(inp).expect("parse error");
*dst = Some(data);
i
}
}
pub trait FieldParserWithModifiedTable {
type Output;
fn parse<'a>(
inp: &'a [u8],
modified_aa: &ModifiedAminoAcidTable,
modified_nuc: &ModifiedNucleotideTable,
) -> IResult<&'a [u8], Self::Output>;
fn parse_into<'a, 'b>(
inp: &'a [u8],
dst: &'b mut Self::Output,
modified_aa: &ModifiedAminoAcidTable,
modified_nuc: &ModifiedNucleotideTable,
) -> &'a [u8] {
let (i, data) = Self::parse(inp, modified_aa, modified_nuc).expect("parse error");
*dst = data;
i
}
fn parse_into_vec<'a>(
inp: &'a [u8],
dst: &mut Vec<Self::Output>,
modified_aa: &ModifiedAminoAcidTable,
modified_nuc: &ModifiedNucleotideTable,
) -> &'a [u8] {
let (i, data) = Self::parse(inp, modified_aa, modified_nuc).expect("parse error");
dst.push(data);
i
}
}
pub(crate) fn jump_newline(inp: &[u8]) -> IResult<&[u8], ()> {
let (inp, _) = not_line_ending(inp)?;
let (inp, _) = line_ending(inp)?;
Ok((inp, ()))
}
fn char_is_space(c: u8) -> bool {
c == b' '
}
pub(crate) fn parse_date(i: &[u8]) -> IResult<&[u8], NaiveDate> {
let (i, day) = take(2usize)(i)?;
let (i, _) = take(1usize)(i)?;
let (i, month) = parse_month(i)?;
let (i, _) = take(1usize)(i)?;
let (i, year) = take(2usize)(i)?;
let mut year = atoi::<i32>(year).unwrap();
if year < 50i32 {
year += 2000
} else {
year += 1900
}
Ok((
i,
NaiveDate::from_ymd(year, month, atoi::<u32>(day).unwrap()),
))
}
fn parse_month(i: &[u8]) -> IResult<&[u8], u32> {
map_res(take(3usize), |s: &[u8]| -> Result<u32, ()> {
let s = unsafe { std::str::from_utf8_unchecked(s) };
let mut parsed = Parsed::new();
chrono::format::parse(&mut parsed, s, StrftimeItems::new("%b"))
.expect("Failed to parse month");
Ok(parsed.month.unwrap())
})(i)
}
pub(crate) fn parse_right<'a, T>(i: &'a [u8], length: usize) -> IResult<&'a [u8], T>
where
T: std::str::FromStr,
{
let (i, s) = take_while(char_is_space)(i)?;
let l = s.len();
if l >= length {
panic!("Failed to parse int")
}
let (i, digit) = take(length - l)(i)?;
let digit = unsafe { std::str::from_utf8_unchecked(digit) };
match digit.parse() {
Err(_) => Err(nom::Err::Error((i, nom::error::ErrorKind::Digit))),
Ok(x) => Ok((i, x)),
}
}
pub(crate) fn parse_multiline_list(inp: &[u8]) -> IResult<&[u8], Vec<String>> {
let (mut inp, _) = take(4usize)(inp)?; let mut v: Vec<String> = Vec::new();
loop {
let (i, item) = take_while(|c| c != b',' && c != b'\n' && c != b'\r')(inp)?;
let item = unsafe { std::str::from_utf8_unchecked(item) };
v.push(item.trim().to_owned());
let (i, comma_or_newline) = anychar(i)?; if comma_or_newline == ',' {
let (i, char_after_comma) = peek(anychar)(i)?;
if char_after_comma == ' ' {
let (i, second_char_after_comma) = peek(anychar)(anychar(i)?.0)?;
if !second_char_after_comma.is_alphanumeric() {
inp = multispace1(i)?.0;
inp = take(10usize)(inp)?.0; } else {
inp = i;
}
} else {
inp = i;
}
} else {
inp = take_while(|x| x == b'\n')(i)?.0;
return Ok((inp, v));
}
}
}
pub(crate) fn parse_multiline_string<'a>(
inp: &'a [u8],
record_identifier: &[u8],
) -> IResult<&'a [u8], String> {
let (mut inp, _) = take(4usize)(inp)?; let mut s = String::new();
loop {
let (i, item) = not_line_ending(inp)?;
let item = std::str::from_utf8(item).unwrap();
s.push_str(item.trim_end());
let (i, _) = line_ending(i)?;
if peek(take(6usize))(i)?.1 != record_identifier {
return Ok((i, s));
}
let (i, _) = take(10usize)(i)?;
inp = i;
}
}
pub(crate) fn parse_multiline<'a, T, F>(
inp: &'a [u8],
record_identifier: &[u8],
continuation: bool,
parse_oneline: F,
) -> IResult<&'a [u8], Vec<T>>
where
F: Fn(&'a [u8]) -> IResult<&'a [u8], T>,
{
let offset = if continuation { 10usize } else { 6usize };
let (mut inp, _) = take(4usize)(inp)?; let mut res = Vec::<T>::new();
loop {
let (i, item) = parse_oneline(inp)?;
res.push(item);
if peek(take(6usize))(i)?.1 != record_identifier {
return Ok((i, res));
}
let (i, _) = take(offset)(i)?;
inp = i;
}
}
use std::collections::HashMap;
pub(crate) fn parse_residue<'a, 'b>(
inp: &'a [u8],
modified_aa: &'b ModifiedAminoAcidTable,
modified_nuc: &'b ModifiedNucleotideTable,
) -> IResult<&'a [u8], Residue> {
let (inp, residue) = take(3usize)(inp)?;
let residue_s = unsafe { std::str::from_utf8_unchecked(residue).to_owned() };
let residue = if let Some(res) = StandardAminoAcid::try_parse_fw3(&residue) {
Residue::AminoAcid(AminoAcid::Standard(res))
} else if let Some(_res) = modified_aa.get(&residue_s) {
Residue::AminoAcid(AminoAcid::Modified(residue_s))
} else if let Some(res) = StandardNucleotide::try_parse_fw3(&residue) {
Residue::Nucleotide(Nucleotide::Standard(res))
} else if let Some(_res) = modified_nuc.get(&residue_s) {
Residue::Nucleotide(Nucleotide::Modified(residue_s))
} else {
match residue {
b"HOH" => Residue::Water,
b"UNX" => Residue::UnknownAtomOrIon,
b"UNL" => Residue::UnknownLigand,
_ => Residue::Other(residue_s),
}
};
Ok((inp, residue))
}
pub(crate) unsafe fn take_trim_own(inp: &[u8], n: usize) -> IResult<&[u8], String> {
let (inp, x) = take(n)(inp)?;
Ok((inp, std::str::from_utf8_unchecked(x).trim().to_owned()))
}
pub(crate) unsafe fn take_trim_start_own(inp: &[u8], n: usize) -> IResult<&[u8], String> {
let (inp, x) = take(n)(inp)?;
Ok((
inp,
std::str::from_utf8_unchecked(x).trim_start().to_owned(),
))
}