nom_pdb/common/parser.rs
1// Copyright (c) 2020 Tianyi Shi
2//
3// This software is released under the MIT License.
4// https://opensource.org/licenses/MIT
5
6use crate::types::*;
7use atoi::atoi;
8use chrono::{
9 format::{strftime::StrftimeItems, Parsed},
10 NaiveDate,
11};
12use nom::{
13 bytes::complete::{is_not, tag, take, take_while},
14 character::complete::{anychar, char, line_ending, multispace1, not_line_ending},
15 combinator::{map, map_res, peek},
16 IResult,
17};
18
19pub trait FieldParser {
20 type Output;
21 fn parse(inp: &[u8]) -> IResult<&[u8], Self::Output>;
22 fn parse_into<'a, 'b>(inp: &'a [u8], dst: &'b mut Self::Output) -> &'a [u8] {
23 let (i, data) = Self::parse(inp).expect("parse error");
24 *dst = data;
25 i
26 }
27 fn parse_into_vec<'a>(inp: &'a [u8], dst: &mut Vec<Self::Output>) -> &'a [u8] {
28 let (i, data) = Self::parse(inp).expect("parse error");
29 dst.push(data);
30 i
31 }
32 fn parse_into_option<'a>(inp: &'a [u8], dst: &mut Option<Self::Output>) -> &'a [u8] {
33 let (i, data) = Self::parse(inp).expect("parse error");
34 *dst = Some(data);
35 i
36 }
37}
38
39pub trait FieldParserWithModifiedTable {
40 type Output;
41 fn parse<'a>(
42 inp: &'a [u8],
43 modified_aa: &ModifiedAminoAcidTable,
44 modified_nuc: &ModifiedNucleotideTable,
45 ) -> IResult<&'a [u8], Self::Output>;
46 fn parse_into<'a, 'b>(
47 inp: &'a [u8],
48 dst: &'b mut Self::Output,
49 modified_aa: &ModifiedAminoAcidTable,
50 modified_nuc: &ModifiedNucleotideTable,
51 ) -> &'a [u8] {
52 let (i, data) = Self::parse(inp, modified_aa, modified_nuc).expect("parse error");
53 *dst = data;
54 i
55 }
56 fn parse_into_vec<'a>(
57 inp: &'a [u8],
58 dst: &mut Vec<Self::Output>,
59 modified_aa: &ModifiedAminoAcidTable,
60 modified_nuc: &ModifiedNucleotideTable,
61 ) -> &'a [u8] {
62 let (i, data) = Self::parse(inp, modified_aa, modified_nuc).expect("parse error");
63 dst.push(data);
64 i
65 }
66}
67
68// fn ws<'a, F: 'a, O, E: ParseError<&'a [u8]>>(inner: F) -> impl Fn(&'a [u8]) -> IResult<&'a [u8], O, E>
69// where
70// F: Fn(&'a [u8]) -> IResult<&'a [u8], O, E>,
71// {
72// preceded(multispace0, &inner)(i)
73// }
74
75pub(crate) fn jump_newline(inp: &[u8]) -> IResult<&[u8], ()> {
76 let (inp, _) = not_line_ending(inp)?;
77 let (inp, _) = line_ending(inp)?;
78 Ok((inp, ()))
79}
80
81fn char_is_space(c: u8) -> bool {
82 c == b' '
83}
84
85pub(crate) fn parse_date(i: &[u8]) -> IResult<&[u8], NaiveDate> {
86 let (i, day) = take(2usize)(i)?;
87 let (i, _) = take(1usize)(i)?;
88 let (i, month) = parse_month(i)?;
89 let (i, _) = take(1usize)(i)?;
90 let (i, year) = take(2usize)(i)?;
91 let mut year = atoi::<i32>(year).unwrap();
92 if year < 50i32 {
93 year += 2000
94 } else {
95 year += 1900
96 }
97 Ok((
98 i,
99 NaiveDate::from_ymd(year, month, atoi::<u32>(day).unwrap()),
100 ))
101}
102
103fn parse_month(i: &[u8]) -> IResult<&[u8], u32> {
104 map_res(take(3usize), |s: &[u8]| -> Result<u32, ()> {
105 let s = unsafe { std::str::from_utf8_unchecked(s) };
106 let mut parsed = Parsed::new();
107 chrono::format::parse(&mut parsed, s, StrftimeItems::new("%b"))
108 .expect("Failed to parse month");
109 Ok(parsed.month.unwrap())
110 })(i)
111}
112
113pub(crate) fn parse_right<'a, T>(i: &'a [u8], length: usize) -> IResult<&'a [u8], T>
114where
115 T: std::str::FromStr,
116{
117 let (i, s) = take_while(char_is_space)(i)?;
118 let l = s.len();
119 if l >= length {
120 panic!("Failed to parse int")
121 }
122 let (i, digit) = take(length - l)(i)?;
123 let digit = unsafe { std::str::from_utf8_unchecked(digit) };
124 match digit.parse() {
125 Err(_) => Err(nom::Err::Error((i, nom::error::ErrorKind::Digit))),
126 Ok(x) => Ok((i, x)),
127 }
128}
129
130// * MULTILINE PARSERS ---------------------------------------------------------
131
132pub(crate) fn parse_multiline_list(inp: &[u8]) -> IResult<&[u8], Vec<String>> {
133 // ! need improvement
134 let (mut inp, _) = take(4usize)(inp)?; // 7 - 10
135 let mut v: Vec<String> = Vec::new();
136 loop {
137 let (i, item) = take_while(|c| c != b',' && c != b'\n' && c != b'\r')(inp)?;
138 let item = unsafe { std::str::from_utf8_unchecked(item) };
139 v.push(item.trim().to_owned());
140 let (i, comma_or_newline) = anychar(i)?; // consume \r or \n if newline
141 if comma_or_newline == ',' {
142 let (i, char_after_comma) = peek(anychar)(i)?;
143 if char_after_comma == ' ' {
144 let (i, second_char_after_comma) = peek(anychar)(anychar(i)?.0)?;
145 if !second_char_after_comma.is_alphanumeric() {
146 // newline
147 inp = multispace1(i)?.0;
148 inp = take(10usize)(inp)?.0; // 1 - 10
149 } else {
150 inp = i;
151 }
152 } else {
153 inp = i;
154 }
155 } else {
156 // end
157 inp = take_while(|x| x == b'\n')(i)?.0;
158 return Ok((inp, v));
159 }
160 }
161}
162
163pub(crate) fn parse_multiline_string<'a>(
164 inp: &'a [u8],
165 record_identifier: &[u8],
166) -> IResult<&'a [u8], String> {
167 // ! need improvement
168 let (mut inp, _) = take(4usize)(inp)?; // 7 - 10
169 let mut s = String::new();
170 loop {
171 let (i, item) = not_line_ending(inp)?;
172 let item = std::str::from_utf8(item).unwrap();
173 s.push_str(item.trim_end());
174 let (i, _) = line_ending(i)?;
175 if peek(take(6usize))(i)?.1 != record_identifier {
176 return Ok((i, s));
177 }
178 let (i, _) = take(10usize)(i)?;
179 inp = i;
180 }
181}
182
183pub(crate) fn parse_multiline<'a, T, F>(
184 inp: &'a [u8],
185 record_identifier: &[u8],
186 continuation: bool,
187 parse_oneline: F,
188) -> IResult<&'a [u8], Vec<T>>
189where
190 F: Fn(&'a [u8]) -> IResult<&'a [u8], T>,
191{
192 // ! need improvement
193 let offset = if continuation { 10usize } else { 6usize };
194 let (mut inp, _) = take(4usize)(inp)?; // 7 - 10
195 let mut res = Vec::<T>::new();
196 loop {
197 let (i, item) = parse_oneline(inp)?;
198 res.push(item);
199 if peek(take(6usize))(i)?.1 != record_identifier {
200 return Ok((i, res));
201 }
202 let (i, _) = take(offset)(i)?;
203 inp = i;
204 }
205}
206
207use std::collections::HashMap;
208
209pub(crate) fn parse_residue<'a, 'b>(
210 inp: &'a [u8],
211 modified_aa: &'b ModifiedAminoAcidTable,
212 modified_nuc: &'b ModifiedNucleotideTable,
213) -> IResult<&'a [u8], Residue> {
214 let (inp, residue) = take(3usize)(inp)?;
215 let residue_s = unsafe { std::str::from_utf8_unchecked(residue).to_owned() };
216 let residue = if let Some(res) = StandardAminoAcid::try_parse_fw3(&residue) {
217 Residue::AminoAcid(AminoAcid::Standard(res))
218 } else if let Some(_res) = modified_aa.get(&residue_s) {
219 Residue::AminoAcid(AminoAcid::Modified(residue_s))
220 } else if let Some(res) = StandardNucleotide::try_parse_fw3(&residue) {
221 Residue::Nucleotide(Nucleotide::Standard(res))
222 } else if let Some(_res) = modified_nuc.get(&residue_s) {
223 Residue::Nucleotide(Nucleotide::Modified(residue_s))
224 } else {
225 match residue {
226 b"HOH" => Residue::Water,
227 b"UNX" => Residue::UnknownAtomOrIon,
228 b"UNL" => Residue::UnknownLigand,
229 _ => Residue::Other(residue_s),
230 }
231 };
232 Ok((inp, residue))
233}
234
235pub(crate) unsafe fn take_trim_own(inp: &[u8], n: usize) -> IResult<&[u8], String> {
236 let (inp, x) = take(n)(inp)?;
237 Ok((inp, std::str::from_utf8_unchecked(x).trim().to_owned()))
238}
239
240pub(crate) unsafe fn take_trim_start_own(inp: &[u8], n: usize) -> IResult<&[u8], String> {
241 let (inp, x) = take(n)(inp)?;
242 Ok((
243 inp,
244 std::str::from_utf8_unchecked(x).trim_start().to_owned(),
245 ))
246}
247
248// pub(crate) unsafe fn take_trim_end_own(inp: &[u8], n: usize) -> IResult<&[u8], String> {
249// let (inp, x) = take(n)(inp)?;
250// Ok((inp, std::str::from_utf8_unchecked(x).trim_end().to_owned()))
251// }
252
253// pub(crate) fn parse_specification(inp: &[u8]) -> IResult<&[u8], Token> {
254// let (mut inp, _) = take(4usize)(inp)?;
255// let (inp, token) = is_not(":")(inp)?;
256// }
257
258// Represents keys of CMPND and SOURCE records
259// #[derive(Debug, PartialEq, Clone)]
260// pub enum Token {
261// MoleculeId(u32),
262// Molecule(String),
263// Chain { identifiers: Vec<String> },
264// Fragment(String),
265// Synonym { synonyms: Vec<String> },
266// Ec { commission_numbers: Vec<String> },
267// Engineered(bool),
268// Mutation(bool),
269// OtherDetails(String),
270// Synthetic(String),
271// OrganismScientific(String),
272// OrganismCommon { organisms: Vec<String> },
273// OrganismTaxId { id: Vec<u32> },
274// Strain(String),
275// Variant(String),
276// CellLine(String),
277// Atcc(u32),
278// Organ(String),
279// Tissue(String),
280// Cell(String),
281// Organelle(String),
282// Secretion(String),
283// CellularLocation(String),
284// Plasmid(String),
285// Gene { gene: Vec<String> },
286// ExpressionSystem(String),
287// ExpressionSystemCommon { systems: Vec<String> },
288// ExpressionSystemTaxId { id: Vec<u32> },
289// ExpressionSystemStrain(String),
290// ExpressionSystemVariant(String),
291// ExpressionSystemCellLine(String),
292// ExpressionSystemAtcc(u32),
293// ExpressionSystemOrgan(String),
294// ExpressionSystemTissue(String),
295// ExpressionSystemCell(String),
296// ExpressionSystemOrganelle(String),
297// ExpressionSystemCellularLocation(String),
298// ExpressionSystemVectorType(String),
299// ExpressionSystemVector(String),
300// ExpressionSystemPlasmid(String),
301// ExpressionSystemGene(String),
302// }
303
304// /// Represents a modification made to this pdb entry.
305// #[derive(Debug, Clone)]
306// pub struct Revdat {
307// pub modification_number: u32,
308// pub modification_date: NaiveDate,
309// pub idcode: String,
310// pub modification_type: ModificationType,
311// pub modification_detail: Vec<String>,
312// }
313
314// /// modification type of REVDAT record
315// #[derive(Debug, Clone)]
316// pub enum ModificationType {
317// /// initial release of the entry. Indicated as 0
318// /// in a REVDAT record
319// InitialRelease,
320// /// modifications other than initial release
321// /// Indicated with 1 in a REVDAT record.
322// OtherModification,
323// /// modification type other than 0 or 1
324// UnknownModification,
325// }
326
327// /// Serial Number Type of a JRNL REFN record
328// #[derive(Debug, Clone, PartialEq)]
329// pub enum SerialNumber {
330// Issn,
331// Essn,
332// }
333
334// /// contains HEADER recor information
335// #[derive(Debug, Clone)]
336// pub struct Header {
337// pub classification: String,
338// pub deposition_date: NaiveDate,
339// pub id_code: String,
340// }
341
342// impl std::default::Default for Header {
343// fn default() -> Self {
344// Header {
345// classification: String::default(),
346// deposition_date: NaiveDate::from_ymd(1900, 1, 1),
347// id_code: String::default(),
348// }
349// }
350// }
351
352// /// result of a TITLE record
353// #[derive(Debug, Clone, Default)]
354// pub struct Title {
355// pub title: String,
356// }
357
358// /// contains pdb entry ids which removed
359// /// this one from PDB
360// #[derive(Debug, Clone)]
361// pub struct Obslte {
362// pub replacement_date: NaiveDate,
363// pub replacement_ids: Vec<String>,
364// }
365
366// impl std::default::Default for Obslte {
367// fn default() -> Self {
368// Obslte {
369// replacement_date: NaiveDate::from_ymd(1900, 1, 1),
370// replacement_ids: Vec::new(),
371// }
372// }
373// }
374
375// /// if this entry is a part of bigger
376// /// structure, this struct holds ids of other
377// /// parts of the bigger structure
378// #[derive(Debug, Clone, Default)]
379// pub struct Split {
380// pub id_codes: Vec<String>,
381// }
382
383// /// fallacies of this entry
384// #[derive(Debug, Clone, Default)]
385// pub struct Caveat {
386// pub id_code: String,
387// pub comment: String,
388// }
389
390// /// pdb entry ids made obsolete by this entry
391// #[derive(Debug, Clone)]
392// pub struct Sprsde {
393// pub sprsde_date: NaiveDate,
394// pub id_code: String,
395// pub superseeded: Vec<String>,
396// }
397
398// impl std::default::Default for Sprsde {
399// fn default() -> Self {
400// Sprsde {
401// sprsde_date: NaiveDate::from_ymd(1900, 1, 1),
402// superseeded: Vec::new(),
403// id_code: String::default(),
404// }
405// }
406// }
407
408// #[derive(Debug, Clone, Default)]
409// pub struct Seqres {
410// pub chain_id: Option<char>,
411// pub residues: Vec<String>,
412// }
413
414// /// model type of the entry
415// #[derive(Debug, Clone, Default)]
416// pub struct Mdltyp {
417// pub structural_annotation: Vec<String>,
418// }
419
420// /// collection of revisions
421// #[derive(Debug, Clone, Default)]
422// pub struct Revdats {
423// pub revdat: Vec<Revdat>,
424// }
425
426// /// collection of tokens in a CMPND record
427// #[derive(Debug, Clone, Default)]
428// pub struct Cmpnd {
429// pub tokens: Vec<Token>,
430// }
431
432// /// collection of tokens in a SOURCE record
433// #[derive(Debug, Clone, Default)]
434// pub struct Source {
435// pub tokens: Vec<Token>,
436// }
437
438// /// keywords related to the entry
439// #[derive(Debug, Clone, Default)]
440// pub struct Keywds {
441// pub keywords: Vec<String>,
442// }
443
444// /// journal author collection
445// #[derive(Debug, Clone, Default)]
446// pub struct JournalAuthors<'a> {
447// pub authors: Vec<&'a [u8]>,
448// }
449
450// /// journal title
451// #[derive(Debug, Clone, Default)]
452// pub struct JournalTitle {
453// pub title: String,
454// }
455
456// /// journal editor collection
457// #[derive(Debug, Clone, Default)]
458// pub struct JournalEditors<'a> {
459// pub name: Vec<&'a [u8]>,
460// }
461
462// /// journal reference
463// #[derive(Debug, Clone, Default)]
464// pub struct JournalReference {
465// pub publication_name: String,
466// pub volume: Option<u32>,
467// pub page: Option<u32>,
468// pub year: Option<u32>,
469// }
470
471// /// journal Citation fields
472// #[derive(Debug, Clone, Default)]
473// pub struct JournalCitation {
474// pub serial_type: Option<SerialNumber>,
475// pub serial: Option<String>,
476// }
477
478// /// journal publication fields
479// #[derive(Debug, Clone, Default)]
480// pub struct JournalPublication {
481// pub publication: String,
482// }
483
484// /// journal PubMed id
485// #[derive(Debug, Clone, Default)]
486// pub struct JournalPubMedId {
487// pub id: u32,
488// }
489
490// /// digital object identifier of related e-pub
491// #[derive(Debug, Clone, Default)]
492// pub struct JournalDoi {
493// pub id: String,
494// }
495
496// /// number of models in this file
497// #[derive(Debug, Clone, Default)]
498// pub struct Nummdl {
499// pub num: u32,
500// }
501
502// /// cross references to other sequence databases
503// #[derive(Debug, Clone, Default)]
504// pub struct Dbref {
505// pub idcode: String,
506// pub chain_id: char,
507// pub seq_begin: u32,
508// pub initial_sequence: Option<char>,
509// pub seq_end: u32,
510// pub ending_sequence: Option<char>,
511// pub database: String,
512// pub db_accession: String,
513// pub db_idcode: String,
514// pub db_seq_begin: u32,
515// pub idbns_begin: Option<char>,
516// pub db_seq_end: u32,
517// pub dbins_end: Option<char>,
518// }