nom_pdb/common/
parser.rs

1// Copyright (c) 2020 Tianyi Shi
2//
3// This software is released under the MIT License.
4// https://opensource.org/licenses/MIT
5
6use crate::types::*;
7use atoi::atoi;
8use chrono::{
9    format::{strftime::StrftimeItems, Parsed},
10    NaiveDate,
11};
12use nom::{
13    bytes::complete::{is_not, tag, take, take_while},
14    character::complete::{anychar, char, line_ending, multispace1, not_line_ending},
15    combinator::{map, map_res, peek},
16    IResult,
17};
18
19pub trait FieldParser {
20    type Output;
21    fn parse(inp: &[u8]) -> IResult<&[u8], Self::Output>;
22    fn parse_into<'a, 'b>(inp: &'a [u8], dst: &'b mut Self::Output) -> &'a [u8] {
23        let (i, data) = Self::parse(inp).expect("parse error");
24        *dst = data;
25        i
26    }
27    fn parse_into_vec<'a>(inp: &'a [u8], dst: &mut Vec<Self::Output>) -> &'a [u8] {
28        let (i, data) = Self::parse(inp).expect("parse error");
29        dst.push(data);
30        i
31    }
32    fn parse_into_option<'a>(inp: &'a [u8], dst: &mut Option<Self::Output>) -> &'a [u8] {
33        let (i, data) = Self::parse(inp).expect("parse error");
34        *dst = Some(data);
35        i
36    }
37}
38
39pub trait FieldParserWithModifiedTable {
40    type Output;
41    fn parse<'a>(
42        inp: &'a [u8],
43        modified_aa: &ModifiedAminoAcidTable,
44        modified_nuc: &ModifiedNucleotideTable,
45    ) -> IResult<&'a [u8], Self::Output>;
46    fn parse_into<'a, 'b>(
47        inp: &'a [u8],
48        dst: &'b mut Self::Output,
49        modified_aa: &ModifiedAminoAcidTable,
50        modified_nuc: &ModifiedNucleotideTable,
51    ) -> &'a [u8] {
52        let (i, data) = Self::parse(inp, modified_aa, modified_nuc).expect("parse error");
53        *dst = data;
54        i
55    }
56    fn parse_into_vec<'a>(
57        inp: &'a [u8],
58        dst: &mut Vec<Self::Output>,
59        modified_aa: &ModifiedAminoAcidTable,
60        modified_nuc: &ModifiedNucleotideTable,
61    ) -> &'a [u8] {
62        let (i, data) = Self::parse(inp, modified_aa, modified_nuc).expect("parse error");
63        dst.push(data);
64        i
65    }
66}
67
68// fn ws<'a, F: 'a, O, E: ParseError<&'a [u8]>>(inner: F) -> impl Fn(&'a [u8]) -> IResult<&'a [u8], O, E>
69// where
70//     F: Fn(&'a [u8]) -> IResult<&'a [u8], O, E>,
71// {
72//     preceded(multispace0, &inner)(i)
73// }
74
75pub(crate) fn jump_newline(inp: &[u8]) -> IResult<&[u8], ()> {
76    let (inp, _) = not_line_ending(inp)?;
77    let (inp, _) = line_ending(inp)?;
78    Ok((inp, ()))
79}
80
81fn char_is_space(c: u8) -> bool {
82    c == b' '
83}
84
85pub(crate) fn parse_date(i: &[u8]) -> IResult<&[u8], NaiveDate> {
86    let (i, day) = take(2usize)(i)?;
87    let (i, _) = take(1usize)(i)?;
88    let (i, month) = parse_month(i)?;
89    let (i, _) = take(1usize)(i)?;
90    let (i, year) = take(2usize)(i)?;
91    let mut year = atoi::<i32>(year).unwrap();
92    if year < 50i32 {
93        year += 2000
94    } else {
95        year += 1900
96    }
97    Ok((
98        i,
99        NaiveDate::from_ymd(year, month, atoi::<u32>(day).unwrap()),
100    ))
101}
102
103fn parse_month(i: &[u8]) -> IResult<&[u8], u32> {
104    map_res(take(3usize), |s: &[u8]| -> Result<u32, ()> {
105        let s = unsafe { std::str::from_utf8_unchecked(s) };
106        let mut parsed = Parsed::new();
107        chrono::format::parse(&mut parsed, s, StrftimeItems::new("%b"))
108            .expect("Failed to parse month");
109        Ok(parsed.month.unwrap())
110    })(i)
111}
112
113pub(crate) fn parse_right<'a, T>(i: &'a [u8], length: usize) -> IResult<&'a [u8], T>
114where
115    T: std::str::FromStr,
116{
117    let (i, s) = take_while(char_is_space)(i)?;
118    let l = s.len();
119    if l >= length {
120        panic!("Failed to parse int")
121    }
122    let (i, digit) = take(length - l)(i)?;
123    let digit = unsafe { std::str::from_utf8_unchecked(digit) };
124    match digit.parse() {
125        Err(_) => Err(nom::Err::Error((i, nom::error::ErrorKind::Digit))),
126        Ok(x) => Ok((i, x)),
127    }
128}
129
130// * MULTILINE PARSERS ---------------------------------------------------------
131
132pub(crate) fn parse_multiline_list(inp: &[u8]) -> IResult<&[u8], Vec<String>> {
133    // ! need improvement
134    let (mut inp, _) = take(4usize)(inp)?; // 7 - 10
135    let mut v: Vec<String> = Vec::new();
136    loop {
137        let (i, item) = take_while(|c| c != b',' && c != b'\n' && c != b'\r')(inp)?;
138        let item = unsafe { std::str::from_utf8_unchecked(item) };
139        v.push(item.trim().to_owned());
140        let (i, comma_or_newline) = anychar(i)?; // consume \r or \n if newline
141        if comma_or_newline == ',' {
142            let (i, char_after_comma) = peek(anychar)(i)?;
143            if char_after_comma == ' ' {
144                let (i, second_char_after_comma) = peek(anychar)(anychar(i)?.0)?;
145                if !second_char_after_comma.is_alphanumeric() {
146                    // newline
147                    inp = multispace1(i)?.0;
148                    inp = take(10usize)(inp)?.0; // 1 - 10
149                } else {
150                    inp = i;
151                }
152            } else {
153                inp = i;
154            }
155        } else {
156            // end
157            inp = take_while(|x| x == b'\n')(i)?.0;
158            return Ok((inp, v));
159        }
160    }
161}
162
163pub(crate) fn parse_multiline_string<'a>(
164    inp: &'a [u8],
165    record_identifier: &[u8],
166) -> IResult<&'a [u8], String> {
167    // ! need improvement
168    let (mut inp, _) = take(4usize)(inp)?; // 7 - 10
169    let mut s = String::new();
170    loop {
171        let (i, item) = not_line_ending(inp)?;
172        let item = std::str::from_utf8(item).unwrap();
173        s.push_str(item.trim_end());
174        let (i, _) = line_ending(i)?;
175        if peek(take(6usize))(i)?.1 != record_identifier {
176            return Ok((i, s));
177        }
178        let (i, _) = take(10usize)(i)?;
179        inp = i;
180    }
181}
182
183pub(crate) fn parse_multiline<'a, T, F>(
184    inp: &'a [u8],
185    record_identifier: &[u8],
186    continuation: bool,
187    parse_oneline: F,
188) -> IResult<&'a [u8], Vec<T>>
189where
190    F: Fn(&'a [u8]) -> IResult<&'a [u8], T>,
191{
192    // ! need improvement
193    let offset = if continuation { 10usize } else { 6usize };
194    let (mut inp, _) = take(4usize)(inp)?; // 7 - 10
195    let mut res = Vec::<T>::new();
196    loop {
197        let (i, item) = parse_oneline(inp)?;
198        res.push(item);
199        if peek(take(6usize))(i)?.1 != record_identifier {
200            return Ok((i, res));
201        }
202        let (i, _) = take(offset)(i)?;
203        inp = i;
204    }
205}
206
207use std::collections::HashMap;
208
209pub(crate) fn parse_residue<'a, 'b>(
210    inp: &'a [u8],
211    modified_aa: &'b ModifiedAminoAcidTable,
212    modified_nuc: &'b ModifiedNucleotideTable,
213) -> IResult<&'a [u8], Residue> {
214    let (inp, residue) = take(3usize)(inp)?;
215    let residue_s = unsafe { std::str::from_utf8_unchecked(residue).to_owned() };
216    let residue = if let Some(res) = StandardAminoAcid::try_parse_fw3(&residue) {
217        Residue::AminoAcid(AminoAcid::Standard(res))
218    } else if let Some(_res) = modified_aa.get(&residue_s) {
219        Residue::AminoAcid(AminoAcid::Modified(residue_s))
220    } else if let Some(res) = StandardNucleotide::try_parse_fw3(&residue) {
221        Residue::Nucleotide(Nucleotide::Standard(res))
222    } else if let Some(_res) = modified_nuc.get(&residue_s) {
223        Residue::Nucleotide(Nucleotide::Modified(residue_s))
224    } else {
225        match residue {
226            b"HOH" => Residue::Water,
227            b"UNX" => Residue::UnknownAtomOrIon,
228            b"UNL" => Residue::UnknownLigand,
229            _ => Residue::Other(residue_s),
230        }
231    };
232    Ok((inp, residue))
233}
234
235pub(crate) unsafe fn take_trim_own(inp: &[u8], n: usize) -> IResult<&[u8], String> {
236    let (inp, x) = take(n)(inp)?;
237    Ok((inp, std::str::from_utf8_unchecked(x).trim().to_owned()))
238}
239
240pub(crate) unsafe fn take_trim_start_own(inp: &[u8], n: usize) -> IResult<&[u8], String> {
241    let (inp, x) = take(n)(inp)?;
242    Ok((
243        inp,
244        std::str::from_utf8_unchecked(x).trim_start().to_owned(),
245    ))
246}
247
248// pub(crate) unsafe fn take_trim_end_own(inp: &[u8], n: usize) -> IResult<&[u8], String> {
249//     let (inp, x) = take(n)(inp)?;
250//     Ok((inp, std::str::from_utf8_unchecked(x).trim_end().to_owned()))
251// }
252
253// pub(crate) fn parse_specification(inp: &[u8]) -> IResult<&[u8], Token> {
254//     let (mut inp, _) = take(4usize)(inp)?;
255//     let (inp, token) = is_not(":")(inp)?;
256// }
257
258// Represents keys of CMPND and SOURCE records
259// #[derive(Debug, PartialEq, Clone)]
260// pub enum Token {
261//     MoleculeId(u32),
262//     Molecule(String),
263//     Chain { identifiers: Vec<String> },
264//     Fragment(String),
265//     Synonym { synonyms: Vec<String> },
266//     Ec { commission_numbers: Vec<String> },
267//     Engineered(bool),
268//     Mutation(bool),
269//     OtherDetails(String),
270//     Synthetic(String),
271//     OrganismScientific(String),
272//     OrganismCommon { organisms: Vec<String> },
273//     OrganismTaxId { id: Vec<u32> },
274//     Strain(String),
275//     Variant(String),
276//     CellLine(String),
277//     Atcc(u32),
278//     Organ(String),
279//     Tissue(String),
280//     Cell(String),
281//     Organelle(String),
282//     Secretion(String),
283//     CellularLocation(String),
284//     Plasmid(String),
285//     Gene { gene: Vec<String> },
286//     ExpressionSystem(String),
287//     ExpressionSystemCommon { systems: Vec<String> },
288//     ExpressionSystemTaxId { id: Vec<u32> },
289//     ExpressionSystemStrain(String),
290//     ExpressionSystemVariant(String),
291//     ExpressionSystemCellLine(String),
292//     ExpressionSystemAtcc(u32),
293//     ExpressionSystemOrgan(String),
294//     ExpressionSystemTissue(String),
295//     ExpressionSystemCell(String),
296//     ExpressionSystemOrganelle(String),
297//     ExpressionSystemCellularLocation(String),
298//     ExpressionSystemVectorType(String),
299//     ExpressionSystemVector(String),
300//     ExpressionSystemPlasmid(String),
301//     ExpressionSystemGene(String),
302// }
303
304// /// Represents a modification made to this pdb entry.
305// #[derive(Debug, Clone)]
306// pub struct Revdat {
307//     pub modification_number: u32,
308//     pub modification_date: NaiveDate,
309//     pub idcode: String,
310//     pub modification_type: ModificationType,
311//     pub modification_detail: Vec<String>,
312// }
313
314// /// modification type of REVDAT record
315// #[derive(Debug, Clone)]
316// pub enum ModificationType {
317//     /// initial release of the entry. Indicated as 0
318//     /// in a REVDAT record
319//     InitialRelease,
320//     /// modifications other than initial release
321//     /// Indicated with 1 in a REVDAT record.
322//     OtherModification,
323//     /// modification type other than 0 or 1
324//     UnknownModification,
325// }
326
327// /// Serial Number Type of a JRNL REFN record
328// #[derive(Debug, Clone, PartialEq)]
329// pub enum SerialNumber {
330//     Issn,
331//     Essn,
332// }
333
334// /// contains HEADER recor information
335// #[derive(Debug, Clone)]
336// pub struct Header {
337//     pub classification: String,
338//     pub deposition_date: NaiveDate,
339//     pub id_code: String,
340// }
341
342// impl std::default::Default for Header {
343//     fn default() -> Self {
344//         Header {
345//             classification: String::default(),
346//             deposition_date: NaiveDate::from_ymd(1900, 1, 1),
347//             id_code: String::default(),
348//         }
349//     }
350// }
351
352// /// result of a TITLE record
353// #[derive(Debug, Clone, Default)]
354// pub struct Title {
355//     pub title: String,
356// }
357
358// /// contains pdb entry ids which removed
359// /// this one from PDB
360// #[derive(Debug, Clone)]
361// pub struct Obslte {
362//     pub replacement_date: NaiveDate,
363//     pub replacement_ids: Vec<String>,
364// }
365
366// impl std::default::Default for Obslte {
367//     fn default() -> Self {
368//         Obslte {
369//             replacement_date: NaiveDate::from_ymd(1900, 1, 1),
370//             replacement_ids: Vec::new(),
371//         }
372//     }
373// }
374
375// /// if this entry is a part of bigger
376// /// structure, this struct holds ids of other
377// /// parts of the bigger structure
378// #[derive(Debug, Clone, Default)]
379// pub struct Split {
380//     pub id_codes: Vec<String>,
381// }
382
383// /// fallacies of this entry
384// #[derive(Debug, Clone, Default)]
385// pub struct Caveat {
386//     pub id_code: String,
387//     pub comment: String,
388// }
389
390// /// pdb entry ids made obsolete by this entry
391// #[derive(Debug, Clone)]
392// pub struct Sprsde {
393//     pub sprsde_date: NaiveDate,
394//     pub id_code: String,
395//     pub superseeded: Vec<String>,
396// }
397
398// impl std::default::Default for Sprsde {
399//     fn default() -> Self {
400//         Sprsde {
401//             sprsde_date: NaiveDate::from_ymd(1900, 1, 1),
402//             superseeded: Vec::new(),
403//             id_code: String::default(),
404//         }
405//     }
406// }
407
408// #[derive(Debug, Clone, Default)]
409// pub struct Seqres {
410//     pub chain_id: Option<char>,
411//     pub residues: Vec<String>,
412// }
413
414// /// model type of the entry
415// #[derive(Debug, Clone, Default)]
416// pub struct Mdltyp {
417//     pub structural_annotation: Vec<String>,
418// }
419
420// /// collection of revisions
421// #[derive(Debug, Clone, Default)]
422// pub struct Revdats {
423//     pub revdat: Vec<Revdat>,
424// }
425
426// /// collection of tokens in a CMPND record
427// #[derive(Debug, Clone, Default)]
428// pub struct Cmpnd {
429//     pub tokens: Vec<Token>,
430// }
431
432// /// collection of tokens in a SOURCE record
433// #[derive(Debug, Clone, Default)]
434// pub struct Source {
435//     pub tokens: Vec<Token>,
436// }
437
438// /// keywords related to the entry
439// #[derive(Debug, Clone, Default)]
440// pub struct Keywds {
441//     pub keywords: Vec<String>,
442// }
443
444// /// journal author collection
445// #[derive(Debug, Clone, Default)]
446// pub struct JournalAuthors<'a> {
447//     pub authors: Vec<&'a [u8]>,
448// }
449
450// /// journal title
451// #[derive(Debug, Clone, Default)]
452// pub struct JournalTitle {
453//     pub title: String,
454// }
455
456// /// journal editor collection
457// #[derive(Debug, Clone, Default)]
458// pub struct JournalEditors<'a> {
459//     pub name: Vec<&'a [u8]>,
460// }
461
462// /// journal reference
463// #[derive(Debug, Clone, Default)]
464// pub struct JournalReference {
465//     pub publication_name: String,
466//     pub volume: Option<u32>,
467//     pub page: Option<u32>,
468//     pub year: Option<u32>,
469// }
470
471// /// journal Citation fields
472// #[derive(Debug, Clone, Default)]
473// pub struct JournalCitation {
474//     pub serial_type: Option<SerialNumber>,
475//     pub serial: Option<String>,
476// }
477
478// /// journal publication fields
479// #[derive(Debug, Clone, Default)]
480// pub struct JournalPublication {
481//     pub publication: String,
482// }
483
484// /// journal PubMed id
485// #[derive(Debug, Clone, Default)]
486// pub struct JournalPubMedId {
487//     pub id: u32,
488// }
489
490// /// digital object identifier of related e-pub
491// #[derive(Debug, Clone, Default)]
492// pub struct JournalDoi {
493//     pub id: String,
494// }
495
496// /// number of models in this file
497// #[derive(Debug, Clone, Default)]
498// pub struct Nummdl {
499//     pub num: u32,
500// }
501
502// /// cross references to other sequence databases
503// #[derive(Debug, Clone, Default)]
504// pub struct Dbref {
505//     pub idcode: String,
506//     pub chain_id: char,
507//     pub seq_begin: u32,
508//     pub initial_sequence: Option<char>,
509//     pub seq_end: u32,
510//     pub ending_sequence: Option<char>,
511//     pub database: String,
512//     pub db_accession: String,
513//     pub db_idcode: String,
514//     pub db_seq_begin: u32,
515//     pub idbns_begin: Option<char>,
516//     pub db_seq_end: u32,
517//     pub dbins_end: Option<char>,
518// }