1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
/// A simple single-thread parser.
use crate::{coordinate::*, crystallography::*, primary_structure::*, title_section::*};

// use crate::common::error::PdbParseError;
use crate::common::parser::FieldParser;
use nom::bytes::complete::take;
use nom::character::complete::{line_ending, not_line_ending};
use nom::IResult;
use protein_core::types::model::Model;
use serde::{Deserialize, Serialize};
// use nom::Err::Error;

use protein_core::io::pdb::Pdb;

/// http://www.wwpdb.org/documentation/file-format-content/format33/sect1.html
#[derive(Eq, PartialEq, Debug, Ord, PartialOrd)]
enum Fields {
    Header,  // M
    Obslte,  // O : Mandatory in entries that have been replaced by a newer entry.
    Title,   // M
    Split, //   O : Mandatory when  large macromolecular complexes are split into multiple PDB entries.
    Caveat, // O : Mandatory when there are outstanding errors such as chirality.
    Compnd, // M
    Source, // M
    Keywds, // M
    Expdta, // M
    Nummdl, // O : Mandatory for  NMR ensemble entries.
    Mdltyp, // O : Mandatory for  NMR minimized average Structures or when the entire  polymer chain contains C alpha or P atoms only.
    Author, // M
    Revdat, // M
    Sprsde, // O : Mandatory for a replacement entry.
    Jrnl,   //   O: Mandatory for a publication describes the experiment.
    Remark0, // O : Mandatory for a re-refined structure
    Remark1, // O
    Remark2, // M
    Remark3, // M
    RemarkN, // O : Mandatory under certain conditions
    Dbref,  //  O : Mandatory for all polymers.
    Dbref1,
    Dbref2, // O : Mandatory when certain sequence database accession and/or sequence numbering does not fit preceding DBREF format.
    SeqAdv, // O : Mandatory if sequence  conflict exists.
    SeqRes, // O : Mandatory if ATOM records exist.
    Modres, // O : Mandatory if modified group exists in the coordinates.
    Het,    // O : Mandatory if a non-standard group other than water appears in the coordinates.
    Hetnam, // O : Mandatory if a non-standard group otherthan  water appears in the coordinates.
    Hetsyn, // O
    Formul, // O : Mandatory if a non-standard group or water appears in the coordinates.
    Helix,  // O
    Sheet,  // O
    Ssbond, // O : Mandatory if a  disulfide bond is present.
    Link,   // O : Mandatory if  non-standard residues appear in a polymer
    Cispep, // O
    Site,   // O
    Cryst1, // M
    Origx1, // M
    Origx2, // M
    Origx3, // M
    Scale1, // M
    Scale2, // M
    Scale3, // M
    Mtrix1, // O Mandatory if  the complete asymmetric unit
    Mtrix2, // O must  be generated from the given coordinates
    Mtrix3, // O using non-crystallographic symmetry.
    Model,  // O : Mandatory if more than one model is present in the entry.
    Atom,   // O : Mandatory if standard residues exist.
    Anisou, // O
    Ter,    // O : Mandatory if ATOM records exist.
    Hetatm, // O : Mandatory if non-standard group exists.
    Endmdl, // O : Mandatory if MODEL appears.
    Conect, // O : Mandatory if non-standard group appears and if LINK or SSBOND records exist.
    Master, // M
    End,    // M
}

#[derive(Eq, PartialEq, Debug, Ord, PartialOrd)]
enum Section {
    Title, // HEADER,  OBSLTE, TITLE, SPLIT, CAVEAT, COMPND, SOURCE, KEYWDS, EXPDTA, NUMMDL, MDLTYP, AUTHOR, REVDAT, PRSDE, JRNL
    Remark, // REMARKs  0-999
    PrimaryStructure, // DBREF, SEQADV, SEQRES MODRES
    Heterogen, // HET, HETNAM, HETSYN, FORMUL
    SecondaryStructure, // HELIX, SHEET
    Connectivity, // CONECT, SSBOND, LINK, CISPEP
    Misc,  // SITE
    Crystallography, // CRYST1
    CoordinateTransformation, // ORIGXn,  SCALEn, MTRIXn,
    Coordinate, // MODEL, ATOM, ANISOU, TER, HETATM, ENDMDL
}
/// Commas, colons, and semi-colons are used as list delimiters in records that have one of the following data types:
///
/// - List
/// - SList
/// - Specification List
/// - Specification
///
/// If a comma, colon, or semi-colon is used in any context other than as a delimiting character, then the character must be escaped, i.e., immediately preceded by a backslash, "\".
enum Dtype {
    // To interpret a String, concatenate the contents of all continued fields together, collapse all sequences of multiple blanks to a single blank, and remove any leading and trailing blanks. This permits very long strings to be properly reconstructed.
    List,              // A String that is composed of text separated with commas.
    SList,             // A String that is composed of text separated with semi-colons.
    Specification, // A String composed of a token and its  associated value separated by a colon.
    SpecificationList, // A sequence of Specifications, separated by semi-colons.
    String,
    Oneline,
}

enum ParserState {
    FirstLine,
    Continue,
}

pub struct Parser<'a> {
    state: ParserState,
    remaining: &'a str,
    buffer: String,
}

impl<'a> Parser<'a> {
    pub fn parse(mut inp: &str) -> nom::IResult<&str, Pdb> {
        let mut pdb = Pdb::default();
        pdb.models = vec![Model::default()];
        let mut model_idx = 0;
        loop {
            let (i, tag) = take(6usize)(inp)?;
            inp = match tag {
                "HEADER" => HeaderParser::parse_into(&i, &mut pdb.header),
                "TITLE " => TitleParser::parse_into(&i, &mut pdb.title),
                "AUTHOR" => AuthorsParser::parse_into(&i, &mut pdb.authors),
                "CRYST1" => Cryst1Parser::parse_into(&i, &mut pdb.cryst1),
                "SEQRES" => SeqResParser::parse_into(&i, &mut pdb.seqres),
                "MODRES" => ModresParser::parse_into(&i, &mut pdb.modres),
                "EXPDTA" => {
                    ExperimentalTechniquesParser::parse_into(&i, &mut pdb.experimental_techniques)
                }
                "ATOM  " => AtomParser::parse_into_vec(&i, &mut pdb.models[model_idx].atoms),
                "ANISOU" => AnisouParser::parse_into_vec(&i, &mut pdb.models[model_idx].anisou),
                "ENDMDL" => {
                    pdb.models.push(Model::default());
                    model_idx += 1;
                    let (i, _) = not_line_ending(i)?;
                    let (i, _) = line_ending(i)?;
                    i
                }
                "END   " => {
                    inp = "";
                    break;
                }
                _ => {
                    // new line
                    let (i, _) = not_line_ending(i)?;
                    let (i, _) = line_ending(i)?;
                    i
                } //panic!("Unkown field"),
            }
        }
        Ok((inp, pdb))
    }
}