1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
// Copyright (c) 2020 Tianyi Shi
//
// This software is released under the MIT License.
// https://opensource.org/licenses/MIT

use crate::common::parser::FieldParser;
use crate::common::parser::{jump_newline, parse_right, take_trim_start_own};
use crate::types::{
    AtomName, Helix, HelixClass, ParseFw4, Registration, ResidueSerial, SecondaryStructureSerial,
    Sense, Sheet, Ssbond, Strand,
};
use nom::{bytes::complete::take, character::complete::anychar, combinator::map, IResult};

/// # Overview
///
/// HELIX records are used to identify the position of helices in the molecule. Helices are named, numbered, and classified by type. The residues where the helix begins and ends are noted, as well as the total length.
///
/// # Record Format
///
/// | COLUMNS | DATA  TYPE    | FIELD       | DEFINITION                                 |
/// | ------- | ------------- | ----------- | ------------------------------------------ |
/// | 1 -  6  | Record name   | "HELIX "    |                                            |
/// | 8 - 10  | Integer       | serNum      | Serial number of the helix. This starts    |
/// |         |               |             | at 1  and increases incrementally.         |
/// | 12 - 14 | LString(3)    | helixID     | Helix  identifier. In addition to a serial |
/// |         |               |             | number, each helix is given an             |
/// |         |               |             | alphanumeric character helix identifier.   |
/// | 16 - 18 | Residue name  | initResName | Name of the initial residue.               |
/// | 20      | Character     | initChainID | Chain identifier for the chain containing  |
/// |         |               |             | this  helix.                               |
/// | 22 - 25 | Integer       | initSeqNum  | Sequence number of the initial residue.    |
/// | 26      | AChar         | initICode   | Insertion code of the initial residue.     |
/// | 28 - 30 | Residue  name | endResName  | Name of the terminal residue of the helix. |
/// | 32      | Character     | endChainID  | Chain identifier for the chain containing  |
/// |         |               |             | this  helix.                               |
/// | 34 - 37 | Integer       | endSeqNum   | Sequence number of the terminal residue.   |
/// | 38      | AChar         | endICode    | Insertion code of the terminal residue.    |
/// | 39 - 40 | Integer       | helixClass  | Helix class (see below).                   |
/// | 41 - 70 | String        | comment     | Comment about this helix.                  |
/// | 72 - 76 | Integer       | length      | Length of this helix.                      |
///
/// # Details
///
/// Additional HELIX records with different serial numbers and identifiers occur if more than one helix is present.
/// The initial residue of the helix is the N-terminal residue.
/// Helices are classified as follows:
///
/// |                                |     CLASS NUMBER             |
/// |TYPE OF  HELIX                  |   (COLUMNS 39 - 40)          |
/// |--------------------------------|------------------------------|
/// |Right-handed alpha (default)    |            1                 |
/// |Right-handed omega              |            2                 |
/// |Right-handed pi                 |            3                 |
/// |Right-handed gamma              |            4                 |
/// |Right-handed 3 - 10             |            5                 |
/// |Left-handed alpha               |            6                 |
/// |Left-handed omega               |            7                 |
/// |Left-handed gamma               |            8                 |
/// |2 - 7 ribbon/helix              |            9                 |
/// |Polyproline                     |           10                 |
pub struct HelixParser;

impl FieldParser for HelixParser {
    type Output = Helix;
    fn parse(inp: &[u8]) -> IResult<&[u8], Self::Output> {
        let inp = &inp[5..]; // 7; 8 - 10; 11
        let (inp, id) = take(3usize)(inp)?; // 12 - 14
        let inp = &inp[5..]; // 15; 16 - 18; 19
        let (inp, start_chain) = anychar(inp)?; // 20
        let inp = &inp[1..]; // 21
        let (inp, start_serial) = parse_right::<ResidueSerial>(inp, 4)?; // 22 - 25
        let (inp, _start_icode) = anychar(inp)?; // 26
        let inp = &inp[5..]; // 27; 28 - 30; 31
        let (inp, end_chain) = anychar(inp)?; // 32
        let inp = &inp[1..]; // 33
        let (inp, end_serial) = parse_right::<ResidueSerial>(inp, 4)?; // 34 - 37
        let (inp, _end_icode) = anychar(inp)?; // 38
        let (inp, class) = Self::parse_helix_class(inp)?; // 39 - 40
        let (inp, comment) = take(30usize)(inp)?; // 41 - 70
        let (inp, _) = jump_newline(inp)?;
        let helix = Helix {
            id: unsafe { std::str::from_utf8_unchecked(id).trim().to_owned() },
            class,
            start: (start_chain, start_serial),
            end: (end_chain, end_serial),
            comment: unsafe { std::str::from_utf8_unchecked(comment).trim().to_owned() },
        };
        Ok((inp, helix))
    }
}

impl HelixParser {
    pub fn parse_helix_class(inp: &[u8]) -> IResult<&[u8], HelixClass> {
        use HelixClass::*;
        let (inp, code) = parse_right::<usize>(inp, 2)?;

        let class = if code < 11 {
            [
                RightHandedAlpha,
                RightHandedOmega,
                RightHandedPi,
                RightHandedGamma,
                RightHanded310,
                LeftHandedAlpha,
                LeftHandedOmega,
                LeftHandedGamma,
                TwoSevenRibbonHelix,
                Polyproline,
            ][code]
        } else {
            Unknown
        };
        Ok((inp, class))
    }
}

/// # Overview
///
/// SHEET records are used to identify the position of sheets in the molecule. Sheets are both named and numbered. The residues where the sheet begins and ends are noted.
///
/// # Record Format
///
/// | COLUMNS | DATA  TYPE   | FIELD       | DEFINITION                                        |
/// | ------- | ------------ | ----------- | ------------------------------------------------- |
/// | 1 -  6  | Record name  | "SHEET "    |                                                   |
/// | 8 - 10  | Integer      | strand      | Strand  number which starts at 1 for each         |
/// |         |              |             | strand within a sheet and increases by one.       |
/// | 12 - 14 | LString(3)   | sheetID     | Sheet  identifier.                                |
/// | 15 - 16 | Integer      | numStrands  | Number  of strands in sheet.                      |
/// | 18 - 20 | Residue name | initResName | Residue  name of initial residue.                 |
/// | 22      | Character    | initChainID | Chain identifier of initial residue               |
/// |         |              |             | in strand.                                        |
/// | 23 - 26 | Integer      | initSeqNum  | Sequence number of initial residue                |
/// |         |              |             | in strand.                                        |
/// | 27      | AChar        | initICode   | Insertion code of initial residue                 |
/// |         |              |             | in  strand.                                       |
/// | 29 - 31 | Residue name | endResName  | Residue name of terminal residue.                 |
/// | 33      | Character    | endChainID  | Chain identifier of terminal residue.             |
/// | 34 - 37 | Integer      | endSeqNum   | Sequence number of terminal residue.              |
/// | 38      | AChar        | endICode    | Insertion code of terminal residue.               |
/// | 39 - 40 | Integer      | sense       | Sense of strand with respect to previous          |
/// |         |              |             | strand in the sheet. 0 if first strand,           |
/// |         |              |             | 1 if  parallel,and -1 if anti-parallel.           |
/// | 42 - 45 | Atom         | curAtom     | Registration.  Atom name in current strand.       |
/// | 46 - 48 | Residue name | curResName  | Registration.  Residue name in current strand     |
/// | 50      | Character    | curChainId  | Registration. Chain identifier in                 |
/// |         |              |             | current strand.                                   |
/// | 51 - 54 | Integer      | curResSeq   | Registration.  Residue sequence number            |
/// |         |              |             | in current strand.                                |
/// | 55      | AChar        | curICode    | Registration. Insertion code in                   |
/// |         |              |             | current strand.                                   |
/// | 57 - 60 | Atom         | prevAtom    | Registration.  Atom name in previous strand.      |
/// | 61 - 63 | Residue name | prevResName | Registration.  Residue name in                    |
/// |         |              |             | previous strand.                                  |
/// | 65      | Character    | prevChainId | Registration.  Chain identifier in                |
/// |         |              |             | previous  strand.                                 |
/// | 66 - 69 | Integer      | prevResSeq  | Registration. Residue sequence number             |
/// |         |              |             | in previous strand.                               |
/// | 70      | AChar        | prevICode   | Registration.  Insertion code in previous strand. |
///
/// # Details
///
/// - The initial residue for a strand is its N-terminus. Strand registration information is provided in columns 39 - 70. Strands are listed starting with one edge of the sheet and continuing to the spatially adjacent strand.
/// - The sense in columns 39 - 40 indicates whether strand n is parallel (sense = 1) or anti-parallel (sense = -1) to strand n-1. Sense is equal to zero (0) for the first strand of a sheet.
/// - The registration (columns 42 - 70) of strand n to strand n-1 may be specified by one hydrogen bond between each such pair of strands. This is done by providing the hydrogen bonding between the current and previous strands. No register information should be provided for the first strand.
/// - Split strands, or strands with two or more runs of residues from discontinuous parts of the amino acid sequence, are explicitly listed. Detail description can be included in the REMARK 700 .
pub struct SheetParser;

impl FieldParser for SheetParser {
    type Output = Sheet;
    fn parse(inp: &[u8]) -> IResult<&[u8], Self::Output> {
        Self::parse_sheet(inp)
    }
}

impl SheetParser {
    fn parse_sheet(inp: &[u8]) -> IResult<&[u8], Sheet> {
        let mut sheet = Sheet::default();
        // first line
        let inp = &inp[5..]; // 7 - 11
        let (inp, id) = unsafe { take_trim_start_own(inp, 3usize)? }; // 12 - 14
        sheet.id = id;
        let (inp, num_strands) = parse_right::<SecondaryStructureSerial>(inp, 2)?; // 15 - 16
        let inp = &inp[1..]; // 17
        let (inp, first_strand) = Self::parse_first_line(inp)?;
        sheet.strands.push(first_strand);
        let mut i = 1 as SecondaryStructureSerial;
        let mut last_inp = inp;
        while i < num_strands {
            let (inp, _) = take(7usize)(last_inp)?; // 1 - 7
            let (inp, idx) = parse_right::<SecondaryStructureSerial>(inp, 3)?; // 8 - 10
            i = idx;
            let inp = &inp[7..]; // 11 - 17
            let (inp, (strand, registration)) = Self::parse_line(inp)?;
            sheet.strands.push(strand);
            sheet.registration.push(registration);
            last_inp = inp;
        }
        Ok((last_inp, sheet))
    }

    fn parse_first_line(inp: &[u8]) -> IResult<&[u8], Strand> {
        let (inp, res) = Self::parse_strand(inp)?;
        let (inp, _) = jump_newline(inp)?;
        Ok((inp, res))
    }

    fn parse_line(inp: &[u8]) -> IResult<&[u8], (Strand, Registration)> {
        let (inp, strand) = Self::parse_strand(inp)?;
        let inp = &inp[1..];
        let (inp, registration) = Self::parse_registration(inp)?;
        Ok((inp, (strand, registration)))
    }

    fn parse_strand(inp: &[u8]) -> IResult<&[u8], Strand> {
        // let (inp, _start_res) = map(take(3usize), parse_amino_acid)(inp)?;
        let inp = &inp[3..]; // 18 - 20
        let inp = &inp[1..]; //           21
        let (inp, start_chain) = anychar(inp)?; // 22
        let (inp, start_serial) = parse_right::<ResidueSerial>(inp, 4)?; // 23 - 26
        let (inp, _start_icode) = anychar(inp)?; // 27
        let inp = &inp[1..]; // 28
                             // let (inp, _end_res) = map(take(3usize), parse_amino_acid)(inp)?;
        let inp = &inp[3..]; // 29 - 31
        let inp = &inp[1..]; //      32
        let (inp, end_chain) = anychar(inp)?; // 33
        let (inp, end_serial) = parse_right::<ResidueSerial>(inp, 4)?; // 34 - 37
        let (inp, _end_icode) = anychar(inp)?; // 38
        let (inp, sense) = Self::parse_sense(inp)?;
        let strand = Strand {
            start: (start_chain, start_serial),
            end: (end_chain, end_serial),
            sense,
        };
        Ok((inp, strand))
    }
    fn parse_registration(inp: &[u8]) -> IResult<&[u8], Registration> {
        // | 42 - 45 | Atom         | curAtom     | Registration.  Atom name in current strand.       |
        // | 46 - 48 | Residue name | curResName  | Registration.  Residue name in current strand     |
        // | 50      | Character    | curChainId  | Registration. Chain identifier in                 |
        // |         |              |             | current strand.                                   |
        // | 51 - 54 | Integer      | curResSeq   | Registration.  Residue sequence number            |
        // |         |              |             | in current strand.                                |
        // | 55      | AChar        | curICode    | Registration. Insertion code in                   |
        // |         |              |             | current strand.                                   |
        // | 57 - 60 | Atom         | prevAtom    | Registration.  Atom name in previous strand.      |
        // | 61 - 63 | Residue name | prevResName | Registration.  Residue name in                    |
        // |         |              |             | previous strand.                                  |
        // | 65      | Character    | prevChainId | Registration.  Chain identifier in                |
        // |         |              |             | previous  strand.                                 |
        // | 66 - 69 | Integer      | prevResSeq  | Registration. Residue sequence number             |
        // |         |              |             | in previous strand.                               |
        // | 70      | AChar        | prevICode   | Registration.  Insertion code in previous strand. |
        let (inp, cur_atom) = map(take(4usize), AtomName::parse_fw4)(inp)?; // 42 - 45
        let inp = &inp[4..]; // 46 - 48; 49
        let (inp, cur_chain) = anychar(inp)?; // 50
        let (inp, cur_serial) = parse_right::<ResidueSerial>(inp, 4)?; // 51 - 54
        let inp = &inp[2..]; // 55; 56
        let (inp, prev_atom) = map(take(4usize), AtomName::parse_fw4)(inp)?; // 57 - 60
        let inp = &inp[4..]; // 61 - 63; 64
        let (inp, prev_chain) = anychar(inp)?; // 65
        let (inp, prev_serial) = parse_right::<ResidueSerial>(inp, 4)?; // 66 - 69
        let (inp, _) = jump_newline(inp)?;
        let registration = Registration {
            curr: (cur_atom, cur_chain, cur_serial),
            prev: (prev_atom, prev_chain, prev_serial),
        };
        Ok((inp, registration))
    }

    fn parse_sense(inp: &[u8]) -> IResult<&[u8], Sense> {
        let (inp, sense) = take(2usize)(inp)?;
        let sense = match sense {
            b" 1" => Sense::Parallel,
            b" 0" => Sense::Unknown,
            b"-1" => Sense::Antiparallel,
            _ => panic!("Error when parsing beta-strand sense!"),
        };
        Ok((inp, sense))
    }
}

/// # SSBOND
///
/// The SSBOND record identifies each disulfide bond in protein and polypeptide structures by identifying the two residues involved in the bond.
///
/// The disulfide bond distance is included after the symmetry operations at the end of the SSBOND record.
///
/// ## Record Format
///
/// | COLUMNS | DATA  TYPE  | FIELD    | DEFINITION                       |
/// | ------- | ----------- | -------- | -------------------------------- |
/// | 1 -  6  | Record name | "SSBOND" |                                  |
/// | 8 - 10  | Integer     | serNum   | Serial number.                   |
/// | 12 - 14 | LString(3)  | "CYS"    | Residue name.                    |
/// | 16      | Character   | chainID1 | Chain identifier.                |
/// | 18 - 21 | Integer     | seqNum1  | Residue sequence number.         |
/// | 22      | AChar       | icode1   | Insertion code.                  |
/// | 26 - 28 | LString(3)  | "CYS"    | Residue name.                    |
/// | 30      | Character   | chainID2 | Chain identifier.                |
/// | 32 - 35 | Integer     | seqNum2  | Residue sequence number.         |
/// | 36      | AChar       | icode2   | Insertion code.                  |
/// | 60 - 65 | SymOP       | sym1     | Symmetry operator for residue 1. |
/// | 67 - 72 | SymOP       | sym2     | Symmetry operator for residue 2. |
/// | 74 – 78 | Real(5.2)   | Length   | Disulfide bond distance          |
///
/// ## Details
///
/// - Bond distances between the sulfur atoms must be close to expected value.
/// - sym1 and sym2 are right justified and are always given even when identity operator (no cell translation) is to be applied to the residue.
///
/// Verification/Validation/Value Authority Control
///
/// wwPDB processing programs generate these records automatically.
///
/// Relationships to Other Record Types
///
/// CONECT records are generated for the disulfide bonds when SG atoms of both cysteines are present in the coordinate records.
///
/// Example
///
/// ```ignore
///          1         2          3        4         5         6         7         8
/// 12345678901234567890123456789012345678901234567890123456789012345678901234567890
/// SSBOND   1 CYS A    6    CYS A  127                          1555   1555  2.03
/// SSBOND   2 CYS A   30    CYS A  115                          1555   1555  2.07
/// SSBOND   3 CYS A   64    CYS A   80                          1555   1555  2.06
/// SSBOND   4 CYS A   76    CYS A   94                          1555   1555  2.04
///```
///
/// ## Known Problems
///
/// If SG of cysteine is disordered then there are possible alternate linkages. wwPDB practice is to put together all possible SSBOND records. This is problematic because the alternate location identifier is not specified in the SSBOND record.
pub struct SsbondParser;

impl FieldParser for SsbondParser {
    type Output = Ssbond;
    fn parse(inp: &[u8]) -> IResult<&[u8], Ssbond> {
        let inp = &inp[9..]; // 7 - 15
        let (inp, chain_a) = anychar(inp)?; // 16
        let inp = &inp[1..]; // 17
        let (inp, serial_a) = parse_right::<ResidueSerial>(inp, 4usize)?;
        let (inp, _insertion_code) = anychar(inp)?;
        let inp = &inp[7..]; // 23 - 29;
        let (inp, chain_b) = anychar(inp)?;
        let inp = &inp[1..];
        let (inp, serial_b) = parse_right::<ResidueSerial>(inp, 4usize)?;
        let (inp, _) = jump_newline(inp)?;
        Ok((
            inp,
            Ssbond {
                a: (chain_a, serial_a),
                b: (chain_b, serial_b),
            },
        ))
    }
}