nom_pdb/
secondary_structure.rs

1// Copyright (c) 2020 Tianyi Shi
2//
3// This software is released under the MIT License.
4// https://opensource.org/licenses/MIT
5
6use crate::common::parser::FieldParser;
7use crate::common::parser::{jump_newline, parse_right, take_trim_start_own};
8use crate::types::{
9    AtomName, Helix, HelixClass, ParseFw4, Registration, ResidueSerial, SecondaryStructureSerial,
10    Sense, Sheet, Ssbond, Strand,
11};
12use nom::{bytes::complete::take, character::complete::anychar, combinator::map, IResult};
13
14/// # Overview
15///
16/// HELIX records are used to identify the position of helices in the molecule. Helices are named, numbered, and classified by type. The residues where the helix begins and ends are noted, as well as the total length.
17///
18/// # Record Format
19///
20/// | COLUMNS | DATA  TYPE    | FIELD       | DEFINITION                                 |
21/// | ------- | ------------- | ----------- | ------------------------------------------ |
22/// | 1 -  6  | Record name   | "HELIX "    |                                            |
23/// | 8 - 10  | Integer       | serNum      | Serial number of the helix. This starts    |
24/// |         |               |             | at 1  and increases incrementally.         |
25/// | 12 - 14 | LString(3)    | helixID     | Helix  identifier. In addition to a serial |
26/// |         |               |             | number, each helix is given an             |
27/// |         |               |             | alphanumeric character helix identifier.   |
28/// | 16 - 18 | Residue name  | initResName | Name of the initial residue.               |
29/// | 20      | Character     | initChainID | Chain identifier for the chain containing  |
30/// |         |               |             | this  helix.                               |
31/// | 22 - 25 | Integer       | initSeqNum  | Sequence number of the initial residue.    |
32/// | 26      | AChar         | initICode   | Insertion code of the initial residue.     |
33/// | 28 - 30 | Residue  name | endResName  | Name of the terminal residue of the helix. |
34/// | 32      | Character     | endChainID  | Chain identifier for the chain containing  |
35/// |         |               |             | this  helix.                               |
36/// | 34 - 37 | Integer       | endSeqNum   | Sequence number of the terminal residue.   |
37/// | 38      | AChar         | endICode    | Insertion code of the terminal residue.    |
38/// | 39 - 40 | Integer       | helixClass  | Helix class (see below).                   |
39/// | 41 - 70 | String        | comment     | Comment about this helix.                  |
40/// | 72 - 76 | Integer       | length      | Length of this helix.                      |
41///
42/// # Details
43///
44/// Additional HELIX records with different serial numbers and identifiers occur if more than one helix is present.
45/// The initial residue of the helix is the N-terminal residue.
46/// Helices are classified as follows:
47///
48/// |                                |     CLASS NUMBER             |
49/// |TYPE OF  HELIX                  |   (COLUMNS 39 - 40)          |
50/// |--------------------------------|------------------------------|
51/// |Right-handed alpha (default)    |            1                 |
52/// |Right-handed omega              |            2                 |
53/// |Right-handed pi                 |            3                 |
54/// |Right-handed gamma              |            4                 |
55/// |Right-handed 3 - 10             |            5                 |
56/// |Left-handed alpha               |            6                 |
57/// |Left-handed omega               |            7                 |
58/// |Left-handed gamma               |            8                 |
59/// |2 - 7 ribbon/helix              |            9                 |
60/// |Polyproline                     |           10                 |
61pub struct HelixParser;
62
63impl FieldParser for HelixParser {
64    type Output = Helix;
65    fn parse(inp: &[u8]) -> IResult<&[u8], Self::Output> {
66        let inp = &inp[5..]; // 7; 8 - 10; 11
67        let (inp, id) = take(3usize)(inp)?; // 12 - 14
68        let inp = &inp[5..]; // 15; 16 - 18; 19
69        let (inp, start_chain) = anychar(inp)?; // 20
70        let inp = &inp[1..]; // 21
71        let (inp, start_serial) = parse_right::<ResidueSerial>(inp, 4)?; // 22 - 25
72        let (inp, _start_icode) = anychar(inp)?; // 26
73        let inp = &inp[5..]; // 27; 28 - 30; 31
74        let (inp, end_chain) = anychar(inp)?; // 32
75        let inp = &inp[1..]; // 33
76        let (inp, end_serial) = parse_right::<ResidueSerial>(inp, 4)?; // 34 - 37
77        let (inp, _end_icode) = anychar(inp)?; // 38
78        let (inp, class) = Self::parse_helix_class(inp)?; // 39 - 40
79        let (inp, comment) = take(30usize)(inp)?; // 41 - 70
80        let (inp, _) = jump_newline(inp)?;
81        let helix = Helix {
82            id: unsafe { std::str::from_utf8_unchecked(id).trim().to_owned() },
83            class,
84            start: (start_chain, start_serial),
85            end: (end_chain, end_serial),
86            comment: unsafe { std::str::from_utf8_unchecked(comment).trim().to_owned() },
87        };
88        Ok((inp, helix))
89    }
90}
91
92impl HelixParser {
93    pub fn parse_helix_class(inp: &[u8]) -> IResult<&[u8], HelixClass> {
94        use HelixClass::*;
95        let (inp, code) = parse_right::<usize>(inp, 2)?;
96
97        let class = if code < 11 {
98            [
99                RightHandedAlpha,
100                RightHandedOmega,
101                RightHandedPi,
102                RightHandedGamma,
103                RightHanded310,
104                LeftHandedAlpha,
105                LeftHandedOmega,
106                LeftHandedGamma,
107                TwoSevenRibbonHelix,
108                Polyproline,
109            ][code]
110        } else {
111            Unknown
112        };
113        Ok((inp, class))
114    }
115}
116
117/// # Overview
118///
119/// SHEET records are used to identify the position of sheets in the molecule. Sheets are both named and numbered. The residues where the sheet begins and ends are noted.
120///
121/// # Record Format
122///
123/// | COLUMNS | DATA  TYPE   | FIELD       | DEFINITION                                        |
124/// | ------- | ------------ | ----------- | ------------------------------------------------- |
125/// | 1 -  6  | Record name  | "SHEET "    |                                                   |
126/// | 8 - 10  | Integer      | strand      | Strand  number which starts at 1 for each         |
127/// |         |              |             | strand within a sheet and increases by one.       |
128/// | 12 - 14 | LString(3)   | sheetID     | Sheet  identifier.                                |
129/// | 15 - 16 | Integer      | numStrands  | Number  of strands in sheet.                      |
130/// | 18 - 20 | Residue name | initResName | Residue  name of initial residue.                 |
131/// | 22      | Character    | initChainID | Chain identifier of initial residue               |
132/// |         |              |             | in strand.                                        |
133/// | 23 - 26 | Integer      | initSeqNum  | Sequence number of initial residue                |
134/// |         |              |             | in strand.                                        |
135/// | 27      | AChar        | initICode   | Insertion code of initial residue                 |
136/// |         |              |             | in  strand.                                       |
137/// | 29 - 31 | Residue name | endResName  | Residue name of terminal residue.                 |
138/// | 33      | Character    | endChainID  | Chain identifier of terminal residue.             |
139/// | 34 - 37 | Integer      | endSeqNum   | Sequence number of terminal residue.              |
140/// | 38      | AChar        | endICode    | Insertion code of terminal residue.               |
141/// | 39 - 40 | Integer      | sense       | Sense of strand with respect to previous          |
142/// |         |              |             | strand in the sheet. 0 if first strand,           |
143/// |         |              |             | 1 if  parallel,and -1 if anti-parallel.           |
144/// | 42 - 45 | Atom         | curAtom     | Registration.  Atom name in current strand.       |
145/// | 46 - 48 | Residue name | curResName  | Registration.  Residue name in current strand     |
146/// | 50      | Character    | curChainId  | Registration. Chain identifier in                 |
147/// |         |              |             | current strand.                                   |
148/// | 51 - 54 | Integer      | curResSeq   | Registration.  Residue sequence number            |
149/// |         |              |             | in current strand.                                |
150/// | 55      | AChar        | curICode    | Registration. Insertion code in                   |
151/// |         |              |             | current strand.                                   |
152/// | 57 - 60 | Atom         | prevAtom    | Registration.  Atom name in previous strand.      |
153/// | 61 - 63 | Residue name | prevResName | Registration.  Residue name in                    |
154/// |         |              |             | previous strand.                                  |
155/// | 65      | Character    | prevChainId | Registration.  Chain identifier in                |
156/// |         |              |             | previous  strand.                                 |
157/// | 66 - 69 | Integer      | prevResSeq  | Registration. Residue sequence number             |
158/// |         |              |             | in previous strand.                               |
159/// | 70      | AChar        | prevICode   | Registration.  Insertion code in previous strand. |
160///
161/// # Details
162///
163/// - The initial residue for a strand is its N-terminus. Strand registration information is provided in columns 39 - 70. Strands are listed starting with one edge of the sheet and continuing to the spatially adjacent strand.
164/// - The sense in columns 39 - 40 indicates whether strand n is parallel (sense = 1) or anti-parallel (sense = -1) to strand n-1. Sense is equal to zero (0) for the first strand of a sheet.
165/// - The registration (columns 42 - 70) of strand n to strand n-1 may be specified by one hydrogen bond between each such pair of strands. This is done by providing the hydrogen bonding between the current and previous strands. No register information should be provided for the first strand.
166/// - Split strands, or strands with two or more runs of residues from discontinuous parts of the amino acid sequence, are explicitly listed. Detail description can be included in the REMARK 700 .
167pub struct SheetParser;
168
169impl FieldParser for SheetParser {
170    type Output = Sheet;
171    fn parse(inp: &[u8]) -> IResult<&[u8], Self::Output> {
172        Self::parse_sheet(inp)
173    }
174}
175
176impl SheetParser {
177    fn parse_sheet(inp: &[u8]) -> IResult<&[u8], Sheet> {
178        let mut sheet = Sheet::default();
179        // first line
180        let inp = &inp[5..]; // 7 - 11
181        let (inp, id) = unsafe { take_trim_start_own(inp, 3usize)? }; // 12 - 14
182        sheet.id = id;
183        let (inp, num_strands) = parse_right::<SecondaryStructureSerial>(inp, 2)?; // 15 - 16
184        let inp = &inp[1..]; // 17
185        let (inp, first_strand) = Self::parse_first_line(inp)?;
186        sheet.strands.push(first_strand);
187        let mut i = 1 as SecondaryStructureSerial;
188        let mut last_inp = inp;
189        while i < num_strands {
190            let (inp, _) = take(7usize)(last_inp)?; // 1 - 7
191            let (inp, idx) = parse_right::<SecondaryStructureSerial>(inp, 3)?; // 8 - 10
192            i = idx;
193            let inp = &inp[7..]; // 11 - 17
194            let (inp, (strand, registration)) = Self::parse_line(inp)?;
195            sheet.strands.push(strand);
196            sheet.registration.push(registration);
197            last_inp = inp;
198        }
199        Ok((last_inp, sheet))
200    }
201
202    fn parse_first_line(inp: &[u8]) -> IResult<&[u8], Strand> {
203        let (inp, res) = Self::parse_strand(inp)?;
204        let (inp, _) = jump_newline(inp)?;
205        Ok((inp, res))
206    }
207
208    fn parse_line(inp: &[u8]) -> IResult<&[u8], (Strand, Registration)> {
209        let (inp, strand) = Self::parse_strand(inp)?;
210        let inp = &inp[1..];
211        let (inp, registration) = Self::parse_registration(inp)?;
212        Ok((inp, (strand, registration)))
213    }
214
215    fn parse_strand(inp: &[u8]) -> IResult<&[u8], Strand> {
216        // let (inp, _start_res) = map(take(3usize), parse_amino_acid)(inp)?;
217        let inp = &inp[3..]; // 18 - 20
218        let inp = &inp[1..]; //           21
219        let (inp, start_chain) = anychar(inp)?; // 22
220        let (inp, start_serial) = parse_right::<ResidueSerial>(inp, 4)?; // 23 - 26
221        let (inp, _start_icode) = anychar(inp)?; // 27
222        let inp = &inp[1..]; // 28
223                             // let (inp, _end_res) = map(take(3usize), parse_amino_acid)(inp)?;
224        let inp = &inp[3..]; // 29 - 31
225        let inp = &inp[1..]; //      32
226        let (inp, end_chain) = anychar(inp)?; // 33
227        let (inp, end_serial) = parse_right::<ResidueSerial>(inp, 4)?; // 34 - 37
228        let (inp, _end_icode) = anychar(inp)?; // 38
229        let (inp, sense) = Self::parse_sense(inp)?;
230        let strand = Strand {
231            start: (start_chain, start_serial),
232            end: (end_chain, end_serial),
233            sense,
234        };
235        Ok((inp, strand))
236    }
237    fn parse_registration(inp: &[u8]) -> IResult<&[u8], Registration> {
238        // | 42 - 45 | Atom         | curAtom     | Registration.  Atom name in current strand.       |
239        // | 46 - 48 | Residue name | curResName  | Registration.  Residue name in current strand     |
240        // | 50      | Character    | curChainId  | Registration. Chain identifier in                 |
241        // |         |              |             | current strand.                                   |
242        // | 51 - 54 | Integer      | curResSeq   | Registration.  Residue sequence number            |
243        // |         |              |             | in current strand.                                |
244        // | 55      | AChar        | curICode    | Registration. Insertion code in                   |
245        // |         |              |             | current strand.                                   |
246        // | 57 - 60 | Atom         | prevAtom    | Registration.  Atom name in previous strand.      |
247        // | 61 - 63 | Residue name | prevResName | Registration.  Residue name in                    |
248        // |         |              |             | previous strand.                                  |
249        // | 65      | Character    | prevChainId | Registration.  Chain identifier in                |
250        // |         |              |             | previous  strand.                                 |
251        // | 66 - 69 | Integer      | prevResSeq  | Registration. Residue sequence number             |
252        // |         |              |             | in previous strand.                               |
253        // | 70      | AChar        | prevICode   | Registration.  Insertion code in previous strand. |
254        let (inp, cur_atom) = map(take(4usize), AtomName::parse_fw4)(inp)?; // 42 - 45
255        let inp = &inp[4..]; // 46 - 48; 49
256        let (inp, cur_chain) = anychar(inp)?; // 50
257        let (inp, cur_serial) = parse_right::<ResidueSerial>(inp, 4)?; // 51 - 54
258        let inp = &inp[2..]; // 55; 56
259        let (inp, prev_atom) = map(take(4usize), AtomName::parse_fw4)(inp)?; // 57 - 60
260        let inp = &inp[4..]; // 61 - 63; 64
261        let (inp, prev_chain) = anychar(inp)?; // 65
262        let (inp, prev_serial) = parse_right::<ResidueSerial>(inp, 4)?; // 66 - 69
263        let (inp, _) = jump_newline(inp)?;
264        let registration = Registration {
265            curr: (cur_atom, cur_chain, cur_serial),
266            prev: (prev_atom, prev_chain, prev_serial),
267        };
268        Ok((inp, registration))
269    }
270
271    fn parse_sense(inp: &[u8]) -> IResult<&[u8], Sense> {
272        let (inp, sense) = take(2usize)(inp)?;
273        let sense = match sense {
274            b" 1" => Sense::Parallel,
275            b" 0" => Sense::Unknown,
276            b"-1" => Sense::Antiparallel,
277            _ => panic!("Error when parsing beta-strand sense!"),
278        };
279        Ok((inp, sense))
280    }
281}
282
283/// # SSBOND
284///
285/// The SSBOND record identifies each disulfide bond in protein and polypeptide structures by identifying the two residues involved in the bond.
286///
287/// The disulfide bond distance is included after the symmetry operations at the end of the SSBOND record.
288///
289/// ## Record Format
290///
291/// | COLUMNS | DATA  TYPE  | FIELD    | DEFINITION                       |
292/// | ------- | ----------- | -------- | -------------------------------- |
293/// | 1 -  6  | Record name | "SSBOND" |                                  |
294/// | 8 - 10  | Integer     | serNum   | Serial number.                   |
295/// | 12 - 14 | LString(3)  | "CYS"    | Residue name.                    |
296/// | 16      | Character   | chainID1 | Chain identifier.                |
297/// | 18 - 21 | Integer     | seqNum1  | Residue sequence number.         |
298/// | 22      | AChar       | icode1   | Insertion code.                  |
299/// | 26 - 28 | LString(3)  | "CYS"    | Residue name.                    |
300/// | 30      | Character   | chainID2 | Chain identifier.                |
301/// | 32 - 35 | Integer     | seqNum2  | Residue sequence number.         |
302/// | 36      | AChar       | icode2   | Insertion code.                  |
303/// | 60 - 65 | SymOP       | sym1     | Symmetry operator for residue 1. |
304/// | 67 - 72 | SymOP       | sym2     | Symmetry operator for residue 2. |
305/// | 74 – 78 | Real(5.2)   | Length   | Disulfide bond distance          |
306///
307/// ## Details
308///
309/// - Bond distances between the sulfur atoms must be close to expected value.
310/// - sym1 and sym2 are right justified and are always given even when identity operator (no cell translation) is to be applied to the residue.
311///
312/// Verification/Validation/Value Authority Control
313///
314/// wwPDB processing programs generate these records automatically.
315///
316/// Relationships to Other Record Types
317///
318/// CONECT records are generated for the disulfide bonds when SG atoms of both cysteines are present in the coordinate records.
319///
320/// Example
321///
322/// ```ignore
323///          1         2          3        4         5         6         7         8
324/// 12345678901234567890123456789012345678901234567890123456789012345678901234567890
325/// SSBOND   1 CYS A    6    CYS A  127                          1555   1555  2.03
326/// SSBOND   2 CYS A   30    CYS A  115                          1555   1555  2.07
327/// SSBOND   3 CYS A   64    CYS A   80                          1555   1555  2.06
328/// SSBOND   4 CYS A   76    CYS A   94                          1555   1555  2.04
329///```
330///
331/// ## Known Problems
332///
333/// If SG of cysteine is disordered then there are possible alternate linkages. wwPDB practice is to put together all possible SSBOND records. This is problematic because the alternate location identifier is not specified in the SSBOND record.
334pub struct SsbondParser;
335
336impl FieldParser for SsbondParser {
337    type Output = Ssbond;
338    fn parse(inp: &[u8]) -> IResult<&[u8], Ssbond> {
339        let inp = &inp[9..]; // 7 - 15
340        let (inp, chain_a) = anychar(inp)?; // 16
341        let inp = &inp[1..]; // 17
342        let (inp, serial_a) = parse_right::<ResidueSerial>(inp, 4usize)?;
343        let (inp, _insertion_code) = anychar(inp)?;
344        let inp = &inp[7..]; // 23 - 29;
345        let (inp, chain_b) = anychar(inp)?;
346        let inp = &inp[1..];
347        let (inp, serial_b) = parse_right::<ResidueSerial>(inp, 4usize)?;
348        let (inp, _) = jump_newline(inp)?;
349        Ok((
350            inp,
351            Ssbond {
352                a: (chain_a, serial_a),
353                b: (chain_b, serial_b),
354            },
355        ))
356    }
357}