Skip to main content

cyanea_struct/
pdb.rs

1//! PDB format parser.
2//!
3//! Parses ATOM, HETATM, TER, HEADER, and MODEL/ENDMDL records from PDB-format
4//! text. Only the first MODEL is returned for multi-model (NMR) files.
5
6use cyanea_core::{CyaneaError, Result};
7
8use crate::types::{Atom, Chain, Point3D, Residue, Structure};
9
10use alloc::string::{String, ToString};
11use alloc::vec::Vec;
12
13/// Parse a PDB-format string into a [`Structure`].
14///
15/// # Errors
16///
17/// Returns an error if no ATOM records are found or if an ATOM record is
18/// malformed (wrong column widths, unparseable coordinates).
19pub fn parse_pdb(input: &str) -> Result<Structure> {
20    let mut id = String::from("UNKN");
21    let mut current_chain_id: Option<char> = None;
22    let mut current_residue_key: Option<(i32, Option<char>, String)> = None;
23    let mut chains: Vec<Chain> = Vec::new();
24    let mut residues: Vec<Residue> = Vec::new();
25    let mut atoms: Vec<Atom> = Vec::new();
26    let mut atom_count = 0u32;
27    let mut in_first_model = true;
28
29    for line in input.lines() {
30        if line.starts_with("ENDMDL") {
31            break; // only first model
32        }
33        if line.starts_with("MODEL") {
34            if !in_first_model {
35                break;
36            }
37            in_first_model = false;
38            continue;
39        }
40
41        if line.starts_with("HEADER") && line.len() >= 66 {
42            let pdb_id = line[62..66].trim();
43            if !pdb_id.is_empty() {
44                id = pdb_id.into();
45            }
46        }
47
48        let is_atom = line.starts_with("ATOM  ");
49        let is_hetatm = line.starts_with("HETATM");
50
51        if is_atom || is_hetatm {
52            let atom = parse_atom_record(line, is_hetatm)?;
53            atom_count += 1;
54
55            let chain_id = parse_chain_id(line);
56            let seq_num = parse_residue_seq(line)?;
57            let i_code = parse_insertion_code(line);
58            let res_name = parse_residue_name(line);
59
60            let res_key = (seq_num, i_code, res_name.clone());
61
62            // Chain changed?
63            if current_chain_id != Some(chain_id) {
64                // Flush current residue
65                if !atoms.is_empty() {
66                    if let Some((sn, ic, rn)) = current_residue_key.take() {
67                        residues.push(Residue {
68                            name: rn,
69                            seq_num: sn,
70                            i_code: ic,
71                            atoms: core::mem::take(&mut atoms),
72                        });
73                    }
74                }
75                // Flush current chain
76                if let Some(cid) = current_chain_id {
77                    if !residues.is_empty() {
78                        chains.push(Chain::new(cid, core::mem::take(&mut residues)));
79                    }
80                }
81                current_chain_id = Some(chain_id);
82                current_residue_key = Some(res_key);
83            } else if current_residue_key.as_ref() != Some(&res_key) {
84                // Residue changed within same chain
85                if !atoms.is_empty() {
86                    if let Some((sn, ic, rn)) = current_residue_key.take() {
87                        residues.push(Residue {
88                            name: rn,
89                            seq_num: sn,
90                            i_code: ic,
91                            atoms: core::mem::take(&mut atoms),
92                        });
93                    }
94                }
95                current_residue_key = Some(res_key);
96            }
97
98            atoms.push(atom);
99        }
100
101        if line.starts_with("TER") {
102            // Flush residue + chain
103            if !atoms.is_empty() {
104                if let Some((sn, ic, rn)) = current_residue_key.take() {
105                    residues.push(Residue {
106                        name: rn,
107                        seq_num: sn,
108                        i_code: ic,
109                        atoms: core::mem::take(&mut atoms),
110                    });
111                }
112            }
113            if let Some(cid) = current_chain_id.take() {
114                if !residues.is_empty() {
115                    chains.push(Chain::new(cid, core::mem::take(&mut residues)));
116                }
117            }
118        }
119    }
120
121    // Flush remaining
122    if !atoms.is_empty() {
123        if let Some((sn, ic, rn)) = current_residue_key.take() {
124            residues.push(Residue {
125                name: rn,
126                seq_num: sn,
127                i_code: ic,
128                atoms,
129            });
130        }
131    }
132    if let Some(cid) = current_chain_id {
133        if !residues.is_empty() {
134            chains.push(Chain::new(cid, residues));
135        }
136    }
137
138    if atom_count == 0 {
139        return Err(CyaneaError::Parse("no ATOM records found".into()));
140    }
141
142    Ok(Structure { id, chains })
143}
144
145/// Parse a PDB file from disk.
146#[cfg(feature = "std")]
147pub fn parse_pdb_file(path: impl AsRef<::std::path::Path>) -> Result<Structure> {
148    let contents = ::std::fs::read_to_string(path)?;
149    parse_pdb(&contents)
150}
151
152fn parse_atom_record(line: &str, is_hetatm: bool) -> Result<Atom> {
153    // PDB format is fixed-width columns. We need at least 54 chars for coords.
154    if line.len() < 54 {
155        return Err(CyaneaError::Parse(alloc::format!(
156            "ATOM record too short ({} chars): {}",
157            line.len(),
158            line
159        )));
160    }
161
162    let serial = safe_slice(line, 6, 11)
163        .trim()
164        .parse::<u32>()
165        .map_err(|e| CyaneaError::Parse(alloc::format!("bad atom serial: {}", e)))?;
166
167    let name = safe_slice(line, 12, 16).to_string();
168
169    let alt_loc = {
170        let c = safe_slice(line, 16, 17).chars().next().unwrap_or(' ');
171        if c == ' ' {
172            None
173        } else {
174            Some(c)
175        }
176    };
177
178    let x = safe_slice(line, 30, 38)
179        .trim()
180        .parse::<f64>()
181        .map_err(|e| CyaneaError::Parse(alloc::format!("bad x coordinate: {}", e)))?;
182    let y = safe_slice(line, 38, 46)
183        .trim()
184        .parse::<f64>()
185        .map_err(|e| CyaneaError::Parse(alloc::format!("bad y coordinate: {}", e)))?;
186    let z = safe_slice(line, 46, 54)
187        .trim()
188        .parse::<f64>()
189        .map_err(|e| CyaneaError::Parse(alloc::format!("bad z coordinate: {}", e)))?;
190
191    let occupancy = if line.len() >= 60 {
192        safe_slice(line, 54, 60).trim().parse::<f64>().unwrap_or(1.0)
193    } else {
194        1.0
195    };
196
197    let temp_factor = if line.len() >= 66 {
198        safe_slice(line, 60, 66).trim().parse::<f64>().unwrap_or(0.0)
199    } else {
200        0.0
201    };
202
203    let element = if line.len() >= 78 {
204        let e = safe_slice(line, 76, 78).trim();
205        if e.is_empty() {
206            None
207        } else {
208            Some(e.to_string())
209        }
210    } else {
211        None
212    };
213
214    let charge = if line.len() >= 80 {
215        let ch = safe_slice(line, 78, 80).trim();
216        parse_pdb_charge(ch)
217    } else {
218        None
219    };
220
221    Ok(Atom {
222        serial,
223        name,
224        alt_loc,
225        coords: Point3D::new(x, y, z),
226        occupancy,
227        temp_factor,
228        element,
229        charge,
230        is_hetatm,
231    })
232}
233
234fn parse_chain_id(line: &str) -> char {
235    safe_slice(line, 21, 22).chars().next().unwrap_or(' ')
236}
237
238fn parse_residue_seq(line: &str) -> Result<i32> {
239    safe_slice(line, 22, 26)
240        .trim()
241        .parse::<i32>()
242        .map_err(|e| CyaneaError::Parse(alloc::format!("bad residue seq number: {}", e)))
243}
244
245fn parse_insertion_code(line: &str) -> Option<char> {
246    let c = safe_slice(line, 26, 27).chars().next().unwrap_or(' ');
247    if c == ' ' {
248        None
249    } else {
250        Some(c)
251    }
252}
253
254fn parse_residue_name(line: &str) -> String {
255    safe_slice(line, 17, 20).trim().to_string()
256}
257
258fn parse_pdb_charge(s: &str) -> Option<i8> {
259    if s.is_empty() {
260        return None;
261    }
262    // PDB charge format: "2+" or "1-" or "+2" etc.
263    let s = s.trim();
264    if s.is_empty() {
265        return None;
266    }
267    // Try "2+" / "2-" format
268    if s.len() == 2 {
269        let chars: Vec<char> = s.chars().collect();
270        if chars[1] == '+' || chars[1] == '-' {
271            if let Some(d) = chars[0].to_digit(10) {
272                let sign: i8 = if chars[1] == '+' { 1 } else { -1 };
273                return Some(sign * d as i8);
274            }
275        }
276        if chars[0] == '+' || chars[0] == '-' {
277            if let Some(d) = chars[1].to_digit(10) {
278                let sign: i8 = if chars[0] == '+' { 1 } else { -1 };
279                return Some(sign * d as i8);
280            }
281        }
282    }
283    None
284}
285
286/// Safe substring that handles short lines gracefully.
287fn safe_slice(s: &str, start: usize, end: usize) -> &str {
288    let bytes = s.as_bytes();
289    let len = bytes.len();
290    if start >= len {
291        return "";
292    }
293    let actual_end = end.min(len);
294    // Safety: PDB files are ASCII, so byte boundaries = char boundaries
295    &s[start..actual_end]
296}
297
298#[cfg(test)]
299mod tests {
300    use super::*;
301
302    fn minimal_pdb() -> &'static str {
303        "\
304HEADER                                                        1CRN\n\
305ATOM      1  N   THR A   1       2.464   9.901  13.546  1.00 10.00           N\n\
306ATOM      2  CA  THR A   1       2.135  10.226  12.120  1.00 10.00           C\n\
307ATOM      3  C   THR A   1       3.427  10.018  11.354  1.00 10.00           C\n\
308ATOM      4  O   THR A   1       3.426  10.335  10.184  1.00 10.00           O\n\
309ATOM      5  N   ILE A   2       4.462   9.470  11.952  1.00 10.00           N\n\
310ATOM      6  CA  ILE A   2       5.735   9.197  11.275  1.00 10.00           C\n\
311TER       7      ILE A   2\n\
312END\n"
313    }
314
315    #[test]
316    fn parse_single_chain() {
317        let s = parse_pdb(minimal_pdb()).unwrap();
318        assert_eq!(s.id, "1CRN");
319        assert_eq!(s.chain_count(), 1);
320        assert_eq!(s.get_chain('A').unwrap().residue_count(), 2);
321        assert_eq!(s.atom_count(), 6);
322    }
323
324    #[test]
325    fn parse_multi_chain() {
326        let input = "\
327ATOM      1  CA  ALA A   1       1.000   2.000   3.000  1.00  0.00           C\n\
328TER\n\
329ATOM      2  CA  ALA B   1       4.000   5.000   6.000  1.00  0.00           C\n\
330TER\n\
331END\n";
332        let s = parse_pdb(input).unwrap();
333        assert_eq!(s.chain_count(), 2);
334        assert_eq!(s.get_chain('A').unwrap().atom_count(), 1);
335        assert_eq!(s.get_chain('B').unwrap().atom_count(), 1);
336    }
337
338    #[test]
339    fn parse_hetatm() {
340        let input = "\
341ATOM      1  CA  ALA A   1       1.000   2.000   3.000  1.00  0.00           C\n\
342HETATM    2  O   HOH A   2       4.000   5.000   6.000  1.00  0.00           O\n\
343END\n";
344        let s = parse_pdb(input).unwrap();
345        let chain = s.get_chain('A').unwrap();
346        assert_eq!(chain.residue_count(), 2);
347        let water = &chain.residues[1];
348        assert_eq!(water.name, "HOH");
349        assert!(water.atoms[0].is_hetatm);
350    }
351
352    #[test]
353    fn parse_insertion_codes() {
354        // Use properly column-formatted PDB lines.
355        // Col  1- 6: record type
356        // Col  7-11: serial
357        // Col 13-16: atom name
358        // Col 18-20: res name
359        // Col 22:    chain ID
360        // Col 23-26: seq num (right-justified)
361        // Col 27:    iCode
362        // Col 31-38: x  (8.3f)
363        // Col 39-46: y  (8.3f)
364        // Col 47-54: z  (8.3f)
365        //       1         2         3         4         5         6         7
366        // 01234567890123456789012345678901234567890123456789012345678901234567890123456789
367        let input = "\
368ATOM      1  CA  ALA A  10       1.000   2.000   3.000  1.00  0.00           C\n\
369ATOM      2  CA  ALA A  10A      4.000   5.000   6.000  1.00  0.00           C\n\
370END\n";
371        let s = parse_pdb(input).unwrap();
372        let chain = s.get_chain('A').unwrap();
373        assert_eq!(chain.residue_count(), 2);
374        assert_eq!(chain.residues[0].i_code, None);
375        assert_eq!(chain.residues[1].i_code, Some('A'));
376    }
377
378    #[test]
379    fn parse_malformed_record() {
380        let input = "ATOM   BAD\n";
381        assert!(parse_pdb(input).is_err());
382    }
383}