use crate::molecule::{
Atom, Bond, Element, HelixType, Molecule, SecondaryStructureAssignment,
};
use crate::parser::topology;
use crate::parser::ParseError;
use nalgebra::Vector3;
use std::collections::{HashMap, HashSet};
pub fn parse_pdb(content: &str) -> Result<Molecule, ParseError> {
let mut molecule = Molecule::new();
let mut in_first_model = true;
let mut seen_model = false;
let mut chain_set: HashSet<char> = HashSet::new();
let mut ter_positions: HashSet<(char, i32)> = HashSet::new();
for (line_num, line) in content.lines().enumerate() {
let record_type = if line.len() >= 6 {
&line[0..6]
} else {
line
};
match record_type.trim() {
"MODEL" => {
if seen_model {
in_first_model = false;
}
seen_model = true;
}
"ENDMDL" => {
if seen_model {
break;
}
}
"ATOM" | "HETATM" if in_first_model => {
let atom = parse_atom_record(line, line_num + 1, record_type.trim() == "HETATM")?;
chain_set.insert(atom.chain_id);
molecule.atoms.push(atom);
}
"TER" if in_first_model => {
let padded = format!("{:80}", line);
let chain_id = padded.chars().nth(21).unwrap_or(' ');
let res_seq: i32 = padded[22..26].trim().parse().unwrap_or(0);
if chain_id != ' ' || res_seq != 0 {
ter_positions.insert((chain_id, res_seq));
}
}
"HELIX" => {
if let Some(ss) = parse_helix_record(line, line_num + 1)? {
molecule.secondary_structure.push(ss);
}
}
"SHEET" => {
if let Some(ss) = parse_sheet_record(line, line_num + 1)? {
molecule.secondary_structure.push(ss);
}
}
"END" => break,
_ => {}
}
}
molecule.chains = chain_set.into_iter().collect();
molecule.chains.sort();
molecule.bonds = determine_bonds_with_ter(&molecule.atoms, &ter_positions);
Ok(molecule)
}
fn parse_atom_record(line: &str, line_num: usize, is_hetatm: bool) -> Result<Atom, ParseError> {
let line = format!("{:80}", line);
let serial = parse_int(&line[6..11], line_num, "serial number")?;
let name = line[12..16].trim().to_string();
let alt_loc = {
let c = line.chars().nth(16).unwrap_or(' ');
if c == ' ' { None } else { Some(c) }
};
let residue_name = line[17..20].trim().to_string();
let chain_id = line.chars().nth(21).unwrap_or('A');
let residue_seq = parse_int(&line[22..26], line_num, "residue sequence")?;
let ins_code = {
let c = line.chars().nth(26).unwrap_or(' ');
if c == ' ' { None } else { Some(c) }
};
let x = parse_float(&line[30..38], line_num, "X coordinate")?;
let y = parse_float(&line[38..46], line_num, "Y coordinate")?;
let z = parse_float(&line[46..54], line_num, "Z coordinate")?;
let occupancy = parse_float(&line[54..60], line_num, "occupancy").unwrap_or(1.0);
let temp_factor = parse_float(&line[60..66], line_num, "temperature factor").unwrap_or(0.0);
let element_str = line[76..78].trim();
let element = if !element_str.is_empty() {
Element::from_symbol(element_str)
} else {
infer_element_from_name(&name)
};
Ok(Atom {
serial: serial as u32,
name,
alt_loc,
residue_name,
chain_id,
residue_seq,
ins_code,
coord: Vector3::new(x, y, z),
occupancy,
temp_factor,
element,
is_hetatm,
})
}
fn parse_helix_record(
line: &str,
_line_num: usize,
) -> Result<Option<SecondaryStructureAssignment>, ParseError> {
let line = format!("{:80}", line);
let chain_id = line.chars().nth(19).unwrap_or(' ');
let start_seq: i32 = line[21..25].trim().parse().unwrap_or(0);
let end_seq: i32 = line[33..37].trim().parse().unwrap_or(0);
let helix_class: i32 = line[38..40].trim().parse().unwrap_or(1);
let helix_type = match helix_class {
1 => HelixType::Alpha,
3 => HelixType::Pi,
5 => HelixType::ThreeTen,
_ => HelixType::Alpha,
};
Ok(Some(SecondaryStructureAssignment::helix(
chain_id, start_seq, end_seq, helix_type,
)))
}
fn parse_sheet_record(
line: &str,
_line_num: usize,
) -> Result<Option<SecondaryStructureAssignment>, ParseError> {
let line = format!("{:80}", line);
let chain_id = line.chars().nth(21).unwrap_or(' ');
let start_seq: i32 = line[22..26].trim().parse().unwrap_or(0);
let end_seq: i32 = line[33..37].trim().parse().unwrap_or(0);
Ok(Some(SecondaryStructureAssignment::sheet(
chain_id, start_seq, end_seq,
)))
}
fn parse_int(s: &str, line_num: usize, field: &str) -> Result<i32, ParseError> {
s.trim()
.parse()
.map_err(|_| ParseError::ParseError {
line: line_num,
message: format!("Invalid {} value: '{}'", field, s.trim()),
})
}
fn parse_float(s: &str, line_num: usize, field: &str) -> Result<f32, ParseError> {
s.trim()
.parse()
.map_err(|_| ParseError::ParseError {
line: line_num,
message: format!("Invalid {} value: '{}'", field, s.trim()),
})
}
fn infer_element_from_name(name: &str) -> Element {
let letters: String = name.chars().filter(|c| c.is_ascii_alphabetic()).collect();
if letters.is_empty() {
return Element::Unknown;
}
if letters.len() >= 2 {
let two_letter = &letters[0..2];
let elem = Element::from_symbol(two_letter);
if elem != Element::Unknown {
return elem;
}
}
Element::from_symbol(&letters[0..1])
}
pub fn determine_bonds_shared(atoms: &[Atom]) -> Vec<Bond> {
determine_bonds_with_ter(atoms, &HashSet::new())
}
fn determine_bonds_with_ter(atoms: &[Atom], ter_positions: &HashSet<(char, i32)>) -> Vec<Bond> {
let mut bonds = Vec::new();
let mut residue_atoms: HashMap<(char, i32, Option<char>), Vec<usize>> = HashMap::new();
for (idx, atom) in atoms.iter().enumerate() {
residue_atoms
.entry((atom.chain_id, atom.residue_seq, atom.ins_code))
.or_default()
.push(idx);
}
for indices in residue_atoms.values() {
if indices.is_empty() {
continue;
}
let residue_name = &atoms[indices[0]].residue_name;
let topo_bonds = topology::get_residue_bonds(residue_name);
for (name1, name2) in topo_bonds {
let atom1_idx = indices.iter().find(|&&i| atoms[i].name == name1).copied();
let atom2_idx = indices.iter().find(|&&i| atoms[i].name == name2).copied();
if let (Some(a1), Some(a2)) = (atom1_idx, atom2_idx) {
bonds.push(Bond::single(a1, a2));
}
}
}
let mut prev_c: Option<usize> = None;
let mut prev_chain: Option<char> = None;
let mut prev_seq: Option<i32> = None;
let mut prev_ins_code: Option<char> = None;
for (idx, atom) in atoms.iter().enumerate() {
if atom.name == "N" && !atom.is_hetatm {
if let (Some(c_idx), Some(pc), Some(ps)) = (prev_c, prev_chain, prev_seq) {
if atom.chain_id == pc {
let has_ter = ter_positions.contains(&(pc, ps));
if !has_ter {
let is_consecutive = if atom.residue_seq == ps + 1 && atom.ins_code.is_none() && prev_ins_code.is_none() {
true
} else if atom.residue_seq == ps {
match (prev_ins_code, atom.ins_code) {
(None, Some(_)) => true, (Some(p), Some(c)) => (c as u8) == (p as u8) + 1, _ => false,
}
} else if atom.residue_seq == ps + 1 && prev_ins_code.is_some() && atom.ins_code.is_none() {
true
} else {
false
};
if is_consecutive {
bonds.push(Bond::single(c_idx, idx));
}
}
}
}
}
if atom.name == "C" && !atom.is_hetatm {
prev_c = Some(idx);
prev_chain = Some(atom.chain_id);
prev_seq = Some(atom.residue_seq);
prev_ins_code = atom.ins_code;
}
}
add_distance_bonds(atoms, &mut bonds);
bonds
}
fn add_distance_bonds(atoms: &[Atom], bonds: &mut Vec<Bond>) {
let hetatm_indices: Vec<usize> = atoms
.iter()
.enumerate()
.filter(|(_, a)| a.is_hetatm && !a.is_water())
.map(|(i, _)| i)
.collect();
if hetatm_indices.is_empty() {
return;
}
const CELL_SIZE: f32 = 5.5;
let mut grid: HashMap<(i32, i32, i32), Vec<usize>> = HashMap::new();
for &idx in &hetatm_indices {
let coord = &atoms[idx].coord;
let cell = (
(coord.x / CELL_SIZE).floor() as i32,
(coord.y / CELL_SIZE).floor() as i32,
(coord.z / CELL_SIZE).floor() as i32,
);
grid.entry(cell).or_default().push(idx);
}
let mut checked: HashSet<(usize, usize)> = HashSet::new();
for &idx1 in &hetatm_indices {
let coord1 = &atoms[idx1].coord;
let cell = (
(coord1.x / CELL_SIZE).floor() as i32,
(coord1.y / CELL_SIZE).floor() as i32,
(coord1.z / CELL_SIZE).floor() as i32,
);
for dx in -1..=1 {
for dy in -1..=1 {
for dz in -1..=1 {
let neighbor_cell = (cell.0 + dx, cell.1 + dy, cell.2 + dz);
if let Some(neighbors) = grid.get(&neighbor_cell) {
for &idx2 in neighbors {
if idx1 >= idx2 {
continue; }
let pair = (idx1.min(idx2), idx1.max(idx2));
if checked.contains(&pair) {
continue;
}
checked.insert(pair);
let dist = (atoms[idx1].coord - atoms[idx2].coord).magnitude();
let max_bond_dist =
atoms[idx1].vdw_radius() + atoms[idx2].vdw_radius() - 0.4;
if dist < max_bond_dist && dist > 0.4 {
bonds.push(Bond::single(idx1, idx2));
}
}
}
}
}
}
}
}