use crate::constants::{AMINOACIDS, DNA};
use serde::{Deserialize, Serialize};
use std::fs::File;
use std::io::{BufRead, BufReader, Cursor};
use std::collections::{HashMap, HashSet};
mod constants;
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub enum MolecularType {
Protein,
Dna,
Other,
}
impl From<MolecularType> for String {
fn from(val: MolecularType) -> Self {
match val {
MolecularType::Protein => "protein".to_string(),
MolecularType::Dna => "dna".to_string(),
MolecularType::Other => "other".to_string(),
}
}
}
pub fn identify_molecular_types(structure: &pdbtbx::PDB) -> HashMap<String, Vec<MolecularType>> {
let mut mol_types = HashMap::new();
for chain in structure.chains() {
let chain_id = chain.id().to_string();
let chain_mol_types = chain.residues().map(|res| {
let res_name = res.name().unwrap().to_uppercase();
if AMINOACIDS.contains(&res_name.as_str()) {
MolecularType::Protein
} else if DNA.contains(&res_name.as_str()) {
MolecularType::Dna
} else {
MolecularType::Other
}
});
let unique_mol_types = chain_mol_types.into_iter().collect();
mol_types.insert(chain_id, unique_mol_types);
}
mol_types
}
pub fn identify_chains(structure: &pdbtbx::PDB) -> Vec<String> {
structure
.chains()
.map(|chain| chain.id().to_string())
.collect()
}
pub fn identify_residue_numbers(structure: &pdbtbx::PDB) -> HashMap<String, Vec<String>> {
structure
.chains()
.map(|chain| {
let resnumbers: Vec<String> = chain
.residues()
.map(|res| res.serial_number().to_string())
.collect::<Vec<_>>()
.into_iter()
.collect::<std::collections::HashSet<_>>()
.into_iter()
.collect();
let mut resnumbers = resnumbers.into_iter().collect::<Vec<_>>();
resnumbers.sort();
(chain.id().to_string(), resnumbers)
})
.collect()
}
pub fn identify_unknowns(structure: &pdbtbx::PDB) -> HashMap<String, Vec<String>> {
let mut res_map = HashMap::new();
let known_residues: HashSet<_> = AMINOACIDS
.iter()
.chain(DNA.iter())
.map(|s| s.to_uppercase())
.collect();
for chain in structure.chains() {
let chain_residues: Vec<_> = chain
.residues()
.filter(|res| !known_residues.contains(&res.name().unwrap().to_uppercase()))
.map(|res| res.name().unwrap().to_string())
.collect();
let mut chain_residues = chain_residues;
chain_residues.sort();
chain_residues.dedup();
res_map.insert(chain.id().to_string(), chain_residues);
}
res_map
}
pub fn chains_in_contact(structure: &pdbtbx::PDB) -> Vec<(String, String)> {
let mut contacts: HashSet<Vec<String>> = HashSet::new();
for (chain_x, chain_y) in structure
.chains()
.flat_map(|cx| structure.chains().map(move |cy| (cx, cy)))
{
if chain_x.id() == chain_y.id() {
continue;
}
let mut in_contacts = false;
for contact in &contacts {
if contact.contains(&chain_x.id().to_string())
&& contact.contains(&chain_y.id().to_string())
{
in_contacts = true;
break;
}
}
if in_contacts {
continue;
}
for res_x in chain_x.residues() {
for res_y in chain_y.residues() {
for atom_i in res_x.atoms() {
for atom_j in res_y.atoms() {
let dist = atom_i.distance(atom_j);
if dist <= 5.0 {
contacts
.insert(vec![chain_x.id().to_string(), chain_y.id().to_string()]);
}
}
}
}
}
}
contacts
.into_iter()
.map(|pair| (pair[0].clone(), pair[1].clone()))
.collect()
}
pub fn remove_remark(pdb_f: &str) -> BufReader<Cursor<Vec<u8>>> {
let input_file = File::open(pdb_f).unwrap();
let reader = BufReader::new(input_file);
let filtered_content: Vec<u8> = reader
.lines()
.filter_map(|line| {
let line = line.unwrap();
if !line.starts_with("REMARK") {
Some(line + "\n")
} else {
None
}
})
.collect::<String>()
.into_bytes();
BufReader::new(Cursor::new(filtered_content))
}
pub fn pad_lines(pdb_f: &str) -> BufReader<Cursor<Vec<u8>>> {
let input_file = File::open(pdb_f).unwrap();
let reader = BufReader::new(input_file);
let filtered_content: Vec<u8> = reader
.lines()
.flat_map(|line| {
let line = line.unwrap();
let mut processed_line = if line.starts_with("ATOM") {
let mut padded_line = line.to_string();
if line.len() <= 80 {
padded_line.push_str(" ".repeat(80 - line.len()).as_str());
padded_line
} else {
line[..80].to_string()
}
} else {
line
};
processed_line.push('\n'); processed_line.into_bytes()
})
.collect();
BufReader::new(Cursor::new(filtered_content))
}
#[cfg(test)]
mod tests {
use pdbtbx::ReadOptions;
use super::*;
use std::collections::HashMap;
#[test]
fn test_identify_molecular_types() {
let (structure, _) = ReadOptions::default()
.set_format(pdbtbx::Format::Pdb)
.read("test_data/prot_ligand.pdb")
.unwrap();
let mol_types = identify_molecular_types(&structure);
let mut expected = HashMap::new();
expected.insert(
"A".to_string(),
vec![MolecularType::Protein, MolecularType::Other],
);
assert_eq!(mol_types, expected);
}
#[test]
fn test_identify_chains() {
let (structure, _) = ReadOptions::default()
.set_format(pdbtbx::Format::Pdb)
.read("test_data/chains.pdb")
.unwrap();
let chains = identify_chains(&structure);
assert_eq!(
chains,
vec!["A".to_string(), "B".to_string(), "C".to_string()]
);
}
#[test]
fn test_identify_residue_numbers() {
let (structure, _) = ReadOptions::default()
.set_format(pdbtbx::Format::Pdb)
.read("test_data/prot_ligand.pdb")
.unwrap();
let residue_numbers = identify_residue_numbers(&structure);
let mut expected = HashMap::new();
expected.insert("A".to_string(), vec!["104".to_string(), "201".to_string()]);
assert_eq!(residue_numbers, expected);
}
#[test]
fn test_identify_unknowns() {
let (structure, _) = ReadOptions::default()
.set_format(pdbtbx::Format::Pdb)
.read("test_data/prot_ligand.pdb")
.unwrap();
let unknowns = identify_unknowns(&structure);
let mut expected = HashMap::new();
expected.insert("A".to_string(), vec!["I09".to_string()]);
assert_eq!(unknowns, expected);
}
#[test]
fn test_chains_in_contact() {
let (structure, _) = ReadOptions::default()
.set_format(pdbtbx::Format::Pdb)
.read("test_data/chains_in_contact.pdb")
.unwrap();
let contacts = chains_in_contact(&structure);
let expected = vec![("A".to_string(), "B".to_string())];
assert_eq!(contacts, expected);
}
#[test]
fn test_remove_remarks() {
let input_pdb = "test_data/pdb_w_remark.pdb";
let reader = remove_remark(input_pdb);
let lines: Vec<String> = reader.lines().map(|line| line.unwrap()).collect();
assert!(!lines.iter().any(|line| line.starts_with("REMARK")));
}
#[test]
fn test_pad_short_lines() {
let input_pdb = "test_data/pdb_w_short_lines.pdb";
let reader = pad_lines(input_pdb);
let lines: Vec<String> = reader.lines().map(|line| line.unwrap()).collect();
assert!(lines
.iter()
.filter(|line| line.starts_with("ATOM"))
.all(|line| line.len() == 80));
}
#[test]
fn test_pad_long_lines() {
let input_pdb = "test_data/pdb_w_long_lines.pdb";
let reader = pad_lines(input_pdb);
let lines: Vec<String> = reader.lines().map(|line| line.unwrap()).collect();
assert!(lines
.iter()
.filter(|line| line.starts_with("ATOM"))
.all(|line| line.len() == 80));
}
}