pdb_handler/lib.rs
1use crate::constants::{AMINOACIDS, DNA};
2use serde::{Deserialize, Serialize};
3use std::fs::File;
4use std::io::{BufRead, BufReader, Cursor};
5
6use std::collections::{HashMap, HashSet};
7
8mod constants;
9
10#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
11pub enum MolecularType {
12 Protein,
13 Dna,
14 Other,
15}
16
17impl From<MolecularType> for String {
18 fn from(val: MolecularType) -> Self {
19 match val {
20 MolecularType::Protein => "protein".to_string(),
21 MolecularType::Dna => "dna".to_string(),
22 MolecularType::Other => "other".to_string(),
23 }
24 }
25}
26
27/// Identifies molecular types in the given PDB structure.
28///
29/// This function analyzes the chains and residues in a PDB structure to categorize each residue
30/// into molecular types such as Protein, DNA, or Other. It returns a `HashMap` where the keys
31/// are chain IDs and the values are vectors of unique `MolecularType`s present in each chain.
32///
33/// # Arguments
34///
35/// * `structure` - A reference to a `pdbtbx::PDB` structure representing the PDB file to be analyzed.
36///
37/// # Returns
38///
39/// A `HashMap<String, Vec<MolecularType>>` where each key is a chain ID and each value is a vector
40/// of unique `MolecularType`s found in that chain.
41///
42/// # Example
43///
44/// ```rust
45/// use pdbtbx::PDB;
46/// use pdb_handler::{identify_molecular_types, MolecularType};
47///
48/// let (mut pdb, _errors) = pdbtbx::open("example-pdbs/1crn.pdb").unwrap();
49/// let mol_types = identify_molecular_types(&pdb);
50///
51/// for (chain_id, types) in mol_types {
52/// println!("Chain {}: {:?}", chain_id, types);
53/// }
54/// ```
55///
56/// # Panics
57///
58/// This function will panic if the residue name cannot be retrieved (`res.name().unwrap()`).
59///
60pub fn identify_molecular_types(structure: &pdbtbx::PDB) -> HashMap<String, Vec<MolecularType>> {
61 let mut mol_types = HashMap::new();
62
63 for chain in structure.chains() {
64 let chain_id = chain.id().to_string();
65 let chain_mol_types = chain.residues().map(|res| {
66 let res_name = res.name().unwrap().to_uppercase();
67 if AMINOACIDS.contains(&res_name.as_str()) {
68 MolecularType::Protein
69 } else if DNA.contains(&res_name.as_str()) {
70 MolecularType::Dna
71 } else {
72 MolecularType::Other
73 }
74 });
75
76 let unique_mol_types = chain_mol_types.into_iter().collect();
77
78 mol_types.insert(chain_id, unique_mol_types);
79 }
80
81 mol_types
82}
83
84/// Identifies all chain IDs in the given PDB structure.
85///
86/// This function iterates over all chains in a PDB structure and collects their IDs into a vector of strings.
87///
88/// # Arguments
89///
90/// * `structure` - A reference to a `pdbtbx::PDB` structure representing the PDB file to be analyzed.
91///
92/// # Returns
93///
94/// A `Vec<String>` containing the IDs of all chains present in the PDB structure.
95///
96/// # Example
97///
98/// ```rust
99/// use pdbtbx::PDB;
100/// use pdb_handler::identify_chains;
101///
102/// let (mut pdb, _errors) = pdbtbx::open("example-pdbs/1crn.pdb").unwrap();
103/// let chains = identify_chains(&pdb);
104///
105/// for chain_id in chains {
106/// println!("Chain ID: {}", chain_id);
107/// }
108/// ```
109pub fn identify_chains(structure: &pdbtbx::PDB) -> Vec<String> {
110 structure
111 .chains()
112 .map(|chain| chain.id().to_string())
113 .collect()
114}
115
116/// Identifies residue numbers in each chain of the given PDB structure.
117///
118/// This function iterates over all chains in a PDB structure, collects the residue numbers
119/// within each chain, and returns them in a `HashMap`. The keys in the `HashMap` are chain IDs,
120/// and the values are vectors of unique residue numbers represented as strings.
121///
122/// # Arguments
123///
124/// * `structure` - A reference to a `pdbtbx::PDB` structure representing the PDB file to be analyzed.
125///
126/// # Returns
127///
128/// A `HashMap<String, Vec<String>>` where each key is a chain ID and each value is a vector of unique
129/// residue numbers found in that chain.
130///
131/// # Example
132///
133/// ```rust
134/// use pdbtbx::PDB;
135/// use pdb_handler::identify_residue_numbers;
136///
137/// let (mut pdb, _errors) = pdbtbx::open("example-pdbs/1crn.pdb").unwrap();
138/// let residue_numbers = identify_residue_numbers(&pdb);
139///
140/// for (chain_id, numbers) in residue_numbers {
141/// println!("Chain {}: {:?}", chain_id, numbers);
142/// }
143/// ```
144///
145/// # Panics
146///
147/// This function will panic if the residue serial number cannot be retrieved.
148pub fn identify_residue_numbers(structure: &pdbtbx::PDB) -> HashMap<String, Vec<String>> {
149 structure
150 .chains()
151 .map(|chain| {
152 let resnumbers: Vec<String> = chain
153 .residues()
154 .map(|res| res.serial_number().to_string())
155 .collect::<Vec<_>>()
156 .into_iter()
157 .collect::<std::collections::HashSet<_>>()
158 .into_iter()
159 .collect();
160 // Sort the residue numbers
161 let mut resnumbers = resnumbers.into_iter().collect::<Vec<_>>();
162 resnumbers.sort();
163 (chain.id().to_string(), resnumbers)
164 })
165 .collect()
166}
167
168/// Identifies unknown residues in each chain of the given PDB structure.
169///
170/// This function iterates over all chains in a PDB structure, filters out known residues (amino acids and DNA),
171/// and collects the names of unknown residues. It returns a `HashMap` where the keys are chain IDs and the
172/// values are vectors of unique unknown residue names.
173///
174/// # Arguments
175///
176/// * `structure` - A reference to a `pdbtbx::PDB` structure representing the PDB file to be analyzed.
177///
178/// # Returns
179///
180/// A `HashMap<String, Vec<String>>` where each key is a chain ID and each value is a vector of unique
181/// unknown residue names found in that chain.
182///
183/// # Example
184///
185/// ```rust
186/// use pdbtbx::PDB;
187/// use pdb_handler::identify_unknowns;
188///
189/// let (mut pdb, _errors) = pdbtbx::open("example-pdbs/1crn.pdb").unwrap();
190/// let unknown_residues = identify_unknowns(&pdb);
191///
192/// for (chain_id, residues) in unknown_residues {
193/// println!("Chain {}: {:?}", chain_id, residues);
194/// }
195/// ```
196///
197/// # Panics
198///
199/// This function will panic if the residue name cannot be retrieved.
200pub fn identify_unknowns(structure: &pdbtbx::PDB) -> HashMap<String, Vec<String>> {
201 let mut res_map = HashMap::new();
202
203 let known_residues: HashSet<_> = AMINOACIDS
204 .iter()
205 .chain(DNA.iter())
206 .map(|s| s.to_uppercase())
207 .collect();
208
209 for chain in structure.chains() {
210 let chain_residues: Vec<_> = chain
211 .residues()
212 .filter(|res| !known_residues.contains(&res.name().unwrap().to_uppercase()))
213 .map(|res| res.name().unwrap().to_string())
214 .collect();
215
216 let mut chain_residues = chain_residues;
217
218 chain_residues.sort();
219 chain_residues.dedup();
220
221 res_map.insert(chain.id().to_string(), chain_residues);
222 }
223
224 res_map
225}
226
227/// Identifies unknown residues in each chain of the given PDB structure.
228///
229/// This function iterates over all chains in a PDB structure, filters out known residues (amino acids and DNA),
230/// and collects the names of unknown residues. It returns a `HashMap` where the keys are chain IDs and the
231/// values are vectors of unique unknown residue names.
232///
233/// # Arguments
234///
235/// * `structure` - A reference to a `pdbtbx::PDB` structure representing the PDB file to be analyzed.
236///
237/// # Returns
238///
239/// A `HashMap<String, Vec<String>>` where each key is a chain ID and each value is a vector of unique
240/// unknown residue names found in that chain.
241///
242/// # Example
243///
244/// ```rust
245/// use pdbtbx::PDB;
246/// use pdb_handler::identify_unknowns;
247///
248/// let (mut pdb, _errors) = pdbtbx::open("example-pdbs/1crn.pdb").unwrap();
249/// let unknown_residues = identify_unknowns(&pdb);
250///
251/// for (chain_id, residues) in unknown_residues {
252/// println!("Chain {}: {:?}", chain_id, residues);
253/// }
254/// ```
255///
256/// # Panics
257///
258/// This function will panic if the residue name cannot be retrieved.
259pub fn chains_in_contact(structure: &pdbtbx::PDB) -> Vec<(String, String)> {
260 let mut contacts: HashSet<Vec<String>> = HashSet::new();
261
262 for (chain_x, chain_y) in structure
263 .chains()
264 .flat_map(|cx| structure.chains().map(move |cy| (cx, cy)))
265 {
266 if chain_x.id() == chain_y.id() {
267 continue;
268 }
269
270 let mut in_contacts = false;
271 for contact in &contacts {
272 if contact.contains(&chain_x.id().to_string())
273 && contact.contains(&chain_y.id().to_string())
274 {
275 in_contacts = true;
276 break;
277 }
278 }
279
280 if in_contacts {
281 continue;
282 }
283
284 for res_x in chain_x.residues() {
285 for res_y in chain_y.residues() {
286 for atom_i in res_x.atoms() {
287 for atom_j in res_y.atoms() {
288 let dist = atom_i.distance(atom_j);
289 if dist <= 5.0 {
290 contacts
291 .insert(vec![chain_x.id().to_string(), chain_y.id().to_string()]);
292 }
293 }
294 }
295 }
296 }
297 }
298
299 contacts
300 .into_iter()
301 .map(|pair| (pair[0].clone(), pair[1].clone()))
302 .collect()
303}
304
305/// Removes lines starting with "REMARK" from a PDB file and returns the filtered content as a BufReader.
306///
307/// This function reads a Protein Data Bank (PDB) file, filters out all lines that start with the keyword "REMARK",
308/// and returns the remaining content as a `BufReader` over an in-memory buffer. This allows for further processing
309/// of the filtered content without needing to write it to a temporary file.
310///
311/// # Arguments
312///
313/// * `pdb_f` - A string slice that holds the path to the input PDB file.
314///
315/// # Returns
316///
317/// * `BufReader<Cursor<Vec<u8>>>` - A `BufReader` containing the filtered content.
318///
319/// # Panics
320///
321/// This function will panic if the input file cannot be opened or read.
322///
323/// # Examples
324///
325/// ```
326/// use pdb_handler::remove_remark;
327/// use std::io::BufRead;
328/// let reader = remove_remark("example-pdbs/1crn.pdb");
329/// for line in reader.lines() {
330/// println!("{:?}", line.unwrap());
331/// }
332/// ```
333pub fn remove_remark(pdb_f: &str) -> BufReader<Cursor<Vec<u8>>> {
334 // Open the input file
335 let input_file = File::open(pdb_f).unwrap();
336 let reader = BufReader::new(input_file);
337
338 // Collect filtered lines into a vector
339 let filtered_content: Vec<u8> = reader
340 .lines()
341 .filter_map(|line| {
342 let line = line.unwrap();
343 if !line.starts_with("REMARK") {
344 Some(line + "\n")
345 } else {
346 None
347 }
348 })
349 .collect::<String>()
350 .into_bytes();
351
352 // Create a BufReader over an in-memory buffer
353 BufReader::new(Cursor::new(filtered_content))
354}
355
356/// Reads a text file specified by `pdb_f`, pads each line that starts with `ATOM` to 80 characters
357/// with spaces, and returns a buffered reader over an in-memory buffer
358/// containing the padded content.
359///
360/// # Arguments
361///
362/// * `pdb_f` - A string slice that holds the path to the input text file.
363///
364/// # Returns
365///
366/// A `BufReader` wrapped around a `Cursor<Vec<u8>>>`, where each line from
367/// the input file is padded to 80 characters with spaces and newline character.
368///
369/// # Panics
370///
371/// This function panics if it encounters any I/O errors while reading or
372/// processing the file.
373///
374/// # Examples
375///
376/// ```rust
377/// use pdb_handler::pad_lines;
378/// use std::io::Read;
379/// use std::io::BufReader;
380///
381/// let mut padded_reader = pad_lines("example-pdbs/dna.pdb");
382/// let mut buffer = String::new();
383/// padded_reader.read_to_string(&mut buffer).unwrap();
384/// println!("Padded content:\n{}", buffer);
385/// ```
386///
387/// This example reads lines from "dna.pdb", pads each line that starts with `ATOM` with spaces
388/// to reach 80 characters, and then prints out the padded content.
389pub fn pad_lines(pdb_f: &str) -> BufReader<Cursor<Vec<u8>>> {
390 // Open the input file
391 let input_file = File::open(pdb_f).unwrap();
392 let reader = BufReader::new(input_file);
393
394 // Collect filtered lines into a vector
395 let filtered_content: Vec<u8> = reader
396 .lines()
397 .flat_map(|line| {
398 let line = line.unwrap();
399 let mut processed_line = if line.starts_with("ATOM") {
400 let mut padded_line = line.to_string();
401 if line.len() <= 80 {
402 padded_line.push_str(" ".repeat(80 - line.len()).as_str());
403 padded_line
404 } else {
405 line[..80].to_string()
406 }
407 } else {
408 line
409 };
410 processed_line.push('\n'); // Append newline
411 processed_line.into_bytes()
412 })
413 .collect();
414
415 // Create a BufReader over an in-memory buffer
416 BufReader::new(Cursor::new(filtered_content))
417}
418
419#[cfg(test)]
420mod tests {
421
422 use pdbtbx::ReadOptions;
423
424 use super::*;
425 // use pdbtbx::{Atom, Chain, Residue, PDB};
426 use std::collections::HashMap;
427
428 #[test]
429 fn test_identify_molecular_types() {
430 // Load the structure from the test_data folder
431 let (structure, _) = ReadOptions::default()
432 .set_format(pdbtbx::Format::Pdb)
433 .read("test_data/prot_ligand.pdb")
434 .unwrap();
435
436 let mol_types = identify_molecular_types(&structure);
437
438 let mut expected = HashMap::new();
439 expected.insert(
440 "A".to_string(),
441 vec![MolecularType::Protein, MolecularType::Other],
442 );
443
444 assert_eq!(mol_types, expected);
445 }
446
447 #[test]
448 fn test_identify_chains() {
449 // Load the structure from the test_data folder
450 let (structure, _) = ReadOptions::default()
451 .set_format(pdbtbx::Format::Pdb)
452 .read("test_data/chains.pdb")
453 .unwrap();
454
455 let chains = identify_chains(&structure);
456
457 assert_eq!(
458 chains,
459 vec!["A".to_string(), "B".to_string(), "C".to_string()]
460 );
461 }
462
463 #[test]
464 fn test_identify_residue_numbers() {
465 // Load the structure from the test_data folder
466 let (structure, _) = ReadOptions::default()
467 .set_format(pdbtbx::Format::Pdb)
468 .read("test_data/prot_ligand.pdb")
469 .unwrap();
470
471 let residue_numbers = identify_residue_numbers(&structure);
472
473 let mut expected = HashMap::new();
474 expected.insert("A".to_string(), vec!["104".to_string(), "201".to_string()]);
475
476 assert_eq!(residue_numbers, expected);
477 }
478
479 #[test]
480 fn test_identify_unknowns() {
481 // Load the structure from the test_data folder
482 let (structure, _) = ReadOptions::default()
483 .set_format(pdbtbx::Format::Pdb)
484 .read("test_data/prot_ligand.pdb")
485 .unwrap();
486
487 let unknowns = identify_unknowns(&structure);
488
489 let mut expected = HashMap::new();
490 expected.insert("A".to_string(), vec!["I09".to_string()]);
491
492 assert_eq!(unknowns, expected);
493 }
494
495 #[test]
496 fn test_chains_in_contact() {
497 // Load the structure from the test_data folder
498 let (structure, _) = ReadOptions::default()
499 .set_format(pdbtbx::Format::Pdb)
500 .read("test_data/chains_in_contact.pdb")
501 .unwrap();
502
503 let contacts = chains_in_contact(&structure);
504
505 let expected = vec![("A".to_string(), "B".to_string())];
506
507 assert_eq!(contacts, expected);
508 }
509
510 #[test]
511 fn test_remove_remarks() {
512 let input_pdb = "test_data/pdb_w_remark.pdb";
513 let reader = remove_remark(input_pdb);
514
515 // Collect the lines from the reader and check if the REMARK lines are removed
516 let lines: Vec<String> = reader.lines().map(|line| line.unwrap()).collect();
517
518 assert!(!lines.iter().any(|line| line.starts_with("REMARK")));
519 }
520
521 #[test]
522 fn test_pad_short_lines() {
523 let input_pdb = "test_data/pdb_w_short_lines.pdb";
524
525 let reader = pad_lines(input_pdb);
526
527 let lines: Vec<String> = reader.lines().map(|line| line.unwrap()).collect();
528
529 assert!(lines
530 .iter()
531 .filter(|line| line.starts_with("ATOM"))
532 .all(|line| line.len() == 80));
533 }
534 #[test]
535 fn test_pad_long_lines() {
536 let input_pdb = "test_data/pdb_w_long_lines.pdb";
537
538 let reader = pad_lines(input_pdb);
539
540 let lines: Vec<String> = reader.lines().map(|line| line.unwrap()).collect();
541
542 assert!(lines
543 .iter()
544 .filter(|line| line.starts_with("ATOM"))
545 .all(|line| line.len() == 80));
546 }
547}