pdb_handler/
lib.rs

1use crate::constants::{AMINOACIDS, DNA};
2use serde::{Deserialize, Serialize};
3use std::fs::File;
4use std::io::{BufRead, BufReader, Cursor};
5
6use std::collections::{HashMap, HashSet};
7
8mod constants;
9
10#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
11pub enum MolecularType {
12    Protein,
13    Dna,
14    Other,
15}
16
17impl From<MolecularType> for String {
18    fn from(val: MolecularType) -> Self {
19        match val {
20            MolecularType::Protein => "protein".to_string(),
21            MolecularType::Dna => "dna".to_string(),
22            MolecularType::Other => "other".to_string(),
23        }
24    }
25}
26
27/// Identifies molecular types in the given PDB structure.
28///
29/// This function analyzes the chains and residues in a PDB structure to categorize each residue
30/// into molecular types such as Protein, DNA, or Other. It returns a `HashMap` where the keys
31/// are chain IDs and the values are vectors of unique `MolecularType`s present in each chain.
32///
33/// # Arguments
34///
35/// * `structure` - A reference to a `pdbtbx::PDB` structure representing the PDB file to be analyzed.
36///
37/// # Returns
38///
39/// A `HashMap<String, Vec<MolecularType>>` where each key is a chain ID and each value is a vector
40/// of unique `MolecularType`s found in that chain.
41///
42/// # Example
43///
44/// ```rust
45/// use pdbtbx::PDB;
46/// use pdb_handler::{identify_molecular_types, MolecularType};
47///
48/// let (mut pdb, _errors) = pdbtbx::open("example-pdbs/1crn.pdb").unwrap();
49/// let mol_types = identify_molecular_types(&pdb);
50///
51/// for (chain_id, types) in mol_types {
52///     println!("Chain {}: {:?}", chain_id, types);
53/// }
54/// ```
55///
56/// # Panics
57///
58/// This function will panic if the residue name cannot be retrieved (`res.name().unwrap()`).
59///
60pub fn identify_molecular_types(structure: &pdbtbx::PDB) -> HashMap<String, Vec<MolecularType>> {
61    let mut mol_types = HashMap::new();
62
63    for chain in structure.chains() {
64        let chain_id = chain.id().to_string();
65        let chain_mol_types = chain.residues().map(|res| {
66            let res_name = res.name().unwrap().to_uppercase();
67            if AMINOACIDS.contains(&res_name.as_str()) {
68                MolecularType::Protein
69            } else if DNA.contains(&res_name.as_str()) {
70                MolecularType::Dna
71            } else {
72                MolecularType::Other
73            }
74        });
75
76        let unique_mol_types = chain_mol_types.into_iter().collect();
77
78        mol_types.insert(chain_id, unique_mol_types);
79    }
80
81    mol_types
82}
83
84/// Identifies all chain IDs in the given PDB structure.
85///
86/// This function iterates over all chains in a PDB structure and collects their IDs into a vector of strings.
87///
88/// # Arguments
89///
90/// * `structure` - A reference to a `pdbtbx::PDB` structure representing the PDB file to be analyzed.
91///
92/// # Returns
93///
94/// A `Vec<String>` containing the IDs of all chains present in the PDB structure.
95///
96/// # Example
97///
98/// ```rust
99/// use pdbtbx::PDB;
100/// use pdb_handler::identify_chains;
101///
102/// let (mut pdb, _errors) = pdbtbx::open("example-pdbs/1crn.pdb").unwrap();
103/// let chains = identify_chains(&pdb);
104///
105/// for chain_id in chains {
106///     println!("Chain ID: {}", chain_id);
107/// }
108/// ```
109pub fn identify_chains(structure: &pdbtbx::PDB) -> Vec<String> {
110    structure
111        .chains()
112        .map(|chain| chain.id().to_string())
113        .collect()
114}
115
116/// Identifies residue numbers in each chain of the given PDB structure.
117///
118/// This function iterates over all chains in a PDB structure, collects the residue numbers
119/// within each chain, and returns them in a `HashMap`. The keys in the `HashMap` are chain IDs,
120/// and the values are vectors of unique residue numbers represented as strings.
121///
122/// # Arguments
123///
124/// * `structure` - A reference to a `pdbtbx::PDB` structure representing the PDB file to be analyzed.
125///
126/// # Returns
127///
128/// A `HashMap<String, Vec<String>>` where each key is a chain ID and each value is a vector of unique
129/// residue numbers found in that chain.
130///
131/// # Example
132///
133/// ```rust
134/// use pdbtbx::PDB;
135/// use pdb_handler::identify_residue_numbers;
136///
137/// let (mut pdb, _errors) = pdbtbx::open("example-pdbs/1crn.pdb").unwrap();
138/// let residue_numbers = identify_residue_numbers(&pdb);
139///
140/// for (chain_id, numbers) in residue_numbers {
141///     println!("Chain {}: {:?}", chain_id, numbers);
142/// }
143/// ```
144///
145/// # Panics
146///
147/// This function will panic if the residue serial number cannot be retrieved.
148pub fn identify_residue_numbers(structure: &pdbtbx::PDB) -> HashMap<String, Vec<String>> {
149    structure
150        .chains()
151        .map(|chain| {
152            let resnumbers: Vec<String> = chain
153                .residues()
154                .map(|res| res.serial_number().to_string())
155                .collect::<Vec<_>>()
156                .into_iter()
157                .collect::<std::collections::HashSet<_>>()
158                .into_iter()
159                .collect();
160            // Sort the residue numbers
161            let mut resnumbers = resnumbers.into_iter().collect::<Vec<_>>();
162            resnumbers.sort();
163            (chain.id().to_string(), resnumbers)
164        })
165        .collect()
166}
167
168/// Identifies unknown residues in each chain of the given PDB structure.
169///
170/// This function iterates over all chains in a PDB structure, filters out known residues (amino acids and DNA),
171/// and collects the names of unknown residues. It returns a `HashMap` where the keys are chain IDs and the
172/// values are vectors of unique unknown residue names.
173///
174/// # Arguments
175///
176/// * `structure` - A reference to a `pdbtbx::PDB` structure representing the PDB file to be analyzed.
177///
178/// # Returns
179///
180/// A `HashMap<String, Vec<String>>` where each key is a chain ID and each value is a vector of unique
181/// unknown residue names found in that chain.
182///
183/// # Example
184///
185/// ```rust
186/// use pdbtbx::PDB;
187/// use pdb_handler::identify_unknowns;
188///
189/// let (mut pdb, _errors) = pdbtbx::open("example-pdbs/1crn.pdb").unwrap();
190/// let unknown_residues = identify_unknowns(&pdb);
191///
192/// for (chain_id, residues) in unknown_residues {
193///    println!("Chain {}: {:?}", chain_id, residues);
194/// }
195/// ```
196///
197/// # Panics
198///
199/// This function will panic if the residue name cannot be retrieved.
200pub fn identify_unknowns(structure: &pdbtbx::PDB) -> HashMap<String, Vec<String>> {
201    let mut res_map = HashMap::new();
202
203    let known_residues: HashSet<_> = AMINOACIDS
204        .iter()
205        .chain(DNA.iter())
206        .map(|s| s.to_uppercase())
207        .collect();
208
209    for chain in structure.chains() {
210        let chain_residues: Vec<_> = chain
211            .residues()
212            .filter(|res| !known_residues.contains(&res.name().unwrap().to_uppercase()))
213            .map(|res| res.name().unwrap().to_string())
214            .collect();
215
216        let mut chain_residues = chain_residues;
217
218        chain_residues.sort();
219        chain_residues.dedup();
220
221        res_map.insert(chain.id().to_string(), chain_residues);
222    }
223
224    res_map
225}
226
227/// Identifies unknown residues in each chain of the given PDB structure.
228///
229/// This function iterates over all chains in a PDB structure, filters out known residues (amino acids and DNA),
230/// and collects the names of unknown residues. It returns a `HashMap` where the keys are chain IDs and the
231/// values are vectors of unique unknown residue names.
232///
233/// # Arguments
234///
235/// * `structure` - A reference to a `pdbtbx::PDB` structure representing the PDB file to be analyzed.
236///
237/// # Returns
238///
239/// A `HashMap<String, Vec<String>>` where each key is a chain ID and each value is a vector of unique
240/// unknown residue names found in that chain.
241///
242/// # Example
243///
244/// ```rust
245/// use pdbtbx::PDB;
246/// use pdb_handler::identify_unknowns;
247///
248/// let (mut pdb, _errors) = pdbtbx::open("example-pdbs/1crn.pdb").unwrap();
249/// let unknown_residues = identify_unknowns(&pdb);
250///
251/// for (chain_id, residues) in unknown_residues {
252///     println!("Chain {}: {:?}", chain_id, residues);
253/// }
254/// ```
255///
256/// # Panics
257///
258/// This function will panic if the residue name cannot be retrieved.
259pub fn chains_in_contact(structure: &pdbtbx::PDB) -> Vec<(String, String)> {
260    let mut contacts: HashSet<Vec<String>> = HashSet::new();
261
262    for (chain_x, chain_y) in structure
263        .chains()
264        .flat_map(|cx| structure.chains().map(move |cy| (cx, cy)))
265    {
266        if chain_x.id() == chain_y.id() {
267            continue;
268        }
269
270        let mut in_contacts = false;
271        for contact in &contacts {
272            if contact.contains(&chain_x.id().to_string())
273                && contact.contains(&chain_y.id().to_string())
274            {
275                in_contacts = true;
276                break;
277            }
278        }
279
280        if in_contacts {
281            continue;
282        }
283
284        for res_x in chain_x.residues() {
285            for res_y in chain_y.residues() {
286                for atom_i in res_x.atoms() {
287                    for atom_j in res_y.atoms() {
288                        let dist = atom_i.distance(atom_j);
289                        if dist <= 5.0 {
290                            contacts
291                                .insert(vec![chain_x.id().to_string(), chain_y.id().to_string()]);
292                        }
293                    }
294                }
295            }
296        }
297    }
298
299    contacts
300        .into_iter()
301        .map(|pair| (pair[0].clone(), pair[1].clone()))
302        .collect()
303}
304
305/// Removes lines starting with "REMARK" from a PDB file and returns the filtered content as a BufReader.
306///
307/// This function reads a Protein Data Bank (PDB) file, filters out all lines that start with the keyword "REMARK",
308/// and returns the remaining content as a `BufReader` over an in-memory buffer. This allows for further processing
309/// of the filtered content without needing to write it to a temporary file.
310///
311/// # Arguments
312///
313/// * `pdb_f` - A string slice that holds the path to the input PDB file.
314///
315/// # Returns
316///
317/// * `BufReader<Cursor<Vec<u8>>>` - A `BufReader` containing the filtered content.
318///
319/// # Panics
320///
321/// This function will panic if the input file cannot be opened or read.
322///
323/// # Examples
324///
325/// ```
326/// use pdb_handler::remove_remark;
327/// use std::io::BufRead;
328/// let reader = remove_remark("example-pdbs/1crn.pdb");
329/// for line in reader.lines() {
330///     println!("{:?}", line.unwrap());
331/// }
332/// ```
333pub fn remove_remark(pdb_f: &str) -> BufReader<Cursor<Vec<u8>>> {
334    // Open the input file
335    let input_file = File::open(pdb_f).unwrap();
336    let reader = BufReader::new(input_file);
337
338    // Collect filtered lines into a vector
339    let filtered_content: Vec<u8> = reader
340        .lines()
341        .filter_map(|line| {
342            let line = line.unwrap();
343            if !line.starts_with("REMARK") {
344                Some(line + "\n")
345            } else {
346                None
347            }
348        })
349        .collect::<String>()
350        .into_bytes();
351
352    // Create a BufReader over an in-memory buffer
353    BufReader::new(Cursor::new(filtered_content))
354}
355
356/// Reads a text file specified by `pdb_f`, pads each line that starts with `ATOM` to 80 characters
357/// with spaces, and returns a buffered reader over an in-memory buffer
358/// containing the padded content.
359///
360/// # Arguments
361///
362/// * `pdb_f` - A string slice that holds the path to the input text file.
363///
364/// # Returns
365///
366/// A `BufReader` wrapped around a `Cursor<Vec<u8>>>`, where each line from
367/// the input file is padded to 80 characters with spaces and newline character.
368///
369/// # Panics
370///
371/// This function panics if it encounters any I/O errors while reading or
372/// processing the file.
373///
374/// # Examples
375///
376/// ```rust
377/// use pdb_handler::pad_lines;
378/// use std::io::Read;
379/// use std::io::BufReader;
380///
381/// let mut padded_reader = pad_lines("example-pdbs/dna.pdb");
382/// let mut buffer = String::new();
383/// padded_reader.read_to_string(&mut buffer).unwrap();
384/// println!("Padded content:\n{}", buffer);
385/// ```
386///
387/// This example reads lines from "dna.pdb", pads each line that starts with `ATOM` with spaces
388/// to reach 80 characters, and then prints out the padded content.
389pub fn pad_lines(pdb_f: &str) -> BufReader<Cursor<Vec<u8>>> {
390    // Open the input file
391    let input_file = File::open(pdb_f).unwrap();
392    let reader = BufReader::new(input_file);
393
394    // Collect filtered lines into a vector
395    let filtered_content: Vec<u8> = reader
396        .lines()
397        .flat_map(|line| {
398            let line = line.unwrap();
399            let mut processed_line = if line.starts_with("ATOM") {
400                let mut padded_line = line.to_string();
401                if line.len() <= 80 {
402                    padded_line.push_str(" ".repeat(80 - line.len()).as_str());
403                    padded_line
404                } else {
405                    line[..80].to_string()
406                }
407            } else {
408                line
409            };
410            processed_line.push('\n'); // Append newline
411            processed_line.into_bytes()
412        })
413        .collect();
414
415    // Create a BufReader over an in-memory buffer
416    BufReader::new(Cursor::new(filtered_content))
417}
418
419#[cfg(test)]
420mod tests {
421
422    use pdbtbx::ReadOptions;
423
424    use super::*;
425    // use pdbtbx::{Atom, Chain, Residue, PDB};
426    use std::collections::HashMap;
427
428    #[test]
429    fn test_identify_molecular_types() {
430        // Load the structure from the test_data folder
431        let (structure, _) = ReadOptions::default()
432            .set_format(pdbtbx::Format::Pdb)
433            .read("test_data/prot_ligand.pdb")
434            .unwrap();
435
436        let mol_types = identify_molecular_types(&structure);
437
438        let mut expected = HashMap::new();
439        expected.insert(
440            "A".to_string(),
441            vec![MolecularType::Protein, MolecularType::Other],
442        );
443
444        assert_eq!(mol_types, expected);
445    }
446
447    #[test]
448    fn test_identify_chains() {
449        // Load the structure from the test_data folder
450        let (structure, _) = ReadOptions::default()
451            .set_format(pdbtbx::Format::Pdb)
452            .read("test_data/chains.pdb")
453            .unwrap();
454
455        let chains = identify_chains(&structure);
456
457        assert_eq!(
458            chains,
459            vec!["A".to_string(), "B".to_string(), "C".to_string()]
460        );
461    }
462
463    #[test]
464    fn test_identify_residue_numbers() {
465        // Load the structure from the test_data folder
466        let (structure, _) = ReadOptions::default()
467            .set_format(pdbtbx::Format::Pdb)
468            .read("test_data/prot_ligand.pdb")
469            .unwrap();
470
471        let residue_numbers = identify_residue_numbers(&structure);
472
473        let mut expected = HashMap::new();
474        expected.insert("A".to_string(), vec!["104".to_string(), "201".to_string()]);
475
476        assert_eq!(residue_numbers, expected);
477    }
478
479    #[test]
480    fn test_identify_unknowns() {
481        // Load the structure from the test_data folder
482        let (structure, _) = ReadOptions::default()
483            .set_format(pdbtbx::Format::Pdb)
484            .read("test_data/prot_ligand.pdb")
485            .unwrap();
486
487        let unknowns = identify_unknowns(&structure);
488
489        let mut expected = HashMap::new();
490        expected.insert("A".to_string(), vec!["I09".to_string()]);
491
492        assert_eq!(unknowns, expected);
493    }
494
495    #[test]
496    fn test_chains_in_contact() {
497        // Load the structure from the test_data folder
498        let (structure, _) = ReadOptions::default()
499            .set_format(pdbtbx::Format::Pdb)
500            .read("test_data/chains_in_contact.pdb")
501            .unwrap();
502
503        let contacts = chains_in_contact(&structure);
504
505        let expected = vec![("A".to_string(), "B".to_string())];
506
507        assert_eq!(contacts, expected);
508    }
509
510    #[test]
511    fn test_remove_remarks() {
512        let input_pdb = "test_data/pdb_w_remark.pdb";
513        let reader = remove_remark(input_pdb);
514
515        // Collect the lines from the reader and check if the REMARK lines are removed
516        let lines: Vec<String> = reader.lines().map(|line| line.unwrap()).collect();
517
518        assert!(!lines.iter().any(|line| line.starts_with("REMARK")));
519    }
520
521    #[test]
522    fn test_pad_short_lines() {
523        let input_pdb = "test_data/pdb_w_short_lines.pdb";
524
525        let reader = pad_lines(input_pdb);
526
527        let lines: Vec<String> = reader.lines().map(|line| line.unwrap()).collect();
528
529        assert!(lines
530            .iter()
531            .filter(|line| line.starts_with("ATOM"))
532            .all(|line| line.len() == 80));
533    }
534    #[test]
535    fn test_pad_long_lines() {
536        let input_pdb = "test_data/pdb_w_long_lines.pdb";
537
538        let reader = pad_lines(input_pdb);
539
540        let lines: Vec<String> = reader.lines().map(|line| line.unwrap()).collect();
541
542        assert!(lines
543            .iter()
544            .filter(|line| line.starts_with("ATOM"))
545            .all(|line| line.len() == 80));
546    }
547}