haddock_restraints/core/structure.rs
1use core::f64;
2use std::collections::{HashMap, HashSet};
3
4use itertools::Itertools;
5use kd_tree::KdTree;
6use nalgebra::Vector3;
7use pdbtbx::{Atom, Element, PDB, ReadOptions};
8use pdbtbx::{PDBError, Residue};
9use rand::SeedableRng;
10use rand::prelude::IndexedRandom;
11use rand::rngs::StdRng;
12use std::fs::File;
13use std::io::BufReader;
14use std::io::Cursor;
15use std::io::Write;
16
17#[derive(Clone)]
18pub struct Bead {
19 pub position: Vector3<f64>,
20}
21
22#[derive(Debug, Clone, PartialEq)]
23pub struct Pair {
24 pub chain_i: String,
25 pub res_i: isize,
26 pub atom_i: String,
27 pub chain_j: String,
28 pub res_j: isize,
29 pub atom_j: String,
30 pub distance: f64,
31}
32
33/// The `Finder` struct contains two hashmaps to quickly retrieve the chain ID and residue number associated with a given atom by its serial number. These lookups are initialized from the provided PDB file during instantiation.
34///
35struct Finder {
36 chain_lookup: HashMap<usize, String>,
37 residue_lookup: HashMap<usize, isize>,
38}
39
40impl Finder {
41 /// Creates a new `Finder` instance.
42 ///
43 /// This method initializes the `Finder` struct by extracting the chain and residue information from the provided `PDB` reference. It constructs two hashmaps: `chain_lookup` (for mapping atom serial numbers to chain IDs) and `residue_lookup` (for mapping atom serial numbers to residue serial numbers).
44 ///
45 fn new(pdb: &PDB) -> Self {
46 let chain_lookup = pdb
47 .chains()
48 .flat_map(|chain| {
49 let chain_id = chain.id().to_string();
50 chain.residues().flat_map(move |residue| {
51 residue.atoms().map({
52 let v = chain_id.clone();
53 move |atom| (atom.serial_number(), v.clone())
54 })
55 })
56 })
57 .collect();
58
59 let residue_lookup = pdb
60 .chains()
61 .flat_map(|chain| {
62 chain.residues().flat_map(|residue| {
63 let res_num = residue.serial_number();
64 residue
65 .atoms()
66 .map(move |atom| (atom.serial_number(), res_num))
67 })
68 })
69 .collect();
70
71 Finder {
72 chain_lookup,
73 residue_lookup,
74 }
75 }
76
77 /// Finds the chain ID associated with a given atom.
78 ///
79 /// This method looks up the chain ID for an atom based on its serial number. If no chain is found, the default value "A" is returned.
80 ///
81 fn find_chain_id(&self, atom: &Atom) -> String {
82 self.chain_lookup
83 .get(&atom.serial_number())
84 .unwrap_or(&"A".to_string())
85 .clone()
86 }
87
88 /// Finds the residue serial number associated with a given atom.
89 ///
90 /// This method looks up the residue serial number for an atom based on its serial number. If no residue number is found, the default value `0` is returned.
91 ///
92 fn find_residue_number(&self, atom: &Atom) -> isize {
93 *self.residue_lookup.get(&atom.serial_number()).unwrap_or(&0)
94 }
95}
96
97/// Performs a neighbor search for residues within a specified radius of target residues.
98///
99/// This function uses a K-d tree to efficiently find residues that are within a given radius
100/// of any atom in the target residues.
101///
102/// # Arguments
103///
104/// * `pdb` - A `pdbtbx::PDB` structure representing the entire protein.
105/// * `target_residues_numbers` - A vector of references to `pdbtbx::Residue` objects representing the target residues.
106/// * `radius` - A `f64` value specifying the search radius in Angstroms.
107///
108/// # Returns
109///
110/// A `Vec<isize>` containing the sorted serial numbers of residues that are within the specified
111/// radius of any atom in the target residues, excluding the target residues themselves.
112///
113/// # Algorithm
114///
115/// 1. Constructs a K-d tree from all atom coordinates in the PDB structure.
116/// 2. For each atom in the target residues:
117/// a. Performs a radius search in the K-d tree.
118/// b. Identifies the residues corresponding to the atoms found in the search.
119/// 3. Removes the target residues from the result set.
120/// 4. Sorts the resulting residue serial numbers.
121///
122/// # Notes
123///
124/// - The function uses the `KdTree` data structure for efficient spatial searching.
125/// - The current implementation may have performance bottlenecks in the atom-to-residue mapping step.
126/// - The function assumes that atom coordinates are unique for identification purposes.
127///
128/// # TODO
129///
130/// - Optimize the atom-to-residue mapping step for better performance.
131pub fn neighbor_search(
132 pdb: pdbtbx::PDB,
133 target_residues_numbers: Vec<&pdbtbx::Residue>,
134 radius: f64,
135) -> Vec<isize> {
136 let points: Vec<[f64; 3]> = pdb
137 .atoms()
138 .map(|atom| {
139 let x = atom.x();
140 let y = atom.y();
141 let z = atom.z();
142 [x, y, z]
143 })
144 .collect();
145
146 let kdtree = KdTree::build_by_ordered_float(points.clone());
147
148 // Find the coordinates of each target residue
149 // let mut neighbors: Vec<&Atom> = Vec::new();
150 let mut result: HashSet<isize> = HashSet::new();
151 for residue in &target_residues_numbers {
152 let query_vec: Vec<[f64; 3]> = residue
153 .atoms()
154 .map(|atom| {
155 let x = atom.x();
156 let y = atom.y();
157 let z = atom.z();
158 [x, y, z]
159 })
160 .collect();
161
162 for query in query_vec {
163 let found_vec = kdtree.within_radius(&query, radius);
164 for found in found_vec {
165 // Loop over the whole PDB and find an atom that matches the found point
166 // FIXME: Optimize this
167 for residue in pdb.residues() {
168 for atom in residue.atoms() {
169 let x = atom.x();
170 let y = atom.y();
171 let z = atom.z();
172 if x == found[0] && y == found[1] && z == found[2] {
173 result.insert(residue.serial_number());
174 }
175 }
176 }
177 }
178 }
179 }
180 // Remove the target residues from the result
181 for residue in target_residues_numbers {
182 result.remove(&residue.serial_number());
183 }
184
185 // Sort the result
186 let mut result: Vec<isize> = result.into_iter().collect();
187 result.sort();
188
189 result
190}
191
192/// Retrieves specific residues from a PDB structure based on their serial numbers.
193///
194/// # Arguments
195///
196/// * `pdb` - A reference to a `pdbtbx::PDB` structure representing the entire protein.
197/// * `target` - A vector of `isize` values representing the serial numbers of the residues to retrieve.
198///
199/// # Returns
200///
201/// A `Vec<&Residue>` containing references to the residues whose serial numbers match those in the `target` vector.
202///
203/// # Notes
204///
205/// - This function performs a linear search through all residues in the PDB structure.
206/// - The order of residues in the result vector matches the order they appear in the PDB structure,
207/// not necessarily the order of serial numbers in the `target` vector.
208/// - If a serial number in `target` doesn't match any residue in the PDB, it is simply ignored.
209pub fn get_residues(pdb: &pdbtbx::PDB, target: Vec<isize>) -> Vec<&Residue> {
210 let mut result: Vec<&Residue> = Vec::new();
211 for residue in pdb.residues() {
212 if target.contains(&residue.serial_number()) {
213 result.push(residue);
214 }
215 }
216 result
217}
218
219/// Identifies the true interface residues between chains in a PDB structure.
220///
221/// This function determines interface residues by finding pairs of residues from different chains
222/// that have atoms within a specified cutoff distance of each other.
223///
224/// # Arguments
225///
226/// * `pdb` - A reference to a `pdbtbx::PDB` structure representing the entire protein.
227/// * `cutoff` - A `f64` value specifying the maximum distance (in Angstroms) between atoms
228/// for residues to be considered part of the interface.
229///
230/// # Returns
231///
232/// A `HashMap<String, HashSet<isize>>` where:
233/// - The keys are chain identifiers (as strings).
234/// - The values are `HashSet`s of residue serial numbers (as `isize`) that are part of the interface for that chain.
235///
236/// # Algorithm
237///
238/// 1. Iterates over all pairs of chains in the PDB structure.
239/// 2. For each pair of chains, compares all residues between the two chains.
240/// 3. For each pair of residues, checks if any pair of atoms (one from each residue) is within the cutoff distance.
241/// 4. If atoms are within the cutoff, both residues are added to the interface set for their respective chains.
242///
243/// # Notes
244///
245/// - This function performs an exhaustive search, which may be computationally expensive for large structures.
246/// - The cutoff is applied to atom-atom distances, not residue-residue distances.
247/// - A residue is considered part of the interface if any of its atoms are within the cutoff of any atom in a residue from another chain.
248/// - The function does not distinguish between different types of atoms or residues.
249pub fn get_true_interface(pdb: &pdbtbx::PDB, cutoff: f64) -> HashMap<String, HashSet<isize>> {
250 let mut true_interface: HashMap<String, HashSet<isize>> = HashMap::new();
251 for chain_i in pdb.chains() {
252 for chain_j in pdb.chains() {
253 if chain_i.id() == chain_j.id() {
254 continue;
255 }
256 for res_i in chain_i.residues() {
257 for res_j in chain_j.residues() {
258 // Check if any of the atoms are touching each other
259 for atom_i in res_i.atoms() {
260 for atom_j in res_j.atoms() {
261 let distance = atom_i.distance(atom_j);
262 if distance < cutoff {
263 true_interface
264 .entry(chain_i.id().to_string())
265 .or_default()
266 .insert(res_i.serial_number());
267 true_interface
268 .entry(chain_j.id().to_string())
269 .or_default()
270 .insert(res_j.serial_number());
271 break;
272 }
273 }
274 }
275 }
276 }
277 }
278 }
279 true_interface
280}
281
282/// Identifies pairs of chains in a PDB structure that are in contact with each other.
283///
284/// This function determines which chains are in contact by finding pairs of residues from different chains
285/// that have atoms within a specified cutoff distance of each other.
286///
287/// # Arguments
288///
289/// * `pdb` - A reference to a `pdbtbx::PDB` structure representing the entire protein.
290/// * `cutoff` - A `f64` value specifying the maximum distance (in Angstroms) between atoms
291/// for chains to be considered in contact.
292///
293/// # Returns
294///
295/// A `HashSet<(String, String)>` where each tuple represents a pair of chain identifiers that are in contact.
296/// The order of chain identifiers in each tuple is arbitrary.
297///
298/// # Algorithm
299///
300/// 1. Iterates over all pairs of chains in the PDB structure.
301/// 2. For each pair of chains, compares all residues between the two chains.
302/// 3. For each pair of residues, checks if any pair of atoms (one from each residue) is within the cutoff distance.
303/// 4. If atoms are within the cutoff, the pair of chain identifiers is added to the result set.
304///
305/// # Notes
306///
307/// - This function performs an exhaustive search, which may be computationally expensive for large structures.
308/// - The cutoff is applied to atom-atom distances, not residue-residue or chain-chain distances.
309/// - A pair of chains is considered in contact if any atom from one chain is within the cutoff distance of any atom from the other chain.
310/// - The function does not distinguish between different types of atoms or residues.
311/// - Each pair of chains appears only once in the result set, regardless of the number of contacts between them.
312pub fn get_chains_in_contact(pdb: &pdbtbx::PDB, cutoff: f64) -> HashSet<(String, String)> {
313 let mut chains_in_contact: HashSet<(String, String)> = HashSet::new();
314
315 for chain_i in pdb.chains() {
316 for chain_j in pdb.chains() {
317 if chain_i.id() == chain_j.id() {
318 continue;
319 }
320 for res_i in chain_i.residues() {
321 for res_j in chain_j.residues() {
322 // Check if any of the atoms are touching each other
323 for atom_i in res_i.atoms() {
324 for atom_j in res_j.atoms() {
325 let distance = atom_i.distance(atom_j);
326 if distance < cutoff {
327 chains_in_contact
328 .insert((chain_i.id().to_string(), chain_j.id().to_string()));
329 break;
330 }
331 }
332 }
333 }
334 }
335 }
336 }
337 chains_in_contact
338}
339
340/// Represents two residues in a protein chain.
341///
342/// A `Gap` is defined by two residues that are sequential in the primary structure
343/// but have a distance between their atoms that exceeds a certain threshold in the
344/// tertiary structure.
345#[derive(Debug)]
346pub struct Gap {
347 /// The chain identifier where the gap is located.
348 pub chain: String,
349
350 /// The name of the atom in the first residue used for distance calculation.
351 pub atom_i: String,
352
353 /// The name of the atom in the second residue used for distance calculation.
354 pub atom_j: String,
355
356 /// The sequence number of the first residue.
357 pub res_i: isize,
358
359 /// The sequence number of the second residue.
360 pub res_j: isize,
361
362 /// The distance between the specified atoms of the two residues.
363 pub distance: f64,
364}
365
366/// Filters a vector of `Gap` elements, retaining only the unique elements based on the `atom_j` value.
367///
368/// This function removes duplicate `Gap` elements where the `atom_j` value is the same across multiple entries,
369/// keeping only the first occurrence of each unique `atom_j`. This is useful when you only need to retain
370/// one `Gap` entry for each unique `atom_j` regardless of the `atom_i` or other fields.
371///
372pub fn filter_unique_by_atom_j(gaps: Vec<Gap>) -> Vec<Gap> {
373 let mut seen = HashSet::new();
374 let mut unique: Vec<Gap> = Vec::new();
375
376 gaps.into_iter().for_each(|g| {
377 if !seen.contains(&g.atom_j) {
378 seen.insert(g.atom_j.clone());
379 unique.push(g)
380 }
381 });
382
383 unique
384}
385
386/// Identifies separate bodies within a protein structure based on CA atom distances.
387///
388/// This function segments the protein into separate "bodies" by analyzing the distances
389/// between consecutive CA (alpha carbon) atoms in each chain. A new body is defined when
390/// the distance between consecutive CA atoms exceeds 4 Angstroms.
391///
392/// # Arguments
393///
394/// * `pdb` - A reference to a `pdbtbx::PDB` structure representing the entire protein.
395///
396/// # Returns
397///
398/// A `HashMap<isize, Vec<(isize, &str, &pdbtbx::Atom)>>` where:
399/// - The key is a body identifier (starting from 0).
400/// - The value is a vector of tuples, each containing:
401/// - The residue serial number (`isize`)
402/// - The chain identifier (`&str`)
403/// - A reference to the CA atom (`&pdbtbx::Atom`)
404///
405/// # Algorithm
406///
407/// 1. Extracts all CA atoms from the PDB structure, along with their chain and residue information.
408/// 2. Iterates through consecutive CA atoms within each chain.
409/// 3. If the distance between consecutive CA atoms exceeds 4 Angstroms, a new body is started.
410/// 4. Groups CA atoms into bodies based on this distance criterion.
411///
412/// # Notes
413///
414/// - This function only considers CA atoms for body identification.
415/// - The 4 Angstrom cutoff is hard-coded and may not be suitable for all use cases.
416/// - Bodies are numbered sequentially starting from 0.
417/// - Gaps between chains always result in a new body, regardless of spatial proximity.
418/// - The function assumes that CA atoms within a chain are listed in sequence order.
419pub fn find_bodies(pdb: &pdbtbx::PDB) -> HashMap<isize, Vec<(isize, &str, &pdbtbx::Atom)>> {
420 // Check if the distance of a given atom to its next one is higher than 4A
421
422 // Get only the `CA` atoms
423 let mut ca_atoms: Vec<(&str, isize, &pdbtbx::Atom)> = Vec::new();
424 pdb.chains().for_each(|chain| {
425 chain.residues().for_each(|residue| {
426 residue.atoms().for_each(|atom| {
427 if atom.name() == "CA" {
428 ca_atoms.push((chain.id(), residue.serial_number(), atom));
429 }
430 });
431 });
432 });
433 let mut bodies: HashMap<isize, Vec<(isize, &str, &pdbtbx::Atom)>> = HashMap::new();
434 let mut body_id = 0;
435 for (i, j) in ca_atoms.iter().zip(ca_atoms.iter().skip(1)) {
436 let (chain_i, res_i, atom_i) = i;
437 let (chain_j, _res_j, atom_j) = j;
438 if chain_i != chain_j {
439 continue;
440 }
441 let distance = atom_i.distance(atom_j);
442 if distance > 4.0 {
443 body_id += 1;
444 }
445 bodies
446 .entry(body_id)
447 .or_default()
448 .push((*res_i, chain_i, atom_i));
449 }
450
451 bodies
452}
453
454/// Creates a list of gaps between different bodies in a protein structure.
455///
456/// This function analyzes the spatial relationships between separate bodies of a protein
457/// structure and generates a list of `Gap` instances representing the distances between
458/// randomly selected pairs of atoms from different bodies.
459///
460/// # Arguments
461///
462/// * `bodies` - A reference to a `HashMap<isize, Vec<(isize, &str, &pdbtbx::Atom)>>` representing
463/// the bodies of the protein structure, as returned by the `find_bodies` function.
464///
465/// # Returns
466///
467/// A `Vec<Gap>` containing `Gap` instances that represent the distances between pairs of
468/// atoms from different bodies.
469///
470/// # Algorithm
471///
472/// 1. Initializes a random number generator with a fixed seed (42) for reproducibility.
473/// 2. Iterates over all unique pairs of bodies.
474/// 3. For each pair of bodies with at least 2 atoms each:
475/// a. Randomly selects 2 atoms from each body.
476/// b. Creates 2 `Gap` instances, one for each pair of selected atoms.
477/// 4. Collects all created `Gap` instances into a vector.
478///
479/// # Notes
480///
481/// - The function uses a fixed random seed (42) for reproducibility. This means that
482/// for the same input, it will always produce the same output.
483/// - Only bodies with at least 2 atoms are considered for gap creation.
484/// - The function creates 2 gaps for each pair of bodies, using different randomly selected atoms.
485/// - The `Gap` instances contain information about the chain, atom names, residue numbers,
486/// and the distance between the selected atoms.
487/// - This function assumes that the input `bodies` HashMap is structured as expected,
488/// with each value being a vector of tuples (residue number, chain ID, atom reference).
489///
490/// # Safety
491///
492/// This function uses `unwrap()` on the results of `choose_multiple()`. It assumes that
493/// the bodies contain at least 2 atoms each, as checked earlier in the function.
494pub fn create_iter_body_gaps(
495 bodies: &HashMap<isize, Vec<(isize, &str, &pdbtbx::Atom)>>,
496) -> Vec<Gap> {
497 let mut rng = StdRng::seed_from_u64(42);
498 let mut pairs: Vec<Gap> = Vec::new();
499 let mut body_ids: Vec<isize> = bodies.keys().cloned().collect();
500 body_ids.sort();
501 for (i, &body_id1) in body_ids.iter().enumerate() {
502 for &body_id2 in body_ids[i + 1..].iter() {
503 if let (Some(atoms1), Some(atoms2)) = (bodies.get(&body_id1), bodies.get(&body_id2))
504 && atoms1.len() >= 2 && atoms2.len() >= 2 {
505 let selected1: Vec<_> = atoms1.choose_multiple(&mut rng, 2).cloned().collect();
506 let selected2: Vec<_> = atoms2.choose_multiple(&mut rng, 2).cloned().collect();
507
508 for i in 0..2 {
509 pairs.push(Gap {
510 chain: selected1[i].1.to_string(),
511 atom_i: selected1[i].2.name().to_string(),
512 atom_j: selected2[i].2.name().to_string(),
513 res_i: selected1[i].0,
514 res_j: selected2[i].0,
515 distance: selected1[i].2.distance(selected2[i].2),
516 });
517 }
518 }
519 }
520 }
521
522 pairs
523}
524
525// pub fn calculate_z_axis(selection1: &[pdbtbx::Atom], selection2: &[pdbtbx::Atom]) -> Vector3<f64> {
526// let center1 = calculate_geometric_center(selection1);
527// let center2 = calculate_geometric_center(selection2);
528// (center2 - center1).normalize()
529// }
530
531pub fn calculate_geometric_center(atoms: &[pdbtbx::Atom]) -> Vector3<f64> {
532 let mut center = Vector3::zeros();
533 for atom in atoms {
534 center += Vector3::new(atom.x(), atom.y(), atom.z());
535 }
536 center / (atoms.len() as f64)
537}
538
539// pub fn generate_axis_beads(start_z: f64, end_z: f64, num_beads: usize) -> Vec<Bead> {
540// let mut beads = Vec::with_capacity(num_beads);
541// let length = end_z - start_z;
542
543// for i in 0..num_beads {
544// let t = (i as f64) / ((num_beads - 1) as f64);
545// let z = start_z + t * length;
546// let position = Vector3::new(0.0, 0.0, z);
547// beads.push(Bead { position });
548// }
549// beads
550// }
551
552pub fn generate_grid_beads(center_z: f64, grid_size: usize, spacing: f64) -> Vec<Bead> {
553 let mut beads = Vec::with_capacity(grid_size * grid_size);
554
555 let grid_offset = (grid_size as f64 - 1.0) * spacing / 2.0;
556
557 for i in 0..grid_size {
558 for j in 0..grid_size {
559 let x = (i as f64) * spacing - grid_offset;
560 let y = (j as f64) * spacing - grid_offset;
561
562 let final_pos = Vector3::new(x, y, center_z);
563 beads.push(Bead {
564 position: final_pos,
565 });
566 }
567 }
568
569 beads
570}
571
572pub fn write_beads_pdb(beads: &[Bead], filename: &str) -> std::io::Result<()> {
573 let mut file = File::create(filename)?;
574
575 for (i, bead) in beads.iter().enumerate() {
576 writeln!(
577 file,
578 "{:<6}{:>5} {:<4}{:1}{:<3} {:1}{:>4}{:1} {:>8.3}{:>8.3}{:>8.3}{:>6.2}{:>6.2} {:>2}{:>2}",
579 "ATOM",
580 i + 1,
581 "SHA",
582 " ",
583 "SHA",
584 "S",
585 i + 1,
586 " ",
587 bead.position.x,
588 bead.position.y,
589 bead.position.z,
590 1.00,
591 1.00,
592 "H",
593 ""
594 )?;
595 }
596 writeln!(file, "END")?;
597
598 Ok(())
599}
600
601pub fn find_furthest_selections(selections: &[Vec<isize>], pdb: &PDB) -> (Vec<Atom>, Vec<Atom>) {
602 let sele: HashMap<usize, (Vec<Atom>, Vector3<f64>)> = selections
603 .iter()
604 .enumerate()
605 .map(|(i, sel)| {
606 let atoms = get_atoms_from_resnumbers(pdb, sel);
607 let center = calculate_geometric_center(&atoms);
608 (i, (atoms, center))
609 })
610 .collect();
611
612 // Find the two selections that are the furthest apart
613 let mut max_distance = 0.0;
614 let mut atoms1 = Vec::new();
615 let mut atoms2 = Vec::new();
616 for (i, j) in (0..selections.len()).tuple_combinations() {
617 let distance = (sele[&i].1 - sele[&j].1).norm();
618 if distance > max_distance {
619 max_distance = distance;
620 atoms1 = sele[&i].0.clone();
621 atoms2 = sele[&j].0.clone();
622 }
623 }
624
625 (atoms1, atoms2)
626}
627
628pub fn get_atoms_from_resnumbers(pdb: &PDB, selection: &[isize]) -> Vec<Atom> {
629 pdb.residues()
630 .filter(|res| selection.contains(&res.serial_number()))
631 .flat_map(|res| res.atoms())
632 .cloned() // This clones each &Atom into an owned Atom
633 .collect()
634}
635
636/// Loads a PDB structure from a file path
637///
638/// # Arguments
639/// * `input_pdb` - Path to the PDB file
640///
641/// # Returns
642/// Result containing either the parsed PDB or a list of errors
643pub fn load_pdb(input_pdb: &str) -> Result<PDB, Vec<PDBError>> {
644 std::fs::read_to_string(input_pdb)
645 .map_err(|e| {
646 vec![PDBError::new(
647 pdbtbx::ErrorLevel::GeneralWarning,
648 "File read error",
649 format!("Failed to read file {}: {}", input_pdb, e),
650 pdbtbx::Context::None,
651 )]
652 })
653 .and_then(|content| process_pdb_contents(&content))
654}
655
656/// Loads a PDB structure from string content
657///
658/// # Arguments
659/// * `content` - PDB file contents as a string
660///
661/// # Returns
662/// Result containing either the parsed PDB or a list of errors
663pub fn load_pdb_from_content(content: &str) -> Result<PDB, Vec<PDBError>> {
664 process_pdb_contents(content)
665}
666
667/// Processes raw PDB content by removing remarks and padding lines
668///
669/// # Arguments
670/// * `content` - Raw PDB content
671///
672/// # Returns
673/// Processed PDB content ready for parsing
674pub fn process_pdb_contents(pdb_content: &str) -> Result<PDB, Vec<PDBError>> {
675 let processed_content = process_pdb_string(pdb_content);
676
677 let mut opts = ReadOptions::new();
678 opts.set_format(pdbtbx::Format::Pdb)
679 .set_level(pdbtbx::StrictnessLevel::Loose);
680
681 let cursor = Cursor::new(processed_content.into_bytes());
682 let reader = BufReader::new(cursor);
683
684 match opts.read_raw(reader) {
685 Ok((pdb, _)) => Ok(pdb),
686 Err(e) => Err(e),
687 }
688}
689
690/// Processes raw PDB content by removing remarks and padding lines
691///
692/// # Arguments
693/// * `content` - Raw PDB content
694///
695/// # Returns
696/// Processed PDB content ready for parsing
697fn process_pdb_string(content: &str) -> String {
698 let mut output = String::with_capacity(content.len());
699
700 // Process lines with single allocation
701 for line in content.lines() {
702 if !line.starts_with("REMARK") {
703 output.push_str(line);
704 if line.len() < 80 {
705 output.push_str(&" ".repeat(80 - line.len()));
706 }
707 output.push('\n');
708 }
709 }
710
711 output
712}
713
714/// Finds the closest residue pairs between different protein chains within a specified distance cutoff.
715///
716/// This function analyzes inter-chain interactions by identifying the closest atom pairs between residues
717/// from different chains, using the alpha carbon (CA) atoms for distance calculation.
718///
719/// # Arguments
720///
721/// * pdb - A reference to a parsed PDB structure.
722/// * cutoff - A floating-point value specifying the maximum distance (in Angstroms) for considering residue pairs.
723///
724/// # Returns
725///
726/// A Vec<Pair> containing the closest residue pairs within the specified cutoff, sorted by distance.
727///
728/// # Functionality
729///
730/// 1. Iterates through all unique chain pairs in the PDB structure.
731/// 2. For each chain pair, finds the closest atoms between residues.
732/// 3. Selects pairs where the inter-residue distance is less than the specified cutoff.
733/// 4. Uses alpha carbon (CA) atoms for distance calculation and pair representation.
734/// 5. Sorts the resulting pairs by their inter-alpha carbon distance.
735///
736/// # Notes
737///
738/// - Only inter-chain interactions are considered.
739/// - The distance is calculated based on the closest atom pairs between residues.
740/// - The resulting pairs are sorted from shortest to longest distance.
741pub fn get_closest_residue_pairs(pdb: &pdbtbx::PDB, cutoff: f64) -> Vec<Pair> {
742 let mut closest_pairs = Vec::new();
743
744 let chains: Vec<_> = pdb.chains().collect();
745
746 for i in 0..chains.len() {
747 for j in (i + 1)..chains.len() {
748 let chain_i = &chains[i];
749 let chain_j = &chains[j];
750
751 for res_i in chain_i.residues() {
752 let mut min_distance = f64::MAX;
753 let mut closest_pair = None;
754
755 for res_j in chain_j.residues() {
756 let mut atom_dist = f64::MAX;
757 for atom_i in res_i.atoms() {
758 for atom_j in res_j.atoms() {
759 let distance = atom_i.distance(atom_j);
760 if distance < atom_dist {
761 atom_dist = distance;
762 }
763 }
764 }
765 if atom_dist < min_distance {
766 min_distance = atom_dist;
767 closest_pair = Some((res_j, res_i));
768 }
769 }
770
771 if min_distance < cutoff
772 && let Some((res_j, res_i)) = closest_pair {
773 let ca_i = res_i.atoms().find(|atom| atom.name() == "CA").unwrap();
774 let ca_j = res_j.atoms().find(|atom| atom.name() == "CA").unwrap();
775 let ca_ca_dist = ca_i.distance(ca_j);
776 closest_pairs.push(Pair {
777 chain_i: chain_i.id().to_string(),
778 res_i: res_i.serial_number(),
779 atom_i: ca_i.name().to_string(),
780 chain_j: chain_j.id().to_string(),
781 res_j: res_j.serial_number(),
782 atom_j: ca_j.name().to_string(),
783 distance: ca_ca_dist,
784 });
785 }
786 }
787 }
788 }
789
790 closest_pairs.sort_by(|a, b| a.distance.partial_cmp(&b.distance).unwrap());
791
792 closest_pairs
793}
794
795/// Finds pairs of atoms between a ligand and nearby protein residues within a certain distance.
796///
797/// This function processes the given PDB structure to identify heteroatoms (ligand atoms) and
798/// their neighboring atoms within a specified distance threshold (5.0 Å). It then finds the closest
799/// protein atom for each ligand atom and returns a vector of `Pair` structures containing information
800/// about the ligand atom and its closest protein atom. The result is a list of ligand-protein pairs
801/// and their corresponding distance, residue, and chain information.
802///
803pub fn gaps_around_ligand(pdb: &PDB) -> Vec<Gap> {
804 let ligand_atoms: Vec<&Atom> = pdb
805 .atoms()
806 .filter(|a| a.hetero() && *a.element().unwrap() != Element::H)
807 .collect();
808
809 let ligand_residues: Vec<&Residue> = pdb
810 .residues()
811 .filter(|r| r.atoms().any(|a| a.hetero()))
812 .collect();
813
814 let neighbors = neighbor_search(pdb.clone(), ligand_residues, 5.0);
815 let protein_atoms = get_atoms_from_resnumbers(pdb, &neighbors);
816
817 let mut result: Vec<Gap> = Vec::new();
818
819 let finder = Finder::new(pdb);
820
821 for ligand_atom in ligand_atoms {
822 let closest_protein_atom = protein_atoms
823 .iter()
824 .filter(|a| *a.element().unwrap() != Element::H)
825 .map(|a| (a, ligand_atom.distance(a)))
826 .min_by(|a, b| a.1.partial_cmp(&b.1).unwrap())
827 .unwrap();
828
829 let ligand_chain = finder.find_chain_id(ligand_atom);
830 let protein_chain = finder.find_chain_id(closest_protein_atom.0);
831
832 if ligand_chain == protein_chain {
833 result.push(Gap {
834 chain: ligand_chain,
835 res_i: finder.find_residue_number(ligand_atom),
836 atom_i: ligand_atom.name().to_string(),
837 res_j: finder.find_residue_number(closest_protein_atom.0),
838 atom_j: closest_protein_atom.0.name().to_string(),
839 distance: closest_protein_atom.1,
840 })
841 }
842 }
843
844 result
845}
846
847#[cfg(test)]
848mod tests {
849
850 use std::env;
851
852 use tempfile::NamedTempFile;
853
854 use super::*;
855
856 #[test]
857 fn test_get_true_interface() {
858 let pdb = load_pdb("tests/data/complex.pdb").unwrap();
859
860 let observed_true_interface = get_true_interface(&pdb, 5.0);
861
862 let mut expected_true_interface: HashMap<String, HashSet<isize>> = HashMap::new();
863 let chain_a = "A".to_string();
864 let res_a: HashSet<isize> = vec![934, 936, 933, 946, 950, 938, 940, 941, 937, 931]
865 .into_iter()
866 .collect();
867 let chain_b = "B";
868 let res_b: HashSet<isize> = vec![66, 48, 68, 49, 46, 45, 44, 47, 69, 6, 70, 8, 42]
869 .into_iter()
870 .collect();
871
872 expected_true_interface.insert(chain_a, res_a);
873 expected_true_interface.insert(chain_b.to_string(), res_b);
874
875 assert_eq!(observed_true_interface, expected_true_interface);
876 }
877
878 #[test]
879 fn test_load_pdb_short_lines() {
880 let pdb_path = env::current_dir()
881 .unwrap()
882 .join("tests/data/leu_short_lines.pdb");
883 let pdb = load_pdb(pdb_path.to_str().unwrap()).unwrap();
884 assert_eq!(pdb.atoms().count(), 8);
885 }
886
887 #[test]
888 fn test_load_pdb_normal_lines() {
889 let pdb_path = env::current_dir()
890 .unwrap()
891 .join("tests/data/leu_normal_lines.pdb");
892 let pdb = load_pdb(pdb_path.to_str().unwrap()).unwrap();
893 assert_eq!(pdb.atoms().count(), 8);
894 }
895 // Updated test helper function that works with older Rust versions
896 fn create_test_pdb(remarks: usize, atoms: usize) -> String {
897 let mut pdb = String::new();
898
899 // Add remarks
900 for i in 0..remarks {
901 pdb.push_str(&format!("REMARK {} Test remark\n", i));
902 }
903
904 // Add atoms (ATOM records)
905 for i in 0..atoms {
906 pdb.push_str(&format!(
907 "ATOM {:>5} N ALA A{:>4} {:>8}{:>8}{:>8} 1.00 0.00 N\n",
908 i + 1,
909 i + 1,
910 format!("{:.3}", i as f32),
911 format!("{:.3}", i as f32),
912 format!("{:.3}", i as f32)
913 ));
914 }
915
916 pdb
917 }
918
919 #[test]
920 fn test_process_pdb_string_removes_remarks() {
921 let input = "REMARK 1 Test\nATOM 1 N ALA A 1 0.000 0.000 0.000\n";
922 let processed = process_pdb_string(input);
923
924 assert!(!processed.contains("REMARK"));
925 assert!(processed.contains("ATOM"));
926 }
927
928 #[test]
929 fn test_process_pdb_string_pads_lines() {
930 let short_line = "ATOM 1 N ALA A 1 0.000 0.000 0.000";
931 let processed = process_pdb_string(short_line);
932
933 // Check line is padded to 80 chars (including newline)
934 assert_eq!(processed.lines().next().unwrap().len(), 80);
935 assert_eq!(processed.chars().count(), 81); // 80 + newline
936 }
937
938 #[test]
939 fn test_process_pdb_string_preserves_long_lines() {
940 let long_line =
941 "ATOM 1 N ALA A 1 0.000 0.000 0.000 1.00 0.00 N EXTRA";
942 let processed = process_pdb_string(long_line);
943
944 // Line should remain unchanged (except newline)
945 assert_eq!(processed.trim(), long_line);
946 }
947
948 #[test]
949 fn test_process_pdb_contents_valid_pdb() {
950 let pdb_content = create_test_pdb(3, 5);
951 let result = process_pdb_contents(&pdb_content);
952
953 assert!(result.is_ok());
954 let pdb = result.unwrap();
955 assert_eq!(pdb.atoms().count(), 5);
956 }
957
958 #[test]
959 fn test_process_pdb_contents_empty_input() {
960 let result = process_pdb_contents("");
961 assert!(result.is_err());
962
963 if let Err(errors) = result {
964 assert_eq!(errors.len(), 1);
965 }
966 }
967
968 #[test]
969 fn test_process_pdb_contents_invalid_pdb() {
970 let invalid_content = "This is not a PDB file\n";
971 let result = process_pdb_contents(invalid_content);
972
973 assert!(result.is_err());
974 if let Err(errors) = result {
975 assert!(!errors.is_empty());
976 }
977 }
978
979 #[test]
980 fn test_process_pdb_string_empty_input() {
981 let processed = process_pdb_string("");
982 assert_eq!(processed, "");
983 }
984
985 #[test]
986 fn test_process_pdb_string_mixed_content() {
987 let input = "\
988REMARK 1 Test
989ATOM 1 N ALA A 1 0.000 0.000 0.000
990REMARK 2 Another
991ATOM 2 N ALA A 2 1.000 1.000 1.000
992";
993 let processed = process_pdb_string(input);
994 let lines: Vec<&str> = processed.lines().collect();
995
996 assert_eq!(lines.len(), 2); // Only ATOM lines should remain
997 assert!(lines[0].starts_with("ATOM"));
998 assert!(lines[1].starts_with("ATOM"));
999 }
1000
1001 #[test]
1002 fn test_process_pdb_contents_with_real_file() {
1003 // Create a temporary PDB file
1004 let mut file = NamedTempFile::new().unwrap();
1005 let pdb_content = create_test_pdb(2, 3);
1006 write!(file, "{}", pdb_content).unwrap();
1007
1008 // Test with file content
1009 let file_content = std::fs::read_to_string(file.path()).unwrap();
1010 let result = process_pdb_contents(&file_content);
1011
1012 assert!(result.is_ok());
1013 let pdb = result.unwrap();
1014 assert_eq!(pdb.atoms().count(), 3);
1015 }
1016
1017 #[test]
1018 fn test_get_closest_residue_pairs() {
1019 let pdb = load_pdb("tests/data/two_res.pdb").unwrap();
1020 let observed_pairs = get_closest_residue_pairs(&pdb, 5.0);
1021 let expected_pairs = &[Pair {
1022 chain_i: "A".to_string(),
1023 chain_j: "B".to_string(),
1024 atom_i: "CA".to_string(),
1025 atom_j: "CA".to_string(),
1026 res_i: 2,
1027 res_j: 10,
1028 distance: 9.1,
1029 }];
1030 assert_eq!(observed_pairs.len(), expected_pairs.len());
1031 let pair = &observed_pairs[0];
1032 assert_eq!(pair.chain_i, expected_pairs[0].chain_i);
1033 assert_eq!(pair.chain_j, expected_pairs[0].chain_j);
1034 assert_eq!(pair.atom_i, expected_pairs[0].atom_i);
1035 assert_eq!(pair.atom_j, expected_pairs[0].atom_j);
1036 assert_eq!(pair.res_i, expected_pairs[0].res_i);
1037 assert_eq!(pair.res_j, expected_pairs[0].res_j);
1038 assert!((pair.distance - 9.1).abs() < 0.1);
1039 }
1040 #[test]
1041 fn test_gaps_around_ligand() {
1042 let pdb = load_pdb("tests/data/prot_ligand.pdb").unwrap();
1043 let observed = gaps_around_ligand(&pdb);
1044 let expected = &[
1045 Gap {
1046 chain: "E".to_string(),
1047 res_i: 351,
1048 atom_i: "N1".to_string(),
1049 res_j: 123,
1050 atom_j: "N".to_string(),
1051 distance: 3.0400192433601467,
1052 },
1053 Gap {
1054 chain: "E".to_string(),
1055 res_i: 351,
1056 atom_i: "C2".to_string(),
1057 res_j: 327,
1058 atom_j: "CE2".to_string(),
1059 distance: 3.79,
1060 },
1061 ];
1062
1063 // Compare chain, residue, and atom names
1064 assert_eq!(observed[0].chain, expected[0].chain);
1065 assert_eq!(observed[0].res_i, expected[0].res_i);
1066 assert_eq!(observed[0].atom_i, expected[0].atom_i);
1067 assert_eq!(observed[0].res_j, expected[0].res_j);
1068 assert_eq!(observed[0].atom_j, expected[0].atom_j);
1069
1070 assert_eq!(observed[1].chain, expected[1].chain);
1071 assert_eq!(observed[1].res_i, expected[1].res_i);
1072 assert_eq!(observed[1].atom_i, expected[1].atom_i);
1073 assert_eq!(observed[1].res_j, expected[1].res_j);
1074 assert_eq!(observed[1].atom_j, expected[1].atom_j);
1075
1076 // TODO: Find a smart way to check the distances
1077 //
1078 // assert_eq!(observed[0].distance, expected[0].distance);
1079 // assert_eq!(observed[1].distance, expected[1].distance);
1080 }
1081
1082 #[test]
1083 fn test_finder() {
1084 let pdb = load_pdb("tests/data/complex.pdb").unwrap();
1085 let finder = Finder::new(&pdb);
1086
1087 assert_eq!(finder.chain_lookup.keys().len(), 924);
1088 assert_eq!(finder.residue_lookup.keys().len(), 924)
1089 }
1090
1091 #[test]
1092 fn test_finder_find_chain_id() {
1093 let pdb = load_pdb("tests/data/complex.pdb").unwrap();
1094 let finder = Finder::new(&pdb);
1095
1096 let found = finder.find_chain_id(pdb.atom(1).expect(""));
1097 assert_eq!(found, "A");
1098
1099 let found = finder.find_chain_id(pdb.atom(700).expect(""));
1100 assert_eq!(found, "B");
1101 }
1102
1103 #[test]
1104 fn test_finder_find_residue_number() {
1105 let pdb = load_pdb("tests/data/complex.pdb").unwrap();
1106 let finder = Finder::new(&pdb);
1107
1108 let found = finder.find_residue_number(pdb.atom(1).unwrap());
1109
1110 assert_eq!(found, 929)
1111 }
1112
1113 #[test]
1114 fn test_filter_unique_by_atom_j() {
1115 let gaps = vec![
1116 Gap {
1117 chain: "A".to_string(),
1118 atom_i: "C".to_string(),
1119 atom_j: "B".to_string(),
1120 res_i: 1,
1121 res_j: 2,
1122 distance: 3.5,
1123 },
1124 Gap {
1125 chain: "A".to_string(),
1126 atom_i: "A".to_string(),
1127 atom_j: "B".to_string(),
1128 res_i: 3,
1129 res_j: 4,
1130 distance: 2.8,
1131 },
1132 Gap {
1133 chain: "B".to_string(),
1134 atom_i: "C".to_string(),
1135 atom_j: "D".to_string(),
1136 res_i: 5,
1137 res_j: 6,
1138 distance: 4.1,
1139 },
1140 Gap {
1141 chain: "A".to_string(),
1142 atom_i: "E".to_string(),
1143 atom_j: "B".to_string(),
1144 res_i: 7,
1145 res_j: 8,
1146 distance: 1.5,
1147 },
1148 ];
1149
1150 let filtered = filter_unique_by_atom_j(gaps);
1151
1152 // The function should remove duplicates based on `atom_j` and keep only the first occurrence.
1153 assert_eq!(filtered.len(), 2); // "B" and "D" should remain
1154 assert_eq!(filtered[0].atom_j, "B");
1155 assert_eq!(filtered[1].atom_j, "D");
1156
1157 // Verify the correct `atom_j` is retained
1158 assert!(
1159 filtered
1160 .iter()
1161 .all(|gap| gap.atom_j != "B" || gap.atom_i == "C")
1162 );
1163 }
1164
1165 #[test]
1166 fn test_no_duplicates() {
1167 let gaps = vec![
1168 Gap {
1169 chain: "A".to_string(),
1170 atom_i: "C".to_string(),
1171 atom_j: "B".to_string(),
1172 res_i: 1,
1173 res_j: 2,
1174 distance: 3.5,
1175 },
1176 Gap {
1177 chain: "A".to_string(),
1178 atom_i: "A".to_string(),
1179 atom_j: "D".to_string(),
1180 res_i: 3,
1181 res_j: 4,
1182 distance: 2.8,
1183 },
1184 ];
1185
1186 let filtered = filter_unique_by_atom_j(gaps);
1187 assert_eq!(filtered.len(), 2); // No duplicates, so the length should be the same
1188 }
1189
1190 #[test]
1191 fn test_empty_input() {
1192 let gaps: Vec<Gap> = Vec::new();
1193 let filtered = filter_unique_by_atom_j(gaps);
1194 assert!(filtered.is_empty()); // Should return an empty list
1195 }
1196
1197 #[test]
1198 fn test_single_element() {
1199 let gaps = vec![Gap {
1200 chain: "A".to_string(),
1201 atom_i: "C".to_string(),
1202 atom_j: "B".to_string(),
1203 res_i: 1,
1204 res_j: 2,
1205 distance: 3.5,
1206 }];
1207
1208 let filtered = filter_unique_by_atom_j(gaps);
1209 assert_eq!(filtered.len(), 1); // Single element, should remain
1210 assert_eq!(filtered[0].atom_j, "B");
1211 }
1212}