haddock_restraints/core/structure.rs
1use core::f64;
2use std::collections::{HashMap, HashSet};
3
4use itertools::Itertools;
5use kd_tree::KdTree;
6use nalgebra::Vector3;
7use pdbtbx::{Atom, Element, PDB, ReadOptions};
8use pdbtbx::{PDBError, Residue};
9use rand::SeedableRng;
10use rand::prelude::IndexedRandom;
11use rand::rngs::StdRng;
12use std::fs::File;
13use std::io::BufReader;
14use std::io::Cursor;
15use std::io::Write;
16
17#[derive(Clone)]
18pub struct Bead {
19 pub position: Vector3<f64>,
20}
21
22#[derive(Debug, Clone, PartialEq)]
23pub struct Pair {
24 pub chain_i: String,
25 pub res_i: isize,
26 pub atom_i: String,
27 pub chain_j: String,
28 pub res_j: isize,
29 pub atom_j: String,
30 pub distance: f64,
31}
32
33/// The `Finder` struct contains two hashmaps to quickly retrieve the chain ID and residue number associated with a given atom by its serial number. These lookups are initialized from the provided PDB file during instantiation.
34///
35struct Finder {
36 chain_lookup: HashMap<usize, String>,
37 residue_lookup: HashMap<usize, isize>,
38}
39
40impl Finder {
41 /// Creates a new `Finder` instance.
42 ///
43 /// This method initializes the `Finder` struct by extracting the chain and residue information from the provided `PDB` reference. It constructs two hashmaps: `chain_lookup` (for mapping atom serial numbers to chain IDs) and `residue_lookup` (for mapping atom serial numbers to residue serial numbers).
44 ///
45 fn new(pdb: &PDB) -> Self {
46 let chain_lookup = pdb
47 .chains()
48 .flat_map(|chain| {
49 let chain_id = chain.id().to_string();
50 chain.residues().flat_map(move |residue| {
51 residue.atoms().map({
52 let v = chain_id.clone();
53 move |atom| (atom.serial_number(), v.clone())
54 })
55 })
56 })
57 .collect();
58
59 let residue_lookup = pdb
60 .chains()
61 .flat_map(|chain| {
62 chain.residues().flat_map(|residue| {
63 let res_num = residue.serial_number();
64 residue
65 .atoms()
66 .map(move |atom| (atom.serial_number(), res_num))
67 })
68 })
69 .collect();
70
71 Finder {
72 chain_lookup,
73 residue_lookup,
74 }
75 }
76
77 /// Finds the chain ID associated with a given atom.
78 ///
79 /// This method looks up the chain ID for an atom based on its serial number. If no chain is found, the default value "A" is returned.
80 ///
81 fn find_chain_id(&self, atom: &Atom) -> String {
82 self.chain_lookup
83 .get(&atom.serial_number())
84 .unwrap_or(&"A".to_string())
85 .clone()
86 }
87
88 /// Finds the residue serial number associated with a given atom.
89 ///
90 /// This method looks up the residue serial number for an atom based on its serial number. If no residue number is found, the default value `0` is returned.
91 ///
92 fn find_residue_number(&self, atom: &Atom) -> isize {
93 *self.residue_lookup.get(&atom.serial_number()).unwrap_or(&0)
94 }
95}
96
97/// Performs a neighbor search for residues within a specified radius of target residues.
98///
99/// This function uses a K-d tree to efficiently find residues that are within a given radius
100/// of any atom in the target residues.
101///
102/// # Arguments
103///
104/// * `pdb` - A `pdbtbx::PDB` structure representing the entire protein.
105/// * `target_residues_numbers` - A vector of references to `pdbtbx::Residue` objects representing the target residues.
106/// * `radius` - A `f64` value specifying the search radius in Angstroms.
107///
108/// # Returns
109///
110/// A `Vec<isize>` containing the sorted serial numbers of residues that are within the specified
111/// radius of any atom in the target residues, excluding the target residues themselves.
112///
113/// # Algorithm
114///
115/// 1. Constructs a K-d tree from all atom coordinates in the PDB structure.
116/// 2. For each atom in the target residues:
117/// a. Performs a radius search in the K-d tree.
118/// b. Identifies the residues corresponding to the atoms found in the search.
119/// 3. Removes the target residues from the result set.
120/// 4. Sorts the resulting residue serial numbers.
121///
122/// # Notes
123///
124/// - The function uses the `KdTree` data structure for efficient spatial searching.
125/// - The current implementation may have performance bottlenecks in the atom-to-residue mapping step.
126/// - The function assumes that atom coordinates are unique for identification purposes.
127///
128/// # TODO
129///
130/// - Optimize the atom-to-residue mapping step for better performance.
131pub fn neighbor_search(
132 pdb: pdbtbx::PDB,
133 target_residues_numbers: Vec<&pdbtbx::Residue>,
134 radius: f64,
135) -> Vec<isize> {
136 let points: Vec<[f64; 3]> = pdb
137 .atoms()
138 .map(|atom| {
139 let x = atom.x();
140 let y = atom.y();
141 let z = atom.z();
142 [x, y, z]
143 })
144 .collect();
145
146 let kdtree = KdTree::build_by_ordered_float(points.clone());
147
148 // Find the coordinates of each target residue
149 // let mut neighbors: Vec<&Atom> = Vec::new();
150 let mut result: HashSet<isize> = HashSet::new();
151 for residue in &target_residues_numbers {
152 let query_vec: Vec<[f64; 3]> = residue
153 .atoms()
154 .map(|atom| {
155 let x = atom.x();
156 let y = atom.y();
157 let z = atom.z();
158 [x, y, z]
159 })
160 .collect();
161
162 for query in query_vec {
163 let found_vec = kdtree.within_radius(&query, radius);
164 for found in found_vec {
165 // Loop over the whole PDB and find an atom that matches the found point
166 // FIXME: Optimize this
167 for residue in pdb.residues() {
168 for atom in residue.atoms() {
169 let x = atom.x();
170 let y = atom.y();
171 let z = atom.z();
172 if x == found[0] && y == found[1] && z == found[2] {
173 result.insert(residue.serial_number());
174 }
175 }
176 }
177 }
178 }
179 }
180 // Remove the target residues from the result
181 for residue in target_residues_numbers {
182 result.remove(&residue.serial_number());
183 }
184
185 // Sort the result
186 let mut result: Vec<isize> = result.into_iter().collect();
187 result.sort();
188
189 result
190}
191
192/// Retrieves specific residues from a PDB structure based on their serial numbers.
193///
194/// # Arguments
195///
196/// * `pdb` - A reference to a `pdbtbx::PDB` structure representing the entire protein.
197/// * `target` - A vector of `isize` values representing the serial numbers of the residues to retrieve.
198///
199/// # Returns
200///
201/// A `Vec<&Residue>` containing references to the residues whose serial numbers match those in the `target` vector.
202///
203/// # Notes
204///
205/// - This function performs a linear search through all residues in the PDB structure.
206/// - The order of residues in the result vector matches the order they appear in the PDB structure,
207/// not necessarily the order of serial numbers in the `target` vector.
208/// - If a serial number in `target` doesn't match any residue in the PDB, it is simply ignored.
209pub fn get_residues(pdb: &pdbtbx::PDB, target: Vec<isize>) -> Vec<&Residue> {
210 let mut result: Vec<&Residue> = Vec::new();
211 for residue in pdb.residues() {
212 if target.contains(&residue.serial_number()) {
213 result.push(residue);
214 }
215 }
216 result
217}
218
219/// Identifies the true interface residues between chains in a PDB structure.
220///
221/// This function determines interface residues by finding pairs of residues from different chains
222/// that have atoms within a specified cutoff distance of each other.
223///
224/// # Arguments
225///
226/// * `pdb` - A reference to a `pdbtbx::PDB` structure representing the entire protein.
227/// * `cutoff` - A `f64` value specifying the maximum distance (in Angstroms) between atoms
228/// for residues to be considered part of the interface.
229///
230/// # Returns
231///
232/// A `HashMap<String, HashSet<isize>>` where:
233/// - The keys are chain identifiers (as strings).
234/// - The values are `HashSet`s of residue serial numbers (as `isize`) that are part of the interface for that chain.
235///
236/// # Algorithm
237///
238/// 1. Iterates over all pairs of chains in the PDB structure.
239/// 2. For each pair of chains, compares all residues between the two chains.
240/// 3. For each pair of residues, checks if any pair of atoms (one from each residue) is within the cutoff distance.
241/// 4. If atoms are within the cutoff, both residues are added to the interface set for their respective chains.
242///
243/// # Notes
244///
245/// - This function performs an exhaustive search, which may be computationally expensive for large structures.
246/// - The cutoff is applied to atom-atom distances, not residue-residue distances.
247/// - A residue is considered part of the interface if any of its atoms are within the cutoff of any atom in a residue from another chain.
248/// - The function does not distinguish between different types of atoms or residues.
249pub fn get_true_interface(pdb: &pdbtbx::PDB, cutoff: f64) -> HashMap<String, HashSet<isize>> {
250 let mut true_interface: HashMap<String, HashSet<isize>> = HashMap::new();
251 for chain_i in pdb.chains() {
252 for chain_j in pdb.chains() {
253 if chain_i.id() == chain_j.id() {
254 continue;
255 }
256 for res_i in chain_i.residues() {
257 for res_j in chain_j.residues() {
258 // Check if any of the atoms are touching each other
259 for atom_i in res_i.atoms() {
260 for atom_j in res_j.atoms() {
261 let distance = atom_i.distance(atom_j);
262 if distance < cutoff {
263 true_interface
264 .entry(chain_i.id().to_string())
265 .or_default()
266 .insert(res_i.serial_number());
267 true_interface
268 .entry(chain_j.id().to_string())
269 .or_default()
270 .insert(res_j.serial_number());
271 break;
272 }
273 }
274 }
275 }
276 }
277 }
278 }
279 true_interface
280}
281
282/// Identifies pairs of chains in a PDB structure that are in contact with each other.
283///
284/// This function determines which chains are in contact by finding pairs of residues from different chains
285/// that have atoms within a specified cutoff distance of each other.
286///
287/// # Arguments
288///
289/// * `pdb` - A reference to a `pdbtbx::PDB` structure representing the entire protein.
290/// * `cutoff` - A `f64` value specifying the maximum distance (in Angstroms) between atoms
291/// for chains to be considered in contact.
292///
293/// # Returns
294///
295/// A `HashSet<(String, String)>` where each tuple represents a pair of chain identifiers that are in contact.
296/// The order of chain identifiers in each tuple is arbitrary.
297///
298/// # Algorithm
299///
300/// 1. Iterates over all pairs of chains in the PDB structure.
301/// 2. For each pair of chains, compares all residues between the two chains.
302/// 3. For each pair of residues, checks if any pair of atoms (one from each residue) is within the cutoff distance.
303/// 4. If atoms are within the cutoff, the pair of chain identifiers is added to the result set.
304///
305/// # Notes
306///
307/// - This function performs an exhaustive search, which may be computationally expensive for large structures.
308/// - The cutoff is applied to atom-atom distances, not residue-residue or chain-chain distances.
309/// - A pair of chains is considered in contact if any atom from one chain is within the cutoff distance of any atom from the other chain.
310/// - The function does not distinguish between different types of atoms or residues.
311/// - Each pair of chains appears only once in the result set, regardless of the number of contacts between them.
312pub fn get_chains_in_contact(pdb: &pdbtbx::PDB, cutoff: f64) -> HashSet<(String, String)> {
313 let mut chains_in_contact: HashSet<(String, String)> = HashSet::new();
314
315 for chain_i in pdb.chains() {
316 for chain_j in pdb.chains() {
317 if chain_i.id() == chain_j.id() {
318 continue;
319 }
320 for res_i in chain_i.residues() {
321 for res_j in chain_j.residues() {
322 // Check if any of the atoms are touching each other
323 for atom_i in res_i.atoms() {
324 for atom_j in res_j.atoms() {
325 let distance = atom_i.distance(atom_j);
326 if distance < cutoff {
327 chains_in_contact
328 .insert((chain_i.id().to_string(), chain_j.id().to_string()));
329 break;
330 }
331 }
332 }
333 }
334 }
335 }
336 }
337 chains_in_contact
338}
339
340/// Represents two residues in a protein chain.
341///
342/// A `Gap` is defined by two residues that are sequential in the primary structure
343/// but have a distance between their atoms that exceeds a certain threshold in the
344/// tertiary structure.
345#[derive(Debug)]
346pub struct Gap {
347 /// The chain identifier where the gap is located.
348 pub chain: String,
349
350 /// The name of the atom in the first residue used for distance calculation.
351 pub atom_i: String,
352
353 /// The name of the atom in the second residue used for distance calculation.
354 pub atom_j: String,
355
356 /// The sequence number of the first residue.
357 pub res_i: isize,
358
359 /// The sequence number of the second residue.
360 pub res_j: isize,
361
362 /// The distance between the specified atoms of the two residues.
363 pub distance: f64,
364}
365
366/// Filters a vector of `Gap` elements, retaining only the unique elements based on the `atom_j` value.
367///
368/// This function removes duplicate `Gap` elements where the `atom_j` value is the same across multiple entries,
369/// keeping only the first occurrence of each unique `atom_j`. This is useful when you only need to retain
370/// one `Gap` entry for each unique `atom_j` regardless of the `atom_i` or other fields.
371///
372pub fn filter_unique_by_atom_j(gaps: Vec<Gap>) -> Vec<Gap> {
373 let mut seen = HashSet::new();
374 let mut unique: Vec<Gap> = Vec::new();
375
376 gaps.into_iter().for_each(|g| {
377 if !seen.contains(&g.atom_j) {
378 seen.insert(g.atom_j.clone());
379 unique.push(g)
380 }
381 });
382
383 unique
384}
385
386/// Identifies separate bodies within a protein structure based on CA atom distances.
387///
388/// This function segments the protein into separate "bodies" by analyzing the distances
389/// between consecutive CA (alpha carbon) atoms in each chain. A new body is defined when
390/// the distance between consecutive CA atoms exceeds 4 Angstroms.
391///
392/// # Arguments
393///
394/// * `pdb` - A reference to a `pdbtbx::PDB` structure representing the entire protein.
395///
396/// # Returns
397///
398/// A `HashMap<isize, Vec<(isize, &str, &pdbtbx::Atom)>>` where:
399/// - The key is a body identifier (starting from 0).
400/// - The value is a vector of tuples, each containing:
401/// - The residue serial number (`isize`)
402/// - The chain identifier (`&str`)
403/// - A reference to the CA atom (`&pdbtbx::Atom`)
404///
405/// # Algorithm
406///
407/// 1. Extracts all CA atoms from the PDB structure, along with their chain and residue information.
408/// 2. Iterates through consecutive CA atoms within each chain.
409/// 3. If the distance between consecutive CA atoms exceeds 4 Angstroms, a new body is started.
410/// 4. Groups CA atoms into bodies based on this distance criterion.
411///
412/// # Notes
413///
414/// - This function only considers CA atoms for body identification.
415/// - The 4 Angstrom cutoff is hard-coded and may not be suitable for all use cases.
416/// - Bodies are numbered sequentially starting from 0.
417/// - Gaps between chains always result in a new body, regardless of spatial proximity.
418/// - The function assumes that CA atoms within a chain are listed in sequence order.
419pub fn find_bodies(pdb: &pdbtbx::PDB) -> HashMap<isize, Vec<(isize, &str, &pdbtbx::Atom)>> {
420 // Check if the distance of a given atom to its next one is higher than 4A
421
422 // Get only the `CA` atoms
423 let mut ca_atoms: Vec<(&str, isize, &pdbtbx::Atom)> = Vec::new();
424 pdb.chains().for_each(|chain| {
425 chain.residues().for_each(|residue| {
426 residue.atoms().for_each(|atom| {
427 if atom.name() == "CA" {
428 ca_atoms.push((chain.id(), residue.serial_number(), atom));
429 }
430 });
431 });
432 });
433 let mut bodies: HashMap<isize, Vec<(isize, &str, &pdbtbx::Atom)>> = HashMap::new();
434 let mut body_id = 0;
435 for (i, j) in ca_atoms.iter().zip(ca_atoms.iter().skip(1)) {
436 let (chain_i, res_i, atom_i) = i;
437 let (chain_j, _res_j, atom_j) = j;
438 if chain_i != chain_j {
439 continue;
440 }
441 let distance = atom_i.distance(atom_j);
442 if distance > 4.0 {
443 body_id += 1;
444 }
445 bodies
446 .entry(body_id)
447 .or_default()
448 .push((*res_i, chain_i, atom_i));
449 }
450
451 bodies
452}
453
454/// Creates a list of gaps between different bodies in a protein structure.
455///
456/// This function analyzes the spatial relationships between separate bodies of a protein
457/// structure and generates a list of `Gap` instances representing the distances between
458/// randomly selected pairs of atoms from different bodies.
459///
460/// # Arguments
461///
462/// * `bodies` - A reference to a `HashMap<isize, Vec<(isize, &str, &pdbtbx::Atom)>>` representing
463/// the bodies of the protein structure, as returned by the `find_bodies` function.
464///
465/// # Returns
466///
467/// A `Vec<Gap>` containing `Gap` instances that represent the distances between pairs of
468/// atoms from different bodies.
469///
470/// # Algorithm
471///
472/// 1. Initializes a random number generator with a fixed seed (42) for reproducibility.
473/// 2. Iterates over all unique pairs of bodies.
474/// 3. For each pair of bodies with at least 2 atoms each:
475/// a. Randomly selects 2 atoms from each body.
476/// b. Creates 2 `Gap` instances, one for each pair of selected atoms.
477/// 4. Collects all created `Gap` instances into a vector.
478///
479/// # Notes
480///
481/// - The function uses a fixed random seed (42) for reproducibility. This means that
482/// for the same input, it will always produce the same output.
483/// - Only bodies with at least 2 atoms are considered for gap creation.
484/// - The function creates 2 gaps for each pair of bodies, using different randomly selected atoms.
485/// - The `Gap` instances contain information about the chain, atom names, residue numbers,
486/// and the distance between the selected atoms.
487/// - This function assumes that the input `bodies` HashMap is structured as expected,
488/// with each value being a vector of tuples (residue number, chain ID, atom reference).
489///
490/// # Safety
491///
492/// This function uses `unwrap()` on the results of `choose_multiple()`. It assumes that
493/// the bodies contain at least 2 atoms each, as checked earlier in the function.
494pub fn create_iter_body_gaps(
495 bodies: &HashMap<isize, Vec<(isize, &str, &pdbtbx::Atom)>>,
496) -> Vec<Gap> {
497 let mut rng = StdRng::seed_from_u64(42);
498 let mut pairs: Vec<Gap> = Vec::new();
499 let mut body_ids: Vec<isize> = bodies.keys().cloned().collect();
500 body_ids.sort();
501 for (i, &body_id1) in body_ids.iter().enumerate() {
502 for &body_id2 in body_ids[i + 1..].iter() {
503 if let (Some(atoms1), Some(atoms2)) = (bodies.get(&body_id1), bodies.get(&body_id2)) {
504 if atoms1.len() >= 2 && atoms2.len() >= 2 {
505 let selected1: Vec<_> = atoms1.choose_multiple(&mut rng, 2).cloned().collect();
506 let selected2: Vec<_> = atoms2.choose_multiple(&mut rng, 2).cloned().collect();
507
508 for i in 0..2 {
509 pairs.push(Gap {
510 chain: selected1[i].1.to_string(),
511 atom_i: selected1[i].2.name().to_string(),
512 atom_j: selected2[i].2.name().to_string(),
513 res_i: selected1[i].0,
514 res_j: selected2[i].0,
515 distance: selected1[i].2.distance(selected2[i].2),
516 });
517 }
518 }
519 }
520 }
521 }
522
523 pairs
524}
525
526// pub fn calculate_z_axis(selection1: &[pdbtbx::Atom], selection2: &[pdbtbx::Atom]) -> Vector3<f64> {
527// let center1 = calculate_geometric_center(selection1);
528// let center2 = calculate_geometric_center(selection2);
529// (center2 - center1).normalize()
530// }
531
532pub fn calculate_geometric_center(atoms: &[pdbtbx::Atom]) -> Vector3<f64> {
533 let mut center = Vector3::zeros();
534 for atom in atoms {
535 center += Vector3::new(atom.x(), atom.y(), atom.z());
536 }
537 center / (atoms.len() as f64)
538}
539
540// pub fn generate_axis_beads(start_z: f64, end_z: f64, num_beads: usize) -> Vec<Bead> {
541// let mut beads = Vec::with_capacity(num_beads);
542// let length = end_z - start_z;
543
544// for i in 0..num_beads {
545// let t = (i as f64) / ((num_beads - 1) as f64);
546// let z = start_z + t * length;
547// let position = Vector3::new(0.0, 0.0, z);
548// beads.push(Bead { position });
549// }
550// beads
551// }
552
553pub fn generate_grid_beads(center_z: f64, grid_size: usize, spacing: f64) -> Vec<Bead> {
554 let mut beads = Vec::with_capacity(grid_size * grid_size);
555
556 let grid_offset = (grid_size as f64 - 1.0) * spacing / 2.0;
557
558 for i in 0..grid_size {
559 for j in 0..grid_size {
560 let x = (i as f64) * spacing - grid_offset;
561 let y = (j as f64) * spacing - grid_offset;
562
563 let final_pos = Vector3::new(x, y, center_z);
564 beads.push(Bead {
565 position: final_pos,
566 });
567 }
568 }
569
570 beads
571}
572
573pub fn write_beads_pdb(beads: &[Bead], filename: &str) -> std::io::Result<()> {
574 let mut file = File::create(filename)?;
575
576 for (i, bead) in beads.iter().enumerate() {
577 writeln!(
578 file,
579 "{:<6}{:>5} {:<4}{:1}{:<3} {:1}{:>4}{:1} {:>8.3}{:>8.3}{:>8.3}{:>6.2}{:>6.2} {:>2}{:>2}",
580 "ATOM",
581 i + 1,
582 "SHA",
583 " ",
584 "SHA",
585 "S",
586 i + 1,
587 " ",
588 bead.position.x,
589 bead.position.y,
590 bead.position.z,
591 1.00,
592 1.00,
593 "H",
594 ""
595 )?;
596 }
597 writeln!(file, "END")?;
598
599 Ok(())
600}
601
602pub fn find_furthest_selections(selections: &[Vec<isize>], pdb: &PDB) -> (Vec<Atom>, Vec<Atom>) {
603 let sele: HashMap<usize, (Vec<Atom>, Vector3<f64>)> = selections
604 .iter()
605 .enumerate()
606 .map(|(i, sel)| {
607 let atoms = get_atoms_from_resnumbers(pdb, sel);
608 let center = calculate_geometric_center(&atoms);
609 (i, (atoms, center))
610 })
611 .collect();
612
613 // Find the two selections that are the furthest apart
614 let mut max_distance = 0.0;
615 let mut atoms1 = Vec::new();
616 let mut atoms2 = Vec::new();
617 for (i, j) in (0..selections.len()).tuple_combinations() {
618 let distance = (sele[&i].1 - sele[&j].1).norm();
619 if distance > max_distance {
620 max_distance = distance;
621 atoms1 = sele[&i].0.clone();
622 atoms2 = sele[&j].0.clone();
623 }
624 }
625
626 (atoms1, atoms2)
627}
628
629pub fn get_atoms_from_resnumbers(pdb: &PDB, selection: &[isize]) -> Vec<Atom> {
630 pdb.residues()
631 .filter(|res| selection.contains(&res.serial_number()))
632 .flat_map(|res| res.atoms())
633 .cloned() // This clones each &Atom into an owned Atom
634 .collect()
635}
636
637/// Loads a PDB structure from a file path
638///
639/// # Arguments
640/// * `input_pdb` - Path to the PDB file
641///
642/// # Returns
643/// Result containing either the parsed PDB or a list of errors
644pub fn load_pdb(input_pdb: &str) -> Result<PDB, Vec<PDBError>> {
645 std::fs::read_to_string(input_pdb)
646 .map_err(|e| {
647 vec![PDBError::new(
648 pdbtbx::ErrorLevel::GeneralWarning,
649 "File read error",
650 format!("Failed to read file {}: {}", input_pdb, e),
651 pdbtbx::Context::None,
652 )]
653 })
654 .and_then(|content| process_pdb_contents(&content))
655}
656
657/// Loads a PDB structure from string content
658///
659/// # Arguments
660/// * `content` - PDB file contents as a string
661///
662/// # Returns
663/// Result containing either the parsed PDB or a list of errors
664pub fn load_pdb_from_content(content: &str) -> Result<PDB, Vec<PDBError>> {
665 process_pdb_contents(content)
666}
667
668/// Processes raw PDB content by removing remarks and padding lines
669///
670/// # Arguments
671/// * `content` - Raw PDB content
672///
673/// # Returns
674/// Processed PDB content ready for parsing
675pub fn process_pdb_contents(pdb_content: &str) -> Result<PDB, Vec<PDBError>> {
676 let processed_content = process_pdb_string(pdb_content);
677
678 let mut opts = ReadOptions::new();
679 opts.set_format(pdbtbx::Format::Pdb)
680 .set_level(pdbtbx::StrictnessLevel::Loose);
681
682 let cursor = Cursor::new(processed_content.into_bytes());
683 let reader = BufReader::new(cursor);
684
685 match opts.read_raw(reader) {
686 Ok((pdb, _)) => Ok(pdb),
687 Err(e) => Err(e),
688 }
689}
690
691/// Processes raw PDB content by removing remarks and padding lines
692///
693/// # Arguments
694/// * `content` - Raw PDB content
695///
696/// # Returns
697/// Processed PDB content ready for parsing
698fn process_pdb_string(content: &str) -> String {
699 let mut output = String::with_capacity(content.len());
700
701 // Process lines with single allocation
702 for line in content.lines() {
703 if !line.starts_with("REMARK") {
704 output.push_str(line);
705 if line.len() < 80 {
706 output.push_str(&" ".repeat(80 - line.len()));
707 }
708 output.push('\n');
709 }
710 }
711
712 output
713}
714
715/// Finds the closest residue pairs between different protein chains within a specified distance cutoff.
716///
717/// This function analyzes inter-chain interactions by identifying the closest atom pairs between residues
718/// from different chains, using the alpha carbon (CA) atoms for distance calculation.
719///
720/// # Arguments
721///
722/// * pdb - A reference to a parsed PDB structure.
723/// * cutoff - A floating-point value specifying the maximum distance (in Angstroms) for considering residue pairs.
724///
725/// # Returns
726///
727/// A Vec<Pair> containing the closest residue pairs within the specified cutoff, sorted by distance.
728///
729/// # Functionality
730///
731/// 1. Iterates through all unique chain pairs in the PDB structure.
732/// 2. For each chain pair, finds the closest atoms between residues.
733/// 3. Selects pairs where the inter-residue distance is less than the specified cutoff.
734/// 4. Uses alpha carbon (CA) atoms for distance calculation and pair representation.
735/// 5. Sorts the resulting pairs by their inter-alpha carbon distance.
736///
737/// # Notes
738///
739/// - Only inter-chain interactions are considered.
740/// - The distance is calculated based on the closest atom pairs between residues.
741/// - The resulting pairs are sorted from shortest to longest distance.
742pub fn get_closest_residue_pairs(pdb: &pdbtbx::PDB, cutoff: f64) -> Vec<Pair> {
743 let mut closest_pairs = Vec::new();
744
745 let chains: Vec<_> = pdb.chains().collect();
746
747 for i in 0..chains.len() {
748 for j in (i + 1)..chains.len() {
749 let chain_i = &chains[i];
750 let chain_j = &chains[j];
751
752 for res_i in chain_i.residues() {
753 let mut min_distance = f64::MAX;
754 let mut closest_pair = None;
755
756 for res_j in chain_j.residues() {
757 let mut atom_dist = f64::MAX;
758 for atom_i in res_i.atoms() {
759 for atom_j in res_j.atoms() {
760 let distance = atom_i.distance(atom_j);
761 if distance < atom_dist {
762 atom_dist = distance;
763 }
764 }
765 }
766 if atom_dist < min_distance {
767 min_distance = atom_dist;
768 closest_pair = Some((res_j, res_i));
769 }
770 }
771
772 if min_distance < cutoff {
773 if let Some((res_j, res_i)) = closest_pair {
774 let ca_i = res_i.atoms().find(|atom| atom.name() == "CA").unwrap();
775 let ca_j = res_j.atoms().find(|atom| atom.name() == "CA").unwrap();
776 let ca_ca_dist = ca_i.distance(ca_j);
777 closest_pairs.push(Pair {
778 chain_i: chain_i.id().to_string(),
779 res_i: res_i.serial_number(),
780 atom_i: ca_i.name().to_string(),
781 chain_j: chain_j.id().to_string(),
782 res_j: res_j.serial_number(),
783 atom_j: ca_j.name().to_string(),
784 distance: ca_ca_dist,
785 });
786 }
787 }
788 }
789 }
790 }
791
792 closest_pairs.sort_by(|a, b| a.distance.partial_cmp(&b.distance).unwrap());
793
794 closest_pairs
795}
796
797/// Finds pairs of atoms between a ligand and nearby protein residues within a certain distance.
798///
799/// This function processes the given PDB structure to identify heteroatoms (ligand atoms) and
800/// their neighboring atoms within a specified distance threshold (5.0 Å). It then finds the closest
801/// protein atom for each ligand atom and returns a vector of `Pair` structures containing information
802/// about the ligand atom and its closest protein atom. The result is a list of ligand-protein pairs
803/// and their corresponding distance, residue, and chain information.
804///
805pub fn gaps_around_ligand(pdb: &PDB) -> Vec<Gap> {
806 let ligand_atoms: Vec<&Atom> = pdb
807 .atoms()
808 .filter(|a| a.hetero() && *a.element().unwrap() != Element::H)
809 .collect();
810
811 let ligand_residues: Vec<&Residue> = pdb
812 .residues()
813 .filter(|r| r.atoms().any(|a| a.hetero()))
814 .collect();
815
816 let neighbors = neighbor_search(pdb.clone(), ligand_residues, 5.0);
817 let protein_atoms = get_atoms_from_resnumbers(pdb, &neighbors);
818
819 let mut result: Vec<Gap> = Vec::new();
820
821 let finder = Finder::new(pdb);
822
823 for ligand_atom in ligand_atoms {
824 let closest_protein_atom = protein_atoms
825 .iter()
826 .filter(|a| *a.element().unwrap() != Element::H)
827 .map(|a| (a, ligand_atom.distance(a)))
828 .min_by(|a, b| a.1.partial_cmp(&b.1).unwrap())
829 .unwrap();
830
831 let ligand_chain = finder.find_chain_id(ligand_atom);
832 let protein_chain = finder.find_chain_id(closest_protein_atom.0);
833
834 if ligand_chain == protein_chain {
835 result.push(Gap {
836 chain: ligand_chain,
837 res_i: finder.find_residue_number(ligand_atom),
838 atom_i: ligand_atom.name().to_string(),
839 res_j: finder.find_residue_number(closest_protein_atom.0),
840 atom_j: closest_protein_atom.0.name().to_string(),
841 distance: closest_protein_atom.1,
842 })
843 }
844 }
845
846 result
847}
848
849#[cfg(test)]
850mod tests {
851
852 use std::env;
853
854 use tempfile::NamedTempFile;
855
856 use super::*;
857
858 #[test]
859 fn test_get_true_interface() {
860 let pdb = load_pdb("tests/data/complex.pdb").unwrap();
861
862 let observed_true_interface = get_true_interface(&pdb, 5.0);
863
864 let mut expected_true_interface: HashMap<String, HashSet<isize>> = HashMap::new();
865 let chain_a = "A".to_string();
866 let res_a: HashSet<isize> = vec![934, 936, 933, 946, 950, 938, 940, 941, 937, 931]
867 .into_iter()
868 .collect();
869 let chain_b = "B";
870 let res_b: HashSet<isize> = vec![66, 48, 68, 49, 46, 45, 44, 47, 69, 6, 70, 8, 42]
871 .into_iter()
872 .collect();
873
874 expected_true_interface.insert(chain_a, res_a);
875 expected_true_interface.insert(chain_b.to_string(), res_b);
876
877 assert_eq!(observed_true_interface, expected_true_interface);
878 }
879
880 #[test]
881 fn test_load_pdb_short_lines() {
882 let pdb_path = env::current_dir()
883 .unwrap()
884 .join("tests/data/leu_short_lines.pdb");
885 let pdb = load_pdb(pdb_path.to_str().unwrap()).unwrap();
886 assert_eq!(pdb.atoms().count(), 8);
887 }
888
889 #[test]
890 fn test_load_pdb_normal_lines() {
891 let pdb_path = env::current_dir()
892 .unwrap()
893 .join("tests/data/leu_normal_lines.pdb");
894 let pdb = load_pdb(pdb_path.to_str().unwrap()).unwrap();
895 assert_eq!(pdb.atoms().count(), 8);
896 }
897 // Updated test helper function that works with older Rust versions
898 fn create_test_pdb(remarks: usize, atoms: usize) -> String {
899 let mut pdb = String::new();
900
901 // Add remarks
902 for i in 0..remarks {
903 pdb.push_str(&format!("REMARK {} Test remark\n", i));
904 }
905
906 // Add atoms (ATOM records)
907 for i in 0..atoms {
908 pdb.push_str(&format!(
909 "ATOM {:>5} N ALA A{:>4} {:>8}{:>8}{:>8} 1.00 0.00 N\n",
910 i + 1,
911 i + 1,
912 format!("{:.3}", i as f32),
913 format!("{:.3}", i as f32),
914 format!("{:.3}", i as f32)
915 ));
916 }
917
918 pdb
919 }
920
921 #[test]
922 fn test_process_pdb_string_removes_remarks() {
923 let input = "REMARK 1 Test\nATOM 1 N ALA A 1 0.000 0.000 0.000\n";
924 let processed = process_pdb_string(input);
925
926 assert!(!processed.contains("REMARK"));
927 assert!(processed.contains("ATOM"));
928 }
929
930 #[test]
931 fn test_process_pdb_string_pads_lines() {
932 let short_line = "ATOM 1 N ALA A 1 0.000 0.000 0.000";
933 let processed = process_pdb_string(short_line);
934
935 // Check line is padded to 80 chars (including newline)
936 assert_eq!(processed.lines().next().unwrap().len(), 80);
937 assert_eq!(processed.chars().count(), 81); // 80 + newline
938 }
939
940 #[test]
941 fn test_process_pdb_string_preserves_long_lines() {
942 let long_line =
943 "ATOM 1 N ALA A 1 0.000 0.000 0.000 1.00 0.00 N EXTRA";
944 let processed = process_pdb_string(long_line);
945
946 // Line should remain unchanged (except newline)
947 assert_eq!(processed.trim(), long_line);
948 }
949
950 #[test]
951 fn test_process_pdb_contents_valid_pdb() {
952 let pdb_content = create_test_pdb(3, 5);
953 let result = process_pdb_contents(&pdb_content);
954
955 assert!(result.is_ok());
956 let pdb = result.unwrap();
957 assert_eq!(pdb.atoms().count(), 5);
958 }
959
960 #[test]
961 fn test_process_pdb_contents_empty_input() {
962 let result = process_pdb_contents("");
963 assert!(result.is_err());
964
965 if let Err(errors) = result {
966 assert_eq!(errors.len(), 1);
967 }
968 }
969
970 #[test]
971 fn test_process_pdb_contents_invalid_pdb() {
972 let invalid_content = "This is not a PDB file\n";
973 let result = process_pdb_contents(invalid_content);
974
975 assert!(result.is_err());
976 if let Err(errors) = result {
977 assert!(!errors.is_empty());
978 }
979 }
980
981 #[test]
982 fn test_process_pdb_string_empty_input() {
983 let processed = process_pdb_string("");
984 assert_eq!(processed, "");
985 }
986
987 #[test]
988 fn test_process_pdb_string_mixed_content() {
989 let input = "\
990REMARK 1 Test
991ATOM 1 N ALA A 1 0.000 0.000 0.000
992REMARK 2 Another
993ATOM 2 N ALA A 2 1.000 1.000 1.000
994";
995 let processed = process_pdb_string(input);
996 let lines: Vec<&str> = processed.lines().collect();
997
998 assert_eq!(lines.len(), 2); // Only ATOM lines should remain
999 assert!(lines[0].starts_with("ATOM"));
1000 assert!(lines[1].starts_with("ATOM"));
1001 }
1002
1003 #[test]
1004 fn test_process_pdb_contents_with_real_file() {
1005 // Create a temporary PDB file
1006 let mut file = NamedTempFile::new().unwrap();
1007 let pdb_content = create_test_pdb(2, 3);
1008 write!(file, "{}", pdb_content).unwrap();
1009
1010 // Test with file content
1011 let file_content = std::fs::read_to_string(file.path()).unwrap();
1012 let result = process_pdb_contents(&file_content);
1013
1014 assert!(result.is_ok());
1015 let pdb = result.unwrap();
1016 assert_eq!(pdb.atoms().count(), 3);
1017 }
1018
1019 #[test]
1020 fn test_get_closest_residue_pairs() {
1021 let pdb = load_pdb("tests/data/two_res.pdb").unwrap();
1022 let observed_pairs = get_closest_residue_pairs(&pdb, 5.0);
1023 let expected_pairs = &[Pair {
1024 chain_i: "A".to_string(),
1025 chain_j: "B".to_string(),
1026 atom_i: "CA".to_string(),
1027 atom_j: "CA".to_string(),
1028 res_i: 2,
1029 res_j: 10,
1030 distance: 9.1,
1031 }];
1032 assert_eq!(observed_pairs.len(), expected_pairs.len());
1033 let pair = &observed_pairs[0];
1034 assert_eq!(pair.chain_i, expected_pairs[0].chain_i);
1035 assert_eq!(pair.chain_j, expected_pairs[0].chain_j);
1036 assert_eq!(pair.atom_i, expected_pairs[0].atom_i);
1037 assert_eq!(pair.atom_j, expected_pairs[0].atom_j);
1038 assert_eq!(pair.res_i, expected_pairs[0].res_i);
1039 assert_eq!(pair.res_j, expected_pairs[0].res_j);
1040 assert!((pair.distance - 9.1).abs() < 0.1);
1041 }
1042 #[test]
1043 fn test_gaps_around_ligand() {
1044 let pdb = load_pdb("tests/data/prot_ligand.pdb").unwrap();
1045 let observed = gaps_around_ligand(&pdb);
1046 let expected = &[
1047 Gap {
1048 chain: "E".to_string(),
1049 res_i: 351,
1050 atom_i: "N1".to_string(),
1051 res_j: 123,
1052 atom_j: "N".to_string(),
1053 distance: 3.0400192433601467,
1054 },
1055 Gap {
1056 chain: "E".to_string(),
1057 res_i: 351,
1058 atom_i: "C2".to_string(),
1059 res_j: 327,
1060 atom_j: "CE2".to_string(),
1061 distance: 3.79,
1062 },
1063 ];
1064
1065 // Compare chain, residue, and atom names
1066 assert_eq!(observed[0].chain, expected[0].chain);
1067 assert_eq!(observed[0].res_i, expected[0].res_i);
1068 assert_eq!(observed[0].atom_i, expected[0].atom_i);
1069 assert_eq!(observed[0].res_j, expected[0].res_j);
1070 assert_eq!(observed[0].atom_j, expected[0].atom_j);
1071
1072 assert_eq!(observed[1].chain, expected[1].chain);
1073 assert_eq!(observed[1].res_i, expected[1].res_i);
1074 assert_eq!(observed[1].atom_i, expected[1].atom_i);
1075 assert_eq!(observed[1].res_j, expected[1].res_j);
1076 assert_eq!(observed[1].atom_j, expected[1].atom_j);
1077
1078 // TODO: Find a smart way to check the distances
1079 //
1080 // assert_eq!(observed[0].distance, expected[0].distance);
1081 // assert_eq!(observed[1].distance, expected[1].distance);
1082 }
1083
1084 #[test]
1085 fn test_finder() {
1086 let pdb = load_pdb("tests/data/complex.pdb").unwrap();
1087 let finder = Finder::new(&pdb);
1088
1089 assert_eq!(finder.chain_lookup.keys().len(), 924);
1090 assert_eq!(finder.residue_lookup.keys().len(), 924)
1091 }
1092
1093 #[test]
1094 fn test_finder_find_chain_id() {
1095 let pdb = load_pdb("tests/data/complex.pdb").unwrap();
1096 let finder = Finder::new(&pdb);
1097
1098 let found = finder.find_chain_id(pdb.atom(1).expect(""));
1099 assert_eq!(found, "A");
1100
1101 let found = finder.find_chain_id(pdb.atom(700).expect(""));
1102 assert_eq!(found, "B");
1103 }
1104
1105 #[test]
1106 fn test_finder_find_residue_number() {
1107 let pdb = load_pdb("tests/data/complex.pdb").unwrap();
1108 let finder = Finder::new(&pdb);
1109
1110 let found = finder.find_residue_number(pdb.atom(1).unwrap());
1111
1112 assert_eq!(found, 929)
1113 }
1114
1115 #[test]
1116 fn test_filter_unique_by_atom_j() {
1117 let gaps = vec![
1118 Gap {
1119 chain: "A".to_string(),
1120 atom_i: "C".to_string(),
1121 atom_j: "B".to_string(),
1122 res_i: 1,
1123 res_j: 2,
1124 distance: 3.5,
1125 },
1126 Gap {
1127 chain: "A".to_string(),
1128 atom_i: "A".to_string(),
1129 atom_j: "B".to_string(),
1130 res_i: 3,
1131 res_j: 4,
1132 distance: 2.8,
1133 },
1134 Gap {
1135 chain: "B".to_string(),
1136 atom_i: "C".to_string(),
1137 atom_j: "D".to_string(),
1138 res_i: 5,
1139 res_j: 6,
1140 distance: 4.1,
1141 },
1142 Gap {
1143 chain: "A".to_string(),
1144 atom_i: "E".to_string(),
1145 atom_j: "B".to_string(),
1146 res_i: 7,
1147 res_j: 8,
1148 distance: 1.5,
1149 },
1150 ];
1151
1152 let filtered = filter_unique_by_atom_j(gaps);
1153
1154 // The function should remove duplicates based on `atom_j` and keep only the first occurrence.
1155 assert_eq!(filtered.len(), 2); // "B" and "D" should remain
1156 assert_eq!(filtered[0].atom_j, "B");
1157 assert_eq!(filtered[1].atom_j, "D");
1158
1159 // Verify the correct `atom_j` is retained
1160 assert!(
1161 filtered
1162 .iter()
1163 .all(|gap| gap.atom_j != "B" || gap.atom_i == "C")
1164 );
1165 }
1166
1167 #[test]
1168 fn test_no_duplicates() {
1169 let gaps = vec![
1170 Gap {
1171 chain: "A".to_string(),
1172 atom_i: "C".to_string(),
1173 atom_j: "B".to_string(),
1174 res_i: 1,
1175 res_j: 2,
1176 distance: 3.5,
1177 },
1178 Gap {
1179 chain: "A".to_string(),
1180 atom_i: "A".to_string(),
1181 atom_j: "D".to_string(),
1182 res_i: 3,
1183 res_j: 4,
1184 distance: 2.8,
1185 },
1186 ];
1187
1188 let filtered = filter_unique_by_atom_j(gaps);
1189 assert_eq!(filtered.len(), 2); // No duplicates, so the length should be the same
1190 }
1191
1192 #[test]
1193 fn test_empty_input() {
1194 let gaps: Vec<Gap> = Vec::new();
1195 let filtered = filter_unique_by_atom_j(gaps);
1196 assert!(filtered.is_empty()); // Should return an empty list
1197 }
1198
1199 #[test]
1200 fn test_single_element() {
1201 let gaps = vec![Gap {
1202 chain: "A".to_string(),
1203 atom_i: "C".to_string(),
1204 atom_j: "B".to_string(),
1205 res_i: 1,
1206 res_j: 2,
1207 distance: 3.5,
1208 }];
1209
1210 let filtered = filter_unique_by_atom_j(gaps);
1211 assert_eq!(filtered.len(), 1); // Single element, should remain
1212 assert_eq!(filtered[0].atom_j, "B");
1213 }
1214}