haddock_restraints/core/
structure.rs

1use core::f64;
2use std::collections::{HashMap, HashSet};
3
4use itertools::Itertools;
5use kd_tree::KdTree;
6use nalgebra::Vector3;
7use pdbtbx::{Atom, Element, PDB, ReadOptions};
8use pdbtbx::{PDBError, Residue};
9use rand::SeedableRng;
10use rand::prelude::IndexedRandom;
11use rand::rngs::StdRng;
12use std::fs::File;
13use std::io::BufReader;
14use std::io::Cursor;
15use std::io::Write;
16
17#[derive(Clone)]
18pub struct Bead {
19    pub position: Vector3<f64>,
20}
21
22#[derive(Debug, Clone, PartialEq)]
23pub struct Pair {
24    pub chain_i: String,
25    pub res_i: isize,
26    pub atom_i: String,
27    pub chain_j: String,
28    pub res_j: isize,
29    pub atom_j: String,
30    pub distance: f64,
31}
32
33/// The `Finder` struct contains two hashmaps to quickly retrieve the chain ID and residue number associated with a given atom by its serial number. These lookups are initialized from the provided PDB file during instantiation.
34///
35struct Finder {
36    chain_lookup: HashMap<usize, String>,
37    residue_lookup: HashMap<usize, isize>,
38}
39
40impl Finder {
41    /// Creates a new `Finder` instance.
42    ///
43    /// This method initializes the `Finder` struct by extracting the chain and residue information from the provided `PDB` reference. It constructs two hashmaps: `chain_lookup` (for mapping atom serial numbers to chain IDs) and `residue_lookup` (for mapping atom serial numbers to residue serial numbers).
44    ///
45    fn new(pdb: &PDB) -> Self {
46        let chain_lookup = pdb
47            .chains()
48            .flat_map(|chain| {
49                let chain_id = chain.id().to_string();
50                chain.residues().flat_map(move |residue| {
51                    residue.atoms().map({
52                        let v = chain_id.clone();
53                        move |atom| (atom.serial_number(), v.clone())
54                    })
55                })
56            })
57            .collect();
58
59        let residue_lookup = pdb
60            .chains()
61            .flat_map(|chain| {
62                chain.residues().flat_map(|residue| {
63                    let res_num = residue.serial_number();
64                    residue
65                        .atoms()
66                        .map(move |atom| (atom.serial_number(), res_num))
67                })
68            })
69            .collect();
70
71        Finder {
72            chain_lookup,
73            residue_lookup,
74        }
75    }
76
77    /// Finds the chain ID associated with a given atom.
78    ///
79    /// This method looks up the chain ID for an atom based on its serial number. If no chain is found, the default value "A" is returned.
80    ///
81    fn find_chain_id(&self, atom: &Atom) -> String {
82        self.chain_lookup
83            .get(&atom.serial_number())
84            .unwrap_or(&"A".to_string())
85            .clone()
86    }
87
88    /// Finds the residue serial number associated with a given atom.
89    ///
90    /// This method looks up the residue serial number for an atom based on its serial number. If no residue number is found, the default value `0` is returned.
91    ///
92    fn find_residue_number(&self, atom: &Atom) -> isize {
93        *self.residue_lookup.get(&atom.serial_number()).unwrap_or(&0)
94    }
95}
96
97/// Performs a neighbor search for residues within a specified radius of target residues.
98///
99/// This function uses a K-d tree to efficiently find residues that are within a given radius
100/// of any atom in the target residues.
101///
102/// # Arguments
103///
104/// * `pdb` - A `pdbtbx::PDB` structure representing the entire protein.
105/// * `target_residues_numbers` - A vector of references to `pdbtbx::Residue` objects representing the target residues.
106/// * `radius` - A `f64` value specifying the search radius in Angstroms.
107///
108/// # Returns
109///
110/// A `Vec<isize>` containing the sorted serial numbers of residues that are within the specified
111/// radius of any atom in the target residues, excluding the target residues themselves.
112///
113/// # Algorithm
114///
115/// 1. Constructs a K-d tree from all atom coordinates in the PDB structure.
116/// 2. For each atom in the target residues:
117///    a. Performs a radius search in the K-d tree.
118///    b. Identifies the residues corresponding to the atoms found in the search.
119/// 3. Removes the target residues from the result set.
120/// 4. Sorts the resulting residue serial numbers.
121///
122/// # Notes
123///
124/// - The function uses the `KdTree` data structure for efficient spatial searching.
125/// - The current implementation may have performance bottlenecks in the atom-to-residue mapping step.
126/// - The function assumes that atom coordinates are unique for identification purposes.
127///
128/// # TODO
129///
130/// - Optimize the atom-to-residue mapping step for better performance.
131pub fn neighbor_search(
132    pdb: pdbtbx::PDB,
133    target_residues_numbers: Vec<&pdbtbx::Residue>,
134    radius: f64,
135) -> Vec<isize> {
136    let points: Vec<[f64; 3]> = pdb
137        .atoms()
138        .map(|atom| {
139            let x = atom.x();
140            let y = atom.y();
141            let z = atom.z();
142            [x, y, z]
143        })
144        .collect();
145
146    let kdtree = KdTree::build_by_ordered_float(points.clone());
147
148    // Find the coordinates of each target residue
149    // let mut neighbors: Vec<&Atom> = Vec::new();
150    let mut result: HashSet<isize> = HashSet::new();
151    for residue in &target_residues_numbers {
152        let query_vec: Vec<[f64; 3]> = residue
153            .atoms()
154            .map(|atom| {
155                let x = atom.x();
156                let y = atom.y();
157                let z = atom.z();
158                [x, y, z]
159            })
160            .collect();
161
162        for query in query_vec {
163            let found_vec = kdtree.within_radius(&query, radius);
164            for found in found_vec {
165                // Loop over the whole PDB and find an atom that matches the found point
166                // FIXME: Optimize this
167                for residue in pdb.residues() {
168                    for atom in residue.atoms() {
169                        let x = atom.x();
170                        let y = atom.y();
171                        let z = atom.z();
172                        if x == found[0] && y == found[1] && z == found[2] {
173                            result.insert(residue.serial_number());
174                        }
175                    }
176                }
177            }
178        }
179    }
180    // Remove the target residues from the result
181    for residue in target_residues_numbers {
182        result.remove(&residue.serial_number());
183    }
184
185    // Sort the result
186    let mut result: Vec<isize> = result.into_iter().collect();
187    result.sort();
188
189    result
190}
191
192/// Retrieves specific residues from a PDB structure based on their serial numbers.
193///
194/// # Arguments
195///
196/// * `pdb` - A reference to a `pdbtbx::PDB` structure representing the entire protein.
197/// * `target` - A vector of `isize` values representing the serial numbers of the residues to retrieve.
198///
199/// # Returns
200///
201/// A `Vec<&Residue>` containing references to the residues whose serial numbers match those in the `target` vector.
202///
203/// # Notes
204///
205/// - This function performs a linear search through all residues in the PDB structure.
206/// - The order of residues in the result vector matches the order they appear in the PDB structure,
207///   not necessarily the order of serial numbers in the `target` vector.
208/// - If a serial number in `target` doesn't match any residue in the PDB, it is simply ignored.
209pub fn get_residues(pdb: &pdbtbx::PDB, target: Vec<isize>) -> Vec<&Residue> {
210    let mut result: Vec<&Residue> = Vec::new();
211    for residue in pdb.residues() {
212        if target.contains(&residue.serial_number()) {
213            result.push(residue);
214        }
215    }
216    result
217}
218
219/// Identifies the true interface residues between chains in a PDB structure.
220///
221/// This function determines interface residues by finding pairs of residues from different chains
222/// that have atoms within a specified cutoff distance of each other.
223///
224/// # Arguments
225///
226/// * `pdb` - A reference to a `pdbtbx::PDB` structure representing the entire protein.
227/// * `cutoff` - A `f64` value specifying the maximum distance (in Angstroms) between atoms
228///   for residues to be considered part of the interface.
229///
230/// # Returns
231///
232/// A `HashMap<String, HashSet<isize>>` where:
233/// - The keys are chain identifiers (as strings).
234/// - The values are `HashSet`s of residue serial numbers (as `isize`) that are part of the interface for that chain.
235///
236/// # Algorithm
237///
238/// 1. Iterates over all pairs of chains in the PDB structure.
239/// 2. For each pair of chains, compares all residues between the two chains.
240/// 3. For each pair of residues, checks if any pair of atoms (one from each residue) is within the cutoff distance.
241/// 4. If atoms are within the cutoff, both residues are added to the interface set for their respective chains.
242///
243/// # Notes
244///
245/// - This function performs an exhaustive search, which may be computationally expensive for large structures.
246/// - The cutoff is applied to atom-atom distances, not residue-residue distances.
247/// - A residue is considered part of the interface if any of its atoms are within the cutoff of any atom in a residue from another chain.
248/// - The function does not distinguish between different types of atoms or residues.
249pub fn get_true_interface(pdb: &pdbtbx::PDB, cutoff: f64) -> HashMap<String, HashSet<isize>> {
250    let mut true_interface: HashMap<String, HashSet<isize>> = HashMap::new();
251    for chain_i in pdb.chains() {
252        for chain_j in pdb.chains() {
253            if chain_i.id() == chain_j.id() {
254                continue;
255            }
256            for res_i in chain_i.residues() {
257                for res_j in chain_j.residues() {
258                    // Check if any of the atoms are touching each other
259                    for atom_i in res_i.atoms() {
260                        for atom_j in res_j.atoms() {
261                            let distance = atom_i.distance(atom_j);
262                            if distance < cutoff {
263                                true_interface
264                                    .entry(chain_i.id().to_string())
265                                    .or_default()
266                                    .insert(res_i.serial_number());
267                                true_interface
268                                    .entry(chain_j.id().to_string())
269                                    .or_default()
270                                    .insert(res_j.serial_number());
271                                break;
272                            }
273                        }
274                    }
275                }
276            }
277        }
278    }
279    true_interface
280}
281
282/// Identifies pairs of chains in a PDB structure that are in contact with each other.
283///
284/// This function determines which chains are in contact by finding pairs of residues from different chains
285/// that have atoms within a specified cutoff distance of each other.
286///
287/// # Arguments
288///
289/// * `pdb` - A reference to a `pdbtbx::PDB` structure representing the entire protein.
290/// * `cutoff` - A `f64` value specifying the maximum distance (in Angstroms) between atoms
291///   for chains to be considered in contact.
292///
293/// # Returns
294///
295/// A `HashSet<(String, String)>` where each tuple represents a pair of chain identifiers that are in contact.
296/// The order of chain identifiers in each tuple is arbitrary.
297///
298/// # Algorithm
299///
300/// 1. Iterates over all pairs of chains in the PDB structure.
301/// 2. For each pair of chains, compares all residues between the two chains.
302/// 3. For each pair of residues, checks if any pair of atoms (one from each residue) is within the cutoff distance.
303/// 4. If atoms are within the cutoff, the pair of chain identifiers is added to the result set.
304///
305/// # Notes
306///
307/// - This function performs an exhaustive search, which may be computationally expensive for large structures.
308/// - The cutoff is applied to atom-atom distances, not residue-residue or chain-chain distances.
309/// - A pair of chains is considered in contact if any atom from one chain is within the cutoff distance of any atom from the other chain.
310/// - The function does not distinguish between different types of atoms or residues.
311/// - Each pair of chains appears only once in the result set, regardless of the number of contacts between them.
312pub fn get_chains_in_contact(pdb: &pdbtbx::PDB, cutoff: f64) -> HashSet<(String, String)> {
313    let mut chains_in_contact: HashSet<(String, String)> = HashSet::new();
314
315    for chain_i in pdb.chains() {
316        for chain_j in pdb.chains() {
317            if chain_i.id() == chain_j.id() {
318                continue;
319            }
320            for res_i in chain_i.residues() {
321                for res_j in chain_j.residues() {
322                    // Check if any of the atoms are touching each other
323                    for atom_i in res_i.atoms() {
324                        for atom_j in res_j.atoms() {
325                            let distance = atom_i.distance(atom_j);
326                            if distance < cutoff {
327                                chains_in_contact
328                                    .insert((chain_i.id().to_string(), chain_j.id().to_string()));
329                                break;
330                            }
331                        }
332                    }
333                }
334            }
335        }
336    }
337    chains_in_contact
338}
339
340/// Represents two residues in a protein chain.
341///
342/// A `Gap` is defined by two residues that are sequential in the primary structure
343/// but have a distance between their atoms that exceeds a certain threshold in the
344/// tertiary structure.
345#[derive(Debug)]
346pub struct Gap {
347    /// The chain identifier where the gap is located.
348    pub chain: String,
349
350    /// The name of the atom in the first residue used for distance calculation.
351    pub atom_i: String,
352
353    /// The name of the atom in the second residue used for distance calculation.
354    pub atom_j: String,
355
356    /// The sequence number of the first residue.
357    pub res_i: isize,
358
359    /// The sequence number of the second residue.
360    pub res_j: isize,
361
362    /// The distance between the specified atoms of the two residues.
363    pub distance: f64,
364}
365
366/// Filters a vector of `Gap` elements, retaining only the unique elements based on the `atom_j` value.
367///
368/// This function removes duplicate `Gap` elements where the `atom_j` value is the same across multiple entries,
369/// keeping only the first occurrence of each unique `atom_j`. This is useful when you only need to retain
370/// one `Gap` entry for each unique `atom_j` regardless of the `atom_i` or other fields.
371///
372pub fn filter_unique_by_atom_j(gaps: Vec<Gap>) -> Vec<Gap> {
373    let mut seen = HashSet::new();
374    let mut unique: Vec<Gap> = Vec::new();
375
376    gaps.into_iter().for_each(|g| {
377        if !seen.contains(&g.atom_j) {
378            seen.insert(g.atom_j.clone());
379            unique.push(g)
380        }
381    });
382
383    unique
384}
385
386/// Identifies separate bodies within a protein structure based on CA atom distances.
387///
388/// This function segments the protein into separate "bodies" by analyzing the distances
389/// between consecutive CA (alpha carbon) atoms in each chain. A new body is defined when
390/// the distance between consecutive CA atoms exceeds 4 Angstroms.
391///
392/// # Arguments
393///
394/// * `pdb` - A reference to a `pdbtbx::PDB` structure representing the entire protein.
395///
396/// # Returns
397///
398/// A `HashMap<isize, Vec<(isize, &str, &pdbtbx::Atom)>>` where:
399/// - The key is a body identifier (starting from 0).
400/// - The value is a vector of tuples, each containing:
401///   - The residue serial number (`isize`)
402///   - The chain identifier (`&str`)
403///   - A reference to the CA atom (`&pdbtbx::Atom`)
404///
405/// # Algorithm
406///
407/// 1. Extracts all CA atoms from the PDB structure, along with their chain and residue information.
408/// 2. Iterates through consecutive CA atoms within each chain.
409/// 3. If the distance between consecutive CA atoms exceeds 4 Angstroms, a new body is started.
410/// 4. Groups CA atoms into bodies based on this distance criterion.
411///
412/// # Notes
413///
414/// - This function only considers CA atoms for body identification.
415/// - The 4 Angstrom cutoff is hard-coded and may not be suitable for all use cases.
416/// - Bodies are numbered sequentially starting from 0.
417/// - Gaps between chains always result in a new body, regardless of spatial proximity.
418/// - The function assumes that CA atoms within a chain are listed in sequence order.
419pub fn find_bodies(pdb: &pdbtbx::PDB) -> HashMap<isize, Vec<(isize, &str, &pdbtbx::Atom)>> {
420    // Check if the distance of a given atom to its next one is higher than 4A
421
422    // Get only the `CA` atoms
423    let mut ca_atoms: Vec<(&str, isize, &pdbtbx::Atom)> = Vec::new();
424    pdb.chains().for_each(|chain| {
425        chain.residues().for_each(|residue| {
426            residue.atoms().for_each(|atom| {
427                if atom.name() == "CA" {
428                    ca_atoms.push((chain.id(), residue.serial_number(), atom));
429                }
430            });
431        });
432    });
433    let mut bodies: HashMap<isize, Vec<(isize, &str, &pdbtbx::Atom)>> = HashMap::new();
434    let mut body_id = 0;
435    for (i, j) in ca_atoms.iter().zip(ca_atoms.iter().skip(1)) {
436        let (chain_i, res_i, atom_i) = i;
437        let (chain_j, _res_j, atom_j) = j;
438        if chain_i != chain_j {
439            continue;
440        }
441        let distance = atom_i.distance(atom_j);
442        if distance > 4.0 {
443            body_id += 1;
444        }
445        bodies
446            .entry(body_id)
447            .or_default()
448            .push((*res_i, chain_i, atom_i));
449    }
450
451    bodies
452}
453
454/// Creates a list of gaps between different bodies in a protein structure.
455///
456/// This function analyzes the spatial relationships between separate bodies of a protein
457/// structure and generates a list of `Gap` instances representing the distances between
458/// randomly selected pairs of atoms from different bodies.
459///
460/// # Arguments
461///
462/// * `bodies` - A reference to a `HashMap<isize, Vec<(isize, &str, &pdbtbx::Atom)>>` representing
463///   the bodies of the protein structure, as returned by the `find_bodies` function.
464///
465/// # Returns
466///
467/// A `Vec<Gap>` containing `Gap` instances that represent the distances between pairs of
468/// atoms from different bodies.
469///
470/// # Algorithm
471///
472/// 1. Initializes a random number generator with a fixed seed (42) for reproducibility.
473/// 2. Iterates over all unique pairs of bodies.
474/// 3. For each pair of bodies with at least 2 atoms each:
475///    a. Randomly selects 2 atoms from each body.
476///    b. Creates 2 `Gap` instances, one for each pair of selected atoms.
477/// 4. Collects all created `Gap` instances into a vector.
478///
479/// # Notes
480///
481/// - The function uses a fixed random seed (42) for reproducibility. This means that
482///   for the same input, it will always produce the same output.
483/// - Only bodies with at least 2 atoms are considered for gap creation.
484/// - The function creates 2 gaps for each pair of bodies, using different randomly selected atoms.
485/// - The `Gap` instances contain information about the chain, atom names, residue numbers,
486///   and the distance between the selected atoms.
487/// - This function assumes that the input `bodies` HashMap is structured as expected,
488///   with each value being a vector of tuples (residue number, chain ID, atom reference).
489///
490/// # Safety
491///
492/// This function uses `unwrap()` on the results of `choose_multiple()`. It assumes that
493/// the bodies contain at least 2 atoms each, as checked earlier in the function.
494pub fn create_iter_body_gaps(
495    bodies: &HashMap<isize, Vec<(isize, &str, &pdbtbx::Atom)>>,
496) -> Vec<Gap> {
497    let mut rng = StdRng::seed_from_u64(42);
498    let mut pairs: Vec<Gap> = Vec::new();
499    let mut body_ids: Vec<isize> = bodies.keys().cloned().collect();
500    body_ids.sort();
501    for (i, &body_id1) in body_ids.iter().enumerate() {
502        for &body_id2 in body_ids[i + 1..].iter() {
503            if let (Some(atoms1), Some(atoms2)) = (bodies.get(&body_id1), bodies.get(&body_id2))
504                && atoms1.len() >= 2 && atoms2.len() >= 2 {
505                    let selected1: Vec<_> = atoms1.choose_multiple(&mut rng, 2).cloned().collect();
506                    let selected2: Vec<_> = atoms2.choose_multiple(&mut rng, 2).cloned().collect();
507
508                    for i in 0..2 {
509                        pairs.push(Gap {
510                            chain: selected1[i].1.to_string(),
511                            atom_i: selected1[i].2.name().to_string(),
512                            atom_j: selected2[i].2.name().to_string(),
513                            res_i: selected1[i].0,
514                            res_j: selected2[i].0,
515                            distance: selected1[i].2.distance(selected2[i].2),
516                        });
517                    }
518                }
519        }
520    }
521
522    pairs
523}
524
525// pub fn calculate_z_axis(selection1: &[pdbtbx::Atom], selection2: &[pdbtbx::Atom]) -> Vector3<f64> {
526//     let center1 = calculate_geometric_center(selection1);
527//     let center2 = calculate_geometric_center(selection2);
528//     (center2 - center1).normalize()
529// }
530
531pub fn calculate_geometric_center(atoms: &[pdbtbx::Atom]) -> Vector3<f64> {
532    let mut center = Vector3::zeros();
533    for atom in atoms {
534        center += Vector3::new(atom.x(), atom.y(), atom.z());
535    }
536    center / (atoms.len() as f64)
537}
538
539// pub fn generate_axis_beads(start_z: f64, end_z: f64, num_beads: usize) -> Vec<Bead> {
540//     let mut beads = Vec::with_capacity(num_beads);
541//     let length = end_z - start_z;
542
543//     for i in 0..num_beads {
544//         let t = (i as f64) / ((num_beads - 1) as f64);
545//         let z = start_z + t * length;
546//         let position = Vector3::new(0.0, 0.0, z);
547//         beads.push(Bead { position });
548//     }
549//     beads
550// }
551
552pub fn generate_grid_beads(center_z: f64, grid_size: usize, spacing: f64) -> Vec<Bead> {
553    let mut beads = Vec::with_capacity(grid_size * grid_size);
554
555    let grid_offset = (grid_size as f64 - 1.0) * spacing / 2.0;
556
557    for i in 0..grid_size {
558        for j in 0..grid_size {
559            let x = (i as f64) * spacing - grid_offset;
560            let y = (j as f64) * spacing - grid_offset;
561
562            let final_pos = Vector3::new(x, y, center_z);
563            beads.push(Bead {
564                position: final_pos,
565            });
566        }
567    }
568
569    beads
570}
571
572pub fn write_beads_pdb(beads: &[Bead], filename: &str) -> std::io::Result<()> {
573    let mut file = File::create(filename)?;
574
575    for (i, bead) in beads.iter().enumerate() {
576        writeln!(
577            file,
578            "{:<6}{:>5} {:<4}{:1}{:<3} {:1}{:>4}{:1}   {:>8.3}{:>8.3}{:>8.3}{:>6.2}{:>6.2}          {:>2}{:>2}",
579            "ATOM",
580            i + 1,
581            "SHA",
582            " ",
583            "SHA",
584            "S",
585            i + 1,
586            " ",
587            bead.position.x,
588            bead.position.y,
589            bead.position.z,
590            1.00,
591            1.00,
592            "H",
593            ""
594        )?;
595    }
596    writeln!(file, "END")?;
597
598    Ok(())
599}
600
601pub fn find_furthest_selections(selections: &[Vec<isize>], pdb: &PDB) -> (Vec<Atom>, Vec<Atom>) {
602    let sele: HashMap<usize, (Vec<Atom>, Vector3<f64>)> = selections
603        .iter()
604        .enumerate()
605        .map(|(i, sel)| {
606            let atoms = get_atoms_from_resnumbers(pdb, sel);
607            let center = calculate_geometric_center(&atoms);
608            (i, (atoms, center))
609        })
610        .collect();
611
612    // Find the two selections that are the furthest apart
613    let mut max_distance = 0.0;
614    let mut atoms1 = Vec::new();
615    let mut atoms2 = Vec::new();
616    for (i, j) in (0..selections.len()).tuple_combinations() {
617        let distance = (sele[&i].1 - sele[&j].1).norm();
618        if distance > max_distance {
619            max_distance = distance;
620            atoms1 = sele[&i].0.clone();
621            atoms2 = sele[&j].0.clone();
622        }
623    }
624
625    (atoms1, atoms2)
626}
627
628pub fn get_atoms_from_resnumbers(pdb: &PDB, selection: &[isize]) -> Vec<Atom> {
629    pdb.residues()
630        .filter(|res| selection.contains(&res.serial_number()))
631        .flat_map(|res| res.atoms())
632        .cloned() // This clones each &Atom into an owned Atom
633        .collect()
634}
635
636/// Loads a PDB structure from a file path
637///
638/// # Arguments
639/// * `input_pdb` - Path to the PDB file
640///
641/// # Returns
642/// Result containing either the parsed PDB or a list of errors
643pub fn load_pdb(input_pdb: &str) -> Result<PDB, Vec<PDBError>> {
644    std::fs::read_to_string(input_pdb)
645        .map_err(|e| {
646            vec![PDBError::new(
647                pdbtbx::ErrorLevel::GeneralWarning,
648                "File read error",
649                format!("Failed to read file {}: {}", input_pdb, e),
650                pdbtbx::Context::None,
651            )]
652        })
653        .and_then(|content| process_pdb_contents(&content))
654}
655
656/// Loads a PDB structure from string content
657///
658/// # Arguments
659/// * `content` - PDB file contents as a string
660///
661/// # Returns
662/// Result containing either the parsed PDB or a list of errors
663pub fn load_pdb_from_content(content: &str) -> Result<PDB, Vec<PDBError>> {
664    process_pdb_contents(content)
665}
666
667/// Processes raw PDB content by removing remarks and padding lines
668///
669/// # Arguments
670/// * `content` - Raw PDB content
671///
672/// # Returns
673/// Processed PDB content ready for parsing
674pub fn process_pdb_contents(pdb_content: &str) -> Result<PDB, Vec<PDBError>> {
675    let processed_content = process_pdb_string(pdb_content);
676
677    let mut opts = ReadOptions::new();
678    opts.set_format(pdbtbx::Format::Pdb)
679        .set_level(pdbtbx::StrictnessLevel::Loose);
680
681    let cursor = Cursor::new(processed_content.into_bytes());
682    let reader = BufReader::new(cursor);
683
684    match opts.read_raw(reader) {
685        Ok((pdb, _)) => Ok(pdb),
686        Err(e) => Err(e),
687    }
688}
689
690/// Processes raw PDB content by removing remarks and padding lines
691///
692/// # Arguments
693/// * `content` - Raw PDB content
694///
695/// # Returns
696/// Processed PDB content ready for parsing
697fn process_pdb_string(content: &str) -> String {
698    let mut output = String::with_capacity(content.len());
699
700    // Process lines with single allocation
701    for line in content.lines() {
702        if !line.starts_with("REMARK") {
703            output.push_str(line);
704            if line.len() < 80 {
705                output.push_str(&" ".repeat(80 - line.len()));
706            }
707            output.push('\n');
708        }
709    }
710
711    output
712}
713
714/// Finds the closest residue pairs between different protein chains within a specified distance cutoff.
715///
716/// This function analyzes inter-chain interactions by identifying the closest atom pairs between residues
717/// from different chains, using the alpha carbon (CA) atoms for distance calculation.
718///
719/// # Arguments
720///
721/// * pdb - A reference to a parsed PDB structure.
722/// * cutoff - A floating-point value specifying the maximum distance (in Angstroms) for considering residue pairs.
723///
724/// # Returns
725///
726/// A Vec<Pair> containing the closest residue pairs within the specified cutoff, sorted by distance.
727///
728/// # Functionality
729///
730/// 1. Iterates through all unique chain pairs in the PDB structure.
731/// 2. For each chain pair, finds the closest atoms between residues.
732/// 3. Selects pairs where the inter-residue distance is less than the specified cutoff.
733/// 4. Uses alpha carbon (CA) atoms for distance calculation and pair representation.
734/// 5. Sorts the resulting pairs by their inter-alpha carbon distance.
735///
736/// # Notes
737///
738/// - Only inter-chain interactions are considered.
739/// - The distance is calculated based on the closest atom pairs between residues.
740/// - The resulting pairs are sorted from shortest to longest distance.
741pub fn get_closest_residue_pairs(pdb: &pdbtbx::PDB, cutoff: f64) -> Vec<Pair> {
742    let mut closest_pairs = Vec::new();
743
744    let chains: Vec<_> = pdb.chains().collect();
745
746    for i in 0..chains.len() {
747        for j in (i + 1)..chains.len() {
748            let chain_i = &chains[i];
749            let chain_j = &chains[j];
750
751            for res_i in chain_i.residues() {
752                let mut min_distance = f64::MAX;
753                let mut closest_pair = None;
754
755                for res_j in chain_j.residues() {
756                    let mut atom_dist = f64::MAX;
757                    for atom_i in res_i.atoms() {
758                        for atom_j in res_j.atoms() {
759                            let distance = atom_i.distance(atom_j);
760                            if distance < atom_dist {
761                                atom_dist = distance;
762                            }
763                        }
764                    }
765                    if atom_dist < min_distance {
766                        min_distance = atom_dist;
767                        closest_pair = Some((res_j, res_i));
768                    }
769                }
770
771                if min_distance < cutoff
772                    && let Some((res_j, res_i)) = closest_pair {
773                        let ca_i = res_i.atoms().find(|atom| atom.name() == "CA").unwrap();
774                        let ca_j = res_j.atoms().find(|atom| atom.name() == "CA").unwrap();
775                        let ca_ca_dist = ca_i.distance(ca_j);
776                        closest_pairs.push(Pair {
777                            chain_i: chain_i.id().to_string(),
778                            res_i: res_i.serial_number(),
779                            atom_i: ca_i.name().to_string(),
780                            chain_j: chain_j.id().to_string(),
781                            res_j: res_j.serial_number(),
782                            atom_j: ca_j.name().to_string(),
783                            distance: ca_ca_dist,
784                        });
785                    }
786            }
787        }
788    }
789
790    closest_pairs.sort_by(|a, b| a.distance.partial_cmp(&b.distance).unwrap());
791
792    closest_pairs
793}
794
795/// Finds pairs of atoms between a ligand and nearby protein residues within a certain distance.
796///
797/// This function processes the given PDB structure to identify heteroatoms (ligand atoms) and
798/// their neighboring atoms within a specified distance threshold (5.0 Å). It then finds the closest
799/// protein atom for each ligand atom and returns a vector of `Pair` structures containing information
800/// about the ligand atom and its closest protein atom. The result is a list of ligand-protein pairs
801/// and their corresponding distance, residue, and chain information.
802///
803pub fn gaps_around_ligand(pdb: &PDB) -> Vec<Gap> {
804    let ligand_atoms: Vec<&Atom> = pdb
805        .atoms()
806        .filter(|a| a.hetero() && *a.element().unwrap() != Element::H)
807        .collect();
808
809    let ligand_residues: Vec<&Residue> = pdb
810        .residues()
811        .filter(|r| r.atoms().any(|a| a.hetero()))
812        .collect();
813
814    let neighbors = neighbor_search(pdb.clone(), ligand_residues, 5.0);
815    let protein_atoms = get_atoms_from_resnumbers(pdb, &neighbors);
816
817    let mut result: Vec<Gap> = Vec::new();
818
819    let finder = Finder::new(pdb);
820
821    for ligand_atom in ligand_atoms {
822        let closest_protein_atom = protein_atoms
823            .iter()
824            .filter(|a| *a.element().unwrap() != Element::H)
825            .map(|a| (a, ligand_atom.distance(a)))
826            .min_by(|a, b| a.1.partial_cmp(&b.1).unwrap())
827            .unwrap();
828
829        let ligand_chain = finder.find_chain_id(ligand_atom);
830        let protein_chain = finder.find_chain_id(closest_protein_atom.0);
831
832        if ligand_chain == protein_chain {
833            result.push(Gap {
834                chain: ligand_chain,
835                res_i: finder.find_residue_number(ligand_atom),
836                atom_i: ligand_atom.name().to_string(),
837                res_j: finder.find_residue_number(closest_protein_atom.0),
838                atom_j: closest_protein_atom.0.name().to_string(),
839                distance: closest_protein_atom.1,
840            })
841        }
842    }
843
844    result
845}
846
847#[cfg(test)]
848mod tests {
849
850    use std::env;
851
852    use tempfile::NamedTempFile;
853
854    use super::*;
855
856    #[test]
857    fn test_get_true_interface() {
858        let pdb = load_pdb("tests/data/complex.pdb").unwrap();
859
860        let observed_true_interface = get_true_interface(&pdb, 5.0);
861
862        let mut expected_true_interface: HashMap<String, HashSet<isize>> = HashMap::new();
863        let chain_a = "A".to_string();
864        let res_a: HashSet<isize> = vec![934, 936, 933, 946, 950, 938, 940, 941, 937, 931]
865            .into_iter()
866            .collect();
867        let chain_b = "B";
868        let res_b: HashSet<isize> = vec![66, 48, 68, 49, 46, 45, 44, 47, 69, 6, 70, 8, 42]
869            .into_iter()
870            .collect();
871
872        expected_true_interface.insert(chain_a, res_a);
873        expected_true_interface.insert(chain_b.to_string(), res_b);
874
875        assert_eq!(observed_true_interface, expected_true_interface);
876    }
877
878    #[test]
879    fn test_load_pdb_short_lines() {
880        let pdb_path = env::current_dir()
881            .unwrap()
882            .join("tests/data/leu_short_lines.pdb");
883        let pdb = load_pdb(pdb_path.to_str().unwrap()).unwrap();
884        assert_eq!(pdb.atoms().count(), 8);
885    }
886
887    #[test]
888    fn test_load_pdb_normal_lines() {
889        let pdb_path = env::current_dir()
890            .unwrap()
891            .join("tests/data/leu_normal_lines.pdb");
892        let pdb = load_pdb(pdb_path.to_str().unwrap()).unwrap();
893        assert_eq!(pdb.atoms().count(), 8);
894    }
895    // Updated test helper function that works with older Rust versions
896    fn create_test_pdb(remarks: usize, atoms: usize) -> String {
897        let mut pdb = String::new();
898
899        // Add remarks
900        for i in 0..remarks {
901            pdb.push_str(&format!("REMARK {} Test remark\n", i));
902        }
903
904        // Add atoms (ATOM records)
905        for i in 0..atoms {
906            pdb.push_str(&format!(
907                "ATOM  {:>5}  N   ALA A{:>4}    {:>8}{:>8}{:>8}  1.00  0.00           N\n",
908                i + 1,
909                i + 1,
910                format!("{:.3}", i as f32),
911                format!("{:.3}", i as f32),
912                format!("{:.3}", i as f32)
913            ));
914        }
915
916        pdb
917    }
918
919    #[test]
920    fn test_process_pdb_string_removes_remarks() {
921        let input = "REMARK 1 Test\nATOM      1  N   ALA A   1       0.000   0.000   0.000\n";
922        let processed = process_pdb_string(input);
923
924        assert!(!processed.contains("REMARK"));
925        assert!(processed.contains("ATOM"));
926    }
927
928    #[test]
929    fn test_process_pdb_string_pads_lines() {
930        let short_line = "ATOM      1  N   ALA A   1       0.000   0.000   0.000";
931        let processed = process_pdb_string(short_line);
932
933        // Check line is padded to 80 chars (including newline)
934        assert_eq!(processed.lines().next().unwrap().len(), 80);
935        assert_eq!(processed.chars().count(), 81); // 80 + newline
936    }
937
938    #[test]
939    fn test_process_pdb_string_preserves_long_lines() {
940        let long_line =
941            "ATOM      1  N   ALA A   1       0.000   0.000   0.000  1.00  0.00           N  EXTRA";
942        let processed = process_pdb_string(long_line);
943
944        // Line should remain unchanged (except newline)
945        assert_eq!(processed.trim(), long_line);
946    }
947
948    #[test]
949    fn test_process_pdb_contents_valid_pdb() {
950        let pdb_content = create_test_pdb(3, 5);
951        let result = process_pdb_contents(&pdb_content);
952
953        assert!(result.is_ok());
954        let pdb = result.unwrap();
955        assert_eq!(pdb.atoms().count(), 5);
956    }
957
958    #[test]
959    fn test_process_pdb_contents_empty_input() {
960        let result = process_pdb_contents("");
961        assert!(result.is_err());
962
963        if let Err(errors) = result {
964            assert_eq!(errors.len(), 1);
965        }
966    }
967
968    #[test]
969    fn test_process_pdb_contents_invalid_pdb() {
970        let invalid_content = "This is not a PDB file\n";
971        let result = process_pdb_contents(invalid_content);
972
973        assert!(result.is_err());
974        if let Err(errors) = result {
975            assert!(!errors.is_empty());
976        }
977    }
978
979    #[test]
980    fn test_process_pdb_string_empty_input() {
981        let processed = process_pdb_string("");
982        assert_eq!(processed, "");
983    }
984
985    #[test]
986    fn test_process_pdb_string_mixed_content() {
987        let input = "\
988REMARK 1 Test
989ATOM      1  N   ALA A   1       0.000   0.000   0.000
990REMARK 2 Another
991ATOM      2  N   ALA A   2       1.000   1.000   1.000
992";
993        let processed = process_pdb_string(input);
994        let lines: Vec<&str> = processed.lines().collect();
995
996        assert_eq!(lines.len(), 2); // Only ATOM lines should remain
997        assert!(lines[0].starts_with("ATOM"));
998        assert!(lines[1].starts_with("ATOM"));
999    }
1000
1001    #[test]
1002    fn test_process_pdb_contents_with_real_file() {
1003        // Create a temporary PDB file
1004        let mut file = NamedTempFile::new().unwrap();
1005        let pdb_content = create_test_pdb(2, 3);
1006        write!(file, "{}", pdb_content).unwrap();
1007
1008        // Test with file content
1009        let file_content = std::fs::read_to_string(file.path()).unwrap();
1010        let result = process_pdb_contents(&file_content);
1011
1012        assert!(result.is_ok());
1013        let pdb = result.unwrap();
1014        assert_eq!(pdb.atoms().count(), 3);
1015    }
1016
1017    #[test]
1018    fn test_get_closest_residue_pairs() {
1019        let pdb = load_pdb("tests/data/two_res.pdb").unwrap();
1020        let observed_pairs = get_closest_residue_pairs(&pdb, 5.0);
1021        let expected_pairs = &[Pair {
1022            chain_i: "A".to_string(),
1023            chain_j: "B".to_string(),
1024            atom_i: "CA".to_string(),
1025            atom_j: "CA".to_string(),
1026            res_i: 2,
1027            res_j: 10,
1028            distance: 9.1,
1029        }];
1030        assert_eq!(observed_pairs.len(), expected_pairs.len());
1031        let pair = &observed_pairs[0];
1032        assert_eq!(pair.chain_i, expected_pairs[0].chain_i);
1033        assert_eq!(pair.chain_j, expected_pairs[0].chain_j);
1034        assert_eq!(pair.atom_i, expected_pairs[0].atom_i);
1035        assert_eq!(pair.atom_j, expected_pairs[0].atom_j);
1036        assert_eq!(pair.res_i, expected_pairs[0].res_i);
1037        assert_eq!(pair.res_j, expected_pairs[0].res_j);
1038        assert!((pair.distance - 9.1).abs() < 0.1);
1039    }
1040    #[test]
1041    fn test_gaps_around_ligand() {
1042        let pdb = load_pdb("tests/data/prot_ligand.pdb").unwrap();
1043        let observed = gaps_around_ligand(&pdb);
1044        let expected = &[
1045            Gap {
1046                chain: "E".to_string(),
1047                res_i: 351,
1048                atom_i: "N1".to_string(),
1049                res_j: 123,
1050                atom_j: "N".to_string(),
1051                distance: 3.0400192433601467,
1052            },
1053            Gap {
1054                chain: "E".to_string(),
1055                res_i: 351,
1056                atom_i: "C2".to_string(),
1057                res_j: 327,
1058                atom_j: "CE2".to_string(),
1059                distance: 3.79,
1060            },
1061        ];
1062
1063        // Compare chain, residue, and atom names
1064        assert_eq!(observed[0].chain, expected[0].chain);
1065        assert_eq!(observed[0].res_i, expected[0].res_i);
1066        assert_eq!(observed[0].atom_i, expected[0].atom_i);
1067        assert_eq!(observed[0].res_j, expected[0].res_j);
1068        assert_eq!(observed[0].atom_j, expected[0].atom_j);
1069
1070        assert_eq!(observed[1].chain, expected[1].chain);
1071        assert_eq!(observed[1].res_i, expected[1].res_i);
1072        assert_eq!(observed[1].atom_i, expected[1].atom_i);
1073        assert_eq!(observed[1].res_j, expected[1].res_j);
1074        assert_eq!(observed[1].atom_j, expected[1].atom_j);
1075
1076        // TODO: Find a smart way to check the distances
1077        //
1078        // assert_eq!(observed[0].distance, expected[0].distance);
1079        // assert_eq!(observed[1].distance, expected[1].distance);
1080    }
1081
1082    #[test]
1083    fn test_finder() {
1084        let pdb = load_pdb("tests/data/complex.pdb").unwrap();
1085        let finder = Finder::new(&pdb);
1086
1087        assert_eq!(finder.chain_lookup.keys().len(), 924);
1088        assert_eq!(finder.residue_lookup.keys().len(), 924)
1089    }
1090
1091    #[test]
1092    fn test_finder_find_chain_id() {
1093        let pdb = load_pdb("tests/data/complex.pdb").unwrap();
1094        let finder = Finder::new(&pdb);
1095
1096        let found = finder.find_chain_id(pdb.atom(1).expect(""));
1097        assert_eq!(found, "A");
1098
1099        let found = finder.find_chain_id(pdb.atom(700).expect(""));
1100        assert_eq!(found, "B");
1101    }
1102
1103    #[test]
1104    fn test_finder_find_residue_number() {
1105        let pdb = load_pdb("tests/data/complex.pdb").unwrap();
1106        let finder = Finder::new(&pdb);
1107
1108        let found = finder.find_residue_number(pdb.atom(1).unwrap());
1109
1110        assert_eq!(found, 929)
1111    }
1112
1113    #[test]
1114    fn test_filter_unique_by_atom_j() {
1115        let gaps = vec![
1116            Gap {
1117                chain: "A".to_string(),
1118                atom_i: "C".to_string(),
1119                atom_j: "B".to_string(),
1120                res_i: 1,
1121                res_j: 2,
1122                distance: 3.5,
1123            },
1124            Gap {
1125                chain: "A".to_string(),
1126                atom_i: "A".to_string(),
1127                atom_j: "B".to_string(),
1128                res_i: 3,
1129                res_j: 4,
1130                distance: 2.8,
1131            },
1132            Gap {
1133                chain: "B".to_string(),
1134                atom_i: "C".to_string(),
1135                atom_j: "D".to_string(),
1136                res_i: 5,
1137                res_j: 6,
1138                distance: 4.1,
1139            },
1140            Gap {
1141                chain: "A".to_string(),
1142                atom_i: "E".to_string(),
1143                atom_j: "B".to_string(),
1144                res_i: 7,
1145                res_j: 8,
1146                distance: 1.5,
1147            },
1148        ];
1149
1150        let filtered = filter_unique_by_atom_j(gaps);
1151
1152        // The function should remove duplicates based on `atom_j` and keep only the first occurrence.
1153        assert_eq!(filtered.len(), 2); // "B" and "D" should remain
1154        assert_eq!(filtered[0].atom_j, "B");
1155        assert_eq!(filtered[1].atom_j, "D");
1156
1157        // Verify the correct `atom_j` is retained
1158        assert!(
1159            filtered
1160                .iter()
1161                .all(|gap| gap.atom_j != "B" || gap.atom_i == "C")
1162        );
1163    }
1164
1165    #[test]
1166    fn test_no_duplicates() {
1167        let gaps = vec![
1168            Gap {
1169                chain: "A".to_string(),
1170                atom_i: "C".to_string(),
1171                atom_j: "B".to_string(),
1172                res_i: 1,
1173                res_j: 2,
1174                distance: 3.5,
1175            },
1176            Gap {
1177                chain: "A".to_string(),
1178                atom_i: "A".to_string(),
1179                atom_j: "D".to_string(),
1180                res_i: 3,
1181                res_j: 4,
1182                distance: 2.8,
1183            },
1184        ];
1185
1186        let filtered = filter_unique_by_atom_j(gaps);
1187        assert_eq!(filtered.len(), 2); // No duplicates, so the length should be the same
1188    }
1189
1190    #[test]
1191    fn test_empty_input() {
1192        let gaps: Vec<Gap> = Vec::new();
1193        let filtered = filter_unique_by_atom_j(gaps);
1194        assert!(filtered.is_empty()); // Should return an empty list
1195    }
1196
1197    #[test]
1198    fn test_single_element() {
1199        let gaps = vec![Gap {
1200            chain: "A".to_string(),
1201            atom_i: "C".to_string(),
1202            atom_j: "B".to_string(),
1203            res_i: 1,
1204            res_j: 2,
1205            distance: 3.5,
1206        }];
1207
1208        let filtered = filter_unique_by_atom_j(gaps);
1209        assert_eq!(filtered.len(), 1); // Single element, should remain
1210        assert_eq!(filtered[0].atom_j, "B");
1211    }
1212}