haddock_restraints/core/
structure.rs

1use core::f64;
2use std::collections::{HashMap, HashSet};
3
4use itertools::Itertools;
5use kd_tree::KdTree;
6use nalgebra::Vector3;
7use pdbtbx::{Atom, Element, PDB, ReadOptions};
8use pdbtbx::{PDBError, Residue};
9use rand::SeedableRng;
10use rand::prelude::IndexedRandom;
11use rand::rngs::StdRng;
12use std::fs::File;
13use std::io::BufReader;
14use std::io::Cursor;
15use std::io::Write;
16
17#[derive(Clone)]
18pub struct Bead {
19    pub position: Vector3<f64>,
20}
21
22#[derive(Debug, Clone, PartialEq)]
23pub struct Pair {
24    pub chain_i: String,
25    pub res_i: isize,
26    pub atom_i: String,
27    pub chain_j: String,
28    pub res_j: isize,
29    pub atom_j: String,
30    pub distance: f64,
31}
32
33/// The `Finder` struct contains two hashmaps to quickly retrieve the chain ID and residue number associated with a given atom by its serial number. These lookups are initialized from the provided PDB file during instantiation.
34///
35struct Finder {
36    chain_lookup: HashMap<usize, String>,
37    residue_lookup: HashMap<usize, isize>,
38}
39
40impl Finder {
41    /// Creates a new `Finder` instance.
42    ///
43    /// This method initializes the `Finder` struct by extracting the chain and residue information from the provided `PDB` reference. It constructs two hashmaps: `chain_lookup` (for mapping atom serial numbers to chain IDs) and `residue_lookup` (for mapping atom serial numbers to residue serial numbers).
44    ///
45    fn new(pdb: &PDB) -> Self {
46        let chain_lookup = pdb
47            .chains()
48            .flat_map(|chain| {
49                let chain_id = chain.id().to_string();
50                chain.residues().flat_map(move |residue| {
51                    residue.atoms().map({
52                        let v = chain_id.clone();
53                        move |atom| (atom.serial_number(), v.clone())
54                    })
55                })
56            })
57            .collect();
58
59        let residue_lookup = pdb
60            .chains()
61            .flat_map(|chain| {
62                chain.residues().flat_map(|residue| {
63                    let res_num = residue.serial_number();
64                    residue
65                        .atoms()
66                        .map(move |atom| (atom.serial_number(), res_num))
67                })
68            })
69            .collect();
70
71        Finder {
72            chain_lookup,
73            residue_lookup,
74        }
75    }
76
77    /// Finds the chain ID associated with a given atom.
78    ///
79    /// This method looks up the chain ID for an atom based on its serial number. If no chain is found, the default value "A" is returned.
80    ///
81    fn find_chain_id(&self, atom: &Atom) -> String {
82        self.chain_lookup
83            .get(&atom.serial_number())
84            .unwrap_or(&"A".to_string())
85            .clone()
86    }
87
88    /// Finds the residue serial number associated with a given atom.
89    ///
90    /// This method looks up the residue serial number for an atom based on its serial number. If no residue number is found, the default value `0` is returned.
91    ///
92    fn find_residue_number(&self, atom: &Atom) -> isize {
93        *self.residue_lookup.get(&atom.serial_number()).unwrap_or(&0)
94    }
95}
96
97/// Performs a neighbor search for residues within a specified radius of target residues.
98///
99/// This function uses a K-d tree to efficiently find residues that are within a given radius
100/// of any atom in the target residues.
101///
102/// # Arguments
103///
104/// * `pdb` - A `pdbtbx::PDB` structure representing the entire protein.
105/// * `target_residues_numbers` - A vector of references to `pdbtbx::Residue` objects representing the target residues.
106/// * `radius` - A `f64` value specifying the search radius in Angstroms.
107///
108/// # Returns
109///
110/// A `Vec<isize>` containing the sorted serial numbers of residues that are within the specified
111/// radius of any atom in the target residues, excluding the target residues themselves.
112///
113/// # Algorithm
114///
115/// 1. Constructs a K-d tree from all atom coordinates in the PDB structure.
116/// 2. For each atom in the target residues:
117///    a. Performs a radius search in the K-d tree.
118///    b. Identifies the residues corresponding to the atoms found in the search.
119/// 3. Removes the target residues from the result set.
120/// 4. Sorts the resulting residue serial numbers.
121///
122/// # Notes
123///
124/// - The function uses the `KdTree` data structure for efficient spatial searching.
125/// - The current implementation may have performance bottlenecks in the atom-to-residue mapping step.
126/// - The function assumes that atom coordinates are unique for identification purposes.
127///
128/// # TODO
129///
130/// - Optimize the atom-to-residue mapping step for better performance.
131pub fn neighbor_search(
132    pdb: pdbtbx::PDB,
133    target_residues_numbers: Vec<&pdbtbx::Residue>,
134    radius: f64,
135) -> Vec<isize> {
136    let points: Vec<[f64; 3]> = pdb
137        .atoms()
138        .map(|atom| {
139            let x = atom.x();
140            let y = atom.y();
141            let z = atom.z();
142            [x, y, z]
143        })
144        .collect();
145
146    let kdtree = KdTree::build_by_ordered_float(points.clone());
147
148    // Find the coordinates of each target residue
149    // let mut neighbors: Vec<&Atom> = Vec::new();
150    let mut result: HashSet<isize> = HashSet::new();
151    for residue in &target_residues_numbers {
152        let query_vec: Vec<[f64; 3]> = residue
153            .atoms()
154            .map(|atom| {
155                let x = atom.x();
156                let y = atom.y();
157                let z = atom.z();
158                [x, y, z]
159            })
160            .collect();
161
162        for query in query_vec {
163            let found_vec = kdtree.within_radius(&query, radius);
164            for found in found_vec {
165                // Loop over the whole PDB and find an atom that matches the found point
166                // FIXME: Optimize this
167                for residue in pdb.residues() {
168                    for atom in residue.atoms() {
169                        let x = atom.x();
170                        let y = atom.y();
171                        let z = atom.z();
172                        if x == found[0] && y == found[1] && z == found[2] {
173                            result.insert(residue.serial_number());
174                        }
175                    }
176                }
177            }
178        }
179    }
180    // Remove the target residues from the result
181    for residue in target_residues_numbers {
182        result.remove(&residue.serial_number());
183    }
184
185    // Sort the result
186    let mut result: Vec<isize> = result.into_iter().collect();
187    result.sort();
188
189    result
190}
191
192/// Retrieves specific residues from a PDB structure based on their serial numbers.
193///
194/// # Arguments
195///
196/// * `pdb` - A reference to a `pdbtbx::PDB` structure representing the entire protein.
197/// * `target` - A vector of `isize` values representing the serial numbers of the residues to retrieve.
198///
199/// # Returns
200///
201/// A `Vec<&Residue>` containing references to the residues whose serial numbers match those in the `target` vector.
202///
203/// # Notes
204///
205/// - This function performs a linear search through all residues in the PDB structure.
206/// - The order of residues in the result vector matches the order they appear in the PDB structure,
207///   not necessarily the order of serial numbers in the `target` vector.
208/// - If a serial number in `target` doesn't match any residue in the PDB, it is simply ignored.
209pub fn get_residues(pdb: &pdbtbx::PDB, target: Vec<isize>) -> Vec<&Residue> {
210    let mut result: Vec<&Residue> = Vec::new();
211    for residue in pdb.residues() {
212        if target.contains(&residue.serial_number()) {
213            result.push(residue);
214        }
215    }
216    result
217}
218
219/// Identifies the true interface residues between chains in a PDB structure.
220///
221/// This function determines interface residues by finding pairs of residues from different chains
222/// that have atoms within a specified cutoff distance of each other.
223///
224/// # Arguments
225///
226/// * `pdb` - A reference to a `pdbtbx::PDB` structure representing the entire protein.
227/// * `cutoff` - A `f64` value specifying the maximum distance (in Angstroms) between atoms
228///   for residues to be considered part of the interface.
229///
230/// # Returns
231///
232/// A `HashMap<String, HashSet<isize>>` where:
233/// - The keys are chain identifiers (as strings).
234/// - The values are `HashSet`s of residue serial numbers (as `isize`) that are part of the interface for that chain.
235///
236/// # Algorithm
237///
238/// 1. Iterates over all pairs of chains in the PDB structure.
239/// 2. For each pair of chains, compares all residues between the two chains.
240/// 3. For each pair of residues, checks if any pair of atoms (one from each residue) is within the cutoff distance.
241/// 4. If atoms are within the cutoff, both residues are added to the interface set for their respective chains.
242///
243/// # Notes
244///
245/// - This function performs an exhaustive search, which may be computationally expensive for large structures.
246/// - The cutoff is applied to atom-atom distances, not residue-residue distances.
247/// - A residue is considered part of the interface if any of its atoms are within the cutoff of any atom in a residue from another chain.
248/// - The function does not distinguish between different types of atoms or residues.
249pub fn get_true_interface(pdb: &pdbtbx::PDB, cutoff: f64) -> HashMap<String, HashSet<isize>> {
250    let mut true_interface: HashMap<String, HashSet<isize>> = HashMap::new();
251    for chain_i in pdb.chains() {
252        for chain_j in pdb.chains() {
253            if chain_i.id() == chain_j.id() {
254                continue;
255            }
256            for res_i in chain_i.residues() {
257                for res_j in chain_j.residues() {
258                    // Check if any of the atoms are touching each other
259                    for atom_i in res_i.atoms() {
260                        for atom_j in res_j.atoms() {
261                            let distance = atom_i.distance(atom_j);
262                            if distance < cutoff {
263                                true_interface
264                                    .entry(chain_i.id().to_string())
265                                    .or_default()
266                                    .insert(res_i.serial_number());
267                                true_interface
268                                    .entry(chain_j.id().to_string())
269                                    .or_default()
270                                    .insert(res_j.serial_number());
271                                break;
272                            }
273                        }
274                    }
275                }
276            }
277        }
278    }
279    true_interface
280}
281
282/// Identifies pairs of chains in a PDB structure that are in contact with each other.
283///
284/// This function determines which chains are in contact by finding pairs of residues from different chains
285/// that have atoms within a specified cutoff distance of each other.
286///
287/// # Arguments
288///
289/// * `pdb` - A reference to a `pdbtbx::PDB` structure representing the entire protein.
290/// * `cutoff` - A `f64` value specifying the maximum distance (in Angstroms) between atoms
291///   for chains to be considered in contact.
292///
293/// # Returns
294///
295/// A `HashSet<(String, String)>` where each tuple represents a pair of chain identifiers that are in contact.
296/// The order of chain identifiers in each tuple is arbitrary.
297///
298/// # Algorithm
299///
300/// 1. Iterates over all pairs of chains in the PDB structure.
301/// 2. For each pair of chains, compares all residues between the two chains.
302/// 3. For each pair of residues, checks if any pair of atoms (one from each residue) is within the cutoff distance.
303/// 4. If atoms are within the cutoff, the pair of chain identifiers is added to the result set.
304///
305/// # Notes
306///
307/// - This function performs an exhaustive search, which may be computationally expensive for large structures.
308/// - The cutoff is applied to atom-atom distances, not residue-residue or chain-chain distances.
309/// - A pair of chains is considered in contact if any atom from one chain is within the cutoff distance of any atom from the other chain.
310/// - The function does not distinguish between different types of atoms or residues.
311/// - Each pair of chains appears only once in the result set, regardless of the number of contacts between them.
312pub fn get_chains_in_contact(pdb: &pdbtbx::PDB, cutoff: f64) -> HashSet<(String, String)> {
313    let mut chains_in_contact: HashSet<(String, String)> = HashSet::new();
314
315    for chain_i in pdb.chains() {
316        for chain_j in pdb.chains() {
317            if chain_i.id() == chain_j.id() {
318                continue;
319            }
320            for res_i in chain_i.residues() {
321                for res_j in chain_j.residues() {
322                    // Check if any of the atoms are touching each other
323                    for atom_i in res_i.atoms() {
324                        for atom_j in res_j.atoms() {
325                            let distance = atom_i.distance(atom_j);
326                            if distance < cutoff {
327                                chains_in_contact
328                                    .insert((chain_i.id().to_string(), chain_j.id().to_string()));
329                                break;
330                            }
331                        }
332                    }
333                }
334            }
335        }
336    }
337    chains_in_contact
338}
339
340/// Represents two residues in a protein chain.
341///
342/// A `Gap` is defined by two residues that are sequential in the primary structure
343/// but have a distance between their atoms that exceeds a certain threshold in the
344/// tertiary structure.
345#[derive(Debug)]
346pub struct Gap {
347    /// The chain identifier where the gap is located.
348    pub chain: String,
349
350    /// The name of the atom in the first residue used for distance calculation.
351    pub atom_i: String,
352
353    /// The name of the atom in the second residue used for distance calculation.
354    pub atom_j: String,
355
356    /// The sequence number of the first residue.
357    pub res_i: isize,
358
359    /// The sequence number of the second residue.
360    pub res_j: isize,
361
362    /// The distance between the specified atoms of the two residues.
363    pub distance: f64,
364}
365
366/// Filters a vector of `Gap` elements, retaining only the unique elements based on the `atom_j` value.
367///
368/// This function removes duplicate `Gap` elements where the `atom_j` value is the same across multiple entries,
369/// keeping only the first occurrence of each unique `atom_j`. This is useful when you only need to retain
370/// one `Gap` entry for each unique `atom_j` regardless of the `atom_i` or other fields.
371///
372pub fn filter_unique_by_atom_j(gaps: Vec<Gap>) -> Vec<Gap> {
373    let mut seen = HashSet::new();
374    let mut unique: Vec<Gap> = Vec::new();
375
376    gaps.into_iter().for_each(|g| {
377        if !seen.contains(&g.atom_j) {
378            seen.insert(g.atom_j.clone());
379            unique.push(g)
380        }
381    });
382
383    unique
384}
385
386/// Identifies separate bodies within a protein structure based on CA atom distances.
387///
388/// This function segments the protein into separate "bodies" by analyzing the distances
389/// between consecutive CA (alpha carbon) atoms in each chain. A new body is defined when
390/// the distance between consecutive CA atoms exceeds 4 Angstroms.
391///
392/// # Arguments
393///
394/// * `pdb` - A reference to a `pdbtbx::PDB` structure representing the entire protein.
395///
396/// # Returns
397///
398/// A `HashMap<isize, Vec<(isize, &str, &pdbtbx::Atom)>>` where:
399/// - The key is a body identifier (starting from 0).
400/// - The value is a vector of tuples, each containing:
401///   - The residue serial number (`isize`)
402///   - The chain identifier (`&str`)
403///   - A reference to the CA atom (`&pdbtbx::Atom`)
404///
405/// # Algorithm
406///
407/// 1. Extracts all CA atoms from the PDB structure, along with their chain and residue information.
408/// 2. Iterates through consecutive CA atoms within each chain.
409/// 3. If the distance between consecutive CA atoms exceeds 4 Angstroms, a new body is started.
410/// 4. Groups CA atoms into bodies based on this distance criterion.
411///
412/// # Notes
413///
414/// - This function only considers CA atoms for body identification.
415/// - The 4 Angstrom cutoff is hard-coded and may not be suitable for all use cases.
416/// - Bodies are numbered sequentially starting from 0.
417/// - Gaps between chains always result in a new body, regardless of spatial proximity.
418/// - The function assumes that CA atoms within a chain are listed in sequence order.
419pub fn find_bodies(pdb: &pdbtbx::PDB) -> HashMap<isize, Vec<(isize, &str, &pdbtbx::Atom)>> {
420    // Check if the distance of a given atom to its next one is higher than 4A
421
422    // Get only the `CA` atoms
423    let mut ca_atoms: Vec<(&str, isize, &pdbtbx::Atom)> = Vec::new();
424    pdb.chains().for_each(|chain| {
425        chain.residues().for_each(|residue| {
426            residue.atoms().for_each(|atom| {
427                if atom.name() == "CA" {
428                    ca_atoms.push((chain.id(), residue.serial_number(), atom));
429                }
430            });
431        });
432    });
433    let mut bodies: HashMap<isize, Vec<(isize, &str, &pdbtbx::Atom)>> = HashMap::new();
434    let mut body_id = 0;
435    for (i, j) in ca_atoms.iter().zip(ca_atoms.iter().skip(1)) {
436        let (chain_i, res_i, atom_i) = i;
437        let (chain_j, _res_j, atom_j) = j;
438        if chain_i != chain_j {
439            continue;
440        }
441        let distance = atom_i.distance(atom_j);
442        if distance > 4.0 {
443            body_id += 1;
444        }
445        bodies
446            .entry(body_id)
447            .or_default()
448            .push((*res_i, chain_i, atom_i));
449    }
450
451    bodies
452}
453
454/// Creates a list of gaps between different bodies in a protein structure.
455///
456/// This function analyzes the spatial relationships between separate bodies of a protein
457/// structure and generates a list of `Gap` instances representing the distances between
458/// randomly selected pairs of atoms from different bodies.
459///
460/// # Arguments
461///
462/// * `bodies` - A reference to a `HashMap<isize, Vec<(isize, &str, &pdbtbx::Atom)>>` representing
463///   the bodies of the protein structure, as returned by the `find_bodies` function.
464///
465/// # Returns
466///
467/// A `Vec<Gap>` containing `Gap` instances that represent the distances between pairs of
468/// atoms from different bodies.
469///
470/// # Algorithm
471///
472/// 1. Initializes a random number generator with a fixed seed (42) for reproducibility.
473/// 2. Iterates over all unique pairs of bodies.
474/// 3. For each pair of bodies with at least 2 atoms each:
475///    a. Randomly selects 2 atoms from each body.
476///    b. Creates 2 `Gap` instances, one for each pair of selected atoms.
477/// 4. Collects all created `Gap` instances into a vector.
478///
479/// # Notes
480///
481/// - The function uses a fixed random seed (42) for reproducibility. This means that
482///   for the same input, it will always produce the same output.
483/// - Only bodies with at least 2 atoms are considered for gap creation.
484/// - The function creates 2 gaps for each pair of bodies, using different randomly selected atoms.
485/// - The `Gap` instances contain information about the chain, atom names, residue numbers,
486///   and the distance between the selected atoms.
487/// - This function assumes that the input `bodies` HashMap is structured as expected,
488///   with each value being a vector of tuples (residue number, chain ID, atom reference).
489///
490/// # Safety
491///
492/// This function uses `unwrap()` on the results of `choose_multiple()`. It assumes that
493/// the bodies contain at least 2 atoms each, as checked earlier in the function.
494pub fn create_iter_body_gaps(
495    bodies: &HashMap<isize, Vec<(isize, &str, &pdbtbx::Atom)>>,
496) -> Vec<Gap> {
497    let mut rng = StdRng::seed_from_u64(42);
498    let mut pairs: Vec<Gap> = Vec::new();
499    let mut body_ids: Vec<isize> = bodies.keys().cloned().collect();
500    body_ids.sort();
501    for (i, &body_id1) in body_ids.iter().enumerate() {
502        for &body_id2 in body_ids[i + 1..].iter() {
503            if let (Some(atoms1), Some(atoms2)) = (bodies.get(&body_id1), bodies.get(&body_id2)) {
504                if atoms1.len() >= 2 && atoms2.len() >= 2 {
505                    let selected1: Vec<_> = atoms1.choose_multiple(&mut rng, 2).cloned().collect();
506                    let selected2: Vec<_> = atoms2.choose_multiple(&mut rng, 2).cloned().collect();
507
508                    for i in 0..2 {
509                        pairs.push(Gap {
510                            chain: selected1[i].1.to_string(),
511                            atom_i: selected1[i].2.name().to_string(),
512                            atom_j: selected2[i].2.name().to_string(),
513                            res_i: selected1[i].0,
514                            res_j: selected2[i].0,
515                            distance: selected1[i].2.distance(selected2[i].2),
516                        });
517                    }
518                }
519            }
520        }
521    }
522
523    pairs
524}
525
526// pub fn calculate_z_axis(selection1: &[pdbtbx::Atom], selection2: &[pdbtbx::Atom]) -> Vector3<f64> {
527//     let center1 = calculate_geometric_center(selection1);
528//     let center2 = calculate_geometric_center(selection2);
529//     (center2 - center1).normalize()
530// }
531
532pub fn calculate_geometric_center(atoms: &[pdbtbx::Atom]) -> Vector3<f64> {
533    let mut center = Vector3::zeros();
534    for atom in atoms {
535        center += Vector3::new(atom.x(), atom.y(), atom.z());
536    }
537    center / (atoms.len() as f64)
538}
539
540// pub fn generate_axis_beads(start_z: f64, end_z: f64, num_beads: usize) -> Vec<Bead> {
541//     let mut beads = Vec::with_capacity(num_beads);
542//     let length = end_z - start_z;
543
544//     for i in 0..num_beads {
545//         let t = (i as f64) / ((num_beads - 1) as f64);
546//         let z = start_z + t * length;
547//         let position = Vector3::new(0.0, 0.0, z);
548//         beads.push(Bead { position });
549//     }
550//     beads
551// }
552
553pub fn generate_grid_beads(center_z: f64, grid_size: usize, spacing: f64) -> Vec<Bead> {
554    let mut beads = Vec::with_capacity(grid_size * grid_size);
555
556    let grid_offset = (grid_size as f64 - 1.0) * spacing / 2.0;
557
558    for i in 0..grid_size {
559        for j in 0..grid_size {
560            let x = (i as f64) * spacing - grid_offset;
561            let y = (j as f64) * spacing - grid_offset;
562
563            let final_pos = Vector3::new(x, y, center_z);
564            beads.push(Bead {
565                position: final_pos,
566            });
567        }
568    }
569
570    beads
571}
572
573pub fn write_beads_pdb(beads: &[Bead], filename: &str) -> std::io::Result<()> {
574    let mut file = File::create(filename)?;
575
576    for (i, bead) in beads.iter().enumerate() {
577        writeln!(
578            file,
579            "{:<6}{:>5} {:<4}{:1}{:<3} {:1}{:>4}{:1}   {:>8.3}{:>8.3}{:>8.3}{:>6.2}{:>6.2}          {:>2}{:>2}",
580            "ATOM",
581            i + 1,
582            "SHA",
583            " ",
584            "SHA",
585            "S",
586            i + 1,
587            " ",
588            bead.position.x,
589            bead.position.y,
590            bead.position.z,
591            1.00,
592            1.00,
593            "H",
594            ""
595        )?;
596    }
597    writeln!(file, "END")?;
598
599    Ok(())
600}
601
602pub fn find_furthest_selections(selections: &[Vec<isize>], pdb: &PDB) -> (Vec<Atom>, Vec<Atom>) {
603    let sele: HashMap<usize, (Vec<Atom>, Vector3<f64>)> = selections
604        .iter()
605        .enumerate()
606        .map(|(i, sel)| {
607            let atoms = get_atoms_from_resnumbers(pdb, sel);
608            let center = calculate_geometric_center(&atoms);
609            (i, (atoms, center))
610        })
611        .collect();
612
613    // Find the two selections that are the furthest apart
614    let mut max_distance = 0.0;
615    let mut atoms1 = Vec::new();
616    let mut atoms2 = Vec::new();
617    for (i, j) in (0..selections.len()).tuple_combinations() {
618        let distance = (sele[&i].1 - sele[&j].1).norm();
619        if distance > max_distance {
620            max_distance = distance;
621            atoms1 = sele[&i].0.clone();
622            atoms2 = sele[&j].0.clone();
623        }
624    }
625
626    (atoms1, atoms2)
627}
628
629pub fn get_atoms_from_resnumbers(pdb: &PDB, selection: &[isize]) -> Vec<Atom> {
630    pdb.residues()
631        .filter(|res| selection.contains(&res.serial_number()))
632        .flat_map(|res| res.atoms())
633        .cloned() // This clones each &Atom into an owned Atom
634        .collect()
635}
636
637/// Loads a PDB structure from a file path
638///
639/// # Arguments
640/// * `input_pdb` - Path to the PDB file
641///
642/// # Returns
643/// Result containing either the parsed PDB or a list of errors
644pub fn load_pdb(input_pdb: &str) -> Result<PDB, Vec<PDBError>> {
645    std::fs::read_to_string(input_pdb)
646        .map_err(|e| {
647            vec![PDBError::new(
648                pdbtbx::ErrorLevel::GeneralWarning,
649                "File read error",
650                format!("Failed to read file {}: {}", input_pdb, e),
651                pdbtbx::Context::None,
652            )]
653        })
654        .and_then(|content| process_pdb_contents(&content))
655}
656
657/// Loads a PDB structure from string content
658///
659/// # Arguments
660/// * `content` - PDB file contents as a string
661///
662/// # Returns
663/// Result containing either the parsed PDB or a list of errors
664pub fn load_pdb_from_content(content: &str) -> Result<PDB, Vec<PDBError>> {
665    process_pdb_contents(content)
666}
667
668/// Processes raw PDB content by removing remarks and padding lines
669///
670/// # Arguments
671/// * `content` - Raw PDB content
672///
673/// # Returns
674/// Processed PDB content ready for parsing
675pub fn process_pdb_contents(pdb_content: &str) -> Result<PDB, Vec<PDBError>> {
676    let processed_content = process_pdb_string(pdb_content);
677
678    let mut opts = ReadOptions::new();
679    opts.set_format(pdbtbx::Format::Pdb)
680        .set_level(pdbtbx::StrictnessLevel::Loose);
681
682    let cursor = Cursor::new(processed_content.into_bytes());
683    let reader = BufReader::new(cursor);
684
685    match opts.read_raw(reader) {
686        Ok((pdb, _)) => Ok(pdb),
687        Err(e) => Err(e),
688    }
689}
690
691/// Processes raw PDB content by removing remarks and padding lines
692///
693/// # Arguments
694/// * `content` - Raw PDB content
695///
696/// # Returns
697/// Processed PDB content ready for parsing
698fn process_pdb_string(content: &str) -> String {
699    let mut output = String::with_capacity(content.len());
700
701    // Process lines with single allocation
702    for line in content.lines() {
703        if !line.starts_with("REMARK") {
704            output.push_str(line);
705            if line.len() < 80 {
706                output.push_str(&" ".repeat(80 - line.len()));
707            }
708            output.push('\n');
709        }
710    }
711
712    output
713}
714
715/// Finds the closest residue pairs between different protein chains within a specified distance cutoff.
716///
717/// This function analyzes inter-chain interactions by identifying the closest atom pairs between residues
718/// from different chains, using the alpha carbon (CA) atoms for distance calculation.
719///
720/// # Arguments
721///
722/// * pdb - A reference to a parsed PDB structure.
723/// * cutoff - A floating-point value specifying the maximum distance (in Angstroms) for considering residue pairs.
724///
725/// # Returns
726///
727/// A Vec<Pair> containing the closest residue pairs within the specified cutoff, sorted by distance.
728///
729/// # Functionality
730///
731/// 1. Iterates through all unique chain pairs in the PDB structure.
732/// 2. For each chain pair, finds the closest atoms between residues.
733/// 3. Selects pairs where the inter-residue distance is less than the specified cutoff.
734/// 4. Uses alpha carbon (CA) atoms for distance calculation and pair representation.
735/// 5. Sorts the resulting pairs by their inter-alpha carbon distance.
736///
737/// # Notes
738///
739/// - Only inter-chain interactions are considered.
740/// - The distance is calculated based on the closest atom pairs between residues.
741/// - The resulting pairs are sorted from shortest to longest distance.
742pub fn get_closest_residue_pairs(pdb: &pdbtbx::PDB, cutoff: f64) -> Vec<Pair> {
743    let mut closest_pairs = Vec::new();
744
745    let chains: Vec<_> = pdb.chains().collect();
746
747    for i in 0..chains.len() {
748        for j in (i + 1)..chains.len() {
749            let chain_i = &chains[i];
750            let chain_j = &chains[j];
751
752            for res_i in chain_i.residues() {
753                let mut min_distance = f64::MAX;
754                let mut closest_pair = None;
755
756                for res_j in chain_j.residues() {
757                    let mut atom_dist = f64::MAX;
758                    for atom_i in res_i.atoms() {
759                        for atom_j in res_j.atoms() {
760                            let distance = atom_i.distance(atom_j);
761                            if distance < atom_dist {
762                                atom_dist = distance;
763                            }
764                        }
765                    }
766                    if atom_dist < min_distance {
767                        min_distance = atom_dist;
768                        closest_pair = Some((res_j, res_i));
769                    }
770                }
771
772                if min_distance < cutoff {
773                    if let Some((res_j, res_i)) = closest_pair {
774                        let ca_i = res_i.atoms().find(|atom| atom.name() == "CA").unwrap();
775                        let ca_j = res_j.atoms().find(|atom| atom.name() == "CA").unwrap();
776                        let ca_ca_dist = ca_i.distance(ca_j);
777                        closest_pairs.push(Pair {
778                            chain_i: chain_i.id().to_string(),
779                            res_i: res_i.serial_number(),
780                            atom_i: ca_i.name().to_string(),
781                            chain_j: chain_j.id().to_string(),
782                            res_j: res_j.serial_number(),
783                            atom_j: ca_j.name().to_string(),
784                            distance: ca_ca_dist,
785                        });
786                    }
787                }
788            }
789        }
790    }
791
792    closest_pairs.sort_by(|a, b| a.distance.partial_cmp(&b.distance).unwrap());
793
794    closest_pairs
795}
796
797/// Finds pairs of atoms between a ligand and nearby protein residues within a certain distance.
798///
799/// This function processes the given PDB structure to identify heteroatoms (ligand atoms) and
800/// their neighboring atoms within a specified distance threshold (5.0 Å). It then finds the closest
801/// protein atom for each ligand atom and returns a vector of `Pair` structures containing information
802/// about the ligand atom and its closest protein atom. The result is a list of ligand-protein pairs
803/// and their corresponding distance, residue, and chain information.
804///
805pub fn gaps_around_ligand(pdb: &PDB) -> Vec<Gap> {
806    let ligand_atoms: Vec<&Atom> = pdb
807        .atoms()
808        .filter(|a| a.hetero() && *a.element().unwrap() != Element::H)
809        .collect();
810
811    let ligand_residues: Vec<&Residue> = pdb
812        .residues()
813        .filter(|r| r.atoms().any(|a| a.hetero()))
814        .collect();
815
816    let neighbors = neighbor_search(pdb.clone(), ligand_residues, 5.0);
817    let protein_atoms = get_atoms_from_resnumbers(pdb, &neighbors);
818
819    let mut result: Vec<Gap> = Vec::new();
820
821    let finder = Finder::new(pdb);
822
823    for ligand_atom in ligand_atoms {
824        let closest_protein_atom = protein_atoms
825            .iter()
826            .filter(|a| *a.element().unwrap() != Element::H)
827            .map(|a| (a, ligand_atom.distance(a)))
828            .min_by(|a, b| a.1.partial_cmp(&b.1).unwrap())
829            .unwrap();
830
831        let ligand_chain = finder.find_chain_id(ligand_atom);
832        let protein_chain = finder.find_chain_id(closest_protein_atom.0);
833
834        if ligand_chain == protein_chain {
835            result.push(Gap {
836                chain: ligand_chain,
837                res_i: finder.find_residue_number(ligand_atom),
838                atom_i: ligand_atom.name().to_string(),
839                res_j: finder.find_residue_number(closest_protein_atom.0),
840                atom_j: closest_protein_atom.0.name().to_string(),
841                distance: closest_protein_atom.1,
842            })
843        }
844    }
845
846    result
847}
848
849#[cfg(test)]
850mod tests {
851
852    use std::env;
853
854    use tempfile::NamedTempFile;
855
856    use super::*;
857
858    #[test]
859    fn test_get_true_interface() {
860        let pdb = load_pdb("tests/data/complex.pdb").unwrap();
861
862        let observed_true_interface = get_true_interface(&pdb, 5.0);
863
864        let mut expected_true_interface: HashMap<String, HashSet<isize>> = HashMap::new();
865        let chain_a = "A".to_string();
866        let res_a: HashSet<isize> = vec![934, 936, 933, 946, 950, 938, 940, 941, 937, 931]
867            .into_iter()
868            .collect();
869        let chain_b = "B";
870        let res_b: HashSet<isize> = vec![66, 48, 68, 49, 46, 45, 44, 47, 69, 6, 70, 8, 42]
871            .into_iter()
872            .collect();
873
874        expected_true_interface.insert(chain_a, res_a);
875        expected_true_interface.insert(chain_b.to_string(), res_b);
876
877        assert_eq!(observed_true_interface, expected_true_interface);
878    }
879
880    #[test]
881    fn test_load_pdb_short_lines() {
882        let pdb_path = env::current_dir()
883            .unwrap()
884            .join("tests/data/leu_short_lines.pdb");
885        let pdb = load_pdb(pdb_path.to_str().unwrap()).unwrap();
886        assert_eq!(pdb.atoms().count(), 8);
887    }
888
889    #[test]
890    fn test_load_pdb_normal_lines() {
891        let pdb_path = env::current_dir()
892            .unwrap()
893            .join("tests/data/leu_normal_lines.pdb");
894        let pdb = load_pdb(pdb_path.to_str().unwrap()).unwrap();
895        assert_eq!(pdb.atoms().count(), 8);
896    }
897    // Updated test helper function that works with older Rust versions
898    fn create_test_pdb(remarks: usize, atoms: usize) -> String {
899        let mut pdb = String::new();
900
901        // Add remarks
902        for i in 0..remarks {
903            pdb.push_str(&format!("REMARK {} Test remark\n", i));
904        }
905
906        // Add atoms (ATOM records)
907        for i in 0..atoms {
908            pdb.push_str(&format!(
909                "ATOM  {:>5}  N   ALA A{:>4}    {:>8}{:>8}{:>8}  1.00  0.00           N\n",
910                i + 1,
911                i + 1,
912                format!("{:.3}", i as f32),
913                format!("{:.3}", i as f32),
914                format!("{:.3}", i as f32)
915            ));
916        }
917
918        pdb
919    }
920
921    #[test]
922    fn test_process_pdb_string_removes_remarks() {
923        let input = "REMARK 1 Test\nATOM      1  N   ALA A   1       0.000   0.000   0.000\n";
924        let processed = process_pdb_string(input);
925
926        assert!(!processed.contains("REMARK"));
927        assert!(processed.contains("ATOM"));
928    }
929
930    #[test]
931    fn test_process_pdb_string_pads_lines() {
932        let short_line = "ATOM      1  N   ALA A   1       0.000   0.000   0.000";
933        let processed = process_pdb_string(short_line);
934
935        // Check line is padded to 80 chars (including newline)
936        assert_eq!(processed.lines().next().unwrap().len(), 80);
937        assert_eq!(processed.chars().count(), 81); // 80 + newline
938    }
939
940    #[test]
941    fn test_process_pdb_string_preserves_long_lines() {
942        let long_line =
943            "ATOM      1  N   ALA A   1       0.000   0.000   0.000  1.00  0.00           N  EXTRA";
944        let processed = process_pdb_string(long_line);
945
946        // Line should remain unchanged (except newline)
947        assert_eq!(processed.trim(), long_line);
948    }
949
950    #[test]
951    fn test_process_pdb_contents_valid_pdb() {
952        let pdb_content = create_test_pdb(3, 5);
953        let result = process_pdb_contents(&pdb_content);
954
955        assert!(result.is_ok());
956        let pdb = result.unwrap();
957        assert_eq!(pdb.atoms().count(), 5);
958    }
959
960    #[test]
961    fn test_process_pdb_contents_empty_input() {
962        let result = process_pdb_contents("");
963        assert!(result.is_err());
964
965        if let Err(errors) = result {
966            assert_eq!(errors.len(), 1);
967        }
968    }
969
970    #[test]
971    fn test_process_pdb_contents_invalid_pdb() {
972        let invalid_content = "This is not a PDB file\n";
973        let result = process_pdb_contents(invalid_content);
974
975        assert!(result.is_err());
976        if let Err(errors) = result {
977            assert!(!errors.is_empty());
978        }
979    }
980
981    #[test]
982    fn test_process_pdb_string_empty_input() {
983        let processed = process_pdb_string("");
984        assert_eq!(processed, "");
985    }
986
987    #[test]
988    fn test_process_pdb_string_mixed_content() {
989        let input = "\
990REMARK 1 Test
991ATOM      1  N   ALA A   1       0.000   0.000   0.000
992REMARK 2 Another
993ATOM      2  N   ALA A   2       1.000   1.000   1.000
994";
995        let processed = process_pdb_string(input);
996        let lines: Vec<&str> = processed.lines().collect();
997
998        assert_eq!(lines.len(), 2); // Only ATOM lines should remain
999        assert!(lines[0].starts_with("ATOM"));
1000        assert!(lines[1].starts_with("ATOM"));
1001    }
1002
1003    #[test]
1004    fn test_process_pdb_contents_with_real_file() {
1005        // Create a temporary PDB file
1006        let mut file = NamedTempFile::new().unwrap();
1007        let pdb_content = create_test_pdb(2, 3);
1008        write!(file, "{}", pdb_content).unwrap();
1009
1010        // Test with file content
1011        let file_content = std::fs::read_to_string(file.path()).unwrap();
1012        let result = process_pdb_contents(&file_content);
1013
1014        assert!(result.is_ok());
1015        let pdb = result.unwrap();
1016        assert_eq!(pdb.atoms().count(), 3);
1017    }
1018
1019    #[test]
1020    fn test_get_closest_residue_pairs() {
1021        let pdb = load_pdb("tests/data/two_res.pdb").unwrap();
1022        let observed_pairs = get_closest_residue_pairs(&pdb, 5.0);
1023        let expected_pairs = &[Pair {
1024            chain_i: "A".to_string(),
1025            chain_j: "B".to_string(),
1026            atom_i: "CA".to_string(),
1027            atom_j: "CA".to_string(),
1028            res_i: 2,
1029            res_j: 10,
1030            distance: 9.1,
1031        }];
1032        assert_eq!(observed_pairs.len(), expected_pairs.len());
1033        let pair = &observed_pairs[0];
1034        assert_eq!(pair.chain_i, expected_pairs[0].chain_i);
1035        assert_eq!(pair.chain_j, expected_pairs[0].chain_j);
1036        assert_eq!(pair.atom_i, expected_pairs[0].atom_i);
1037        assert_eq!(pair.atom_j, expected_pairs[0].atom_j);
1038        assert_eq!(pair.res_i, expected_pairs[0].res_i);
1039        assert_eq!(pair.res_j, expected_pairs[0].res_j);
1040        assert!((pair.distance - 9.1).abs() < 0.1);
1041    }
1042    #[test]
1043    fn test_gaps_around_ligand() {
1044        let pdb = load_pdb("tests/data/prot_ligand.pdb").unwrap();
1045        let observed = gaps_around_ligand(&pdb);
1046        let expected = &[
1047            Gap {
1048                chain: "E".to_string(),
1049                res_i: 351,
1050                atom_i: "N1".to_string(),
1051                res_j: 123,
1052                atom_j: "N".to_string(),
1053                distance: 3.0400192433601467,
1054            },
1055            Gap {
1056                chain: "E".to_string(),
1057                res_i: 351,
1058                atom_i: "C2".to_string(),
1059                res_j: 327,
1060                atom_j: "CE2".to_string(),
1061                distance: 3.79,
1062            },
1063        ];
1064
1065        // Compare chain, residue, and atom names
1066        assert_eq!(observed[0].chain, expected[0].chain);
1067        assert_eq!(observed[0].res_i, expected[0].res_i);
1068        assert_eq!(observed[0].atom_i, expected[0].atom_i);
1069        assert_eq!(observed[0].res_j, expected[0].res_j);
1070        assert_eq!(observed[0].atom_j, expected[0].atom_j);
1071
1072        assert_eq!(observed[1].chain, expected[1].chain);
1073        assert_eq!(observed[1].res_i, expected[1].res_i);
1074        assert_eq!(observed[1].atom_i, expected[1].atom_i);
1075        assert_eq!(observed[1].res_j, expected[1].res_j);
1076        assert_eq!(observed[1].atom_j, expected[1].atom_j);
1077
1078        // TODO: Find a smart way to check the distances
1079        //
1080        // assert_eq!(observed[0].distance, expected[0].distance);
1081        // assert_eq!(observed[1].distance, expected[1].distance);
1082    }
1083
1084    #[test]
1085    fn test_finder() {
1086        let pdb = load_pdb("tests/data/complex.pdb").unwrap();
1087        let finder = Finder::new(&pdb);
1088
1089        assert_eq!(finder.chain_lookup.keys().len(), 924);
1090        assert_eq!(finder.residue_lookup.keys().len(), 924)
1091    }
1092
1093    #[test]
1094    fn test_finder_find_chain_id() {
1095        let pdb = load_pdb("tests/data/complex.pdb").unwrap();
1096        let finder = Finder::new(&pdb);
1097
1098        let found = finder.find_chain_id(pdb.atom(1).expect(""));
1099        assert_eq!(found, "A");
1100
1101        let found = finder.find_chain_id(pdb.atom(700).expect(""));
1102        assert_eq!(found, "B");
1103    }
1104
1105    #[test]
1106    fn test_finder_find_residue_number() {
1107        let pdb = load_pdb("tests/data/complex.pdb").unwrap();
1108        let finder = Finder::new(&pdb);
1109
1110        let found = finder.find_residue_number(pdb.atom(1).unwrap());
1111
1112        assert_eq!(found, 929)
1113    }
1114
1115    #[test]
1116    fn test_filter_unique_by_atom_j() {
1117        let gaps = vec![
1118            Gap {
1119                chain: "A".to_string(),
1120                atom_i: "C".to_string(),
1121                atom_j: "B".to_string(),
1122                res_i: 1,
1123                res_j: 2,
1124                distance: 3.5,
1125            },
1126            Gap {
1127                chain: "A".to_string(),
1128                atom_i: "A".to_string(),
1129                atom_j: "B".to_string(),
1130                res_i: 3,
1131                res_j: 4,
1132                distance: 2.8,
1133            },
1134            Gap {
1135                chain: "B".to_string(),
1136                atom_i: "C".to_string(),
1137                atom_j: "D".to_string(),
1138                res_i: 5,
1139                res_j: 6,
1140                distance: 4.1,
1141            },
1142            Gap {
1143                chain: "A".to_string(),
1144                atom_i: "E".to_string(),
1145                atom_j: "B".to_string(),
1146                res_i: 7,
1147                res_j: 8,
1148                distance: 1.5,
1149            },
1150        ];
1151
1152        let filtered = filter_unique_by_atom_j(gaps);
1153
1154        // The function should remove duplicates based on `atom_j` and keep only the first occurrence.
1155        assert_eq!(filtered.len(), 2); // "B" and "D" should remain
1156        assert_eq!(filtered[0].atom_j, "B");
1157        assert_eq!(filtered[1].atom_j, "D");
1158
1159        // Verify the correct `atom_j` is retained
1160        assert!(
1161            filtered
1162                .iter()
1163                .all(|gap| gap.atom_j != "B" || gap.atom_i == "C")
1164        );
1165    }
1166
1167    #[test]
1168    fn test_no_duplicates() {
1169        let gaps = vec![
1170            Gap {
1171                chain: "A".to_string(),
1172                atom_i: "C".to_string(),
1173                atom_j: "B".to_string(),
1174                res_i: 1,
1175                res_j: 2,
1176                distance: 3.5,
1177            },
1178            Gap {
1179                chain: "A".to_string(),
1180                atom_i: "A".to_string(),
1181                atom_j: "D".to_string(),
1182                res_i: 3,
1183                res_j: 4,
1184                distance: 2.8,
1185            },
1186        ];
1187
1188        let filtered = filter_unique_by_atom_j(gaps);
1189        assert_eq!(filtered.len(), 2); // No duplicates, so the length should be the same
1190    }
1191
1192    #[test]
1193    fn test_empty_input() {
1194        let gaps: Vec<Gap> = Vec::new();
1195        let filtered = filter_unique_by_atom_j(gaps);
1196        assert!(filtered.is_empty()); // Should return an empty list
1197    }
1198
1199    #[test]
1200    fn test_single_element() {
1201        let gaps = vec![Gap {
1202            chain: "A".to_string(),
1203            atom_i: "C".to_string(),
1204            atom_j: "B".to_string(),
1205            res_i: 1,
1206            res_j: 2,
1207            distance: 3.5,
1208        }];
1209
1210        let filtered = filter_unique_by_atom_j(gaps);
1211        assert_eq!(filtered.len(), 1); // Single element, should remain
1212        assert_eq!(filtered[0].atom_j, "B");
1213    }
1214}