pdbtbx 0.10.1

A library to open/edit/save (crystallographic) Protein Data Bank (PDB) and mmCIF files.
Documentation
use crate::error::*;
use crate::structs::*;

use std::collections::HashMap;

/// Validate the SEQRES data found, if there is any
#[allow(
    clippy::comparison_chain,
    clippy::cast_possible_wrap,
    clippy::cast_sign_loss
)]
pub fn validate_seqres(
    pdb: &mut PDB,
    sequence: HashMap<String, Vec<(usize, usize, Vec<String>)>>,
    lines: Vec<String>,
    start_linenumber: usize,
    context: &Context,
) -> Vec<PDBError> {
    let mut errors = Vec::new();
    for (chain_id, data) in sequence {
        if let Some(chain) = pdb.chains_mut().find(|c| c.id() == chain_id) {
            let mut chain_sequence = Vec::new();
            let mut serial = 1;
            let mut residues = 0;
            for (line, (ser_num, res_num, seq)) in data.into_iter().enumerate() {
                if serial != ser_num {
                    errors.push(PDBError::new(
                        ErrorLevel::StrictWarning,
                        "SEQRES serial number invalid",
                        &format!("The serial number for SEQRES chain \"{}\" with number \"{}\" does not follow sequentially from the previous row.", chain_id, ser_num),
                        context.clone()
                    ));
                }
                serial += 1;
                if residues == 0 {
                    residues = res_num;
                } else if residues != res_num {
                    errors.push(PDBError::new(
                        ErrorLevel::StrictWarning,
                        "SEQRES residue total invalid",
                        &format!("The residue total for SEQRES chain \"{}\" with number \"{}\" does not match the total on the first row for this chain.", chain_id, ser_num),
                        context.clone()
                    ));
                }
                chain_sequence.extend(
                    seq.into_iter()
                        .enumerate()
                        .map(|(column, item)| (item, (line, column))),
                );
            }
            if chain_sequence.len() != residues {
                errors.push(PDBError::new(
                    ErrorLevel::LooseWarning,
                    "SEQRES residue total invalid",
                    &format!("The residue total for SEQRES chain \"{}\" does not match the total residues found in the seqres records.", chain_id),
                    context.clone()
                ));
            }
            let mut offset = 0;
            if let Some(db_ref) = chain.database_reference() {
                offset = db_ref.pdb_position.start;
                for dif in &db_ref.differences {
                    if dif.database_residue.is_none() && dif.residue.1 < db_ref.pdb_position.start {
                        // If there is a residue in front of the db sequence
                        offset -= 1;
                    }
                }
                if db_ref.pdb_position.end - offset + 1 != residues as isize {
                    errors.push(PDBError::new(
                        ErrorLevel::LooseWarning,
                        "SEQRES residue total invalid",
                        &format!("The residue total ({}) for SEQRES chain \"{}\" does not match the total residues found in the dbref record ({}).", residues, chain_id, db_ref.pdb_position.end - offset + 1),
                        context.clone()
                    ));
                }
            }

            let copy = chain.clone();
            let mut chain_res = copy.residues();
            let mut next = chain_res.next();
            // Contain all inconsistencies between the SEQRES and found residues, group all by `chain_id` and keep a list of index, seqres item position, and chain_sequence.
            let mut seqres_inconsistent = Vec::new();

            for (raw_index, (seq, position)) in chain_sequence.iter().enumerate() {
                let index = raw_index as isize + offset;
                if let Some(n) = next {
                    if index == n.serial_number() {
                        if let Some(name) = n.name() {
                            if *seq != name {
                                #[allow(clippy::type_complexity)]
                                if let Some(item) = seqres_inconsistent.iter_mut().find(
                                    |item: &&mut (&str, Vec<(isize, (usize, usize), &str)>)| {
                                        item.0 == chain_id
                                    },
                                ) {
                                    item.1.push((index, *position, name));
                                } else {
                                    seqres_inconsistent
                                        .push((&chain_id, vec![(index, *position, name)]));
                                }
                            }
                        } else {
                            errors.push(PDBError::new(
                                ErrorLevel::StrictWarning,
                                "Multiple residues in SEQRES validation",
                                &format!("The residue index {} in chain {} has no conformers or multiple with different names. The program cannot validate the SEQRES record in this way.", index, chain_id),
                                context.clone()
                            )); // TODO: show found residues
                        }
                        next = chain_res.next();
                    } else if index < n.serial_number() {
                        chain.add_residue(
                            Residue::new(
                                index,
                                None,
                                Some(
                                    Conformer::new(seq, None, None)
                                        .expect("Invalid characters in Conformer generation"),
                                ),
                            )
                            .expect("Invalid characters in Residue generation"),
                        );
                        chain.sort();
                    } else {
                        errors.push(PDBError::new(
                            ErrorLevel::LooseWarning,
                            "Chain residue invalid",
                            &format!("The residue index {} value \"{:?}\" for Chain \"{}\" is not sequentially increasing, value expected: {}.", n.serial_number(), n.name(), chain_id, index),
                            context.clone()
                        ));
                        #[allow(clippy::while_let_on_iterator)]
                        while let Some(n) = chain_res.next() {
                            if n.serial_number() == index {
                                next = chain_res.next();
                                break;
                            }
                        }
                    }
                } else {
                    chain.add_residue(
                        Residue::new(
                            index,
                            None,
                            Some(
                                Conformer::new(seq, None, None)
                                    .expect("Invalid characters in Conformer generation"),
                            ),
                        )
                        .expect("Invalid characters in Residue generation"),
                    );
                    chain.sort();
                }
            }

            if !seqres_inconsistent.is_empty() {
                for (_chain, inconsistencies) in seqres_inconsistent {
                    // Find the extremes of the used lines from all SEQRES lines, assuming that all lines are continuous.
                    let used_lines = inconsistencies
                        .iter()
                        .fold((usize::MAX, usize::MIN), |a, v| {
                            (v.1 .0.min(a.0), v.1 .0.max(a.1))
                        });
                    let context_lines = &lines[used_lines.0..=used_lines.1];
                    let mut highlights = Vec::new();
                    let mut found_residues = Vec::new();

                    // Create all highlights in the original SEQRES definition.
                    for inconsistency in inconsistencies {
                        let line_offset = inconsistency.1 .0 - used_lines.0;
                        highlights.push((line_offset, 19 + 4 * inconsistency.1 .1, 3)); // Calculate the correct character position for the highlight (is index right now)

                        // Add the found residue to the list of found residues, in the correct line
                        if let Some(line) = found_residues
                            .iter_mut()
                            .find(|v: &&mut (usize, Vec<(usize, &str)>)| v.0 == line_offset)
                        {
                            line.1.push((inconsistency.1 .1, inconsistency.2));
                        } else {
                            found_residues
                                .push((line_offset, vec![(inconsistency.1 .1, inconsistency.2)]));
                        }
                    }
                    // Add all found residues into lines matching the SEQRES definition
                    let found_residues = found_residues
                        .iter()
                        .map(|line| {
                            line.1
                                .iter()
                                .enumerate()
                                .fold(("".to_string(), 0), |acc, v| {
                                    (
                                        acc.0
                                            + if v.1 .0.saturating_sub(1) == acc.1 || v.0 == 0 {
                                                " "
                                            } else {
                                                " ... "
                                            }
                                            + v.1 .1,
                                        v.1 .0,
                                    )
                                })
                                .0
                        })
                        .collect();

                    // Create the final error message. See example:
                    // LooseWarning: SEQRES inconsistent residues
                    //
                    //      |
                    // 1    | SEQRES 1   B 475   GLY PRO ASN ILE CYS THR THR ARG GLY VAL SER SER CYS (SEQRES definition)
                    //      |                                ^^^ ^^^ ^^^ ^^^
                    //      |
                    //
                    //      |
                    // 1    |                     HOH HOH HOH HOH (found residues)
                    //      |
                    // The residues as defined in the SEQRES records do not match with the found residues, see above for details.
                    errors.push(PDBError::new(
                        ErrorLevel::LooseWarning,
                        "SEQRES inconsistent residues",
                        "The residues as defined in the SEQRES records do not match with the found residues, see above for details.",
                        Context::Multiple { contexts: vec![
                            (Some("SEQRES definition".to_string()), Context::RangeHighlights{start_linenumber, lines: context_lines.to_vec(), highlights}),
                            (Some("Residues found in ATOM definitions".to_string()), Context::RangeHighlights { start_linenumber: 0, lines: found_residues, highlights: Vec::new() })]}
                    ));
                }
            }

            let total_found = chain
                .residues()
                .filter(|r| !r.atoms().any(Atom::hetero)) // TODO: It filters out all residues with at least one HETATM, this should be changed to include in the total residues defined in HETNAM.
                .count();
            if chain_sequence.len() != total_found {
                errors.push(PDBError::new(
                    ErrorLevel::LooseWarning,
                    "SEQRES residue total invalid",
                    &format!("The residue total ({}) for SEQRES chain \"{}\" does not match the total residues found in the chain ({}).", chain_sequence.len(), chain_id, total_found),
                    context.clone()
                ));
            }
        }
    }
    errors
}