kira-mmcif 0.2.0

Low-level, streaming mmCIF/BinaryCIF parser focused on protein coordinates.
Documentation
use smallvec::SmallVec;

#[derive(Debug, Clone, Default)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct Structure {
    pub models: Vec<Model>,
}

#[derive(Debug, Clone, Default)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct Model {
    pub chains: Vec<Chain>,
}

#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct Chain {
    pub id: ChainId,
    pub residues: Vec<Residue>,
}

#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct Residue {
    pub name: ResidueName,
    /// `None` corresponds to mmCIF `label_seq_id == "."`.
    pub seq_id: Option<i32>,
    pub atoms: SmallVec<[Atom; 4]>,
}

/// Chain identifier.
///
/// Stores up to [`ChainId::MAX_LEN`] bytes of the original `label_asym_id`,
/// preserving the original case (so `A` and `a` are distinct chains). The type
/// stays `Copy + Hash + Eq` and never allocates on the heap.
#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct ChainId {
    bytes: [u8; 8],
    len: u8,
}

impl ChainId {
    pub const MAX_LEN: usize = 8;

    /// Builds a [`ChainId`] from an mmCIF `label_asym_id`. Returns `None` if the
    /// label is empty, longer than [`ChainId::MAX_LEN`], or contains characters
    /// outside `[A-Za-z0-9_]`.
    pub fn from_label_asym_id(label: &str) -> Option<Self> {
        Self::from_bytes(label.as_bytes())
    }

    pub fn from_bytes(label: &[u8]) -> Option<Self> {
        if label.is_empty() || label.len() > Self::MAX_LEN {
            return None;
        }
        for &c in label {
            if !c.is_ascii_alphanumeric() && c != b'_' {
                return None;
            }
        }
        let mut bytes = [0u8; 8];
        bytes[..label.len()].copy_from_slice(label);
        Some(Self {
            bytes,
            len: label.len() as u8,
        })
    }

    pub fn as_str(&self) -> &str {
        // SAFETY: `from_bytes` only accepts ASCII alphanumeric + `_`, which is
        // always valid UTF-8. The length is also kept in `self.len`.
        unsafe { core::str::from_utf8_unchecked(self.as_bytes()) }
    }

    pub fn as_bytes(&self) -> &[u8] {
        &self.bytes[..self.len as usize]
    }

    pub fn len(&self) -> usize {
        self.len as usize
    }

    pub fn is_empty(&self) -> bool {
        self.len == 0
    }
}

impl core::fmt::Debug for ChainId {
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        write!(f, "ChainId({:?})", self.as_str())
    }
}

impl core::fmt::Display for ChainId {
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        f.write_str(self.as_str())
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[repr(u8)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub enum AtomName {
    N = 0,
    CA = 1,
    C = 2,
    O = 3,
}

impl AtomName {
    pub fn from_label_atom_id(label: &str) -> Option<Self> {
        Self::from_label_atom_id_bytes(label.as_bytes())
    }

    #[inline]
    pub fn from_label_atom_id_bytes(label: &[u8]) -> Option<Self> {
        match label {
            b"N" => Some(Self::N),
            b"CA" => Some(Self::CA),
            b"C" => Some(Self::C),
            b"O" => Some(Self::O),
            _ => None,
        }
    }

    #[inline]
    pub fn as_u8(self) -> u8 {
        self as u8
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[repr(u8)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub enum ResidueName {
    ALA = 0,
    ARG = 1,
    ASN = 2,
    ASP = 3,
    CYS = 4,
    GLN = 5,
    GLU = 6,
    GLY = 7,
    HIS = 8,
    ILE = 9,
    LEU = 10,
    LYS = 11,
    MET = 12,
    PHE = 13,
    PRO = 14,
    SER = 15,
    THR = 16,
    TRP = 17,
    TYR = 18,
    VAL = 19,
    UNK = 255,
}

impl ResidueName {
    pub fn from_label_comp_id(label: &str) -> Self {
        Self::from_label_comp_id_bytes(label.as_bytes())
    }

    #[inline]
    pub fn from_label_comp_id_bytes(label: &[u8]) -> Self {
        match label {
            b"ALA" => Self::ALA,
            b"ARG" => Self::ARG,
            b"ASN" => Self::ASN,
            b"ASP" => Self::ASP,
            b"CYS" => Self::CYS,
            b"GLN" => Self::GLN,
            b"GLU" => Self::GLU,
            b"GLY" => Self::GLY,
            b"HIS" => Self::HIS,
            b"ILE" => Self::ILE,
            b"LEU" => Self::LEU,
            b"LYS" => Self::LYS,
            b"MET" => Self::MET,
            b"PHE" => Self::PHE,
            b"PRO" => Self::PRO,
            b"SER" => Self::SER,
            b"THR" => Self::THR,
            b"TRP" => Self::TRP,
            b"TYR" => Self::TYR,
            b"VAL" => Self::VAL,
            _ => Self::UNK,
        }
    }

    #[inline]
    pub fn as_u8(self) -> u8 {
        self as u8
    }
}

#[derive(Debug, Clone, Copy, PartialEq)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct Atom {
    pub name: AtomName,
    pub x: f32,
    pub y: f32,
    pub z: f32,
}