use std::{
collections::HashMap,
fs,
fs::File,
io,
io::{ErrorKind, Write},
path::Path,
str::FromStr,
};
use bio_apis::{drugbank, pdbe, pubchem, pubchem::StructureSearchNamespace};
use lin_alg::f64::Vec3;
use na_seq::Element;
use crate::{
AtomGeneric, BondGeneric, BondType, ChainGeneric, Mol2, PharmacophoreFeatureGeneric,
PharmacophoreTypeGeneric, ResidueEnd, ResidueGeneric, ResidueType,
};
#[derive(Clone, Debug)]
pub struct Sdf {
pub ident: String,
pub metadata: HashMap<String, String>,
pub atoms: Vec<AtomGeneric>,
pub bonds: Vec<BondGeneric>,
pub chains: Vec<ChainGeneric>,
pub residues: Vec<ResidueGeneric>,
pub pharmacophore_features: Vec<PharmacophoreFeatureGeneric>,
}
#[derive(Clone, Copy, Debug, Default)]
pub enum SdfFormat {
#[default]
V2000,
V3000,
}
fn split_atom_bond_count_col(col: &str, split: usize) -> io::Result<(usize, usize)> {
if col.len() < split {
return Err(io::Error::new(
ErrorKind::InvalidData,
format!("Atom/bond count column too short to split: {col}"),
));
}
let n_atoms = col[..split]
.parse::<usize>()
.map_err(|_| io::Error::new(ErrorKind::InvalidData, "Could not parse number of atoms"))?;
let n_bonds = col[split..]
.parse::<usize>()
.map_err(|_| io::Error::new(ErrorKind::InvalidData, "Could not parse number of bonds"))?;
Ok((n_atoms, n_bonds))
}
fn split_bond_indices_line(
col: &str,
prev: Option<&str>,
_next: Option<&str>, max_atom_index: usize,
) -> io::Result<(usize, usize)> {
let mut valid_splits = Vec::new();
for i in 1..col.len() {
let s1 = &col[..i];
let s2 = &col[i..];
if let (Ok(n1), Ok(n2)) = (s1.parse::<usize>(), s2.parse::<usize>()) {
if n1 <= max_atom_index && n2 <= max_atom_index && n1 > 0 && n2 > 0 {
valid_splits.push((n1, n2));
}
}
}
match valid_splits.len() {
0 => Err(io::Error::new(
ErrorKind::InvalidData,
format!("No valid bond split found for '{col}' with max atom {max_atom_index}"),
)),
1 => Ok(valid_splits[0]),
_ => {
let p_val = prev.and_then(|s| s.parse::<usize>().ok()).unwrap_or(0);
valid_splits.sort_by_key(|(n1, _)| {
if *n1 >= p_val {
n1 - p_val
} else {
(p_val - n1) + 10000
}
});
Ok(valid_splits[0])
}
}
}
fn parse_v2000_ctab(lines: &[&str]) -> io::Result<(Vec<AtomGeneric>, Vec<BondGeneric>, usize)> {
let counts_line = lines[3];
let counts_cols: Vec<&str> = counts_line.split_whitespace().collect();
if counts_cols.len() < 2 {
return Err(io::Error::new(
ErrorKind::InvalidData,
"Counts line doesn't have enough fields",
));
}
let mut n_atoms = counts_cols[0]
.parse::<usize>()
.map_err(|_| io::Error::new(ErrorKind::InvalidData, "Could not parse number of atoms"))?;
let mut n_bonds = counts_cols[1]
.parse::<usize>()
.map_err(|_| io::Error::new(ErrorKind::InvalidData, "Could not parse number of bonds"))?;
let first_atom_line = 4;
let mut last_atom_line = first_atom_line + n_atoms;
let mut first_bond_line = last_atom_line;
let mut last_bond_line = first_bond_line + n_bonds;
if lines.len() < last_atom_line {
let atom_bond_count = counts_cols[0];
if atom_bond_count.len() >= 4 {
(n_atoms, n_bonds) = split_atom_bond_count_col(atom_bond_count, 3)?;
last_atom_line = first_atom_line + n_atoms;
first_bond_line = last_atom_line;
last_bond_line = first_bond_line + n_bonds;
if lines.len() < last_atom_line {
return Err(io::Error::new(
ErrorKind::InvalidData,
format!(
"Not enough lines for the declared atom block (after 3/3 split.) \
Lines: {}, expected: {} atoms",
lines.len(),
last_atom_line - 4
),
));
}
} else {
return Err(io::Error::new(
ErrorKind::InvalidData,
"Not enough lines for the declared atom block",
));
}
}
let mut atoms = Vec::with_capacity(n_atoms);
#[allow(clippy::needless_range_loop)]
for i in first_atom_line..last_atom_line {
let line = lines[i];
let cols: Vec<&str> = line.split_whitespace().collect();
if cols.len() < 4 {
return Err(io::Error::new(
ErrorKind::InvalidData,
format!("Atom line {i} does not have enough columns"),
));
}
let x = cols[0]
.parse::<f64>()
.map_err(|_| io::Error::new(ErrorKind::InvalidData, "Could not parse X coordinate"))?;
let y = cols[1]
.parse::<f64>()
.map_err(|_| io::Error::new(ErrorKind::InvalidData, "Could not parse Y coordinate"))?;
let z = cols[2]
.parse::<f64>()
.map_err(|_| io::Error::new(ErrorKind::InvalidData, "Could not parse Z coordinate"))?;
let element = cols[3];
atoms.push(AtomGeneric {
serial_number: (i - first_atom_line) as u32 + 1,
posit: Vec3 { x, y, z }, element: Element::from_letter(element)?,
hetero: true,
..Default::default()
});
}
let mut bonds = Vec::with_capacity(n_bonds);
for i in first_bond_line..last_bond_line {
let line = lines[i];
let cols: Vec<&str> = line.split_whitespace().collect();
if cols.len() < 3 {
return Err(io::Error::new(
ErrorKind::InvalidData,
format!("Bond line {i} does not have enough columns"),
));
}
let mut atom_0_sn = cols[0]
.parse::<u32>()
.map_err(|_| io::Error::new(ErrorKind::InvalidData, "Could not parse bond atom 0"))?;
let mut atom_1_sn = cols[1]
.parse::<u32>()
.map_err(|_| io::Error::new(ErrorKind::InvalidData, "Could not parse bond atom 1"))?;
let bond_type = if cols[2] == "0" {
let prev = if i != first_bond_line {
let line_p = lines[i - 1];
let cols_p: Vec<&str> = line_p.split_whitespace().collect();
Some(cols_p[0])
} else {
None
};
let next = if i != last_bond_line - 1 {
let line_p = lines[i + 1];
let cols_p: Vec<&str> = line_p.split_whitespace().collect();
Some(cols_p[0])
} else {
None
};
let (atom_0, atom_1) = split_bond_indices_line(cols[0], prev, next, atoms.len())?;
atom_0_sn = atom_0 as u32;
atom_1_sn = atom_1 as u32;
if atom_0_sn as usize > atoms.len() || atom_1_sn as usize > atoms.len() {
return Err(io::Error::new(
ErrorKind::InvalidData,
format!(
"Bond indices out of bounds, during split: {}. Atom len: {}. A0: {atom_0_sn} A1: {atom_1_sn}",
cols[0],
atoms.len(),
),
));
}
BondType::from_str(cols[1])?
} else {
BondType::from_str(cols[2])?
};
bonds.push(BondGeneric {
atom_0_sn,
atom_1_sn,
bond_type,
})
}
Ok((atoms, bonds, last_bond_line))
}
fn parse_v3000_ctab(lines: &[&str]) -> io::Result<(Vec<AtomGeneric>, Vec<BondGeneric>)> {
let ctab_begin = lines
.iter()
.position(|l| l.trim() == "M V30 BEGIN CTAB")
.ok_or_else(|| {
io::Error::new(ErrorKind::InvalidData, "V3000: no M V30 BEGIN CTAB found")
})?;
let counts_idx = lines[ctab_begin..]
.iter()
.position(|l| {
let t = l.trim();
t.starts_with("M V30 COUNTS") || t.starts_with("M V30 COUNTS")
})
.ok_or_else(|| io::Error::new(ErrorKind::InvalidData, "V3000: no COUNTS line"))?
+ ctab_begin;
let counts_cols: Vec<&str> = lines[counts_idx].split_whitespace().collect();
if counts_cols.len() < 5 {
return Err(io::Error::new(
ErrorKind::InvalidData,
"V3000 COUNTS line too short",
));
}
let n_atoms = counts_cols[3]
.parse::<usize>()
.map_err(|_| io::Error::new(ErrorKind::InvalidData, "V3000: could not parse atom count"))?;
let n_bonds = counts_cols[4]
.parse::<usize>()
.map_err(|_| io::Error::new(ErrorKind::InvalidData, "V3000: could not parse bond count"))?;
let atom_begin = lines[ctab_begin..]
.iter()
.position(|l| l.trim() == "M V30 BEGIN ATOM")
.ok_or_else(|| io::Error::new(ErrorKind::InvalidData, "V3000: no BEGIN ATOM"))?
+ ctab_begin
+ 1;
let atom_end = lines[atom_begin..]
.iter()
.position(|l| l.trim() == "M V30 END ATOM")
.ok_or_else(|| io::Error::new(ErrorKind::InvalidData, "V3000: no END ATOM"))?
+ atom_begin;
let mut atoms = Vec::with_capacity(n_atoms);
for i in atom_begin..atom_end {
let line = lines[i];
let cols: Vec<&str> = line.split_whitespace().collect();
if cols.len() < 7 {
return Err(io::Error::new(
ErrorKind::InvalidData,
format!("V3000 atom line {i} too short (got {} cols)", cols.len()),
));
}
let serial_number = cols[2]
.parse::<u32>()
.map_err(|_| io::Error::new(ErrorKind::InvalidData, "V3000: bad atom serial number"))?;
let element_str = cols[3];
let x = cols[4]
.parse::<f64>()
.map_err(|_| io::Error::new(ErrorKind::InvalidData, "V3000: bad X coordinate"))?;
let y = cols[5]
.parse::<f64>()
.map_err(|_| io::Error::new(ErrorKind::InvalidData, "V3000: bad Y coordinate"))?;
let z = cols[6]
.parse::<f64>()
.map_err(|_| io::Error::new(ErrorKind::InvalidData, "V3000: bad Z coordinate"))?;
atoms.push(AtomGeneric {
serial_number,
posit: Vec3 { x, y, z },
element: Element::from_letter(element_str)?,
hetero: true,
..Default::default()
});
}
let bond_begin = lines[ctab_begin..]
.iter()
.position(|l| l.trim() == "M V30 BEGIN BOND")
.ok_or_else(|| io::Error::new(ErrorKind::InvalidData, "V3000: no BEGIN BOND"))?
+ ctab_begin
+ 1;
let bond_end = lines[bond_begin..]
.iter()
.position(|l| l.trim() == "M V30 END BOND")
.ok_or_else(|| io::Error::new(ErrorKind::InvalidData, "V3000: no END BOND"))?
+ bond_begin;
let mut bonds = Vec::with_capacity(n_bonds);
for i in bond_begin..bond_end {
let line = lines[i];
let cols: Vec<&str> = line.split_whitespace().collect();
if cols.len() < 6 {
return Err(io::Error::new(
ErrorKind::InvalidData,
format!("V3000 bond line {i} too short (got {} cols)", cols.len()),
));
}
let bond_type = BondType::from_str(cols[3])?;
let atom_0_sn = cols[4]
.parse::<u32>()
.map_err(|_| io::Error::new(ErrorKind::InvalidData, "V3000: bad bond atom 0"))?;
let atom_1_sn = cols[5]
.parse::<u32>()
.map_err(|_| io::Error::new(ErrorKind::InvalidData, "V3000: bad bond atom 1"))?;
bonds.push(BondGeneric {
atom_0_sn,
atom_1_sn,
bond_type,
});
}
Ok((atoms, bonds))
}
fn parse_metadata_section(
lines: &[&str],
atoms: &mut Vec<AtomGeneric>,
fallback_start: usize,
) -> io::Result<(HashMap<String, String>, Vec<PharmacophoreFeatureGeneric>)> {
let mut metadata: HashMap<String, String> = HashMap::new();
let mut pharmacophore_features: Vec<PharmacophoreFeatureGeneric> = Vec::new();
for (i, line) in lines.iter().enumerate() {
if line.contains("> <PUBCHEM_COMPOUND_CID>")
&& let Some(value_line) = lines.get(i + 1)
{
let value = value_line.trim();
let _ = value.parse::<u32>(); }
if line.contains("> <DATABASE_ID>")
&& let Some(value_line) = lines.get(i + 1)
{
let value = value_line.trim();
let _ = (!value.is_empty()).then(|| value.to_string()); }
}
let mut idx = if let Some(m_end) = lines.iter().position(|l| l.trim() == "M END") {
m_end + 1
} else {
fallback_start
};
while idx < lines.len() {
let line = lines[idx].trim();
if line == "$$$$" {
break;
}
if line.starts_with('>')
&& let (Some(l), Some(r)) = (line.find('<'), line.rfind('>'))
&& r > l + 1
{
let key = &line[l + 1..r];
idx += 1;
let mut rows_pharm: Vec<&str> = Vec::new();
while idx < lines.len() {
let v = lines[idx];
let v_trim = v.trim_end();
if v_trim.is_empty() || v_trim == "$$$$" || v_trim.starts_with("> <") {
break;
}
rows_pharm.push(v_trim);
idx += 1;
}
if key == "PUBCHEM_PHARMACOPHORE_FEATURES" {
match parse_pharmacophore_features(&rows_pharm) {
Ok(v) => pharmacophore_features = v,
Err(e) => {
metadata.insert(key.to_string(), rows_pharm.join("\n"));
eprintln!("Failed to parse PUBCHEM_PHARMACOPHORE_FEATURES: {e}");
}
}
} else {
metadata.insert(key.to_string(), rows_pharm.join("\n"));
}
if key == "atom.dprop.PartialCharge" {
let joined = rows_pharm.join(" ");
let charges: Vec<&str> = joined.split_whitespace().collect();
for (i, q) in charges.into_iter().enumerate() {
if i < atoms.len() {
atoms[i].partial_charge = Some(q.parse().unwrap_or(0.));
}
}
}
if key == "PUBCHEM_MMFF94_PARTIAL_CHARGES" {
if rows_pharm.is_empty() {
eprintln!("No values for PUBCHEM_MMFF94_PARTIAL_CHARGES");
} else {
let n = rows_pharm[0].trim().parse::<usize>().unwrap_or(0);
if rows_pharm.len().saturating_sub(1) != n {
eprintln!(
"Charge count mismatch: expected {}, got {}",
n,
rows_pharm.len().saturating_sub(1)
);
}
for line in rows_pharm.iter().skip(1).take(n) {
let mut it = line.split_whitespace();
let i1 = it.next().and_then(|s| s.parse::<usize>().ok());
let q = it.next().and_then(|s| s.parse::<f32>().ok());
if let (Some(i1), Some(q)) = (i1, q) {
if (1..=atoms.len()).contains(&i1) {
atoms[i1 - 1].partial_charge = Some(q); } else {
eprintln!(
"Atom index {} out of range (n_atoms={})",
i1,
atoms.len()
);
}
}
}
}
}
continue;
}
idx += 1;
}
Ok((metadata, pharmacophore_features))
}
impl Sdf {
pub fn new(text: &str) -> io::Result<Self> {
let lines: Vec<&str> = text.lines().collect();
if lines.len() < 4 {
return Err(io::Error::new(
ErrorKind::InvalidData,
"Not enough lines to parse an SDF header",
));
}
let ident = lines[0].trim().to_string();
let format = if lines[3].contains("V3000") {
SdfFormat::V3000
} else {
SdfFormat::V2000
};
let (mut atoms, bonds, fallback_start) = match format {
SdfFormat::V2000 => parse_v2000_ctab(&lines)?,
SdfFormat::V3000 => {
let (a, b) = parse_v3000_ctab(&lines)?;
(a, b, lines.len())
}
};
let (metadata, pharmacophore_features) =
parse_metadata_section(&lines, &mut atoms, fallback_start)?;
let atom_sns: Vec<_> = atoms.iter().map(|a| a.serial_number).collect();
let mut chains = Vec::new();
let mut residues = Vec::new();
residues.push(ResidueGeneric {
serial_number: 0,
res_type: ResidueType::Other("Unknown".to_string()),
atom_sns: atom_sns.clone(),
end: ResidueEnd::Hetero,
});
chains.push(ChainGeneric {
id: "A".to_string(),
residue_sns: vec![0],
atom_sns,
});
Ok(Self {
ident,
metadata,
atoms,
chains,
residues,
bonds,
pharmacophore_features,
})
}
pub fn save(&self, path: &Path, format: SdfFormat) -> io::Result<()> {
let mut file = File::create(path)?;
writeln!(file, "{}", self.ident)?;
writeln!(file)?;
writeln!(file)?;
let natoms = self.atoms.len();
let nbonds = self.bonds.len();
match format {
SdfFormat::V2000 => {
let w = natoms.max(nbonds).to_string().len().max(3) + 1;
writeln!(
file,
"{:>w$}{:>w$} 0 0 0 0 0999 V2000",
natoms,
nbonds,
w = w,
)?;
for atom in &self.atoms {
let x = atom.posit.x;
let y = atom.posit.y;
let z = atom.posit.z;
let symbol = atom.element.to_letter();
writeln!(
file,
"{:>10.4}{:>10.4}{:>10.4} {:<2} 0 0 0 0 0 0 0 0 0 0",
x, y, z, symbol
)?;
}
let max_sn = self
.bonds
.iter()
.flat_map(|b| [b.atom_0_sn, b.atom_1_sn])
.max()
.unwrap_or(0);
let w = max_sn.to_string().len().max(3) + 1;
for bond in &self.bonds {
writeln!(
file,
"{:>w$}{:>w$}{:>3} 0 0 0 0",
bond.atom_0_sn,
bond.atom_1_sn,
bond.bond_type.to_str_sdf(),
w = w,
)?;
}
writeln!(file, "M END")?;
}
SdfFormat::V3000 => {
writeln!(file, " 0 0 0 0 0 0 V3000")?;
writeln!(file, "M V30 BEGIN CTAB")?;
writeln!(file, "M V30 COUNTS {} {} 0 0 0", natoms, nbonds)?;
writeln!(file, "M V30 BEGIN ATOM")?;
for (i, atom) in self.atoms.iter().enumerate() {
writeln!(
file,
"M V30 {} {} {:.6} {:.6} {:.6} 0",
i + 1,
atom.element.to_letter(),
atom.posit.x,
atom.posit.y,
atom.posit.z,
)?;
}
writeln!(file, "M V30 END ATOM")?;
writeln!(file, "M V30 BEGIN BOND")?;
for (i, bond) in self.bonds.iter().enumerate() {
writeln!(
file,
"M V30 {} {} {} {}",
i + 1,
bond.bond_type.to_str_sdf(),
bond.atom_0_sn,
bond.atom_1_sn,
)?;
}
writeln!(file, "M V30 END BOND")?;
writeln!(file, "M V30 END CTAB")?;
writeln!(file, "M END")?;
}
}
for m in &self.metadata {
write_metadata(m.0, m.1, &mut file)?;
}
let mut partial_charges = Vec::new();
let mut all_partial_charges_present = true;
for atom in &self.atoms {
match atom.partial_charge {
Some(q) => partial_charges.push(q),
None => {
all_partial_charges_present = false;
break;
}
}
}
if all_partial_charges_present {
let charges_formated: Vec<_> =
partial_charges.iter().map(|q| format!("{q:.8}")).collect();
let charge_str = charges_formated.join(" ");
write_metadata("atom.dprop.PartialCharge", &charge_str, &mut file)?;
}
if !self.pharmacophore_features.is_empty() {
let v = format_pharmacophore_features(&self.pharmacophore_features);
write_metadata("PUBCHEM_PHARMACOPHORE_FEATURES", &v, &mut file)?;
}
writeln!(file, "$$$$")?;
Ok(())
}
pub fn load(path: &Path) -> io::Result<Self> {
let data_str = fs::read_to_string(path)?;
Self::new(&data_str)
}
pub fn load_drugbank(ident: &str) -> io::Result<Self> {
let data_str = drugbank::load_sdf(ident)
.map_err(|e| io::Error::other(format!("Error loading: {e:?}")))?;
Self::new(&data_str)
}
pub fn load_pubchem(id_type: StructureSearchNamespace, id: &str) -> io::Result<Self> {
let data_str = pubchem::load_sdf(id_type, id)
.map_err(|e| io::Error::other(format!("Error loading: {e:?}")))?;
Self::new(&data_str)
}
pub fn load_pdbe(ident: &str) -> io::Result<Self> {
let data_str =
pdbe::load_sdf(ident).map_err(|e| io::Error::other(format!("Error loading: {e:?}")))?;
Self::new(&data_str)
}
}
impl From<Mol2> for Sdf {
fn from(m: Mol2) -> Self {
Self {
ident: m.ident.clone(),
metadata: m.metadata.clone(),
atoms: m.atoms.clone(),
bonds: m.bonds.clone(),
chains: Vec::new(),
residues: Vec::new(),
pharmacophore_features: Vec::new(),
}
}
}
fn write_metadata(key: &str, val: &str, file: &mut File) -> io::Result<()> {
writeln!(file, "> <{key}>")?;
writeln!(file, "{val}")?;
writeln!(file)?;
Ok(())
}
pub(crate) fn parse_pharmacophore_features(
rows: &[&str],
) -> io::Result<Vec<PharmacophoreFeatureGeneric>> {
if rows.is_empty() {
return Ok(Vec::new());
}
let n = rows[0].trim().parse::<usize>().map_err(|_| {
io::Error::new(
ErrorKind::InvalidData,
"Bad PUBCHEM_PHARMACOPHORE_FEATURES count",
)
})?;
let mut out = Vec::with_capacity(n);
for line in rows.iter().skip(1).take(n) {
let cols: Vec<&str> = line.split_whitespace().collect();
if cols.len() < 2 {
return Err(io::Error::new(
ErrorKind::InvalidData,
"Bad PUBCHEM_PHARMACOPHORE_FEATURES line",
));
}
let k = cols[0].parse::<usize>().map_err(|_| {
io::Error::new(
ErrorKind::InvalidData,
"Bad PUBCHEM_PHARMACOPHORE_FEATURES atom-count",
)
})?;
if cols.len() < 1 + k + 1 {
return Err(io::Error::new(
ErrorKind::InvalidData,
"PUBCHEM_PHARMACOPHORE_FEATURES line missing atoms/type",
));
}
let mut atom_sns = Vec::with_capacity(k);
for t in cols.iter().skip(1).take(k) {
let sn = t.parse::<u32>().map_err(|_| {
io::Error::new(
ErrorKind::InvalidData,
"Bad PUBCHEM_PHARMACOPHORE_FEATURES atom index",
)
})?;
atom_sns.push(sn);
}
let type_str = cols[1 + k..].join(" ");
let type_ = PharmacophoreTypeGeneric::from_pubchem_str(&type_str).ok_or_else(|| {
io::Error::new(
ErrorKind::InvalidData,
format!("Unknown pharmacophore type: {type_str}"),
)
})?;
out.push(PharmacophoreFeatureGeneric { atom_sns, type_ });
}
Ok(out)
}
pub(crate) fn format_pharmacophore_features(features: &[PharmacophoreFeatureGeneric]) -> String {
let mut s = String::new();
s.push_str(&format!("{}\n", features.len()));
for f in features {
s.push_str(&format!("{}", f.atom_sns.len()));
for sn in &f.atom_sns {
s.push_str(&format!(" {sn}"));
}
s.push(' ');
s.push_str(&f.type_.clone().to_pubchem_str());
s.push('\n');
}
s
}