use std::collections::HashMap;
use thiserror::Error;
#[derive(Debug, Error)]
pub enum DescriptorError {
#[error("unsupported or invalid molecule string: {0}")]
ParseError(String),
#[error("unknown element: {0}")]
UnknownElement(String),
}
pub fn molecular_weight(mol: &str) -> Result<f64, DescriptorError> {
let weights = atomic_weights();
if mol.starts_with("InChI=") {
if let Some(rest) = mol.splitn(2, '/').nth(1) {
let formula = rest.split('/').next().unwrap_or(rest);
return molecular_weight_from_formula(formula, &weights);
} else {
return Err(DescriptorError::ParseError(mol.to_string()));
}
}
#[derive(Debug)]
struct Atom {
element: String,
explicit_bonds: usize,
explicit_h: Option<usize>, }
let mut atoms: Vec<Atom> = Vec::new();
let chars: Vec<char> = mol.chars().collect();
let mut i = 0usize;
let mut pending_bond: Option<usize> = None;
while i < chars.len() {
let c = chars[i];
match c {
'[' => {
let mut j = i + 1;
let mut content = String::new();
while j < chars.len() && chars[j] != ']' {
content.push(chars[j]);
j += 1;
}
if j >= chars.len() {
return Err(DescriptorError::ParseError(mol.to_string()));
}
i = j + 1;
let content_chars: Vec<char> = content.chars().collect();
if content_chars.is_empty() {
return Err(DescriptorError::ParseError(mol.to_string()));
}
let mut el = content_chars[0].to_string();
if content_chars.len() >= 2 && content_chars[1].is_lowercase() {
el.push(content_chars[1]);
}
let mut explicit_h: Option<usize> = None;
if let Some(pos) = content.find('H') {
let digits: String = content[pos + 1..]
.chars()
.take_while(|ch| ch.is_ascii_digit())
.collect();
if digits.is_empty() {
explicit_h = Some(1);
} else if let Ok(n) = digits.parse() {
explicit_h = Some(n);
}
}
let mut explicit_bonds = 0usize;
if let Some(b) = pending_bond {
explicit_bonds += b;
pending_bond = None;
}
atoms.push(Atom {
element: el,
explicit_bonds,
explicit_h,
});
continue;
}
'=' => {
pending_bond = Some(2);
i += 1;
continue;
}
'#' => {
pending_bond = Some(3);
i += 1;
continue;
}
':' => {
pending_bond = Some(1);
i += 1;
continue;
}
'-' => {
pending_bond = Some(1);
i += 1;
continue;
}
'(' | ')' | '%' => {
return Err(DescriptorError::ParseError(
"SMILES with branches/rings not supported in simple parser".to_string(),
));
}
ch if ch.is_whitespace() => {
i += 1;
continue;
}
_ => {
let mut el = String::new();
if c.is_uppercase() {
el.push(c);
if i + 1 < chars.len() && chars[i + 1].is_lowercase() {
el.push(chars[i + 1]);
i += 1;
}
} else if c.is_lowercase() {
let mapped = match c {
'c' => "C",
'n' => "N",
'o' => "O",
's' => "S",
'p' => "P",
'b' => "B",
'r' => "R", _ => {
return Err(DescriptorError::ParseError(format!(
"unknown aromatic element: {}",
c
)))
}
};
el.push_str(mapped);
} else {
return Err(DescriptorError::ParseError(format!(
"unexpected character in SMILES: {}",
c
)));
}
let mut explicit_bonds = 0usize;
if let Some(b) = pending_bond {
explicit_bonds += b;
pending_bond = None;
}
atoms.push(Atom {
element: el,
explicit_bonds,
explicit_h: None,
});
i += 1;
continue;
}
}
}
if atoms.is_empty() {
return Err(DescriptorError::ParseError(mol.to_string()));
}
for idx in 0..atoms.len() - 1 {
atoms[idx].explicit_bonds += 1;
atoms[idx + 1].explicit_bonds += 1;
}
let valences: HashMap<&str, usize> = [
("H", 1),
("C", 4),
("N", 3),
("O", 2),
("S", 2), ("P", 3),
("F", 1),
("Cl", 1),
("Br", 1),
("I", 1),
("B", 3),
("Si", 4),
]
.iter()
.cloned()
.collect();
let mut total_mass = 0f64;
for atom in atoms.iter() {
let el = atom.element.as_str();
let mass = weights
.get(el)
.ok_or_else(|| DescriptorError::UnknownElement(el.to_string()))?;
let explicit_bonds = atom.explicit_bonds;
let implicit_h = if let Some(explicit_h) = atom.explicit_h {
explicit_h
} else {
if let Some(&val) = valences.get(el) {
if val >= explicit_bonds {
val - explicit_bonds
} else {
0usize
}
} else {
0usize
}
};
total_mass += *mass;
if implicit_h > 0 {
let h_mass = *weights
.get("H")
.ok_or_else(|| DescriptorError::UnknownElement("H".to_string()))?;
total_mass += (implicit_h as f64) * h_mass;
}
}
Ok(total_mass)
}
fn molecular_weight_from_formula(
formula: &str,
weights: &HashMap<&'static str, f64>,
) -> Result<f64, DescriptorError> {
let mut i = 0usize;
let chars: Vec<char> = formula.chars().collect();
let mut total = 0f64;
while i < chars.len() {
let c = chars[i];
if !c.is_uppercase() {
return Err(DescriptorError::ParseError(formula.to_string()));
}
let mut el = c.to_string();
if i + 1 < chars.len() && chars[i + 1].is_lowercase() {
el.push(chars[i + 1]);
i += 1;
}
i += 1;
let mut digits = String::new();
while i < chars.len() && chars[i].is_ascii_digit() {
digits.push(chars[i]);
i += 1;
}
let count: usize = if digits.is_empty() { 1 } else { digits.parse().unwrap_or(1) };
let mass = weights
.get(el.as_str())
.ok_or_else(|| DescriptorError::UnknownElement(el.clone()))?;
total += (*mass) * (count as f64);
}
Ok(total)
}
fn atomic_weights() -> HashMap<&'static str, f64> {
let mut m = HashMap::new();
m.insert("H", 1.00782503223);
m.insert("C", 12.0);
m.insert("N", 14.00307400443);
m.insert("O", 15.99491461956);
m.insert("S", 31.9720711744);
m.insert("P", 30.97376199842);
m.insert("F", 18.998403163);
m.insert("Cl", 34.968852682);
m.insert("Br", 78.9183376);
m.insert("I", 126.90447);
m.insert("B", 11.00930536);
m.insert("Si", 27.976926532);
m
}
#[cfg(test)]
mod tests {
use super::*;
const EPS: f64 = 1e-8;
#[test]
fn water_molecular_weight_smiles_o() {
let mw = molecular_weight("O").expect("calculate MW");
let expected = 2.0 * 1.00782503223 + 15.99491461956;
assert!(
(mw - expected).abs() < EPS,
"water MW mismatch: got {}, expected {}",
mw,
expected
);
}
#[test]
fn ethanol_molecular_weight_smiles_cco() {
let mw = molecular_weight("CCO").expect("calculate MW");
let expected = 2.0 * 12.0 + 6.0 * 1.00782503223 + 15.99491461956;
assert!(
(mw - expected).abs() < EPS,
"ethanol MW mismatch: got {}, expected {}",
mw,
expected
);
}
#[test]
fn formula_parsing_h2o() {
let mw = molecular_weight_from_formula("H2O", &atomic_weights()).unwrap();
let expected = 2.0 * 1.00782503223 + 15.99491461956;
assert!((mw - expected).abs() < EPS);
}
}