use std::fmt;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(transparent)]
pub struct Molecule {
smiles: String,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum MoleculeError {
Empty,
UnbalancedParens,
UnbalancedBrackets,
InvalidChar(char),
}
impl fmt::Display for MoleculeError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
MoleculeError::Empty => write!(f, "empty SMILES"),
MoleculeError::UnbalancedParens => write!(f, "unbalanced parentheses"),
MoleculeError::UnbalancedBrackets => write!(f, "unbalanced brackets"),
MoleculeError::InvalidChar(c) => write!(f, "invalid SMILES character: {c:?}"),
}
}
}
impl Molecule {
pub fn parse(raw: &str) -> Result<Self, MoleculeError> {
let s = raw.trim().to_string();
if s.is_empty() {
return Err(MoleculeError::Empty);
}
let mut paren_depth: i32 = 0;
let mut bracket_depth: i32 = 0;
for ch in s.chars() {
match ch {
'(' => paren_depth += 1,
')' => {
paren_depth -= 1;
if paren_depth < 0 {
return Err(MoleculeError::UnbalancedParens);
}
}
'[' => bracket_depth += 1,
']' => {
bracket_depth -= 1;
if bracket_depth < 0 {
return Err(MoleculeError::UnbalancedBrackets);
}
}
'A'..='Z' | 'a'..='z' => {}
'0'..='9' => {}
'-' | '=' | '#' | ':' | '~' => {}
'.' => {}
'/' | '\\' | '@' => {}
'+' | '%' => {}
_ => return Err(MoleculeError::InvalidChar(ch)),
}
}
if paren_depth != 0 {
return Err(MoleculeError::UnbalancedParens);
}
if bracket_depth != 0 {
return Err(MoleculeError::UnbalancedBrackets);
}
Ok(Molecule { smiles: s })
}
pub fn as_str(&self) -> &str {
&self.smiles
}
pub fn len(&self) -> usize {
self.smiles.len()
}
pub fn is_empty(&self) -> bool {
self.smiles.is_empty()
}
pub fn upper(&self) -> String {
self.smiles.to_ascii_uppercase()
}
}
impl fmt::Display for Molecule {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.smiles)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum FunctionalGroup {
Phosphonate,
PhosphorusFluoride,
Nitro,
Peroxide,
Azide,
AromaticAmine,
NNitroso,
PerfluoroCarbon,
Mustard,
Isocyanate,
Epoxide,
Aldehyde,
Thiol,
HeavyMetal,
StereoCenter,
}
pub fn detect_functional_groups(mol: &Molecule) -> Vec<FunctionalGroup> {
let s = mol.as_str();
let upper = mol.upper();
let mut groups = Vec::new();
if upper.contains("P(=O)") || upper.contains("P(O)") {
groups.push(FunctionalGroup::Phosphonate);
}
if (upper.contains("P(=O)") || upper.contains("P(")) && upper.contains('F') {
groups.push(FunctionalGroup::PhosphorusFluoride);
}
if s.contains("[N+](=O)[O-]") || s.contains("N(=O)=O") || s.contains("[NO2]") {
groups.push(FunctionalGroup::Nitro);
}
if upper.contains("OO") {
groups.push(FunctionalGroup::Peroxide);
}
if s.contains("N=N=N") || s.contains("[N-]=[N+]=[N-]") {
groups.push(FunctionalGroup::Azide);
}
if s.contains("Nc1ccccc1") || s.contains("c1ccc(N)cc1") || s.contains("c1cc(N)ccc1") {
groups.push(FunctionalGroup::AromaticAmine);
}
if s.contains("NN=O") || s.contains("N(=O)N") {
groups.push(FunctionalGroup::NNitroso);
}
if s.contains("C(F)(F)F") {
groups.push(FunctionalGroup::PerfluoroCarbon);
}
if s.contains("ClCCS") || s.contains("SCCCl") {
groups.push(FunctionalGroup::Mustard);
}
if s.contains("N=C=O") {
groups.push(FunctionalGroup::Isocyanate);
}
if s.contains("C1OC1") || s.contains("C1(O1)") {
groups.push(FunctionalGroup::Epoxide);
}
if s.contains("[SH]") {
groups.push(FunctionalGroup::Thiol);
}
for metal in &["[Hg]", "[Pb]", "[Cd]", "[As]", "[Cr", "[U]", "[Tl]"] {
if s.contains(metal) {
groups.push(FunctionalGroup::HeavyMetal);
break;
}
}
if s.contains('@') {
groups.push(FunctionalGroup::StereoCenter);
}
groups
}
pub const SMARTS_RULE_LIBRARY_VERSION: u32 = 1;
#[derive(Debug, Clone)]
pub struct SmartsRule {
pub id: &'static str,
pub label: &'static str,
pub patterns: &'static [&'static str],
pub severity: RuleSeverity,
pub hazard_class: &'static str,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum RuleSeverity {
Fail,
Advisory,
}
#[derive(Debug, Clone)]
pub struct RuleMatch {
pub rule_id: &'static str,
pub label: &'static str,
pub matched_pattern: &'static str,
pub severity: RuleSeverity,
pub hazard_class: &'static str,
}
pub static CWC_RULES: &[SmartsRule] = &[
SmartsRule {
id: "CWC-SA-01",
label: "Alkylphosphonofluoridate (G-series core)",
patterns: &["P(=O)(F)", "P(F)(=O)"],
severity: RuleSeverity::Fail,
hazard_class: "cwc-schedule-1",
},
SmartsRule {
id: "CWC-SA-02",
label: "Phosphoramidocyanidate (tabun-class core)",
patterns: &["P(=O)(N)(C#N)", "P(N)(=O)(C#N)"],
severity: RuleSeverity::Fail,
hazard_class: "cwc-schedule-1",
},
SmartsRule {
id: "CWC-SA-03",
label: "Sulfur mustard (bis-chloroethyl thioether)",
patterns: &["ClCCSCCCl"],
severity: RuleSeverity::Fail,
hazard_class: "cwc-schedule-1",
},
SmartsRule {
id: "CWC-SA-04",
label: "Nitrogen mustard (bis-chloroethyl amine)",
patterns: &["ClCCN(CCCl)"],
severity: RuleSeverity::Fail,
hazard_class: "cwc-schedule-1",
},
SmartsRule {
id: "CWC-SA-05",
label: "Alkylphosphonyl difluoride (DF precursor)",
patterns: &["P(=O)(F)F", "P(F)(F)=O"],
severity: RuleSeverity::Advisory,
hazard_class: "cwc-schedule-2",
},
SmartsRule {
id: "CWC-SA-06",
label: "Dialkylaminoethanol (precursor amine)",
patterns: &["N(C)(C)CCO", "OCCN(C)C"],
severity: RuleSeverity::Advisory,
hazard_class: "cwc-schedule-2",
},
SmartsRule {
id: "CWC-SA-07",
label: "Thiodiglycol (mustard precursor)",
patterns: &["OCCSCC O", "OCCSCCO"],
severity: RuleSeverity::Advisory,
hazard_class: "cwc-schedule-2",
},
SmartsRule {
id: "CWC-SA-08",
label: "Alkylphosphonate with leaving group",
patterns: &["P(=O)(OC)(F)", "P(=O)(F)(OC)"],
severity: RuleSeverity::Advisory,
hazard_class: "cwc-structural-alert",
},
];
pub static EXPLOSIVE_RULES: &[SmartsRule] = &[
SmartsRule {
id: "EXP-SA-01",
label: "Polynitro compound (≥3 nitro groups)",
patterns: &["[N+](=O)[O-]"],
severity: RuleSeverity::Advisory,
hazard_class: "explosive",
},
SmartsRule {
id: "EXP-SA-02",
label: "Organic peroxide",
patterns: &["OO"],
severity: RuleSeverity::Advisory,
hazard_class: "explosive",
},
SmartsRule {
id: "EXP-SA-03",
label: "Organic azide",
patterns: &["N=N=N", "[N-]=[N+]=[N-]"],
severity: RuleSeverity::Advisory,
hazard_class: "explosive",
},
];
pub fn match_rules(mol: &Molecule, rules: &[SmartsRule]) -> Vec<RuleMatch> {
let s = mol.as_str();
let mut matches = Vec::new();
for rule in rules {
for pat in rule.patterns {
if s.contains(pat) {
matches.push(RuleMatch {
rule_id: rule.id,
label: rule.label,
matched_pattern: pat,
severity: rule.severity,
hazard_class: rule.hazard_class,
});
break; }
}
}
matches
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ComplexityScore {
pub ring_count: usize,
pub stereo_count: usize,
pub heteroatom_ratio: f64,
pub smiles_length: usize,
pub score: f64,
}
pub fn complexity_score(mol: &Molecule) -> ComplexityScore {
let s = mol.as_str();
let len = s.len();
let mut ring_count = 0usize;
let mut in_bracket = false;
for ch in s.chars() {
match ch {
'[' => in_bracket = true,
']' => in_bracket = false,
'0'..='9' if !in_bracket => ring_count += 1,
'%' if !in_bracket => ring_count += 1,
_ => {}
}
}
ring_count /= 2;
let stereo_count = s.chars().filter(|c| *c == '@').count();
let atom_chars: usize = s.chars().filter(|c| c.is_ascii_alphabetic()).count().max(1);
let hetero_chars: usize = s
.chars()
.filter(|c| matches!(c, 'N' | 'O' | 'S' | 'P' | 'F' | 'I' | 'n' | 'o' | 's'))
.count();
let cl_count = count_non_overlapping(s, "Cl");
let br_count = count_non_overlapping(s, "Br");
let total_hetero = hetero_chars + cl_count + br_count;
let heteroatom_ratio = total_hetero as f64 / atom_chars as f64;
let ring_score = (ring_count as f64 / 6.0).min(1.0);
let stereo_score = (stereo_count as f64 / 4.0).min(1.0);
let len_score = (len as f64 / 300.0).min(1.0);
let hetero_score = heteroatom_ratio.min(1.0);
let score = (0.30 * ring_score + 0.20 * stereo_score + 0.25 * len_score + 0.25 * hetero_score)
.clamp(0.0, 1.0);
ComplexityScore {
ring_count,
stereo_count,
heteroatom_ratio,
smiles_length: len,
score,
}
}
fn count_non_overlapping(haystack: &str, pat: &str) -> usize {
if pat.is_empty() {
return 0;
}
haystack.matches(pat).count()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_valid_smiles() {
assert!(Molecule::parse("CCO").is_ok());
assert!(Molecule::parse("c1ccccc1").is_ok());
assert!(Molecule::parse("[Na]Cl").is_ok());
assert!(Molecule::parse("CC(=O)O").is_ok());
assert!(Molecule::parse("C(/F)=C/Cl").is_ok()); }
#[test]
fn parse_rejects_empty() {
assert_eq!(Molecule::parse(""), Err(MoleculeError::Empty));
assert_eq!(Molecule::parse(" "), Err(MoleculeError::Empty));
}
#[test]
fn parse_rejects_unbalanced_parens() {
assert_eq!(
Molecule::parse("CC(=O"),
Err(MoleculeError::UnbalancedParens)
);
assert_eq!(Molecule::parse("CC)"), Err(MoleculeError::UnbalancedParens));
}
#[test]
fn parse_rejects_unbalanced_brackets() {
assert_eq!(
Molecule::parse("[Na"),
Err(MoleculeError::UnbalancedBrackets)
);
}
#[test]
fn parse_strips_whitespace() {
let mol = Molecule::parse(" CCO ").unwrap();
assert_eq!(mol.as_str(), "CCO");
}
#[test]
fn molecule_display() {
let mol = Molecule::parse("CCO").unwrap();
assert_eq!(format!("{mol}"), "CCO");
}
#[test]
fn molecule_serde_roundtrip() {
let mol = Molecule::parse("CC(=O)O").unwrap();
let json = serde_json::to_string(&mol).unwrap();
let back: Molecule = serde_json::from_str(&json).unwrap();
assert_eq!(mol, back);
}
#[test]
fn detect_phosphonate() {
let mol = Molecule::parse("CCP(=O)(OC)OC").unwrap();
let groups = detect_functional_groups(&mol);
assert!(groups.contains(&FunctionalGroup::Phosphonate));
}
#[test]
fn detect_phosphorus_fluoride() {
let mol = Molecule::parse("CCP(=O)(F)OC").unwrap();
let groups = detect_functional_groups(&mol);
assert!(groups.contains(&FunctionalGroup::PhosphorusFluoride));
}
#[test]
fn detect_nitro_group() {
let mol = Molecule::parse("CC[N+](=O)[O-]").unwrap();
let groups = detect_functional_groups(&mol);
assert!(groups.contains(&FunctionalGroup::Nitro));
}
#[test]
fn detect_peroxide() {
let mol = Molecule::parse("COOC").unwrap();
let groups = detect_functional_groups(&mol);
assert!(groups.contains(&FunctionalGroup::Peroxide));
}
#[test]
fn detect_azide() {
let mol = Molecule::parse("CCN=N=N").unwrap();
let groups = detect_functional_groups(&mol);
assert!(groups.contains(&FunctionalGroup::Azide));
}
#[test]
fn detect_aromatic_amine() {
let mol = Molecule::parse("Nc1ccccc1").unwrap();
let groups = detect_functional_groups(&mol);
assert!(groups.contains(&FunctionalGroup::AromaticAmine));
}
#[test]
fn detect_perfluoro() {
let mol = Molecule::parse("CC(F)(F)F").unwrap();
let groups = detect_functional_groups(&mol);
assert!(groups.contains(&FunctionalGroup::PerfluoroCarbon));
}
#[test]
fn detect_mustard() {
let mol = Molecule::parse("ClCCSCCCl").unwrap();
let groups = detect_functional_groups(&mol);
assert!(groups.contains(&FunctionalGroup::Mustard));
}
#[test]
fn detect_heavy_metal() {
let mol = Molecule::parse("[Hg](Cl)Cl").unwrap();
let groups = detect_functional_groups(&mol);
assert!(groups.contains(&FunctionalGroup::HeavyMetal));
}
#[test]
fn detect_stereo() {
let mol = Molecule::parse("C[C@@H](O)F").unwrap();
let groups = detect_functional_groups(&mol);
assert!(groups.contains(&FunctionalGroup::StereoCenter));
}
#[test]
fn clean_molecule_no_groups() {
let mol = Molecule::parse("CCO").unwrap();
let groups = detect_functional_groups(&mol);
assert!(
groups.is_empty(),
"ethanol should have no alerts: {groups:?}"
);
}
#[test]
fn cwc_sa01_matches_g_series_core() {
let mol = Molecule::parse("CCP(=O)(F)OC").unwrap();
let matches = match_rules(&mol, CWC_RULES);
assert!(
matches.iter().any(|m| m.rule_id == "CWC-SA-01"),
"G-series core should trigger CWC-SA-01: {matches:?}"
);
}
#[test]
fn cwc_sa03_matches_sulfur_mustard() {
let mol = Molecule::parse("ClCCSCCCl").unwrap();
let matches = match_rules(&mol, CWC_RULES);
assert!(
matches.iter().any(|m| m.rule_id == "CWC-SA-03"),
"sulfur mustard should trigger CWC-SA-03: {matches:?}"
);
}
#[test]
fn cwc_sa04_matches_nitrogen_mustard() {
let mol = Molecule::parse("ClCCN(CCCl)C").unwrap();
let matches = match_rules(&mol, CWC_RULES);
assert!(
matches.iter().any(|m| m.rule_id == "CWC-SA-04"),
"nitrogen mustard should trigger CWC-SA-04: {matches:?}"
);
}
#[test]
fn clean_molecule_no_cwc_matches() {
let mol = Molecule::parse("CCO").unwrap();
let matches = match_rules(&mol, CWC_RULES);
assert!(matches.is_empty(), "ethanol should not trigger CWC rules");
}
#[test]
fn explosive_rules_detect_polynitro() {
let mol = Molecule::parse("CC([N+](=O)[O-])([N+](=O)[O-])[N+](=O)[O-]").unwrap();
let matches = match_rules(&mol, EXPLOSIVE_RULES);
assert!(
matches.iter().any(|m| m.rule_id == "EXP-SA-01"),
"polynitro should trigger EXP-SA-01"
);
}
#[test]
fn smarts_rule_library_version() {
assert_eq!(SMARTS_RULE_LIBRARY_VERSION, 1);
}
#[test]
fn simple_molecule_low_complexity() {
let mol = Molecule::parse("CCO").unwrap();
let cs = complexity_score(&mol);
assert!(
cs.score < 0.3,
"ethanol should be low complexity: {}",
cs.score
);
assert_eq!(cs.ring_count, 0);
assert_eq!(cs.stereo_count, 0);
}
#[test]
fn benzene_has_rings() {
let mol = Molecule::parse("c1ccccc1").unwrap();
let cs = complexity_score(&mol);
assert!(cs.ring_count >= 1, "benzene should have ≥1 ring");
}
#[test]
fn complex_molecule_higher_score() {
let mol = Molecule::parse("CC1(C)SC2C(NC(=O)C2N)C1C(=O)O").unwrap();
let cs = complexity_score(&mol);
assert!(
cs.score > 0.2,
"penicillin-like should be moderate complexity: {}",
cs.score
);
}
#[test]
fn long_smiles_high_length_component() {
let long = "C".repeat(300);
let mol = Molecule::parse(&long).unwrap();
let cs = complexity_score(&mol);
assert!(cs.smiles_length >= 300);
assert!(cs.score > 0.2, "long SMILES should have non-trivial score");
}
#[test]
fn stereo_molecule_counted() {
let mol = Molecule::parse("C[C@@H](O)[C@H](O)C").unwrap();
let cs = complexity_score(&mol);
assert!(cs.stereo_count >= 2);
}
}