use crate::types::OrganicInorganic;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum FunctionalGroup {
Anhydride,
Isocyanate,
Nitrile,
Nitro,
Epoxide,
SulphonicAcid,
Phosphate,
Amide,
Ester,
CarboxylicAcid,
Aldehyde,
Ketone,
Phenol,
Thiol,
Sulphide,
Alcohol,
Ether,
Amine,
Halide,
AromaticRing,
}
impl FunctionalGroup {
pub fn label(self) -> &'static str {
match self {
Self::Anhydride => "Anhydride",
Self::Isocyanate => "Isocyanate",
Self::Nitrile => "Nitrile",
Self::Nitro => "Nitro",
Self::Epoxide => "Epoxide",
Self::SulphonicAcid => "SulphonicAcid",
Self::Phosphate => "Phosphate",
Self::Amide => "Amide",
Self::Ester => "Ester",
Self::CarboxylicAcid => "CarboxylicAcid",
Self::Aldehyde => "Aldehyde",
Self::Ketone => "Ketone",
Self::Phenol => "Phenol",
Self::Thiol => "Thiol",
Self::Sulphide => "Sulphide",
Self::Alcohol => "Alcohol",
Self::Ether => "Ether",
Self::Amine => "Amine",
Self::Halide => "Halide",
Self::AromaticRing => "AromaticRing",
}
}
}
pub fn classify_organic(smiles: &str) -> OrganicInorganic {
if !smiles.chars().any(|c| c == 'C' || c == 'c') {
return OrganicInorganic::Inorganic;
}
let normalised = smiles.replace(' ', "");
let inorganic_exact: &[&str] = &[
"O=C=O", "[O-]C(=O)[O-]", "[O-]C([O-])=O",
"[C-]#[O+]", "[C+]#[O-]",
"S=C=S", "[C-]#N", "[N+]#[C-]",
"C(=O)([O-])[O-]", ];
if inorganic_exact.iter().any(|p| normalised == *p) {
return OrganicInorganic::Inorganic;
}
let metal_symbols: &[&str] = &[
"[Fe]", "[Co]", "[Ni]", "[Cr]", "[Mn]", "[Mo]", "[W]",
"[Ti]", "[V]", "[Ru]", "[Rh]", "[Pd]", "[Os]", "[Ir]",
"[Pt]", "[Zn]", "[Al]", "[Pb]", "[Sn]", "[Hg]", "[Tl]",
];
for metal in metal_symbols {
if smiles.contains(metal) {
let idx = smiles.find(metal).unwrap_or(usize::MAX);
let after = smiles.get(idx + metal.len()..).unwrap_or("");
let before = smiles.get(..idx).unwrap_or("");
let bonded = after.starts_with('C')
|| after.starts_with('c')
|| before.ends_with('C')
|| before.ends_with('c');
if bonded {
return OrganicInorganic::Organometallic;
}
}
}
OrganicInorganic::Organic
}
pub fn detect_functional_groups(smiles: &str) -> Vec<FunctionalGroup> {
let mut groups: Vec<FunctionalGroup> = Vec::new();
let any = |patterns: &[&str]| -> bool { patterns.iter().any(|p| smiles.contains(p)) };
let cyclic_anhydride = (1u8..=9).any(|n| {
smiles.contains(&format!("O=C{}OC(=O)", n))
});
if smiles.contains("C(=O)OC(=O)") || cyclic_anhydride {
groups.push(FunctionalGroup::Anhydride);
}
if any(&["N=C=O", "O=C=N"]) {
groups.push(FunctionalGroup::Isocyanate);
}
if any(&["C#N", "N#C"]) {
groups.push(FunctionalGroup::Nitrile);
}
if any(&[
"O=[N+]([O-])", "[N+](=O)[O-]", "N(=O)=O",
"[N+]([O-])=O",
"[N+](=O)([O-])",
]) {
groups.push(FunctionalGroup::Nitro);
}
if any(&[
"C1CO1", "C1OC1", "[C@@H]1O[C@H]1", "[C@H]1O[C@@H]1",
]) {
groups.push(FunctionalGroup::Epoxide);
}
if any(&["S(=O)(=O)O", "S(=O)(=O)[OH]", "S(O)(=O)=O", "[S](=O)(=O)O"]) {
groups.push(FunctionalGroup::SulphonicAcid);
}
if smiles.contains('P')
&& any(&["P(=O)(O)", "P(=O)([O", "P(O)(O)", "P([OH])", "OP(=O)", "P(=O)O"])
{
groups.push(FunctionalGroup::Phosphate);
}
if any(&[
"NC(=O)", "NC(C", "C(N)=O", "C(=O)N", "C(=O)[NH", "[NH]C(=O)", "[NH2]C(=O)",
"N)=O", ]) {
let has_iso = groups.contains(&FunctionalGroup::Isocyanate);
let has_nitrile = groups.contains(&FunctionalGroup::Nitrile);
if !has_iso && !has_nitrile {
groups.push(FunctionalGroup::Amide);
}
}
let has_anhydride = groups.contains(&FunctionalGroup::Anhydride);
if !has_anhydride
&& any(&[
"OC(C)=O", "OC(=O)C", "C(=O)OC", "C(=O)Oc", "OC(CC", "OC(c", ])
{
groups.push(FunctionalGroup::Ester);
}
let has_ester = groups.contains(&FunctionalGroup::Ester);
if !has_ester && !has_anhydride {
let has_acid_pattern = any(&[
"C(=O)O", "C(O)=O", "C(=O)[OH]", ]);
if has_acid_pattern {
groups.push(FunctionalGroup::CarboxylicAcid);
}
}
let has_higher_carbonyl = groups.iter().any(|g| {
matches!(
g,
FunctionalGroup::Amide
| FunctionalGroup::Ester
| FunctionalGroup::CarboxylicAcid
| FunctionalGroup::Anhydride
)
});
if !has_higher_carbonyl {
let aldehyde = smiles.ends_with("C=O")
|| smiles.ends_with("[CH]=O")
|| smiles.starts_with("O=C") || any(&["[CH]=O", "[CHO]"]);
if aldehyde {
groups.push(FunctionalGroup::Aldehyde);
}
}
if !has_higher_carbonyl {
let has_aldehyde = groups.contains(&FunctionalGroup::Aldehyde);
if !has_aldehyde
&& any(&[
"C(C)=O", "C(CC)=O", "C(CCC)=O",
"C(c)=O", "c(=O)C", "C(=O)C", ])
{
groups.push(FunctionalGroup::Ketone);
}
}
if any(&[
"c1ccccc1O", "Oc1ccccc1",
"c(O)", "c([OH])", "Oc1cc", "Oc1ccc", "c1cc(O)", "c1ccc(O)",
]) {
groups.push(FunctionalGroup::Phenol);
}
if any(&["[SH]", "C[SH]", "c[SH]"])
|| smiles.ends_with("CS")
|| smiles.ends_with("cS")
{
groups.push(FunctionalGroup::Thiol);
}
let has_sulphonic = groups.contains(&FunctionalGroup::SulphonicAcid);
let has_thiol = groups.contains(&FunctionalGroup::Thiol);
if !has_sulphonic
&& !has_thiol
&& smiles.contains('S')
&& any(&["CSC", "cSC", "CSc", "cSc", "C(S)C"])
{
groups.push(FunctionalGroup::Sulphide);
}
let has_phenol = groups.contains(&FunctionalGroup::Phenol);
let has_acid = groups.contains(&FunctionalGroup::CarboxylicAcid);
let has_ester2 = groups.contains(&FunctionalGroup::Ester);
let has_anhydride2 = groups.contains(&FunctionalGroup::Anhydride);
let has_aldehyde_grp = groups.contains(&FunctionalGroup::Aldehyde);
if !has_phenol && !has_acid && !has_ester2 && !has_anhydride2 && !has_aldehyde_grp {
let alcohol = any(&["[OH]", "C[OH]"])
|| smiles.ends_with("CO")
|| smiles.ends_with("CCO")
|| smiles.ends_with("O") || any(&["C(O)", "C([OH])"]);
if alcohol {
groups.push(FunctionalGroup::Alcohol);
}
}
let has_epoxide = groups.contains(&FunctionalGroup::Epoxide);
let has_ester3 = groups.contains(&FunctionalGroup::Ester);
let has_acid2 = groups.contains(&FunctionalGroup::CarboxylicAcid);
if !has_epoxide && !has_ester3 && !has_acid2 && !has_anhydride
&& any(&["COC", "cOC", "COc", "cOc"]) {
groups.push(FunctionalGroup::Ether);
}
let has_amide = groups.contains(&FunctionalGroup::Amide);
let has_nitrile = groups.contains(&FunctionalGroup::Nitrile);
let has_nitro = groups.contains(&FunctionalGroup::Nitro);
if smiles.contains('N')
&& !has_nitrile
&& !has_nitro
{
let amine = any(&[
"CN", "NC", "[NH2]", "[NH3+]", "[NH]", "cN", "Nc",
]);
if amine && (!has_amide || any(&["[NH2]", "[NH3+]", "CN(", "N(C)C"])) {
groups.push(FunctionalGroup::Amine);
}
}
if any(&[
"CF", "CCl", "CBr", "CI",
"Fc", "Clc", "Brc", "Ic",
"[F]", "[Cl]", "[Br]", "[I]",
"c[F]", "c[Cl]", "c[Br]", "c[I]",
"CF3", "CCl3", "CHF", "CHCl", "CHBr",
]) {
groups.push(FunctionalGroup::Halide);
}
if smiles.chars().any(|c| matches!(c, 'c' | 'n' | 'o' | 's' | 'p')) {
groups.push(FunctionalGroup::AromaticRing);
}
groups
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct StructuralFeatures {
pub carbon_count: u32,
pub hydroxyl_count: u32,
pub carbonyl_count: u32,
pub has_ring: bool,
pub has_aromatic_ring: bool,
pub has_cc_double_bond: bool,
pub has_halogen: bool,
}
pub fn detect_structural_features(smiles: &str) -> StructuralFeatures {
StructuralFeatures {
carbon_count: count_carbons(smiles),
hydroxyl_count: count_hydroxyls(smiles),
carbonyl_count: smiles.matches("=O").count() as u32,
has_ring: ring_present(smiles),
has_aromatic_ring: smiles.contains('c'),
has_cc_double_bond: cc_double_bond_present(smiles),
has_halogen: smiles.contains('F')
|| smiles.contains("Cl")
|| smiles.contains("Br")
|| (smiles.contains('I') && !smiles.contains("In")),
}
}
fn count_carbons(smiles: &str) -> u32 {
let mut count = 0u32;
let mut chars = smiles.chars().peekable();
let mut in_bracket = false;
let mut bracket_buf = String::new();
while let Some(ch) = chars.next() {
match ch {
'[' => {
in_bracket = true;
bracket_buf.clear();
}
']' if in_bracket => {
in_bracket = false;
let sym = bracket_buf.trim_start_matches(|c: char| c.is_ascii_digit());
if sym.starts_with('C') || sym.starts_with('c') {
count += 1;
}
}
c if in_bracket => bracket_buf.push(c),
'C' => {
if chars.peek() == Some(&'l') {
chars.next(); } else {
count += 1;
}
}
'c' => count += 1,
_ => {}
}
}
count
}
fn count_hydroxyls(smiles: &str) -> u32 {
let chars: Vec<char> = smiles.chars().collect();
let n = chars.len();
let mut count = 0u32;
let mut i = 0;
while i < n {
if chars[i] == '[' {
i += 1;
let mut buf = String::new();
while i < n && chars[i] != ']' {
buf.push(chars[i]);
i += 1;
}
i += 1; let sym = buf.trim_start_matches(|c: char| c.is_ascii_digit());
if sym.starts_with("OH") {
count += 1;
}
continue;
}
if chars[i] == 'O' {
let prev = if i > 0 { chars[i - 1] } else { '\0' };
let next = if i + 1 < n { chars[i + 1] } else { '\0' };
if prev == '=' {
i += 1;
continue;
}
let prev_is_c = matches!(prev, 'C' | 'c' | ')');
let next_is_c = matches!(next, 'C' | 'c' | '(');
if prev_is_c && next_is_c {
i += 1;
continue;
}
count += 1;
}
i += 1;
}
count
}
fn ring_present(smiles: &str) -> bool {
let mut in_bracket = false;
for ch in smiles.chars() {
match ch {
'[' => in_bracket = true,
']' => in_bracket = false,
c if c.is_ascii_digit() && !in_bracket => return true,
_ => {}
}
}
false
}
fn cc_double_bond_present(smiles: &str) -> bool {
smiles.contains("C=C")
|| smiles.contains("c=c")
|| smiles.contains("C=c")
|| smiles.contains("c=C")
|| smiles.contains("(=C)")
|| smiles.contains("(=c)")
}
#[cfg(test)]
mod tests {
use super::*;
fn fg(smiles: &str) -> Vec<FunctionalGroup> {
detect_functional_groups(smiles)
}
fn has(smiles: &str, g: FunctionalGroup) -> bool {
fg(smiles).contains(&g)
}
#[test]
fn co2_is_inorganic() {
assert_eq!(classify_organic("O=C=O"), OrganicInorganic::Inorganic);
}
#[test]
fn water_is_inorganic() {
assert_eq!(classify_organic("O"), OrganicInorganic::Inorganic);
}
#[test]
fn ethanol_is_organic() {
assert_eq!(classify_organic("CCO"), OrganicInorganic::Organic);
}
#[test]
fn benzene_is_organic() {
assert_eq!(classify_organic("c1ccccc1"), OrganicInorganic::Organic);
}
#[test]
fn acetic_acid_detected() {
assert!(has("CC(=O)O", FunctionalGroup::CarboxylicAcid));
assert!(!has("CC(=O)O", FunctionalGroup::Ester));
}
#[test]
fn ethyl_acetate_detected_as_ester() {
assert!(has("CCOC(C)=O", FunctionalGroup::Ester));
assert!(!has("CCOC(C)=O", FunctionalGroup::CarboxylicAcid));
}
#[test]
fn phthalic_anhydride_detected() {
let groups = fg("O=C1OC(=O)c2ccccc21");
assert!(groups.contains(&FunctionalGroup::Anhydride));
assert!(!groups.contains(&FunctionalGroup::Ester));
}
#[test]
fn acetaldehyde_detected() {
assert!(has("CC=O", FunctionalGroup::Aldehyde));
assert!(!has("CC=O", FunctionalGroup::Ketone));
}
#[test]
fn acetaldehyde_not_classified_as_alcohol() {
assert!(!has("CC=O", FunctionalGroup::Alcohol),
"aldehyde SMILES 'CC=O' must not produce Alcohol group");
}
#[test]
fn acetone_detected_as_ketone() {
assert!(has("CC(C)=O", FunctionalGroup::Ketone));
assert!(!has("CC(C)=O", FunctionalGroup::Aldehyde));
}
#[test]
fn ethanol_detected_as_alcohol() {
assert!(has("CCO", FunctionalGroup::Alcohol));
assert!(!has("CCO", FunctionalGroup::Ether));
}
#[test]
fn dimethyl_ether_detected() {
assert!(has("COC", FunctionalGroup::Ether));
assert!(!has("COC", FunctionalGroup::Alcohol));
}
#[test]
fn methylamine_detected() {
assert!(has("CN", FunctionalGroup::Amine));
}
#[test]
fn acetamide_detected() {
assert!(has("CC(N)=O", FunctionalGroup::Amide));
assert!(!has("CC(N)=O", FunctionalGroup::Ketone));
}
#[test]
fn acetonitrile_detected() {
assert!(has("CC#N", FunctionalGroup::Nitrile));
}
#[test]
fn chloromethane_detected() {
assert!(has("CCl", FunctionalGroup::Halide));
}
#[test]
fn ethylene_oxide_detected() {
assert!(has("C1CO1", FunctionalGroup::Epoxide));
}
#[test]
fn benzene_detected_as_aromatic() {
assert!(has("c1ccccc1", FunctionalGroup::AromaticRing));
}
#[test]
fn phenol_detected() {
assert!(has("Oc1ccccc1", FunctionalGroup::Phenol));
}
#[test]
fn nitrobenzene_detected() {
assert!(has("O=[N+]([O-])c1ccccc1", FunctionalGroup::Nitro));
}
#[test]
fn ethanesulfonic_acid_detected() {
assert!(has("CCS(=O)(=O)O", FunctionalGroup::SulphonicAcid));
}
#[test]
fn dimethyl_sulfide_detected() {
assert!(has("CSC", FunctionalGroup::Sulphide));
}
#[test]
fn methanethiol_detected() {
assert!(has("C[SH]", FunctionalGroup::Thiol));
}
#[test]
fn isocyanate_detected() {
assert!(has("CN=C=O", FunctionalGroup::Isocyanate));
}
#[test]
fn trimethyl_phosphate_detected() {
assert!(has("COP(=O)(OC)OC", FunctionalGroup::Phosphate));
}
fn sf(smiles: &str) -> StructuralFeatures {
detect_structural_features(smiles)
}
#[test]
fn acetone_carbon_count_3() {
let f = sf("CC(C)=O");
assert_eq!(f.carbon_count, 3);
assert!(!f.has_ring);
assert!(!f.has_aromatic_ring);
assert!(!f.has_cc_double_bond);
assert_eq!(f.carbonyl_count, 1);
}
#[test]
fn ethanol_hydroxyl_count_1() {
let f = sf("CCO");
assert_eq!(f.carbon_count, 2);
assert_eq!(f.hydroxyl_count, 1);
}
#[test]
fn ethylene_glycol_hydroxyl_count_2() {
let f = sf("OCCO");
assert_eq!(f.carbon_count, 2);
assert_eq!(f.hydroxyl_count, 2);
}
#[test]
fn glycerol_hydroxyl_count_3() {
let f = sf("OCC(O)CO");
assert_eq!(f.carbon_count, 3);
assert_eq!(f.hydroxyl_count, 3);
}
#[test]
fn ether_oxygen_not_counted_as_oh() {
let f = sf("COC");
assert_eq!(f.hydroxyl_count, 0);
}
#[test]
fn acetic_acid_one_oh() {
let f = sf("CC(=O)O");
assert_eq!(f.carbon_count, 2);
assert_eq!(f.hydroxyl_count, 1);
assert_eq!(f.carbonyl_count, 1);
}
#[test]
fn acrylic_acid_has_cc_double_bond() {
let f = sf("C=CC(=O)O");
assert!(f.has_cc_double_bond);
assert_eq!(f.carbon_count, 3);
}
#[test]
fn methacrylic_acid_has_cc_double_bond() {
let f = sf("CC(=C)C(=O)O");
assert!(f.has_cc_double_bond);
assert_eq!(f.carbon_count, 4);
}
#[test]
fn benzene_has_aromatic_ring() {
let f = sf("c1ccccc1");
assert!(f.has_ring);
assert!(f.has_aromatic_ring);
assert_eq!(f.carbon_count, 6);
}
#[test]
fn cyclohexanone_is_ring_no_aromatic() {
let f = sf("O=C1CCCCC1");
assert!(f.has_ring);
assert!(!f.has_aromatic_ring);
assert_eq!(f.carbon_count, 6);
}
#[test]
fn chlorobenzene_has_halogen() {
let f = sf("Clc1ccccc1");
assert!(f.has_halogen);
assert_eq!(f.carbon_count, 6);
}
#[test]
fn methanol_carbon_count_1() {
let f = sf("CO");
assert_eq!(f.carbon_count, 1);
assert_eq!(f.hydroxyl_count, 1);
}
}