use std::collections::{HashMap, HashSet};
use chematic_core::{AtomIdx, BondIdx, BondOrder, Molecule};
pub fn canonical_smiles(mol: &Molecule) -> String {
if mol.atom_count() == 0 {
return String::new();
}
let ranks = morgan_ranks(mol);
CanonicalWriter::new(mol, &ranks).write_all()
}
pub fn morgan_ranks(mol: &Molecule) -> Vec<u64> {
let n = mol.atom_count();
let mut ranks: Vec<u64> = (0..n)
.map(|i| initial_invariant(mol, AtomIdx(i as u32)))
.collect();
let max_iter = n + 2;
for _ in 0..max_iter {
let old_distinct = count_distinct(&ranks);
let new_ranks: Vec<u64> = (0..n)
.map(|i| {
let idx = AtomIdx(i as u32);
let mut neighbor_ranks: Vec<u64> = mol
.neighbors(idx)
.map(|(nb, _)| ranks[nb.0 as usize])
.collect();
neighbor_ranks.sort_unstable();
fnv_hash_sequence(ranks[i], &neighbor_ranks)
})
.collect();
let new_distinct = count_distinct(&new_ranks);
ranks = new_ranks;
if new_distinct <= old_distinct {
break;
}
}
normalize_ranks(&ranks)
}
fn initial_invariant(mol: &Molecule, idx: AtomIdx) -> u64 {
let atom = mol.atom(idx);
if atom.wildcard {
return 0;
}
let an = atom.element.atomic_number() as u64;
let degree = mol.degree(idx) as u64;
let charge = (atom.charge as i64 + 128) as u64;
let iso = atom.isotope.unwrap_or(0) as u64;
let arom = atom.aromatic as u64;
let h_flag = atom.hydrogen_count.map(|h| h as u64 + 1).unwrap_or(0);
(an << 56)
| (degree << 48)
| (charge << 40)
| (iso << 24)
| (h_flag << 16)
| (arom << 8)
}
fn fnv_hash_sequence(base: u64, values: &[u64]) -> u64 {
const FNV_PRIME: u64 = 0x0000_0100_0000_01B3;
const FNV_OFFSET: u64 = 0xcbf2_9ce4_8422_2325;
let mut h = FNV_OFFSET ^ base.wrapping_mul(FNV_PRIME);
for &v in values {
h ^= v;
h = h.wrapping_mul(FNV_PRIME);
}
h
}
fn count_distinct(ranks: &[u64]) -> usize {
let mut seen: Vec<u64> = ranks.to_vec();
seen.sort_unstable();
seen.dedup();
seen.len()
}
fn normalize_ranks(ranks: &[u64]) -> Vec<u64> {
let mut sorted: Vec<(u64, usize)> = ranks
.iter()
.copied()
.enumerate()
.map(|(i, v)| (v, i))
.collect();
sorted.sort_unstable_by_key(|&(v, _)| v);
let mut result = vec![0u64; ranks.len()];
let mut current_rank: u64 = 0;
let mut prev_val = sorted[0].0;
for (val, idx) in sorted {
if val != prev_val {
current_rank += 1;
prev_val = val;
}
result[idx] = current_rank;
}
result
}
struct CanonicalWriter<'a> {
mol: &'a Molecule,
ranks: &'a [u64],
written: Vec<bool>,
ring_bonds: HashSet<BondIdx>,
atom_ring_nums: HashMap<AtomIdx, Vec<(u8, BondOrder)>>,
next_ring: u8,
out: String,
}
impl<'a> CanonicalWriter<'a> {
fn new(mol: &'a Molecule, ranks: &'a [u64]) -> Self {
let n = mol.atom_count();
Self {
mol,
ranks,
written: vec![false; n],
ring_bonds: HashSet::new(),
atom_ring_nums: HashMap::new(),
next_ring: 1,
out: String::new(),
}
}
fn write_all(mut self) -> String {
self.find_ring_closures();
let starts = self.canonical_atom_list();
let mut first = true;
for start in starts {
if self.written[start.0 as usize] { continue; }
if !first { self.out.push('.'); }
first = false;
self.write_chain(start, None, None);
}
self.out
}
fn canonical_atom_list(&self) -> Vec<AtomIdx> {
let mut atoms: Vec<AtomIdx> = (0..self.mol.atom_count())
.map(|i| AtomIdx(i as u32))
.collect();
atoms.sort_by(|&a, &b| self.canonical_cmp(b, a)); atoms
}
fn canonical_cmp(&self, a: AtomIdx, b: AtomIdx) -> std::cmp::Ordering {
let ra = self.ranks[a.0 as usize];
let rb = self.ranks[b.0 as usize];
if ra != rb { return ra.cmp(&rb); }
let atom_a = self.mol.atom(a);
let atom_b = self.mol.atom(b);
atom_a.element.atomic_number().cmp(&atom_b.element.atomic_number())
.then_with(|| atom_a.isotope.unwrap_or(0).cmp(&atom_b.isotope.unwrap_or(0)))
.then_with(|| atom_a.charge.cmp(&atom_b.charge))
.then_with(|| (atom_a.aromatic as u8).cmp(&(atom_b.aromatic as u8)))
.then_with(|| self.mol.degree(a).cmp(&self.mol.degree(b)))
}
fn find_ring_closures(&mut self) {
let n = self.mol.atom_count();
let mut visited = vec![false; n];
let mut in_stack = vec![false; n];
let starts = self.canonical_atom_list();
for start in starts {
if !visited[start.0 as usize] {
self.dfs_mark(start, None, &mut visited, &mut in_stack);
}
}
}
fn dfs_mark(
&mut self,
atom: AtomIdx,
from_bond: Option<BondIdx>,
visited: &mut Vec<bool>,
in_stack: &mut Vec<bool>,
) {
visited[atom.0 as usize] = true;
in_stack[atom.0 as usize] = true;
let mut neighbors: Vec<(AtomIdx, BondIdx)> = self.mol.neighbors(atom).collect();
self.sort_neighbors_canonical(&mut neighbors);
for (neighbor, bidx) in neighbors {
if Some(bidx) == from_bond { continue; }
if self.ring_bonds.contains(&bidx) { continue; }
if !visited[neighbor.0 as usize] {
self.dfs_mark(neighbor, Some(bidx), visited, in_stack);
} else if in_stack[neighbor.0 as usize] {
self.ring_bonds.insert(bidx);
let rn = self.next_ring;
self.next_ring += 1;
let order = self.mol.bond(bidx).order;
self.atom_ring_nums.entry(neighbor).or_default().push((rn, order));
self.atom_ring_nums.entry(atom).or_default().push((rn, order));
}
}
in_stack[atom.0 as usize] = false;
}
fn write_chain(
&mut self,
atom: AtomIdx,
from_atom: Option<AtomIdx>,
incoming_bond: Option<BondOrder>,
) {
self.written[atom.0 as usize] = true;
if let Some(bond) = incoming_bond {
self.out.push(bond.smiles_char());
}
self.emit_atom(atom);
if let Some(rings) = self.atom_ring_nums.remove(&atom) {
for (rn, bond_order) in rings {
let atom_arom = self.mol.atom(atom).aromatic;
if !(bond_order == BondOrder::Aromatic && atom_arom)
&& bond_order != BondOrder::Single
{
self.out.push(bond_order.smiles_char());
}
if rn >= 10 {
self.out.push('%');
self.out.push(char::from_digit((rn / 10) as u32, 10).unwrap());
self.out.push(char::from_digit((rn % 10) as u32, 10).unwrap());
} else {
self.out.push(char::from_digit(rn as u32, 10).unwrap());
}
}
}
let mut children: Vec<(AtomIdx, BondOrder)> = self.mol
.neighbors(atom)
.filter(|(nb, bidx)| {
Some(*nb) != from_atom
&& !self.written[nb.0 as usize]
&& !self.ring_bonds.contains(bidx)
})
.map(|(nb, bidx)| (nb, self.mol.bond(bidx).order))
.collect();
children.sort_by(|&(a, _), &(b, _)| self.canonical_cmp(a, b));
let n = children.len();
for (i, (child, bond_order)) in children.into_iter().enumerate() {
let is_last = i == n - 1;
let parent_arom = self.mol.atom(atom).aromatic;
let child_arom = self.mol.atom(child).aromatic;
let implicit = match bond_order {
BondOrder::Single => !(parent_arom && child_arom),
BondOrder::Aromatic => parent_arom && child_arom,
_ => false,
};
let written_bond = if implicit { None } else { Some(bond_order) };
if !is_last {
self.out.push('(');
self.write_chain(child, Some(atom), written_bond);
self.out.push(')');
} else {
self.write_chain(child, Some(atom), written_bond);
}
}
}
fn sort_neighbors_canonical(&self, neighbors: &mut Vec<(AtomIdx, BondIdx)>) {
neighbors.sort_by(|&(a, _), &(b, _)| self.canonical_cmp(b, a)); }
fn emit_atom(&mut self, idx: AtomIdx) {
let atom = self.mol.atom(idx);
if atom.wildcard {
self.out.push('[');
self.out.push('*');
self.out.push(']');
return;
}
let needs_bracket = atom.isotope.is_some()
|| atom.charge != 0
|| atom.hydrogen_count.is_some()
|| !atom.element.is_organic_subset()
|| atom.atom_map.is_some();
if needs_bracket {
self.out.push('[');
if let Some(iso) = atom.isotope {
self.out.push_str(&iso.to_string());
}
let sym = if atom.aromatic {
atom.element.symbol().to_lowercase()
} else {
atom.element.symbol().to_string()
};
self.out.push_str(&sym);
match atom.chirality {
chematic_core::Chirality::CounterClockwise => self.out.push('@'),
chematic_core::Chirality::Clockwise => self.out.push_str("@@"),
chematic_core::Chirality::None => {}
}
if let Some(h) = atom.hydrogen_count {
if h > 0 {
self.out.push('H');
if h > 1 { self.out.push_str(&h.to_string()); }
}
}
match atom.charge {
0 => {}
1 => self.out.push('+'),
-1 => self.out.push('-'),
c if c > 0 => { self.out.push('+'); self.out.push_str(&c.to_string()); }
c => self.out.push_str(&c.to_string()),
}
if let Some(m) = atom.atom_map {
self.out.push(':');
self.out.push_str(&m.to_string());
}
self.out.push(']');
} else if atom.aromatic {
self.out.push_str(&atom.element.symbol().to_lowercase());
} else {
self.out.push_str(atom.element.symbol());
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::parser::parse;
fn is_stable(smiles: &str) -> bool {
let mol1 = parse(smiles).expect(smiles);
let c1 = canonical_smiles(&mol1);
assert!(!c1.is_empty(), "canonical_smiles returned empty for '{smiles}'");
let mol2 = parse(&c1).unwrap_or_else(|e| {
panic!("canonical SMILES '{c1}' is not parseable: {e}")
});
let c2 = canonical_smiles(&mol2);
c1 == c2
}
fn same_canonical(a: &str, b: &str) -> bool {
let mol_a = parse(a).expect(a);
let mol_b = parse(b).expect(b);
canonical_smiles(&mol_a) == canonical_smiles(&mol_b)
}
#[test]
fn test_methane_stable() { assert!(is_stable("C")); }
#[test]
fn test_ethane_stable() { assert!(is_stable("CC")); }
#[test]
fn test_ethanol_stable() { assert!(is_stable("CCO")); }
#[test]
fn test_acetic_acid_stable() { assert!(is_stable("CC(=O)O")); }
#[test]
fn test_benzene_stable() { assert!(is_stable("c1ccccc1")); }
#[test]
fn test_pyridine_stable() { assert!(is_stable("c1ccncc1")); }
#[test]
fn test_naphthalene_stable() { assert!(is_stable("c1ccc2ccccc2c1")); }
#[test]
fn test_aspirin_stable() { assert!(is_stable("CC(=O)Oc1ccccc1C(=O)O")); }
#[test]
fn test_caffeine_stable() { assert!(is_stable("Cn1cnc2c1c(=O)n(c(=O)n2C)C")); }
#[test]
fn test_ethanol_same_from_different_starts() {
assert!(same_canonical("CCO", "OCC"));
}
#[test]
fn test_isobutane_same_canonical() {
assert!(same_canonical("CC(C)C", "C(C)(C)C"));
}
#[test]
fn test_wildcard_roundtrip() {
let mol = parse("[*]CC").unwrap();
let c = canonical_smiles(&mol);
assert!(!c.is_empty());
let mol2 = parse(&c).unwrap();
assert_eq!(mol.atom_count(), mol2.atom_count());
assert!(is_stable("[*]CC"));
}
#[test]
fn test_disconnected_stable() {
assert!(is_stable("[Na+].[Cl-]"));
}
}