sdfrust 0.6.0

A fast, pure-Rust parser for SDF, MOL2, and XYZ chemical structure files
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
use std::collections::HashMap;

use crate::atom::Atom;
use crate::bond::{Bond, BondOrder};
use crate::collection::Collection;
use crate::sgroup::SGroup;
use crate::stereogroup::StereoGroup;

/// SDF format version.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
pub enum SdfFormat {
    /// V2000 format (fixed-width columns, max 999 atoms/bonds).
    #[default]
    V2000,
    /// V3000 format (variable-width, unlimited atoms/bonds).
    V3000,
}

impl SdfFormat {
    /// Returns the format string for the counts line.
    pub fn to_str(&self) -> &'static str {
        match self {
            SdfFormat::V2000 => "V2000",
            SdfFormat::V3000 => "V3000",
        }
    }

    /// Creates an SdfFormat from a string.
    pub fn parse(s: &str) -> Option<Self> {
        if s.contains("V3000") {
            Some(SdfFormat::V3000)
        } else if s.contains("V2000") {
            Some(SdfFormat::V2000)
        } else {
            None
        }
    }
}

/// Represents a molecule with atoms, bonds, and properties.
#[derive(Debug, Clone, PartialEq)]
pub struct Molecule {
    /// Molecule name (from the first line of the molfile).
    pub name: String,

    /// Program/timestamp line (second line of molfile).
    pub program_line: Option<String>,

    /// Comment line (third line of molfile).
    pub comment: Option<String>,

    /// List of atoms in the molecule.
    pub atoms: Vec<Atom>,

    /// List of bonds in the molecule.
    pub bonds: Vec<Bond>,

    /// Properties from the SDF data block (key-value pairs).
    pub properties: HashMap<String, String>,

    /// Format version (V2000 or V3000).
    pub format_version: SdfFormat,

    /// Enhanced stereochemistry groups (V3000).
    pub stereogroups: Vec<StereoGroup>,

    /// SGroups for superatoms, polymers, etc. (V3000).
    pub sgroups: Vec<SGroup>,

    /// Collections for atom lists, R-groups, etc. (V3000).
    pub collections: Vec<Collection>,
}

impl Molecule {
    /// Creates a new empty molecule with the given name.
    pub fn new(name: &str) -> Self {
        Self {
            name: name.to_string(),
            program_line: None,
            comment: None,
            atoms: Vec::new(),
            bonds: Vec::new(),
            properties: HashMap::new(),
            format_version: SdfFormat::V2000,
            stereogroups: Vec::new(),
            sgroups: Vec::new(),
            collections: Vec::new(),
        }
    }

    /// Returns true if this molecule requires V3000 format.
    ///
    /// Returns true if:
    /// - The molecule has more than 999 atoms or bonds
    /// - The molecule has V3000-only features (stereogroups, sgroups, collections)
    /// - Any atom has V3000-specific fields set
    /// - Any bond has extended bond types (coordination, hydrogen)
    pub fn needs_v3000(&self) -> bool {
        // V2000 limit is 999 atoms/bonds
        if self.atoms.len() > 999 || self.bonds.len() > 999 {
            return true;
        }

        // V3000-specific structures
        if !self.stereogroups.is_empty() || !self.sgroups.is_empty() || !self.collections.is_empty()
        {
            return true;
        }

        // V3000-only atom fields
        for atom in &self.atoms {
            if atom.v3000_id.is_some()
                || atom.atom_atom_mapping.is_some()
                || atom.rgroup_label.is_some()
                || atom.radical.is_some()
            {
                return true;
            }
        }

        // V3000-only bond types
        for bond in &self.bonds {
            if matches!(
                bond.order,
                crate::bond::BondOrder::Coordination | crate::bond::BondOrder::Hydrogen
            ) {
                return true;
            }
            if bond.v3000_id.is_some() || bond.reacting_center.is_some() {
                return true;
            }
        }

        false
    }

    /// Returns the number of atoms in the molecule.
    pub fn atom_count(&self) -> usize {
        self.atoms.len()
    }

    /// Returns the number of bonds in the molecule.
    pub fn bond_count(&self) -> usize {
        self.bonds.len()
    }

    /// Returns true if the molecule has no atoms.
    pub fn is_empty(&self) -> bool {
        self.atoms.is_empty()
    }

    /// Returns an iterator over all atoms.
    pub fn atoms(&self) -> impl Iterator<Item = &Atom> {
        self.atoms.iter()
    }

    /// Returns an iterator over all bonds.
    pub fn bonds(&self) -> impl Iterator<Item = &Bond> {
        self.bonds.iter()
    }

    /// Returns the atom at the given index, if it exists.
    pub fn get_atom(&self, index: usize) -> Option<&Atom> {
        self.atoms.get(index)
    }

    /// Returns all bonds connected to the given atom index.
    pub fn bonds_for_atom(&self, atom_index: usize) -> Vec<&Bond> {
        self.bonds
            .iter()
            .filter(|b| b.contains_atom(atom_index))
            .collect()
    }

    /// Returns the indices of atoms connected to the given atom.
    pub fn neighbors(&self, atom_index: usize) -> Vec<usize> {
        self.bonds
            .iter()
            .filter_map(|b| b.other_atom(atom_index))
            .collect()
    }

    /// Returns the molecular formula as a string (e.g., "C6H12O6").
    pub fn formula(&self) -> String {
        let mut counts: HashMap<&str, usize> = HashMap::new();
        for atom in &self.atoms {
            *counts.entry(atom.element.as_str()).or_insert(0) += 1;
        }

        // Standard order: C, H, then alphabetical
        let mut formula = String::new();

        if let Some(&c) = counts.get("C") {
            formula.push('C');
            if c > 1 {
                formula.push_str(&c.to_string());
            }
            counts.remove("C");
        }

        if let Some(&h) = counts.get("H") {
            formula.push('H');
            if h > 1 {
                formula.push_str(&h.to_string());
            }
            counts.remove("H");
        }

        let mut remaining: Vec<_> = counts.into_iter().collect();
        remaining.sort_by_key(|(elem, _)| *elem);

        for (elem, count) in remaining {
            formula.push_str(elem);
            if count > 1 {
                formula.push_str(&count.to_string());
            }
        }

        formula
    }

    /// Returns the total formal charge of the molecule.
    pub fn total_charge(&self) -> i32 {
        self.atoms.iter().map(|a| a.formal_charge as i32).sum()
    }

    /// Returns the centroid (geometric center) of the molecule.
    pub fn centroid(&self) -> Option<(f64, f64, f64)> {
        if self.atoms.is_empty() {
            return None;
        }

        let n = self.atoms.len() as f64;
        let sum_x: f64 = self.atoms.iter().map(|a| a.x).sum();
        let sum_y: f64 = self.atoms.iter().map(|a| a.y).sum();
        let sum_z: f64 = self.atoms.iter().map(|a| a.z).sum();

        Some((sum_x / n, sum_y / n, sum_z / n))
    }

    /// Translates the molecule by the given vector.
    pub fn translate(&mut self, dx: f64, dy: f64, dz: f64) {
        for atom in &mut self.atoms {
            atom.x += dx;
            atom.y += dy;
            atom.z += dz;
        }
    }

    /// Centers the molecule at the origin.
    pub fn center(&mut self) {
        if let Some((cx, cy, cz)) = self.centroid() {
            self.translate(-cx, -cy, -cz);
        }
    }

    /// Returns a property value by key, if it exists.
    pub fn get_property(&self, key: &str) -> Option<&str> {
        self.properties.get(key).map(|s| s.as_str())
    }

    /// Sets a property value.
    pub fn set_property(&mut self, key: &str, value: &str) {
        self.properties.insert(key.to_string(), value.to_string());
    }

    /// Returns true if the molecule contains any aromatic bonds.
    pub fn has_aromatic_bonds(&self) -> bool {
        self.bonds.iter().any(|b| b.is_aromatic())
    }

    /// Returns true if the molecule contains any charged atoms.
    pub fn has_charges(&self) -> bool {
        self.atoms.iter().any(|a| a.is_charged())
    }

    /// Returns the count of each element in the molecule.
    pub fn element_counts(&self) -> HashMap<String, usize> {
        let mut counts = HashMap::new();
        for atom in &self.atoms {
            *counts.entry(atom.element.clone()).or_insert(0) += 1;
        }
        counts
    }

    /// Calculates the sum of bond orders (useful for validation).
    pub fn total_bond_order(&self) -> f64 {
        self.bonds.iter().map(|b| b.order.order()).sum()
    }

    /// Returns atoms that match the given element.
    pub fn atoms_by_element(&self, element: &str) -> Vec<&Atom> {
        self.atoms.iter().filter(|a| a.element == element).collect()
    }

    /// Returns bonds with the given order.
    pub fn bonds_by_order(&self, order: BondOrder) -> Vec<&Bond> {
        self.bonds.iter().filter(|b| b.order == order).collect()
    }

    // ============================================================
    // Descriptor convenience methods
    // ============================================================

    /// Calculate the molecular weight (sum of atomic weights).
    ///
    /// Uses standard atomic weights (IUPAC 2021) for each element.
    /// Returns `None` if any atom has an unknown element.
    ///
    /// # Example
    ///
    /// ```rust
    /// use sdfrust::{Molecule, Atom};
    ///
    /// let mut mol = Molecule::new("water");
    /// mol.atoms.push(Atom::new(0, "O", 0.0, 0.0, 0.0));
    /// mol.atoms.push(Atom::new(1, "H", 1.0, 0.0, 0.0));
    /// mol.atoms.push(Atom::new(2, "H", -0.3, 0.95, 0.0));
    ///
    /// let mw = mol.molecular_weight().unwrap();
    /// assert!((mw - 18.015).abs() < 0.01);
    /// ```
    pub fn molecular_weight(&self) -> Option<f64> {
        crate::descriptors::molecular::molecular_weight(self)
    }

    /// Calculate the exact (monoisotopic) mass.
    ///
    /// Uses the mass of the most abundant isotope for each element.
    /// Returns `None` if any atom has an unknown element.
    pub fn exact_mass(&self) -> Option<f64> {
        crate::descriptors::molecular::exact_mass(self)
    }

    /// Count non-hydrogen atoms (heavy atoms).
    ///
    /// Heavy atoms are all atoms except hydrogen (H), deuterium (D), and tritium (T).
    pub fn heavy_atom_count(&self) -> usize {
        crate::descriptors::molecular::heavy_atom_count(self)
    }

    /// Count bonds by bond order.
    ///
    /// Returns a HashMap mapping each BondOrder to its count.
    pub fn bond_type_counts(&self) -> HashMap<BondOrder, usize> {
        crate::descriptors::molecular::bond_type_counts(self)
    }

    /// Count the number of rings in the molecule.
    ///
    /// Uses the Euler characteristic formula: rings = bonds - atoms + components.
    pub fn ring_count(&self) -> usize {
        crate::descriptors::topological::ring_count(self)
    }

    /// Check if an atom at the given index is in a ring.
    ///
    /// Returns `false` if the index is out of bounds.
    pub fn is_atom_in_ring(&self, idx: usize) -> bool {
        if idx >= self.atoms.len() {
            return false;
        }
        crate::descriptors::topological::ring_atoms(self)[idx]
    }

    /// Check if a bond at the given index is in a ring.
    ///
    /// Returns `false` if the index is out of bounds.
    pub fn is_bond_in_ring(&self, idx: usize) -> bool {
        if idx >= self.bonds.len() {
            return false;
        }
        crate::descriptors::topological::ring_bonds(self)[idx]
    }

    /// Count rotatable bonds.
    ///
    /// A bond is rotatable if it is a single bond, not in a ring,
    /// not terminal, and doesn't involve hydrogen atoms.
    pub fn rotatable_bond_count(&self) -> usize {
        crate::descriptors::topological::rotatable_bond_count(self)
    }
}

impl Default for Molecule {
    fn default() -> Self {
        Self::new("")
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::atom::Atom;
    use crate::bond::{Bond, BondOrder};

    #[test]
    fn test_needs_v3000_basic() {
        let mol = Molecule::new("test");
        assert!(!mol.needs_v3000());
    }

    #[test]
    fn test_needs_v3000_stereogroups() {
        let mut mol = Molecule::new("test");
        mol.stereogroups.push(StereoGroup::default());
        assert!(mol.needs_v3000());
    }

    #[test]
    fn test_needs_v3000_coordination_bond() {
        let mut mol = Molecule::new("test");
        mol.atoms.push(Atom::new(0, "C", 0.0, 0.0, 0.0));
        mol.atoms.push(Atom::new(1, "N", 1.0, 0.0, 0.0));
        mol.bonds.push(Bond::new(0, 1, BondOrder::Coordination));
        assert!(mol.needs_v3000());
    }
}