Expand description
§sdfrust
A fast, pure-Rust parser for SDF (Structure Data File), MOL2, and XYZ chemical structure files.
§Features
- Parse SDF V2000 and V3000 format files (single and multi-molecule)
- Parse TRIPOS MOL2 format files (single and multi-molecule)
- Parse XYZ coordinate files (single and multi-molecule, atomic numbers supported)
- Write SDF V2000 and V3000 format files
- Automatic format detection for SDF, MOL2, and XYZ files
- Transparent gzip decompression for all file parsers (optional
gzipfeature) - Support for molecules with >999 atoms/bonds (V3000)
- Enhanced stereochemistry, SGroups, and collections (V3000)
- Iterate over large files without loading everything into memory
- Access atom coordinates, bonds, and molecule properties
- Zero external dependencies for parsing (only
thiserrorfor error handling)
§Quick Start
§Parse a single molecule
use sdfrust::{parse_sdf_file, Molecule};
let molecule = parse_sdf_file("molecule.sdf")?;
println!("Name: {}", molecule.name);
println!("Atoms: {}", molecule.atom_count());
println!("Formula: {}", molecule.formula());§Parse multiple molecules
use sdfrust::parse_sdf_file_multi;
let molecules = parse_sdf_file_multi("database.sdf")?;
for mol in &molecules {
println!("{}: {} atoms", mol.name, mol.atom_count());
}§Iterate over a large file
use sdfrust::iter_sdf_file;
for result in iter_sdf_file("large_database.sdf")? {
let mol = result?;
// Process each molecule without loading all into memory
}§Parse from string
use sdfrust::parse_sdf_string;
let sdf_content = r#"methane
5 4 0 0 0 0 0 0 0 0999 V2000
0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
0.6289 0.6289 0.6289 H 0 0 0 0 0 0 0 0 0 0 0 0
-0.6289 -0.6289 0.6289 H 0 0 0 0 0 0 0 0 0 0 0 0
-0.6289 0.6289 -0.6289 H 0 0 0 0 0 0 0 0 0 0 0 0
0.6289 -0.6289 -0.6289 H 0 0 0 0 0 0 0 0 0 0 0 0
1 2 1 0 0 0 0
1 3 1 0 0 0 0
1 4 1 0 0 0 0
1 5 1 0 0 0 0
M END
$$$$
"#;
let mol = parse_sdf_string(sdf_content).unwrap();
assert_eq!(mol.name, "methane");
assert_eq!(mol.atom_count(), 5);
assert_eq!(mol.formula(), "CH4");§Write molecules
use sdfrust::{Molecule, Atom, Bond, BondOrder, write_sdf_string};
let mut mol = Molecule::new("water");
mol.atoms.push(Atom::new(0, "O", 0.0, 0.0, 0.0));
mol.atoms.push(Atom::new(1, "H", 0.96, 0.0, 0.0));
mol.atoms.push(Atom::new(2, "H", -0.24, 0.93, 0.0));
mol.bonds.push(Bond::new(0, 1, BondOrder::Single));
mol.bonds.push(Bond::new(0, 2, BondOrder::Single));
let sdf_output = write_sdf_string(&mol).unwrap();
println!("{}", sdf_output);§Parse MOL2 files
use sdfrust::parse_mol2_string;
let mol2_content = r#"@<TRIPOS>MOLECULE
water
3 2 0 0 0
SMALL
NO_CHARGES
@<TRIPOS>ATOM
1 O1 0.0000 0.0000 0.0000 O.3 1 MOL 0.0000
2 H1 0.9572 0.0000 0.0000 H 1 MOL 0.0000
3 H2 -0.2400 0.9266 0.0000 H 1 MOL 0.0000
@<TRIPOS>BOND
1 1 2 1
2 1 3 1
"#;
let mol = parse_mol2_string(mol2_content).unwrap();
assert_eq!(mol.name, "water");
assert_eq!(mol.formula(), "H2O");§Error Handling
All parsing functions return Result<T, SdfError>. The library provides specific
error variants for different failure modes:
use sdfrust::{parse_sdf_string, SdfError};
let result = parse_sdf_string("invalid content");
match result {
Ok(mol) => println!("Parsed: {}", mol.name),
Err(SdfError::EmptyFile) => println!("File was empty"),
Err(SdfError::Parse { line, message }) => {
println!("Parse error at line {}: {}", line, message);
}
Err(SdfError::InvalidCountsLine(s)) => {
println!("Bad counts line: {}", s);
}
Err(e) => println!("Other error: {}", e),
}§Common Error Types
SdfError::Io- File I/O errors (file not found, permission denied)SdfError::Parse- General parse errors with line numberSdfError::EmptyFile- The file contains no dataSdfError::AtomCountMismatch- Declared atom count doesn’t match actual atomsSdfError::BondCountMismatch- Declared bond count doesn’t match actual bondsSdfError::InvalidAtomIndex- Bond references non-existent atomSdfError::InvalidBondOrder- Unrecognized bond typeSdfError::InvalidCountsLine- Malformed counts line in headerSdfError::MissingSection- Required section not found (MOL2)
§Handling Multi-Molecule Files with Errors
When iterating, each molecule is parsed independently:
use sdfrust::iter_sdf_file;
let mut success_count = 0;
let mut error_count = 0;
for result in iter_sdf_file("database.sdf")? {
match result {
Ok(mol) => success_count += 1,
Err(e) => {
eprintln!("Skipping molecule: {}", e);
error_count += 1;
}
}
}
println!("Parsed {} molecules, {} errors", success_count, error_count);§Working with Properties
SDF files can contain key-value properties in the data block. These are
stored as a HashMap<String, String> on the molecule.
§Getting Properties
use sdfrust::parse_sdf_string;
let sdf = r#"aspirin
1 0 0 0 0 0 0 0 0 0999 V2000
0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
M END
> <PUBCHEM_CID>
2244
> <MOLECULAR_WEIGHT>
180.16
$$$$
"#;
let mol = parse_sdf_string(sdf).unwrap();
// Get a single property
if let Some(cid) = mol.get_property("PUBCHEM_CID") {
assert_eq!(cid, "2244");
}
// Check if property exists
assert!(mol.properties.contains_key("MOLECULAR_WEIGHT"));§Setting Properties
use sdfrust::Molecule;
let mut mol = Molecule::new("example");
mol.set_property("SMILES", "CCO");
mol.set_property("SOURCE", "generated");
assert_eq!(mol.get_property("SMILES"), Some("CCO"));§Iterating Over Properties
use sdfrust::Molecule;
let mut mol = Molecule::new("example");
mol.set_property("MW", "180.16");
mol.set_property("CID", "2244");
for (key, value) in &mol.properties {
println!("{}: {}", key, value);
}§Molecule Operations
The Molecule struct provides many useful methods for working with
chemical structure data.
§Molecular Formula
use sdfrust::parse_sdf_string;
let sdf = r#"water
3 2 0 0 0 0 0 0 0 0999 V2000
0.0000 0.0000 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
0.9572 0.0000 0.0000 H 0 0 0 0 0 0 0 0 0 0 0 0
-0.2400 0.9266 0.0000 H 0 0 0 0 0 0 0 0 0 0 0 0
1 2 1 0 0 0 0
1 3 1 0 0 0 0
M END
$$$$
"#;
let mol = parse_sdf_string(sdf).unwrap();
assert_eq!(mol.formula(), "H2O");§Geometric Center (Centroid)
use sdfrust::{Molecule, Atom};
let mut mol = Molecule::new("example");
mol.atoms.push(Atom::new(0, "C", 0.0, 0.0, 0.0));
mol.atoms.push(Atom::new(1, "C", 2.0, 0.0, 0.0));
let (cx, cy, cz) = mol.centroid().unwrap();
assert!((cx - 1.0).abs() < 1e-6);
assert!((cy - 0.0).abs() < 1e-6);§Bond Connectivity (Neighbors)
use sdfrust::{Molecule, Atom, Bond, BondOrder};
let mut mol = Molecule::new("methane");
mol.atoms.push(Atom::new(0, "C", 0.0, 0.0, 0.0));
mol.atoms.push(Atom::new(1, "H", 1.0, 0.0, 0.0));
mol.atoms.push(Atom::new(2, "H", -1.0, 0.0, 0.0));
mol.bonds.push(Bond::new(0, 1, BondOrder::Single));
mol.bonds.push(Bond::new(0, 2, BondOrder::Single));
// Get neighbors of the carbon (index 0)
let neighbors = mol.neighbors(0);
assert_eq!(neighbors.len(), 2);
assert!(neighbors.contains(&1));
assert!(neighbors.contains(&2));§Element Counts
use sdfrust::{Molecule, Atom};
let mut mol = Molecule::new("ethanol");
mol.atoms.push(Atom::new(0, "C", 0.0, 0.0, 0.0));
mol.atoms.push(Atom::new(1, "C", 1.5, 0.0, 0.0));
mol.atoms.push(Atom::new(2, "O", 2.5, 0.0, 0.0));
mol.atoms.push(Atom::new(3, "H", 0.0, 1.0, 0.0));
let counts = mol.element_counts();
assert_eq!(counts.get("C"), Some(&2));
assert_eq!(counts.get("O"), Some(&1));
assert_eq!(counts.get("H"), Some(&1));§Other Useful Methods
atom_count()/bond_count()- Get countsis_empty()- Check if molecule has atomstotal_charge()- Sum of formal chargeshas_aromatic_bonds()- Check for aromaticityhas_charges()- Check for charged atomsatoms_by_element("C")- Filter atoms by elementbonds_by_order(BondOrder::Double)- Filter bonds by typetranslate(dx, dy, dz)- Move moleculecenter()- Move centroid to origin
§Performance Tips
§Use Iterators for Large Files
For files with thousands of molecules, use the iterator API to process molecules one at a time without loading all into memory:
use sdfrust::iter_sdf_file;
// Memory efficient - processes one molecule at a time
for result in iter_sdf_file("large_database.sdf")? {
let mol = result?;
// Process and discard
}
// vs. loading all at once (uses more memory)
let all_molecules = parse_sdf_file_multi("large_database.sdf")?;§Release Builds for Benchmarks
Parsing performance improves significantly with optimizations:
cargo build --release
cargo run --release --example benchmark§Streaming vs Load-All Tradeoffs
| Approach | Memory | Speed | Use Case |
|---|---|---|---|
iter_sdf_file | O(1) | Fast | Large files, filtering |
parse_sdf_file_multi | O(n) | Fast | Need random access |
parse_sdf_string | O(1) | Fastest | Single molecule |
§Format Notes
§Supported Formats
- SDF V2000: Full support for reading and writing (up to 999 atoms/bonds)
- MOL2 TRIPOS: Full support for reading (MOLECULE, ATOM, BOND sections)
§SDF V3000
SDF V3000 format is fully supported for both parsing and writing:
use sdfrust::{parse_sdf_auto_string, write_sdf_auto_string, SdfFormat};
// V3000 content is automatically detected and parsed
let v3000_content = r#"test
0 0 0 0 0 999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 2 1 0 0 0
M V30 BEGIN ATOM
M V30 1 C 0.0000 0.0000 0.0000 0
M V30 2 O 1.2000 0.0000 0.0000 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 2 1 2
M V30 END BOND
M V30 END CTAB
M END
$$$$
"#;
let mol = parse_sdf_auto_string(v3000_content).unwrap();
assert_eq!(mol.format_version, SdfFormat::V3000);§Format Detection
The library uses file content to determine format:
- SDF V2000 files contain
V2000in the counts line - SDF V3000 files contain
V3000in the counts line - MOL2 files start with
@<TRIPOS>MOLECULE
Re-exports§
pub use atom::Atom;pub use bond::Bond;pub use bond::BondOrder;pub use bond::BondStereo;pub use collection::Collection;pub use collection::CollectionType;pub use error::Result;pub use error::SdfError;pub use molecule::Molecule;pub use molecule::SdfFormat;pub use sgroup::SGroup;pub use sgroup::SGroupType;pub use stereogroup::StereoGroup;pub use stereogroup::StereoGroupType;pub use parser::SdfIterator;pub use parser::SdfParser;pub use parser::detect_sdf_format;pub use parser::iter_sdf_file;pub use parser::parse_sdf_auto_file;pub use parser::parse_sdf_auto_file_multi;pub use parser::parse_sdf_auto_string;pub use parser::parse_sdf_auto_string_multi;pub use parser::parse_sdf_file;pub use parser::parse_sdf_file_multi;pub use parser::parse_sdf_string;pub use parser::parse_sdf_string_multi;pub use parser::AutoIterator;pub use parser::FileFormat;pub use parser::detect_format;pub use parser::iter_auto_file;pub use parser::parse_auto_file;pub use parser::parse_auto_file_multi;pub use parser::parse_auto_string;pub use parser::parse_auto_string_multi;pub use parser::SdfV3000Iterator;pub use parser::SdfV3000Parser;pub use parser::iter_sdf_v3000_file;pub use parser::parse_sdf_v3000_file;pub use parser::parse_sdf_v3000_file_multi;pub use parser::parse_sdf_v3000_string;pub use parser::parse_sdf_v3000_string_multi;pub use parser::Mol2Iterator;pub use parser::Mol2Parser;pub use parser::iter_mol2_file;pub use parser::parse_mol2_file;pub use parser::parse_mol2_file_multi;pub use parser::parse_mol2_string;pub use parser::parse_mol2_string_multi;pub use parser::XyzIterator;pub use parser::XyzParser;pub use parser::iter_xyz_file;pub use parser::parse_xyz_file;pub use parser::parse_xyz_file_multi;pub use parser::parse_xyz_string;pub use parser::parse_xyz_string_multi;pub use writer::write_sdf;pub use writer::write_sdf_file;pub use writer::write_sdf_file_multi;pub use writer::write_sdf_multi;pub use writer::write_sdf_string;pub use writer::needs_v3000;pub use writer::write_sdf_auto;pub use writer::write_sdf_auto_file;pub use writer::write_sdf_auto_string;pub use writer::write_sdf_v3000;pub use writer::write_sdf_v3000_file;pub use writer::write_sdf_v3000_file_multi;pub use writer::write_sdf_v3000_multi;pub use writer::write_sdf_v3000_string;pub use writer::write_mol2;pub use writer::write_mol2_file;pub use writer::write_mol2_file_multi;pub use writer::write_mol2_multi;pub use writer::write_mol2_string;pub use descriptors::BondInferenceConfig;pub use descriptors::infer_bonds;pub use descriptors::infer_bonds_with_config;pub use graph::AdjacencyList;pub use featurize::ogb::OgbAtomFeatures;pub use featurize::ogb::OgbBondFeatures;pub use featurize::ogb::OgbGraphFeatures;pub use fingerprints::ecfp::EcfpFingerprint;
Modules§
- atom
- bond
- collection
- Collection types for SDF V3000 format.
- descriptors
- Molecular descriptors for chemical structure analysis.
- error
- featurize
- ML-ready molecular featurization.
- fingerprints
- Molecular fingerprint algorithms.
- graph
- Graph adjacency infrastructure for molecular structures.
- molecule
- parser
- sgroup
- SGroup types for SDF V3000 format.
- stereogroup
- Enhanced stereochemistry types for SDF V3000 format.
- writer