use super::OcrResult;
pub struct SmilesGenerator {
canonical: bool,
include_stereochemistry: bool,
}
impl SmilesGenerator {
pub fn new() -> Self {
Self {
canonical: true,
include_stereochemistry: true,
}
}
pub fn canonical(mut self, canonical: bool) -> Self {
self.canonical = canonical;
self
}
pub fn stereochemistry(mut self, include: bool) -> Self {
self.include_stereochemistry = include;
self
}
pub fn generate_from_result(&self, result: &OcrResult) -> Result<String, String> {
if let Some(smiles) = &result.formats.smiles {
return Ok(smiles.clone());
}
if let Some(line_data) = &result.line_data {
for line in line_data {
if line.line_type == "chemistry" || line.line_type == "molecule" {
return self.parse_chemical_notation(&line.text);
}
}
}
Err("No chemical structure data found".to_string())
}
fn parse_chemical_notation(&self, notation: &str) -> Result<String, String> {
if self.is_smiles(notation) {
return Ok(notation.to_string());
}
if let Some(smiles) = self.simple_formula_to_smiles(notation) {
return Ok(smiles);
}
Err(format!("Cannot convert '{}' to SMILES", notation))
}
fn is_smiles(&self, s: &str) -> bool {
let smiles_chars = "CNOPSFClBrI[]()=#@+-0123456789cnops";
s.chars().all(|c| smiles_chars.contains(c))
}
fn simple_formula_to_smiles(&self, formula: &str) -> Option<String> {
match formula.trim() {
"H2O" | "water" => Some("O".to_string()),
"CO2" | "carbon dioxide" => Some("O=C=O".to_string()),
"CH4" | "methane" => Some("C".to_string()),
"C2H6" | "ethane" => Some("CC".to_string()),
"C2H5OH" | "ethanol" => Some("CCO".to_string()),
"CH3COOH" | "acetic acid" => Some("CC(=O)O".to_string()),
"C6H6" | "benzene" => Some("c1ccccc1".to_string()),
"C6H12O6" | "glucose" => Some("OC[C@H]1OC(O)[C@H](O)[C@@H](O)[C@@H]1O".to_string()),
"NH3" | "ammonia" => Some("N".to_string()),
"H2SO4" | "sulfuric acid" => Some("OS(=O)(=O)O".to_string()),
"NaCl" | "sodium chloride" => Some("[Na+].[Cl-]".to_string()),
_ => None,
}
}
pub fn validate(&self, smiles: &str) -> Result<(), String> {
let mut depth = 0;
for c in smiles.chars() {
match c {
'(' => depth += 1,
')' => {
depth -= 1;
if depth < 0 {
return Err("Unbalanced parentheses".to_string());
}
}
_ => {}
}
}
if depth != 0 {
return Err("Unbalanced parentheses".to_string());
}
let mut depth = 0;
for c in smiles.chars() {
match c {
'[' => depth += 1,
']' => {
depth -= 1;
if depth < 0 {
return Err("Unbalanced brackets".to_string());
}
}
_ => {}
}
}
if depth != 0 {
return Err("Unbalanced brackets".to_string());
}
Ok(())
}
pub fn to_molecular_formula(&self, smiles: &str) -> Result<String, String> {
self.validate(smiles)?;
let mut counts: std::collections::HashMap<char, usize> = std::collections::HashMap::new();
for c in smiles.chars() {
if c.is_alphabetic() && c.is_uppercase() {
*counts.entry(c).or_insert(0) += 1;
}
}
let mut formula = String::new();
for element in &['C', 'H', 'N', 'O', 'S', 'P', 'F'] {
if let Some(&count) = counts.get(element) {
formula.push(*element);
if count > 1 {
formula.push_str(&count.to_string());
}
}
}
if formula.is_empty() {
Err("Could not determine molecular formula".to_string())
} else {
Ok(formula)
}
}
pub fn molecular_weight(&self, smiles: &str) -> Result<f32, String> {
self.validate(smiles)?;
let weights: std::collections::HashMap<char, f32> = [
('C', 12.01),
('H', 1.008),
('N', 14.01),
('O', 16.00),
('S', 32.07),
('P', 30.97),
('F', 19.00),
]
.iter()
.cloned()
.collect();
let mut total_weight = 0.0;
for c in smiles.chars() {
if let Some(&weight) = weights.get(&c) {
total_weight += weight;
}
}
Ok(total_weight)
}
}
impl Default for SmilesGenerator {
fn default() -> Self {
Self::new()
}
}
pub struct SmilesParser;
impl SmilesParser {
pub fn new() -> Self {
Self
}
pub fn count_atoms(&self, smiles: &str) -> std::collections::HashMap<String, usize> {
let mut counts = std::collections::HashMap::new();
let mut i = 0;
let chars: Vec<char> = smiles.chars().collect();
while i < chars.len() {
if chars[i].is_uppercase() {
let mut atom = String::from(chars[i]);
if i + 1 < chars.len() && chars[i + 1].is_lowercase() {
atom.push(chars[i + 1]);
i += 1;
}
*counts.entry(atom).or_insert(0) += 1;
}
i += 1;
}
counts
}
pub fn find_rings(&self, smiles: &str) -> Vec<usize> {
let mut rings = Vec::new();
for (_i, c) in smiles.chars().enumerate() {
if c.is_numeric() {
if let Some(digit) = c.to_digit(10) {
rings.push(digit as usize);
}
}
}
rings
}
}
impl Default for SmilesParser {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_smiles() {
let gen = SmilesGenerator::new();
assert!(gen.is_smiles("CCO"));
assert!(gen.is_smiles("c1ccccc1"));
assert!(gen.is_smiles("CC(=O)O"));
assert!(!gen.is_smiles("not smiles!"));
}
#[test]
fn test_simple_formula_conversion() {
let gen = SmilesGenerator::new();
assert_eq!(gen.simple_formula_to_smiles("H2O"), Some("O".to_string()));
assert_eq!(
gen.simple_formula_to_smiles("CO2"),
Some("O=C=O".to_string())
);
assert_eq!(gen.simple_formula_to_smiles("CH4"), Some("C".to_string()));
assert_eq!(
gen.simple_formula_to_smiles("benzene"),
Some("c1ccccc1".to_string())
);
}
#[test]
fn test_validate_smiles() {
let gen = SmilesGenerator::new();
assert!(gen.validate("CCO").is_ok());
assert!(gen.validate("CC(O)C").is_ok());
assert!(gen.validate("c1ccccc1").is_ok());
assert!(gen.validate("CC(O").is_err()); assert!(gen.validate("CC)O").is_err()); }
#[test]
fn test_molecular_formula() {
let gen = SmilesGenerator::new();
let formula = gen.to_molecular_formula("CCO").unwrap();
assert!(formula.contains('C'));
assert!(formula.contains('O'));
}
#[test]
fn test_molecular_weight() {
let gen = SmilesGenerator::new();
let weight = gen.molecular_weight("O").unwrap();
assert!(weight > 0.0);
let weight = gen.molecular_weight("CCO").unwrap();
assert!(weight > 30.0); }
#[test]
fn test_count_atoms() {
let parser = SmilesParser::new();
let counts = parser.count_atoms("CCO");
assert_eq!(counts.get("C"), Some(&2));
assert_eq!(counts.get("O"), Some(&1));
let counts = parser.count_atoms("CC(=O)O");
assert_eq!(counts.get("C"), Some(&2));
assert_eq!(counts.get("O"), Some(&2));
}
#[test]
fn test_find_rings() {
let parser = SmilesParser::new();
let rings = parser.find_rings("c1ccccc1");
assert_eq!(rings, vec![1, 1]);
let rings = parser.find_rings("C1CC1");
assert_eq!(rings, vec![1, 1]);
}
}