use std::collections::HashMap;
use crate::{AtomId, BondOrder, Molecule, MoleculeBuilder};
#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)]
pub enum SequenceError {
#[error("unknown residue character '{ch}' at position {pos}")]
UnknownResidue { ch: char, pos: usize },
#[error("sequence string is empty")]
EmptySequence,
#[error("invalid character '{ch}' at position {pos}")]
InvalidCharacter { ch: char, pos: usize },
#[error("SMILES construction failed: {0}")]
SmilesParseError(String),
#[error("molecule construction failed: {0}")]
BuilderError(String),
}
fn build_protein_fragment_map() -> HashMap<char, &'static str> {
let mut m = HashMap::new();
m.insert('A', "N[C@@H](C)C(=O)");
m.insert('C', "N[C@@H](CS)C(=O)");
m.insert('D', "N[C@@H](CC(=O)O)C(=O)");
m.insert('E', "N[C@@H](CCC(=O)O)C(=O)");
m.insert('F', "N[C@@H](Cc1ccccc1)C(=O)");
m.insert('G', "NCC(=O)");
m.insert('H', "N[C@@H](Cc1c[nH]cn1)C(=O)");
m.insert('I', "N[C@@H]([C@@H](C)CC)C(=O)");
m.insert('K', "N[C@@H](CCCCN)C(=O)");
m.insert('L', "N[C@@H](CC(C)C)C(=O)");
m.insert('M', "N[C@@H](CCSC)C(=O)");
m.insert('N', "N[C@@H](CC(=O)N)C(=O)");
m.insert('P', "N1[C@@H](CCC1)C(=O)");
m.insert('Q', "N[C@@H](CCC(=O)N)C(=O)");
m.insert('R', "N[C@@H](CCCNC(=N)N)C(=O)");
m.insert('S', "N[C@@H](CO)C(=O)");
m.insert('T', "N[C@@H]([C@@H](C)O)C(=O)");
m.insert('V', "N[C@@H](C(C)C)C(=O)");
m.insert('W', "N[C@@H](Cc1c[nH]c2ccccc12)C(=O)");
m.insert('Y', "N[C@@H](Cc1ccc(cc1)O)C(=O)");
m
}
fn build_dna_fragment_map() -> HashMap<char, &'static str> {
let mut m = HashMap::new();
m.insert(
'A',
"OP(=O)([O-])OC[C@H]1O[C@@H](n2cnc3c(N)ncnc23)[C@@H](O[C@@H]1O)C(=O)",
);
m.insert(
'C',
"OP(=O)([O-])OC[C@H]1O[C@@H](n2ccc(N)nc2=O)[C@@H](O[C@@H]1O)C(=O)",
);
m.insert(
'G',
"OP(=O)([O-])OC[C@H]1O[C@@H](n2cnc3c(=O)[nH]c(N)nc23)[C@@H](O[C@@H]1O)C(=O)",
);
m.insert(
'T',
"OP(=O)([O-])OC[C@H]1O[C@@H](n2c(=O)[nH]c(=O)c(C)c2)[C@@H](O[C@@H]1O)C(=O)",
);
m
}
fn build_rna_fragment_map() -> HashMap<char, &'static str> {
let mut m = HashMap::new();
m.insert(
'A',
"OP(=O)([O-])OC[C@H]1O[C@@H](n2cnc3c(N)ncnc23)[C@H](O)[C@@H](O[C@@H]1O)C(=O)",
);
m.insert(
'C',
"OP(=O)([O-])OC[C@H]1O[C@@H](n2ccc(N)nc2=O)[C@H](O)[C@@H](O[C@@H]1O)C(=O)",
);
m.insert(
'G',
"OP(=O)([O-])OC[C@H]1O[C@@H](n2cnc3c(=O)[nH]c(N)nc23)[C@H](O)[C@@H](O[C@@H]1O)C(=O)",
);
m.insert(
'U',
"OP(=O)([O-])OC[C@H]1O[C@@H](n2c(=O)[nH]c(=O)cc2)[C@H](O)[C@@H](O[C@@H]1O)C(=O)",
);
m
}
fn build_peptide_smiles(
sequence: &str,
fragment_map: &HashMap<char, &'static str>,
) -> Result<String, SequenceError> {
if sequence.is_empty() {
return Err(SequenceError::EmptySequence);
}
let mut smiles = String::new();
for (pos, ch) in sequence.chars().enumerate() {
if ch == '\n' || ch == '\r' || ch == '-' || ch == ' ' || ch == '\t' {
continue;
}
if ch == '.' {
continue;
}
match fragment_map.get(&ch) {
Some(fragment) => {
smiles.push_str(fragment);
}
None => {
return Err(SequenceError::UnknownResidue { ch, pos });
}
}
}
if smiles.is_empty() {
return Err(SequenceError::EmptySequence);
}
smiles.push('O');
Ok(smiles)
}
pub fn mol_from_sequence(sequence: &str) -> Result<Molecule, SequenceError> {
mol_from_sequence_with_type(sequence, true)
}
pub fn mol_from_sequence_with_type(
sequence: &str,
is_protein: bool,
) -> Result<Molecule, SequenceError> {
if sequence.is_empty() {
return Err(SequenceError::EmptySequence);
}
let smiles = if is_protein {
let fragment_map = build_protein_fragment_map();
build_peptide_smiles(sequence, &fragment_map)?
} else {
let fragment_map = build_dna_fragment_map();
build_peptide_smiles(sequence, &fragment_map)?
};
parse_smiles_to_molecule(&smiles)
}
fn parse_smiles_to_molecule(smiles: &str) -> Result<Molecule, SequenceError> {
build_from_smiles_fallback(smiles)
}
fn build_from_smiles_fallback(_smiles: &str) -> Result<Molecule, SequenceError> {
Err(SequenceError::SmilesParseError(
"SMILES parser is not yet implemented; use the future `crate::smiles::parse_smiles`"
.to_string(),
))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_protein_fragment_map_has_20_standard() {
let map = build_protein_fragment_map();
for ch in &[
'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T',
'V', 'W', 'Y',
] {
assert!(map.contains_key(ch), "missing fragment for {}", ch);
}
assert_eq!(map.len(), 20);
}
#[test]
fn test_dna_fragment_map_has_4_bases() {
let map = build_dna_fragment_map();
for ch in &['A', 'C', 'G', 'T'] {
assert!(map.contains_key(ch), "missing DNA fragment for {}", ch);
}
assert_eq!(map.len(), 4);
}
#[test]
fn test_rna_fragment_map_has_4_bases() {
let map = build_rna_fragment_map();
for ch in &['A', 'C', 'G', 'U'] {
assert!(map.contains_key(ch), "missing RNA fragment for {}", ch);
}
assert_eq!(map.len(), 4);
}
#[test]
fn test_empty_sequence_returns_error() {
assert_eq!(mol_from_sequence(""), Err(SequenceError::EmptySequence));
}
#[test]
fn test_unknown_residue_returns_error() {
assert!(matches!(
mol_from_sequence("ABZ"),
Err(SequenceError::UnknownResidue { ch: 'B', pos: 1 })
));
}
#[test]
fn test_whitespace_and_dashes_are_skipped() {
let result = build_peptide_smiles("A A-A\nA\rA", &build_protein_fragment_map());
assert!(result.is_ok());
let smiles = result.unwrap();
let ala_fragment = "N[C@@H](C)C(=O)";
assert!(smiles.starts_with(ala_fragment));
assert!(smiles.ends_with('O'));
}
#[test]
fn test_build_peptide_smiles_known_sequence() {
let result = build_peptide_smiles("AAA", &build_protein_fragment_map());
assert!(result.is_ok());
let smiles = result.unwrap();
let expected = concat!("N[C@@H](C)C(=O)", "N[C@@H](C)C(=O)", "N[C@@H](C)C(=O)", "O");
assert_eq!(smiles, expected);
}
}