use std::collections::BTreeMap;
use crate::{AtomQueryPredicate, BondOrder, BondQueryPredicate, QueryNode, SmartsParseError};
#[derive(Debug, Clone)]
pub struct SmartsMolecule {
pub atom_queries: Vec<QueryNode<AtomQueryPredicate>>,
pub bond_queries: Vec<QueryNode<BondQueryPredicate>>,
pub ring_closures: Vec<(u8, usize)>,
}
impl SmartsMolecule {
#[must_use]
pub fn new(
atom_queries: Vec<QueryNode<AtomQueryPredicate>>,
bond_queries: Vec<QueryNode<BondQueryPredicate>>,
ring_closures: Vec<(u8, usize)>,
) -> Self {
Self {
atom_queries,
bond_queries,
ring_closures,
}
}
#[must_use]
pub fn num_atoms(&self) -> usize {
self.atom_queries.len()
}
#[must_use]
pub fn atom_query(&self, idx: usize) -> Option<&QueryNode<AtomQueryPredicate>> {
self.atom_queries.get(idx)
}
#[must_use]
pub fn bond_query(&self, idx: usize) -> Option<&QueryNode<BondQueryPredicate>> {
self.bond_queries.get(idx)
}
}
#[derive(Debug, Clone)]
pub struct SmartsParseParams {
pub allow_cxsmiles: bool,
pub strict_cxsmiles: bool,
pub parse_name: bool,
pub merge_hs: bool,
pub skip_cleanup: bool,
pub debug_parse: bool,
pub replacements: BTreeMap<String, String>,
}
impl Default for SmartsParseParams {
fn default() -> Self {
Self {
allow_cxsmiles: true,
strict_cxsmiles: true,
parse_name: true,
merge_hs: false,
skip_cleanup: false,
debug_parse: false,
replacements: BTreeMap::new(),
}
}
}
pub fn parse_smarts(smarts: &str) -> Result<SmartsMolecule, SmartsParseError> {
parse_smarts_with_params(smarts, &SmartsParseParams::default())
}
pub fn parse_smarts_with_params(
smarts: &str,
_params: &SmartsParseParams,
) -> Result<SmartsMolecule, SmartsParseError> {
let input = preprocess_smarts(smarts, _params);
let input = label_recursive_patterns(&input);
let tokens = tokenize(&input)?;
let mut parser = SmartsParser::new(&tokens, &input);
parser.parse_smarts_molecule()
}
fn preprocess_smarts(smarts: &str, params: &SmartsParseParams) -> String {
let trimmed = smarts.trim();
if params.replacements.is_empty() {
return trimmed.to_string();
}
let mut result = trimmed.to_string();
loop {
let mut replaced = false;
for (key, val) in ¶ms.replacements {
if result.contains(key.as_str()) {
result = result.replace(key.as_str(), val.as_str());
replaced = true;
}
}
if !replaced {
break;
}
}
result
}
fn label_recursive_patterns(sma: &str) -> String {
#[derive(Clone, Copy, PartialEq)]
enum SmaState {
Base,
Branch,
Recurse,
}
use SmaState::*;
let mut state: Vec<SmaState> = vec![Base];
let mut start_recurse: Vec<usize> = Vec::new();
let mut patterns: BTreeMap<String, String> = BTreeMap::new();
let mut res = String::new();
let chars: Vec<char> = sma.chars().collect();
let mut pos: usize = 0;
while pos < chars.len() {
res.push(chars[pos]);
if chars[pos] == '$' && pos + 1 < chars.len() && chars[pos + 1] == '(' {
state.push(Recurse);
start_recurse.push(pos);
pos += 1;
res.push(chars[pos]);
} else if chars[pos] == '(' {
state.push(Branch);
} else if chars[pos] == ')' {
if state.is_empty() || state.last() == Some(&Base) {
return sma.to_string();
}
let curr_state = state.pop().unwrap();
if curr_state == Recurse {
let dollar_pos = start_recurse.pop().unwrap();
if pos + 1 >= chars.len() || chars[pos + 1] != '_' {
let recurs: String = chars[dollar_pos..=pos].iter().collect();
let label = if let Some(lbl) = patterns.get(&recurs) {
lbl.clone()
} else {
let lbl = format!("{}", patterns.len() + 100);
patterns.insert(recurs, lbl.clone());
lbl
};
res.push('_');
res.push_str(&label);
}
}
}
pos += 1;
}
res
}
#[derive(Debug, Clone, PartialEq)]
enum Token {
OrganicElement(String),
AromaticElement(String),
BracketContent(String),
BondSpec(char),
OpenParen,
CloseParen,
OpenBracket,
CloseBracket,
RingClosureDigit(u8),
RingClosurePercent(u8),
And,
Or,
Not,
Dollar,
Dot,
EndOfStream,
}
fn tokenize(input: &str) -> Result<Vec<(Token, usize)>, SmartsParseError> {
let mut tokens = Vec::new();
let chars: Vec<char> = input.chars().collect();
let len = chars.len();
let mut i = 0;
while i < len {
let ch = chars[i];
match ch {
' ' | '\t' | '\n' | '\r' => {
i += 1;
continue;
}
'[' => {
let start = i;
i += 1;
let mut depth = 1u32;
let content_start = i;
while i < len && depth > 0 {
if chars[i] == '[' {
depth += 1;
} else if chars[i] == ']' {
depth -= 1;
}
i += 1;
}
if depth > 0 {
return Err(SmartsParseError::UnclosedBracket(start));
}
let content: String = chars[content_start..i - 1].iter().collect();
tokens.push((Token::BracketContent(content), start));
}
'-' | '=' | '#' | ':' | '~' | '@' => {
tokens.push((Token::BondSpec(ch), i));
i += 1;
}
'/' => {
tokens.push((Token::BondSpec('/'), i));
i += 1;
}
'\\' => {
tokens.push((Token::BondSpec('\\'), i));
i += 1;
}
'(' => {
tokens.push((Token::OpenParen, i));
i += 1;
}
')' => {
tokens.push((Token::CloseParen, i));
i += 1;
}
'&' => {
tokens.push((Token::And, i));
i += 1;
}
';' => {
tokens.push((Token::And, i));
i += 1;
}
',' => {
tokens.push((Token::Or, i));
i += 1;
}
'!' => {
tokens.push((Token::Not, i));
i += 1;
}
'$' => {
tokens.push((Token::Dollar, i));
i += 1;
}
'.' => {
tokens.push((Token::Dot, i));
i += 1;
}
'%' => {
if i + 2 < len {
let d1 = chars[i + 1];
let d2 = chars[i + 2];
if d1.is_ascii_digit() && d2.is_ascii_digit() {
let num = (d1.to_digit(10).unwrap() * 10 + d2.to_digit(10).unwrap()) as u8;
tokens.push((Token::RingClosurePercent(num), i));
i += 3;
continue;
}
}
return Err(SmartsParseError::UnexpectedCharacter {
position: i,
character: ch,
context: "expected two digits after %".to_string(),
});
}
d if d.is_ascii_digit() => {
let num = d.to_digit(10).unwrap() as u8;
tokens.push((Token::RingClosureDigit(num), i));
i += 1;
}
'c' | 'n' | 'o' | 's' | 'p' | 'a' | 'b' => {
let name = ch.to_string();
tokens.push((Token::AromaticElement(name), i));
i += 1;
}
'B' | 'C' | 'N' | 'O' | 'S' | 'P' | 'F' | 'I' | 'H' | 'R' | 'X' | 'D' | 'v' | 'V'
| 'r' | 'u' | 'A' | 'T' | 'Z' | 'K' | 'W' | 'U' | 'Y' | 'G' | 'L' | 'J' | 'E' | 'M'
| 'Q' => {
let start = i;
i += 1;
if i < len && chars[i].is_ascii_lowercase() {
let two_char: String = chars[start..=i].iter().collect();
match two_char.as_str() {
"Cl" | "Br" | "Si" | "As" | "Se" | "Te" | "He" | "Li" | "Be" | "Ne"
| "Na" | "Mg" | "Al" | "Ar" | "Ca" | "Sc" | "Ti" | "Cr" | "Mn" | "Fe"
| "Co" | "Ni" | "Cu" | "Zn" | "Ga" | "Ge" | "Kr" | "Rb" | "Sr" | "Zr"
| "Nb" | "Mo" | "Tc" | "Ru" | "Rh" | "Pd" | "Ag" | "Cd" | "In" | "Sn"
| "Sb" | "Xe" | "Cs" | "Ba" | "La" | "Ce" | "Pr" | "Nd" | "Pm" | "Sm"
| "Eu" | "Gd" | "Tb" | "Dy" | "Ho" | "Er" | "Tm" | "Yb" | "Lu" | "Hf"
| "Ta" | "Re" | "Os" | "Ir" | "Pt" | "Au" | "Hg" | "Tl" | "Pb" | "Bi"
| "Po" | "At" | "Rn" | "Fr" | "Ra" | "Ac" | "Th" | "Pa" | "Np" | "Pu"
| "Am" | "Cm" | "Bk" | "Cf" | "Es" | "Fm" | "Md" | "No" | "Lr" | "Rf"
| "Db" | "Sg" | "Bh" | "Hs" | "Mt" | "Ds" | "Rg" | "Cn" | "Nh" | "Fl"
| "Mc" | "Lv" | "Ts" | "Og" => {
i += 1;
}
_ => {
}
}
}
let name: String = chars[start..i].iter().collect();
tokens.push((Token::OrganicElement(name), start));
}
'*' => {
tokens.push((Token::OrganicElement("*".to_string()), i));
i += 1;
}
_ => {
return Err(SmartsParseError::UnexpectedCharacter {
position: i,
character: ch,
context: "unexpected character in SMARTS string".to_string(),
});
}
}
}
tokens.push((Token::EndOfStream, len));
Ok(tokens)
}
struct SmartsParser<'a> {
tokens: &'a [(Token, usize)],
input: &'a str,
pos: usize,
ring_closure_targets: BTreeMap<u8, (usize, QueryNode<BondQueryPredicate>, usize)>,
}
impl<'a> SmartsParser<'a> {
fn new(tokens: &'a [(Token, usize)], input: &'a str) -> Self {
Self {
tokens,
input,
pos: 0,
ring_closure_targets: BTreeMap::new(),
}
}
fn peek(&self) -> &(Token, usize) {
&self.tokens[self.pos]
}
fn advance(&mut self) {
self.pos += 1;
}
fn pos_info(&self) -> usize {
self.tokens[self.pos].1
}
fn parse_smarts_molecule(&mut self) -> Result<SmartsMolecule, SmartsParseError> {
let mut atom_queries: Vec<QueryNode<AtomQueryPredicate>> = Vec::new();
let mut bond_queries: Vec<QueryNode<BondQueryPredicate>> = Vec::new();
let mut ring_closures: Vec<(u8, usize)> = Vec::new();
let first = self.parse_atom()?;
atom_queries.push(first);
self.parse_smarts_chain(&mut atom_queries, &mut bond_queries, &mut ring_closures)?;
Ok(SmartsMolecule::new(
atom_queries,
bond_queries,
ring_closures,
))
}
fn parse_smarts_chain(
&mut self,
atom_queries: &mut Vec<QueryNode<AtomQueryPredicate>>,
bond_queries: &mut Vec<QueryNode<BondQueryPredicate>>,
ring_closures: &mut Vec<(u8, usize)>,
) -> Result<(), SmartsParseError> {
loop {
match self.peek() {
(Token::EndOfStream, _) => break,
(Token::CloseParen, _) => break,
(Token::CloseBracket, _) => break,
(Token::BondSpec(_), _) | (Token::Not, _) | (Token::And, _) => {
let bond = self.parse_bond()?;
bond_queries.push(bond);
let atom = self.parse_atom()?;
atom_queries.push(atom);
}
_ => {
match self.peek() {
(Token::RingClosureDigit(n), pos) | (Token::RingClosurePercent(n), pos) => {
let num = *n;
let existing_len = atom_queries.len();
let bond_pos = *pos;
self.advance();
let last_atom_idx = existing_len - 1;
ring_closures.push((num, last_atom_idx));
self.ring_closure_targets.entry(num).or_insert((
last_atom_idx,
unspecified_smarts_bond_query(),
bond_pos,
));
}
(Token::OpenParen, _) => {
self.advance();
let saved_bonds = bond_queries.len();
let saved_atoms = atom_queries.len();
self.parse_smarts_chain(atom_queries, bond_queries, ring_closures)?;
match self.peek() {
(Token::CloseParen, _) => {
self.advance();
}
(tok, pos) => {
return Err(SmartsParseError::UnexpectedCharacter {
position: *pos,
character: format!("{:?}", tok)
.chars()
.next()
.unwrap_or('?'),
context: "expected close parenthesis".to_string(),
});
}
}
if saved_atoms < atom_queries.len() {
bond_queries.insert(saved_bonds, unspecified_smarts_bond_query());
}
}
(Token::Dot, _) => {
self.advance();
bond_queries.push(QueryNode::Predicate(BondQueryPredicate::Any));
let atom = self.parse_atom()?;
atom_queries.push(atom);
}
_ => {
bond_queries.push(unspecified_smarts_bond_query());
let atom = self.parse_atom()?;
atom_queries.push(atom);
}
}
}
}
}
Ok(())
}
fn parse_atom(&mut self) -> Result<QueryNode<AtomQueryPredicate>, SmartsParseError> {
let (token, _pos) = self.peek().clone();
match token {
Token::OrganicElement(name) => {
let query = organic_element_to_query(&name);
self.advance();
Ok(query)
}
Token::AromaticElement(name) => {
let query = aromatic_element_to_query(&name);
self.advance();
Ok(query)
}
Token::BracketContent(content) => {
self.advance();
self.parse_bracket_atom_content(&content)
}
Token::EndOfStream => Err(SmartsParseError::UnexpectedEnd(
"expected atom but reached end".to_string(),
)),
_ => {
let pos = self.pos_info();
Err(SmartsParseError::UnexpectedCharacter {
position: pos,
character: '?',
context: "expected atom expression".to_string(),
})
}
}
}
fn parse_bracket_atom_content(
&mut self,
content: &str,
) -> Result<QueryNode<AtomQueryPredicate>, SmartsParseError> {
let chars: Vec<char> = content.chars().collect();
let len = chars.len();
if let Some(query) = self.try_parse_hydrogen_atom(&chars, len)? {
return Ok(query);
}
let mut i = 0;
let mut negate_next = false;
let mut clauses: Vec<QueryNode<AtomQueryPredicate>> = Vec::new();
let mut current_or_terms: Vec<QueryNode<AtomQueryPredicate>> = Vec::new();
let mut current_term: Vec<QueryNode<AtomQueryPredicate>> = Vec::new();
fn finalize_term(
current_term: &mut Vec<QueryNode<AtomQueryPredicate>>,
current_or_terms: &mut Vec<QueryNode<AtomQueryPredicate>>,
) {
if current_term.is_empty() {
return;
}
let term = if current_term.len() == 1 {
current_term.pop().expect("single atom-query term")
} else {
QueryNode::And(std::mem::take(current_term))
};
current_or_terms.push(term);
}
fn finalize_clause(
current_term: &mut Vec<QueryNode<AtomQueryPredicate>>,
current_or_terms: &mut Vec<QueryNode<AtomQueryPredicate>>,
clauses: &mut Vec<QueryNode<AtomQueryPredicate>>,
) {
finalize_term(current_term, current_or_terms);
if current_or_terms.is_empty() {
return;
}
let clause = if current_or_terms.len() == 1 {
current_or_terms.pop().expect("single atom-query clause")
} else {
QueryNode::Or(std::mem::take(current_or_terms))
};
clauses.push(clause);
}
while i < len {
let ch = chars[i];
if ch == '!' {
negate_next = !negate_next;
i += 1;
continue;
}
if ch == ',' {
finalize_term(&mut current_term, &mut current_or_terms);
i += 1;
continue;
}
if ch == '&' {
i += 1;
continue;
}
if ch == ';' {
finalize_clause(&mut current_term, &mut current_or_terms, &mut clauses);
i += 1;
continue;
}
let (pred, consumed) = self.parse_atom_primitive(&chars, i, len)?;
let pred = if negate_next {
negate_next = false;
QueryNode::not(pred)
} else {
pred
};
current_term.push(pred);
i = consumed;
}
finalize_clause(&mut current_term, &mut current_or_terms, &mut clauses);
if clauses.is_empty() {
Ok(QueryNode::Predicate(AtomQueryPredicate::Any))
} else if clauses.len() == 1 {
Ok(clauses.into_iter().next().expect("single bracket clause"))
} else {
Ok(QueryNode::And(clauses))
}
}
fn try_parse_hydrogen_atom(
&self,
chars: &[char],
len: usize,
) -> Result<Option<QueryNode<AtomQueryPredicate>>, SmartsParseError> {
if len == 0 {
return Ok(None);
}
let mut pos = 0usize;
let mut isotope = None;
if chars[pos].is_ascii_digit() {
let (num, consumed) = self.parse_number(chars, pos, len)?;
isotope = Some(num as u16);
pos = consumed;
}
if pos >= len || chars[pos] != 'H' {
return Ok(None);
}
pos += 1;
if pos < len && chars[pos].is_ascii_digit() {
return Ok(None);
}
let mut formal_charge = None;
if pos < len && matches!(chars[pos], '+' | '-') {
let (pred, consumed) = self.parse_atom_primitive(chars, pos, len)?;
match pred {
QueryNode::Predicate(AtomQueryPredicate::FormalCharge(charge)) => {
formal_charge = Some(charge);
pos = consumed;
}
_ => return Ok(None),
}
}
let mut atom_map = None;
if pos < len && chars[pos] == ':' {
let (num, consumed) = self.parse_number(chars, pos + 1, len)?;
atom_map = Some(num);
pos = consumed;
}
if pos != len {
return Ok(None);
}
let mut clauses = vec![QueryNode::Predicate(AtomQueryPredicate::AtomicNumber(1))];
if let Some(isotope) = isotope {
clauses.push(QueryNode::Predicate(AtomQueryPredicate::Isotope(isotope)));
}
if let Some(formal_charge) = formal_charge {
clauses.push(QueryNode::Predicate(AtomQueryPredicate::FormalCharge(
formal_charge,
)));
}
if let Some(atom_map) = atom_map {
clauses.push(QueryNode::Predicate(AtomQueryPredicate::AtomMapNumber(
atom_map,
)));
}
Ok(Some(if clauses.len() == 1 {
clauses.pop().expect("single hydrogen atom clause")
} else {
QueryNode::And(clauses)
}))
}
fn parse_atom_primitive(
&self,
chars: &[char],
i: usize,
len: usize,
) -> Result<(QueryNode<AtomQueryPredicate>, usize), SmartsParseError> {
if i >= len {
return Err(SmartsParseError::UnexpectedEnd(
"expected atom primitive".to_string(),
));
}
let ch = chars[i];
if ch == '#' {
let (num, consumed) = self.parse_number(chars, i + 1, len)?;
return Ok((
QueryNode::Predicate(AtomQueryPredicate::AtomicNumber(num as u8)),
consumed,
));
}
if ch == '$' {
if chars.get(i + 1) != Some(&'(') {
return Err(SmartsParseError::InvalidAtomPrimitive {
position: i,
detail: "expected '(' after '$'".to_string(),
});
}
let mut depth = 1usize;
let mut end = i + 2;
while end < len && depth > 0 {
match chars[end] {
'(' => depth += 1,
')' => depth -= 1,
_ => {}
}
end += 1;
}
if depth != 0 {
return Err(SmartsParseError::UnclosedParenthesis(i + 1));
}
let recursive_smarts: String = chars[i..end].iter().collect();
let mut consumed = end;
if chars.get(consumed) == Some(&'_') {
consumed += 1;
while consumed < len && chars[consumed].is_ascii_digit() {
consumed += 1;
}
}
return Ok((
QueryNode::Predicate(AtomQueryPredicate::RecursiveSmarts(recursive_smarts)),
consumed,
));
}
if ch == '+' {
let start = i + 1;
if start < len && chars[start] == '+' {
return Ok((
QueryNode::Predicate(AtomQueryPredicate::FormalCharge(2)),
start + 1,
));
}
let (num, consumed) = self.parse_optional_number(chars, start, len);
let charge = if consumed == start { 1 } else { num as i8 };
return Ok((
QueryNode::Predicate(AtomQueryPredicate::FormalCharge(charge)),
consumed,
));
}
if ch == '-' {
let start = i + 1;
if start < len && chars[start] == '-' {
return Ok((
QueryNode::Predicate(AtomQueryPredicate::FormalCharge(-2)),
start + 1,
));
}
let (num, consumed) = self.parse_optional_number(chars, start, len);
let charge = if consumed == start { -1 } else { -(num as i8) };
return Ok((
QueryNode::Predicate(AtomQueryPredicate::FormalCharge(charge)),
consumed,
));
}
if ch == '@' {
let start = i + 1;
if start < len && chars[start] == '@' {
return Ok((
QueryNode::Predicate(AtomQueryPredicate::ChiralTagMatch(
crate::ChiralTag::TetrahedralCw,
)),
start + 1,
));
}
return Ok((
QueryNode::Predicate(AtomQueryPredicate::ChiralTagMatch(
crate::ChiralTag::TetrahedralCcw,
)),
start,
));
}
if ch == 'h' {
let (num, consumed) = self.parse_optional_number(chars, i + 1, len);
if consumed == i + 1 {
return Ok((
QueryNode::Predicate(AtomQueryPredicate::HasImplicitHydrogen),
consumed,
));
}
return Ok((
QueryNode::Predicate(AtomQueryPredicate::ImplicitHydrogenCount(num as u8)),
consumed,
));
}
if ch == 'H' {
let (num, consumed) = self.parse_optional_number(chars, i + 1, len);
if consumed == i + 1 {
return Ok((
QueryNode::Predicate(AtomQueryPredicate::HydrogenCount(1)),
consumed,
));
}
return Ok((
QueryNode::Predicate(AtomQueryPredicate::HydrogenCount(num as u8)),
consumed,
));
}
if ch == 'R' {
let (num, consumed) = self.parse_optional_number(chars, i + 1, len);
if num == 0 {
return Ok((QueryNode::Predicate(AtomQueryPredicate::InRing), consumed));
}
return Ok((
QueryNode::Predicate(AtomQueryPredicate::NumRingBonds(num as u8)),
consumed,
));
}
if ch == 'r' {
if chars.get(i + 1) == Some(&'{') {
let (predicate, consumed) = self.parse_ring_size_range(chars, i + 2, len)?;
return Ok((predicate, consumed));
}
let (num, consumed) = self.parse_optional_number(chars, i + 1, len);
if num == 0 {
return Ok((QueryNode::Predicate(AtomQueryPredicate::InRing), consumed));
}
return Ok((
QueryNode::Predicate(AtomQueryPredicate::SmallestRingSize(num as u8)),
consumed,
));
}
if ch == 'X' {
let (num, consumed) = self.parse_optional_number(chars, i + 1, len);
if num == 0 {
return Ok((QueryNode::Predicate(AtomQueryPredicate::Any), consumed));
}
return Ok((
QueryNode::Predicate(AtomQueryPredicate::Connectivity(num as u8)),
consumed,
));
}
if ch == 'x' {
let (num, consumed) = self.parse_optional_number(chars, i + 1, len);
if num == 0 {
return Ok((QueryNode::Predicate(AtomQueryPredicate::Any), consumed));
}
return Ok((
QueryNode::Predicate(AtomQueryPredicate::NumRingBonds(num as u8)),
consumed,
));
}
if ch == 'D' {
let (num, consumed) = self.parse_optional_number(chars, i + 1, len);
if num == 0 {
return Ok((QueryNode::Predicate(AtomQueryPredicate::Any), consumed));
}
return Ok((
QueryNode::Predicate(AtomQueryPredicate::Degree(num as u8)),
consumed,
));
}
if ch == '^' {
let (num, consumed) = self.parse_number(chars, i + 1, len)?;
let hybridization = match num {
0 => crate::Hybridization::S,
1 => crate::Hybridization::Sp,
2 => crate::Hybridization::Sp2,
3 => crate::Hybridization::Sp3,
4 => crate::Hybridization::Sp3d,
5 => crate::Hybridization::Sp3d2,
_ => crate::Hybridization::Other,
};
return Ok((
QueryNode::Predicate(AtomQueryPredicate::HybridizationMatch(hybridization)),
consumed,
));
}
if ch == 'v' {
let (num, consumed) = self.parse_optional_number(chars, i + 1, len);
if num == 0 {
return Ok((QueryNode::Predicate(AtomQueryPredicate::Any), consumed));
}
return Ok((
QueryNode::Predicate(AtomQueryPredicate::TotalDegree(num as u8)),
consumed,
));
}
if ch == ':' {
let (num, consumed) = self.parse_number(chars, i + 1, len)?;
return Ok((
QueryNode::Predicate(AtomQueryPredicate::AtomMapNumber(num)),
consumed,
));
}
if ch == 'a' {
return Ok((
QueryNode::Predicate(AtomQueryPredicate::IsAromatic(true)),
i + 1,
));
}
if ch == 'A' {
return Ok((
QueryNode::Predicate(AtomQueryPredicate::IsAromatic(false)),
i + 1,
));
}
if ch == 'u' {
return Ok((
QueryNode::Predicate(AtomQueryPredicate::IsUnsaturated),
i + 1,
));
}
if ch.is_ascii_uppercase() {
let start = i;
let end = i + 1;
if end < len && chars[end].is_ascii_lowercase() {
let two_char: String = chars[start..=end].iter().collect();
if let Some(atomic_num) = element_symbol_to_atomic_number(&two_char) {
let query = match two_char.as_str() {
"B" | "C" | "N" | "O" | "P" | "S" | "F" | "Cl" | "Br" | "I" | "Si"
| "As" | "Se" | "Te" | "H" => organic_element_to_query(&two_char),
_ => QueryNode::Predicate(AtomQueryPredicate::AtomicNumber(atomic_num)),
};
return Ok((query, end + 1));
}
}
let one_char: String = chars[start..end].iter().collect();
if let Some(atomic_num) = element_symbol_to_atomic_number(&one_char) {
let query = match one_char.as_str() {
"B" | "C" | "N" | "O" | "P" | "S" | "F" | "H" => {
organic_element_to_query(&one_char)
}
_ => QueryNode::Predicate(AtomQueryPredicate::AtomicNumber(atomic_num)),
};
return Ok((query, end));
}
return Ok((QueryNode::Predicate(AtomQueryPredicate::Any), end));
}
if ch.is_ascii_digit() {
let (num, consumed) = self.parse_number(chars, i, len)?;
return Ok((
QueryNode::Predicate(AtomQueryPredicate::Isotope(num as u16)),
consumed,
));
}
if ch.is_ascii_lowercase() && ch != 'a' && ch != 'u' && ch != 'v' && ch != 'r' && ch != 'h'
{
let name = ch.to_string();
let query = aromatic_element_to_query(&name);
return Ok((query, i + 1));
}
if ch == '*' {
return Ok((QueryNode::Predicate(AtomQueryPredicate::Any), i + 1));
}
Err(SmartsParseError::InvalidAtomPrimitive {
position: i,
detail: format!("unexpected character '{}'", ch),
})
}
fn parse_ring_size_range(
&self,
chars: &[char],
start: usize,
len: usize,
) -> Result<(QueryNode<AtomQueryPredicate>, usize), SmartsParseError> {
let mut pos = start;
let lower = if pos < len && chars[pos].is_ascii_digit() {
let (num, consumed) = self.parse_number(chars, pos, len)?;
pos = consumed;
Some(num as u8)
} else {
None
};
if chars.get(pos) != Some(&'-') {
return Err(SmartsParseError::InvalidAtomPrimitive {
position: start.saturating_sub(2),
detail: "expected '-' in ring-size range".to_string(),
});
}
pos += 1;
let upper = if pos < len && chars[pos].is_ascii_digit() {
let (num, consumed) = self.parse_number(chars, pos, len)?;
pos = consumed;
Some(num as u8)
} else {
None
};
if chars.get(pos) != Some(&'}') {
return Err(SmartsParseError::InvalidAtomPrimitive {
position: start.saturating_sub(2),
detail: "expected '}' to close ring-size range".to_string(),
});
}
pos += 1;
let predicate = match (lower, upper) {
(Some(min), Some(max)) => QueryNode::And(vec![
QueryNode::Predicate(AtomQueryPredicate::SmallestRingSizeGreaterEqual(min)),
QueryNode::Predicate(AtomQueryPredicate::SmallestRingSizeLessEqual(max)),
]),
(Some(min), None) => {
QueryNode::Predicate(AtomQueryPredicate::SmallestRingSizeGreaterEqual(min))
}
(None, Some(max)) => {
QueryNode::Predicate(AtomQueryPredicate::SmallestRingSizeLessEqual(max))
}
(None, None) => {
return Err(SmartsParseError::InvalidAtomPrimitive {
position: start.saturating_sub(2),
detail: "empty ring-size range".to_string(),
});
}
};
Ok((predicate, pos))
}
fn parse_number(
&self,
chars: &[char],
i: usize,
len: usize,
) -> Result<(u32, usize), SmartsParseError> {
if i >= len || !chars[i].is_ascii_digit() {
return Err(SmartsParseError::UnexpectedEnd(
"expected number".to_string(),
));
}
let mut val = 0u32;
let mut pos = i;
while pos < len && chars[pos].is_ascii_digit() {
val = val * 10 + chars[pos].to_digit(10).unwrap();
pos += 1;
}
Ok((val, pos))
}
fn parse_optional_number(&self, chars: &[char], i: usize, len: usize) -> (u32, usize) {
if i >= len || !chars[i].is_ascii_digit() {
return (0, i);
}
let mut val = 0u32;
let mut pos = i;
while pos < len && chars[pos].is_ascii_digit() {
val = val * 10 + chars[pos].to_digit(10).unwrap();
pos += 1;
}
(val, pos)
}
fn parse_bond(&mut self) -> Result<QueryNode<BondQueryPredicate>, SmartsParseError> {
let mut negate_next = false;
let mut predicates = Vec::new();
let mut consumed_any = false;
let mut logical_or = false;
while matches!(
self.peek(),
(Token::BondSpec(_), _) | (Token::Not, _) | (Token::And, _) | (Token::Or, _)
) {
match self.peek() {
(Token::Not, _) => {
consumed_any = true;
negate_next = !negate_next;
self.advance();
}
(Token::And, _) => {
consumed_any = true;
self.advance();
}
(Token::Or, _) => {
consumed_any = true;
logical_or = true;
self.advance();
}
(Token::BondSpec(ch), _) => {
consumed_any = true;
let query = bond_spec_to_query(*ch);
self.advance();
let query = if negate_next {
negate_next = false;
QueryNode::not(query)
} else {
query
};
predicates.push(query);
}
_ => break,
}
}
let predicates = predicates
.into_iter()
.filter(|query| *query != QueryNode::Predicate(BondQueryPredicate::Any))
.collect::<Vec<_>>();
match predicates.len() {
0 if consumed_any => Ok(QueryNode::Predicate(BondQueryPredicate::Any)),
0 => Err(SmartsParseError::UnexpectedCharacter {
position: self.pos_info(),
character: '?',
context: "expected bond specifier".to_string(),
}),
len if logical_or && len > 1 => Ok(QueryNode::Or(predicates)),
1 => Ok(predicates.into_iter().next().expect("single bond query")),
_ => Ok(QueryNode::And(predicates)),
}
}
}
fn organic_element_to_query(name: &str) -> QueryNode<AtomQueryPredicate> {
fn atom_type_query(n: u8, aromatic: bool) -> QueryNode<AtomQueryPredicate> {
QueryNode::Predicate(AtomQueryPredicate::AtomType {
atomic_number: n,
aromatic,
})
}
match name {
"*" => QueryNode::Predicate(AtomQueryPredicate::Any),
"A" => QueryNode::Predicate(AtomQueryPredicate::IsAromatic(false)),
"B" => atom_type_query(5, false),
"C" => atom_type_query(6, false),
"N" => atom_type_query(7, false),
"O" => atom_type_query(8, false),
"P" => atom_type_query(15, false),
"S" => atom_type_query(16, false),
"F" => atom_type_query(9, false),
"Cl" => atom_type_query(17, false),
"Br" => atom_type_query(35, false),
"I" => atom_type_query(53, false),
"Si" => atom_type_query(14, false),
"As" => atom_type_query(33, false),
"Se" => atom_type_query(34, false),
"Te" => atom_type_query(52, false),
"H" => atom_type_query(1, false),
_ => QueryNode::Predicate(AtomQueryPredicate::Any),
}
}
fn aromatic_element_to_query(name: &str) -> QueryNode<AtomQueryPredicate> {
match name {
"c" => QueryNode::And(vec![
QueryNode::Predicate(AtomQueryPredicate::AtomicNumber(6)),
QueryNode::Predicate(AtomQueryPredicate::IsAromatic(true)),
]),
"n" => QueryNode::And(vec![
QueryNode::Predicate(AtomQueryPredicate::AtomicNumber(7)),
QueryNode::Predicate(AtomQueryPredicate::IsAromatic(true)),
]),
"o" => QueryNode::And(vec![
QueryNode::Predicate(AtomQueryPredicate::AtomicNumber(8)),
QueryNode::Predicate(AtomQueryPredicate::IsAromatic(true)),
]),
"s" => QueryNode::And(vec![
QueryNode::Predicate(AtomQueryPredicate::AtomicNumber(16)),
QueryNode::Predicate(AtomQueryPredicate::IsAromatic(true)),
]),
"p" => QueryNode::And(vec![
QueryNode::Predicate(AtomQueryPredicate::AtomicNumber(15)),
QueryNode::Predicate(AtomQueryPredicate::IsAromatic(true)),
]),
"b" => QueryNode::And(vec![
QueryNode::Predicate(AtomQueryPredicate::AtomicNumber(5)),
QueryNode::Predicate(AtomQueryPredicate::IsAromatic(true)),
]),
"a" => QueryNode::Predicate(AtomQueryPredicate::IsAromatic(true)),
_ => QueryNode::Predicate(AtomQueryPredicate::Any),
}
}
fn bond_spec_to_query(ch: char) -> QueryNode<BondQueryPredicate> {
match ch {
'-' => QueryNode::Predicate(BondQueryPredicate::Order(BondOrder::Single)),
'=' => QueryNode::Predicate(BondQueryPredicate::Order(BondOrder::Double)),
'#' => QueryNode::Predicate(BondQueryPredicate::Order(BondOrder::Triple)),
':' => QueryNode::Predicate(BondQueryPredicate::IsAromatic(true)),
'@' => QueryNode::Predicate(BondQueryPredicate::IsInRing(true)),
'~' => QueryNode::Predicate(BondQueryPredicate::Any),
'/' | '\\' => unspecified_smarts_bond_query(),
_ => QueryNode::Predicate(BondQueryPredicate::Any),
}
}
fn unspecified_smarts_bond_query() -> QueryNode<BondQueryPredicate> {
QueryNode::Or(vec![
QueryNode::Predicate(BondQueryPredicate::Order(BondOrder::Single)),
QueryNode::Predicate(BondQueryPredicate::IsAromatic(true)),
])
}
fn element_symbol_to_atomic_number(symbol: &str) -> Option<u8> {
match symbol {
"H" => Some(1),
"He" => Some(2),
"Li" => Some(3),
"Be" => Some(4),
"B" => Some(5),
"C" => Some(6),
"N" => Some(7),
"O" => Some(8),
"F" => Some(9),
"Ne" => Some(10),
"Na" => Some(11),
"Mg" => Some(12),
"Al" => Some(13),
"Si" => Some(14),
"P" => Some(15),
"S" => Some(16),
"Cl" => Some(17),
"Ar" => Some(18),
"K" => Some(19),
"Ca" => Some(20),
"Sc" => Some(21),
"Ti" => Some(22),
"V" => Some(23),
"Cr" => Some(24),
"Mn" => Some(25),
"Fe" => Some(26),
"Co" => Some(27),
"Ni" => Some(28),
"Cu" => Some(29),
"Zn" => Some(30),
"Ga" => Some(31),
"Ge" => Some(32),
"As" => Some(33),
"Se" => Some(34),
"Br" => Some(35),
"Kr" => Some(36),
"Rb" => Some(37),
"Sr" => Some(38),
"Y" => Some(39),
"Zr" => Some(40),
"Nb" => Some(41),
"Mo" => Some(42),
"Tc" => Some(43),
"Ru" => Some(44),
"Rh" => Some(45),
"Pd" => Some(46),
"Ag" => Some(47),
"Cd" => Some(48),
"In" => Some(49),
"Sn" => Some(50),
"Sb" => Some(51),
"Te" => Some(52),
"I" => Some(53),
"Xe" => Some(54),
"Cs" => Some(55),
"Ba" => Some(56),
"La" => Some(57),
"Ce" => Some(58),
"Pr" => Some(59),
"Nd" => Some(60),
"Pm" => Some(61),
"Sm" => Some(62),
"Eu" => Some(63),
"Gd" => Some(64),
"Tb" => Some(65),
"Dy" => Some(66),
"Ho" => Some(67),
"Er" => Some(68),
"Tm" => Some(69),
"Yb" => Some(70),
"Lu" => Some(71),
"Hf" => Some(72),
"Ta" => Some(73),
"W" => Some(74),
"Re" => Some(75),
"Os" => Some(76),
"Ir" => Some(77),
"Pt" => Some(78),
"Au" => Some(79),
"Hg" => Some(80),
"Tl" => Some(81),
"Pb" => Some(82),
"Bi" => Some(83),
"Po" => Some(84),
"At" => Some(85),
"Rn" => Some(86),
"Fr" => Some(87),
"Ra" => Some(88),
"Ac" => Some(89),
"Th" => Some(90),
"Pa" => Some(91),
"U" => Some(92),
"Np" => Some(93),
"Pu" => Some(94),
"Am" => Some(95),
"Cm" => Some(96),
"Bk" => Some(97),
"Cf" => Some(98),
"Es" => Some(99),
"Fm" => Some(100),
"Md" => Some(101),
"No" => Some(102),
"Lr" => Some(103),
"Rf" => Some(104),
"Db" => Some(105),
"Sg" => Some(106),
"Bh" => Some(107),
"Hs" => Some(108),
"Mt" => Some(109),
"Ds" => Some(110),
"Rg" => Some(111),
"Cn" => Some(112),
"Nh" => Some(113),
"Fl" => Some(114),
"Mc" => Some(115),
"Lv" => Some(116),
"Ts" => Some(117),
"Og" => Some(118),
_ => None,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_tokenize_simple_smarts() {
let tokens = tokenize("CC").unwrap();
assert_eq!(tokens.len(), 3); assert_eq!(tokens[0].0, Token::OrganicElement("C".to_string()));
assert_eq!(tokens[1].0, Token::OrganicElement("C".to_string()));
assert_eq!(tokens[2].0, Token::EndOfStream);
}
#[test]
fn test_tokenize_bracket_atom() {
let tokens = tokenize("[N+]").unwrap();
assert_eq!(tokens.len(), 2); match &tokens[0].0 {
Token::BracketContent(content) => {
assert_eq!(content, "N+");
}
_ => panic!("expected bracket content"),
}
}
#[test]
fn test_tokenize_ring_closure() {
let tokens = tokenize("C1CC1").unwrap();
assert_eq!(tokens.len(), 6); assert_eq!(tokens[0].0, Token::OrganicElement("C".to_string()));
assert_eq!(tokens[1].0, Token::RingClosureDigit(1));
assert_eq!(tokens[4].0, Token::RingClosureDigit(1));
}
#[test]
fn test_tokenize_percent_ring_closure() {
let tokens = tokenize("C%10CC%10").unwrap();
assert_eq!(tokens.len(), 6);
assert_eq!(tokens[1].0, Token::RingClosurePercent(10));
assert_eq!(tokens[4].0, Token::RingClosurePercent(10));
}
#[test]
fn test_tokenize_bond_specs() {
let tokens = tokenize("C=O").unwrap();
assert_eq!(tokens[1].0, Token::BondSpec('='));
let tokens = tokenize("C#N").unwrap();
assert_eq!(tokens[1].0, Token::BondSpec('#'));
let tokens = tokenize("C~C").unwrap();
assert_eq!(tokens[1].0, Token::BondSpec('~'));
}
#[test]
fn test_tokenize_unclosed_bracket() {
let result = tokenize("[NH");
assert!(result.is_err());
assert!(matches!(
result.unwrap_err(),
SmartsParseError::UnclosedBracket(_)
));
}
#[test]
fn test_organic_element_to_query() {
assert_eq!(
organic_element_to_query("C"),
QueryNode::Predicate(AtomQueryPredicate::AtomType {
atomic_number: 6,
aromatic: false,
})
);
assert_eq!(
organic_element_to_query("*"),
QueryNode::Predicate(AtomQueryPredicate::Any)
);
}
#[test]
fn test_bracket_atom_element() {
let mol = parse_smarts("[C]").unwrap();
assert_eq!(mol.atom_queries.len(), 1);
assert_eq!(mol.bond_queries.len(), 0);
}
#[test]
fn test_parse_simple_smarts() {
let mol = parse_smarts("CC").unwrap();
assert_eq!(mol.atom_queries.len(), 2);
assert_eq!(mol.bond_queries.len(), 1);
assert_eq!(
mol.atom_queries[0],
QueryNode::Predicate(AtomQueryPredicate::AtomType {
atomic_number: 6,
aromatic: false,
})
);
assert_eq!(
mol.bond_queries[0],
QueryNode::Or(vec![
QueryNode::Predicate(BondQueryPredicate::Order(BondOrder::Single)),
QueryNode::Predicate(BondQueryPredicate::IsAromatic(true)),
])
);
}
#[test]
fn test_parse_bonded_smarts() {
let mol = parse_smarts("C=O").unwrap();
assert_eq!(mol.atom_queries.len(), 2);
assert_eq!(mol.bond_queries.len(), 1);
assert_eq!(
mol.bond_queries[0],
QueryNode::Predicate(BondQueryPredicate::Order(BondOrder::Double))
);
}
#[test]
fn test_bracket_with_charge() {
let mol = parse_smarts("[N+]").unwrap();
assert_eq!(mol.atom_queries.len(), 1);
assert_eq!(
mol.atom_queries[0],
QueryNode::And(vec![
QueryNode::Predicate(AtomQueryPredicate::AtomType {
atomic_number: 7,
aromatic: false,
}),
QueryNode::Predicate(AtomQueryPredicate::FormalCharge(1)),
])
);
}
#[test]
fn test_bracket_with_negative_charge_defaults_to_minus_one() {
let mol = parse_smarts("[O-]").unwrap();
assert_eq!(mol.atom_queries.len(), 1);
assert_eq!(
mol.atom_queries[0],
QueryNode::And(vec![
QueryNode::Predicate(AtomQueryPredicate::AtomType {
atomic_number: 8,
aromatic: false,
}),
QueryNode::Predicate(AtomQueryPredicate::FormalCharge(-1)),
])
);
}
#[test]
fn test_bracket_with_chirality() {
let mol = parse_smarts("[C@@H]").unwrap();
assert_eq!(mol.atom_queries.len(), 1);
}
#[test]
fn test_parse_ring_closure() {
let mol = parse_smarts("C1CC1").unwrap();
assert_eq!(mol.atom_queries.len(), 3);
assert_eq!(mol.ring_closures.len(), 2);
}
#[test]
fn test_parse_branch() {
let mol = parse_smarts("C(C)C").unwrap();
assert_eq!(mol.atom_queries.len(), 3);
}
#[test]
fn test_bracket_atomic_number_primitive() {
let mol = parse_smarts("[#6]").unwrap();
assert_eq!(mol.atom_queries.len(), 1);
}
#[test]
fn test_label_recursive_patterns_noop() {
assert_eq!(label_recursive_patterns("CCO"), "CCO");
}
#[test]
fn test_label_recursive_patterns_simple() {
let result = label_recursive_patterns("[$([N])]");
assert!(result.contains("_100") || result == "[$([N])]");
}
#[test]
fn test_element_symbol_lookup() {
assert_eq!(element_symbol_to_atomic_number("C"), Some(6));
assert_eq!(element_symbol_to_atomic_number("O"), Some(8));
assert_eq!(element_symbol_to_atomic_number("Cl"), Some(17));
assert_eq!(element_symbol_to_atomic_number("Br"), Some(35));
assert_eq!(element_symbol_to_atomic_number("Xx"), None);
}
#[test]
fn test_parse_aromatic_smarts() {
let mol = parse_smarts("c1ccccc1").unwrap();
assert_eq!(mol.atom_queries.len(), 6);
for aq in &mol.atom_queries {
match aq {
QueryNode::And(children) => {
assert_eq!(children.len(), 2);
assert_eq!(
children[0],
QueryNode::Predicate(AtomQueryPredicate::AtomicNumber(6))
);
assert_eq!(
children[1],
QueryNode::Predicate(AtomQueryPredicate::IsAromatic(true))
);
}
_ => panic!("expected And node for aromatic atom"),
}
}
}
#[test]
fn test_smarts_molecule_num_atoms() {
let mol = parse_smarts("CCO").unwrap();
assert_eq!(mol.num_atoms(), 3);
assert!(mol.atom_query(0).is_some());
assert!(mol.bond_query(0).is_some());
assert!(mol.bond_query(1).is_some());
}
#[test]
fn test_smarts_parse_params_default() {
let params = SmartsParseParams::default();
assert!(params.allow_cxsmiles);
assert!(!params.merge_hs);
}
#[test]
fn test_varied_bonds() {
let mol = parse_smarts("C#N").unwrap();
assert_eq!(
mol.bond_queries[0],
QueryNode::Predicate(BondQueryPredicate::Order(BondOrder::Triple))
);
let mol = parse_smarts("C:N").unwrap();
assert_eq!(
mol.bond_queries[0],
QueryNode::Predicate(BondQueryPredicate::IsAromatic(true))
);
}
#[test]
fn test_empty_smarts_molecule() {
let result = parse_smarts("");
assert!(result.is_err());
}
}