1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
//! MACCS 166-bit structural keys fingerprint.
//!
//! Each of the 166 MACCS keys corresponds to a SMARTS query.
//! A bit is set if the query matches at least once in the molecule.
//! Key 1 (bit 0) is unused by convention; keys 2–166 cover element
//! presence (rare metals) through common functional groups.
use chematic_core::Molecule;
use chematic_smarts::{find_matches, parse_smarts};
use crate::bitvec::BitVec2048;
/// MACCS key SMARTS patterns.
/// Index i corresponds to MACCS key (i+1); bit i is set when pattern i matches.
/// Empty string = unused / always-false key.
static MACCS_SMARTS: &[&str] = &[
"", // key 1 - unused
"[#103]", // key 2 - Lr
"[#102]", // key 3 - No
"[#101]", // key 4 - Md
"[#100]", // key 5 - Fm
"[#99]", // key 6 - Es
"[#98]", // key 7 - Cf
"[#97]", // key 8 - Bk
"[#96]", // key 9 - Cm
"[#95]", // key 10 - Am
"[#94]", // key 11 - Pu
"[#93]", // key 12 - Np
"[#92]", // key 13 - U
"[#90,#91]", // key 14 - Th/Pa
"[#89]", // key 15 - Ac
"[#88]", // key 16 - Ra
"[#87]", // key 17 - Fr
"[#85]", // key 18 - At
"[#84]", // key 19 - Po
"[#83]", // key 20 - Bi
"[#82]", // key 21 - Pb
"[#81]", // key 22 - Tl
"[#80]", // key 23 - Hg
"[#79]", // key 24 - Au
"[#77,#78]", // key 25 - Ir/Pt
"[#76]", // key 26 - Os
"[#75]", // key 27 - Re
"[#74]", // key 28 - W
"[#73]", // key 29 - Ta
"[#72]", // key 30 - Hf
"[#71]", // key 31 - Lu
"[#70]", // key 32 - Yb
"[#69]", // key 33 - Tm
"[#68]", // key 34 - Er
"[#67]", // key 35 - Ho
"[#66]", // key 36 - Dy
"[#65]", // key 37 - Tb
"[#64]", // key 38 - Gd
"[#63]", // key 39 - Eu
"[#62]", // key 40 - Sm
"[#61]", // key 41 - Pm
"[#60]", // key 42 - Nd
"[#59]", // key 43 - Pr
"[#58]", // key 44 - Ce
"[#57]", // key 45 - La
"[#55,#56]", // key 46 - Cs/Ba
"[#52,#53,#54]", // key 47 - Te/I/Xe
"[#51]", // key 48 - Sb
"[#50]", // key 49 - Sn
"[#49]", // key 50 - In
"[#47,#48]", // key 51 - Ag/Cd
"[#46]", // key 52 - Pd
"[#45]", // key 53 - Rh
"[#44]", // key 54 - Ru
"[#43]", // key 55 - Tc
"[#42]", // key 56 - Mo
"[#41]", // key 57 - Nb
"[#40]", // key 58 - Zr
"[#39]", // key 59 - Y
"[#37,#38]", // key 60 - Rb/Sr
"[#36]", // key 61 - Kr
"[#35]", // key 62 - Br
"[#34]", // key 63 - Se
"[#33]", // key 64 - As
"[#32]", // key 65 - Ge
"[#31]", // key 66 - Ga
"[#30]", // key 67 - Zn
"[#29]", // key 68 - Cu
"[#28]", // key 69 - Ni
"[#27]", // key 70 - Co
"[#26]", // key 71 - Fe
"[#25]", // key 72 - Mn
"[#24]", // key 73 - Cr
"[#23]", // key 74 - V
"[#22]", // key 75 - Ti
"[#21]", // key 76 - Sc
"[#16;R]", // key 77 - S in ring
"[#8;R]", // key 78 - O in ring
"[#7;R]", // key 79 - N in ring
"[#16]", // key 80 - any S
"[#15]", // key 81 - any P
"[#14]", // key 82 - Si
"[#6]~[#16]", // key 83 - C-S bond
"[#7]~[#6]~[#7]", // key 84 - N-C-N
"[#7]~[#7]", // key 85 - N-N bond
"[#8]~[#8]", // key 86 - O-O bond
"[#8]~[#15]", // key 87 - O-P bond
"[#16]~[#8]", // key 88 - S-O bond
"[#6]=[#16]", // key 89 - C=S
"[#16]=[#7]", // key 90 - S=N
"[#6]=[#7]", // key 91 - C=N
"[#7]~[#6]=[#8]", // key 92 - N-C=O (amide-like)
"[#8]~[#6]=[#8]", // key 93 - O-C=O (ester/acid)
"[#6]=[#6]", // key 94 - C=C
"[#6]#[#7]", // key 95 - C#N (nitrile)
"[#6]#[#6]", // key 96 - C#C (alkyne)
"[#6]~[#15]", // key 97 - C-P
"[#6]~[#8]~[#6]", // key 98 - C-O-C (ether)
"[#6]~[#7]~[#6]", // key 99 - C-N-C
"[#6]~[#16]~[#6]", // key 100 - C-S-C
"[#8]~[#6]~[#8]", // key 101 - O-C-O
"[#7]~[#6]~[#8]", // key 102 - N-C-O
"[#7]~[#6]~[#16]", // key 103 - N-C-S
"[#6]=[#6]~[#6]", // key 104 - C=C-C
"[#6]=[#6]~[#7]", // key 105 - C=C-N
"[#6]=[#6]~[#8]", // key 106 - C=C-O
"[#6]=[#6]~[#16]", // key 107 - C=C-S
"[#6]=[#6]~[#6]=[#6]", // key 108 - diene
"[#6]=[#7]~[#6]=[#8]", // key 109 - C=N-C=O
"[#6]=[#7]~[#6]=[#7]", // key 110 - C=N-C=N
"[#6]=[#8]~[#7]~[#6]=[#8]", // key 111
"[#6]~[#6]~[#8]~[#6]=[#8]", // key 112 - ester chain
"[#6]~[#6]~[#7]~[#6]=[#8]", // key 113 - amide chain
"[#6]~[#8]~[#6]=[#8]", // key 114 - O-C=O ester
"[#7]~[#6](=[#8])~[#7]", // key 115 - urea
"[#6]=[#8]", // key 116 - C=O carbonyl
"[#6]~[#7](~[#6])~[#6]", // key 117 - tertiary amine
"[#8]~[#6]~[#7]", // key 118 - O-C-N
"[!#1;!#6]~[#6]=[#8]", // key 119 - heteroatom adj to C=O
"[#6]=[#8]~[#8]", // key 120 - peracid
"[#7]=[#8]", // key 121 - N=O
"[#7;R]~[#6;R]=[#7;R]", // key 122 - amidin in ring
"[#6]~[#8]~[#8]~[#6]", // key 123 - peroxide
"[#16]=[#8]", // key 124 - S=O
"[!#6;!#1]~[!#6;!#1]", // key 125 - het-het bond
"[!#6;!#1;!#7;!#8;!#16;!#15;!#9;!#17;!#35;!#53]", // key 126 - unusual het
"[#7]~[#6]~[#7]~[#6]~[#8]", // key 127
"[#7]~[#6]~[#16]", // key 128 - N-C-S
"[#7]~[#7]~[#6]", // key 129 - N-N-C
"[#7]~[#6]=[#6]~[#7]", // key 130 - N-C=C-N
"[#6]=[#7]~[#7]=[#6]", // key 131
"[#8]~[#16](=[#8])=[#8]", // key 132 - sulfate
"[#16]~[#6]~[#16]", // key 133
"[!#1;!#6]~[!#1;!#6]~[!#1;!#6]", // key 134 - 3 het chain
"[#6]~[#16]~[#8]~[#6]", // key 135
"[#6]~[#7]~[#8]", // key 136
"[#7]~[#7]~[#7]", // key 137 - triazole/azide
"[#6]~[#7]~[#7]~[#7]", // key 138
"[#8;!R]~[#6;R]", // key 139 - exocyclic O on ring C
"[#7;R]~[#6;!R]=[#8]", // key 140 - exocyclic C=O on ring N
"[#6]~[#8]~[#6]~[#8]", // key 141
"[#7]~[#6](~[#8])~[#7]", // key 142 - urea variant
"[!#1;!#6]~[!#1;!#6]~[!#1;!#6]~[!#1;!#6]", // key 143
"[#6]~[#7;R]~[#6]~[#7;R]", // key 144
"[#6]~[#6]~[#8]~[#6]~[#6]", // key 145
"[#7]~[#7]~[#6]=[#8]", // key 146 - hydrazide
"[#6]~[#6]~[#7]~[#7]", // key 147
"[#7;R]~[#6;R]~[#7;R]~[#6;R]", // key 148
"[#6]~[#8]~[#6]~[#6]", // key 149
"[#16;R]~[#6;R]~[#7;R]", // key 150
"[#16;R]~[#6;R]~[#8;R]", // key 151
"[#16;R]~[#6;R]=[#7;R]", // key 152
"[#7;R]~[#6;R]=[#7;R]", // key 153 - imidazole N-C=N
"[#7;R]~[#6;R]=[#8;R]", // key 154 - lactam N-C=O
"[#8;R]~[#6;R]=[#8;R]", // key 155
"[#8;R]~[#6;R]~[#7;R]", // key 156
"[#8;R]~[#6;R]~[#8;R]", // key 157
"[#8;R]~[#6;R]~[#6;R]", // key 158 - O-C-C in ring
"[#7;R]~[#6;R]~[#6;R]", // key 159 - N-C-C in ring
"[#6;R]~[#6;R]~[#6;R]~[#6;R]~[#6;R]~[#6;R]", // key 160 - 6C chain in ring
"[a]~[a]~[a]~[a]~[a]~[a]", // key 161 - 6-atom aromatic chain
"[a]", // key 162 - any aromatic atom
"[!#6;a]", // key 163 - heteroaromatic
"[!#6;!#1]", // key 164 - any heteroatom (non-C, non-H)
"[#6;R]", // key 165 - any ring C
"[R]", // key 166 - any ring atom
];
/// Compute the MACCS 166-bit structural keys fingerprint for `mol`.
///
/// Each of the 166 bits corresponds to a structural feature (SMARTS pattern).
/// Bit `i` (0-indexed) is set if MACCS key `i+1` matches the molecule.
pub fn maccs(mol: &Molecule) -> BitVec2048 {
let mut fp = BitVec2048::new();
for (i, &pattern) in MACCS_SMARTS.iter().enumerate() {
if pattern.is_empty() {
continue;
}
if let Ok(query) = parse_smarts(pattern)
&& !find_matches(&query, mol).is_empty()
{
fp.set(i);
}
// Silently skip patterns that fail to parse or match errors
}
fp
}
#[cfg(test)]
mod tests {
use super::*;
use chematic_smiles::parse;
#[test]
fn maccs_benzene_nonzero() {
let mol = parse("c1ccccc1").unwrap();
let fp = maccs(&mol);
assert!(fp.popcount() > 0, "benzene maccs should be nonzero");
}
#[test]
fn maccs_ethanol_nonzero() {
let mol = parse("CCO").unwrap();
let fp = maccs(&mol);
assert!(fp.popcount() > 0, "ethanol maccs should be nonzero");
}
#[test]
fn maccs_benzene_has_aromatic_bit() {
let mol = parse("c1ccccc1").unwrap();
let fp = maccs(&mol);
// bit 161 (0-indexed) = key 162 = any aromatic atom
assert!(
fp.get(161),
"benzene should have aromatic bit (key 162, index 161) set"
);
}
#[test]
fn maccs_deterministic() {
let mol = parse("c1ccccc1").unwrap();
assert_eq!(maccs(&mol), maccs(&mol), "maccs must be deterministic");
}
#[test]
fn maccs_aspirin_has_carbonyl_bit() {
// aspirin: CC(=O)Oc1ccccc1C(=O)O — has C=O (key 116 = bit 115)
let mol = parse("CC(=O)Oc1ccccc1C(=O)O").unwrap();
let fp = maccs(&mol);
assert!(
fp.get(115),
"aspirin should have C=O bit (key 116, index 115) set"
);
}
#[test]
fn maccs_acetonitrile_has_triple_bond_bit() {
// CC#N — has C#N (key 95 = bit 94)
let mol = parse("CC#N").unwrap();
let fp = maccs(&mol);
assert!(
fp.get(94),
"acetonitrile should have C#N bit (key 95, index 94) set"
);
}
#[test]
fn maccs_bromobenzene_has_bromine_bit() {
// c1ccccc1Br — has Br (key 62 = bit 61)
let mol = parse("c1ccccc1Br").unwrap();
let fp = maccs(&mol);
assert!(
fp.get(61),
"bromobenzene should have Br bit (key 62, index 61) set"
);
}
}