1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
use std::{
num::NonZeroU16,
ops::{Range, RangeBounds},
};
use context_error::*;
use crate::{
chemistry::{Chemical, Element, MolecularFormula},
glycan::MonoSaccharide,
helper_functions::{RangeExtension, explain_number_error},
sequence::SequencePosition,
};
enum Brick {
Element(Element),
Formula(MolecularFormula),
}
/// # Errors
/// Errors if the provided text is not a Unimod composition brick.
fn parse_unimod_composition_brick(
text: &str,
range: Range<usize>,
) -> Result<Brick, BoxedError<'_, BasicKind>> {
match text[range.clone()].to_lowercase().as_str() {
"ac" => Ok(Brick::Formula(molecular_formula!(C 2 H 2 O 1))),
"me" => Ok(Brick::Formula(molecular_formula!(C 1 H 2))),
"kdn" => Ok(Brick::Formula(molecular_formula!(C 9 H 14 O 8))),
"kdo" => Ok(Brick::Formula(molecular_formula!(C 8 H 12 O 7))),
"sulf" => Ok(Brick::Formula(molecular_formula!(S 1 O 3))),
"phos" => Ok(Brick::Formula(molecular_formula!(P 1 O 3))),
"water" => Ok(Brick::Formula(molecular_formula!(H 2 O 1))),
_ => {
Element::try_from(text[range.clone()].to_lowercase().as_str()).map_or_else(|()| if let Ok((ms, _)) =
MonoSaccharide::from_short_iupac(text, range.start_index(), 0)
{
Ok(Brick::Formula(ms.formula_inner(SequencePosition::default(),0)))
} else {
Err(BoxedError::new(BasicKind::Error,
"Invalid Unimod chemical formula",
"Unknown Unimod composition brick, use an element or one of the unimod shorthands. Eg: 'H(13) C(12) N O(3)'.",
Context::line_range(None, text, range)))
}, |el| Ok(Brick::Element(el)))
}
}
}
impl MolecularFormula {
/// Parses Unimod compositions into molecular formulas. As Unimod compositions can have glycans in them these are reported as molecular formula.
/// ```text
/// H(25) C(8) 13C(7) N 15N(2) O(3)
/// H(6) C(4) N(2) dHex
/// ```
/// # Errors
/// If the formula is not valid according to the above specification, with some help on what is going wrong.
/// # Panics
/// It panics if the string contains not UTF8 symbols.
pub fn from_unimod(
value: &str,
range: impl RangeBounds<usize>,
) -> Result<Self, BoxedError<'_, BasicKind>> {
let (mut index, end) = range.bounds(value.len());
assert!(value.is_ascii());
let mut formula = Self::default();
let mut isotope = None;
let mut last_name_index = -1_isize;
let mut last_name = String::new();
while index < end {
match value.as_bytes()[index] {
b'(' => {
let length = value
.chars()
.skip(index + 1)
.take_while(|c| *c == '-' || *c == '+' || c.is_ascii_digit())
.count();
let num = value[index + 1..index + 1 + length]
.parse::<i32>()
.map_err(|err| {
BoxedError::new(
BasicKind::Error,
"Invalid Unimod chemical formula",
format!("The element amount {}", explain_number_error(&err)),
Context::line(None, value, index + 1, length),
)
})?;
match parse_unimod_composition_brick(
value,
last_name_index as usize..last_name_index as usize + last_name.len(),
)? {
Brick::Element(el) => {
if !formula.add((el, isotope.take(), num)) {
return Err(BoxedError::new(
BasicKind::Error,
"Invalid Unimod chemical formula",
"An element or isotope without a defined mass was found",
Context::line_range(
None,
value,
last_name_index as usize
..last_name_index as usize + last_name.len(),
),
));
}
}
Brick::Formula(f) => formula += f * num,
}
last_name.clear();
last_name_index = -1;
index += length + 2;
if value.as_bytes()[index - 1] != b')' {
return Err(BoxedError::new(
BasicKind::Error,
"Invalid Unimod chemical formula",
"The amount of an element should be closed by ')'",
Context::line(None, value, index - 1, 1),
));
}
}
b' ' => {
if !last_name.is_empty() {
match parse_unimod_composition_brick(
value,
last_name_index as usize..last_name_index as usize + last_name.len(),
)? {
Brick::Element(el) => {
if !formula.add((el, isotope.take(), 1)) {
return Err(BoxedError::new(
BasicKind::Error,
"Invalid Unimod chemical formula",
"An element or isotope without a defined mass was found",
Context::line_range(
None,
value,
last_name_index as usize
..last_name_index as usize + last_name.len(),
),
));
}
}
Brick::Formula(f) => formula += f,
}
last_name.clear();
last_name_index = -1;
}
index += 1;
}
n if n.is_ascii_digit() => {
let length = value
.chars()
.skip(index)
.take_while(char::is_ascii_digit)
.count();
isotope = Some(value[index..index + length].parse::<NonZeroU16>().map_err(
|err| {
BoxedError::new(
BasicKind::Error,
"Invalid Unimod chemical formula",
format!("The isotope {}", explain_number_error(&err)),
Context::line(None, value, index, length),
)
},
)?);
index += length;
}
n if n.is_ascii_alphabetic() => {
last_name.push(n as char);
if last_name_index == -1 {
last_name_index = isize::try_from(index).unwrap();
}
index += 1;
}
_ => {
return Err(BoxedError::new(
BasicKind::Error,
"Invalid Unimod chemical formula",
"Unexpected character, use an element or one of the unimod shorthands. Eg: 'H(13) C(12) N O(3)'.",
Context::line(None, value, index, 1),
));
}
}
}
if !last_name.is_empty() {
match parse_unimod_composition_brick(
value,
last_name_index as usize..last_name_index as usize + last_name.len(),
)? {
Brick::Element(el) => {
if !formula.add((el, isotope.take(), 1)) {
return Err(BoxedError::new(
BasicKind::Error,
"Invalid Unimod chemical formula",
"An element or isotope without a defined mass was found",
Context::line_range(
None,
value,
last_name_index as usize
..last_name_index as usize + last_name.len(),
),
));
}
}
Brick::Formula(f) => formula += f,
}
}
Ok(formula)
}
}