use regex::Regex;
use crate::chemistry::unimod::unimod_modifications_mass;
pub fn unimod_sequence_to_tokens(sequence: &str, group_modifications: bool) -> Vec<String> {
let pattern = Regex::new(r"\[UNIMOD:\d+\]").unwrap();
let mut tokens = Vec::new();
let mut last_index = 0;
for mat in pattern.find_iter(sequence) {
if group_modifications {
let pre_mod_sequence = &sequence[last_index..mat.start()];
let aa_sequence = if pre_mod_sequence.is_empty() {
""
} else {
&pre_mod_sequence[..pre_mod_sequence.len() - 1]
};
tokens.extend(aa_sequence.chars().map(|c| c.to_string()));
let grouped_mod = format!("{}{}", pre_mod_sequence.chars().last().unwrap_or_default().to_string(), &sequence[mat.start()..mat.end()]);
tokens.push(grouped_mod);
} else {
let aa_sequence = &sequence[last_index..mat.start()];
tokens.extend(aa_sequence.chars().map(|c| c.to_string()));
let unimod = &sequence[mat.start()..mat.end()];
tokens.push(unimod.to_string());
}
last_index = mat.end();
}
if !group_modifications || last_index < sequence.len() {
let remaining_aa_sequence = &sequence[last_index..];
tokens.extend(remaining_aa_sequence.chars().map(|c| c.to_string()));
}
tokens
}
pub fn find_unimod_patterns(input_string: &str) -> (String, Vec<f64>) {
let results = extract_unimod_patterns(input_string);
let stripped_sequence = remove_unimod_annotation(input_string);
let index_list = generate_index_list(&results, input_string);
let mods = calculate_modifications(&index_list, &stripped_sequence);
(stripped_sequence, mods)
}
fn remove_unimod_annotation(sequence: &str) -> String {
let pattern = Regex::new(r"\[UNIMOD:\d+]").unwrap();
pattern.replace_all(sequence, "").to_string()
}
fn extract_unimod_patterns(input_string: &str) -> Vec<(usize, usize, String)> {
let pattern = Regex::new(r"\[UNIMOD:\d+]").unwrap();
pattern.find_iter(input_string)
.map(|mat| (mat.start(), mat.end(), mat.as_str().to_string()))
.collect()
}
fn generate_index_list(results: &[(usize, usize, String)], sequence: &str) -> Vec<(usize, String)> {
let mut index_list = Vec::new();
let mut chars_removed_counter = 0;
for (start, end, _) in results {
let num_chars_removed = end - start;
let mod_str = &sequence[*start..*end];
let later_aa_index = if *start != 0 {
start - 1 - chars_removed_counter
} else {
0
};
index_list.push((later_aa_index, mod_str.to_string()));
chars_removed_counter += num_chars_removed;
}
index_list
}
fn calculate_modifications(index_list: &[(usize, String)], stripped_sequence: &str) -> Vec<f64> {
let mut mods = vec![0.0; stripped_sequence.len()];
for (index, mod_str) in index_list {
if let Some(mass) = unimod_modifications_mass().get(mod_str.as_str()) {
mods[*index] += mass;
}
}
mods
}
pub fn reshape_prosit_array(flat_array: Vec<f64>) -> Vec<Vec<Vec<f64>>> {
let mut array_return: Vec<Vec<Vec<f64>>> = vec![vec![vec![0.0; 3]; 2]; 29];
let mut ptr = 0;
for c in 0..3 {
for row in 0..29 {
array_return[row][0][c] = flat_array[ptr];
ptr += 1;
}
for row in 0..29 {
array_return[row][1][c] = flat_array[ptr];
ptr += 1;
}
}
array_return
}