use super::types::{ErrorType, ResolvedAction};
use std::cmp::min;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct DetectedCorrection {
pub error_type: ErrorType,
pub original: String,
pub corrected: String,
pub start: usize,
pub end: usize,
}
impl DetectedCorrection {
pub fn new(
error_type: ErrorType,
original: impl Into<String>,
corrected: impl Into<String>,
start: usize,
end: usize,
) -> Self {
Self {
error_type,
original: original.into(),
corrected: corrected.into(),
start,
end,
}
}
pub fn warning_message(&self) -> String {
format!(
"{} at position {}: '{}' → '{}'",
self.error_type, self.start, self.original, self.corrected
)
}
}
pub fn correct_dash_characters(input: &str) -> (String, Vec<DetectedCorrection>) {
let mut result = String::with_capacity(input.len());
let mut corrections = Vec::new();
for (i, c) in input.char_indices() {
match c {
'\u{2013}' => {
corrections.push(DetectedCorrection::new(
ErrorType::WrongDashCharacter,
c.to_string(),
"-",
i,
i + c.len_utf8(),
));
result.push('-');
}
'\u{2014}' => {
corrections.push(DetectedCorrection::new(
ErrorType::WrongDashCharacter,
c.to_string(),
"-",
i,
i + c.len_utf8(),
));
result.push('-');
}
'\u{2212}' => {
corrections.push(DetectedCorrection::new(
ErrorType::WrongDashCharacter,
c.to_string(),
"-",
i,
i + c.len_utf8(),
));
result.push('-');
}
_ => result.push(c),
}
}
(result, corrections)
}
pub fn correct_quote_characters(input: &str) -> (String, Vec<DetectedCorrection>) {
let mut result = String::with_capacity(input.len());
let mut corrections = Vec::new();
for (i, c) in input.char_indices() {
match c {
'\u{2018}' | '\u{2019}' => {
corrections.push(DetectedCorrection::new(
ErrorType::WrongQuoteCharacter,
c.to_string(),
"'",
i,
i + c.len_utf8(),
));
result.push('\'');
}
'\u{201C}' | '\u{201D}' => {
corrections.push(DetectedCorrection::new(
ErrorType::WrongQuoteCharacter,
c.to_string(),
"\"",
i,
i + c.len_utf8(),
));
result.push('"');
}
_ => result.push(c),
}
}
(result, corrections)
}
pub fn correct_whitespace(input: &str) -> (String, Vec<DetectedCorrection>) {
let mut corrections = Vec::new();
let trimmed = input.trim();
if input != trimmed {
let leading = input.len() - input.trim_start().len();
let trailing = input.trim_end().len();
if leading > 0 || trailing < input.len() {
corrections.push(DetectedCorrection::new(
ErrorType::ExtraWhitespace,
input.to_string(),
trimmed.to_string(),
0,
input.len(),
));
}
}
let mut result = String::with_capacity(trimmed.len());
let mut prev_space = false;
let mut chars = trimmed.char_indices().peekable();
while let Some((i, c)) = chars.next() {
if c.is_whitespace() {
if let Some(&(_, next_c)) = chars.peek() {
if matches!(
next_c,
'.' | ':' | '>' | '<' | '+' | '-' | '_' | '(' | ')' | '[' | ']'
) {
if corrections.is_empty()
|| corrections.last().unwrap().error_type != ErrorType::ExtraWhitespace
{
corrections.push(DetectedCorrection::new(
ErrorType::ExtraWhitespace,
" ".to_string(),
"".to_string(),
i,
i + 1,
));
}
prev_space = false;
continue;
}
}
prev_space = true;
} else {
if prev_space
&& matches!(
c,
'.' | ':' | '>' | '<' | '+' | '-' | '_' | '(' | ')' | '[' | ']'
)
{
prev_space = false;
result.push(c);
continue;
}
if prev_space {
result.push(' ');
prev_space = false;
}
result.push(c);
}
}
(result, corrections)
}
pub fn detect_position_zero(input: &str) -> Option<usize> {
let mut chars = input.char_indices().peekable();
while let Some((i, c)) = chars.next() {
if c == '.' {
if let Some(&(_, next_c)) = chars.peek() {
if next_c == '0' {
chars.next();
if let Some(&(_, after_zero)) = chars.peek() {
if !after_zero.is_ascii_digit() {
return Some(i + 1); }
} else {
return Some(i + 1);
}
}
}
}
}
None
}
pub fn correct_protein_arrow(input: &str) -> (String, Vec<DetectedCorrection>) {
let mut corrections = Vec::new();
if let Some(arrow_pos) = input.find('>') {
if input.starts_with("p.") || input.contains(":p.") {
let after_arrow = &input[arrow_pos + 1..];
if after_arrow.len() >= 3 {
let potential_aa = &after_arrow[..3];
if is_amino_acid_like(potential_aa) {
let corrected = format!("{}{}", &input[..arrow_pos], &input[arrow_pos + 1..]);
corrections.push(DetectedCorrection::new(
ErrorType::ProteinSubstitutionArrow,
">",
"",
arrow_pos,
arrow_pos + 1,
));
return (corrected, corrections);
}
}
}
}
(input.to_string(), corrections)
}
fn is_amino_acid_like(s: &str) -> bool {
if s.len() < 3 {
return false;
}
let bytes = s.as_bytes();
bytes[0].is_ascii_alphabetic()
&& bytes[1].is_ascii_alphabetic()
&& bytes[2].is_ascii_alphabetic()
}
#[allow(dead_code)]
pub fn correct_amino_acid_case(token: &str) -> Option<(String, ErrorType)> {
let amino_acids = [
("ala", "Ala"),
("arg", "Arg"),
("asn", "Asn"),
("asp", "Asp"),
("cys", "Cys"),
("gln", "Gln"),
("glu", "Glu"),
("gly", "Gly"),
("his", "His"),
("ile", "Ile"),
("leu", "Leu"),
("lys", "Lys"),
("met", "Met"),
("phe", "Phe"),
("pro", "Pro"),
("sec", "Sec"),
("ser", "Ser"),
("thr", "Thr"),
("trp", "Trp"),
("tyr", "Tyr"),
("val", "Val"),
("ter", "Ter"),
("xaa", "Xaa"),
];
let lower = token.to_lowercase();
for (pattern, correct) in amino_acids {
if lower == pattern && token != correct {
return Some((correct.to_string(), ErrorType::LowercaseAminoAcid));
}
}
None
}
#[allow(dead_code)]
pub fn single_to_three_letter_aa(single: char) -> Option<&'static str> {
match single.to_ascii_uppercase() {
'A' => Some("Ala"),
'R' => Some("Arg"),
'N' => Some("Asn"),
'D' => Some("Asp"),
'C' => Some("Cys"),
'Q' => Some("Gln"),
'E' => Some("Glu"),
'G' => Some("Gly"),
'H' => Some("His"),
'I' => Some("Ile"),
'L' => Some("Leu"),
'K' => Some("Lys"),
'M' => Some("Met"),
'F' => Some("Phe"),
'P' => Some("Pro"),
'U' => Some("Sec"),
'S' => Some("Ser"),
'T' => Some("Thr"),
'W' => Some("Trp"),
'Y' => Some("Tyr"),
'V' => Some("Val"),
'*' => Some("Ter"),
'X' => Some("Xaa"),
_ => None,
}
}
pub fn correct_accession_prefix_case(input: &str) -> (String, Vec<DetectedCorrection>) {
let mut corrections = Vec::new();
let prefixes = [
("nm_", "NM_"),
("np_", "NP_"),
("nc_", "NC_"),
("ng_", "NG_"),
("nr_", "NR_"),
("xm_", "XM_"),
("xp_", "XP_"),
("xr_", "XR_"),
("enst", "ENST"),
("ensp", "ENSP"),
("lrg_", "LRG_"),
];
for (lower, correct) in prefixes {
if input.to_lowercase().starts_with(lower) && !input.starts_with(correct) {
let corrected = format!("{}{}", correct, &input[lower.len()..]);
corrections.push(DetectedCorrection::new(
ErrorType::LowercaseAccessionPrefix,
input[..lower.len()].to_string(),
correct.to_string(),
0,
lower.len(),
));
return (corrected, corrections);
}
}
(input.to_string(), corrections)
}
#[allow(dead_code)]
pub fn correct_edit_type_case(token: &str) -> Option<(String, ErrorType)> {
let edit_types = [
("del", "del"),
("ins", "ins"),
("dup", "dup"),
("inv", "inv"),
("con", "con"),
("delins", "delins"),
];
let lower = token.to_lowercase();
for (pattern, correct) in edit_types {
if lower == pattern && token != correct {
return Some((correct.to_string(), ErrorType::MixedCaseEditType));
}
}
None
}
pub fn detect_missing_version(input: &str) -> Option<(usize, String)> {
let prefixes = [
("NM_", false),
("NP_", false),
("NC_", false),
("NG_", false),
("NR_", false),
("XM_", false),
("XP_", false),
("XR_", false),
("ENST", true), ("ENSP", true),
("LRG_", false),
];
for (prefix, optional) in prefixes {
if let Some(start) = input.find(prefix) {
let after_prefix = &input[start + prefix.len()..];
let end = after_prefix.find(':').unwrap_or(after_prefix.len());
let accession_body = &after_prefix[..end];
let has_version = accession_body.contains('.')
&& accession_body
.rsplit('.')
.next()
.map(|v| v.chars().all(|c| c.is_ascii_digit()))
.unwrap_or(false);
if !has_version && !optional {
let full_accession = format!("{}{}", prefix, accession_body);
return Some((start, full_accession));
}
}
}
None
}
pub fn detect_swapped_positions(input: &str) -> Option<DetectedCorrection> {
let coord_markers = [".c.", ".g.", ".n.", ".m.", ".o.", ".r."];
let mut search_start = 0;
for marker in &coord_markers {
if let Some(pos) = input.find(marker) {
search_start = pos + marker.len();
break;
}
}
if search_start == 0 {
if let Some(pos) = input.find(['c', 'g', 'n', 'm', 'r']) {
if input.get(pos + 1..pos + 2) == Some(".") {
search_start = pos + 2;
}
}
}
if search_start == 0 {
return None;
}
let after_marker = &input[search_start..];
let mut chars = after_marker.char_indices().peekable();
let mut first_num_str = String::new();
if let Some(&(_, '-')) = chars.peek() {
first_num_str.push('-');
chars.next();
}
while let Some(&(_, c)) = chars.peek() {
if c.is_ascii_digit() {
first_num_str.push(c);
chars.next();
} else {
break;
}
}
if first_num_str.is_empty() || first_num_str == "-" {
return None;
}
while let Some(&(_, c)) = chars.peek() {
if c == '+' || c == '-' || c.is_ascii_digit() {
chars.next();
} else {
break;
}
}
if chars.next().map(|(_, c)| c) != Some('_') {
return None;
}
let _second_start = chars.peek().map(|(i, _)| *i).unwrap_or(0);
let mut second_num_str = String::new();
if let Some(&(_, '-')) = chars.peek() {
second_num_str.push('-');
chars.next();
}
while let Some(&(_, c)) = chars.peek() {
if c.is_ascii_digit() {
second_num_str.push(c);
chars.next();
} else {
break;
}
}
if second_num_str.is_empty() || second_num_str == "-" {
return None;
}
let first_num: i64 = first_num_str.parse().ok()?;
let second_num: i64 = second_num_str.parse().ok()?;
if first_num > second_num {
let prefix = &input[..search_start];
let suffix_start =
search_start + chars.peek().map(|(i, _)| *i).unwrap_or(after_marker.len());
let suffix = &input[suffix_start..];
let _corrected = format!("{}{}_{}{}", prefix, second_num_str, first_num_str, suffix);
return Some(DetectedCorrection::new(
ErrorType::SwappedPositions,
format!("{}_{}", first_num_str, second_num_str),
format!("{}_{}", second_num_str, first_num_str),
search_start,
suffix_start,
));
}
None
}
pub fn strip_trailing_annotation(input: &str) -> (String, Vec<DetectedCorrection>) {
let mut corrections = Vec::new();
let trimmed = input.trim_end();
if let Some(start_idx) = trimmed.rfind("(p.") {
if let Some(close_idx) = trimmed[start_idx..].find(')') {
let abs_close_idx = start_idx + close_idx;
let after_close = trimmed[abs_close_idx + 1..].trim();
if after_close.is_empty() {
let before_annotation = &trimmed[..start_idx];
let stripped = before_annotation.trim_end();
let annotation_start = stripped.len();
let annotation = &input[annotation_start..];
corrections.push(DetectedCorrection::new(
ErrorType::TrailingAnnotation,
annotation.trim(),
"",
annotation_start,
input.len(),
));
return (stripped.to_string(), corrections);
}
}
}
(input.to_string(), corrections)
}
pub fn correct_missing_coordinate_prefix(input: &str) -> (String, Vec<DetectedCorrection>) {
let mut corrections = Vec::new();
let colon_pos = match input.find(':') {
Some(pos) => pos,
None => return (input.to_string(), corrections),
};
let accession = &input[..colon_pos];
let after_colon = &input[colon_pos + 1..];
let is_genomic_accession = accession.starts_with("NC_")
|| accession.starts_with("NG_")
|| accession.starts_with("NT_")
|| accession.starts_with("NW_")
|| (accession.starts_with("LRG_") && !accession.contains('t'));
if !is_genomic_accession {
return (input.to_string(), corrections);
}
let has_prefix = after_colon.starts_with("g.")
|| after_colon.starts_with("c.")
|| after_colon.starts_with("p.")
|| after_colon.starts_with("n.")
|| after_colon.starts_with("r.")
|| after_colon.starts_with("m.");
if has_prefix {
return (input.to_string(), corrections);
}
let first_char = after_colon.chars().next().unwrap_or(' ');
if !first_char.is_ascii_digit() && first_char != '(' && first_char != '?' && first_char != '[' {
return (input.to_string(), corrections);
}
let corrected = format!("{}:g.{}", accession, after_colon);
corrections.push(DetectedCorrection::new(
ErrorType::MissingCoordinatePrefix,
format!("{}:", accession),
format!("{}:g.", accession),
0,
colon_pos + 1,
));
(corrected, corrections)
}
pub fn correct_swapped_positions(input: &str) -> (String, Vec<DetectedCorrection>) {
if let Some(correction) = detect_swapped_positions(input) {
let corrected = format!(
"{}{}{}",
&input[..correction.start],
correction.corrected,
&input[correction.end..]
);
(corrected, vec![correction])
} else {
(input.to_string(), vec![])
}
}
pub fn correct_old_allele_format(input: &str) -> (String, Vec<DetectedCorrection>) {
let mut corrections = Vec::new();
let colon_bracket = match input.find(":[") {
Some(pos) => pos,
None => return (input.to_string(), corrections),
};
let after_bracket = &input[colon_bracket + 2..];
let coord_type = if after_bracket.starts_with("c.") {
"c."
} else if after_bracket.starts_with("g.") {
"g."
} else if after_bracket.starts_with("n.") {
"n."
} else if after_bracket.starts_with("r.") {
"r."
} else if after_bracket.starts_with("m.") {
"m."
} else if after_bracket.starts_with("p.") {
"p."
} else {
return (input.to_string(), corrections);
};
let close_bracket = match input[colon_bracket..].find(']') {
Some(pos) => colon_bracket + pos,
None => return (input.to_string(), corrections),
};
let content = &input[colon_bracket + 2..close_bracket];
let parts: Vec<&str> = content.split(';').collect();
if parts.is_empty() {
return (input.to_string(), corrections);
}
let all_same_prefix = parts.iter().all(|p| p.trim().starts_with(coord_type));
if !all_same_prefix {
return (input.to_string(), corrections);
}
let stripped_parts: Vec<String> = parts
.iter()
.map(|p| {
let trimmed = p.trim();
if let Some(stripped) = trimmed.strip_prefix(coord_type) {
stripped.to_string()
} else {
trimmed.to_string()
}
})
.collect();
let accession = &input[..colon_bracket];
let new_content = stripped_parts.join(";");
let remaining = &input[close_bracket + 1..];
let corrected = format!("{}:{}[{}]{}", accession, coord_type, new_content, remaining);
let original_pattern = &input[colon_bracket..close_bracket + 1];
let corrected_pattern = format!(":{}[{}]", coord_type, new_content);
corrections.push(DetectedCorrection::new(
ErrorType::OldAlleleFormat,
original_pattern,
&corrected_pattern,
colon_bracket,
close_bracket + 1,
));
(corrected, corrections)
}
#[allow(dead_code)]
pub fn apply_correction(
original: &str,
correction: &DetectedCorrection,
action: ResolvedAction,
) -> (String, bool) {
match action {
ResolvedAction::Reject => (original.to_string(), false),
ResolvedAction::WarnCorrect => {
let corrected = format!(
"{}{}{}",
&original[..correction.start],
correction.corrected,
&original[correction.end..]
);
(corrected, true)
}
ResolvedAction::SilentCorrect => {
let corrected = format!(
"{}{}{}",
&original[..correction.start],
correction.corrected,
&original[correction.end..]
);
(corrected, false)
}
ResolvedAction::Accept => (original.to_string(), false),
}
}
pub fn levenshtein_distance(a: &str, b: &str) -> usize {
let a_chars: Vec<char> = a.chars().collect();
let b_chars: Vec<char> = b.chars().collect();
let a_len = a_chars.len();
let b_len = b_chars.len();
if a_len == 0 {
return b_len;
}
if b_len == 0 {
return a_len;
}
let mut prev_row: Vec<usize> = (0..=b_len).collect();
let mut curr_row: Vec<usize> = vec![0; b_len + 1];
for (i, a_char) in a_chars.iter().enumerate() {
curr_row[0] = i + 1;
for (j, b_char) in b_chars.iter().enumerate() {
let cost = if a_char == b_char { 0 } else { 1 };
curr_row[j + 1] = min(
min(
prev_row[j + 1] + 1, curr_row[j] + 1, ),
prev_row[j] + cost, );
}
std::mem::swap(&mut prev_row, &mut curr_row);
}
prev_row[b_len]
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct FuzzyMatch {
pub matched: String,
pub distance: usize,
}
impl FuzzyMatch {
pub fn new(matched: impl Into<String>, distance: usize) -> Self {
Self {
matched: matched.into(),
distance,
}
}
}
const ACCESSION_PREFIXES: &[&str] = &[
"NM_", "NP_", "NC_", "NG_", "NR_", "NW_", "NT_", "XM_", "XP_", "XR_", "ENST", "ENSG", "ENSP",
"ENSE", "LRG_",
];
const EDIT_TYPES: &[&str] = &["del", "ins", "dup", "inv", "con", "delins", "fs", "ext"];
const AMINO_ACIDS: &[&str] = &[
"Ala", "Arg", "Asn", "Asp", "Cys", "Gln", "Glu", "Gly", "His", "Ile", "Leu", "Lys", "Met",
"Phe", "Pro", "Sec", "Ser", "Thr", "Trp", "Tyr", "Val", "Ter", "Xaa",
];
pub fn find_closest_match(
input: &str,
candidates: &[&str],
max_distance: usize,
case_sensitive: bool,
) -> Option<FuzzyMatch> {
let input_normalized = if case_sensitive {
input.to_string()
} else {
input.to_lowercase()
};
let mut best_match: Option<FuzzyMatch> = None;
for &candidate in candidates {
let candidate_normalized = if case_sensitive {
candidate.to_string()
} else {
candidate.to_lowercase()
};
let distance = levenshtein_distance(&input_normalized, &candidate_normalized);
if distance <= max_distance
&& (best_match.is_none() || distance < best_match.as_ref().unwrap().distance)
{
best_match = Some(FuzzyMatch::new(candidate, distance));
}
}
best_match
}
pub fn detect_accession_typo(input: &str) -> Option<FuzzyMatch> {
let upper = input.to_uppercase();
for prefix in ACCESSION_PREFIXES {
if upper.starts_with(prefix) {
return None; }
}
let prefix_end = input
.char_indices()
.take(5)
.find(|(_, c)| c.is_ascii_digit())
.map(|(i, _)| i)
.unwrap_or_else(|| input.len().min(5));
if prefix_end < 2 {
return None;
}
let potential_prefix = &input[..prefix_end];
find_closest_match(potential_prefix, ACCESSION_PREFIXES, 2, false).filter(|m| m.distance > 0)
}
pub fn detect_edit_type_typo(input: &str) -> Option<FuzzyMatch> {
let lower = input.to_lowercase();
if EDIT_TYPES.contains(&lower.as_str()) {
return None;
}
find_closest_match(input, EDIT_TYPES, 1, false)
}
pub fn detect_amino_acid_typo(input: &str) -> Option<FuzzyMatch> {
for aa in AMINO_ACIDS {
if input.eq_ignore_ascii_case(aa) {
return None;
}
}
find_closest_match(input, AMINO_ACIDS, 1, false)
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TypoSuggestion {
pub original: String,
pub suggestion: String,
pub token_type: TypoTokenType,
pub start: usize,
pub end: usize,
pub distance: usize,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TypoTokenType {
AccessionPrefix,
EditType,
AminoAcid,
}
impl std::fmt::Display for TypoTokenType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
TypoTokenType::AccessionPrefix => write!(f, "accession prefix"),
TypoTokenType::EditType => write!(f, "edit type"),
TypoTokenType::AminoAcid => write!(f, "amino acid"),
}
}
}
impl TypoSuggestion {
pub fn suggestion_message(&self) -> String {
format!(
"possible typo in {}: '{}' → did you mean '{}'?",
self.token_type, self.original, self.suggestion
)
}
}
pub fn detect_typos(input: &str) -> Vec<TypoSuggestion> {
let mut suggestions = Vec::new();
if let Some(fuzzy) = detect_accession_typo(input) {
let prefix_end = input
.char_indices()
.take(10)
.find(|(_, c)| c.is_ascii_digit())
.map(|(i, _)| i)
.unwrap_or(fuzzy.matched.len());
suggestions.push(TypoSuggestion {
original: input[..prefix_end].to_string(),
suggestion: fuzzy.matched,
token_type: TypoTokenType::AccessionPrefix,
start: 0,
end: prefix_end,
distance: fuzzy.distance,
});
}
let edit_pattern = regex_lite_find_edit_types(input);
for (start, end, token) in edit_pattern {
if let Some(fuzzy) = detect_edit_type_typo(&token) {
suggestions.push(TypoSuggestion {
original: token,
suggestion: fuzzy.matched,
token_type: TypoTokenType::EditType,
start,
end,
distance: fuzzy.distance,
});
}
}
if input.contains("p.") || input.contains(":p.") {
let aa_pattern = find_potential_amino_acids(input);
for (start, end, token) in aa_pattern {
if let Some(fuzzy) = detect_amino_acid_typo(&token) {
suggestions.push(TypoSuggestion {
original: token,
suggestion: fuzzy.matched,
token_type: TypoTokenType::AminoAcid,
start,
end,
distance: fuzzy.distance,
});
}
}
}
suggestions
}
fn regex_lite_find_edit_types(input: &str) -> Vec<(usize, usize, String)> {
let mut results = Vec::new();
let chars: Vec<char> = input.chars().collect();
let len = chars.len();
let mut i = 0;
while i < len {
if chars[i].is_ascii_digit() {
while i < len && (chars[i].is_ascii_digit() || chars[i] == '+' || chars[i] == '-') {
i += 1;
}
let start = i;
let mut token = String::new();
while i < len && chars[i].is_ascii_alphabetic() {
token.push(chars[i]);
i += 1;
}
if token.len() >= 2 && token.len() <= 7 {
let lower = token.to_lowercase();
if !lower.starts_with("ins")
&& !EDIT_TYPES.contains(&lower.as_str())
&& !AMINO_ACIDS.iter().any(|aa| aa.eq_ignore_ascii_case(&token))
{
results.push((start, i, token));
}
}
} else {
i += 1;
}
}
results
}
fn find_potential_amino_acids(input: &str) -> Vec<(usize, usize, String)> {
let mut results = Vec::new();
let p_pos = if let Some(pos) = input.find(":p.") {
pos + 3
} else if let Some(pos) = input.find("p.") {
pos + 2
} else {
return results;
};
let chars: Vec<char> = input[p_pos..].chars().collect();
let len = chars.len();
let mut i = 0;
while i < len {
if chars[i].is_ascii_uppercase() {
let start = p_pos + i;
let mut token = String::new();
token.push(chars[i]);
i += 1;
while i < len && chars[i].is_ascii_alphabetic() && !chars[i].is_ascii_uppercase() {
token.push(chars[i]);
i += 1;
}
if token.len() >= 3 && token.len() <= 4 {
let end = p_pos + i;
results.push((start, end, token));
}
} else {
i += 1;
}
}
results
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_correct_dash_en_dash() {
let (corrected, corrections) = correct_dash_characters("c.100\u{2013}200del");
assert_eq!(corrected, "c.100-200del");
assert_eq!(corrections.len(), 1);
assert_eq!(corrections[0].error_type, ErrorType::WrongDashCharacter);
}
#[test]
fn test_correct_dash_em_dash() {
let (corrected, corrections) = correct_dash_characters("c.100\u{2014}200del");
assert_eq!(corrected, "c.100-200del");
assert_eq!(corrections.len(), 1);
}
#[test]
fn test_correct_dash_minus_sign() {
let (corrected, corrections) = correct_dash_characters("c.100\u{2212}200del");
assert_eq!(corrected, "c.100-200del");
assert_eq!(corrections.len(), 1);
}
#[test]
fn test_correct_dash_no_change() {
let (corrected, corrections) = correct_dash_characters("c.100-200del");
assert_eq!(corrected, "c.100-200del");
assert!(corrections.is_empty());
}
#[test]
fn test_correct_smart_single_quotes() {
let (corrected, corrections) = correct_quote_characters("c.100ins\u{2018}A\u{2019}");
assert_eq!(corrected, "c.100ins'A'");
assert_eq!(corrections.len(), 2);
}
#[test]
fn test_correct_smart_double_quotes() {
let (corrected, corrections) = correct_quote_characters("c.100ins\u{201C}ATG\u{201D}");
assert_eq!(corrected, "c.100ins\"ATG\"");
assert_eq!(corrections.len(), 2);
}
#[test]
fn test_correct_whitespace_trim() {
let (corrected, corrections) = correct_whitespace(" c.100A>G ");
assert_eq!(corrected, "c.100A>G");
assert!(!corrections.is_empty());
}
#[test]
fn test_correct_whitespace_no_change() {
let (corrected, _corrections) = correct_whitespace("c.100A>G");
assert_eq!(corrected, "c.100A>G");
assert_eq!(corrected, "c.100A>G");
}
#[test]
fn test_detect_position_zero() {
assert!(detect_position_zero("c.0A>G").is_some());
assert!(detect_position_zero("g.0del").is_some());
assert!(detect_position_zero("c.10A>G").is_none());
assert!(detect_position_zero("c.100A>G").is_none());
}
#[test]
fn test_correct_protein_arrow() {
let (corrected, corrections) = correct_protein_arrow("p.Val600>Glu");
assert_eq!(corrected, "p.Val600Glu");
assert_eq!(corrections.len(), 1);
assert_eq!(
corrections[0].error_type,
ErrorType::ProteinSubstitutionArrow
);
}
#[test]
fn test_correct_protein_arrow_no_change() {
let (corrected, corrections) = correct_protein_arrow("p.Val600Glu");
assert_eq!(corrected, "p.Val600Glu");
assert!(corrections.is_empty());
}
#[test]
fn test_correct_protein_arrow_not_protein() {
let (corrected, corrections) = correct_protein_arrow("c.100A>G");
assert_eq!(corrected, "c.100A>G");
assert!(corrections.is_empty());
}
#[test]
fn test_correct_amino_acid_case_lowercase() {
let result = correct_amino_acid_case("val");
assert!(result.is_some());
let (corrected, error_type) = result.unwrap();
assert_eq!(corrected, "Val");
assert_eq!(error_type, ErrorType::LowercaseAminoAcid);
}
#[test]
fn test_correct_amino_acid_case_uppercase() {
let result = correct_amino_acid_case("VAL");
assert!(result.is_some());
let (corrected, _) = result.unwrap();
assert_eq!(corrected, "Val");
}
#[test]
fn test_correct_amino_acid_case_correct() {
let result = correct_amino_acid_case("Val");
assert!(result.is_none());
}
#[test]
fn test_single_to_three_letter_aa() {
assert_eq!(single_to_three_letter_aa('V'), Some("Val"));
assert_eq!(single_to_three_letter_aa('E'), Some("Glu"));
assert_eq!(single_to_three_letter_aa('*'), Some("Ter"));
assert_eq!(single_to_three_letter_aa('Z'), None);
}
#[test]
fn test_correct_accession_prefix_lowercase() {
let (corrected, corrections) = correct_accession_prefix_case("nm_000088.3");
assert_eq!(corrected, "NM_000088.3");
assert_eq!(corrections.len(), 1);
}
#[test]
fn test_correct_accession_prefix_correct() {
let (corrected, corrections) = correct_accession_prefix_case("NM_000088.3");
assert_eq!(corrected, "NM_000088.3");
assert!(corrections.is_empty());
}
#[test]
fn test_correct_edit_type_uppercase() {
let result = correct_edit_type_case("DEL");
assert!(result.is_some());
let (corrected, _) = result.unwrap();
assert_eq!(corrected, "del");
}
#[test]
fn test_correct_edit_type_mixed() {
let result = correct_edit_type_case("Del");
assert!(result.is_some());
let (corrected, _) = result.unwrap();
assert_eq!(corrected, "del");
}
#[test]
fn test_correct_edit_type_correct() {
let result = correct_edit_type_case("del");
assert!(result.is_none());
}
#[test]
fn test_detected_correction_warning_message() {
let correction =
DetectedCorrection::new(ErrorType::WrongDashCharacter, "\u{2013}", "-", 5, 8);
let msg = correction.warning_message();
assert!(msg.contains("wrong dash character"));
assert!(msg.contains("position 5"));
}
#[test]
fn test_detect_missing_version_no_version() {
let result = detect_missing_version("NM_000088:c.100A>G");
assert!(result.is_some());
let (pos, acc) = result.unwrap();
assert_eq!(pos, 0);
assert_eq!(acc, "NM_000088");
}
#[test]
fn test_detect_missing_version_with_version() {
let result = detect_missing_version("NM_000088.3:c.100A>G");
assert!(result.is_none());
}
#[test]
fn test_detect_missing_version_ensembl() {
let result = detect_missing_version("ENST00000123456:c.100A>G");
assert!(result.is_none());
}
#[test]
fn test_detect_missing_version_nc() {
let result = detect_missing_version("NC_000001:g.12345A>G");
assert!(result.is_some());
}
#[test]
fn test_detect_swapped_positions_swapped() {
let result = detect_swapped_positions("c.200_100del");
assert!(result.is_some());
let correction = result.unwrap();
assert_eq!(correction.error_type, ErrorType::SwappedPositions);
assert_eq!(correction.original, "200_100");
assert_eq!(correction.corrected, "100_200");
}
#[test]
fn test_detect_swapped_positions_correct() {
let result = detect_swapped_positions("c.100_200del");
assert!(result.is_none());
}
#[test]
fn test_detect_swapped_positions_with_accession() {
let result = detect_swapped_positions("NM_000088.3:c.500_100del");
assert!(result.is_some());
let correction = result.unwrap();
assert_eq!(correction.original, "500_100");
assert_eq!(correction.corrected, "100_500");
}
#[test]
fn test_detect_swapped_positions_genomic() {
let result = detect_swapped_positions("g.2000_1000del");
assert!(result.is_some());
}
#[test]
fn test_detect_swapped_positions_negative() {
let result = detect_swapped_positions("c.-10_-50del");
assert!(result.is_some());
}
#[test]
fn test_correct_swapped_positions() {
let (corrected, corrections) = correct_swapped_positions("c.200_100del");
assert_eq!(corrected, "c.100_200del");
assert_eq!(corrections.len(), 1);
}
#[test]
fn test_correct_swapped_positions_no_change() {
let (corrected, corrections) = correct_swapped_positions("c.100_200del");
assert_eq!(corrected, "c.100_200del");
assert!(corrections.is_empty());
}
#[test]
fn test_levenshtein_distance_identical() {
assert_eq!(levenshtein_distance("del", "del"), 0);
assert_eq!(levenshtein_distance("", ""), 0);
}
#[test]
fn test_levenshtein_distance_one_empty() {
assert_eq!(levenshtein_distance("", "abc"), 3);
assert_eq!(levenshtein_distance("abc", ""), 3);
}
#[test]
fn test_levenshtein_distance_substitution() {
assert_eq!(levenshtein_distance("del", "dek"), 1);
assert_eq!(levenshtein_distance("cat", "bat"), 1);
}
#[test]
fn test_levenshtein_distance_insertion() {
assert_eq!(levenshtein_distance("del", "delx"), 1);
assert_eq!(levenshtein_distance("abc", "abcd"), 1);
}
#[test]
fn test_levenshtein_distance_deletion() {
assert_eq!(levenshtein_distance("delx", "del"), 1);
assert_eq!(levenshtein_distance("abcd", "abc"), 1);
}
#[test]
fn test_levenshtein_distance_transposition() {
assert_eq!(levenshtein_distance("ab", "ba"), 2);
assert_eq!(levenshtein_distance("NM_", "MN_"), 2);
}
#[test]
fn test_levenshtein_distance_classic_example() {
assert_eq!(levenshtein_distance("kitten", "sitting"), 3);
}
#[test]
fn test_find_closest_match_exact() {
let candidates = vec!["del", "ins", "dup"];
let result = find_closest_match("del", &candidates, 1, true);
assert!(result.is_some());
assert_eq!(result.unwrap().distance, 0);
}
#[test]
fn test_find_closest_match_typo() {
let candidates = vec!["del", "ins", "dup"];
let result = find_closest_match("dek", &candidates, 1, true);
assert!(result.is_some());
let m = result.unwrap();
assert_eq!(m.matched, "del");
assert_eq!(m.distance, 1);
}
#[test]
fn test_find_closest_match_case_insensitive() {
let candidates = vec!["Val", "Glu", "Ala"];
let result = find_closest_match("VAL", &candidates, 0, false);
assert!(result.is_some());
assert_eq!(result.unwrap().matched, "Val");
}
#[test]
fn test_find_closest_match_no_match() {
let candidates = vec!["del", "ins", "dup"];
let result = find_closest_match("xyz", &candidates, 1, true);
assert!(result.is_none());
}
#[test]
fn test_detect_accession_typo_transposed() {
let result = detect_accession_typo("MN_000088.3");
assert!(result.is_some());
let m = result.unwrap();
assert_eq!(m.matched, "NM_");
}
#[test]
fn test_detect_accession_typo_correct() {
let result = detect_accession_typo("NM_000088.3");
assert!(result.is_none());
}
#[test]
fn test_detect_accession_typo_nc() {
let result = detect_accession_typo("CN_000001.10");
assert!(result.is_some());
let matched = result.unwrap().matched;
assert!(matched == "NM_" || matched == "NC_");
}
#[test]
fn test_detect_accession_typo_ensembl() {
let result = detect_accession_typo("ESNT00000123456");
assert!(result.is_some());
assert_eq!(result.unwrap().matched, "ENST");
}
#[test]
fn test_detect_edit_type_typo_dek() {
let result = detect_edit_type_typo("dek");
assert!(result.is_some());
assert_eq!(result.unwrap().matched, "del");
}
#[test]
fn test_detect_edit_type_typo_inx() {
let result = detect_edit_type_typo("inx");
assert!(result.is_some());
assert_eq!(result.unwrap().matched, "ins");
}
#[test]
fn test_detect_edit_type_typo_correct() {
assert!(detect_edit_type_typo("del").is_none());
assert!(detect_edit_type_typo("ins").is_none());
assert!(detect_edit_type_typo("dup").is_none());
}
#[test]
fn test_detect_amino_acid_typo_vasl() {
let result = detect_amino_acid_typo("Vasl");
assert!(result.is_some());
assert_eq!(result.unwrap().matched, "Val");
}
#[test]
fn test_detect_amino_acid_typo_gul() {
let result = detect_amino_acid_typo("Gul");
assert!(result.is_none());
}
#[test]
fn test_detect_amino_acid_typo_single_char_error() {
let result = detect_amino_acid_typo("Vak");
assert!(result.is_some());
assert_eq!(result.unwrap().matched, "Val");
}
#[test]
fn test_detect_amino_acid_typo_correct() {
assert!(detect_amino_acid_typo("Val").is_none());
assert!(detect_amino_acid_typo("Glu").is_none());
assert!(detect_amino_acid_typo("Ala").is_none());
}
#[test]
fn test_detect_typos_accession_prefix() {
let typos = detect_typos("MN_000088.3:c.100A>G");
assert!(!typos.is_empty());
assert_eq!(typos[0].token_type, TypoTokenType::AccessionPrefix);
assert_eq!(typos[0].suggestion, "NM_");
}
#[test]
fn test_detect_typos_no_typos() {
let typos = detect_typos("NM_000088.3:c.100A>G");
assert!(typos.is_empty());
}
#[test]
fn test_detect_typos_edit_type() {
let typos = detect_typos("NM_000088.3:c.100dek");
assert!(!typos.is_empty());
let edit_typo = typos
.iter()
.find(|t| t.token_type == TypoTokenType::EditType);
assert!(edit_typo.is_some());
assert_eq!(edit_typo.unwrap().suggestion, "del");
}
#[test]
fn test_detect_typos_amino_acid() {
let typos = detect_typos("p.Vasl600Glu");
assert!(!typos.is_empty());
assert_eq!(typos[0].token_type, TypoTokenType::AminoAcid);
assert_eq!(typos[0].suggestion, "Val");
}
#[test]
fn test_typo_suggestion_message() {
let suggestion = TypoSuggestion {
original: "MN_".to_string(),
suggestion: "NM_".to_string(),
token_type: TypoTokenType::AccessionPrefix,
start: 0,
end: 3,
distance: 2,
};
let msg = suggestion.suggestion_message();
assert!(msg.contains("accession prefix"));
assert!(msg.contains("MN_"));
assert!(msg.contains("NM_"));
}
#[test]
fn test_typo_token_type_display() {
assert_eq!(
format!("{}", TypoTokenType::AccessionPrefix),
"accession prefix"
);
assert_eq!(format!("{}", TypoTokenType::EditType), "edit type");
assert_eq!(format!("{}", TypoTokenType::AminoAcid), "amino acid");
}
#[test]
fn test_fuzzy_match_struct() {
let m = FuzzyMatch::new("del", 1);
assert_eq!(m.matched, "del");
assert_eq!(m.distance, 1);
}
#[test]
fn test_strip_trailing_annotation_synonymous() {
let (corrected, corrections) =
strip_trailing_annotation("NM_003467.3(CXCR4):c.708G>A (p.Lys236=)");
assert_eq!(corrected, "NM_003467.3(CXCR4):c.708G>A");
assert_eq!(corrections.len(), 1);
assert_eq!(corrections[0].error_type, ErrorType::TrailingAnnotation);
}
#[test]
fn test_strip_trailing_annotation_missense() {
let (corrected, corrections) =
strip_trailing_annotation("NM_021831.6(AGBL5):c.2083G>A (p.Val695Ile)");
assert_eq!(corrected, "NM_021831.6(AGBL5):c.2083G>A");
assert_eq!(corrections.len(), 1);
}
#[test]
fn test_strip_trailing_annotation_frameshift() {
let (corrected, corrections) =
strip_trailing_annotation("NM_178127.5(ANGPTL5):c.1097dup (p.Asn366fs)");
assert_eq!(corrected, "NM_178127.5(ANGPTL5):c.1097dup");
assert_eq!(corrections.len(), 1);
}
#[test]
fn test_strip_trailing_annotation_no_space() {
let (corrected, corrections) = strip_trailing_annotation("NM_000088.3:c.459A>G(p.Lys153=)");
assert_eq!(corrected, "NM_000088.3:c.459A>G");
assert_eq!(corrections.len(), 1);
}
#[test]
fn test_strip_trailing_annotation_no_annotation() {
let (corrected, corrections) = strip_trailing_annotation("NM_000088.3:c.459A>G");
assert_eq!(corrected, "NM_000088.3:c.459A>G");
assert!(corrections.is_empty());
}
#[test]
fn test_strip_trailing_annotation_valid_uncertain() {
let (corrected, corrections) = strip_trailing_annotation("NP_000079.2:p.(Val600Glu)");
assert_eq!(corrected, "NP_000079.2:p.(Val600Glu)");
assert!(corrections.is_empty());
}
#[test]
fn test_strip_trailing_annotation_protein_variant() {
let (corrected, corrections) = strip_trailing_annotation("NP_000079.2:p.Val600Glu");
assert_eq!(corrected, "NP_000079.2:p.Val600Glu");
assert!(corrections.is_empty());
}
#[test]
fn test_strip_trailing_annotation_all_parse_gaps() {
let patterns = [
(
"NM_003467.3(CXCR4):c.708G>A (p.Lys236=)",
"NM_003467.3(CXCR4):c.708G>A",
),
(
"NM_001267550.2(TTN):c.30570C>A (p.Thr10190=)",
"NM_001267550.2(TTN):c.30570C>A",
),
(
"NM_000199.5(SGSH):c.1428C>T (p.His476=)",
"NM_000199.5(SGSH):c.1428C>T",
),
(
"NM_001366385.1(CARD14):c.27C>T (p.Ser9=)",
"NM_001366385.1(CARD14):c.27C>T",
),
(
"NM_001267550.2(TTN):c.49476T>C (p.Pro16492=)",
"NM_001267550.2(TTN):c.49476T>C",
),
(
"NM_198253.3(TERT):c.603G>A (p.Arg201=)",
"NM_198253.3(TERT):c.603G>A",
),
(
"NM_178127.5(ANGPTL5):c.1097dup (p.Asn366fs)",
"NM_178127.5(ANGPTL5):c.1097dup",
),
(
"NM_032303.5(HSDL2):c.894A>G (p.Lys298=)",
"NM_032303.5(HSDL2):c.894A>G",
),
(
"NM_021831.6(AGBL5):c.2083G>A (p.Val695Ile)",
"NM_021831.6(AGBL5):c.2083G>A",
),
(
"NM_201555.2(FHL2):c.507C>T (p.Ile169=)",
"NM_201555.2(FHL2):c.507C>T",
),
(
"NM_025137.4(SPG11):c.1821A>G (p.Ser607=)",
"NM_025137.4(SPG11):c.1821A>G",
),
(
"NM_025137.4(SPG11):c.6892A>G (p.Ile2298Val)",
"NM_025137.4(SPG11):c.6892A>G",
),
(
"NM_001148.6(ANK2):c.8484T>C (p.Asp2828=)",
"NM_001148.6(ANK2):c.8484T>C",
),
(
"NM_001148.6(ANK2):c.6492T>G (p.Leu2164=)",
"NM_001148.6(ANK2):c.6492T>G",
),
(
"NM_001148.6(ANK2):c.231G>A (p.Val77=)",
"NM_001148.6(ANK2):c.231G>A",
),
(
"NM_000059.3(BRCA2):c.3570G>C (p.Arg1190=)",
"NM_000059.3(BRCA2):c.3570G>C",
),
(
"NM_030773.4(TUBB1):c.1045G>A (p.Val349Ile)",
"NM_030773.4(TUBB1):c.1045G>A",
),
];
for (input, expected) in patterns {
let (corrected, corrections) = strip_trailing_annotation(input);
assert_eq!(corrected, expected, "Failed for input: {}", input);
assert_eq!(corrections.len(), 1, "No correction for: {}", input);
}
}
#[test]
fn test_correct_missing_coordinate_prefix_nc() {
let (corrected, corrections) = correct_missing_coordinate_prefix("NC_000017.11:12345A>G");
assert_eq!(corrected, "NC_000017.11:g.12345A>G");
assert_eq!(corrections.len(), 1);
assert_eq!(
corrections[0].error_type,
ErrorType::MissingCoordinatePrefix
);
}
#[test]
fn test_correct_missing_coordinate_prefix_uncertain_range() {
let (corrected, corrections) =
correct_missing_coordinate_prefix("NC_000017.11:(?_31094927)_(31377677_?)del");
assert_eq!(corrected, "NC_000017.11:g.(?_31094927)_(31377677_?)del");
assert_eq!(corrections.len(), 1);
}
#[test]
fn test_correct_missing_coordinate_prefix_ng() {
let (corrected, corrections) = correct_missing_coordinate_prefix("NG_007489.1:100_200del");
assert_eq!(corrected, "NG_007489.1:g.100_200del");
assert_eq!(corrections.len(), 1);
}
#[test]
fn test_correct_missing_coordinate_prefix_already_present() {
let (corrected, corrections) = correct_missing_coordinate_prefix("NC_000017.11:g.12345A>G");
assert_eq!(corrected, "NC_000017.11:g.12345A>G");
assert!(corrections.is_empty());
}
#[test]
fn test_correct_missing_coordinate_prefix_nm() {
let (corrected, corrections) = correct_missing_coordinate_prefix("NM_000088.3:459A>G");
assert_eq!(corrected, "NM_000088.3:459A>G");
assert!(corrections.is_empty());
}
#[test]
fn test_correct_missing_coordinate_prefix_lrg() {
let (corrected, corrections) = correct_missing_coordinate_prefix("LRG_292:100_200del");
assert_eq!(corrected, "LRG_292:g.100_200del");
assert_eq!(corrections.len(), 1);
}
#[test]
fn test_correct_missing_coordinate_prefix_lrg_transcript() {
let (corrected, corrections) = correct_missing_coordinate_prefix("LRG_292t1:100_200del");
assert_eq!(corrected, "LRG_292t1:100_200del");
assert!(corrections.is_empty());
}
#[test]
fn test_correct_missing_coordinate_prefix_bracket() {
let (corrected, corrections) =
correct_missing_coordinate_prefix("NC_000004.12:[144539078A>G]");
assert_eq!(corrected, "NC_000004.12:g.[144539078A>G]");
assert_eq!(corrections.len(), 1);
}
}