use super::types::{ErrorType, ResolvedAction};
use std::cmp::min;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct DetectedCorrection {
pub error_type: ErrorType,
pub original: String,
pub corrected: String,
pub start: usize,
pub end: usize,
}
impl DetectedCorrection {
pub fn new(
error_type: ErrorType,
original: impl Into<String>,
corrected: impl Into<String>,
start: usize,
end: usize,
) -> Self {
Self {
error_type,
original: original.into(),
corrected: corrected.into(),
start,
end,
}
}
pub fn warning_message(&self) -> String {
format!(
"{} at position {}: '{}' → '{}'",
self.error_type, self.start, self.original, self.corrected
)
}
}
pub fn correct_dash_characters(input: &str) -> (String, Vec<DetectedCorrection>) {
let mut result = String::with_capacity(input.len());
let mut corrections = Vec::new();
for (i, c) in input.char_indices() {
match c {
'\u{2013}' => {
corrections.push(DetectedCorrection::new(
ErrorType::WrongDashCharacter,
c.to_string(),
"-",
i,
i + c.len_utf8(),
));
result.push('-');
}
'\u{2014}' => {
corrections.push(DetectedCorrection::new(
ErrorType::WrongDashCharacter,
c.to_string(),
"-",
i,
i + c.len_utf8(),
));
result.push('-');
}
'\u{2212}' => {
corrections.push(DetectedCorrection::new(
ErrorType::WrongDashCharacter,
c.to_string(),
"-",
i,
i + c.len_utf8(),
));
result.push('-');
}
_ => result.push(c),
}
}
(result, corrections)
}
pub fn correct_quote_characters(input: &str) -> (String, Vec<DetectedCorrection>) {
let mut result = String::with_capacity(input.len());
let mut corrections = Vec::new();
for (i, c) in input.char_indices() {
match c {
'\u{2018}' | '\u{2019}' => {
corrections.push(DetectedCorrection::new(
ErrorType::WrongQuoteCharacter,
c.to_string(),
"'",
i,
i + c.len_utf8(),
));
result.push('\'');
}
'\u{201C}' | '\u{201D}' => {
corrections.push(DetectedCorrection::new(
ErrorType::WrongQuoteCharacter,
c.to_string(),
"\"",
i,
i + c.len_utf8(),
));
result.push('"');
}
_ => result.push(c),
}
}
(result, corrections)
}
#[inline]
fn is_invisible_whitespace(c: char) -> bool {
c.is_whitespace() || matches!(c, '\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{FEFF}')
}
pub fn correct_whitespace(input: &str) -> (String, Vec<DetectedCorrection>) {
let mut corrections = Vec::new();
let mut result = String::with_capacity(input.len());
let mut run_start: Option<usize> = None;
for (i, c) in input.char_indices() {
if is_invisible_whitespace(c) {
if run_start.is_none() {
run_start = Some(i);
}
} else {
if let Some(start) = run_start.take() {
corrections.push(DetectedCorrection::new(
ErrorType::ExtraWhitespace,
input[start..i].to_string(),
String::new(),
start,
i,
));
}
result.push(c);
}
}
if let Some(start) = run_start {
corrections.push(DetectedCorrection::new(
ErrorType::ExtraWhitespace,
input[start..].to_string(),
String::new(),
start,
input.len(),
));
}
(result, corrections)
}
pub fn detect_position_zero(input: &str) -> Option<usize> {
let mut chars = input.char_indices().peekable();
while let Some((i, c)) = chars.next() {
if c == '.' {
if let Some(&(_, next_c)) = chars.peek() {
if next_c == '0' {
chars.next();
if let Some(&(_, after_zero)) = chars.peek() {
if !after_zero.is_ascii_digit() {
return Some(i + 1); }
} else {
return Some(i + 1);
}
}
}
}
}
None
}
pub fn correct_protein_arrow(input: &str) -> (String, Vec<DetectedCorrection>) {
let mut corrections = Vec::new();
if let Some(arrow_pos) = input.find('>') {
if input.starts_with("p.") || input.contains(":p.") {
let after_arrow = &input[arrow_pos + 1..];
if after_arrow.len() >= 3 {
let potential_aa = &after_arrow[..3];
if is_amino_acid_like(potential_aa) {
let corrected = format!("{}{}", &input[..arrow_pos], &input[arrow_pos + 1..]);
corrections.push(DetectedCorrection::new(
ErrorType::ProteinSubstitutionArrow,
">",
"",
arrow_pos,
arrow_pos + 1,
));
return (corrected, corrections);
}
}
}
}
(input.to_string(), corrections)
}
pub fn correct_deprecated_protein_forms(input: &str) -> (String, Vec<DetectedCorrection>) {
let mut corrections = Vec::new();
if !input.starts_with("p.") && !input.contains(":p.") && !input.contains("[p.") {
return (input.to_string(), corrections);
}
let bytes = input.as_bytes();
let mut out = String::with_capacity(input.len());
let mut i = 0;
while i < bytes.len() {
let c = bytes[i];
if c == b'f'
&& i + 1 < bytes.len()
&& bytes[i + 1] == b's'
&& i + 2 < bytes.len()
&& (bytes[i + 2] == b'*' || bytes[i + 2] == b'X')
&& i + 3 < bytes.len()
&& bytes[i + 3].is_ascii_digit()
&& in_protein_segment(input, i + 2)
{
let star_or_x = bytes[i + 2];
let digits_start = i + 3;
let mut digits_end = digits_start;
while digits_end < bytes.len() && bytes[digits_end].is_ascii_digit() {
digits_end += 1;
}
let digits = &input[digits_start..digits_end];
let original = &input[i + 2..digits_end]; let corrected = format!("Ter{}", digits);
let error_type = if star_or_x == b'*' {
ErrorType::DeprecatedFrameshiftStar
} else {
ErrorType::DeprecatedFrameshiftX
};
corrections.push(DetectedCorrection::new(
error_type,
original.to_string(),
corrected.clone(),
i + 2,
digits_end,
));
out.push_str("fs");
out.push_str(&corrected);
i = digits_end;
continue;
}
if (c == b'*' || c == b'X') && i > 0 && bytes[i - 1].is_ascii_digit() {
let next_is_digit = i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit();
let next_is_alpha = i + 1 < bytes.len() && bytes[i + 1].is_ascii_alphabetic();
let invalid_for_x = c == b'X' && next_is_alpha;
if !next_is_digit && !invalid_for_x && in_protein_segment(input, i) {
let error_type = if c == b'*' {
ErrorType::DeprecatedStopCodonStar
} else {
ErrorType::DeprecatedStopCodonX
};
corrections.push(DetectedCorrection::new(
error_type,
(c as char).to_string(),
"Ter".to_string(),
i,
i + 1,
));
out.push_str("Ter");
i += 1;
continue;
}
}
if c.is_ascii() {
out.push(c as char);
i += 1;
} else {
let ch = input[i..]
.chars()
.next()
.expect("input is valid UTF-8 and i is at a char boundary");
let len = ch.len_utf8();
out.push_str(&input[i..i + len]);
i += len;
}
}
(out, corrections)
}
fn in_protein_segment(input: &str, pos: usize) -> bool {
let bytes = input.as_bytes();
let mut j = pos;
while j >= 2 {
if bytes[j - 2] == b'p' && bytes[j - 1] == b'.' {
return true;
}
if bytes[j - 1] == b':' {
return false;
}
j -= 1;
}
input.starts_with("p.")
}
fn is_amino_acid_like(s: &str) -> bool {
if s.len() < 3 {
return false;
}
let bytes = s.as_bytes();
bytes[0].is_ascii_alphabetic()
&& bytes[1].is_ascii_alphabetic()
&& bytes[2].is_ascii_alphabetic()
}
pub fn correct_amino_acid_case(token: &str) -> Option<(String, ErrorType)> {
let amino_acids = [
("ala", "Ala"),
("arg", "Arg"),
("asn", "Asn"),
("asp", "Asp"),
("cys", "Cys"),
("gln", "Gln"),
("glu", "Glu"),
("gly", "Gly"),
("his", "His"),
("ile", "Ile"),
("leu", "Leu"),
("lys", "Lys"),
("met", "Met"),
("phe", "Phe"),
("pro", "Pro"),
("sec", "Sec"),
("ser", "Ser"),
("thr", "Thr"),
("trp", "Trp"),
("tyr", "Tyr"),
("val", "Val"),
("ter", "Ter"),
("xaa", "Xaa"),
];
let lower = token.to_lowercase();
for (pattern, correct) in amino_acids {
if lower == pattern && token != correct {
return Some((correct.to_string(), ErrorType::LowercaseAminoAcid));
}
}
None
}
pub fn correct_amino_acid_case_in_protein(input: &str) -> (String, Vec<DetectedCorrection>) {
let mut corrections = Vec::new();
let Some(start) = find_protein_segment_start(input) else {
return (input.to_string(), corrections);
};
let prefix = &input[..start];
let body = &input[start..];
let mut out = String::with_capacity(input.len());
out.push_str(prefix);
let bytes = body.as_bytes();
let len = bytes.len();
let mut i = 0usize;
while i < len {
let c = bytes[i];
if c.is_ascii_alphabetic() {
let mut run_end = i;
while run_end < len && bytes[run_end].is_ascii_alphabetic() {
run_end += 1;
}
let mut j = i;
while j + 3 <= run_end {
let token = &body[j..j + 3];
if let Some((canonical, error_type)) = correct_amino_acid_case(token) {
let abs_start = start + j;
corrections.push(DetectedCorrection::new(
error_type,
token.to_string(),
canonical.clone(),
abs_start,
abs_start + 3,
));
out.push_str(&canonical);
j += 3;
} else {
out.push(bytes[j] as char);
j += 1;
}
}
while j < run_end {
out.push(bytes[j] as char);
j += 1;
}
i = run_end;
} else if c.is_ascii() {
out.push(c as char);
i += 1;
} else {
let ch_len = utf8_char_len(c);
out.push_str(&body[i..i + ch_len]);
i += ch_len;
}
}
(out, corrections)
}
fn utf8_char_len(b: u8) -> usize {
if b < 0xC0 {
1
} else if b < 0xE0 {
2
} else if b < 0xF0 {
3
} else {
4
}
}
pub fn correct_single_letter_aa_in_protein(input: &str) -> (String, Vec<DetectedCorrection>) {
let mut corrections = Vec::new();
let Some(start) = find_protein_segment_start(input) else {
return (input.to_string(), corrections);
};
let prefix = &input[..start];
let body = &input[start..];
let mut out = String::with_capacity(input.len());
out.push_str(prefix);
let bytes = body.as_bytes();
let len = bytes.len();
let mut i = 0usize;
while i < len {
let c = bytes[i];
if c.is_ascii_uppercase() && i + 3 <= len {
let token = &body[i..i + 3];
if is_canonical_three_letter_aa(token) {
out.push_str(token);
i += 3;
continue;
}
}
if c.is_ascii_uppercase() {
if let Some(three) = single_to_three_letter_aa(c as char) {
let abs_start = start + i;
corrections.push(DetectedCorrection::new(
ErrorType::SingleLetterAminoAcid,
(c as char).to_string(),
three.to_string(),
abs_start,
abs_start + 1,
));
out.push_str(three);
i += 1;
continue;
}
}
if c.is_ascii() {
out.push(c as char);
i += 1;
} else {
let ch_len = utf8_char_len(c);
out.push_str(&body[i..i + ch_len]);
i += ch_len;
}
}
(out, corrections)
}
fn find_protein_segment_start(input: &str) -> Option<usize> {
if let Some(idx) = input.find(":p.") {
return Some(idx + 3);
}
if input.starts_with("p.") {
return Some(2);
}
None
}
fn is_canonical_three_letter_aa(token: &str) -> bool {
matches!(
token,
"Ala"
| "Arg"
| "Asn"
| "Asp"
| "Cys"
| "Gln"
| "Glu"
| "Gly"
| "His"
| "Ile"
| "Leu"
| "Lys"
| "Met"
| "Phe"
| "Pro"
| "Sec"
| "Ser"
| "Thr"
| "Trp"
| "Tyr"
| "Val"
| "Ter"
| "Xaa"
)
}
pub fn single_to_three_letter_aa(single: char) -> Option<&'static str> {
match single.to_ascii_uppercase() {
'A' => Some("Ala"),
'R' => Some("Arg"),
'N' => Some("Asn"),
'D' => Some("Asp"),
'C' => Some("Cys"),
'Q' => Some("Gln"),
'E' => Some("Glu"),
'G' => Some("Gly"),
'H' => Some("His"),
'I' => Some("Ile"),
'L' => Some("Leu"),
'K' => Some("Lys"),
'M' => Some("Met"),
'F' => Some("Phe"),
'P' => Some("Pro"),
'U' => Some("Sec"),
'S' => Some("Ser"),
'T' => Some("Thr"),
'W' => Some("Trp"),
'Y' => Some("Tyr"),
'V' => Some("Val"),
'*' => Some("Ter"),
'X' => Some("Xaa"),
_ => None,
}
}
pub fn correct_accession_prefix_case(input: &str) -> (String, Vec<DetectedCorrection>) {
let mut corrections = Vec::new();
let prefixes = [
("nm_", "NM_"),
("np_", "NP_"),
("nc_", "NC_"),
("ng_", "NG_"),
("nr_", "NR_"),
("xm_", "XM_"),
("xp_", "XP_"),
("xr_", "XR_"),
("enst", "ENST"),
("ensp", "ENSP"),
("lrg_", "LRG_"),
];
for (lower, correct) in prefixes {
if input.to_lowercase().starts_with(lower) && !input.starts_with(correct) {
let corrected = format!("{}{}", correct, &input[lower.len()..]);
corrections.push(DetectedCorrection::new(
ErrorType::LowercaseAccessionPrefix,
input[..lower.len()].to_string(),
correct.to_string(),
0,
lower.len(),
));
return (corrected, corrections);
}
}
(input.to_string(), corrections)
}
#[allow(dead_code)]
pub fn correct_edit_type_case(token: &str) -> Option<(String, ErrorType)> {
let edit_types = [
("del", "del"),
("ins", "ins"),
("dup", "dup"),
("inv", "inv"),
("con", "con"),
("delins", "delins"),
];
let lower = token.to_lowercase();
for (pattern, correct) in edit_types {
if lower == pattern && token != correct {
return Some((correct.to_string(), ErrorType::MixedCaseEditType));
}
}
None
}
pub fn detect_missing_version(input: &str) -> Option<(usize, String)> {
let prefixes = [
("NM_", false),
("NP_", false),
("NC_", false),
("NG_", false),
("NR_", false),
("XM_", false),
("XP_", false),
("XR_", false),
("ENST", true), ("ENSP", true),
("LRG_", false),
];
for (prefix, optional) in prefixes {
if let Some(start) = input.find(prefix) {
let after_prefix = &input[start + prefix.len()..];
let end = after_prefix.find(':').unwrap_or(after_prefix.len());
let accession_body = &after_prefix[..end];
let has_version = accession_body.contains('.')
&& accession_body
.rsplit('.')
.next()
.map(|v| v.chars().all(|c| c.is_ascii_digit()))
.unwrap_or(false);
if !has_version && !optional {
let full_accession = format!("{}{}", prefix, accession_body);
return Some((start, full_accession));
}
}
}
None
}
pub fn detect_missing_versions(input: &str) -> Vec<DetectedCorrection> {
let mut hits = Vec::new();
let bytes = input.as_bytes();
let mut i = 0usize;
while i < bytes.len() {
if !bytes[i].is_ascii_uppercase() {
i += 1;
continue;
}
let prefix_start = i;
let mut j = i;
while j < bytes.len() && (bytes[j].is_ascii_alphabetic() || bytes[j] == b'_') {
j += 1;
}
let alpha_end = j;
while j < bytes.len() && bytes[j].is_ascii_digit() {
j += 1;
}
let digit_end = j;
if alpha_end == prefix_start || digit_end == alpha_end {
i = (alpha_end + 1).max(i + 1);
continue;
}
let accession_alpha = &input[prefix_start..alpha_end];
let accession_full = &input[prefix_start..digit_end];
let (recognised, optional) = classify_accession_prefix(accession_alpha);
if !recognised {
i = digit_end;
continue;
}
let has_version = digit_end < bytes.len()
&& bytes[digit_end] == b'.'
&& bytes
.get(digit_end + 1)
.copied()
.map(|b| b.is_ascii_digit())
.unwrap_or(false);
if !has_version && !optional {
let is_lrg_suffix_ref = accession_alpha == "LRG_"
&& digit_end < bytes.len()
&& bytes[digit_end].is_ascii_alphabetic();
if !is_lrg_suffix_ref {
hits.push(DetectedCorrection::new(
ErrorType::MissingVersion,
accession_full.to_string(),
String::new(),
prefix_start,
digit_end,
));
}
}
i = digit_end;
}
hits
}
fn classify_accession_prefix(prefix: &str) -> (bool, bool) {
match prefix {
"NM_" | "NP_" | "NC_" | "NG_" | "NR_" | "NT_" | "NW_" | "XM_" | "XP_" | "XR_" | "LRG_" => {
(true, false)
}
"ENST" | "ENSP" | "ENSG" | "ENSR" => (true, true),
_ => (false, false),
}
}
pub fn detect_swapped_positions(input: &str) -> Option<DetectedCorrection> {
let coord_markers = [".c.", ".g.", ".n.", ".m.", ".o.", ".r."];
let mut search_start = 0;
for marker in &coord_markers {
if let Some(pos) = input.find(marker) {
search_start = pos + marker.len();
break;
}
}
if search_start == 0 {
if let Some(pos) = input.find(['c', 'g', 'n', 'm', 'r']) {
if input.get(pos + 1..pos + 2) == Some(".") {
search_start = pos + 2;
}
}
}
if search_start == 0 {
return None;
}
let after_marker = &input[search_start..];
let mut chars = after_marker.char_indices().peekable();
let mut first_num_str = String::new();
if let Some(&(_, '-')) = chars.peek() {
first_num_str.push('-');
chars.next();
}
while let Some(&(_, c)) = chars.peek() {
if c.is_ascii_digit() {
first_num_str.push(c);
chars.next();
} else {
break;
}
}
if first_num_str.is_empty() || first_num_str == "-" {
return None;
}
while let Some(&(_, c)) = chars.peek() {
if c == '+' || c == '-' || c.is_ascii_digit() {
chars.next();
} else {
break;
}
}
if chars.next().map(|(_, c)| c) != Some('_') {
return None;
}
let _second_start = chars.peek().map(|(i, _)| *i).unwrap_or(0);
let mut second_num_str = String::new();
if let Some(&(_, '-')) = chars.peek() {
second_num_str.push('-');
chars.next();
}
while let Some(&(_, c)) = chars.peek() {
if c.is_ascii_digit() {
second_num_str.push(c);
chars.next();
} else {
break;
}
}
if second_num_str.is_empty() || second_num_str == "-" {
return None;
}
let first_num: i64 = first_num_str.parse().ok()?;
let second_num: i64 = second_num_str.parse().ok()?;
if first_num > second_num {
let prefix = &input[..search_start];
let suffix_start =
search_start + chars.peek().map(|(i, _)| *i).unwrap_or(after_marker.len());
let suffix = &input[suffix_start..];
let _corrected = format!("{}{}_{}{}", prefix, second_num_str, first_num_str, suffix);
return Some(DetectedCorrection::new(
ErrorType::SwappedPositions,
format!("{}_{}", first_num_str, second_num_str),
format!("{}_{}", second_num_str, first_num_str),
search_start,
suffix_start,
));
}
None
}
pub fn strip_trailing_annotation(input: &str) -> (String, Vec<DetectedCorrection>) {
let mut corrections = Vec::new();
let trimmed = input.trim_end();
if let Some(start_idx) = trimmed.rfind("(p.") {
if let Some(close_idx) = trimmed[start_idx..].find(')') {
let abs_close_idx = start_idx + close_idx;
let after_close = trimmed[abs_close_idx + 1..].trim();
if after_close.is_empty() {
let before_annotation = &trimmed[..start_idx];
let stripped = before_annotation.trim_end();
let annotation_start = stripped.len();
let annotation = &input[annotation_start..];
corrections.push(DetectedCorrection::new(
ErrorType::TrailingAnnotation,
annotation.trim(),
"",
annotation_start,
input.len(),
));
return (stripped.to_string(), corrections);
}
}
}
(input.to_string(), corrections)
}
pub fn correct_missing_coordinate_prefix(input: &str) -> (String, Vec<DetectedCorrection>) {
let mut corrections = Vec::new();
let colon_pos = match input.find(':') {
Some(pos) => pos,
None => return (input.to_string(), corrections),
};
let accession = &input[..colon_pos];
let after_colon = &input[colon_pos + 1..];
let is_genomic_accession = accession.starts_with("NC_")
|| accession.starts_with("NG_")
|| accession.starts_with("NT_")
|| accession.starts_with("NW_")
|| (accession.starts_with("LRG_") && !accession.contains('t'));
if !is_genomic_accession {
return (input.to_string(), corrections);
}
let has_prefix = after_colon.starts_with("g.")
|| after_colon.starts_with("c.")
|| after_colon.starts_with("p.")
|| after_colon.starts_with("n.")
|| after_colon.starts_with("r.")
|| after_colon.starts_with("m.");
if has_prefix {
return (input.to_string(), corrections);
}
let first_char = after_colon.chars().next().unwrap_or(' ');
if !first_char.is_ascii_digit() && first_char != '(' && first_char != '?' && first_char != '[' {
return (input.to_string(), corrections);
}
let corrected = format!("{}:g.{}", accession, after_colon);
corrections.push(DetectedCorrection::new(
ErrorType::MissingCoordinatePrefix,
format!("{}:", accession),
format!("{}:g.", accession),
0,
colon_pos + 1,
));
(corrected, corrections)
}
pub fn correct_swapped_positions(input: &str) -> (String, Vec<DetectedCorrection>) {
if let Some(correction) = detect_swapped_positions(input) {
let corrected = format!(
"{}{}{}",
&input[..correction.start],
correction.corrected,
&input[correction.end..]
);
(corrected, vec![correction])
} else {
(input.to_string(), vec![])
}
}
pub fn correct_old_allele_format(input: &str) -> (String, Vec<DetectedCorrection>) {
let mut corrections = Vec::new();
let colon_bracket = match input.find(":[") {
Some(pos) => pos,
None => return (input.to_string(), corrections),
};
let after_bracket = &input[colon_bracket + 2..];
let coord_type = if after_bracket.starts_with("c.") {
"c."
} else if after_bracket.starts_with("g.") {
"g."
} else if after_bracket.starts_with("n.") {
"n."
} else if after_bracket.starts_with("r.") {
"r."
} else if after_bracket.starts_with("m.") {
"m."
} else if after_bracket.starts_with("p.") {
"p."
} else {
return (input.to_string(), corrections);
};
let close_bracket = match input[colon_bracket..].find(']') {
Some(pos) => colon_bracket + pos,
None => return (input.to_string(), corrections),
};
let content = &input[colon_bracket + 2..close_bracket];
let parts: Vec<&str> = content.split(';').collect();
if parts.is_empty() {
return (input.to_string(), corrections);
}
let all_same_prefix = parts.iter().all(|p| p.trim().starts_with(coord_type));
if !all_same_prefix {
return (input.to_string(), corrections);
}
let stripped_parts: Vec<String> = parts
.iter()
.map(|p| {
let trimmed = p.trim();
if let Some(stripped) = trimmed.strip_prefix(coord_type) {
stripped.to_string()
} else {
trimmed.to_string()
}
})
.collect();
let accession = &input[..colon_bracket];
let new_content = stripped_parts.join(";");
let remaining = &input[close_bracket + 1..];
let corrected = format!("{}:{}[{}]{}", accession, coord_type, new_content, remaining);
let original_pattern = &input[colon_bracket..close_bracket + 1];
let corrected_pattern = format!(":{}[{}]", coord_type, new_content);
corrections.push(DetectedCorrection::new(
ErrorType::OldAlleleFormat,
original_pattern,
&corrected_pattern,
colon_bracket,
close_bracket + 1,
));
(corrected, corrections)
}
pub fn correct_old_substitution_syntax(input: &str) -> (String, Vec<DetectedCorrection>) {
let bytes = input.as_bytes();
let mut result = String::with_capacity(input.len());
let mut corrections = Vec::new();
let mut i = 0usize;
while i < bytes.len() {
let prev_ok = i == 0
|| !{
let p = bytes[i - 1];
(p as char).is_ascii_alphanumeric() || p == b'_'
};
let pos1_start = i;
let mut j = i;
if j < bytes.len() && matches!(bytes[j] as char, '-' | '*') {
j += 1;
}
let digits_start = j;
while j < bytes.len() && (bytes[j] as char).is_ascii_digit() {
j += 1;
}
if j == digits_start || !prev_ok {
result.push(bytes[i] as char);
i += 1;
continue;
}
let pos1_end = j;
let has_range = bytes.get(j).copied() == Some(b'_');
let mut pos2_end = j;
if has_range {
let mut k = j + 1;
if k < bytes.len() && matches!(bytes[k] as char, '-' | '*') {
k += 1;
}
let d2 = k;
while k < bytes.len() && (bytes[k] as char).is_ascii_digit() {
k += 1;
}
if k > d2 {
pos2_end = k;
}
}
let positions_end = if has_range && pos2_end > j {
pos2_end
} else {
j
};
let mut k = positions_end;
let refs_start = k;
while k < bytes.len() && is_iupac_base(bytes[k] as char) {
k += 1;
}
let refs_end = k;
let ref_count = refs_end - refs_start;
if k >= bytes.len() || bytes[k] != b'>' {
result.push_str(&input[pos1_start..positions_end]);
i = positions_end;
continue;
}
let arrow = k;
let mut m = arrow + 1;
while m < bytes.len() && is_iupac_base(bytes[m] as char) {
m += 1;
}
let rhs_count = m - (arrow + 1);
if rhs_count == 0 {
result.push_str(&input[pos1_start..positions_end]);
i = positions_end;
continue;
}
let is_canonical_sub = !has_range && ref_count == 1 && rhs_count == 1;
if is_canonical_sub {
result.push_str(&input[pos1_start..m]);
i = m;
continue;
}
let pos_str = &input[pos1_start..positions_end];
let rhs_str = &input[arrow + 1..m];
let original = &input[pos1_start..m];
let new_pos = if has_range {
pos_str.to_string()
} else if ref_count > 1 {
let pos1_text = &input[pos1_start..pos1_end];
let pos1_num: Option<i64> = pos1_text.parse().ok();
match pos1_num {
Some(n) if n >= 1 => {
format!("{}_{}", n, n + ref_count as i64 - 1)
}
_ => pos1_text.to_string(),
}
} else {
pos_str.to_string()
};
let corrected = format!("{}delins{}", new_pos, rhs_str);
corrections.push(DetectedCorrection::new(
ErrorType::OldSubstitutionSyntax,
original.to_string(),
corrected.clone(),
pos1_start,
m,
));
result.push_str(&corrected);
i = m;
}
(result, corrections)
}
fn is_iupac_base(c: char) -> bool {
matches!(
c,
'A' | 'C'
| 'G'
| 'T'
| 'U'
| 'R'
| 'Y'
| 'S'
| 'W'
| 'K'
| 'M'
| 'B'
| 'D'
| 'H'
| 'V'
| 'N'
)
}
pub fn detect_deprecated_ivs(input: &str) -> Vec<DetectedCorrection> {
let mut out = Vec::new();
let bytes = input.as_bytes();
let prefixes: &[&[u8]] = &[b"c.IVS", b"n.IVS", b"r.IVS"];
for prefix in prefixes {
let mut search_start = 0usize;
while let Some(rel) = find_subslice(&bytes[search_start..], prefix) {
let prefix_start = search_start + rel;
let ok_left = prefix_start == 0 || !bytes[prefix_start - 1].is_ascii_alphanumeric();
let ivs_start = prefix_start + 2;
let mut j = ivs_start + 3;
let digits_start = j;
while j < bytes.len() && (bytes[j] as char).is_ascii_digit() {
j += 1;
}
if ok_left && j > digits_start {
out.push(DetectedCorrection::new(
ErrorType::DeprecatedIvsNotation,
&input[ivs_start..j],
"",
ivs_start,
j,
));
search_start = j;
} else {
search_start = prefix_start + 1;
}
}
}
out.sort_by_key(|c| c.start);
out
}
fn find_subslice(haystack: &[u8], needle: &[u8]) -> Option<usize> {
haystack.windows(needle.len()).position(|w| w == needle)
}
pub fn correct_deprecated_con(input: &str) -> (String, Vec<DetectedCorrection>) {
let bytes = input.as_bytes();
let mut result = String::with_capacity(input.len() + 4);
let mut corrections = Vec::new();
let mut i = 0usize;
while i < bytes.len() {
if i + 3 <= bytes.len() && &bytes[i..i + 3] == b"con" {
let con_start = i;
let preceded_by_digit =
con_start > 0 && (bytes[con_start - 1] as char).is_ascii_digit();
if preceded_by_digit && has_range_position_before(bytes, con_start) {
let src_start = con_start + 3;
if !has_valid_con_source_start(bytes, src_start) {
result.push(bytes[i] as char);
i += 1;
continue;
}
let mut j = src_start;
while j < bytes.len() && !(bytes[j] as char).is_whitespace() {
j += 1;
}
let src_end = j;
if src_end > src_start {
let src = &input[src_start..src_end];
let original = &input[con_start..src_end];
let corrected = format!("delins{}", src);
corrections.push(DetectedCorrection::new(
ErrorType::DeprecatedConSyntax,
original.to_string(),
corrected.clone(),
con_start,
src_end,
));
result.push_str(&corrected);
i = src_end;
continue;
}
}
}
result.push(bytes[i] as char);
i += 1;
}
(result, corrections)
}
fn has_range_position_before(bytes: &[u8], end: usize) -> bool {
let mut i = end;
let mut saw_digit = false;
while i > 0 && (bytes[i - 1] as char).is_ascii_digit() {
i -= 1;
saw_digit = true;
}
if !saw_digit {
return false;
}
if i > 0 && matches!(bytes[i - 1] as char, '+' | '-' | '*') {
i -= 1;
}
while i > 0 && (bytes[i - 1] as char).is_ascii_digit() {
i -= 1;
}
if i > 0 && matches!(bytes[i - 1] as char, '-' | '*') {
i -= 1;
}
if i == 0 || bytes[i - 1] != b'_' {
return false;
}
i -= 1;
let mut saw_digit_l = false;
while i > 0 && (bytes[i - 1] as char).is_ascii_digit() {
i -= 1;
saw_digit_l = true;
}
if !saw_digit_l {
return false;
}
if i > 0 && matches!(bytes[i - 1] as char, '+' | '-' | '*') {
i -= 1;
}
while i > 0 && (bytes[i - 1] as char).is_ascii_digit() {
i -= 1;
}
if i > 0 && matches!(bytes[i - 1] as char, '-' | '*') {
i -= 1;
}
let _ = i;
true
}
fn has_valid_con_source_start(bytes: &[u8], start: usize) -> bool {
if start >= bytes.len() {
return false;
}
let c = bytes[start] as char;
if c.is_ascii_digit() || matches!(c, '-' | '*' | '(') || c.is_ascii_uppercase() {
return true;
}
if matches!(c, 'g' | 'c' | 'n' | 'r' | 'm')
&& start + 1 < bytes.len()
&& bytes[start + 1] == b'.'
{
return true;
}
false
}
#[allow(dead_code)]
pub fn apply_correction(
original: &str,
correction: &DetectedCorrection,
action: ResolvedAction,
) -> (String, bool) {
match action {
ResolvedAction::Reject => (original.to_string(), false),
ResolvedAction::WarnCorrect => {
let corrected = format!(
"{}{}{}",
&original[..correction.start],
correction.corrected,
&original[correction.end..]
);
(corrected, true)
}
ResolvedAction::SilentCorrect => {
let corrected = format!(
"{}{}{}",
&original[..correction.start],
correction.corrected,
&original[correction.end..]
);
(corrected, false)
}
ResolvedAction::Accept => (original.to_string(), false),
}
}
pub fn levenshtein_distance(a: &str, b: &str) -> usize {
let a_chars: Vec<char> = a.chars().collect();
let b_chars: Vec<char> = b.chars().collect();
let a_len = a_chars.len();
let b_len = b_chars.len();
if a_len == 0 {
return b_len;
}
if b_len == 0 {
return a_len;
}
let mut prev_row: Vec<usize> = (0..=b_len).collect();
let mut curr_row: Vec<usize> = vec![0; b_len + 1];
for (i, a_char) in a_chars.iter().enumerate() {
curr_row[0] = i + 1;
for (j, b_char) in b_chars.iter().enumerate() {
let cost = if a_char == b_char { 0 } else { 1 };
curr_row[j + 1] = min(
min(
prev_row[j + 1] + 1, curr_row[j] + 1, ),
prev_row[j] + cost, );
}
std::mem::swap(&mut prev_row, &mut curr_row);
}
prev_row[b_len]
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct FuzzyMatch {
pub matched: String,
pub distance: usize,
}
impl FuzzyMatch {
pub fn new(matched: impl Into<String>, distance: usize) -> Self {
Self {
matched: matched.into(),
distance,
}
}
}
const ACCESSION_PREFIXES: &[&str] = &[
"NM_", "NP_", "NC_", "NG_", "NR_", "NW_", "NT_", "XM_", "XP_", "XR_", "ENST", "ENSG", "ENSP",
"ENSE", "LRG_",
];
const EDIT_TYPES: &[&str] = &["del", "ins", "dup", "inv", "con", "delins", "fs", "ext"];
const AMINO_ACIDS: &[&str] = &[
"Ala", "Arg", "Asn", "Asp", "Cys", "Gln", "Glu", "Gly", "His", "Ile", "Leu", "Lys", "Met",
"Phe", "Pro", "Sec", "Ser", "Thr", "Trp", "Tyr", "Val", "Ter", "Xaa",
];
pub fn find_closest_match(
input: &str,
candidates: &[&str],
max_distance: usize,
case_sensitive: bool,
) -> Option<FuzzyMatch> {
let input_normalized = if case_sensitive {
input.to_string()
} else {
input.to_lowercase()
};
let mut best_match: Option<FuzzyMatch> = None;
for &candidate in candidates {
let candidate_normalized = if case_sensitive {
candidate.to_string()
} else {
candidate.to_lowercase()
};
let distance = levenshtein_distance(&input_normalized, &candidate_normalized);
if distance <= max_distance
&& (best_match.is_none() || distance < best_match.as_ref().unwrap().distance)
{
best_match = Some(FuzzyMatch::new(candidate, distance));
}
}
best_match
}
pub fn detect_accession_typo(input: &str) -> Option<FuzzyMatch> {
let upper = input.to_uppercase();
for prefix in ACCESSION_PREFIXES {
if upper.starts_with(prefix) {
return None; }
}
let prefix_end = input
.char_indices()
.take(5)
.find(|(_, c)| c.is_ascii_digit())
.map(|(i, _)| i)
.unwrap_or_else(|| input.len().min(5));
if prefix_end < 2 {
return None;
}
let potential_prefix = &input[..prefix_end];
find_closest_match(potential_prefix, ACCESSION_PREFIXES, 2, false).filter(|m| m.distance > 0)
}
pub fn detect_edit_type_typo(input: &str) -> Option<FuzzyMatch> {
let lower = input.to_lowercase();
if EDIT_TYPES.contains(&lower.as_str()) {
return None;
}
find_closest_match(input, EDIT_TYPES, 1, false)
}
pub fn detect_amino_acid_typo(input: &str) -> Option<FuzzyMatch> {
for aa in AMINO_ACIDS {
if input.eq_ignore_ascii_case(aa) {
return None;
}
}
find_closest_match(input, AMINO_ACIDS, 1, false)
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TypoSuggestion {
pub original: String,
pub suggestion: String,
pub token_type: TypoTokenType,
pub start: usize,
pub end: usize,
pub distance: usize,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TypoTokenType {
AccessionPrefix,
EditType,
AminoAcid,
}
impl std::fmt::Display for TypoTokenType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
TypoTokenType::AccessionPrefix => write!(f, "accession prefix"),
TypoTokenType::EditType => write!(f, "edit type"),
TypoTokenType::AminoAcid => write!(f, "amino acid"),
}
}
}
impl TypoSuggestion {
pub fn suggestion_message(&self) -> String {
format!(
"possible typo in {}: '{}' → did you mean '{}'?",
self.token_type, self.original, self.suggestion
)
}
}
pub fn detect_typos(input: &str) -> Vec<TypoSuggestion> {
let mut suggestions = Vec::new();
if let Some(fuzzy) = detect_accession_typo(input) {
let prefix_end = input
.char_indices()
.take(10)
.find(|(_, c)| c.is_ascii_digit())
.map(|(i, _)| i)
.unwrap_or(fuzzy.matched.len());
suggestions.push(TypoSuggestion {
original: input[..prefix_end].to_string(),
suggestion: fuzzy.matched,
token_type: TypoTokenType::AccessionPrefix,
start: 0,
end: prefix_end,
distance: fuzzy.distance,
});
}
let edit_pattern = regex_lite_find_edit_types(input);
for (start, end, token) in edit_pattern {
if let Some(fuzzy) = detect_edit_type_typo(&token) {
suggestions.push(TypoSuggestion {
original: token,
suggestion: fuzzy.matched,
token_type: TypoTokenType::EditType,
start,
end,
distance: fuzzy.distance,
});
}
}
if input.contains("p.") || input.contains(":p.") {
let aa_pattern = find_potential_amino_acids(input);
for (start, end, token) in aa_pattern {
if let Some(fuzzy) = detect_amino_acid_typo(&token) {
suggestions.push(TypoSuggestion {
original: token,
suggestion: fuzzy.matched,
token_type: TypoTokenType::AminoAcid,
start,
end,
distance: fuzzy.distance,
});
}
}
}
suggestions
}
fn regex_lite_find_edit_types(input: &str) -> Vec<(usize, usize, String)> {
let mut results = Vec::new();
let chars: Vec<char> = input.chars().collect();
let len = chars.len();
let mut i = 0;
while i < len {
if chars[i].is_ascii_digit() {
while i < len && (chars[i].is_ascii_digit() || chars[i] == '+' || chars[i] == '-') {
i += 1;
}
let start = i;
let mut token = String::new();
while i < len && chars[i].is_ascii_alphabetic() {
token.push(chars[i]);
i += 1;
}
if token.len() >= 2 && token.len() <= 7 {
let lower = token.to_lowercase();
if !lower.starts_with("ins")
&& !EDIT_TYPES.contains(&lower.as_str())
&& !AMINO_ACIDS.iter().any(|aa| aa.eq_ignore_ascii_case(&token))
{
results.push((start, i, token));
}
}
} else {
i += 1;
}
}
results
}
fn find_potential_amino_acids(input: &str) -> Vec<(usize, usize, String)> {
let mut results = Vec::new();
let p_pos = if let Some(pos) = input.find(":p.") {
pos + 3
} else if let Some(pos) = input.find("p.") {
pos + 2
} else {
return results;
};
let chars: Vec<char> = input[p_pos..].chars().collect();
let len = chars.len();
let mut i = 0;
while i < len {
if chars[i].is_ascii_uppercase() {
let start = p_pos + i;
let mut token = String::new();
token.push(chars[i]);
i += 1;
while i < len && chars[i].is_ascii_alphabetic() && !chars[i].is_ascii_uppercase() {
token.push(chars[i]);
i += 1;
}
if token.len() >= 3 && token.len() <= 4 {
let end = p_pos + i;
results.push((start, end, token));
}
} else {
i += 1;
}
}
results
}
fn has_non_protein_description(bytes: &[u8]) -> bool {
let mut i = 0usize;
while i + 1 < bytes.len() {
let coord = bytes[i];
let dot = bytes[i + 1];
if dot == b'.'
&& (coord == b'g'
|| coord == b'c'
|| coord == b'n'
|| coord == b'r'
|| coord == b'm'
|| coord == b'o')
{
let prev_ok = i == 0 || matches!(bytes[i - 1], b':' | b'(' | b'[' | b';');
if prev_ok {
return true;
}
}
i += 1;
}
false
}
#[inline]
fn next_char_end(bytes: &[u8], i: usize) -> usize {
if i >= bytes.len() {
return i;
}
let b = bytes[i];
let extra = if b < 0x80 {
0
} else if b & 0xE0 == 0xC0 {
1
} else if b & 0xF0 == 0xE0 {
2
} else if b & 0xF8 == 0xF0 {
3
} else {
0
};
(i + 1 + extra).min(bytes.len())
}
pub fn detect_del_size_suffix(input: &str) -> Vec<DetectedCorrection> {
let mut hits = Vec::new();
let bytes = input.as_bytes();
if !has_non_protein_description(bytes) {
return hits;
}
let mut i = 0usize;
while i + 3 <= bytes.len() {
if &bytes[i..i + 3] != b"del" {
i += 1;
continue;
}
if i + 6 <= bytes.len() && &bytes[i + 3..i + 6] == b"ins" {
i += 6;
continue;
}
let digit_start = i + 3;
let mut j = digit_start;
while j < bytes.len() && bytes[j].is_ascii_digit() {
j += 1;
}
if j == digit_start {
i += 3;
continue;
}
let end_byte = bytes.get(j).copied();
let is_terminator =
end_byte.is_none_or(|b| b == b')' || b == b';' || b == b']' || b.is_ascii_whitespace());
if !is_terminator {
i = j;
continue;
}
hits.push(DetectedCorrection::new(
ErrorType::DelSizeSuffix,
&input[i..j],
String::new(),
i,
j,
));
i = j;
}
hits
}
pub fn correct_empty_delins(input: &str) -> (String, Vec<DetectedCorrection>) {
let mut hits = Vec::new();
let bytes = input.as_bytes();
if !has_non_protein_description(bytes) {
return (input.to_string(), hits);
}
let mut result = String::with_capacity(input.len());
let mut i = 0usize;
while i < bytes.len() {
if i + 6 <= bytes.len() && &bytes[i..i + 6] == b"delins" {
let after = bytes.get(i + 6).copied();
let is_empty = match after {
None => true,
Some(b) => b == b')' || b == b';' || b == b']' || b.is_ascii_whitespace(),
};
if is_empty {
hits.push(DetectedCorrection::new(
ErrorType::EmptyDelinsInsert,
"delins",
"del",
i,
i + 6,
));
result.push_str("del");
i += 6;
continue;
}
}
let ch_end = next_char_end(bytes, i);
result.push_str(&input[i..ch_end]);
i = ch_end;
}
(result, hits)
}
fn match_equal_position_pair(bytes: &[u8], i: usize, input: &str) -> Option<(usize, String)> {
let mut j = i;
let first_start = j;
if j < bytes.len() && bytes[j] == b'-' {
j += 1;
}
let num1_digit_start = j;
while j < bytes.len() && bytes[j].is_ascii_digit() {
j += 1;
}
if j == num1_digit_start {
return None;
}
let first_end = j;
if j >= bytes.len() || bytes[j] != b'_' {
return None;
}
j += 1;
let second_start = j;
if j < bytes.len() && bytes[j] == b'-' {
j += 1;
}
let num2_digit_start = j;
while j < bytes.len() && bytes[j].is_ascii_digit() {
j += 1;
}
if j == num2_digit_start {
return None;
}
let second_end = j;
let first = &input[first_start..first_end];
let second = &input[second_start..second_end];
if first != second {
return None;
}
Some((j, first.to_string()))
}
fn matches_single_pos_keyword(bytes: &[u8], i: usize) -> bool {
if i + 3 > bytes.len() {
return false;
}
let kw = &bytes[i..i + 3];
if kw == b"del" {
return !(i + 6 <= bytes.len() && &bytes[i + 3..i + 6] == b"ins");
}
kw == b"dup" || kw == b"inv"
}
pub fn correct_single_position_range(input: &str) -> (String, Vec<DetectedCorrection>) {
let mut hits = Vec::new();
let bytes = input.as_bytes();
if !has_non_protein_description(bytes) {
return (input.to_string(), hits);
}
let mut result = String::with_capacity(input.len());
let mut p = 0usize;
while p < bytes.len() {
if let Some((pair_end, value_text)) = match_equal_position_pair(bytes, p, input) {
if matches_single_pos_keyword(bytes, pair_end) {
hits.push(DetectedCorrection::new(
ErrorType::SinglePositionRange,
&input[p..pair_end],
value_text.clone(),
p,
pair_end,
));
result.push_str(&value_text);
p = pair_end;
continue;
}
}
let ch_end = next_char_end(bytes, p);
result.push_str(&input[p..ch_end]);
p = ch_end;
}
(result, hits)
}
fn match_position_pair(bytes: &[u8], i: usize) -> Option<usize> {
let mut j = i;
if j < bytes.len() && bytes[j] == b'-' {
j += 1;
}
let num1_start = j;
while j < bytes.len() && bytes[j].is_ascii_digit() {
j += 1;
}
if j == num1_start {
return None;
}
if j >= bytes.len() || bytes[j] != b'_' {
return None;
}
j += 1;
if j < bytes.len() && bytes[j] == b'-' {
j += 1;
}
let num2_start = j;
while j < bytes.len() && bytes[j].is_ascii_digit() {
j += 1;
}
if j == num2_start {
return None;
}
Some(j)
}
pub fn correct_redundant_repeat_label(input: &str) -> (String, Vec<DetectedCorrection>) {
let mut hits = Vec::new();
let bytes = input.as_bytes();
let mut desc_start = None;
let mut i = 0usize;
while i + 1 < bytes.len() {
if bytes[i] == b'r'
&& bytes[i + 1] == b'.'
&& (i == 0 || matches!(bytes[i - 1], b':' | b'(' | b'[' | b';'))
{
desc_start = Some(i + 2);
break;
}
i += 1;
}
let Some(start) = desc_start else {
return (input.to_string(), hits);
};
let mut result = String::with_capacity(input.len());
result.push_str(&input[..start]);
let mut p = start;
while p < bytes.len() {
if matches!(bytes[p], b'c' | b'g' | b'm' | b'n' | b'o' | b'p')
&& bytes.get(p + 1).copied() == Some(b'.')
&& p > start
&& matches!(bytes[p - 1], b':' | b'(' | b'[' | b';' | b'|')
{
break;
}
let pair_start = p;
let pair = match_position_pair(bytes, p);
if let Some(after_pair) = pair {
let bases_start = after_pair;
let mut bases_end = bases_start;
while bases_end < bytes.len() && matches!(bytes[bases_end], b'a' | b'c' | b'g' | b'u') {
bases_end += 1;
}
if bases_end > bases_start && bytes.get(bases_end).copied() == Some(b'[') {
result.push_str(&input[pair_start..bases_start]);
hits.push(DetectedCorrection::new(
ErrorType::RedundantRepeatLabel,
&input[bases_start..bases_end],
String::new(),
bases_start,
bases_end,
));
p = bases_end;
continue;
}
}
let ch_end = next_char_end(bytes, p);
result.push_str(&input[p..ch_end]);
p = ch_end;
}
result.push_str(&input[p..]);
(result, hits)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_correct_dash_en_dash() {
let (corrected, corrections) = correct_dash_characters("c.100\u{2013}200del");
assert_eq!(corrected, "c.100-200del");
assert_eq!(corrections.len(), 1);
assert_eq!(corrections[0].error_type, ErrorType::WrongDashCharacter);
}
#[test]
fn test_correct_dash_em_dash() {
let (corrected, corrections) = correct_dash_characters("c.100\u{2014}200del");
assert_eq!(corrected, "c.100-200del");
assert_eq!(corrections.len(), 1);
}
#[test]
fn test_correct_dash_minus_sign() {
let (corrected, corrections) = correct_dash_characters("c.100\u{2212}200del");
assert_eq!(corrected, "c.100-200del");
assert_eq!(corrections.len(), 1);
}
#[test]
fn test_correct_dash_no_change() {
let (corrected, corrections) = correct_dash_characters("c.100-200del");
assert_eq!(corrected, "c.100-200del");
assert!(corrections.is_empty());
}
#[test]
fn test_correct_smart_single_quotes() {
let (corrected, corrections) = correct_quote_characters("c.100ins\u{2018}A\u{2019}");
assert_eq!(corrected, "c.100ins'A'");
assert_eq!(corrections.len(), 2);
}
#[test]
fn test_correct_smart_double_quotes() {
let (corrected, corrections) = correct_quote_characters("c.100ins\u{201C}ATG\u{201D}");
assert_eq!(corrected, "c.100ins\"ATG\"");
assert_eq!(corrections.len(), 2);
}
#[test]
fn test_correct_whitespace_trim() {
let (corrected, corrections) = correct_whitespace(" c.100A>G ");
assert_eq!(corrected, "c.100A>G");
assert!(!corrections.is_empty());
}
#[test]
fn test_correct_whitespace_no_change() {
let (corrected, _corrections) = correct_whitespace("c.100A>G");
assert_eq!(corrected, "c.100A>G");
assert_eq!(corrected, "c.100A>G");
}
#[test]
fn test_correct_whitespace_strips_zero_width_chars() {
let input = "NM_000088.3:c.100\u{200B}A>G";
let (corrected, corrections) = correct_whitespace(input);
assert_eq!(corrected, "NM_000088.3:c.100A>G");
assert_eq!(corrections.len(), 1, "should record one correction");
assert_eq!(corrections[0].error_type, ErrorType::ExtraWhitespace);
let input = "\u{FEFF}NM_000088.3:c.100A>G";
let (corrected, _) = correct_whitespace(input);
assert_eq!(corrected, "NM_000088.3:c.100A>G");
let input = "c.100\u{200C}A>G";
let (corrected, _) = correct_whitespace(input);
assert_eq!(corrected, "c.100A>G");
let input = "c.100\u{200D}A>G";
let (corrected, _) = correct_whitespace(input);
assert_eq!(corrected, "c.100A>G");
}
#[test]
fn test_correct_whitespace_embedded_single_run_one_warning() {
let (corrected, corrections) = correct_whitespace("c.100 A>G");
assert_eq!(corrected, "c.100A>G");
assert_eq!(corrections.len(), 1);
assert_eq!(corrections[0].original, " ");
assert_eq!(corrections[0].start, 5);
assert_eq!(corrections[0].end, 8);
}
#[test]
fn test_correct_whitespace_multiple_runs_count() {
let (corrected, corrections) = correct_whitespace(" c.100 A> G");
assert_eq!(corrected, "c.100A>G");
assert_eq!(corrections.len(), 3);
}
#[test]
fn test_correct_whitespace_idempotent() {
let (first, _) = correct_whitespace(" c. 100 A>G ");
let (second, corrections) = correct_whitespace(&first);
assert_eq!(first, second);
assert!(corrections.is_empty());
}
#[test]
fn test_correct_whitespace_mixed_unicode_whitespace() {
let input = "c.100\t\u{00A0}\u{2003}\u{000B}A>G";
let (corrected, corrections) = correct_whitespace(input);
assert_eq!(corrected, "c.100A>G");
assert_eq!(corrections.len(), 1);
}
#[test]
fn test_detect_position_zero() {
assert!(detect_position_zero("c.0A>G").is_some());
assert!(detect_position_zero("g.0del").is_some());
assert!(detect_position_zero("c.10A>G").is_none());
assert!(detect_position_zero("c.100A>G").is_none());
}
#[test]
fn test_correct_protein_arrow() {
let (corrected, corrections) = correct_protein_arrow("p.Val600>Glu");
assert_eq!(corrected, "p.Val600Glu");
assert_eq!(corrections.len(), 1);
assert_eq!(
corrections[0].error_type,
ErrorType::ProteinSubstitutionArrow
);
}
#[test]
fn test_correct_protein_arrow_no_change() {
let (corrected, corrections) = correct_protein_arrow("p.Val600Glu");
assert_eq!(corrected, "p.Val600Glu");
assert!(corrections.is_empty());
}
#[test]
fn test_correct_protein_arrow_not_protein() {
let (corrected, corrections) = correct_protein_arrow("c.100A>G");
assert_eq!(corrected, "c.100A>G");
assert!(corrections.is_empty());
}
#[test]
fn test_correct_amino_acid_case_lowercase() {
let result = correct_amino_acid_case("val");
assert!(result.is_some());
let (corrected, error_type) = result.unwrap();
assert_eq!(corrected, "Val");
assert_eq!(error_type, ErrorType::LowercaseAminoAcid);
}
#[test]
fn test_correct_amino_acid_case_uppercase() {
let result = correct_amino_acid_case("VAL");
assert!(result.is_some());
let (corrected, _) = result.unwrap();
assert_eq!(corrected, "Val");
}
#[test]
fn test_correct_amino_acid_case_correct() {
let result = correct_amino_acid_case("Val");
assert!(result.is_none());
}
#[test]
fn test_single_to_three_letter_aa() {
assert_eq!(single_to_three_letter_aa('V'), Some("Val"));
assert_eq!(single_to_three_letter_aa('E'), Some("Glu"));
assert_eq!(single_to_three_letter_aa('*'), Some("Ter"));
assert_eq!(single_to_three_letter_aa('Z'), None);
}
#[test]
fn test_correct_amino_acid_case_in_protein_lowercase() {
let (corrected, corrections) =
correct_amino_acid_case_in_protein("NP_000079.2:p.val600glu");
assert_eq!(corrected, "NP_000079.2:p.Val600Glu");
assert_eq!(corrections.len(), 2);
assert!(corrections
.iter()
.all(|c| c.error_type == ErrorType::LowercaseAminoAcid));
assert_eq!(corrections[0].original, "val");
assert_eq!(corrections[0].corrected, "Val");
assert_eq!(corrections[1].original, "glu");
assert_eq!(corrections[1].corrected, "Glu");
}
#[test]
fn test_correct_amino_acid_case_in_protein_uppercase() {
let (corrected, corrections) =
correct_amino_acid_case_in_protein("NP_000079.2:p.VAL600GLU");
assert_eq!(corrected, "NP_000079.2:p.Val600Glu");
assert_eq!(corrections.len(), 2);
}
#[test]
fn test_correct_amino_acid_case_in_protein_canonical_no_warning() {
let (corrected, corrections) =
correct_amino_acid_case_in_protein("NP_000079.2:p.Val600Glu");
assert_eq!(corrected, "NP_000079.2:p.Val600Glu");
assert!(corrections.is_empty());
}
#[test]
fn test_correct_amino_acid_case_in_protein_only_runs_on_protein() {
let (corrected, corrections) = correct_amino_acid_case_in_protein("NM_000088.3:c.100A>G");
assert_eq!(corrected, "NM_000088.3:c.100A>G");
assert!(corrections.is_empty());
}
#[test]
fn test_correct_amino_acid_case_in_protein_predicted_parens() {
let (corrected, corrections) =
correct_amino_acid_case_in_protein("NP_000079.2:p.(val600glu)");
assert_eq!(corrected, "NP_000079.2:p.(Val600Glu)");
assert_eq!(corrections.len(), 2);
}
#[test]
fn test_correct_amino_acid_case_in_protein_idempotent() {
let (once, _) = correct_amino_acid_case_in_protein("NP_000079.2:p.val600glu");
let (twice, corrections) = correct_amino_acid_case_in_protein(&once);
assert_eq!(twice, once);
assert!(corrections.is_empty());
}
#[test]
fn test_correct_single_letter_aa_substitution() {
let (corrected, corrections) = correct_single_letter_aa_in_protein("NP_000079.2:p.V600E");
assert_eq!(corrected, "NP_000079.2:p.Val600Glu");
assert_eq!(corrections.len(), 2);
assert!(corrections
.iter()
.all(|c| c.error_type == ErrorType::SingleLetterAminoAcid));
}
#[test]
fn test_correct_single_letter_aa_canonical_no_warning() {
let (corrected, corrections) =
correct_single_letter_aa_in_protein("NP_000079.2:p.Val600Glu");
assert_eq!(corrected, "NP_000079.2:p.Val600Glu");
assert!(corrections.is_empty());
}
#[test]
fn test_correct_single_letter_aa_no_p_segment() {
let (corrected, corrections) = correct_single_letter_aa_in_protein("NM_000088.3:c.459A>G");
assert_eq!(corrected, "NM_000088.3:c.459A>G");
assert!(corrections.is_empty());
}
#[test]
fn test_correct_single_letter_aa_predicted() {
let (corrected, corrections) = correct_single_letter_aa_in_protein("NP_000079.2:p.(V600E)");
assert_eq!(corrected, "NP_000079.2:p.(Val600Glu)");
assert_eq!(corrections.len(), 2);
}
#[test]
fn test_correct_single_letter_aa_delins() {
let (corrected, corrections) =
correct_single_letter_aa_in_protein("NP_000079.2:p.V600_E601delinsK");
assert_eq!(corrected, "NP_000079.2:p.Val600_Glu601delinsLys");
assert_eq!(corrections.len(), 3);
}
#[test]
fn test_correct_single_letter_aa_idempotent() {
let (once, _) = correct_single_letter_aa_in_protein("NP_000079.2:p.V600E");
let (twice, corrections) = correct_single_letter_aa_in_protein(&once);
assert_eq!(twice, once);
assert!(corrections.is_empty());
}
#[test]
fn test_detect_missing_versions_single() {
let hits = detect_missing_versions("NM_000088:c.100A>G");
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].original, "NM_000088");
assert_eq!(hits[0].error_type, ErrorType::MissingVersion);
assert_eq!(hits[0].start, 0);
assert_eq!(hits[0].end, "NM_000088".len());
}
#[test]
fn test_detect_missing_versions_with_version_no_hit() {
let hits = detect_missing_versions("NM_000088.3:c.100A>G");
assert!(hits.is_empty());
}
#[test]
fn test_detect_missing_versions_inner_accession() {
let hits = detect_missing_versions("NG_012232(NM_004006.2):c.93+1G>T");
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].original, "NG_012232");
}
#[test]
fn test_detect_missing_versions_both_missing() {
let hits = detect_missing_versions("NG_012232(NM_004006):c.93+1G>T");
assert_eq!(hits.len(), 2);
let originals: Vec<&str> = hits.iter().map(|h| h.original.as_str()).collect();
assert!(originals.contains(&"NG_012232"));
assert!(originals.contains(&"NM_004006"));
}
#[test]
fn test_detect_missing_versions_ensembl_optional() {
let hits = detect_missing_versions("ENST00000380152:c.100A>G");
assert!(hits.is_empty());
}
#[test]
fn test_detect_missing_versions_no_accession() {
let hits = detect_missing_versions("c.100A>G");
assert!(hits.is_empty());
}
#[test]
fn test_detect_missing_versions_lrg() {
let hits = detect_missing_versions("LRG_199:c.100A>G");
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].original, "LRG_199");
}
#[test]
fn test_detect_missing_versions_lrg_transcript_no_hit() {
let hits = detect_missing_versions("LRG_292t1:c.100A>G");
assert!(
hits.is_empty(),
"LRG transcript suffix should not trigger W3001, got {hits:?}",
);
}
#[test]
fn test_detect_missing_versions_lrg_protein_no_hit() {
let hits = detect_missing_versions("LRG_292p1:p.Ser68Arg");
assert!(
hits.is_empty(),
"LRG protein suffix should not trigger W3001, got {hits:?}",
);
}
#[test]
fn test_correct_accession_prefix_lowercase() {
let (corrected, corrections) = correct_accession_prefix_case("nm_000088.3");
assert_eq!(corrected, "NM_000088.3");
assert_eq!(corrections.len(), 1);
}
#[test]
fn test_correct_accession_prefix_correct() {
let (corrected, corrections) = correct_accession_prefix_case("NM_000088.3");
assert_eq!(corrected, "NM_000088.3");
assert!(corrections.is_empty());
}
#[test]
fn test_correct_edit_type_uppercase() {
let result = correct_edit_type_case("DEL");
assert!(result.is_some());
let (corrected, _) = result.unwrap();
assert_eq!(corrected, "del");
}
#[test]
fn test_correct_edit_type_mixed() {
let result = correct_edit_type_case("Del");
assert!(result.is_some());
let (corrected, _) = result.unwrap();
assert_eq!(corrected, "del");
}
#[test]
fn test_correct_edit_type_correct() {
let result = correct_edit_type_case("del");
assert!(result.is_none());
}
#[test]
fn test_detected_correction_warning_message() {
let correction =
DetectedCorrection::new(ErrorType::WrongDashCharacter, "\u{2013}", "-", 5, 8);
let msg = correction.warning_message();
assert!(msg.contains("wrong dash character"));
assert!(msg.contains("position 5"));
}
#[test]
fn test_detect_missing_version_no_version() {
let result = detect_missing_version("NM_000088:c.100A>G");
assert!(result.is_some());
let (pos, acc) = result.unwrap();
assert_eq!(pos, 0);
assert_eq!(acc, "NM_000088");
}
#[test]
fn test_detect_missing_version_with_version() {
let result = detect_missing_version("NM_000088.3:c.100A>G");
assert!(result.is_none());
}
#[test]
fn test_detect_missing_version_ensembl() {
let result = detect_missing_version("ENST00000123456:c.100A>G");
assert!(result.is_none());
}
#[test]
fn test_detect_missing_version_nc() {
let result = detect_missing_version("NC_000001:g.12345A>G");
assert!(result.is_some());
}
#[test]
fn test_detect_swapped_positions_swapped() {
let result = detect_swapped_positions("c.200_100del");
assert!(result.is_some());
let correction = result.unwrap();
assert_eq!(correction.error_type, ErrorType::SwappedPositions);
assert_eq!(correction.original, "200_100");
assert_eq!(correction.corrected, "100_200");
}
#[test]
fn test_detect_swapped_positions_correct() {
let result = detect_swapped_positions("c.100_200del");
assert!(result.is_none());
}
#[test]
fn test_detect_swapped_positions_with_accession() {
let result = detect_swapped_positions("NM_000088.3:c.500_100del");
assert!(result.is_some());
let correction = result.unwrap();
assert_eq!(correction.original, "500_100");
assert_eq!(correction.corrected, "100_500");
}
#[test]
fn test_detect_swapped_positions_genomic() {
let result = detect_swapped_positions("g.2000_1000del");
assert!(result.is_some());
}
#[test]
fn test_detect_swapped_positions_negative() {
let result = detect_swapped_positions("c.-10_-50del");
assert!(result.is_some());
}
#[test]
fn test_correct_swapped_positions() {
let (corrected, corrections) = correct_swapped_positions("c.200_100del");
assert_eq!(corrected, "c.100_200del");
assert_eq!(corrections.len(), 1);
}
#[test]
fn test_correct_swapped_positions_no_change() {
let (corrected, corrections) = correct_swapped_positions("c.100_200del");
assert_eq!(corrected, "c.100_200del");
assert!(corrections.is_empty());
}
#[test]
fn test_levenshtein_distance_identical() {
assert_eq!(levenshtein_distance("del", "del"), 0);
assert_eq!(levenshtein_distance("", ""), 0);
}
#[test]
fn test_levenshtein_distance_one_empty() {
assert_eq!(levenshtein_distance("", "abc"), 3);
assert_eq!(levenshtein_distance("abc", ""), 3);
}
#[test]
fn test_levenshtein_distance_substitution() {
assert_eq!(levenshtein_distance("del", "dek"), 1);
assert_eq!(levenshtein_distance("cat", "bat"), 1);
}
#[test]
fn test_levenshtein_distance_insertion() {
assert_eq!(levenshtein_distance("del", "delx"), 1);
assert_eq!(levenshtein_distance("abc", "abcd"), 1);
}
#[test]
fn test_levenshtein_distance_deletion() {
assert_eq!(levenshtein_distance("delx", "del"), 1);
assert_eq!(levenshtein_distance("abcd", "abc"), 1);
}
#[test]
fn test_levenshtein_distance_transposition() {
assert_eq!(levenshtein_distance("ab", "ba"), 2);
assert_eq!(levenshtein_distance("NM_", "MN_"), 2);
}
#[test]
fn test_levenshtein_distance_classic_example() {
assert_eq!(levenshtein_distance("kitten", "sitting"), 3);
}
#[test]
fn test_find_closest_match_exact() {
let candidates = vec!["del", "ins", "dup"];
let result = find_closest_match("del", &candidates, 1, true);
assert!(result.is_some());
assert_eq!(result.unwrap().distance, 0);
}
#[test]
fn test_find_closest_match_typo() {
let candidates = vec!["del", "ins", "dup"];
let result = find_closest_match("dek", &candidates, 1, true);
assert!(result.is_some());
let m = result.unwrap();
assert_eq!(m.matched, "del");
assert_eq!(m.distance, 1);
}
#[test]
fn test_find_closest_match_case_insensitive() {
let candidates = vec!["Val", "Glu", "Ala"];
let result = find_closest_match("VAL", &candidates, 0, false);
assert!(result.is_some());
assert_eq!(result.unwrap().matched, "Val");
}
#[test]
fn test_find_closest_match_no_match() {
let candidates = vec!["del", "ins", "dup"];
let result = find_closest_match("xyz", &candidates, 1, true);
assert!(result.is_none());
}
#[test]
fn test_detect_accession_typo_transposed() {
let result = detect_accession_typo("MN_000088.3");
assert!(result.is_some());
let m = result.unwrap();
assert_eq!(m.matched, "NM_");
}
#[test]
fn test_detect_accession_typo_correct() {
let result = detect_accession_typo("NM_000088.3");
assert!(result.is_none());
}
#[test]
fn test_detect_accession_typo_nc() {
let result = detect_accession_typo("CN_000001.10");
assert!(result.is_some());
let matched = result.unwrap().matched;
assert!(matched == "NM_" || matched == "NC_");
}
#[test]
fn test_detect_accession_typo_ensembl() {
let result = detect_accession_typo("ESNT00000123456");
assert!(result.is_some());
assert_eq!(result.unwrap().matched, "ENST");
}
#[test]
fn test_detect_edit_type_typo_dek() {
let result = detect_edit_type_typo("dek");
assert!(result.is_some());
assert_eq!(result.unwrap().matched, "del");
}
#[test]
fn test_detect_edit_type_typo_inx() {
let result = detect_edit_type_typo("inx");
assert!(result.is_some());
assert_eq!(result.unwrap().matched, "ins");
}
#[test]
fn test_detect_edit_type_typo_correct() {
assert!(detect_edit_type_typo("del").is_none());
assert!(detect_edit_type_typo("ins").is_none());
assert!(detect_edit_type_typo("dup").is_none());
}
#[test]
fn test_detect_amino_acid_typo_vasl() {
let result = detect_amino_acid_typo("Vasl");
assert!(result.is_some());
assert_eq!(result.unwrap().matched, "Val");
}
#[test]
fn test_detect_amino_acid_typo_gul() {
let result = detect_amino_acid_typo("Gul");
assert!(result.is_none());
}
#[test]
fn test_detect_amino_acid_typo_single_char_error() {
let result = detect_amino_acid_typo("Vak");
assert!(result.is_some());
assert_eq!(result.unwrap().matched, "Val");
}
#[test]
fn test_detect_amino_acid_typo_correct() {
assert!(detect_amino_acid_typo("Val").is_none());
assert!(detect_amino_acid_typo("Glu").is_none());
assert!(detect_amino_acid_typo("Ala").is_none());
}
#[test]
fn test_detect_typos_accession_prefix() {
let typos = detect_typos("MN_000088.3:c.100A>G");
assert!(!typos.is_empty());
assert_eq!(typos[0].token_type, TypoTokenType::AccessionPrefix);
assert_eq!(typos[0].suggestion, "NM_");
}
#[test]
fn test_detect_typos_no_typos() {
let typos = detect_typos("NM_000088.3:c.100A>G");
assert!(typos.is_empty());
}
#[test]
fn test_detect_typos_edit_type() {
let typos = detect_typos("NM_000088.3:c.100dek");
assert!(!typos.is_empty());
let edit_typo = typos
.iter()
.find(|t| t.token_type == TypoTokenType::EditType);
assert!(edit_typo.is_some());
assert_eq!(edit_typo.unwrap().suggestion, "del");
}
#[test]
fn test_detect_typos_amino_acid() {
let typos = detect_typos("p.Vasl600Glu");
assert!(!typos.is_empty());
assert_eq!(typos[0].token_type, TypoTokenType::AminoAcid);
assert_eq!(typos[0].suggestion, "Val");
}
#[test]
fn test_typo_suggestion_message() {
let suggestion = TypoSuggestion {
original: "MN_".to_string(),
suggestion: "NM_".to_string(),
token_type: TypoTokenType::AccessionPrefix,
start: 0,
end: 3,
distance: 2,
};
let msg = suggestion.suggestion_message();
assert!(msg.contains("accession prefix"));
assert!(msg.contains("MN_"));
assert!(msg.contains("NM_"));
}
#[test]
fn test_typo_token_type_display() {
assert_eq!(
format!("{}", TypoTokenType::AccessionPrefix),
"accession prefix"
);
assert_eq!(format!("{}", TypoTokenType::EditType), "edit type");
assert_eq!(format!("{}", TypoTokenType::AminoAcid), "amino acid");
}
#[test]
fn test_fuzzy_match_struct() {
let m = FuzzyMatch::new("del", 1);
assert_eq!(m.matched, "del");
assert_eq!(m.distance, 1);
}
#[test]
fn test_strip_trailing_annotation_synonymous() {
let (corrected, corrections) =
strip_trailing_annotation("NM_003467.3(CXCR4):c.708G>A (p.Lys236=)");
assert_eq!(corrected, "NM_003467.3(CXCR4):c.708G>A");
assert_eq!(corrections.len(), 1);
assert_eq!(corrections[0].error_type, ErrorType::TrailingAnnotation);
}
#[test]
fn test_strip_trailing_annotation_missense() {
let (corrected, corrections) =
strip_trailing_annotation("NM_021831.6(AGBL5):c.2083G>A (p.Val695Ile)");
assert_eq!(corrected, "NM_021831.6(AGBL5):c.2083G>A");
assert_eq!(corrections.len(), 1);
}
#[test]
fn test_strip_trailing_annotation_frameshift() {
let (corrected, corrections) =
strip_trailing_annotation("NM_178127.5(ANGPTL5):c.1097dup (p.Asn366fs)");
assert_eq!(corrected, "NM_178127.5(ANGPTL5):c.1097dup");
assert_eq!(corrections.len(), 1);
}
#[test]
fn test_strip_trailing_annotation_no_space() {
let (corrected, corrections) = strip_trailing_annotation("NM_000088.3:c.459A>G(p.Lys153=)");
assert_eq!(corrected, "NM_000088.3:c.459A>G");
assert_eq!(corrections.len(), 1);
}
#[test]
fn test_strip_trailing_annotation_no_annotation() {
let (corrected, corrections) = strip_trailing_annotation("NM_000088.3:c.459A>G");
assert_eq!(corrected, "NM_000088.3:c.459A>G");
assert!(corrections.is_empty());
}
#[test]
fn test_strip_trailing_annotation_valid_uncertain() {
let (corrected, corrections) = strip_trailing_annotation("NP_000079.2:p.(Val600Glu)");
assert_eq!(corrected, "NP_000079.2:p.(Val600Glu)");
assert!(corrections.is_empty());
}
#[test]
fn test_strip_trailing_annotation_protein_variant() {
let (corrected, corrections) = strip_trailing_annotation("NP_000079.2:p.Val600Glu");
assert_eq!(corrected, "NP_000079.2:p.Val600Glu");
assert!(corrections.is_empty());
}
#[test]
fn test_strip_trailing_annotation_all_parse_gaps() {
let patterns = [
(
"NM_003467.3(CXCR4):c.708G>A (p.Lys236=)",
"NM_003467.3(CXCR4):c.708G>A",
),
(
"NM_001267550.2(TTN):c.30570C>A (p.Thr10190=)",
"NM_001267550.2(TTN):c.30570C>A",
),
(
"NM_000199.5(SGSH):c.1428C>T (p.His476=)",
"NM_000199.5(SGSH):c.1428C>T",
),
(
"NM_001366385.1(CARD14):c.27C>T (p.Ser9=)",
"NM_001366385.1(CARD14):c.27C>T",
),
(
"NM_001267550.2(TTN):c.49476T>C (p.Pro16492=)",
"NM_001267550.2(TTN):c.49476T>C",
),
(
"NM_198253.3(TERT):c.603G>A (p.Arg201=)",
"NM_198253.3(TERT):c.603G>A",
),
(
"NM_178127.5(ANGPTL5):c.1097dup (p.Asn366fs)",
"NM_178127.5(ANGPTL5):c.1097dup",
),
(
"NM_032303.5(HSDL2):c.894A>G (p.Lys298=)",
"NM_032303.5(HSDL2):c.894A>G",
),
(
"NM_021831.6(AGBL5):c.2083G>A (p.Val695Ile)",
"NM_021831.6(AGBL5):c.2083G>A",
),
(
"NM_201555.2(FHL2):c.507C>T (p.Ile169=)",
"NM_201555.2(FHL2):c.507C>T",
),
(
"NM_025137.4(SPG11):c.1821A>G (p.Ser607=)",
"NM_025137.4(SPG11):c.1821A>G",
),
(
"NM_025137.4(SPG11):c.6892A>G (p.Ile2298Val)",
"NM_025137.4(SPG11):c.6892A>G",
),
(
"NM_001148.6(ANK2):c.8484T>C (p.Asp2828=)",
"NM_001148.6(ANK2):c.8484T>C",
),
(
"NM_001148.6(ANK2):c.6492T>G (p.Leu2164=)",
"NM_001148.6(ANK2):c.6492T>G",
),
(
"NM_001148.6(ANK2):c.231G>A (p.Val77=)",
"NM_001148.6(ANK2):c.231G>A",
),
(
"NM_000059.3(BRCA2):c.3570G>C (p.Arg1190=)",
"NM_000059.3(BRCA2):c.3570G>C",
),
(
"NM_030773.4(TUBB1):c.1045G>A (p.Val349Ile)",
"NM_030773.4(TUBB1):c.1045G>A",
),
];
for (input, expected) in patterns {
let (corrected, corrections) = strip_trailing_annotation(input);
assert_eq!(corrected, expected, "Failed for input: {}", input);
assert_eq!(corrections.len(), 1, "No correction for: {}", input);
}
}
#[test]
fn test_correct_missing_coordinate_prefix_nc() {
let (corrected, corrections) = correct_missing_coordinate_prefix("NC_000017.11:12345A>G");
assert_eq!(corrected, "NC_000017.11:g.12345A>G");
assert_eq!(corrections.len(), 1);
assert_eq!(
corrections[0].error_type,
ErrorType::MissingCoordinatePrefix
);
}
#[test]
fn test_correct_missing_coordinate_prefix_uncertain_range() {
let (corrected, corrections) =
correct_missing_coordinate_prefix("NC_000017.11:(?_31094927)_(31377677_?)del");
assert_eq!(corrected, "NC_000017.11:g.(?_31094927)_(31377677_?)del");
assert_eq!(corrections.len(), 1);
}
#[test]
fn test_correct_missing_coordinate_prefix_ng() {
let (corrected, corrections) = correct_missing_coordinate_prefix("NG_007489.1:100_200del");
assert_eq!(corrected, "NG_007489.1:g.100_200del");
assert_eq!(corrections.len(), 1);
}
#[test]
fn test_correct_missing_coordinate_prefix_already_present() {
let (corrected, corrections) = correct_missing_coordinate_prefix("NC_000017.11:g.12345A>G");
assert_eq!(corrected, "NC_000017.11:g.12345A>G");
assert!(corrections.is_empty());
}
#[test]
fn test_correct_missing_coordinate_prefix_nm() {
let (corrected, corrections) = correct_missing_coordinate_prefix("NM_000088.3:459A>G");
assert_eq!(corrected, "NM_000088.3:459A>G");
assert!(corrections.is_empty());
}
#[test]
fn test_correct_missing_coordinate_prefix_lrg() {
let (corrected, corrections) = correct_missing_coordinate_prefix("LRG_292:100_200del");
assert_eq!(corrected, "LRG_292:g.100_200del");
assert_eq!(corrections.len(), 1);
}
#[test]
fn test_correct_missing_coordinate_prefix_lrg_transcript() {
let (corrected, corrections) = correct_missing_coordinate_prefix("LRG_292t1:100_200del");
assert_eq!(corrected, "LRG_292t1:100_200del");
assert!(corrections.is_empty());
}
#[test]
fn test_correct_missing_coordinate_prefix_bracket() {
let (corrected, corrections) =
correct_missing_coordinate_prefix("NC_000004.12:[144539078A>G]");
assert_eq!(corrected, "NC_000004.12:g.[144539078A>G]");
assert_eq!(corrections.len(), 1);
}
#[test]
fn test_deprecated_stop_star_substitution() {
let (corrected, corrections) = correct_deprecated_protein_forms("NP_000079.2:p.Arg97*");
assert_eq!(corrected, "NP_000079.2:p.Arg97Ter");
assert_eq!(corrections.len(), 1);
assert_eq!(
corrections[0].error_type,
ErrorType::DeprecatedStopCodonStar
);
assert_eq!(corrections[0].original, "*");
assert_eq!(corrections[0].corrected, "Ter");
}
#[test]
fn test_deprecated_stop_x_substitution() {
let (corrected, corrections) = correct_deprecated_protein_forms("NP_000079.2:p.Arg97X");
assert_eq!(corrected, "NP_000079.2:p.Arg97Ter");
assert_eq!(corrections.len(), 1);
assert_eq!(corrections[0].error_type, ErrorType::DeprecatedStopCodonX);
}
#[test]
fn test_deprecated_frameshift_star() {
let (corrected, corrections) = correct_deprecated_protein_forms("NP_000079.2:p.Arg97fs*23");
assert_eq!(corrected, "NP_000079.2:p.Arg97fsTer23");
assert_eq!(corrections.len(), 1);
assert_eq!(
corrections[0].error_type,
ErrorType::DeprecatedFrameshiftStar
);
assert_eq!(corrections[0].original, "*23");
assert_eq!(corrections[0].corrected, "Ter23");
}
#[test]
fn test_deprecated_frameshift_x() {
let (corrected, corrections) = correct_deprecated_protein_forms("NP_000079.2:p.Arg97fsX23");
assert_eq!(corrected, "NP_000079.2:p.Arg97fsTer23");
assert_eq!(corrections.len(), 1);
assert_eq!(corrections[0].error_type, ErrorType::DeprecatedFrameshiftX);
}
#[test]
fn test_deprecated_frameshift_with_new_aa() {
let (corrected, corrections) =
correct_deprecated_protein_forms("NP_000079.2:p.Arg97Profs*23");
assert_eq!(corrected, "NP_000079.2:p.Arg97ProfsTer23");
assert_eq!(corrections.len(), 1);
assert_eq!(
corrections[0].error_type,
ErrorType::DeprecatedFrameshiftStar
);
let (corrected, corrections) =
correct_deprecated_protein_forms("NP_000079.2:p.Arg97ProfsX23");
assert_eq!(corrected, "NP_000079.2:p.Arg97ProfsTer23");
assert_eq!(corrections.len(), 1);
assert_eq!(corrections[0].error_type, ErrorType::DeprecatedFrameshiftX);
}
#[test]
fn test_deprecated_canonical_input_no_corrections() {
for input in [
"NP_000079.2:p.Arg97Ter",
"NP_000079.2:p.Arg97ProfsTer23",
"NP_000079.2:p.Tyr180fs",
"NP_000079.2:p.Val600Glu",
"NP_000079.2:p.Arg782Xaa",
"NP_000079.2:p.Met1ext-5",
"NP_001166937.1:p.Ter514LeuextTer?",
] {
let (corrected, corrections) = correct_deprecated_protein_forms(input);
assert_eq!(corrected, input, "input changed: {}", input);
assert!(
corrections.is_empty(),
"expected no corrections for {}, got {:?}",
input,
corrections
);
}
}
#[test]
fn test_detect_del_size_suffix_basic() {
let hits = detect_del_size_suffix("NG_012232.1:g.123del6");
assert_eq!(hits.len(), 1, "expected one hit, got {:?}", hits);
let h = &hits[0];
assert_eq!(h.error_type, ErrorType::DelSizeSuffix);
assert_eq!(h.original, "del6");
}
#[test]
fn test_detect_del_size_suffix_canonical_no_hit() {
let hits = detect_del_size_suffix("NG_012232.1:g.123_128del");
assert!(hits.is_empty());
}
#[test]
fn test_detect_del_size_suffix_plain_del_no_hit() {
let hits = detect_del_size_suffix("NM_000088.3:c.123del");
assert!(hits.is_empty());
}
#[test]
fn test_detect_del_size_suffix_delins_no_hit() {
let hits = detect_del_size_suffix("NC_000001.11:g.100_102delins10");
assert!(hits.is_empty());
let hits = detect_del_size_suffix("NC_000001.11:g.100_102delinsATG");
assert!(hits.is_empty());
}
#[test]
fn test_detect_del_size_suffix_with_range_still_hits() {
let hits = detect_del_size_suffix("NG_012232.1:g.100_120del6");
assert_eq!(hits.len(), 1);
}
#[test]
fn test_detect_del_size_suffix_protein_no_hit() {
let hits = detect_del_size_suffix("NP_000079.2:p.Lys100del32");
assert!(hits.is_empty());
}
#[test]
fn test_correct_empty_delins_basic() {
let (out, hits) = correct_empty_delins("NC_000001.11:g.100_102delins");
assert_eq!(out, "NC_000001.11:g.100_102del");
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].error_type, ErrorType::EmptyDelinsInsert);
assert_eq!(hits[0].original, "delins");
assert_eq!(hits[0].corrected, "del");
}
#[test]
fn test_correct_empty_delins_with_payload_no_change() {
for input in [
"NC_000001.11:g.100_102delinsATG",
"NC_000001.11:g.100_102delinsN[12]",
"NC_000001.11:g.100_102delins[A;G]",
"NC_000001.11:g.100_102delins(10_20)",
"NC_000001.11:g.100_102delins10",
] {
let (out, hits) = correct_empty_delins(input);
assert_eq!(out, input, "input {} should not change", input);
assert!(
hits.is_empty(),
"input {} should not warn, got {:?}",
input,
hits
);
}
}
#[test]
fn test_deprecated_idempotent() {
let once = correct_deprecated_protein_forms("NP_000079.2:p.Arg97fs*23").0;
let twice = correct_deprecated_protein_forms(&once);
assert_eq!(twice.0, once);
assert!(twice.1.is_empty());
}
#[test]
fn test_deprecated_no_protein_context_no_corrections() {
let (corrected, corrections) = correct_deprecated_protein_forms("NM_000088.3:c.123*");
assert_eq!(corrected, "NM_000088.3:c.123*");
assert!(corrections.is_empty());
let (corrected, corrections) = correct_deprecated_protein_forms("NM_000088.3:c.123X");
assert_eq!(corrected, "NM_000088.3:c.123X");
assert!(corrections.is_empty());
}
#[test]
fn test_deprecated_multiple_in_compound_allele() {
let (corrected, corrections) =
correct_deprecated_protein_forms("NP_000079.2:p.[Arg97*;Arg100X]");
assert_eq!(corrected, "NP_000079.2:p.[Arg97Ter;Arg100Ter]");
assert_eq!(corrections.len(), 2);
assert_eq!(
corrections[0].error_type,
ErrorType::DeprecatedStopCodonStar
);
assert_eq!(corrections[1].error_type, ErrorType::DeprecatedStopCodonX);
}
#[test]
fn test_deprecated_does_not_touch_xaa_or_extension() {
let (corrected, corrections) = correct_deprecated_protein_forms("NP_000079.2:p.Arg782Xaa");
assert_eq!(corrected, "NP_000079.2:p.Arg782Xaa");
assert!(corrections.is_empty());
let (corrected, corrections) = correct_deprecated_protein_forms("NP_000079.2:p.Met1ext*-5");
assert_eq!(corrected, "NP_000079.2:p.Met1ext*-5");
assert!(corrections.is_empty());
}
#[test]
fn test_deprecated_predicted_paren_form() {
let (corrected, corrections) = correct_deprecated_protein_forms("NP_000079.2:p.(Arg97*)");
assert_eq!(corrected, "NP_000079.2:p.(Arg97Ter)");
assert_eq!(corrections.len(), 1);
}
#[test]
fn test_deprecated_byte_offsets_track_original_input() {
let input = "NP_000079.2:p.Arg97fs*23";
let (_, corrections) = correct_deprecated_protein_forms(input);
assert_eq!(corrections.len(), 1);
let star_pos = input.find('*').unwrap();
assert_eq!(corrections[0].start, star_pos);
assert_eq!(corrections[0].end, input.len()); }
#[test]
fn test_deprecated_preserves_non_ascii_utf8() {
let input = "NP_000079.2:p.Arg97* αβ";
let (corrected, corrections) = correct_deprecated_protein_forms(input);
assert_eq!(corrections.len(), 1);
assert_eq!(corrected, "NP_000079.2:p.Arg97Ter αβ");
assert_eq!(corrected.chars().filter(|c| !c.is_ascii()).count(), 2);
}
#[test]
fn test_deprecated_fs_branch_gated_to_protein_segment() {
let input = "fs*23:p.Arg97Ter";
let (corrected, corrections) = correct_deprecated_protein_forms(input);
assert_eq!(corrected, input);
assert!(corrections.is_empty());
}
#[test]
fn test_correct_empty_delins_in_compound_allele() {
let (out, hits) = correct_empty_delins("NM_000088.3:c.[100_102delins;200T>G]");
assert_eq!(out, "NM_000088.3:c.[100_102del;200T>G]");
assert_eq!(hits.len(), 1);
}
#[test]
fn test_correct_empty_delins_idempotent() {
let (out1, hits1) = correct_empty_delins("NC_000001.11:g.100_102delins");
let (out2, hits2) = correct_empty_delins(&out1);
assert_eq!(out2, "NC_000001.11:g.100_102del");
assert_eq!(hits1.len(), 1);
assert!(hits2.is_empty());
}
#[test]
fn test_correct_redundant_repeat_label_basic() {
let (out, hits) = correct_redundant_repeat_label("NM_000088.3:r.100_102cug[4]");
assert_eq!(out, "NM_000088.3:r.100_102[4]");
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].error_type, ErrorType::RedundantRepeatLabel);
assert_eq!(hits[0].original, "cug");
assert_eq!(hits[0].corrected, "");
}
#[test]
fn test_correct_redundant_repeat_label_negative_positions() {
let (out, hits) = correct_redundant_repeat_label("NM_000088.3:r.-125_-123cug[4]");
assert_eq!(out, "NM_000088.3:r.-125_-123[4]");
assert_eq!(hits.len(), 1);
}
#[test]
fn test_correct_redundant_repeat_label_canonical_no_change() {
let (out, hits) = correct_redundant_repeat_label("NM_000088.3:r.100_102[4]");
assert_eq!(out, "NM_000088.3:r.100_102[4]");
assert!(hits.is_empty());
}
#[test]
fn test_correct_redundant_repeat_label_dna_no_change() {
let (out, hits) = correct_redundant_repeat_label("NM_000088.3:c.100_102CAG[4]");
assert_eq!(out, "NM_000088.3:c.100_102CAG[4]");
assert!(hits.is_empty());
}
#[test]
fn test_correct_redundant_repeat_label_range_count() {
let (out, hits) = correct_redundant_repeat_label("NM_000088.3:r.100_102cug[4_8]");
assert_eq!(out, "NM_000088.3:r.100_102[4_8]");
assert_eq!(hits.len(), 1);
}
#[test]
fn test_correct_redundant_repeat_label_idempotent() {
let input = "NM_000088.3:r.100_102cug[4]";
let (out1, _) = correct_redundant_repeat_label(input);
let (out2, hits2) = correct_redundant_repeat_label(&out1);
assert_eq!(out1, out2);
assert!(hits2.is_empty());
}
#[test]
fn test_correct_redundant_repeat_label_does_not_strip_non_rna_repeat_after_r_description() {
let input = "r.100_102cug[4];c.50_52acg[3]";
let (out, hits) = correct_redundant_repeat_label(input);
assert_eq!(out, "r.100_102[4];c.50_52acg[3]");
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].original, "cug");
}
#[test]
fn test_correct_single_position_range_del() {
let (out, hits) = correct_single_position_range("NM_000088.3:c.123_123del");
assert_eq!(out, "NM_000088.3:c.123del");
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].error_type, ErrorType::SinglePositionRange);
assert_eq!(hits[0].original, "123_123");
assert_eq!(hits[0].corrected, "123");
}
#[test]
fn test_correct_single_position_range_dup() {
let (out, hits) = correct_single_position_range("NM_000088.3:c.123_123dup");
assert_eq!(out, "NM_000088.3:c.123dup");
assert_eq!(hits.len(), 1);
}
#[test]
fn test_correct_single_position_range_inv() {
let (out, hits) = correct_single_position_range("NM_000088.3:c.100_100inv");
assert_eq!(out, "NM_000088.3:c.100inv");
assert_eq!(hits.len(), 1);
}
#[test]
fn test_correct_single_position_range_distinct_positions_no_hit() {
for input in [
"NM_000088.3:c.123_126del",
"NM_000088.3:c.123_126dup",
"NM_000088.3:c.100_102inv",
] {
let (out, hits) = correct_single_position_range(input);
assert_eq!(out, input);
assert!(hits.is_empty(), "input {} should not warn", input);
}
}
#[test]
fn test_correct_single_position_range_negative_positions() {
let (out, hits) = correct_single_position_range("NM_000088.3:c.-50_-50del");
assert_eq!(out, "NM_000088.3:c.-50del");
assert_eq!(hits.len(), 1);
}
#[test]
fn test_correct_single_position_range_multiple_in_allele() {
let (out, hits) = correct_single_position_range("NM_000088.3:c.[100_100del;200_200dup]");
assert_eq!(out, "NM_000088.3:c.[100del;200dup]");
assert_eq!(hits.len(), 2);
}
#[test]
fn test_correct_single_position_range_idempotent() {
let input = "NM_000088.3:c.123_123del";
let (out1, _) = correct_single_position_range(input);
let (out2, hits2) = correct_single_position_range(&out1);
assert_eq!(out1, out2);
assert!(hits2.is_empty());
}
#[test]
fn test_correct_single_position_range_does_not_touch_ins() {
let (out, hits) = correct_single_position_range("NM_000088.3:c.100_100insATG");
assert_eq!(out, "NM_000088.3:c.100_100insATG");
assert!(hits.is_empty());
}
#[test]
fn test_correct_single_position_range_does_not_touch_substitution() {
let (out, hits) = correct_single_position_range("NM_000088.3:c.100A>G");
assert_eq!(out, "NM_000088.3:c.100A>G");
assert!(hits.is_empty());
}
#[test]
fn test_correct_old_substitution_with_refs() {
let (out, corrections) = correct_old_substitution_syntax("NM_000088.3:c.79_80GC>TT");
assert_eq!(out, "NM_000088.3:c.79_80delinsTT");
assert_eq!(corrections.len(), 1);
assert_eq!(corrections[0].error_type, ErrorType::OldSubstitutionSyntax);
assert_eq!(corrections[0].original, "79_80GC>TT");
assert_eq!(corrections[0].corrected, "79_80delinsTT");
}
#[test]
fn test_correct_old_substitution_no_refs() {
let (out, corrections) = correct_old_substitution_syntax("NM_000088.3:c.100_102>ATG");
assert_eq!(out, "NM_000088.3:c.100_102delinsATG");
assert_eq!(corrections.len(), 1);
}
#[test]
fn test_correct_old_substitution_single_pos_multi_ref() {
let (out, corrections) = correct_old_substitution_syntax("NM_000088.3:c.79GC>TT");
assert_eq!(out, "NM_000088.3:c.79_80delinsTT");
assert_eq!(corrections.len(), 1);
}
#[test]
fn test_correct_old_substitution_canonical_no_op() {
let (out, corrections) = correct_old_substitution_syntax("NM_000088.3:c.100A>G");
assert_eq!(out, "NM_000088.3:c.100A>G");
assert!(corrections.is_empty());
}
#[test]
fn test_correct_old_substitution_idempotent() {
let (out1, _) = correct_old_substitution_syntax("c.79_80GC>TT");
let (out2, c2) = correct_old_substitution_syntax(&out1);
assert_eq!(out1, out2);
assert!(c2.is_empty());
}
#[test]
fn test_correct_old_substitution_negative_position() {
let (out, corrections) = correct_old_substitution_syntax("c.-10_-8>ATG");
assert_eq!(out, "c.-10_-8delinsATG");
assert_eq!(corrections.len(), 1);
}
#[test]
fn test_detect_deprecated_ivs_basic() {
let detections = detect_deprecated_ivs("NM_000088.3:c.IVS2+2T>G");
assert_eq!(detections.len(), 1);
assert_eq!(detections[0].error_type, ErrorType::DeprecatedIvsNotation);
assert_eq!(detections[0].original, "IVS2");
}
#[test]
fn test_detect_deprecated_ivs_minus_offset() {
let detections = detect_deprecated_ivs("NM_000088.3:c.IVS5-1G>T");
assert_eq!(detections.len(), 1);
assert_eq!(detections[0].original, "IVS5");
}
#[test]
fn test_detect_deprecated_ivs_canonical_no_op() {
let detections = detect_deprecated_ivs("NM_000088.3:c.88+2T>G");
assert!(detections.is_empty());
}
#[test]
fn test_detect_deprecated_ivs_does_not_match_in_accession() {
let detections = detect_deprecated_ivs("NM_IVS_TEST.3:c.100A>G");
assert!(detections.is_empty());
}
#[test]
fn test_detect_deprecated_ivs_n_prefix() {
let detections = detect_deprecated_ivs("NR_001234.1:n.IVS3+1G>A");
assert_eq!(detections.len(), 1);
assert_eq!(detections[0].original, "IVS3");
}
#[test]
fn test_detect_deprecated_ivs_r_prefix() {
let detections = detect_deprecated_ivs("NR_001234.1:r.IVS3+1g>a");
assert_eq!(detections.len(), 1);
assert_eq!(detections[0].error_type, ErrorType::DeprecatedIvsNotation);
assert_eq!(detections[0].original, "IVS3");
}
#[test]
fn test_correct_deprecated_con_basic() {
let (out, corrections) = correct_deprecated_con("NM_004006.2:c.100_200conNM_001.1:c.5_105");
assert_eq!(out, "NM_004006.2:c.100_200delinsNM_001.1:c.5_105");
assert_eq!(corrections.len(), 1);
assert_eq!(corrections[0].error_type, ErrorType::DeprecatedConSyntax);
}
#[test]
fn test_correct_deprecated_con_canonical_no_op() {
let (out, corrections) =
correct_deprecated_con("NM_004006.2:c.100_200delinsNM_001.1:c.5_105");
assert_eq!(out, "NM_004006.2:c.100_200delinsNM_001.1:c.5_105");
assert!(corrections.is_empty());
}
#[test]
fn test_correct_deprecated_con_genomic() {
let (out, corrections) = correct_deprecated_con("g.1000_2000conNC_000022.10:g.5_1005");
assert_eq!(out, "g.1000_2000delinsNC_000022.10:g.5_1005");
assert_eq!(corrections.len(), 1);
}
#[test]
fn test_correct_deprecated_con_idempotent() {
let (out1, _) = correct_deprecated_con("c.100_200conNM_001.1:c.5_105");
let (out2, c2) = correct_deprecated_con(&out1);
assert_eq!(out1, out2);
assert!(c2.is_empty());
}
#[test]
fn test_correct_deprecated_con_does_not_match_inside_word() {
let (out, corrections) = correct_deprecated_con("concept text");
assert_eq!(out, "concept text");
assert!(corrections.is_empty());
}
#[test]
fn test_correct_deprecated_con_does_not_match_no_separator_word() {
let (out, corrections) = correct_deprecated_con("c.100_200conditional");
assert_eq!(out, "c.100_200conditional");
assert!(corrections.is_empty());
}
#[test]
fn test_correct_deprecated_con_bare_position_source() {
let (out, corrections) = correct_deprecated_con("c.100_200con5_105");
assert_eq!(out, "c.100_200delins5_105");
assert_eq!(corrections.len(), 1);
}
}