use crate::hgvs::edit::{Base, NaEdit};
use crate::hgvs::interval::{CdsInterval, GenomeInterval};
use crate::hgvs::location::{CdsPos, GenomePos};
use crate::hgvs::variant::{Accession, CdsVariant, GenomeVariant, HgvsVariant, LocEdit};
#[inline]
const fn is_iupac_base(b: u8) -> bool {
matches!(
b,
b'A' | b'C'
| b'G'
| b'T'
| b'U'
| b'R'
| b'Y'
| b'S'
| b'W'
| b'K'
| b'M'
| b'B'
| b'D'
| b'H'
| b'V'
| b'N'
)
}
#[inline]
fn scan_digits(bytes: &[u8], start: usize) -> (u64, usize) {
let mut end = start;
let mut value = 0u64;
while end < bytes.len() && bytes[end].is_ascii_digit() {
value = value * 10 + (bytes[end] - b'0') as u64;
end += 1;
}
(value, end)
}
#[allow(clippy::large_enum_variant)]
pub enum FastPathResult {
Success(HgvsVariant),
Fallback,
}
#[inline]
pub fn try_fast_path(input: &str) -> FastPathResult {
let bytes = input.as_bytes();
let len = bytes.len();
if len < 10 {
return FastPathResult::Fallback;
}
if len < 3
|| bytes[len - 2] != b'>'
|| !is_iupac_base(bytes[len - 1])
|| !is_iupac_base(bytes[len - 3])
{
return FastPathResult::Fallback;
}
if let Some(colon_pos) = memchr::memchr(b':', bytes) {
if colon_pos + 2 < len {
let type_char = bytes[colon_pos + 1];
let dot_char = bytes[colon_pos + 2];
if type_char == b'n' && dot_char == b'.' {
return FastPathResult::Fallback;
}
if type_char == b'r' && dot_char == b'.' {
return FastPathResult::Fallback;
}
if type_char == b'c' && dot_char == b'.' && colon_pos + 3 < len {
let pos_start = bytes[colon_pos + 3];
if pos_start == b'*' || pos_start == b'-' {
return FastPathResult::Fallback;
}
let search_region = &bytes[colon_pos + 4..len - 3];
if memchr::memchr2(b'+', b'-', search_region).is_some() {
return FastPathResult::Fallback;
}
}
}
}
match bytes[0] {
b'N' | b'X' => {
if bytes.len() > 2 && bytes[2] == b'_' {
return try_refseq_fast_path(input, bytes);
}
FastPathResult::Fallback
}
b'E' => {
if bytes.len() >= 4 && bytes[1] == b'N' && bytes[2] == b'S' {
return try_ensembl_fast_path(input, bytes);
}
FastPathResult::Fallback
}
b'L' => {
if bytes.len() >= 4 && bytes[1] == b'R' && bytes[2] == b'G' && bytes[3] == b'_' {
return try_lrg_fast_path(input, bytes);
}
FastPathResult::Fallback
}
b'G' => {
if bytes.len() >= 6 && bytes[1] == b'R' && bytes[2] == b'C' && bytes[3] == b'h' {
return try_assembly_fast_path(input, bytes);
}
FastPathResult::Fallback
}
b'h' => {
if bytes.len() >= 4 && bytes[1] == b'g' {
return try_assembly_fast_path(input, bytes);
}
FastPathResult::Fallback
}
_ => FastPathResult::Fallback,
}
}
#[inline]
fn try_refseq_fast_path(input: &str, bytes: &[u8]) -> FastPathResult {
let prefix_end = 2;
if !bytes[0].is_ascii_uppercase() || !bytes[1].is_ascii_uppercase() {
return FastPathResult::Fallback;
}
if bytes[2] != b'_' {
return FastPathResult::Fallback;
}
let (_number_value, number_end) = scan_digits(bytes, 3);
if number_end == 3 {
return FastPathResult::Fallback; }
let number_str = &input[3..number_end];
let (version, version_end) = if number_end < bytes.len() && bytes[number_end] == b'.' {
let (v, ve) = scan_digits(bytes, number_end + 1);
if ve == number_end + 1 {
return FastPathResult::Fallback; }
(Some(v as u32), ve)
} else {
(None, number_end)
};
if version_end >= bytes.len() || bytes[version_end] != b':' {
return FastPathResult::Fallback;
}
let type_start = version_end + 1;
if type_start + 2 > bytes.len() || bytes[type_start + 1] != b'.' {
return FastPathResult::Fallback;
}
let prefix = &input[0..prefix_end];
let accession = Accession::with_style(
prefix.to_string(),
number_str.to_string(),
version,
false, );
let type_char = bytes[type_start];
let edit_start = type_start + 2;
match type_char {
b'g' => try_parse_genome_substitution(input, bytes, edit_start, accession),
b'c' => try_parse_cds_substitution(input, bytes, edit_start, accession),
b'p' => FastPathResult::Fallback, _ => FastPathResult::Fallback,
}
}
#[inline]
fn try_ensembl_fast_path(input: &str, bytes: &[u8]) -> FastPathResult {
if bytes.len() < 15 || bytes[0] != b'E' || bytes[1] != b'N' || bytes[2] != b'S' {
return FastPathResult::Fallback;
}
let type_char = bytes[3];
if !matches!(type_char, b'T' | b'G' | b'P' | b'E' | b'R') {
return FastPathResult::Fallback;
}
let (_number_value, number_end) = scan_digits(bytes, 4);
let digit_count = number_end - 4;
if number_end == 4 || !(11..=15).contains(&digit_count) {
return FastPathResult::Fallback;
}
let number_str = &input[4..number_end];
let (version, version_end) = if number_end < bytes.len() && bytes[number_end] == b'.' {
let (v, ve) = scan_digits(bytes, number_end + 1);
if ve == number_end + 1 {
return FastPathResult::Fallback;
}
(Some(v as u32), ve)
} else {
(None, number_end)
};
if version_end >= bytes.len() || bytes[version_end] != b':' {
return FastPathResult::Fallback;
}
let var_type_start = version_end + 1;
if var_type_start + 2 > bytes.len() || bytes[var_type_start + 1] != b'.' {
return FastPathResult::Fallback;
}
let prefix = &input[0..4]; let accession = Accession::with_style(
prefix.to_string(),
number_str.to_string(),
version,
true, );
let var_type_char = bytes[var_type_start];
let edit_start = var_type_start + 2;
match var_type_char {
b'g' => try_parse_genome_substitution(input, bytes, edit_start, accession),
b'c' => try_parse_cds_substitution(input, bytes, edit_start, accession),
b'p' => FastPathResult::Fallback,
_ => FastPathResult::Fallback,
}
}
#[inline]
fn try_lrg_fast_path(input: &str, bytes: &[u8]) -> FastPathResult {
if bytes.len() < 8
|| bytes[0] != b'L'
|| bytes[1] != b'R'
|| bytes[2] != b'G'
|| bytes[3] != b'_'
{
return FastPathResult::Fallback;
}
let (_number_value, number_end) = scan_digits(bytes, 4);
if number_end == 4 {
return FastPathResult::Fallback;
}
let number_str = &input[4..number_end];
let (full_number, version_end) =
if number_end < bytes.len() && (bytes[number_end] == b't' || bytes[number_end] == b'p') {
let (_tx_num, tx_end) = scan_digits(bytes, number_end + 1);
if tx_end == number_end + 1 {
return FastPathResult::Fallback;
}
(&input[4..tx_end], tx_end)
} else {
(number_str, number_end)
};
if version_end >= bytes.len() || bytes[version_end] != b':' {
return FastPathResult::Fallback;
}
let type_start = version_end + 1;
if type_start + 2 > bytes.len() || bytes[type_start + 1] != b'.' {
return FastPathResult::Fallback;
}
let accession = Accession::with_style(
"LRG".to_string(),
full_number.to_string(),
None, false,
);
let type_char = bytes[type_start];
let edit_start = type_start + 2;
match type_char {
b'g' => try_parse_genome_substitution(input, bytes, edit_start, accession),
b'c' => try_parse_cds_substitution(input, bytes, edit_start, accession),
b'p' => FastPathResult::Fallback,
_ => FastPathResult::Fallback,
}
}
#[inline]
fn try_assembly_fast_path(input: &str, bytes: &[u8]) -> FastPathResult {
let paren_pos = bytes.iter().position(|&b| b == b'(');
if paren_pos.is_none() {
return FastPathResult::Fallback;
}
let paren_pos = paren_pos.unwrap();
let assembly = &input[0..paren_pos];
if !matches!(
assembly,
"GRCh37" | "GRCh38" | "hg19" | "hg38" | "hg18" | "GRCh36"
) {
return FastPathResult::Fallback;
}
let close_paren = bytes[paren_pos + 1..].iter().position(|&b| b == b')');
if close_paren.is_none() {
return FastPathResult::Fallback;
}
let close_paren = paren_pos + 1 + close_paren.unwrap();
let chromosome = &input[paren_pos + 1..close_paren];
if close_paren + 1 >= bytes.len() || bytes[close_paren + 1] != b':' {
return FastPathResult::Fallback;
}
let type_start = close_paren + 2;
if type_start + 2 > bytes.len() || bytes[type_start + 1] != b'.' {
return FastPathResult::Fallback;
}
let accession = Accession::from_assembly(assembly.to_string(), chromosome.to_string());
let type_char = bytes[type_start];
let edit_start = type_start + 2;
match type_char {
b'g' => try_parse_genome_substitution(input, bytes, edit_start, accession),
_ => FastPathResult::Fallback, }
}
#[inline]
fn try_parse_genome_substitution(
_input: &str,
bytes: &[u8],
edit_start: usize,
accession: Accession,
) -> FastPathResult {
if edit_start >= bytes.len() || !bytes[edit_start].is_ascii_digit() {
return FastPathResult::Fallback;
}
let (position, pos_end) = scan_digits(bytes, edit_start);
if pos_end == edit_start {
return FastPathResult::Fallback;
}
if pos_end + 3 != bytes.len() {
return FastPathResult::Fallback; }
if !is_iupac_base(bytes[pos_end])
|| bytes[pos_end + 1] != b'>'
|| !is_iupac_base(bytes[pos_end + 2])
{
return FastPathResult::Fallback;
}
let reference = Base::from_char(bytes[pos_end] as char).unwrap();
let alternative = Base::from_char(bytes[pos_end + 2] as char).unwrap();
let pos = GenomePos::new(position);
let interval = GenomeInterval::point(pos);
let edit = NaEdit::Substitution {
reference,
alternative,
};
FastPathResult::Success(HgvsVariant::Genome(GenomeVariant {
accession,
gene_symbol: None,
loc_edit: LocEdit::new(interval, edit),
}))
}
#[inline]
fn try_parse_cds_substitution(
_input: &str,
bytes: &[u8],
edit_start: usize,
accession: Accession,
) -> FastPathResult {
if edit_start >= bytes.len() || !bytes[edit_start].is_ascii_digit() {
return FastPathResult::Fallback;
}
let (position, pos_end) = scan_digits(bytes, edit_start);
if pos_end == edit_start {
return FastPathResult::Fallback;
}
if pos_end < bytes.len() && (bytes[pos_end] == b'+' || bytes[pos_end] == b'-') {
if pos_end + 1 < bytes.len() && bytes[pos_end + 1].is_ascii_digit() {
return FastPathResult::Fallback;
}
}
if pos_end + 3 != bytes.len() {
return FastPathResult::Fallback;
}
if !is_iupac_base(bytes[pos_end])
|| bytes[pos_end + 1] != b'>'
|| !is_iupac_base(bytes[pos_end + 2])
{
return FastPathResult::Fallback;
}
let reference = Base::from_char(bytes[pos_end] as char).unwrap();
let alternative = Base::from_char(bytes[pos_end + 2] as char).unwrap();
let pos = CdsPos::new(position as i64);
let interval = CdsInterval::point(pos);
let edit = NaEdit::Substitution {
reference,
alternative,
};
FastPathResult::Success(HgvsVariant::Cds(CdsVariant {
accession,
gene_symbol: None,
loc_edit: LocEdit::new(interval, edit),
}))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_refseq_genomic_substitution() {
match try_fast_path("NC_000001.11:g.12345A>G") {
FastPathResult::Success(variant) => {
assert!(matches!(variant, HgvsVariant::Genome(_)));
if let HgvsVariant::Genome(g) = variant {
assert_eq!(&*g.accession.prefix, "NC");
assert_eq!(&*g.accession.number, "000001");
assert_eq!(g.accession.version, Some(11));
}
}
FastPathResult::Fallback => panic!("Expected success for RefSeq genomic"),
}
}
#[test]
fn test_refseq_cds_substitution() {
match try_fast_path("NM_000088.3:c.459A>G") {
FastPathResult::Success(variant) => {
assert!(matches!(variant, HgvsVariant::Cds(_)));
if let HgvsVariant::Cds(c) = variant {
assert_eq!(&*c.accession.prefix, "NM");
assert_eq!(&*c.accession.number, "000088");
assert_eq!(c.accession.version, Some(3));
}
}
FastPathResult::Fallback => panic!("Expected success for RefSeq CDS"),
}
}
#[test]
fn test_ensembl_genomic_substitution() {
match try_fast_path("ENSG00000141510.5:g.12345A>G") {
FastPathResult::Success(variant) => {
assert!(matches!(variant, HgvsVariant::Genome(_)));
if let HgvsVariant::Genome(g) = variant {
assert_eq!(&*g.accession.prefix, "ENSG");
assert!(g.accession.ensembl_style);
}
}
FastPathResult::Fallback => panic!("Expected success for Ensembl genomic"),
}
}
#[test]
fn test_ensembl_cds_substitution() {
match try_fast_path("ENST00000012345.1:c.100A>G") {
FastPathResult::Success(variant) => {
assert!(matches!(variant, HgvsVariant::Cds(_)));
}
FastPathResult::Fallback => panic!("Expected success for Ensembl CDS"),
}
}
#[test]
fn test_lrg_substitution() {
match try_fast_path("LRG_1:g.12345A>G") {
FastPathResult::Success(variant) => {
assert!(matches!(variant, HgvsVariant::Genome(_)));
}
FastPathResult::Fallback => panic!("Expected success for LRG"),
}
}
#[test]
fn test_assembly_substitution() {
match try_fast_path("GRCh38(chr1):g.12345A>G") {
FastPathResult::Success(variant) => {
assert!(matches!(variant, HgvsVariant::Genome(_)));
}
FastPathResult::Fallback => panic!("Expected success for assembly"),
}
}
#[test]
fn test_hg_assembly_substitution() {
match try_fast_path("hg38(chr1):g.12345A>G") {
FastPathResult::Success(variant) => {
assert!(matches!(variant, HgvsVariant::Genome(_)));
}
FastPathResult::Fallback => panic!("Expected success for hg assembly"),
}
}
#[test]
fn test_fallback_for_complex_patterns() {
assert!(matches!(
try_fast_path("NM_000088.3:c.100+5A>G"),
FastPathResult::Fallback
));
assert!(matches!(
try_fast_path("NM_000088.3:c.*100A>G"),
FastPathResult::Fallback
));
assert!(matches!(
try_fast_path("NC_000001.11:g.12345del"),
FastPathResult::Fallback
));
assert!(matches!(
try_fast_path("NC_000001.11:g.12345_12346insA"),
FastPathResult::Fallback
));
assert!(matches!(
try_fast_path("NC_000001.11:g.12345_12346del"),
FastPathResult::Fallback
));
assert!(matches!(
try_fast_path("NP_000079.2:p.Arg100Gly"),
FastPathResult::Fallback
));
}
#[test]
fn test_fallback_for_unknown_patterns() {
assert!(matches!(
try_fast_path("ABC_12345.1:g.100A>G"),
FastPathResult::Fallback
));
assert!(matches!(
try_fast_path("N:g.1A>G"),
FastPathResult::Fallback
));
}
}