use once_cell::sync::Lazy;
use regex::Regex;
use serde::{Deserialize, Serialize};
const MAX_HGVS_LENGTH: usize = 1000;
const MAX_TIMEOUT_SECONDS: u32 = 300;
const MIN_TIMEOUT_SECONDS: u32 = 1;
const MAX_BATCH_SIZE: usize = 1000;
static HGVS_PATTERN: Lazy<Regex> = Lazy::new(|| {
Regex::new(
r"(?x)
^
(?:
[A-Za-z]{2,4}_\d+(?:\.\d+)? # RefSeq / LRG (NM_000088.3, LRG_1)
| ENS[TGPER]\d+(?:\.\d+)? # Ensembl (transcript/gene/protein/exon/regulatory)
| [A-Z]\d{5}(?:\.\d+)? # GenBank (U12345.1)
| [A-Z]{2}\d{6}(?:\.\d+)? # GenBank (AF118569.1)
| (?:GRCh\d+|hg(?:18|19|38))\([^)]+\) # Assembly-prefixed (GRCh38(chr1), hg38(chr1))
)
:[cgmnpro]\.
.+
$
",
)
.unwrap()
});
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub enum ValidationError {
Empty,
TooLong { max: usize, actual: usize },
NonAscii,
InvalidFormat,
DangerousCharacters,
InvalidTimeout { min: u32, max: u32, actual: u32 },
BatchTooLarge { max: usize, actual: usize },
}
impl std::fmt::Display for ValidationError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ValidationError::Empty => write!(f, "Input cannot be empty"),
ValidationError::TooLong { max, actual } => {
write!(f, "Input too long: {} characters (max: {})", actual, max)
}
ValidationError::NonAscii => write!(f, "Input must contain only ASCII characters"),
ValidationError::InvalidFormat => write!(f, "Input does not match valid HGVS format"),
ValidationError::DangerousCharacters => {
write!(f, "Input contains potentially dangerous characters")
}
ValidationError::InvalidTimeout { min, max, actual } => {
write!(
f,
"Timeout {} is out of valid range ({}-{})",
actual, min, max
)
}
ValidationError::BatchTooLarge { max, actual } => {
write!(f, "Batch size {} exceeds maximum allowed ({})", actual, max)
}
}
}
}
impl std::error::Error for ValidationError {}
pub fn validate_hgvs(input: &str) -> Result<(), ValidationError> {
if input.is_empty() {
return Err(ValidationError::Empty);
}
if input.len() > MAX_HGVS_LENGTH {
return Err(ValidationError::TooLong {
max: MAX_HGVS_LENGTH,
actual: input.len(),
});
}
if !input.is_ascii() {
return Err(ValidationError::NonAscii);
}
if !HGVS_PATTERN.is_match(input) {
return Err(ValidationError::InvalidFormat);
}
if input.chars().any(|c| "<|&`${}\\".contains(c)) {
return Err(ValidationError::DangerousCharacters);
}
Ok(())
}
pub fn validate_hgvs_batch(variants: &[String]) -> Result<(), ValidationError> {
if variants.len() > MAX_BATCH_SIZE {
return Err(ValidationError::BatchTooLarge {
max: MAX_BATCH_SIZE,
actual: variants.len(),
});
}
for variant in variants {
validate_hgvs(variant)?;
}
Ok(())
}
pub fn validate_timeout(timeout_seconds: u32) -> Result<(), ValidationError> {
if !(MIN_TIMEOUT_SECONDS..=MAX_TIMEOUT_SECONDS).contains(&timeout_seconds) {
return Err(ValidationError::InvalidTimeout {
min: MIN_TIMEOUT_SECONDS,
max: MAX_TIMEOUT_SECONDS,
actual: timeout_seconds,
});
}
Ok(())
}
pub fn validate_optional_timeout(timeout_seconds: Option<u32>) -> Result<(), ValidationError> {
if let Some(timeout) = timeout_seconds {
validate_timeout(timeout)?;
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_validate_hgvs_valid_inputs() {
assert!(validate_hgvs("NM_000001.1:c.123A>G").is_ok());
assert!(validate_hgvs("NC_000001.11:g.123456del").is_ok());
assert!(validate_hgvs("NP_000001.1:p.Arg123Cys").is_ok());
assert!(validate_hgvs("NM_000001.1:c.123+1G>A").is_ok());
assert!(validate_hgvs("NM_000001.1:c.123-10_123-5del").is_ok());
}
#[test]
fn test_validate_hgvs_empty_input() {
let result = validate_hgvs("");
assert!(matches!(result, Err(ValidationError::Empty)));
}
#[test]
fn test_validate_hgvs_too_long() {
let long_input = "A".repeat(MAX_HGVS_LENGTH + 1);
let result = validate_hgvs(&long_input);
assert!(matches!(result, Err(ValidationError::TooLong { .. })));
}
#[test]
fn test_validate_hgvs_non_ascii() {
let result = validate_hgvs("NM_000001.1:c.123A>🦀");
assert!(matches!(result, Err(ValidationError::NonAscii)));
}
#[test]
fn test_validate_hgvs_invalid_format() {
assert!(matches!(
validate_hgvs("not-hgvs-format"),
Err(ValidationError::InvalidFormat)
));
assert!(matches!(
validate_hgvs(":c.123A>G"),
Err(ValidationError::InvalidFormat)
));
assert!(matches!(
validate_hgvs("NM_000001.1:x.123A>G"),
Err(ValidationError::InvalidFormat)
));
}
#[test]
fn test_validate_hgvs_repeats() {
assert!(validate_hgvs("NM_003820.4:c.495_500C[8]").is_ok());
assert!(validate_hgvs("NC_000001.11:g.2560658_2560663C[8]").is_ok());
assert!(validate_hgvs("NM_000001.1:c.100_105CTG[9]TTG[1]CTG[13]").is_ok());
assert!(validate_hgvs("NM_000001.1:c.100_105CAG[10_15]").is_ok());
}
#[test]
fn test_validate_hgvs_predicted_effects() {
assert!(validate_hgvs("NP_000001.1:p.(Val600Glu)").is_ok());
assert!(validate_hgvs("NP_000001.1:p.(=)").is_ok());
assert!(validate_hgvs("NP_000001.1:p.(?)").is_ok());
}
#[test]
fn test_validate_hgvs_uncertain_positions() {
assert!(validate_hgvs("NM_000001.1:c.(100_200)del").is_ok());
assert!(validate_hgvs("NC_000001.11:g.(?_100)_(200_?)del").is_ok());
}
#[test]
fn test_validate_hgvs_allele_notation() {
assert!(validate_hgvs("NM_000001.1:c.[123A>G;456C>T]").is_ok());
assert!(validate_hgvs("NM_000001.1:c.[123A>G];[456C>T]").is_ok());
}
#[test]
fn test_validate_hgvs_assembly_prefixed_accessions() {
assert!(validate_hgvs("GRCh38(chr1):g.2560658_2560663C[8]").is_ok());
assert!(validate_hgvs("GRCh37(chr1):g.12345A>G").is_ok());
assert!(validate_hgvs("GRCh36(chr1):g.100del").is_ok());
assert!(validate_hgvs("hg38(chr1):g.12345A>G").is_ok());
assert!(validate_hgvs("hg19(chr1):g.12345A>G").is_ok());
assert!(validate_hgvs("hg18(chr1):g.12345A>G").is_ok());
}
#[test]
fn test_validate_hgvs_ensembl_accessions() {
assert!(validate_hgvs("ENST00000123456.1:c.123A>G").is_ok());
assert!(validate_hgvs("ENSG00000123456.1:g.100A>G").is_ok());
assert!(validate_hgvs("ENSP00000123456.1:p.Val600Glu").is_ok());
assert!(validate_hgvs("ENSE00000123456.1:g.100A>G").is_ok());
assert!(validate_hgvs("ENSR00000123456.1:g.100A>G").is_ok());
}
#[test]
fn test_validate_hgvs_lrg_accessions() {
assert!(validate_hgvs("LRG_1:g.12345A>G").is_ok());
}
#[test]
fn test_validate_hgvs_genbank_accessions() {
assert!(validate_hgvs("U12345.1:g.100A>G").is_ok());
}
#[test]
fn test_validate_hgvs_dangerous_characters() {
let dangerous_inputs = vec![
"NM_000001.1:c.123A<G",
"NM_000001.1:c.123del|rm -rf /",
"NM_000001.1:c.123del$(whoami)",
"NM_000001.1:c.123del{backdoor}",
"NM_000001.1:c.123del\\evil",
"NM_000001.1:c.123del`whoami`",
"NM_000001.1:c.123del&rm",
];
for input in dangerous_inputs {
let result = validate_hgvs(input);
assert!(
matches!(result, Err(ValidationError::DangerousCharacters)),
"Expected DangerousCharacters for input: {input}"
);
}
}
#[test]
fn test_validate_hgvs_batch() {
let valid_batch = vec![
"NM_000001.1:c.123A>G".to_string(),
"NM_000002.2:c.456C>T".to_string(),
];
assert!(validate_hgvs_batch(&valid_batch).is_ok());
let invalid_batch = vec![
"NM_000001.1:c.123A>G".to_string(),
"invalid-variant".to_string(),
];
assert!(validate_hgvs_batch(&invalid_batch).is_err());
}
#[test]
fn test_validate_hgvs_batch_too_large() {
let large_batch = vec!["NM_000001.1:c.123A>G".to_string(); MAX_BATCH_SIZE + 1];
let result = validate_hgvs_batch(&large_batch);
assert!(matches!(result, Err(ValidationError::BatchTooLarge { .. })));
}
#[test]
fn test_validate_timeout() {
assert!(validate_timeout(30).is_ok());
assert!(validate_timeout(MIN_TIMEOUT_SECONDS).is_ok());
assert!(validate_timeout(MAX_TIMEOUT_SECONDS).is_ok());
assert!(matches!(
validate_timeout(0),
Err(ValidationError::InvalidTimeout { .. })
));
assert!(matches!(
validate_timeout(MAX_TIMEOUT_SECONDS + 1),
Err(ValidationError::InvalidTimeout { .. })
));
}
#[test]
fn test_validate_optional_timeout() {
assert!(validate_optional_timeout(None).is_ok());
assert!(validate_optional_timeout(Some(30)).is_ok());
assert!(validate_optional_timeout(Some(0)).is_err());
}
}