pub mod vcf;
use crate::error::FerroError;
use crate::reference::transcript::GenomeBuild;
#[derive(Debug, Clone)]
pub struct RsIdResult {
pub rsid: String,
pub contig: String,
pub position: u64,
pub reference: String,
pub alternate: String,
pub build: GenomeBuild,
pub hgvs: Option<String>,
pub allele_frequency: Option<f64>,
pub clinical_significance: Option<String>,
}
impl RsIdResult {
pub fn new(
rsid: String,
contig: String,
position: u64,
reference: String,
alternate: String,
build: GenomeBuild,
) -> Self {
Self {
rsid,
contig,
position,
reference,
alternate,
build,
hgvs: None,
allele_frequency: None,
clinical_significance: None,
}
}
pub fn is_snv(&self) -> bool {
self.reference.len() == 1 && self.alternate.len() == 1
}
pub fn is_deletion(&self) -> bool {
self.reference.len() > self.alternate.len()
}
pub fn is_insertion(&self) -> bool {
self.reference.len() < self.alternate.len()
}
pub fn to_hgvs(&self) -> String {
let prefix = format!("g.{}", self.position);
if self.is_snv() {
format!("{}{}>{}", prefix, self.reference, self.alternate)
} else if self.is_deletion() {
let del_len = self.reference.len() - self.alternate.len();
if del_len == 1 {
format!("{}del", self.position + self.alternate.len() as u64)
} else {
let start = self.position + self.alternate.len() as u64;
let end = start + del_len as u64 - 1;
format!("g.{}_{}del", start, end)
}
} else if self.is_insertion() {
let ins_seq = &self.alternate[self.reference.len()..];
format!(
"g.{}_{}ins{}",
self.position + self.reference.len() as u64 - 1,
self.position + self.reference.len() as u64,
ins_seq
)
} else {
format!("g.{}delins{}", prefix, self.alternate)
}
}
}
#[derive(Debug, Clone)]
pub struct RsIdMetadata {
pub source: String,
pub build: GenomeBuild,
pub version: String,
pub variant_count: u64,
}
pub trait RsIdLookup {
fn lookup(&self, rsid: &str) -> Result<Vec<RsIdResult>, FerroError>;
fn contains(&self, rsid: &str) -> bool;
fn metadata(&self) -> &RsIdMetadata;
}
pub fn parse_rsid(rsid: &str) -> Result<u64, FerroError> {
let s = rsid.strip_prefix("rs").unwrap_or(rsid);
s.parse().map_err(|_| FerroError::InvalidCoordinates {
msg: format!("Invalid rsID: {}", rsid),
})
}
pub fn format_rsid(rsid_num: u64) -> String {
format!("rs{}", rsid_num)
}
#[derive(Debug, Clone)]
pub struct InMemoryRsIdLookup {
entries: std::collections::HashMap<u64, Vec<RsIdResult>>,
metadata: RsIdMetadata,
}
impl InMemoryRsIdLookup {
pub fn new(build: GenomeBuild) -> Self {
Self {
entries: std::collections::HashMap::new(),
metadata: RsIdMetadata {
source: "In-memory".to_string(),
build,
version: "1.0".to_string(),
variant_count: 0,
},
}
}
pub fn add(&mut self, result: RsIdResult) -> Result<(), FerroError> {
let rsid_num = parse_rsid(&result.rsid)?;
self.entries.entry(rsid_num).or_default().push(result);
self.metadata.variant_count = self.entries.values().map(|v| v.len() as u64).sum();
Ok(())
}
pub fn with_test_data() -> Self {
let mut lookup = Self::new(GenomeBuild::GRCh38);
let _ = lookup.add(RsIdResult {
rsid: "rs121913529".to_string(),
contig: "chr7".to_string(),
position: 140753336,
reference: "A".to_string(),
alternate: "T".to_string(),
build: GenomeBuild::GRCh38,
hgvs: Some("NC_000007.14:g.140753336A>T".to_string()),
allele_frequency: Some(0.0001),
clinical_significance: Some("pathogenic".to_string()),
});
let _ = lookup.add(RsIdResult {
rsid: "rs80357906".to_string(),
contig: "chr17".to_string(),
position: 43124027,
reference: "GAG".to_string(),
alternate: "G".to_string(),
build: GenomeBuild::GRCh38,
hgvs: Some("NC_000017.11:g.43124028_43124029del".to_string()),
allele_frequency: Some(0.0001),
clinical_significance: Some("pathogenic".to_string()),
});
lookup
}
}
impl RsIdLookup for InMemoryRsIdLookup {
fn lookup(&self, rsid: &str) -> Result<Vec<RsIdResult>, FerroError> {
let rsid_num = parse_rsid(rsid)?;
self.entries
.get(&rsid_num)
.cloned()
.ok_or_else(|| FerroError::ReferenceNotFound {
id: rsid.to_string(),
})
}
fn contains(&self, rsid: &str) -> bool {
parse_rsid(rsid)
.map(|n| self.entries.contains_key(&n))
.unwrap_or(false)
}
fn metadata(&self) -> &RsIdMetadata {
&self.metadata
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_rsid() {
assert_eq!(parse_rsid("rs121913529").unwrap(), 121913529);
assert_eq!(parse_rsid("121913529").unwrap(), 121913529);
assert!(parse_rsid("rsABC").is_err());
}
#[test]
fn test_format_rsid() {
assert_eq!(format_rsid(121913529), "rs121913529");
}
#[test]
fn test_rsid_result_is_snv() {
let snv = RsIdResult::new(
"rs1".to_string(),
"chr1".to_string(),
100,
"A".to_string(),
"G".to_string(),
GenomeBuild::GRCh38,
);
assert!(snv.is_snv());
assert!(!snv.is_deletion());
assert!(!snv.is_insertion());
}
#[test]
fn test_rsid_result_is_deletion() {
let del = RsIdResult::new(
"rs1".to_string(),
"chr1".to_string(),
100,
"AG".to_string(),
"A".to_string(),
GenomeBuild::GRCh38,
);
assert!(!del.is_snv());
assert!(del.is_deletion());
assert!(!del.is_insertion());
}
#[test]
fn test_rsid_result_is_insertion() {
let ins = RsIdResult::new(
"rs1".to_string(),
"chr1".to_string(),
100,
"A".to_string(),
"AG".to_string(),
GenomeBuild::GRCh38,
);
assert!(!ins.is_snv());
assert!(!ins.is_deletion());
assert!(ins.is_insertion());
}
#[test]
fn test_in_memory_lookup() {
let lookup = InMemoryRsIdLookup::with_test_data();
let results = lookup.lookup("rs121913529").unwrap();
assert_eq!(results.len(), 1);
assert_eq!(results[0].contig, "chr7");
assert!(results[0].is_snv());
let results = lookup.lookup("rs80357906").unwrap();
assert_eq!(results.len(), 1);
assert!(results[0].is_deletion());
assert!(lookup.lookup("rs999999999").is_err());
}
#[test]
fn test_contains() {
let lookup = InMemoryRsIdLookup::with_test_data();
assert!(lookup.contains("rs121913529"));
assert!(lookup.contains("121913529")); assert!(!lookup.contains("rs999999999"));
}
#[test]
fn test_to_hgvs_snv() {
let snv = RsIdResult::new(
"rs1".to_string(),
"chr1".to_string(),
100,
"A".to_string(),
"G".to_string(),
GenomeBuild::GRCh38,
);
assert_eq!(snv.to_hgvs(), "g.100A>G");
}
#[test]
fn test_to_hgvs_insertion() {
let ins = RsIdResult::new(
"rs1".to_string(),
"chr1".to_string(),
100,
"A".to_string(),
"AG".to_string(),
GenomeBuild::GRCh38,
);
assert_eq!(ins.to_hgvs(), "g.100_101insG");
}
#[test]
fn test_metadata() {
let lookup = InMemoryRsIdLookup::with_test_data();
let meta = lookup.metadata();
assert_eq!(meta.source, "In-memory");
assert_eq!(meta.build, GenomeBuild::GRCh38);
assert_eq!(meta.variant_count, 2);
}
}