use std::collections::HashMap;
use std::time::Instant;
use indexmap::IndexMap;
use postgres::{Client, NoTls};
use biocommons_bioutils::assemblies::{Assembly, ASSEMBLY_INFOS};
use hgvs::data::error::Error as HgvsDataError;
use hgvs::data::interface::{
GeneInfoRecord, Provider, TxExonsRecord, TxForRegionRecord, TxIdentityInfo, TxInfoRecord,
TxMappingOptionsRecord, TxSimilarityRecord,
};
use seqrepo::{AliasOrSeqId, Interface as SeqRepoInterface, Query as SeqRepoQuery, SeqRepo};
#[derive(Debug, Clone)]
pub struct InMemoryProviderConfig {
pub uta_db_url: String,
pub uta_db_schema: String,
pub seqrepo_path: Option<String>,
}
impl Default for InMemoryProviderConfig {
fn default() -> Self {
Self {
uta_db_url: "postgresql://anonymous:anonymous@localhost:5432/uta".to_string(),
uta_db_schema: "uta_20210129b".to_string(),
seqrepo_path: None,
}
}
}
#[derive(Debug, Clone)]
pub struct TxRegionRecord {
pub tx_ac: String,
pub alt_ac: String,
pub alt_strand: i16,
pub alt_aln_method: String,
pub start_i: i32,
pub end_i: i32,
}
pub struct InMemoryProvider {
pub data_version: String,
pub schema_version: String,
pub assembly_maps: HashMap<Assembly, IndexMap<String, String>>,
pub gene_info: HashMap<String, GeneInfoRecord>,
pub tx_to_pro: HashMap<String, String>,
pub sequences: HashMap<String, String>,
pub seq_id_to_acs: HashMap<String, Vec<String>>,
pub similar_transcripts: HashMap<String, Vec<TxSimilarityRecord>>,
pub tx_exons: HashMap<(String, String, String), Vec<TxExonsRecord>>,
pub tx_for_gene: HashMap<String, Vec<TxInfoRecord>>,
pub tx_info: HashMap<(String, String, String), TxInfoRecord>,
pub tx_identity_info: HashMap<String, TxIdentityInfo>,
pub tx_mapping_options: HashMap<String, Vec<TxMappingOptionsRecord>>,
pub tx_region_index: HashMap<(String, String), Vec<TxRegionRecord>>,
}
const SELENOPROTEIN_SYMBOLS: [&str; 25] = [
"DIO1", "DIO3", "GPX1", "GPX2", "GPX3", "GPX4", "GPX6", "SELENOF", "SELENOH", "SELENOI",
"SELENOK", "SELENOM", "SELENON", "SELENOO", "SELENOP", "MSRB1", "SELENOS", "SELENOT",
"SELENOV", "SELENOW", "DIO2", "SEPHS2", "TXNRD1", "TXNRD2", "TXNRD3",
];
type SequenceData = (HashMap<String, String>, HashMap<String, Vec<String>>);
type TxExonData = (
HashMap<(String, String, String), Vec<TxExonsRecord>>,
HashMap<String, Vec<TxMappingOptionsRecord>>,
);
type TxInfoData = (
HashMap<(String, String, String), TxInfoRecord>,
HashMap<String, Vec<TxInfoRecord>>,
);
impl InMemoryProvider {
pub fn new(config: &InMemoryProviderConfig) -> Result<Self, String> {
let total_start = Instant::now();
eprintln!("Connecting to UTA at {} ...", config.uta_db_url);
let mut conn = Client::connect(&config.uta_db_url, NoTls)
.map_err(|e| format!("Failed to connect to UTA: {e}"))?;
let schema = &config.uta_db_schema;
eprintln!("Loading schema version ...");
let schema_version = {
let sql = format!("SELECT value FROM {schema}.meta WHERE key = 'schema_version'");
let row = conn
.query_one(&sql, &[])
.map_err(|e| format!("Failed to query schema_version: {e}"))?;
row.get::<_, String>("value")
};
eprintln!(" schema_version = {schema_version}");
eprintln!("Building assembly maps ...");
let assembly_maps = Self::build_assembly_maps();
let gene_info = Self::load_gene_info(&mut conn, schema)?;
let tx_to_pro = Self::load_tx_to_pro(&mut conn, schema)?;
let (sequences, seq_id_to_acs) = Self::load_sequences(&mut conn, schema)?;
let similar_transcripts = Self::load_similar_transcripts(&mut conn, schema)?;
let tx_identity_info = Self::load_tx_identity_info(&mut conn, schema)?;
let (tx_exons, tx_mapping_options) = Self::load_tx_exon_alignments(&mut conn, schema)?;
let (tx_info, tx_for_gene) = Self::load_transcript_info(&mut conn, schema)?;
let tx_region_index = Self::load_tx_region_index(&mut conn, schema)?;
let mut sequences = sequences;
if let Some(ref seqrepo_path) = config.seqrepo_path {
Self::load_seqrepo_sequences(seqrepo_path, &mut sequences)?;
}
let elapsed = total_start.elapsed();
eprintln!(
"InMemoryProvider loaded in {:.1}s (gene_info={}, sequences={}, \
tx_identity={}, tx_exons_keys={}, tx_info_keys={}, region_index_keys={})",
elapsed.as_secs_f64(),
gene_info.len(),
sequences.len(),
tx_identity_info.len(),
tx_exons.len(),
tx_info.len(),
tx_region_index.len(),
);
Ok(Self {
data_version: schema.clone(),
schema_version,
assembly_maps,
gene_info,
tx_to_pro,
sequences,
seq_id_to_acs,
similar_transcripts,
tx_exons,
tx_for_gene,
tx_info,
tx_identity_info,
tx_mapping_options,
tx_region_index,
})
}
fn build_assembly_maps() -> HashMap<Assembly, IndexMap<String, String>> {
let mut maps = HashMap::new();
for assembly in [Assembly::Grch37, Assembly::Grch37p10, Assembly::Grch38] {
let map = IndexMap::from_iter(
ASSEMBLY_INFOS[assembly]
.sequences
.iter()
.map(|record| (record.refseq_ac.clone(), record.name.clone())),
);
maps.insert(assembly, map);
}
maps
}
fn load_gene_info(
conn: &mut Client,
schema: &str,
) -> Result<HashMap<String, GeneInfoRecord>, String> {
let t = Instant::now();
eprintln!("Loading gene info ...");
let sql = format!("SELECT hgnc, maploc, descr, summary, aliases, added FROM {schema}.gene");
let rows = conn
.query(&sql, &[])
.map_err(|e| format!("Failed to query gene table: {e}"))?;
let mut map = HashMap::with_capacity(rows.len());
for row in &rows {
let hgnc: String = row.get("hgnc");
let aliases_str: String = row.get("aliases");
let aliases: Vec<String> = aliases_str.split(',').map(|s| s.to_owned()).collect();
let record = GeneInfoRecord {
hgnc: hgnc.clone(),
maploc: row.get("maploc"),
descr: row.get("descr"),
summary: row.get("summary"),
aliases,
added: row.get("added"),
};
map.insert(hgnc, record);
}
eprintln!(
" gene info: {} records in {:.1}s",
map.len(),
t.elapsed().as_secs_f64()
);
Ok(map)
}
fn load_tx_to_pro(conn: &mut Client, schema: &str) -> Result<HashMap<String, String>, String> {
let t = Instant::now();
eprintln!("Loading associated accessions ...");
let sql = format!(
"SELECT tx_ac, pro_ac FROM {schema}.associated_accessions \
ORDER BY tx_ac, pro_ac DESC"
);
let rows = conn
.query(&sql, &[])
.map_err(|e| format!("Failed to query associated_accessions: {e}"))?;
let mut map: HashMap<String, String> = HashMap::new();
for row in &rows {
let tx_ac: String = row.get("tx_ac");
let pro_ac: String = row.get("pro_ac");
map.entry(tx_ac).or_insert(pro_ac);
}
eprintln!(
" associated accessions: {} mappings in {:.1}s",
map.len(),
t.elapsed().as_secs_f64()
);
Ok(map)
}
fn load_sequences(conn: &mut Client, schema: &str) -> Result<SequenceData, String> {
let t = Instant::now();
eprintln!("Loading seq_anno -> seq_id mapping ...");
let sql = format!("SELECT ac, seq_id FROM {schema}.seq_anno");
let rows = conn
.query(&sql, &[])
.map_err(|e| format!("Failed to query seq_anno: {e}"))?;
let mut ac_to_seq_id: HashMap<String, String> = HashMap::with_capacity(rows.len());
let mut seq_id_to_acs: HashMap<String, Vec<String>> = HashMap::new();
for row in &rows {
let ac: String = row.get("ac");
let seq_id: String = row.get("seq_id");
seq_id_to_acs
.entry(seq_id.clone())
.or_default()
.push(ac.clone());
ac_to_seq_id.insert(ac, seq_id);
}
eprintln!(
" seq_anno: {} accessions, {} unique seq_ids in {:.1}s",
ac_to_seq_id.len(),
seq_id_to_acs.len(),
t.elapsed().as_secs_f64(),
);
let t2 = Instant::now();
eprintln!("Loading sequences ...");
let sql = format!("SELECT seq_id, seq FROM {schema}.seq");
let rows = conn
.query(&sql, &[])
.map_err(|e| format!("Failed to query seq: {e}"))?;
let mut seq_id_to_seq: HashMap<String, String> = HashMap::with_capacity(rows.len());
for row in &rows {
let seq_id: String = row.get("seq_id");
let seq: Option<String> = row.get("seq");
if let Some(seq) = seq {
seq_id_to_seq.insert(seq_id, seq);
}
}
eprintln!(
" seq: {} sequences in {:.1}s",
seq_id_to_seq.len(),
t2.elapsed().as_secs_f64(),
);
let t3 = Instant::now();
eprintln!("Building accession -> sequence map ...");
let mut sequences: HashMap<String, String> = HashMap::with_capacity(ac_to_seq_id.len());
let mut missing = 0usize;
for (ac, seq_id) in &ac_to_seq_id {
if let Some(seq) = seq_id_to_seq.get(seq_id) {
sequences.insert(ac.clone(), seq.clone());
} else {
missing += 1;
}
}
if missing > 0 {
eprintln!(" WARNING: {missing} accessions had no matching sequence");
}
eprintln!(
" accession -> sequence: {} entries in {:.1}s",
sequences.len(),
t3.elapsed().as_secs_f64(),
);
Ok((sequences, seq_id_to_acs))
}
fn load_similar_transcripts(
conn: &mut Client,
schema: &str,
) -> Result<HashMap<String, Vec<TxSimilarityRecord>>, String> {
let t = Instant::now();
eprintln!("Loading similar transcripts (tx_similarity_v) ...");
let sql = format!(
"SELECT tx_ac1, tx_ac2, hgnc_eq, cds_eq, es_fp_eq, \
cds_es_fp_eq, cds_exon_lengths_fp_eq \
FROM {schema}.tx_similarity_v \
ORDER BY tx_ac1, tx_ac2"
);
let rows = conn
.query(&sql, &[])
.map_err(|e| format!("Failed to query tx_similarity_v: {e}"))?;
let mut map: HashMap<String, Vec<TxSimilarityRecord>> = HashMap::new();
for row in &rows {
let record = TxSimilarityRecord {
tx_ac1: row.get("tx_ac1"),
tx_ac2: row.get("tx_ac2"),
hgnc_eq: row.try_get("hgnc_eq").unwrap_or(false),
cds_eq: row.try_get("cds_eq").unwrap_or(false),
es_fp_eq: row.try_get("es_fp_eq").unwrap_or(false),
cds_es_fp_eq: row.try_get("cds_es_fp_eq").unwrap_or(false),
cds_exon_lengths_fp_eq: row.try_get("cds_exon_lengths_fp_eq").unwrap_or(false),
};
let key: String = row.get("tx_ac1");
map.entry(key).or_default().push(record);
}
eprintln!(
" similar transcripts: {} keys, {} total records in {:.1}s",
map.len(),
map.values().map(|v| v.len()).sum::<usize>(),
t.elapsed().as_secs_f64(),
);
Ok(map)
}
fn load_tx_identity_info(
conn: &mut Client,
schema: &str,
) -> Result<HashMap<String, TxIdentityInfo>, String> {
let t = Instant::now();
eprintln!("Loading tx identity info (tx_def_summary_v) — this may be slow ...");
let sql = format!(
"SELECT DISTINCT tx_ac, alt_ac, alt_aln_method, cds_start_i, \
cds_end_i, lengths, hgnc \
FROM {schema}.tx_def_summary_v \
ORDER BY tx_ac, alt_ac, alt_aln_method, cds_start_i, cds_end_i, lengths, hgnc"
);
let rows = conn
.query(&sql, &[])
.map_err(|e| format!("Failed to query tx_def_summary_v: {e}"))?;
let mut map: HashMap<String, TxIdentityInfo> = HashMap::new();
for row in &rows {
let tx_ac: String = row.get("tx_ac");
if map.contains_key(&tx_ac) {
continue;
}
let hgnc: &str = row.get("hgnc");
let is_selenoprotein = SELENOPROTEIN_SYMBOLS.contains(&hgnc);
let record = TxIdentityInfo {
tx_ac: tx_ac.clone(),
alt_ac: row.get("alt_ac"),
alt_aln_method: row.get("alt_aln_method"),
cds_start_i: row.get("cds_start_i"),
cds_end_i: row.get("cds_end_i"),
lengths: row.get("lengths"),
hgnc: hgnc.to_string(),
translation_table: if is_selenoprotein {
hgvs::sequences::TranslationTable::Selenocysteine
} else {
Default::default()
},
};
map.insert(tx_ac, record);
}
eprintln!(
" tx identity info: {} records in {:.1}s",
map.len(),
t.elapsed().as_secs_f64(),
);
Ok(map)
}
fn load_tx_exon_alignments(conn: &mut Client, schema: &str) -> Result<TxExonData, String> {
let t = Instant::now();
eprintln!("Loading tx exon alignments (tx_exon_aln_v) — this may be slow ...");
let sql = format!(
"SELECT hgnc, tx_ac, alt_ac, alt_aln_method, alt_strand, ord, \
tx_start_i, tx_end_i, alt_start_i, alt_end_i, cigar, \
tx_aseq, alt_aseq, tx_exon_set_id, alt_exon_set_id, \
tx_exon_id, alt_exon_id, exon_aln_id \
FROM {schema}.tx_exon_aln_v \
ORDER BY tx_ac, alt_ac, alt_aln_method, alt_start_i"
);
let rows = conn
.query(&sql, &[])
.map_err(|e| format!("Failed to query tx_exon_aln_v: {e}"))?;
let mut exons_map: HashMap<(String, String, String), Vec<TxExonsRecord>> = HashMap::new();
let mut mapping_triples: HashMap<String, Vec<(String, String, String)>> = HashMap::new();
let mut mapping_seen: std::collections::HashSet<(String, String, String)> =
std::collections::HashSet::new();
for row in &rows {
let tx_ac: String = row.get("tx_ac");
let alt_ac: String = row.get("alt_ac");
let alt_aln_method: String = row.get("alt_aln_method");
let exon_aln_id: Option<i32> = row.get("exon_aln_id");
let record = TxExonsRecord {
hgnc: row.get("hgnc"),
tx_ac: tx_ac.clone(),
alt_ac: alt_ac.clone(),
alt_aln_method: alt_aln_method.clone(),
alt_strand: row.get("alt_strand"),
ord: row.get("ord"),
tx_start_i: row.get("tx_start_i"),
tx_end_i: row.get("tx_end_i"),
alt_start_i: row.get("alt_start_i"),
alt_end_i: row.get("alt_end_i"),
cigar: row.get("cigar"),
tx_aseq: row.get("tx_aseq"),
alt_aseq: row.get("alt_aseq"),
tx_exon_set_id: row.get("tx_exon_set_id"),
alt_exon_set_id: row.get("alt_exon_set_id"),
tx_exon_id: row.get("tx_exon_id"),
alt_exon_id: row.get("alt_exon_id"),
exon_aln_id: exon_aln_id.unwrap_or(0),
};
let key = (tx_ac.clone(), alt_ac.clone(), alt_aln_method.clone());
exons_map.entry(key.clone()).or_default().push(record);
if exon_aln_id.is_some() && mapping_seen.insert(key) {
mapping_triples.entry(tx_ac).or_default().push((
row.get("tx_ac"),
alt_ac,
alt_aln_method,
));
}
}
let mut tx_mapping_options: HashMap<String, Vec<TxMappingOptionsRecord>> = HashMap::new();
for (tx_ac, triples) in mapping_triples {
let mut records: Vec<TxMappingOptionsRecord> = triples
.into_iter()
.map(|(tx, alt, method)| TxMappingOptionsRecord {
tx_ac: tx,
alt_ac: alt,
alt_aln_method: method,
})
.collect();
records.sort_by(|a, b| {
a.tx_ac
.cmp(&b.tx_ac)
.then(a.alt_ac.cmp(&b.alt_ac))
.then(a.alt_aln_method.cmp(&b.alt_aln_method))
});
tx_mapping_options.insert(tx_ac, records);
}
eprintln!(
" tx exon alignments: {} alignment keys, {} mapping-option keys in {:.1}s",
exons_map.len(),
tx_mapping_options.len(),
t.elapsed().as_secs_f64(),
);
Ok((exons_map, tx_mapping_options))
}
fn load_transcript_info(conn: &mut Client, schema: &str) -> Result<TxInfoData, String> {
let t = Instant::now();
eprintln!("Loading transcript info (transcript + exon_set) ...");
let sql = format!(
"SELECT hgnc, cds_start_i, cds_end_i, tx_ac, alt_ac, alt_aln_method \
FROM {schema}.transcript T \
JOIN {schema}.exon_set ES ON T.ac = ES.tx_ac \
ORDER BY hgnc, cds_start_i, cds_end_i, tx_ac, alt_ac, alt_aln_method"
);
let rows = conn
.query(&sql, &[])
.map_err(|e| format!("Failed to query transcript+exon_set: {e}"))?;
let mut info_map: HashMap<(String, String, String), TxInfoRecord> = HashMap::new();
let mut gene_map: HashMap<String, Vec<TxInfoRecord>> = HashMap::new();
for row in &rows {
let hgnc: String = row.get("hgnc");
let tx_ac: String = row.get("tx_ac");
let alt_ac: String = row.get("alt_ac");
let alt_aln_method: String = row.get("alt_aln_method");
let record = TxInfoRecord {
hgnc: hgnc.clone(),
cds_start_i: row.get("cds_start_i"),
cds_end_i: row.get("cds_end_i"),
tx_ac: tx_ac.clone(),
alt_ac: alt_ac.clone(),
alt_aln_method: alt_aln_method.clone(),
};
let key = (tx_ac, alt_ac, alt_aln_method.clone());
info_map.insert(key, record.clone());
if alt_aln_method != "transcript" {
gene_map.entry(hgnc).or_default().push(record);
}
}
eprintln!(
" transcript info: {} info keys, {} gene keys in {:.1}s",
info_map.len(),
gene_map.len(),
t.elapsed().as_secs_f64(),
);
Ok((info_map, gene_map))
}
fn load_tx_region_index(
conn: &mut Client,
schema: &str,
) -> Result<HashMap<(String, String), Vec<TxRegionRecord>>, String> {
let t = Instant::now();
eprintln!("Loading tx region index (exon_set + exon) ...");
let sql = format!(
"SELECT tx_ac, alt_ac, alt_strand, alt_aln_method, \
MIN(start_i) AS start_i, MAX(end_i) AS end_i \
FROM {schema}.exon_set es \
JOIN {schema}.exon e ON es.exon_set_id = e.exon_set_id \
GROUP BY tx_ac, alt_ac, alt_strand, alt_aln_method \
ORDER BY tx_ac, alt_ac, alt_strand, alt_aln_method"
);
let rows = conn
.query(&sql, &[])
.map_err(|e| format!("Failed to query exon_set+exon for region index: {e}"))?;
let mut map: HashMap<(String, String), Vec<TxRegionRecord>> = HashMap::new();
for row in &rows {
let alt_ac: String = row.get("alt_ac");
let alt_aln_method: String = row.get("alt_aln_method");
let record = TxRegionRecord {
tx_ac: row.get("tx_ac"),
alt_ac: alt_ac.clone(),
alt_strand: row.get("alt_strand"),
alt_aln_method: alt_aln_method.clone(),
start_i: row.get("start_i"),
end_i: row.get("end_i"),
};
map.entry((alt_ac, alt_aln_method))
.or_default()
.push(record);
}
eprintln!(
" tx region index: {} keys, {} total records in {:.1}s",
map.len(),
map.values().map(|v| v.len()).sum::<usize>(),
t.elapsed().as_secs_f64(),
);
Ok(map)
}
fn load_seqrepo_sequences(
seqrepo_path: &str,
sequences: &mut HashMap<String, String>,
) -> Result<(), String> {
let t = Instant::now();
eprintln!("Loading sequences from SeqRepo at {seqrepo_path} ...");
let path = std::path::Path::new(seqrepo_path);
let instance = path
.file_name()
.ok_or_else(|| format!("Could not extract instance name from path: {seqrepo_path}"))?
.to_str()
.ok_or_else(|| format!("Non-UTF8 path component in: {seqrepo_path}"))?;
let parent = path
.parent()
.ok_or_else(|| format!("Could not extract parent from path: {seqrepo_path}"))?;
let repo = SeqRepo::new(parent, instance)
.map_err(|e| format!("Failed to open SeqRepo at {seqrepo_path}: {e}"))?;
let query = SeqRepoQuery::default();
let mut seqid_to_aliases: HashMap<String, Vec<String>> = HashMap::new();
repo.alias_db()
.find(&query, |record_result| {
if let Ok(record) = record_result {
seqid_to_aliases
.entry(record.seqid.clone())
.or_default()
.push(record.alias);
}
})
.map_err(|e| format!("Failed to query SeqRepo aliases: {e}"))?;
eprintln!(
" SeqRepo aliases: {} unique seq_ids, loading sequences ...",
seqid_to_aliases.len(),
);
let mut loaded = 0usize;
let mut skipped = 0usize;
for (seqid, aliases) in &seqid_to_aliases {
if aliases.iter().all(|a| sequences.contains_key(a)) {
skipped += aliases.len();
continue;
}
let seq =
match repo.fetch_sequence_part(&AliasOrSeqId::SeqId(seqid.clone()), None, None) {
Ok(s) => s,
Err(e) => {
eprintln!(" WARNING: failed to fetch sequence for seq_id={seqid}: {e}");
continue;
}
};
for alias in aliases {
if !sequences.contains_key(alias) {
sequences.insert(alias.clone(), seq.clone());
loaded += 1;
}
}
}
eprintln!(
" SeqRepo: loaded {} new alias->sequence mappings, skipped {} already present in {:.1}s",
loaded,
skipped,
t.elapsed().as_secs_f64(),
);
Ok(())
}
}
impl Provider for InMemoryProvider {
fn data_version(&self) -> &str {
&self.data_version
}
fn schema_version(&self) -> &str {
&self.schema_version
}
fn get_assembly_map(&self, assembly: Assembly) -> IndexMap<String, String> {
self.assembly_maps
.get(&assembly)
.cloned()
.unwrap_or_default()
}
fn get_gene_info(&self, hgnc: &str) -> Result<GeneInfoRecord, HgvsDataError> {
self.gene_info
.get(hgnc)
.cloned()
.ok_or_else(|| HgvsDataError::NoGeneFound(hgnc.to_string()))
}
fn get_pro_ac_for_tx_ac(&self, tx_ac: &str) -> Result<Option<String>, HgvsDataError> {
Ok(self.tx_to_pro.get(tx_ac).cloned())
}
fn get_seq_part(
&self,
ac: &str,
begin: Option<usize>,
end: Option<usize>,
) -> Result<String, HgvsDataError> {
let seq = self
.sequences
.get(ac)
.ok_or_else(|| HgvsDataError::NoSequenceRecord(ac.to_string()))?;
let start = begin.unwrap_or(0);
let stop = end.unwrap_or(seq.len());
Ok(seq[start..stop].to_string())
}
fn get_acs_for_protein_seq(&self, seq: &str) -> Result<Vec<String>, HgvsDataError> {
let md5 = hgvs::sequences::seq_md5(seq, true)?;
let mut acs = self.seq_id_to_acs.get(&md5).cloned().unwrap_or_default();
acs.push(format!("MD5_{md5}"));
Ok(acs)
}
fn get_similar_transcripts(
&self,
tx_ac: &str,
) -> Result<Vec<TxSimilarityRecord>, HgvsDataError> {
Ok(self
.similar_transcripts
.get(tx_ac)
.cloned()
.unwrap_or_default())
}
fn get_tx_exons(
&self,
tx_ac: &str,
alt_ac: &str,
alt_aln_method: &str,
) -> Result<Vec<TxExonsRecord>, HgvsDataError> {
let key = (
tx_ac.to_string(),
alt_ac.to_string(),
alt_aln_method.to_string(),
);
self.tx_exons.get(&key).cloned().ok_or_else(|| {
HgvsDataError::NoTxExons(
tx_ac.to_string(),
alt_ac.to_string(),
alt_aln_method.to_string(),
)
})
}
fn get_tx_for_gene(&self, gene: &str) -> Result<Vec<TxInfoRecord>, HgvsDataError> {
Ok(self.tx_for_gene.get(gene).cloned().unwrap_or_default())
}
fn get_tx_for_region(
&self,
alt_ac: &str,
alt_aln_method: &str,
start_i: i32,
end_i: i32,
) -> Result<Vec<TxForRegionRecord>, HgvsDataError> {
let key = (alt_ac.to_string(), alt_aln_method.to_string());
let records = self.tx_region_index.get(&key);
let result = match records {
Some(entries) => entries
.iter()
.filter(|r| r.start_i < end_i && start_i <= r.end_i)
.map(|r| TxForRegionRecord {
tx_ac: r.tx_ac.clone(),
alt_ac: r.alt_ac.clone(),
alt_strand: r.alt_strand,
alt_aln_method: r.alt_aln_method.clone(),
start_i: r.start_i,
end_i: r.end_i,
})
.collect(),
None => Vec::new(),
};
Ok(result)
}
fn get_tx_identity_info(&self, tx_ac: &str) -> Result<TxIdentityInfo, HgvsDataError> {
self.tx_identity_info
.get(tx_ac)
.cloned()
.ok_or_else(|| HgvsDataError::NoTranscriptFound(tx_ac.to_string()))
}
fn get_tx_info(
&self,
tx_ac: &str,
alt_ac: &str,
alt_aln_method: &str,
) -> Result<TxInfoRecord, HgvsDataError> {
let key = (
tx_ac.to_string(),
alt_ac.to_string(),
alt_aln_method.to_string(),
);
self.tx_info
.get(&key)
.cloned()
.ok_or_else(|| HgvsDataError::NoAlignmentFound(tx_ac.to_string(), alt_ac.to_string()))
}
fn get_tx_mapping_options(
&self,
tx_ac: &str,
) -> Result<Vec<TxMappingOptionsRecord>, HgvsDataError> {
Ok(self
.tx_mapping_options
.get(tx_ac)
.cloned()
.unwrap_or_default())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
#[ignore]
fn test_load_from_uta() {
let config = InMemoryProviderConfig::default();
let provider = InMemoryProvider::new(&config)
.expect("Failed to create InMemoryProvider — is UTA running?");
assert!(
!provider.schema_version.is_empty(),
"schema_version should not be empty"
);
assert!(
!provider.gene_info.is_empty(),
"gene_info should not be empty"
);
assert!(
!provider.sequences.is_empty(),
"sequences should not be empty"
);
assert!(
!provider.tx_identity_info.is_empty(),
"tx_identity_info should not be empty"
);
assert!(
!provider.tx_exons.is_empty(),
"tx_exons should not be empty"
);
assert!(
!provider.tx_mapping_options.is_empty(),
"tx_mapping_options should not be empty"
);
assert!(!provider.tx_info.is_empty(), "tx_info should not be empty");
assert!(
!provider.tx_for_gene.is_empty(),
"tx_for_gene should not be empty"
);
assert!(
!provider.tx_region_index.is_empty(),
"tx_region_index should not be empty"
);
assert_eq!(
provider.assembly_maps.len(),
3,
"should have 3 assembly maps"
);
eprintln!("All InMemoryProvider sanity checks passed.");
}
fn make_provider_with_seqrepo() -> InMemoryProvider {
let config = InMemoryProviderConfig {
seqrepo_path: Some("manuscript/benchmark/output/data/seqrepo/2021-01-29".to_string()),
..Default::default()
};
InMemoryProvider::new(&config)
.expect("Failed to create InMemoryProvider — is UTA + SeqRepo available?")
}
#[test]
#[ignore]
fn test_seqrepo_loads_genomic_sequences() {
let provider = make_provider_with_seqrepo();
assert!(
provider.sequences.contains_key("NM_000051.3"),
"NM_000051.3 (transcript) should be loaded from UTA"
);
assert!(
provider.sequences.contains_key("NC_000011.10"),
"NC_000011.10 (genomic) should be loaded from SeqRepo"
);
}
#[test]
#[ignore]
fn test_get_seq_part() {
let provider = make_provider_with_seqrepo();
let full = provider.get_seq("NM_000051.3").expect("get_seq failed");
assert!(
full.len() > 1000,
"NM_000051.3 sequence should be > 1000 bp"
);
let part = provider
.get_seq_part("NM_000051.3", Some(100), Some(200))
.expect("get_seq_part failed");
assert_eq!(part.len(), 100, "slice [100..200] should be 100 bp");
assert_eq!(&full[100..200], part, "slice content should match");
}
#[test]
#[ignore]
fn test_get_tx_mapping_options() {
let provider = make_provider_with_seqrepo();
let options = provider
.get_tx_mapping_options("NM_000051.3")
.expect("get_tx_mapping_options failed");
assert!(
!options.is_empty(),
"NM_000051.3 should have mapping options"
);
}
#[test]
#[ignore]
fn test_get_tx_identity_info() {
let provider = make_provider_with_seqrepo();
let info = provider
.get_tx_identity_info("NM_000051.3")
.expect("get_tx_identity_info failed");
assert_eq!(info.hgnc, "ATM", "NM_000051.3 should map to ATM gene");
}
#[test]
#[ignore]
fn test_get_tx_exons() {
let provider = make_provider_with_seqrepo();
let options = provider
.get_tx_mapping_options("NM_000051.3")
.expect("get_tx_mapping_options failed");
let opt = options
.iter()
.find(|o| o.alt_aln_method == "splign")
.expect("should have a splign alignment for NM_000051.3");
let exons = provider
.get_tx_exons("NM_000051.3", &opt.alt_ac, "splign")
.expect("get_tx_exons failed");
assert!(
!exons.is_empty(),
"NM_000051.3 should have exon records for {}",
opt.alt_ac,
);
}
}