use std::{
collections::HashMap,
fs::{File, OpenOptions},
io::BufReader,
path::Path,
sync::{Arc, Mutex},
};
use crate::repo::SeqRepo;
use crate::{
error::Error,
interface::{AliasOrSeqId, Interface},
};
pub struct CacheWritingSeqRepo {
writer: Arc<Mutex<noodles::fasta::Writer<File>>>,
repo: SeqRepo,
cache: Arc<Mutex<HashMap<String, String>>>,
}
impl CacheWritingSeqRepo {
pub fn new<P>(repo: SeqRepo, cache_path: P) -> Result<Self, Error>
where
P: AsRef<Path>,
{
let cache = if cache_path.as_ref().exists() {
Arc::new(Mutex::new(CacheReadingSeqRepo::read_cache(
cache_path.as_ref(),
)?))
} else {
Arc::new(Mutex::new(HashMap::new()))
};
let file = OpenOptions::new()
.create(true)
.append(true)
.open(&cache_path)
.map_err(|e| Error::SeqSepoCacheOpenWrite(e.to_string()))?;
Ok(Self {
repo,
writer: Arc::new(Mutex::new(noodles::fasta::Writer::new(file))),
cache,
})
}
}
impl Interface for CacheWritingSeqRepo {
fn fetch_sequence_part(
&self,
alias_or_seq_id: &AliasOrSeqId,
begin: Option<usize>,
end: Option<usize>,
) -> Result<String, Error> {
let key = build_key(alias_or_seq_id, begin, end);
if let Some(value) = self
.cache
.as_ref()
.lock()
.expect("could not acquire lock")
.get(&key)
{
return Ok(value.to_owned());
}
let value = self.repo.fetch_sequence_part(alias_or_seq_id, begin, end)?;
self.cache
.as_ref()
.lock()
.expect("could not acquire lock")
.insert(key.clone(), value.clone());
self.writer
.lock()
.expect("could not acquire lock")
.write_record(&noodles::fasta::Record::new(
noodles::fasta::record::Definition::new(key, None),
noodles::fasta::record::Sequence::from(value.as_bytes().to_vec()),
))
.map_err(|e| Error::SeqSepoCacheWrite(e.to_string()))?;
Ok(value)
}
}
pub struct CacheReadingSeqRepo {
cache: HashMap<String, String>,
}
impl CacheReadingSeqRepo {
pub fn new<P>(path: P) -> Result<Self, Error>
where
P: AsRef<Path>,
{
Ok(Self {
cache: Self::read_cache(path.as_ref())?,
})
}
fn read_cache(path: &Path) -> Result<HashMap<String, String>, Error> {
let mut reader = File::open(path)
.map(BufReader::new)
.map(noodles::fasta::Reader::new)
.map_err(|e| Error::SeqSepoCacheOpenRead(e.to_string()))?;
let mut result = HashMap::new();
for record in reader.records() {
let record = record.map_err(|e| Error::SeqSepoCacheRead(e.to_string()))?;
result.insert(
std::str::from_utf8(record.name().as_ref())
.map_err(|e| Error::SeqSepoCacheOpenRead(e.to_string()))?
.to_string(),
std::str::from_utf8(record.sequence().as_ref())
.map_err(|e| Error::SeqSepoCacheOpenRead(e.to_string()))?
.to_string(),
);
}
Ok(result)
}
}
impl Interface for CacheReadingSeqRepo {
fn fetch_sequence_part(
&self,
alias_or_seq_id: &AliasOrSeqId,
begin: Option<usize>,
end: Option<usize>,
) -> Result<String, Error> {
let key = build_key(alias_or_seq_id, begin, end);
if let Some(seq) = self.cache.get(&key) {
Ok(seq.clone())
} else {
Err(Error::SeqSepoCacheKey(key))
}
}
}
fn build_key(alias_or_seq_id: &AliasOrSeqId, begin: Option<usize>, end: Option<usize>) -> String {
let name = match alias_or_seq_id {
AliasOrSeqId::Alias { value, namespace } => match namespace {
Some(namespace) => {
if namespace.is_empty() {
value.clone()
} else {
format!("{namespace}:{value}")
}
}
None => value.clone(),
},
AliasOrSeqId::SeqId(seqid) => seqid.clone(),
};
let suffix = match (begin, end) {
(None, None) => "".to_string(),
(None, Some(end)) => format!("?-{end}"),
(Some(begin), None) => format!("{begin}-?"),
(Some(begin), Some(end)) => format!("{begin}-{end}"),
};
if suffix.is_empty() {
name
} else {
format!("{}:{}", &name, &suffix)
}
}
#[cfg(test)]
mod test {
use anyhow::Error;
use test_log::test;
use std::{fs::read_to_string, path::PathBuf};
use pretty_assertions::assert_eq;
use temp_testdir::TempDir;
use crate::{AliasOrSeqId, CacheReadingSeqRepo, Interface, SeqRepo};
use super::CacheWritingSeqRepo;
#[test]
fn test_sync() {
fn is_sync<T: Sync>() {}
is_sync::<super::CacheReadingSeqRepo>();
is_sync::<super::CacheWritingSeqRepo>();
}
fn test_fetch(sr: &impl Interface) -> Result<(), Error> {
let alias = "NM_001304430.2";
let aos = AliasOrSeqId::Alias {
value: alias.to_string(),
namespace: None,
};
assert_eq!(
sr.fetch_sequence(&aos)?,
"ACTGCTGAGCTGGGAGATGTCGGCGGCGTGTTGGGAGGAACCGTGGGGTCTTCCCGGCGGCTTT\
GCGAAGCGGGTCCTGGTGACCGGCGGTGCTGGTTTCATGTAGGTAATGGCGCCGCTAGCCAAGCA\
GTGGCTCCCCAGAAACCCCTACCTTTTCCCGCAGCTCTGCTTGCCCTAGTGCATCACATATGATT\
GTCTCTTTAGTGGAAGATTATCCAAACTATATGATCATAAATCTAGACAAGCTGGATTACTGTGC\
AAGCTTGAAGAATCTTGAAACCATTTCTAACAAACAGAACTACAAATTTATACAGGGTGACATAT\
GTGATTCTCACTTTGTGAAACTGCTTTTTGAAACAGAGAAAATAGATATAGTACTACATTTTGCC\
GCACAAACACATGTAGATCTTTCATTCGTACGTGCCTTTGAGTTTACCTATGTTAATGTTTATGG\
CACTCACGTTTTGGTAAGTGCTGCTCATGAAGCCAGAGTGGAGAAGTTTATTTATGTCAGCACAG\
ATGAAGTATATGGTGGCAGTCTTGATAAGGAATTTGATGAATCTTCACCCAAACAACCTACAAAT\
CCTTATGCATCATCTAAAGCAGCTGCTGAATGTTTTGTACAGTCTTACTGGGAACAATATAAGTT\
TCCAGTTGTCATCACAAGAAGCAGTAATGTTTATGGACCACATCAATATCCAGAAAAGGTTATTC\
CAAAATTTATATCTTTGCTACAGCACAACAGGAAATGTTGCATTCATGGGTCAGGGCTTCAAACA\
AGAAACTTCCTTTATGCTACTGATGTTGTAGAAGCATTTCTCACTGTCCTCAAAAAAGGGAAACC\
AGGTGAAATTTATAACATCGGAACCAATTTTGAAATGTCAGTTGTCCAGCTTGCCAAAGAACTAA\
TACAACTGATCAAAGAGACCAATTCAGAGTCTGAAATGGAAAATTGGGTTGATTATGTTAATGAT\
AGACCCACCAATGACATGAGATACCCAATGAAGTCAGAAAAAATACATGGCTTAGGATGGAGACC\
TAAAGTGCCTTGGAAAGAAGGAATAAAGAAAACAATTGAATGGTACAGAGAGAATTTTCACAACT\
GGAAGAATGTGGAAAAGGCATTAGAACCCTTTCCGGTATAATCACCATTTATATAGTCGAGACAG\
TTGTCAAAGAAGAAAGTTATCCTACCTCGCCAAGTGGTATGAAATTAAGTGACCAAATGAAGTGC\
ACTCTTTTCTTTTGGAATTAGATTCATGACTTTCTGTATAAAATTCAAATGCAGAATGCCTCAAT\
CTTTGGGAGAGTTTCAGTACTGGCATAGAATTTAAATGTCAAAATTCTTTCTGAAACCCTTTCTC\
CTAGAAACTAGGAAATAATAGGTGTAGAAGACTCTCCCTAAGGGTAGCCAGGAAGAAGTCTCCTG\
ATTCGGACAACCATGAGGGGTAGTGGTGCTAGGGAGAAGGCAACCTTCACTGGTTTTGAACTCAG\
TGCCTAAGAAAGTCTCTGAAATGTTCGTTTTTAGGCAATATAGGATGTCTTAGGCCCTAATTCAC\
CATTTCTTTTTTAAGATCTGATATGCTATCATTGCCTTAATAATGGAACAAAATAGAAGCATATC\
TAACACTTTTTAAATTGATAATTTTGTAAAATTGATTACGTTGAATGCTTTTTAAGAGAAGTGTG\
TAAAGTTTTTATATTTTCACAATTAACGTATGTAAAACCTTGTATCAGAAATTTATCATGTTTAC\
TGTTTAAAATGATTGTATTTATAAAATTGTCAATATCTTAATGTATTTAATGTAGAATATTGCTT\
TTTAAAATAATGTTTTTATTTTGCTGTAGAAAAATAAAAAAAAATTTGATTATA"
);
assert_eq!(sr.fetch_sequence_part(&aos, None, Some(4))?, "ACTG");
assert_eq!(sr.fetch_sequence_part(&aos, Some(1869), None)?, "TATA");
assert_eq!(sr.fetch_sequence_part(&aos, Some(0), Some(4))?, "ACTG");
Ok(())
}
#[test]
fn cache_writing() -> Result<(), Error> {
let temp = TempDir::default();
let sr = SeqRepo::new("tests/data/seqrepo", "latest")?;
let mut cache_path = PathBuf::from(temp.as_ref());
cache_path.push("cache.fasta");
{
let cw = CacheWritingSeqRepo::new(sr, &cache_path)?;
test_fetch(&cw)?;
test_fetch(&cw)?;
}
assert_eq!(
read_to_string(&cache_path).unwrap(),
read_to_string("tests/data/cached/cache.fasta").unwrap(),
);
Ok(())
}
#[test]
fn cache_reading() -> Result<(), Error> {
let cr = CacheReadingSeqRepo::new("tests/data/cached/cache.fasta")?;
test_fetch(&cr)?;
Ok(())
}
}