use crate::FerroError;
use std::collections::HashSet;
use std::io::{BufRead, BufReader};
use std::path::Path;
#[derive(Debug, Clone, Default)]
pub struct AccessionSources {
pub transcripts: HashSet<String>,
pub supplemental: HashSet<String>,
pub genomic: HashSet<String>,
pub refseqgene: HashSet<String>,
pub lrg: HashSet<String>,
}
impl AccessionSources {
pub fn from_ferro_reference(ferro_ref: &Path) -> Result<Self, FerroError> {
Ok(Self {
transcripts: load_accessions_from_dir(&ferro_ref.join("transcripts"))?,
supplemental: load_accessions_from_dir(&ferro_ref.join("supplemental"))?,
genomic: load_accessions_from_dir(&ferro_ref.join("genome"))?,
refseqgene: load_accessions_from_dir(&ferro_ref.join("refseqgene"))?,
lrg: load_accessions_from_dir(&ferro_ref.join("lrg"))?,
})
}
pub fn contains(&self, accession: &str) -> bool {
self.transcripts.contains(accession)
|| self.supplemental.contains(accession)
|| self.genomic.contains(accession)
|| self.refseqgene.contains(accession)
|| self.lrg.contains(accession)
}
pub fn total_count(&self) -> usize {
let mut all = HashSet::new();
all.extend(self.transcripts.iter().cloned());
all.extend(self.supplemental.iter().cloned());
all.extend(self.genomic.iter().cloned());
all.extend(self.refseqgene.iter().cloned());
all.extend(self.lrg.iter().cloned());
all.len()
}
pub fn find_missing<'a>(&self, accessions: &'a [String]) -> Vec<&'a String> {
accessions
.iter()
.filter(|acc| !self.contains(acc))
.collect()
}
pub fn source_for(&self, accession: &str) -> Option<&'static str> {
if self.transcripts.contains(accession) {
Some("transcripts")
} else if self.supplemental.contains(accession) {
Some("supplemental")
} else if self.genomic.contains(accession) {
Some("genomic")
} else if self.refseqgene.contains(accession) {
Some("refseqgene")
} else if self.lrg.contains(accession) {
Some("lrg")
} else {
None
}
}
pub fn print_summary(&self) {
eprintln!(" Ferro reference contains:");
eprintln!(
" {} transcripts (NM_, NR_, XM_, XR_)",
self.transcripts.len()
);
eprintln!(" {} supplemental sequences", self.supplemental.len());
eprintln!(" {} genomic sequences (NC_)", self.genomic.len());
eprintln!(" {} RefSeqGene sequences (NG_)", self.refseqgene.len());
eprintln!(" {} LRG sequences", self.lrg.len());
eprintln!(" {} total unique accessions", self.total_count());
}
}
fn load_accessions_from_dir(dir: &Path) -> Result<HashSet<String>, FerroError> {
let mut accessions = HashSet::new();
if !dir.exists() {
return Ok(accessions);
}
for entry in std::fs::read_dir(dir).map_err(|e| FerroError::Io {
msg: format!("Failed to read directory {}: {}", dir.display(), e),
})? {
let path = entry
.map_err(|e| FerroError::Io {
msg: format!("Failed to read entry: {}", e),
})?
.path();
if path.extension().is_some_and(|e| e == "fai") {
let file = std::fs::File::open(&path).map_err(|e| FerroError::Io {
msg: format!("Failed to open {}: {}", path.display(), e),
})?;
for line in BufReader::new(file).lines() {
let line = line.map_err(|e| FerroError::Io {
msg: format!("Failed to read line: {}", e),
})?;
if let Some(acc) = line.split('\t').next() {
accessions.insert(acc.to_string());
}
}
}
}
Ok(accessions)
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::TempDir;
#[test]
fn test_load_accessions_from_dir() {
let dir = TempDir::new().unwrap();
let fai_path = dir.path().join("test.fna.fai");
let mut f = std::fs::File::create(&fai_path).unwrap();
writeln!(f, "NM_000001.1\t1000\t0\t80\t81").unwrap();
writeln!(f, "NM_000002.2\t2000\t1000\t80\t81").unwrap();
let accessions = load_accessions_from_dir(dir.path()).unwrap();
assert_eq!(accessions.len(), 2);
assert!(accessions.contains("NM_000001.1"));
assert!(accessions.contains("NM_000002.2"));
}
#[test]
fn test_accession_sources_contains() {
let mut sources = AccessionSources::default();
sources.transcripts.insert("NM_000001.1".to_string());
sources.supplemental.insert("U31929.1".to_string());
sources.genomic.insert("NC_000001.11".to_string());
assert!(sources.contains("NM_000001.1"));
assert!(sources.contains("U31929.1"));
assert!(sources.contains("NC_000001.11"));
assert!(!sources.contains("NM_999999.1"));
}
#[test]
fn test_find_missing() {
let mut sources = AccessionSources::default();
sources.transcripts.insert("NM_000001.1".to_string());
sources.transcripts.insert("NM_000002.2".to_string());
let to_check = vec![
"NM_000001.1".to_string(),
"NM_000002.2".to_string(),
"NM_000003.3".to_string(),
];
let missing = sources.find_missing(&to_check);
assert_eq!(missing.len(), 1);
assert_eq!(missing[0], "NM_000003.3");
}
#[test]
fn test_total_count_deduplicates() {
let mut sources = AccessionSources::default();
sources.transcripts.insert("NM_000001.1".to_string());
sources.supplemental.insert("NM_000001.1".to_string()); sources.supplemental.insert("U31929.1".to_string());
assert_eq!(sources.total_count(), 2);
}
}