use crate::FerroError;
use indicatif::{ProgressBar, ProgressStyle};
use std::collections::{HashMap, HashSet};
use std::fs::{self, File};
use std::io::{BufRead, BufReader, BufWriter, Write};
use std::path::{Path, PathBuf};
use std::process::Command;
pub mod manifest;
pub use manifest::{check_references, ReferenceManifest};
pub const LEGACY_GENBANK_ACCESSIONS: &[&str] = &[
"U31929.1", "M75126.1", "M22590.1", "AJ298105.1", "AF319569.1", ];
#[derive(Debug, Clone)]
pub struct PrepareConfig {
pub output_dir: PathBuf,
pub download_transcripts: bool,
pub download_genome: bool,
pub download_genome_grch37: bool,
pub download_refseqgene: bool,
pub download_lrg: bool,
pub download_cdot: bool,
pub download_cdot_grch37: bool,
pub skip_existing: bool,
pub clinvar_file: Option<PathBuf>,
pub patterns_file: Option<PathBuf>,
pub dry_run: bool,
}
impl Default for PrepareConfig {
fn default() -> Self {
Self {
output_dir: PathBuf::from("ferro-reference"),
download_transcripts: true,
download_genome: true,
download_genome_grch37: false,
download_refseqgene: false,
download_lrg: false,
download_cdot: true,
download_cdot_grch37: false,
skip_existing: true,
clinvar_file: None,
patterns_file: None,
dry_run: false,
}
}
}
pub mod urls {
pub const REFSEQ_RNA_BASE: &str =
"https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.";
pub const REFSEQ_RNA_SUFFIX: &str = ".rna.fna.gz";
pub const REFSEQ_RNA_COUNT: usize = 20;
pub const GRCH38_GENOME: &str = "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.fna.gz";
pub const GRCH37_GENOME: &str = "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.fna.gz";
pub const REFSEQGENE_BASE: &str = "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/";
pub const REFSEQGENE_COUNT: usize = 9;
pub const CDOT_REFSEQ_GRCH38: &str = "https://github.com/SACGF/cdot/releases/download/data_v0.2.32/cdot-0.2.32.refseq.GRCh38.json.gz";
pub const CDOT_REFSEQ_GRCH37: &str = "https://github.com/SACGF/cdot/releases/download/data_v0.2.32/cdot-0.2.32.refseq.GRCh37.json.gz";
pub const LRG_FASTA_BASE: &str = "https://ftp.ebi.ac.uk/pub/databases/lrgex/fasta/";
pub const LRG_XML_BASE: &str = "https://ftp.ebi.ac.uk/pub/databases/lrgex/";
pub const LRG_MAX_ID: usize = 1400;
pub const LRG_REFSEQ_MAPPING: &str =
"https://ftp.ebi.ac.uk/pub/databases/lrgex/list_LRGs_transcripts_xrefs.txt";
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct LegacyTranscript {
pub id: String,
pub gene_symbol: Option<String>,
pub cds_start: Option<u64>,
pub cds_end: Option<u64>,
pub sequence_length: usize,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct LegacyMetadata {
pub generated_at: String,
pub transcripts: HashMap<String, LegacyTranscript>,
}
pub fn prepare_references(config: &PrepareConfig) -> Result<ReferenceManifest, FerroError> {
eprintln!(
"Preparing reference data in {}",
config.output_dir.display()
);
fs::create_dir_all(&config.output_dir).map_err(|e| FerroError::Io {
msg: format!(
"Failed to create directory {}: {}",
config.output_dir.display(),
e
),
})?;
let mut manifest = ReferenceManifest::load_or_default(&config.output_dir)?;
if config.download_transcripts {
eprintln!("\n=== Downloading RefSeq transcripts ===");
let transcript_dir = config.output_dir.join("transcripts");
fs::create_dir_all(&transcript_dir).map_err(|e| FerroError::Io {
msg: format!("Failed to create directory: {}", e),
})?;
for i in 1..=urls::REFSEQ_RNA_COUNT {
let filename = format!("human.{}.rna.fna.gz", i);
let url = format!("{}{}{}", urls::REFSEQ_RNA_BASE, i, urls::REFSEQ_RNA_SUFFIX);
let output_path = transcript_dir.join(&filename);
if config.skip_existing && output_path.exists() {
eprintln!(" Skipping {} (exists)", filename);
if !manifest.transcript_fastas.contains(&output_path) {
manifest.transcript_fastas.push(output_path);
}
continue;
}
match download_file(&url, &output_path) {
Ok(_) => {
eprintln!(" Downloaded {}", filename);
if !manifest.transcript_fastas.contains(&output_path) {
manifest.transcript_fastas.push(output_path);
}
}
Err(e) => {
if i > 1 {
eprintln!(" No more files after human.{}.rna.fna.gz", i - 1);
break;
}
return Err(e);
}
}
}
eprintln!("\n=== Processing transcript files ===");
for gz_path in &manifest.transcript_fastas {
let fasta_path = gz_path.with_extension("").with_extension("fna");
if config.skip_existing && fasta_path.exists() {
eprintln!(" Skipping decompress {} (exists)", fasta_path.display());
} else {
decompress_gzip(gz_path, &fasta_path)?;
eprintln!(" Decompressed {}", fasta_path.display());
}
let fai_path = PathBuf::from(format!("{}.fai", fasta_path.display()));
if config.skip_existing && fai_path.exists() {
eprintln!(" Skipping index {} (exists)", fai_path.display());
} else {
index_fasta(&fasta_path)?;
eprintln!(" Indexed {}", fasta_path.display());
}
}
let (count, prefixes) = count_transcripts(&manifest.transcript_fastas)?;
manifest.transcript_count = count;
manifest.available_prefixes = prefixes;
eprintln!("\n Total transcripts: {}", count);
}
if config.download_genome {
eprintln!("\n=== Downloading GRCh38 genome (this may take a while) ===");
let genome_dir = config.output_dir.join("genome");
fs::create_dir_all(&genome_dir).map_err(|e| FerroError::Io {
msg: format!("Failed to create directory: {}", e),
})?;
let gz_path = genome_dir.join("GRCh38.fna.gz");
let fasta_path = genome_dir.join("GRCh38.fna");
if config.skip_existing && fasta_path.exists() {
eprintln!(" Skipping genome download (exists)");
let fai_path = PathBuf::from(format!("{}.fai", fasta_path.display()));
if !fai_path.exists() {
eprintln!(" Indexing existing genome...");
index_fasta(&fasta_path)?;
}
} else {
download_file(urls::GRCH38_GENOME, &gz_path)?;
decompress_gzip(&gz_path, &fasta_path)?;
index_fasta(&fasta_path)?;
}
manifest.genome_fasta = Some(fasta_path);
}
if config.download_genome_grch37 {
eprintln!("\n=== Downloading GRCh37 genome (this may take a while, ~900MB) ===");
let genome_dir = config.output_dir.join("genome");
fs::create_dir_all(&genome_dir).map_err(|e| FerroError::Io {
msg: format!("Failed to create directory: {}", e),
})?;
let gz_path = genome_dir.join("GRCh37.fna.gz");
let fasta_path = genome_dir.join("GRCh37.fna");
if config.skip_existing && fasta_path.exists() {
eprintln!(" Skipping GRCh37 genome download (exists)");
let fai_path = PathBuf::from(format!("{}.fai", fasta_path.display()));
if !fai_path.exists() {
eprintln!(" Indexing existing GRCh37 genome...");
index_fasta(&fasta_path)?;
}
} else {
eprintln!(" Downloading GRCh37 genome...");
download_file(urls::GRCH37_GENOME, &gz_path)?;
eprintln!(" Decompressing (~3GB)...");
decompress_gzip(&gz_path, &fasta_path)?;
eprintln!(" Indexing...");
index_fasta(&fasta_path)?;
eprintln!(" Done.");
}
manifest.genome_grch37_fasta = Some(fasta_path);
}
if config.download_refseqgene {
eprintln!("\n=== Downloading RefSeqGene sequences (NG_* accessions, ~600MB) ===");
let refseqgene_dir = config.output_dir.join("refseqgene");
fs::create_dir_all(&refseqgene_dir).map_err(|e| FerroError::Io {
msg: format!("Failed to create directory: {}", e),
})?;
for i in 1..=urls::REFSEQGENE_COUNT {
let filename = format!("refseqgene.{}.genomic.fna.gz", i);
let url = format!("{}refseqgene.{}.genomic.fna.gz", urls::REFSEQGENE_BASE, i);
let gz_path = refseqgene_dir.join(&filename);
let fasta_path = refseqgene_dir.join(format!("refseqgene.{}.genomic.fna", i));
if config.skip_existing && fasta_path.exists() {
eprintln!(" Skipping {} (exists)", filename);
let fai_path = PathBuf::from(format!("{}.fai", fasta_path.display()));
if !fai_path.exists() {
index_fasta(&fasta_path)?;
}
continue;
}
eprintln!(" Downloading {}...", filename);
match download_file(&url, &gz_path) {
Ok(_) => {
decompress_gzip(&gz_path, &fasta_path)?;
index_fasta(&fasta_path)?;
manifest.refseqgene_fastas.push(fasta_path);
}
Err(e) => {
eprintln!(" Warning: Failed to download {}: {}", filename, e);
}
}
}
eprintln!(
" Downloaded {} RefSeqGene files",
manifest.refseqgene_fastas.len()
);
}
if config.download_lrg {
eprintln!("\n=== Preparing LRG sequences from EBI (~1325 files) ===");
let lrg_dir = config.output_dir.join("lrg");
fs::create_dir_all(&lrg_dir).map_err(|e| FerroError::Io {
msg: format!("Failed to create directory: {}", e),
})?;
let mut fasta_downloaded = 0;
let mut fasta_skipped = 0;
let mut xml_downloaded = 0;
let mut xml_skipped = 0;
for i in 1..=urls::LRG_MAX_ID {
let lrg_id = format!("LRG_{}", i);
let fasta_filename = format!("{}.fasta", lrg_id);
let xml_filename = format!("{}.xml", lrg_id);
let fasta_url = format!("{}{}.fasta", urls::LRG_FASTA_BASE, lrg_id);
let xml_url = format!("{}{}.xml", urls::LRG_XML_BASE, lrg_id);
let fasta_path = lrg_dir.join(&fasta_filename);
let xml_path = lrg_dir.join(&xml_filename);
let mut lrg_exists = false;
if config.skip_existing && fasta_path.exists() {
let fai_path = PathBuf::from(format!("{}.fai", fasta_path.display()));
if !fai_path.exists() {
index_fasta(&fasta_path)?;
}
fasta_skipped += 1;
lrg_exists = true;
} else {
match download_file(&fasta_url, &fasta_path) {
Ok(_) => {
index_fasta(&fasta_path)?;
manifest.lrg_fastas.push(fasta_path.clone());
fasta_downloaded += 1;
lrg_exists = true;
if fasta_downloaded % 100 == 0 {
eprintln!(" Downloaded {} LRG FASTA files...", fasta_downloaded);
}
}
Err(_) => {
let _ = std::fs::remove_file(&fasta_path);
}
}
}
if lrg_exists {
if config.skip_existing && xml_path.exists() {
xml_skipped += 1;
} else {
match download_file(&xml_url, &xml_path) {
Ok(_) => {
manifest.lrg_xmls.push(xml_path);
xml_downloaded += 1;
if xml_downloaded % 100 == 0 {
eprintln!(" Downloaded {} LRG XML files...", xml_downloaded);
}
}
Err(_) => {
let _ = std::fs::remove_file(&xml_path);
}
}
}
}
}
if fasta_skipped > 0 {
eprintln!(" Skipped {} existing LRG FASTA files", fasta_skipped);
}
if fasta_downloaded > 0 {
eprintln!(" Downloaded {} new LRG FASTA files", fasta_downloaded);
}
if xml_skipped > 0 {
eprintln!(" Skipped {} existing LRG XML files", xml_skipped);
}
if xml_downloaded > 0 {
eprintln!(" Downloaded {} new LRG XML files", xml_downloaded);
}
eprintln!(" Indexed all LRG files");
let mapping_path = lrg_dir.join("lrg_refseq_mapping.txt");
if config.skip_existing && mapping_path.exists() {
eprintln!(" Skipping LRG-RefSeq mapping (exists)");
} else {
eprintln!(" Downloading LRG-RefSeq mapping...");
download_file(urls::LRG_REFSEQ_MAPPING, &mapping_path)?;
}
manifest.lrg_refseq_mapping = Some(mapping_path);
}
if config.download_cdot {
eprintln!("\n=== Downloading GRCh38 cdot transcript metadata (~200MB) ===");
let cdot_dir = config.output_dir.join("cdot");
manifest.cdot_json = Some(download_cdot(
urls::CDOT_REFSEQ_GRCH38,
&cdot_dir,
config.skip_existing,
)?);
}
if config.download_cdot_grch37 {
eprintln!("\n=== Downloading GRCh37 cdot transcript metadata (~200MB) ===");
let cdot_dir = config.output_dir.join("cdot");
manifest.cdot_grch37_json = Some(download_cdot(
urls::CDOT_REFSEQ_GRCH37,
&cdot_dir,
config.skip_existing,
)?);
}
#[cfg(feature = "benchmark")]
if config.clinvar_file.is_some() || config.patterns_file.is_some() {
fetch_supplemental_data(config, &mut manifest)?;
}
#[cfg(not(feature = "benchmark"))]
if config.clinvar_file.is_some() || config.patterns_file.is_some() {
eprintln!("\n=== Warning: --clinvar and --patterns require benchmark feature ===");
eprintln!(" Build with: cargo build --features benchmark");
eprintln!(" Skipping supplemental data fetching.");
}
eprintln!("\n=== Fetching legacy GenBank sequences ===");
let supplemental_dir = config.output_dir.join("supplemental");
fs::create_dir_all(&supplemental_dir).map_err(|e| FerroError::Io {
msg: format!("Failed to create supplemental directory: {}", e),
})?;
let genbank_fasta = supplemental_dir.join("legacy_genbank.fna");
if config.skip_existing && genbank_fasta.exists() {
eprintln!(" Skipping (exists): {}", genbank_fasta.display());
manifest.legacy_genbank_fasta = Some(genbank_fasta.clone());
let metadata_path = genbank_fasta.with_extension("metadata.json");
if metadata_path.exists() {
manifest.legacy_genbank_metadata = Some(metadata_path);
}
} else {
let genbank_accessions: Vec<String> = LEGACY_GENBANK_ACCESSIONS
.iter()
.map(|s| s.to_string())
.collect();
let fetched = fetch_legacy_versions(&genbank_accessions, &genbank_fasta, config.dry_run)?;
if !config.dry_run && fetched > 0 {
eprintln!(" Indexing legacy GenBank FASTA...");
index_fasta(&genbank_fasta)?;
manifest.legacy_genbank_fasta = Some(genbank_fasta.clone());
let metadata_path = genbank_fasta.with_extension("metadata.json");
if metadata_path.exists() {
manifest.legacy_genbank_metadata = Some(metadata_path);
}
}
}
manifest.save()?;
eprintln!("\n=== Preparation complete ===");
eprintln!(
"Manifest: {}",
manifest.reference_dir.join("manifest.json").display()
);
Ok(manifest)
}
#[cfg(feature = "benchmark")]
fn fetch_supplemental_data(
config: &PrepareConfig,
manifest: &mut ReferenceManifest,
) -> Result<(), FerroError> {
eprintln!("\n=== Fetching missing transcripts ===");
let existing = load_existing_accessions(&config.output_dir)?;
eprintln!(
" Found {} existing accessions in reference data",
existing.len()
);
let mut all_accessions = HashSet::new();
if let Some(ref clinvar_file) = config.clinvar_file {
let accessions = crate::benchmark::cache::extract_clinvar_accessions(clinvar_file)?;
eprintln!(" Found {} accessions in ClinVar file", accessions.len());
all_accessions.extend(accessions);
}
if let Some(ref patterns_file) = config.patterns_file {
let accessions = crate::benchmark::cache::extract_all_accessions_from_file(patterns_file)?;
eprintln!(" Found {} accessions in patterns file", accessions.len());
all_accessions.extend(accessions);
}
eprintln!(" {} unique accessions total", all_accessions.len());
let missing: Vec<String> = all_accessions
.into_iter()
.filter(|acc| !existing.contains(acc))
.collect();
eprintln!(
" {} accessions are missing from reference data",
missing.len()
);
if !missing.is_empty() {
let supplemental_dir = config.output_dir.join("supplemental");
fs::create_dir_all(&supplemental_dir).map_err(|e| FerroError::Io {
msg: format!("Failed to create supplemental directory: {}", e),
})?;
let supplemental_fasta = supplemental_dir.join("clinvar_transcripts.fna");
let fetched = crate::benchmark::cache::fetch_fasta_to_file(
&missing,
&supplemental_fasta,
config.dry_run,
)?;
if !config.dry_run && fetched > 0 {
eprintln!(" Indexing supplemental FASTA...");
index_fasta(&supplemental_fasta)?;
manifest.supplemental_fasta = Some(supplemental_fasta);
eprintln!(" Fetched {} supplemental transcripts", fetched);
}
}
if let Some(ref patterns_path) = config.patterns_file {
eprintln!("\n=== Detecting legacy transcript versions ===");
let transcript_dir = config.output_dir.join("transcripts");
let supplemental_dir = config.output_dir.join("supplemental");
let supp_dir_opt = if supplemental_dir.exists() {
Some(supplemental_dir.as_path())
} else {
None
};
let legacy_versions = detect_legacy_versions(patterns_path, &transcript_dir, supp_dir_opt)?;
if !legacy_versions.is_empty() {
eprintln!(" Found {} legacy versions to fetch", legacy_versions.len());
let supplemental_dir = config.output_dir.join("supplemental");
fs::create_dir_all(&supplemental_dir).map_err(|e| FerroError::Io {
msg: format!("Failed to create supplemental directory: {}", e),
})?;
let legacy_fasta = supplemental_dir.join("legacy_transcripts.fna");
let fetched = fetch_legacy_versions(&legacy_versions, &legacy_fasta, config.dry_run)?;
if !config.dry_run && fetched > 0 {
eprintln!(" Indexing legacy transcripts FASTA...");
index_fasta(&legacy_fasta)?;
manifest.legacy_transcripts_fasta = Some(legacy_fasta.clone());
let metadata_path = legacy_fasta.with_extension("metadata.json");
if metadata_path.exists() {
manifest.legacy_transcripts_metadata = Some(metadata_path);
}
}
} else {
eprintln!(" No legacy versions needed");
}
}
Ok(())
}
fn download_cdot(url: &str, cdot_dir: &Path, skip_existing: bool) -> Result<PathBuf, FerroError> {
fs::create_dir_all(cdot_dir).map_err(|e| FerroError::Io {
msg: format!("Failed to create directory: {}", e),
})?;
let gz_filename = Path::new(url).file_name().ok_or_else(|| FerroError::Io {
msg: format!("cdot URL has no filename component: {}", url),
})?;
let gz_path = cdot_dir.join(gz_filename);
let json_path = gz_path.with_extension("");
let json_is_fresh = if skip_existing && json_path.exists() {
eprintln!(" Skipping cdot download (exists)");
false
} else {
if skip_existing && gz_path.exists() {
eprintln!(
" Reusing existing {}...",
gz_path.file_name().unwrap_or_default().to_string_lossy()
);
} else {
eprintln!(
" Downloading {}...",
gz_path.file_name().unwrap_or_default().to_string_lossy()
);
download_file(url, &gz_path)?;
}
eprintln!(" Decompressing...");
decompress_gzip(&gz_path, &json_path)?;
eprintln!(" Done.");
true
};
let bin_path = json_path.with_extension("bin");
if !json_is_fresh && skip_existing && bin_path.exists() {
eprintln!(" Skipping cdot bincode conversion (exists)");
} else {
eprintln!(" Converting cdot JSON to bincode for fast loading...");
let mapper = crate::data::cdot::CdotMapper::from_json_file(&json_path).map_err(|e| {
FerroError::Io {
msg: format!("Failed to parse cdot JSON: {}", e),
}
})?;
mapper.to_bincode_file(&bin_path)?;
eprintln!(" Done.");
}
Ok(json_path)
}
fn download_file(url: &str, output: &Path) -> Result<(), FerroError> {
let output_str = output.to_str().ok_or_else(|| FerroError::Io {
msg: format!("Path contains invalid UTF-8: {:?}", output),
})?;
let curl_result = Command::new("curl")
.args(["-fSL", "-o", output_str, url])
.output();
match curl_result {
Ok(output_result) if output_result.status.success() => Ok(()),
_ => {
let wget_result = Command::new("wget")
.args(["-q", "-O", output_str, url])
.output();
match wget_result {
Ok(output_result) if output_result.status.success() => Ok(()),
_ => {
download_with_reqwest(url, output)
}
}
}
}
}
fn download_with_reqwest(url: &str, output: &Path) -> Result<(), FerroError> {
let client = reqwest::blocking::Client::builder()
.timeout(std::time::Duration::from_secs(3600))
.build()
.map_err(|e| FerroError::Io {
msg: format!("Failed to create HTTP client: {}", e),
})?;
let response = client.get(url).send().map_err(|e| FerroError::Io {
msg: format!("Failed to download {}: {}", url, e),
})?;
if !response.status().is_success() {
return Err(FerroError::Io {
msg: format!("HTTP {} for {}", response.status(), url),
});
}
let mut file = File::create(output).map_err(|e| FerroError::Io {
msg: format!("Failed to create {}: {}", output.display(), e),
})?;
let content = response.bytes().map_err(|e| FerroError::Io {
msg: format!("Failed to read response: {}", e),
})?;
file.write_all(&content).map_err(|e| FerroError::Io {
msg: format!("Failed to write file: {}", e),
})?;
Ok(())
}
fn decompress_gzip(input: &Path, output: &Path) -> Result<(), FerroError> {
use flate2::read::GzDecoder;
let input_file = File::open(input).map_err(|e| FerroError::Io {
msg: format!("Failed to open {}: {}", input.display(), e),
})?;
let output_file = File::create(output).map_err(|e| FerroError::Io {
msg: format!("Failed to create {}: {}", output.display(), e),
})?;
let mut decoder = GzDecoder::new(BufReader::new(input_file));
let mut writer = BufWriter::new(output_file);
std::io::copy(&mut decoder, &mut writer).map_err(|e| FerroError::Io {
msg: format!("Failed to decompress: {}", e),
})?;
Ok(())
}
pub fn index_fasta(fasta_path: &Path) -> Result<(), FerroError> {
let fasta_str = fasta_path.to_str().ok_or_else(|| FerroError::Io {
msg: format!("FASTA path contains invalid UTF-8: {:?}", fasta_path),
})?;
let samtools_result = Command::new("samtools").args(["faidx", fasta_str]).output();
if let Ok(output) = samtools_result {
if output.status.success() {
return Ok(());
}
}
build_fasta_index(fasta_path)
}
fn build_fasta_index(fasta_path: &Path) -> Result<(), FerroError> {
let file = File::open(fasta_path).map_err(|e| FerroError::Io {
msg: format!("Failed to open {}: {}", fasta_path.display(), e),
})?;
let reader = BufReader::new(file);
let fai_path = PathBuf::from(format!("{}.fai", fasta_path.display()));
let fai_file = File::create(&fai_path).map_err(|e| FerroError::Io {
msg: format!("Failed to create {}: {}", fai_path.display(), e),
})?;
let mut writer = BufWriter::new(fai_file);
let mut current_name: Option<String> = None;
let mut seq_length: u64 = 0;
let mut seq_offset: u64 = 0;
let mut line_bases: u64 = 0;
let mut line_bytes: u64 = 0;
let mut byte_offset: u64 = 0;
let mut first_seq_line = true;
for line in reader.lines() {
let line = line.map_err(|e| FerroError::Io {
msg: format!("Failed to read line: {}", e),
})?;
if let Some(header) = line.strip_prefix('>') {
if let Some(name) = current_name.take() {
writeln!(
writer,
"{}\t{}\t{}\t{}\t{}",
name, seq_length, seq_offset, line_bases, line_bytes
)
.map_err(|e| FerroError::Io {
msg: format!("Failed to write index: {}", e),
})?;
}
let name = header.split_whitespace().next().unwrap_or("").to_string();
current_name = Some(name);
seq_length = 0;
seq_offset = byte_offset + line.len() as u64 + 1; first_seq_line = true;
} else if current_name.is_some() {
let bases = line.trim_end().len() as u64;
seq_length += bases;
if first_seq_line {
line_bases = bases;
line_bytes = line.len() as u64 + 1; first_seq_line = false;
}
}
byte_offset += line.len() as u64 + 1; }
if let Some(name) = current_name {
writeln!(
writer,
"{}\t{}\t{}\t{}\t{}",
name, seq_length, seq_offset, line_bases, line_bytes
)
.map_err(|e| FerroError::Io {
msg: format!("Failed to write index: {}", e),
})?;
}
Ok(())
}
fn count_transcripts(fasta_files: &[PathBuf]) -> Result<(usize, Vec<String>), FerroError> {
let mut count = 0usize;
let mut prefixes = HashSet::new();
let pb = ProgressBar::new(fasta_files.len() as u64);
pb.set_style(
ProgressStyle::default_bar()
.template("[{elapsed_precise}] {bar:40} {pos}/{len} {msg}")
.unwrap(),
);
for fasta_path in fasta_files {
let fasta_path = fasta_path.with_extension("").with_extension("fna");
if !fasta_path.exists() {
continue;
}
let file = File::open(&fasta_path).map_err(|e| FerroError::Io {
msg: format!("Failed to open {}: {}", fasta_path.display(), e),
})?;
let reader = BufReader::new(file);
for line in reader.lines() {
let line = line.map_err(|e| FerroError::Io {
msg: format!("Failed to read line: {}", e),
})?;
if let Some(header) = line.strip_prefix('>') {
count += 1;
if let Some(acc) = header.split('|').nth(3) {
if let Some(prefix) = acc.split('_').next() {
prefixes.insert(format!("{}_", prefix));
}
}
}
}
pb.inc(1);
}
pb.finish_with_message(format!("{} transcripts", count));
let mut prefix_vec: Vec<String> = prefixes.into_iter().collect();
prefix_vec.sort();
Ok((count, prefix_vec))
}
#[cfg(feature = "benchmark")]
fn load_existing_accessions(reference_dir: &Path) -> Result<HashSet<String>, FerroError> {
let mut accessions = HashSet::new();
let dirs = [
reference_dir.join("transcripts"),
reference_dir.join("genome"),
reference_dir.join("refseqgene"),
reference_dir.join("lrg"),
reference_dir.join("supplemental"),
];
for dir in &dirs {
if !dir.exists() {
continue;
}
let entries = fs::read_dir(dir).map_err(|e| FerroError::Io {
msg: format!("Failed to read directory {}: {}", dir.display(), e),
})?;
for entry in entries.flatten() {
let path = entry.path();
if path.extension().and_then(|e| e.to_str()) == Some("fai") {
let file = File::open(&path).map_err(|e| FerroError::Io {
msg: format!("Failed to open {}: {}", path.display(), e),
})?;
let reader = BufReader::new(file);
for line in reader.lines().map_while(Result::ok) {
if let Some(name) = line.split('\t').next() {
accessions.insert(name.to_string());
}
}
}
}
}
Ok(accessions)
}
fn load_available_versions(
transcript_dir: &Path,
supplemental_dir: Option<&Path>,
) -> HashSet<String> {
let mut versions = HashSet::new();
if let Ok(entries) = fs::read_dir(transcript_dir) {
for entry in entries.flatten() {
let path = entry.path();
if path.extension().is_some_and(|e| e == "fai") {
if let Ok(file) = File::open(&path) {
for line in BufReader::new(file).lines().map_while(Result::ok) {
if let Some(acc) = line.split('\t').next() {
versions.insert(acc.to_string());
}
}
}
}
}
}
if let Some(supp_dir) = supplemental_dir {
if let Ok(entries) = fs::read_dir(supp_dir) {
for entry in entries.flatten() {
let path = entry.path();
if path.extension().is_some_and(|e| e == "fai") {
if let Ok(file) = File::open(&path) {
for line in BufReader::new(file).lines().map_while(Result::ok) {
if let Some(acc) = line.split('\t').next() {
versions.insert(acc.to_string());
}
}
}
}
}
}
}
versions
}
fn extract_versioned_accessions_from_patterns(
patterns_path: &Path,
) -> Result<HashSet<String>, FerroError> {
let file = File::open(patterns_path).map_err(|e| FerroError::Io {
msg: format!("Failed to open patterns file: {}", e),
})?;
let mut accessions = HashSet::new();
for line in BufReader::new(file).lines().map_while(Result::ok) {
if let Some(colon_pos) = line.find(':') {
let accession = &line[..colon_pos];
let is_transcript = accession.starts_with("NM_")
|| accession.starts_with("NR_")
|| accession.starts_with("XM_")
|| accession.starts_with("XR_");
let has_version = accession.contains('.');
if is_transcript && has_version {
accessions.insert(accession.to_string());
}
}
}
Ok(accessions)
}
pub fn detect_legacy_versions(
patterns_path: &Path,
transcript_dir: &Path,
supplemental_dir: Option<&Path>,
) -> Result<Vec<String>, FerroError> {
let available = load_available_versions(transcript_dir, supplemental_dir);
eprintln!(" Loaded {} available transcript versions", available.len());
let pattern_versions = extract_versioned_accessions_from_patterns(patterns_path)?;
eprintln!(
" Found {} versioned accessions in patterns",
pattern_versions.len()
);
let mut legacy: Vec<String> = pattern_versions
.into_iter()
.filter(|acc| {
(acc.starts_with("NM_")
|| acc.starts_with("NR_")
|| acc.starts_with("XM_")
|| acc.starts_with("XR_"))
&&
!available.contains(acc)
})
.collect();
legacy.sort();
Ok(legacy)
}
fn parse_genbank_record(genbank: &str) -> Option<(String, Option<u64>, Option<u64>, String)> {
let mut sequence = String::new();
let mut in_origin = false;
let mut gene_name = String::new();
let mut cds_start: Option<u64> = None;
let mut cds_end: Option<u64> = None;
for line in genbank.lines() {
if line.starts_with("ORIGIN") {
in_origin = true;
continue;
}
if in_origin {
if line.starts_with("//") {
break;
}
let seq_part: String = line
.chars()
.filter(|c| c.is_alphabetic())
.collect::<String>()
.to_uppercase();
sequence.push_str(&seq_part);
}
if line.contains("/gene=\"") {
if let Some(start) = line.find("/gene=\"") {
if let Some(end) = line[start + 7..].find('"') {
gene_name = line[start + 7..start + 7 + end].to_string();
}
}
}
let trimmed = line.trim();
if trimmed.starts_with("CDS") && !trimmed.contains('/') {
let coords = trimmed.trim_start_matches("CDS").trim();
let coords = coords
.trim_start_matches("complement(")
.trim_end_matches(')');
if coords.starts_with("join(") {
let inner = coords.trim_start_matches("join(").trim_end_matches(')');
let parts: Vec<&str> = inner.split(',').collect();
if let Some(first) = parts.first() {
if let Some((start, _)) = first.split_once("..") {
cds_start = start.trim().trim_start_matches('<').parse().ok();
}
}
if let Some(last) = parts.last() {
if let Some((_, end)) = last.split_once("..") {
cds_end = end.trim().trim_end_matches('>').parse().ok();
}
}
} else if let Some((start, end)) = coords.split_once("..") {
cds_start = start.trim().trim_start_matches('<').parse().ok();
cds_end = end.trim().trim_end_matches('>').parse().ok();
}
}
}
if sequence.is_empty() {
return None;
}
Some((sequence, cds_start, cds_end, gene_name))
}
pub fn fetch_legacy_versions(
legacy_accessions: &[String],
output_fasta: &Path,
dry_run: bool,
) -> Result<usize, FerroError> {
if legacy_accessions.is_empty() {
return Ok(0);
}
if dry_run {
eprintln!(
" [dry run] Would fetch {} legacy versions",
legacy_accessions.len()
);
for acc in legacy_accessions.iter().take(10) {
eprintln!(" {}", acc);
}
if legacy_accessions.len() > 10 {
eprintln!(" ... and {} more", legacy_accessions.len() - 10);
}
return Ok(0);
}
eprintln!(
" Fetching {} legacy transcript versions (GenBank format for CDS metadata)...",
legacy_accessions.len()
);
let mut fasta_file = File::create(output_fasta).map_err(|e| FerroError::Io {
msg: format!("Failed to create {}: {}", output_fasta.display(), e),
})?;
let metadata_path = output_fasta.with_extension("metadata.json");
let mut metadata = LegacyMetadata {
generated_at: chrono::Utc::now().to_rfc3339(),
transcripts: HashMap::new(),
};
let mut fetched = 0;
let mut failed = Vec::new();
let pb = ProgressBar::new(legacy_accessions.len() as u64);
pb.set_style(
ProgressStyle::default_bar()
.template(" [{elapsed_precise}] {bar:40.cyan/blue} {pos}/{len} {msg}")
.unwrap()
.progress_chars("##-"),
);
for acc in legacy_accessions {
pb.set_message(acc.clone());
let url = format!(
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={}&rettype=gb&retmode=text",
acc
);
let output = Command::new("curl")
.args(["-s", "-L", &url])
.output()
.map_err(|e| FerroError::Io {
msg: format!("Failed to fetch {}: {}", acc, e),
})?;
let genbank_text = String::from_utf8_lossy(&output.stdout);
if output.status.success() && genbank_text.contains("LOCUS") {
if let Some((seq, cds_start, cds_end, gene_name)) = parse_genbank_record(&genbank_text)
{
let header = if gene_name.is_empty() {
acc.clone()
} else {
format!("{} {}", acc, gene_name)
};
writeln!(fasta_file, ">{}", header).map_err(|e| FerroError::Io {
msg: format!("Failed to write {}: {}", acc, e),
})?;
for chunk in seq.as_bytes().chunks(70) {
writeln!(fasta_file, "{}", std::str::from_utf8(chunk).unwrap_or("")).map_err(
|e| FerroError::Io {
msg: format!("Failed to write sequence: {}", e),
},
)?;
}
metadata.transcripts.insert(
acc.clone(),
LegacyTranscript {
id: acc.clone(),
gene_symbol: if gene_name.is_empty() {
None
} else {
Some(gene_name)
},
cds_start,
cds_end,
sequence_length: seq.len(),
},
);
fetched += 1;
} else {
failed.push(acc.clone());
}
} else {
failed.push(acc.clone());
}
pb.inc(1);
std::thread::sleep(std::time::Duration::from_millis(100));
}
pb.finish_and_clear();
let metadata_file = File::create(&metadata_path).map_err(|e| FerroError::Io {
msg: format!("Failed to create metadata file: {}", e),
})?;
serde_json::to_writer_pretty(metadata_file, &metadata).map_err(|e| FerroError::Io {
msg: format!("Failed to write metadata: {}", e),
})?;
if !failed.is_empty() {
eprintln!(" Warning: Failed to fetch {} accessions:", failed.len());
for acc in failed.iter().take(5) {
eprintln!(" {}", acc);
}
if failed.len() > 5 {
eprintln!(" ... and {} more", failed.len() - 5);
}
}
eprintln!(" Fetched {} legacy transcripts with CDS metadata", fetched);
eprintln!(" Metadata written to: {}", metadata_path.display());
Ok(fetched)
}
pub fn detect_patterns_from_ferro_reference(ferro_ref: &Path) -> Option<PathBuf> {
let patterns_dir = ferro_ref.join("patterns");
if !patterns_dir.exists() {
return None;
}
std::fs::read_dir(&patterns_dir)
.ok()?
.filter_map(|e| e.ok())
.map(|e| e.path())
.find(|p| p.extension().map(|ext| ext == "txt").unwrap_or(false))
}
pub fn detect_clinvar_from_ferro_reference(ferro_ref: &Path) -> Option<PathBuf> {
let clinvar_dir = ferro_ref.join("clinvar");
if !clinvar_dir.exists() {
return None;
}
std::fs::read_dir(&clinvar_dir)
.ok()?
.filter_map(|e| e.ok())
.map(|e| e.path())
.find(|p| p.extension().map(|ext| ext == "gz").unwrap_or(false))
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
#[test]
fn test_build_fasta_index() {
let dir = TempDir::new().unwrap();
let fasta_path = dir.path().join("test.fna");
let mut f = File::create(&fasta_path).unwrap();
writeln!(f, ">seq1 description").unwrap();
writeln!(f, "ATGCATGC").unwrap();
writeln!(f, "ATGCATGC").unwrap();
writeln!(f, ">seq2").unwrap();
writeln!(f, "GGGGCCCC").unwrap();
build_fasta_index(&fasta_path).unwrap();
let fai_path = PathBuf::from(format!("{}.fai", fasta_path.display()));
assert!(fai_path.exists());
let content = std::fs::read_to_string(&fai_path).unwrap();
let lines: Vec<&str> = content.lines().collect();
assert_eq!(lines.len(), 2);
assert!(lines[0].starts_with("seq1\t16\t")); assert!(lines[1].starts_with("seq2\t8\t")); }
#[test]
fn test_prepare_config_default() {
let config = PrepareConfig::default();
assert_eq!(config.output_dir, PathBuf::from("ferro-reference"));
assert!(config.download_transcripts);
assert!(config.download_genome);
assert!(!config.download_genome_grch37);
assert!(!config.download_refseqgene);
assert!(!config.download_lrg);
assert!(config.download_cdot);
assert!(!config.download_cdot_grch37);
assert!(config.skip_existing);
}
#[test]
fn test_detect_patterns_from_ferro_reference_with_patterns() {
let temp_dir = TempDir::new().unwrap();
let ferro_ref = temp_dir.path();
let patterns_dir = ferro_ref.join("patterns");
std::fs::create_dir_all(&patterns_dir).unwrap();
let patterns_file = patterns_dir.join("clinvar_patterns.txt");
std::fs::write(
&patterns_file,
"NM_000001.1:c.100A>G\nNM_000002.2:c.200del\n",
)
.unwrap();
let detected = detect_patterns_from_ferro_reference(ferro_ref);
assert!(detected.is_some());
assert_eq!(detected.unwrap(), patterns_file);
}
#[test]
fn test_detect_patterns_from_ferro_reference_no_patterns_dir() {
let temp_dir = TempDir::new().unwrap();
let ferro_ref = temp_dir.path();
let detected = detect_patterns_from_ferro_reference(ferro_ref);
assert!(detected.is_none());
}
#[test]
fn test_detect_patterns_from_ferro_reference_empty_patterns_dir() {
let temp_dir = TempDir::new().unwrap();
let ferro_ref = temp_dir.path();
let patterns_dir = ferro_ref.join("patterns");
std::fs::create_dir_all(&patterns_dir).unwrap();
let detected = detect_patterns_from_ferro_reference(ferro_ref);
assert!(detected.is_none());
}
#[test]
fn test_detect_patterns_from_ferro_reference_ignores_non_txt() {
let temp_dir = TempDir::new().unwrap();
let ferro_ref = temp_dir.path();
let patterns_dir = ferro_ref.join("patterns");
std::fs::create_dir_all(&patterns_dir).unwrap();
std::fs::write(patterns_dir.join("patterns.json"), "{}").unwrap();
std::fs::write(patterns_dir.join("readme.md"), "# Readme").unwrap();
let detected = detect_patterns_from_ferro_reference(ferro_ref);
assert!(detected.is_none());
}
#[test]
fn test_detect_patterns_finds_first_txt_file() {
let temp_dir = TempDir::new().unwrap();
let ferro_ref = temp_dir.path();
let patterns_dir = ferro_ref.join("patterns");
std::fs::create_dir_all(&patterns_dir).unwrap();
std::fs::write(patterns_dir.join("a_patterns.txt"), "pattern1").unwrap();
std::fs::write(patterns_dir.join("b_patterns.txt"), "pattern2").unwrap();
let detected = detect_patterns_from_ferro_reference(ferro_ref);
assert!(detected.is_some());
let path = detected.unwrap();
assert!(path.extension().map(|e| e == "txt").unwrap_or(false));
}
#[test]
fn test_detect_clinvar_from_ferro_reference_with_clinvar() {
let temp_dir = TempDir::new().unwrap();
let ferro_ref = temp_dir.path();
let clinvar_dir = ferro_ref.join("clinvar");
std::fs::create_dir_all(&clinvar_dir).unwrap();
let clinvar_file = clinvar_dir.join("hgvs4variation.txt.gz");
std::fs::write(&clinvar_file, "mock clinvar data").unwrap();
let detected = detect_clinvar_from_ferro_reference(ferro_ref);
assert!(detected.is_some());
assert_eq!(detected.unwrap(), clinvar_file);
}
#[test]
fn test_detect_clinvar_from_ferro_reference_no_clinvar_dir() {
let temp_dir = TempDir::new().unwrap();
let ferro_ref = temp_dir.path();
let detected = detect_clinvar_from_ferro_reference(ferro_ref);
assert!(detected.is_none());
}
#[test]
fn test_detect_clinvar_from_ferro_reference_empty_clinvar_dir() {
let temp_dir = TempDir::new().unwrap();
let ferro_ref = temp_dir.path();
let clinvar_dir = ferro_ref.join("clinvar");
std::fs::create_dir_all(&clinvar_dir).unwrap();
let detected = detect_clinvar_from_ferro_reference(ferro_ref);
assert!(detected.is_none());
}
#[test]
fn test_detect_clinvar_from_ferro_reference_ignores_non_gz() {
let temp_dir = TempDir::new().unwrap();
let ferro_ref = temp_dir.path();
let clinvar_dir = ferro_ref.join("clinvar");
std::fs::create_dir_all(&clinvar_dir).unwrap();
std::fs::write(clinvar_dir.join("data.txt"), "plain text").unwrap();
std::fs::write(clinvar_dir.join("readme.md"), "# Readme").unwrap();
let detected = detect_clinvar_from_ferro_reference(ferro_ref);
assert!(detected.is_none());
}
}