#![allow(clippy::type_complexity)]
use crate::benchmark::translate::derive_protein_from_transcript;
use crate::coords::hgvs_pos_to_index;
use crate::data::cdot::{CdotMapper, CdotTranscript};
use crate::FerroError;
use rayon::prelude::*;
use serde_json::json;
use std::collections::{HashMap, HashSet};
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;
use std::sync::atomic::{AtomicUsize, Ordering};
fn get_ncbi_api_key() -> Option<String> {
std::env::var("NCBI_API_KEY").ok().filter(|k| !k.is_empty())
}
fn build_efetch_url(db: &str, ids: &str, rettype: &str, retmode: &str) -> String {
let base = format!(
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db={}&id={}&rettype={}&retmode={}",
db, ids, rettype, retmode
);
if let Some(key) = get_ncbi_api_key() {
format!("{}&api_key={}", base, key)
} else {
base
}
}
#[allow(dead_code)]
fn build_efetch_url_with_history(
db: &str,
web_env: &str,
query_key: &str,
retstart: usize,
retmax: usize,
rettype: &str,
retmode: &str,
) -> String {
let base = format!(
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db={}&WebEnv={}&query_key={}&retstart={}&retmax={}&rettype={}&retmode={}",
db, web_env, query_key, retstart, retmax, rettype, retmode
);
if let Some(key) = get_ncbi_api_key() {
format!("{}&api_key={}", base, key)
} else {
base
}
}
#[allow(dead_code)]
fn epost_ids(db: &str, ids: &[&String]) -> Result<(String, String), FerroError> {
use std::process::Command;
let url = if let Some(key) = get_ncbi_api_key() {
format!(
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi?db={}&api_key={}",
db, key
)
} else {
format!(
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi?db={}",
db
)
};
let id_list = ids.iter().map(|s| s.as_str()).collect::<Vec<_>>().join(",");
let output = Command::new("curl")
.args(["-s", "-X", "POST", "-d", &format!("id={}", id_list), &url])
.output()
.map_err(|e| FerroError::Io {
msg: format!("Failed to POST to NCBI epost: {}", e),
})?;
if !output.status.success() {
return Err(FerroError::Io {
msg: "NCBI epost request failed".to_string(),
});
}
let response = String::from_utf8_lossy(&output.stdout);
let web_env = response
.split("<WebEnv>")
.nth(1)
.and_then(|s| s.split("</WebEnv>").next())
.ok_or_else(|| FerroError::Io {
msg: "Failed to parse WebEnv from epost response".to_string(),
})?
.to_string();
let query_key = response
.split("<QueryKey>")
.nth(1)
.and_then(|s| s.split("</QueryKey>").next())
.ok_or_else(|| FerroError::Io {
msg: "Failed to parse QueryKey from epost response".to_string(),
})?
.to_string();
Ok((web_env, query_key))
}
pub fn populate_mutalyzer_cache<P: AsRef<Path>>(
reference_dir: P,
cache_dir: P,
filter_file: Option<P>,
) -> Result<CacheStats, FerroError> {
let reference_dir = reference_dir.as_ref();
let cache_dir = cache_dir.as_ref();
std::fs::create_dir_all(cache_dir).map_err(|e| FerroError::Io {
msg: format!("Failed to create cache dir {}: {}", cache_dir.display(), e),
})?;
let filter_set: Option<HashSet<String>> = if let Some(filter_path) = filter_file {
let filter_path = filter_path.as_ref();
eprintln!("Loading filter list from {}...", filter_path.display());
let file = File::open(filter_path).map_err(|e| FerroError::Io {
msg: format!("Failed to open filter file: {}", e),
})?;
let reader = BufReader::new(file);
let set: HashSet<String> = reader
.lines()
.map_while(Result::ok)
.map(|l| l.trim().to_string())
.filter(|l| !l.is_empty() && !l.starts_with('#'))
.collect();
eprintln!(" Loaded {} accessions to filter", set.len());
Some(set)
} else {
None
};
let manifest_path = reference_dir.join("manifest.json");
if !manifest_path.exists() {
return Err(FerroError::Io {
msg: format!(
"Manifest not found at {}. Run 'ferro-benchmark prepare' first.",
manifest_path.display()
),
});
}
let manifest: serde_json::Value = {
let file = File::open(&manifest_path).map_err(|e| FerroError::Io {
msg: format!("Failed to open manifest: {}", e),
})?;
serde_json::from_reader(file).map_err(|e| FerroError::Json {
msg: format!("Failed to parse manifest: {}", e),
})?
};
let cdot_relative = manifest
.get("cdot_json")
.and_then(|v| v.as_str())
.ok_or_else(|| FerroError::Io {
msg: "No cdot_json in manifest. Run 'ferro-benchmark prepare' with cdot.".to_string(),
})?;
let cdot_path = reference_dir.join(cdot_relative);
eprintln!("Loading cdot from {}...", cdot_path.display());
let mut cdot = if cdot_relative.ends_with(".gz") {
CdotMapper::from_json_gz(&cdot_path)?
} else {
CdotMapper::from_json_file(&cdot_path)?
};
let transcript_count = cdot.transcript_count();
eprintln!(" Loaded {} transcripts", transcript_count);
if let Some(lrg_mapping_relative) = manifest.get("lrg_refseq_mapping").and_then(|v| v.as_str())
{
let path = reference_dir.join(lrg_mapping_relative);
if path.exists() {
let count = cdot.load_lrg_mapping(&path)?;
eprintln!(" Loaded {} LRG to RefSeq mappings", count);
}
}
eprintln!("Loading transcript sequences from FASTA files...");
let mut sequences = load_fasta_sequences_parallel(reference_dir)?;
eprintln!(" Loaded {} transcript sequences", sequences.len());
let supplemental_dir = reference_dir.join("supplemental");
let mut supplemental_cds: HashMap<String, (Option<u64>, Option<u64>, Option<String>)> =
HashMap::new();
if supplemental_dir.exists() {
let supplemental = load_fasta_sequences_parallel(&supplemental_dir)?;
let count = supplemental.len();
for (id, seq) in supplemental {
sequences.push((id, seq));
}
if count > 0 {
eprintln!(" Loaded {} supplemental transcript sequences", count);
}
let metadata_path = supplemental_dir.join("patterns_transcripts.metadata.json");
if metadata_path.exists() {
let file = File::open(&metadata_path).map_err(|e| FerroError::Io {
msg: format!("Failed to open {}: {}", metadata_path.display(), e),
})?;
let metadata: SupplementalMetadata =
serde_json::from_reader(file).map_err(|e| FerroError::Json {
msg: format!("Failed to parse {}: {}", metadata_path.display(), e),
})?;
for (acc, info) in metadata.transcripts {
supplemental_cds.insert(acc, (info.cds_start, info.cds_end, info.gene_symbol));
}
eprintln!(
" Loaded CDS metadata for {} supplemental transcripts",
supplemental_cds.len()
);
}
}
let genome_dir = reference_dir.join("genome");
if genome_dir.exists() {
eprintln!("Loading genomic sequences from {}...", genome_dir.display());
let genomic_seqs = load_genomic_sequences(&genome_dir)?;
eprintln!(" Loaded {} genomic sequences", genomic_seqs.len());
sequences.extend(genomic_seqs);
}
let refseqgene_dir = reference_dir.join("refseqgene");
if refseqgene_dir.exists() {
eprintln!(
"Loading RefSeqGene sequences from {}...",
refseqgene_dir.display()
);
let refseqgene_seqs = load_refseqgene_sequences(&refseqgene_dir)?;
eprintln!(" Loaded {} RefSeqGene sequences", refseqgene_seqs.len());
sequences.extend(refseqgene_seqs);
}
eprintln!("Fetching legacy GenBank sequences...");
let legacy_accessions: Vec<String> = LEGACY_GENBANK_ACCESSIONS
.iter()
.map(|s| s.to_string())
.collect();
let legacy_fetched =
fetch_legacy_genbank_sequences(&legacy_accessions, cache_dir, Some(reference_dir))?;
eprintln!(" Fetched {} legacy GenBank sequences", legacy_fetched);
eprintln!("Fetching legacy transcript versions...");
let legacy_transcripts_fetched =
fetch_legacy_transcript_versions(cache_dir, Some(reference_dir))?;
eprintln!(
" Fetched {} legacy transcript versions",
legacy_transcripts_fetched
);
let sequences_to_process: Vec<_> = if let Some(ref filter) = filter_set {
sequences
.into_iter()
.filter(|(id, _)| filter.contains(id))
.collect()
} else {
sequences.into_iter().collect()
};
let total_to_process = sequences_to_process.len();
eprintln!(
"Creating {} mutalyzer cache entries ({} threads)...",
total_to_process,
rayon::current_num_threads()
);
let created = AtomicUsize::new(0);
let errors = AtomicUsize::new(0);
sequences_to_process.par_iter().for_each(|(tx_id, seq)| {
let result = create_cache_entry(cache_dir, tx_id, seq, cdot.get_transcript(tx_id));
if result.is_ok() {
let count = created.fetch_add(1, Ordering::Relaxed) + 1;
if count.is_multiple_of(50000) {
eprintln!(" Created {} cache entries...", count);
}
} else {
errors.fetch_add(1, Ordering::Relaxed);
}
});
let cache_entries_created = created.load(Ordering::Relaxed);
let error_count = errors.load(Ordering::Relaxed);
eprintln!(
"Created {} cache entries in {} ({} errors)",
cache_entries_created,
cache_dir.display(),
error_count
);
eprintln!("Deriving protein sequences from transcripts...");
let mut proteins_derived = 0usize;
let mut proteins_skipped = 0usize;
for (tx_id, seq) in &sequences_to_process {
if !tx_id.starts_with("NM_") && !tx_id.starts_with("XM_") {
continue;
}
let (cds_start, cds_end, protein_id) = if let Some(tx) = cdot.get_transcript(tx_id) {
let start = tx.cds_start.unwrap_or(0) as usize;
let end = tx.cds_end.unwrap_or(seq.len() as u64) as usize;
let prot_id = tx
.protein
.clone()
.unwrap_or_else(|| tx_id.replace("NM_", "NP_").replace("XM_", "XP_"));
(start, end, prot_id)
} else if let Some((start, end, _gene)) = supplemental_cds.get(tx_id) {
let start = start.map(|s| (s - 1) as usize).unwrap_or(0);
let end = end.map(|e| e as usize).unwrap_or(seq.len());
let prot_id = tx_id.replace("NM_", "NP_").replace("XM_", "XP_");
(start, end, prot_id)
} else {
continue; };
let prot_seq_path = cache_dir.join(format!("{}.sequence", protein_id));
if prot_seq_path.exists() {
proteins_skipped += 1;
continue;
}
if let Some(protein_seq) = derive_protein_from_transcript(seq, cds_start, cds_end) {
if let Err(e) = std::fs::write(&prot_seq_path, &protein_seq) {
eprintln!(
" Warning: Failed to write {}: {}",
prot_seq_path.display(),
e
);
continue;
}
let annotations = build_protein_annotations(&protein_id, protein_seq.len());
let ann_path = cache_dir.join(format!("{}.annotations", protein_id));
if let Err(e) = std::fs::write(
&ann_path,
serde_json::to_string(&annotations).unwrap_or_default(),
) {
eprintln!(" Warning: Failed to write {}: {}", ann_path.display(), e);
continue;
}
proteins_derived += 1;
}
}
eprintln!(
" Derived {} protein sequences ({} already cached)",
proteins_derived, proteins_skipped
);
let settings_path = cache_dir.join("mutalyzer_settings.conf");
let settings_content = format!(
"MUTALYZER_CACHE_DIR = {}\nMUTALYZER_FILE_CACHE_ADD = false\n",
cache_dir.display()
);
std::fs::write(&settings_path, settings_content).map_err(|e| FerroError::Io {
msg: format!("Failed to write settings: {}", e),
})?;
eprintln!("Settings file: {}", settings_path.display());
Ok(CacheStats {
transcripts_processed: total_to_process,
sequences_matched: total_to_process,
cache_entries_created,
legacy_sequences_fetched: legacy_fetched + legacy_transcripts_fetched,
})
}
fn create_cache_entry(
cache_dir: &Path,
tx_id: &str,
seq: &str,
tx_data: Option<&CdotTranscript>,
) -> Result<(), FerroError> {
let seq_path = cache_dir.join(format!("{}.sequence", tx_id));
let ann_path = cache_dir.join(format!("{}.annotations", tx_id));
if seq_path.exists() && ann_path.exists() {
return Ok(());
}
std::fs::write(&seq_path, seq).map_err(|e| FerroError::Io {
msg: format!("Failed to write {}: {}", seq_path.display(), e),
})?;
let annotations = if let Some(tx) = tx_data {
build_annotations_from_cdot(tx_id, seq.len(), tx)
} else {
build_minimal_annotations(tx_id, seq.len())
};
let ann_file = File::create(&ann_path).map_err(|e| FerroError::Io {
msg: format!("Failed to create {}: {}", ann_path.display(), e),
})?;
serde_json::to_writer(ann_file, &annotations).map_err(|e| FerroError::Io {
msg: format!("Failed to write annotations: {}", e),
})?;
Ok(())
}
#[derive(Debug, Clone)]
pub struct CacheStats {
pub transcripts_processed: usize,
pub sequences_matched: usize,
pub cache_entries_created: usize,
pub legacy_sequences_fetched: usize,
}
fn load_fasta_sequences(fasta_path: &Path) -> Result<HashMap<String, String>, FerroError> {
let file = File::open(fasta_path).map_err(|e| FerroError::Io {
msg: format!("Failed to open {}: {}", fasta_path.display(), e),
})?;
let reader = BufReader::new(file);
let mut sequences = HashMap::new();
let mut current_id: Option<String> = None;
let mut current_seq = String::new();
for line in reader.lines() {
let line = line.map_err(|e| FerroError::Io {
msg: format!("Failed to read line: {}", e),
})?;
if let Some(header) = line.strip_prefix('>') {
if let Some(id) = current_id.take() {
if !current_seq.is_empty() {
sequences.insert(id, std::mem::take(&mut current_seq));
}
}
current_id = header.split_whitespace().next().map(|s| s.to_string());
} else if current_id.is_some() {
current_seq.push_str(line.trim());
}
}
if let Some(id) = current_id {
if !current_seq.is_empty() {
sequences.insert(id, current_seq);
}
}
Ok(sequences)
}
fn load_fasta_sequences_parallel<P: AsRef<Path>>(
dir: P,
) -> Result<Vec<(String, String)>, FerroError> {
let dir = dir.as_ref();
let mut sequences = HashMap::new();
let fasta_files: Vec<_> = std::fs::read_dir(dir)
.map_err(|e| FerroError::Io {
msg: format!("Failed to read {}: {}", dir.display(), e),
})?
.filter_map(|e| e.ok())
.map(|e| e.path())
.filter(|p| {
p.extension()
.is_some_and(|e| e == "fna" || e == "fa" || e == "fasta")
})
.collect();
let mut all_fasta = fasta_files;
for entry in std::fs::read_dir(dir).into_iter().flatten().flatten() {
let path = entry.path();
if path.is_dir() {
if let Ok(subdir) = std::fs::read_dir(&path) {
for subentry in subdir.flatten() {
let subpath = subentry.path();
if subpath
.extension()
.is_some_and(|e| e == "fna" || e == "fa" || e == "fasta")
{
all_fasta.push(subpath);
}
}
}
}
}
for fasta_path in all_fasta {
let file = File::open(&fasta_path).map_err(|e| FerroError::Io {
msg: format!("Failed to open {}: {}", fasta_path.display(), e),
})?;
let reader = BufReader::new(file);
let mut current_id: Option<String> = None;
let mut current_seq = String::new();
for line in reader.lines() {
let line = line.map_err(|e| FerroError::Io {
msg: format!("Failed to read line: {}", e),
})?;
if let Some(header) = line.strip_prefix('>') {
if let Some(id) = current_id.take() {
if !current_seq.is_empty() {
sequences.insert(id, std::mem::take(&mut current_seq));
}
}
current_id = header.split_whitespace().next().map(|s| s.to_string());
} else if current_id.is_some() {
current_seq.push_str(line.trim());
}
}
if let Some(id) = current_id {
if !current_seq.is_empty() {
sequences.insert(id, current_seq);
}
}
}
Ok(sequences.into_iter().collect())
}
fn build_annotations_from_cdot(
tx_id: &str,
seq_len: usize,
tx: &CdotTranscript,
) -> serde_json::Value {
let gene_name = tx.gene_name.clone().unwrap_or_else(|| tx_id.to_string());
let (mol_type, feature_type, is_coding) = if tx_id.starts_with("NM_")
|| tx_id.starts_with("XM_")
{
("mRNA", "mRNA", true)
} else if tx_id.starts_with("NR_") || tx_id.starts_with("XR_") {
("ncRNA", "ncRNA", false)
} else if tx_id.starts_with("NG_") || tx_id.starts_with("NC_") || tx_id.starts_with("LRG_") {
("genomic DNA", "gene", false)
} else {
("mRNA", "mRNA", true) };
let mut exon_features: Vec<serde_json::Value> = Vec::new();
for (i, exon) in tx.exons.iter().enumerate() {
let tx_start = hgvs_pos_to_index(exon[2]); let tx_end = exon[3] as usize; exon_features.push(json!({
"type": "exon",
"location": {
"type": "range",
"start": {"type": "point", "position": tx_start},
"end": {"type": "point", "position": tx_end},
"strand": 1
},
"id": format!("id-{}-{}", gene_name, i + 1)
}));
}
let mut mrna_features: Vec<serde_json::Value> = Vec::new();
if is_coding && tx.cds_start.is_some() && tx.cds_end.is_some() {
let cds_start = tx.cds_start.map(|p| p as usize).unwrap_or(0);
let cds_end = tx.cds_end.map(|p| p as usize).unwrap_or(seq_len);
let protein_id = tx
.protein
.clone()
.unwrap_or_else(|| format!("{}_protein", tx_id));
mrna_features.push(json!({
"type": "CDS",
"location": {
"type": "range",
"start": {"type": "point", "position": cds_start},
"end": {"type": "point", "position": cds_end},
"strand": 1
},
"id": protein_id,
"features": []
}));
}
mrna_features.extend(exon_features);
json!({
"id": tx_id,
"type": "record",
"location": {
"type": "range",
"start": {"type": "point", "position": 0},
"end": {"type": "point", "position": seq_len}
},
"qualifiers": {
"name": gene_name,
"mol_type": mol_type
},
"features": [{
"type": "gene",
"location": {
"type": "range",
"start": {"type": "point", "position": 0},
"end": {"type": "point", "position": seq_len},
"strand": 1
},
"id": gene_name.clone(),
"qualifiers": {"name": gene_name},
"features": [{
"id": tx_id,
"type": feature_type,
"location": {
"type": "range",
"start": {"type": "point", "position": 0},
"end": {"type": "point", "position": seq_len}
},
"features": mrna_features
}]
}]
})
}
fn load_genomic_sequences<P: AsRef<Path>>(dir: P) -> Result<Vec<(String, String)>, FerroError> {
let dir = dir.as_ref();
let mut sequences = Vec::new();
let fasta_files: Vec<_> = std::fs::read_dir(dir)
.map_err(|e| FerroError::Io {
msg: format!("Failed to read {}: {}", dir.display(), e),
})?
.filter_map(|e| e.ok())
.map(|e| e.path())
.filter(|p| {
p.extension()
.is_some_and(|e| e == "fna" || e == "fa" || e == "fasta")
&& p.file_name().and_then(|n| n.to_str()).is_some_and(|n| {
n.starts_with("GCF_") || n.starts_with("GRCh") || n.contains("genomic")
})
})
.collect();
for fasta_path in fasta_files {
eprintln!(" Reading {}...", fasta_path.display());
let file = File::open(&fasta_path).map_err(|e| FerroError::Io {
msg: format!("Failed to open {}: {}", fasta_path.display(), e),
})?;
let reader = BufReader::with_capacity(8 * 1024 * 1024, file);
let mut current_id: Option<String> = None;
let mut current_seq = String::new();
let mut skipped = 0usize;
for line in reader.lines() {
let line = line.map_err(|e| FerroError::Io {
msg: format!("Failed to read line: {}", e),
})?;
if let Some(header) = line.strip_prefix('>') {
if let Some(id) = current_id.take() {
if is_genomic_accession(&id) && !current_seq.is_empty() {
sequences.push((id, std::mem::take(&mut current_seq)));
} else {
skipped += 1;
current_seq.clear();
}
}
let id = header.split_whitespace().next().map(|s| s.to_string());
if let Some(ref acc) = id {
if is_genomic_accession(acc) {
current_id = id;
} else {
current_id = None;
}
}
} else if current_id.is_some() {
current_seq.push_str(line.trim());
}
}
if let Some(id) = current_id {
if is_genomic_accession(&id) && !current_seq.is_empty() {
sequences.push((id, current_seq));
} else {
skipped += 1;
}
}
if skipped > 0 {
eprintln!(" Skipped {} non-genomic sequences", skipped);
}
}
Ok(sequences)
}
fn is_genomic_accession(acc: &str) -> bool {
acc.starts_with("NC_") || acc.starts_with("NT_") || acc.starts_with("NW_")
}
pub fn fetch_legacy_genbank_sequences(
accessions: &[String],
output_dir: &Path,
ferro_reference: Option<&Path>,
) -> Result<usize, FerroError> {
use std::process::Command;
if accessions.is_empty() {
return Ok(0);
}
std::fs::create_dir_all(output_dir).map_err(|e| FerroError::Io {
msg: format!("Failed to create legacy dir: {}", e),
})?;
let ferro_genbank_sequences: HashMap<String, String> = if let Some(ferro_ref) = ferro_reference
{
let genbank_fasta = ferro_ref.join("supplemental/legacy_genbank.fna");
if genbank_fasta.exists() {
load_fasta_sequences(&genbank_fasta).unwrap_or_default()
} else {
HashMap::new()
}
} else {
HashMap::new()
};
if !ferro_genbank_sequences.is_empty() {
eprintln!(
" Found {} GenBank sequences in ferro reference",
ferro_genbank_sequences.len()
);
}
let mut fetched = 0;
let mut from_ferro = 0;
for acc in accessions {
let seq_path = output_dir.join(format!("{}.sequence", acc));
if seq_path.exists() {
fetched += 1;
continue;
}
if let Some(sequence) = ferro_genbank_sequences.get(acc) {
std::fs::write(&seq_path, sequence).map_err(|e| FerroError::Io {
msg: format!("Failed to write {}: {}", acc, e),
})?;
let annotations = build_minimal_annotations(acc, sequence.len());
let ann_path = output_dir.join(format!("{}.annotations", acc));
let ann_file = std::fs::File::create(&ann_path).map_err(|e| FerroError::Io {
msg: format!("Failed to create annotations for {}: {}", acc, e),
})?;
serde_json::to_writer(ann_file, &annotations).map_err(|e| FerroError::Io {
msg: format!("Failed to write annotations for {}: {}", acc, e),
})?;
from_ferro += 1;
fetched += 1;
continue;
}
eprintln!(" Fetching {} from NCBI...", acc);
let url = build_efetch_url("nuccore", acc, "fasta", "text");
let output = Command::new("curl")
.args(["-s", "-L", &url])
.output()
.map_err(|e| FerroError::Io {
msg: format!("Failed to fetch {}: {}", acc, e),
})?;
if !output.status.success() {
eprintln!(" Warning: Failed to fetch {}", acc);
continue;
}
let fasta = String::from_utf8_lossy(&output.stdout);
let mut seq = String::new();
for line in fasta.lines() {
if !line.starts_with('>') && !line.is_empty() {
seq.push_str(line.trim());
}
}
if seq.is_empty() {
eprintln!(" Warning: Empty sequence for {}", acc);
continue;
}
std::fs::write(&seq_path, &seq).map_err(|e| FerroError::Io {
msg: format!("Failed to write {}: {}", seq_path.display(), e),
})?;
let ann_path = output_dir.join(format!("{}.annotations", acc));
let annotations = build_minimal_annotations(acc, seq.len());
let mut ann_file = File::create(&ann_path).map_err(|e| FerroError::Io {
msg: format!("Failed to create {}: {}", ann_path.display(), e),
})?;
serde_json::to_writer(&mut ann_file, &annotations).map_err(|e| FerroError::Io {
msg: format!("Failed to write annotations: {}", e),
})?;
fetched += 1;
std::thread::sleep(std::time::Duration::from_millis(350));
}
if from_ferro > 0 {
eprintln!(
" Loaded {} GenBank sequences from ferro reference",
from_ferro
);
}
Ok(fetched)
}
pub const LEGACY_GENBANK_ACCESSIONS: &[&str] = &[
"U31929.1",
"M75126.1",
"M22590.1",
"AJ298105.1",
"AF319569.1",
];
pub const LEGACY_TRANSCRIPT_VERSIONS: &[(&str, Option<&str>)] = &[
("NM_000051.3", None), ("NM_000174.4", None), ("NM_000246.3", None), ("NM_000267.3", None), ("NM_000280.5", None), ("NM_000426.3", None), ("NM_000449.3", None), ("NM_001127221.1", None), ("NM_001271223.2", None), ("NM_001458.4", None), ("NM_002529.3", None), ("NM_002878.3", None), ("NM_004006.2", None), ("NM_004082.4", None), ("NM_004321.7", None), ("NM_004364.4", None), ("NM_015125.4", None), ("NM_015243.2", None), ("NM_015247.2", None), ("NM_017890.4", None), ("NM_018400.3", None), ("NM_018993.3", None), ("NM_019023.4", None), ("NM_021946.4", None), ("NM_030962.3", None), ("NM_030984.5", None), ("NM_032790.3", None), ("NM_033517.2", Some("NM_033517.1")), ("NM_138387.3", None), ("NM_152383.4", None), ("NM_172201.1", None), ("NM_177438.2", None), ("NM_203447.3", None), ];
pub fn fetch_legacy_transcript_versions(
output_dir: &Path,
ferro_reference: Option<&Path>,
) -> Result<usize, FerroError> {
use std::process::Command;
let mut fetched = 0;
let mut from_ferro = 0;
let ferro_legacy_sequences: HashMap<String, String> = if let Some(ferro_ref) = ferro_reference {
let legacy_fasta = ferro_ref.join("supplemental/legacy_transcripts.fna");
if legacy_fasta.exists() {
load_fasta_sequences(&legacy_fasta).unwrap_or_default()
} else {
HashMap::new()
}
} else {
HashMap::new()
};
if !ferro_legacy_sequences.is_empty() {
eprintln!(
" Found {} legacy transcripts in ferro reference",
ferro_legacy_sequences.len()
);
}
for (accession, fallback) in LEGACY_TRANSCRIPT_VERSIONS {
let seq_path = output_dir.join(format!("{}.sequence", accession));
if seq_path.exists() {
fetched += 1;
continue;
}
if let Some(sequence) = ferro_legacy_sequences.get(*accession) {
std::fs::write(&seq_path, sequence).map_err(|e| FerroError::Io {
msg: format!("Failed to write {}: {}", accession, e),
})?;
let annotations = build_minimal_annotations(accession, sequence.len());
let ann_path = output_dir.join(format!("{}.annotations", accession));
let ann_file = std::fs::File::create(&ann_path).map_err(|e| FerroError::Io {
msg: format!("Failed to create annotations for {}: {}", accession, e),
})?;
serde_json::to_writer(ann_file, &annotations).map_err(|e| FerroError::Io {
msg: format!("Failed to write annotations for {}: {}", accession, e),
})?;
from_ferro += 1;
fetched += 1;
continue;
}
eprintln!(" Fetching {} from NCBI...", accession);
let url = build_efetch_url("nuccore", accession, "gb", "text");
let output = Command::new("curl")
.args(["-s", "-L", &url])
.output()
.map_err(|e| FerroError::Io {
msg: format!("Failed to fetch {}: {}", accession, e),
})?;
let genbank_text = String::from_utf8_lossy(&output.stdout);
let fetch_failed = !output.status.success()
|| genbank_text.contains("Error")
|| genbank_text.contains("Failed to retrieve")
|| !genbank_text.contains("LOCUS");
if fetch_failed {
if let Some(fallback_acc) = fallback {
eprintln!(
" {} not available, using fallback {}",
accession, fallback_acc
);
let fallback_seq_path = output_dir.join(format!("{}.sequence", fallback_acc));
if !fallback_seq_path.exists() {
let fallback_url = build_efetch_url("nuccore", fallback_acc, "gb", "text");
let fallback_output = Command::new("curl")
.args(["-s", "-L", &fallback_url])
.output()
.map_err(|e| FerroError::Io {
msg: format!("Failed to fetch fallback {}: {}", fallback_acc, e),
})?;
let fallback_genbank = String::from_utf8_lossy(&fallback_output.stdout);
if let Some((seq, cds_start, cds_end, gene_name)) =
parse_genbank_record(&fallback_genbank)
{
write_cache_entry(
output_dir,
fallback_acc,
&seq,
cds_start,
cds_end,
&gene_name,
)?;
}
std::thread::sleep(std::time::Duration::from_millis(350));
}
let ann_path = output_dir.join(format!("{}.annotations", accession));
let fallback_seq = format!("{}.sequence", fallback_acc);
let fallback_ann = format!("{}.annotations", fallback_acc);
#[cfg(unix)]
{
use std::os::unix::fs::symlink;
let _ = symlink(&fallback_seq, &seq_path);
let _ = symlink(&fallback_ann, &ann_path);
}
#[cfg(not(unix))]
{
let _ = std::fs::copy(output_dir.join(&fallback_seq), &seq_path);
let _ = std::fs::copy(output_dir.join(&fallback_ann), &ann_path);
}
fetched += 1;
} else {
eprintln!(" Warning: Failed to fetch {}", accession);
}
continue;
}
if let Some((seq, cds_start, cds_end, gene_name)) = parse_genbank_record(&genbank_text) {
write_cache_entry(output_dir, accession, &seq, cds_start, cds_end, &gene_name)?;
fetched += 1;
} else {
eprintln!(" Warning: Failed to parse GenBank for {}", accession);
}
std::thread::sleep(std::time::Duration::from_millis(350));
}
if from_ferro > 0 {
eprintln!(
" Loaded {} legacy transcripts from ferro reference",
from_ferro
);
}
Ok(fetched)
}
fn parse_genbank_record(genbank: &str) -> Option<(String, Option<usize>, Option<usize>, String)> {
let mut sequence = String::new();
let mut in_origin = false;
let mut gene_name = String::new();
let mut cds_start: Option<usize> = None;
let mut cds_end: Option<usize> = None;
for line in genbank.lines() {
if line.starts_with("ORIGIN") {
in_origin = true;
continue;
}
if in_origin {
if line.starts_with("//") {
break;
}
for part in line.split_whitespace() {
if part.chars().all(|c| c.is_ascii_alphabetic()) {
sequence.push_str(&part.to_uppercase());
}
}
} else {
if line.contains("/gene=") {
if let Some(start) = line.find("/gene=\"") {
let rest = &line[start + 7..];
if let Some(end) = rest.find('"') {
gene_name = rest[..end].to_string();
}
}
}
if line.trim_start().starts_with("CDS") {
let parts: Vec<&str> = line.split_whitespace().collect();
if parts.len() >= 2 {
let loc = parts[1];
if let Some(range) = loc.strip_prefix("join(") {
if let Some(first_seg) = range.split(',').next() {
if let Some((s, e)) = first_seg.split_once("..") {
cds_start = s.trim_matches(|c: char| !c.is_numeric()).parse().ok();
cds_end = e.trim_matches(|c: char| !c.is_numeric()).parse().ok();
}
}
} else if let Some((s, e)) = loc.split_once("..") {
cds_start = s.trim_matches(|c: char| !c.is_numeric()).parse().ok();
cds_end = e.trim_matches(|c: char| !c.is_numeric()).parse().ok();
}
}
}
}
}
if sequence.is_empty() {
return None;
}
cds_start = cds_start.map(|p| hgvs_pos_to_index(p as u64));
cds_end = cds_end.map(|p| hgvs_pos_to_index(p as u64));
Some((sequence, cds_start, cds_end, gene_name))
}
fn write_cache_entry(
output_dir: &Path,
accession: &str,
sequence: &str,
cds_start: Option<usize>,
cds_end: Option<usize>,
gene_name: &str,
) -> Result<(), FerroError> {
let seq_path = output_dir.join(format!("{}.sequence", accession));
std::fs::write(&seq_path, sequence).map_err(|e| FerroError::Io {
msg: format!("Failed to write {}: {}", seq_path.display(), e),
})?;
let gene = if gene_name.is_empty() {
accession.to_string()
} else {
gene_name.to_string()
};
let seq_len = sequence.len();
let cds_s = cds_start.unwrap_or(0);
let cds_e = cds_end.unwrap_or(seq_len);
let annotations = json!({
"id": accession,
"type": "record",
"location": {
"type": "range",
"start": {"type": "point", "position": 0},
"end": {"type": "point", "position": seq_len}
},
"qualifiers": {
"name": gene,
"mol_type": "mRNA"
},
"features": [{
"type": "gene",
"location": {
"type": "range",
"start": {"type": "point", "position": 0},
"end": {"type": "point", "position": seq_len},
"strand": 1
},
"id": gene,
"qualifiers": {"name": gene.clone()},
"features": [{
"id": accession,
"type": "mRNA",
"location": {
"type": "range",
"start": {"type": "point", "position": 0},
"end": {"type": "point", "position": seq_len}
},
"features": [{
"type": "CDS",
"id": format!("{}_protein", accession),
"location": {
"type": "range",
"start": {"type": "point", "position": cds_s},
"end": {"type": "point", "position": cds_e},
"strand": 1
},
"features": []
}]
}]
}]
});
let ann_path = output_dir.join(format!("{}.annotations", accession));
let ann_file = File::create(&ann_path).map_err(|e| FerroError::Io {
msg: format!("Failed to create {}: {}", ann_path.display(), e),
})?;
serde_json::to_writer(ann_file, &annotations).map_err(|e| FerroError::Io {
msg: format!("Failed to write annotations: {}", e),
})?;
Ok(())
}
fn load_refseqgene_sequences<P: AsRef<Path>>(dir: P) -> Result<Vec<(String, String)>, FerroError> {
let dir = dir.as_ref();
let mut sequences = Vec::new();
let fasta_files: Vec<_> = std::fs::read_dir(dir)
.map_err(|e| FerroError::Io {
msg: format!("Failed to read {}: {}", dir.display(), e),
})?
.filter_map(|e| e.ok())
.map(|e| e.path())
.filter(|p| {
p.extension()
.is_some_and(|e| e == "fna" || e == "fa" || e == "fasta")
&& p.file_name()
.and_then(|n| n.to_str())
.is_some_and(|n| n.contains("refseqgene") || n.starts_with("NG_"))
})
.collect();
for fasta_path in fasta_files {
eprintln!(
" Reading {}...",
fasta_path.file_name().unwrap_or_default().to_string_lossy()
);
let file = File::open(&fasta_path).map_err(|e| FerroError::Io {
msg: format!("Failed to open {}: {}", fasta_path.display(), e),
})?;
let reader = BufReader::with_capacity(4 * 1024 * 1024, file);
let mut current_id: Option<String> = None;
let mut current_seq = String::new();
for line in reader.lines() {
let line = line.map_err(|e| FerroError::Io {
msg: format!("Failed to read line: {}", e),
})?;
if let Some(header) = line.strip_prefix('>') {
if let Some(id) = current_id.take() {
if id.starts_with("NG_") && !current_seq.is_empty() {
sequences.push((id, std::mem::take(&mut current_seq)));
} else {
current_seq.clear();
}
}
let id = header.split_whitespace().next().map(|s| s.to_string());
if let Some(ref acc) = id {
if acc.starts_with("NG_") {
current_id = id;
} else {
current_id = None;
}
}
} else if current_id.is_some() {
current_seq.push_str(line.trim());
}
}
if let Some(id) = current_id {
if id.starts_with("NG_") && !current_seq.is_empty() {
sequences.push((id, current_seq));
}
}
}
Ok(sequences)
}
pub fn extract_accessions_from_pattern(pattern: &str) -> Vec<String> {
let mut accessions = Vec::new();
let bytes = pattern.as_bytes();
let len = bytes.len();
let mut i = 0;
while i < len {
if !bytes[i].is_ascii() {
i += 1;
continue;
}
if i + 3 <= len && bytes[i + 1].is_ascii() && bytes[i + 2].is_ascii() {
let prefix = &bytes[i..i + 3];
if matches!(
prefix,
b"NC_"
| b"NG_"
| b"NM_"
| b"NR_"
| b"NP_"
| b"XM_"
| b"XR_"
| b"XP_"
| b"NT_"
| b"NW_"
| b"CM_"
) {
if let Some(acc) = extract_refseq_accession(&pattern[i..]) {
accessions.push(acc);
i += 3;
continue;
}
}
}
if i + 4 <= len
&& bytes[i + 1].is_ascii()
&& bytes[i + 2].is_ascii()
&& bytes[i + 3].is_ascii()
&& &bytes[i..i + 4] == b"LRG_"
{
if let Some(acc) = extract_lrg_accession(&pattern[i..]) {
accessions.push(acc);
i += 4;
continue;
}
}
if i + 2 <= len
&& bytes[i].is_ascii_uppercase()
&& bytes[i + 1].is_ascii_uppercase()
&& (i + 2 >= len || !bytes[i + 2].is_ascii_alphabetic())
{
if let Some(acc) = extract_genbank_accession(&pattern[i..]) {
accessions.push(acc);
i += 2;
continue;
}
}
i += 1;
}
let mut seen = std::collections::HashSet::new();
accessions.retain(|acc| seen.insert(acc.clone()));
accessions
}
fn extract_refseq_accession(s: &str) -> Option<String> {
let bytes = s.as_bytes();
if bytes.len() < 4 {
return None;
}
let mut i = 3;
let digit_start = i;
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
if i == digit_start {
return None; }
if i >= bytes.len() || bytes[i] != b'.' {
return None;
}
i += 1;
let version_start = i;
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
if i == version_start {
return None; }
Some(s[..i].to_string())
}
fn extract_lrg_accession(s: &str) -> Option<String> {
let bytes = s.as_bytes();
if bytes.len() < 5 {
return None;
}
let mut i = 4;
let digit_start = i;
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
if i == digit_start {
return None; }
if i < bytes.len() && (bytes[i] == b't' || bytes[i] == b'p') {
i += 1;
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
}
Some(s[..i].to_string())
}
fn extract_genbank_accession(s: &str) -> Option<String> {
let bytes = s.as_bytes();
if bytes.len() < 10 {
return None;
}
let mut i = 2;
let digit_start = i;
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
if i - digit_start != 6 {
return None; }
if i >= bytes.len() || bytes[i] != b'.' {
return None;
}
i += 1;
let version_start = i;
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
if i == version_start {
return None; }
Some(s[..i].to_string())
}
pub fn extract_all_accessions_from_file<P: AsRef<Path>>(
pattern_file: P,
) -> Result<Vec<String>, FerroError> {
let file = File::open(pattern_file.as_ref()).map_err(|e| FerroError::Io {
msg: format!("Failed to open pattern file: {}", e),
})?;
let reader = BufReader::new(file);
let mut all_accessions = std::collections::HashSet::new();
for line in reader.lines() {
let line = line.map_err(|e| FerroError::Io {
msg: format!("Failed to read line: {}", e),
})?;
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
for acc in extract_accessions_from_pattern(line) {
all_accessions.insert(acc);
}
}
let mut result: Vec<String> = all_accessions.into_iter().collect();
result.sort();
Ok(result)
}
pub fn fetch_missing_accessions(
accessions: &[String],
cache_dir: &Path,
) -> Result<usize, FerroError> {
const BATCH_SIZE: usize = 200;
let mut fetched = 0;
let mut missing_nuccore: Vec<&String> = Vec::new();
let mut missing_protein: Vec<&String> = Vec::new();
let mut skipped_cm = 0usize;
let mut skipped_lrg = 0usize;
let mut skipped_nc = 0usize;
let mut skipped_draft = 0usize;
for acc in accessions {
if acc.starts_with("CM_") {
skipped_cm += 1;
continue;
}
if acc.starts_with("LRG_") {
skipped_lrg += 1;
continue;
}
if acc.starts_with("NC_") {
skipped_nc += 1;
continue;
}
if acc.starts_with("NT_") || acc.starts_with("NW_") {
skipped_draft += 1;
continue;
}
let seq_path = cache_dir.join(format!("{}.sequence", acc));
if !seq_path.exists() {
if acc.starts_with("NP_") || acc.starts_with("XP_") || acc.starts_with("YP_") {
missing_protein.push(acc);
} else {
missing_nuccore.push(acc);
}
}
}
if skipped_cm > 0 {
eprintln!(
" Skipped {} CM_ accessions (redundant with NC_ chromosomes)",
skipped_cm
);
}
if skipped_lrg > 0 {
eprintln!(
" Skipped {} LRG_ accessions (fetched separately from EBI)",
skipped_lrg
);
}
if skipped_nc > 0 {
eprintln!(" Skipped {} NC_ accessions (in genome FASTAs)", skipped_nc);
}
if skipped_draft > 0 {
eprintln!(
" Skipped {} NT_/NW_ accessions (draft scaffolds)",
skipped_draft
);
}
if missing_nuccore.is_empty() && missing_protein.is_empty() {
eprintln!(" All accessions already in cache");
return Ok(0);
}
eprintln!(
"\n=== Fetch Summary ===\n Nucleotide: {} missing\n Protein: {} missing\n Total: {} to fetch\n",
missing_nuccore.len(),
missing_protein.len(),
missing_nuccore.len() + missing_protein.len()
);
if !missing_nuccore.is_empty() {
let total_batches = missing_nuccore.len().div_ceil(BATCH_SIZE);
eprintln!(
"Fetching {} nucleotide accessions in {} batches...",
missing_nuccore.len(),
total_batches
);
let mut nuc_fetched = 0usize;
for (batch_idx, batch) in missing_nuccore.chunks(BATCH_SIZE).enumerate() {
let batch_fetched = fetch_batch(cache_dir, "nuccore", batch)?;
nuc_fetched += batch_fetched;
fetched += batch_fetched;
eprintln!(
" Batch {}/{}: fetched {}/{} (total: {}/{})",
batch_idx + 1,
total_batches,
batch_fetched,
batch.len(),
nuc_fetched,
missing_nuccore.len()
);
std::thread::sleep(std::time::Duration::from_millis(350));
}
if nuc_fetched < missing_nuccore.len() {
eprintln!(
" Warning: {} nucleotide accessions could not be fetched",
missing_nuccore.len() - nuc_fetched
);
}
}
if !missing_protein.is_empty() {
eprintln!(
"\n Skipping {} protein accessions (derived from transcript CDS)",
missing_protein.len()
);
}
eprintln!("\n=== Fetch Complete ===");
eprintln!(" Total fetched: {}", fetched);
Ok(fetched)
}
fn fetch_batch(cache_dir: &Path, db: &str, accessions: &[&String]) -> Result<usize, FerroError> {
use std::process::Command;
if accessions.is_empty() {
return Ok(0);
}
let ids: Vec<&str> = accessions.iter().map(|s| s.as_str()).collect();
let id_list = ids.join(",");
let url = build_efetch_url(db, &id_list, "gb", "text");
let output = Command::new("curl")
.args(["-s", "-L", &url])
.output()
.map_err(|e| FerroError::Io {
msg: format!("Failed to fetch batch: {}", e),
})?;
if !output.status.success() {
eprintln!(" Warning: Batch fetch failed");
return Ok(0);
}
let genbank_text = String::from_utf8_lossy(&output.stdout);
let mut fetched = 0;
let mut need_fasta: Vec<String> = Vec::new();
for record in genbank_text.split("\nLOCUS ") {
let record = if record.starts_with("LOCUS ") {
record.to_string()
} else if fetched == 0 && need_fasta.is_empty() && record.contains("LOCUS ") {
record.to_string()
} else {
format!("LOCUS {}", record)
};
let accession = record
.lines()
.find(|l| l.starts_with("VERSION"))
.and_then(|l| l.split_whitespace().nth(1))
.map(|s| s.to_string());
if let Some(acc) = accession {
if !record.contains("ORIGIN") {
need_fasta.push(acc);
continue;
}
if let Some((seq, cds_start, cds_end, gene_name)) = parse_genbank_record(&record) {
if write_cache_entry(cache_dir, &acc, &seq, cds_start, cds_end, &gene_name).is_ok()
{
fetched += 1;
}
}
}
}
if !need_fasta.is_empty() {
let fasta_fetched = fetch_fasta_fallback(cache_dir, db, &need_fasta)?;
fetched += fasta_fetched;
}
Ok(fetched)
}
fn fetch_fasta_fallback(
cache_dir: &Path,
db: &str,
accessions: &[String],
) -> Result<usize, FerroError> {
use std::process::Command;
if accessions.is_empty() {
return Ok(0);
}
let id_list = accessions.join(",");
let url = build_efetch_url(db, &id_list, "fasta", "text");
let output = Command::new("curl")
.args(["-s", "-L", &url])
.output()
.map_err(|e| FerroError::Io {
msg: format!("Failed to fetch FASTA: {}", e),
})?;
if !output.status.success() {
return Ok(0);
}
let fasta_text = String::from_utf8_lossy(&output.stdout);
let mut fetched = 0;
let mut current_acc: Option<String> = None;
let mut current_seq = String::new();
for line in fasta_text.lines() {
if let Some(header) = line.strip_prefix('>') {
if let Some(acc) = current_acc.take() {
if !current_seq.is_empty()
&& write_nucleotide_fasta_entry(cache_dir, &acc, ¤t_seq).is_ok()
{
fetched += 1;
}
}
current_acc = header.split_whitespace().next().map(|s| s.to_string());
current_seq.clear();
} else if !line.is_empty() {
current_seq.push_str(line.trim());
}
}
if let Some(acc) = current_acc {
if !current_seq.is_empty()
&& write_nucleotide_fasta_entry(cache_dir, &acc, ¤t_seq).is_ok()
{
fetched += 1;
}
}
Ok(fetched)
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct SupplementalTranscript {
pub id: String,
pub gene_symbol: Option<String>,
pub cds_start: Option<u64>,
pub cds_end: Option<u64>,
pub sequence_length: usize,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct SupplementalMetadata {
pub generated_at: String,
pub transcripts: std::collections::HashMap<String, SupplementalTranscript>,
}
pub fn fetch_fasta_to_file(
accessions: &[String],
output_file: &Path,
dry_run: bool,
) -> Result<usize, FerroError> {
use std::io::Write;
use std::process::Command;
const BATCH_SIZE: usize = 200;
if accessions.is_empty() {
return Ok(0);
}
let mut nuccore: Vec<&String> = Vec::new();
let mut protein: Vec<&String> = Vec::new();
let mut skipped_lrg = 0usize;
let mut skipped_cm = 0usize;
for acc in accessions {
if acc.starts_with("LRG_") {
skipped_lrg += 1;
continue;
}
if acc.starts_with("CM_") {
skipped_cm += 1;
continue;
}
if acc.starts_with("NP_") || acc.starts_with("XP_") || acc.starts_with("YP_") {
protein.push(acc);
} else {
nuccore.push(acc);
}
}
if skipped_lrg > 0 {
eprintln!(
" Skipped {} LRG_ accessions (not available from NCBI efetch)",
skipped_lrg
);
}
if skipped_cm > 0 {
eprintln!(
" Skipped {} CM_ accessions (redundant with NC_ chromosomes)",
skipped_cm
);
}
if !protein.is_empty() {
eprintln!(
" Skipped {} protein accessions (NP_/XP_/YP_) - not needed for ferro",
protein.len()
);
}
if nuccore.is_empty() {
eprintln!(" No nucleotide accessions to fetch");
return Ok(0);
}
if dry_run {
eprintln!(
" Would fetch {} nucleotide sequences with CDS metadata",
nuccore.len()
);
return Ok(0);
}
let mut fasta_file = std::fs::OpenOptions::new()
.create(true)
.append(true)
.open(output_file)
.map_err(|e| FerroError::Io {
msg: format!("Failed to open {}: {}", output_file.display(), e),
})?;
let metadata_path = output_file.with_extension("metadata.json");
let mut metadata = if metadata_path.exists() {
let file = File::open(&metadata_path).map_err(|e| FerroError::Io {
msg: format!("Failed to open {}: {}", metadata_path.display(), e),
})?;
serde_json::from_reader(file).unwrap_or_else(|_| SupplementalMetadata {
generated_at: chrono::Utc::now().to_rfc3339(),
transcripts: std::collections::HashMap::new(),
})
} else {
SupplementalMetadata {
generated_at: chrono::Utc::now().to_rfc3339(),
transcripts: std::collections::HashMap::new(),
}
};
let original_count = nuccore.len();
let nuccore: Vec<&String> = nuccore
.into_iter()
.filter(|acc| !metadata.transcripts.contains_key(*acc))
.collect();
if nuccore.is_empty() {
eprintln!(
" All {} accessions already fetched (skipping)",
original_count
);
return Ok(0);
}
if original_count != nuccore.len() {
eprintln!(
" Skipping {} already-fetched accessions, {} remaining to fetch",
original_count - nuccore.len(),
nuccore.len()
);
}
let mut fetched = 0usize;
let total_batches = nuccore.len().div_ceil(BATCH_SIZE);
let has_api_key = get_ncbi_api_key().is_some();
let rate_delay_ms = if has_api_key { 100 } else { 350 };
eprintln!(
" Fetching {} nucleotide sequences in {} batches (batch size: {})...",
nuccore.len(),
total_batches,
BATCH_SIZE
);
if has_api_key {
eprintln!(" Using NCBI API key (10 req/sec rate limit)");
} else {
eprintln!(" No NCBI_API_KEY set (3 req/sec rate limit - set key for 3x faster)");
}
let start_time = std::time::Instant::now();
for (batch_idx, batch) in nuccore.chunks(BATCH_SIZE).enumerate() {
let id_list: Vec<&str> = batch.iter().map(|s| s.as_str()).collect();
let ids = id_list.join(",");
let batch_accessions: std::collections::HashSet<&str> = id_list.iter().copied().collect();
let mut fetched_accessions: std::collections::HashSet<String> =
std::collections::HashSet::new();
let url = build_efetch_url("nuccore", &ids, "gb", "text");
let output = Command::new("curl")
.args(["-s", "-L", &url])
.output()
.map_err(|e| FerroError::Io {
msg: format!("Failed to fetch batch: {}", e),
})?;
if !output.status.success() {
eprintln!(
" Warning: Batch {}/{} failed",
batch_idx + 1,
total_batches
);
continue;
}
let genbank_text = String::from_utf8_lossy(&output.stdout);
let mut batch_count = 0;
for record in genbank_text.split("\nLOCUS ") {
let record = if record.starts_with("LOCUS ")
|| (batch_count == 0 && record.contains("LOCUS "))
{
record.to_string()
} else {
format!("LOCUS {}", record)
};
let accession = record
.lines()
.find(|l| l.starts_with("VERSION"))
.and_then(|l| l.split_whitespace().nth(1))
.map(|s| s.to_string());
if let Some(acc) = accession {
if let Some((seq, cds_start, cds_end, gene_name)) =
parse_genbank_record_full(&record)
{
let header = if gene_name.is_empty() {
format!(">{}", acc)
} else {
format!(">{} {}", acc, gene_name)
};
writeln!(fasta_file, "{}", header).map_err(|e| FerroError::Io {
msg: format!("Failed to write to {}: {}", output_file.display(), e),
})?;
for chunk in seq.as_bytes().chunks(70) {
writeln!(fasta_file, "{}", std::str::from_utf8(chunk).unwrap()).map_err(
|e| FerroError::Io {
msg: format!("Failed to write to {}: {}", output_file.display(), e),
},
)?;
}
metadata.transcripts.insert(
acc.clone(),
SupplementalTranscript {
id: acc.clone(),
gene_symbol: if gene_name.is_empty() {
None
} else {
Some(gene_name)
},
cds_start,
cds_end,
sequence_length: seq.len(),
},
);
fetched_accessions.insert(acc);
batch_count += 1;
}
}
}
let missing_from_batch: Vec<&str> = batch_accessions
.iter()
.filter(|acc| !fetched_accessions.contains(**acc))
.copied()
.collect();
if !missing_from_batch.is_empty() {
let missing_ids = missing_from_batch.join(",");
let fasta_url = build_efetch_url("nuccore", &missing_ids, "fasta", "text");
if let Ok(fasta_output) = Command::new("curl").args(["-s", "-L", &fasta_url]).output() {
if fasta_output.status.success() {
let fasta_text = String::from_utf8_lossy(&fasta_output.stdout);
let mut current_acc = String::new();
let mut current_seq = String::new();
let mut current_desc = String::new();
for line in fasta_text.lines() {
if let Some(header) = line.strip_prefix('>') {
if !current_acc.is_empty() && !current_seq.is_empty() {
if current_desc.is_empty() {
writeln!(fasta_file, ">{}", current_acc).ok();
} else {
writeln!(fasta_file, ">{} {}", current_acc, current_desc).ok();
}
for chunk in current_seq.as_bytes().chunks(70) {
writeln!(fasta_file, "{}", std::str::from_utf8(chunk).unwrap())
.ok();
}
metadata.transcripts.insert(
current_acc.clone(),
SupplementalTranscript {
id: current_acc.clone(),
gene_symbol: if current_desc.is_empty() {
None
} else {
current_desc
.split('(')
.next()
.map(|s| s.trim().to_string())
},
cds_start: None,
cds_end: None,
sequence_length: current_seq.len(),
},
);
batch_count += 1;
}
let parts: Vec<&str> = header.splitn(2, ' ').collect();
current_acc = parts[0].to_string();
current_desc = parts.get(1).unwrap_or(&"").to_string();
current_seq.clear();
} else {
current_seq.push_str(line.trim());
}
}
if !current_acc.is_empty() && !current_seq.is_empty() {
if current_desc.is_empty() {
writeln!(fasta_file, ">{}", current_acc).ok();
} else {
writeln!(fasta_file, ">{} {}", current_acc, current_desc).ok();
}
for chunk in current_seq.as_bytes().chunks(70) {
writeln!(fasta_file, "{}", std::str::from_utf8(chunk).unwrap()).ok();
}
metadata.transcripts.insert(
current_acc.clone(),
SupplementalTranscript {
id: current_acc.clone(),
gene_symbol: if current_desc.is_empty() {
None
} else {
current_desc.split('(').next().map(|s| s.trim().to_string())
},
cds_start: None,
cds_end: None,
sequence_length: current_seq.len(),
},
);
batch_count += 1;
}
}
}
std::thread::sleep(std::time::Duration::from_millis(rate_delay_ms));
}
fetched += batch_count;
if (batch_idx + 1) % 10 == 0 || batch_idx + 1 == total_batches {
let elapsed = start_time.elapsed().as_secs_f64();
let batches_done = batch_idx + 1;
let batches_remaining = total_batches - batches_done;
let secs_per_batch = elapsed / batches_done as f64;
let eta_secs = (batches_remaining as f64 * secs_per_batch) as u64;
let eta_min = eta_secs / 60;
let eta_sec = eta_secs % 60;
eprintln!(
" Batch {}/{} ({:.1}%): {} sequences fetched, ETA: {}m {}s",
batches_done,
total_batches,
(batches_done as f64 / total_batches as f64) * 100.0,
fetched,
eta_min,
eta_sec
);
}
if batch_idx < total_batches - 1 {
std::thread::sleep(std::time::Duration::from_millis(rate_delay_ms));
}
}
let metadata_file = File::create(&metadata_path).map_err(|e| FerroError::Io {
msg: format!("Failed to create {}: {}", metadata_path.display(), e),
})?;
serde_json::to_writer_pretty(metadata_file, &metadata).map_err(|e| FerroError::Io {
msg: format!("Failed to write {}: {}", metadata_path.display(), e),
})?;
eprintln!(" Wrote metadata to {}", metadata_path.display());
Ok(fetched)
}
fn parse_genbank_record_full(genbank: &str) -> Option<(String, Option<u64>, Option<u64>, String)> {
let mut sequence = String::new();
let mut in_origin = false;
let mut gene_name = String::new();
let mut cds_start: Option<u64> = None;
let mut cds_end: Option<u64> = None;
for line in genbank.lines() {
if line.starts_with("ORIGIN") {
in_origin = true;
continue;
}
if in_origin {
if line.starts_with("//") {
break;
}
for part in line.split_whitespace() {
if part.chars().all(|c| c.is_ascii_alphabetic()) {
sequence.push_str(&part.to_uppercase());
}
}
} else {
if line.contains("/gene=") {
if let Some(start) = line.find("/gene=\"") {
let rest = &line[start + 7..];
if let Some(end) = rest.find('"') {
gene_name = rest[..end].to_string();
}
}
}
if line.trim_start().starts_with("CDS") {
let parts: Vec<&str> = line.split_whitespace().collect();
if parts.len() >= 2 {
let loc = parts[1];
let loc = loc
.strip_prefix("complement(")
.and_then(|s| s.strip_suffix(')'))
.unwrap_or(loc);
if let Some(range) = loc.strip_prefix("join(") {
let range = range.strip_suffix(')').unwrap_or(range);
let segments: Vec<&str> = range.split(',').collect();
if let Some(first) = segments.first() {
if let Some((s, _)) = first.split_once("..") {
cds_start = s.trim_matches(|c: char| !c.is_numeric()).parse().ok();
}
}
if let Some(last) = segments.last() {
if let Some((_, e)) = last.split_once("..") {
cds_end = e.trim_matches(|c: char| !c.is_numeric()).parse().ok();
}
}
} else if let Some((s, e)) = loc.split_once("..") {
cds_start = s.trim_matches(|c: char| !c.is_numeric()).parse().ok();
cds_end = e.trim_matches(|c: char| !c.is_numeric()).parse().ok();
}
}
}
}
}
if sequence.is_empty() {
return None;
}
Some((sequence, cds_start, cds_end, gene_name))
}
fn write_nucleotide_fasta_entry(
cache_dir: &Path,
accession: &str,
sequence: &str,
) -> Result<(), FerroError> {
let seq_path = cache_dir.join(format!("{}.sequence", accession));
std::fs::write(&seq_path, sequence).map_err(|e| FerroError::Io {
msg: format!("Failed to write {}: {}", seq_path.display(), e),
})?;
let annotations = build_minimal_annotations(accession, sequence.len());
let ann_path = cache_dir.join(format!("{}.annotations", accession));
let ann_json = serde_json::to_string_pretty(&annotations).map_err(|e| FerroError::Json {
msg: format!("Failed to serialize annotations: {}", e),
})?;
std::fs::write(&ann_path, ann_json).map_err(|e| FerroError::Io {
msg: format!("Failed to write {}: {}", ann_path.display(), e),
})?;
Ok(())
}
#[allow(dead_code)]
fn fetch_protein_batch(cache_dir: &Path, accessions: &[&String]) -> Result<usize, FerroError> {
use std::process::Command;
if accessions.is_empty() {
return Ok(0);
}
let ids: Vec<&str> = accessions.iter().map(|s| s.as_str()).collect();
let id_list = ids.join(",");
let url = build_efetch_url("protein", &id_list, "fasta", "text");
let output = Command::new("curl")
.args(["-s", "-L", &url])
.output()
.map_err(|e| FerroError::Io {
msg: format!("Failed to fetch protein batch: {}", e),
})?;
if !output.status.success() {
eprintln!(" Warning: Protein batch fetch failed");
return Ok(0);
}
let fasta_text = String::from_utf8_lossy(&output.stdout);
let mut fetched = 0;
let mut current_acc: Option<String> = None;
let mut current_seq = String::new();
for line in fasta_text.lines() {
if let Some(header) = line.strip_prefix('>') {
if let Some(acc) = current_acc.take() {
if !current_seq.is_empty()
&& write_protein_cache_entry(cache_dir, &acc, ¤t_seq).is_ok()
{
fetched += 1;
}
}
current_acc = header.split_whitespace().next().map(|s| s.to_string());
current_seq.clear();
} else if !line.is_empty() {
current_seq.push_str(line.trim());
}
}
if let Some(acc) = current_acc {
if !current_seq.is_empty()
&& write_protein_cache_entry(cache_dir, &acc, ¤t_seq).is_ok()
{
fetched += 1;
}
}
Ok(fetched)
}
#[allow(dead_code)]
fn write_protein_cache_entry(
cache_dir: &Path,
accession: &str,
sequence: &str,
) -> Result<(), FerroError> {
let seq_path = cache_dir.join(format!("{}.sequence", accession));
std::fs::write(&seq_path, sequence).map_err(|e| FerroError::Io {
msg: format!("Failed to write {}: {}", seq_path.display(), e),
})?;
let annotations = build_protein_annotations(accession, sequence.len());
let ann_path = cache_dir.join(format!("{}.annotations", accession));
let ann_json = serde_json::to_string_pretty(&annotations).map_err(|e| FerroError::Json {
msg: format!("Failed to serialize annotations: {}", e),
})?;
std::fs::write(&ann_path, ann_json).map_err(|e| FerroError::Io {
msg: format!("Failed to write {}: {}", ann_path.display(), e),
})?;
Ok(())
}
fn build_minimal_annotations(tx_id: &str, seq_len: usize) -> serde_json::Value {
let (mol_type, feature_type) = if tx_id.starts_with("NM_") || tx_id.starts_with("XM_") {
("mRNA", "mRNA")
} else if tx_id.starts_with("NR_") || tx_id.starts_with("XR_") {
("ncRNA", "ncRNA")
} else if tx_id.starts_with("NG_")
|| tx_id.starts_with("NC_")
|| tx_id.starts_with("LRG_")
|| tx_id.starts_with("chr")
{
("genomic DNA", "gene")
} else {
("mRNA", "mRNA") };
json!({
"id": tx_id,
"type": "record",
"location": {
"type": "range",
"start": {"type": "point", "position": 0},
"end": {"type": "point", "position": seq_len}
},
"qualifiers": {
"name": tx_id,
"mol_type": mol_type
},
"features": [{
"type": "gene",
"location": {
"type": "range",
"start": {"type": "point", "position": 0},
"end": {"type": "point", "position": seq_len},
"strand": 1
},
"id": tx_id,
"features": [{
"id": tx_id,
"type": feature_type,
"location": {
"type": "range",
"start": {"type": "point", "position": 0},
"end": {"type": "point", "position": seq_len}
},
"features": []
}]
}]
})
}
fn build_protein_annotations(protein_id: &str, seq_len: usize) -> serde_json::Value {
json!({
"id": protein_id,
"type": "record",
"coordinate_system": "p",
"location": {
"type": "range",
"start": {"type": "point", "position": 0},
"end": {"type": "point", "position": seq_len}
},
"qualifiers": {
"name": protein_id,
"mol_type": "CDS"
},
"features": [{
"type": "CDS",
"location": {
"type": "range",
"start": {"type": "point", "position": 0},
"end": {"type": "point", "position": seq_len},
"strand": 1
},
"id": protein_id,
"qualifiers": {
"gbkey": "Prot"
}
}]
})
}
fn build_nc_annotations_with_selectors(
nc_id: &str,
seq_len: usize,
transcripts: &[(&str, &CdotTranscript)],
) -> serde_json::Value {
let mut genes: HashMap<String, Vec<(&str, &CdotTranscript)>> = HashMap::new();
for (tx_id, tx) in transcripts {
let gene_name = tx.gene_name.clone().unwrap_or_else(|| tx_id.to_string());
genes.entry(gene_name).or_default().push((tx_id, tx));
}
let mut gene_features: Vec<serde_json::Value> = Vec::new();
for (gene_name, gene_transcripts) in genes {
let mut gene_start = u64::MAX;
let mut gene_end = 0u64;
let mut gene_strand = 1i32;
for (_, tx) in &gene_transcripts {
gene_strand = if tx.strand == crate::reference::Strand::Plus {
1
} else {
-1
};
for exon in &tx.exons {
gene_start = gene_start.min(exon[0]);
gene_end = gene_end.max(exon[1]);
}
}
let mut transcript_features: Vec<serde_json::Value> = Vec::new();
for (tx_id, tx) in &gene_transcripts {
let (tx_type, is_coding) = if tx_id.starts_with("NM_") || tx_id.starts_with("XM_") {
("mRNA", true)
} else if tx_id.starts_with("NR_") || tx_id.starts_with("XR_") {
("ncRNA", false)
} else {
("mRNA", true)
};
let mut tx_start = u64::MAX;
let mut tx_end = 0u64;
for exon in &tx.exons {
tx_start = tx_start.min(exon[0]);
tx_end = tx_end.max(exon[1]);
}
let mut exon_features: Vec<serde_json::Value> = Vec::new();
for (i, exon) in tx.exons.iter().enumerate() {
let exon_start = exon[0] as usize;
let exon_end = exon[1] as usize;
exon_features.push(json!({
"type": "exon",
"id": format!("id-{}-{}", tx_id, i + 1),
"location": {
"type": "range",
"start": {"type": "point", "position": exon_start},
"end": {"type": "point", "position": exon_end},
"strand": gene_strand
}
}));
}
let mut tx_inner_features: Vec<serde_json::Value> = Vec::new();
if is_coding {
if let (Some(cds_start), Some(cds_end)) = (tx.cds_start, tx.cds_end) {
let cds_genomic = calculate_genomic_cds(tx, cds_start, cds_end);
if let Some((cds_g_start, cds_g_end)) = cds_genomic {
let protein_id = tx
.protein
.clone()
.unwrap_or_else(|| format!("{}_protein", tx_id));
tx_inner_features.push(json!({
"type": "CDS",
"id": protein_id,
"location": {
"type": "range",
"start": {"type": "point", "position": cds_g_start},
"end": {"type": "point", "position": cds_g_end},
"strand": gene_strand
},
"features": []
}));
}
}
}
tx_inner_features.extend(exon_features);
transcript_features.push(json!({
"type": tx_type,
"id": *tx_id,
"location": {
"type": "range",
"start": {"type": "point", "position": tx_start as usize},
"end": {"type": "point", "position": tx_end as usize},
"strand": gene_strand
},
"features": tx_inner_features
}));
}
gene_features.push(json!({
"type": "gene",
"id": gene_name.clone(),
"qualifiers": {"name": gene_name},
"location": {
"type": "range",
"start": {"type": "point", "position": gene_start as usize},
"end": {"type": "point", "position": gene_end as usize},
"strand": gene_strand
},
"features": transcript_features
}));
}
json!({
"id": nc_id,
"type": "record",
"location": {
"type": "range",
"start": {"type": "point", "position": 0},
"end": {"type": "point", "position": seq_len}
},
"qualifiers": {
"name": nc_id,
"mol_type": "genomic DNA"
},
"features": gene_features
})
}
fn calculate_genomic_cds(
tx: &CdotTranscript,
cds_start: u64,
cds_end: u64,
) -> Option<(usize, usize)> {
let mut exons: Vec<_> = tx.exons.iter().collect();
exons.sort_by_key(|e| e[2]);
let mut g_cds_start: Option<usize> = None;
let mut g_cds_end: Option<usize> = None;
for exon in &exons {
let g_start = exon[0] as usize;
let g_end = exon[1] as usize;
let t_start = exon[2]; let t_end = exon[3];
let t_start_0 = hgvs_pos_to_index(t_start) as u64;
let t_end_0 = hgvs_pos_to_index(t_end) as u64;
if g_cds_start.is_none() && cds_start >= t_start_0 && cds_start <= t_end_0 {
let offset = (cds_start - t_start_0) as usize;
if tx.strand == crate::reference::Strand::Plus {
g_cds_start = Some(g_start + offset);
} else {
g_cds_start = Some(g_end - offset);
}
}
if g_cds_end.is_none() && cds_end >= t_start_0 && cds_end <= t_end_0 {
let offset = (cds_end - t_start_0) as usize;
if tx.strand == crate::reference::Strand::Plus {
g_cds_end = Some(g_start + offset);
} else {
g_cds_end = Some(g_end - offset);
}
}
}
match (g_cds_start, g_cds_end) {
(Some(s), Some(e)) => {
if s < e {
Some((s, e))
} else {
Some((e, s))
}
}
_ => None,
}
}
pub fn enhance_nc_annotations<P: AsRef<Path>>(
cache_dir: P,
cdot: &CdotMapper,
) -> Result<usize, FerroError> {
let cache_dir = cache_dir.as_ref();
let mut enhanced = 0;
let nc_files: Vec<_> = std::fs::read_dir(cache_dir)
.map_err(|e| FerroError::Io {
msg: format!("Failed to read cache dir: {}", e),
})?
.filter_map(|e| e.ok())
.filter(|e| {
let name = e.file_name();
let name = name.to_string_lossy();
name.starts_with("NC_") && name.ends_with(".annotations")
})
.collect();
eprintln!(
"Enhancing {} NC_ annotation files with transcript selectors...",
nc_files.len()
);
for entry in nc_files {
let ann_path = entry.path();
let filename = entry.file_name();
let nc_id = filename
.to_string_lossy()
.trim_end_matches(".annotations")
.to_string();
let tx_ids = cdot.transcripts_on_contig(&nc_id);
if tx_ids.is_empty() {
continue;
}
let seq_path = cache_dir.join(format!("{}.sequence", nc_id));
let seq_len = if seq_path.exists() {
std::fs::metadata(&seq_path)
.map(|m| m.len() as usize)
.unwrap_or(0)
} else {
continue;
};
let transcripts: Vec<_> = tx_ids
.iter()
.filter_map(|id| cdot.get_transcript(id).map(|tx| (*id, tx)))
.collect();
if transcripts.is_empty() {
continue;
}
let annotations = build_nc_annotations_with_selectors(&nc_id, seq_len, &transcripts);
let ann_file = File::create(&ann_path).map_err(|e| FerroError::Io {
msg: format!("Failed to create {}: {}", ann_path.display(), e),
})?;
serde_json::to_writer(ann_file, &annotations).map_err(|e| FerroError::Io {
msg: format!("Failed to write annotations: {}", e),
})?;
enhanced += 1;
if enhanced % 10 == 0 {
eprintln!(
" Enhanced {} NC_ files ({} transcripts for {})...",
enhanced,
transcripts.len(),
nc_id
);
}
}
eprintln!("Enhanced {} NC_ annotation files", enhanced);
Ok(enhanced)
}
pub fn extract_clinvar_accessions<P: AsRef<Path>>(
clinvar_file: P,
) -> Result<Vec<String>, FerroError> {
use flate2::read::GzDecoder;
let clinvar_file = clinvar_file.as_ref();
let file = File::open(clinvar_file).map_err(|e| FerroError::Io {
msg: format!(
"Failed to open ClinVar file {}: {}",
clinvar_file.display(),
e
),
})?;
let reader: Box<dyn BufRead> = if clinvar_file.extension().is_some_and(|e| e == "gz") {
Box::new(BufReader::new(GzDecoder::new(file)))
} else {
Box::new(BufReader::new(file))
};
let mut all_accessions = HashSet::new();
let mut line_count = 0usize;
let nucleotide_column = 6;
let protein_column = 8;
for line in reader.lines() {
let line = line.map_err(|e| FerroError::Io {
msg: format!("Error reading line: {}", e),
})?;
if line.starts_with('#') {
continue;
}
line_count += 1;
if line_count.is_multiple_of(500_000) {
eprintln!(
" Processed {} lines, found {} accessions...",
line_count,
all_accessions.len()
);
}
let parts: Vec<&str> = line.split('\t').collect();
for &column in &[nucleotide_column, protein_column] {
if let Some(field) = parts.get(column) {
for pattern in field.split('|') {
let pattern = pattern.trim();
if pattern.is_empty() || pattern == "-" {
continue;
}
for acc in extract_accessions_from_pattern(pattern) {
all_accessions.insert(acc);
}
}
}
}
}
eprintln!(" Processed {} lines total", line_count);
let mut result: Vec<String> = all_accessions.into_iter().collect();
result.sort();
Ok(result)
}
pub fn fetch_lrg_sequences(
cache_dir: &Path,
ferro_reference: Option<&Path>,
) -> Result<usize, FerroError> {
use std::process::Command;
const LRG_MAX_ID: usize = 1400;
const LRG_BASE_URL: &str = "https://ftp.ebi.ac.uk/pub/databases/lrgex/fasta/";
let mut copied = 0usize;
let mut fetched = 0usize;
let mut skipped = 0usize;
let ferro_lrg_dir = ferro_reference.map(|r| r.join("lrg"));
for i in 1..=LRG_MAX_ID {
let lrg_id = format!("LRG_{}", i);
let seq_path = cache_dir.join(format!("{}.sequence", lrg_id));
if seq_path.exists() {
skipped += 1;
continue;
}
if let Some(ref lrg_dir) = ferro_lrg_dir {
let ferro_fasta = lrg_dir.join(format!("{}.fasta", lrg_id));
if ferro_fasta.exists() {
if let Ok(fasta_content) = std::fs::read_to_string(&ferro_fasta) {
let mut sequence = String::new();
for line in fasta_content.lines() {
if !line.starts_with('>') {
sequence.push_str(line.trim());
}
}
if !sequence.is_empty() {
if std::fs::write(&seq_path, &sequence).is_ok() {
let annotations = build_minimal_annotations(&lrg_id, sequence.len());
let annotations_path =
cache_dir.join(format!("{}.annotations", lrg_id));
if let Ok(json_str) = serde_json::to_string_pretty(&annotations) {
let _ = std::fs::write(&annotations_path, json_str);
}
copied += 1;
if copied.is_multiple_of(100) {
eprintln!(" Copied {} LRG sequences from ferro...", copied);
}
continue;
}
}
}
}
}
let url = format!("{}LRG_{}.fasta", LRG_BASE_URL, i);
let output = Command::new("curl").args(["-s", "-f", "-L", &url]).output();
let output = match output {
Ok(o) if o.status.success() => o,
_ => continue, };
let fasta_content = String::from_utf8_lossy(&output.stdout);
if fasta_content.is_empty() || !fasta_content.starts_with('>') {
continue;
}
let mut sequence = String::new();
for line in fasta_content.lines() {
if !line.starts_with('>') {
sequence.push_str(line.trim());
}
}
if sequence.is_empty() {
continue;
}
if std::fs::write(&seq_path, &sequence).is_err() {
continue;
}
let annotations = build_minimal_annotations(&lrg_id, sequence.len());
let annotations_path = cache_dir.join(format!("{}.annotations", lrg_id));
if let Ok(json_str) = serde_json::to_string_pretty(&annotations) {
let _ = std::fs::write(&annotations_path, json_str);
}
fetched += 1;
if fetched.is_multiple_of(100) {
eprintln!(" Fetched {} LRG sequences from EBI...", fetched);
}
std::thread::sleep(std::time::Duration::from_millis(100));
}
if skipped > 0 {
eprintln!(" Skipped {} existing LRG sequences", skipped);
}
if copied > 0 {
eprintln!(" Copied {} LRG sequences from ferro reference", copied);
}
if fetched > 0 {
eprintln!(" Fetched {} LRG sequences from EBI", fetched);
}
Ok(copied + fetched)
}
pub fn enhance_lrg_annotations(
cache_dir: &Path,
ferro_reference: Option<&Path>,
) -> Result<usize, FerroError> {
use std::process::Command;
const LRG_MAX_ID: usize = 1400;
const LRG_XML_URL: &str = "https://ftp.ebi.ac.uk/pub/databases/lrgex/";
let mut enhanced = 0usize;
let mut skipped = 0usize;
let mut from_ferro = 0usize;
let mut from_ebi = 0usize;
let mut failed = 0usize;
let ferro_lrg_dir = ferro_reference.map(|r| r.join("lrg"));
eprintln!("Enhancing LRG annotations...");
for i in 1..=LRG_MAX_ID {
let lrg_id = format!("LRG_{}", i);
let seq_path = cache_dir.join(format!("{}.sequence", &lrg_id));
if !seq_path.exists() {
continue;
}
let ann_path = cache_dir.join(format!("{}.annotations", &lrg_id));
if ann_path.exists() {
if let Ok(content) = std::fs::read_to_string(&ann_path) {
if content.contains("\"type\":\"exon\"") {
skipped += 1;
continue;
}
}
}
let xml_content = if let Some(ref lrg_dir) = ferro_lrg_dir {
let ferro_xml_path = lrg_dir.join(format!("{}.xml", lrg_id));
if ferro_xml_path.exists() {
match std::fs::read_to_string(&ferro_xml_path) {
Ok(content) if content.contains("<lrg") => {
from_ferro += 1;
Some(content)
}
_ => None,
}
} else {
None
}
} else {
None
};
let xml_content = match xml_content {
Some(content) => content,
None => {
let url = format!("{}{}.xml", LRG_XML_URL, lrg_id);
let output = Command::new("curl").args(["-s", "-f", "-L", &url]).output();
match output {
Ok(o) if o.status.success() => {
let content = String::from_utf8_lossy(&o.stdout).to_string();
if content.is_empty() || !content.contains("<lrg") {
failed += 1;
continue;
}
from_ebi += 1;
std::thread::sleep(std::time::Duration::from_millis(100));
content
}
_ => {
failed += 1;
continue;
}
}
}
};
match parse_lrg_xml_to_annotations(&lrg_id, &xml_content) {
Ok((genomic_ann, transcript_anns)) => {
if let Ok(json_str) = serde_json::to_string_pretty(&genomic_ann) {
let _ = std::fs::write(&ann_path, json_str);
}
for (tx_id, tx_ann, tx_seq) in transcript_anns {
let tx_ann_path = cache_dir.join(format!("{}.annotations", tx_id));
let tx_seq_path = cache_dir.join(format!("{}.sequence", tx_id));
if let Ok(json_str) = serde_json::to_string_pretty(&tx_ann) {
let _ = std::fs::write(&tx_ann_path, json_str);
}
if let Some(seq) = tx_seq {
let _ = std::fs::write(&tx_seq_path, seq);
}
}
enhanced += 1;
if enhanced.is_multiple_of(50) {
eprintln!(" Enhanced {} LRG annotations...", enhanced);
}
}
Err(_) => {
failed += 1;
}
}
}
if skipped > 0 {
eprintln!(" Skipped {} already-enhanced LRG annotations", skipped);
}
if from_ferro > 0 {
eprintln!(
" Enhanced {} LRG annotations from ferro XML cache",
from_ferro
);
}
if from_ebi > 0 {
eprintln!(" Enhanced {} LRG annotations from EBI", from_ebi);
}
if failed > 0 {
eprintln!(" Failed to enhance {} LRG annotations", failed);
}
Ok(enhanced)
}
#[allow(clippy::type_complexity)]
fn parse_lrg_xml_to_annotations(
lrg_id: &str,
xml_content: &str,
) -> Result<
(
serde_json::Value,
Vec<(String, serde_json::Value, Option<String>)>,
),
FerroError,
> {
let seq_len = extract_xml_value(xml_content, "sequence")
.map(|s| s.len())
.unwrap_or(0);
let mol_type = extract_xml_attr(xml_content, "mol_type").unwrap_or_else(|| "dna".to_string());
let mut transcript_features = Vec::new();
let mut transcript_annotations = Vec::new();
let mut pos = 0;
while let Some(tx_start) = xml_content[pos..].find("<transcript name=\"") {
let tx_start = pos + tx_start;
let tx_name_start = tx_start + 18; let tx_name_end = match xml_content[tx_name_start..].find('"') {
Some(e) => tx_name_start + e,
None => break,
};
let tx_name = &xml_content[tx_name_start..tx_name_end];
let tx_id = format!("{}{}", lrg_id, tx_name);
let tx_end = match xml_content[tx_start..].find("</transcript>") {
Some(e) => tx_start + e + 13,
None => break,
};
let tx_block = &xml_content[tx_start..tx_end];
let (tx_start_coord, tx_end_coord, tx_strand) =
extract_lrg_coordinates(tx_block, lrg_id).unwrap_or((0, seq_len, 1));
let cdna_seq = extract_xml_value(tx_block, "sequence");
let (cds_start, cds_end) = extract_lrg_cds_coordinates(tx_block, lrg_id);
let exons = extract_lrg_exons(tx_block, lrg_id);
let tx_mol_type = if cds_start.is_some() { "mRNA" } else { "ncRNA" };
let mut tx_feature = json!({
"type": tx_mol_type,
"id": tx_name,
"location": {
"type": "range",
"start": {"type": "point", "position": tx_start_coord},
"end": {"type": "point", "position": tx_end_coord},
"strand": tx_strand
},
"features": []
});
let exon_features: Vec<serde_json::Value> = exons
.iter()
.map(|(label, start, end)| {
json!({
"type": "exon",
"id": label,
"location": {
"type": "range",
"start": {"type": "point", "position": *start},
"end": {"type": "point", "position": *end},
"strand": tx_strand
}
})
})
.collect();
tx_feature["features"] = json!(exon_features);
if let (Some(cds_s), Some(cds_e)) = (cds_start, cds_end) {
tx_feature["cds"] = json!({
"location": {
"type": "range",
"start": {"type": "point", "position": cds_s},
"end": {"type": "point", "position": cds_e}
}
});
}
transcript_features.push(tx_feature);
let cdna_len = cdna_seq
.as_ref()
.map(|s| s.len())
.unwrap_or(tx_end_coord - tx_start_coord);
let tx_cds = if let (Some(cds_s), Some(cds_e)) = (cds_start, cds_end) {
calculate_transcript_cds(&exons, cds_s, cds_e, tx_start_coord)
} else {
None
};
let mut tx_ann = json!({
"type": "record",
"id": tx_id,
"location": {
"type": "range",
"start": {"type": "point", "position": 0},
"end": {"type": "point", "position": cdna_len}
},
"qualifiers": {
"mol_type": tx_mol_type,
"name": tx_id
},
"features": [{
"type": "gene",
"id": tx_id,
"location": {
"type": "range",
"start": {"type": "point", "position": 0},
"end": {"type": "point", "position": cdna_len},
"strand": 1
},
"features": [{
"type": tx_mol_type,
"id": tx_id,
"location": {
"type": "range",
"start": {"type": "point", "position": 0},
"end": {"type": "point", "position": cdna_len}
},
"features": []
}]
}]
});
if let Some((cds_tx_start, cds_tx_end)) = tx_cds {
if let Some(gene_features) = tx_ann["features"][0]["features"].as_array_mut() {
if let Some(mrna) = gene_features.first_mut() {
mrna["features"] = json!([{
"type": "CDS",
"id": tx_id,
"location": {
"type": "range",
"start": {"type": "point", "position": cds_tx_start},
"end": {"type": "point", "position": cds_tx_end}
}
}]);
}
}
}
transcript_annotations.push((tx_id, tx_ann, cdna_seq));
pos = tx_end;
}
let genomic_ann = json!({
"type": "record",
"id": lrg_id,
"location": {
"type": "range",
"start": {"type": "point", "position": 0},
"end": {"type": "point", "position": seq_len}
},
"qualifiers": {
"mol_type": mol_type,
"name": lrg_id
},
"features": [{
"type": "gene",
"id": lrg_id,
"location": {
"type": "range",
"start": {"type": "point", "position": 0},
"end": {"type": "point", "position": seq_len},
"strand": 1
},
"features": transcript_features
}]
});
Ok((genomic_ann, transcript_annotations))
}
fn extract_xml_value(xml: &str, tag: &str) -> Option<String> {
let start_tag = format!("<{}>", tag);
let end_tag = format!("</{}>", tag);
let start = xml.find(&start_tag)?;
let content_start = start + start_tag.len();
let end = xml[content_start..].find(&end_tag)?;
Some(xml[content_start..content_start + end].to_string())
}
fn extract_xml_attr(xml: &str, tag: &str) -> Option<String> {
let start_tag = format!("<{}>", tag);
let end_tag = format!("</{}>", tag);
let start = xml.find(&start_tag)?;
let content_start = start + start_tag.len();
let end = xml[content_start..].find(&end_tag)?;
Some(xml[content_start..content_start + end].to_string())
}
fn extract_lrg_coordinates(block: &str, coord_system: &str) -> Option<(usize, usize, i32)> {
let search = format!("coord_system=\"{}\"", coord_system);
let pos = block.find(&search)?;
let line_start = block[..pos].rfind('<')?;
let line_end = block[pos..].find('>')?;
let coord_line = &block[line_start..pos + line_end + 1];
let start = extract_number_attr(coord_line, "start")?;
let end = extract_number_attr(coord_line, "end")?;
let strand = extract_number_attr(coord_line, "strand").unwrap_or(1) as i32;
Some((start, end, strand))
}
fn extract_lrg_cds_coordinates(block: &str, coord_system: &str) -> (Option<usize>, Option<usize>) {
let search = "<coding_region>";
if let Some(cr_start) = block.find(search) {
let cr_end = block[cr_start..]
.find("</coding_region>")
.unwrap_or(block.len() - cr_start);
let cr_block = &block[cr_start..cr_start + cr_end];
if let Some((start, end, _)) = extract_lrg_coordinates(cr_block, coord_system) {
return (Some(start), Some(end));
}
}
(None, None)
}
fn extract_lrg_exons(block: &str, coord_system: &str) -> Vec<(String, usize, usize)> {
let mut exons = Vec::new();
let mut pos = 0;
while let Some(exon_start) = block[pos..].find("<exon label=\"") {
let exon_start = pos + exon_start;
let label_start = exon_start + 13;
let label_end = match block[label_start..].find('"') {
Some(e) => label_start + e,
None => break,
};
let label = block[label_start..label_end].to_string();
let exon_end = match block[exon_start..].find("</exon>") {
Some(e) => exon_start + e + 7,
None => match block[exon_start..].find("/>") {
Some(e) => exon_start + e + 2,
None => break,
},
};
let exon_block = &block[exon_start..exon_end];
if let Some((start, end, _)) = extract_lrg_coordinates(exon_block, coord_system) {
exons.push((label, start, end));
}
pos = exon_end;
}
exons
}
fn extract_number_attr(element: &str, attr: &str) -> Option<usize> {
let search = format!("{}=\"", attr);
let pos = element.find(&search)?;
let start = pos + search.len();
let end = element[start..].find('"')?;
element[start..start + end].parse().ok()
}
fn calculate_transcript_cds(
exons: &[(String, usize, usize)],
cds_start: usize,
cds_end: usize,
_tx_start: usize,
) -> Option<(usize, usize)> {
if exons.is_empty() {
return None;
}
let mut tx_pos = 0usize;
let mut cds_tx_start = None;
let mut cds_tx_end = None;
for (_label, exon_start, exon_end) in exons {
let exon_len = exon_end - exon_start;
if cds_tx_start.is_none() && cds_start >= *exon_start && cds_start <= *exon_end {
cds_tx_start = Some(tx_pos + (cds_start - exon_start));
}
if cds_tx_end.is_none() && cds_end >= *exon_start && cds_end <= *exon_end {
cds_tx_end = Some(tx_pos + (cds_end - exon_start));
}
tx_pos += exon_len;
if cds_tx_start.is_some() && cds_tx_end.is_some() {
break;
}
}
match (cds_tx_start, cds_tx_end) {
(Some(s), Some(e)) => Some((s, e)),
_ => None,
}
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct MappingStats {
pub from_cdot: usize,
pub from_gene2refseq: usize,
pub total: usize,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct TranscriptMapping {
pub assembly: String,
pub mappings: HashMap<String, String>,
pub stats: MappingStats,
}
pub fn extract_cdot_mappings<P: AsRef<Path>>(
cdot_path: P,
assembly: &str,
) -> Result<HashMap<String, String>, FerroError> {
let cdot_path = cdot_path.as_ref();
eprintln!("Loading cdot mappings from {}...", cdot_path.display());
let file = File::open(cdot_path).map_err(|e| FerroError::Io {
msg: format!("Failed to open cdot file: {}", e),
})?;
let value: serde_json::Value = if cdot_path.extension().is_some_and(|e| e == "gz") {
let decoder = flate2::read::GzDecoder::new(file);
let reader = BufReader::new(decoder);
serde_json::from_reader(reader).map_err(|e| FerroError::Json {
msg: format!("Failed to parse cdot JSON: {}", e),
})?
} else {
let reader = BufReader::new(file);
serde_json::from_reader(reader).map_err(|e| FerroError::Json {
msg: format!("Failed to parse cdot JSON: {}", e),
})?
};
let mut mappings = HashMap::new();
if let Some(transcripts) = value.get("transcripts").and_then(|t| t.as_object()) {
for (acc, tx_data) in transcripts {
if let Some(contig) = tx_data
.get("genome_builds")
.and_then(|gb| gb.get(assembly))
.and_then(|asm| asm.get("contig"))
.and_then(|c| c.as_str())
{
mappings.insert(acc.clone(), contig.to_string());
}
}
}
eprintln!(
" Extracted {} transcript→chromosome mappings",
mappings.len()
);
Ok(mappings)
}
pub fn download_gene2refseq<P: AsRef<Path>>(output_path: P) -> Result<(), FerroError> {
let output_path = output_path.as_ref();
if output_path.exists() {
eprintln!("gene2refseq already exists at {}", output_path.display());
return Ok(());
}
eprintln!("Downloading gene2refseq.gz from NCBI (this may take a while, ~2GB)...");
let url = "https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz";
let status = std::process::Command::new("curl")
.args(["-o", &output_path.display().to_string(), "-L", "-f", url])
.status()
.map_err(|e| FerroError::Io {
msg: format!("Failed to run curl: {}", e),
})?;
if !status.success() {
return Err(FerroError::Io {
msg: format!(
"Failed to download gene2refseq.gz (exit code: {:?})",
status.code()
),
});
}
eprintln!(" Downloaded to {}", output_path.display());
Ok(())
}
pub fn extract_gene2refseq_mappings<P: AsRef<Path>>(
gene2refseq_path: P,
assembly: &str,
) -> Result<HashMap<String, String>, FerroError> {
let gene2refseq_path = gene2refseq_path.as_ref();
eprintln!("Extracting {} mappings from gene2refseq...", assembly);
let file = File::open(gene2refseq_path).map_err(|e| FerroError::Io {
msg: format!("Failed to open gene2refseq: {}", e),
})?;
let reader: Box<dyn BufRead> = if gene2refseq_path.extension().is_some_and(|e| e == "gz") {
Box::new(BufReader::new(flate2::read::GzDecoder::new(file)))
} else {
Box::new(BufReader::new(file))
};
let mut mappings = HashMap::new();
let mut lines_processed = 0u64;
for line in reader.lines() {
let line = line.map_err(|e| FerroError::Io {
msg: format!("Error reading gene2refseq: {}", e),
})?;
if line.starts_with('#') {
continue;
}
lines_processed += 1;
if lines_processed.is_multiple_of(5_000_000) {
eprintln!(
" Processed {} million lines...",
lines_processed / 1_000_000
);
}
let cols: Vec<&str> = line.split('\t').collect();
if cols.len() < 13 {
continue;
}
if cols[0] != "9606" {
continue;
}
let transcript = cols[3];
if transcript == "-" || (!transcript.starts_with("NM_") && !transcript.starts_with("NR_")) {
continue;
}
let genomic = cols[7];
if genomic == "-" || !genomic.starts_with("NC_") {
continue;
}
let asm = cols[12];
if !asm.contains(assembly) {
continue;
}
mappings
.entry(transcript.to_string())
.or_insert_with(|| genomic.to_string());
}
eprintln!(
" Extracted {} transcript→chromosome mappings from gene2refseq",
mappings.len()
);
Ok(mappings)
}
pub fn build_transcript_chromosome_mapping<P: AsRef<Path>>(
cdot_path: P,
gene2refseq_path: Option<P>,
output_path: P,
assembly: &str,
) -> Result<MappingStats, FerroError> {
let output_path = output_path.as_ref();
let mut mappings = extract_cdot_mappings(&cdot_path, assembly)?;
let from_cdot = mappings.len();
let from_gene2refseq = if let Some(g2r_path) = gene2refseq_path {
let g2r_mappings = extract_gene2refseq_mappings(g2r_path, assembly)?;
let mut added = 0usize;
for (transcript, chromosome) in g2r_mappings {
if let std::collections::hash_map::Entry::Vacant(e) = mappings.entry(transcript) {
e.insert(chromosome);
added += 1;
}
}
eprintln!(" Added {} mappings from gene2refseq (not in cdot)", added);
added
} else {
0
};
let stats = MappingStats {
from_cdot,
from_gene2refseq,
total: mappings.len(),
};
let output = TranscriptMapping {
assembly: assembly.to_string(),
mappings,
stats: stats.clone(),
};
let file = File::create(output_path).map_err(|e| FerroError::Io {
msg: format!("Failed to create mapping file: {}", e),
})?;
serde_json::to_writer_pretty(file, &output).map_err(|e| FerroError::Json {
msg: format!("Failed to write mapping file: {}", e),
})?;
eprintln!(
"\nMapping file written to {} ({} mappings)",
output_path.display(),
stats.total
);
Ok(stats)
}
pub fn load_transcript_mapping<P: AsRef<Path>>(
path: P,
) -> Result<HashMap<String, String>, FerroError> {
let path = path.as_ref();
let file = File::open(path).map_err(|e| FerroError::Io {
msg: format!("Failed to open mapping file: {}", e),
})?;
let mapping: TranscriptMapping =
serde_json::from_reader(BufReader::new(file)).map_err(|e| FerroError::Json {
msg: format!("Failed to parse mapping file: {}", e),
})?;
Ok(mapping.mappings)
}
pub fn extract_clinvar_protein_accessions<P: AsRef<Path>>(
clinvar_path: P,
) -> Result<Vec<String>, FerroError> {
use flate2::read::GzDecoder;
let clinvar_path = clinvar_path.as_ref();
eprintln!(
"Extracting protein accessions from {}...",
clinvar_path.display()
);
let file = File::open(clinvar_path).map_err(|e| FerroError::Io {
msg: format!("Failed to open ClinVar file: {}", e),
})?;
let reader: Box<dyn BufRead> = if clinvar_path.to_string_lossy().ends_with(".gz") {
Box::new(BufReader::new(GzDecoder::new(file)))
} else {
Box::new(BufReader::new(file))
};
let mut accessions = HashSet::new();
let mut line_count = 0u64;
for line in reader.lines() {
let line = line.map_err(|e| FerroError::Io {
msg: format!("Failed to read line: {}", e),
})?;
if line.starts_with('#') {
continue;
}
line_count += 1;
if line_count.is_multiple_of(1_000_000) {
eprintln!(
" Processed {} lines, found {} accessions...",
line_count,
accessions.len()
);
}
let fields: Vec<&str> = line.split('\t').collect();
if fields.len() > 8 {
let protein_col = fields[8];
if protein_col != "-" && !protein_col.is_empty() {
if let Some(acc) = protein_col.split(':').next() {
accessions.insert(acc.to_string());
}
}
}
}
eprintln!(
" Found {} unique protein accessions from {} lines",
accessions.len(),
line_count
);
let mut result: Vec<String> = accessions.into_iter().collect();
result.sort();
Ok(result)
}
#[derive(Debug, Clone, Default)]
pub struct ProteinCacheStats {
pub ncbi_fetched: usize,
pub uniprot_fetched: usize,
pub already_cached: usize,
pub failed: usize,
pub total: usize,
}
pub fn populate_protein_cache<P: AsRef<Path>>(
accessions: &[String],
cache_dir: P,
) -> Result<ProteinCacheStats, FerroError> {
let cache_dir = cache_dir.as_ref();
std::fs::create_dir_all(cache_dir).map_err(|e| FerroError::Io {
msg: format!("Failed to create protein cache dir: {}", e),
})?;
let mut stats = ProteinCacheStats {
total: accessions.len(),
..Default::default()
};
let mut ncbi_accessions: Vec<&str> = Vec::new();
let mut uniprot_accessions: Vec<&str> = Vec::new();
let parent_dir = cache_dir.parent();
for acc in accessions {
let already_cached = parent_dir
.map(|p| p.join(format!("{}.sequence", acc)).exists())
.unwrap_or(false);
if already_cached {
stats.already_cached += 1;
continue;
}
if acc.starts_with("NP_") || acc.starts_with("XP_") || acc.starts_with("YP_") {
ncbi_accessions.push(acc);
} else if is_uniprot_accession(acc) {
uniprot_accessions.push(acc);
}
}
eprintln!(
"Protein cache population: {} total, {} already cached",
stats.total, stats.already_cached
);
eprintln!(
" To fetch: {} NCBI, {} UniProt",
ncbi_accessions.len(),
uniprot_accessions.len()
);
if !ncbi_accessions.is_empty() {
eprintln!(
"Fetching {} protein sequences from NCBI...",
ncbi_accessions.len()
);
let fetched = fetch_ncbi_protein_batch(&ncbi_accessions, cache_dir)?;
stats.ncbi_fetched = fetched;
eprintln!(" Fetched {} NCBI protein sequences", fetched);
}
if !uniprot_accessions.is_empty() {
eprintln!(
"Fetching {} protein sequences from UniProt...",
uniprot_accessions.len()
);
let fetched = fetch_uniprot_batch(&uniprot_accessions, cache_dir)?;
stats.uniprot_fetched = fetched;
eprintln!(" Fetched {} UniProt protein sequences", fetched);
}
stats.failed = stats.total - stats.already_cached - stats.ncbi_fetched - stats.uniprot_fetched;
Ok(stats)
}
fn is_uniprot_accession(acc: &str) -> bool {
if acc.len() < 6 || acc.len() > 10 {
return false;
}
if acc.contains('.') {
return false;
}
acc.chars().all(|c| c.is_ascii_alphanumeric())
}
fn fetch_ncbi_protein_batch(accessions: &[&str], cache_dir: &Path) -> Result<usize, FerroError> {
use std::process::Command;
const BATCH_SIZE: usize = 100;
let mut fetched = 0;
for (batch_idx, batch) in accessions.chunks(BATCH_SIZE).enumerate() {
let id_list = batch.join(",");
let url = build_efetch_url("protein", &id_list, "fasta", "text");
let output = Command::new("curl")
.args(["-s", "-L", &url])
.output()
.map_err(|e| FerroError::Io {
msg: format!("Failed to fetch protein batch: {}", e),
})?;
if !output.status.success() {
eprintln!(
" Warning: Failed to fetch batch {}: {}",
batch_idx + 1,
String::from_utf8_lossy(&output.stderr)
);
continue;
}
let response = String::from_utf8_lossy(&output.stdout);
let mut current_acc: Option<String> = None;
let mut current_seq = String::new();
for line in response.lines() {
if let Some(header) = line.strip_prefix('>') {
if let Some(ref acc) = current_acc {
if !current_seq.is_empty() {
save_protein_fasta(cache_dir, acc, ¤t_seq)?;
fetched += 1;
}
}
current_acc = header.split_whitespace().next().map(|s| s.to_string());
current_seq.clear();
} else if current_acc.is_some() {
current_seq.push_str(line.trim());
}
}
if let Some(ref acc) = current_acc {
if !current_seq.is_empty() {
save_protein_fasta(cache_dir, acc, ¤t_seq)?;
fetched += 1;
}
}
eprintln!(
" Batch {}/{}: fetched sequences",
batch_idx + 1,
accessions.len().div_ceil(BATCH_SIZE)
);
std::thread::sleep(std::time::Duration::from_millis(350));
}
Ok(fetched)
}
fn fetch_uniprot_batch(accessions: &[&str], cache_dir: &Path) -> Result<usize, FerroError> {
use std::process::Command;
const BATCH_SIZE: usize = 100; let mut fetched = 0;
for (batch_idx, batch) in accessions.chunks(BATCH_SIZE).enumerate() {
let acc_list = batch.join(",");
let url = format!(
"https://rest.uniprot.org/uniprotkb/accessions?accessions={}&format=fasta",
acc_list
);
let output = Command::new("curl")
.args(["-s", "-L", &url])
.output()
.map_err(|e| FerroError::Io {
msg: format!("Failed to fetch UniProt batch: {}", e),
})?;
if !output.status.success() {
eprintln!(
" Warning: Failed to fetch UniProt batch {}: {}",
batch_idx + 1,
String::from_utf8_lossy(&output.stderr)
);
continue;
}
let response = String::from_utf8_lossy(&output.stdout);
let mut current_acc: Option<String> = None;
let mut current_seq = String::new();
for line in response.lines() {
if let Some(header) = line.strip_prefix('>') {
if let Some(ref acc) = current_acc {
if !current_seq.is_empty() {
save_protein_fasta(cache_dir, acc, ¤t_seq)?;
fetched += 1;
}
}
current_acc = header
.split('|')
.nth(1)
.or_else(|| header.split_whitespace().next())
.map(|s| s.to_string());
current_seq.clear();
} else if current_acc.is_some() {
current_seq.push_str(line.trim());
}
}
if let Some(ref acc) = current_acc {
if !current_seq.is_empty() {
save_protein_fasta(cache_dir, acc, ¤t_seq)?;
fetched += 1;
}
}
eprintln!(
" Batch {}/{}: fetched sequences",
batch_idx + 1,
accessions.len().div_ceil(BATCH_SIZE)
);
std::thread::sleep(std::time::Duration::from_millis(200));
}
Ok(fetched)
}
fn save_protein_fasta(cache_dir: &Path, accession: &str, sequence: &str) -> Result<(), FerroError> {
use std::io::Write;
let safe_name = accession.replace(['/', '\\', ':', '*', '?', '"', '<', '>', '|'], "_");
let path = cache_dir.join(format!("{}.fa", safe_name));
let mut file = File::create(&path).map_err(|e| FerroError::Io {
msg: format!("Failed to create protein cache file: {}", e),
})?;
writeln!(file, ">{}", accession).map_err(|e| FerroError::Io {
msg: format!("Failed to write FASTA header: {}", e),
})?;
for chunk in sequence.as_bytes().chunks(60) {
writeln!(file, "{}", std::str::from_utf8(chunk).unwrap_or("")).map_err(|e| {
FerroError::Io {
msg: format!("Failed to write sequence: {}", e),
}
})?;
}
if let Some(parent) = cache_dir.parent() {
let seq_path = parent.join(format!("{}.sequence", accession));
let ann_path = parent.join(format!("{}.annotations", accession));
if !seq_path.exists() {
std::fs::write(&seq_path, sequence).map_err(|e| FerroError::Io {
msg: format!("Failed to write protein sequence cache: {}", e),
})?;
let annotations = build_protein_annotations(accession, sequence.len());
std::fs::write(
&ann_path,
serde_json::to_string(&annotations).unwrap_or_default(),
)
.map_err(|e| FerroError::Io {
msg: format!("Failed to write protein annotations: {}", e),
})?;
}
}
Ok(())
}