use crate::genome_io::GenomeIO;
use anyhow::{anyhow, Context, Result};
use flate2::read::MultiGzDecoder;
use ragc_common::Contig;
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufReader, Read};
use std::path::{Path, PathBuf};
#[cfg(feature = "indexed-fasta")]
use faigz_rs::{FastaFormat, FastaIndex, FastaReader};
pub trait ContigIterator {
fn next_contig(&mut self) -> Result<Option<(String, String, Contig)>>;
fn reset(&mut self) -> Result<()>;
}
#[derive(Debug)]
struct SampleOrderInfo {
sample_contigs: HashMap<String, Vec<String>>,
sample_order: Vec<String>,
is_contiguous: bool,
}
impl SampleOrderInfo {
fn analyze_from_index(index_path: &str) -> Result<Self> {
use std::io::BufRead;
let file = File::open(index_path)
.with_context(|| format!("Failed to open index: {index_path}"))?;
let reader = BufReader::new(file);
let mut sample_contigs: HashMap<String, Vec<String>> = HashMap::new();
let mut sample_order = Vec::new();
let mut last_sample = None;
for line in reader.lines() {
let line = line?;
let full_header = line
.split('\t')
.next()
.ok_or_else(|| anyhow!("Invalid FAI format"))?
.to_string();
let sample_name = if let Some(parts) = full_header.split('#').nth(0) {
if let Some(hap) = full_header.split('#').nth(1) {
format!("{parts}#{hap}")
} else {
full_header.clone()
}
} else {
full_header.clone()
};
sample_contigs
.entry(sample_name.clone())
.or_default()
.push(full_header);
if last_sample.as_ref() != Some(&sample_name) {
sample_order.push(sample_name.clone());
last_sample = Some(sample_name);
}
}
let unique_samples: std::collections::HashSet<_> = sample_order.iter().collect();
let is_contiguous = unique_samples.len() == sample_order.len();
let final_sample_order = if is_contiguous {
sample_order
} else {
let mut sorted: Vec<_> = sample_contigs.keys().cloned().collect();
sorted.sort();
sorted
};
Ok(SampleOrderInfo {
sample_contigs,
sample_order: final_sample_order,
is_contiguous,
})
}
fn analyze_file(file_path: &Path) -> Result<Self> {
let index_path = format!("{}.fai", file_path.display());
if std::path::Path::new(&index_path).exists() {
return Self::analyze_from_index(&index_path);
}
let file = File::open(file_path)
.with_context(|| format!("Failed to open file: {}", file_path.display()))?;
let reader: Box<dyn Read> = if file_path.extension().and_then(|s| s.to_str()) == Some("gz")
{
Box::new(MultiGzDecoder::new(BufReader::new(file)))
} else {
Box::new(BufReader::new(file))
};
let mut genome_io = GenomeIO::new(reader);
let mut sample_contigs: HashMap<String, Vec<String>> = HashMap::new();
let mut sample_order = Vec::new();
let mut last_sample = None;
while let Some((full_header, sample_name, _contig_name, _sequence)) =
genome_io.read_contig_with_sample()?
{
sample_contigs
.entry(sample_name.clone())
.or_default()
.push(full_header);
if last_sample.as_ref() != Some(&sample_name) {
sample_order.push(sample_name.clone());
last_sample = Some(sample_name);
}
}
let unique_samples: std::collections::HashSet<_> = sample_order.iter().collect();
let is_contiguous = unique_samples.len() == sample_order.len();
let final_sample_order = if is_contiguous {
sample_order
} else {
let mut sorted: Vec<_> = sample_contigs.keys().cloned().collect();
sorted.sort();
sorted
};
Ok(SampleOrderInfo {
sample_contigs,
sample_order: final_sample_order,
is_contiguous,
})
}
}
pub struct PansnFileIterator {
file_path: PathBuf,
reader: Option<GenomeIO<Box<dyn Read>>>,
}
impl PansnFileIterator {
pub fn new(file_path: &Path) -> Result<Self> {
let order_info = SampleOrderInfo::analyze_file(file_path)?;
if !order_info.is_contiguous {
let index_path = format!("{}.fai", file_path.display());
let has_index = std::path::Path::new(&index_path).exists();
#[cfg(feature = "indexed-fasta")]
{
if has_index {
return Err(anyhow!(
"Samples are not contiguously ordered in file: {}\n\
\n\
For C++ AGC compatibility, samples must appear in contiguous blocks.\n\
\n\
This file has an index, so you can use random access.\n\
Use IndexedPansnFileIterator::new() instead of PansnFileIterator::new()\n\
Or enable automatic detection in your code.",
file_path.display()
));
}
}
return Err(anyhow!(
"Samples are not contiguously ordered in file: {}\n\
\n\
For C++ AGC compatibility, samples must appear in contiguous blocks.\n\
\n\
Your file has samples scattered throughout (e.g., all chr1, then all chr2).\n\
\n\
Solutions:\n\
1. Reorder file by sample:\n \
ragc sort-fasta {} -o sorted.fa.gz\n\
\n\
2. Use bgzip compression with index for random access:\n \
gunzip {}\n \
bgzip {}\n \
samtools faidx {}.gz\n \
# Then use IndexedPansnFileIterator\n\
\n\
3. Split into per-sample files and compress together\n\
\n\
Found {} samples in non-contiguous order.",
file_path.display(),
file_path.display(),
file_path.display(),
file_path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("input"),
file_path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("input"),
order_info.sample_contigs.len()
));
}
let reader = Self::open_reader(file_path)?;
Ok(PansnFileIterator {
file_path: file_path.to_path_buf(),
reader: Some(reader),
})
}
fn open_reader(file_path: &Path) -> Result<GenomeIO<Box<dyn Read>>> {
let file = File::open(file_path)
.with_context(|| format!("Failed to open file: {}", file_path.display()))?;
let reader: Box<dyn Read> = if file_path.extension().and_then(|s| s.to_str()) == Some("gz")
{
Box::new(MultiGzDecoder::new(BufReader::new(file)))
} else {
Box::new(BufReader::new(file))
};
Ok(GenomeIO::new(reader))
}
}
impl ContigIterator for PansnFileIterator {
fn next_contig(&mut self) -> Result<Option<(String, String, Contig)>> {
let reader = match &mut self.reader {
Some(r) => r,
None => return Ok(None),
};
match reader.read_contig_with_sample()? {
Some((full_header, sample_name, _contig_name, sequence)) => {
Ok(Some((sample_name, full_header, sequence)))
}
None => Ok(None),
}
}
fn reset(&mut self) -> Result<()> {
self.reader = Some(Self::open_reader(&self.file_path)?);
Ok(())
}
}
pub struct BufferedPansnFileIterator {
sample_contigs: HashMap<String, Vec<(String, Contig)>>,
sample_order: Vec<String>,
current_sample_idx: usize,
current_contig_idx: usize,
}
impl BufferedPansnFileIterator {
pub fn new(file_path: &Path) -> Result<Self> {
eprintln!("Reading entire file into memory for reordering...");
let file = File::open(file_path)
.with_context(|| format!("Failed to open file: {}", file_path.display()))?;
let reader: Box<dyn Read> = if file_path.to_string_lossy().ends_with(".gz") {
Box::new(BufReader::new(MultiGzDecoder::new(file)))
} else {
Box::new(BufReader::new(file))
};
let mut genome_io = GenomeIO::new(reader);
let mut sample_contigs: HashMap<String, Vec<(String, Contig)>> = HashMap::new();
let mut sample_order = Vec::new();
let mut seen_samples = std::collections::HashSet::new();
while let Some((header, contig)) = genome_io.read_contig_converted()? {
let sample_name = if let Some(parts) = header.split('#').nth(0) {
if let Some(hap) = header.split('#').nth(1) {
format!("{parts}#{hap}")
} else {
header.clone()
}
} else {
header.clone()
};
if !seen_samples.contains(&sample_name) {
sample_order.push(sample_name.clone());
seen_samples.insert(sample_name.clone());
}
sample_contigs
.entry(sample_name)
.or_default()
.push((header, contig));
}
eprintln!("Loaded {} samples into memory", sample_order.len());
Ok(BufferedPansnFileIterator {
sample_contigs,
sample_order,
current_sample_idx: 0,
current_contig_idx: 0,
})
}
}
impl ContigIterator for BufferedPansnFileIterator {
fn next_contig(&mut self) -> Result<Option<(String, String, Contig)>> {
if self.current_sample_idx >= self.sample_order.len() {
return Ok(None);
}
let sample_name = &self.sample_order[self.current_sample_idx];
let contigs = self
.sample_contigs
.get(sample_name)
.ok_or_else(|| anyhow!("Sample not found: {sample_name}"))?;
if self.current_contig_idx >= contigs.len() {
self.current_sample_idx += 1;
self.current_contig_idx = 0;
return self.next_contig(); }
let (contig_name, contig) = &contigs[self.current_contig_idx];
self.current_contig_idx += 1;
Ok(Some((
sample_name.clone(),
contig_name.clone(),
contig.clone(),
)))
}
fn reset(&mut self) -> Result<()> {
self.current_sample_idx = 0;
self.current_contig_idx = 0;
Ok(())
}
}
#[cfg(feature = "indexed-fasta")]
pub struct IndexedPansnFileIterator {
#[allow(dead_code)]
index: FastaIndex, reader: FastaReader,
order_info: SampleOrderInfo,
current_sample_idx: usize,
current_contig_idx: usize,
}
#[cfg(feature = "indexed-fasta")]
impl IndexedPansnFileIterator {
pub fn new(file_path: &Path) -> Result<Self> {
let index_path = format!("{}.fai", file_path.display());
if !std::path::Path::new(&index_path).exists() {
return Err(anyhow!(
"Index file not found: {}\n\
To use indexed random access, create an index with:\n \
samtools faidx {}",
index_path,
file_path.display()
));
}
let order_info = SampleOrderInfo::analyze_file(file_path)?;
let index = FastaIndex::new(
file_path.to_str().ok_or_else(|| anyhow!("Invalid path"))?,
FastaFormat::Fasta,
)
.with_context(|| format!("Failed to load FASTA index for {}", file_path.display()))?;
let reader = FastaReader::new(&index).with_context(|| "Failed to create FASTA reader")?;
Ok(IndexedPansnFileIterator {
index, reader,
order_info,
current_sample_idx: 0,
current_contig_idx: 0,
})
}
}
#[cfg(feature = "indexed-fasta")]
impl ContigIterator for IndexedPansnFileIterator {
fn next_contig(&mut self) -> Result<Option<(String, String, Contig)>> {
loop {
if self.current_sample_idx >= self.order_info.sample_order.len() {
return Ok(None);
}
let sample_name = &self.order_info.sample_order[self.current_sample_idx];
let contigs = self
.order_info
.sample_contigs
.get(sample_name)
.ok_or_else(|| anyhow!("Sample not found: {sample_name}"))?;
if self.current_contig_idx >= contigs.len() {
self.current_sample_idx += 1;
self.current_contig_idx = 0;
continue; }
let full_header = &contigs[self.current_contig_idx];
self.current_contig_idx += 1;
let sequence = self
.reader
.fetch_seq_all(full_header)
.with_context(|| format!("Failed to fetch sequence for {full_header}"))?;
use crate::genome_io::CNV_NUM;
let mut contig = Contig::with_capacity(sequence.len());
for byte in sequence.as_bytes() {
if (*byte as usize) < CNV_NUM.len() {
contig.push(CNV_NUM[*byte as usize]);
} else {
contig.push(4); }
}
return Ok(Some((sample_name.clone(), full_header.clone(), contig)));
}
}
fn reset(&mut self) -> Result<()> {
self.current_sample_idx = 0;
self.current_contig_idx = 0;
Ok(())
}
}
pub struct MultiFileIterator {
file_paths: Vec<PathBuf>,
current_file_idx: usize,
current_reader: Option<GenomeIO<Box<dyn Read>>>,
current_sample_name: String,
}
impl MultiFileIterator {
pub fn new(file_paths: Vec<PathBuf>) -> Result<Self> {
let mut iterator = MultiFileIterator {
file_paths,
current_file_idx: 0,
current_reader: None,
current_sample_name: String::new(),
};
if !iterator.file_paths.is_empty() {
iterator.advance_to_next_file()?;
}
Ok(iterator)
}
fn open_file(&self, file_path: &Path) -> Result<(GenomeIO<Box<dyn Read>>, String)> {
let file = File::open(file_path)
.with_context(|| format!("Failed to open file: {}", file_path.display()))?;
let reader: Box<dyn Read> = if file_path.extension().and_then(|s| s.to_str()) == Some("gz")
{
Box::new(MultiGzDecoder::new(BufReader::new(file)))
} else {
Box::new(BufReader::new(file))
};
let sample_name = file_path
.file_stem()
.and_then(|s| s.to_str())
.map(|s| {
s.trim_end_matches(".fa")
.trim_end_matches(".fasta")
.to_string()
})
.unwrap_or_else(|| "unknown".to_string());
Ok((GenomeIO::new(reader), sample_name))
}
fn advance_to_next_file(&mut self) -> Result<bool> {
if self.current_file_idx >= self.file_paths.len() {
self.current_reader = None;
return Ok(false);
}
let file_path = &self.file_paths[self.current_file_idx];
let (reader, sample_name) = self.open_file(file_path)?;
self.current_reader = Some(reader);
self.current_sample_name = sample_name;
self.current_file_idx += 1;
Ok(true)
}
}
impl ContigIterator for MultiFileIterator {
fn next_contig(&mut self) -> Result<Option<(String, String, Contig)>> {
loop {
let reader = match &mut self.current_reader {
Some(r) => r,
None => return Ok(None),
};
match reader.read_contig_with_sample()? {
Some((full_header, sample_from_header, _contig_name, sequence)) => {
let sample_name = if sample_from_header != "unknown" {
sample_from_header
} else {
self.current_sample_name.clone()
};
return Ok(Some((sample_name, full_header, sequence)));
}
None => {
if !self.advance_to_next_file()? {
return Ok(None);
}
}
}
}
}
fn reset(&mut self) -> Result<()> {
self.current_file_idx = 0;
self.current_reader = None;
self.current_sample_name.clear();
if !self.file_paths.is_empty() {
self.advance_to_next_file()?;
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
#[test]
fn test_pansn_file_iterator() {
let mut temp_file = NamedTempFile::new().unwrap();
writeln!(temp_file, ">sample1#1#chr1").unwrap();
writeln!(temp_file, "ACGTACGT").unwrap();
writeln!(temp_file, ">sample1#1#chr2").unwrap();
writeln!(temp_file, "TGCATGCA").unwrap();
writeln!(temp_file, ">sample2#0#chr1").unwrap();
writeln!(temp_file, "AAAACCCC").unwrap();
temp_file.flush().unwrap();
let mut iterator = PansnFileIterator::new(temp_file.path()).unwrap();
let (sample, contig, _seq) = iterator.next_contig().unwrap().unwrap();
assert_eq!(sample, "sample1#1");
assert_eq!(contig, "sample1#1#chr1");
let (sample, contig, _seq) = iterator.next_contig().unwrap().unwrap();
assert_eq!(sample, "sample1#1");
assert_eq!(contig, "sample1#1#chr2");
let (sample, contig, _seq) = iterator.next_contig().unwrap().unwrap();
assert_eq!(sample, "sample2#0");
assert_eq!(contig, "sample2#0#chr1");
assert!(iterator.next_contig().unwrap().is_none());
iterator.reset().unwrap();
let (sample, contig, _seq) = iterator.next_contig().unwrap().unwrap();
assert_eq!(sample, "sample1#1");
assert_eq!(contig, "sample1#1#chr1");
}
}