1use crate::genome_io::GenomeIO;
11use anyhow::{anyhow, Context, Result};
12use flate2::read::MultiGzDecoder;
13use ragc_common::Contig;
14use std::collections::HashMap;
15use std::fs::File;
16use std::io::{BufReader, Read};
17use std::path::{Path, PathBuf};
18
19#[cfg(feature = "indexed-fasta")]
20use faigz_rs::{FastaFormat, FastaIndex, FastaReader};
21
22pub trait ContigIterator {
24 fn next_contig(&mut self) -> Result<Option<(String, String, Contig)>>;
27
28 fn reset(&mut self) -> Result<()>;
30}
31
32#[derive(Debug)]
34struct SampleOrderInfo {
35 sample_contigs: HashMap<String, Vec<String>>,
37 sample_order: Vec<String>,
39 is_contiguous: bool,
41}
42
43impl SampleOrderInfo {
44 fn analyze_from_index(index_path: &str) -> Result<Self> {
46 use std::io::BufRead;
47
48 let file = File::open(index_path)
49 .with_context(|| format!("Failed to open index: {index_path}"))?;
50 let reader = BufReader::new(file);
51
52 let mut sample_contigs: HashMap<String, Vec<String>> = HashMap::new();
53 let mut sample_order = Vec::new();
54 let mut last_sample = None;
55
56 for line in reader.lines() {
58 let line = line?;
59 let full_header = line
60 .split('\t')
61 .next()
62 .ok_or_else(|| anyhow!("Invalid FAI format"))?
63 .to_string();
64
65 let sample_name = if let Some(parts) = full_header.split('#').nth(0) {
67 if let Some(hap) = full_header.split('#').nth(1) {
68 format!("{parts}#{hap}")
69 } else {
70 full_header.clone()
71 }
72 } else {
73 full_header.clone()
74 };
75
76 sample_contigs
78 .entry(sample_name.clone())
79 .or_default()
80 .push(full_header);
81
82 if last_sample.as_ref() != Some(&sample_name) {
84 sample_order.push(sample_name.clone());
85 last_sample = Some(sample_name);
86 }
87 }
88
89 let unique_samples: std::collections::HashSet<_> = sample_order.iter().collect();
91 let is_contiguous = unique_samples.len() == sample_order.len();
92
93 let final_sample_order = if is_contiguous {
95 sample_order
96 } else {
97 let mut sorted: Vec<_> = sample_contigs.keys().cloned().collect();
98 sorted.sort();
99 sorted
100 };
101
102 Ok(SampleOrderInfo {
103 sample_contigs,
104 sample_order: final_sample_order,
105 is_contiguous,
106 })
107 }
108
109 fn analyze_file(file_path: &Path) -> Result<Self> {
112 let index_path = format!("{}.fai", file_path.display());
114 if std::path::Path::new(&index_path).exists() {
115 return Self::analyze_from_index(&index_path);
116 }
117
118 let file = File::open(file_path)
120 .with_context(|| format!("Failed to open file: {}", file_path.display()))?;
121
122 let reader: Box<dyn Read> = if file_path.extension().and_then(|s| s.to_str()) == Some("gz")
123 {
124 Box::new(MultiGzDecoder::new(BufReader::new(file)))
125 } else {
126 Box::new(BufReader::new(file))
127 };
128
129 let mut genome_io = GenomeIO::new(reader);
130 let mut sample_contigs: HashMap<String, Vec<String>> = HashMap::new();
131 let mut sample_order = Vec::new();
132 let mut last_sample = None;
133
134 while let Some((full_header, sample_name, _contig_name, _sequence)) =
136 genome_io.read_contig_with_sample()?
137 {
138 sample_contigs
140 .entry(sample_name.clone())
141 .or_default()
142 .push(full_header);
143
144 if last_sample.as_ref() != Some(&sample_name) {
146 sample_order.push(sample_name.clone());
147 last_sample = Some(sample_name);
148 }
149 }
150
151 let unique_samples: std::collections::HashSet<_> = sample_order.iter().collect();
153 let is_contiguous = unique_samples.len() == sample_order.len();
154
155 let final_sample_order = if is_contiguous {
157 sample_order
158 } else {
159 let mut sorted: Vec<_> = sample_contigs.keys().cloned().collect();
160 sorted.sort();
161 sorted
162 };
163
164 Ok(SampleOrderInfo {
165 sample_contigs,
166 sample_order: final_sample_order,
167 is_contiguous,
168 })
169 }
170}
171
172pub struct PansnFileIterator {
175 file_path: PathBuf,
176 reader: Option<GenomeIO<Box<dyn Read>>>,
177}
178
179impl PansnFileIterator {
180 pub fn new(file_path: &Path) -> Result<Self> {
189 let order_info = SampleOrderInfo::analyze_file(file_path)?;
191
192 if !order_info.is_contiguous {
193 let index_path = format!("{}.fai", file_path.display());
195 let has_index = std::path::Path::new(&index_path).exists();
196
197 #[cfg(feature = "indexed-fasta")]
198 {
199 if has_index {
200 return Err(anyhow!(
201 "Samples are not contiguously ordered in file: {}\n\
202 \n\
203 For C++ AGC compatibility, samples must appear in contiguous blocks.\n\
204 \n\
205 This file has an index, so you can use random access.\n\
206 Use IndexedPansnFileIterator::new() instead of PansnFileIterator::new()\n\
207 Or enable automatic detection in your code.",
208 file_path.display()
209 ));
210 }
211 }
212
213 return Err(anyhow!(
214 "Samples are not contiguously ordered in file: {}\n\
215 \n\
216 For C++ AGC compatibility, samples must appear in contiguous blocks.\n\
217 \n\
218 Your file has samples scattered throughout (e.g., all chr1, then all chr2).\n\
219 \n\
220 Solutions:\n\
221 1. Reorder file by sample:\n \
222 ragc sort-fasta {} -o sorted.fa.gz\n\
223 \n\
224 2. Use bgzip compression with index for random access:\n \
225 gunzip {}\n \
226 bgzip {}\n \
227 samtools faidx {}.gz\n \
228 # Then use IndexedPansnFileIterator\n\
229 \n\
230 3. Split into per-sample files and compress together\n\
231 \n\
232 Found {} samples in non-contiguous order.",
233 file_path.display(),
234 file_path.display(),
235 file_path.display(),
236 file_path
237 .file_stem()
238 .and_then(|s| s.to_str())
239 .unwrap_or("input"),
240 file_path
241 .file_stem()
242 .and_then(|s| s.to_str())
243 .unwrap_or("input"),
244 order_info.sample_contigs.len()
245 ));
246 }
247
248 let reader = Self::open_reader(file_path)?;
249 Ok(PansnFileIterator {
250 file_path: file_path.to_path_buf(),
251 reader: Some(reader),
252 })
253 }
254
255 fn open_reader(file_path: &Path) -> Result<GenomeIO<Box<dyn Read>>> {
256 let file = File::open(file_path)
257 .with_context(|| format!("Failed to open file: {}", file_path.display()))?;
258
259 let reader: Box<dyn Read> = if file_path.extension().and_then(|s| s.to_str()) == Some("gz")
260 {
261 Box::new(MultiGzDecoder::new(BufReader::new(file)))
262 } else {
263 Box::new(BufReader::new(file))
264 };
265
266 Ok(GenomeIO::new(reader))
267 }
268}
269
270impl ContigIterator for PansnFileIterator {
271 fn next_contig(&mut self) -> Result<Option<(String, String, Contig)>> {
272 let reader = match &mut self.reader {
273 Some(r) => r,
274 None => return Ok(None),
275 };
276
277 match reader.read_contig_with_sample()? {
278 Some((full_header, sample_name, _contig_name, sequence)) => {
279 Ok(Some((sample_name, full_header, sequence)))
284 }
285 None => Ok(None),
286 }
287 }
288
289 fn reset(&mut self) -> Result<()> {
290 self.reader = Some(Self::open_reader(&self.file_path)?);
291 Ok(())
292 }
293}
294
295pub struct BufferedPansnFileIterator {
298 sample_contigs: HashMap<String, Vec<(String, Contig)>>,
300 sample_order: Vec<String>,
301 current_sample_idx: usize,
302 current_contig_idx: usize,
303}
304
305impl BufferedPansnFileIterator {
306 pub fn new(file_path: &Path) -> Result<Self> {
308 eprintln!("Reading entire file into memory for reordering...");
309
310 let file = File::open(file_path)
312 .with_context(|| format!("Failed to open file: {}", file_path.display()))?;
313 let reader: Box<dyn Read> = if file_path.to_string_lossy().ends_with(".gz") {
314 Box::new(BufReader::new(MultiGzDecoder::new(file)))
315 } else {
316 Box::new(BufReader::new(file))
317 };
318 let mut genome_io = GenomeIO::new(reader);
319
320 let mut sample_contigs: HashMap<String, Vec<(String, Contig)>> = HashMap::new();
322 let mut sample_order = Vec::new();
323 let mut seen_samples = std::collections::HashSet::new();
324
325 while let Some((header, contig)) = genome_io.read_contig_converted()? {
326 let sample_name = if let Some(parts) = header.split('#').nth(0) {
328 if let Some(hap) = header.split('#').nth(1) {
329 format!("{parts}#{hap}")
330 } else {
331 header.clone()
332 }
333 } else {
334 header.clone()
335 };
336
337 if !seen_samples.contains(&sample_name) {
339 sample_order.push(sample_name.clone());
340 seen_samples.insert(sample_name.clone());
341 }
342
343 sample_contigs
345 .entry(sample_name)
346 .or_default()
347 .push((header, contig));
348 }
349
350 eprintln!("Loaded {} samples into memory", sample_order.len());
351
352 Ok(BufferedPansnFileIterator {
353 sample_contigs,
354 sample_order,
355 current_sample_idx: 0,
356 current_contig_idx: 0,
357 })
358 }
359}
360
361impl ContigIterator for BufferedPansnFileIterator {
362 fn next_contig(&mut self) -> Result<Option<(String, String, Contig)>> {
363 if self.current_sample_idx >= self.sample_order.len() {
365 return Ok(None);
366 }
367
368 let sample_name = &self.sample_order[self.current_sample_idx];
369 let contigs = self
370 .sample_contigs
371 .get(sample_name)
372 .ok_or_else(|| anyhow!("Sample not found: {sample_name}"))?;
373
374 if self.current_contig_idx >= contigs.len() {
376 self.current_sample_idx += 1;
378 self.current_contig_idx = 0;
379 return self.next_contig(); }
381
382 let (contig_name, contig) = &contigs[self.current_contig_idx];
384 self.current_contig_idx += 1;
385
386 Ok(Some((
387 sample_name.clone(),
388 contig_name.clone(),
389 contig.clone(),
390 )))
391 }
392
393 fn reset(&mut self) -> Result<()> {
394 self.current_sample_idx = 0;
395 self.current_contig_idx = 0;
396 Ok(())
397 }
398}
399
400#[cfg(feature = "indexed-fasta")]
403pub struct IndexedPansnFileIterator {
404 #[allow(dead_code)]
406 index: FastaIndex, reader: FastaReader,
408 order_info: SampleOrderInfo,
409 current_sample_idx: usize,
410 current_contig_idx: usize,
411}
412
413#[cfg(feature = "indexed-fasta")]
414impl IndexedPansnFileIterator {
415 pub fn new(file_path: &Path) -> Result<Self> {
418 let index_path = format!("{}.fai", file_path.display());
420 if !std::path::Path::new(&index_path).exists() {
421 return Err(anyhow!(
422 "Index file not found: {}\n\
423 To use indexed random access, create an index with:\n \
424 samtools faidx {}",
425 index_path,
426 file_path.display()
427 ));
428 }
429
430 let order_info = SampleOrderInfo::analyze_file(file_path)?;
432
433 let index = FastaIndex::new(
435 file_path.to_str().ok_or_else(|| anyhow!("Invalid path"))?,
436 FastaFormat::Fasta,
437 )
438 .with_context(|| format!("Failed to load FASTA index for {}", file_path.display()))?;
439
440 let reader = FastaReader::new(&index).with_context(|| "Failed to create FASTA reader")?;
442
443 Ok(IndexedPansnFileIterator {
444 index, reader,
446 order_info,
447 current_sample_idx: 0,
448 current_contig_idx: 0,
449 })
450 }
451}
452
453#[cfg(feature = "indexed-fasta")]
454impl ContigIterator for IndexedPansnFileIterator {
455 fn next_contig(&mut self) -> Result<Option<(String, String, Contig)>> {
456 loop {
458 if self.current_sample_idx >= self.order_info.sample_order.len() {
460 return Ok(None);
461 }
462
463 let sample_name = &self.order_info.sample_order[self.current_sample_idx];
464 let contigs = self
465 .order_info
466 .sample_contigs
467 .get(sample_name)
468 .ok_or_else(|| anyhow!("Sample not found: {sample_name}"))?;
469
470 if self.current_contig_idx >= contigs.len() {
472 self.current_sample_idx += 1;
474 self.current_contig_idx = 0;
475 continue; }
477
478 let full_header = &contigs[self.current_contig_idx];
480 self.current_contig_idx += 1;
481
482 let sequence = self
484 .reader
485 .fetch_seq_all(full_header)
486 .with_context(|| format!("Failed to fetch sequence for {full_header}"))?;
487
488 use crate::genome_io::CNV_NUM;
490 let mut contig = Contig::with_capacity(sequence.len());
491 for byte in sequence.as_bytes() {
492 if (*byte as usize) < CNV_NUM.len() {
493 contig.push(CNV_NUM[*byte as usize]);
494 } else {
495 contig.push(4); }
497 }
498
499 return Ok(Some((sample_name.clone(), full_header.clone(), contig)));
500 }
501 }
502
503 fn reset(&mut self) -> Result<()> {
504 self.current_sample_idx = 0;
505 self.current_contig_idx = 0;
506 Ok(())
507 }
508}
509
510pub struct MultiFileIterator {
513 file_paths: Vec<PathBuf>,
514 current_file_idx: usize,
515 current_reader: Option<GenomeIO<Box<dyn Read>>>,
516 current_sample_name: String,
517}
518
519impl MultiFileIterator {
520 pub fn new(file_paths: Vec<PathBuf>) -> Result<Self> {
523 let mut iterator = MultiFileIterator {
524 file_paths,
525 current_file_idx: 0,
526 current_reader: None,
527 current_sample_name: String::new(),
528 };
529
530 if !iterator.file_paths.is_empty() {
532 iterator.advance_to_next_file()?;
533 }
534
535 Ok(iterator)
536 }
537
538 fn open_file(&self, file_path: &Path) -> Result<(GenomeIO<Box<dyn Read>>, String)> {
539 let file = File::open(file_path)
540 .with_context(|| format!("Failed to open file: {}", file_path.display()))?;
541
542 let reader: Box<dyn Read> = if file_path.extension().and_then(|s| s.to_str()) == Some("gz")
543 {
544 Box::new(MultiGzDecoder::new(BufReader::new(file)))
545 } else {
546 Box::new(BufReader::new(file))
547 };
548
549 let sample_name = file_path
551 .file_stem()
552 .and_then(|s| s.to_str())
553 .map(|s| {
554 s.trim_end_matches(".fa")
556 .trim_end_matches(".fasta")
557 .to_string()
558 })
559 .unwrap_or_else(|| "unknown".to_string());
560
561 Ok((GenomeIO::new(reader), sample_name))
562 }
563
564 fn advance_to_next_file(&mut self) -> Result<bool> {
565 if self.current_file_idx >= self.file_paths.len() {
566 self.current_reader = None;
567 return Ok(false);
568 }
569
570 let file_path = &self.file_paths[self.current_file_idx];
571 let (reader, sample_name) = self.open_file(file_path)?;
572
573 self.current_reader = Some(reader);
574 self.current_sample_name = sample_name;
575 self.current_file_idx += 1;
576
577 Ok(true)
578 }
579}
580
581impl ContigIterator for MultiFileIterator {
582 fn next_contig(&mut self) -> Result<Option<(String, String, Contig)>> {
583 loop {
584 let reader = match &mut self.current_reader {
585 Some(r) => r,
586 None => return Ok(None),
587 };
588
589 match reader.read_contig_with_sample()? {
590 Some((full_header, sample_from_header, _contig_name, sequence)) => {
591 let sample_name = if sample_from_header != "unknown" {
593 sample_from_header
594 } else {
595 self.current_sample_name.clone()
596 };
597 return Ok(Some((sample_name, full_header, sequence)));
599 }
600 None => {
601 if !self.advance_to_next_file()? {
603 return Ok(None);
604 }
605 }
607 }
608 }
609 }
610
611 fn reset(&mut self) -> Result<()> {
612 self.current_file_idx = 0;
613 self.current_reader = None;
614 self.current_sample_name.clear();
615
616 if !self.file_paths.is_empty() {
617 self.advance_to_next_file()?;
618 }
619
620 Ok(())
621 }
622}
623
624#[cfg(test)]
625mod tests {
626 use super::*;
627 use std::io::Write;
628 use tempfile::NamedTempFile;
629
630 #[test]
631 fn test_pansn_file_iterator() {
632 let mut temp_file = NamedTempFile::new().unwrap();
634 writeln!(temp_file, ">sample1#1#chr1").unwrap();
635 writeln!(temp_file, "ACGTACGT").unwrap();
636 writeln!(temp_file, ">sample1#1#chr2").unwrap();
637 writeln!(temp_file, "TGCATGCA").unwrap();
638 writeln!(temp_file, ">sample2#0#chr1").unwrap();
639 writeln!(temp_file, "AAAACCCC").unwrap();
640 temp_file.flush().unwrap();
641
642 let mut iterator = PansnFileIterator::new(temp_file.path()).unwrap();
643
644 let (sample, contig, _seq) = iterator.next_contig().unwrap().unwrap();
646 assert_eq!(sample, "sample1#1");
647 assert_eq!(contig, "sample1#1#chr1");
648
649 let (sample, contig, _seq) = iterator.next_contig().unwrap().unwrap();
651 assert_eq!(sample, "sample1#1");
652 assert_eq!(contig, "sample1#1#chr2");
653
654 let (sample, contig, _seq) = iterator.next_contig().unwrap().unwrap();
656 assert_eq!(sample, "sample2#0");
657 assert_eq!(contig, "sample2#0#chr1");
658
659 assert!(iterator.next_contig().unwrap().is_none());
661
662 iterator.reset().unwrap();
664 let (sample, contig, _seq) = iterator.next_contig().unwrap().unwrap();
665 assert_eq!(sample, "sample1#1");
666 assert_eq!(contig, "sample1#1#chr1");
667 }
668}