ragc_core/
contig_iterator.rs

1// Contig Iterator abstraction for unified input handling
2//
3// Provides a common interface for reading contigs from:
4// - Single pansn-format FASTA file (sample#hap#chr headers)
5//   - With sample ordering detection and indexed random access support
6// - Multiple per-sample FASTA files
7//
8// This allows the streaming compressor to handle both input formats identically.
9
10use crate::genome_io::GenomeIO;
11use anyhow::{anyhow, Context, Result};
12use flate2::read::MultiGzDecoder;
13use ragc_common::Contig;
14use std::collections::HashMap;
15use std::fs::File;
16use std::io::{BufReader, Read};
17use std::path::{Path, PathBuf};
18
19#[cfg(feature = "indexed-fasta")]
20use faigz_rs::{FastaFormat, FastaIndex, FastaReader};
21
22/// Trait for iterating over contigs from various input sources
23pub trait ContigIterator {
24    /// Get the next contig. Returns None when no more contigs.
25    /// Returns (sample_name, contig_name, sequence)
26    fn next_contig(&mut self) -> Result<Option<(String, String, Contig)>>;
27
28    /// Reset the iterator to the beginning (for second pass)
29    fn reset(&mut self) -> Result<()>;
30}
31
32/// Information about sample ordering in a PanSN file
33#[derive(Debug)]
34struct SampleOrderInfo {
35    /// Map of sample -> list of (contig_name, file_byte_offset)
36    sample_contigs: HashMap<String, Vec<String>>,
37    /// Samples in the order they should be processed
38    sample_order: Vec<String>,
39    /// Whether samples appear in contiguous blocks (all of sample A, then all of sample B)
40    is_contiguous: bool,
41}
42
43impl SampleOrderInfo {
44    /// Analyze sample ordering from FAI index (fast - no decompression needed)
45    fn analyze_from_index(index_path: &str) -> Result<Self> {
46        use std::io::BufRead;
47
48        let file = File::open(index_path)
49            .with_context(|| format!("Failed to open index: {index_path}"))?;
50        let reader = BufReader::new(file);
51
52        let mut sample_contigs: HashMap<String, Vec<String>> = HashMap::new();
53        let mut sample_order = Vec::new();
54        let mut last_sample = None;
55
56        // Read headers from first column of FAI file
57        for line in reader.lines() {
58            let line = line?;
59            let full_header = line
60                .split('\t')
61                .next()
62                .ok_or_else(|| anyhow!("Invalid FAI format"))?
63                .to_string();
64
65            // Parse sample name (sample#hap format)
66            let sample_name = if let Some(parts) = full_header.split('#').nth(0) {
67                if let Some(hap) = full_header.split('#').nth(1) {
68                    format!("{parts}#{hap}")
69                } else {
70                    full_header.clone()
71                }
72            } else {
73                full_header.clone()
74            };
75
76            // Track which sample this contig belongs to
77            sample_contigs
78                .entry(sample_name.clone())
79                .or_default()
80                .push(full_header);
81
82            // Track when samples change (for contiguity check)
83            if last_sample.as_ref() != Some(&sample_name) {
84                sample_order.push(sample_name.clone());
85                last_sample = Some(sample_name);
86            }
87        }
88
89        // Check if samples are contiguous
90        let unique_samples: std::collections::HashSet<_> = sample_order.iter().collect();
91        let is_contiguous = unique_samples.len() == sample_order.len();
92
93        // If not contiguous, use sorted order
94        let final_sample_order = if is_contiguous {
95            sample_order
96        } else {
97            let mut sorted: Vec<_> = sample_contigs.keys().cloned().collect();
98            sorted.sort();
99            sorted
100        };
101
102        Ok(SampleOrderInfo {
103            sample_contigs,
104            sample_order: final_sample_order,
105            is_contiguous,
106        })
107    }
108
109    /// Scan a PanSN file to determine sample ordering
110    /// This does a fast header-only pass through the file
111    fn analyze_file(file_path: &Path) -> Result<Self> {
112        // Check if FAI index exists - if so, use it for instant header reading
113        let index_path = format!("{}.fai", file_path.display());
114        if std::path::Path::new(&index_path).exists() {
115            return Self::analyze_from_index(&index_path);
116        }
117
118        // Otherwise fall back to decompressing (slow)
119        let file = File::open(file_path)
120            .with_context(|| format!("Failed to open file: {}", file_path.display()))?;
121
122        let reader: Box<dyn Read> = if file_path.extension().and_then(|s| s.to_str()) == Some("gz")
123        {
124            Box::new(MultiGzDecoder::new(BufReader::new(file)))
125        } else {
126            Box::new(BufReader::new(file))
127        };
128
129        let mut genome_io = GenomeIO::new(reader);
130        let mut sample_contigs: HashMap<String, Vec<String>> = HashMap::new();
131        let mut sample_order = Vec::new();
132        let mut last_sample = None;
133
134        // Scan headers
135        while let Some((full_header, sample_name, _contig_name, _sequence)) =
136            genome_io.read_contig_with_sample()?
137        {
138            // Track which sample this contig belongs to
139            sample_contigs
140                .entry(sample_name.clone())
141                .or_default()
142                .push(full_header);
143
144            // Track when samples change (for contiguity check)
145            if last_sample.as_ref() != Some(&sample_name) {
146                sample_order.push(sample_name.clone());
147                last_sample = Some(sample_name);
148            }
149        }
150
151        // Check if samples are contiguous (each sample appears only once in order)
152        let unique_samples: std::collections::HashSet<_> = sample_order.iter().collect();
153        let is_contiguous = unique_samples.len() == sample_order.len();
154
155        // If not contiguous, use sorted order for processing
156        let final_sample_order = if is_contiguous {
157            sample_order
158        } else {
159            let mut sorted: Vec<_> = sample_contigs.keys().cloned().collect();
160            sorted.sort();
161            sorted
162        };
163
164        Ok(SampleOrderInfo {
165            sample_contigs,
166            sample_order: final_sample_order,
167            is_contiguous,
168        })
169    }
170}
171
172/// Iterator for a single pansn-format FASTA file
173/// Parses sample names from headers like: >sample#hap#chromosome
174pub struct PansnFileIterator {
175    file_path: PathBuf,
176    reader: Option<GenomeIO<Box<dyn Read>>>,
177}
178
179impl PansnFileIterator {
180    /// Create a new iterator for a single pansn-format FASTA file
181    ///
182    /// This iterator requires samples to appear in contiguous blocks for C++ AGC compatibility.
183    /// If your file has samples in non-contiguous order (e.g., all chr1, then all chr2),
184    /// use one of these solutions:
185    /// 1. Use IndexedPansnFileIterator with a bgzip+indexed file
186    /// 2. Reorder the file by sample using `ragc sort-fasta`
187    /// 3. Split into per-sample files
188    pub fn new(file_path: &Path) -> Result<Self> {
189        // Check if samples are contiguously ordered
190        let order_info = SampleOrderInfo::analyze_file(file_path)?;
191
192        if !order_info.is_contiguous {
193            // Check if an index exists
194            let index_path = format!("{}.fai", file_path.display());
195            let has_index = std::path::Path::new(&index_path).exists();
196
197            #[cfg(feature = "indexed-fasta")]
198            {
199                if has_index {
200                    return Err(anyhow!(
201                        "Samples are not contiguously ordered in file: {}\n\
202                        \n\
203                        For C++ AGC compatibility, samples must appear in contiguous blocks.\n\
204                        \n\
205                        This file has an index, so you can use random access.\n\
206                        Use IndexedPansnFileIterator::new() instead of PansnFileIterator::new()\n\
207                        Or enable automatic detection in your code.",
208                        file_path.display()
209                    ));
210                }
211            }
212
213            return Err(anyhow!(
214                "Samples are not contiguously ordered in file: {}\n\
215                \n\
216                For C++ AGC compatibility, samples must appear in contiguous blocks.\n\
217                \n\
218                Your file has samples scattered throughout (e.g., all chr1, then all chr2).\n\
219                \n\
220                Solutions:\n\
221                1. Reorder file by sample:\n   \
222                   ragc sort-fasta {} -o sorted.fa.gz\n\
223                \n\
224                2. Use bgzip compression with index for random access:\n   \
225                   gunzip {}\n   \
226                   bgzip {}\n   \
227                   samtools faidx {}.gz\n   \
228                   # Then use IndexedPansnFileIterator\n\
229                \n\
230                3. Split into per-sample files and compress together\n\
231                \n\
232                Found {} samples in non-contiguous order.",
233                file_path.display(),
234                file_path.display(),
235                file_path.display(),
236                file_path
237                    .file_stem()
238                    .and_then(|s| s.to_str())
239                    .unwrap_or("input"),
240                file_path
241                    .file_stem()
242                    .and_then(|s| s.to_str())
243                    .unwrap_or("input"),
244                order_info.sample_contigs.len()
245            ));
246        }
247
248        let reader = Self::open_reader(file_path)?;
249        Ok(PansnFileIterator {
250            file_path: file_path.to_path_buf(),
251            reader: Some(reader),
252        })
253    }
254
255    fn open_reader(file_path: &Path) -> Result<GenomeIO<Box<dyn Read>>> {
256        let file = File::open(file_path)
257            .with_context(|| format!("Failed to open file: {}", file_path.display()))?;
258
259        let reader: Box<dyn Read> = if file_path.extension().and_then(|s| s.to_str()) == Some("gz")
260        {
261            Box::new(MultiGzDecoder::new(BufReader::new(file)))
262        } else {
263            Box::new(BufReader::new(file))
264        };
265
266        Ok(GenomeIO::new(reader))
267    }
268}
269
270impl ContigIterator for PansnFileIterator {
271    fn next_contig(&mut self) -> Result<Option<(String, String, Contig)>> {
272        let reader = match &mut self.reader {
273            Some(r) => r,
274            None => return Ok(None),
275        };
276
277        match reader.read_contig_with_sample()? {
278            Some((full_header, sample_name, _contig_name, sequence)) => {
279                // IMPORTANT: Use full_header as contig name to preserve complete PanSN format
280                // Sample name comes from parsing (e.g., "AAA#0" from "AAA#0#chrI")
281                // Contig name is the full header (e.g., "AAA#0#chrI")
282                // This groups contigs by sample while preserving full headers
283                Ok(Some((sample_name, full_header, sequence)))
284            }
285            None => Ok(None),
286        }
287    }
288
289    fn reset(&mut self) -> Result<()> {
290        self.reader = Some(Self::open_reader(&self.file_path)?);
291        Ok(())
292    }
293}
294
295/// Iterator that buffers entire file in memory and outputs in sample-grouped order
296/// Much faster than random access for files with non-contiguous sample ordering
297pub struct BufferedPansnFileIterator {
298    // Map from sample name to vector of (contig_name, sequence)
299    sample_contigs: HashMap<String, Vec<(String, Contig)>>,
300    sample_order: Vec<String>,
301    current_sample_idx: usize,
302    current_contig_idx: usize,
303}
304
305impl BufferedPansnFileIterator {
306    /// Create a new buffered iterator that reads entire file into memory
307    pub fn new(file_path: &Path) -> Result<Self> {
308        eprintln!("Reading entire file into memory for reordering...");
309
310        // Open reader
311        let file = File::open(file_path)
312            .with_context(|| format!("Failed to open file: {}", file_path.display()))?;
313        let reader: Box<dyn Read> = if file_path.to_string_lossy().ends_with(".gz") {
314            Box::new(BufReader::new(MultiGzDecoder::new(file)))
315        } else {
316            Box::new(BufReader::new(file))
317        };
318        let mut genome_io = GenomeIO::new(reader);
319
320        // Read all contigs and group by sample
321        let mut sample_contigs: HashMap<String, Vec<(String, Contig)>> = HashMap::new();
322        let mut sample_order = Vec::new();
323        let mut seen_samples = std::collections::HashSet::new();
324
325        while let Some((header, contig)) = genome_io.read_contig_converted()? {
326            // Parse sample name from header (sample#hap#chr format)
327            let sample_name = if let Some(parts) = header.split('#').nth(0) {
328                if let Some(hap) = header.split('#').nth(1) {
329                    format!("{parts}#{hap}")
330                } else {
331                    header.clone()
332                }
333            } else {
334                header.clone()
335            };
336
337            // Track sample order (first occurrence)
338            if !seen_samples.contains(&sample_name) {
339                sample_order.push(sample_name.clone());
340                seen_samples.insert(sample_name.clone());
341            }
342
343            // Store contig
344            sample_contigs
345                .entry(sample_name)
346                .or_default()
347                .push((header, contig));
348        }
349
350        eprintln!("Loaded {} samples into memory", sample_order.len());
351
352        Ok(BufferedPansnFileIterator {
353            sample_contigs,
354            sample_order,
355            current_sample_idx: 0,
356            current_contig_idx: 0,
357        })
358    }
359}
360
361impl ContigIterator for BufferedPansnFileIterator {
362    fn next_contig(&mut self) -> Result<Option<(String, String, Contig)>> {
363        // Check if we've exhausted all samples
364        if self.current_sample_idx >= self.sample_order.len() {
365            return Ok(None);
366        }
367
368        let sample_name = &self.sample_order[self.current_sample_idx];
369        let contigs = self
370            .sample_contigs
371            .get(sample_name)
372            .ok_or_else(|| anyhow!("Sample not found: {sample_name}"))?;
373
374        // Check if we've exhausted contigs for this sample
375        if self.current_contig_idx >= contigs.len() {
376            // Move to next sample
377            self.current_sample_idx += 1;
378            self.current_contig_idx = 0;
379            return self.next_contig(); // Recursively get first contig of next sample
380        }
381
382        // Get the contig
383        let (contig_name, contig) = &contigs[self.current_contig_idx];
384        self.current_contig_idx += 1;
385
386        Ok(Some((
387            sample_name.clone(),
388            contig_name.clone(),
389            contig.clone(),
390        )))
391    }
392
393    fn reset(&mut self) -> Result<()> {
394        self.current_sample_idx = 0;
395        self.current_contig_idx = 0;
396        Ok(())
397    }
398}
399
400/// Iterator for indexed PanSN FASTA files with random access
401/// Uses faigz-rs to read contigs in sample-grouped order even if file is out-of-order
402#[cfg(feature = "indexed-fasta")]
403pub struct IndexedPansnFileIterator {
404    // Field is intentionally kept to prevent dangling pointer in reader
405    #[allow(dead_code)]
406    index: FastaIndex, // Must keep index alive for reader
407    reader: FastaReader,
408    order_info: SampleOrderInfo,
409    current_sample_idx: usize,
410    current_contig_idx: usize,
411}
412
413#[cfg(feature = "indexed-fasta")]
414impl IndexedPansnFileIterator {
415    /// Create a new indexed iterator for a PanSN FASTA file
416    /// Requires the file to be bgzip-compressed with a .fai index
417    pub fn new(file_path: &Path) -> Result<Self> {
418        // Check for index file
419        let index_path = format!("{}.fai", file_path.display());
420        if !std::path::Path::new(&index_path).exists() {
421            return Err(anyhow!(
422                "Index file not found: {}\n\
423                To use indexed random access, create an index with:\n  \
424                samtools faidx {}",
425                index_path,
426                file_path.display()
427            ));
428        }
429
430        // Analyze sample ordering FIRST (reads .fai file header-only, fast)
431        let order_info = SampleOrderInfo::analyze_file(file_path)?;
432
433        // Load the index
434        let index = FastaIndex::new(
435            file_path.to_str().ok_or_else(|| anyhow!("Invalid path"))?,
436            FastaFormat::Fasta,
437        )
438        .with_context(|| format!("Failed to load FASTA index for {}", file_path.display()))?;
439
440        // Create the reader once and reuse it for all fetches
441        let reader = FastaReader::new(&index).with_context(|| "Failed to create FASTA reader")?;
442
443        Ok(IndexedPansnFileIterator {
444            index, // Store index to keep it alive
445            reader,
446            order_info,
447            current_sample_idx: 0,
448            current_contig_idx: 0,
449        })
450    }
451}
452
453#[cfg(feature = "indexed-fasta")]
454impl ContigIterator for IndexedPansnFileIterator {
455    fn next_contig(&mut self) -> Result<Option<(String, String, Contig)>> {
456        // Loop to find next sample with contigs (handles empty samples)
457        loop {
458            // Check if we've exhausted all samples
459            if self.current_sample_idx >= self.order_info.sample_order.len() {
460                return Ok(None);
461            }
462
463            let sample_name = &self.order_info.sample_order[self.current_sample_idx];
464            let contigs = self
465                .order_info
466                .sample_contigs
467                .get(sample_name)
468                .ok_or_else(|| anyhow!("Sample not found: {sample_name}"))?;
469
470            // Check if we've exhausted contigs for this sample
471            if self.current_contig_idx >= contigs.len() {
472                // Move to next sample
473                self.current_sample_idx += 1;
474                self.current_contig_idx = 0;
475                continue; // Loop to next sample instead of recursion
476            }
477
478            // Fetch the contig using random access
479            let full_header = &contigs[self.current_contig_idx];
480            self.current_contig_idx += 1;
481
482            // Use faigz-rs to fetch the sequence (reusing the reader)
483            let sequence = self
484                .reader
485                .fetch_seq_all(full_header)
486                .with_context(|| format!("Failed to fetch sequence for {full_header}"))?;
487
488            // Convert ASCII nucleotides to 0-3 encoding
489            use crate::genome_io::CNV_NUM;
490            let mut contig = Contig::with_capacity(sequence.len());
491            for byte in sequence.as_bytes() {
492                if (*byte as usize) < CNV_NUM.len() {
493                    contig.push(CNV_NUM[*byte as usize]);
494                } else {
495                    contig.push(4); // N for invalid characters
496                }
497            }
498
499            return Ok(Some((sample_name.clone(), full_header.clone(), contig)));
500        }
501    }
502
503    fn reset(&mut self) -> Result<()> {
504        self.current_sample_idx = 0;
505        self.current_contig_idx = 0;
506        Ok(())
507    }
508}
509
510/// Iterator for multiple FASTA files (one per sample)
511/// Each file contains all contigs for that sample
512pub struct MultiFileIterator {
513    file_paths: Vec<PathBuf>,
514    current_file_idx: usize,
515    current_reader: Option<GenomeIO<Box<dyn Read>>>,
516    current_sample_name: String,
517}
518
519impl MultiFileIterator {
520    /// Create a new iterator for multiple FASTA files
521    /// Each file is treated as a separate sample
522    pub fn new(file_paths: Vec<PathBuf>) -> Result<Self> {
523        let mut iterator = MultiFileIterator {
524            file_paths,
525            current_file_idx: 0,
526            current_reader: None,
527            current_sample_name: String::new(),
528        };
529
530        // Open first file
531        if !iterator.file_paths.is_empty() {
532            iterator.advance_to_next_file()?;
533        }
534
535        Ok(iterator)
536    }
537
538    fn open_file(&self, file_path: &Path) -> Result<(GenomeIO<Box<dyn Read>>, String)> {
539        let file = File::open(file_path)
540            .with_context(|| format!("Failed to open file: {}", file_path.display()))?;
541
542        let reader: Box<dyn Read> = if file_path.extension().and_then(|s| s.to_str()) == Some("gz")
543        {
544            Box::new(MultiGzDecoder::new(BufReader::new(file)))
545        } else {
546            Box::new(BufReader::new(file))
547        };
548
549        // Extract sample name from filename
550        let sample_name = file_path
551            .file_stem()
552            .and_then(|s| s.to_str())
553            .map(|s| {
554                // Remove .fa or .fasta extensions if present
555                s.trim_end_matches(".fa")
556                    .trim_end_matches(".fasta")
557                    .to_string()
558            })
559            .unwrap_or_else(|| "unknown".to_string());
560
561        Ok((GenomeIO::new(reader), sample_name))
562    }
563
564    fn advance_to_next_file(&mut self) -> Result<bool> {
565        if self.current_file_idx >= self.file_paths.len() {
566            self.current_reader = None;
567            return Ok(false);
568        }
569
570        let file_path = &self.file_paths[self.current_file_idx];
571        let (reader, sample_name) = self.open_file(file_path)?;
572
573        self.current_reader = Some(reader);
574        self.current_sample_name = sample_name;
575        self.current_file_idx += 1;
576
577        Ok(true)
578    }
579}
580
581impl ContigIterator for MultiFileIterator {
582    fn next_contig(&mut self) -> Result<Option<(String, String, Contig)>> {
583        loop {
584            let reader = match &mut self.current_reader {
585                Some(r) => r,
586                None => return Ok(None),
587            };
588
589            match reader.read_contig_with_sample()? {
590                Some((full_header, sample_from_header, _contig_name, sequence)) => {
591                    // Use sample name from header (PanSN format) if available, else fallback to filename
592                    let sample_name = if sample_from_header != "unknown" {
593                        sample_from_header
594                    } else {
595                        self.current_sample_name.clone()
596                    };
597                    // IMPORTANT: Use full_header as contig name to match C++ AGC expectations
598                    return Ok(Some((sample_name, full_header, sequence)));
599                }
600                None => {
601                    // Current file exhausted, move to next file
602                    if !self.advance_to_next_file()? {
603                        return Ok(None);
604                    }
605                    // Continue loop to read from new file
606                }
607            }
608        }
609    }
610
611    fn reset(&mut self) -> Result<()> {
612        self.current_file_idx = 0;
613        self.current_reader = None;
614        self.current_sample_name.clear();
615
616        if !self.file_paths.is_empty() {
617            self.advance_to_next_file()?;
618        }
619
620        Ok(())
621    }
622}
623
624#[cfg(test)]
625mod tests {
626    use super::*;
627    use std::io::Write;
628    use tempfile::NamedTempFile;
629
630    #[test]
631    fn test_pansn_file_iterator() {
632        // Create a temporary pansn-format FASTA file
633        let mut temp_file = NamedTempFile::new().unwrap();
634        writeln!(temp_file, ">sample1#1#chr1").unwrap();
635        writeln!(temp_file, "ACGTACGT").unwrap();
636        writeln!(temp_file, ">sample1#1#chr2").unwrap();
637        writeln!(temp_file, "TGCATGCA").unwrap();
638        writeln!(temp_file, ">sample2#0#chr1").unwrap();
639        writeln!(temp_file, "AAAACCCC").unwrap();
640        temp_file.flush().unwrap();
641
642        let mut iterator = PansnFileIterator::new(temp_file.path()).unwrap();
643
644        // First contig (contig name is now the full header for C++ AGC compatibility)
645        let (sample, contig, _seq) = iterator.next_contig().unwrap().unwrap();
646        assert_eq!(sample, "sample1#1");
647        assert_eq!(contig, "sample1#1#chr1");
648
649        // Second contig
650        let (sample, contig, _seq) = iterator.next_contig().unwrap().unwrap();
651        assert_eq!(sample, "sample1#1");
652        assert_eq!(contig, "sample1#1#chr2");
653
654        // Third contig
655        let (sample, contig, _seq) = iterator.next_contig().unwrap().unwrap();
656        assert_eq!(sample, "sample2#0");
657        assert_eq!(contig, "sample2#0#chr1");
658
659        // No more contigs
660        assert!(iterator.next_contig().unwrap().is_none());
661
662        // Test reset
663        iterator.reset().unwrap();
664        let (sample, contig, _seq) = iterator.next_contig().unwrap().unwrap();
665        assert_eq!(sample, "sample1#1");
666        assert_eq!(contig, "sample1#1#chr1");
667    }
668}