Skip to main content

fgumi_lib/
reference.rs

1//! Reference genome FASTA reading with all sequences loaded into memory.
2//!
3//! This module provides thread-safe access to reference genome sequences, which is needed
4//! for tasks like NM/UQ/MD tag calculation and variant calling.
5//!
6//! Following fgbio's approach, the entire reference is loaded into memory at startup
7//! to ensure O(1) lookup performance for each read during tag regeneration.
8//!
9//! Uses FAI index for fast raw-byte reading (htsjdk-style) instead of line-by-line parsing.
10//!
11//! # Memory Usage
12//!
13//! Reference sequences are stored as raw bytes, matching htsjdk's approach.
14//! For a typical human reference (~3GB), this uses approximately 3GB of memory
15//! but provides the fastest possible load times (~9s vs ~22s for compressed storage).
16//!
17//! # Future improvement
18//!
19//! The custom FAI-based raw-byte reading (`read_sequence_raw`) could be replaced with
20//! noodles' built-in indexed reader once <https://github.com/zaeleus/noodles/pull/365>
21//! is merged and released, which adds the same optimization to noodles.
22use crate::errors::FgumiError;
23use anyhow::{Context, Result};
24use log::debug;
25use noodles::core::Position;
26use noodles::fasta::fai;
27use std::collections::HashMap;
28use std::fs::File;
29use std::io::{Read, Seek, SeekFrom};
30use std::path::{Path, PathBuf};
31use std::sync::Arc;
32
33/// Read a sequence from a FASTA file using FAI index metadata.
34/// Uses raw byte reading with mathematical newline handling (htsjdk-style).
35///
36/// Optimized: Reads entire sequence bytes in one syscall, then strips newlines in memory.
37/// Fast path: If sequence fits in a single line, skip newline stripping entirely.
38#[allow(clippy::cast_possible_truncation)]
39fn read_sequence_raw(file: &mut File, record: &fai::Record) -> Result<Vec<u8>> {
40    let line_bases = record.line_bases() as usize;
41    let line_width = record.line_width() as usize;
42    let seq_len = record.length() as usize;
43    let offset = record.offset();
44
45    // Fast path: if sequence fits in a single line, no newlines to strip
46    if seq_len <= line_bases {
47        file.seek(SeekFrom::Start(offset))?;
48        let mut sequence = vec![0u8; seq_len];
49        file.read_exact(&mut sequence)?;
50        return Ok(sequence);
51    }
52
53    // Calculate number of complete lines and remaining bases
54    let complete_lines = seq_len / line_bases;
55    let remaining_bases = seq_len % line_bases;
56
57    // Total bytes = (complete_lines * line_width) + remaining_bases
58    // But last line might not have a terminator, so we calculate conservatively
59    let total_bytes = if remaining_bases > 0 {
60        complete_lines * line_width + remaining_bases
61    } else if complete_lines > 0 {
62        // All bases fit exactly in complete lines, last line has no terminator at end of seq
63        (complete_lines - 1) * line_width + line_bases
64    } else {
65        0
66    };
67
68    // Seek and read all bytes at once
69    file.seek(SeekFrom::Start(offset))?;
70    let mut raw_bytes = vec![0u8; total_bytes];
71    file.read_exact(&mut raw_bytes)?;
72
73    // Strip newlines in memory (much faster than seeking)
74    let mut sequence = Vec::with_capacity(seq_len);
75    let terminator_len = line_width - line_bases;
76
77    let mut pos = 0;
78    while sequence.len() < seq_len && pos < raw_bytes.len() {
79        // Read up to line_bases bytes
80        let bases_to_copy = (seq_len - sequence.len()).min(line_bases).min(raw_bytes.len() - pos);
81        sequence.extend_from_slice(&raw_bytes[pos..pos + bases_to_copy]);
82        pos += bases_to_copy;
83
84        // Skip line terminator if present and we need more bases
85        if sequence.len() < seq_len && pos < raw_bytes.len() {
86            pos += terminator_len;
87        }
88    }
89
90    Ok(sequence)
91}
92
93/// Find a sibling file for a FASTA file by trying two naming conventions:
94/// 1. Replace extension (e.g., `ref.fasta` → `ref.<replace_ext>`)
95/// 2. Append extension to full path (e.g., `ref.fa` → `ref.fa.<append_ext>`)
96fn find_sibling_file(fasta_path: &Path, replace_ext: &str, append_ext: &str) -> Option<PathBuf> {
97    let replaced = fasta_path.with_extension(replace_ext);
98    if replaced.exists() {
99        return Some(replaced);
100    }
101
102    let appended = PathBuf::from(format!("{}.{append_ext}", fasta_path.display()));
103    if appended.exists() {
104        return Some(appended);
105    }
106
107    None
108}
109
110/// Find FAI index path for a FASTA file.
111///
112/// Tries multiple naming conventions:
113/// 1. Replace extension with `.fa.fai` (e.g., `ref.fasta` → `ref.fa.fai`)
114/// 2. Append `.fai` to full path (e.g., `ref.fa` → `ref.fa.fai`, `ref.fasta` → `ref.fasta.fai`)
115fn find_fai_path(fasta_path: &Path) -> Option<PathBuf> {
116    find_sibling_file(fasta_path, "fa.fai", "fai")
117}
118
119/// Find sequence dictionary path for a FASTA file.
120///
121/// Tries multiple naming conventions used by different tools:
122/// 1. Replace extension with `.dict` (fgbio/HTSJDK/Picard convention: `ref.fa` → `ref.dict`)
123/// 2. Append `.dict` to full path (GATK convention: `ref.fa` → `ref.fa.dict`)
124///
125/// # Arguments
126/// * `fasta_path` - Path to the FASTA file
127///
128/// # Returns
129/// The path to the dictionary file if found, or `None` if not found.
130///
131/// # Examples
132/// ```no_run
133/// use std::path::Path;
134/// use fgumi_lib::reference::find_dict_path;
135///
136/// // Will find either "ref.dict" or "ref.fa.dict"
137/// if let Some(dict_path) = find_dict_path(Path::new("ref.fa")) {
138///     println!("Found dictionary: {}", dict_path.display());
139/// }
140/// ```
141#[must_use]
142pub fn find_dict_path(fasta_path: &Path) -> Option<PathBuf> {
143    find_sibling_file(fasta_path, "dict", "dict")
144}
145
146/// A thread-safe reference genome reader with all sequences preloaded into memory.
147///
148/// This reader loads the entire FASTA file into memory at construction time,
149/// providing O(1) lookup performance for sequence fetches. This approach matches
150/// fgbio's `nmUqMdTagRegeneratingWriter` which reads all contigs into a Map upfront.
151///
152/// For a typical human reference (e.g., hs38DH at ~3GB), this uses approximately
153/// 3GB of memory (raw byte storage like htsjdk) and provides the fastest load times.
154#[derive(Clone)]
155pub struct ReferenceReader {
156    /// All sequences loaded into memory as raw bytes, keyed by sequence name
157    sequences: Arc<HashMap<String, Vec<u8>>>,
158}
159
160impl ReferenceReader {
161    /// Creates a new reference reader, loading all sequences into memory.
162    ///
163    /// This reads the entire FASTA file into memory at construction time.
164    /// For a typical human reference (~3GB), this takes a few seconds but
165    /// provides O(1) lookup performance for all subsequent fetches.
166    ///
167    /// # Arguments
168    /// * `path` - Path to the reference FASTA file (may be gzipped)
169    ///
170    /// # Errors
171    /// Returns an error if:
172    /// - The file does not exist
173    /// - The file cannot be read or parsed as FASTA
174    ///
175    /// # Examples
176    /// ```no_run
177    /// use fgumi_lib::reference::ReferenceReader;
178    ///
179    /// let reader = ReferenceReader::new("reference.fasta")?;
180    /// # Ok::<(), anyhow::Error>(())
181    /// ```
182    pub fn new<P: AsRef<Path>>(path: P) -> Result<Self> {
183        let path = path.as_ref();
184
185        // Verify the file exists and is readable
186        if !path.exists() {
187            return Err(FgumiError::InvalidFileFormat {
188                file_type: "Reference FASTA".to_string(),
189                path: path.display().to_string(),
190                reason: "File does not exist".to_string(),
191            }
192            .into());
193        }
194
195        debug!("Reading reference FASTA into memory: {}", path.display());
196
197        // Try FAI-based fast reading first
198        if let Some(fai_path) = find_fai_path(path) {
199            debug!("Using FAI index for fast loading: {}", fai_path.display());
200            return Self::new_with_fai(path, &fai_path);
201        }
202
203        // Fall back to noodles for non-indexed files
204        debug!("No FAI index found, using sequential reading");
205        Self::new_sequential(path)
206    }
207
208    /// Load sequences using FAI index for fast raw-byte reading (htsjdk-style).
209    fn new_with_fai(fasta_path: &Path, fai_path: &Path) -> Result<Self> {
210        let index = fai::fs::read(fai_path)
211            .with_context(|| format!("Failed to read FAI index: {}", fai_path.display()))?;
212        let records: &[fai::Record] = index.as_ref();
213        let mut file = File::open(fasta_path)
214            .with_context(|| format!("Failed to open FASTA: {}", fasta_path.display()))?;
215
216        let mut sequences = HashMap::with_capacity(records.len());
217
218        for record in records {
219            let raw_sequence = read_sequence_raw(&mut file, record)?;
220            let name = String::from_utf8_lossy(record.name().as_ref()).into_owned();
221            sequences.insert(name, raw_sequence);
222        }
223
224        debug!("Loaded {} contigs into memory (FAI-indexed)", sequences.len());
225        Ok(Self { sequences: Arc::new(sequences) })
226    }
227
228    /// Load sequences using noodles sequential reading (fallback for non-indexed files).
229    fn new_sequential(path: &Path) -> Result<Self> {
230        use noodles::fasta;
231
232        let mut sequences = HashMap::new();
233        let mut reader = fasta::io::reader::Builder.build_from_path(path)?;
234
235        for result in reader.records() {
236            let record = result?;
237            let name = std::str::from_utf8(record.name())?.to_string();
238            let raw_sequence: Vec<u8> = record.sequence().as_ref().to_vec();
239            sequences.insert(name, raw_sequence);
240        }
241
242        debug!("Loaded {} contigs into memory (sequential)", sequences.len());
243        Ok(Self { sequences: Arc::new(sequences) })
244    }
245
246    /// Retrieves a subsequence from the reference genome.
247    ///
248    /// Since all sequences are preloaded into memory, this is an O(1) lookup
249    /// followed by a slice copy.
250    ///
251    /// # Arguments
252    /// * `chrom` - Chromosome/sequence name (e.g., "chr1", "1")
253    /// * `start` - Start position (1-based, inclusive)
254    /// * `end` - End position (1-based, inclusive)
255    ///
256    /// # Returns
257    /// The requested subsequence as a vector of bytes (preserving original case)
258    ///
259    /// # Errors
260    /// Returns an error if:
261    /// - The chromosome is not found in the reference
262    /// - The requested region exceeds the chromosome length
263    ///
264    /// # Examples
265    /// ```no_run
266    /// use fgumi_lib::reference::ReferenceReader;
267    /// use noodles::core::Position;
268    ///
269    /// let reader = ReferenceReader::new("reference.fasta")?;
270    ///
271    /// // Fetch first 100 bases of chr1
272    /// let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(100)?)?;
273    /// assert_eq!(seq.len(), 100);
274    /// # Ok::<(), anyhow::Error>(())
275    /// ```
276    pub fn fetch(&self, chrom: &str, start: Position, end: Position) -> Result<Vec<u8>> {
277        Ok(self.fetch_slice(chrom, start, end)?.to_vec())
278    }
279
280    /// Borrowed-slice variant of [`fetch`] that returns a reference into the
281    /// in-memory sequence — same lookup semantics, no allocation.
282    ///
283    /// Prefer this over [`fetch`] in hot loops (e.g., per-record reference
284    /// access in `fgumi zipper`'s `--restore-unconverted-bases` path) where
285    /// allocating a fresh `Vec<u8>` per call adds up to gigabytes of churn.
286    ///
287    /// # Errors
288    ///
289    /// Same as [`fetch`]: returns an error if the chromosome is missing or the
290    /// requested region exceeds the sequence length.
291    pub fn fetch_slice(&self, chrom: &str, start: Position, end: Position) -> Result<&[u8]> {
292        let sequence = self
293            .sequences
294            .get(chrom)
295            .ok_or_else(|| FgumiError::ReferenceNotFound { ref_name: chrom.to_string() })?;
296
297        // Convert from 1-based inclusive to 0-based [start, end) indexing
298        let start_idx = usize::from(start) - 1;
299        let end_idx = usize::from(end);
300
301        if end_idx > sequence.len() || start_idx >= end_idx {
302            return Err(FgumiError::InvalidParameter {
303                parameter: "region".to_string(),
304                reason: format!(
305                    "Requested region {}:{}-{} exceeds sequence length {}",
306                    chrom,
307                    start,
308                    end,
309                    sequence.len()
310                ),
311            }
312            .into());
313        }
314
315        Ok(&sequence[start_idx..end_idx])
316    }
317
318    /// Gets a single base from the reference at the specified position.
319    ///
320    /// This is a convenience method that delegates to `fetch()` with a single-base region.
321    ///
322    /// # Arguments
323    /// * `chrom` - Chromosome/sequence name (e.g., "chr1", "1")
324    /// * `pos` - Position (1-based)
325    ///
326    /// # Returns
327    /// The base at the specified position (preserving original case from FASTA)
328    ///
329    /// # Errors
330    /// Returns an error if:
331    /// - The chromosome is not found in the reference
332    /// - The position exceeds the chromosome length
333    ///
334    /// # Examples
335    /// ```no_run
336    /// use fgumi_lib::reference::ReferenceReader;
337    /// use noodles::core::Position;
338    ///
339    /// let reader = ReferenceReader::new("reference.fasta")?;
340    ///
341    /// // Get the base at position 1000 of chr1
342    /// let base = reader.base_at("chr1", Position::try_from(1000)?)?;
343    /// assert!(matches!(base, b'A' | b'C' | b'G' | b'T' | b'N'));
344    /// # Ok::<(), anyhow::Error>(())
345    /// ```
346    pub fn base_at(&self, chrom: &str, pos: Position) -> Result<u8> {
347        let sequence = self
348            .sequences
349            .get(chrom)
350            .ok_or_else(|| FgumiError::ReferenceNotFound { ref_name: chrom.to_string() })?;
351
352        // Convert from 1-based to 0-based indexing
353        let pos_idx = usize::from(pos) - 1;
354
355        sequence.get(pos_idx).copied().ok_or_else(|| {
356            FgumiError::InvalidParameter {
357                parameter: "position".to_string(),
358                reason: format!(
359                    "Position {}:{} exceeds sequence length {}",
360                    chrom,
361                    pos,
362                    sequence.len()
363                ),
364            }
365            .into()
366        })
367    }
368}
369
370impl fgumi_sam::ReferenceProvider for ReferenceReader {
371    fn fetch(
372        &self,
373        chrom: &str,
374        start: noodles::core::Position,
375        end: noodles::core::Position,
376    ) -> anyhow::Result<Vec<u8>> {
377        self.fetch(chrom, start, end)
378    }
379}
380
381#[cfg(feature = "simplex")]
382impl fgumi_consensus::methylation::RefBaseProvider for ReferenceReader {
383    fn base_at_0based(&self, chrom: &str, pos: u64) -> Option<u8> {
384        let sequence = self.sequences.get(chrom)?;
385        sequence.get(usize::try_from(pos).ok()?).copied()
386    }
387
388    fn sequence_for(&self, chrom: &str) -> Option<&[u8]> {
389        self.sequences.get(chrom).map(Vec::as_slice)
390    }
391}
392
393#[cfg(test)]
394mod tests {
395    use super::*;
396    use crate::sam::builder::create_default_test_fasta;
397
398    #[test]
399    fn test_fetch_subsequence() -> Result<()> {
400        let fasta = create_default_test_fasta()?;
401        let reader = ReferenceReader::new(fasta.path())?;
402
403        // Fetch from chr1
404        let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(4)?)?;
405        assert_eq!(seq, b"ACGT");
406
407        // Fetch from chr2
408        let seq = reader.fetch("chr2", Position::try_from(5)?, Position::try_from(8)?)?;
409        assert_eq!(seq, b"CCCC");
410
411        Ok(())
412    }
413
414    #[test]
415    fn test_fetch_slice_returns_borrowed_bytes() -> Result<()> {
416        let fasta = create_default_test_fasta()?;
417        let reader = ReferenceReader::new(fasta.path())?;
418
419        // Borrowed alternative to `fetch` — same bytes, no allocation.
420        let slice: &[u8] =
421            reader.fetch_slice("chr1", Position::try_from(1)?, Position::try_from(4)?)?;
422        assert_eq!(slice, b"ACGT");
423
424        // Same bytes as `fetch` would return.
425        let owned = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(4)?)?;
426        assert_eq!(slice, owned.as_slice());
427
428        // Bounds errors propagate just like `fetch`.
429        assert!(
430            reader
431                .fetch_slice("chr1", Position::try_from(1)?, Position::try_from(10_000)?)
432                .is_err()
433        );
434        assert!(
435            reader.fetch_slice("nope", Position::try_from(1)?, Position::try_from(2)?).is_err()
436        );
437        // Inverted interval (start > end) must also error.
438        assert!(
439            reader.fetch_slice("chr1", Position::try_from(5)?, Position::try_from(4)?).is_err()
440        );
441
442        Ok(())
443    }
444
445    #[test]
446    fn test_base_at() -> Result<()> {
447        let fasta = create_default_test_fasta()?;
448        let reader = ReferenceReader::new(fasta.path())?;
449
450        assert_eq!(reader.base_at("chr1", Position::try_from(1)?)?, b'A');
451        assert_eq!(reader.base_at("chr1", Position::try_from(2)?)?, b'C');
452        assert_eq!(reader.base_at("chr2", Position::try_from(1)?)?, b'G');
453
454        Ok(())
455    }
456
457    #[test]
458    fn test_all_sequences_loaded() -> Result<()> {
459        let fasta = create_default_test_fasta()?;
460        let reader = ReferenceReader::new(fasta.path())?;
461
462        // All sequences should be available immediately after construction
463        let seq1 = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(4)?)?;
464        assert_eq!(seq1, b"ACGT");
465
466        let seq2 = reader.fetch("chr2", Position::try_from(1)?, Position::try_from(4)?)?;
467        assert_eq!(seq2, b"GGGG");
468
469        // Fetching chr1 again should still work (all in memory)
470        let seq1_again = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(4)?)?;
471        assert_eq!(seq1_again, b"ACGT");
472
473        Ok(())
474    }
475
476    #[test]
477    fn test_nonexistent_sequence() {
478        let fasta = create_default_test_fasta().expect("creating test FASTA should succeed");
479        let reader =
480            ReferenceReader::new(fasta.path()).expect("creating reference reader should succeed");
481
482        let result = reader.fetch(
483            "chr999",
484            Position::try_from(1).expect("position conversion should succeed"),
485            Position::try_from(4).expect("position conversion should succeed"),
486        );
487        assert!(result.is_err());
488    }
489
490    #[test]
491    fn test_out_of_bounds() {
492        let fasta = create_default_test_fasta().expect("creating test FASTA should succeed");
493        let reader =
494            ReferenceReader::new(fasta.path()).expect("creating reference reader should succeed");
495
496        // chr1 is only 12 bases long
497        let result = reader.fetch(
498            "chr1",
499            Position::try_from(1).expect("position conversion should succeed"),
500            Position::try_from(100).expect("position conversion should succeed"),
501        );
502        assert!(result.is_err());
503    }
504
505    #[test]
506    fn test_reference_case_preserved() -> Result<()> {
507        // Test that FASTA case is preserved (important for MD tag generation)
508        // fgbio preserves case in reference sequences for proper MD tag output
509        use crate::sam::builder::create_test_fasta;
510
511        let file = create_test_fasta(&[("chr1", "AcGtNnAaCcGgTt")])?; // Mixed case sequence
512
513        let reader = ReferenceReader::new(file.path())?;
514
515        // Fetch the full sequence and verify case is preserved
516        let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(14)?)?;
517        assert_eq!(seq, b"AcGtNnAaCcGgTt");
518
519        // Verify individual bases preserve case
520        assert_eq!(reader.base_at("chr1", Position::try_from(1)?)?, b'A'); // uppercase A
521        assert_eq!(reader.base_at("chr1", Position::try_from(2)?)?, b'c'); // lowercase c
522        assert_eq!(reader.base_at("chr1", Position::try_from(3)?)?, b'G'); // uppercase G
523        assert_eq!(reader.base_at("chr1", Position::try_from(4)?)?, b't'); // lowercase t
524        assert_eq!(reader.base_at("chr1", Position::try_from(5)?)?, b'N'); // uppercase N
525        assert_eq!(reader.base_at("chr1", Position::try_from(6)?)?, b'n'); // lowercase n
526
527        Ok(())
528    }
529
530    #[test]
531    fn test_n_bases_at_various_positions() -> Result<()> {
532        use crate::sam::builder::create_test_fasta;
533
534        // N at start, middle, and end
535        let file = create_test_fasta(&[("chr1", "NACGTNACGTN")])?;
536        let reader = ReferenceReader::new(file.path())?;
537
538        assert_eq!(reader.base_at("chr1", Position::try_from(1)?)?, b'N');
539        assert_eq!(reader.base_at("chr1", Position::try_from(6)?)?, b'N');
540        assert_eq!(reader.base_at("chr1", Position::try_from(11)?)?, b'N');
541
542        // Fetch range including N
543        let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(6)?)?;
544        assert_eq!(seq, b"NACGTN");
545
546        Ok(())
547    }
548
549    #[test]
550    fn test_all_n_sequence() -> Result<()> {
551        use crate::sam::builder::create_test_fasta;
552
553        let file = create_test_fasta(&[("chrN", "NNNNNNNNNN")])?;
554        let reader = ReferenceReader::new(file.path())?;
555
556        let seq = reader.fetch("chrN", Position::try_from(1)?, Position::try_from(10)?)?;
557        assert_eq!(seq, b"NNNNNNNNNN");
558
559        for i in 1..=10 {
560            assert_eq!(reader.base_at("chrN", Position::try_from(i)?)?, b'N');
561        }
562
563        Ok(())
564    }
565
566    #[test]
567    fn test_long_sequence_boundaries() -> Result<()> {
568        use crate::sam::builder::create_test_fasta;
569
570        // Create a 100-base sequence with markers at specific positions
571        // We want to test fetching across the 32-base boundary (for bit-packed storage)
572        let mut seq = String::new();
573
574        // Positions 1-28: ACGT repeated (7 times)
575        for _ in 0..7 {
576            seq.push_str("ACGT");
577        }
578        // Positions 29-32: NNNN (straddles u64 boundary at position 32)
579        seq.push_str("NNNN");
580        // Positions 33-60: ACGT repeated (7 times)
581        for _ in 0..7 {
582            seq.push_str("ACGT");
583        }
584        // Positions 61-64: lowercase acgt (straddles second u64 boundary)
585        seq.push_str("acgt");
586        // Positions 65-100: ACGT repeated (9 times)
587        for _ in 0..9 {
588            seq.push_str("ACGT");
589        }
590
591        assert_eq!(seq.len(), 100);
592
593        let file = create_test_fasta(&[("chr1", &seq)])?;
594        let reader = ReferenceReader::new(file.path())?;
595
596        // Verify positions around first marker (NNNN at 29-32)
597        assert_eq!(reader.base_at("chr1", Position::try_from(28)?)?, b'T');
598        assert_eq!(reader.base_at("chr1", Position::try_from(29)?)?, b'N');
599        assert_eq!(reader.base_at("chr1", Position::try_from(32)?)?, b'N');
600        assert_eq!(reader.base_at("chr1", Position::try_from(33)?)?, b'A');
601
602        // Verify positions around second marker (acgt at 61-64)
603        assert_eq!(reader.base_at("chr1", Position::try_from(60)?)?, b'T');
604        assert_eq!(reader.base_at("chr1", Position::try_from(61)?)?, b'a');
605        assert_eq!(reader.base_at("chr1", Position::try_from(64)?)?, b't');
606        assert_eq!(reader.base_at("chr1", Position::try_from(65)?)?, b'A');
607
608        // Fetch across boundaries
609        let cross_first = reader.fetch("chr1", Position::try_from(27)?, Position::try_from(34)?)?;
610        assert_eq!(cross_first, b"GTNNNNAC");
611
612        let cross_second =
613            reader.fetch("chr1", Position::try_from(59)?, Position::try_from(66)?)?;
614        assert_eq!(cross_second, b"GTacgtAC");
615
616        Ok(())
617    }
618
619    #[test]
620    fn test_single_base_sequence() -> Result<()> {
621        use crate::sam::builder::create_test_fasta;
622
623        let file = create_test_fasta(&[("chr1", "A"), ("chr2", "N"), ("chr3", "g")])?;
624        let reader = ReferenceReader::new(file.path())?;
625
626        assert_eq!(reader.base_at("chr1", Position::try_from(1)?)?, b'A');
627        assert_eq!(reader.base_at("chr2", Position::try_from(1)?)?, b'N');
628        assert_eq!(reader.base_at("chr3", Position::try_from(1)?)?, b'g');
629
630        let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(1)?)?;
631        assert_eq!(seq, b"A");
632
633        Ok(())
634    }
635
636    #[test]
637    fn test_fetch_full_sequence() -> Result<()> {
638        use crate::sam::builder::create_test_fasta;
639
640        let original = "ACGTNacgtn";
641        let file = create_test_fasta(&[("chr1", original)])?;
642        let reader = ReferenceReader::new(file.path())?;
643
644        let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(10)?)?;
645        assert_eq!(seq, original.as_bytes());
646
647        Ok(())
648    }
649
650    #[test]
651    fn test_fetch_last_base() -> Result<()> {
652        use crate::sam::builder::create_test_fasta;
653
654        let file = create_test_fasta(&[("chr1", "ACGTN")])?;
655        let reader = ReferenceReader::new(file.path())?;
656
657        // Fetch last base
658        let seq = reader.fetch("chr1", Position::try_from(5)?, Position::try_from(5)?)?;
659        assert_eq!(seq, b"N");
660
661        assert_eq!(reader.base_at("chr1", Position::try_from(5)?)?, b'N');
662
663        Ok(())
664    }
665
666    #[test]
667    fn test_multiple_chromosomes_isolation() -> Result<()> {
668        use crate::sam::builder::create_test_fasta;
669
670        let file = create_test_fasta(&[
671            ("chr1", "AAAA"),
672            ("chr2", "CCCC"),
673            ("chr3", "GGGG"),
674            ("chr4", "TTTT"),
675        ])?;
676        let reader = ReferenceReader::new(file.path())?;
677
678        // Verify each chromosome has its own sequence
679        assert_eq!(reader.fetch("chr1", Position::try_from(1)?, Position::try_from(4)?)?, b"AAAA");
680        assert_eq!(reader.fetch("chr2", Position::try_from(1)?, Position::try_from(4)?)?, b"CCCC");
681        assert_eq!(reader.fetch("chr3", Position::try_from(1)?, Position::try_from(4)?)?, b"GGGG");
682        assert_eq!(reader.fetch("chr4", Position::try_from(1)?, Position::try_from(4)?)?, b"TTTT");
683
684        Ok(())
685    }
686
687    #[test]
688    fn test_mixed_case_all_bases() -> Result<()> {
689        use crate::sam::builder::create_test_fasta;
690
691        // Test all 10 possible base values: A, C, G, T, N (upper and lower)
692        let file = create_test_fasta(&[("chr1", "ACGTNacgtn")])?;
693        let reader = ReferenceReader::new(file.path())?;
694
695        let expected = [b'A', b'C', b'G', b'T', b'N', b'a', b'c', b'g', b't', b'n'];
696        for (i, &expected_base) in expected.iter().enumerate() {
697            let pos = Position::try_from(i + 1)?;
698            assert_eq!(
699                reader.base_at("chr1", pos)?,
700                expected_base,
701                "Mismatch at position {}",
702                i + 1
703            );
704        }
705
706        Ok(())
707    }
708
709    #[test]
710    fn test_runs_of_n_bases() -> Result<()> {
711        use crate::sam::builder::create_test_fasta;
712
713        // Simulate masked regions with runs of N
714        let file = create_test_fasta(&[("chr1", "ACGTNNNNNNNNACGT")])?;
715        let reader = ReferenceReader::new(file.path())?;
716
717        // Fetch run of N's
718        let n_run = reader.fetch("chr1", Position::try_from(5)?, Position::try_from(12)?)?;
719        assert_eq!(n_run, b"NNNNNNNN");
720
721        // Fetch across N boundary
722        let across = reader.fetch("chr1", Position::try_from(3)?, Position::try_from(14)?)?;
723        assert_eq!(across, b"GTNNNNNNNNAC");
724
725        Ok(())
726    }
727
728    #[test]
729    fn test_position_one_based() -> Result<()> {
730        use crate::sam::builder::create_test_fasta;
731
732        let file = create_test_fasta(&[("chr1", "ACGTN")])?;
733        let reader = ReferenceReader::new(file.path())?;
734
735        // Position 1 should be first base 'A', not second
736        assert_eq!(reader.base_at("chr1", Position::try_from(1)?)?, b'A');
737        assert_eq!(reader.base_at("chr1", Position::try_from(2)?)?, b'C');
738
739        // fetch(1, 1) should return single base 'A'
740        let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(1)?)?;
741        assert_eq!(seq, b"A");
742
743        // fetch(1, 2) should return "AC"
744        let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(2)?)?;
745        assert_eq!(seq, b"AC");
746
747        Ok(())
748    }
749
750    #[test]
751    fn test_find_dict_path_replacing_convention() -> Result<()> {
752        // Test the fgbio/HTSJDK/Picard convention: ref.fa -> ref.dict
753        let temp_dir = tempfile::tempdir()?;
754        let fasta_path = temp_dir.path().join("ref.fa");
755        let dict_path = temp_dir.path().join("ref.dict");
756
757        // Create empty files
758        std::fs::write(&fasta_path, "")?;
759        std::fs::write(&dict_path, "")?;
760
761        let found = find_dict_path(&fasta_path);
762        assert!(found.is_some());
763        assert_eq!(found.expect("should find dictionary path"), dict_path);
764
765        Ok(())
766    }
767
768    #[test]
769    fn test_find_dict_path_appending_convention() -> Result<()> {
770        // Test the GATK convention: ref.fa -> ref.fa.dict
771        let temp_dir = tempfile::tempdir()?;
772        let fasta_path = temp_dir.path().join("ref.fa");
773        let dict_path = temp_dir.path().join("ref.fa.dict");
774
775        // Create empty files
776        std::fs::write(&fasta_path, "")?;
777        std::fs::write(&dict_path, "")?;
778
779        let found = find_dict_path(&fasta_path);
780        assert!(found.is_some());
781        assert_eq!(found.expect("should find dictionary path"), dict_path);
782
783        Ok(())
784    }
785
786    #[test]
787    fn test_find_dict_path_prefers_replacing_convention() -> Result<()> {
788        // When both exist, prefer the fgbio/HTSJDK convention (ref.dict)
789        let temp_dir = tempfile::tempdir()?;
790        let fasta_path = temp_dir.path().join("ref.fa");
791        let dict_replacing = temp_dir.path().join("ref.dict");
792        let dict_appending = temp_dir.path().join("ref.fa.dict");
793
794        // Create all files
795        std::fs::write(&fasta_path, "")?;
796        std::fs::write(&dict_replacing, "")?;
797        std::fs::write(&dict_appending, "")?;
798
799        let found = find_dict_path(&fasta_path);
800        assert!(found.is_some());
801        // Should find ref.dict first (fgbio/HTSJDK convention)
802        assert_eq!(found.expect("should find dictionary path"), dict_replacing);
803
804        Ok(())
805    }
806
807    #[test]
808    fn test_find_dict_path_not_found() {
809        // When no dict file exists, return None
810        let temp_dir = tempfile::tempdir().expect("creating temp file/dir should succeed");
811        let fasta_path = temp_dir.path().join("ref.fa");
812
813        // Create only the FASTA file, no dict
814        std::fs::write(&fasta_path, "").expect("writing file should succeed");
815
816        let found = find_dict_path(&fasta_path);
817        assert!(found.is_none());
818    }
819
820    #[test]
821    fn test_find_dict_path_fasta_extension() -> Result<()> {
822        // Test with .fasta extension: ref.fasta -> ref.dict
823        let temp_dir = tempfile::tempdir()?;
824        let fasta_path = temp_dir.path().join("ref.fasta");
825        let dict_path = temp_dir.path().join("ref.dict");
826
827        std::fs::write(&fasta_path, "")?;
828        std::fs::write(&dict_path, "")?;
829
830        let found = find_dict_path(&fasta_path);
831        assert!(found.is_some());
832        assert_eq!(found.expect("should find dictionary path"), dict_path);
833
834        Ok(())
835    }
836
837    #[test]
838    fn test_find_dict_path_fasta_appending_convention() -> Result<()> {
839        // Test with .fasta extension: ref.fasta -> ref.fasta.dict
840        let temp_dir = tempfile::tempdir()?;
841        let fasta_path = temp_dir.path().join("ref.fasta");
842        let dict_path = temp_dir.path().join("ref.fasta.dict");
843
844        std::fs::write(&fasta_path, "")?;
845        std::fs::write(&dict_path, "")?;
846
847        let found = find_dict_path(&fasta_path);
848        assert!(found.is_some());
849        assert_eq!(found.expect("should find dictionary path"), dict_path);
850
851        Ok(())
852    }
853}