Skip to main content

fgumi_lib/
reference.rs

1//! Reference genome FASTA reading with all sequences loaded into memory.
2//!
3//! This module provides thread-safe access to reference genome sequences, which is needed
4//! for tasks like NM/UQ/MD tag calculation and variant calling.
5//!
6//! Following fgbio's approach, the entire reference is loaded into memory at startup
7//! to ensure O(1) lookup performance for each read during tag regeneration.
8//!
9//! Uses FAI index for fast raw-byte reading (htsjdk-style) instead of line-by-line parsing.
10//!
11//! # Memory Usage
12//!
13//! Reference sequences are stored as raw bytes, matching htsjdk's approach.
14//! For a typical human reference (~3GB), this uses approximately 3GB of memory
15//! but provides the fastest possible load times (~9s vs ~22s for compressed storage).
16//!
17//! # Future improvement
18//!
19//! The custom FAI-based raw-byte reading (`read_sequence_raw`) could be replaced with
20//! noodles' built-in indexed reader once <https://github.com/zaeleus/noodles/pull/365>
21//! is merged and released, which adds the same optimization to noodles.
22use crate::errors::FgumiError;
23use anyhow::{Context, Result};
24use log::debug;
25use noodles::core::Position;
26use noodles::fasta::fai;
27use std::collections::HashMap;
28use std::fs::File;
29use std::io::{Read, Seek, SeekFrom};
30use std::path::{Path, PathBuf};
31use std::sync::Arc;
32
33/// Read a sequence from a FASTA file using FAI index metadata.
34/// Uses raw byte reading with mathematical newline handling (htsjdk-style).
35///
36/// Optimized: Reads entire sequence bytes in one syscall, then strips newlines in memory.
37/// Fast path: If sequence fits in a single line, skip newline stripping entirely.
38#[allow(clippy::cast_possible_truncation)]
39fn read_sequence_raw(file: &mut File, record: &fai::Record) -> Result<Vec<u8>> {
40    let line_bases = record.line_bases() as usize;
41    let line_width = record.line_width() as usize;
42    let seq_len = record.length() as usize;
43    let offset = record.offset();
44
45    // Fast path: if sequence fits in a single line, no newlines to strip
46    if seq_len <= line_bases {
47        file.seek(SeekFrom::Start(offset))?;
48        let mut sequence = vec![0u8; seq_len];
49        file.read_exact(&mut sequence)?;
50        return Ok(sequence);
51    }
52
53    // Calculate number of complete lines and remaining bases
54    let complete_lines = seq_len / line_bases;
55    let remaining_bases = seq_len % line_bases;
56
57    // Total bytes = (complete_lines * line_width) + remaining_bases
58    // But last line might not have a terminator, so we calculate conservatively
59    let total_bytes = if remaining_bases > 0 {
60        complete_lines * line_width + remaining_bases
61    } else if complete_lines > 0 {
62        // All bases fit exactly in complete lines, last line has no terminator at end of seq
63        (complete_lines - 1) * line_width + line_bases
64    } else {
65        0
66    };
67
68    // Seek and read all bytes at once
69    file.seek(SeekFrom::Start(offset))?;
70    let mut raw_bytes = vec![0u8; total_bytes];
71    file.read_exact(&mut raw_bytes)?;
72
73    // Strip newlines in memory (much faster than seeking)
74    let mut sequence = Vec::with_capacity(seq_len);
75    let terminator_len = line_width - line_bases;
76
77    let mut pos = 0;
78    while sequence.len() < seq_len && pos < raw_bytes.len() {
79        // Read up to line_bases bytes
80        let bases_to_copy = (seq_len - sequence.len()).min(line_bases).min(raw_bytes.len() - pos);
81        sequence.extend_from_slice(&raw_bytes[pos..pos + bases_to_copy]);
82        pos += bases_to_copy;
83
84        // Skip line terminator if present and we need more bases
85        if sequence.len() < seq_len && pos < raw_bytes.len() {
86            pos += terminator_len;
87        }
88    }
89
90    Ok(sequence)
91}
92
93/// Find a sibling file for a FASTA file by trying two naming conventions:
94/// 1. Replace extension (e.g., `ref.fasta` → `ref.<replace_ext>`)
95/// 2. Append extension to full path (e.g., `ref.fa` → `ref.fa.<append_ext>`)
96fn find_sibling_file(fasta_path: &Path, replace_ext: &str, append_ext: &str) -> Option<PathBuf> {
97    let replaced = fasta_path.with_extension(replace_ext);
98    if replaced.exists() {
99        return Some(replaced);
100    }
101
102    let appended = PathBuf::from(format!("{}.{append_ext}", fasta_path.display()));
103    if appended.exists() {
104        return Some(appended);
105    }
106
107    None
108}
109
110/// Find FAI index path for a FASTA file.
111///
112/// Tries multiple naming conventions:
113/// 1. Replace extension with `.fa.fai` (e.g., `ref.fasta` → `ref.fa.fai`)
114/// 2. Append `.fai` to full path (e.g., `ref.fa` → `ref.fa.fai`, `ref.fasta` → `ref.fasta.fai`)
115fn find_fai_path(fasta_path: &Path) -> Option<PathBuf> {
116    find_sibling_file(fasta_path, "fa.fai", "fai")
117}
118
119/// Find sequence dictionary path for a FASTA file.
120///
121/// Tries multiple naming conventions used by different tools:
122/// 1. Replace extension with `.dict` (fgbio/HTSJDK/Picard convention: `ref.fa` → `ref.dict`)
123/// 2. Append `.dict` to full path (GATK convention: `ref.fa` → `ref.fa.dict`)
124///
125/// # Arguments
126/// * `fasta_path` - Path to the FASTA file
127///
128/// # Returns
129/// The path to the dictionary file if found, or `None` if not found.
130///
131/// # Examples
132/// ```no_run
133/// use std::path::Path;
134/// use fgumi_lib::reference::find_dict_path;
135///
136/// // Will find either "ref.dict" or "ref.fa.dict"
137/// if let Some(dict_path) = find_dict_path(Path::new("ref.fa")) {
138///     println!("Found dictionary: {}", dict_path.display());
139/// }
140/// ```
141#[must_use]
142pub fn find_dict_path(fasta_path: &Path) -> Option<PathBuf> {
143    find_sibling_file(fasta_path, "dict", "dict")
144}
145
146/// A thread-safe reference genome reader with all sequences preloaded into memory.
147///
148/// This reader loads the entire FASTA file into memory at construction time,
149/// providing O(1) lookup performance for sequence fetches. This approach matches
150/// fgbio's `nmUqMdTagRegeneratingWriter` which reads all contigs into a Map upfront.
151///
152/// For a typical human reference (e.g., hs38DH at ~3GB), this uses approximately
153/// 3GB of memory (raw byte storage like htsjdk) and provides the fastest load times.
154#[derive(Clone)]
155pub struct ReferenceReader {
156    /// All sequences loaded into memory as raw bytes, keyed by sequence name
157    sequences: Arc<HashMap<String, Vec<u8>>>,
158}
159
160impl ReferenceReader {
161    /// Creates a new reference reader, loading all sequences into memory.
162    ///
163    /// This reads the entire FASTA file into memory at construction time.
164    /// For a typical human reference (~3GB), this takes a few seconds but
165    /// provides O(1) lookup performance for all subsequent fetches.
166    ///
167    /// # Arguments
168    /// * `path` - Path to the reference FASTA file (may be gzipped)
169    ///
170    /// # Errors
171    /// Returns an error if:
172    /// - The file does not exist
173    /// - The file cannot be read or parsed as FASTA
174    ///
175    /// # Examples
176    /// ```no_run
177    /// use fgumi_lib::reference::ReferenceReader;
178    ///
179    /// let reader = ReferenceReader::new("reference.fasta")?;
180    /// # Ok::<(), anyhow::Error>(())
181    /// ```
182    pub fn new<P: AsRef<Path>>(path: P) -> Result<Self> {
183        let path = path.as_ref();
184
185        // Verify the file exists and is readable
186        if !path.exists() {
187            return Err(FgumiError::InvalidFileFormat {
188                file_type: "Reference FASTA".to_string(),
189                path: path.display().to_string(),
190                reason: "File does not exist".to_string(),
191            }
192            .into());
193        }
194
195        debug!("Reading reference FASTA into memory: {}", path.display());
196
197        // Try FAI-based fast reading first
198        if let Some(fai_path) = find_fai_path(path) {
199            debug!("Using FAI index for fast loading: {}", fai_path.display());
200            return Self::new_with_fai(path, &fai_path);
201        }
202
203        // Fall back to noodles for non-indexed files
204        debug!("No FAI index found, using sequential reading");
205        Self::new_sequential(path)
206    }
207
208    /// Load sequences using FAI index for fast raw-byte reading (htsjdk-style).
209    fn new_with_fai(fasta_path: &Path, fai_path: &Path) -> Result<Self> {
210        let index = fai::fs::read(fai_path)
211            .with_context(|| format!("Failed to read FAI index: {}", fai_path.display()))?;
212        let records: &[fai::Record] = index.as_ref();
213        let mut file = File::open(fasta_path)
214            .with_context(|| format!("Failed to open FASTA: {}", fasta_path.display()))?;
215
216        let mut sequences = HashMap::with_capacity(records.len());
217
218        for record in records {
219            let raw_sequence = read_sequence_raw(&mut file, record)?;
220            let name = String::from_utf8_lossy(record.name().as_ref()).into_owned();
221            sequences.insert(name, raw_sequence);
222        }
223
224        debug!("Loaded {} contigs into memory (FAI-indexed)", sequences.len());
225        Ok(Self { sequences: Arc::new(sequences) })
226    }
227
228    /// Load sequences using noodles sequential reading (fallback for non-indexed files).
229    fn new_sequential(path: &Path) -> Result<Self> {
230        use noodles::fasta;
231
232        let mut sequences = HashMap::new();
233        let mut reader = fasta::io::reader::Builder.build_from_path(path)?;
234
235        for result in reader.records() {
236            let record = result?;
237            let name = std::str::from_utf8(record.name())?.to_string();
238            let raw_sequence: Vec<u8> = record.sequence().as_ref().to_vec();
239            sequences.insert(name, raw_sequence);
240        }
241
242        debug!("Loaded {} contigs into memory (sequential)", sequences.len());
243        Ok(Self { sequences: Arc::new(sequences) })
244    }
245
246    /// Retrieves a subsequence from the reference genome.
247    ///
248    /// Since all sequences are preloaded into memory, this is an O(1) lookup
249    /// followed by a slice copy.
250    ///
251    /// # Arguments
252    /// * `chrom` - Chromosome/sequence name (e.g., "chr1", "1")
253    /// * `start` - Start position (1-based, inclusive)
254    /// * `end` - End position (1-based, inclusive)
255    ///
256    /// # Returns
257    /// The requested subsequence as a vector of bytes (preserving original case)
258    ///
259    /// # Errors
260    /// Returns an error if:
261    /// - The chromosome is not found in the reference
262    /// - The requested region exceeds the chromosome length
263    ///
264    /// # Examples
265    /// ```no_run
266    /// use fgumi_lib::reference::ReferenceReader;
267    /// use noodles::core::Position;
268    ///
269    /// let reader = ReferenceReader::new("reference.fasta")?;
270    ///
271    /// // Fetch first 100 bases of chr1
272    /// let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(100)?)?;
273    /// assert_eq!(seq.len(), 100);
274    /// # Ok::<(), anyhow::Error>(())
275    /// ```
276    pub fn fetch(&self, chrom: &str, start: Position, end: Position) -> Result<Vec<u8>> {
277        let sequence = self
278            .sequences
279            .get(chrom)
280            .ok_or_else(|| FgumiError::ReferenceNotFound { ref_name: chrom.to_string() })?;
281
282        // Convert from 1-based inclusive to 0-based [start, end) indexing
283        let start_idx = usize::from(start) - 1;
284        let end_idx = usize::from(end);
285
286        if end_idx > sequence.len() || start_idx > end_idx {
287            return Err(FgumiError::InvalidParameter {
288                parameter: "region".to_string(),
289                reason: format!(
290                    "Requested region {}:{}-{} exceeds sequence length {}",
291                    chrom,
292                    start,
293                    end,
294                    sequence.len()
295                ),
296            }
297            .into());
298        }
299
300        Ok(sequence[start_idx..end_idx].to_vec())
301    }
302
303    /// Gets a single base from the reference at the specified position.
304    ///
305    /// This is a convenience method that delegates to `fetch()` with a single-base region.
306    ///
307    /// # Arguments
308    /// * `chrom` - Chromosome/sequence name (e.g., "chr1", "1")
309    /// * `pos` - Position (1-based)
310    ///
311    /// # Returns
312    /// The base at the specified position (preserving original case from FASTA)
313    ///
314    /// # Errors
315    /// Returns an error if:
316    /// - The chromosome is not found in the reference
317    /// - The position exceeds the chromosome length
318    ///
319    /// # Examples
320    /// ```no_run
321    /// use fgumi_lib::reference::ReferenceReader;
322    /// use noodles::core::Position;
323    ///
324    /// let reader = ReferenceReader::new("reference.fasta")?;
325    ///
326    /// // Get the base at position 1000 of chr1
327    /// let base = reader.base_at("chr1", Position::try_from(1000)?)?;
328    /// assert!(matches!(base, b'A' | b'C' | b'G' | b'T' | b'N'));
329    /// # Ok::<(), anyhow::Error>(())
330    /// ```
331    pub fn base_at(&self, chrom: &str, pos: Position) -> Result<u8> {
332        let sequence = self
333            .sequences
334            .get(chrom)
335            .ok_or_else(|| FgumiError::ReferenceNotFound { ref_name: chrom.to_string() })?;
336
337        // Convert from 1-based to 0-based indexing
338        let pos_idx = usize::from(pos) - 1;
339
340        sequence.get(pos_idx).copied().ok_or_else(|| {
341            FgumiError::InvalidParameter {
342                parameter: "position".to_string(),
343                reason: format!(
344                    "Position {}:{} exceeds sequence length {}",
345                    chrom,
346                    pos,
347                    sequence.len()
348                ),
349            }
350            .into()
351        })
352    }
353}
354
355impl fgumi_sam::ReferenceProvider for ReferenceReader {
356    fn fetch(
357        &self,
358        chrom: &str,
359        start: noodles::core::Position,
360        end: noodles::core::Position,
361    ) -> anyhow::Result<Vec<u8>> {
362        self.fetch(chrom, start, end)
363    }
364}
365
366#[cfg(test)]
367mod tests {
368    use super::*;
369    use crate::sam::builder::create_default_test_fasta;
370
371    #[test]
372    fn test_fetch_subsequence() -> Result<()> {
373        let fasta = create_default_test_fasta()?;
374        let reader = ReferenceReader::new(fasta.path())?;
375
376        // Fetch from chr1
377        let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(4)?)?;
378        assert_eq!(seq, b"ACGT");
379
380        // Fetch from chr2
381        let seq = reader.fetch("chr2", Position::try_from(5)?, Position::try_from(8)?)?;
382        assert_eq!(seq, b"CCCC");
383
384        Ok(())
385    }
386
387    #[test]
388    fn test_base_at() -> Result<()> {
389        let fasta = create_default_test_fasta()?;
390        let reader = ReferenceReader::new(fasta.path())?;
391
392        assert_eq!(reader.base_at("chr1", Position::try_from(1)?)?, b'A');
393        assert_eq!(reader.base_at("chr1", Position::try_from(2)?)?, b'C');
394        assert_eq!(reader.base_at("chr2", Position::try_from(1)?)?, b'G');
395
396        Ok(())
397    }
398
399    #[test]
400    fn test_all_sequences_loaded() -> Result<()> {
401        let fasta = create_default_test_fasta()?;
402        let reader = ReferenceReader::new(fasta.path())?;
403
404        // All sequences should be available immediately after construction
405        let seq1 = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(4)?)?;
406        assert_eq!(seq1, b"ACGT");
407
408        let seq2 = reader.fetch("chr2", Position::try_from(1)?, Position::try_from(4)?)?;
409        assert_eq!(seq2, b"GGGG");
410
411        // Fetching chr1 again should still work (all in memory)
412        let seq1_again = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(4)?)?;
413        assert_eq!(seq1_again, b"ACGT");
414
415        Ok(())
416    }
417
418    #[test]
419    fn test_nonexistent_sequence() {
420        let fasta = create_default_test_fasta().unwrap();
421        let reader = ReferenceReader::new(fasta.path()).unwrap();
422
423        let result =
424            reader.fetch("chr999", Position::try_from(1).unwrap(), Position::try_from(4).unwrap());
425        assert!(result.is_err());
426    }
427
428    #[test]
429    fn test_out_of_bounds() {
430        let fasta = create_default_test_fasta().unwrap();
431        let reader = ReferenceReader::new(fasta.path()).unwrap();
432
433        // chr1 is only 12 bases long
434        let result =
435            reader.fetch("chr1", Position::try_from(1).unwrap(), Position::try_from(100).unwrap());
436        assert!(result.is_err());
437    }
438
439    #[test]
440    fn test_reference_case_preserved() -> Result<()> {
441        // Test that FASTA case is preserved (important for MD tag generation)
442        // fgbio preserves case in reference sequences for proper MD tag output
443        use crate::sam::builder::create_test_fasta;
444
445        let file = create_test_fasta(&[("chr1", "AcGtNnAaCcGgTt")])?; // Mixed case sequence
446
447        let reader = ReferenceReader::new(file.path())?;
448
449        // Fetch the full sequence and verify case is preserved
450        let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(14)?)?;
451        assert_eq!(seq, b"AcGtNnAaCcGgTt");
452
453        // Verify individual bases preserve case
454        assert_eq!(reader.base_at("chr1", Position::try_from(1)?)?, b'A'); // uppercase A
455        assert_eq!(reader.base_at("chr1", Position::try_from(2)?)?, b'c'); // lowercase c
456        assert_eq!(reader.base_at("chr1", Position::try_from(3)?)?, b'G'); // uppercase G
457        assert_eq!(reader.base_at("chr1", Position::try_from(4)?)?, b't'); // lowercase t
458        assert_eq!(reader.base_at("chr1", Position::try_from(5)?)?, b'N'); // uppercase N
459        assert_eq!(reader.base_at("chr1", Position::try_from(6)?)?, b'n'); // lowercase n
460
461        Ok(())
462    }
463
464    #[test]
465    fn test_n_bases_at_various_positions() -> Result<()> {
466        use crate::sam::builder::create_test_fasta;
467
468        // N at start, middle, and end
469        let file = create_test_fasta(&[("chr1", "NACGTNACGTN")])?;
470        let reader = ReferenceReader::new(file.path())?;
471
472        assert_eq!(reader.base_at("chr1", Position::try_from(1)?)?, b'N');
473        assert_eq!(reader.base_at("chr1", Position::try_from(6)?)?, b'N');
474        assert_eq!(reader.base_at("chr1", Position::try_from(11)?)?, b'N');
475
476        // Fetch range including N
477        let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(6)?)?;
478        assert_eq!(seq, b"NACGTN");
479
480        Ok(())
481    }
482
483    #[test]
484    fn test_all_n_sequence() -> Result<()> {
485        use crate::sam::builder::create_test_fasta;
486
487        let file = create_test_fasta(&[("chrN", "NNNNNNNNNN")])?;
488        let reader = ReferenceReader::new(file.path())?;
489
490        let seq = reader.fetch("chrN", Position::try_from(1)?, Position::try_from(10)?)?;
491        assert_eq!(seq, b"NNNNNNNNNN");
492
493        for i in 1..=10 {
494            assert_eq!(reader.base_at("chrN", Position::try_from(i)?)?, b'N');
495        }
496
497        Ok(())
498    }
499
500    #[test]
501    fn test_long_sequence_boundaries() -> Result<()> {
502        use crate::sam::builder::create_test_fasta;
503
504        // Create a 100-base sequence with markers at specific positions
505        // We want to test fetching across the 32-base boundary (for bit-packed storage)
506        let mut seq = String::new();
507
508        // Positions 1-28: ACGT repeated (7 times)
509        for _ in 0..7 {
510            seq.push_str("ACGT");
511        }
512        // Positions 29-32: NNNN (straddles u64 boundary at position 32)
513        seq.push_str("NNNN");
514        // Positions 33-60: ACGT repeated (7 times)
515        for _ in 0..7 {
516            seq.push_str("ACGT");
517        }
518        // Positions 61-64: lowercase acgt (straddles second u64 boundary)
519        seq.push_str("acgt");
520        // Positions 65-100: ACGT repeated (9 times)
521        for _ in 0..9 {
522            seq.push_str("ACGT");
523        }
524
525        assert_eq!(seq.len(), 100);
526
527        let file = create_test_fasta(&[("chr1", &seq)])?;
528        let reader = ReferenceReader::new(file.path())?;
529
530        // Verify positions around first marker (NNNN at 29-32)
531        assert_eq!(reader.base_at("chr1", Position::try_from(28)?)?, b'T');
532        assert_eq!(reader.base_at("chr1", Position::try_from(29)?)?, b'N');
533        assert_eq!(reader.base_at("chr1", Position::try_from(32)?)?, b'N');
534        assert_eq!(reader.base_at("chr1", Position::try_from(33)?)?, b'A');
535
536        // Verify positions around second marker (acgt at 61-64)
537        assert_eq!(reader.base_at("chr1", Position::try_from(60)?)?, b'T');
538        assert_eq!(reader.base_at("chr1", Position::try_from(61)?)?, b'a');
539        assert_eq!(reader.base_at("chr1", Position::try_from(64)?)?, b't');
540        assert_eq!(reader.base_at("chr1", Position::try_from(65)?)?, b'A');
541
542        // Fetch across boundaries
543        let cross_first = reader.fetch("chr1", Position::try_from(27)?, Position::try_from(34)?)?;
544        assert_eq!(cross_first, b"GTNNNNAC");
545
546        let cross_second =
547            reader.fetch("chr1", Position::try_from(59)?, Position::try_from(66)?)?;
548        assert_eq!(cross_second, b"GTacgtAC");
549
550        Ok(())
551    }
552
553    #[test]
554    fn test_single_base_sequence() -> Result<()> {
555        use crate::sam::builder::create_test_fasta;
556
557        let file = create_test_fasta(&[("chr1", "A"), ("chr2", "N"), ("chr3", "g")])?;
558        let reader = ReferenceReader::new(file.path())?;
559
560        assert_eq!(reader.base_at("chr1", Position::try_from(1)?)?, b'A');
561        assert_eq!(reader.base_at("chr2", Position::try_from(1)?)?, b'N');
562        assert_eq!(reader.base_at("chr3", Position::try_from(1)?)?, b'g');
563
564        let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(1)?)?;
565        assert_eq!(seq, b"A");
566
567        Ok(())
568    }
569
570    #[test]
571    fn test_fetch_full_sequence() -> Result<()> {
572        use crate::sam::builder::create_test_fasta;
573
574        let original = "ACGTNacgtn";
575        let file = create_test_fasta(&[("chr1", original)])?;
576        let reader = ReferenceReader::new(file.path())?;
577
578        let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(10)?)?;
579        assert_eq!(seq, original.as_bytes());
580
581        Ok(())
582    }
583
584    #[test]
585    fn test_fetch_last_base() -> Result<()> {
586        use crate::sam::builder::create_test_fasta;
587
588        let file = create_test_fasta(&[("chr1", "ACGTN")])?;
589        let reader = ReferenceReader::new(file.path())?;
590
591        // Fetch last base
592        let seq = reader.fetch("chr1", Position::try_from(5)?, Position::try_from(5)?)?;
593        assert_eq!(seq, b"N");
594
595        assert_eq!(reader.base_at("chr1", Position::try_from(5)?)?, b'N');
596
597        Ok(())
598    }
599
600    #[test]
601    fn test_multiple_chromosomes_isolation() -> Result<()> {
602        use crate::sam::builder::create_test_fasta;
603
604        let file = create_test_fasta(&[
605            ("chr1", "AAAA"),
606            ("chr2", "CCCC"),
607            ("chr3", "GGGG"),
608            ("chr4", "TTTT"),
609        ])?;
610        let reader = ReferenceReader::new(file.path())?;
611
612        // Verify each chromosome has its own sequence
613        assert_eq!(reader.fetch("chr1", Position::try_from(1)?, Position::try_from(4)?)?, b"AAAA");
614        assert_eq!(reader.fetch("chr2", Position::try_from(1)?, Position::try_from(4)?)?, b"CCCC");
615        assert_eq!(reader.fetch("chr3", Position::try_from(1)?, Position::try_from(4)?)?, b"GGGG");
616        assert_eq!(reader.fetch("chr4", Position::try_from(1)?, Position::try_from(4)?)?, b"TTTT");
617
618        Ok(())
619    }
620
621    #[test]
622    fn test_mixed_case_all_bases() -> Result<()> {
623        use crate::sam::builder::create_test_fasta;
624
625        // Test all 10 possible base values: A, C, G, T, N (upper and lower)
626        let file = create_test_fasta(&[("chr1", "ACGTNacgtn")])?;
627        let reader = ReferenceReader::new(file.path())?;
628
629        let expected = [b'A', b'C', b'G', b'T', b'N', b'a', b'c', b'g', b't', b'n'];
630        for (i, &expected_base) in expected.iter().enumerate() {
631            let pos = Position::try_from(i + 1)?;
632            assert_eq!(
633                reader.base_at("chr1", pos)?,
634                expected_base,
635                "Mismatch at position {}",
636                i + 1
637            );
638        }
639
640        Ok(())
641    }
642
643    #[test]
644    fn test_runs_of_n_bases() -> Result<()> {
645        use crate::sam::builder::create_test_fasta;
646
647        // Simulate masked regions with runs of N
648        let file = create_test_fasta(&[("chr1", "ACGTNNNNNNNNACGT")])?;
649        let reader = ReferenceReader::new(file.path())?;
650
651        // Fetch run of N's
652        let n_run = reader.fetch("chr1", Position::try_from(5)?, Position::try_from(12)?)?;
653        assert_eq!(n_run, b"NNNNNNNN");
654
655        // Fetch across N boundary
656        let across = reader.fetch("chr1", Position::try_from(3)?, Position::try_from(14)?)?;
657        assert_eq!(across, b"GTNNNNNNNNAC");
658
659        Ok(())
660    }
661
662    #[test]
663    fn test_position_one_based() -> Result<()> {
664        use crate::sam::builder::create_test_fasta;
665
666        let file = create_test_fasta(&[("chr1", "ACGTN")])?;
667        let reader = ReferenceReader::new(file.path())?;
668
669        // Position 1 should be first base 'A', not second
670        assert_eq!(reader.base_at("chr1", Position::try_from(1)?)?, b'A');
671        assert_eq!(reader.base_at("chr1", Position::try_from(2)?)?, b'C');
672
673        // fetch(1, 1) should return single base 'A'
674        let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(1)?)?;
675        assert_eq!(seq, b"A");
676
677        // fetch(1, 2) should return "AC"
678        let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(2)?)?;
679        assert_eq!(seq, b"AC");
680
681        Ok(())
682    }
683
684    #[test]
685    fn test_find_dict_path_replacing_convention() -> Result<()> {
686        // Test the fgbio/HTSJDK/Picard convention: ref.fa -> ref.dict
687        let temp_dir = tempfile::tempdir()?;
688        let fasta_path = temp_dir.path().join("ref.fa");
689        let dict_path = temp_dir.path().join("ref.dict");
690
691        // Create empty files
692        std::fs::write(&fasta_path, "")?;
693        std::fs::write(&dict_path, "")?;
694
695        let found = find_dict_path(&fasta_path);
696        assert!(found.is_some());
697        assert_eq!(found.unwrap(), dict_path);
698
699        Ok(())
700    }
701
702    #[test]
703    fn test_find_dict_path_appending_convention() -> Result<()> {
704        // Test the GATK convention: ref.fa -> ref.fa.dict
705        let temp_dir = tempfile::tempdir()?;
706        let fasta_path = temp_dir.path().join("ref.fa");
707        let dict_path = temp_dir.path().join("ref.fa.dict");
708
709        // Create empty files
710        std::fs::write(&fasta_path, "")?;
711        std::fs::write(&dict_path, "")?;
712
713        let found = find_dict_path(&fasta_path);
714        assert!(found.is_some());
715        assert_eq!(found.unwrap(), dict_path);
716
717        Ok(())
718    }
719
720    #[test]
721    fn test_find_dict_path_prefers_replacing_convention() -> Result<()> {
722        // When both exist, prefer the fgbio/HTSJDK convention (ref.dict)
723        let temp_dir = tempfile::tempdir()?;
724        let fasta_path = temp_dir.path().join("ref.fa");
725        let dict_replacing = temp_dir.path().join("ref.dict");
726        let dict_appending = temp_dir.path().join("ref.fa.dict");
727
728        // Create all files
729        std::fs::write(&fasta_path, "")?;
730        std::fs::write(&dict_replacing, "")?;
731        std::fs::write(&dict_appending, "")?;
732
733        let found = find_dict_path(&fasta_path);
734        assert!(found.is_some());
735        // Should find ref.dict first (fgbio/HTSJDK convention)
736        assert_eq!(found.unwrap(), dict_replacing);
737
738        Ok(())
739    }
740
741    #[test]
742    fn test_find_dict_path_not_found() {
743        // When no dict file exists, return None
744        let temp_dir = tempfile::tempdir().unwrap();
745        let fasta_path = temp_dir.path().join("ref.fa");
746
747        // Create only the FASTA file, no dict
748        std::fs::write(&fasta_path, "").unwrap();
749
750        let found = find_dict_path(&fasta_path);
751        assert!(found.is_none());
752    }
753
754    #[test]
755    fn test_find_dict_path_fasta_extension() -> Result<()> {
756        // Test with .fasta extension: ref.fasta -> ref.dict
757        let temp_dir = tempfile::tempdir()?;
758        let fasta_path = temp_dir.path().join("ref.fasta");
759        let dict_path = temp_dir.path().join("ref.dict");
760
761        std::fs::write(&fasta_path, "")?;
762        std::fs::write(&dict_path, "")?;
763
764        let found = find_dict_path(&fasta_path);
765        assert!(found.is_some());
766        assert_eq!(found.unwrap(), dict_path);
767
768        Ok(())
769    }
770
771    #[test]
772    fn test_find_dict_path_fasta_appending_convention() -> Result<()> {
773        // Test with .fasta extension: ref.fasta -> ref.fasta.dict
774        let temp_dir = tempfile::tempdir()?;
775        let fasta_path = temp_dir.path().join("ref.fasta");
776        let dict_path = temp_dir.path().join("ref.fasta.dict");
777
778        std::fs::write(&fasta_path, "")?;
779        std::fs::write(&dict_path, "")?;
780
781        let found = find_dict_path(&fasta_path);
782        assert!(found.is_some());
783        assert_eq!(found.unwrap(), dict_path);
784
785        Ok(())
786    }
787}