Skip to main content

ferro_hgvs/reference/
multi_fasta.rs

1//! Multi-FASTA reference sequence provider
2//!
3//! This module provides a reference provider that can load sequences from
4//! multiple FASTA files, useful for large reference datasets split across files.
5//!
6//! # Coordinate Systems
7//!
8//! This module handles coordinate conversions when loading sequences:
9//!
10//! | Source | Basis | Target | Notes |
11//! |--------|-------|--------|-------|
12//! | FASTA/FAI index | 0-based | Internal | Half-open `[start, end)` |
13//! | cdot genomic | 0-based | Transcript | `genome_start` is 0-based, converted to 1-based |
14//! | cdot tx | 1-based | Transcript | `tx_start`/`tx_end` used directly (already 1-based) |
15//! | cdot CDS | 0-based | Transcript | `cds_start` converted via `+ 1` to 1-based |
16//!
17//! **Important**: The `get_sequence()` method uses 0-based half-open coordinates
18//! consistent with FASTA index conventions.
19//!
20//! For type-safe coordinate handling, see [`crate::coords`].
21
22use std::collections::HashMap;
23use std::fs::File;
24use std::io::{BufRead, BufReader, Read, Seek, SeekFrom};
25use std::path::{Path, PathBuf};
26
27use log::warn;
28
29use crate::data::cdot::CdotMapper;
30use crate::error::FerroError;
31use crate::reference::provider::ReferenceProvider;
32use crate::reference::transcript::Transcript;
33
34/// Index entry for a sequence in a FASTA file
35#[derive(Debug, Clone)]
36struct FastaIndexEntry {
37    /// Path to the FASTA file containing this sequence
38    file_path: PathBuf,
39    /// Sequence name/accession
40    name: String,
41    /// Length of the sequence
42    length: u64,
43    /// Byte offset to the start of sequence data
44    offset: u64,
45    /// Number of bases per line
46    line_bases: u64,
47    /// Number of bytes per line (including newline)
48    line_bytes: u64,
49}
50
51/// Supplemental transcript info for a single transcript.
52#[derive(Debug, Clone)]
53pub struct SupplementalTranscriptInfo {
54    pub cds_start: Option<u64>,
55    pub cds_end: Option<u64>,
56    pub gene_symbol: Option<String>,
57    pub sequence_length: u64,
58}
59
60/// Supplemental transcript metadata for transcripts not in cdot.
61#[derive(Debug, Clone, Default)]
62pub struct SupplementalCdsInfo {
63    /// Mapping from accession to transcript info.
64    /// CDS coordinates are 1-based inclusive.
65    pub transcripts: HashMap<String, SupplementalTranscriptInfo>,
66}
67
68/// Multi-FASTA reference provider
69///
70/// Provides efficient random access to sequences across multiple FASTA files.
71/// Useful for reference datasets split across files (e.g., RefSeq transcripts).
72///
73/// # Example
74///
75/// ```ignore
76/// use ferro_hgvs::reference::multi_fasta::MultiFastaProvider;
77///
78/// let provider = MultiFastaProvider::from_directory("reference_data/transcripts")?;
79/// let seq = provider.get_sequence("NM_000546.6", 0, 100)?;
80/// ```
81pub struct MultiFastaProvider {
82    /// Index mapping sequence names to their locations
83    index: HashMap<String, FastaIndexEntry>,
84    /// Index mapping base accession (without version) to versioned accession
85    base_to_versioned: HashMap<String, String>,
86    /// Chromosome aliases (e.g., NC_000001 -> chr1)
87    aliases: HashMap<String, String>,
88    /// Optional cdot transcript metadata (CDS positions, exon coordinates)
89    cdot_mapper: Option<CdotMapper>,
90    /// Supplemental CDS info for transcripts not in cdot
91    supplemental_cds: SupplementalCdsInfo,
92}
93
94impl MultiFastaProvider {
95    /// Create a new multi-FASTA provider from a directory of FASTA files
96    ///
97    /// Scans the directory for .fna and .fa files with accompanying .fai indexes.
98    pub fn from_directory<P: AsRef<Path>>(dir: P) -> Result<Self, FerroError> {
99        let dir = dir.as_ref();
100
101        if !dir.exists() {
102            return Err(FerroError::Io {
103                msg: format!("Reference directory not found: {}", dir.display()),
104            });
105        }
106
107        let mut index = HashMap::new();
108        let mut base_to_versioned = HashMap::new();
109
110        // Find all FASTA files with indexes
111        let entries = std::fs::read_dir(dir).map_err(|e| FerroError::Io {
112            msg: format!("Failed to read directory {}: {}", dir.display(), e),
113        })?;
114
115        for entry in entries {
116            let entry = entry.map_err(|e| FerroError::Io {
117                msg: format!("Failed to read directory entry: {}", e),
118            })?;
119
120            let path = entry.path();
121            let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
122
123            // Look for .fna, .fa, or .fasta files
124            if ext == "fna" || ext == "fa" || ext == "fasta" {
125                let fai_path = PathBuf::from(format!("{}.fai", path.display()));
126
127                if fai_path.exists() {
128                    // Load index for this file
129                    let file_index = load_fai_index(&path, &fai_path)?;
130
131                    for (name, entry) in file_index {
132                        // Map base accession (without version) to versioned
133                        if let Some(base) = name.split('.').next() {
134                            base_to_versioned.insert(base.to_string(), name.clone());
135                        }
136                        index.insert(name, entry);
137                    }
138                }
139            }
140        }
141
142        if index.is_empty() {
143            return Err(FerroError::Io {
144                msg: format!(
145                    "No indexed FASTA files found in {}. Run 'ferro-benchmark prepare' first.",
146                    dir.display()
147                ),
148            });
149        }
150
151        eprintln!(
152            "Loaded {} sequences from {} FASTA files",
153            index.len(),
154            count_unique_files(&index)
155        );
156
157        Ok(Self {
158            index,
159            base_to_versioned,
160            aliases: build_chromosome_aliases(),
161            cdot_mapper: None,
162            supplemental_cds: SupplementalCdsInfo::default(),
163        })
164    }
165
166    /// Create a provider from multiple directories (e.g., transcripts + genome)
167    pub fn from_directories<P: AsRef<Path>>(dirs: &[P]) -> Result<Self, FerroError> {
168        let mut combined_index = HashMap::new();
169        let mut base_to_versioned = HashMap::new();
170
171        for dir in dirs {
172            let dir = dir.as_ref();
173            if !dir.exists() {
174                continue;
175            }
176
177            let partial = Self::from_directory(dir)?;
178            combined_index.extend(partial.index);
179            base_to_versioned.extend(partial.base_to_versioned);
180        }
181
182        if combined_index.is_empty() {
183            return Err(FerroError::Io {
184                msg: "No indexed FASTA files found in any directory".to_string(),
185            });
186        }
187
188        Ok(Self {
189            index: combined_index,
190            base_to_versioned,
191            aliases: build_chromosome_aliases(),
192            cdot_mapper: None,
193            supplemental_cds: SupplementalCdsInfo::default(),
194        })
195    }
196
197    /// Create a provider from a manifest file
198    pub fn from_manifest<P: AsRef<Path>>(manifest_path: P) -> Result<Self, FerroError> {
199        let manifest_path = manifest_path.as_ref();
200
201        // Get the directory containing the manifest - paths are relative to this
202        let base_dir = manifest_path
203            .parent()
204            .unwrap_or(Path::new("."))
205            .to_path_buf();
206
207        // Helper to resolve a path relative to the manifest's directory
208        let resolve_path = |path_str: &str| -> PathBuf {
209            let path = PathBuf::from(path_str);
210            if path.is_absolute() {
211                path
212            } else {
213                base_dir.join(path)
214            }
215        };
216
217        let file = File::open(manifest_path).map_err(|e| FerroError::Io {
218            msg: format!("Failed to open manifest: {}", e),
219        })?;
220
221        let manifest: serde_json::Value =
222            serde_json::from_reader(file).map_err(|e| FerroError::Io {
223                msg: format!("Failed to parse manifest: {}", e),
224            })?;
225
226        let mut dirs = Vec::new();
227
228        // Get transcript directory (paths in manifest are relative to manifest location)
229        if let Some(fastas) = manifest.get("transcript_fastas").and_then(|v| v.as_array()) {
230            if let Some(first) = fastas.first().and_then(|v| v.as_str()) {
231                // Remove .gz extension if present
232                let path_str = first.strip_suffix(".gz").unwrap_or(first);
233                let path = resolve_path(path_str);
234                if let Some(transcript_dir) = path.parent() {
235                    dirs.push(transcript_dir.to_path_buf());
236                }
237            }
238        }
239
240        // Get genome directory (paths in manifest are relative to manifest location)
241        if let Some(genome) = manifest.get("genome_fasta").and_then(|v| v.as_str()) {
242            let path = resolve_path(genome);
243            if let Some(genome_dir) = path.parent() {
244                dirs.push(genome_dir.to_path_buf());
245            }
246        }
247
248        // Get RefSeqGene directory (NG_ accessions)
249        if let Some(fastas) = manifest.get("refseqgene_fastas").and_then(|v| v.as_array()) {
250            if let Some(first) = fastas.first().and_then(|v| v.as_str()) {
251                let path = resolve_path(first);
252                if let Some(refseqgene_dir) = path.parent() {
253                    if !dirs.contains(&refseqgene_dir.to_path_buf()) {
254                        dirs.push(refseqgene_dir.to_path_buf());
255                    }
256                }
257            }
258        }
259
260        // Get LRG directory (LRG_ accessions)
261        if let Some(fastas) = manifest.get("lrg_fastas").and_then(|v| v.as_array()) {
262            if let Some(first) = fastas.first().and_then(|v| v.as_str()) {
263                let path = resolve_path(first);
264                if let Some(lrg_dir) = path.parent() {
265                    if !dirs.contains(&lrg_dir.to_path_buf()) {
266                        dirs.push(lrg_dir.to_path_buf());
267                    }
268                }
269            }
270        }
271
272        // Get supplemental directory (missing ClinVar transcripts)
273        if let Some(supplemental) = manifest.get("supplemental_fasta").and_then(|v| v.as_str()) {
274            let path = resolve_path(supplemental);
275            if let Some(supplemental_dir) = path.parent() {
276                if !dirs.contains(&supplemental_dir.to_path_buf()) {
277                    dirs.push(supplemental_dir.to_path_buf());
278                }
279            }
280        }
281
282        if dirs.is_empty() {
283            return Err(FerroError::Io {
284                msg: "No FASTA directories found in manifest".to_string(),
285            });
286        }
287
288        let mut provider = Self::from_directories(&dirs)?;
289
290        // Load cdot transcript metadata if available
291        if let Some(cdot_path_str) = manifest.get("cdot_json").and_then(|v| v.as_str()) {
292            let cdot_path = resolve_path(cdot_path_str);
293            if cdot_path.exists() {
294                eprintln!(
295                    "Loading cdot transcript metadata from {}...",
296                    cdot_path.display()
297                );
298                match CdotMapper::from_json_file(&cdot_path) {
299                    Ok(mut mapper) => {
300                        eprintln!(
301                            "Loaded {} transcripts with CDS metadata",
302                            mapper.transcript_count()
303                        );
304
305                        // Load LRG to RefSeq mapping if available
306                        if let Some(lrg_mapping_path) =
307                            manifest.get("lrg_refseq_mapping").and_then(|v| v.as_str())
308                        {
309                            let lrg_path = resolve_path(lrg_mapping_path);
310                            if lrg_path.exists() {
311                                match mapper.load_lrg_mapping(&lrg_path) {
312                                    Ok(count) => {
313                                        eprintln!("Loaded {} LRG to RefSeq mappings", count);
314                                    }
315                                    Err(e) => {
316                                        eprintln!("Warning: Failed to load LRG mapping: {}", e);
317                                    }
318                                }
319                            }
320                        }
321
322                        provider.cdot_mapper = Some(mapper);
323                    }
324                    Err(e) => {
325                        eprintln!("Warning: Failed to load cdot data: {}", e);
326                    }
327                }
328            }
329        }
330
331        // Load supplemental metadata if available (for old/superseded transcripts)
332        if let Some(supplemental) = manifest.get("supplemental_fasta").and_then(|v| v.as_str()) {
333            let supplemental_path = resolve_path(supplemental);
334            // Metadata file is {fasta_basename}.metadata.json (e.g., clinvar_transcripts.metadata.json)
335            let metadata_path = supplemental_path.with_extension("metadata.json");
336            if metadata_path.exists() {
337                eprintln!(
338                    "Loading supplemental CDS metadata from {}...",
339                    metadata_path.display()
340                );
341                match std::fs::read_to_string(&metadata_path) {
342                    Ok(content) => match serde_json::from_str::<serde_json::Value>(&content) {
343                        Ok(metadata) => {
344                            if let Some(transcripts) =
345                                metadata.get("transcripts").and_then(|v| v.as_object())
346                            {
347                                for (accession, info) in transcripts {
348                                    let cds_start = info.get("cds_start").and_then(|v| v.as_u64());
349                                    let cds_end = info.get("cds_end").and_then(|v| v.as_u64());
350                                    let gene_symbol = info
351                                        .get("gene_symbol")
352                                        .and_then(|v| v.as_str())
353                                        .map(|s| s.to_string());
354                                    let sequence_length = info
355                                        .get("sequence_length")
356                                        .and_then(|v| v.as_u64())
357                                        .unwrap_or(0);
358                                    provider.supplemental_cds.transcripts.insert(
359                                        accession.clone(),
360                                        SupplementalTranscriptInfo {
361                                            cds_start,
362                                            cds_end,
363                                            gene_symbol,
364                                            sequence_length,
365                                        },
366                                    );
367                                }
368                                eprintln!(
369                                    "Loaded {} supplemental transcripts with CDS metadata",
370                                    provider.supplemental_cds.transcripts.len()
371                                );
372                            }
373                        }
374                        Err(e) => {
375                            eprintln!("Warning: Failed to parse supplemental metadata: {}", e);
376                        }
377                    },
378                    Err(e) => {
379                        eprintln!("Warning: Failed to read supplemental metadata: {}", e);
380                    }
381                }
382            }
383        }
384
385        Ok(provider)
386    }
387
388    /// Create a provider from a single FASTA file with cdot metadata
389    ///
390    /// This is the recommended method for normalizing intronic variants,
391    /// as it combines genomic sequence data (from FASTA) with transcript
392    /// metadata (from cdot) needed for coordinate conversions.
393    ///
394    /// # Arguments
395    ///
396    /// * `fasta_path` - Path to an indexed FASTA file (must have .fai index)
397    /// * `cdot_path` - Path to a cdot JSON file (can be gzipped)
398    ///
399    /// # Example
400    ///
401    /// ```ignore
402    /// let provider = MultiFastaProvider::with_cdot(
403    ///     "reference.fa",
404    ///     "cdot-0.2.32.refseq.GRCh38.json.gz",
405    /// )?;
406    /// ```
407    pub fn with_cdot<P: AsRef<Path>, Q: AsRef<Path>>(
408        fasta_path: P,
409        cdot_path: Q,
410    ) -> Result<Self, FerroError> {
411        let fasta_path = fasta_path.as_ref();
412        let cdot_path = cdot_path.as_ref();
413
414        // Load FASTA index
415        let fai_path = PathBuf::from(format!("{}.fai", fasta_path.display()));
416        if !fai_path.exists() {
417            return Err(FerroError::Io {
418                msg: format!(
419                    "FASTA index not found: {}. Run 'samtools faidx {}' to create it.",
420                    fai_path.display(),
421                    fasta_path.display()
422                ),
423            });
424        }
425
426        let index = load_fai_index(fasta_path, &fai_path)?;
427
428        // Build base to versioned mapping
429        let mut base_to_versioned = HashMap::new();
430        for name in index.keys() {
431            if let Some(base) = name.split('.').next() {
432                base_to_versioned.insert(base.to_string(), name.clone());
433            }
434        }
435
436        eprintln!("Loaded {} sequences from FASTA", index.len());
437
438        // Load cdot (auto-detect gzipped or plain JSON)
439        let cdot_mapper = if cdot_path.extension().map(|e| e == "gz").unwrap_or(false) {
440            CdotMapper::from_json_gz(cdot_path)?
441        } else {
442            CdotMapper::from_json_file(cdot_path)?
443        };
444        eprintln!(
445            "Loaded {} transcripts from cdot",
446            cdot_mapper.transcript_count()
447        );
448
449        Ok(Self {
450            index,
451            base_to_versioned,
452            aliases: build_chromosome_aliases(),
453            cdot_mapper: Some(cdot_mapper),
454            supplemental_cds: SupplementalCdsInfo::default(),
455        })
456    }
457
458    /// Resolve a sequence name, trying various forms
459    fn resolve_name(&self, name: &str) -> Option<String> {
460        // Direct lookup
461        if self.index.contains_key(name) {
462            return Some(name.to_string());
463        }
464
465        // Try chromosome alias FIRST (before version fallback)
466        // This ensures NC_000001.11 maps to chr1 rather than falling back to NC_000001.10
467        if let Some(aliased) = self.aliases.get(name) {
468            if self.index.contains_key(aliased) {
469                return Some(aliased.clone());
470            }
471        }
472
473        // Try adding/removing chr prefix
474        let alt_name = if name.starts_with("chr") {
475            name.strip_prefix("chr").unwrap().to_string()
476        } else {
477            format!("chr{}", name)
478        };
479
480        if self.index.contains_key(&alt_name) {
481            return Some(alt_name);
482        }
483
484        // Try without version (e.g., NM_000546 -> NM_000546.6)
485        // This is for transcript accessions, not chromosome accessions
486        if !name.contains('.') {
487            if let Some(versioned) = self.base_to_versioned.get(name) {
488                return Some(versioned.clone());
489            }
490        }
491
492        // Try with version stripped (fallback for transcripts with different versions)
493        // Skip for chromosome accessions that should have been resolved by aliases above
494        if let Some(base) = name.split('.').next() {
495            // Only use version fallback for transcript accessions, not chromosomes
496            // Chromosome accessions (NC_*) should use aliases, not version fallback
497            if !base.starts_with("NC_") {
498                if let Some(versioned) = self.base_to_versioned.get(base) {
499                    return Some(versioned.clone());
500                }
501            }
502        }
503
504        None
505    }
506
507    /// Get a sequence from the appropriate FASTA file
508    fn get_sequence_from_index(
509        &self,
510        entry: &FastaIndexEntry,
511        start: u64,
512        end: u64,
513    ) -> Result<String, FerroError> {
514        // Validate coordinates
515        if start >= entry.length {
516            return Err(FerroError::InvalidCoordinates {
517                msg: format!(
518                    "Start position {} exceeds sequence length {} for {}",
519                    start, entry.length, entry.name
520                ),
521            });
522        }
523
524        let actual_end = end.min(entry.length);
525        if start >= actual_end {
526            return Ok(String::new());
527        }
528
529        // Calculate file position
530        let line_start = start / entry.line_bases;
531        let byte_offset = start % entry.line_bases;
532        let file_offset = entry.offset + line_start * entry.line_bytes + byte_offset;
533
534        // Calculate how many bytes we need to read
535        let seq_len = actual_end - start;
536        let num_lines = (seq_len + byte_offset).div_ceil(entry.line_bases);
537        let bytes_to_read = seq_len + num_lines; // Extra for newlines
538
539        // Read from file
540        let mut file = File::open(&entry.file_path).map_err(|e| FerroError::Io {
541            msg: format!("Failed to open FASTA file: {}", e),
542        })?;
543
544        file.seek(SeekFrom::Start(file_offset))
545            .map_err(|e| FerroError::Io {
546                msg: format!("Failed to seek in FASTA file: {}", e),
547            })?;
548
549        let mut buffer = vec![0u8; bytes_to_read as usize];
550        file.read_exact(&mut buffer).map_err(|e| FerroError::Io {
551            msg: format!("Failed to read from FASTA file: {}", e),
552        })?;
553
554        // Filter out newlines and take exact length
555        let sequence: String = buffer
556            .iter()
557            .filter(|&&b| b != b'\n' && b != b'\r')
558            .take(seq_len as usize)
559            .map(|&b| b as char)
560            .collect();
561
562        Ok(sequence.to_uppercase())
563    }
564
565    /// Check if a sequence exists
566    pub fn has_sequence(&self, name: &str) -> bool {
567        self.resolve_name(name).is_some()
568    }
569
570    /// Get the length of a sequence
571    pub fn sequence_length(&self, name: &str) -> Option<u64> {
572        self.resolve_name(name)
573            .and_then(|n| self.index.get(&n))
574            .map(|e| e.length)
575    }
576
577    /// Get total number of sequences
578    pub fn sequence_count(&self) -> usize {
579        self.index.len()
580    }
581
582    /// Get CDS info from supplemental metadata for old/superseded transcripts.
583    /// Creates a synthetic single exon spanning the entire transcript since
584    /// mRNA transcripts are already spliced.
585    #[allow(clippy::type_complexity)]
586    fn get_supplemental_cds_info(
587        &self,
588        accession: &str,
589    ) -> (
590        Option<String>,
591        crate::reference::transcript::Strand,
592        Option<u64>,
593        Option<u64>,
594        Option<String>,
595        Vec<crate::reference::transcript::Exon>,
596        Option<u64>,
597        Option<u64>,
598    ) {
599        use crate::reference::transcript::{Exon, Strand};
600
601        if let Some(info) = self.supplemental_cds.transcripts.get(accession) {
602            // Create a synthetic single exon spanning the entire transcript.
603            // For mRNA transcripts, the entire sequence is exonic (already spliced).
604            let exons = if info.sequence_length > 0 {
605                vec![Exon {
606                    number: 1,
607                    start: 1,
608                    end: info.sequence_length,
609                    genomic_start: None,
610                    genomic_end: None,
611                }]
612            } else {
613                Vec::new()
614            };
615
616            (
617                info.gene_symbol.clone(),
618                Strand::Plus,   // Assume plus strand for supplemental transcripts
619                info.cds_start, // CDS coordinates are already 1-based from GenBank
620                info.cds_end,
621                None,  // No chromosome info
622                exons, // Synthetic single exon
623                None,
624                None,
625            )
626        } else {
627            (None, Strand::Plus, None, None, None, Vec::new(), None, None)
628        }
629    }
630}
631
632impl ReferenceProvider for MultiFastaProvider {
633    fn get_transcript(&self, id: &str) -> Result<Transcript, FerroError> {
634        use crate::reference::transcript::{Exon as TxExon, GenomeBuild, ManeStatus};
635        use std::sync::OnceLock;
636
637        if let Some(resolved) = self.resolve_name(id) {
638            if let Some(entry) = self.index.get(&resolved) {
639                // Get the full sequence
640                let sequence = self.get_sequence_from_index(entry, 0, entry.length)?;
641
642                // Try to get metadata from cdot
643                let (
644                    gene_symbol,
645                    strand,
646                    cds_start,
647                    cds_end,
648                    chromosome,
649                    exons,
650                    genomic_start,
651                    genomic_end,
652                ) = if let Some(ref cdot) = self.cdot_mapper {
653                    if let Some(tx) = cdot.get_transcript(&resolved) {
654                        // Convert cdot exons to transcript exons
655                        // cdot internal format: [genome_start, genome_end, tx_start, tx_end]
656                        // COORDINATE SYSTEMS:
657                        // - tx_start/tx_end: 1-based (use directly, no conversion needed)
658                        // - genome_start: 0-based, convert to 1-based by adding 1
659                        // - genome_end: 0-based exclusive = 1-based inclusive (no conversion)
660                        let exons: Vec<TxExon> = tx
661                            .exons
662                            .iter()
663                            .enumerate()
664                            .map(|(i, e)| TxExon {
665                                number: (i + 1) as u32,
666                                start: e[2], // tx_start: already 1-based
667                                end: e[3],   // tx_end: already 1-based
668                                genomic_start: Some(e[0] + 1), // genome_start: 0-based → 1-based
669                                genomic_end: Some(e[1]), // genome_end: 0-based excl = 1-based incl
670                            })
671                            .collect();
672
673                        // Validate exon continuity - for mRNA transcripts, tx coordinates
674                        // should be contiguous (no gaps). If there are gaps, the cdot data
675                        // may be corrupted, so fall back to supplemental data.
676                        let has_gaps = exons.windows(2).any(|w| w[1].start != w[0].end + 1);
677
678                        if has_gaps && self.supplemental_cds.transcripts.contains_key(&resolved) {
679                            // cdot has gaps, use supplemental data instead
680                            warn!(
681                                "cdot exon data for {} has gaps in transcript coordinates, \
682                                 using supplemental CDS metadata",
683                                resolved
684                            );
685                            self.get_supplemental_cds_info(&resolved)
686                        } else {
687                            // Compute transcript genomic bounds from exons
688                            let (genomic_start, genomic_end) = if !exons.is_empty() {
689                                let min_start = exons.iter().filter_map(|e| e.genomic_start).min();
690                                let max_end = exons.iter().filter_map(|e| e.genomic_end).max();
691                                (min_start, max_end)
692                            } else {
693                                (None, None)
694                            };
695
696                            (
697                                tx.gene_name.clone(),
698                                tx.strand,
699                                // CDS coordinates: cdot 0-based → transcript 1-based
700                                tx.cds_start.map(|s| s + 1), // 0-based → 1-based
701                                tx.cds_end, // 0-based exclusive = 1-based inclusive (no conversion)
702                                Some(tx.contig.clone()),
703                                exons,
704                                genomic_start,
705                                genomic_end,
706                            )
707                        }
708                    } else {
709                        // Check supplemental CDS for old/superseded transcripts
710                        if self.supplemental_cds.transcripts.contains_key(&resolved) {
711                            warn!(
712                                "{} not in cdot data, using supplemental CDS metadata",
713                                resolved
714                            );
715                        }
716                        self.get_supplemental_cds_info(&resolved)
717                    }
718                } else {
719                    // Check supplemental CDS for old/superseded transcripts (no cdot mapper)
720                    if self.supplemental_cds.transcripts.contains_key(&resolved) {
721                        warn!(
722                            "{} not in cdot data (no cdot mapper), using supplemental CDS metadata",
723                            resolved
724                        );
725                    }
726                    self.get_supplemental_cds_info(&resolved)
727                };
728
729                return Ok(Transcript {
730                    id: resolved,
731                    gene_symbol,
732                    strand,
733                    sequence,
734                    cds_start,
735                    cds_end,
736                    exons,
737                    chromosome,
738                    genomic_start,
739                    genomic_end,
740                    genome_build: GenomeBuild::GRCh38,
741                    mane_status: ManeStatus::default(),
742                    refseq_match: None,
743                    ensembl_match: None,
744                    cached_introns: OnceLock::new(),
745                });
746            }
747        }
748
749        Err(FerroError::ReferenceNotFound { id: id.to_string() })
750    }
751
752    fn get_sequence(&self, id: &str, start: u64, end: u64) -> Result<String, FerroError> {
753        let resolved = self
754            .resolve_name(id)
755            .ok_or_else(|| FerroError::ReferenceNotFound { id: id.to_string() })?;
756
757        let entry = self
758            .index
759            .get(&resolved)
760            .ok_or_else(|| FerroError::ReferenceNotFound { id: id.to_string() })?;
761
762        self.get_sequence_from_index(entry, start, end)
763    }
764
765    fn get_genomic_sequence(
766        &self,
767        contig: &str,
768        start: u64,
769        end: u64,
770    ) -> Result<String, FerroError> {
771        // Try to find the contig in the FASTA index
772        // First try exact match, then try with/without version
773        let resolved =
774            self.resolve_name(contig)
775                .ok_or_else(|| FerroError::GenomicReferenceNotAvailable {
776                    contig: contig.to_string(),
777                    start,
778                    end,
779                })?;
780
781        let entry =
782            self.index
783                .get(&resolved)
784                .ok_or_else(|| FerroError::GenomicReferenceNotAvailable {
785                    contig: contig.to_string(),
786                    start,
787                    end,
788                })?;
789
790        self.get_sequence_from_index(entry, start, end)
791    }
792
793    fn has_genomic_data(&self) -> bool {
794        // Check if we have chromosome/contig sequences in the index
795        // These may be named as NC_* (NCBI RefSeq) or chr* (UCSC style)
796        self.index
797            .keys()
798            .any(|k| k.starts_with("NC_") || k.starts_with("chr"))
799    }
800}
801
802/// Load a FASTA index (.fai) file
803///
804/// Note: This function canonicalizes the FASTA path to resolve symlinks and
805/// provide absolute paths, which helps prevent path traversal issues.
806fn load_fai_index<P: AsRef<Path>>(
807    fasta_path: P,
808    fai_path: P,
809) -> Result<HashMap<String, FastaIndexEntry>, FerroError> {
810    // Canonicalize the FASTA path to resolve symlinks and get absolute path
811    // This helps prevent path traversal vulnerabilities
812    let fasta_path = fasta_path
813        .as_ref()
814        .canonicalize()
815        .map_err(|e| FerroError::Io {
816            msg: format!(
817                "Failed to canonicalize FASTA path '{}': {}",
818                fasta_path.as_ref().display(),
819                e
820            ),
821        })?;
822    let fai_path = fai_path.as_ref();
823
824    let file = File::open(fai_path).map_err(|e| FerroError::Io {
825        msg: format!("Failed to open FAI file: {}", e),
826    })?;
827    let reader = BufReader::new(file);
828
829    let mut index = HashMap::new();
830
831    for line in reader.lines() {
832        let line = line.map_err(|e| FerroError::Io {
833            msg: format!("Failed to read FAI line: {}", e),
834        })?;
835
836        let fields: Vec<&str> = line.split('\t').collect();
837        if fields.len() < 5 {
838            continue;
839        }
840
841        // Parse index values with proper error handling instead of silent defaults
842        let name = fields[0].to_string();
843        let length: u64 = fields[1].parse().map_err(|_| FerroError::Io {
844            msg: format!(
845                "Invalid length '{}' in FAI for sequence '{}'",
846                fields[1], name
847            ),
848        })?;
849        let offset: u64 = fields[2].parse().map_err(|_| FerroError::Io {
850            msg: format!(
851                "Invalid offset '{}' in FAI for sequence '{}'",
852                fields[2], name
853            ),
854        })?;
855        let line_bases: u64 = fields[3].parse().map_err(|_| FerroError::Io {
856            msg: format!(
857                "Invalid line_bases '{}' in FAI for sequence '{}'",
858                fields[3], name
859            ),
860        })?;
861        let line_bytes: u64 = fields[4].parse().map_err(|_| FerroError::Io {
862            msg: format!(
863                "Invalid line_bytes '{}' in FAI for sequence '{}'",
864                fields[4], name
865            ),
866        })?;
867
868        // Validate that critical fields are non-zero to prevent divide-by-zero
869        if line_bases == 0 || line_bytes == 0 {
870            return Err(FerroError::Io {
871                msg: format!(
872                    "Invalid FAI entry for '{}': line_bases={}, line_bytes={} (must be > 0)",
873                    name, line_bases, line_bytes
874                ),
875            });
876        }
877
878        let entry = FastaIndexEntry {
879            file_path: fasta_path.clone(),
880            name: name.clone(),
881            length,
882            offset,
883            line_bases,
884            line_bytes,
885        };
886
887        index.insert(name, entry);
888    }
889
890    Ok(index)
891}
892
893/// Count unique files in the index
894fn count_unique_files(index: &HashMap<String, FastaIndexEntry>) -> usize {
895    let files: std::collections::HashSet<_> = index.values().map(|e| &e.file_path).collect();
896    files.len()
897}
898
899/// Build chromosome name aliases
900fn build_chromosome_aliases() -> HashMap<String, String> {
901    let mut aliases = HashMap::new();
902
903    // RefSeq to UCSC chromosome names (for genome)
904    let refseq_chroms = [
905        ("NC_000001.11", "chr1"),
906        ("NC_000002.12", "chr2"),
907        ("NC_000003.12", "chr3"),
908        ("NC_000004.12", "chr4"),
909        ("NC_000005.10", "chr5"),
910        ("NC_000006.12", "chr6"),
911        ("NC_000007.14", "chr7"),
912        ("NC_000008.11", "chr8"),
913        ("NC_000009.12", "chr9"),
914        ("NC_000010.11", "chr10"),
915        ("NC_000011.10", "chr11"),
916        ("NC_000012.12", "chr12"),
917        ("NC_000013.11", "chr13"),
918        ("NC_000014.9", "chr14"),
919        ("NC_000015.10", "chr15"),
920        ("NC_000016.10", "chr16"),
921        ("NC_000017.11", "chr17"),
922        ("NC_000018.10", "chr18"),
923        ("NC_000019.10", "chr19"),
924        ("NC_000020.11", "chr20"),
925        ("NC_000021.9", "chr21"),
926        ("NC_000022.11", "chr22"),
927        ("NC_000023.11", "chrX"),
928        ("NC_000024.10", "chrY"),
929        ("NC_012920.1", "chrM"),
930    ];
931
932    for (refseq, ucsc) in &refseq_chroms {
933        aliases.insert(refseq.to_string(), ucsc.to_string());
934        // Also map base accession (without version)
935        if let Some(base) = refseq.split('.').next() {
936            aliases.insert(base.to_string(), ucsc.to_string());
937        }
938    }
939
940    // Also map bare chromosome names
941    for i in 1..=22 {
942        aliases.insert(i.to_string(), format!("chr{}", i));
943    }
944    aliases.insert("X".to_string(), "chrX".to_string());
945    aliases.insert("Y".to_string(), "chrY".to_string());
946    aliases.insert("M".to_string(), "chrM".to_string());
947    aliases.insert("MT".to_string(), "chrM".to_string());
948
949    aliases
950}
951
952#[cfg(test)]
953mod tests {
954    use super::*;
955    use std::io::Write;
956    use tempfile::tempdir;
957
958    #[test]
959    fn test_load_fai_index() {
960        let dir = tempdir().unwrap();
961        let fasta_path = dir.path().join("test.fna");
962        let fai_path = dir.path().join("test.fna.fai");
963
964        // Create FASTA file
965        let mut fasta_file = File::create(&fasta_path).unwrap();
966        writeln!(fasta_file, ">NM_000001.1").unwrap();
967        writeln!(fasta_file, "ATGCATGCAT").unwrap();
968
969        // Create FAI file
970        let mut fai_file = File::create(&fai_path).unwrap();
971        writeln!(fai_file, "NM_000001.1\t10\t13\t10\t11").unwrap();
972
973        let index = load_fai_index(&fasta_path, &fai_path).unwrap();
974
975        assert!(index.contains_key("NM_000001.1"));
976        let entry = index.get("NM_000001.1").unwrap();
977        assert_eq!(entry.length, 10);
978        assert_eq!(entry.offset, 13);
979    }
980
981    #[test]
982    fn test_multi_fasta_provider_from_directory() {
983        let dir = tempdir().unwrap();
984
985        // Create FASTA file
986        let fasta_path = dir.path().join("test.fna");
987        let fai_path = dir.path().join("test.fna.fai");
988
989        let mut fasta_file = File::create(&fasta_path).unwrap();
990        writeln!(fasta_file, ">NM_000001.1").unwrap();
991        writeln!(fasta_file, "ATGCATGCAT").unwrap();
992
993        let mut fai_file = File::create(&fai_path).unwrap();
994        writeln!(fai_file, "NM_000001.1\t10\t13\t10\t11").unwrap();
995
996        let provider = MultiFastaProvider::from_directory(dir.path()).unwrap();
997
998        assert!(provider.has_sequence("NM_000001.1"));
999        assert!(provider.has_sequence("NM_000001")); // Without version
1000        assert_eq!(provider.sequence_length("NM_000001.1"), Some(10));
1001    }
1002
1003    #[test]
1004    fn test_get_sequence() {
1005        let dir = tempdir().unwrap();
1006
1007        let fasta_path = dir.path().join("test.fna");
1008        let fai_path = dir.path().join("test.fna.fai");
1009
1010        let mut fasta_file = File::create(&fasta_path).unwrap();
1011        writeln!(fasta_file, ">NM_000001.1").unwrap();
1012        writeln!(fasta_file, "ATGCATGCAT").unwrap();
1013
1014        let mut fai_file = File::create(&fai_path).unwrap();
1015        writeln!(fai_file, "NM_000001.1\t10\t13\t10\t11").unwrap();
1016
1017        let provider = MultiFastaProvider::from_directory(dir.path()).unwrap();
1018
1019        let seq = provider.get_sequence("NM_000001.1", 0, 4).unwrap();
1020        assert_eq!(seq, "ATGC");
1021
1022        let seq = provider.get_sequence("NM_000001", 0, 4).unwrap();
1023        assert_eq!(seq, "ATGC");
1024    }
1025
1026    #[test]
1027    fn test_chromosome_aliases() {
1028        let aliases = build_chromosome_aliases();
1029
1030        assert_eq!(aliases.get("NC_000001.11"), Some(&"chr1".to_string()));
1031        assert_eq!(aliases.get("NC_000001"), Some(&"chr1".to_string()));
1032        assert_eq!(aliases.get("1"), Some(&"chr1".to_string()));
1033        assert_eq!(aliases.get("X"), Some(&"chrX".to_string()));
1034    }
1035}