1use std::collections::HashMap;
23use std::fs::File;
24use std::io::{BufRead, BufReader, Read, Seek, SeekFrom};
25use std::path::{Path, PathBuf};
26
27use log::warn;
28
29use crate::data::cdot::CdotMapper;
30use crate::error::FerroError;
31use crate::reference::provider::ReferenceProvider;
32use crate::reference::transcript::Transcript;
33
34#[derive(Debug, Clone)]
36struct FastaIndexEntry {
37 file_path: PathBuf,
39 name: String,
41 length: u64,
43 offset: u64,
45 line_bases: u64,
47 line_bytes: u64,
49}
50
51#[derive(Debug, Clone)]
53pub struct SupplementalTranscriptInfo {
54 pub cds_start: Option<u64>,
55 pub cds_end: Option<u64>,
56 pub gene_symbol: Option<String>,
57 pub sequence_length: u64,
58}
59
60#[derive(Debug, Clone, Default)]
62pub struct SupplementalCdsInfo {
63 pub transcripts: HashMap<String, SupplementalTranscriptInfo>,
66}
67
68pub struct MultiFastaProvider {
82 index: HashMap<String, FastaIndexEntry>,
84 base_to_versioned: HashMap<String, String>,
86 aliases: HashMap<String, String>,
88 cdot_mapper: Option<CdotMapper>,
90 supplemental_cds: SupplementalCdsInfo,
92}
93
94impl MultiFastaProvider {
95 pub fn from_directory<P: AsRef<Path>>(dir: P) -> Result<Self, FerroError> {
99 let dir = dir.as_ref();
100
101 if !dir.exists() {
102 return Err(FerroError::Io {
103 msg: format!("Reference directory not found: {}", dir.display()),
104 });
105 }
106
107 let mut index = HashMap::new();
108 let mut base_to_versioned = HashMap::new();
109
110 let entries = std::fs::read_dir(dir).map_err(|e| FerroError::Io {
112 msg: format!("Failed to read directory {}: {}", dir.display(), e),
113 })?;
114
115 for entry in entries {
116 let entry = entry.map_err(|e| FerroError::Io {
117 msg: format!("Failed to read directory entry: {}", e),
118 })?;
119
120 let path = entry.path();
121 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
122
123 if ext == "fna" || ext == "fa" || ext == "fasta" {
125 let fai_path = PathBuf::from(format!("{}.fai", path.display()));
126
127 if fai_path.exists() {
128 let file_index = load_fai_index(&path, &fai_path)?;
130
131 for (name, entry) in file_index {
132 if let Some(base) = name.split('.').next() {
134 base_to_versioned.insert(base.to_string(), name.clone());
135 }
136 index.insert(name, entry);
137 }
138 }
139 }
140 }
141
142 if index.is_empty() {
143 return Err(FerroError::Io {
144 msg: format!(
145 "No indexed FASTA files found in {}. Run 'ferro-benchmark prepare' first.",
146 dir.display()
147 ),
148 });
149 }
150
151 eprintln!(
152 "Loaded {} sequences from {} FASTA files",
153 index.len(),
154 count_unique_files(&index)
155 );
156
157 Ok(Self {
158 index,
159 base_to_versioned,
160 aliases: build_chromosome_aliases(),
161 cdot_mapper: None,
162 supplemental_cds: SupplementalCdsInfo::default(),
163 })
164 }
165
166 pub fn from_directories<P: AsRef<Path>>(dirs: &[P]) -> Result<Self, FerroError> {
168 let mut combined_index = HashMap::new();
169 let mut base_to_versioned = HashMap::new();
170
171 for dir in dirs {
172 let dir = dir.as_ref();
173 if !dir.exists() {
174 continue;
175 }
176
177 let partial = Self::from_directory(dir)?;
178 combined_index.extend(partial.index);
179 base_to_versioned.extend(partial.base_to_versioned);
180 }
181
182 if combined_index.is_empty() {
183 return Err(FerroError::Io {
184 msg: "No indexed FASTA files found in any directory".to_string(),
185 });
186 }
187
188 Ok(Self {
189 index: combined_index,
190 base_to_versioned,
191 aliases: build_chromosome_aliases(),
192 cdot_mapper: None,
193 supplemental_cds: SupplementalCdsInfo::default(),
194 })
195 }
196
197 pub fn from_manifest<P: AsRef<Path>>(manifest_path: P) -> Result<Self, FerroError> {
199 let manifest_path = manifest_path.as_ref();
200
201 let base_dir = manifest_path
203 .parent()
204 .unwrap_or(Path::new("."))
205 .to_path_buf();
206
207 let resolve_path = |path_str: &str| -> PathBuf {
209 let path = PathBuf::from(path_str);
210 if path.is_absolute() {
211 path
212 } else {
213 base_dir.join(path)
214 }
215 };
216
217 let file = File::open(manifest_path).map_err(|e| FerroError::Io {
218 msg: format!("Failed to open manifest: {}", e),
219 })?;
220
221 let manifest: serde_json::Value =
222 serde_json::from_reader(file).map_err(|e| FerroError::Io {
223 msg: format!("Failed to parse manifest: {}", e),
224 })?;
225
226 let mut dirs = Vec::new();
227
228 if let Some(fastas) = manifest.get("transcript_fastas").and_then(|v| v.as_array()) {
230 if let Some(first) = fastas.first().and_then(|v| v.as_str()) {
231 let path_str = first.strip_suffix(".gz").unwrap_or(first);
233 let path = resolve_path(path_str);
234 if let Some(transcript_dir) = path.parent() {
235 dirs.push(transcript_dir.to_path_buf());
236 }
237 }
238 }
239
240 if let Some(genome) = manifest.get("genome_fasta").and_then(|v| v.as_str()) {
242 let path = resolve_path(genome);
243 if let Some(genome_dir) = path.parent() {
244 dirs.push(genome_dir.to_path_buf());
245 }
246 }
247
248 if let Some(fastas) = manifest.get("refseqgene_fastas").and_then(|v| v.as_array()) {
250 if let Some(first) = fastas.first().and_then(|v| v.as_str()) {
251 let path = resolve_path(first);
252 if let Some(refseqgene_dir) = path.parent() {
253 if !dirs.contains(&refseqgene_dir.to_path_buf()) {
254 dirs.push(refseqgene_dir.to_path_buf());
255 }
256 }
257 }
258 }
259
260 if let Some(fastas) = manifest.get("lrg_fastas").and_then(|v| v.as_array()) {
262 if let Some(first) = fastas.first().and_then(|v| v.as_str()) {
263 let path = resolve_path(first);
264 if let Some(lrg_dir) = path.parent() {
265 if !dirs.contains(&lrg_dir.to_path_buf()) {
266 dirs.push(lrg_dir.to_path_buf());
267 }
268 }
269 }
270 }
271
272 if let Some(supplemental) = manifest.get("supplemental_fasta").and_then(|v| v.as_str()) {
274 let path = resolve_path(supplemental);
275 if let Some(supplemental_dir) = path.parent() {
276 if !dirs.contains(&supplemental_dir.to_path_buf()) {
277 dirs.push(supplemental_dir.to_path_buf());
278 }
279 }
280 }
281
282 if dirs.is_empty() {
283 return Err(FerroError::Io {
284 msg: "No FASTA directories found in manifest".to_string(),
285 });
286 }
287
288 let mut provider = Self::from_directories(&dirs)?;
289
290 if let Some(cdot_path_str) = manifest.get("cdot_json").and_then(|v| v.as_str()) {
292 let cdot_path = resolve_path(cdot_path_str);
293 if cdot_path.exists() {
294 eprintln!(
295 "Loading cdot transcript metadata from {}...",
296 cdot_path.display()
297 );
298 match CdotMapper::from_json_file(&cdot_path) {
299 Ok(mut mapper) => {
300 eprintln!(
301 "Loaded {} transcripts with CDS metadata",
302 mapper.transcript_count()
303 );
304
305 if let Some(lrg_mapping_path) =
307 manifest.get("lrg_refseq_mapping").and_then(|v| v.as_str())
308 {
309 let lrg_path = resolve_path(lrg_mapping_path);
310 if lrg_path.exists() {
311 match mapper.load_lrg_mapping(&lrg_path) {
312 Ok(count) => {
313 eprintln!("Loaded {} LRG to RefSeq mappings", count);
314 }
315 Err(e) => {
316 eprintln!("Warning: Failed to load LRG mapping: {}", e);
317 }
318 }
319 }
320 }
321
322 provider.cdot_mapper = Some(mapper);
323 }
324 Err(e) => {
325 eprintln!("Warning: Failed to load cdot data: {}", e);
326 }
327 }
328 }
329 }
330
331 if let Some(supplemental) = manifest.get("supplemental_fasta").and_then(|v| v.as_str()) {
333 let supplemental_path = resolve_path(supplemental);
334 let metadata_path = supplemental_path.with_extension("metadata.json");
336 if metadata_path.exists() {
337 eprintln!(
338 "Loading supplemental CDS metadata from {}...",
339 metadata_path.display()
340 );
341 match std::fs::read_to_string(&metadata_path) {
342 Ok(content) => match serde_json::from_str::<serde_json::Value>(&content) {
343 Ok(metadata) => {
344 if let Some(transcripts) =
345 metadata.get("transcripts").and_then(|v| v.as_object())
346 {
347 for (accession, info) in transcripts {
348 let cds_start = info.get("cds_start").and_then(|v| v.as_u64());
349 let cds_end = info.get("cds_end").and_then(|v| v.as_u64());
350 let gene_symbol = info
351 .get("gene_symbol")
352 .and_then(|v| v.as_str())
353 .map(|s| s.to_string());
354 let sequence_length = info
355 .get("sequence_length")
356 .and_then(|v| v.as_u64())
357 .unwrap_or(0);
358 provider.supplemental_cds.transcripts.insert(
359 accession.clone(),
360 SupplementalTranscriptInfo {
361 cds_start,
362 cds_end,
363 gene_symbol,
364 sequence_length,
365 },
366 );
367 }
368 eprintln!(
369 "Loaded {} supplemental transcripts with CDS metadata",
370 provider.supplemental_cds.transcripts.len()
371 );
372 }
373 }
374 Err(e) => {
375 eprintln!("Warning: Failed to parse supplemental metadata: {}", e);
376 }
377 },
378 Err(e) => {
379 eprintln!("Warning: Failed to read supplemental metadata: {}", e);
380 }
381 }
382 }
383 }
384
385 Ok(provider)
386 }
387
388 pub fn with_cdot<P: AsRef<Path>, Q: AsRef<Path>>(
408 fasta_path: P,
409 cdot_path: Q,
410 ) -> Result<Self, FerroError> {
411 let fasta_path = fasta_path.as_ref();
412 let cdot_path = cdot_path.as_ref();
413
414 let fai_path = PathBuf::from(format!("{}.fai", fasta_path.display()));
416 if !fai_path.exists() {
417 return Err(FerroError::Io {
418 msg: format!(
419 "FASTA index not found: {}. Run 'samtools faidx {}' to create it.",
420 fai_path.display(),
421 fasta_path.display()
422 ),
423 });
424 }
425
426 let index = load_fai_index(fasta_path, &fai_path)?;
427
428 let mut base_to_versioned = HashMap::new();
430 for name in index.keys() {
431 if let Some(base) = name.split('.').next() {
432 base_to_versioned.insert(base.to_string(), name.clone());
433 }
434 }
435
436 eprintln!("Loaded {} sequences from FASTA", index.len());
437
438 let cdot_mapper = if cdot_path.extension().map(|e| e == "gz").unwrap_or(false) {
440 CdotMapper::from_json_gz(cdot_path)?
441 } else {
442 CdotMapper::from_json_file(cdot_path)?
443 };
444 eprintln!(
445 "Loaded {} transcripts from cdot",
446 cdot_mapper.transcript_count()
447 );
448
449 Ok(Self {
450 index,
451 base_to_versioned,
452 aliases: build_chromosome_aliases(),
453 cdot_mapper: Some(cdot_mapper),
454 supplemental_cds: SupplementalCdsInfo::default(),
455 })
456 }
457
458 fn resolve_name(&self, name: &str) -> Option<String> {
460 if self.index.contains_key(name) {
462 return Some(name.to_string());
463 }
464
465 if let Some(aliased) = self.aliases.get(name) {
468 if self.index.contains_key(aliased) {
469 return Some(aliased.clone());
470 }
471 }
472
473 let alt_name = if name.starts_with("chr") {
475 name.strip_prefix("chr").unwrap().to_string()
476 } else {
477 format!("chr{}", name)
478 };
479
480 if self.index.contains_key(&alt_name) {
481 return Some(alt_name);
482 }
483
484 if !name.contains('.') {
487 if let Some(versioned) = self.base_to_versioned.get(name) {
488 return Some(versioned.clone());
489 }
490 }
491
492 if let Some(base) = name.split('.').next() {
495 if !base.starts_with("NC_") {
498 if let Some(versioned) = self.base_to_versioned.get(base) {
499 return Some(versioned.clone());
500 }
501 }
502 }
503
504 None
505 }
506
507 fn get_sequence_from_index(
509 &self,
510 entry: &FastaIndexEntry,
511 start: u64,
512 end: u64,
513 ) -> Result<String, FerroError> {
514 if start >= entry.length {
516 return Err(FerroError::InvalidCoordinates {
517 msg: format!(
518 "Start position {} exceeds sequence length {} for {}",
519 start, entry.length, entry.name
520 ),
521 });
522 }
523
524 let actual_end = end.min(entry.length);
525 if start >= actual_end {
526 return Ok(String::new());
527 }
528
529 let line_start = start / entry.line_bases;
531 let byte_offset = start % entry.line_bases;
532 let file_offset = entry.offset + line_start * entry.line_bytes + byte_offset;
533
534 let seq_len = actual_end - start;
536 let num_lines = (seq_len + byte_offset).div_ceil(entry.line_bases);
537 let bytes_to_read = seq_len + num_lines; let mut file = File::open(&entry.file_path).map_err(|e| FerroError::Io {
541 msg: format!("Failed to open FASTA file: {}", e),
542 })?;
543
544 file.seek(SeekFrom::Start(file_offset))
545 .map_err(|e| FerroError::Io {
546 msg: format!("Failed to seek in FASTA file: {}", e),
547 })?;
548
549 let mut buffer = vec![0u8; bytes_to_read as usize];
550 file.read_exact(&mut buffer).map_err(|e| FerroError::Io {
551 msg: format!("Failed to read from FASTA file: {}", e),
552 })?;
553
554 let sequence: String = buffer
556 .iter()
557 .filter(|&&b| b != b'\n' && b != b'\r')
558 .take(seq_len as usize)
559 .map(|&b| b as char)
560 .collect();
561
562 Ok(sequence.to_uppercase())
563 }
564
565 pub fn has_sequence(&self, name: &str) -> bool {
567 self.resolve_name(name).is_some()
568 }
569
570 pub fn sequence_length(&self, name: &str) -> Option<u64> {
572 self.resolve_name(name)
573 .and_then(|n| self.index.get(&n))
574 .map(|e| e.length)
575 }
576
577 pub fn sequence_count(&self) -> usize {
579 self.index.len()
580 }
581
582 #[allow(clippy::type_complexity)]
586 fn get_supplemental_cds_info(
587 &self,
588 accession: &str,
589 ) -> (
590 Option<String>,
591 crate::reference::transcript::Strand,
592 Option<u64>,
593 Option<u64>,
594 Option<String>,
595 Vec<crate::reference::transcript::Exon>,
596 Option<u64>,
597 Option<u64>,
598 ) {
599 use crate::reference::transcript::{Exon, Strand};
600
601 if let Some(info) = self.supplemental_cds.transcripts.get(accession) {
602 let exons = if info.sequence_length > 0 {
605 vec![Exon {
606 number: 1,
607 start: 1,
608 end: info.sequence_length,
609 genomic_start: None,
610 genomic_end: None,
611 }]
612 } else {
613 Vec::new()
614 };
615
616 (
617 info.gene_symbol.clone(),
618 Strand::Plus, info.cds_start, info.cds_end,
621 None, exons, None,
624 None,
625 )
626 } else {
627 (None, Strand::Plus, None, None, None, Vec::new(), None, None)
628 }
629 }
630}
631
632impl ReferenceProvider for MultiFastaProvider {
633 fn get_transcript(&self, id: &str) -> Result<Transcript, FerroError> {
634 use crate::reference::transcript::{Exon as TxExon, GenomeBuild, ManeStatus};
635 use std::sync::OnceLock;
636
637 if let Some(resolved) = self.resolve_name(id) {
638 if let Some(entry) = self.index.get(&resolved) {
639 let sequence = self.get_sequence_from_index(entry, 0, entry.length)?;
641
642 let (
644 gene_symbol,
645 strand,
646 cds_start,
647 cds_end,
648 chromosome,
649 exons,
650 genomic_start,
651 genomic_end,
652 ) = if let Some(ref cdot) = self.cdot_mapper {
653 if let Some(tx) = cdot.get_transcript(&resolved) {
654 let exons: Vec<TxExon> = tx
661 .exons
662 .iter()
663 .enumerate()
664 .map(|(i, e)| TxExon {
665 number: (i + 1) as u32,
666 start: e[2], end: e[3], genomic_start: Some(e[0] + 1), genomic_end: Some(e[1]), })
671 .collect();
672
673 let has_gaps = exons.windows(2).any(|w| w[1].start != w[0].end + 1);
677
678 if has_gaps && self.supplemental_cds.transcripts.contains_key(&resolved) {
679 warn!(
681 "cdot exon data for {} has gaps in transcript coordinates, \
682 using supplemental CDS metadata",
683 resolved
684 );
685 self.get_supplemental_cds_info(&resolved)
686 } else {
687 let (genomic_start, genomic_end) = if !exons.is_empty() {
689 let min_start = exons.iter().filter_map(|e| e.genomic_start).min();
690 let max_end = exons.iter().filter_map(|e| e.genomic_end).max();
691 (min_start, max_end)
692 } else {
693 (None, None)
694 };
695
696 (
697 tx.gene_name.clone(),
698 tx.strand,
699 tx.cds_start.map(|s| s + 1), tx.cds_end, Some(tx.contig.clone()),
703 exons,
704 genomic_start,
705 genomic_end,
706 )
707 }
708 } else {
709 if self.supplemental_cds.transcripts.contains_key(&resolved) {
711 warn!(
712 "{} not in cdot data, using supplemental CDS metadata",
713 resolved
714 );
715 }
716 self.get_supplemental_cds_info(&resolved)
717 }
718 } else {
719 if self.supplemental_cds.transcripts.contains_key(&resolved) {
721 warn!(
722 "{} not in cdot data (no cdot mapper), using supplemental CDS metadata",
723 resolved
724 );
725 }
726 self.get_supplemental_cds_info(&resolved)
727 };
728
729 return Ok(Transcript {
730 id: resolved,
731 gene_symbol,
732 strand,
733 sequence,
734 cds_start,
735 cds_end,
736 exons,
737 chromosome,
738 genomic_start,
739 genomic_end,
740 genome_build: GenomeBuild::GRCh38,
741 mane_status: ManeStatus::default(),
742 refseq_match: None,
743 ensembl_match: None,
744 cached_introns: OnceLock::new(),
745 });
746 }
747 }
748
749 Err(FerroError::ReferenceNotFound { id: id.to_string() })
750 }
751
752 fn get_sequence(&self, id: &str, start: u64, end: u64) -> Result<String, FerroError> {
753 let resolved = self
754 .resolve_name(id)
755 .ok_or_else(|| FerroError::ReferenceNotFound { id: id.to_string() })?;
756
757 let entry = self
758 .index
759 .get(&resolved)
760 .ok_or_else(|| FerroError::ReferenceNotFound { id: id.to_string() })?;
761
762 self.get_sequence_from_index(entry, start, end)
763 }
764
765 fn get_genomic_sequence(
766 &self,
767 contig: &str,
768 start: u64,
769 end: u64,
770 ) -> Result<String, FerroError> {
771 let resolved =
774 self.resolve_name(contig)
775 .ok_or_else(|| FerroError::GenomicReferenceNotAvailable {
776 contig: contig.to_string(),
777 start,
778 end,
779 })?;
780
781 let entry =
782 self.index
783 .get(&resolved)
784 .ok_or_else(|| FerroError::GenomicReferenceNotAvailable {
785 contig: contig.to_string(),
786 start,
787 end,
788 })?;
789
790 self.get_sequence_from_index(entry, start, end)
791 }
792
793 fn has_genomic_data(&self) -> bool {
794 self.index
797 .keys()
798 .any(|k| k.starts_with("NC_") || k.starts_with("chr"))
799 }
800}
801
802fn load_fai_index<P: AsRef<Path>>(
807 fasta_path: P,
808 fai_path: P,
809) -> Result<HashMap<String, FastaIndexEntry>, FerroError> {
810 let fasta_path = fasta_path
813 .as_ref()
814 .canonicalize()
815 .map_err(|e| FerroError::Io {
816 msg: format!(
817 "Failed to canonicalize FASTA path '{}': {}",
818 fasta_path.as_ref().display(),
819 e
820 ),
821 })?;
822 let fai_path = fai_path.as_ref();
823
824 let file = File::open(fai_path).map_err(|e| FerroError::Io {
825 msg: format!("Failed to open FAI file: {}", e),
826 })?;
827 let reader = BufReader::new(file);
828
829 let mut index = HashMap::new();
830
831 for line in reader.lines() {
832 let line = line.map_err(|e| FerroError::Io {
833 msg: format!("Failed to read FAI line: {}", e),
834 })?;
835
836 let fields: Vec<&str> = line.split('\t').collect();
837 if fields.len() < 5 {
838 continue;
839 }
840
841 let name = fields[0].to_string();
843 let length: u64 = fields[1].parse().map_err(|_| FerroError::Io {
844 msg: format!(
845 "Invalid length '{}' in FAI for sequence '{}'",
846 fields[1], name
847 ),
848 })?;
849 let offset: u64 = fields[2].parse().map_err(|_| FerroError::Io {
850 msg: format!(
851 "Invalid offset '{}' in FAI for sequence '{}'",
852 fields[2], name
853 ),
854 })?;
855 let line_bases: u64 = fields[3].parse().map_err(|_| FerroError::Io {
856 msg: format!(
857 "Invalid line_bases '{}' in FAI for sequence '{}'",
858 fields[3], name
859 ),
860 })?;
861 let line_bytes: u64 = fields[4].parse().map_err(|_| FerroError::Io {
862 msg: format!(
863 "Invalid line_bytes '{}' in FAI for sequence '{}'",
864 fields[4], name
865 ),
866 })?;
867
868 if line_bases == 0 || line_bytes == 0 {
870 return Err(FerroError::Io {
871 msg: format!(
872 "Invalid FAI entry for '{}': line_bases={}, line_bytes={} (must be > 0)",
873 name, line_bases, line_bytes
874 ),
875 });
876 }
877
878 let entry = FastaIndexEntry {
879 file_path: fasta_path.clone(),
880 name: name.clone(),
881 length,
882 offset,
883 line_bases,
884 line_bytes,
885 };
886
887 index.insert(name, entry);
888 }
889
890 Ok(index)
891}
892
893fn count_unique_files(index: &HashMap<String, FastaIndexEntry>) -> usize {
895 let files: std::collections::HashSet<_> = index.values().map(|e| &e.file_path).collect();
896 files.len()
897}
898
899fn build_chromosome_aliases() -> HashMap<String, String> {
901 let mut aliases = HashMap::new();
902
903 let refseq_chroms = [
905 ("NC_000001.11", "chr1"),
906 ("NC_000002.12", "chr2"),
907 ("NC_000003.12", "chr3"),
908 ("NC_000004.12", "chr4"),
909 ("NC_000005.10", "chr5"),
910 ("NC_000006.12", "chr6"),
911 ("NC_000007.14", "chr7"),
912 ("NC_000008.11", "chr8"),
913 ("NC_000009.12", "chr9"),
914 ("NC_000010.11", "chr10"),
915 ("NC_000011.10", "chr11"),
916 ("NC_000012.12", "chr12"),
917 ("NC_000013.11", "chr13"),
918 ("NC_000014.9", "chr14"),
919 ("NC_000015.10", "chr15"),
920 ("NC_000016.10", "chr16"),
921 ("NC_000017.11", "chr17"),
922 ("NC_000018.10", "chr18"),
923 ("NC_000019.10", "chr19"),
924 ("NC_000020.11", "chr20"),
925 ("NC_000021.9", "chr21"),
926 ("NC_000022.11", "chr22"),
927 ("NC_000023.11", "chrX"),
928 ("NC_000024.10", "chrY"),
929 ("NC_012920.1", "chrM"),
930 ];
931
932 for (refseq, ucsc) in &refseq_chroms {
933 aliases.insert(refseq.to_string(), ucsc.to_string());
934 if let Some(base) = refseq.split('.').next() {
936 aliases.insert(base.to_string(), ucsc.to_string());
937 }
938 }
939
940 for i in 1..=22 {
942 aliases.insert(i.to_string(), format!("chr{}", i));
943 }
944 aliases.insert("X".to_string(), "chrX".to_string());
945 aliases.insert("Y".to_string(), "chrY".to_string());
946 aliases.insert("M".to_string(), "chrM".to_string());
947 aliases.insert("MT".to_string(), "chrM".to_string());
948
949 aliases
950}
951
952#[cfg(test)]
953mod tests {
954 use super::*;
955 use std::io::Write;
956 use tempfile::tempdir;
957
958 #[test]
959 fn test_load_fai_index() {
960 let dir = tempdir().unwrap();
961 let fasta_path = dir.path().join("test.fna");
962 let fai_path = dir.path().join("test.fna.fai");
963
964 let mut fasta_file = File::create(&fasta_path).unwrap();
966 writeln!(fasta_file, ">NM_000001.1").unwrap();
967 writeln!(fasta_file, "ATGCATGCAT").unwrap();
968
969 let mut fai_file = File::create(&fai_path).unwrap();
971 writeln!(fai_file, "NM_000001.1\t10\t13\t10\t11").unwrap();
972
973 let index = load_fai_index(&fasta_path, &fai_path).unwrap();
974
975 assert!(index.contains_key("NM_000001.1"));
976 let entry = index.get("NM_000001.1").unwrap();
977 assert_eq!(entry.length, 10);
978 assert_eq!(entry.offset, 13);
979 }
980
981 #[test]
982 fn test_multi_fasta_provider_from_directory() {
983 let dir = tempdir().unwrap();
984
985 let fasta_path = dir.path().join("test.fna");
987 let fai_path = dir.path().join("test.fna.fai");
988
989 let mut fasta_file = File::create(&fasta_path).unwrap();
990 writeln!(fasta_file, ">NM_000001.1").unwrap();
991 writeln!(fasta_file, "ATGCATGCAT").unwrap();
992
993 let mut fai_file = File::create(&fai_path).unwrap();
994 writeln!(fai_file, "NM_000001.1\t10\t13\t10\t11").unwrap();
995
996 let provider = MultiFastaProvider::from_directory(dir.path()).unwrap();
997
998 assert!(provider.has_sequence("NM_000001.1"));
999 assert!(provider.has_sequence("NM_000001")); assert_eq!(provider.sequence_length("NM_000001.1"), Some(10));
1001 }
1002
1003 #[test]
1004 fn test_get_sequence() {
1005 let dir = tempdir().unwrap();
1006
1007 let fasta_path = dir.path().join("test.fna");
1008 let fai_path = dir.path().join("test.fna.fai");
1009
1010 let mut fasta_file = File::create(&fasta_path).unwrap();
1011 writeln!(fasta_file, ">NM_000001.1").unwrap();
1012 writeln!(fasta_file, "ATGCATGCAT").unwrap();
1013
1014 let mut fai_file = File::create(&fai_path).unwrap();
1015 writeln!(fai_file, "NM_000001.1\t10\t13\t10\t11").unwrap();
1016
1017 let provider = MultiFastaProvider::from_directory(dir.path()).unwrap();
1018
1019 let seq = provider.get_sequence("NM_000001.1", 0, 4).unwrap();
1020 assert_eq!(seq, "ATGC");
1021
1022 let seq = provider.get_sequence("NM_000001", 0, 4).unwrap();
1023 assert_eq!(seq, "ATGC");
1024 }
1025
1026 #[test]
1027 fn test_chromosome_aliases() {
1028 let aliases = build_chromosome_aliases();
1029
1030 assert_eq!(aliases.get("NC_000001.11"), Some(&"chr1".to_string()));
1031 assert_eq!(aliases.get("NC_000001"), Some(&"chr1".to_string()));
1032 assert_eq!(aliases.get("1"), Some(&"chr1".to_string()));
1033 assert_eq!(aliases.get("X"), Some(&"chrX".to_string()));
1034 }
1035}