1use crate::errors::FgumiError;
23use anyhow::{Context, Result};
24use log::debug;
25use noodles::core::Position;
26use noodles::fasta::fai;
27use std::collections::HashMap;
28use std::fs::File;
29use std::io::{Read, Seek, SeekFrom};
30use std::path::{Path, PathBuf};
31use std::sync::Arc;
32
33#[allow(clippy::cast_possible_truncation)]
39fn read_sequence_raw(file: &mut File, record: &fai::Record) -> Result<Vec<u8>> {
40 let line_bases = record.line_bases() as usize;
41 let line_width = record.line_width() as usize;
42 let seq_len = record.length() as usize;
43 let offset = record.offset();
44
45 if seq_len <= line_bases {
47 file.seek(SeekFrom::Start(offset))?;
48 let mut sequence = vec![0u8; seq_len];
49 file.read_exact(&mut sequence)?;
50 return Ok(sequence);
51 }
52
53 let complete_lines = seq_len / line_bases;
55 let remaining_bases = seq_len % line_bases;
56
57 let total_bytes = if remaining_bases > 0 {
60 complete_lines * line_width + remaining_bases
61 } else if complete_lines > 0 {
62 (complete_lines - 1) * line_width + line_bases
64 } else {
65 0
66 };
67
68 file.seek(SeekFrom::Start(offset))?;
70 let mut raw_bytes = vec![0u8; total_bytes];
71 file.read_exact(&mut raw_bytes)?;
72
73 let mut sequence = Vec::with_capacity(seq_len);
75 let terminator_len = line_width - line_bases;
76
77 let mut pos = 0;
78 while sequence.len() < seq_len && pos < raw_bytes.len() {
79 let bases_to_copy = (seq_len - sequence.len()).min(line_bases).min(raw_bytes.len() - pos);
81 sequence.extend_from_slice(&raw_bytes[pos..pos + bases_to_copy]);
82 pos += bases_to_copy;
83
84 if sequence.len() < seq_len && pos < raw_bytes.len() {
86 pos += terminator_len;
87 }
88 }
89
90 Ok(sequence)
91}
92
93fn find_sibling_file(fasta_path: &Path, replace_ext: &str, append_ext: &str) -> Option<PathBuf> {
97 let replaced = fasta_path.with_extension(replace_ext);
98 if replaced.exists() {
99 return Some(replaced);
100 }
101
102 let appended = PathBuf::from(format!("{}.{append_ext}", fasta_path.display()));
103 if appended.exists() {
104 return Some(appended);
105 }
106
107 None
108}
109
110fn find_fai_path(fasta_path: &Path) -> Option<PathBuf> {
116 find_sibling_file(fasta_path, "fa.fai", "fai")
117}
118
119#[must_use]
142pub fn find_dict_path(fasta_path: &Path) -> Option<PathBuf> {
143 find_sibling_file(fasta_path, "dict", "dict")
144}
145
146#[derive(Clone)]
155pub struct ReferenceReader {
156 sequences: Arc<HashMap<String, Vec<u8>>>,
158}
159
160impl ReferenceReader {
161 pub fn new<P: AsRef<Path>>(path: P) -> Result<Self> {
183 let path = path.as_ref();
184
185 if !path.exists() {
187 return Err(FgumiError::InvalidFileFormat {
188 file_type: "Reference FASTA".to_string(),
189 path: path.display().to_string(),
190 reason: "File does not exist".to_string(),
191 }
192 .into());
193 }
194
195 debug!("Reading reference FASTA into memory: {}", path.display());
196
197 if let Some(fai_path) = find_fai_path(path) {
199 debug!("Using FAI index for fast loading: {}", fai_path.display());
200 return Self::new_with_fai(path, &fai_path);
201 }
202
203 debug!("No FAI index found, using sequential reading");
205 Self::new_sequential(path)
206 }
207
208 fn new_with_fai(fasta_path: &Path, fai_path: &Path) -> Result<Self> {
210 let index = fai::fs::read(fai_path)
211 .with_context(|| format!("Failed to read FAI index: {}", fai_path.display()))?;
212 let records: &[fai::Record] = index.as_ref();
213 let mut file = File::open(fasta_path)
214 .with_context(|| format!("Failed to open FASTA: {}", fasta_path.display()))?;
215
216 let mut sequences = HashMap::with_capacity(records.len());
217
218 for record in records {
219 let raw_sequence = read_sequence_raw(&mut file, record)?;
220 let name = String::from_utf8_lossy(record.name().as_ref()).into_owned();
221 sequences.insert(name, raw_sequence);
222 }
223
224 debug!("Loaded {} contigs into memory (FAI-indexed)", sequences.len());
225 Ok(Self { sequences: Arc::new(sequences) })
226 }
227
228 fn new_sequential(path: &Path) -> Result<Self> {
230 use noodles::fasta;
231
232 let mut sequences = HashMap::new();
233 let mut reader = fasta::io::reader::Builder.build_from_path(path)?;
234
235 for result in reader.records() {
236 let record = result?;
237 let name = std::str::from_utf8(record.name())?.to_string();
238 let raw_sequence: Vec<u8> = record.sequence().as_ref().to_vec();
239 sequences.insert(name, raw_sequence);
240 }
241
242 debug!("Loaded {} contigs into memory (sequential)", sequences.len());
243 Ok(Self { sequences: Arc::new(sequences) })
244 }
245
246 pub fn fetch(&self, chrom: &str, start: Position, end: Position) -> Result<Vec<u8>> {
277 Ok(self.fetch_slice(chrom, start, end)?.to_vec())
278 }
279
280 pub fn fetch_slice(&self, chrom: &str, start: Position, end: Position) -> Result<&[u8]> {
292 let sequence = self
293 .sequences
294 .get(chrom)
295 .ok_or_else(|| FgumiError::ReferenceNotFound { ref_name: chrom.to_string() })?;
296
297 let start_idx = usize::from(start) - 1;
299 let end_idx = usize::from(end);
300
301 if end_idx > sequence.len() || start_idx >= end_idx {
302 return Err(FgumiError::InvalidParameter {
303 parameter: "region".to_string(),
304 reason: format!(
305 "Requested region {}:{}-{} exceeds sequence length {}",
306 chrom,
307 start,
308 end,
309 sequence.len()
310 ),
311 }
312 .into());
313 }
314
315 Ok(&sequence[start_idx..end_idx])
316 }
317
318 pub fn base_at(&self, chrom: &str, pos: Position) -> Result<u8> {
347 let sequence = self
348 .sequences
349 .get(chrom)
350 .ok_or_else(|| FgumiError::ReferenceNotFound { ref_name: chrom.to_string() })?;
351
352 let pos_idx = usize::from(pos) - 1;
354
355 sequence.get(pos_idx).copied().ok_or_else(|| {
356 FgumiError::InvalidParameter {
357 parameter: "position".to_string(),
358 reason: format!(
359 "Position {}:{} exceeds sequence length {}",
360 chrom,
361 pos,
362 sequence.len()
363 ),
364 }
365 .into()
366 })
367 }
368}
369
370impl fgumi_sam::ReferenceProvider for ReferenceReader {
371 fn fetch(
372 &self,
373 chrom: &str,
374 start: noodles::core::Position,
375 end: noodles::core::Position,
376 ) -> anyhow::Result<Vec<u8>> {
377 self.fetch(chrom, start, end)
378 }
379}
380
381#[cfg(feature = "simplex")]
382impl fgumi_consensus::methylation::RefBaseProvider for ReferenceReader {
383 fn base_at_0based(&self, chrom: &str, pos: u64) -> Option<u8> {
384 let sequence = self.sequences.get(chrom)?;
385 sequence.get(usize::try_from(pos).ok()?).copied()
386 }
387
388 fn sequence_for(&self, chrom: &str) -> Option<&[u8]> {
389 self.sequences.get(chrom).map(Vec::as_slice)
390 }
391}
392
393#[cfg(test)]
394mod tests {
395 use super::*;
396 use crate::sam::builder::create_default_test_fasta;
397
398 #[test]
399 fn test_fetch_subsequence() -> Result<()> {
400 let fasta = create_default_test_fasta()?;
401 let reader = ReferenceReader::new(fasta.path())?;
402
403 let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(4)?)?;
405 assert_eq!(seq, b"ACGT");
406
407 let seq = reader.fetch("chr2", Position::try_from(5)?, Position::try_from(8)?)?;
409 assert_eq!(seq, b"CCCC");
410
411 Ok(())
412 }
413
414 #[test]
415 fn test_fetch_slice_returns_borrowed_bytes() -> Result<()> {
416 let fasta = create_default_test_fasta()?;
417 let reader = ReferenceReader::new(fasta.path())?;
418
419 let slice: &[u8] =
421 reader.fetch_slice("chr1", Position::try_from(1)?, Position::try_from(4)?)?;
422 assert_eq!(slice, b"ACGT");
423
424 let owned = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(4)?)?;
426 assert_eq!(slice, owned.as_slice());
427
428 assert!(
430 reader
431 .fetch_slice("chr1", Position::try_from(1)?, Position::try_from(10_000)?)
432 .is_err()
433 );
434 assert!(
435 reader.fetch_slice("nope", Position::try_from(1)?, Position::try_from(2)?).is_err()
436 );
437 assert!(
439 reader.fetch_slice("chr1", Position::try_from(5)?, Position::try_from(4)?).is_err()
440 );
441
442 Ok(())
443 }
444
445 #[test]
446 fn test_base_at() -> Result<()> {
447 let fasta = create_default_test_fasta()?;
448 let reader = ReferenceReader::new(fasta.path())?;
449
450 assert_eq!(reader.base_at("chr1", Position::try_from(1)?)?, b'A');
451 assert_eq!(reader.base_at("chr1", Position::try_from(2)?)?, b'C');
452 assert_eq!(reader.base_at("chr2", Position::try_from(1)?)?, b'G');
453
454 Ok(())
455 }
456
457 #[test]
458 fn test_all_sequences_loaded() -> Result<()> {
459 let fasta = create_default_test_fasta()?;
460 let reader = ReferenceReader::new(fasta.path())?;
461
462 let seq1 = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(4)?)?;
464 assert_eq!(seq1, b"ACGT");
465
466 let seq2 = reader.fetch("chr2", Position::try_from(1)?, Position::try_from(4)?)?;
467 assert_eq!(seq2, b"GGGG");
468
469 let seq1_again = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(4)?)?;
471 assert_eq!(seq1_again, b"ACGT");
472
473 Ok(())
474 }
475
476 #[test]
477 fn test_nonexistent_sequence() {
478 let fasta = create_default_test_fasta().expect("creating test FASTA should succeed");
479 let reader =
480 ReferenceReader::new(fasta.path()).expect("creating reference reader should succeed");
481
482 let result = reader.fetch(
483 "chr999",
484 Position::try_from(1).expect("position conversion should succeed"),
485 Position::try_from(4).expect("position conversion should succeed"),
486 );
487 assert!(result.is_err());
488 }
489
490 #[test]
491 fn test_out_of_bounds() {
492 let fasta = create_default_test_fasta().expect("creating test FASTA should succeed");
493 let reader =
494 ReferenceReader::new(fasta.path()).expect("creating reference reader should succeed");
495
496 let result = reader.fetch(
498 "chr1",
499 Position::try_from(1).expect("position conversion should succeed"),
500 Position::try_from(100).expect("position conversion should succeed"),
501 );
502 assert!(result.is_err());
503 }
504
505 #[test]
506 fn test_reference_case_preserved() -> Result<()> {
507 use crate::sam::builder::create_test_fasta;
510
511 let file = create_test_fasta(&[("chr1", "AcGtNnAaCcGgTt")])?; let reader = ReferenceReader::new(file.path())?;
514
515 let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(14)?)?;
517 assert_eq!(seq, b"AcGtNnAaCcGgTt");
518
519 assert_eq!(reader.base_at("chr1", Position::try_from(1)?)?, b'A'); assert_eq!(reader.base_at("chr1", Position::try_from(2)?)?, b'c'); assert_eq!(reader.base_at("chr1", Position::try_from(3)?)?, b'G'); assert_eq!(reader.base_at("chr1", Position::try_from(4)?)?, b't'); assert_eq!(reader.base_at("chr1", Position::try_from(5)?)?, b'N'); assert_eq!(reader.base_at("chr1", Position::try_from(6)?)?, b'n'); Ok(())
528 }
529
530 #[test]
531 fn test_n_bases_at_various_positions() -> Result<()> {
532 use crate::sam::builder::create_test_fasta;
533
534 let file = create_test_fasta(&[("chr1", "NACGTNACGTN")])?;
536 let reader = ReferenceReader::new(file.path())?;
537
538 assert_eq!(reader.base_at("chr1", Position::try_from(1)?)?, b'N');
539 assert_eq!(reader.base_at("chr1", Position::try_from(6)?)?, b'N');
540 assert_eq!(reader.base_at("chr1", Position::try_from(11)?)?, b'N');
541
542 let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(6)?)?;
544 assert_eq!(seq, b"NACGTN");
545
546 Ok(())
547 }
548
549 #[test]
550 fn test_all_n_sequence() -> Result<()> {
551 use crate::sam::builder::create_test_fasta;
552
553 let file = create_test_fasta(&[("chrN", "NNNNNNNNNN")])?;
554 let reader = ReferenceReader::new(file.path())?;
555
556 let seq = reader.fetch("chrN", Position::try_from(1)?, Position::try_from(10)?)?;
557 assert_eq!(seq, b"NNNNNNNNNN");
558
559 for i in 1..=10 {
560 assert_eq!(reader.base_at("chrN", Position::try_from(i)?)?, b'N');
561 }
562
563 Ok(())
564 }
565
566 #[test]
567 fn test_long_sequence_boundaries() -> Result<()> {
568 use crate::sam::builder::create_test_fasta;
569
570 let mut seq = String::new();
573
574 for _ in 0..7 {
576 seq.push_str("ACGT");
577 }
578 seq.push_str("NNNN");
580 for _ in 0..7 {
582 seq.push_str("ACGT");
583 }
584 seq.push_str("acgt");
586 for _ in 0..9 {
588 seq.push_str("ACGT");
589 }
590
591 assert_eq!(seq.len(), 100);
592
593 let file = create_test_fasta(&[("chr1", &seq)])?;
594 let reader = ReferenceReader::new(file.path())?;
595
596 assert_eq!(reader.base_at("chr1", Position::try_from(28)?)?, b'T');
598 assert_eq!(reader.base_at("chr1", Position::try_from(29)?)?, b'N');
599 assert_eq!(reader.base_at("chr1", Position::try_from(32)?)?, b'N');
600 assert_eq!(reader.base_at("chr1", Position::try_from(33)?)?, b'A');
601
602 assert_eq!(reader.base_at("chr1", Position::try_from(60)?)?, b'T');
604 assert_eq!(reader.base_at("chr1", Position::try_from(61)?)?, b'a');
605 assert_eq!(reader.base_at("chr1", Position::try_from(64)?)?, b't');
606 assert_eq!(reader.base_at("chr1", Position::try_from(65)?)?, b'A');
607
608 let cross_first = reader.fetch("chr1", Position::try_from(27)?, Position::try_from(34)?)?;
610 assert_eq!(cross_first, b"GTNNNNAC");
611
612 let cross_second =
613 reader.fetch("chr1", Position::try_from(59)?, Position::try_from(66)?)?;
614 assert_eq!(cross_second, b"GTacgtAC");
615
616 Ok(())
617 }
618
619 #[test]
620 fn test_single_base_sequence() -> Result<()> {
621 use crate::sam::builder::create_test_fasta;
622
623 let file = create_test_fasta(&[("chr1", "A"), ("chr2", "N"), ("chr3", "g")])?;
624 let reader = ReferenceReader::new(file.path())?;
625
626 assert_eq!(reader.base_at("chr1", Position::try_from(1)?)?, b'A');
627 assert_eq!(reader.base_at("chr2", Position::try_from(1)?)?, b'N');
628 assert_eq!(reader.base_at("chr3", Position::try_from(1)?)?, b'g');
629
630 let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(1)?)?;
631 assert_eq!(seq, b"A");
632
633 Ok(())
634 }
635
636 #[test]
637 fn test_fetch_full_sequence() -> Result<()> {
638 use crate::sam::builder::create_test_fasta;
639
640 let original = "ACGTNacgtn";
641 let file = create_test_fasta(&[("chr1", original)])?;
642 let reader = ReferenceReader::new(file.path())?;
643
644 let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(10)?)?;
645 assert_eq!(seq, original.as_bytes());
646
647 Ok(())
648 }
649
650 #[test]
651 fn test_fetch_last_base() -> Result<()> {
652 use crate::sam::builder::create_test_fasta;
653
654 let file = create_test_fasta(&[("chr1", "ACGTN")])?;
655 let reader = ReferenceReader::new(file.path())?;
656
657 let seq = reader.fetch("chr1", Position::try_from(5)?, Position::try_from(5)?)?;
659 assert_eq!(seq, b"N");
660
661 assert_eq!(reader.base_at("chr1", Position::try_from(5)?)?, b'N');
662
663 Ok(())
664 }
665
666 #[test]
667 fn test_multiple_chromosomes_isolation() -> Result<()> {
668 use crate::sam::builder::create_test_fasta;
669
670 let file = create_test_fasta(&[
671 ("chr1", "AAAA"),
672 ("chr2", "CCCC"),
673 ("chr3", "GGGG"),
674 ("chr4", "TTTT"),
675 ])?;
676 let reader = ReferenceReader::new(file.path())?;
677
678 assert_eq!(reader.fetch("chr1", Position::try_from(1)?, Position::try_from(4)?)?, b"AAAA");
680 assert_eq!(reader.fetch("chr2", Position::try_from(1)?, Position::try_from(4)?)?, b"CCCC");
681 assert_eq!(reader.fetch("chr3", Position::try_from(1)?, Position::try_from(4)?)?, b"GGGG");
682 assert_eq!(reader.fetch("chr4", Position::try_from(1)?, Position::try_from(4)?)?, b"TTTT");
683
684 Ok(())
685 }
686
687 #[test]
688 fn test_mixed_case_all_bases() -> Result<()> {
689 use crate::sam::builder::create_test_fasta;
690
691 let file = create_test_fasta(&[("chr1", "ACGTNacgtn")])?;
693 let reader = ReferenceReader::new(file.path())?;
694
695 let expected = [b'A', b'C', b'G', b'T', b'N', b'a', b'c', b'g', b't', b'n'];
696 for (i, &expected_base) in expected.iter().enumerate() {
697 let pos = Position::try_from(i + 1)?;
698 assert_eq!(
699 reader.base_at("chr1", pos)?,
700 expected_base,
701 "Mismatch at position {}",
702 i + 1
703 );
704 }
705
706 Ok(())
707 }
708
709 #[test]
710 fn test_runs_of_n_bases() -> Result<()> {
711 use crate::sam::builder::create_test_fasta;
712
713 let file = create_test_fasta(&[("chr1", "ACGTNNNNNNNNACGT")])?;
715 let reader = ReferenceReader::new(file.path())?;
716
717 let n_run = reader.fetch("chr1", Position::try_from(5)?, Position::try_from(12)?)?;
719 assert_eq!(n_run, b"NNNNNNNN");
720
721 let across = reader.fetch("chr1", Position::try_from(3)?, Position::try_from(14)?)?;
723 assert_eq!(across, b"GTNNNNNNNNAC");
724
725 Ok(())
726 }
727
728 #[test]
729 fn test_position_one_based() -> Result<()> {
730 use crate::sam::builder::create_test_fasta;
731
732 let file = create_test_fasta(&[("chr1", "ACGTN")])?;
733 let reader = ReferenceReader::new(file.path())?;
734
735 assert_eq!(reader.base_at("chr1", Position::try_from(1)?)?, b'A');
737 assert_eq!(reader.base_at("chr1", Position::try_from(2)?)?, b'C');
738
739 let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(1)?)?;
741 assert_eq!(seq, b"A");
742
743 let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(2)?)?;
745 assert_eq!(seq, b"AC");
746
747 Ok(())
748 }
749
750 #[test]
751 fn test_find_dict_path_replacing_convention() -> Result<()> {
752 let temp_dir = tempfile::tempdir()?;
754 let fasta_path = temp_dir.path().join("ref.fa");
755 let dict_path = temp_dir.path().join("ref.dict");
756
757 std::fs::write(&fasta_path, "")?;
759 std::fs::write(&dict_path, "")?;
760
761 let found = find_dict_path(&fasta_path);
762 assert!(found.is_some());
763 assert_eq!(found.expect("should find dictionary path"), dict_path);
764
765 Ok(())
766 }
767
768 #[test]
769 fn test_find_dict_path_appending_convention() -> Result<()> {
770 let temp_dir = tempfile::tempdir()?;
772 let fasta_path = temp_dir.path().join("ref.fa");
773 let dict_path = temp_dir.path().join("ref.fa.dict");
774
775 std::fs::write(&fasta_path, "")?;
777 std::fs::write(&dict_path, "")?;
778
779 let found = find_dict_path(&fasta_path);
780 assert!(found.is_some());
781 assert_eq!(found.expect("should find dictionary path"), dict_path);
782
783 Ok(())
784 }
785
786 #[test]
787 fn test_find_dict_path_prefers_replacing_convention() -> Result<()> {
788 let temp_dir = tempfile::tempdir()?;
790 let fasta_path = temp_dir.path().join("ref.fa");
791 let dict_replacing = temp_dir.path().join("ref.dict");
792 let dict_appending = temp_dir.path().join("ref.fa.dict");
793
794 std::fs::write(&fasta_path, "")?;
796 std::fs::write(&dict_replacing, "")?;
797 std::fs::write(&dict_appending, "")?;
798
799 let found = find_dict_path(&fasta_path);
800 assert!(found.is_some());
801 assert_eq!(found.expect("should find dictionary path"), dict_replacing);
803
804 Ok(())
805 }
806
807 #[test]
808 fn test_find_dict_path_not_found() {
809 let temp_dir = tempfile::tempdir().expect("creating temp file/dir should succeed");
811 let fasta_path = temp_dir.path().join("ref.fa");
812
813 std::fs::write(&fasta_path, "").expect("writing file should succeed");
815
816 let found = find_dict_path(&fasta_path);
817 assert!(found.is_none());
818 }
819
820 #[test]
821 fn test_find_dict_path_fasta_extension() -> Result<()> {
822 let temp_dir = tempfile::tempdir()?;
824 let fasta_path = temp_dir.path().join("ref.fasta");
825 let dict_path = temp_dir.path().join("ref.dict");
826
827 std::fs::write(&fasta_path, "")?;
828 std::fs::write(&dict_path, "")?;
829
830 let found = find_dict_path(&fasta_path);
831 assert!(found.is_some());
832 assert_eq!(found.expect("should find dictionary path"), dict_path);
833
834 Ok(())
835 }
836
837 #[test]
838 fn test_find_dict_path_fasta_appending_convention() -> Result<()> {
839 let temp_dir = tempfile::tempdir()?;
841 let fasta_path = temp_dir.path().join("ref.fasta");
842 let dict_path = temp_dir.path().join("ref.fasta.dict");
843
844 std::fs::write(&fasta_path, "")?;
845 std::fs::write(&dict_path, "")?;
846
847 let found = find_dict_path(&fasta_path);
848 assert!(found.is_some());
849 assert_eq!(found.expect("should find dictionary path"), dict_path);
850
851 Ok(())
852 }
853}