1use crate::errors::FgumiError;
23use anyhow::{Context, Result};
24use log::debug;
25use noodles::core::Position;
26use noodles::fasta::fai;
27use std::collections::HashMap;
28use std::fs::File;
29use std::io::{Read, Seek, SeekFrom};
30use std::path::{Path, PathBuf};
31use std::sync::Arc;
32
33#[allow(clippy::cast_possible_truncation)]
39fn read_sequence_raw(file: &mut File, record: &fai::Record) -> Result<Vec<u8>> {
40 let line_bases = record.line_bases() as usize;
41 let line_width = record.line_width() as usize;
42 let seq_len = record.length() as usize;
43 let offset = record.offset();
44
45 if seq_len <= line_bases {
47 file.seek(SeekFrom::Start(offset))?;
48 let mut sequence = vec![0u8; seq_len];
49 file.read_exact(&mut sequence)?;
50 return Ok(sequence);
51 }
52
53 let complete_lines = seq_len / line_bases;
55 let remaining_bases = seq_len % line_bases;
56
57 let total_bytes = if remaining_bases > 0 {
60 complete_lines * line_width + remaining_bases
61 } else if complete_lines > 0 {
62 (complete_lines - 1) * line_width + line_bases
64 } else {
65 0
66 };
67
68 file.seek(SeekFrom::Start(offset))?;
70 let mut raw_bytes = vec![0u8; total_bytes];
71 file.read_exact(&mut raw_bytes)?;
72
73 let mut sequence = Vec::with_capacity(seq_len);
75 let terminator_len = line_width - line_bases;
76
77 let mut pos = 0;
78 while sequence.len() < seq_len && pos < raw_bytes.len() {
79 let bases_to_copy = (seq_len - sequence.len()).min(line_bases).min(raw_bytes.len() - pos);
81 sequence.extend_from_slice(&raw_bytes[pos..pos + bases_to_copy]);
82 pos += bases_to_copy;
83
84 if sequence.len() < seq_len && pos < raw_bytes.len() {
86 pos += terminator_len;
87 }
88 }
89
90 Ok(sequence)
91}
92
93fn find_sibling_file(fasta_path: &Path, replace_ext: &str, append_ext: &str) -> Option<PathBuf> {
97 let replaced = fasta_path.with_extension(replace_ext);
98 if replaced.exists() {
99 return Some(replaced);
100 }
101
102 let appended = PathBuf::from(format!("{}.{append_ext}", fasta_path.display()));
103 if appended.exists() {
104 return Some(appended);
105 }
106
107 None
108}
109
110fn find_fai_path(fasta_path: &Path) -> Option<PathBuf> {
116 find_sibling_file(fasta_path, "fa.fai", "fai")
117}
118
119#[must_use]
142pub fn find_dict_path(fasta_path: &Path) -> Option<PathBuf> {
143 find_sibling_file(fasta_path, "dict", "dict")
144}
145
146#[derive(Clone)]
155pub struct ReferenceReader {
156 sequences: Arc<HashMap<String, Vec<u8>>>,
158}
159
160impl ReferenceReader {
161 pub fn new<P: AsRef<Path>>(path: P) -> Result<Self> {
183 let path = path.as_ref();
184
185 if !path.exists() {
187 return Err(FgumiError::InvalidFileFormat {
188 file_type: "Reference FASTA".to_string(),
189 path: path.display().to_string(),
190 reason: "File does not exist".to_string(),
191 }
192 .into());
193 }
194
195 debug!("Reading reference FASTA into memory: {}", path.display());
196
197 if let Some(fai_path) = find_fai_path(path) {
199 debug!("Using FAI index for fast loading: {}", fai_path.display());
200 return Self::new_with_fai(path, &fai_path);
201 }
202
203 debug!("No FAI index found, using sequential reading");
205 Self::new_sequential(path)
206 }
207
208 fn new_with_fai(fasta_path: &Path, fai_path: &Path) -> Result<Self> {
210 let index = fai::fs::read(fai_path)
211 .with_context(|| format!("Failed to read FAI index: {}", fai_path.display()))?;
212 let records: &[fai::Record] = index.as_ref();
213 let mut file = File::open(fasta_path)
214 .with_context(|| format!("Failed to open FASTA: {}", fasta_path.display()))?;
215
216 let mut sequences = HashMap::with_capacity(records.len());
217
218 for record in records {
219 let raw_sequence = read_sequence_raw(&mut file, record)?;
220 let name = String::from_utf8_lossy(record.name().as_ref()).into_owned();
221 sequences.insert(name, raw_sequence);
222 }
223
224 debug!("Loaded {} contigs into memory (FAI-indexed)", sequences.len());
225 Ok(Self { sequences: Arc::new(sequences) })
226 }
227
228 fn new_sequential(path: &Path) -> Result<Self> {
230 use noodles::fasta;
231
232 let mut sequences = HashMap::new();
233 let mut reader = fasta::io::reader::Builder.build_from_path(path)?;
234
235 for result in reader.records() {
236 let record = result?;
237 let name = std::str::from_utf8(record.name())?.to_string();
238 let raw_sequence: Vec<u8> = record.sequence().as_ref().to_vec();
239 sequences.insert(name, raw_sequence);
240 }
241
242 debug!("Loaded {} contigs into memory (sequential)", sequences.len());
243 Ok(Self { sequences: Arc::new(sequences) })
244 }
245
246 pub fn fetch(&self, chrom: &str, start: Position, end: Position) -> Result<Vec<u8>> {
277 let sequence = self
278 .sequences
279 .get(chrom)
280 .ok_or_else(|| FgumiError::ReferenceNotFound { ref_name: chrom.to_string() })?;
281
282 let start_idx = usize::from(start) - 1;
284 let end_idx = usize::from(end);
285
286 if end_idx > sequence.len() || start_idx > end_idx {
287 return Err(FgumiError::InvalidParameter {
288 parameter: "region".to_string(),
289 reason: format!(
290 "Requested region {}:{}-{} exceeds sequence length {}",
291 chrom,
292 start,
293 end,
294 sequence.len()
295 ),
296 }
297 .into());
298 }
299
300 Ok(sequence[start_idx..end_idx].to_vec())
301 }
302
303 pub fn base_at(&self, chrom: &str, pos: Position) -> Result<u8> {
332 let sequence = self
333 .sequences
334 .get(chrom)
335 .ok_or_else(|| FgumiError::ReferenceNotFound { ref_name: chrom.to_string() })?;
336
337 let pos_idx = usize::from(pos) - 1;
339
340 sequence.get(pos_idx).copied().ok_or_else(|| {
341 FgumiError::InvalidParameter {
342 parameter: "position".to_string(),
343 reason: format!(
344 "Position {}:{} exceeds sequence length {}",
345 chrom,
346 pos,
347 sequence.len()
348 ),
349 }
350 .into()
351 })
352 }
353}
354
355impl fgumi_sam::ReferenceProvider for ReferenceReader {
356 fn fetch(
357 &self,
358 chrom: &str,
359 start: noodles::core::Position,
360 end: noodles::core::Position,
361 ) -> anyhow::Result<Vec<u8>> {
362 self.fetch(chrom, start, end)
363 }
364}
365
366#[cfg(test)]
367mod tests {
368 use super::*;
369 use crate::sam::builder::create_default_test_fasta;
370
371 #[test]
372 fn test_fetch_subsequence() -> Result<()> {
373 let fasta = create_default_test_fasta()?;
374 let reader = ReferenceReader::new(fasta.path())?;
375
376 let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(4)?)?;
378 assert_eq!(seq, b"ACGT");
379
380 let seq = reader.fetch("chr2", Position::try_from(5)?, Position::try_from(8)?)?;
382 assert_eq!(seq, b"CCCC");
383
384 Ok(())
385 }
386
387 #[test]
388 fn test_base_at() -> Result<()> {
389 let fasta = create_default_test_fasta()?;
390 let reader = ReferenceReader::new(fasta.path())?;
391
392 assert_eq!(reader.base_at("chr1", Position::try_from(1)?)?, b'A');
393 assert_eq!(reader.base_at("chr1", Position::try_from(2)?)?, b'C');
394 assert_eq!(reader.base_at("chr2", Position::try_from(1)?)?, b'G');
395
396 Ok(())
397 }
398
399 #[test]
400 fn test_all_sequences_loaded() -> Result<()> {
401 let fasta = create_default_test_fasta()?;
402 let reader = ReferenceReader::new(fasta.path())?;
403
404 let seq1 = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(4)?)?;
406 assert_eq!(seq1, b"ACGT");
407
408 let seq2 = reader.fetch("chr2", Position::try_from(1)?, Position::try_from(4)?)?;
409 assert_eq!(seq2, b"GGGG");
410
411 let seq1_again = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(4)?)?;
413 assert_eq!(seq1_again, b"ACGT");
414
415 Ok(())
416 }
417
418 #[test]
419 fn test_nonexistent_sequence() {
420 let fasta = create_default_test_fasta().unwrap();
421 let reader = ReferenceReader::new(fasta.path()).unwrap();
422
423 let result =
424 reader.fetch("chr999", Position::try_from(1).unwrap(), Position::try_from(4).unwrap());
425 assert!(result.is_err());
426 }
427
428 #[test]
429 fn test_out_of_bounds() {
430 let fasta = create_default_test_fasta().unwrap();
431 let reader = ReferenceReader::new(fasta.path()).unwrap();
432
433 let result =
435 reader.fetch("chr1", Position::try_from(1).unwrap(), Position::try_from(100).unwrap());
436 assert!(result.is_err());
437 }
438
439 #[test]
440 fn test_reference_case_preserved() -> Result<()> {
441 use crate::sam::builder::create_test_fasta;
444
445 let file = create_test_fasta(&[("chr1", "AcGtNnAaCcGgTt")])?; let reader = ReferenceReader::new(file.path())?;
448
449 let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(14)?)?;
451 assert_eq!(seq, b"AcGtNnAaCcGgTt");
452
453 assert_eq!(reader.base_at("chr1", Position::try_from(1)?)?, b'A'); assert_eq!(reader.base_at("chr1", Position::try_from(2)?)?, b'c'); assert_eq!(reader.base_at("chr1", Position::try_from(3)?)?, b'G'); assert_eq!(reader.base_at("chr1", Position::try_from(4)?)?, b't'); assert_eq!(reader.base_at("chr1", Position::try_from(5)?)?, b'N'); assert_eq!(reader.base_at("chr1", Position::try_from(6)?)?, b'n'); Ok(())
462 }
463
464 #[test]
465 fn test_n_bases_at_various_positions() -> Result<()> {
466 use crate::sam::builder::create_test_fasta;
467
468 let file = create_test_fasta(&[("chr1", "NACGTNACGTN")])?;
470 let reader = ReferenceReader::new(file.path())?;
471
472 assert_eq!(reader.base_at("chr1", Position::try_from(1)?)?, b'N');
473 assert_eq!(reader.base_at("chr1", Position::try_from(6)?)?, b'N');
474 assert_eq!(reader.base_at("chr1", Position::try_from(11)?)?, b'N');
475
476 let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(6)?)?;
478 assert_eq!(seq, b"NACGTN");
479
480 Ok(())
481 }
482
483 #[test]
484 fn test_all_n_sequence() -> Result<()> {
485 use crate::sam::builder::create_test_fasta;
486
487 let file = create_test_fasta(&[("chrN", "NNNNNNNNNN")])?;
488 let reader = ReferenceReader::new(file.path())?;
489
490 let seq = reader.fetch("chrN", Position::try_from(1)?, Position::try_from(10)?)?;
491 assert_eq!(seq, b"NNNNNNNNNN");
492
493 for i in 1..=10 {
494 assert_eq!(reader.base_at("chrN", Position::try_from(i)?)?, b'N');
495 }
496
497 Ok(())
498 }
499
500 #[test]
501 fn test_long_sequence_boundaries() -> Result<()> {
502 use crate::sam::builder::create_test_fasta;
503
504 let mut seq = String::new();
507
508 for _ in 0..7 {
510 seq.push_str("ACGT");
511 }
512 seq.push_str("NNNN");
514 for _ in 0..7 {
516 seq.push_str("ACGT");
517 }
518 seq.push_str("acgt");
520 for _ in 0..9 {
522 seq.push_str("ACGT");
523 }
524
525 assert_eq!(seq.len(), 100);
526
527 let file = create_test_fasta(&[("chr1", &seq)])?;
528 let reader = ReferenceReader::new(file.path())?;
529
530 assert_eq!(reader.base_at("chr1", Position::try_from(28)?)?, b'T');
532 assert_eq!(reader.base_at("chr1", Position::try_from(29)?)?, b'N');
533 assert_eq!(reader.base_at("chr1", Position::try_from(32)?)?, b'N');
534 assert_eq!(reader.base_at("chr1", Position::try_from(33)?)?, b'A');
535
536 assert_eq!(reader.base_at("chr1", Position::try_from(60)?)?, b'T');
538 assert_eq!(reader.base_at("chr1", Position::try_from(61)?)?, b'a');
539 assert_eq!(reader.base_at("chr1", Position::try_from(64)?)?, b't');
540 assert_eq!(reader.base_at("chr1", Position::try_from(65)?)?, b'A');
541
542 let cross_first = reader.fetch("chr1", Position::try_from(27)?, Position::try_from(34)?)?;
544 assert_eq!(cross_first, b"GTNNNNAC");
545
546 let cross_second =
547 reader.fetch("chr1", Position::try_from(59)?, Position::try_from(66)?)?;
548 assert_eq!(cross_second, b"GTacgtAC");
549
550 Ok(())
551 }
552
553 #[test]
554 fn test_single_base_sequence() -> Result<()> {
555 use crate::sam::builder::create_test_fasta;
556
557 let file = create_test_fasta(&[("chr1", "A"), ("chr2", "N"), ("chr3", "g")])?;
558 let reader = ReferenceReader::new(file.path())?;
559
560 assert_eq!(reader.base_at("chr1", Position::try_from(1)?)?, b'A');
561 assert_eq!(reader.base_at("chr2", Position::try_from(1)?)?, b'N');
562 assert_eq!(reader.base_at("chr3", Position::try_from(1)?)?, b'g');
563
564 let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(1)?)?;
565 assert_eq!(seq, b"A");
566
567 Ok(())
568 }
569
570 #[test]
571 fn test_fetch_full_sequence() -> Result<()> {
572 use crate::sam::builder::create_test_fasta;
573
574 let original = "ACGTNacgtn";
575 let file = create_test_fasta(&[("chr1", original)])?;
576 let reader = ReferenceReader::new(file.path())?;
577
578 let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(10)?)?;
579 assert_eq!(seq, original.as_bytes());
580
581 Ok(())
582 }
583
584 #[test]
585 fn test_fetch_last_base() -> Result<()> {
586 use crate::sam::builder::create_test_fasta;
587
588 let file = create_test_fasta(&[("chr1", "ACGTN")])?;
589 let reader = ReferenceReader::new(file.path())?;
590
591 let seq = reader.fetch("chr1", Position::try_from(5)?, Position::try_from(5)?)?;
593 assert_eq!(seq, b"N");
594
595 assert_eq!(reader.base_at("chr1", Position::try_from(5)?)?, b'N');
596
597 Ok(())
598 }
599
600 #[test]
601 fn test_multiple_chromosomes_isolation() -> Result<()> {
602 use crate::sam::builder::create_test_fasta;
603
604 let file = create_test_fasta(&[
605 ("chr1", "AAAA"),
606 ("chr2", "CCCC"),
607 ("chr3", "GGGG"),
608 ("chr4", "TTTT"),
609 ])?;
610 let reader = ReferenceReader::new(file.path())?;
611
612 assert_eq!(reader.fetch("chr1", Position::try_from(1)?, Position::try_from(4)?)?, b"AAAA");
614 assert_eq!(reader.fetch("chr2", Position::try_from(1)?, Position::try_from(4)?)?, b"CCCC");
615 assert_eq!(reader.fetch("chr3", Position::try_from(1)?, Position::try_from(4)?)?, b"GGGG");
616 assert_eq!(reader.fetch("chr4", Position::try_from(1)?, Position::try_from(4)?)?, b"TTTT");
617
618 Ok(())
619 }
620
621 #[test]
622 fn test_mixed_case_all_bases() -> Result<()> {
623 use crate::sam::builder::create_test_fasta;
624
625 let file = create_test_fasta(&[("chr1", "ACGTNacgtn")])?;
627 let reader = ReferenceReader::new(file.path())?;
628
629 let expected = [b'A', b'C', b'G', b'T', b'N', b'a', b'c', b'g', b't', b'n'];
630 for (i, &expected_base) in expected.iter().enumerate() {
631 let pos = Position::try_from(i + 1)?;
632 assert_eq!(
633 reader.base_at("chr1", pos)?,
634 expected_base,
635 "Mismatch at position {}",
636 i + 1
637 );
638 }
639
640 Ok(())
641 }
642
643 #[test]
644 fn test_runs_of_n_bases() -> Result<()> {
645 use crate::sam::builder::create_test_fasta;
646
647 let file = create_test_fasta(&[("chr1", "ACGTNNNNNNNNACGT")])?;
649 let reader = ReferenceReader::new(file.path())?;
650
651 let n_run = reader.fetch("chr1", Position::try_from(5)?, Position::try_from(12)?)?;
653 assert_eq!(n_run, b"NNNNNNNN");
654
655 let across = reader.fetch("chr1", Position::try_from(3)?, Position::try_from(14)?)?;
657 assert_eq!(across, b"GTNNNNNNNNAC");
658
659 Ok(())
660 }
661
662 #[test]
663 fn test_position_one_based() -> Result<()> {
664 use crate::sam::builder::create_test_fasta;
665
666 let file = create_test_fasta(&[("chr1", "ACGTN")])?;
667 let reader = ReferenceReader::new(file.path())?;
668
669 assert_eq!(reader.base_at("chr1", Position::try_from(1)?)?, b'A');
671 assert_eq!(reader.base_at("chr1", Position::try_from(2)?)?, b'C');
672
673 let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(1)?)?;
675 assert_eq!(seq, b"A");
676
677 let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(2)?)?;
679 assert_eq!(seq, b"AC");
680
681 Ok(())
682 }
683
684 #[test]
685 fn test_find_dict_path_replacing_convention() -> Result<()> {
686 let temp_dir = tempfile::tempdir()?;
688 let fasta_path = temp_dir.path().join("ref.fa");
689 let dict_path = temp_dir.path().join("ref.dict");
690
691 std::fs::write(&fasta_path, "")?;
693 std::fs::write(&dict_path, "")?;
694
695 let found = find_dict_path(&fasta_path);
696 assert!(found.is_some());
697 assert_eq!(found.unwrap(), dict_path);
698
699 Ok(())
700 }
701
702 #[test]
703 fn test_find_dict_path_appending_convention() -> Result<()> {
704 let temp_dir = tempfile::tempdir()?;
706 let fasta_path = temp_dir.path().join("ref.fa");
707 let dict_path = temp_dir.path().join("ref.fa.dict");
708
709 std::fs::write(&fasta_path, "")?;
711 std::fs::write(&dict_path, "")?;
712
713 let found = find_dict_path(&fasta_path);
714 assert!(found.is_some());
715 assert_eq!(found.unwrap(), dict_path);
716
717 Ok(())
718 }
719
720 #[test]
721 fn test_find_dict_path_prefers_replacing_convention() -> Result<()> {
722 let temp_dir = tempfile::tempdir()?;
724 let fasta_path = temp_dir.path().join("ref.fa");
725 let dict_replacing = temp_dir.path().join("ref.dict");
726 let dict_appending = temp_dir.path().join("ref.fa.dict");
727
728 std::fs::write(&fasta_path, "")?;
730 std::fs::write(&dict_replacing, "")?;
731 std::fs::write(&dict_appending, "")?;
732
733 let found = find_dict_path(&fasta_path);
734 assert!(found.is_some());
735 assert_eq!(found.unwrap(), dict_replacing);
737
738 Ok(())
739 }
740
741 #[test]
742 fn test_find_dict_path_not_found() {
743 let temp_dir = tempfile::tempdir().unwrap();
745 let fasta_path = temp_dir.path().join("ref.fa");
746
747 std::fs::write(&fasta_path, "").unwrap();
749
750 let found = find_dict_path(&fasta_path);
751 assert!(found.is_none());
752 }
753
754 #[test]
755 fn test_find_dict_path_fasta_extension() -> Result<()> {
756 let temp_dir = tempfile::tempdir()?;
758 let fasta_path = temp_dir.path().join("ref.fasta");
759 let dict_path = temp_dir.path().join("ref.dict");
760
761 std::fs::write(&fasta_path, "")?;
762 std::fs::write(&dict_path, "")?;
763
764 let found = find_dict_path(&fasta_path);
765 assert!(found.is_some());
766 assert_eq!(found.unwrap(), dict_path);
767
768 Ok(())
769 }
770
771 #[test]
772 fn test_find_dict_path_fasta_appending_convention() -> Result<()> {
773 let temp_dir = tempfile::tempdir()?;
775 let fasta_path = temp_dir.path().join("ref.fasta");
776 let dict_path = temp_dir.path().join("ref.fasta.dict");
777
778 std::fs::write(&fasta_path, "")?;
779 std::fs::write(&dict_path, "")?;
780
781 let found = find_dict_path(&fasta_path);
782 assert!(found.is_some());
783 assert_eq!(found.unwrap(), dict_path);
784
785 Ok(())
786 }
787}