use std::borrow::Cow;
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufRead, BufReader, BufWriter, Write};
use std::path::{Path, PathBuf};
use std::sync::Arc;
use memmap2::Mmap;
use serde::{Deserialize, Serialize};
use crate::error::VarEffectError;
pub const GENOME_BIN_INDEX_VERSION: u32 = 1;
#[cfg(not(target_pointer_width = "64"))]
compile_error!("vareffect requires a 64-bit target (genome files may exceed 4 GB)");
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GenomeBinIndex {
pub version: u32,
pub build: String,
pub expected_size: u64,
pub contigs: Vec<ContigEntry>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ContigEntry {
pub name: String,
pub offset: u64,
pub length: u64,
}
pub fn write_genome_binary(
contigs: &[(&str, &[u8])],
build: &str,
bin_path: &Path,
idx_path: &Path,
) -> Result<(), VarEffectError> {
let bin_file = File::create(bin_path).map_err(|source| VarEffectError::Io {
path: bin_path.to_path_buf(),
source,
})?;
let mut writer = BufWriter::new(bin_file);
let mut entries = Vec::with_capacity(contigs.len());
let mut offset: u64 = 0;
let mut buf = [0u8; 64 * 1024];
for &(name, seq) in contigs {
let entry_offset = offset;
for chunk in seq.chunks(buf.len()) {
let n = chunk.len();
buf[..n].copy_from_slice(chunk);
buf[..n].make_ascii_uppercase();
for (i, &b) in buf[..n].iter().enumerate() {
if !is_iupac_nucleotide(b) {
return Err(VarEffectError::Malformed(format!(
"non-IUPAC byte 0x{:02X} ('{}') in contig {name} at offset {}",
chunk[i],
chunk[i] as char,
offset + i as u64,
)));
}
}
writer
.write_all(&buf[..n])
.map_err(|source| VarEffectError::Io {
path: bin_path.to_path_buf(),
source,
})?;
offset += n as u64;
}
entries.push(ContigEntry {
name: name.to_string(),
offset: entry_offset,
length: seq.len() as u64,
});
}
writer.flush().map_err(|source| VarEffectError::Io {
path: bin_path.to_path_buf(),
source,
})?;
writer
.into_inner()
.map_err(|e| VarEffectError::Io {
path: bin_path.to_path_buf(),
source: e.into_error(),
})?
.sync_all()
.map_err(|source| VarEffectError::Io {
path: bin_path.to_path_buf(),
source,
})?;
let index = GenomeBinIndex {
version: GENOME_BIN_INDEX_VERSION,
build: build.to_string(),
expected_size: offset,
contigs: entries,
};
let idx_bytes = rmp_serde::to_vec(&index).map_err(|e| VarEffectError::Io {
path: idx_path.to_path_buf(),
source: std::io::Error::new(std::io::ErrorKind::InvalidData, e),
})?;
let tmp_idx = idx_path.with_extension("tmp");
std::fs::write(&tmp_idx, &idx_bytes).map_err(|source| VarEffectError::Io {
path: idx_path.to_path_buf(),
source,
})?;
std::fs::rename(&tmp_idx, idx_path).map_err(|source| VarEffectError::Io {
path: idx_path.to_path_buf(),
source,
})?;
Ok(())
}
#[inline]
pub fn is_iupac_nucleotide(b: u8) -> bool {
matches!(
b,
b'A' | b'C'
| b'G'
| b'T'
| b'N'
| b'R'
| b'Y'
| b'S'
| b'W'
| b'K'
| b'M'
| b'B'
| b'D'
| b'H'
| b'V'
)
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum ContigNaming {
NcbiRefSeq,
UcscPrefixed,
EnsemblBare,
}
pub struct FastaReader {
mmap: Arc<Mmap>,
contigs: Arc<HashMap<String, (u64, u64)>>,
naming: ContigNaming,
patch_aliases: Option<Arc<HashMap<String, String>>>,
path: PathBuf,
}
impl FastaReader {
pub fn open(path: &Path) -> Result<Self, VarEffectError> {
Self::open_with_patch_aliases(path, None)
}
pub fn open_with_patch_aliases(
path: &Path,
patch_aliases_csv: Option<&Path>,
) -> Result<Self, VarEffectError> {
let idx_path = append_idx_extension(path);
if !idx_path.exists() {
return Err(VarEffectError::IndexNotFound {
path: idx_path.display().to_string(),
});
}
let idx_bytes = std::fs::read(&idx_path).map_err(|source| VarEffectError::Io {
path: idx_path.clone(),
source,
})?;
let index: GenomeBinIndex =
rmp_serde::from_slice(&idx_bytes).map_err(|e| VarEffectError::Io {
path: idx_path.clone(),
source: std::io::Error::new(std::io::ErrorKind::InvalidData, e),
})?;
if index.version != GENOME_BIN_INDEX_VERSION {
return Err(VarEffectError::Malformed(format!(
"unsupported genome binary index version {} (expected {})",
index.version, GENOME_BIN_INDEX_VERSION,
)));
}
let file = File::open(path).map_err(|source| VarEffectError::Io {
path: path.to_path_buf(),
source,
})?;
let mmap = unsafe {
memmap2::MmapOptions::new()
.map(&file)
.map_err(|source| VarEffectError::Io {
path: path.to_path_buf(),
source,
})?
};
if mmap.len() as u64 != index.expected_size {
return Err(VarEffectError::Malformed(format!(
"genome binary size mismatch: expected {} bytes, got {} — \
file may be truncated or corrupt",
index.expected_size,
mmap.len(),
)));
}
let mut contigs: HashMap<String, (u64, u64)> = HashMap::with_capacity(index.contigs.len());
let mut naming = ContigNaming::EnsemblBare;
for entry in &index.contigs {
if entry.name.starts_with("NC_") {
naming = ContigNaming::NcbiRefSeq;
} else if entry.name.starts_with("chr") && naming != ContigNaming::NcbiRefSeq {
naming = ContigNaming::UcscPrefixed;
}
contigs.insert(entry.name.clone(), (entry.offset, entry.length));
}
let patch_aliases = match (patch_aliases_csv, naming) {
(Some(csv_path), ContigNaming::NcbiRefSeq) => {
Some(Arc::new(load_ucsc_to_refseq_aliases(csv_path)?))
}
_ => None,
};
Ok(Self {
mmap: Arc::new(mmap),
contigs: Arc::new(contigs),
naming,
patch_aliases,
path: path.to_path_buf(),
})
}
pub fn try_clone(&self) -> Result<Self, VarEffectError> {
Ok(Self {
mmap: Arc::clone(&self.mmap),
contigs: Arc::clone(&self.contigs),
naming: self.naming,
patch_aliases: self.patch_aliases.as_ref().map(Arc::clone),
path: self.path.clone(),
})
}
pub fn fetch_sequence(
&self,
chrom: &str,
start: u64,
end: u64,
) -> Result<Vec<u8>, VarEffectError> {
let translated = self.translate_chrom(chrom);
let &(offset, length) =
self.contigs
.get(translated.as_ref())
.ok_or_else(|| VarEffectError::ChromNotFound {
chrom: chrom.to_string(),
})?;
if start >= end || end > length {
return Err(VarEffectError::CoordinateOutOfRange {
chrom: chrom.to_string(),
start,
end,
chrom_len: length,
});
}
let slice_start = (offset + start) as usize;
let slice_end = (offset + end) as usize;
Ok(self.mmap[slice_start..slice_end].to_vec())
}
pub fn fetch_sequence_raw(
&self,
chrom: &str,
start: u64,
end: u64,
) -> Result<Vec<u8>, VarEffectError> {
self.fetch_sequence(chrom, start, end)
}
pub fn fetch_base(&self, chrom: &str, pos: u64) -> Result<u8, VarEffectError> {
let translated = self.translate_chrom(chrom);
let &(offset, length) =
self.contigs
.get(translated.as_ref())
.ok_or_else(|| VarEffectError::ChromNotFound {
chrom: chrom.to_string(),
})?;
if pos >= length {
return Err(VarEffectError::CoordinateOutOfRange {
chrom: chrom.to_string(),
start: pos,
end: pos + 1,
chrom_len: length,
});
}
Ok(self.mmap[(offset + pos) as usize])
}
pub fn verify_ref(
&self,
chrom: &str,
pos: u64,
ref_allele: &[u8],
) -> Result<bool, VarEffectError> {
if ref_allele.is_empty() {
return Ok(true);
}
let translated = self.translate_chrom(chrom);
let &(offset, length) =
self.contigs
.get(translated.as_ref())
.ok_or_else(|| VarEffectError::ChromNotFound {
chrom: chrom.to_string(),
})?;
let end_pos = pos + ref_allele.len() as u64;
if end_pos > length {
return Err(VarEffectError::CoordinateOutOfRange {
chrom: chrom.to_string(),
start: pos,
end: end_pos,
chrom_len: length,
});
}
let start = (offset + pos) as usize;
let end = start + ref_allele.len();
let slice = &self.mmap[start..end];
Ok(slice
.iter()
.zip(ref_allele.iter())
.all(|(a, b)| a.eq_ignore_ascii_case(b)))
}
pub fn chrom_length(&self, chrom: &str) -> Option<u64> {
let translated = self.translate_chrom(chrom);
self.contigs
.get(translated.as_ref())
.map(|&(_, length)| length)
}
fn translate_chrom<'a>(&'a self, chrom: &'a str) -> Cow<'a, str> {
match self.naming {
ContigNaming::NcbiRefSeq => {
let primary = crate::chrom::ucsc_to_refseq(chrom);
if !std::ptr::eq(primary.as_ptr(), chrom.as_ptr()) {
return Cow::Borrowed(primary);
}
if let Some(aliases) = &self.patch_aliases
&& let Some(refseq) = aliases.get(chrom)
{
return Cow::Owned(refseq.clone());
}
Cow::Borrowed(chrom)
}
ContigNaming::UcscPrefixed => {
Cow::Borrowed(chrom)
}
ContigNaming::EnsemblBare => {
if chrom == "chrM" {
Cow::Owned("MT".to_string())
} else if let Some(stripped) = chrom.strip_prefix("chr") {
Cow::Borrowed(stripped)
} else {
Cow::Borrowed(chrom)
}
}
}
}
}
fn load_ucsc_to_refseq_aliases(path: &Path) -> Result<HashMap<String, String>, VarEffectError> {
let file = File::open(path).map_err(|source| VarEffectError::Io {
path: path.to_path_buf(),
source,
})?;
let reader = BufReader::new(file);
let mut map: HashMap<String, String> = HashMap::new();
for line in reader.lines() {
let line = line.map_err(|source| VarEffectError::Io {
path: path.to_path_buf(),
source,
})?;
let trimmed = line.trim();
if trimmed.is_empty() || trimmed.starts_with('#') || trimmed.starts_with("refseq,") {
continue;
}
let Some((refseq, ucsc)) = trimmed.split_once(',') else {
continue;
};
let refseq = refseq.trim();
let ucsc = ucsc.trim();
if refseq.is_empty() || ucsc.is_empty() {
continue;
}
map.insert(ucsc.to_string(), refseq.to_string());
}
Ok(map)
}
impl std::fmt::Debug for FastaReader {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("FastaReader")
.field("path", &self.path)
.field("naming", &self.naming)
.field("patch_aliases", &self.patch_aliases.is_some())
.field("contigs", &self.contigs.len())
.finish()
}
}
fn append_idx_extension(path: &Path) -> PathBuf {
let mut os = path.as_os_str().to_os_string();
os.push(".idx");
PathBuf::from(os)
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
fn write_test_genome(contigs: &[(&str, &[u8])]) -> (TempDir, FastaReader) {
let tmp = TempDir::new().unwrap();
let bin_path = tmp.path().join("test.bin");
let idx_path = tmp.path().join("test.bin.idx");
write_genome_binary(contigs, "test", &bin_path, &idx_path).unwrap();
let reader = FastaReader::open(&bin_path).unwrap();
(tmp, reader)
}
fn write_patch_alias_csv(tmp: &TempDir) -> PathBuf {
let csv_path = tmp.path().join("patch_chrom_aliases.csv");
let contents = "\
# GRCh38 patch-contig aliases: RefSeq accession -> UCSC contig name.
refseq,ucsc
NW_009646194.1,chr1_KN196472v1_fix
NT_187633.1,chr22_KI270879v1_alt
";
std::fs::write(&csv_path, contents).unwrap();
csv_path
}
#[test]
fn chrom_name_translation_ensembl_style() {
let contigs: &[(&str, &[u8])] = &[("1", b"ACGTACGTACGTACGT"), ("MT", b"NNNNACGTACGT")];
let (_tmp, reader) = write_test_genome(contigs);
assert_eq!(reader.naming, ContigNaming::EnsemblBare);
let seq = reader.fetch_sequence("chr1", 0, 4).unwrap();
assert_eq!(seq, b"ACGT");
let seq = reader.fetch_sequence("chrM", 4, 8).unwrap();
assert_eq!(seq, b"ACGT");
}
#[test]
fn chrom_name_translation_ucsc_style() {
let contigs: &[(&str, &[u8])] = &[("chr1", b"ACGTACGTACGTACGT")];
let (_tmp, reader) = write_test_genome(contigs);
assert_eq!(reader.naming, ContigNaming::UcscPrefixed);
let seq = reader.fetch_sequence("chr1", 0, 4).unwrap();
assert_eq!(seq, b"ACGT");
}
#[test]
fn ucsc_naming_detected_even_without_chr1() {
let contigs: &[(&str, &[u8])] = &[("chr22", b"ACGT"), ("chrY", b"TTTT")];
let (_tmp, reader) = write_test_genome(contigs);
assert_eq!(
reader.naming,
ContigNaming::UcscPrefixed,
"partial assembly should still be flagged UCSC-style",
);
assert_eq!(reader.fetch_base("chr22", 0).unwrap(), b'A');
assert_eq!(reader.fetch_base("chrY", 0).unwrap(), b'T');
}
#[test]
fn ncbi_naming_detected_with_nc_prefix() {
let contigs: &[(&str, &[u8])] = &[
("NC_000001.11", b"ACGTACGTACGTACGT"),
("NW_009646194.1", b"GGGGCCCC"),
];
let (_tmp, reader) = write_test_genome(contigs);
assert_eq!(reader.naming, ContigNaming::NcbiRefSeq);
assert_eq!(reader.fetch_base("chr1", 0).unwrap(), b'A');
assert_eq!(reader.fetch_sequence("chr1", 0, 4).unwrap(), b"ACGT");
assert_eq!(reader.chrom_length("chr1"), Some(16));
}
#[test]
fn mixed_ncbi_naming_nc_plus_nw() {
let contigs: &[(&str, &[u8])] = &[
("NC_000001.11", b"AAAA"),
("NW_009646194.1", b"CCCC"),
("NT_187633.1", b"GGGG"),
];
let (_tmp, reader) = write_test_genome(contigs);
assert_eq!(reader.naming, ContigNaming::NcbiRefSeq);
}
#[test]
fn coordinate_validation_start_ge_end() {
let contigs: &[(&str, &[u8])] = &[("1", b"ACGTACGTACGTACGT")];
let (_tmp, reader) = write_test_genome(contigs);
let err = reader.fetch_sequence("chr1", 5, 5).unwrap_err();
assert!(matches!(err, VarEffectError::CoordinateOutOfRange { .. }));
let err = reader.fetch_sequence("chr1", 10, 5).unwrap_err();
assert!(matches!(err, VarEffectError::CoordinateOutOfRange { .. }));
}
#[test]
fn coordinate_validation_out_of_range() {
let contigs: &[(&str, &[u8])] = &[("1", b"ACGTACGTACGTACGT")];
let (_tmp, reader) = write_test_genome(contigs);
let err = reader.fetch_sequence("chr1", 0, 17).unwrap_err();
assert!(matches!(err, VarEffectError::CoordinateOutOfRange { .. }));
}
#[test]
fn fetch_base_rejects_position_past_end_of_chrom() {
let contigs: &[(&str, &[u8])] = &[("1", b"ACGTACGTACGTACGT")];
let (_tmp, reader) = write_test_genome(contigs);
let err = reader.fetch_base("chr1", 16).unwrap_err();
assert!(matches!(err, VarEffectError::CoordinateOutOfRange { .. }));
}
#[test]
fn unknown_chrom_returns_err() {
let contigs: &[(&str, &[u8])] = &[("1", b"ACGT")];
let (_tmp, reader) = write_test_genome(contigs);
let err = reader.fetch_sequence("chrZZ", 0, 4).unwrap_err();
assert!(matches!(err, VarEffectError::ChromNotFound { .. }));
}
#[test]
fn fetch_base_returns_single_byte() {
let contigs: &[(&str, &[u8])] = &[("1", b"ACGTACGTACGTACGT")];
let (_tmp, reader) = write_test_genome(contigs);
assert_eq!(reader.fetch_base("chr1", 0).unwrap(), b'A');
assert_eq!(reader.fetch_base("chr1", 3).unwrap(), b'T');
}
#[test]
fn fetch_sequence_returns_correct_range() {
let contigs: &[(&str, &[u8])] = &[("chr1", b"ACGTACGTACGTACGT"), ("chrM", b"NNNNACGTACGT")];
let (_tmp, reader) = write_test_genome(contigs);
assert_eq!(reader.fetch_sequence("chr1", 0, 4).unwrap(), b"ACGT");
assert_eq!(reader.fetch_sequence("chr1", 4, 8).unwrap(), b"ACGT");
assert_eq!(reader.fetch_sequence("chrM", 0, 4).unwrap(), b"NNNN");
assert_eq!(reader.fetch_sequence("chrM", 4, 8).unwrap(), b"ACGT");
}
#[test]
fn fetch_sequence_uppercases_lowercase_input() {
let contigs: &[(&str, &[u8])] = &[("1", b"acgtacgt")];
let (_tmp, reader) = write_test_genome(contigs);
let seq = reader.fetch_sequence("chr1", 0, 8).unwrap();
assert_eq!(seq, b"ACGTACGT");
}
#[test]
fn fetch_sequence_raw_returns_same_as_fetch_sequence() {
let contigs: &[(&str, &[u8])] = &[("1", b"ACGTACGT")];
let (_tmp, reader) = write_test_genome(contigs);
let raw = reader.fetch_sequence_raw("chr1", 0, 8).unwrap();
let upper = reader.fetch_sequence("chr1", 0, 8).unwrap();
assert_eq!(raw, upper);
}
#[test]
fn verify_ref_match_and_mismatch() {
let contigs: &[(&str, &[u8])] = &[("1", b"ACGTACGTACGTACGT")];
let (_tmp, reader) = write_test_genome(contigs);
assert!(reader.verify_ref("chr1", 0, b"ACGT").unwrap());
assert!(!reader.verify_ref("chr1", 0, b"TTTT").unwrap());
assert!(reader.verify_ref("chr1", 0, b"acgt").unwrap());
assert!(reader.verify_ref("chr1", 0, b"").unwrap());
}
#[test]
fn verify_ref_out_of_bounds_returns_err() {
let contigs: &[(&str, &[u8])] = &[("1", b"ACGT")];
let (_tmp, reader) = write_test_genome(contigs);
let err = reader.verify_ref("chr1", 2, b"ACGT").unwrap_err();
assert!(matches!(err, VarEffectError::CoordinateOutOfRange { .. }));
}
#[test]
fn chrom_length_reports_correct_values() {
let contigs: &[(&str, &[u8])] = &[("1", b"ACGTACGTACGTACGT"), ("MT", b"NNNNACGTACGT")];
let (_tmp, reader) = write_test_genome(contigs);
assert_eq!(reader.chrom_length("chr1"), Some(16));
assert_eq!(reader.chrom_length("chrM"), Some(12));
assert_eq!(reader.chrom_length("chrZZ"), None);
}
#[test]
fn try_clone_produces_independent_reader_sharing_mmap() {
let contigs: &[(&str, &[u8])] = &[("1", b"ACGTACGTACGTACGT")];
let (_tmp, reader) = write_test_genome(contigs);
let cloned = reader.try_clone().expect("try_clone must succeed");
assert_eq!(reader.fetch_base("chr1", 0).unwrap(), b'A');
assert_eq!(cloned.fetch_base("chr1", 0).unwrap(), b'A');
assert!(
Arc::ptr_eq(&reader.mmap, &cloned.mmap),
"mmap Arc must be shared across clones"
);
assert!(
Arc::ptr_eq(&reader.contigs, &cloned.contigs),
"contigs Arc must be shared across clones"
);
}
#[test]
fn missing_idx_reports_index_not_found() {
let tmp = TempDir::new().unwrap();
let bin_path = tmp.path().join("no_index.bin");
std::fs::write(&bin_path, b"ACGT").unwrap();
let err = FastaReader::open(&bin_path).unwrap_err();
assert!(matches!(err, VarEffectError::IndexNotFound { .. }));
}
#[test]
fn truncated_bin_reports_malformed() {
let tmp = TempDir::new().unwrap();
let bin_path = tmp.path().join("truncated.bin");
let idx_path = tmp.path().join("truncated.bin.idx");
let index = GenomeBinIndex {
version: GENOME_BIN_INDEX_VERSION,
build: "test".into(),
expected_size: 100,
contigs: vec![ContigEntry {
name: "chr1".into(),
offset: 0,
length: 100,
}],
};
let idx_bytes = rmp_serde::to_vec(&index).unwrap();
std::fs::write(&idx_path, &idx_bytes).unwrap();
std::fs::write(&bin_path, [b'A'; 10]).unwrap();
let err = FastaReader::open(&bin_path).unwrap_err();
assert!(matches!(err, VarEffectError::Malformed(_)));
}
#[test]
fn wrong_index_version_reports_malformed() {
let tmp = TempDir::new().unwrap();
let bin_path = tmp.path().join("badver.bin");
let idx_path = tmp.path().join("badver.bin.idx");
std::fs::write(&bin_path, b"ACGT").unwrap();
let index = GenomeBinIndex {
version: 99,
build: "test".into(),
expected_size: 4,
contigs: vec![ContigEntry {
name: "chr1".into(),
offset: 0,
length: 4,
}],
};
let idx_bytes = rmp_serde::to_vec(&index).unwrap();
std::fs::write(&idx_path, &idx_bytes).unwrap();
let err = FastaReader::open(&bin_path).unwrap_err();
assert!(matches!(err, VarEffectError::Malformed(_)));
}
#[test]
fn multi_contig_boundary_no_cross_contamination() {
let contigs: &[(&str, &[u8])] = &[("chr1", b"GGGGTTTT"), ("chr2", b"AAAACCCC")];
let (_tmp, reader) = write_test_genome(contigs);
assert_eq!(reader.fetch_base("chr1", 7).unwrap(), b'T');
assert_eq!(reader.fetch_base("chr2", 0).unwrap(), b'A');
assert_eq!(reader.fetch_sequence("chr1", 4, 8).unwrap(), b"TTTT");
assert_eq!(reader.fetch_sequence("chr2", 0, 4).unwrap(), b"AAAA");
}
#[test]
fn ncbi_fasta_rejects_ucsc_patch_without_alias_csv() {
let contigs: &[(&str, &[u8])] = &[
("NC_000001.11", b"ACGTACGTACGTACGT"),
("NW_009646194.1", b"GGGGCCCC"),
];
let (_tmp, reader) = write_test_genome(contigs);
let err = reader.fetch_base("chr1_KN196472v1_fix", 0).unwrap_err();
assert!(
matches!(err, VarEffectError::ChromNotFound { .. }),
"expected ChromNotFound, got {err:?}",
);
}
#[test]
fn ncbi_fasta_resolves_ucsc_patch_via_alias_csv() {
let tmp = TempDir::new().unwrap();
let bin_path = tmp.path().join("ncbi.bin");
let idx_path = tmp.path().join("ncbi.bin.idx");
let contigs: &[(&str, &[u8])] = &[
("NC_000001.11", b"ACGTACGTACGTACGT"),
("NW_009646194.1", b"GGGGCCCC"),
];
write_genome_binary(contigs, "test", &bin_path, &idx_path).unwrap();
let csv_path = write_patch_alias_csv(&tmp);
let reader = FastaReader::open_with_patch_aliases(&bin_path, Some(&csv_path)).unwrap();
assert_eq!(reader.naming, ContigNaming::NcbiRefSeq);
assert!(reader.patch_aliases.is_some());
assert_eq!(reader.fetch_base("chr1_KN196472v1_fix", 0).unwrap(), b'G');
assert_eq!(
reader.fetch_sequence("chr1_KN196472v1_fix", 0, 4).unwrap(),
b"GGGG",
);
}
#[test]
fn ncbi_fasta_ignores_alias_csv_for_non_ncbi_binary() {
let tmp = TempDir::new().unwrap();
let bin_path = tmp.path().join("ens.bin");
let idx_path = tmp.path().join("ens.bin.idx");
let contigs: &[(&str, &[u8])] = &[("1", b"ACGT"), ("MT", b"NNNN")];
write_genome_binary(contigs, "test", &bin_path, &idx_path).unwrap();
let csv_path = write_patch_alias_csv(&tmp);
let reader = FastaReader::open_with_patch_aliases(&bin_path, Some(&csv_path)).unwrap();
assert_eq!(reader.naming, ContigNaming::EnsemblBare);
assert!(
reader.patch_aliases.is_none(),
"patch aliases must not load against a non-NCBI binary",
);
assert_eq!(reader.fetch_base("chr1", 0).unwrap(), b'A');
}
#[test]
fn builder_rejects_non_iupac_bytes() {
let tmp = TempDir::new().unwrap();
let bin_path = tmp.path().join("bad.bin");
let idx_path = tmp.path().join("bad.bin.idx");
let err =
write_genome_binary(&[("chr1", b"ACGTXN")], "test", &bin_path, &idx_path).unwrap_err();
assert!(matches!(err, VarEffectError::Malformed(_)));
}
#[test]
fn builder_accepts_iupac_ambiguity_codes() {
let seq = b"ACGTNRYSWKMBDHV";
let contigs: &[(&str, &[u8])] = &[("chr1", seq.as_slice())];
let (_tmp, reader) = write_test_genome(contigs);
let fetched = reader.fetch_sequence("chr1", 0, seq.len() as u64).unwrap();
assert_eq!(fetched, seq.as_slice());
}
#[test]
fn builder_round_trips_all_valid_bases() {
let seq = b"ACGTNNNTTTAAACCCGGG";
let contigs: &[(&str, &[u8])] = &[("chr1", seq.as_slice())];
let (_tmp, reader) = write_test_genome(contigs);
let fetched = reader.fetch_sequence("chr1", 0, seq.len() as u64).unwrap();
assert_eq!(fetched, seq.as_slice());
}
#[test]
fn append_idx_extension_preserves_existing_ext() {
let out = append_idx_extension(Path::new("/tmp/GRCh38.bin"));
assert_eq!(out, PathBuf::from("/tmp/GRCh38.bin.idx"));
}
}