vareffect/fasta.rs
1//! Indexed reference genome reader backed by a flat memory-mapped binary.
2//!
3//! Thin wrapper around [`memmap2::Mmap`] that serves **0-based half-open**
4//! sequences by **UCSC-style chromosome name** on the **plus strand**. All
5//! three conventions match [`crate::types::TranscriptModel`] so downstream
6//! consumers never need to translate between coordinate systems.
7//!
8//! # On-disk format
9//!
10//! The reader expects a pair of files produced by `vareffect-cli setup`:
11//!
12//! - **`GRCh38.bin`** — flat binary genome. Concatenated, newline-stripped,
13//! uppercased chromosome sequences. One byte per base, standard IUPAC
14//! nucleotide codes (`A`/`C`/`G`/`T`/`N` plus ambiguity codes `R`/`Y`/`S`/
15//! `W`/`K`/`M`/`B`/`D`/`H`/`V`). No headers, no line breaks, no padding
16//! between contigs. ~3.1 GB for GRCh38.p14 (primary + patches).
17//! - **`GRCh38.bin.idx`** — MessagePack-serialized [`GenomeBinIndex`] mapping
18//! each contig name to its `(offset, length)` in the `.bin` file. ~10 KB.
19//!
20//! Use [`write_genome_binary`] to produce these files from raw contig data
21//! (used by `vareffect-cli setup` and by unit tests).
22//!
23//! # Coordinate convention
24//!
25//! All coordinates exposed by [`FastaReader`] are 0-based half-open
26//! `[start, end)` — identical to `TranscriptModel::tx_start` / `tx_end`.
27//! Converting from a VCF 1-based position is `vcf_pos - 1`.
28//!
29//! # Chromosome name translation
30//!
31//! Callers always use UCSC-style names (`chr1`, `chr17`, `chrX`, `chrY`,
32//! `chrM`, `chr9_KN196479v1_fix`, ...) to stay consistent with
33//! `TranscriptModel::chrom`. The on-disk binary may use any of three naming
34//! conventions (inherited from the source FASTA); the reader detects which
35//! one at open time by scanning the index entries:
36//!
37//! - **NCBI RefSeq** (`NC_000001.11`, `NW_*`, ...) — produced by
38//! `vareffect-cli setup` from the NCBI GRCh38.p14 assembly. The reader
39//! translates primary chroms via [`crate::chrom::ucsc_to_refseq`] and
40//! patch contigs via an optional runtime alias CSV loaded through
41//! [`FastaReader::open_with_patch_aliases`].
42//! - **UCSC-prefixed** (`chr1`, `chrM`, ...) — pass-through translation.
43//! - **Ensembl bare** (`1`, `MT`, ...) — the reader strips the `chr` prefix
44//! and maps `chrM -> MT`.
45//!
46//! Patch contigs (`chr9_KN196479v1_fix`, `chr22_KI270879v1_alt`, ...) can
47//! only be served against an NCBI-naming binary when a
48//! `patch_chrom_aliases.csv` is supplied via
49//! [`FastaReader::open_with_patch_aliases`].
50//!
51//! # Thread safety
52//!
53//! [`FastaReader`] is inherently `Send + Sync` — the underlying
54//! [`memmap2::Mmap`] derefs to `&[u8]` with no Mutex required. All threads
55//! can read from the same reader concurrently with zero contention.
56//! [`FastaReader::try_clone`] is retained for API compatibility but simply
57//! clones a handful of `Arc`s — it is no longer needed for parallel
58//! workloads.
59//!
60//! # Soft-masking
61//!
62//! The flat binary stores uppercase IUPAC nucleotide codes. Soft-mask
63//! information (Ensembl lowercase = repeat region) is destroyed at build
64//! time. This matches VEP's internal behavior (which uppercases all fetched
65//! bases) and GA4GH refget v2.0 (specifies uppercase IUPAC).
66//! [`FastaReader::fetch_sequence_raw`] is retained for API compatibility but
67//! returns the same uppercase bytes as [`FastaReader::fetch_sequence`].
68//!
69//! # Usage
70//!
71//! ```no_run
72//! use std::path::Path;
73//! use vareffect::FastaReader;
74//!
75//! // Open the flat binary genome produced by `vareffect-cli setup`.
76//! let reader = FastaReader::open(Path::new("data/vareffect/GRCh38.bin"))?;
77//!
78//! // TP53 c.742C>T lives at chr17:7674221 (1-based VCF) = chr17:7674220 (0-based).
79//! let base = reader.fetch_base("chr17", 7674220)?;
80//! assert_eq!(base, b'C');
81//!
82//! // Patch-contig reads against an NCBI-source binary need the alias CSV.
83//! let reader = FastaReader::open_with_patch_aliases(
84//! Path::new("data/vareffect/GRCh38.bin"),
85//! Some(Path::new("data/vareffect/patch_chrom_aliases.csv")),
86//! )?;
87//! # Ok::<(), vareffect::VarEffectError>(())
88//! ```
89
90use std::borrow::Cow;
91use std::collections::HashMap;
92use std::fs::File;
93use std::io::{BufRead, BufReader, BufWriter, Write};
94use std::path::{Path, PathBuf};
95use std::sync::Arc;
96
97use memmap2::Mmap;
98use serde::{Deserialize, Serialize};
99
100use crate::error::VarEffectError;
101
102// ---------------------------------------------------------------------------
103// Index types (serialized as MessagePack in .bin.idx)
104// ---------------------------------------------------------------------------
105
106/// Current format version for [`GenomeBinIndex`]. Increment on breaking
107/// changes to the on-disk layout. Public so `vareffect-cli` uses the same
108/// constant — there must be a single source of truth for reader/writer
109/// version agreement.
110pub const GENOME_BIN_INDEX_VERSION: u32 = 1;
111
112#[cfg(not(target_pointer_width = "64"))]
113compile_error!("vareffect requires a 64-bit target (genome files may exceed 4 GB)");
114
115/// Flat binary genome index.
116///
117/// Serialized as MessagePack and stored alongside the `.bin` file with an
118/// `.idx` suffix. Read once at [`FastaReader::open`] time.
119#[derive(Debug, Clone, Serialize, Deserialize)]
120pub struct GenomeBinIndex {
121 /// Format version. Must equal [`GENOME_BIN_INDEX_VERSION`] at open time.
122 pub version: u32,
123 /// Assembly build label (e.g. `"GRCh38.p14"`). Informational — not
124 /// currently validated at open time, but available for downstream
125 /// logging and safety checks.
126 pub build: String,
127 /// Expected `.bin` file size in bytes. Verified against the mmap length
128 /// at open time to catch truncation.
129 pub expected_size: u64,
130 /// Contigs in file order. Each entry maps a contig name to its byte
131 /// offset and length in the `.bin` file.
132 pub contigs: Vec<ContigEntry>,
133}
134
135/// One contig in the [`GenomeBinIndex`].
136#[derive(Debug, Clone, Serialize, Deserialize)]
137pub struct ContigEntry {
138 /// Raw contig name from the source FASTA (e.g. `"NC_000001.11"`).
139 pub name: String,
140 /// Byte offset in the `.bin` file where this contig's sequence starts.
141 pub offset: u64,
142 /// Sequence length in bytes (= number of bases).
143 pub length: u64,
144}
145
146// ---------------------------------------------------------------------------
147// Builder
148// ---------------------------------------------------------------------------
149
150/// Build a flat binary genome from raw contig sequences.
151///
152/// Writes two files:
153/// - `bin_path`: concatenated uppercase bases, one byte per base, no separators.
154/// - `idx_path`: MessagePack-serialized [`GenomeBinIndex`].
155///
156/// Used by `vareffect-cli setup` for production builds and by unit tests
157/// for synthetic genomes.
158///
159/// # Base validation
160///
161/// Every byte is uppercased and then checked against the standard IUPAC
162/// nucleotide alphabet (`A`, `C`, `G`, `T`, `N`, `R`, `Y`, `S`, `W`, `K`,
163/// `M`, `B`, `D`, `H`, `V`). Non-alphabetic or non-IUPAC bytes are rejected.
164/// The NCBI GRCh38.p14 assembly uses ambiguity codes (e.g. `M`, `R`, `Y`)
165/// in some patch-scaffold regions; these are preserved verbatim.
166///
167/// # Errors
168///
169/// * [`VarEffectError::Io`] on write failures.
170/// * [`VarEffectError::Malformed`] if any byte is not a valid IUPAC
171/// nucleotide code after uppercasing.
172pub fn write_genome_binary(
173 contigs: &[(&str, &[u8])],
174 build: &str,
175 bin_path: &Path,
176 idx_path: &Path,
177) -> Result<(), VarEffectError> {
178 let bin_file = File::create(bin_path).map_err(|source| VarEffectError::Io {
179 path: bin_path.to_path_buf(),
180 source,
181 })?;
182 let mut writer = BufWriter::new(bin_file);
183
184 let mut entries = Vec::with_capacity(contigs.len());
185 let mut offset: u64 = 0;
186
187 // 64 KB staging buffer for chunk-based uppercase + validation + write.
188 // Avoids per-byte function-call overhead on multi-GB genomes.
189 let mut buf = [0u8; 64 * 1024];
190
191 for &(name, seq) in contigs {
192 let entry_offset = offset;
193 for chunk in seq.chunks(buf.len()) {
194 let n = chunk.len();
195 buf[..n].copy_from_slice(chunk);
196 buf[..n].make_ascii_uppercase();
197 for (i, &b) in buf[..n].iter().enumerate() {
198 if !is_iupac_nucleotide(b) {
199 return Err(VarEffectError::Malformed(format!(
200 "non-IUPAC byte 0x{:02X} ('{}') in contig {name} at offset {}",
201 chunk[i],
202 chunk[i] as char,
203 offset + i as u64,
204 )));
205 }
206 }
207 writer
208 .write_all(&buf[..n])
209 .map_err(|source| VarEffectError::Io {
210 path: bin_path.to_path_buf(),
211 source,
212 })?;
213 offset += n as u64;
214 }
215 entries.push(ContigEntry {
216 name: name.to_string(),
217 offset: entry_offset,
218 length: seq.len() as u64,
219 });
220 }
221
222 writer.flush().map_err(|source| VarEffectError::Io {
223 path: bin_path.to_path_buf(),
224 source,
225 })?;
226 // Ensure data hits disk before writing the index.
227 writer
228 .into_inner()
229 .map_err(|e| VarEffectError::Io {
230 path: bin_path.to_path_buf(),
231 source: e.into_error(),
232 })?
233 .sync_all()
234 .map_err(|source| VarEffectError::Io {
235 path: bin_path.to_path_buf(),
236 source,
237 })?;
238
239 let index = GenomeBinIndex {
240 version: GENOME_BIN_INDEX_VERSION,
241 build: build.to_string(),
242 expected_size: offset,
243 contigs: entries,
244 };
245 let idx_bytes = rmp_serde::to_vec(&index).map_err(|e| VarEffectError::Io {
246 path: idx_path.to_path_buf(),
247 source: std::io::Error::new(std::io::ErrorKind::InvalidData, e),
248 })?;
249 // Atomic write: .tmp + rename so a crash never leaves a partial .bin.idx
250 // that the AlreadyPresent fast-path would accept.
251 let tmp_idx = idx_path.with_extension("tmp");
252 std::fs::write(&tmp_idx, &idx_bytes).map_err(|source| VarEffectError::Io {
253 path: idx_path.to_path_buf(),
254 source,
255 })?;
256 std::fs::rename(&tmp_idx, idx_path).map_err(|source| VarEffectError::Io {
257 path: idx_path.to_path_buf(),
258 source,
259 })?;
260
261 Ok(())
262}
263
264/// Check whether a byte is a valid uppercase IUPAC nucleotide code.
265///
266/// Accepts the 15 standard codes: the four canonical bases (`A`, `C`, `G`,
267/// `T`), the fully-ambiguous placeholder (`N`), and the ten two-/three-base
268/// ambiguity codes (`R`, `Y`, `S`, `W`, `K`, `M`, `B`, `D`, `H`, `V`).
269/// The NCBI GRCh38.p14 assembly uses several of these in patch-scaffold
270/// regions; rejecting them would prevent building a complete genome binary.
271#[inline]
272pub fn is_iupac_nucleotide(b: u8) -> bool {
273 matches!(
274 b,
275 b'A' | b'C'
276 | b'G'
277 | b'T'
278 | b'N'
279 | b'R'
280 | b'Y'
281 | b'S'
282 | b'W'
283 | b'K'
284 | b'M'
285 | b'B'
286 | b'D'
287 | b'H'
288 | b'V'
289 )
290}
291
292// ---------------------------------------------------------------------------
293// Contig naming detection
294// ---------------------------------------------------------------------------
295
296/// On-disk contig naming style. Detected by scanning the [`GenomeBinIndex`]
297/// contig names at [`FastaReader::open`] time; drives the translation ladder
298/// inside [`FastaReader::translate_chrom`].
299///
300/// Detection priority: `NC_*` -> `NcbiRefSeq` beats a later `chr*` ->
301/// `UcscPrefixed` so a mixed-naming source (e.g. `NC_*` primaries + `NW_*`
302/// patches) still resolves toward the translation-heavy branch.
303#[derive(Debug, Clone, Copy, PartialEq, Eq)]
304pub(crate) enum ContigNaming {
305 /// Contigs are NCBI RefSeq accessions (`NC_000001.11`, `NW_*`, `NT_*`).
306 NcbiRefSeq,
307 /// Contigs are UCSC-style (`chr1`, `chrM`, `chr9_KN196479v1_fix`).
308 UcscPrefixed,
309 /// Contigs are bare Ensembl names (`1`, `MT`, `X`, `Y`).
310 EnsemblBare,
311}
312
313// ---------------------------------------------------------------------------
314// FastaReader
315// ---------------------------------------------------------------------------
316
317/// Memory-mapped reference genome reader for random-access sequence retrieval.
318///
319/// Construct with [`FastaReader::open`] (or
320/// [`FastaReader::open_with_patch_aliases`] when patch-contig lookups against
321/// an NCBI-source binary are needed), then call
322/// [`FastaReader::fetch_sequence`] to pull bases by `(chrom, start, end)`.
323/// Coordinates are 0-based half-open; chromosome names are UCSC-style.
324///
325/// See the module docs for details on the on-disk format, coordinate
326/// conventions, chromosome name translation, and thread safety.
327pub struct FastaReader {
328 /// Memory-mapped genome binary. `Mmap` derefs to `&[u8]` and is
329 /// `Send + Sync`, so no Mutex is needed for concurrent reads.
330 mmap: Arc<Mmap>,
331
332 /// Per-contig `(byte_offset, length)` keyed by the **raw index name**
333 /// (the exact name stored in the `.bin.idx`). The translation ladder
334 /// in [`FastaReader::translate_chrom`] converts caller-side UCSC names
335 /// *into* raw index keys before probing this map.
336 contigs: Arc<HashMap<String, (u64, u64)>>,
337
338 /// Detected on-disk naming convention. See [`ContigNaming`].
339 naming: ContigNaming,
340
341 /// Optional UCSC -> RefSeq patch alias table. Only populated when the
342 /// caller supplies a `patch_chrom_aliases.csv` to
343 /// [`FastaReader::open_with_patch_aliases`] *and* the binary uses
344 /// [`ContigNaming::NcbiRefSeq`].
345 patch_aliases: Option<Arc<HashMap<String, String>>>,
346
347 /// Original `.bin` path, kept for diagnostic messages.
348 path: PathBuf,
349}
350
351impl FastaReader {
352 /// Open a flat binary genome file without a patch-contig alias table.
353 ///
354 /// This is the common-case constructor for callers that only need
355 /// primary-chromosome reads (`chr1`..`chrM`). If the binary uses NCBI
356 /// RefSeq naming and you need patch-contig reads as well, use
357 /// [`FastaReader::open_with_patch_aliases`] instead.
358 ///
359 /// The expected on-disk layout is:
360 /// - `path` — the `.bin` file (flat binary genome)
361 /// - `{path}.idx` — the MessagePack index (sibling file)
362 ///
363 /// # Errors
364 ///
365 /// * [`VarEffectError::IndexNotFound`] if the `.idx` sidecar is missing.
366 /// * [`VarEffectError::Io`] on I/O or deserialization errors.
367 /// * [`VarEffectError::Malformed`] if the index version is unsupported
368 /// or the `.bin` file size doesn't match the expected size.
369 pub fn open(path: &Path) -> Result<Self, VarEffectError> {
370 Self::open_with_patch_aliases(path, None)
371 }
372
373 /// Open a flat binary genome file, optionally with a patch-contig alias
374 /// table for NCBI-source binaries.
375 ///
376 /// `patch_aliases_csv` is a path to a `refseq,ucsc` CSV produced by
377 /// `vareffect-cli setup`. When supplied *and* the binary uses
378 /// NCBI RefSeq contig naming, the reader loads the CSV into a
379 /// UCSC -> RefSeq map for the second tier of the translation ladder.
380 /// For any other combination, the argument is silently ignored.
381 ///
382 /// See [`FastaReader::open`] for the expected on-disk layout and error
383 /// conditions.
384 pub fn open_with_patch_aliases(
385 path: &Path,
386 patch_aliases_csv: Option<&Path>,
387 ) -> Result<Self, VarEffectError> {
388 // 1. Locate and parse the .bin.idx sidecar.
389 let idx_path = append_idx_extension(path);
390 if !idx_path.exists() {
391 return Err(VarEffectError::IndexNotFound {
392 path: idx_path.display().to_string(),
393 });
394 }
395 let idx_bytes = std::fs::read(&idx_path).map_err(|source| VarEffectError::Io {
396 path: idx_path.clone(),
397 source,
398 })?;
399 let index: GenomeBinIndex =
400 rmp_serde::from_slice(&idx_bytes).map_err(|e| VarEffectError::Io {
401 path: idx_path.clone(),
402 source: std::io::Error::new(std::io::ErrorKind::InvalidData, e),
403 })?;
404
405 // 2. Validate index version.
406 if index.version != GENOME_BIN_INDEX_VERSION {
407 return Err(VarEffectError::Malformed(format!(
408 "unsupported genome binary index version {} (expected {})",
409 index.version, GENOME_BIN_INDEX_VERSION,
410 )));
411 }
412
413 // 3. Memory-map the .bin file.
414 let file = File::open(path).map_err(|source| VarEffectError::Io {
415 path: path.to_path_buf(),
416 source,
417 })?;
418 // SAFETY: The .bin file is opened read-only and is write-once
419 // (produced by `vareffect-cli setup`, never modified in place).
420 // Hot-reload uses ArcSwap to atomically swap entire FastaReader
421 // instances rather than modifying the underlying file.
422 let mmap = unsafe {
423 memmap2::MmapOptions::new()
424 .map(&file)
425 .map_err(|source| VarEffectError::Io {
426 path: path.to_path_buf(),
427 source,
428 })?
429 };
430
431 // 4. Truncation guard: verify the mmap length matches the expected
432 // size stored in the index.
433 if mmap.len() as u64 != index.expected_size {
434 return Err(VarEffectError::Malformed(format!(
435 "genome binary size mismatch: expected {} bytes, got {} — \
436 file may be truncated or corrupt",
437 index.expected_size,
438 mmap.len(),
439 )));
440 }
441
442 // 5. Build the contigs lookup and detect naming convention.
443 let mut contigs: HashMap<String, (u64, u64)> = HashMap::with_capacity(index.contigs.len());
444 let mut naming = ContigNaming::EnsemblBare;
445
446 for entry in &index.contigs {
447 // Detection: NC_* wins outright; chr* flips to UcscPrefixed
448 // only if we haven't already seen an NC_* record.
449 if entry.name.starts_with("NC_") {
450 naming = ContigNaming::NcbiRefSeq;
451 } else if entry.name.starts_with("chr") && naming != ContigNaming::NcbiRefSeq {
452 naming = ContigNaming::UcscPrefixed;
453 }
454
455 contigs.insert(entry.name.clone(), (entry.offset, entry.length));
456 }
457
458 // 6. Load the patch alias CSV only when it was provided *and* the
459 // binary actually uses NCBI RefSeq naming.
460 let patch_aliases = match (patch_aliases_csv, naming) {
461 (Some(csv_path), ContigNaming::NcbiRefSeq) => {
462 Some(Arc::new(load_ucsc_to_refseq_aliases(csv_path)?))
463 }
464 _ => None,
465 };
466
467 Ok(Self {
468 mmap: Arc::new(mmap),
469 contigs: Arc::new(contigs),
470 naming,
471 patch_aliases,
472 path: path.to_path_buf(),
473 })
474 }
475
476 /// Mint an additional reader handle for the same genome binary.
477 ///
478 /// With the memory-mapped backend this is trivially cheap — it clones
479 /// a handful of `Arc`s. The `Result` wrapper is retained for API
480 /// compatibility with callers written for the previous seek-based
481 /// reader; it always returns `Ok`.
482 ///
483 /// **Note:** With the mmap backend, `try_clone` is no longer needed for
484 /// parallel workloads. All threads can read from the same `FastaReader`
485 /// concurrently with zero contention.
486 pub fn try_clone(&self) -> Result<Self, VarEffectError> {
487 Ok(Self {
488 mmap: Arc::clone(&self.mmap),
489 contigs: Arc::clone(&self.contigs),
490 naming: self.naming,
491 patch_aliases: self.patch_aliases.as_ref().map(Arc::clone),
492 path: self.path.clone(),
493 })
494 }
495
496 /// Fetch a genomic sequence as uppercase ASCII bytes.
497 ///
498 /// Coordinates are 0-based half-open `[start, end)`. The returned
499 /// sequence is the plus-strand reference — strand-aware complementing
500 /// is the caller's responsibility (see `crate::types::Strand`).
501 ///
502 /// The flat binary stores uppercase bases only, so the returned bytes
503 /// are always uppercase IUPAC nucleotide codes (typically `A`/`C`/`G`/
504 /// `T`/`N`, occasionally ambiguity codes like `M`/`R`/`Y` in some
505 /// GRCh38 patch regions).
506 ///
507 /// # Errors
508 ///
509 /// * [`VarEffectError::ChromNotFound`] if `chrom` is not present in the
510 /// genome index.
511 /// * [`VarEffectError::CoordinateOutOfRange`] if `start >= end` or `end`
512 /// exceeds the chromosome length.
513 pub fn fetch_sequence(
514 &self,
515 chrom: &str,
516 start: u64,
517 end: u64,
518 ) -> Result<Vec<u8>, VarEffectError> {
519 let translated = self.translate_chrom(chrom);
520
521 let &(offset, length) =
522 self.contigs
523 .get(translated.as_ref())
524 .ok_or_else(|| VarEffectError::ChromNotFound {
525 chrom: chrom.to_string(),
526 })?;
527
528 if start >= end || end > length {
529 return Err(VarEffectError::CoordinateOutOfRange {
530 chrom: chrom.to_string(),
531 start,
532 end,
533 chrom_len: length,
534 });
535 }
536
537 // Bounds validated — mmap indexing cannot panic.
538 let slice_start = (offset + start) as usize;
539 let slice_end = (offset + end) as usize;
540 Ok(self.mmap[slice_start..slice_end].to_vec())
541 }
542
543 /// Fetch a genomic sequence as raw ASCII bytes.
544 ///
545 /// With the flat binary backend, this is functionally identical to
546 /// [`FastaReader::fetch_sequence`] — the binary stores uppercase bases
547 /// only, so soft-mask (lowercase) information is not preserved.
548 ///
549 /// Retained for API compatibility. Callers that need repeat-region
550 /// annotations should use a dedicated RepeatMasker store rather than
551 /// relying on FASTA soft-masking.
552 pub fn fetch_sequence_raw(
553 &self,
554 chrom: &str,
555 start: u64,
556 end: u64,
557 ) -> Result<Vec<u8>, VarEffectError> {
558 self.fetch_sequence(chrom, start, end)
559 }
560
561 /// Fetch a single base at the given 0-based position.
562 ///
563 /// Returns the uppercase ASCII byte. With the mmap backend, this is a
564 /// single pointer dereference (~5 ns).
565 ///
566 /// # Errors
567 ///
568 /// Same as [`FastaReader::fetch_sequence`].
569 pub fn fetch_base(&self, chrom: &str, pos: u64) -> Result<u8, VarEffectError> {
570 let translated = self.translate_chrom(chrom);
571 let &(offset, length) =
572 self.contigs
573 .get(translated.as_ref())
574 .ok_or_else(|| VarEffectError::ChromNotFound {
575 chrom: chrom.to_string(),
576 })?;
577
578 if pos >= length {
579 return Err(VarEffectError::CoordinateOutOfRange {
580 chrom: chrom.to_string(),
581 start: pos,
582 end: pos + 1,
583 chrom_len: length,
584 });
585 }
586
587 Ok(self.mmap[(offset + pos) as usize])
588 }
589
590 /// Verify that the reference allele at a genomic position matches the
591 /// genome binary. Returns `true` on match, `false` on mismatch.
592 ///
593 /// Both the stored bases and `ref_allele` are compared case-insensitively.
594 /// I/O or coordinate errors propagate as `Err` — only a clean byte-level
595 /// mismatch returns `Ok(false)`. This mirrors VCF `--check-ref` semantics.
596 ///
597 /// With the mmap backend, this is a zero-copy slice comparison — no
598 /// `Vec<u8>` allocation.
599 pub fn verify_ref(
600 &self,
601 chrom: &str,
602 pos: u64,
603 ref_allele: &[u8],
604 ) -> Result<bool, VarEffectError> {
605 if ref_allele.is_empty() {
606 return Ok(true);
607 }
608
609 let translated = self.translate_chrom(chrom);
610 let &(offset, length) =
611 self.contigs
612 .get(translated.as_ref())
613 .ok_or_else(|| VarEffectError::ChromNotFound {
614 chrom: chrom.to_string(),
615 })?;
616
617 let end_pos = pos + ref_allele.len() as u64;
618 if end_pos > length {
619 return Err(VarEffectError::CoordinateOutOfRange {
620 chrom: chrom.to_string(),
621 start: pos,
622 end: end_pos,
623 chrom_len: length,
624 });
625 }
626
627 // Bounds validated — mmap indexing cannot panic.
628 let start = (offset + pos) as usize;
629 let end = start + ref_allele.len();
630 let slice = &self.mmap[start..end];
631 Ok(slice
632 .iter()
633 .zip(ref_allele.iter())
634 .all(|(a, b)| a.eq_ignore_ascii_case(b)))
635 }
636
637 /// Return the length of `chrom` in bases, or `None` if the chromosome is
638 /// not present in the genome index.
639 ///
640 /// O(1) — reads from the contig table built at [`FastaReader::open`] time.
641 pub fn chrom_length(&self, chrom: &str) -> Option<u64> {
642 let translated = self.translate_chrom(chrom);
643 self.contigs
644 .get(translated.as_ref())
645 .map(|&(_, length)| length)
646 }
647
648 // ---------------------------------------------------------------
649 // Internal helpers
650 // ---------------------------------------------------------------
651
652 /// Translate a caller-supplied UCSC chromosome name into the raw index
653 /// name. The three-branch dispatcher covers every supported on-disk
654 /// naming; see [`ContigNaming`] for the detection logic.
655 ///
656 /// Returns a `Cow` so the common-case branches (NCBI primary chrom,
657 /// UCSC pass-through, Ensembl strip-`chr`) allocate nothing. The
658 /// NcbiRefSeq patch-alias tier is the only path that allocates: the
659 /// alias map stores owned `String` values, and returning a borrow of
660 /// that string would require the returned `Cow`'s lifetime to extend
661 /// beyond the caller-supplied `chrom`, which the function signature
662 /// doesn't allow.
663 fn translate_chrom<'a>(&'a self, chrom: &'a str) -> Cow<'a, str> {
664 match self.naming {
665 ContigNaming::NcbiRefSeq => {
666 // Tier 1: primary chroms via the 25-entry const table.
667 // `ucsc_to_refseq` returns `&'static str` on a hit, so the
668 // `Cow::Borrowed` here has a 'static lifetime — always safe
669 // to coerce into 'a.
670 let primary = crate::chrom::ucsc_to_refseq(chrom);
671 if !std::ptr::eq(primary.as_ptr(), chrom.as_ptr()) {
672 // `primary` points into `NC_TO_UCSC`, not into `chrom`
673 // — the pointer compare distinguishes "table hit" from
674 // "pass-through" without a second string compare.
675 return Cow::Borrowed(primary);
676 }
677 // Tier 2: patch contigs via the runtime alias map, if one
678 // was supplied at open time.
679 if let Some(aliases) = &self.patch_aliases
680 && let Some(refseq) = aliases.get(chrom)
681 {
682 return Cow::Owned(refseq.clone());
683 }
684 // Tier 3: unknown name — pass through. The caller will get
685 // a `ChromNotFound` from the contigs lookup, which is the
686 // desired failure mode for typos or unknown patches.
687 Cow::Borrowed(chrom)
688 }
689 ContigNaming::UcscPrefixed => {
690 // Binary uses UCSC names — no translation needed.
691 Cow::Borrowed(chrom)
692 }
693 ContigNaming::EnsemblBare => {
694 if chrom == "chrM" {
695 // Mitochondrial contig is special: Ensembl calls it `MT`.
696 Cow::Owned("MT".to_string())
697 } else if let Some(stripped) = chrom.strip_prefix("chr") {
698 Cow::Borrowed(stripped)
699 } else {
700 // Caller passed a name that doesn't start with `chr`;
701 // hand it through unchanged so tests and ad-hoc
702 // callers can still query by the raw index name.
703 Cow::Borrowed(chrom)
704 }
705 }
706 }
707 }
708}
709
710/// Load `patch_chrom_aliases.csv` into a UCSC -> RefSeq lookup map.
711///
712/// The on-disk CSV format is `refseq,ucsc` (see
713/// `vareffect-cli::builders::patch_chrom_aliases`), so this loader inverts
714/// the direction at parse time. The same tolerances as the builder-side
715/// loader apply: `#` comments, blank lines, the header row, and leading/
716/// trailing whitespace are skipped.
717///
718/// We deliberately duplicate ~25 LOC of CSV parsing here rather than
719/// depending on `vareffect-cli`: the `vareffect` crate is designed to be
720/// publishable standalone on crates.io, and a dependency on the workspace
721/// CLI would defeat that.
722fn load_ucsc_to_refseq_aliases(path: &Path) -> Result<HashMap<String, String>, VarEffectError> {
723 let file = File::open(path).map_err(|source| VarEffectError::Io {
724 path: path.to_path_buf(),
725 source,
726 })?;
727 let reader = BufReader::new(file);
728
729 let mut map: HashMap<String, String> = HashMap::new();
730 for line in reader.lines() {
731 let line = line.map_err(|source| VarEffectError::Io {
732 path: path.to_path_buf(),
733 source,
734 })?;
735 let trimmed = line.trim();
736 if trimmed.is_empty() || trimmed.starts_with('#') || trimmed.starts_with("refseq,") {
737 continue;
738 }
739 let Some((refseq, ucsc)) = trimmed.split_once(',') else {
740 continue;
741 };
742 let refseq = refseq.trim();
743 let ucsc = ucsc.trim();
744 if refseq.is_empty() || ucsc.is_empty() {
745 continue;
746 }
747 // Inverted direction: key on UCSC, value is RefSeq.
748 map.insert(ucsc.to_string(), refseq.to_string());
749 }
750 Ok(map)
751}
752
753impl std::fmt::Debug for FastaReader {
754 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
755 f.debug_struct("FastaReader")
756 .field("path", &self.path)
757 .field("naming", &self.naming)
758 .field("patch_aliases", &self.patch_aliases.is_some())
759 .field("contigs", &self.contigs.len())
760 .finish()
761 }
762}
763
764/// Derive the `.bin.idx` sidecar path from the `.bin` path by appending
765/// `.idx` to the full filename.
766fn append_idx_extension(path: &Path) -> PathBuf {
767 let mut os = path.as_os_str().to_os_string();
768 os.push(".idx");
769 PathBuf::from(os)
770}
771
772// ---------------------------------------------------------------------------
773// Tests
774// ---------------------------------------------------------------------------
775
776#[cfg(test)]
777mod tests {
778 use super::*;
779 use tempfile::TempDir;
780
781 /// Build a flat binary genome from contigs and open it as a `FastaReader`.
782 fn write_test_genome(contigs: &[(&str, &[u8])]) -> (TempDir, FastaReader) {
783 let tmp = TempDir::new().unwrap();
784 let bin_path = tmp.path().join("test.bin");
785 let idx_path = tmp.path().join("test.bin.idx");
786 write_genome_binary(contigs, "test", &bin_path, &idx_path).unwrap();
787 let reader = FastaReader::open(&bin_path).unwrap();
788 (tmp, reader)
789 }
790
791 /// Write a `refseq,ucsc` alias CSV to a tempdir and return the path.
792 fn write_patch_alias_csv(tmp: &TempDir) -> PathBuf {
793 let csv_path = tmp.path().join("patch_chrom_aliases.csv");
794 let contents = "\
795# GRCh38 patch-contig aliases: RefSeq accession -> UCSC contig name.
796refseq,ucsc
797NW_009646194.1,chr1_KN196472v1_fix
798NT_187633.1,chr22_KI270879v1_alt
799";
800 std::fs::write(&csv_path, contents).unwrap();
801 csv_path
802 }
803
804 // -- Naming detection --------------------------------------------------
805
806 #[test]
807 fn chrom_name_translation_ensembl_style() {
808 let contigs: &[(&str, &[u8])] = &[("1", b"ACGTACGTACGTACGT"), ("MT", b"NNNNACGTACGT")];
809 let (_tmp, reader) = write_test_genome(contigs);
810
811 assert_eq!(reader.naming, ContigNaming::EnsemblBare);
812
813 // chr1 -> 1
814 let seq = reader.fetch_sequence("chr1", 0, 4).unwrap();
815 assert_eq!(seq, b"ACGT");
816
817 // chrM -> MT, start at position 4 (the first `A`)
818 let seq = reader.fetch_sequence("chrM", 4, 8).unwrap();
819 assert_eq!(seq, b"ACGT");
820 }
821
822 #[test]
823 fn chrom_name_translation_ucsc_style() {
824 let contigs: &[(&str, &[u8])] = &[("chr1", b"ACGTACGTACGTACGT")];
825 let (_tmp, reader) = write_test_genome(contigs);
826
827 assert_eq!(reader.naming, ContigNaming::UcscPrefixed);
828
829 // UCSC name passes through unchanged.
830 let seq = reader.fetch_sequence("chr1", 0, 4).unwrap();
831 assert_eq!(seq, b"ACGT");
832 }
833
834 #[test]
835 fn ucsc_naming_detected_even_without_chr1() {
836 // A partial assembly without chr1 should still detect UCSC style.
837 let contigs: &[(&str, &[u8])] = &[("chr22", b"ACGT"), ("chrY", b"TTTT")];
838 let (_tmp, reader) = write_test_genome(contigs);
839
840 assert_eq!(
841 reader.naming,
842 ContigNaming::UcscPrefixed,
843 "partial assembly should still be flagged UCSC-style",
844 );
845 assert_eq!(reader.fetch_base("chr22", 0).unwrap(), b'A');
846 assert_eq!(reader.fetch_base("chrY", 0).unwrap(), b'T');
847 }
848
849 #[test]
850 fn ncbi_naming_detected_with_nc_prefix() {
851 let contigs: &[(&str, &[u8])] = &[
852 ("NC_000001.11", b"ACGTACGTACGTACGT"),
853 ("NW_009646194.1", b"GGGGCCCC"),
854 ];
855 let (_tmp, reader) = write_test_genome(contigs);
856
857 assert_eq!(reader.naming, ContigNaming::NcbiRefSeq);
858 // Caller supplies UCSC form — reader translates to NC_000001.11.
859 assert_eq!(reader.fetch_base("chr1", 0).unwrap(), b'A');
860 assert_eq!(reader.fetch_sequence("chr1", 0, 4).unwrap(), b"ACGT");
861 assert_eq!(reader.chrom_length("chr1"), Some(16));
862 }
863
864 #[test]
865 fn mixed_ncbi_naming_nc_plus_nw() {
866 // Both NC_ and NW_ contigs — NW_ doesn't match NC_ or chr checks,
867 // naming should stay NcbiRefSeq.
868 let contigs: &[(&str, &[u8])] = &[
869 ("NC_000001.11", b"AAAA"),
870 ("NW_009646194.1", b"CCCC"),
871 ("NT_187633.1", b"GGGG"),
872 ];
873 let (_tmp, reader) = write_test_genome(contigs);
874 assert_eq!(reader.naming, ContigNaming::NcbiRefSeq);
875 }
876
877 // -- Coordinate validation ---------------------------------------------
878
879 #[test]
880 fn coordinate_validation_start_ge_end() {
881 let contigs: &[(&str, &[u8])] = &[("1", b"ACGTACGTACGTACGT")];
882 let (_tmp, reader) = write_test_genome(contigs);
883
884 let err = reader.fetch_sequence("chr1", 5, 5).unwrap_err();
885 assert!(matches!(err, VarEffectError::CoordinateOutOfRange { .. }));
886
887 let err = reader.fetch_sequence("chr1", 10, 5).unwrap_err();
888 assert!(matches!(err, VarEffectError::CoordinateOutOfRange { .. }));
889 }
890
891 #[test]
892 fn coordinate_validation_out_of_range() {
893 let contigs: &[(&str, &[u8])] = &[("1", b"ACGTACGTACGTACGT")];
894 let (_tmp, reader) = write_test_genome(contigs);
895
896 // Contig is 16 bases, so end > 16 is out of range.
897 let err = reader.fetch_sequence("chr1", 0, 17).unwrap_err();
898 assert!(matches!(err, VarEffectError::CoordinateOutOfRange { .. }));
899 }
900
901 #[test]
902 fn fetch_base_rejects_position_past_end_of_chrom() {
903 let contigs: &[(&str, &[u8])] = &[("1", b"ACGTACGTACGTACGT")];
904 let (_tmp, reader) = write_test_genome(contigs);
905 // 16 bases — valid positions are 0..16.
906 let err = reader.fetch_base("chr1", 16).unwrap_err();
907 assert!(matches!(err, VarEffectError::CoordinateOutOfRange { .. }));
908 }
909
910 #[test]
911 fn unknown_chrom_returns_err() {
912 let contigs: &[(&str, &[u8])] = &[("1", b"ACGT")];
913 let (_tmp, reader) = write_test_genome(contigs);
914
915 let err = reader.fetch_sequence("chrZZ", 0, 4).unwrap_err();
916 assert!(matches!(err, VarEffectError::ChromNotFound { .. }));
917 }
918
919 // -- Fetch operations --------------------------------------------------
920
921 #[test]
922 fn fetch_base_returns_single_byte() {
923 let contigs: &[(&str, &[u8])] = &[("1", b"ACGTACGTACGTACGT")];
924 let (_tmp, reader) = write_test_genome(contigs);
925
926 assert_eq!(reader.fetch_base("chr1", 0).unwrap(), b'A');
927 assert_eq!(reader.fetch_base("chr1", 3).unwrap(), b'T');
928 }
929
930 #[test]
931 fn fetch_sequence_returns_correct_range() {
932 let contigs: &[(&str, &[u8])] = &[("chr1", b"ACGTACGTACGTACGT"), ("chrM", b"NNNNACGTACGT")];
933 let (_tmp, reader) = write_test_genome(contigs);
934
935 assert_eq!(reader.fetch_sequence("chr1", 0, 4).unwrap(), b"ACGT");
936 assert_eq!(reader.fetch_sequence("chr1", 4, 8).unwrap(), b"ACGT");
937 assert_eq!(reader.fetch_sequence("chrM", 0, 4).unwrap(), b"NNNN");
938 assert_eq!(reader.fetch_sequence("chrM", 4, 8).unwrap(), b"ACGT");
939 }
940
941 #[test]
942 fn fetch_sequence_uppercases_lowercase_input() {
943 // Builder uppercases — verify lowercase input survives.
944 let contigs: &[(&str, &[u8])] = &[("1", b"acgtacgt")];
945 let (_tmp, reader) = write_test_genome(contigs);
946 let seq = reader.fetch_sequence("chr1", 0, 8).unwrap();
947 assert_eq!(seq, b"ACGTACGT");
948 }
949
950 #[test]
951 fn fetch_sequence_raw_returns_same_as_fetch_sequence() {
952 // With flat binary, raw and uppercased are identical.
953 let contigs: &[(&str, &[u8])] = &[("1", b"ACGTACGT")];
954 let (_tmp, reader) = write_test_genome(contigs);
955
956 let raw = reader.fetch_sequence_raw("chr1", 0, 8).unwrap();
957 let upper = reader.fetch_sequence("chr1", 0, 8).unwrap();
958 assert_eq!(raw, upper);
959 }
960
961 // -- verify_ref --------------------------------------------------------
962
963 #[test]
964 fn verify_ref_match_and_mismatch() {
965 let contigs: &[(&str, &[u8])] = &[("1", b"ACGTACGTACGTACGT")];
966 let (_tmp, reader) = write_test_genome(contigs);
967
968 assert!(reader.verify_ref("chr1", 0, b"ACGT").unwrap());
969 assert!(!reader.verify_ref("chr1", 0, b"TTTT").unwrap());
970
971 // Case-insensitive: caller passes lowercase, binary is uppercase.
972 assert!(reader.verify_ref("chr1", 0, b"acgt").unwrap());
973
974 // Empty allele trivially verifies.
975 assert!(reader.verify_ref("chr1", 0, b"").unwrap());
976 }
977
978 #[test]
979 fn verify_ref_out_of_bounds_returns_err() {
980 let contigs: &[(&str, &[u8])] = &[("1", b"ACGT")];
981 let (_tmp, reader) = write_test_genome(contigs);
982
983 // pos + ref_allele.len() > length
984 let err = reader.verify_ref("chr1", 2, b"ACGT").unwrap_err();
985 assert!(matches!(err, VarEffectError::CoordinateOutOfRange { .. }));
986 }
987
988 // -- chrom_length ------------------------------------------------------
989
990 #[test]
991 fn chrom_length_reports_correct_values() {
992 let contigs: &[(&str, &[u8])] = &[("1", b"ACGTACGTACGTACGT"), ("MT", b"NNNNACGTACGT")];
993 let (_tmp, reader) = write_test_genome(contigs);
994
995 assert_eq!(reader.chrom_length("chr1"), Some(16));
996 assert_eq!(reader.chrom_length("chrM"), Some(12));
997 assert_eq!(reader.chrom_length("chrZZ"), None);
998 }
999
1000 // -- try_clone ---------------------------------------------------------
1001
1002 #[test]
1003 fn try_clone_produces_independent_reader_sharing_mmap() {
1004 let contigs: &[(&str, &[u8])] = &[("1", b"ACGTACGTACGTACGT")];
1005 let (_tmp, reader) = write_test_genome(contigs);
1006 let cloned = reader.try_clone().expect("try_clone must succeed");
1007
1008 assert_eq!(reader.fetch_base("chr1", 0).unwrap(), b'A');
1009 assert_eq!(cloned.fetch_base("chr1", 0).unwrap(), b'A');
1010
1011 // Shared Arc -> same allocation.
1012 assert!(
1013 Arc::ptr_eq(&reader.mmap, &cloned.mmap),
1014 "mmap Arc must be shared across clones"
1015 );
1016 assert!(
1017 Arc::ptr_eq(&reader.contigs, &cloned.contigs),
1018 "contigs Arc must be shared across clones"
1019 );
1020 }
1021
1022 // -- Index validation --------------------------------------------------
1023
1024 #[test]
1025 fn missing_idx_reports_index_not_found() {
1026 let tmp = TempDir::new().unwrap();
1027 let bin_path = tmp.path().join("no_index.bin");
1028 std::fs::write(&bin_path, b"ACGT").unwrap();
1029
1030 let err = FastaReader::open(&bin_path).unwrap_err();
1031 assert!(matches!(err, VarEffectError::IndexNotFound { .. }));
1032 }
1033
1034 #[test]
1035 fn truncated_bin_reports_malformed() {
1036 let tmp = TempDir::new().unwrap();
1037 let bin_path = tmp.path().join("truncated.bin");
1038 let idx_path = tmp.path().join("truncated.bin.idx");
1039
1040 // Write a valid index claiming 100 bytes...
1041 let index = GenomeBinIndex {
1042 version: GENOME_BIN_INDEX_VERSION,
1043 build: "test".into(),
1044 expected_size: 100,
1045 contigs: vec![ContigEntry {
1046 name: "chr1".into(),
1047 offset: 0,
1048 length: 100,
1049 }],
1050 };
1051 let idx_bytes = rmp_serde::to_vec(&index).unwrap();
1052 std::fs::write(&idx_path, &idx_bytes).unwrap();
1053 // ...but only write 10 bytes to the bin file.
1054 std::fs::write(&bin_path, [b'A'; 10]).unwrap();
1055
1056 let err = FastaReader::open(&bin_path).unwrap_err();
1057 assert!(matches!(err, VarEffectError::Malformed(_)));
1058 }
1059
1060 #[test]
1061 fn wrong_index_version_reports_malformed() {
1062 let tmp = TempDir::new().unwrap();
1063 let bin_path = tmp.path().join("badver.bin");
1064 let idx_path = tmp.path().join("badver.bin.idx");
1065
1066 std::fs::write(&bin_path, b"ACGT").unwrap();
1067 let index = GenomeBinIndex {
1068 version: 99,
1069 build: "test".into(),
1070 expected_size: 4,
1071 contigs: vec![ContigEntry {
1072 name: "chr1".into(),
1073 offset: 0,
1074 length: 4,
1075 }],
1076 };
1077 let idx_bytes = rmp_serde::to_vec(&index).unwrap();
1078 std::fs::write(&idx_path, &idx_bytes).unwrap();
1079
1080 let err = FastaReader::open(&bin_path).unwrap_err();
1081 assert!(matches!(err, VarEffectError::Malformed(_)));
1082 }
1083
1084 // -- Multi-contig boundary ---------------------------------------------
1085
1086 #[test]
1087 fn multi_contig_boundary_no_cross_contamination() {
1088 // chr1 ends with TTTT, chr2 starts with AAAA. Fetching the last
1089 // byte of chr1 and the first byte of chr2 must not cross over.
1090 let contigs: &[(&str, &[u8])] = &[("chr1", b"GGGGTTTT"), ("chr2", b"AAAACCCC")];
1091 let (_tmp, reader) = write_test_genome(contigs);
1092
1093 // Last byte of chr1
1094 assert_eq!(reader.fetch_base("chr1", 7).unwrap(), b'T');
1095 // First byte of chr2
1096 assert_eq!(reader.fetch_base("chr2", 0).unwrap(), b'A');
1097
1098 // Range fetch at the boundary
1099 assert_eq!(reader.fetch_sequence("chr1", 4, 8).unwrap(), b"TTTT");
1100 assert_eq!(reader.fetch_sequence("chr2", 0, 4).unwrap(), b"AAAA");
1101 }
1102
1103 // -- Patch aliases -----------------------------------------------------
1104
1105 #[test]
1106 fn ncbi_fasta_rejects_ucsc_patch_without_alias_csv() {
1107 let contigs: &[(&str, &[u8])] = &[
1108 ("NC_000001.11", b"ACGTACGTACGTACGT"),
1109 ("NW_009646194.1", b"GGGGCCCC"),
1110 ];
1111 let (_tmp, reader) = write_test_genome(contigs);
1112
1113 let err = reader.fetch_base("chr1_KN196472v1_fix", 0).unwrap_err();
1114 assert!(
1115 matches!(err, VarEffectError::ChromNotFound { .. }),
1116 "expected ChromNotFound, got {err:?}",
1117 );
1118 }
1119
1120 #[test]
1121 fn ncbi_fasta_resolves_ucsc_patch_via_alias_csv() {
1122 let tmp = TempDir::new().unwrap();
1123 let bin_path = tmp.path().join("ncbi.bin");
1124 let idx_path = tmp.path().join("ncbi.bin.idx");
1125 let contigs: &[(&str, &[u8])] = &[
1126 ("NC_000001.11", b"ACGTACGTACGTACGT"),
1127 ("NW_009646194.1", b"GGGGCCCC"),
1128 ];
1129 write_genome_binary(contigs, "test", &bin_path, &idx_path).unwrap();
1130
1131 let csv_path = write_patch_alias_csv(&tmp);
1132 let reader = FastaReader::open_with_patch_aliases(&bin_path, Some(&csv_path)).unwrap();
1133
1134 assert_eq!(reader.naming, ContigNaming::NcbiRefSeq);
1135 assert!(reader.patch_aliases.is_some());
1136
1137 // `chr1_KN196472v1_fix` -> `NW_009646194.1` via alias.
1138 assert_eq!(reader.fetch_base("chr1_KN196472v1_fix", 0).unwrap(), b'G');
1139 assert_eq!(
1140 reader.fetch_sequence("chr1_KN196472v1_fix", 0, 4).unwrap(),
1141 b"GGGG",
1142 );
1143 }
1144
1145 #[test]
1146 fn ncbi_fasta_ignores_alias_csv_for_non_ncbi_binary() {
1147 let tmp = TempDir::new().unwrap();
1148 let bin_path = tmp.path().join("ens.bin");
1149 let idx_path = tmp.path().join("ens.bin.idx");
1150 let contigs: &[(&str, &[u8])] = &[("1", b"ACGT"), ("MT", b"NNNN")];
1151 write_genome_binary(contigs, "test", &bin_path, &idx_path).unwrap();
1152
1153 let csv_path = write_patch_alias_csv(&tmp);
1154 let reader = FastaReader::open_with_patch_aliases(&bin_path, Some(&csv_path)).unwrap();
1155
1156 assert_eq!(reader.naming, ContigNaming::EnsemblBare);
1157 assert!(
1158 reader.patch_aliases.is_none(),
1159 "patch aliases must not load against a non-NCBI binary",
1160 );
1161 assert_eq!(reader.fetch_base("chr1", 0).unwrap(), b'A');
1162 }
1163
1164 // -- Builder validation ------------------------------------------------
1165
1166 #[test]
1167 fn builder_rejects_non_iupac_bytes() {
1168 let tmp = TempDir::new().unwrap();
1169 let bin_path = tmp.path().join("bad.bin");
1170 let idx_path = tmp.path().join("bad.bin.idx");
1171
1172 // 'X' is not a valid IUPAC nucleotide code.
1173 let err =
1174 write_genome_binary(&[("chr1", b"ACGTXN")], "test", &bin_path, &idx_path).unwrap_err();
1175 assert!(matches!(err, VarEffectError::Malformed(_)));
1176 }
1177
1178 #[test]
1179 fn builder_accepts_iupac_ambiguity_codes() {
1180 // The NCBI GRCh38.p14 assembly uses ambiguity codes (M, R, Y, etc.)
1181 // in some patch-scaffold regions. The builder must accept all 15
1182 // standard IUPAC nucleotide codes.
1183 let seq = b"ACGTNRYSWKMBDHV";
1184 let contigs: &[(&str, &[u8])] = &[("chr1", seq.as_slice())];
1185 let (_tmp, reader) = write_test_genome(contigs);
1186 let fetched = reader.fetch_sequence("chr1", 0, seq.len() as u64).unwrap();
1187 assert_eq!(fetched, seq.as_slice());
1188 }
1189
1190 #[test]
1191 fn builder_round_trips_all_valid_bases() {
1192 let seq = b"ACGTNNNTTTAAACCCGGG";
1193 let contigs: &[(&str, &[u8])] = &[("chr1", seq.as_slice())];
1194 let (_tmp, reader) = write_test_genome(contigs);
1195 let fetched = reader.fetch_sequence("chr1", 0, seq.len() as u64).unwrap();
1196 assert_eq!(fetched, seq.as_slice());
1197 }
1198
1199 // -- append_idx_extension ----------------------------------------------
1200
1201 #[test]
1202 fn append_idx_extension_preserves_existing_ext() {
1203 let out = append_idx_extension(Path::new("/tmp/GRCh38.bin"));
1204 assert_eq!(out, PathBuf::from("/tmp/GRCh38.bin.idx"));
1205 }
1206}